使用python初试采集
前些天闲着无聊用php写了段采集的代码,执行后发现要遍历的页面有些多,直接炸了,其实在中间加几秒的停留应该会好些(那么多页数据 不炸就奇怪了),代码如下
/** * 获取页面html数据 * @param $url * @return string **/ public static function getPageData($url){ $ch = curl_init($url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $content = curl_exec($ch); return $content; } /** * 获取全部数据 * @param $url 初始链接 * @param int $currentPage * @return data **/ public static function getAllData($url, $currentPage = 1){ $data = array(); $page = $currentPage; $nextPage = self::getNextLink($url); $data[0] = self::getNovelList($url); do { $page++; $data[$page] = self::getNovelList($nextPage); $nextPage = self::getNextLink($nextPage); } while ($nextPage); return $data; } public static function getNovelList($url){ $content = self::getPageData($url); preg_match_all( '/<ul>.*<li class="qq_a">(.*)<\/li>.*<li class="qq_l">(.*)<\/li>.*<li class="qq_m">(.*)<\/li>.*<li class="qq_r">(.*)<\/li>.*<li class="qq_g"><a href="(.*)">(.*)<\/a><\/li>.*<\/ul>/Us', $content, $result ); if(!empty($result[0])){ unset($result[0]); foreach($result[5] as $key => $singleLink){ $result['download-link'][$key] = self::getNovelDownloadPage($singleLink); } return $result; }else{ return false; } } public static function getNextLink($url){ $content = self::getPageData($url); preg_match_all( '/<li><a href="(\S{45,60})">下一页<\/a><\/li>/Us', $content, $result ); if(!empty($result[0])){ unset($result[0]); return $result[1][0]; }else{ return false; } } public static function getNovelDownloadPage($url){ $content = self::getPageData($url); preg_match_all( '/.*<div class="downbox">.*<a href="(.*)".*title/Us', $content, $result ); if(!empty($result[0])){ unset($result[0]); return $result[1][0]; }else{ return false; } } public static function getNovelDownloadLink($url){ $content = self::getPageData($url); preg_match_all( '/.*<div class="shuji".*<li><a rel="nofollow" href="(.*)" target="_blank">下载地址【手机/Us', $content, $result ); if(!empty($result[0])){ unset($result[0]); return $result[1][0]; }else{ return false; } }
执行后效果并不理想,如下图
之前一直想看python来着,于是乎就先从这里入手了(服务器python版本为2.7的 本地是3.5的),代码如下
***config.py #!/usr/local/python2.7 # encoding=utf-8 user = 'root' passwd = 'root' host = 'localhost' port = 3306 db = 'your_db' charset = 'utf8' 3.5版本的python mysql使用pymysql即可 创建mysql.connect将参数顺序依次改为host post user passwd db charset即可 ***mysqlModel.py #!/usr/local/python2.7 # encoding=utf-8 import MySQLdb import config class mysqlModel: def __init__(self): self.mysqlConnect = MySQLdb.connect( user = config.user, passwd = config.passwd, host = config.host, port = config.port, db = config.db, charset = config.charset ) self.mysqlHandle = self.mysqlConnect.cursor() def select(self, table, field = '*', where = '', groupBy = '', orderBy = '', limit = ''): self.mysqlHandle.execute( 'select ' + field + ' from ' + table + ' ' + where + ' ' + groupBy + ' ' + orderBy + ' ' + limit) result = self.mysqlHandle.fetchall() return result def find(self, table, field = '*', where = '', groupBy = '', orderBy = '', limit = ''): self.mysqlHandle.execute( 'select ' + field + ' from ' + table + ' ' + where + ' ' + groupBy + ' ' + orderBy + ' ' + limit) result = self.mysqlHandle.fetchone() return result def create(self, sql, data): try: self.mysqlHandle.execute(sql, data) self.mysqlConnect.commit() last_id = self.mysqlHandle.lastrowid return last_id except: self.mysqlConnect.rollback() last_id = 0 return last_id def edit(self, sql): try: self.mysqlHandle.execute(sql) self.mysqlConnect.commit() res = self.mysqlHandle.rowcount() except: self.mysqlConnect.rollback() res = 0 return res def delete(self, sql): try: self.mysqlHandle.execute(sql) self.mysqlConnect.commit() res = self.mysqlHandle.rowcount() except: self.mysqlConnect.rollback() res = 0 return res def mysqlClose(self): self.mysqlConnect.close() def __del__(self): # print('mysql closed') self.mysqlClose() if __name__ == '__main__': print('you can do something...') ***connect.py #!/usr/local/python2.7 # encoding=utf-8 import MySQLdb import mysqlModel import requests from bs4 import BeautifulSoup import time import sys URL = 'http://www.txt53.com/html/xuanhuan/list_21_7.html' def get_page_data(url): if url == '': return '' page_data = requests.get(url).content return BeautifulSoup(page_data, 'html.parser') # 获取单页小说详情 def get_single_novels(url): if url == '': return '' page_content = get_page_data(url) novel_wraps = page_content.find('div', class_ = 'xiashu') novel_list = [] for single_wrap in novel_wraps.find_all('ul'): novel_detail = [] # 小说标题 novel_title = single_wrap.find('li', class_ = 'qq_g').find('a').text # 小说章节链接 novel_chapter_link = single_wrap.find('li', class_ = 'qq_g').find('a')['href'] # 小说作者 novel_author = single_wrap.find('li', class_ = 'qq_r').text novel_detail.append(novel_title) novel_detail.append(int(time.time())) novel_detail.append(novel_author) # 存入小说信息 handle = mysqlModel.mysqlModel() create_sql = 'insert into novel_name(novel_title, create_at, author) values(%s, %s, %s)' novel_id = handle.create(create_sql, novel_detail) # 获取小说章节信息 get_novel_chapters(novel_chapter_link, novel_id) novel_list.append(novel_id) return novel_list def get_novel_chapters(url, novel_id): if url == '': return '' tmp_page_data = get_page_data(url) tmp_content = tmp_page_data.find('div', class_ = 'downbox') num = 0 for content in tmp_content.find_all('a'): num += 1 if num == 1: continue chapter_url = content['href'] page_data = get_page_data(chapter_url) read_list_wrap = page_data.find('div', class_ = 'read_list') chapter_count = 0 for single_chapter in read_list_wrap.find_all('a'): chapter_link = single_chapter['href'] chapter_content = single_chapter_content(chapter_link, novel_id) if chapter_content == '': continue chapter = [] chapter.append(single_chapter.text) chapter.append(chapter_content) chapter.append(novel_id) handle = mysqlModel.mysqlModel() create_sql = 'insert into novel(title, content, novel_name_id) values(%s, %s, %s)' handle.create(create_sql, chapter) chapter_count += 1 update_sql = 'update novel_name set chapter = ' + str(chapter_count) + ' where id = ' + str(novel_id) result = handle.edit(update_sql) if result != 0: print("download success~\r\n") else: print("download failed~\r\n") # 获取单章节内容 def single_chapter_content(url, novel_id): if url == '': return '' page_data = get_page_data(url) chapter_content = page_data.find(id = 'view_content_txt').text if chapter_content == '': return '' return chapter_content # 获取下一页链接 def get_next_page_link(page_content): page_wrap = page_content.find('div', class_ = 'yemian').find('ul') next_link = '' for link in page_wrap.find_all('a'): if link.text != '下一页': continue else: next_link = link['href'] return next_link # 获取整个板块小说 def get_all_novel(url): while True: get_single_novels(url) page_data = get_page_data(url) next_page_link = get_next_page_link(page_data) url = next_page_link if next_page_link == '': break def main(): reload(sys) sys.setdefaultencoding('utf-8') url = sys.argv[1] # url = URL get_all_novel(url) if __name__ == '__main__': main()
运行时只需执行python connect.py url 即可,这是我用python写的第一段代码,难免有许多不足的地方,有需要指正的地方大家可以留言
下图是采集到的部分结果