使用python初试采集

前些天闲着无聊用php写了段采集的代码，执行后发现要遍历的页面有些多，直接炸了，其实在中间加几秒的停留应该会好些（那么多页数据不炸就奇怪了），代码如下

    /**
     * 获取页面html数据
     * @param $url
     * @return string
     **/
    public static function getPageData($url){
        $ch = curl_init($url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        $content = curl_exec($ch);
        return $content;
    }

    /**
     * 获取全部数据
     * @param $url 初始链接
     * @param int $currentPage
     * @return data
     **/
    public static function getAllData($url, $currentPage = 1){
        $data     = array();
        $page     = $currentPage;
        $nextPage = self::getNextLink($url);
        $data[0]  = self::getNovelList($url);
        do {
            $page++;
            $data[$page] = self::getNovelList($nextPage);
            $nextPage    = self::getNextLink($nextPage);
        } while ($nextPage);

        return $data;
    }

    public static function getNovelList($url){
        $content = self::getPageData($url);
        preg_match_all(
            '/<ul>.*<li class="qq_a">(.*)<\/li>.*<li class="qq_l">(.*)<\/li>.*<li class="qq_m">(.*)<\/li>.*<li class="qq_r">(.*)<\/li>.*<li class="qq_g"><a href="(.*)">(.*)<\/a><\/li>.*<\/ul>/Us',
            $content,
            $result
        );
        if(!empty($result[0])){
            unset($result[0]);
            foreach($result[5] as $key => $singleLink){
                $result['download-link'][$key] = self::getNovelDownloadPage($singleLink);
            }
            return $result;
        }else{
            return false;
        }
    }

    public static function getNextLink($url){
        $content = self::getPageData($url);
        preg_match_all(
            '/<li><a href="(\S{45,60})">下一页<\/a><\/li>/Us',
            $content,
            $result
        );
        if(!empty($result[0])){
            unset($result[0]);
            return $result[1][0];
        }else{
            return false;
        }
    }

    public static function getNovelDownloadPage($url){
        $content = self::getPageData($url);
        preg_match_all(
            '/.*<div class="downbox">.*<a href="(.*)".*title/Us',
            $content,
            $result
        );
        if(!empty($result[0])){
            unset($result[0]);
            return $result[1][0];
        }else{
            return false;
        }
    }

    public static function getNovelDownloadLink($url){
        $content = self::getPageData($url);
        preg_match_all(
            '/.*<div class="shuji".*<li><a rel="nofollow" href="(.*)" target="_blank">下载地址【手机/Us',
            $content,
            $result
        );
        if(!empty($result[0])){
            unset($result[0]);
            return $result[1][0];
        }else{
            return false;
        }
    }

执行后效果并不理想，如下图

之前一直想看python来着，于是乎就先从这里入手了（服务器python版本为2.7的本地是3.5的），代码如下

***config.py
#!/usr/local/python2.7
# encoding=utf-8
user = 'root'
passwd = 'root'
host = 'localhost'
port = 3306
db = 'your_db'
charset = 'utf8'

3.5版本的python mysql使用pymysql即可
创建mysql.connect将参数顺序依次改为host post user passwd db charset即可

***mysqlModel.py
#!/usr/local/python2.7
# encoding=utf-8
import MySQLdb
import config


class mysqlModel:
    def __init__(self):
        self.mysqlConnect = MySQLdb.connect(
                user = config.user,
                passwd = config.passwd,
                host = config.host,
                port = config.port,
                db = config.db,
                charset = config.charset
        )
        self.mysqlHandle = self.mysqlConnect.cursor()

    def select(self, table, field = '*', where = '', groupBy = '', orderBy = '', limit = ''):
        self.mysqlHandle.execute(
                'select ' + field + ' from ' + table + ' ' + where + ' ' + groupBy + ' ' + orderBy + ' ' + limit)
        result = self.mysqlHandle.fetchall()
        return result

    def find(self, table, field = '*', where = '', groupBy = '', orderBy = '', limit = ''):
        self.mysqlHandle.execute(
                'select ' + field + ' from ' + table + ' ' + where + ' ' + groupBy + ' ' + orderBy + ' ' + limit)
        result = self.mysqlHandle.fetchone()
        return result

    def create(self, sql, data):
        try:
            self.mysqlHandle.execute(sql, data)
            self.mysqlConnect.commit()
            last_id = self.mysqlHandle.lastrowid
            return last_id
        except:
            self.mysqlConnect.rollback()
            last_id = 0
        return last_id

    def edit(self, sql):
        try:
            self.mysqlHandle.execute(sql)
            self.mysqlConnect.commit()
            res = self.mysqlHandle.rowcount()
        except:
            self.mysqlConnect.rollback()
            res = 0
        return res

    def delete(self, sql):
        try:
            self.mysqlHandle.execute(sql)
            self.mysqlConnect.commit()
            res = self.mysqlHandle.rowcount()
        except:
            self.mysqlConnect.rollback()
            res = 0
        return res

    def mysqlClose(self):
        self.mysqlConnect.close()

    def __del__(self):
        # print('mysql closed')
        self.mysqlClose()

    if __name__ == '__main__':
        print('you can do something...')
        

***connect.py

#!/usr/local/python2.7
# encoding=utf-8

import MySQLdb
import mysqlModel
import requests
from bs4 import BeautifulSoup
import time
import sys

URL = 'http://www.txt53.com/html/xuanhuan/list_21_7.html'

def get_page_data(url):
    if url == '':
        return ''

    page_data = requests.get(url).content
    return BeautifulSoup(page_data, 'html.parser')


# 获取单页小说详情
def get_single_novels(url):
    if url == '':
        return ''
    page_content = get_page_data(url)

    novel_wraps = page_content.find('div', class_ = 'xiashu')

    novel_list = []

    for single_wrap in novel_wraps.find_all('ul'):
        novel_detail = []

        # 小说标题
        novel_title = single_wrap.find('li', class_ = 'qq_g').find('a').text

        # 小说章节链接
        novel_chapter_link = single_wrap.find('li', class_ = 'qq_g').find('a')['href']

        # 小说作者
        novel_author = single_wrap.find('li', class_ = 'qq_r').text

        novel_detail.append(novel_title)
        novel_detail.append(int(time.time()))
        novel_detail.append(novel_author)

        # 存入小说信息
        handle = mysqlModel.mysqlModel()
        create_sql = 'insert into novel_name(novel_title, create_at, author) values(%s, %s, %s)'

        novel_id = handle.create(create_sql, novel_detail)
	
        # 获取小说章节信息
        get_novel_chapters(novel_chapter_link, novel_id)

        novel_list.append(novel_id)

    return novel_list


def get_novel_chapters(url, novel_id):
    if url == '':
        return ''
    tmp_page_data = get_page_data(url)
    tmp_content = tmp_page_data.find('div', class_ = 'downbox')

    num = 0
    for content in tmp_content.find_all('a'):
        num += 1
        if num == 1:
            continue
        chapter_url = content['href']

    page_data = get_page_data(chapter_url)
    read_list_wrap = page_data.find('div', class_ = 'read_list')

    chapter_count = 0
    for single_chapter in read_list_wrap.find_all('a'):
        chapter_link = single_chapter['href']
        chapter_content = single_chapter_content(chapter_link, novel_id)
        if chapter_content == '':
            continue

        chapter = []
        chapter.append(single_chapter.text)
        chapter.append(chapter_content)
        chapter.append(novel_id)

        handle = mysqlModel.mysqlModel()
        create_sql = 'insert into novel(title, content, novel_name_id) values(%s, %s, %s)'
        handle.create(create_sql, chapter)
        chapter_count += 1

    update_sql = 'update novel_name set chapter = ' + str(chapter_count) + ' where id = ' + str(novel_id)
    result = handle.edit(update_sql)
    
    if result != 0:
        print("download success~\r\n")
    else:
        print("download failed~\r\n")


# 获取单章节内容
def single_chapter_content(url, novel_id):
    if url == '':
        return ''
    page_data = get_page_data(url)
    chapter_content = page_data.find(id = 'view_content_txt').text

    if chapter_content == '':
        return ''

    return chapter_content


# 获取下一页链接
def get_next_page_link(page_content):
    page_wrap = page_content.find('div', class_ = 'yemian').find('ul')

    next_link = ''
    for link in page_wrap.find_all('a'):
	if link.text != '下一页':
            continue
        else:
            next_link = link['href']
    return next_link


# 获取整个板块小说
def get_all_novel(url):
    while True:
	get_single_novels(url)
        page_data = get_page_data(url)

        next_page_link = get_next_page_link(page_data)

        url = next_page_link

        if next_page_link == '':
            break


def main():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    url = sys.argv[1]
    # url = URL
    get_all_novel(url)


if __name__ == '__main__':
    main()

运行时只需执行python connect.py url 即可，这是我用python写的第一段代码，难免有许多不足的地方，有需要指正的地方大家可以留言

下图是采集到的部分结果