python多进程实践-d

作者:Shine 发布于:2019-04-28 16:05:59 浏览:894次 分类:PHP

python多进程实践

  • UserAgent:生成随机UserAgent以模拟真实请求

  • peewee:轻量级ORM模块

  • multiprocess:最简单基础的多进程实现

  • logging:调试以及记录程序中出现的问题

  • traceback:提取、格式化和打印程序的stack traces信息

#!/usr/local/bin/python3
import requests
from fake_useragent import UserAgent
import json
from peewee import *
import datetime
import time
from multiprocessing import Process, Pool
import os
import logging
import traceback

logging.basicConfig(level=logging.ERROR,
                    filename='collect-multiprocess.log',
                    filemode='a',
                    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
                    )

id_all = {"动作": 1, "角色扮演": 5, "横版过关": 41, "冒险": 4, "射击": 48, "第一人称射击": 32,
          "策略": 2, "益智": 18, "模拟": 7, "体育": 3, "竞速": 6, "格斗": 9, "乱斗/清版": 37, "即时战略": 12, "音乐/旋律": 19}

comment_api = 'https://www.douban.com/j/ilmen/game/search?genres={}&platforms=&q=&sort=rating&more={}'

db = MySQLDatabase('thinkphp', user='root', charset='utf8mb4')


class DouBanGame(Model):
    title = CharField()
    cover = CharField()
    star = CharField()
    type = CharField()
    rating = CharField()
    platforms = CharField()
    n_ratings = CharField()
    genres = CharField()
    content = CharField()
    create_at = DateTimeField()

    class Meta:
        database = db
        table_name = 'douban_games'


def get_data(genres):
    logging.info(genres)
    # print('Run task as %s (%s)...' % (genres, os.getpid()))

    id_all_reverse = dict([val, key] for key, val in id_all.items())

    link = comment_api.format(genres, 1)

    headers = {"User-Agent": UserAgent(verify_ssl=False).random}

    page_data = requests.get(link, headers=headers,
                             # proxies={'https': 'http://127.0.0.1:8888'},
                             # verify=False
                             )

    init_data = json.loads(page_data.text)

    logging.info(init_data)

    col = ['name', 'star', 'rating', 'platforms', 'n_ratings', 'genres', 'content']

    total = init_data['total']
    print('{}类别共{}个游戏,开始爬取!'.format(id_all_reverse[genres], total))

    i = 0
    while i < total:
        data = []
        game_type = id_all_reverse[genres]

        if i == 0:
            n = 1
        else:
            n = init_data['more']

        init_data = json.loads(
            requests.get(comment_api.format(genres, n), headers=headers,
                         # proxies={'https': 'http://127.0.0.1:8888'},
                         # verify=False
                         ).text)

        current_games = init_data['games']

        length = len(init_data['games'])

        try:

            for j in range(length - 1):
                data.append({
                    'title': current_games[j]['title'],
                    'cover': current_games[j]['cover'],
                    'type': game_type,
                    'star': current_games[j]['star'],
                    'rating': current_games[j]['rating'],
                    'platforms': current_games[j]['platforms'],
                    'n_ratings': current_games[j]['n_ratings'],
                    'genres': current_games[j]['genres'],
                    'content': (
                        current_games[j]['review']['content'] if isinstance(current_games[j]['review']['content'],
                                                                            str) else ''),
                    'create_at': datetime.datetime.now()
                })
                i += 1
            # time.sleep(0.8)

            if data:
                last_id = DouBanGame.insert_many(data).execute()
                print(last_id)
            else:
                print('empty data!')
                print('NO%s' % i)
                break
        except Exception as e:
            logging.info(traceback.format_exc())


if __name__ == '__main__':
    comment_api = 'https://www.douban.com/j/ilmen/game/search?genres={}&platforms=&q=&sort=rating&more={}'

    print('Parent process %s.' % os.getpid())

    try:

        p = Pool(4)

        for genres in list(id_all.values()):
            p.apply_async(get_data, args=(genres,))

        print("waiting for all subProcesses done...")
        p.close()
        p.join()
        print('All subProcesses done.')
    except Exception as e:
        logging.info(traceback.print_exc())

代码地址 https://github.com/jumoshen/python-practice

标签: python process
声明:文章内容由作者原创或整理,转载请标明出处!
暂留位置!--请勿随意修改