python 进程/线程/协程 测试

# Author: yeshengbao
# --      coding: utf-8     --
# @Time  : 2018/5/24  21:38

import datetime
import requests
import os
import hashlib
from multiprocessing import Process
from lxml import etree
from threading import Thread
from gevent import monkey
import gevent

monkey.patch_socket()    # 开启猴子方法, 必须加
bag = ''
if not os.path.exists(bag):
    os.mkdir(bag)



class DouTu(object):
    def __init__(self):
        self.url = 'http://www.23us.so/files/article/html/6/6926/index.html'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                          " Chrome/64.0.3282.186 Safari/537.36",
        }

    def md5(self, strs):
        stri = hashlib.md5(strs.encode('utf-8'))
        key = stri.hexdigest()
        return key

    def get_source(self, url, headers):
        try:
            response = requests.get(url, headers=headers, timeout=10).content
            return response
        except Exception:
            return self.get_source(url, headers)

    def get_detail_content(self, frction_detail_url):
        if frction_detail_url:
            html = self.get_source(frction_detail_url, self.headers).decode('utf-8')
            doc = etree.HTML(html)
            title = doc.xpath('.//div[@class="bdsub"]/dl/dd[1]/h1/text()')[0]
            content = ''.join(doc.xpath('.//div[@class="bdsub"]/dl/dd[@id="contents"]/text()')).strip().replace('\n', '').replace('\t', '')
            if content:
                with open(bag + '\\' + 'text.txt', 'a+' , encoding='utf-8')as fp:
                    fp.write(title + ' :' + content + '\n')
                    print('正在写入{}_{}'.format(title, content))

    def analysis_index(self, html):
        doc = etree.HTML(html)
        td_list = doc.xpath('.//table[@id="at"]//td[@class="L"]')
        thread_list = []
        for td in td_list:
            xie = gevent.spawn(self.get_detail_content, td.xpath('./a/@href')[0])
            xie.start()
            thread_list.append(xie)
            print(xie)

        #     while True:
        #         if len(thread_list) < 100:    # 可选择开启多少线程
        #             th = Thread(target=self.get_detail_content, args=(td.xpath('./a/@href')[0], ))
        #             th.start()
        #             thread_list.append(th)
        #             break
        #
        #         else:
        #             print(thread_list)
        #             time.sleep(3)
        #             for ths in thread_list:
        #                 if not ths.is_alive():
        #                     thread_list.remove(ths)
        
        for th in thread_list:    # 为保证线程或协程的运行结束
            th.join()           

    def begin_spider(self):
        html = self.get_source(self.url, self.headers).decode('utf-8')
        self.analysis_index(html)


start_time = datetime.datetime.now()  # 程序开始时间
doutu = DouTu()
doutu.begin_spider()
over_time = datetime.datetime.now()   # 程序结束时间
total_time = (over_time-start_time).total_seconds()
print('程序共计%s秒' % total_time)



# 线程 620页 约40s
# 协程         18s


# 进程的用法 必须作用在  if __name__ == '__main__': 里

# thread_lists = []
# for page in range(50, 81):
#     while True:
#         if len(thread_lists) < 8:
#             # th = threading.Thread(target=dou.begin_by_page,args=(page,))
#             th = multiprocessing.Process(target=dou.begin_by_page, args=(page,))
#             th.start()
#             thread_lists.append(th)
#             break
#         else:
#             time.sleep(3)
#             print(thread_lists)
#             print('进程池已经满了')
#             for ths in thread_lists:
#                 if not ths.is_alive():
#                     thread_lists.remove(ths)
# for ths in thread_lists:
#     ths.join()

猜你喜欢

转载自www.cnblogs.com/yijian001/p/9085766.html