爬虫 - 多线程、线程池、协程

线程池

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
#import threading

# multiprocessing.dummy 是多进程类库里里的一个多线程模块，有一个类Pool，表示线程池
from multiprocessing.dummy import Pool
import requests
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 发送请求
        #    html = self.send_request(url)
            # 解析响应

        # 创建线程池
        pool = Pool(len(self.url_list))
        pool.map(self.send_request, self.url_list)
        pool.close()
        # 主线程等待所有子线程执行结束，主线程再执行后面的代码
        pool.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

多线程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import threading
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        thread_list = []
        for url in self.url_list:
            # 发送请求
            #html = self.send_request(url)
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start()
            thread_list.append(thread)


        # 让主线程等待，等待所有子线程执行结束，再向下执行代码
        for thread in thread_list:
            thread.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

协程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import Queue
import time

import gevent
from gevent import monkey
monkey.patch_all()
# gevent 可以用同步的语法写异步的程序。
# monkey.patch_all() 在Python程序执行的时候，会动态的将网络库(socket, select)打个补丁，变为异步的库。
# 让程序在进行网络操作的时候，都变为异步的方式去执行。

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 创建队列保存数据
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 获取电影标题
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 获取电影评分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 将数据存储到队列里
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 发送请求
        #    html = self.send_request(url)
            # 解析响应

        # 创建任务列表，保存所有的协程任务
        job_list = []
        for url in self.url_list:
            # 创建一个协程任务
            job = gevent.spawn(self.send_request, url)
            # 将任务添加到列表里
            job_list.append(job)
        # 将所有的协程任务添加到任务队列里执行
        gevent.joinall(job_list)


        #gevent.joinall([gevent.spawn(self.send_request, url) for url in self.url_list])
        #job_list = [gevent.spawn(self.send_request, url) for url in self.url_list]
        #gevent.joinall(job_list)

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

爬虫 - 多线程、线程池、协程

线程池

多线程

协程

猜你喜欢