线程池
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
#import threading
# multiprocessing.dummy 是多进程类库里里的一个多线程模块,有一个类Pool,表示线程池
from multiprocessing.dummy import Pool
import requests
import Queue
import time
class Douban(object):
def __init__(self):
self.base_url = "https://movie.douban.com/top250?start="
self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
# 创建队列保存数据
self.data_queue = Queue.Queue()
self.count = 0
def send_request(self, url):
print url
html = requests.get(url, headers = self.headers).content
time.sleep(1)
self.parse_page(html)
def parse_page(self, html):
html_obj = etree.HTML(html)
node_list = html_obj.xpath("//div[@class='info']")
for node in node_list:
# 获取电影标题
title = node.xpath("./div[@class='hd']/a/span/text()")[0]
# 获取电影评分
score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
# 将数据存储到队列里
self.data_queue.put(score + "\t" + title)
#print score, title
def start_work(self):
#for url in self.url_list:
# 发送请求
# html = self.send_request(url)
# 解析响应
# 创建线程池
pool = Pool(len(self.url_list))
pool.map(self.send_request, self.url_list)
pool.close()
# 主线程等待所有子线程执行结束,主线程再执行后面的代码
pool.join()
while not self.data_queue.empty():
print self.data_queue.get()
self.count += 1
print "\n%d" % self.count
if __name__ == "__main__":
douban = Douban()
start = time.time()
douban.start_work()
print "[INFO]: Useing %f secend" % (time.time() - start)
多线程
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
import requests
import threading
import Queue
import time
class Douban(object):
def __init__(self):
self.base_url = "https://movie.douban.com/top250?start="
self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
# 创建队列保存数据
self.data_queue = Queue.Queue()
self.count = 0
def send_request(self, url):
print url
html = requests.get(url, headers = self.headers).content
time.sleep(1)
self.parse_page(html)
def parse_page(self, html):
html_obj = etree.HTML(html)
node_list = html_obj.xpath("//div[@class='info']")
for node in node_list:
# 获取电影标题
title = node.xpath("./div[@class='hd']/a/span/text()")[0]
# 获取电影评分
score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
# 将数据存储到队列里
self.data_queue.put(score + "\t" + title)
#print score, title
def start_work(self):
thread_list = []
for url in self.url_list:
# 发送请求
#html = self.send_request(url)
thread = threading.Thread(target = self.send_request, args = [url])
thread.start()
thread_list.append(thread)
# 让主线程等待,等待所有子线程执行结束,再向下执行代码
for thread in thread_list:
thread.join()
while not self.data_queue.empty():
print self.data_queue.get()
self.count += 1
print "\n%d" % self.count
if __name__ == "__main__":
douban = Douban()
start = time.time()
douban.start_work()
print "[INFO]: Useing %f secend" % (time.time() - start)
协程
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
import requests
import Queue
import time
import gevent
from gevent import monkey
monkey.patch_all()
# gevent 可以用同步的语法写异步的程序。
# monkey.patch_all() 在Python程序执行的时候,会动态的将网络库(socket, select)打个补丁,变为异步的库。
# 让程序在进行网络操作的时候,都变为异步的方式去执行。
class Douban(object):
def __init__(self):
self.base_url = "https://movie.douban.com/top250?start="
self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
# 创建队列保存数据
self.data_queue = Queue.Queue()
self.count = 0
def send_request(self, url):
print url
html = requests.get(url, headers = self.headers).content
time.sleep(1)
self.parse_page(html)
def parse_page(self, html):
html_obj = etree.HTML(html)
node_list = html_obj.xpath("//div[@class='info']")
for node in node_list:
# 获取电影标题
title = node.xpath("./div[@class='hd']/a/span/text()")[0]
# 获取电影评分
score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]
# 将数据存储到队列里
self.data_queue.put(score + "\t" + title)
#print score, title
def start_work(self):
#for url in self.url_list:
# 发送请求
# html = self.send_request(url)
# 解析响应
# 创建任务列表,保存所有的协程任务
job_list = []
for url in self.url_list:
# 创建一个协程任务
job = gevent.spawn(self.send_request, url)
# 将任务添加到列表里
job_list.append(job)
# 将所有的协程任务添加到任务队列里执行
gevent.joinall(job_list)
#gevent.joinall([gevent.spawn(self.send_request, url) for url in self.url_list])
#job_list = [gevent.spawn(self.send_request, url) for url in self.url_list]
#gevent.joinall(job_list)
while not self.data_queue.empty():
print self.data_queue.get()
self.count += 1
print "\n%d" % self.count
if __name__ == "__main__":
douban = Douban()
start = time.time()
douban.start_work()
print "[INFO]: Useing %f secend" % (time.time() - start)