一个简单的进程池版的爬虫程序

# http://www.doutula.com/article/list/?page=1 第一页
# http://www.doutula.com/article/list/?page=2 翻页
# <a href="http://www.doutula.com/article/detail/1535518" class="list-group-item random_list"> # # 首页中具体包链接
# <img src="http://ww1.sinaimg.cn/large/9150e4e5ly1flj1mvb3hcj20c809cjrm.jpg" alt="猫爪" onerror="this.src='http://img.doutula.com/production/uploads/image/2017/11/15/20171115749279_RPFWHq.jpg'"> #图片链接

# 匹配多行 reg = re.compile(reg,re.S)

#这里是需要的python包
import requests
import time
import gevent
import re
import os
from gevent import monkey

# monkey.patch_all()

# 1.每页的url从1-531，使用for循环拼接url
# 2.打开url之后，使用正则findall抓取该页的具体包链接，存入package_list
def get_pic_url(page):
try:
os.mkdir(str(page))
except Exception as e:
pass
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'}
url = 'http://www.doutula.com/article/list/?page={}'.format(page)
res = requests.get(url,headers = head)
tmp = re.findall(r'<a href="([^#].*?)" class=".*?">',res.text)
print(tmp)
package_list.extend(tmp)
# 3.使用for循环遍历package_list 作为picture_url
# 4.打开改url ，然后抓取url中的表情jpg，存入本地文件夹或者数据库
for pic_url in package_list:
if len(pic_url) != len('http://www.doutula.com/article/detail/1070805'):
break
else:
res_pic = requests.get(pic_url,headers = head)
reg = r'''<img src="(.*?)" alt="(.*?)" .*?>'''
reg = re.compile(reg,re.S)
tmp = re.findall(reg,res_pic.text)

print(tmp)

for i in tmp:
num = tmp.index(i)
picture_res = requests.get(i[0],headers = head)
string = ''
if picture_res:
tmp_str = i[1]
for each in tmp_str:
if each in ('\\','/','*','?','"','|','>','<'):
pass
else:
string = string + each
tmp_str = string
if i[0][-3:] == 'jpg':
with open(r'./{}/{}-{}{}.jpg'.format(page,package_list.index(pic_url),num,tmp_str),'wb') as f:
f.write(picture_res.content)
time.sleep(1)
else:
with open(r'./{}/{}-{}{}.gif'.format(page,package_list.index(pic_url),num,tmp_str),'wb') as f:
f.write(picture_res.content)
time.sleep(1)
else:
break
def f1(): #1-265
for page in range(1,532):
package_list.clear()
get_pic_url(page)
package_list = []
f1()
#
# def f2(): # 266-531
# for page in range(266,531):
# get_pic_url(page)
#
# g1 = gevent.spawn(f1)
# g2 = gevent.spawn(f2)
# g1.join()
# g2.join()

一个简单的进程池版的爬虫程序

猜你喜欢