- 环境:win10 python 3.7 requests lxml multiprocessing
- 多进程
- 使用进程池下载
- 代码如下:
-1 第一步,发送请求,获取响应,使用requests.get 方法。
-2 第二步 ,使用xpath 对获取的内容进行解析
-3 第三步,使用进程池,去下载资源。
import os
from multiprocessing import Pool
import requests
from lxml import etree
from multiprocessing import Manager
class SpiderGirls(object):
url = ""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def get_total_page(self):
base_url = config.url+"/{}/"
page_url = [base_url.format(i) for i in range(2, 238)]
page_url.insert(0, config.url)
return page_url
def get_parse(self, url, headers):
"""发送请求获取相应"""
resp = requests.get(url, headers=headers)
htm = etree.HTML(resp.text)
return resp, htm
def get_pic_list(self, htm):
"""获取图片响应"""
pic_list = htm.xpath("//ul[@id='pins']/li")
item_info = [{i.xpath('./span/a/text()')[0].strip(): i.xpath('./a/@href')[0]} for i in pic_list]
return item_info
def get_personal_url(self, url, headers, name, q):
item_img = {name: []}
headers["referer"] = config.url
while True:
resp, htm = self.get_parse(url, headers)
headers["referer"] = url
img_url = htm.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
item_img[name].append(img_url)
next_url = htm.xpath("//span[text()='下一页»']/../@href")[0] if htm.xpath(
"//span[text()='下一页»']/../@href") else None
if not next_url:
break
url = next_url
q.put(item_img)
def save_img(self, url):
"""保存图片"""
print(url)
path = "".join([self.path, url.split("/")[-1]])
headers = SpiderGirls.headers
headers["referer"] = url
resp, _ = self.get_parse(url, headers)
with open(path, "wb") as f:
f.write(resp.content)
def get_path(self, name):
"""获取图片路径"""
path = os.path.join(os.path.expanduser('~'), 'Desktop')
dir = "".join([path, "//Girls//", name, "//"])
if not os.path.exists(dir):
os.makedirs(dir)
return dir
def run(self):
page_url = self.get_total_page()
for url in page_url[:1]:
headers = SpiderGirls.headers
_, htm = self.get_parse(url, headers)
item_info = self.get_pic_list(htm)
po = Pool(4)
q = Manager().Queue()
print(item_info)
for item in item_info[12:]:
(name, url), = item.items()
po.apply_async(self.get_personal_url, (url, headers, name, q))
item_img = q.get()
path = self.get_path(name)
self.path = path
print(self.path)
p = Pool()
p.map(self.save_img, item_img[name])
if __name__ == '__main__':
SpiderGirls().run()