使用request 库 利用多线程抓取高清图片
网站的整个流程 首先通过AJxs 发送请求获取json 数据 然后渲染到网页上 然后可以获取每个图片主题的网址 加载每个主题的网址 可以获取图片 然后下载。
# coding=utf-8
import requests
from lxml import etree
import os
import time
import json
import threading
from queue import Queue
class laoshiji(object):
def __init__(self):
self.headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "www.laosiji.com",
"Content-Length": "90",
"Origin": "https://www.laosiji.com",
"Referer": "https://www.laosiji.com/community/3773",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
self.url = "https://www.laosiji.com/proxy/api"
self.page_count = 1
self.Total_page_Count = 22
self.str_page = "{}".format(self.page_count)
self.post_data = {
"method": "/community/getsnslistbysort",
"communityid": "3773",
"communitysubid": "0",
"page": "1",
"sorttype": "0"
}
self.file_name = "Pic"
self.cookies= "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936; _gid=GA1.2.252215242.1557817656; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557021882,1557817656; JSESSIONID=E9B7FAF0EF6AE1BA7DD23C4C296BEFEE; tgw_l7_route=83a50c6e17958c25ad3462765ddb8a87; CNZZDATA1261736092=756492161-1557017437-%7C1557822828; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557826565"
self.cookies = {i.split("=")[0]: i.split("=")[1] for i in self.cookies.split(";")}
self.json_queue=Queue()
self.url_queue=Queue()
self.html_queue=Queue()
self.img_url_queue=Queue()
def run(self):
# self.get_url_list()
response = requests.post(
self.url,
headers=self.headers,
cookies=self.cookies,
data=self.post_data
)
tmp = json.loads(response.content.decode())["body"]["sns"]
self.Total_page_Count = int(tmp["count"])
str_list = tmp["list"]
self.page_count = int(self.page_count) + 1
self.json_queue.put(str_list)
for i in str_list:
print(i["resourceid"])
self.page_count += 1
self.start_thread()
print("主线程完成")
def start_thread(self):
t = []
t_json = threading.Thread(target=self.parse_json)
t.append(t_json)
for i in range(4):
t_url=threading.Thread(target=self.parse_url)
t.append(t_url)
for i in range(10):
t_send=threading.Thread(target=self.send_post)
t.append(t_send)
for i in range(10):
t_img_url=threading.Thread(target=self.get_img_url)
t.append(t_img_url)
for i in range(10):
t_get_img=threading.Thread(target=self.get_img)
t.append(t_get_img)
for th in t:
th.setDaemon(True)
th.start()
#
for q in [self.json_queue, self.url_queue, self.html_queue, self.img_url_queue]:
q.join()
def parse_json(self):
if self.Total_page_Count > self.page_count* 20:
self.post_data["page"]=str(self.page_count)
print(self.post_data)
response = requests.post(
self.url,
headers=self.headers,
cookies=self.cookies,
data=self.post_data
)
tmp = json.loads(response.content.decode())["body"]["sns"]
self.Total_page_Count = int(tmp["count"])
str_list = tmp["list"]
self.page_count = int(self.page_count) + 1
self.json_queue.put(str_list)
# for i in str_list:
# print(i["resourceid"])
self.parse_json()
def parse_url(self):
while True:
str_list=self.json_queue.get()
for str_item in str_list:
str_url = str_item["resourceid"]
str_url = "https://www.laosiji.com/thread/" + str(str_url) + ".html"
self.url_queue.put(str_url)
self.json_queue.task_done()
def send_post(self):
while True:
url =self.url_queue.get()
response = requests.get(url)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
def get_img_url(self):
"""
通过xpath 后期地址列表
:param html_str:
:return:
"""
while True:
html_str =self.html_queue.get()
html = etree.HTML(html_str)
img_url_list = html.xpath('//div[@class="main fl"]//a[@class="thread-images"]/@ href')
for i in img_url_list:
self.img_url_queue.put(i)
self.html_queue.task_done()
def get_img(self):
"""
通过地址获取图片
:param img_url_list:
:return:
"""
while True:
url=self.img_url_queue.get()
str_name = str(url).replace("/", "").replace("http:", "")\
.replace(".", "").replace("_", "").replace("?","").replace(
"|", "")
file_path = (self.file_name + "/{}.png").format(str_name)
# print(file_path)
if os._exists(file_path):
file_path=file_path+"{}".format(file_path)
if not os.path.exists(self.file_name):
os.mkdir(self.file_name)
response_img = requests.get(url)
with(open(file_path, "ab")) as f:
f.write(response_img.content)
print("保存成功")
self.img_url_queue.task_done()
if __name__ == "__main__":
laoshiji_spider = laoshiji()
laoshiji_spider.run()