Python learning (3) crawling pictures

Use the request library to grab high-definition images using multi-threading.
The entire process of the website first sends a request through AJxs to obtain json data and then renders it to the webpage. Then you can get the URL of each image theme and load the URL of each theme. You can get the image and then download it.

# coding=utf-8
import requests
from lxml import etree
import os
import time
import json
import threading
from queue import Queue

class laoshiji(object):

    def __init__(self):
        self.headers = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "Host": "www.laosiji.com",
            "Content-Length": "90",
            "Origin": "https://www.laosiji.com",
            "Referer": "https://www.laosiji.com/community/3773",
            "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
        }
        self.url = "https://www.laosiji.com/proxy/api"
        self.page_count = 1
        self.Total_page_Count = 22
        self.str_page = "{}".format(self.page_count)
        self.post_data = {
                "method": "/community/getsnslistbysort",
                "communityid": "3773",
                "communitysubid": "0",
                "page": "1",
                "sorttype": "0"
            }
        self.file_name = "Pic"
        self.cookies= "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936; _gid=GA1.2.252215242.1557817656; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557021882,1557817656; JSESSIONID=E9B7FAF0EF6AE1BA7DD23C4C296BEFEE; tgw_l7_route=83a50c6e17958c25ad3462765ddb8a87; CNZZDATA1261736092=756492161-1557017437-%7C1557822828; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557826565"
        self.cookies = {i.split("=")[0]: i.split("=")[1] for i in self.cookies.split(";")}
        self.json_queue=Queue()
        self.url_queue=Queue()
        self.html_queue=Queue()
        self.img_url_queue=Queue()


    def run(self):
        # self.get_url_list()
        response = requests.post(
            self.url,
            headers=self.headers,
            cookies=self.cookies,
            data=self.post_data
        )
        tmp = json.loads(response.content.decode())["body"]["sns"]
        self.Total_page_Count = int(tmp["count"])
        str_list = tmp["list"]
        self.page_count = int(self.page_count) + 1
        self.json_queue.put(str_list)
        for i in str_list:
            print(i["resourceid"])
        self.page_count += 1
        self.start_thread()
        print("主线程完成")

    def start_thread(self):
        t = []
        t_json = threading.Thread(target=self.parse_json)
        t.append(t_json)
        for i in range(4):
            t_url=threading.Thread(target=self.parse_url)
            t.append(t_url)
        for i in range(10):
            t_send=threading.Thread(target=self.send_post)
            t.append(t_send)
        for i in range(10):
            t_img_url=threading.Thread(target=self.get_img_url)
            t.append(t_img_url)
        for i in range(10):
            t_get_img=threading.Thread(target=self.get_img)
            t.append(t_get_img)
        for th in t:
            th.setDaemon(True)
            th.start()
        #
        for q in [self.json_queue, self.url_queue, self.html_queue, self.img_url_queue]:
            q.join()

    def parse_json(self):
        if self.Total_page_Count > self.page_count* 20:
            self.post_data["page"]=str(self.page_count)
            print(self.post_data)
            response = requests.post(
                self.url,
                headers=self.headers,
                cookies=self.cookies,
                data=self.post_data
            )
            tmp = json.loads(response.content.decode())["body"]["sns"]
            self.Total_page_Count = int(tmp["count"])
            str_list = tmp["list"]
            self.page_count = int(self.page_count) + 1
            self.json_queue.put(str_list)
            # for i in str_list:
            #     print(i["resourceid"])
            self.parse_json()

    def parse_url(self):
        while True:
            str_list=self.json_queue.get()
            for str_item in str_list:
                str_url = str_item["resourceid"]
                str_url = "https://www.laosiji.com/thread/" + str(str_url) + ".html"
                self.url_queue.put(str_url)
            self.json_queue.task_done()

    def send_post(self):
        while True:
            url =self.url_queue.get()
            response = requests.get(url)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()


    def get_img_url(self):
        """
        通过xpath 后期地址列表
        :param html_str:
        :return:
        """
        while True:
            html_str =self.html_queue.get()
            html = etree.HTML(html_str)
            img_url_list = html.xpath('//div[@class="main fl"]//a[@class="thread-images"]/@ href')
            for i in img_url_list:
                self.img_url_queue.put(i)
            self.html_queue.task_done()

    def get_img(self):
        """
        通过地址获取图片
        :param img_url_list:
        :return:
        """
        while True:
            url=self.img_url_queue.get()
            str_name = str(url).replace("/", "").replace("http:", "")\
                .replace(".", "").replace("_", "").replace("?","").replace(
                "|", "")
            file_path = (self.file_name + "/{}.png").format(str_name)
            # print(file_path)
            if os._exists(file_path):
                file_path=file_path+"{}".format(file_path)
            if not os.path.exists(self.file_name):
                os.mkdir(self.file_name)
            response_img = requests.get(url)
            with(open(file_path, "ab")) as f:
                f.write(response_img.content)
                print("保存成功")
            self.img_url_queue.task_done()


if __name__ == "__main__":
    laoshiji_spider = laoshiji()
    laoshiji_spider.run()

Guess you like

Origin blog.csdn.net/m_cainiaokuaifei/article/details/92797323