python学习(三)爬取图片

使用request 库 利用多线程抓取高清图片
网站的整个流程 首先通过AJxs 发送请求获取json 数据 然后渲染到网页上 然后可以获取每个图片主题的网址 加载每个主题的网址 可以获取图片 然后下载。

# coding=utf-8
import requests
from lxml import etree
import os
import time
import json
import threading
from queue import Queue

class laoshiji(object):

    def __init__(self):
        self.headers = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "Host": "www.laosiji.com",
            "Content-Length": "90",
            "Origin": "https://www.laosiji.com",
            "Referer": "https://www.laosiji.com/community/3773",
            "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
        }
        self.url = "https://www.laosiji.com/proxy/api"
        self.page_count = 1
        self.Total_page_Count = 22
        self.str_page = "{}".format(self.page_count)
        self.post_data = {
                "method": "/community/getsnslistbysort",
                "communityid": "3773",
                "communitysubid": "0",
                "page": "1",
                "sorttype": "0"
            }
        self.file_name = "Pic"
        self.cookies= "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936; _gid=GA1.2.252215242.1557817656; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557021882,1557817656; JSESSIONID=E9B7FAF0EF6AE1BA7DD23C4C296BEFEE; tgw_l7_route=83a50c6e17958c25ad3462765ddb8a87; CNZZDATA1261736092=756492161-1557017437-%7C1557822828; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557826565"
        self.cookies = {i.split("=")[0]: i.split("=")[1] for i in self.cookies.split(";")}
        self.json_queue=Queue()
        self.url_queue=Queue()
        self.html_queue=Queue()
        self.img_url_queue=Queue()


    def run(self):
        # self.get_url_list()
        response = requests.post(
            self.url,
            headers=self.headers,
            cookies=self.cookies,
            data=self.post_data
        )
        tmp = json.loads(response.content.decode())["body"]["sns"]
        self.Total_page_Count = int(tmp["count"])
        str_list = tmp["list"]
        self.page_count = int(self.page_count) + 1
        self.json_queue.put(str_list)
        for i in str_list:
            print(i["resourceid"])
        self.page_count += 1
        self.start_thread()
        print("主线程完成")

    def start_thread(self):
        t = []
        t_json = threading.Thread(target=self.parse_json)
        t.append(t_json)
        for i in range(4):
            t_url=threading.Thread(target=self.parse_url)
            t.append(t_url)
        for i in range(10):
            t_send=threading.Thread(target=self.send_post)
            t.append(t_send)
        for i in range(10):
            t_img_url=threading.Thread(target=self.get_img_url)
            t.append(t_img_url)
        for i in range(10):
            t_get_img=threading.Thread(target=self.get_img)
            t.append(t_get_img)
        for th in t:
            th.setDaemon(True)
            th.start()
        #
        for q in [self.json_queue, self.url_queue, self.html_queue, self.img_url_queue]:
            q.join()

    def parse_json(self):
        if self.Total_page_Count > self.page_count* 20:
            self.post_data["page"]=str(self.page_count)
            print(self.post_data)
            response = requests.post(
                self.url,
                headers=self.headers,
                cookies=self.cookies,
                data=self.post_data
            )
            tmp = json.loads(response.content.decode())["body"]["sns"]
            self.Total_page_Count = int(tmp["count"])
            str_list = tmp["list"]
            self.page_count = int(self.page_count) + 1
            self.json_queue.put(str_list)
            # for i in str_list:
            #     print(i["resourceid"])
            self.parse_json()

    def parse_url(self):
        while True:
            str_list=self.json_queue.get()
            for str_item in str_list:
                str_url = str_item["resourceid"]
                str_url = "https://www.laosiji.com/thread/" + str(str_url) + ".html"
                self.url_queue.put(str_url)
            self.json_queue.task_done()

    def send_post(self):
        while True:
            url =self.url_queue.get()
            response = requests.get(url)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()


    def get_img_url(self):
        """
        通过xpath 后期地址列表
        :param html_str:
        :return:
        """
        while True:
            html_str =self.html_queue.get()
            html = etree.HTML(html_str)
            img_url_list = html.xpath('//div[@class="main fl"]//a[@class="thread-images"]/@ href')
            for i in img_url_list:
                self.img_url_queue.put(i)
            self.html_queue.task_done()

    def get_img(self):
        """
        通过地址获取图片
        :param img_url_list:
        :return:
        """
        while True:
            url=self.img_url_queue.get()
            str_name = str(url).replace("/", "").replace("http:", "")\
                .replace(".", "").replace("_", "").replace("?","").replace(
                "|", "")
            file_path = (self.file_name + "/{}.png").format(str_name)
            # print(file_path)
            if os._exists(file_path):
                file_path=file_path+"{}".format(file_path)
            if not os.path.exists(self.file_name):
                os.mkdir(self.file_name)
            response_img = requests.get(url)
            with(open(file_path, "ab")) as f:
                f.write(response_img.content)
                print("保存成功")
            self.img_url_queue.task_done()


if __name__ == "__main__":
    laoshiji_spider = laoshiji()
    laoshiji_spider.run()

猜你喜欢

转载自blog.csdn.net/m_cainiaokuaifei/article/details/92797323
今日推荐