Python3 爬虫实例 妹子图

  • 环境:win10 python 3.7 requests lxml multiprocessing
  • 多进程
  • 使用进程池下载
  • 代码如下:
    -1 第一步,发送请求,获取响应,使用requests.get 方法。
    -2 第二步 ,使用xpath 对获取的内容进行解析
    -3 第三步,使用进程池,去下载资源。
import os
from multiprocessing import Pool
import requests
from lxml import etree
from multiprocessing import Manager


class SpiderGirls(object):
    url = ""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }

    def get_total_page(self):
        base_url = config.url+"/{}/"
        page_url = [base_url.format(i) for i in range(2, 238)]
        page_url.insert(0, config.url)
        return page_url

    def get_parse(self, url, headers):
        """发送请求获取相应"""
        resp = requests.get(url, headers=headers)
        htm = etree.HTML(resp.text)
        return resp, htm

    def get_pic_list(self, htm):
        """获取图片响应"""
        pic_list = htm.xpath("//ul[@id='pins']/li")
        item_info = [{i.xpath('./span/a/text()')[0].strip(): i.xpath('./a/@href')[0]} for i in pic_list]
        return item_info

    def get_personal_url(self, url, headers, name, q):
        item_img = {name: []}
        headers["referer"] = config.url
        while True:
            resp, htm = self.get_parse(url, headers)
            headers["referer"] = url
            img_url = htm.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
            item_img[name].append(img_url)
            next_url = htm.xpath("//span[text()='下一页»']/../@href")[0] if htm.xpath(
                "//span[text()='下一页»']/../@href") else None
            if not next_url:
                break
            url = next_url
        q.put(item_img)

    def save_img(self, url):
        """保存图片"""
        print(url)
        path = "".join([self.path, url.split("/")[-1]])
        headers = SpiderGirls.headers
        headers["referer"] = url
        resp, _ = self.get_parse(url, headers)
        with open(path, "wb") as f:
            f.write(resp.content)

    def get_path(self, name):
        """获取图片路径"""
        path = os.path.join(os.path.expanduser('~'), 'Desktop')  # 获取用户的桌面路径
        dir = "".join([path, "//Girls//", name, "//"])
        if not os.path.exists(dir):
            os.makedirs(dir)
        return dir

    def run(self):
        page_url = self.get_total_page()
        for url in page_url[:1]:
            headers = SpiderGirls.headers
            _, htm = self.get_parse(url, headers)
            item_info = self.get_pic_list(htm)
            po = Pool(4)
            q = Manager().Queue()
            print(item_info)
            for item in item_info[12:]:
                (name, url), = item.items()
                po.apply_async(self.get_personal_url, (url, headers, name, q))
                item_img = q.get()
                path = self.get_path(name)
                self.path = path # 给实例对象添加属性
                print(self.path)
                p = Pool() #进程池下载
                p.map(self.save_img, item_img[name])

if __name__ == '__main__':
    SpiderGirls().run()
发布了127 篇原创文章 · 获赞 25 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/weixin_44224529/article/details/103551723