Python爬虫小试牛刀:爬取pixiv图片

Python爬虫小试牛刀

1.分析pixiv.net的页面结构及xhr内容:

我们可以看到页面的图片不是原图,而是被压缩过的,并且发现html结构中的class name几乎都是随机无意义的字符串,很明显是为了反爬虫.继续分析
找到了discovery页面的json文件在里面出现了推荐的1000条作品id,可以直接用requests.get下载这个文件，json .loads找出id，十分省事.
接下来打开作品详情页.
我们搜索作品的id,竟然在html的script中发现的神奇的东西,很明显这一段script就是用来生成随机class name的东西.

{"illustId":"61048675","illustTitle":"\u53e4\u306e\u9b54\u5c0e\u6a5f\u68b0","id":"61048675","title":"\u53e4\u306e\u9b54\u5c0e\u6a5f\u68b0","illustType":0,"xRestrict":0,"restrict":0,"sl":2,"url":"https:\/\/i.pximg.net\/c\/250x250_80_a2\/img-master\/img\/2017\/01\/22\/01\/02\/01\/61048675_p0_square1200.jpg","description":"","tags":["\u30d5\u30a1\u30f3\u30bf\u30b8\u30fc","\u98a8\u666f","\u826f\u4f5c\u306f\u3075\u3068\u3057\u305f\u3068\u3053\u308d\u306b","\u30aa\u30ea\u30b8\u30ca\u30eb5000users\u5165\u308a"],"userId":"455626","width":3000,"height":1500,"pageCount":3,"isBookmarkable":true,"bookmarkData":null},"57833748"

把它复制出来. 搜索发现未压缩的图片url在"original":"https:\/\/i.pximg.net\/img-original\/img\/2016\/07\/10\/12\/16\/57\/57833748_p0.jpg"
用正则表达式及split整理出图片信息:

title = re.search('"illustTitle":"(.*?)"', jsr).group(1)
img_url_temp = re.search('"original":"(http.*?)"', jsr).group(1)
user_id = re.search('"userId":"(.*?)"', jsr).group(1)
img_url = ''.join(img_url_temp.split('\\'))

3.敲代码:

废话少说，直接上代码
requests时要加上lantern的本地代理地址，不然就会requests.exceptions.SSLError: HTTPSConnectionPool(host='www.pixiv.net', port=443): Max retries exceeded with url: /member_illust.php?mode=medium&illust_id=64930973 (Caused by SSLError(SSLError("bad handshake: SysCallError(104, 'ECONNRESET')",),)) 你会收到gfw送来的问候

#coding:utf-8
import requests
import json
import re
import pymongo
import random
import time
import logging
import os


class GetPixiv(object):
    def __init__(self):
        self.header = {'accept': '*/*',
                       'Cookie': '',#注意这里要放上从浏览器复制的cookies
                       'accept-encoding': 'gzip, deflate, br',
                       'accept-language': 'zh-CN,zh;q=0.9',
                       'referer': 'https://www.pixiv.net/discovery',
                       'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36', }
        self.host = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id='
        self.url = 'https://www.pixiv.net/rpc/recommender.php?type=illust&sample_illusts=auto&num_recommendations=1000&page=discovery&mode=all'
        self.proxy = {'https': '127.0.0.1:45981'}
        client = pymongo.MongoClient(host='127.0.0.1')
        self.post = client['spider']['pixiv_illust_id']
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(level=logging.INFO)
        handler = logging.FileHandler("pic_log.txt")
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)

        console = logging.StreamHandler()
        console.setLevel(logging.INFO)

        self.logger.addHandler(handler)
        self.logger.addHandler(console)

        if not os.path.exists('./img'):
            os.mkdir('./img')
        os.chdir('./img')
        print('init finished.')

    def down_pic(self):
        jss = self.random_proxy_down_content(self.url)
        print('download id list done.')
        jrr = json.loads(jss)
        ill_list = jrr['recommendations']
        for ill in ill_list:
            if self.post.find({'pic_id':ill}).count():#判断重复id
                print(f'id {ill} already exists.')
                continue
            self.down_detail(ill)

    def down_detail(self, ill):
        try:
            jsr = self.random_proxy_down_content(url=self.host + str(ill)).decode('utf-8')
            print('download page source done.')
            title = re.search('"illustTitle":"(.*?)"', jsr).group(1).encode()
            img_url_temp = re.search('"original":"(http.*?)"', jsr).group(1)
            user_id = re.search('"userId":"(.*?)"', jsr).group(1)
            img_url = ''.join(img_url_temp.split('\\'))
            illust_info = {}
            illust_info['pic_id'] = ill
            illust_info['user_id'] = user_id
            illust_info['title'] = title
            illust_info['url'] = img_url
            self.post.insert(illust_info)
            self.save_image(img_url, title,filepath='./')
            print(f'process pic url:{img_url} done!')
        except:
            self.logger.error('Fail.', exc_info=True)

    def save_image(self, img_url, title, filepath):
        try:
            afterfix = img_url.split('/')[-1]
            filename = title + f'{afterfix}'

            if not os.path.exists(filepath):
                os.mkdir(filepath)

            with open(f'{filepath}/{filename}', 'wb') as f:
                img_content = self.random_proxy_down_content(img_url)
                f.write(img_content)
            print(f'save image done,current path is {os.getcwd()}')
        except Exception as e:
            self.logger.error('save image error.',exc_info=True)

    def random_proxy_down_content(self, url):
        time.sleep(random.randint(1, 4))
        # random_header = self.header
        # random_header['user-agent'] = random.choice(self.USER_AGENT_LIST)
        content = requests.get(url, headers=self.header, proxies=self.proxy).content
        print('request done.')
        return content

    def fetch_all_pic(self, user_id):
        json_url = f'https://www.pixiv.net/ajax/user/{user_id}/profile/all'
        json_content = self.random_proxy_down_content(json_url).decode()
        json_pic = json.loads(json_content)
        pic_list = json_pic['body']['illusts']
        print('fetch pic list done.')
        for pic in pic_list:
            self.down_detail(pic)
            
#start download pic of pixiv
handler = GetPixiv()
handler.down_pic()
print('All done!')

本人比较懒，没有写详细的注释，不过都是基础的知识。。。
没有什么好办法来通过代理访问被墙网站，折衷的做法就是设置合理的延时，不然被反爬虫是迟早的
现在的一个问题是用’wb’模式打开image文件时写入的文件名是unicode格式的如\u8840\u5c0f\u677f
待解决。