Python多线程爬虫学习

此代码包含以下学习内容：

程序封装
网页获取与解析
retry装饰器与thread多线程

Anime code

# -*- coding: utf-8 -*-
import requests
import demjson
from retry import retry
from config import *


class DongMan:
    def __init__(self):
        """
        initialize the picture source and page wanna download
        """
        self.dongman_list = input('input content you wanna scratch, use \, to separate  >')
        self.dongman_list = self.dongman_list.split(',')
        self.page_list = input('input pages:>')

    @retry(3)
    def index(self):
        """
        scratch all the path on one page
        """
        for j in self.dongman_list:
            self.j = j
            for i in range(1, int(self.page_list) + 1):
                url = 'http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&pn=%s&rn=30&gsm=10e&1539825490240=' % (
                    str(j), str(j), str(30 * i))
                headers = {
                    'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1539824775271_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&word=%E6%89%8B',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
                }
                reponse = requests.get(url=url, headers=headers)
                img_res = reponse.text.encode('utf-8').decode('latin-1')
                img_res = demjson.decode(img_res)    # decode the json format
                all_img = img_res['data']   # get img link
                self.details(all_img)

    @retry(3)
    def details(self, all_img):
        """
        recursive download the image
        :param all_img: list of the picture
        :return:
        """
        for each_url in all_img:
            each_img_url = each_url.get('thumbURL')
            print(each_img_url)
            if each_img_url:
                headers = {
                    'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1539824775271_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&word=%E6%89%8B',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
                }
                try:
                    reponse = requests.get(url=each_img_url, headers=headers, timeout=10)
                except BaseException as e:
                    print(e)
                img_res = reponse.content   # obtain the image bytes
                img_save(img_res)
                # insertDB((None, each_img_url, md5(each_img_url.encoding('utf-8')).hexdigest(), self.j), 'image_test', 'img_source')


if __name__ == '__main__':
    try:
        dong_man = DongMan()
        use_threadpool(dong_man.index())
    except TypeError as e:
        print('FINISH!!!')

config code

# -*- coding: utf-8 -*-
import sys
import random
import threadpool
import pytesseract
from PIL import Image
from io import BytesIO


def use_threadpool(method):
    """
    using threadpool
    :return:
    """
    task_pool = threadpool.ThreadPool(5)  # use 5 threads
    requests_pool = threadpool.makeRequests(url_down, url_all)# 假设共3000个url，reqeusts_pool为创建的3000个工作请求。
    print(len(url_all)) # 3000
    for req in requests_pool:			# 对于每一个工作请求，将请求放到线程池子里面，由5个线程去取池子里面的任务，若当前线程执行完此方法，则拿池子中下一个任务，直至池子中没有任务为止。
        task_pool.putRequest(req)
    task_pool.wait()


def get_captch(captcha_content):
    """
    :param captcha_content: transfer as type
    :return:
    """
    image = Image.open(BytesIO(captcha_content))
    # convert as gray
    imgry = image.convert('L')
    table = [0 if i < 140 else 1 for i in range(256)]
    # emphasis the font
    out = imgry.point(table, '1')
    # out.show()
    captcha = pytesseract.image_to_string(out)
    captcha = captcha.strip()
    captcha = captcha.upper()
    return captcha


def img_save(img_content):
    # """
    # save image
    # param img_content: pic bytes format:# \xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x0...
    # """
    image = Image.open(BytesIO(img_content))
    # 1. write bytes format into RAM
    # 2. use Iamge Module def .open to upload the RAM bytes into image format

    if int((sys.getsizeof(image))) >= 11:   # getsizeof -> Return the size of an object in bytes.
        with open('./image/image%s.jpg' % random.random(), 'wb') as f:
            f.write(img_content)
    else:
        print('this pic less than 11 bytes')
        image.show()

Python多线程爬虫学习

此代码包含以下学习内容：

Anime code

config code

猜你喜欢