# -*- coding: utf-8 -*-import requests
import demjson
from retry import retry
from config import*classDongMan:def__init__(self):"""
initialize the picture source and page wanna download
"""
self.dongman_list =input('input content you wanna scratch, use \, to separate >')
self.dongman_list = self.dongman_list.split(',')
self.page_list =input('input pages:>')
@retry(3)defindex(self):"""
scratch all the path on one page
"""for j in self.dongman_list:
self.j = j
for i inrange(1,int(self.page_list)+1):
url ='http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&pn=%s&rn=30&gsm=10e&1539825490240='%(str(j),str(j),str(30* i))
headers ={'Referer':'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1539824775271_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&word=%E6%89%8B','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
reponse = requests.get(url=url, headers=headers)
img_res = reponse.text.encode('utf-8').decode('latin-1')
img_res = demjson.decode(img_res)# decode the json format
all_img = img_res['data']# get img link
self.details(all_img)
@retry(3)defdetails(self, all_img):"""
recursive download the image
:param all_img: list of the picture
:return:
"""for each_url in all_img:
each_img_url = each_url.get('thumbURL')print(each_img_url)if each_img_url:
headers ={'Referer':'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1539824775271_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&word=%E6%89%8B','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}try:
reponse = requests.get(url=each_img_url, headers=headers, timeout=10)except BaseException as e:print(e)
img_res = reponse.content # obtain the image bytes
img_save(img_res)# insertDB((None, each_img_url, md5(each_img_url.encoding('utf-8')).hexdigest(), self.j), 'image_test', 'img_source')if __name__ =='__main__':try:
dong_man = DongMan()
use_threadpool(dong_man.index())except TypeError as e:print('FINISH!!!')
config code
# -*- coding: utf-8 -*-import sys
import random
import threadpool
import pytesseract
from PIL import Image
from io import BytesIO
defuse_threadpool(method):"""
using threadpool
:return:
"""
task_pool = threadpool.ThreadPool(5)# use 5 threads
requests_pool = threadpool.makeRequests(url_down, url_all)# 假设共3000个url,reqeusts_pool为创建的3000个工作请求。print(len(url_all))# 3000for req in requests_pool:# 对于每一个工作请求,将请求放到线程池子里面,由5个线程去取池子里面的任务,若当前线程执行完此方法,则拿池子中下一个任务,直至池子中没有任务为止。
task_pool.putRequest(req)
task_pool.wait()defget_captch(captcha_content):"""
:param captcha_content: transfer as type
:return:
"""
image = Image.open(BytesIO(captcha_content))# convert as gray
imgry = image.convert('L')
table =[0if i <140else1for i inrange(256)]# emphasis the font
out = imgry.point(table,'1')# out.show()
captcha = pytesseract.image_to_string(out)
captcha = captcha.strip()
captcha = captcha.upper()return captcha
defimg_save(img_content):# """# save image# param img_content: pic bytes format:# \xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x0...# """
image = Image.open(BytesIO(img_content))# 1. write bytes format into RAM# 2. use Iamge Module def .open to upload the RAM bytes into image formatifint((sys.getsizeof(image)))>=11:# getsizeof -> Return the size of an object in bytes.withopen('./image/image%s.jpg'% random.random(),'wb')as f:
f.write(img_content)else:print('this pic less than 11 bytes')
image.show()