import requests import re,time,random,os from urllib import parse from fake_useragent import UserAgent class BaiduImgSpider(object): def __init__(self): self.baseurl = 'https://image.baidu.com/search/index?tn=baiduimage&word={}' self.count = 1; self.ua = UserAgent() self.savepath = '/home/user/work/spider/day03/' self.re_str = r'{"thumbURL":"(.*?)","replaceUrl":' def get_html(self,name,orgname): header = {'User-Agent':self.ua.random} url = self.baseurl.format(name) html = requests.get(url=url,headers = header).text pattent = re.compile(self.re_str,re.S) img_list = pattent.findall(html) path = self.savepath+orgname if not os.path.exists(path): os.mkdir(path) for img_link in img_list: print(img_link) self.save_img(img_link,path) time.sleep(random.randint(1,2)) def save_img(self,url,path): header = {'User-Agent': self.ua.random} html = requests.get(url=url,headers=header).content filename = path+"/"+str(self.count)+'.jpg' with open(filename,'wb') as f: f.write(html) print('下载成功',filename) self.count += 1 def run(self): search_name = input('输入要获取的名字>'); word = parse.quote(search_name) self.get_html(word,search_name) if __name__ == '__main__': spider = BaiduImgSpider() spider.run();
直接上代码了,非常简单的