The use of third-party Python library request library requests and parsing library lxml tools such as crawling Baidu Post Bar Post Bar arbitrary name in all the pictures:
Requirements are as follows:
1 ------ object-oriented programming paradigm ,.
2, take simple anti-anti-climbing measures: as request time should not be too frequent, the request header User-Agent tool to hide and crawl randomly generated random User-Agent to circumvent anti-climb
3, only crawling bar owners posted pictures, other images prohibit crawling
code show as below:
import requests from lxml import etree import os import time from fake_useragent import UserAgent import warnings import random warnings.filterwarnings('ignore') class BaiduSpider(object): def __init__(self, keyword, page_number): self.url = 'http://tieba.baidu.com/' self.useragent = UserAgent() self.headers = {'User-Agent': self.useragent.random} self.keyword = keyword self.page_number = page_number # 获取帖子链接 def get_tlink(self, data): res = requests.get(self.url, headers=self.headers, params=data) res.encoding = 'utf-8' html = res.text html = html.replace(r"<!--", '').replace(r"-->", '') #Print (HTML) parse_html = etree.HTML (HTML) t_list = parse_html.xpath ( ' // UL [@ ID = "thread_list"] / Li [@ class = "j_thread_list clearfix"] / div // A / @ the href ' ) # Print (t_list) for t in t_list: # stitching each post links t_link = ' http://tieba.baidu.com ' + t # link to send a request to the post, get the picture link, send a request to the image links, save the picture to your local # Print (t_link) self.get_ilink (t_link) # extract image link DEF get_ilink (Self, t_link): RES= requests.get(t_link, headers=self.headers) res.encoding = 'utf-8' html = res.text parse_html = etree.HTML(html) i_list = parse_html.xpath( '//div[@class="d_post_content_main d_post_content_firstfloor"]//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src') print(i_list) for i in i_list: html = requests.get(i, heasers=self.headers).content self.write_image(html, i) # 保存图片 def write_image(self, html, i): filename = './' + self.keyword + '/' + i[-10:] with open(filename, 'wb') as f: f.write(html) def main(self): if os.path.exists(self.keyword): os.remove(self.keyword) for i in range(1, self.page_number + 1): data = { 'kw' : Self.keyword, ' PN' : STR ((I -. 1) * 50 ) } self.get_tlink (Data) Print ( ' page% d downloaded ' % I) the time.sleep (the random.randint ( . 1, 10 )) IF the __name__ == " __main__ " : Spider = Baiduspider ( ' entrance it ' ,. 1 ) spider.main ()