import urllib
import urllib2
from lxml import etree
def loadPage(url):
request = urllib2.Request(url)
html=urllib2.urlopen(request).read()
con=etree.HTML(html)
link_list = con.xpath(r'//div[@class ="t_con cleafix"]/div[2]/div/div/a/@href')
for link in link_list:
full_link="http://tieba.baidu.com"+link
loadImg(full_link)
def loadImg(link):
h={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/;"
}
request=urllib2.Request(link, headers=h)
html=urllib2.urlopen(request).read()
content=etree.HTML(html)
link_list=content.xpath('//img[@class="BDE_Image"]/@src')
for link in link_list:
writeImg(link)
def writeImg(link):
h={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/;"
}
request=urllib2.Request(link, headers=h)
image=urllib2.urlopen(request).read()
filename=link[-10:]
with open(filename.decode('utf-8'), 'wb') as f:
f.write(image)
print("已经成功下载"+filename)
def tiebaSpider(url, beginPage, endPage):
for page in range(beginPage, endPage+1):
pn=(page-1)*50
fullurl=url+"&pn="+ str(pn)
loadPage(fullurl)
print("谢谢使用!")
if __name__ == "__main__":
kw=raw_input("请输入需要爬取的贴吧名字:")
beginPage=int(raw_input("请输入起始页:"))
endPage=int(raw_input("请输入结束页:"))
url="https://tieba.baidu.com/f?"
key=urllib.urlencode({"kw":kw})
fullurl=url+key
tiebaSpider(fullurl, beginPage, endPage)