现在我们用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子,并且将该这个帖子里每个楼层发布的图片下载到本地。
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib 5 import urllib2 6 from lxml import etree 7 8 def loadPage(url): 9 """ 10 作用:根据url发送请求,获取服务器响应文件 11 url: 需要爬取的url地址 12 """ 13 #print url 14 #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} 15 16 request = urllib2.Request(url) 17 html = urllib2.urlopen(request).read() 18 # 解析HTML文档为HTML DOM模型 19 content = etree.HTML(html) 20 #print content 21 # 返回所有匹配成功的列表集合 22 link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') 23 for link in link_list: 24 fulllink = "http://tieba.baidu.com" + link 25 # 组合为每个帖子的链接 26 #print link 27 loadImage(fulllink) 28 29 # 取出每个帖子里的每个图片连接 30 def loadImage(link): 31 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 32 request = urllib2.Request(link, headers = headers) 33 html = urllib2.urlopen(request).read() 34 # 解析 35 content = etree.HTML(html) 36 # 取出帖子里每层层主发送的图片连接集合 37 link_list = content.xpath('//img[@class="BDE_Image"]/@src') 38 # 取出每个图片的连接 39 for link in link_list: 40 #print link 41 writeImage(link) 42 43 def writeImage(link): 44 """ 45 作用:将html内容写入到本地 46 link:图片连接 47 """ 48 #print "正在保存 " + filename 49 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 50 # 文件写入 51 request = urllib2.Request(link, headers = headers) 52 # 图片原始数据 53 image = urllib2.urlopen(request).read() 54 # 取出连接后10位做为文件名 55 filename = link[-10:] 56 # 写入到本地磁盘文件内 57 with open(filename, "wb") as f: 58 f.write(image) 59 print "已经成功下载 "+ filename 60 61 def tiebaSpider(url, beginPage, endPage): 62 """ 63 作用:贴吧爬虫调度器,负责组合处理每个页面的url 64 url : 贴吧url的前部分 65 beginPage : 起始页 66 endPage : 结束页 67 """ 68 for page in range(beginPage, endPage + 1): 69 pn = (page - 1) * 50 70 fullurl = url + "&pn=" + str(pn) 71 #print fullurl 72 loadPage(fullurl) 73 #print html 74 75 print "谢谢使用" 76 77 if __name__ == "__main__": 78 kw = raw_input("请输入需要爬取的贴吧名:") 79 beginPage = int(raw_input("请输入起始页:")) 80 endPage = int(raw_input("请输入结束页:")) 81 82 url = "http://tieba.baidu.com/f?" 83 key = urllib.urlencode({"kw": kw}) 84 fullurl = url + key 85 tiebaSpider(fullurl, beginPage, endPage)