使用XPath的爬虫,百度贴吧图片下载

现在我们用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子,并且将该这个帖子里每个楼层发布的图片下载到本地。

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import urllib
 5 import urllib2
 6 from lxml import etree
 7 
 8 def loadPage(url):
 9     """
10         作用:根据url发送请求,获取服务器响应文件
11         url: 需要爬取的url地址
12     """
13     #print url
14     #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
15 
16     request = urllib2.Request(url)
17     html = urllib2.urlopen(request).read()
18     # 解析HTML文档为HTML DOM模型
19     content = etree.HTML(html)
20     #print content
21     # 返回所有匹配成功的列表集合
22     link_list = content.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
23     for link in link_list:
24         fulllink = "http://tieba.baidu.com" + link
25         # 组合为每个帖子的链接
26         #print link
27         loadImage(fulllink)
28 
29 # 取出每个帖子里的每个图片连接
30 def loadImage(link):
31     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
32     request = urllib2.Request(link, headers = headers)
33     html = urllib2.urlopen(request).read()
34     # 解析
35     content = etree.HTML(html)
36     # 取出帖子里每层层主发送的图片连接集合
37     link_list = content.xpath('//img[@class="BDE_Image"]/@src')
38     # 取出每个图片的连接
39     for link in link_list:
40         #print link
41         writeImage(link)
42 
43 def writeImage(link):
44     """
45         作用:将html内容写入到本地
46         link:图片连接
47     """
48     #print "正在保存 " + filename
49     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
50     # 文件写入
51     request = urllib2.Request(link, headers = headers)
52     # 图片原始数据
53     image = urllib2.urlopen(request).read()
54     # 取出连接后10位做为文件名
55     filename = link[-10:]
56     # 写入到本地磁盘文件内
57     with open(filename, "wb") as f:
58         f.write(image)
59     print "已经成功下载 "+ filename
60 
61 def tiebaSpider(url, beginPage, endPage):
62     """
63         作用:贴吧爬虫调度器,负责组合处理每个页面的url
64         url : 贴吧url的前部分
65         beginPage : 起始页
66         endPage : 结束页
67     """
68     for page in range(beginPage, endPage + 1):
69         pn = (page - 1) * 50
70         fullurl = url + "&pn=" + str(pn)
71         #print fullurl
72         loadPage(fullurl)
73         #print html
74 
75         print "谢谢使用"
76 
77 if __name__ == "__main__":
78     kw = raw_input("请输入需要爬取的贴吧名:")
79     beginPage = int(raw_input("请输入起始页:"))
80     endPage = int(raw_input("请输入结束页:"))
81 
82     url = "http://tieba.baidu.com/f?"
83     key = urllib.urlencode({"kw": kw})
84     fullurl = url + key
85     tiebaSpider(fullurl, beginPage, endPage)

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9190536.html
今日推荐