还是老家的旅游网址:http://www.patour.cn/site/pananzxw/tcgl/index.html,将这些特产的图片及其介绍都爬取下来!
源码:
1 # -*- coding:utf-8 -*-
2 import urllib2
3 import re
4 from lxml import etree
5
6 class Spider:
7 def __init__(self):
8 pass
9 def loadPage(self):
10 #将网页的源码爬取下来
11 url = 'http://www.patour.cn/site/pananzxw/tcgl/index.html'
12 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
13 request = urllib2.Request(url,headers=headers)
14 response = urllib2.urlopen(request)
15 html = response.read()
16 self.getfullUrl(html)
17 #print html
18 def getfullUrl(self,html):
19 #利用xpath将分网页拿取出来
20 content = etree.HTML(html)
21 link_list = content.xpath('//div[@class="box_con"]/a[@class="mtit"]/@href')
22 #print link_list
23 for item in link_list:
24 full_url = "http://www.patour.cn"+str(item)
25 #print full_url
26 self.loadlittlePage(full_url)
27
28 def loadlittlePage(self,url):
29 #将分网页的源码拿出
30 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20 100101 Firefox/45.0"}
31 request = urllib2.Request(url,headers=headers)
32 html_little = urllib2.urlopen(request).read()
33 #print html_little
34
35 self.getImageUrl(html_little)
36 self.getWenzi(html_little)
37
38 def getImageUrl(self,html):
39 #分析拿出图片的url
40 content = etree.HTML(html)
41 link_list = content.xpath('//div[@class="news_text"]/p/img/@src')
42 for item in link_list:
43 fullImage_url = "http://www.patour.cn"+str(item)
44 #print fullImage_url
45 self.loadImage(fullImage_url)#下载图片
46
47 def getWenzi(self,html):
48 #分析文字
49 pattern = re.compile('<p\sstyle="line-height:\s2em;">(.*?)</p>',re.S)
50 content_list = pattern.findall(html)
51
52 for content in content_list:
53 #print content
54 content = content.replace("<br>","").replace("<br/>","")
55 self.loadWenzi(content)
56
57 def loadWenzi(self,content):
58 #下载文字并保存
59 with open("wenzi.txt","a") as f:
60 f.write(content)
61
62 def loadImage(self,link):
63 #将图片下载下来
64 headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20 100101 Firefox/45.0"}
65 request = urllib2.Request(link,headers=headers)
66 image = urllib2.urlopen(request).read()
67 filename = link[-15:]
68 with open(filename,'wb') as f:
69 f.write(image)
70 print '下载成功!'
71
72
73 if __name__ == "__main__":
74 techanspider = Spider()
75 techanspider.loadPage()
结果: