python爬虫--xpath结合re同时爬取文字与图片

还是老家的旅游网址:http://www.patour.cn/site/pananzxw/tcgl/index.html,将这些特产的图片及其介绍都爬取下来!

源码:

1 # -*- coding:utf-8 -*- 
  2 import urllib2
  3 import re
  4 from lxml import etree
  5 
  6 class Spider:
  7     def __init__(self):
  8         pass
  9     def loadPage(self):
 10         #将网页的源码爬取下来
 11         url = 'http://www.patour.cn/site/pananzxw/tcgl/index.html'
 12         headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
 13         request = urllib2.Request(url,headers=headers)
 14         response = urllib2.urlopen(request)
 15         html = response.read()
 16         self.getfullUrl(html)
 17         #print html
 18     def getfullUrl(self,html):
 19         #利用xpath将分网页拿取出来
 20         content = etree.HTML(html)
 21         link_list = content.xpath('//div[@class="box_con"]/a[@class="mtit"]/@href')
 22         #print link_list
 23         for item in link_list:
 24             full_url = "http://www.patour.cn"+str(item)
 25             #print full_url
 26             self.loadlittlePage(full_url)
 27 
 28     def loadlittlePage(self,url):
 29         #将分网页的源码拿出
 30         headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20    100101 Firefox/45.0"}
 31         request = urllib2.Request(url,headers=headers)
 32         html_little = urllib2.urlopen(request).read()
 33         #print html_little
 34 
 35         self.getImageUrl(html_little)
 36         self.getWenzi(html_little)
 37 
 38     def getImageUrl(self,html):
 39         #分析拿出图片的url
 40         content = etree.HTML(html)
 41         link_list = content.xpath('//div[@class="news_text"]/p/img/@src')
 42         for item in link_list:
 43             fullImage_url = "http://www.patour.cn"+str(item)
 44             #print fullImage_url
 45             self.loadImage(fullImage_url)#下载图片
 46     
 47     def getWenzi(self,html):
 48         #分析文字
49         pattern = re.compile('<p\sstyle="line-height:\s2em;">(.*?)</p>',re.S)
 50         content_list = pattern.findall(html)
 51 
 52         for content in content_list:
 53             #print content
 54             content = content.replace("<br>","").replace("<br/>","")
 55             self.loadWenzi(content)
 56 
 57     def loadWenzi(self,content):
 58         #下载文字并保存
 59         with open("wenzi.txt","a") as f:
 60             f.write(content)
 61 
 62     def loadImage(self,link):
 63         #将图片下载下来
 64         headers ={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20        100101 Firefox/45.0"}
 65         request = urllib2.Request(link,headers=headers)
 66         image = urllib2.urlopen(request).read()
 67         filename = link[-15:]
 68         with open(filename,'wb') as f:
 69             f.write(image)
 70         print '下载成功!'
 71 
 72 
 73 if __name__ == "__main__":
 74     techanspider = Spider()
 75     techanspider.loadPage()
                                                              

结果:

猜你喜欢

转载自blog.csdn.net/jerechen/article/details/79330332