Python 之正则、Xpath

好久之前就研究过正则与Xpath ,一直也没做记录

今天记录下,以备随时查阅

上代码片段,抓取的是 豆瓣的TOP250 电影信息,网址为

https://movie.douban.com/top250?start=0&filter=

正则:
def spider(self,html):
  pa_id = re.compile('<em class.*?>(.*?)</em>', re.S)
  #抓取ID号(另一种写法)
  pa_url = re.compile('<div.*?hd.*?href="(.*?)"', re.S)
  #抓取RUL(正则标准写法)
  pa_pic = re.compile('<img.*?100.*?src="(.*?)".*?', re.S)
  #抓取封面图
(正则标准写法)
  pa_title=re.compile('<div class="hd">(.*?)</a>',re.S)
  #抓取电影名(另一种写法,跟上面几个不一样)
  print(re.findall(pa_title,html))
我这里是测试,如果想顺序输出,直接把正则写到一起就好
 
发现没,第一个和第四个,好简单,直接复制代码就能抓取,有点像Xpath  的抓取方式

Xpath:
def spider(self,html):
html1=etree.HTML(html)
li_list=html1.xpath("//ol[@class='grid_view']/li")
for i in li_list:
movies_title=i.xpath(".//span[@class='title']/text()")
movies_director1 = i.xpath(".//div[@class='bd']/p/text()")
str1=str(movies_director1).replace("\\n", "").split()
testobj.append((movies_title,str1))
print(testobj)
 
#正则

import requests,re
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

class douban(object):
    def __init__(self):
        pass
    def get_url(self,url):
        rer = requests.request("get", url, headers=headers)
        if rer.status_code==200:
            return rer.text
        return None
    def spider(self,html):
        pa_id = re.compile('<em class.*?>(.*?)</em>', re.S)
        pa_url = re.compile('<div.*?hd.*?href="(.*?)"', re.S)
        pa_pic = re.compile('<img.*?100.*?src="(.*?)".*?', re.S)
        pa_title=re.compile('<div class="hd">(.*?)</a>',re.S)
        pa_content3=re.compile('<span class="title">(.*?)</span>.*?<span class="title">(.*?)</span>.*?"other">(.*?)</span>',re.S)
        print(re.findall(pa_content3,html))
    def url(self,num):
        url = f"https://movie.douban.com/top250?start={num}&filter="
        return url

if __name__ == '__main__':
    douban = douban()
    for i in range(10):
        url=douban.url(i)
        html=douban.get_url(url)
        douban.spider(html)

  





#Xpath

from lxml import etree import requests headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"} testobj=[] class douban(object): def __init__(self): pass def get_url(self,url): rer = requests.request("get", url, headers=headers) if rer.status_code==200: return rer.text return None def spider(self,html): html1=etree.HTML(html) li_list=html1.xpath("//ol[@class='grid_view']/li") for i in li_list: movies_title=i.xpath(".//span[@class='title']/text()") movies_director1 = i.xpath(".//div[@class='bd']/p/text()") str1=str(movies_director1).replace("\\n", "").split() testobj.append((movies_title,str1)) print(testobj) def url(self,num): url = f"https://movie.douban.com/top250?start={num}&filter=" return url if __name__ == '__main__': douban = douban() for i in range(10): url=douban.url(i) html=douban.get_url(url) douban.spider(html)

  

猜你喜欢

转载自www.cnblogs.com/fbhell/p/12432480.html