爬虫，爬取豆瓣书城首页的书籍信息，requests下载页面，三种解析方式（正则，bs4，xpath）

import requests
r=requests.get('https://book.douban.com/')
content=r.text

需要解析的主要HTMl

# <div class="info">
# <div class="title">
# <a class="" href="https://book.douban.com/subject/30163860/?icn=index-editionrecommend"
# title="绿毛水怪">绿毛水怪</a>
# </div>
# <div class="author">
# 王小波
# </div>
# <div class="more-meta">
# <h4 class="title">
# 绿毛水怪
# </h4>
# <p>
# <span class="author">
# 王小波
# </span>
# /
# <span class="year">
# 2018-5-1
# </span>
# /
# <span class="publisher">
# 北京十月文艺出版社
# </span>

正则

import re
pattern=re.compile('<h4.?>(.?)</h4>.?<p>.?author">.?(.?)</span>.?year">.?(.?)</span>.?publisher">.?(.?)</span>.*?</p>',re.S)

results = re.findall(pattern, content)
print(results)
for result in results:
# print(result)
name,author,time,chuban=result
name=re.sub("\s",'',name)
author=re.sub('\s','',author)
time=re.sub("\s",'',time)
chuban=re.sub("\s",'',chuban)
print(name,author,time,chuban)

bs4

from bs4 import beautiful

html=r.content

soup=BeautifulSoup(html,"lxml")
print(type(soup))

name=soup.findAll(name='h4',class_='title',text=re.compile(".?"))

author=soup.findAll(name='span',class_='author',text=re.compile(".?"))

time=soup.findAll(name='span',class_="year",text=re.compile(".?"))

chuban=soup.findAll(name="span",class_="publisher",text=re.compile((".?")))

xpath

from lxml import etree

html=r.content
tree = etree.HTML(html)
name=tree.xpath("//h4/text()")

author=tree.xpath("//span[@class='author']/text()")

time=tree.xpath("//span[@class='year']/text()")

chuban=tree.xpath("//span[@class='publisher']/text()")