1. 数据解析
-谷歌浏览器离线安装Xpath Helper
1.1. xpath语法与lxml库
xpath语法文档链接:http://www.w3school.com.cn/xpath/index.asp
lxml库 官方链接:https://lxml.de/index.html
- 解析html字符串,使用
lxml.etree.HTML
进行解析, 实例代码
from lxml import etree
text = """
<table>
<tr>
<td>1</td>
<td>2</td>
<td>3</td>
</tr>
<tr>
<td>a</td>
<td>b</td>
<td>c</td>
</tr>
<tr>
<td>中国</td>
<td>美国</td>
<td>日本</td>
</tr>
</table>
"""
htmlElement = etree.HTML(text)
print(etree.tostring(htmlElement, encoding="utf-8").decode())
- 解析html文件,使用
lxml.etree.parse
进行解析,单只这个函数默认使用的是XML
解析器,所以要创建相应的HTMLparser解析器进行解析, 示例如下。
from lxml import etree
# 指定html解析器
parser = etree.HTMLParser(encoding="utf-8")
htmlElement = etree.parse("renren.html", parser=parser)
print(etree.tostring(htmlElement, encoding="utf-8").decode())
1.2. 豆瓣电影正在上映的电影爬取
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Referer": "https://movie.douban.com/",
}
url = "https://movie.douban.com/cinema/nowplaying/beijing/"
response = requests.get(url, headers=headers)
# 获取正在上映的电影信息
html = etree.HTML(response.content.decode())
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies = []
for li in lis:
title = li.xpath("@data-title")[0]
score = li.xpath("@data-score")[0]
duration = li.xpath("@data-duration")[0]
region = li.xpath("@data-region")[0]
actors = li.xpath("@data-actors")[0]
thumnails = li.xpath(".//img/@src")[0]
movie = {
'title': title,
'score': score,
'duration': duration,
'region': region,
'actors': actors,
'thumnails': thumnails,
}
movies.append(movie)
print(movies)
1.3. 电影天堂数据爬取
import requests
from lxml import etree
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}
BASE_URL = "https://dytt8.net";
def get_detail_urls(url):
"""
获取详情页的链接
:param url: 第n页的链接
"""
response = requests.get(url, headers=HEADERS)
# 构建详情页的url
# html = etree.HTML(response.content.decode(encoding="gbk"))
# 注意: 在列表页面有特殊字符,第三页报错,我们在列表页面只为了提取详情页的链接,所以可以采用默认的编码即可
html = etree.HTML(response.text)
detail_paths = html.xpath("//a[@class='ulink']/@href")
detail_urls = map(lambda x: BASE_URL + x, detail_paths)
return detail_urls
def parse_detail_page(detail_url):
"""
获取电影详情信息
:param detail_url: 电影详情的链接
:return:
"""
movie = {}
response = requests.get(detail_url, headers=HEADERS)
# 原网页采用的是gbk编码
htmlE = etree.HTML(response.content.decode("gbk"))
title = htmlE.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title'] = title
zoomE = htmlE.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
cover = imgs[0]
screenshot = imgs[1]
movie['cover'] = cover
movie['screenshot'] = screenshot
infos = zoomE.xpath(".//text()")
# 省略了部分信息
for index, info in enumerate(infos):
if info.startswith("◎年 代"):
year = info.replace("◎年 代", "").strip()
movie['year'] = year
elif info.startswith("◎导 演"):
director = info.replace("◎导 演", "").strip()
movie['director'] = director
elif info.startswith("◎主 演"):
actor = info.replace("◎主 演", "").strip()
actors = [actor]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
return movie
def spider():
base_url = "https://dytt8.net/html/gndy/dyzz/list_23_{}.html"
movies = []
for i in range(1, 2):
detail_urls = get_detail_urls(base_url.format(i))
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ == '__main__':
spider()
2. 数据存储
2.1. json文件处理(参考python案例之爬虫基础案例 )
2.2. csv文件的处理