16.爬虫之数据解析和数据存储

1. 数据解析

-谷歌浏览器离线安装Xpath Helper

1.1. xpath语法与lxml库

xpath语法文档链接:http://www.w3school.com.cn/xpath/index.asp

lxml库 官方链接:https://lxml.de/index.html

  • 解析html字符串,使用lxml.etree.HTML进行解析, 实例代码
from lxml import etree

text = """
<table>
    <tr>
        <td>1</td>
        <td>2</td>
        <td>3</td>
    </tr>
    <tr>
        <td>a</td>
        <td>b</td>
        <td>c</td>
    </tr>
    <tr>
        <td>中国</td>
        <td>美国</td>
        <td>日本</td>
    </tr>
</table>
"""

htmlElement = etree.HTML(text)
print(etree.tostring(htmlElement, encoding="utf-8").decode())

  • 解析html文件,使用lxml.etree.parse进行解析,单只这个函数默认使用的是XML解析器,所以要创建相应的HTMLparser解析器进行解析, 示例如下。
from lxml import etree

# 指定html解析器
parser = etree.HTMLParser(encoding="utf-8")
htmlElement = etree.parse("renren.html", parser=parser)
print(etree.tostring(htmlElement, encoding="utf-8").decode())

1.2. 豆瓣电影正在上映的电影爬取

import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    "Referer": "https://movie.douban.com/",
}

url = "https://movie.douban.com/cinema/nowplaying/beijing/"

response = requests.get(url, headers=headers)

# 获取正在上映的电影信息
html = etree.HTML(response.content.decode())
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies = []
for li in lis:
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    duration = li.xpath("@data-duration")[0]
    region = li.xpath("@data-region")[0]
    actors = li.xpath("@data-actors")[0]
    thumnails = li.xpath(".//img/@src")[0]
    movie = {
        'title': title,
        'score': score,
        'duration': duration,
        'region': region,
        'actors': actors,
        'thumnails': thumnails,
    }
    movies.append(movie)
print(movies)

1.3. 电影天堂数据爬取

import requests
from lxml import etree

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}

BASE_URL = "https://dytt8.net";


def get_detail_urls(url):
    """
    获取详情页的链接
    :param url: 第n页的链接
    """
    response = requests.get(url, headers=HEADERS)
    # 构建详情页的url
    # html = etree.HTML(response.content.decode(encoding="gbk"))
    # 注意: 在列表页面有特殊字符,第三页报错,我们在列表页面只为了提取详情页的链接,所以可以采用默认的编码即可
    html = etree.HTML(response.text)
    detail_paths = html.xpath("//a[@class='ulink']/@href")
    detail_urls = map(lambda x: BASE_URL + x, detail_paths)
    return detail_urls


def parse_detail_page(detail_url):
    """
    获取电影详情信息
    :param detail_url: 电影详情的链接
    :return:
    """
    movie = {}
    response = requests.get(detail_url, headers=HEADERS)
    # 原网页采用的是gbk编码
    htmlE = etree.HTML(response.content.decode("gbk"))
    title = htmlE.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title'] = title
    zoomE = htmlE.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    cover = imgs[0]
    screenshot = imgs[1]
    movie['cover'] = cover
    movie['screenshot'] = screenshot

    infos = zoomE.xpath(".//text()")
    # 省略了部分信息
    for index, info in enumerate(infos):
        if info.startswith("◎年  代"):
            year = info.replace("◎年  代", "").strip()
            movie['year'] = year
        elif info.startswith("◎导  演"):
            director = info.replace("◎导  演", "").strip()
            movie['director'] = director
        elif info.startswith("◎主  演"):
            actor = info.replace("◎主  演", "").strip()
            actors = [actor]
            for x in range(index + 1, len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors'] = actors
    return movie


def spider():
    base_url = "https://dytt8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    for i in range(1, 2):
        detail_urls = get_detail_urls(base_url.format(i))
        for detail_url in detail_urls:
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)


if __name__ == '__main__':
    spider()



1.4. 正则表达式参考

2. 数据存储

2.1. json文件处理(参考python案例之爬虫基础案例 )

2.2. csv文件的处理

发布了85 篇原创文章 · 获赞 12 · 访问量 3745

猜你喜欢

转载自blog.csdn.net/fanjianhai/article/details/103679681