pyquery——以jQuery的语法来操作解析xml文档

pyquery允许对xml文档进行jquery查询。该API尽可能类似于jquery。pyquery使用lxml进行快速的xml和html操作，能够以jQuery的语法来操作解析 HTML 文档。

实例：爬取疫情报告 https://voice.baidu.com/act/newpneumonia/newpneumonia

（今天报错还未调试成功，明天继续）

import requests
from pyquery import PyQuery as pq

def get_page(url):
    """发起请求 获得源码"""
    r = requests.get(url)
    r.encoding = 'utf8'
    html = r.text
    return html


def parse(text):
    """解析数据 写入文件"""
    doc = pq(text)
    # 获得每一行的tr标签
    ths  = doc('table.table thead tr.VirusTable_1-1-156_26gN5Z').items()
    for th in ths:
        area = th.find('span').text()  # 地区
        confirm = th.find('td:nth-child(2)').text()  # 确诊
        death = th.find('td:nth-child(3)').text()  # 死亡
        cure = th.find('td:nth-child(4)').text()  # 治愈
        with open('D:\yiqing.csv', 'a+', encoding='utf8') as f:
            f.write(area + '\t\t')
            f.write(confirm + '\t\t')
            f.write(death + '\t\t')
            f.write(cure + '\t\t\n')
    print("写入完成")
    """
    tds = doc('table.table tbody tr').items()
    for td in tds:
        rank = td.find('td:first-child').text()     # 排名
        name = td.find('div').text()  # 大学名称
        city = td.find('td:nth-child(3)').text()    # 城市
        score = td.find('td:nth-child(4)').text()   # 总分
        with open('D:\yiqing.csv', 'a+', encoding='utf8') as f:
            f.write(rank + '\t\t')
            f.write(name + '\t\t')
            f.write(city + '\t\t')
            f.write(score + '\t\t\n')
    print("写入完成")
    """

if __name__ == "__main__":
    url = "https://voice.baidu.com/act/newpneumonia/newpneumonia"
    text = get_page(url)
    parse(text)

pyquery——以jQuery的语法来操作解析xml文档

猜你喜欢