Python crawls the top ten data of Baidu's hot list

The specific code is as follows:

import re
import requests
import json


def get_html(url):  # 获取网页源代码
    try:
        response = requests.get(url)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            html = response.text
            return html
        else:
            print("连接异常!")
    except:
        print("获取失败!")


def get_result(html):  # 答案
    # print(html)
    pattern1 = re.compile(r'<td class="first">.*?>(\d+)</span>', re.S)  # 获取排名
    rank = re.findall(pattern1, html)
    # print(rank)
    pattern2 = re.compile(r'<td class="keyword">.*?>(.*?)</a>', re.S)  # 获取keyword
    keyword = re.findall(pattern2, html)
    # print(keyword)
    pattern3 = re.compile(r'<td class="last">.*?>(\d+)</span>', re.S)  # 获取流行指数
    last = re.findall(pattern3, html)
    # print(last)
    pattern4 = re.compile(r'<td class="keyword">.*?<a href="(.*?)"', re.S)  # 获取链接
    link = re.findall(pattern4, html)
    # print(link)
    result = {
    
    }
    for i in range(10):
        dict1 = {
    
    
            "关键字": keyword[i],
            "流行指数": last[i],
            "链接": link[i].replace('./detail?b=1&c=513&w',
                                  'https://www.baidu.com/baidu?cl=3&tn=SE_baiduhomet8_jmjb7mjw&rsv_dl=fyb_top&fr=top1000&wd')
        }
        # print(dict1)
        result[rank[i]] = dict1  # 将排行作为外层键,dict1作为结果的值,构成一个大的字典便于查询
    return result


def main():
    url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
    html = get_html(url)
    result1 = get_result(html)
    result = json.dumps(result1, indent=4, ensure_ascii=False)  # 转json格式
    print(result)


if __name__ == '__main__':
    main()

Guess you like

Origin blog.csdn.net/m0_46202060/article/details/111063966