关于python抓取网页数据

就以目前事件的新型冠状病毒肺炎为例,

利用各平台的疫情实时追踪来抓取网页数据,这里以腾讯新闻为例。

步骤

  1. 使用浏览器打开腾讯新闻疫情实况
  2. 利用浏览器自带的开发者工具抓取数据
  3. 从数据中分析得出获取各个数据的方式
  4. 开工

需要用到的模块(不知道如何安装的话请百度搜索python xx模块如何安装,看其教程即可)

  1. requests模块
  2. re模块
  3. json模块

进入正题

从步骤3中,得出获取全球疫情总人数使用的访问方式是get
https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_global_vars&callback
访问后观察里面的数据,显然,访问后返回的是一段json数据。于是我们可以利用json模块将其解析
之后就是数据处理了。

代码如下

import requests
import re
import json
country =[] #国家
area = []#城镇
city = []#城市
dead = []#死亡人数
confirm = []#确诊人数
suspect = []#疑似人数
heal = []#治愈人数
Get_China = r"https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_global_vars&callback"#全球病情总人数
Get_City = r"https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_area_datas&callback"#地区数据
Get_City_V1 = r"https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_area_counts&callback"#地区数据
def GetHtmlText(url):
    try:
        res = requests.get(url,timeout = 30)
        res.raise_for_status()
        res.encoding = res.apparent_encoding
        return res.text
    except:
        return "Error"
China = GetHtmlText(Get_China)
def GetTextCenter(Text,TextLeft,TextRight):#取出中间文本
    L = Text.find(TextLeft) + len(TextLeft)
    Text = Text[L:]
    R = Text.find(TextRight)
    return Text[:R]
City_Count_json = json.loads(China)                             #获取总json数据
City_Count_json = City_Count_json["data"]                       #获取总json数据中的data数据
City_Count_json = re.findall(r"{[^}]+}",City_Count_json)        #将数组对象内的对象提取出来
City_Count_json = json.loads(City_Count_json[0])                #将其转换为对象
recentTime = str(City_Count_json["recentTime"])                 #GetTextCenter(China,r"\"recentTime\": \"",r"\",\n")#更新时间
confirmCount = str(City_Count_json["confirmCount"])            
suspectCount = str(City_Count_json["suspectCount"])            
deadCount = str(City_Count_json["deadCount"])                   #GetTextCenter(China,r"\"deadCount\": ",r",\n")      #疑似人数
cure = str(City_Count_json["cure"])                             #GetTextCenter(China,r"\"cure\": ",r"\n")                 #治愈人数
hintWords = str(City_Count_json["hintWords"])                   #GetTextCenter(China,r"\"hintWords\": 
print("更新时间:" + recentTime + "\n" + "确诊人数为:" + confirmCount + "人\n" + "死亡人数为:" +
      deadCount + "人\n" + "疑似人数为:" + suspectCount + "人\n" + "治愈人数为:" + cure +
      "人\n" + "最新消息:" + hintWords + "\n")
City_json = GetHtmlText(Get_City_V1)
City_Data = json.loads(City_json)
City_Data = City_Data["data"]
City_Data = re.findall(r"{[^}]+}",City_Data)#CitysJson
def GetCityData(CitysJson):#获取精确信息,返回成员长度
    global country # 国家
    global area # 城镇
    global city # 城市
    global dead # 死亡人数
    global confirm # 确诊人数
    global suspect # 疑似人数
    global heal # 治愈人数
    i = len(CitysJson)#获取json数据有多少个成员
    for j in range(0,i):
       data = json.loads(CitysJson[j])
       country.append (data["country"])
       area.append (data["area"])
       city.append (data["city"])
       dead.append(data["dead"])
       confirm.append(data["confirm"])
       suspect.append(data["suspect"])
       heal.append(data["heal"])
    return i
length = GetCityData(City_Data)
for n in range(0,length):
    if(country[n] != "中国"):
        print("国家:" + str(country[n]) + "\t" + "确诊:" + str(confirm[n]) + "例\t" + "治愈:" + str(heal[n]) + "例\t" + "死亡:" + str(dead[n]) + "例\n")
    elif(city[n] == ""):
        print("国家:" + str(country[n]) + "\t" + "地区:" + str(area[n]) + "\t" + "确诊:" + str(
            confirm[n]) + "例\t" + "治愈:" + str(heal[n]) + "例\t" + "死亡:" + str(dead[n]) + "例\n")
    else:
        print("国家:" + str(country[n])+ "\t" + "地区:" + str(area[n]) + "\t" + "城市:" + str(city[n]) + "\t" + "确诊:" + str(
            confirm[n]) + "例\t" + "治愈:" + str(heal[n]) + "例\t" + "死亡:" + str(dead[n]) + "例\n")
发布了10 篇原创文章 · 获赞 6 · 访问量 878

猜你喜欢

转载自blog.csdn.net/WildSky_/article/details/104087058