二月九号博客

今天爬取全国新型冠状病毒感染的肺炎疫情数据

设计思路

1,获取网页数据

2,提取想要的内容

3,储存到excel表格中

代码设计

  1 #_*_coding:utf-8_*_
  2 # 爬取新型冠状病毒肺炎疫情
  3 import requests
  4 import re
  5 import xlw
  6 
  7 
  8 def get_info(txt2_3):
  9     Newly_confirmed_cases = re.findall('.*?新增确诊病例(.*?)例', txt2_3, re.S)
 10     print("新增确诊病例", Newly_confirmed_cases[0])
 11 
 12     New_severe_cases = re.findall('.*?新增重症病例(.*?)例', txt2_3, re.S)
 13     print("新增重症病例", New_severe_cases[0])
 14 
 15     New_deaths = re.findall('.*?新增死亡病例(.*?)例', txt2_3, re.S)
 16     print("新增死亡病例", New_deaths[0])
 17 
 18     New_cured_discharge_cases = re.findall('.*?新增治愈出院病例(.*?)例', txt2_3, re.S)
 19     if New_cured_discharge_cases:
 20         print("新增治愈出院病例", New_cured_discharge_cases[0])
 21     else:
 22         New_cured_discharge_cases = re.findall('.*?新增治愈出院(.*?)例', txt2_3, re.S)
 23         print("新增治愈出院病例", New_cured_discharge_cases[0])
 24 
 25     New_suspected_cases = re.findall('.*?新增疑似病例(.*?)例', txt2_3, re.S)
 26     print("新增疑似病例", New_suspected_cases[0])
 27 
 28     Cumulative_reported_confirmed_cases = re.findall('.*?累计报告确诊病例(.*?)例', txt2_3, re.S)
 29     print("累计报告确诊病例", Cumulative_reported_confirmed_cases[0])
 30 
 31     Severe_cases = re.findall('.*?重症病例(.*?)例', txt2_3, re.S)
 32     print("重症病例", Severe_cases[1])
 33 
 34     Cumulative_deaths = re.findall('.*?累计死亡病例(.*?)例', txt2_3, re.S)
 35     if Cumulative_deaths:
 36         print("累计死亡病例", Cumulative_deaths[0])
 37     else :
 38         Cumulative_deaths = re.findall('.*?累计死亡(.*?)例', txt2_3, re.S)
 39         print("累计死亡病例", Cumulative_deaths[0])
 40 
 41     Accumulative_cured_discharged_cases = re.findall('.*?累计治愈出院病例(.*?)例', txt2_3, re.S)
 42     if Accumulative_cured_discharged_cases:
 43         print("累计治愈出院病例", Accumulative_cured_discharged_cases[0])
 44     else:
 45         Accumulative_cured_discharged_cases = re.findall('.*?累计治愈出院(.*?)例', txt2_3, re.S)
 46         print("累计治愈出院病例", Accumulative_cured_discharged_cases[0])
 47 
 48     Cumulative_suspected_cases = re.findall('.*?有疑似病例(.*?)例', txt2_3, re.S)
 49     print("累计疑似病例", Cumulative_suspected_cases[0])
 50 
 51     Close_contacts = re.findall('.*?追踪到密切接触者(.*?)人', txt2_3, re.S)
 52     print("累计追踪到密切接触者", Close_contacts[0])
 53 
 54     Under_medical_observation = re.findall('.*?人,现有(.*?)人正在接受医学观察', txt2_3, re.S)
 55     if Under_medical_observation:
 56         print("正在接受医学观察人数", Under_medical_observation[0])
 57     else:
 58         Under_medical_observation = re.findall('.*?人,共有(.*?)人正在接受医学观察', txt2_3, re.S)
 59         if Under_medical_observation:
 60             print("正在接受医学观察人数", Under_medical_observation[0])
 61         else:
 62             Under_medical_observation = re.findall('.*?尚在医学观察的密切接触者(.*?)人', txt2_3, re.S)
 63             print("正在接受医学观察人数", Under_medical_observation[0])
 64     day = re.findall('(.*?)0—24时',txt2_3,re.S)
 65     if day:
 66         print("日期", day[0])
 67     else :
 68         day = re.findall('(.*?)0-24时', txt2_3, re.S)
 69         print("日期", day[0])
 70 
 71     data = {}
 72     data['Newly_confirmed_cases'] = Newly_confirmed_cases[0]
 73     data['New_severe_cases'] = New_severe_cases[0]
 74     data['New_deaths'] = New_deaths[0]
 75     data['New_cured_discharge_cases'] = New_cured_discharge_cases[0]
 76     data['New_suspected_cases'] = New_suspected_cases[0]
 77     data['Cumulative_reported_confirmed_cases'] = Cumulative_reported_confirmed_cases[0]
 78     data['Severe_cases'] = Severe_cases[1]
 79     data['Cumulative_deaths'] = Cumulative_deaths[0]
 80     data['Accumulative_cured_discharged_cases'] = Accumulative_cured_discharged_cases[0]
 81     data['Cumulative_suspected_cases'] = Cumulative_suspected_cases[0]
 82     data['Close_contacts'] = Close_contacts[0]
 83     data['Under_medical_observation'] = Under_medical_observation[0]
 84     data['day'] = day[0]
 85     return data
 86 
 87 def write_TXT():
 88     DATA = []
 89     f = open('virus.txt', 'r', encoding='utf-8')
 90     edges = [line.strip('————') for line in f]
 91     for i in range(len(edges)):
 92         edges[i] = edges[i].replace('\n', '')
 93     contents = []
 94     content = ''
 95     for item in edges:
 96         if item:
 97             content += item
 98         else:
 99             contents.append(content)
100             content = ''
101             continue
102     contents.append(content)
103     for item in contents:
104         datas = get_info(item)
105         DATA.append(datas)  # 将所有的数据添加到DATA里
106 
107 
108     f = xlwt.Workbook(encoding='utf-8')
109     sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
110     sheet01.write(0, 0, '新增确诊病例')  # 第一行第一列
111     sheet01.write(0, 1, '新增重症病例')
112     sheet01.write(0, 2, '新增死亡病例')
113     sheet01.write(0, 3, '新增治愈出院病例')
114     sheet01.write(0, 4, '新增疑似病例')
115     sheet01.write(0, 5, '累计报告确诊病例')
116     sheet01.write(0, 6, '重症病例')
117     sheet01.write(0, 7, '累计死亡病例')
118     sheet01.write(0, 8, '累计治愈出院病例')
119     sheet01.write(0, 9, '累计疑似病例')
120     sheet01.write(0, 10, '累计追踪到密切接触者')
121     sheet01.write(0, 11, '正在接受医学观察人数')
122     sheet01.write(0, 12, '日期')
123     # 写内容
124     for i in range(len(DATA)):
125         sheet01.write(i + 1, 0, DATA[i]['Newly_confirmed_cases'])
126         sheet01.write(i + 1, 1, DATA[i]['New_severe_cases'])
127         sheet01.write(i + 1, 2, DATA[i]['New_deaths'])
128         sheet01.write(i + 1, 3, DATA[i]['New_cured_discharge_cases'])
129         sheet01.write(i + 1, 4, DATA[i]['New_suspected_cases'])
130         sheet01.write(i + 1, 5, DATA[i]['Cumulative_reported_confirmed_cases'])
131         sheet01.write(i + 1, 6, DATA[i]['Severe_cases'])
132         sheet01.write(i + 1, 7, DATA[i]['Cumulative_deaths'])
133         sheet01.write(i + 1, 8, DATA[i]['Accumulative_cured_discharged_cases'])
134         sheet01.write(i + 1, 9, DATA[i]['Cumulative_suspected_cases'])
135         sheet01.write(i + 1, 10, DATA[i]['Close_contacts'])
136         sheet01.write(i + 1, 11, DATA[i]['Under_medical_observation'])
137         sheet01.write(i + 1, 12, DATA[i]['day'])
138         print('p', end='')
139     f.save('D:\\爬虫数据\\新型冠状病毒感染的肺炎疫情数据.xls')
140 
141 if __name__ == '__main__':
142     write_TXT()

 遇见的问题:

  一,获取不了国家卫健委网页数据

    解决办法:手动打开网页复制网页数据储存在文本中

  二,由于提取数据的对象组不是完全固定的样式,它会有细微的变化,比如它要表示累计出院病例的数量会说“累计治愈出院病例xxx例”也可能会说“累计治愈出院xxx例”,除此之外,它表示相同的内容的顺序也会变化,比如把新增重症人数放在新增治愈人数前,也有可能把新增治愈人数放在新增重症病人人数前,所以传统的一次提取数据的方法就不可取了。

    解决办法:把所要提取的信息单独出来,把有可能出现多种表达方式的信息多样化处理。

猜你喜欢

转载自www.cnblogs.com/goubb/p/12287182.html