版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_34884302/article/details/85986332
最近项目里需要省市区村的数据,网上找了很多方法,都没有如意的,有的老数据竟然还要钱,要积分的我也还认可,我在网上查了下,参考了这位老兄的博客,自己又动手把第五级村级行政加了上去。下面请看大屏幕,我要划重点了:
# -*- coding: utf-8 -*-
import re
import requests
import time
import operator
from functools import reduce
save_route = 'E://China_Province_2017_test.txt' #数据储存路径
results2 = []
results3 = []
results4 = []
results5 = []
Dates1 = []
kv = {'user-agent': 'Mozilla/5.0'}
n = 0
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
pattern = re.compile("<a href='(.*?)'>(.*?)<") # 正则表达式
result1 = list(set(re.findall(pattern, r.text))) # 从主页面获取子页面的html
print('result1')
#print(result1)
i2 = 0
for i2 in range(len(result1)):
try:
url2a = result1[i2][0]
address1 = result1[i2][1] # 一级地址
i2 += 1
url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' + url2a
#http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/44.html
#print(url2)
#print(address1)
r2 = requests.get(url2, headers=kv)
r2.raise_for_status()
r2.encoding = r2.apparent_encoding
pattern2 = re.compile("<a href='(.*?)'>(.*?)<") # 正则表达式提取目标字段
result2 = list(set(re.findall(pattern2, r2.text)))
#print(result2)
result2a = []
for i2a in result2: # 爬取的城市信息和城市代码混在一起,需要将代码清除
if '0' in i2a[1]:
n += 1
else:
result2a.append(i2a)
print('result2a')
# print(result2a)
except:
print('错误')
with open(save_route, 'a', encoding='utf-8')as f:
f.write('一级错误 一级错误 一级错误 一级错误')
f.write('\n')
f.close()
time.sleep(10)
continue
i3 = 0
for i3 in range(len(result2a)):
try:
url3a = result2a[i3][0]
address2 = result2a[i3][1] #二级地址
i3 += 1
url3 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' + url3a
#http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/34/3401.html
r3 = requests.get(url3, headers=kv)
r3.raise_for_status()
r3.encoding = r3.apparent_encoding
pattern3 = re.compile("<a href='(.*?)'>(.*?)<")
result3 = list(set(re.findall(pattern3, r3.text)))
#print(result3)
result3a = []
for i3a in result3:
if '0' in i3a[1]:
n += 1
else:
result3a.append(i3a)
print('result3a')
except:
print('错误')
with open(save_route, 'a', encoding='utf-8')as f:
f.write('二级错误 二级错误 二级错误 二级错误')
f.write('\n')
f.close()
time.sleep(10)
continue
#print(result3a)
i4 = 0
for i4 in range(len(result3a)):
try:
url4a = result3a[i4][0]
address3 = result3a[i4][1] # 二级地址
i4 += 1
url4 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' + url4a[3:5] + '/' + url4a
#print(url4)
#http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/41/04/410481.html
r4 = requests.get(url4, headers=kv)
r4.raise_for_status()
r4.encoding = r4.apparent_encoding
pattern4 = re.compile("<a href='(.*?)'>(.*?)<")
result4 = list(set(re.findall(pattern3, r4.text)))
#print(result4)
result4a = []
for i4a in result4:
if '0' in i4a[1]:
n += 1
else:
result4a.append(i4a)
print('result4a')
except:
print('错误')
with open(save_route, 'a', encoding='utf-8')as f:
f.write('三级错误 三级错误 三级错误 三级错误')
f.write('\n')
f.close()
time.sleep(10)
continue
#print(result4a)
i5 = 0
for i5 in range(len(result4a)):
try:
url5a = result4a[i5][0]
address4 = result4a[i5][1] # 二级地址
i5 += 1
url5 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' + url5a[3:5] + '/' + url5a[5:7] + '/' + url5a
#http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/41/04/410481.html
r5 = requests.get(url5, headers=kv)
r5.raise_for_status()
r5.encoding = r5.apparent_encoding
pattern5 = re.compile("<tr class='villagetr'><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>")
result5 = list(set(re.findall(pattern5, r5.text)))
#print(result4)
print(len (result5))
result5a = []
for i in range(len(result5)):
cun=result5[i][2]
result5a.append(cun)
print('result5a')
except:
print('错误')
with open(save_route, 'a', encoding='utf-8')as f:
f.write('四级错误 四级错误 四级错误 四级错误')
f.write('\n')
f.close()
time.sleep(10)
continue
#print(result5a)
results6 = []
i6 = 0
for i6 in range(len(result5a)):
try:
address5 = result5a[i6]
i6 += 1
address = str(address1) + ',' + str(address2) + ',' + str(address3) + ',' + str(address4)+ ',' + str(address5)
print(address)
with open(save_route, 'a', encoding='utf-8')as f:
f.write(address)
f.write('\n')
f.close()
except:
print('错误')
with open(save_route, 'a', encoding='utf-8')as f:
f.write('五级错误 五级错误 五级错误 五级错误')
f.write('\n')
f.close()
time.sleep(10)
continue
print('well_done')
在获取五级村级机构时候遇到了困难,如果按照 我是酱油谁要打我 老哥的语法肯定是获取不到的,首先是url,这里我们可以打个print输出,看下上级的数组传来的值是多少,然后截取拼接一下即可。接着就是取村级的值了,只好去查看五级机构的页面元素,发现不是href了,都是直接文字了,然而有个共同点就是<tr class='villagetr'></tr>,此处引起极度舒适,这样取值到list是一个三个元素的集合,我们要取到第三列,所以就用了result5[i][2]。后面的事情就理所当然了。
总之想取哪些数据自己扩展就行了,我用的是python3.6,安装时要自定义安装,把pip安装上,获取包会方便些,这里百度吧。嘿嘿,需要最新的就改下网址就行了。当然是官网不改变写法的前提。
数据有点多,66万条,有需要的可以联系我QQ:1244500959,我是新手欢迎交流
附上参考 我是酱油谁要打我 的链接:https://blog.csdn.net/MG_ApinG/article/details/82992913