#爬虫国家统计局全国行政区划数据#注意第13、14行,例如第13行URL为宁夏,则爬虫宁夏行政区划区划表,14行为全国区划代码表。#import requests
import re
import xlsxwriter
import time
time_start=time.time()
agent={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
choose_ls=[depth*2if depth<=3else3*(depth-1)for depth inrange(1,6)]
match_level=['provincetr','citytr','countytr','towntr','villagetr']
initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/64.html'#爬虫银川市#initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html' #爬虫全国
total_dict={
}
depth=0
each_root={
initurl:('','')}
max_depth=5#可选,1-5分别表示省级、地级、县级、乡级、村级,while depth<max_depth:
total_count=0
next_root={
}for url in each_root:
code_join=each_root[url][0]+'-'if depth!=0else each_root[url][0]
zone_join=each_root[url][1]+'-'if depth!=0else each_root[url][1]
change_root='/'.join(url.split('/')[:-1])+'/'whileTrue:try:
req=requests.get(url,headers=agent)
req.encoding='GBK'#中文解码,
text=req.text
text=text.replace('\n','\\n')
special_sigh=Falseif match_level[depth]in text:
match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[depth],text)[0]breakelse:
search=Falsefor level inrange(depth,5):#东莞、中山、儋州缺县级单位,因此需要进行识别并放入下一节点存储if match_level[level]in text:
match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[level],text)[0]
search=True
special_sigh=Trueprint('特殊区划:%s'%each_root[url][1])breakif search:breakelse:print('服务器繁忙')
time.sleep(2)except:print('服务器繁忙')
time.sleep(2)if special_sigh:
next_root[url]=(code_join,zone_join)else:if depth!=0:
has_tree=re.findall(r"href='(.*?)'>(\d+?)<.*?html'>(.*?)</a></td></tr>",match_text)else:
base_tree=re.findall(r"href='(.*?)'>(.*?)<br/",match_text)
has_tree=[(each[0],each[0].split('.html')[0],each[1])for each in base_tree]
base_no=re.findall(r"td>(\d+?)</td><td>(.*?)</td></tr>",match_text)
no_tree=[(each[0],re.findall(r'<td>(.+)',each[1])[0]if'td'in each[1]else each[1])for each in base_no]for each in has_tree:
each_dir=change_root+each[0]
next_root[each_dir]=(code_join+each[1][:choose_ls[depth]],zone_join+each[2])if depth==3:if(total_count+1)%100==0:print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[2]))else:print('在路径%s处'%(zone_join+each[2]))if no_tree:for each in no_tree:
total_dict[code_join+each[0][:choose_ls[depth]]]=zone_join+each[1]if depth==4:if(total_count+1)%800==0:print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[1]))else:print('已获取路径%s'%(zone_join+each[1]))
total_count+=1
depth+=1
each_root=next_root
defdecompose(each):iftype(total_dict[each])==tuple:
codelist=total_dict[each][0].split('-')
namelist=total_dict[each][1].split('-')else:
codelist=each.split('-')
namelist=total_dict[each].split('-')iflen(codelist)<depth:for i inrange(len(codelist),depth):
codelist.append('')
namelist.append('')
ziplist=list(zip(codelist,namelist))return[i for j in ziplist for i in j]
sort_name=['省级','地级','县级','乡级','村级']
real_column=[(sort_name[each]+'代码',sort_name[each]+'名称')for each inrange(depth)]
flat_col=[i for each in real_column for i in each]
total_dict.update(each_root)if depth<=3:#县级及以上数据量不大(约三千行),可以用excel存储
wk=xlsxwriter.Workbook('五级联动.xlsx')
sh=wk.add_worksheet('sheet1')for each inrange(2*depth):
sh.write(0,each,flat_col[each])
totalrow=1for each in total_dict:
flatlist=decompose(each)for i inrange(2*depth):
sh.write(totalrow,i,flatlist[i])
totalrow+=1
wk.close()else:#县级往下数据较多,excel没有优势,因此写入csv存储
book=open('五级联动.csv','w',encoding='utf-8')
book.write(','.join(flat_col)+'\n')for each in total_dict:
flatten=decompose(each)
book.write(','.join(flatten)+'\n')
book.close()
time_end=time.time()
rest_second=time_end-time_start
print('用时%d分%d秒'%divmod(rest_second,60))