Python crawler division code table

Use python crawler to partition code table

Reptile Ningxia administrative division code table.

#爬虫国家统计局全国行政区划数据
#注意第13、14行,例如第13行URL为宁夏,则爬虫宁夏行政区划区划表,14行为全国区划代码表。
#

import requests
import re
import xlsxwriter
import time
time_start=time.time()
agent={
    
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
choose_ls=[depth*2 if depth<=3 else 3*(depth-1) for depth in range(1,6)]
match_level=['provincetr','citytr','countytr','towntr','villagetr']
initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/64.html'  #爬虫银川市
#initurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'  #爬虫全国
total_dict={
    
    }
depth=0
each_root={
    
    initurl:('','')}
max_depth=5#可选,1-5分别表示省级、地级、县级、乡级、村级,
while depth<max_depth:
    total_count=0
    next_root={
    
    }
    for url in each_root:
        code_join=each_root[url][0]+'-' if depth!=0 else each_root[url][0]
        zone_join=each_root[url][1]+'-' if depth!=0 else each_root[url][1]
        change_root='/'.join(url.split('/')[:-1])+'/'
        while True:
            try:
                req=requests.get(url,headers=agent)
                req.encoding='GBK'#中文解码,
                text=req.text
                text=text.replace('\n','\\n')
                special_sigh=False
                if match_level[depth] in text:
                    match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[depth],text)[0]
                    break
                else:
                    search=False
                    for level in range(depth,5):#东莞、中山、儋州缺县级单位,因此需要进行识别并放入下一节点存储
                        if match_level[level] in text:
                            match_text=re.findall(r"class='%s'>(.*?)</table"%match_level[level],text)[0]
                            search=True
                            special_sigh=True
                            print('特殊区划:%s'%each_root[url][1])
                            break
                    if search:
                        break
                    else:
                        print('服务器繁忙')
                        time.sleep(2)
            except:
                print('服务器繁忙')
                time.sleep(2)
        if special_sigh:
            next_root[url]=(code_join,zone_join)
        else:
            if depth!=0:
                has_tree=re.findall(r"href='(.*?)'>(\d+?)<.*?html'>(.*?)</a></td></tr>",match_text)
            else:
                base_tree=re.findall(r"href='(.*?)'>(.*?)<br/",match_text)
                has_tree=[(each[0],each[0].split('.html')[0],each[1]) for each in base_tree]
            base_no=re.findall(r"td>(\d+?)</td><td>(.*?)</td></tr>",match_text)
            no_tree=[(each[0],re.findall(r'<td>(.+)',each[1])[0] if 'td' in each[1] else each[1]) for each in base_no]
            for each in has_tree:
                each_dir=change_root+each[0]
                next_root[each_dir]=(code_join+each[1][:choose_ls[depth]],zone_join+each[2])
                if depth==3:
                    if (total_count+1)%100==0:
                        print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[2]))
                else:
                    print('在路径%s处'%(zone_join+each[2]))
            if no_tree:
                for each in no_tree:
                    total_dict[code_join+each[0][:choose_ls[depth]]]=zone_join+each[1]
                    if depth==4:
                        if (total_count+1)%800==0:
                            print('已爬取%d个,在路径%s处'%(total_count+1,zone_join+each[1]))
                    else:
                        print('已获取路径%s'%(zone_join+each[1]))
        total_count+=1
    depth+=1
    each_root=next_root
def decompose(each):
    if type(total_dict[each])==tuple:
        codelist=total_dict[each][0].split('-')
        namelist=total_dict[each][1].split('-')
    else:
        codelist=each.split('-')
        namelist=total_dict[each].split('-')
    if len(codelist)<depth:
        for i in range(len(codelist),depth):
            codelist.append('')
            namelist.append('')
    ziplist=list(zip(codelist,namelist))
    return [i for j in ziplist for i in j]
sort_name=['省级','地级','县级','乡级','村级']
real_column=[(sort_name[each]+'代码',sort_name[each]+'名称') for each in range(depth)]
flat_col=[i for each in real_column for i in each]
total_dict.update(each_root)
if depth<=3:#县级及以上数据量不大(约三千行),可以用excel存储
    wk=xlsxwriter.Workbook('五级联动.xlsx')
    sh=wk.add_worksheet('sheet1')
    for each in range(2*depth):
        sh.write(0,each,flat_col[each])
    totalrow=1
    for each in total_dict:
        flatlist=decompose(each)
        for i in range(2*depth):
            sh.write(totalrow,i,flatlist[i])
        totalrow+=1
    wk.close()
else:#县级往下数据较多,excel没有优势,因此写入csv存储
    book=open('五级联动.csv','w',encoding='utf-8')
    book.write(','.join(flat_col)+'\n')
    for each in total_dict:
        flatten=decompose(each)
        book.write(','.join(flatten)+'\n')
    book.close()
time_end=time.time()
rest_second=time_end-time_start
print('用时%d分%d秒'%divmod(rest_second,60))

Guess you like

Origin blog.csdn.net/weixin_43401243/article/details/119104696