全国地区编码python爬取代码

全国行政区划代码


因为要用地区编码,找了一遍,不是太旧,就是要收费,所以自己去爬一下,需要可以自取

  • 数据来源 国家统计局
  • 数据范围: 省、市 、区、镇、居委会 5级行政代码,其中四个直辖市是4级
  • 获取数据时间:2023/4/5
  • 数据量:662892条
  • 写入excel容量:19.3M
  • 此版本使用单线程抓取数据,此过程及其耗时,抓取耗时75分钟(此过程可用线程池加速),写入excel 耗时5分钟
# encoding:utf-8
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from openpyxl.workbook import Workbook

'''
存放抓取的数据
'''
class Entry:
    def __init__(self, code: str = None, desc: str = None, url: str = '', parent_code: str = '000000000000',
                 level: int = 1):
        self.level = level
        self.url = url
        self.code = code
        self.desc = desc
        self.arr = []
        self.parent_code = parent_code

    def __repr__(self):
        return '(%s %s %s %s)' % (self.code, self.desc, self.level, self.url)

'''
抓取数据,写入excel
'''
class China:

    def __init__(self):

        print('开始抓取数据,当前时间:', datetime.now().strftime('%X'), '\n')

        self.se = requests.Session()
        # 访问首页抓取省信息
        self.province = []
        for a in self.__parse_url('index.html', 'td a:not([class])'):
            # 原始unicode转义
            desc = a.text.encode('raw_unicode_escape').decode()
            url = a['href']
            code = '%d0000000000' % int(url[0:2])
            self.province.append(Entry(code, desc, url, '000000000000', 1))

        if len(self.province) == 31:
            self.province.append(Entry('710000000000', '台湾省', '', '000000000000', 1))
            self.province.append(Entry('810000000000', '香港特别行政区', '', '000000000000', 1))
            self.province.append(Entry('820000000000', '澳门特别行政区', '', '000000000000', 1))

#        可以使用线程池优化,抓取所有数据太慢了
#         遍历抓取各省数据
        for obj in self.province:
            self.__parse_data(obj)

        print('\n抓取数据,完成时间:', datetime.now().strftime('%X'), '\n')
        global read_num
        print(f'读取数据:{
      
      read_num}条')
        #      编码、名称、上级行政编码、当前数据是几级行政单位
        th = ('code', 'desc', 'parent_code', 'level')
        workbook = Workbook()
        sheet = workbook.active
        sheet.title = '汇总数据'
        sheet.append(th)
        self.sheet = sheet

        for data in self.province:
            self.__write(data)

        workbook.save('中华人民共和国行政区划代码.xlsx')

        print('\n写入数据,完成时间:', datetime.now().strftime('%X'), '\n')
        global write_num
        print(f'写入数据:{
      
      write_num}条')

#    写入数据
    def __write(self, data: Entry):
        desc = data.desc
        arr = data.arr
        parent_code = data.parent_code
        level = data.level
        code = data.code
        # 直辖市  下面分有 市辖区和县  ,这两个数据用不着
        if desc == '市辖区' or desc == '县':
            if arr != []:
                for obj in arr:
                    obj.parent_code = parent_code
                    obj.level = level
                    self.__write(obj)
            return

        global write_num
        write_num += 1

        self.sheet.append((code, desc, parent_code, level))

        if arr != []:
            for obj in arr:
                obj.level = level + 1
                self.__write(obj)

    # 解析url,获取其中指定的元素
    def __parse_url(self, index, re):
        url = f'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{
      
      index}'
        html = self.se.get(url, timeout=None)
        soup = BeautifulSoup(html.text, "lxml")
        return soup.select(re)

#    在网页中抓取对应数据
    def __parse_data(self, obj: Entry):

        index = obj.url
        if index == '':
            return

        level = obj.level
        key = ''
        if level == 1:
            key = 'citytr'
        if level == 2:
            key = 'countytr'
        if level == 3:
            key = 'towntr'
        if level == 4:
            key = 'villagetr'

        arr = obj.arr
        parent_code = obj.code

        for tr in self.__parse_url(index, f'tr[class={
      
      key}]'):

            global read_num
            read_num +=1

            a_arr = tr.select('a')
            if a_arr == []:
                code = tr.select('td')[0].text
                url = ''
                if len(tr.select('td')) == 2:
                    desc = tr.select('td')[1].text.encode('raw_unicode_escape').decode()
                else:
                    desc = tr.select('td')[2].text.encode('raw_unicode_escape').decode()
            else:
                a = a_arr[0]
                code = a.text
                url = a['href']
                desc = a_arr[1].text.encode('raw_unicode_escape').decode()

            if desc == '市辖区' and url == '':
                continue

            if url != '':
                pre = url[url.index('/') + 1:]
                if level == 2:
                    url = f'{
      
      pre[0:2]}/{
      
      url}'
                if level == 3:
                    url = f'{
      
      pre[0:2]}/{
      
      pre[2:4]}/{
      
      url}'

            new_obj = Entry(code, desc, url, parent_code, level + 1)
            arr.append(new_obj)

        for i in arr:
            self.__parse_data(i)

read_num=0
write_num=0

if __name__ == '__main__':
    China()


猜你喜欢

转载自blog.csdn.net/weixin_46488959/article/details/129975953
今日推荐