Data crawler for provinces, cities and counties nationwide

The project requires data from provinces, cities and counties across the country. I found out that it was either outdated or charged on the Internet. So I took some time to write a crawler and crawled some basic data. It was basically enough. The data was crawled from the National Bureau of Statistics . Currently updated to 2019, the code is as follows:

import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import re
import time


headers = {
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/67.0.3396.99 Safari/537.36'}


def get_page(url):
    try:
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=2))      # 重试次数
        r = s.get(url, headers=headers, timeout=3)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)
        return ''


# 解析省级数据,返回(省份链接,id, 名字)
def parse_province(page):
    provinces = []
    id_pattern = r'(.*).html'       # 用于提取链接中的id信息
    soup = BeautifulSoup(page, 'lxml')

    province_trs = soup.find_all(class_='provincetr')
    # 有些空标签,所以需要很多判断
    for tr in province_trs:
        if tr:
            province_items = tr.find_all(name='td')
            for item in province_items:
                if item:
                    a = item.find(name='a')
                    if a:
                        next_url = a.get('href')
                        id_ = re.search(id_pattern, next_url).group(1)
                        id_ += '0' * (12 - len(id_))        # 省份id只给了前几位,补全到12位
                        name = a.text
                        provinces.append((next_url, id_, name))
    return provinces


# 解析市级数据,返回(市级链接,id, 名字)
def parse_city(page):
    citys = []
    soup = BeautifulSoup(page, 'lxml')

    city_trs = soup.find_all(class_='citytr')
    for tr in city_trs:
        if tr:
            tds = tr.find_all(name='td')
            next_url = tds[0].find(name='a').get('href')
            id_ = tds[0].text
            name = tds[1].text
            citys.append((next_url, id_, name))
    return citys


# 解析区县数据,由于不需要下一级数据,并且保持格式统一,返回(None, id, 名字)
def parse_district(page):
    districts = []
    soup = BeautifulSoup(page, 'lxml')

    district_trs = soup.find_all(class_='countytr')
    for tr in district_trs:
        if tr:
            tds = tr.find_all(name='td')
            id_ = tds[0].text
            name = tds[1].text
            districts.append((None, id_, name))
    return districts


# 写入一串记录到指定文件
def write_lines(file, type, data_list, parent_id, level):
    with open(file, type, encoding='utf-8') as f:
        for data in data_list:
            f.write(data[1] + '\t' + data[2] + '\t' + parent_id + '\t' + level + '\n')


if __name__ == "__main__":
    root = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
    province_list = []
    city_list = []

    # 爬取省份数据
    province_page = get_page(root)
    if province_page:
        province_list = parse_province(province_page)
        if province_list:
            write_lines('areas.txt', 'w', province_list, '0', '1')
            print('='*20)
            print('省份数据写入完成')
            print('='*20)
        else:
            print('省份列表为空')
    else:
        print('省份页面获取失败')

    # 爬取市级数据
    if province_list:
        for province in province_list:
            province_url = root + province[0]
            province_id = province[1]
            province_name = province[2]

            city_page = get_page(province_url)
            if city_page:
                city_list_tmp = parse_city(city_page)
                if city_list_tmp:
                    city_list += city_list_tmp
                    write_lines('areas.txt', 'a', city_list_tmp, province_id, '2')
                    print(province_name+'市级数据写入完成')
                    time.sleep(1)
                else:
                    print(province_name+'市级列表为空')
            else:
                print(province_name+'市级页面获取失败')
    print('='*20)
    print('市级数据写入完成')
    print('='*20)

    # 爬取区县数据
    if city_list:
        for city in city_list:
            city_url = root + city[0]
            city_id = city[1]
            city_name = city[2]

            district_page = get_page(city_url)
            if district_page:
                district_list = parse_district(district_page)
                if district_list:
                    write_lines('areas.txt', 'a', district_list, city_id, '3')
                    print(city_name+'区县数据写入完成')
                    time.sleep(1)
                else:
                    print(city_name+'区县列表为空')
            else:
                print(city[2]+'区县页面获取失败')
    print('='*20)
    print('区县数据写入完成')
    print('='*20)

The final data format is (id, name, superior id, level (1, 2, 3 respectively represent provinces, cities and counties)) :

Insert picture description here

A total of more than 3,600 pieces of data

Guess you like

Origin blog.csdn.net/zzh2910/article/details/104619629
Recommended