2020 年省份数据拉取

前言:

参考文章:

https://www.cnblogs.com/yangzhilong/p/3530700.html

https://www.cnblogs.com/liushaofeng89/p/4873086.html

最近因为用户反馈省份数据表单有部分缺失,百度了一圈度娘以后决定还是自己拉取一下,省份数据的来源于国家统计局,笔者拉取的是2019年,2020-02-25拉取的数据。 

省份数据来源:国家统计局

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

笔者用的是java 中jsoup ,关于jsoup的用法,可参考下面这个文章:https://www.open-open.com/jsoup/

开始

1.准备一张表 region_directory

CREATE TABLE `region_directory` (
  `id` int(32) NOT NULL AUTO_INCREMENT,
  `pid` int(32) DEFAULT NULL COMMENT '父级ID',
  `name` varchar(64) DEFAULT NULL COMMENT '地域名称',
  `name_CN` varchar(64) DEFAULT NULL COMMENT '地域英文名',
  `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
  `create_user` varchar(255) DEFAULT NULL COMMENT '创建人',
  `update_user` varchar(255) DEFAULT NULL COMMENT '修改人',
  `is_open` char(2) DEFAULT NULL COMMENT '是否开启 (0代表未开启 1代表开启)',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2421 DEFAULT CHARSET=utf8 COMMENT='地域表';

2.需要在pom文件中引入jsoup 的jar 包。

 官方上现在有更高版本,我这边使用的是目前使用人数比较多的版本。

     <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

3.拉取数据的代码主要在getRegionDirectory 这个接口中。

4.需要注意的一点是:下图中的这个name ,这个name 代表的是全国省份一级数据,我加了一个判断,先拉取北京市的数据,之所以加这个判断的原因是 数据量比较大,我如果一次性拉取过多的话,连接会报502 ,现在很多网站会做这个恶心攻击的防范,这里需要注意。

4.1 这就是上述图片中描述的502报错

5.接下来就可以在浏览器上访问拉取数据的接口:

 控制台打印一下 获取的数据:

存到数据库中的数据:

6.文章中涉及的所有代码

RegionDirectoryController

package com.bos.controller.basic;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.bos.data.model.RegionDirectoryModel;
import com.bos.data.model.vo.basic.RegionVo;
import com.bos.data.repositories.jpa.setting.RegionDirectoryJPARepository;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.interceptor.TransactionAspectSupport;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * @Author tanghh
 * @Date 2020/6/23 10:37
 */
@RestController
@RequestMapping(value = "/region")
public class RegionDirectoryController {
    private Logger logger = LoggerFactory.getLogger(RegionDirectoryController.class);

    @Autowired
    private RegionDirectoryJPARepository regionDirectoryJPARepository;


    private static List<String> types = new ArrayList<>();
    private static List<String> specialCitys = new ArrayList<>();
    /**
     * 省份
     */
    public static final String LEVEL_PROVINCE = "provincetr";
    /**
     * 城市
     */
    public static final String LEVEL_CITY = "citytr";
    /**
     * 区
     */
    public static final String LEVEL_COUNTY = "countytr";
    /**
     * 街道
     */
    public static final String LEVEL_TOWN = "towntr";
    /**
     * 居委会
     */
    public static final String LEVEL_VILLAGE = "villagetr";

    public static final int LEVEL_MODE_STRING = 1;
    public static final int LEVEL_MODE_NUMBER = 2;
    public static final String CHARSET = "GBK";


    static {
        types.add(LEVEL_PROVINCE);
        types.add(LEVEL_CITY);
        types.add(LEVEL_COUNTY);
        types.add(LEVEL_TOWN);
        types.add(LEVEL_VILLAGE);
    }


    /**
     * 这个列表存放的是比较特殊的市,它们是属于LEVEL_CITY,但下一级却跳过了LEVEL_COUNTY,而直接到LEVEL_TOWN
     * 由于数据较多,不能一一比对,使用当中发现属于这种情况的城市加入到这里即可
     */
    static {
        specialCitys.add("东莞市");
        specialCitys.add("中山市");
        specialCitys.add("儋州市");
    }

    //**************************以下值请根据实际情况修改*************************************
    /**
     * 抓取的首页
     */
    public static final String webUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
    /**
     * 保存路径
     */
    public static final String savePath = "C:/project/latestEbo/ebo-web/ebo/src/main/resources/china.json";

    /**
     * 抓取数据的范围[]支持第一级和中国,比如中国,广东省,北京市
     */
    public static final String AREA = "中国";

    public static int TARGET_LEVEL = 3;
    /**
     * 表示抓取数据的层级采用的模式:LEVEL_MODE_STRING--表示按字符级别 LEVEL_MODE_NUMBER--表示按数字级别
     */
    public static int LEVEL_MODE = LEVEL_MODE_NUMBER;
    //**************************以上值请根据实际情况修改*************************************

    @GetMapping(value = "/getRegionDirectory")
    public void getRegionDirectory() {
        try {
            System.out.println("开始抓取,请耐心等待!!!");
            System.out.println("抓取范围:" + AREA + ",抓取模式(1--字符 2--数字):" + LEVEL_MODE + ",抓取层级:" + TARGET_LEVEL + "(模式为字符:1--province,2--city,3--county,4--town,5--village;)");
            long starttime = System.currentTimeMillis();

            RegionVo region = new RegionVo("000000000000", "中国", 0);
            region.child = new ArrayList<>();
            Document doc = getDocument(webUrl);
            Elements provincetr = doc.getElementsByClass(LEVEL_PROVINCE);
            for (Element e : provincetr) {
                Elements a = e.getElementsByTag("a");
                for (Element ea : a) {
                    //拿到绝对路径
                    String nextUrl = ea.attr("abs:href");
                    String[] arr = nextUrl.split("/");
                    String code = arr[arr.length - 1].split("\\.")[0] + "0000000000";
                    String name = ea.text();

                    if (name.equals("北京市")) {
                        if (AREA.equals("中国") || AREA.equals(name)) {
                            System.out.println(name);
                            RegionVo child = new RegionVo(code, name, 1);
                            region.child.add(child);
                            int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(LEVEL_PROVINCE) : child.level;
                            //表示还需要继续解析
                            if (currentlevel < TARGET_LEVEL) {
                                parseNext(types.get(1), nextUrl, child);
                            }
                        }
                    }
                }
            }
            //解析json
            String jsonStr = new Gson().toJson(region);
            System.out.println(jsonStr);
            JSONObject jsonObject = JSONObject.parseObject(jsonStr);
            JSONArray childJsonArray = jsonObject.getJSONArray("child");
            for (int i = 0; i < childJsonArray.size(); i++) {
                JSONObject childJsonObject = (JSONObject) childJsonArray.get(i);
                //获取省份级别数据
                String provinceName = childJsonObject.getString("name");
                RegionDirectoryModel regionDirectoryModel = new RegionDirectoryModel(0, provinceName, "汤辉红", "汤辉红");
                RegionDirectoryModel provinceModel = regionDirectoryJPARepository.saveAndFlush(regionDirectoryModel);
                JSONArray jsonArray = childJsonObject.getJSONArray("child");
                for (Object o : jsonArray) {
                    //获取省份下的市数据
                    JSONObject itemJsonObject = (JSONObject) o;
                    String cityName = itemJsonObject.getString("name");
                    RegionDirectoryModel cityModel = new RegionDirectoryModel(provinceModel.getId(), cityName, "汤辉红", "汤辉红");
                    RegionDirectoryModel newCityModel = regionDirectoryJPARepository.saveAndFlush(cityModel);
                    JSONArray finalChildJsonArray = itemJsonObject.getJSONArray("child");
                    for (Object o1 : finalChildJsonArray) {
                        //获取城市下的县数据
                        JSONObject finalJsonObject = (JSONObject) o1;
                        String prefectureName = finalJsonObject.getString("name");
                        RegionDirectoryModel prefectureModel = new RegionDirectoryModel(newCityModel.getId(), prefectureName, "汤辉红", "汤辉红");
                        regionDirectoryJPARepository.save(prefectureModel);
                    }
                }

            }
            long endtime = System.currentTimeMillis();
            System.out.println("抓取完毕!!!耗时:" + (endtime - starttime) / 1000 / 60 + "min");
        } catch (Exception e) {
            logger.error("获取省份数据失败", e);
            TransactionAspectSupport.currentTransactionStatus().setRollbackOnly();
        }

    }

    private static Document getDocument(String url) throws IOException {
        return Jsoup.parse(new URL(url).openStream(), CHARSET, url);
    }

    /**
     * @param type 见LEVEL_
     * @return
     */
    private static int getLevel(String type) {
        return types.indexOf(type) + 1;
    }

    private static void saveJson(RegionVo region) throws IOException {
        FileWriter fw = new FileWriter(new File(savePath));
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(new Gson().toJson(region));
        bw.flush();
        bw.close();
    }

    /**
     * 解析下一级数据
     *
     * @param type   见LEVEL_开头
     * @param url    要抓取的网页url
     * @param region 将要保存的数据
     * @throws Exception
     */
    public static void parseNext(String type, String url, RegionVo region) throws Exception {
        region.child = new ArrayList<>();
        Document doc = getDocument(url);
        Elements es = doc.getElementsByClass(type);
        if (LEVEL_VILLAGE.equals(type)) {
            //<tr class="villagetr"><td>110101001001</td><td>111</td><td>多福巷社区居委会</td></tr>
            for (Element e : es) {
                Elements tds = e.getElementsByTag("td");
                String code = tds.get(0).text();
                String name = tds.get(2).text();
                RegionVo child = new RegionVo(code, name, region.level + 1);
                region.child.add(child);
                System.out.println(space(child.level) + name);
            }
        } else {
            //需要处理以下两种情况
            //第一种:<tr class="countytr"><td>130101000000</td><td>市辖区</td></tr>
            //第二种:<tr class="countytr"><td><a href="01/130102.html">130102000000</a></td><td><a href="01/130102.html">长安区</a></td></tr>
            for (Element e : es) {
                String code = null;
                String name = null;
                String nextUrl = null;
                Elements a = e.getElementsByTag("a");
                if (a.isEmpty()) {
                    //属于第一种情况
                    Elements tds = e.getElementsByTag("td");
                    code = tds.get(0).text();
                    name = tds.get(1).text();
                } else {
                    //13/1301.html
                    nextUrl = a.get(0).attr("abs:href");
                    code = a.get(0).text();
                    name = a.get(1).text();
                }
                RegionVo child = new RegionVo(code, name, region.level + 1);
                region.child.add(child);
                System.out.println(space(child.level) + name);
                int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(type) : child.level;
                if (!a.isEmpty() && currentlevel < TARGET_LEVEL) {
                    //如果是东莞市,LEVEL_CITY下一级是LEVEL_TOWN,而不是LEVEL_COUNTY这里需要特殊处理
                    String nextType = null;
                    if (LEVEL_MODE == LEVEL_MODE_NUMBER
                            && (specialCitys.contains(name))) {
                        nextType = LEVEL_TOWN;
                    } else {
                        nextType = types.get(types.indexOf(type) + 1);
                    }

                    parseNext(nextType, nextUrl, child);
                }
            }
        }
    }

    private static String space(int level) {
        if (level > 5) {
            return "";
        }
        return "      ".substring(0, level);
    }


}

RegionVo

package com.bos.data.model.vo.basic;

import lombok.Data;

import java.util.List;

/**
 * @Author tanghh
 * @Date 2020/6/23 10:41
 */
@Data
public class RegionVo {
    /**
     * 编码
     */
    public String code;
    /**
     * 名称
     */
    public String name;
    /**
     * 当前级别
     */
    public int level;
    /**
     * 子数据
     */
    public List<RegionVo> child;

    public RegionVo(String code, String name, int level) {
        this.code = code;
        this.name = name;
        this.level = level;
    }

}

RegionDirectoryModel

package com.bos.data.model;

import javax.persistence.*;
import java.io.Serializable;
import java.sql.Timestamp;
import java.util.Objects;

/**
 * @author luojie 2018/7/4
 */
@Entity
@Table(name = "region_directory", schema = "test", catalog = "")
public class RegionDirectoryModel implements Serializable {
    private Integer id;
    private Integer pid;
    private String name;
    private String nameCn;
    private String isOpen="0";
    private Timestamp createTime;
    private Timestamp updateTime;
    private String createUser;
    private String updateUser;

    @Id
    @Column(name = "id")
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    @Basic
    @Column(name = "name")
    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    @Basic
    @Column(name = "name_CN")
    public String getNameCn() {
        return nameCn;
    }

    public void setNameCn(String nameCn) {
        this.nameCn = nameCn;
    }

    @Basic
    @Column(name = "pid")
    public Integer getPid() {
        return pid;
    }

    public void setPid(Integer pid) {
        this.pid = pid;
    }
    @Basic
    @Column(name = "is_open")
    public String getIsOpen() {
        return isOpen;
    }

    public void setIsOpen(String isOpen) {
        this.isOpen = isOpen;
    }
    @Basic
    @Column(name = "create_time")
    public Timestamp getCreateTime() {
        return createTime;
    }

    public void setCreateTime(Timestamp createTime) {
        this.createTime = createTime;
    }
    @Basic
    @Column(name = "update_time")
    public Timestamp getUpdateTime() {
        return updateTime;
    }

    public void setUpdateTime(Timestamp updateTime) {
        this.updateTime = updateTime;
    }
    @Basic
    @Column(name = "create_user")
    public String getCreateUser() {
        return createUser;
    }

    public void setCreateUser(String createUser) {
        this.createUser = createUser;
    }
    @Basic
    @Column(name = "update_user")
    public String getUpdateUser() {
        return updateUser;
    }

    public void setUpdateUser(String updateUser) {
        this.updateUser = updateUser;
    }

    public RegionDirectoryModel() {

    }

    public RegionDirectoryModel(Integer pid, String name, String createUser, String updateUser) {
        this.pid = pid;
        this.name = name;
        this.createUser = createUser;
        this.updateUser = updateUser;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        RegionDirectoryModel that = (RegionDirectoryModel) o;
        return id == that.id &&
                Objects.equals(name, that.name) &&
                Objects.equals(nameCn, that.nameCn) &&
                Objects.equals(pid, that.pid);
    }

    @Override
    public int hashCode() {

        return Objects.hash(id, name, nameCn, pid);
    }
}

本篇文章就到这里,

如果觉得笔者写的不错的话,欢迎评论点赞。

下篇文章贴出所有省份数据。

猜你喜欢

转载自blog.csdn.net/tangthh123/article/details/106883150