java爬取全国省市县镇村(五级行政单位)行政区代码

爬取国家统计局的行政区代码

需要jar包:参照 import

数据库交互存储类

package com.ssj.dao;

import com.ssj.RegionCode;
import com.ssj.domain.RegionTable;

import java.io.IOException;
import java.sql.*;
import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 16:51
 */
public class jdbc {
    //数据库url、用户名和密码
    static final String DB_URL = "jdbc:mysql://localhost:3306/code?zeroDateTimeBehavior=convertToNull&serverTimezone=Asia/Shanghai&allowMultiQueries=true&characterEncoding=UTF-8";
    static final String USER = "root";
    static final String PASS = "root";

    public static void main(String[] args) {
        try {
            //1、注册JDBC驱动
            Class.forName("com.mysql.jdbc.Driver");
            //2、获取数据库连接
            Connection connection = DriverManager.getConnection(DB_URL, USER, PASS);
            //3、操作数据库
            Statement statement = connection.createStatement();//获取操作数据库的对象

            RegionCode regionCode = new RegionCode();
            List<RegionTable> preservation = regionCode.preservation();
            boolean r = true;
            for (RegionTable regionTable : preservation) {
                if (regionTable.getClassification() == null) {
                    regionTable.setClassification("0");
                }
                String sql = "INSERT INTO china_region_code(`code`, `name`, `type`, `classification`) VALUES (" +
                        "'" + regionTable.getCode() + "'" +
                        "," +
                        "'" + regionTable.getName() + "'" +
                        "," +
                        "'" + regionTable.getType() + "'" +
                        "," +
                        "'" + regionTable.getClassification() + "'" +
                        ");";
                int i = statement.executeUpdate(sql);//执行sql,获取结果集
                if (i != 1) {
                    r = false;
                }

            }
            if (r) {
                System.out.println(" 保存成功 ");
            } else {
                System.out.println(" 保存失败 ");
            }

            //4、关闭结果集、数据库操作对象、数据库连接
            statement.close();
            connection.close();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException throwables) {
            throwables.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

爬取类

package com.ssj;

import com.ssj.domain.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 16:38
 * 爬取中国全国省市县乡镇地区代码
 */
public class RegionCode {
    //睡眠时间,防止对方服务器崩溃
    private final int T = 3 * 1000;
    //超时时间
    private final int T1 = 10 * 1000;
    //出错休息时间
    private final int T2 = 1 * 1000;


    public List<RegionTable> preservation() throws IOException, InterruptedException {
        String listurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
        List<Province> province = getProvince(listurl);
        List<RegionTable> nations = new ArrayList<>();
        RegionTable nation;
        for (int i = 0; i < province.size(); i++) {
            nation = new RegionTable();
            Province province1 = province.get(i);
            nation.setCode(province1.getCode());
            nation.setName(province1.getName());
            nation.setType("A");
            nations.add(nation);
            List<City> cities = province1.getCities();
            if (cities != null) {
                for (City city : cities) {
                    nation = new RegionTable();
                    nation.setCode(city.getCode());
                    nation.setName(city.getName());
                    nation.setType("B");
                    nations.add(nation);
                    List<County> counties = city.getCounties();
                    if (counties != null) {
                        for (County county : counties) {
                            nation = new RegionTable();
                            nation.setCode(county.getCode());
                            nation.setName(county.getName());
                            nation.setType("C");
                            nations.add(nation);
                            List<Country> countries = county.getCountries();
                            if (countries != null) {
                                for (Country country : countries) {
                                    nation = new RegionTable();
                                    nation.setCode(country.getCode());
                                    nation.setName(country.getName());
                                    nation.setType("D");
                                    nations.add(nation);
                                    List<Town> towns = country.getTowns();
                                    if (towns != null) {
                                        for (Town town : towns) {
                                            nation = new RegionTable();
                                            nation.setCode(town.getCode());
                                            nation.setName(town.getName());
                                            nation.setClassification(town.getClassification());
                                            nation.setType("E");
                                            nations.add(nation);
                                        }
                                    }

                                }
                            }
                        }
                    }

                }
            }
        }
        return nations;
    }

    private List<Province> getProvince(final String listurl) throws InterruptedException, IOException {
        Document doc = null;
        try {
            Thread.sleep(T);
            doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
//            doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get();
        } catch (IOException | InterruptedException e) {
            System.out.println("重新载入");
            Thread.sleep(T2);
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
        }
        Elements province = doc.select(".provincetr");//省级
        //查询市
        List<Province> provinces = new ArrayList<>();
        Province province1;
        for (int i = 0; i < province.size(); i++) {
            Elements a = province.get(i).getElementsByTag("a");
            for (int i1 = 0; i1 < a.size(); i1++) {
                province1 = new Province();
                //得到所有a标签
                Element element = a.get(i1);
                //a标签获取code
                String href = element.attr("href");
                //调用转换方法
                href = getUTF8BytesFromGBKString(href);
                String code = href.substring(0, href.lastIndexOf('.'));
                province1.setCode(code);
                //a标签获取name
                String name = element.text();
                //调用转换方法
                name = getUTF8BytesFromGBKString(name);
                province1.setName(name);
                System.out.println(element.text());
                //a标签获取地址获取市
                List<City> cities = indexCity(element.absUrl("href"));
                province1.setCities(cities);
                provinces.add(province1);
            }
        }
        return provinces;
    }

    /**
     * 查询市
     *
     * @param listurl
     * @return
     */
    private List<City> indexCity(String listurl) throws InterruptedException, IOException {
        Document doc = null;
        try {
            Thread.sleep(T);
            doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
//            doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get();
        } catch (IOException | InterruptedException e) {
            System.err.println("重新载入");
            Thread.sleep(T2);
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
        }
        Elements city = doc.select(".citytr");//市辖区
        //查询县
        List<City> cities = new ArrayList<>();
        City city1;
        for (int i = 0; i < city.size(); i++) {
            Elements a = city.get(i).getElementsByTag("a");
            for (int i1 = 0; i1 < a.size(); i1++) {
                city1 = new City();
                //得到所有a标签
                Element element = a.get(i1);
                //第一个标签为code第二个为名称
                //a标签获取code
                String code = element.text();
                //调用转换方法
                code = getUTF8BytesFromGBKString(code);
                city1.setCode(code);
                //a标签获取name
                String name = a.get(++i1).text();
                //调用转换方法
                name = getUTF8BytesFromGBKString(name);
                city1.setName(name);
                System.out.println(name);
                //a标签获取地址获取县
                List<County> counties = indexCounty(element.absUrl("href"));
                city1.setCounties(counties);
                cities.add(city1);
            }
        }
        return cities;
    }

    /**
     * 获取县
     *
     * @param listurl
     * @return
     */
    private List<County> indexCounty(String listurl) throws InterruptedException, IOException {
        Document doc = null;
        try {
            Thread.sleep(T);
            doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
//            doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get();
        } catch (IOException | InterruptedException e) {
            System.err.println("重新载入");
            Thread.sleep(T2);
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
        }
        Elements county = doc.select(".countytr");////查询镇
        List<County> counties = new ArrayList<>();
        County county1;
        for (int i = 0; i < county.size(); i++) {
            Elements a = county.get(i).getElementsByTag("a");
            for (int i1 = 0; i1 < a.size(); i1++) {
                county1 = new County();
                //得到所有a标签
                Element element = a.get(i1);
                //第一个标签为code第二个为名称
                //a标签获取code
                String code = element.text();
                //调用转换方法
                code = getUTF8BytesFromGBKString(code);
                county1.setCode(code);
                //a标签获取name
                String name = a.get(++i1).text();
                //调用转换方法
                name = getUTF8BytesFromGBKString(name);
                county1.setName(name);
                System.out.println(name);
                //a标签获取地址获取镇
                List<Country> countries = indexCountry(element.absUrl("href"));
                county1.setCountries(countries);
                counties.add(county1);
            }
        }
        return counties;
    }

    /**
     * 获取镇
     *
     * @param listurl 获取镇url
     * @return 镇集合
     */
    private List<Country> indexCountry(String listurl) throws InterruptedException, IOException {
        Document doc = null;
        try {
            Thread.sleep(T);
            doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
//            doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get();
        } catch (IOException | InterruptedException e) {
            System.err.println("重新载入");
            Thread.sleep(T2);
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
        }
        Elements country = null;//街道
        try {
            country = doc.select(".towntr");
        } catch (Exception e) {
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
            country = doc.select(".towntr");
        }
        //查询村
        List<Country> countries = new ArrayList<>();
        Country country1;
        for (int i = 0; i < country.size(); i++) {
            Elements a = country.get(i).getElementsByTag("a");
            for (int i1 = 0; i1 < a.size(); i1++) {
                country1 = new Country();
                //得到所有a标签
                Element element = a.get(i1);
                //第一个标签为code第二个为名称
                //a标签获取code
                String code = element.text();
                //调用转换方法
                code = getUTF8BytesFromGBKString(code);
                country1.setCode(code);
                //a标签获取name
                Element elementName = a.get(++i1);
                String name = elementName.text();
                //调用转换方法
                name = getUTF8BytesFromGBKString(name);
                country1.setName(name);
                System.out.println(name);
                //a标签获取地址获取村
                List<Town> towns = indexTown(element.absUrl("href"));
                country1.setTowns(towns);
                countries.add(country1);
            }
        }
        return countries;
    }

    /**
     * 获取村级
     *
     * @param listurl 查询村地址
     * @return 村集合
     */
    private List<Town> indexTown(String listurl) throws InterruptedException, IOException {
        Document doc = null;
        try {
            Thread.sleep(T);
            doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
//            doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get();
        } catch (IOException | InterruptedException e) {
            //如果报错休息一段时间继续
            System.err.println("重新载入");
            Thread.sleep(T2);
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
        }
        Elements town;
        try {
            town = doc.select(".villagetr");//居委会
        } catch (Exception e) {
            //调用重新载入方法
            doc = heavyLoadIn(listurl);
            town = doc.select(".villagetr");//居委会
        }
        //查询村
        List<Town> towns = new ArrayList<>();
        Town town1;
        for (int i = 0; i < town.size(); i++) {
            town1 = new Town();
            Elements td = town.get(i).getElementsByTag("td");
            String code = td.get(0).text();
            code = getUTF8BytesFromGBKString(code);
            town1.setCode(code);
            String classification = td.get(1).text();
            classification = getUTF8BytesFromGBKString(classification);
            town1.setClassification(classification);
            String name = td.get(2).text();
            name = getUTF8BytesFromGBKString(name);
            town1.setName(name);
            System.out.println(name);
            towns.add(town1);
        }
        return towns;
    }

    private Document heavyLoadIn(String listurl) {
        try {
            return Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl);
        } catch (IOException e) {
            heavyLoadIn(listurl);
        }
        return null;
    }


    //有损转换
    public String getUTF8BytesFromGBKString(String gbkStr) {
        int n = gbkStr.length();
        byte[] utfBytes = new byte[3 * n];
        int k = 0;
        for (int i = 0; i < n; i++) {
            int m = gbkStr.charAt(i);
            if (m < 128 && m >= 0) {
                utfBytes[k++] = (byte) m;
                continue;
            }
            utfBytes[k++] = (byte) (0xe0 | (m >> 12));
            utfBytes[k++] = (byte) (0x80 | ((m >> 6) & 0x3f));
            utfBytes[k++] = (byte) (0x80 | (m & 0x3f));
        }
        if (k < utfBytes.length) {
            byte[] tmp = new byte[k];
            System.arraycopy(utfBytes, 0, tmp, 0, k);
            utfBytes = tmp;


        }
        String str = null;
        try {
            str = new String(utfBytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return str;
    }
}

实体类

package com.ssj.domain;

import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 10:45
 */
public class City {
    //code
    String code;
    //name
    String name;
    //
    List<County> counties;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<County> getCounties() {
        return counties;
    }

    public void setCounties(List<County> counties) {
        this.counties = counties;
    }
}
package com.ssj.domain;

import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 10:36
 */
public class Country {
    //code
    String code;
    //name
    String name;
    //
    List<Town> towns;

    public List<Town> getTowns() {
        return towns;
    }

    public void setTowns(List<Town> towns) {
        this.towns = towns;
    }

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }
}
package com.ssj.domain;

import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 10:41
 */
public class County {
    //code
    String code;
    //name
    String name;
    //
    List<Country> countries;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<Country> getCountries() {
        return countries;
    }

    public void setCountries(List<Country> countries) {
        this.countries = countries;
    }
}
package com.ssj.domain;

import java.util.List;

/**
 * @author aikang
 * @create 2020 05 19 10:48
 */
public class Province {
    //code
    String code;
    //name
    String name;
    //
    List<City> cities;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<City> getCities() {
        return cities;
    }

    public void setCities(List<City> cities) {
        this.cities = cities;
    }
}
package com.ssj.domain;

import java.io.Serializable;

/**
 *  2020-05-19
 */
public class RegionTable implements Serializable {
    /**
     * 主键
     */
    private Integer id;

    /**
     * 地区代码
     */
    private String code;

    /**
     * 地区名称
     */
    private String name;

    /**
     * 等级类型
     */
    private String type;

    /**
     * 城乡分类代码
     */
    private String classification;

    /**
     * region_table
     */
    private static final long serialVersionUID = 1L;

    /**
     * 主键
     * @return id 主键
     */
    public Integer getId() {
        return id;
    }

    /**
     * 主键
     * @param id 主键
     */
    public void setId(Integer id) {
        this.id = id;
    }

    /**
     * 地区代码
     * @return code 地区代码
     */
    public String getCode() {
        return code;
    }

    /**
     * 地区代码
     * @param code 地区代码
     */
    public void setCode(String code) {
        this.code = code;
    }

    /**
     * 地区名称
     * @return name 地区名称
     */
    public String getName() {
        return name;
    }

    /**
     * 地区名称
     * @param name 地区名称
     */
    public void setName(String name) {
        this.name = name;
    }

    /**
     * 等级类型
     * @return type 等级类型
     */
    public String getType() {
        return type;
    }

    /**
     * 等级类型
     * @param type 等级类型
     */
    public void setType(String type) {
        this.type = type;
    }

    /**
     * 城乡分类代码
     * @return classification 城乡分类代码
     */
    public String getClassification() {
        return classification;
    }

    /**
     * 城乡分类代码
     * @param classification 城乡分类代码
     */
    public void setClassification(String classification) {
        this.classification = classification;
    }

    /**
     *
     * @mbggenerated 2020-05-19
     */
    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass().getSimpleName());
        sb.append(" [");
        sb.append("Hash = ").append(hashCode());
        sb.append(", id=").append(id);
        sb.append(", code=").append(code);
        sb.append(", name=").append(name);
        sb.append(", type=").append(type);
        sb.append(", classification=").append(classification);
        sb.append(", serialVersionUID=").append(serialVersionUID);
        sb.append("]");
        return sb.toString();
    }
}
package com.ssj.domain;

/**
 * @author aikang
 * @create 2020 05 19 10:31
 */
public class Town {
    //code
    String code;
    //城乡分类代码
    String classification;
    //name
    String name;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getClassification() {
        return classification;
    }

    public void setClassification(String classification) {
        this.classification = classification;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }
}

猜你喜欢

转载自www.cnblogs.com/aikang525/p/12930888.html
今日推荐