爬取国家统计局的行政区代码
需要jar包:参照 import
数据库交互存储类
package com.ssj.dao; import com.ssj.RegionCode; import com.ssj.domain.RegionTable; import java.io.IOException; import java.sql.*; import java.util.List; /** * @author aikang * @create 2020 05 19 16:51 */ public class jdbc { //数据库url、用户名和密码 static final String DB_URL = "jdbc:mysql://localhost:3306/code?zeroDateTimeBehavior=convertToNull&serverTimezone=Asia/Shanghai&allowMultiQueries=true&characterEncoding=UTF-8"; static final String USER = "root"; static final String PASS = "root"; public static void main(String[] args) { try { //1、注册JDBC驱动 Class.forName("com.mysql.jdbc.Driver"); //2、获取数据库连接 Connection connection = DriverManager.getConnection(DB_URL, USER, PASS); //3、操作数据库 Statement statement = connection.createStatement();//获取操作数据库的对象 RegionCode regionCode = new RegionCode(); List<RegionTable> preservation = regionCode.preservation(); boolean r = true; for (RegionTable regionTable : preservation) { if (regionTable.getClassification() == null) { regionTable.setClassification("0"); } String sql = "INSERT INTO china_region_code(`code`, `name`, `type`, `classification`) VALUES (" + "'" + regionTable.getCode() + "'" + "," + "'" + regionTable.getName() + "'" + "," + "'" + regionTable.getType() + "'" + "," + "'" + regionTable.getClassification() + "'" + ");"; int i = statement.executeUpdate(sql);//执行sql,获取结果集 if (i != 1) { r = false; } } if (r) { System.out.println(" 保存成功 "); } else { System.out.println(" 保存失败 "); } //4、关闭结果集、数据库操作对象、数据库连接 statement.close(); connection.close(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException throwables) { throwables.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } }
爬取类
package com.ssj; import com.ssj.domain.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.List; /** * @author aikang * @create 2020 05 19 16:38 * 爬取中国全国省市县乡镇地区代码 */ public class RegionCode { //睡眠时间,防止对方服务器崩溃 private final int T = 3 * 1000; //超时时间 private final int T1 = 10 * 1000; //出错休息时间 private final int T2 = 1 * 1000; public List<RegionTable> preservation() throws IOException, InterruptedException { String listurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html"; List<Province> province = getProvince(listurl); List<RegionTable> nations = new ArrayList<>(); RegionTable nation; for (int i = 0; i < province.size(); i++) { nation = new RegionTable(); Province province1 = province.get(i); nation.setCode(province1.getCode()); nation.setName(province1.getName()); nation.setType("A"); nations.add(nation); List<City> cities = province1.getCities(); if (cities != null) { for (City city : cities) { nation = new RegionTable(); nation.setCode(city.getCode()); nation.setName(city.getName()); nation.setType("B"); nations.add(nation); List<County> counties = city.getCounties(); if (counties != null) { for (County county : counties) { nation = new RegionTable(); nation.setCode(county.getCode()); nation.setName(county.getName()); nation.setType("C"); nations.add(nation); List<Country> countries = county.getCountries(); if (countries != null) { for (Country country : countries) { nation = new RegionTable(); nation.setCode(country.getCode()); nation.setName(country.getName()); nation.setType("D"); nations.add(nation); List<Town> towns = country.getTowns(); if (towns != null) { for (Town town : towns) { nation = new RegionTable(); nation.setCode(town.getCode()); nation.setName(town.getName()); nation.setClassification(town.getClassification()); nation.setType("E"); nations.add(nation); } } } } } } } } } return nations; } private List<Province> getProvince(final String listurl) throws InterruptedException, IOException { Document doc = null; try { Thread.sleep(T); doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); // doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get(); } catch (IOException | InterruptedException e) { System.out.println("重新载入"); Thread.sleep(T2); //调用重新载入方法 doc = heavyLoadIn(listurl); } Elements province = doc.select(".provincetr");//省级 //查询市 List<Province> provinces = new ArrayList<>(); Province province1; for (int i = 0; i < province.size(); i++) { Elements a = province.get(i).getElementsByTag("a"); for (int i1 = 0; i1 < a.size(); i1++) { province1 = new Province(); //得到所有a标签 Element element = a.get(i1); //a标签获取code String href = element.attr("href"); //调用转换方法 href = getUTF8BytesFromGBKString(href); String code = href.substring(0, href.lastIndexOf('.')); province1.setCode(code); //a标签获取name String name = element.text(); //调用转换方法 name = getUTF8BytesFromGBKString(name); province1.setName(name); System.out.println(element.text()); //a标签获取地址获取市 List<City> cities = indexCity(element.absUrl("href")); province1.setCities(cities); provinces.add(province1); } } return provinces; } /** * 查询市 * * @param listurl * @return */ private List<City> indexCity(String listurl) throws InterruptedException, IOException { Document doc = null; try { Thread.sleep(T); doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); // doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get(); } catch (IOException | InterruptedException e) { System.err.println("重新载入"); Thread.sleep(T2); //调用重新载入方法 doc = heavyLoadIn(listurl); } Elements city = doc.select(".citytr");//市辖区 //查询县 List<City> cities = new ArrayList<>(); City city1; for (int i = 0; i < city.size(); i++) { Elements a = city.get(i).getElementsByTag("a"); for (int i1 = 0; i1 < a.size(); i1++) { city1 = new City(); //得到所有a标签 Element element = a.get(i1); //第一个标签为code第二个为名称 //a标签获取code String code = element.text(); //调用转换方法 code = getUTF8BytesFromGBKString(code); city1.setCode(code); //a标签获取name String name = a.get(++i1).text(); //调用转换方法 name = getUTF8BytesFromGBKString(name); city1.setName(name); System.out.println(name); //a标签获取地址获取县 List<County> counties = indexCounty(element.absUrl("href")); city1.setCounties(counties); cities.add(city1); } } return cities; } /** * 获取县 * * @param listurl * @return */ private List<County> indexCounty(String listurl) throws InterruptedException, IOException { Document doc = null; try { Thread.sleep(T); doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); // doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get(); } catch (IOException | InterruptedException e) { System.err.println("重新载入"); Thread.sleep(T2); //调用重新载入方法 doc = heavyLoadIn(listurl); } Elements county = doc.select(".countytr");//县 //查询镇 List<County> counties = new ArrayList<>(); County county1; for (int i = 0; i < county.size(); i++) { Elements a = county.get(i).getElementsByTag("a"); for (int i1 = 0; i1 < a.size(); i1++) { county1 = new County(); //得到所有a标签 Element element = a.get(i1); //第一个标签为code第二个为名称 //a标签获取code String code = element.text(); //调用转换方法 code = getUTF8BytesFromGBKString(code); county1.setCode(code); //a标签获取name String name = a.get(++i1).text(); //调用转换方法 name = getUTF8BytesFromGBKString(name); county1.setName(name); System.out.println(name); //a标签获取地址获取镇 List<Country> countries = indexCountry(element.absUrl("href")); county1.setCountries(countries); counties.add(county1); } } return counties; } /** * 获取镇 * * @param listurl 获取镇url * @return 镇集合 */ private List<Country> indexCountry(String listurl) throws InterruptedException, IOException { Document doc = null; try { Thread.sleep(T); doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); // doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get(); } catch (IOException | InterruptedException e) { System.err.println("重新载入"); Thread.sleep(T2); //调用重新载入方法 doc = heavyLoadIn(listurl); } Elements country = null;//街道 try { country = doc.select(".towntr"); } catch (Exception e) { //调用重新载入方法 doc = heavyLoadIn(listurl); country = doc.select(".towntr"); } //查询村 List<Country> countries = new ArrayList<>(); Country country1; for (int i = 0; i < country.size(); i++) { Elements a = country.get(i).getElementsByTag("a"); for (int i1 = 0; i1 < a.size(); i1++) { country1 = new Country(); //得到所有a标签 Element element = a.get(i1); //第一个标签为code第二个为名称 //a标签获取code String code = element.text(); //调用转换方法 code = getUTF8BytesFromGBKString(code); country1.setCode(code); //a标签获取name Element elementName = a.get(++i1); String name = elementName.text(); //调用转换方法 name = getUTF8BytesFromGBKString(name); country1.setName(name); System.out.println(name); //a标签获取地址获取村 List<Town> towns = indexTown(element.absUrl("href")); country1.setTowns(towns); countries.add(country1); } } return countries; } /** * 获取村级 * * @param listurl 查询村地址 * @return 村集合 */ private List<Town> indexTown(String listurl) throws InterruptedException, IOException { Document doc = null; try { Thread.sleep(T); doc = Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); // doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36").timeout(T1).get(); } catch (IOException | InterruptedException e) { //如果报错休息一段时间继续 System.err.println("重新载入"); Thread.sleep(T2); //调用重新载入方法 doc = heavyLoadIn(listurl); } Elements town; try { town = doc.select(".villagetr");//居委会 } catch (Exception e) { //调用重新载入方法 doc = heavyLoadIn(listurl); town = doc.select(".villagetr");//居委会 } //查询村 List<Town> towns = new ArrayList<>(); Town town1; for (int i = 0; i < town.size(); i++) { town1 = new Town(); Elements td = town.get(i).getElementsByTag("td"); String code = td.get(0).text(); code = getUTF8BytesFromGBKString(code); town1.setCode(code); String classification = td.get(1).text(); classification = getUTF8BytesFromGBKString(classification); town1.setClassification(classification); String name = td.get(2).text(); name = getUTF8BytesFromGBKString(name); town1.setName(name); System.out.println(name); towns.add(town1); } return towns; } private Document heavyLoadIn(String listurl) { try { return Jsoup.parse(new URL(listurl).openStream(), "GBK", listurl); } catch (IOException e) { heavyLoadIn(listurl); } return null; } //有损转换 public String getUTF8BytesFromGBKString(String gbkStr) { int n = gbkStr.length(); byte[] utfBytes = new byte[3 * n]; int k = 0; for (int i = 0; i < n; i++) { int m = gbkStr.charAt(i); if (m < 128 && m >= 0) { utfBytes[k++] = (byte) m; continue; } utfBytes[k++] = (byte) (0xe0 | (m >> 12)); utfBytes[k++] = (byte) (0x80 | ((m >> 6) & 0x3f)); utfBytes[k++] = (byte) (0x80 | (m & 0x3f)); } if (k < utfBytes.length) { byte[] tmp = new byte[k]; System.arraycopy(utfBytes, 0, tmp, 0, k); utfBytes = tmp; } String str = null; try { str = new String(utfBytes, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return str; } }
实体类
package com.ssj.domain; import java.util.List; /** * @author aikang * @create 2020 05 19 10:45 */ public class City { //code String code; //name String name; //县 List<County> counties; public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<County> getCounties() { return counties; } public void setCounties(List<County> counties) { this.counties = counties; } }
package com.ssj.domain; import java.util.List; /** * @author aikang * @create 2020 05 19 10:36 */ public class Country { //code String code; //name String name; //村 List<Town> towns; public List<Town> getTowns() { return towns; } public void setTowns(List<Town> towns) { this.towns = towns; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } }
package com.ssj.domain; import java.util.List; /** * @author aikang * @create 2020 05 19 10:41 */ public class County { //code String code; //name String name; //镇 List<Country> countries; public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<Country> getCountries() { return countries; } public void setCountries(List<Country> countries) { this.countries = countries; } }
package com.ssj.domain; import java.util.List; /** * @author aikang * @create 2020 05 19 10:48 */ public class Province { //code String code; //name String name; //市 List<City> cities; public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<City> getCities() { return cities; } public void setCities(List<City> cities) { this.cities = cities; } }
package com.ssj.domain; import java.io.Serializable; /** * 2020-05-19 */ public class RegionTable implements Serializable { /** * 主键 */ private Integer id; /** * 地区代码 */ private String code; /** * 地区名称 */ private String name; /** * 等级类型 */ private String type; /** * 城乡分类代码 */ private String classification; /** * region_table */ private static final long serialVersionUID = 1L; /** * 主键 * @return id 主键 */ public Integer getId() { return id; } /** * 主键 * @param id 主键 */ public void setId(Integer id) { this.id = id; } /** * 地区代码 * @return code 地区代码 */ public String getCode() { return code; } /** * 地区代码 * @param code 地区代码 */ public void setCode(String code) { this.code = code; } /** * 地区名称 * @return name 地区名称 */ public String getName() { return name; } /** * 地区名称 * @param name 地区名称 */ public void setName(String name) { this.name = name; } /** * 等级类型 * @return type 等级类型 */ public String getType() { return type; } /** * 等级类型 * @param type 等级类型 */ public void setType(String type) { this.type = type; } /** * 城乡分类代码 * @return classification 城乡分类代码 */ public String getClassification() { return classification; } /** * 城乡分类代码 * @param classification 城乡分类代码 */ public void setClassification(String classification) { this.classification = classification; } /** * * @mbggenerated 2020-05-19 */ @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(getClass().getSimpleName()); sb.append(" ["); sb.append("Hash = ").append(hashCode()); sb.append(", id=").append(id); sb.append(", code=").append(code); sb.append(", name=").append(name); sb.append(", type=").append(type); sb.append(", classification=").append(classification); sb.append(", serialVersionUID=").append(serialVersionUID); sb.append("]"); return sb.toString(); } }
package com.ssj.domain; /** * @author aikang * @create 2020 05 19 10:31 */ public class Town { //code String code; //城乡分类代码 String classification; //name String name; public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getClassification() { return classification; } public void setClassification(String classification) { this.classification = classification; } public String getName() { return name; } public void setName(String name) { this.name = name; } }