行政区划获取


背景:
公司的行政区划代码有问题,有的没有街道信息,有的关联信息有误,然后找到了国家的网站国家统计局-行政区划,这个里面是包含了所有的行政信息,但是全是html页面,这个就没法自动获取了,只能去爬取这个数据了,java语音有三方类库Jsoup,他是一个仿浏览器的三方库,可以通过他来获取页面信息。

一、导入jar包

下面是笔者用到的全部jar包

	<dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>3.9</version>
    </dependency>
    <dependency>
        <groupId>com.google.guava</groupId>
        <artifactId>guava</artifactId>
        <version>30.1.1-jre</version>
    </dependency>
    <dependency>
        <groupId>cn.hutool</groupId>
        <artifactId>hutool-json</artifactId>
        <version>5.4.0</version>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.44</version>
    </dependency>

    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.14.3</version>
    </dependency>

    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.5</version>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
        <version>2.5.4</version>
    </dependency>

这里说下,maven配置要从阿里云下载jar,若是从中央仓库将会非常的慢。
阿里云私服:http://maven.aliyun.com/nexus/content/repositories/central/

二、代码展示

这里是代码的展示,笔者是网上搜的代码改造的,不然网站有反爬,大概爬取2000条左右就会中断,笔者加了延时这样就避开了反爬(可能还有别的规避措施)。这里爬取的是4级行政区划:省、市、区县、街道

package com.cheng.controller;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.streaming.SXSSFSheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketTimeoutException;
import java.util.*;

/**
 * @author pcc
 * @version 1.0.0
 * @className JsoupTest
 * @date 2023-03-02 10:39
 */
public class JsoupTestPluMdm {
    
    
    static int i = 1;

    static String url1 = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2022";
    static String url2 = "";
    public static void main(String[] args) throws IOException {
    
    

        try{
    
    
            List<Map<String,String>> listMap = new ArrayList<>();

            Document document = Jsoup.connect(url1)
                    .header("Cookie", "wzws_sessionid=oGQAAyWBMmNlMWZkgjdlZDJkMIAyMjEuMjM4LjEzMi41MA==; SF_cookie_1=15502425; wzws_cid=6e8cdc0aea81349b05c8a0b6c05cd7204b6e0f10e5a48d462175473d23abcb4891edf1ceb73464398cb1ce7e6f53999f7545dd0014a15b1fb4eec5c6cf37421f0c2b08528de36f728ec4c676ed264c7d")
                    .get();

            //获取他所有的省
            Elements elements = document.select("body > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td > table > tbody");
            //解析省的超链接
            Elements elements1 = elements.select("tbody > tr > td > a");
            for(int j=0;j<elements1.size();j++){
    
    
//                Thread.sleep(100);
                String s = elements1.get(j).select("a").attr("href");
                String provinceCode = s.replaceAll(".html","")+"0000";
                System.out.println("省代码:" + provinceCode);
                String provinceName = elements1.get(j).text();
                System.out.println("省名称:" + provinceName);
                Map<String,String> map = new HashMap<>();
                map.put(provinceCode,provinceName);
                listMap.add(map);
            }


            for (int i1 = 0; i1 <31; i1++) {
    
    
                System.out.println("**********************i********************:"+i);
                if(i%1000==0){
    
    
                    Thread.sleep(1000*60*10);
                }
                Map<String, String> stringStringMap = listMap.get(i1);
                Iterator<Map.Entry<String, String>> iterator = stringStringMap.entrySet().iterator();
                while(iterator.hasNext()){
    
    
                    Map.Entry<String,String> entry = iterator.next();
                    String provinceCode = entry.getKey();
                    String provinceName = entry.getValue();
                    String index = provinceCode.substring(0,2)+".html";



                    SXSSFWorkbook wb = new SXSSFWorkbook(100);
                    SXSSFSheet sheet = (SXSSFSheet) wb.createSheet();
                    // TODO 这里改成自己的地址即可,也可以存放到一个文件里
                    String enterFileName = "C:\\Users\\pcc\\Desktop\\xingzhengquhua\\"+provinceName+".xlsx";
                    FileOutputStream fileOut = new FileOutputStream(enterFileName);
                    Row row = sheet.createRow(0);




                    sheet.createRow(i).createCell(0).setCellValue(provinceCode);// id
                    sheet.getRow(i).createCell(1).setCellValue(provinceName);// name
                    sheet.getRow(i).createCell(2).setCellValue(""); // pid
                    sheet.getRow(i).createCell(3).setCellValue("1"); // type
                    i++;
                    try {
    
    
                        jsoupList2(url1 + "/" + index, provinceName, provinceCode, sheet);
                    } catch (SocketTimeoutException e) {
    
    
                        e.printStackTrace();
                        jsoupList2(url1 + "/" + index, provinceName, provinceCode, sheet);
                    } catch (ConnectException e) {
    
    
                        e.printStackTrace();
                        jsoupList2(url1 + "/" + index, provinceName, provinceCode, sheet);
                    }



                    row.createCell(0).setCellValue("id");
                    row.createCell(1).setCellValue("district_name");
                    row.createCell(2).setCellValue("pid");
                    row.createCell(3).setCellValue("type");
                    wb.write(fileOut);
                    fileOut.close();
                }


            }

        }catch (Exception e){
    
    
            e.printStackTrace();
        }finally {
    
    

        }

    }



    //市级页面
    public static void jsoupList2(String url,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception {
    
    
        String cityName = "";
        String cityCode = "";
        url2 = url.replace(".html","");
        Document document = Jsoup.connect(url).get();
        Elements elements = document.select("body > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td > table > tbody");
        Elements elements1 = elements.select("tbody > tr > td");
        //j从2开始是因为他有个表头 统计用区划代码 名称
        for (int j = 2; j < elements1.size(); j++) {
    
    
            System.out.println("**********************i********************:"+i);
            if(i%1000==0){
    
    
                Thread.sleep(1000*60*10);
            }
//            Thread.sleep(500);
            //判断是否是超链接,不是超链接也要获取数据
            if(elements1.get(j).select("td > a").toString().equals("")){
    
    
                String text = elements1.get(j).text();
                if (j % 2 == 0) {
    
    
                    System.out.println("市代码:" + text);
                    sheet.createRow(i).createCell(0).setCellValue(text);
                } else {
    
    
                    System.out.println("市名称:" + text);
                    sheet.getRow(i).createCell(1).setCellValue(text);
                    sheet.getRow(i).createCell(2).setCellValue(provinceCode);
                    sheet.getRow(i).createCell(3).setCellValue("3");
                    i++;
                }
            }else {
    
    
                Elements elements2 = elements1.get(j).select("td > a");
                for (int j1 = 0; j1 < elements2.size(); j1++) {
    
    
                    String text = elements2.get(j1).text();
                    if (j % 2 == 0) {
    
    
                        System.out.println("市代码:" + text);
                        cityCode = text;
                        sheet.createRow(i).createCell(0).setCellValue(text);
                    } else {
    
    
                        System.out.println("市名称:" + text);
                        cityName = text;
                        sheet.getRow(i).createCell(1).setCellValue(text);
                        sheet.getRow(i).createCell(2).setCellValue(provinceCode);
                        sheet.getRow(i).createCell(3).setCellValue("2");
                        i++;
                        String s = elements2.get(j1).select("a").attr("href");
                        //TODO 这里排除了海南的几个市区,更改为不排除任何市区
                        if(true) {
    
    
                            try {
    
    
                                jsoupList3(url1 + "/" + s,cityName,cityCode,provinceName,provinceCode, sheet);
                            } catch (SocketTimeoutException e) {
    
    
                                e.printStackTrace();
                                jsoupList3(url1 + "/" + s,cityName,cityCode,provinceName,provinceCode, sheet);
                            } catch (ConnectException e) {
    
    
                                e.printStackTrace();
                                jsoupList3(url1 + "/" + s,cityName,cityCode,provinceName,provinceCode, sheet);
                            }
                        }
                    }
                }
            }
        }
    }
    //县级页面
    public static void jsoupList3(String url,String cityName,String cityCode,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception {
    
    
        Document document = Jsoup.connect(url).get();
        Elements elements = document.select("body > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td > table > tbody");
        Elements elements1 = elements.select("tbody > tr > td");
        String xianName = "";
        String xianCode = "";
        //j从2开始是因为他有个表头 统计用区划代码 名称
        for (int j = 2; j < elements1.size(); j++) {
    
    
            System.out.println("**********************i********************:"+i);
            if(i%1000==0){
    
    
                Thread.sleep(1000*60*10);
            }
//            Thread.sleep(500);
            //判断是否是超链接,不是超链接也要获取数据
            if(elements1.get(j).select("td > a").toString().equals("")){
    
    
                String text = elements1.get(j).text();
                if (j % 2 == 0) {
    
    
                    System.out.println("县代码:" + text);
                    sheet.createRow(i).createCell(0).setCellValue(text);
                } else {
    
    
                    System.out.println("县名称:" + text);
                    sheet.getRow(i).createCell(1).setCellValue(text);
                    sheet.getRow(i).createCell(2).setCellValue(cityCode);
                    sheet.getRow(i).createCell(3).setCellValue("3");
                    i++;
                }
            }else {
    
    
                Elements elements2 = elements1.get(j).select("td > a");
                for (int j1 = 0; j1 < elements2.size(); j1++) {
    
    
                    String text = elements2.get(j1).text();
                    xianName = text;
                    if (j % 2 == 0) {
    
    
                        xianCode = text;
                        System.out.println("县代码:" + xianCode);
                        sheet.createRow(i).createCell(0).setCellValue(text);
                    } else {
    
    
                        System.out.println("县名称:" + text);
                        sheet.getRow(i).createCell(1).setCellValue(text);
                        sheet.getRow(i).createCell(2).setCellValue(cityCode);
                        sheet.getRow(i).createCell(3).setCellValue("3");
                        i++;
                        String s = elements2.get(j1).select("a").attr("href");
                        try {
    
    
                            jsoupList4(url1 + "/" +provinceCode.substring(0,2)+"/"+ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet);
                        } catch (SocketTimeoutException e) {
    
    
                            e.printStackTrace();
                            jsoupList4(url1 + "/" +provinceCode.substring(0,2)+"/"+ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet);
                        } catch (ConnectException e) {
    
    
                            e.printStackTrace();
                            jsoupList4(url1 + "/" +provinceCode.substring(0,2)+"/"+ s,xianName,xianCode,cityName,cityCode,provinceName,provinceCode, sheet);
                        }
                    }
                }
            }
        }
    }


    //街道页面
    public static void jsoupList4(String url,String xianName,String xianCode,String cityName,String cityCode,String provinceName,String provinceCode,SXSSFSheet sheet) throws Exception {
    
    
        Document document = Jsoup.connect(url).get();
        Elements elements = document.select("body > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td > table > tbody");
        Elements elements1 = elements.select("tbody > tr > td");
        //j从2开始是因为他有个表头 统计用区划代码 名称
        for (int j = 2; j < elements1.size(); j++) {
    
    
            System.out.println("**********************i********************:"+i);
            if(i%1000==0){
    
    
                Thread.sleep(1000*60*10);
            }
//            Thread.sleep(500);
            //判断是否是超链接,不是超链接也要获取数据
            if(elements1.get(j).select("td > a").toString().equals("")){
    
    
                String text = elements1.get(j).text();
                if (j % 2 == 0) {
    
    
                    System.out.println("街道代码:" + text);
                    sheet.createRow(i).createCell(0).setCellValue(text);
                } else {
    
    
                    System.out.println("街道名称:" + text);
                    sheet.getRow(i).createCell(1).setCellValue(text);
                    sheet.getRow(i).createCell(2).setCellValue(xianCode);
                    sheet.getRow(i).createCell(3).setCellValue("4");
                    i++;
                }
            }else {
    
    
                Elements elements2 = elements1.get(j).select("td > a");
                for (int j1 = 0; j1 < elements2.size(); j1++) {
    
    
                    String text = elements2.get(j1).text();
                    if (j % 2 == 0) {
    
    
                        System.out.println("街道代码:" + text);// TODO 这里不能截取,不然街道界别数据截不全
                        sheet.createRow(i).createCell(0).setCellValue(text);
                    } else {
    
    
                        System.out.println("街道名称:" + text);
                        sheet.getRow(i).createCell(1).setCellValue(text);
                        sheet.getRow(i).createCell(2).setCellValue(xianCode);
                        sheet.getRow(i).createCell(3).setCellValue("4");
                        i++;
                    }
                }
            }
        }
    }
}

猜你喜欢

转载自blog.csdn.net/m0_46897923/article/details/129475721