JAVA爬虫爬取国家统计局行政区划数据(2021年最新数据)

一、引入jsoup依赖

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.9.2</version>
        </dependency>

二、爬虫程序

@Service
public class JavaJsoupService {


    /**
     * 本示例只爬取宁夏回族自治区五级行政区划的信息
     */
    private static String allName = "宁夏回族自治区";

    /**
     * 建立连接
     */
    private static Document connect(String url) {
        if (url == null || url.isEmpty()) {
            throw new IllegalArgumentException("无效的url");
        }
        try {
            return Jsoup.connect(url).timeout(200 * 2000).get();
        } catch (IOException e) {
            System.out.println(url+"地址不存在");
            return null;
        }
    }

    /**
     * 获取所有的省份(本示例只爬取宁夏回族自治区五级行政区划的信息)
     * @param url 请求地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html
     * @return
     */
    public List<SysArea> getProvinces(String url) {
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(url);
        Elements rowProvince = connect.select("tr.provincetr");
        for (Element provinceElement : rowProvince) {
            Elements select = provinceElement.select("a");
            for (Element province : select) {
                if(province.text().equals(allName)){
                    String code = province.select("a").attr("href");
                    String name = province.text();
                    SysArea sysArea = new SysArea();
                    sysArea.setAreaCode(code.replace(".html","0000000000"));
                    sysArea.setId(sysArea.getAreaCode());
                    sysArea.setAreaName(name);
                    sysArea.setLevel("1");
                    sysArea.setParentCode("0");
                    sysArea.setDelFlag("1");
                    sysArea.setStatus("1");
                    sysArea.setFullName(name);
                    sysAreas.add(sysArea);
                    String provinceUrl = url.replace("index.html",code);
                    System.err.println("++++++++++++++++++++++++++开始获取"+ name +"下属市区行政区划信息++++++++++++++++++++++++");
                    List<SysArea> cityAreaCodeList = getCityAreaCode(provinceUrl,code.replace(".html","0000000000"),name);
                    sysAreas.addAll(cityAreaCodeList);
                }
            }
        }
        return sysAreas;
    }


    /**
     * 获取市行政区划信息
     * @param provinceUrl 省份对应的地址
     * @param parentCode  需要爬取的省份行政区划(对于市的父级代码即为省行政区划)
     * @return
     */
    public static List<SysArea> getCityAreaCode(String provinceUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(provinceUrl);
        Elements rowCity = connect.select("tr.citytr");
        for (Element cityElement : rowCity) {
            String name = cityElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[1]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("2");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setFullName(upAreaName+split[1]);
            sysArea.setId(sysArea.getAreaCode());
            sysAreas.add(sysArea);
            String cityUrl = provinceUrl.replace(".html","/"+split[0].substring(0, 4)+".html");
            System.err.println("-------------------开始获取"+split[1]+"下属区县行政区划信息-----------------------");
            List<SysArea> downAreaCodeList = getDownAreaCode(cityUrl,split[0],upAreaName+split[1]);
            sysAreas.addAll(downAreaCodeList);
            //只爬取固原市的数据
            /*if("固原市".equals(split[1])){

            }*/
        }
        return sysAreas;
    }

    /**
     * 获取区县行政区划信息
     * @param cityUrl 城市对应的地址
     * @param parentCode  需要爬取的市行政区划(对于区县的父级代码即为市行政区划)
     * @return
     */
    public static List<SysArea> getDownAreaCode(String cityUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(cityUrl);
        Elements rowDown = connect.select("tr.countytr");
        for (Element downElement : rowDown) {
            String code = downElement.select("a").attr("href");
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            if(!"市辖区".equals(split[1])){
                SysArea sysArea = new SysArea();
                sysArea.setAreaCode(split[0]);
                sysArea.setAreaName(split[1]);
                sysArea.setParentCode(parentCode);
                sysArea.setLevel("3");
                sysArea.setDelFlag("1");
                sysArea.setStatus("1");
                sysArea.setFullName(upAreaName+split[1]);
                sysArea.setId(sysArea.getAreaCode());
                sysAreas.add(sysArea);
                String downUrl = cityUrl.replace(parentCode.substring(0,4)+".html",code);
                System.err.println("====================开始获取"+split[1]+"下属区划信息");
                List<SysArea> countryAreaList = getCountryAreaCodeList(downUrl,split[0],upAreaName+split[1]);
                sysAreas.addAll(countryAreaList);
            }
        }
        return sysAreas;
    }


    /**
     * 获取乡镇行政区划信息
     * @param downUrl 
     * @param parentCode 
     * @return
     */
    public static List<SysArea> getCountryAreaCodeList(String downUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(downUrl);
        Elements rowDown = connect.select("tr.towntr");
        for (Element downElement : rowDown) {
            String code = downElement.select("a").attr("href");
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[1]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("4");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setFullName(upAreaName+split[1]);
            sysArea.setId(sysArea.getAreaCode());
            sysAreas.add(sysArea);
            String countryUrl = downUrl.replace(parentCode.substring(0,6)+".html",code);
            System.err.println("====================开始获取"+split[1]+"下属区划信息");
            List<SysArea> villageAreaCodeList = getVillageAreaCodeList(countryUrl,split[0],upAreaName+split[1]);
            sysAreas.addAll(villageAreaCodeList);
        }
        return sysAreas;
    }


    /**
     * 获取村行政区划信息
     * @param countryUrl
     * @param parentCode
     * @return
     */
    public static List<SysArea> getVillageAreaCodeList(String countryUrl,String parentCode,String upAreaName){
        List<SysArea> villageAreaCodeList = new ArrayList<>();
        Document connect = connect(countryUrl);
        Elements rowDown = connect.select("tr.villagetr");
        for (Element downElement : rowDown) {
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[2]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("5");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setId(sysArea.getAreaCode());
            sysArea.setFullName(upAreaName+split[2]);
            villageAreaCodeList.add(sysArea);
        }
        return villageAreaCodeList;
    }
}

三、单元测试

//此处展示批量插入数据库
@Test
    public void 爬虫批量写入数据(){
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
        List<SysArea> sysAreas = javaJsoupService.getProvinces(url);
        System.err.println("爬虫相应数据为:"+JSON.toJSONString(sysAreas));
        int result = surveyCommonMapper.insertAreaInfo(sysAreas);
        System.err.println("插入数据条数:"+result);
    }

四、批量插入数据库

<!--测试批量插入爬虫获取的区划信息-->
    <insert id="insertAreaInfo" parameterType="java.util.List">
        insert into sys_area_20220304(id, area_code, area_name,parent_code, full_name,level,status,del_flag) values
        <foreach collection="list" item="item" index="index" separator=",">
            (#{item.id},#{item.areaCode},#{item.areaName},#{item.parentCode},#{item.fullName},#{item.level},#{item.status},#{item.delFlag})
        </foreach>
    </insert>

猜你喜欢

转载自blog.csdn.net/m0_43584016/article/details/123279345