Based on [a] || springBoot + jsoup crawling national administrative divisions data

First, the code shows

If interrupted halfway, the brush may be selected from the filtered data has been pulled provinces

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 19:37
 */
@RestController
public class CityController {

    @Autowired
    private ProvinceService provinceService;
    @Autowired
    private HttpUtil httpUtil;
    private String yearHref = "";
    private int index;

    // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
    @GetMapping("/start")
    public ResultTemplate<String> spider() throws Exception {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
        String charset = "gb2312";
        Document rootDoc = httpUtil.get(url, charset);

        if (rootDoc == null) {
            return of("fail");
        }
        Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
        // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
        yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
        Document doc = httpUtil.get(yearHref, charset);
        // 遍历所有的省
        Elements provinceElements = doc.getElementsByClass("provincetr");
        for (Element element : provinceElements) {
            Elements aEles = element.select("a");
            for (Element aEle : aEles) {
                String name = aEle.text();
                // 11.html
                String provincesHref = aEle.attr("href");
                String code = provincesHref.substring(0, provincesHref.indexOf("."));
                index = yearHref.lastIndexOf("/") + 1;
                // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
                provincesHref = yearHref.substring(0, index) + provincesHref;
                DicProvince province = new DicProvince()
                        .setProvinceName(name)
                        .setProvinceCode(code)
                        .setCountryId(1196612453660643329L)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                if( "Beijing" .equals (name) || "Tianjin" .equals (name) || "in Hebei Province" .equals (name) ) { 
                    System.out.println ( "unexecuted City:" + name); 
                } the else { 
                    System.out.println ( "start time:" + LocalDateTime.now ()); 
                    System.out.println ( "prefecture name:" + name); 
                    Long ID = provinceService.insertProvince (Province); 
                    getCites (provincesHref , charset, ID); 
                } 
            } 
        } 
        return of ( "Spider crawl End." );
    }

    private void getCites(String url, String charset, Long provinceId) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                rootDoc = null ; 
                System.out.println ( "Web Link Request error" ); 
            } 
        } 
        I = 0 ;
         IF (RootDoc =! null ) { 
            Elements cityElements = rootDoc.getElementsByClass ( "citytr" );
             for (the Element cityElement: cityElements) { 
                aEle Element = cityElement.select ( "a") GET (1);. // the second is the city's name 
                String name = aEle.text ();
                 // 11 / 1101.html 
                String cityHref = aEle.attr ( " href " );
                int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                cityHref = yearHref.substring(0, index) + cityHref;
                DicCity city = new DicCity()
                        .setCityName(name)
                        .setCityCode(code)
                        .setProvinceId(provinceId)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCity(city);
                //Long id=1L;

                getDistrict(cityHref, charset, id);
            }
        }
    }

    // 区县
    private void getDistrict(String url, String charset, Long idDis) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) { 
                    System.out.println ( "cycles:" + I); 
                } 
                RootDoc = httpUtil.get (URL, charset); 
            } the catch (Exception E) { 
                RootDoc = null ; 
                System.out.println ( "Web Link Request error " ); 
            } 
        } 
        I = 0 ;
         IF ! (RootDoc = null ) { 
            Elements cityElements = rootDoc.getElementsByClass (" countytr " );
             for (Element cityElement : cityElements) {
                try {
                    Element aEle = cityElement.select("a").get(1);
                    String name = aEle.text();
                    String cityHref = aEle.attr("href");
                    int start = cityHref.lastIndexOf("/") + 1;
                    String code = cityHref.substring(start, cityHref.indexOf("."));

                    int index = url.lastIndexOf("/") + 1;
                    cityHref = url.substring(0, index) + cityHref;

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
                            .setCreateDate(LocalDateTime.now())
                            .setCreateUserid(1L)
                            .setCreateUsername("admin");
                    Long id = provinceService.insertDistrict(district);
                    //Long id=1L;
                    getStreet(cityHref, charset, id);
                } catch (Exception e) {
                    System.out.println("市辖区");
                    Element aEle = cityElement.select("td").get(0);
                    String code = aEle.text();

                    Element aEle2 = cityElement.select("td").get(1);
                    String name = aEle2.text();

                    DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
                    Long id = provinceService.insertDistrict(district);
                    System.out.println("执行完毕");

                }

            }
        }
    }

    // 街道
    private void getStreet(String url, String charset, Long idStr) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                i++;
                if (i >= 3) {
                    System.out.println("循环次数:" + i);
                }
                rootDoc = httpUtil.get(url, charset);
            } catch (Exception e) {
                RootDoc= Null ; 
                System.out.println ( "Web Link Request error" ); 
            } 
        } 
        I = 0 ;
         IF (RootDoc =! Null ) { 
            Elements cityElements = rootDoc.getElementsByClass ( "towntr" );
             for (the Element cityElement: cityElements) { 
                Element aEle = cityElement.select ( "a") GET (1);. // the second is the city's name 
                String name = aEle.text (); 
                String cityHref = aEle.attr ( "href" );
                 int start = cityHref.lastIndexOf("/") + 1;
                String code = cityHref.substring(start, cityHref.indexOf("."));
                int index = url.lastIndexOf("/") + 1;
                cityHref = url.substring(0, index) + cityHref;
                DicStreet street = new DicStreet()
                        .setStreetName(name)
                        .setStreetCode(code)
                        .setDistrictId(idStr)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertStreet(street);
                //Long id=1L;
                getCommunity(cityHref, charset, id);
            }
        }
    }

    // 社区
    private void getCommunity(String url, String charset, Long idPro) throws Exception {
        Document rootDoc = null;
        int i = 0;
        while (rootDoc == null) {
            try {
                I ++ ;
                if(I> =. 3 ) { 
                    System.out.println ( "cycles:" + I); 
                } 
                RootDoc = httpUtil.get (URL, charset); 
            } the catch (Exception E) { 
                RootDoc = null ; 
                System.out.println ( "web link request error" ); 
            } 
        } 
        I = 0 ;
         IF (! RootDoc = null ) { 
            Elements cityElements = rootDoc.getElementsByClass ( "villagetr" );
             for (Element cityElement : cityElements) {
                Element aEle = cityElement.select("td").get(0);
                String code = aEle.text();

                Element aEle2 = cityElement.select("td").get(1);
                String cl_code = aEle2.text();

                Element aEle3 = cityElement.select("td").get(2);
                String name = aEle3.text();

                DicCommunity community = new DicCommunity()
                        .setCommunityName(name)
                        .setCommunityCode(code)
                        .setClassificationCode(cl_code)
                        .setStreetId(idPro)
                        .setCreateDate(LocalDateTime.now())
                        .setCreateUserid(1L)
                        .setCreateUsername("admin");
                Long id = provinceService.insertCommunity(community);
            }
        }
    }

}

 

Two, HttppUtil tools

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-20 9:17
 */
@Component
public class HttpUtil {
    public Document get(String url, String charset) throws IOException {
        String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
        URL url2 = new URL(url);
        HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
        connection.setRequestMethod("GET");
        // whether to allow caching, default true. 
        connection.setUseCaches (Boolean.FALSE The);
         // setting request header 
        connection.addRequestProperty ( "Connection", "Close" ); 
        connection.addRequestProperty ( "User-Agent" , the userAgent);
         // Set the connection timeout host (unit: ms) 
        connection.setConnectTimeout (80000 );
         // set the timeout data read from the master (unit: ms) 
        connection.setReadTimeout (80000 );
         // start request 
        the try { 
            the Document DOC = Jsoup.parse (connection.getInputStream (), charset , URL);
             return DOC; 
        }catch (Exception e) {
            System.out.println("parse error: " + url);
        }
        return null;
    }

}

 

Three, service part, defined as required database tables themselves

/**
 * TODO
 *
 * @author kevin
 * @createTime 2019-11-18 20:41
 */
@Service
public class ProvinceServiceImpl implements ProvinceService {

    @Autowired
    private ProvinceMapper provinceMapper;
    @Autowired
    private CityMapper cityMapper;
    @Autowired
    private DistrictMapper districtMapper;
    @Autowired
    private StreetMapper streetMapper;
    @Autowired
    private CommunityMapper communityMapper;


    @Override
    public Long insertProvince(DicProvince dicProvince) {
        int res=0;
        while (res!=1){
            try {
                res=provinceMapper.insert(dicProvince);
            } catch (Exception e) {
                res=0;
                System.out.println("插入省数据失败");
                e.printStackTrace();
            }
        }
        return dicProvince.getProvinceId();
    }

    @Override
    public Long insertCity(DicCity dicCity) {
        int res=0;
        while(res!=1){
            try {
                res=cityMapper.insert(dicCity);
            } catch (Exception e) {
                res=0;
                System.out.println("插入市数据失败");
                e.printStackTrace();
            }
        }
        return dicCity.getCityId();
    }


    @Override
    public Long insertDistrict(DicDistrict dicDistrict) {
        int res=0;
        while (res!=1){
            try {
                res=districtMapper.insert(dicDistrict);
            } catch (Exception e) {
                res=0;
                System.out.println("插入区县数据失败");
                e.printStackTrace();
            }
        }
        return dicDistrict.getDistrictId();
    }

    @Override
    public Long insertStreet(DicStreet dicStreet) {
        int res=0;
        while (res!=1){
            {the try 
                RES = streetMapper.insert (dicStreet); 
            } the catch (Exception E) { 
                RES = 0; 
                System.out.println ( "insert street data failed"); 
                e.printStackTrace (); 
            } 
        } 
        return dicStreet.getStreetId () ; 
    } 

    @Override 
    public Long insertCommunity (dicCommunity dicCommunity) { 
        int RES = 0; 
        the while (! = RES. 1) { 
            the try { 
                RES = communityMapper.insert (dicCommunity); 
            } the catch (Exception E) { 
                RES = 0; 
                the System.out .println ( "insertion community data failed");
                e.printStackTrace();
            }
        }
        return dicCommunity.getCommunityId();
    }


}

  

Guess you like

Origin www.cnblogs.com/kevin-ying/p/11925782.html