First, the code shows
If interrupted halfway, the brush may be selected from the filtered data has been pulled provinces
/** * TODO * * @author kevin * @createTime 2019-11-18 19:37 */ @RestController public class CityController { @Autowired private ProvinceService provinceService; @Autowired private HttpUtil httpUtil; private String yearHref = ""; private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"}; @GetMapping("/start") public ResultTemplate<String> spider() throws Exception { String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"; String charset = "gb2312"; Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) { return of("fail"); } Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0); // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接 Document doc = httpUtil.get(yearHref, charset); // 遍历所有的省 Elements provinceElements = doc.getElementsByClass("provincetr"); for (Element element : provinceElements) { Elements aEles = element.select("a"); for (Element aEle : aEles) { String name = aEle.text(); // 11.html String provincesHref = aEle.attr("href"); String code = provincesHref.substring(0, provincesHref.indexOf(".")); index = yearHref.lastIndexOf("/") + 1; // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html provincesHref = yearHref.substring(0, index) + provincesHref; DicProvince province = new DicProvince() .setProvinceName(name) .setProvinceCode(code) .setCountryId(1196612453660643329L) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); if( "Beijing" .equals (name) || "Tianjin" .equals (name) || "in Hebei Province" .equals (name) ) { System.out.println ( "unexecuted City:" + name); } the else { System.out.println ( "start time:" + LocalDateTime.now ()); System.out.println ( "prefecture name:" + name); Long ID = provinceService.insertProvince (Province); getCites (provincesHref , charset, ID); } } } return of ( "Spider crawl End." ); } private void getCites(String url, String charset, Long provinceId) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { rootDoc = null ; System.out.println ( "Web Link Request error" ); } } I = 0 ; IF (RootDoc =! null ) { Elements cityElements = rootDoc.getElementsByClass ( "citytr" ); for (the Element cityElement: cityElements) { aEle Element = cityElement.select ( "a") GET (1);. // the second is the city's name String name = aEle.text (); // 11 / 1101.html String cityHref = aEle.attr ( " href " ); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); cityHref = yearHref.substring(0, index) + cityHref; DicCity city = new DicCity() .setCityName(name) .setCityCode(code) .setProvinceId(provinceId) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCity(city); //Long id=1L; getDistrict(cityHref, charset, id); } } } // 区县 private void getDistrict(String url, String charset, Long idDis) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println ( "cycles:" + I); } RootDoc = httpUtil.get (URL, charset); } the catch (Exception E) { RootDoc = null ; System.out.println ( "Web Link Request error " ); } } I = 0 ; IF ! (RootDoc = null ) { Elements cityElements = rootDoc.getElementsByClass (" countytr " ); for (Element cityElement : cityElements) { try { Element aEle = cityElement.select("a").get(1); String name = aEle.text(); String cityHref = aEle.attr("href"); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertDistrict(district); //Long id=1L; getStreet(cityHref, charset, id); } catch (Exception e) { System.out.println("市辖区"); Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis); Long id = provinceService.insertDistrict(district); System.out.println("执行完毕"); } } } } // 街道 private void getStreet(String url, String charset, Long idStr) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { i++; if (i >= 3) { System.out.println("循环次数:" + i); } rootDoc = httpUtil.get(url, charset); } catch (Exception e) { RootDoc= Null ; System.out.println ( "Web Link Request error" ); } } I = 0 ; IF (RootDoc =! Null ) { Elements cityElements = rootDoc.getElementsByClass ( "towntr" ); for (the Element cityElement: cityElements) { Element aEle = cityElement.select ( "a") GET (1);. // the second is the city's name String name = aEle.text (); String cityHref = aEle.attr ( "href" ); int start = cityHref.lastIndexOf("/") + 1; String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1; cityHref = url.substring(0, index) + cityHref; DicStreet street = new DicStreet() .setStreetName(name) .setStreetCode(code) .setDistrictId(idStr) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertStreet(street); //Long id=1L; getCommunity(cityHref, charset, id); } } } // 社区 private void getCommunity(String url, String charset, Long idPro) throws Exception { Document rootDoc = null; int i = 0; while (rootDoc == null) { try { I ++ ; if(I> =. 3 ) { System.out.println ( "cycles:" + I); } RootDoc = httpUtil.get (URL, charset); } the catch (Exception E) { RootDoc = null ; System.out.println ( "web link request error" ); } } I = 0 ; IF (! RootDoc = null ) { Elements cityElements = rootDoc.getElementsByClass ( "villagetr" ); for (Element cityElement : cityElements) { Element aEle = cityElement.select("td").get(0); String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1); String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2); String name = aEle3.text(); DicCommunity community = new DicCommunity() .setCommunityName(name) .setCommunityCode(code) .setClassificationCode(cl_code) .setStreetId(idPro) .setCreateDate(LocalDateTime.now()) .setCreateUserid(1L) .setCreateUsername("admin"); Long id = provinceService.insertCommunity(community); } } } }
Two, HttppUtil tools
/** * TODO * * @author kevin * @createTime 2019-11-20 9:17 */ @Component public class HttpUtil { public Document get(String url, String charset) throws IOException { String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"; URL url2 = new URL(url); HttpURLConnection connection = (HttpURLConnection)url2.openConnection(); connection.setRequestMethod("GET"); // whether to allow caching, default true. connection.setUseCaches (Boolean.FALSE The); // setting request header connection.addRequestProperty ( "Connection", "Close" ); connection.addRequestProperty ( "User-Agent" , the userAgent); // Set the connection timeout host (unit: ms) connection.setConnectTimeout (80000 ); // set the timeout data read from the master (unit: ms) connection.setReadTimeout (80000 ); // start request the try { the Document DOC = Jsoup.parse (connection.getInputStream (), charset , URL); return DOC; }catch (Exception e) { System.out.println("parse error: " + url); } return null; } }
Three, service part, defined as required database tables themselves
/** * TODO * * @author kevin * @createTime 2019-11-18 20:41 */ @Service public class ProvinceServiceImpl implements ProvinceService { @Autowired private ProvinceMapper provinceMapper; @Autowired private CityMapper cityMapper; @Autowired private DistrictMapper districtMapper; @Autowired private StreetMapper streetMapper; @Autowired private CommunityMapper communityMapper; @Override public Long insertProvince(DicProvince dicProvince) { int res=0; while (res!=1){ try { res=provinceMapper.insert(dicProvince); } catch (Exception e) { res=0; System.out.println("插入省数据失败"); e.printStackTrace(); } } return dicProvince.getProvinceId(); } @Override public Long insertCity(DicCity dicCity) { int res=0; while(res!=1){ try { res=cityMapper.insert(dicCity); } catch (Exception e) { res=0; System.out.println("插入市数据失败"); e.printStackTrace(); } } return dicCity.getCityId(); } @Override public Long insertDistrict(DicDistrict dicDistrict) { int res=0; while (res!=1){ try { res=districtMapper.insert(dicDistrict); } catch (Exception e) { res=0; System.out.println("插入区县数据失败"); e.printStackTrace(); } } return dicDistrict.getDistrictId(); } @Override public Long insertStreet(DicStreet dicStreet) { int res=0; while (res!=1){ {the try RES = streetMapper.insert (dicStreet); } the catch (Exception E) { RES = 0; System.out.println ( "insert street data failed"); e.printStackTrace (); } } return dicStreet.getStreetId () ; } @Override public Long insertCommunity (dicCommunity dicCommunity) { int RES = 0; the while (! = RES. 1) { the try { RES = communityMapper.insert (dicCommunity); } the catch (Exception E) { RES = 0; the System.out .println ( "insertion community data failed"); e.printStackTrace(); } } return dicCommunity.getCommunityId(); } }