Java爬统计局12位区划代码

有朋友需要获取全国2013年统计用区划代码和城乡划分代码，于是写了个爬虫抓数据。
仔细分析页面后，发现其特点是根据最终的区域代码，可以反推上级省、市等代码，故只保存最后一步数据。
第一次接触爬虫，边做边研究。只写了个单线程，下载了41分钟。
后来研究多线程爬虫，又写了个多线程爬知乎话题+回答的程序。由于暂时无法正确保存页面上各种程序语言的代码，半成品的程序就不放出来了。
下面是单线程下载统计局全国区划代码的源码，要改成多线程也不难。不过已经下载到了结果，就懒得再做无用功了。
[java] view plain copy
package king.statitics;  
  
import java.io.IOException;  
import java.io.PrintStream;  
import java.util.ArrayList;  
import java.util.List;  
import java.util.Map;  
import java.util.TreeMap;  
  
import org.jsoup.Jsoup;  
import org.jsoup.nodes.Document;  
import org.jsoup.nodes.Element;  
import org.jsoup.select.Elements;  
  
public class GetNoAndAddress {  
    // 由于每个网页层数未知，用List来保存每一层网页。  
    // Map的第一个String为网址，第二个String为 代码和地址，用SEPARATOR隔开  
    private static List<Map<String, String>> tempList = new ArrayList<Map<String, String>>();  
    private static final String SEPERATOR = "|";  
    private static Map<String, String> result = new TreeMap<String, String>();  
     
    public static void main(String[] args) throws Exception {  
        // 网址： 2013年统计用区划代码和城乡划分代码(截止2013年8月31日)   
        String urlStr = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/index.html";  
        String filePath = "E:/result.txt";  
          
        long start = System.currentTimeMillis();  
        // 执行  
        execute(urlStr, filePath);  
        long end = (System.currentTimeMillis() - start) / (1000 * 60);  
        System.out.println("总运行时间" + end + "分钟");  
    }  
      
    /** 
     * 程序入口 
     * @param urlStr 
     */  
    public static void execute(String urlStr, String filePath) throws Exception {  
        int index = -1; // 照顾下面的循环，从-1开始。  
        do{  
            Map<String, String> store = new TreeMap<String, String>();  
            if (tempList.isEmpty()){ //第一次抓取，即最顶层页面  
                analysisHtml(urlStr, "", store);  
            } else {  
                for(Map.Entry<String, String> entry : tempList.get(index).entrySet()){  
                    analysisHtml(entry.getKey(), getAddress(entry.getValue()), store);  
                }  
            }  
            if (!store.isEmpty()) tempList.add(store);  
        }while (++index < tempList.size());  
          
        System.out.println("下载完成，开始写入文件：");  
//        print();  
          
        // 把所有结果存入文件中  
        PrintStream output = new PrintStream(filePath);  
        for (Map.Entry<String, String> entry : result.entrySet()){  
            output.println(entry.getKey() + " " + entry.getValue());  
        }  
        output.close();  
        System.out.println("OK!");  
    }  
      
    /** 
     * 获取 
     * @param urlStr 网址 
     * @throws IOException 
     */  
    public static void analysisHtml(String urlStr, String parentPath  
            , Map<String, String> store) throws IOException{  
        // 获取html  
        Document doc = Jsoup.connect(urlStr).get();          
        // 经观察，符合条件的数据皆以此开头  
        Elements links = doc.select("tr[class$=tr]");  
        // 遍历每个 <a 标签  
        for (Element link : links) {  
            // 用if 过滤结尾的   
            // <A class=STYLE3       href="http://www.miibeian.gov.cn/"       target=_blank>京ICP备05034670号</A>  
            getStatistics(store, link, parentPath);           
        }  
    }  
      
    /** 
     * 保存结果 
     * @param store 每一层次页面的结果 
     * @param link 链接 
     */  
    public static void getStatistics(Map<String, String> store  
            , Element link, String parentPath){  
        String url = ""; // 绝对网址   
        String code = ""; // 相对网址  
        String address = ""; // 地址  
        if ("".equals(parentPath)){ // 父地址是空值，即最顶层页面，提取方法与其他不同  
            // 包含多个td标签， 每个一组数据  
            Elements td = link.getElementsByTag("td");  
            for (Element e : td){  
                url = e.getElementsByAttribute("href").attr("abs:href");   
                code = e.getElementsByAttribute("href").attr("href");    
                if (code.toLowerCase().endsWith(".html")){  
                    code = code.substring(0, code.indexOf(".")); // 提取代码   
                }  
                address = e.text();   
                store.put(url, code + SEPERATOR + address);  
            }  
        } else{  
            url = link.getElementsByAttribute("href").attr("abs:href");              
            // link包含多个td标签，仍可进一步提取  
            Elements td = link.getElementsByTag("td");  
            // 它们都属于同一条数据  
            for (Element e : td){  
                if (!e.text().matches("\\d{3}")){  
                    if ("".equals(code)){  
                        code = e.text();  
                    }else{  
                        address = parentPath + e.text();  
                    }  
                }  
            }  
            // 存储 结果  
            if (url == null || "".equals(url)){ //说明到了最底层  
                result.put(code, address);  
            } else {  
                store.put(url, code + SEPERATOR + address);  
            }  
            // 控制台输出每一步数据，看看程序有没有在执行  
            System.out.println(code + "---->" + address);  
        }  
    }  
      
  
      
    /** 
     * 拆分字符串，提取出地址值 
     * @param group eg:11|北京市 
     */  
    private static String getAddress(String group){  
        return group.substring(group.indexOf(SEPERATOR) + 1);  
    }  
      
    // 用于测试  
    private static void print(){  
        for(Map.Entry<String, String> entry : tempList.get(0).entrySet()){  
            System.out.println(entry.getKey() + " " + entry.getValue());  
        }  
    }  
}  


最终获取的文件部分内容：
[java] view plain copy
110101001001 北京市市辖区东城区东华门街道办事处多福巷社区居委会  
110101001002 北京市市辖区东城区东华门街道办事处银闸社区居委会  
110101001005 北京市市辖区东城区东华门街道办事处东厂社区居委会  
110101001006 北京市市辖区东城区东华门街道办事处智德社区居委会  
110101001007 北京市市辖区东城区东华门街道办事处南池子社区居委会  
110101001008 北京市市辖区东城区东华门街道办事处黄图岗社区居委会  
110101001009 北京市市辖区东城区东华门街道办事处灯市口社区居委会  
110101001010 北京市市辖区东城区东华门街道办事处正义路社区居委会  
110101001011 北京市市辖区东城区东华门街道办事处甘雨社区居委会  
110101001013 北京市市辖区东城区东华门街道办事处台基厂社区居委会  
110101001014 北京市市辖区东城区东华门街道办事处韶九社区居委会  
110101001015 北京市市辖区东城区东华门街道办事处王府井社区居委会  
110101002001 北京市市辖区东城区景山街道办事处隆福寺社区居委会
Java爬统计局12位区划代码

猜你喜欢