抓取解析ip并入库

  • 以下代码为抓取ip地址并入库分析,代码仅供参考,并未做任何处理....
  • public static void main(String[] args) {
    String crawl_url = "http://www.cnblogs.com/xioxu/archive/2009/05/03/1448322.html";
    StringBuilder sb = new StringBuilder();

    try{
    URL instance = new URL(crawl_url);
    URLConnection con = instance.openConnection();
    BufferedReader bufferReader = new BufferedReader(new InputStreamReader(con.getInputStream()));
    String readLine;
    while((readLine = bufferReader.readLine()) != null){
    // cache in memory
    sb.append(readLine);
    }
    }catch(Exception ex){

    }

    parseIp(sb.toString());
    }

    private static void parseIp(String html){
    List<String> parseList = new ArrayList<String>();
    String regex = "((\\d{2,}\\.){3}\\d{2,})\\__((\\d{2,}\\.){3}\\d{2,})\\__([^a-zA-Z]{2,})\\__([^a-zA-Z]{2,})";
    Pattern p = Pattern.compile(regex);
    Matcher matcher = p.matcher(html);
    while(matcher.find()){
    String str = matcher.group();
    str = str.replaceAll("[\\<\\/]", "");
    parseList.add(str);
    System.out.println(str);
    /*String startIp = matcher.group(0);
    String endIp = matcher.group(1);
    String province = matcher.group(2);
    String routeType = matcher.group(3);
    System.out.println("startIp: " + startIp + "\tendIp: " + endIp + "\tprovince: " + province + "\trouteType: " + routeType);*/
    }

    List<Ip> ipList = toIpList(parseList);
    for(Ip ip : ipList){
    System.out.println(ip.toString());
    }
    }

    private static List<Ip> toIpList(List<String> list){
    List<Ip> ipList = new ArrayList<Ip>();
    Ip ip = null;
    for(String line : list){
    ip = new Ip();
    String[] asArray = toIpArray(line);
    ip.setStartIp(asArray[0]);
    ip.setEndIp(asArray[1]);
    ip.setProvince(asArray[2]);
    ip.setRouteType(asArray[3]);
    ipList.add(ip);
    }

    return ipList;
    }

    private static String[] toIpArray(String line){
    String[] toArray = new String[5];
    int pos = 0;
    int length = "__".length();
    int idx = 0;

    while((pos = line.indexOf("__")) > -1){
    String value = line.substring(0,pos);
    toArray[idx++] = value;
    line = line.substring(pos + length);
    }

    return toArray;
    }

猜你喜欢

转载自sunwch.iteye.com/blog/1565458