Web crawler java version

Web crawler java version of the basic idea is to first get the page information, and then expressions to extract web page content based on positive

Package Penalty for xuexi;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;




public class webtest {
 /**
  * Obtain the corresponding source content urlStr pages    
  * @param args
  * @throws IOException
  */
    public static String  getURLContent(String urlStr,String charset){
        StringBuffer sb=new StringBuffer();
        try {
        URL    url = new URL(urlStr);
        BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(charset)));
        String temp="";
        do{
            temp=reader.readLine();
            sb.append(temp);
            //System.out.println(temp);
        }while(temp != null);
        
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return sb.toString();
    }
    
    /**
     * Regular expressions interception string
     * @Param DestStr
     * @Param regexStr
     * @Return 
     * / 
    // parameters: channeling characters and regular expression 
    public  static List <String> getMatherSubstrs (deststr String, String regexStr) {
        List<String> result = new ArrayList<String>();
          Pattern p=Pattern.compile(regexStr);
          Matcher m=p.matcher(destStr);
          while(m.find()){
              result.add(m.group(0));
          }
          return result;
    }
    
    
  public static void main(String[] args) throws IOException {
      String content=getURLContent("https://www.qq.com/","utf-8");
      List<String> list=getMatherSubstrs(content,"href=\"+[\\w./:]+\"");
       for(String a: list){
          System.out.println(a); 
       }
      
  }  
}

Guess you like

Origin www.cnblogs.com/hzcjd/p/12227948.html