Web crawler java version of the basic idea is to first get the page information, and then expressions to extract web page content based on positive
Package Penalty for xuexi; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class webtest { /** * Obtain the corresponding source content urlStr pages * @param args * @throws IOException */ public static String getURLContent(String urlStr,String charset){ StringBuffer sb=new StringBuffer(); try { URL url = new URL(urlStr); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName(charset))); String temp=""; do{ temp=reader.readLine(); sb.append(temp); //System.out.println(temp); }while(temp != null); }catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return sb.toString(); } /** * Regular expressions interception string * @Param DestStr * @Param regexStr * @Return * / // parameters: channeling characters and regular expression public static List <String> getMatherSubstrs (deststr String, String regexStr) { List<String> result = new ArrayList<String>(); Pattern p=Pattern.compile(regexStr); Matcher m=p.matcher(destStr); while(m.find()){ result.add(m.group(0)); } return result; } public static void main(String[] args) throws IOException { String content=getURLContent("https://www.qq.com/","utf-8"); List<String> list=getMatherSubstrs(content,"href=\"+[\\w./:]+\""); for(String a: list){ System.out.println(a); } } }