java爬虫demo

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;import java.util.regex.Matcher;
import java.util.regex.Pattern;
class getHtml2 {	
      private int num;
	  public int getNum() {		
	          return num;	
}	
      public void setNum(int num) {		
	        this.num = num;
			}
      public void getPicture(String url) {	
	        URL httpUrl;
			BufferedInputStream in;
			FileOutputStream out;
			try {
				System.out.println("==========抓取网络图片 Start==========");
				String PictrueName = url.substring(url.lastIndexOf("/"));
				String savePath = "//sdcard//Download//browser";
				File file = new File(savePath);
				if (!file.exists() && !file.isDirectory()) {
					file.mkdir();
					 }	
				httpUrl = new URL(url);
				in = new BufferedInputStream(httpUrl.openStream());
				out = new FileOutputStream(new File(file + PictrueName));
				byte[] buffer = new byte[1024 * 1024];
				int len;
				while ((len = in.read(buffer)) != -1) {	
				out.write(buffer, 0, len);
				}
				in.close();
				out.close();
				System.out.println("==========抓取网络图片 End==========");
				num++;
				} catch (MalformedURLException e) {	e.printStackTrace();} 
				   catch (Exception e) {e.printStackTrace();		}	}	
		public String getHtmlCode(String url) throws Exception {		
		        String content = "";
				URL httpUrl = new URL(url);		
				BufferedReader buf = new BufferedReader(new InputStreamReader(httpUrl.openStream()));		
				String str;		
				while ((str = buf.readLine()) != null) {			
				content += str + "\n";		}
				// System.out.println(new String(content.getBytes("GBK"),"ISO-8859-1"));
				return new String(content.getBytes("utf-8"), "UTF-8");	}	
		public void get(String url) throws Exception {		
			String searchImgReg = "(\\w*\\.)(\\w+\\.)(\\w*)/(\\d*/)*(\\w+)!(\\w+\\.jpg)";
				String content = this.getHtmlCode(url);		
				System.out.println(content);		
				Pattern pattern = Pattern.compile(searchImgReg);		
				Matcher matcher = pattern.matcher(content);		
				while (matcher.find()) {			
				      this.getPicture("http://"+ matcher.group(0));
					   
					   System.out.println(matcher.group(0));		}		
			   }	
		public static void main(String[] args) throws Exception {		
			String url = "https://m.woyaogexing.com/shouji/dongman/2018/4838.html";		
					   getHtml2 ge = new getHtml2();		
					   ge.get(url);		
					   System.out.println("抓取完毕，本次抓取到" + ge.getNum() + "张图片");	}}
猜你喜欢