//有些网站有专门的防爬虫措施,如:百度,不一定都能提取出来。
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NetSpider {
public static void main(String[] args) throws Exception {
String urlStr="http://www.mmonly.cc/wmtp/"; //要获取图片的网址http://www.mmonly.cc http://www.photophoto.cn
String regexStr="<img\\b[^<>]*?\\bsrc[\\s\\t\\r\\n]*=[\\s\\t\\r\\n]*[\"\"']?[\\s\\t\\r\\n]*(?<imgUrl>[^\\s\\t\\r\\n\"\"'<>]*)[^<>]*?/?[\\s\\t\\r\\n]*>"; //获取的图片正则表达式(该表达式网上找的非原创)
String destPath="/home/daem/桌面/picture/"; //保存的路径
String encode="gbk"; //网页编码方式(右键点击网页查看源码编码方式,也可自己用正则表达式提取)
List<String> picUrl=srcWeb(getNetContent(urlStr,encode),regexStr,1); //获取的图片Url
getPicture(picUrl,destPath); //获取图片并保存到本地
}
/**
* 获取url的图片并保存到本地
* @param list 图片URL
* @param destPath 保存的路径
*/
public static void getPicture(List<String> picUrl,String destPath) {
BufferedInputStream bis=null; //读取网页
BufferedOutputStream bos=null; //输出到本地
URL url=null;
for(String temp:picUrl) {
System.out.println(temp);
String[] regex=temp.split("/");
String name=regex[regex.length-1]; //获取网页图片名字
try {
url=new URL(temp);
bis=new BufferedInputStream(url.openStream());
byte[] b=new byte[1024];
int len=0;
bos=new BufferedOutputStream(new FileOutputStream(new File(destPath+name)));
while((len=bis.read(b))!=-1) {
bos.write(b,0,len);
bos.flush();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 获取指定的url并保存到List中
* @param urlStr url网址
* @param regexStr 正则解析式
* @param group 正则组
* @return
*/
public static List<String> srcWeb(String urlStr,String regexStr,int group){
List<String> list=new ArrayList<>();
Pattern p = Pattern.compile(regexStr);
Matcher m = p.matcher(urlStr);
while(m.find()) {
list.add(m.group(group));
}
return list;
}
/**
* 获取网页内容
* @param urlStr url网址
* encode 编码方式
* @return
*/
public static String getNetContent(String urlStr,String encode) {
URL url=null;
BufferedReader br=null;
StringBuilder sb = new StringBuilder();
try {
url=new URL(urlStr);
br=new BufferedReader(new InputStreamReader(url.openStream(),encode));
String temp =null;
while(null!=(temp=br.readLine())) {
sb.append(temp+"\r\n");
}
System.out.println(sb.toString());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
}