import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author 方小洲 * * 2013-8-5 上午8:46:54 */ public class ImageParse { /** * 根据Url地址获取图片地址 * @param url * @return * @throws MalformedURLException */ public List<String> getImagesPath(String url) throws Exception { List<String> imagePaths = new ArrayList<String>(); String htmlCode = getHtmlCode(url); String imgRegs1 = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; String imgRegs2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; //针对没有全路径的,不带url地址的 Pattern pattern = Pattern.compile(imgRegs1); Matcher matcher = pattern.matcher(htmlCode); while (matcher.find()) { imagePaths.add(url + "/" + matcher.group(3)); //System.out.println(url + "/" + matcher.group(3)); } //针对全路径的,不带url地址的 pattern = Pattern.compile(imgRegs2); matcher = pattern.matcher(htmlCode); while (matcher.find()) { imagePaths.add(matcher.group(3)); //System.out.println(matcher.group(3)); } return imagePaths ; } /** * 根据URL地址获取网页代码 * @param url * @throws Exception * @return */ public String getHtmlCode(String url) throws Exception { StringBuffer sbf = new StringBuffer(); URL httpUrl = new URL(url); BufferedReader reader = new BufferedReader(new InputStreamReader(httpUrl.openStream())); String line = "" ; while((line = reader.readLine()) != null){ //System.out.println(line); sbf.append(line); } return sbf.toString(); } /** * 根据URL地址下载图片 * @param targetUrl 目标网址 * @param outputPath 生成的文件目录 * @throws Exception */ public void downLoadImages(String targetUrl , String outputPath) throws Exception{ List<String> imagePaths = getImagesPath(targetUrl); for (String imagePath : imagePaths) { generatorImageBathByUrl(imagePath , outputPath); } } /** * 下载图片 * @param imagePath * @param outputPath * @throws Exception */ public void generatorImageBathByUrl(String imagePath , String outputPath) throws Exception{ //outputPath = outputPath + "/" + imagePath.substring(imagePath.lastIndexOf("/")); outputPath = outputPath + "/" + System.currentTimeMillis() + imagePath.substring(imagePath.lastIndexOf(".")); URL imageUrl = new URL(imagePath); BufferedInputStream bis = new BufferedInputStream(imageUrl.openStream()); FileOutputStream fos = new FileOutputStream(new File(outputPath)); int pos ; while((pos = bis.read()) != -1) { fos.write(pos); } fos.close(); bis.close(); } public static void main(String[] args) throws Exception { ImageParse parse = new ImageParse(); parse.downLoadImages("http://www.fjboda.cn","d:\\image"); } }
java 抓取网页图片
猜你喜欢
转载自fxz-2008.iteye.com/blog/1920920
今日推荐
周排行