java期末作业
package main; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.io.DataInputStream; import java.io.DataOutputStream; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; public class Wrom { //定义爬取网站,方便获取url所以定义两个 static String baseurl = "http://www.acgwallpaper.com"; static String geturl = "http://www.acgwallpaper.com/"; static String filepath = "E:\\testwrom";//定义图片存放的文件夹 static int j = 0;//计数用的变量 public static void main(String[] args) { System.out.println("初始下载页面:" + geturl); String html = getHtml(geturl); //从url解析得到html的body List<String> href_list = getNextUrl(html); //将href和图片所在网址构成新的url存在链表list for (int i = 0; i < href_list.size(); i++) { System.out.println("图片下载界面:" + href_list.get(i)); String newhtml = getHtml(href_list.get(i));//解析得到由href和网址构成的新的html的body List<String> photo_list = getImgSrcListFromHtml(newhtml);//查找src路径 downloadImg(photo_list, filepath); //下载图片 } System.out.println("下载完毕"); } public static String getHtml(String url){ String html = ""; try { html = Jsoup.connect(url).timeout(5000).execute().body();//测试发现有时会断开连接,故设置连接时间为5s } catch (IOException e) { e.printStackTrace(); } return html; } public static List<String> getImgSrcListFromHtml(String html) { List<String> list = new ArrayList<>();//存放图片超链接 Document doc = Jsoup.parse(html);//解析html页面 Elements elements = doc.select("img[src$=.jpg]");//获取目标尾缀是.jpg的图像 System.out.println("此页面图像个数:" + elements.size()); for(int i = 0; i < elements.size(); i++) { list.add(elements.get(i).attr("src"));//将图片超链接放入String链表内,方便下载 } return list; } public static List<String> getNextUrl(String html){ String forget="/"; List<String> list = new ArrayList<>(); Document document = Jsoup.parse(html);//解析成html页面 Elements elements = document.select("div [class=artwork-block col-xs-6 col-sm-4 col-md-3]").select("a");//获取目标 System.out.println("页面抓取个数" + elements.size()); for (int i = 0;i < elements.size(); i++) { String url = baseurl + elements.get(i).attr("href")+ forget; //图片所在的网页url list.add(url); } return list; } public static void downloadImg(List <String> list, String filepath){ URL newUrl = null; HttpURLConnection hconnection = null; try { for(int i = 0; i < list.size(); i++, j++) { String newlist = baseurl + list.get(i); System.out.println(newlist); String filename = "/img" + j + ".jpg";//文件命名,不能重名 System.out.println(filename); newUrl = new URL(newlist); hconnection = (HttpURLConnection) newUrl.openConnection(); //打开连接 DataInputStream in = new DataInputStream(hconnection.getInputStream());//获取输入流对象 DataOutputStream out = new DataOutputStream(new FileOutputStream(filepath + filename));//输出流 byte[] buffer = new byte[4096]; int count = 0; //将输入流以字节的形式读取并写入buffer中 while ((count = in.read(buffer)) > 0) { out.write(buffer, 0, count); } //关闭该关的东西 out.close(); in.close(); hconnection.disconnect(); } } catch (Exception e) { System.out.println("失误"); } } }