webcollector + selenium 爬取空间图片

  1 package cn.hb.util;
  2 
  3 import java.io.File;
  4 import java.io.FileNotFoundException;
  5 import java.io.FileWriter;
  6 import java.io.IOException;
  7 import java.util.ArrayList;
  8 import java.util.List;
  9 import java.util.Set;
 10 import java.util.UUID;
 11 import java.util.concurrent.TimeUnit;
 12 
 13 import org.apache.commons.io.IOUtils;
 14 import org.openqa.selenium.By;
 15 import org.openqa.selenium.Cookie;
 16 import org.openqa.selenium.JavascriptExecutor;
 17 import org.openqa.selenium.Keys;
 18 import org.openqa.selenium.WebDriver;
 19 import org.openqa.selenium.WebElement;
 20 import org.openqa.selenium.firefox.FirefoxDriver;
 21 import org.openqa.selenium.firefox.FirefoxOptions;
 22 import org.openqa.selenium.interactions.Actions;
 23 import cn.edu.hfut.dmic.webcollector.conf.Configuration;
 24 import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
 25 import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
 26 import cn.edu.hfut.dmic.webcollector.model.Page;
 27 import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
 28 import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
 29 import cn.edu.hfut.dmic.webcollector.util.FileUtils;
 30 
 31 /**
 32  * 爬取空间图片 selenium登录后提取链接给webcollector处理即可
 33  * 
 34  * @author tele
 35  *
 36  */
 37 public class QZoneCrawler extends BreadthCrawler {
 38     static String url = "https://user.qzone.qq.com/qq号";
 39     static String cookies = "";
 40     static final int pageSize = 98;
 41     static List<String> crawdataList = new ArrayList<String>();
 42     static File baseDir = new File("F:/qz/image");
 43 
 44     public QZoneCrawler(String crawlPath, boolean autoParse) {
 45         super(crawlPath, autoParse);
 46     }
 47 
 48     @Override
 49     public void visit(Page page, CrawlDatums next) {
 50         try {
 51             Thread.sleep(3000);
 52         } catch (InterruptedException e) {
 53             e.printStackTrace();
 54         }
 55         String name = UUID.randomUUID().toString() + ".jpg";
 56         try {
 57             FileUtils.write(new File(baseDir, name), page.content());
 58         } catch (FileNotFoundException e) {
 59             e.printStackTrace();
 60         } catch (IOException e) {
 61             e.printStackTrace();
 62         }
 63     }
 64 
 65     String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0";
 66 
 67     // 设置cookies
 68     @Override
 69     public Page getResponse(CrawlDatum crawlDatum) throws Exception {
 70         HttpRequest request = new HttpRequest(crawlDatum);
 71         request.setCookie(cookies);
 72         request.setUserAgent(userAgent);
 73         return request.responsePage();
 74     }
 75 
 76     public static void main(String[] args) throws Exception {
 77 
 78         QZoneCrawler qz = new QZoneCrawler("F:/qz/image/webcollector", true);
 79 
 80         Configuration conf = Configuration.copyDefault();
 81         conf.setAutoDetectImg(true);
 82         conf.setConnectTimeout(5000);
 83         conf.setReadTimeout(10000);
 84 
 85         // 线程爬取间隔
 86         conf.setExecuteInterval(5000);
 87         qz.setConf(conf);
 88         qz.setThreads(100);
 89 
 90         login();
 91         qz.addSeed(crawdataList);
 92         qz.start(1);
 93 
 94     }
 95 
 96     /**
 97      * 登录
 98      * 
 99      * @throws InterruptedException
100      * @throws IOException
101      */
102     public static void login() throws InterruptedException, IOException {
103         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
104 
105         FirefoxOptions options = new FirefoxOptions();
106         options.setBinary("F:/ff/firefox.exe");
107 
108         WebDriver driver = new FirefoxDriver(options);
109         driver.manage().window().maximize();
110         // 超时
111         try {
112             driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
113             driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
114             driver.get(url);
115         } catch (Exception e) {
116             System.out.println("所需元素已出现,停止加载页面");
117         } finally {
118             // 切换到登录login
119             driver.switchTo().frame("login_frame");
120 
121             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
122             System.out.println(switcher_plogin.getText());
123             if (switcher_plogin.isDisplayed()) {
124                 switcher_plogin.click();
125             }
126             // 用户名
127             driver.findElement(By.id("u")).clear();
128             driver.findElement(By.id("u")).sendKeys("账号");
129 
130             // 密码
131             driver.findElement(By.id("p")).clear();
132             driver.findElement(By.id("p")).sendKeys("密码");
133 
134             // 登录
135             try {
136                 driver.findElement(By.id("login_button")).click();
137                 Thread.sleep(3000);
138             } catch (Exception e) {
139                 e.printStackTrace();
140             } finally {
141                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
142                     System.out.println("登录失败!5秒后再次尝试登录");
143                     Thread.sleep(5000);
144                     driver.findElement(By.id("login_button")).click();
145                 }
146             }
147 
148             // 退出frame
149             driver.switchTo().defaultContent();
150 
151             System.out.println(driver.getCurrentUrl());
152 
153             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
154 
155             // 如果有亲密度提示
156             /*
157              * try { WebElement fs_guide = driver.findElement(By.xpath(
158              * "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"
159              * )); if(fs_guide != null && fs_guide.isDisplayed()) {
160              * fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
161              * }finally {
162              * 
163              * }
164              */
165 
166             // 点击相册
167             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
168 
169             Thread.sleep(2000);
170 
171             // 切换到frame
172             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
173 
174             // 进入图片列表(说说相册)
175             // driver.findElement(By.xpath("//ul[@class='js-album-list-ul']/li[1]/div[1]/div[1]/a")).click();
176 
177             // 拼接cookie
178             StringBuilder builder = new StringBuilder();
179             Set<Cookie> cookieSet = driver.manage().getCookies();
180             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
181             cookies = builder.toString();
182 
183             // 获得相册列表
184             List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));
185             if (photoList == null || photoList.size() == 0) {
186                 throw new RuntimeException("定位相册列表元素失败!");
187             }
188 
189             // 构造不同相册的xpath路径
190             List<String> xpathList = new ArrayList<String>();
191             for (int i = 0; i < photoList.size(); i++) {
192                 xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");
193             }
194 
195             // 窗口句柄
196             List<String> allHandles = new ArrayList<String>(driver.getWindowHandles());
197 
198             // 遍历xpath
199             String newUrl = driver.getCurrentUrl();
200             for (int i = 0; i < xpathList.size(); i++) {
201                 // 打开新标签页
202                 jsExecutor.executeScript("window.open('" + newUrl + "');");
203                 allHandles = new ArrayList<String>(driver.getWindowHandles());
204 
205                 Thread.sleep(2000);
206                 String xpath = xpathList.get(i);
207 
208                 // 句柄切换需要时间
209                 driver.switchTo().window(allHandles.get(i + 1));
210                 Thread.sleep(2000);
211 
212                 List<String> urlList = getImageUrl(driver, xpath);
213                 if (urlList == null) {
214                     break;
215                 }
216                 crawdataList.addAll(urlList);
217             }
218 
219             System.out.println("所有相册图片链接提取完毕,退出浏览器");
220             driver.quit();
221 
222         }
223     }
224 
225     /**
226      * 提取图片url
227      * 
228      * @param driver
229      * @param xpath
230      * @throws InterruptedException
231      * @throws IOException
232      */
233     public static List<String> getImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {
234         List<String> urlList = new ArrayList<String>();
235 
236         // 点击相册
237         driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
238 
239         // 切换到图片的frame
240         driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
241         Thread.sleep(1000);
242 
243         // 获得相册名称
244         String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText();
245 
246         //// 文件夹检测
247         File imageUrl = new File("f:/qz/" + photo_name + ".txt");
248         if (!imageUrl.getParentFile().exists()) {
249             imageUrl.mkdirs();
250         } else {
251             imageUrl.delete();
252         }
253 
254         // 获得图片总数,每页最多98张图片
255         WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));
256         String text = span.getText();
257         int count = Integer.parseInt(text);
258 
259         // 进入列表
260         driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();
261         Thread.sleep(3000);
262 
263         // 计算页数
264         int totalPage = (int) Math.ceil((double) count / (double) pageSize);
265         System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页");
266 
267         FileWriter fileWriter = new FileWriter(imageUrl, true);
268 
269         for (int i = 0; i < totalPage; i++) {
270 
271             // 模拟按键加载图片
272             Actions actions = new Actions(driver);
273             for (int j = 0; j < 50; j++) {
274                 if (j % 5 == 0) {
275                     Thread.sleep(1000);
276                 }
277                 actions.sendKeys(Keys.ARROW_DOWN).perform();
278             }
279 
280             // 提取本页的image链接
281             List<WebElement> list = driver.findElements(
282                     By.xpath("//a[@class='item-cover j-pl-photoitem-imgctn']/img[@class='j-pl-photoitem-img']"));
283             if (list == null || list.size() == 0) {
284                 // 相册无权限访问或定位失败
285                 System.out.println("无法提取图片链接!");
286                 return null;
287             }
288             for (WebElement element : list) {
289                 String src = element.getAttribute("src") + "\n";
290                 IOUtils.write(src, fileWriter);
291                 System.out.println(src);
292                 // 添加链接
293                 urlList.add(src);
294             }
295             System.out.println("第" + (i + 1) + "页图片链接提取完毕");
296             Thread.sleep(1000);
297             // 跳转到下一页
298             if ((i + 2) <= totalPage) {
299                 driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();
300                 ;
301             }
302         }
303 
304         fileWriter.close();
305         return urlList;
306     }
307 
308 }
爬取结果
webcollector + selenium 爬取空间图片

猜你喜欢