我们使用webmagic爬取网站,最大的难点不是webmagic的使用,而是各大网站的反爬虫。比如登录后可见,比如限制IP一天中的访问次数、访问频率。今天我们就用webdriver来实现自动登录CSDN,拿到登陆后的cookies从而模拟登录。
首先在加入依赖
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-selenium</artifactId> <version>0.7.3</version> </dependency>
import java.util.Iterator; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.http.client.CookieStore; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.cookie.BasicClientCookie; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; public class Main { public static void main(String[] args) throws Exception { // 初始化参数据 System.setProperty("webdriver.chrome.driver", "C:/bin/chromedriver.exe"); WebDriver driver = new ChromeDriver(); String baseUrl = "https://passport.csdn.net/account/login"; // 加载url driver.get(baseUrl); // 等待加载完成 driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS); // 获取页面元素 WebElement elemUsername = driver.findElement(By.name("username")); WebElement elemPassword = driver.findElement(By.name("password")); WebElement btn = driver.findElement(By.className("logging")); WebElement rememberMe = driver.findElement(By.id("rememberMe")); // 操作页面元素 elemUsername.clear(); elemPassword.clear(); elemUsername.sendKeys("username"); elemPassword.sendKeys("password"); rememberMe.click(); btn.click(); // 提交表单 //btn.submit(); Thread.sleep(5000); //driver.get("http://msg.csdn.net/"); Thread.sleep(5000); // 获取cookies //driver.manage().getCookies(); Set<org.openqa.selenium.Cookie> cookies = driver.manage().getCookies(); System.out.println("Size: " + cookies.size()); Iterator<org.openqa.selenium.Cookie> itr = cookies.iterator(); CookieStore cookieStore = new BasicCookieStore(); while (itr.hasNext()) { Cookie cookie = itr.next(); BasicClientCookie bcco = new BasicClientCookie(cookie.getName(), cookie.getValue()); bcco.setDomain(cookie.getDomain()); bcco.setPath(cookie.getPath()); cookieStore.addCookie(bcco); } } }
如此便能拿到登录后的cookie,后续需要访问该网站其他网页,只需将拿到的cookie放到请求中“骗过”服务器即可