抓取 开发者头条 分享的所有文章

使用 HttpClient 和 jsoup 抓取 开发者头条中分享的所有文章(截止目前15000多条)。

数据: 点击下载
代码: 点击下载




public class ToutiaoArticles {
	
	public static void main(String[] args) {
		new ToutiaoArticles().fetch();
	}
	
	public void fetch() {
		LocalDate startDate = new LocalDate(2014, 9, 27);
		LocalDate endDate = LocalDate.now();
		File outputFile = new File("D://data.csv");
		String baseUrl = "http://toutiao.io/prev/";
		
		PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
		mgr.setMaxTotal(5);
		mgr.setDefaultMaxPerRoute(5);
		HttpClient httpClient = HttpClientBuilder.create().setConnectionManager(mgr).build();
		HttpGet httpGet = null;
		
		String date = null;
		String url = null;
		List<Link> linkInfos = null;
		StringBuffer articleInfos = null;
		
		while (startDate.isBefore(endDate) || startDate.isEqual(endDate)) {
			date = startDate.toString("yyyy-MM-dd");
			url = baseUrl + date;
			System.out.println("[URL]-----" + url);
			httpGet = new HttpGet(url);
			try {
				linkInfos = httpClient.execute(httpGet, new PageResponseHandler());
				if (linkInfos != null) {
					articleInfos = new StringBuffer();
					for (int i = 0; i < linkInfos.size(); i++) {
						Link k = linkInfos.get(i);
						String data = date + "," + (i+1) + "," + k.getTitle() + "," + k.getOriginLink() + "," + k.getLink();
						System.out.println(data);
						articleInfos.append(data + "\r\n");
					}
					FileUtils.writeStringToFile(outputFile, articleInfos.toString(), "GBK", true);
				}
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				httpGet.releaseConnection();
			}
			startDate = startDate.plusDays(1);
		}
	}
	
	class PageResponseHandler implements ResponseHandler<List<Link>> {

		@Override
		public List<Link> handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
			
			HttpEntity entity = response.getEntity();

			if (response.getStatusLine().getStatusCode() >= 300) {
				EntityUtils.consume(entity);
				return null;
			}

			if (entity == null) {
				return null;
			}

			RequestConfig requestConfig = RequestConfig.custom().setRedirectsEnabled(false).build();
			PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
			mgr.setMaxTotal(5);
			mgr.setDefaultMaxPerRoute(5);
			HttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).setConnectionManager(mgr).build();
			HttpGet httpGet = null;
			HttpResponse httpResponse = null;
			
			List<Link> linkInfos = new ArrayList<Link>();
			Link lk = null;

			String html = EntityUtils.toString(entity);
			
			Document document = Jsoup.parse(html);
			Elements links = document.getElementsByAttributeValue("target", "_blank");
			for (int i = 0; i < links.size(); i++) {
				lk = new Link();
				lk.setLink(links.get(i).attr("href"));
				lk.setTitle(links.get(i).text());
				
				httpGet = new HttpGet(lk.getLink());
				try {
					httpResponse = httpClient.execute(httpGet);
					if (httpResponse.getStatusLine().getStatusCode() == 302) {
						String loc = httpResponse.getLastHeader("Location").getValue();
						loc = loc.replaceAll("hmsr=toutiao.io", "");
						loc = loc.replaceAll("&utm_medium=toutiao.io", "");
						loc = loc.replaceAll("&utm_source=toutiao.io", "");
						lk.setOriginLink(loc);
					}
	
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					httpGet.releaseConnection();
				}
				
				linkInfos.add(lk);
			}

			return linkInfos;
		}

	}

	class Link {
		private String title;
		private String link;
		private String originLink;
		
		public String getTitle() {
			return title;
		}
		public void setTitle(String title) {
			this.title = title;
		}
		public String getLink() {
			return link;
		}
		public void setLink(String link) {
			this.link = link;
		}
		public String getOriginLink() {
			return originLink;
		}
		public void setOriginLink(String originLink) {
			this.originLink = originLink;
		}
	}
	
}

猜你喜欢

转载自rensanning.iteye.com/blog/2314282