Java爬取糗百段子

继前篇文章介绍如何使用Java爬取百姐视频之后，就想着如何获取糗百段子。在仔细研究了其规则之后，总算成功爬取了。在中间也遇到了一些问题，如其网页源码和实际获取的内容不一致问题，当时被困扰了很久，改了几次匹配规则，还好解决了。下面直接放码（Talk is cheap, show code)。

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cn.cxd.tools.FileTools;

public class WebSpiderDemo2 {

	public static void main(String[] args) throws Exception {

		String source = "https://www.qiushibaike.com/text/page/";
		String destUrl = "https://www.qiushibaike.com";

		Set<String> setAll = new HashSet<>();
		for (int index = 1; index <= 13; index++) {
			String tmp = source + index + "/";
			Set<String> urlSet = getArticleUrl(tmp);
			setAll.addAll(urlSet);
		}

		String destPath = "D:/joke.txt";
		File destFile = new File(destPath);
		BufferedWriter bw = new BufferedWriter(new FileWriter(destFile, true));

		Iterator<String> it = setAll.iterator();
		while (it.hasNext()) {
			String articleUrl = destUrl + it.next();
			saveToLocal(articleUrl, bw);
		}
		FileTools.close(bw);
	}

	private static void saveToLocal(String articleUrl, BufferedWriter bw) throws Exception {

		URL url = new URL(articleUrl);

		InputStream is = url.openStream();
		BufferedReader br = new BufferedReader(new InputStreamReader(is));

		String line = null;

		String textRegex = "<span class=\"big-begin\">" + "(.+)";
		Pattern pat = Pattern.compile(textRegex);

		for (int i = 0; i < 500; i++) {
			line = br.readLine();

			if (null != line && line.startsWith("<span class=\"big-begin\">")) {
				Matcher matcher = pat.matcher(line);
				if (matcher.find()) {
					String text = matcher.group(1).replace("</span>", "").replace("<br/>", "  ");
					bw.write(text);
					bw.newLine();
					bw.newLine();
					i = 500;
				}
			}
		}

		bw.flush();
		FileTools.close(br);
	}

	public static Set<String> getArticleUrl(String source) throws Exception {

		URL url = new URL(source);

		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
		conn.setRequestMethod("GET");
		conn.setRequestProperty("user-agent",
				"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
		BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));

		String line = null;

		String urlRegex = "/article/\\d+";
		Pattern pattern = Pattern.compile(urlRegex);

		Set<String> set = new HashSet<>();
		Map<String, Integer> map = new HashMap<>();
		for (int i = 0; i < 3000; i++) {
			line = br.readLine();

			if (null != line) {

				if (line.contains("target=\"_blank\"")) {
					Matcher matcher = pattern.matcher(line);
					if (matcher.find()) {
						String tar = matcher.group(0);
						if (map.containsKey(tar)) {
							set.add(tar);
						} else {
							map.put(tar, 1);
						}
					}
				}
			}

		}

		if (null != br) {
			br.close();
		}

		return set;
	}

}

其中的FileTools为一个工具类，用于关闭Java的IO，其代码如下：

import java.io.Closeable;
import java.io.IOException;

public class FileTools {

	public static void close(Closeable... close) {

		for (Closeable io : close) {
			if (null != io) {
				try {
					io.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}
}

最后爬取结果如下

注：此代码只能作为学习交流之用，千万不能做恶，千万不能做恶，千万不能做恶，千万不能做恶，千万不能做恶，千万不能做恶。

猜你喜欢