继前篇文章介绍如何使用Java爬取百姐视频之后,就想着如何获取糗百段子。在仔细研究了其规则之后,总算成功爬取了。在中间也遇到了一些问题,如其网页源码和实际获取的内容不一致问题,当时被困扰了很久,改了几次匹配规则,还好解决了。下面直接放码(Talk is cheap, show code)。
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import cn.cxd.tools.FileTools;
public class WebSpiderDemo2 {
public static void main(String[] args) throws Exception {
String source = "https://www.qiushibaike.com/text/page/";
String destUrl = "https://www.qiushibaike.com";
Set<String> setAll = new HashSet<>();
for (int index = 1; index <= 13; index++) {
String tmp = source + index + "/";
Set<String> urlSet = getArticleUrl(tmp);
setAll.addAll(urlSet);
}
String destPath = "D:/joke.txt";
File destFile = new File(destPath);
BufferedWriter bw = new BufferedWriter(new FileWriter(destFile, true));
Iterator<String> it = setAll.iterator();
while (it.hasNext()) {
String articleUrl = destUrl + it.next();
saveToLocal(articleUrl, bw);
}
FileTools.close(bw);
}
private static void saveToLocal(String articleUrl, BufferedWriter bw) throws Exception {
URL url = new URL(articleUrl);
InputStream is = url.openStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line = null;
String textRegex = "<span class=\"big-begin\">" + "(.+)";
Pattern pat = Pattern.compile(textRegex);
for (int i = 0; i < 500; i++) {
line = br.readLine();
if (null != line && line.startsWith("<span class=\"big-begin\">")) {
Matcher matcher = pat.matcher(line);
if (matcher.find()) {
String text = matcher.group(1).replace("</span>", "").replace("<br/>", " ");
bw.write(text);
bw.newLine();
bw.newLine();
i = 500;
}
}
}
bw.flush();
FileTools.close(br);
}
public static Set<String> getArticleUrl(String source) throws Exception {
URL url = new URL(source);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
String line = null;
String urlRegex = "/article/\\d+";
Pattern pattern = Pattern.compile(urlRegex);
Set<String> set = new HashSet<>();
Map<String, Integer> map = new HashMap<>();
for (int i = 0; i < 3000; i++) {
line = br.readLine();
if (null != line) {
if (line.contains("target=\"_blank\"")) {
Matcher matcher = pattern.matcher(line);
if (matcher.find()) {
String tar = matcher.group(0);
if (map.containsKey(tar)) {
set.add(tar);
} else {
map.put(tar, 1);
}
}
}
}
}
if (null != br) {
br.close();
}
return set;
}
}
其中的FileTools为一个工具类,用于关闭Java的IO,其代码如下:
import java.io.Closeable;
import java.io.IOException;
public class FileTools {
public static void close(Closeable... close) {
for (Closeable io : close) {
if (null != io) {
try {
io.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
最后爬取结果如下
注:此代码只能作为学习交流之用,千万不能做恶,千万不能做恶,千万不能做恶,千万不能做恶,千万不能做恶,千万不能做恶。