Csdn爬虫自动评论

前言:因为发现自己的csdn博客被机器人自动评论,这些博客很多都是对别人进行评论,然后别人就有可能回访或者点赞关注等等,基本上总积分非常高,为了覆盖掉这些机器评论,本篇主要是实现java爬虫对自己的博客所有文章进行自动评论。

一、准备分析工作

工具:webmagic

材料:由文件加载的自动随机评论语


1、创建了一个自动随机生成评论语,以及可以自动加载评论语的类CommentLoad。

/**
 * 评论语加载
 */
public class CommentLoad {

	private AtomicBoolean inited = new AtomicBoolean(false);

	private List<String> urls = new ArrayList<>();

	// 默认刷新时间20秒
	private static final long DEFAULT_REFRESH_TIME = 20000l;
	private static final String DEFAULT_PATH = "comment.txt";

	// 起始时间
	private long beforeTime;

	// 结束时间
	private long endTime;

	// 刷新时间
	private long refreshTime = DEFAULT_REFRESH_TIME;

	public CommentLoad() {
	}

	public CommentLoad(long refreshTime) {
		super();
		this.refreshTime = refreshTime;
	}

	public static void main(String[] args) throws InterruptedException, IOException {

		// 功能1):从文件加载评论语列表
		String path = DEFAULT_PATH;
		CommentLoad commentLoad = new CommentLoad();
		int i = 0;
		while (true) {
			Thread.sleep(1000l);
			List<String> list = commentLoad.loadComments(path);
			System.out.println("计数时间:" + ++i);
			System.out.println(list.size());
			System.out.println(list);
		}

		// 功能2): 创建多个评论语到文件中
		// path =
		// CommentLoad.class.getClassLoader().getResource(path).getPath();
		//
		// System.out.println(path);
		//
		// // 写评论到评论文件中
		// PrintWriter printWriter = new PrintWriter(new FileWriter(path,
		// false));
		// String[] str = new String[] { "文章", "很好", "思路清晰,", "大佬", "66", "加油",
		// "学习了", "你真棒!" };
		// for (int i = 0; i < 50; i++) {
		// // System.out.println(flushArrToString(str));
		// printWriter.println(flushArrToString(str));
		// printWriter.flush();
		// }
		// printWriter.close();
	}

	/**
	 * 随机洗牌
	 */
	public static <T> String flushArrToString(T[] arr) {
		int length = arr.length;
		int index = length - 1;
		for (int i = 0; i < length && index > 0; i++) {
			int num = createRandom(index);
			T temp = arr[num];
			arr[num] = arr[index];
			arr[index] = temp;
			index--;
		}
		StringBuilder builder = new StringBuilder();
		for (T t : arr) {
			builder.append(t.toString());
		}
		return builder.toString();
	}

	public static int createRandom(int end) {
		return (new Random().nextInt(end));
	}

	/**
	 * 读取评论文本
	 */
	public List<String> loadComments(String path) {
		path = path == null ? DEFAULT_PATH : path;
		if (!inited.get() || System.currentTimeMillis() > this.endTime) {
			readComments(path);
		}
		return urls;
	}

	/**
	 * 读取评论文本
	 */
	private synchronized void readComments(String path) {
		if (!inited.get() || System.currentTimeMillis() > this.endTime) {
			try {
				urls = doReadComments("comment.txt");
			} catch (IOException e) {
				e.printStackTrace();
			}
			this.beforeTime = System.currentTimeMillis();
			this.endTime = beforeTime + this.refreshTime;
			inited.set(true);
		}
	}

	/**
	 * 读取评论文本
	 */
	private List<String> doReadComments(String path) throws FileNotFoundException, IOException {
		String res = CommentLoad.class.getClassLoader().getResource(path).getPath();
		List<String> comments = new ArrayList<>();
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new FileReader(res));
			String line;
			while ((line = reader.readLine()) != null) {
				comments.add(line.trim());
			}
		} finally {
			if (reader != null) {
				IOUtils.closeQuietly(reader);
			}
		}
		return comments;
	}

}

此类的主要功能就是从指定的文件path加载评论语到list列表

2、有了评论语,先进行一个评论的测试

经过测试评论文章需要知道文章id,以及登陆态即可进行评论。

	String content = "这个文章非常好啊";  // 评论内容
		String articleId = "109261723"; // 评论文章id
		Request request = new Request("https://blog.csdn.net/phoenix/web/v1/comment/submit");
		
		request.setMethod(HttpConstant.Method.POST);
		Map<String, Object> params = new HashMap<>();
		params.put("commentId", "");
		params.put("content", content);
		params.put("articleId", articleId);
		HttpRequestBody form = HttpRequestBody.form(params , "utf-8");
		request.setRequestBody(form);
		Spider.create(new ComentTest()).addRequest(request).thread(1).run(); // 需要设置登陆cookie

3、有了一个评论的测试,那么多个文章的评论,主要就是到哪里采集需要评论的文章。

比如从最近发表博客的列表等方法获取,本次批量评论,采用单个博主的文章列表全部评论的方式,单个博主的文章采集列表从https://blog.csdn.net/用户名/article/list/分页,开始。

/**
	 * 自动评论---单个博主
	 */
	public static void main(String[] args) {

		String user = "shuixiou1"; // csdn用户
		int page = 3; // 此用户的文章分页数目

		String[] alls = createInitUrls(user, page);
		
		Spider.create(new CsdnConmentSpider()).addUrl(alls).thread(1).run();
	}

	/**
	 * 创建初始时的url集合
	 */
	private static String[] createInitUrls(String user, int page) {
		List<String> urls = new ArrayList<>();
		for (int i = 1; i <= page; i++) {
			urls.add(String.format(listUrl, user) + i);
		}
		String[] result = urls.toArray(new String[urls.size()]);
		return result;
	}

二、完整代码

1、代码

package com.pc.demos.csdn;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.pc.util.CookieUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.utils.HttpConstant;

/**
 * csdn单个博主自动评论所有文章
 */
public class CsdnConmentSpider implements PageProcessor {
	
	Logger logger = LoggerFactory.getLogger(getClass());

    // 列表url
	private static final String listUrl = "https://blog.csdn.net/%s/article/list/";
    
    // 列表url规则
	private static final String listUrlRegex = "https://blog\\.csdn\\.net/(.+)/article/list/(.*)";
	
    // 详细url规则
	private static final String detailUrlRegex = "https://blog\\.csdn\\.net/(.+)/article/details/(.*)";

    // 评论语加载对象	
	private CommentLoad commentLoad = new CommentLoad();

	@Override
	public void process(Page page) {
		// 列表页请求
		if(page.getRequest().getUrl().matches(listUrlRegex)) {
			List<String> list = page.getHtml().xpath("//div[@class='article-item-box csdn-tracking-statistics']/h4/a").all();
			for (String string : list) {
				String link = Jsoup.parse(string).select("a").attr("href");
				page.addTargetRequest(link);
			}
		// 详细页请求
		} else if(page.getRequest().getUrl().matches(detailUrlRegex)){
			
			System.out.println("详情页面加载:" + page.getRequest().getUrl());
			
			// 文章id
			String articleId = page.getRequest().getUrl().substring(page.getRequest().getUrl().lastIndexOf("/") + 1,
					page.getRequest().getUrl().length());
			
			Request request = new Request("https://blog.csdn.net/phoenix/web/v1/comment/submit");
			
			request.setMethod(HttpConstant.Method.POST);
			Map<String, Object> params = new HashMap<>();
			List<String> comments = commentLoad.loadComments(null); 
			
			params.put("commentId", "");
			params.put("content", comments.get(new Random().nextInt(comments.size())));
			params.put("articleId", articleId);
			HttpRequestBody form = HttpRequestBody.form(params , "utf-8");
			request.setRequestBody(form);
			Map<String, Object> extras = new HashMap<>();
			extras.put("articleId", articleId);
			request.setExtras(extras);
			page.addTargetRequest(request);
		// 评论请求
		} else {
			String res = page.getJson().jsonPath("$..data").toString();
			System.out.println("评论成功:返回id是" + res);
		}
	}

	@Override
	public Site getSite() {
		Site site = Site.me().setCycleRetryTimes(3).setSleepTime(2000);
		site.addHeader(":authority", "blog.csdn.net");
		site.addHeader(":method:", "POST");
		site.addHeader(":path:", "/phoenix/web/v1/comment/submit");
		site.addHeader(":scheme", "https");
		site.addHeader("accept", "application/json, text/javascript, */*; q=0.01");

		site.addHeader("accept-encoding", "gzip, deflate, br");
		site.addHeader("accept-language", "zh-CN,zh;q=0.9");
		site.addHeader("origin", "https://blog.csdn.net");
		site.addHeader("referer", "https://blog.csdn.net");
		
		// 设置登陆后的cookie字符串
		
		String cookieSpec = "################";
		
		CookieUtil.setSiteCookies(site, cookieSpec );
		
		return site;
	}

	/**
	 * 自动评论---单个博主
	 */
	public static void main(String[] args) {

		String user = "shuixiou1"; // csdn用户
		int page = 3; // 此用户的文章分页数目

		String[] alls = createInitUrls(user, page);
		
		Spider.create(new CsdnConmentSpider()).addUrl(alls).thread(1).run();
	}

	/**
	 * 创建初始时的url集合
	 */
	private static String[] createInitUrls(String user, int page) {
		List<String> urls = new ArrayList<>();
		for (int i = 1; i <= page; i++) {
			urls.add(String.format(listUrl, user) + i);
		}
		String[] result = urls.toArray(new String[urls.size()]);
		return result;
	}
}

2、 使用说明

经过一轮测试,没有被频率限制

1) 必须要设置登陆的cookie字符串 (代码中已经替换成######################)

2) 需要拿去使用的注意改写csdn博主名称!!!!

扫描二维码关注公众号,回复: 12858652 查看本文章

 

猜你喜欢

转载自blog.csdn.net/shuixiou1/article/details/114371765