Java爬虫获取天猫商品类目

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/pengjunlee/article/details/85317438

https://blog.csdn.net/pengjunlee/article/details/85257375 

import java.text.MessageFormat;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.script.ScriptException;

import org.apache.commons.lang.StringEscapeUtils;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;

import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import com.wpp.dc.task.common.utils.SpiderHttpUtils;
import com.wpp.dc.task.tmall.dao.CategoryDao;
import com.wpp.dc.task.tmall.domain.CategoryEntity;

@RunWith(SpringRunner.class)
@SpringBootTest
public class TmallCategoryTest {

	private static final Gson gson = new Gson();

	@Autowired
	private CategoryDao dao;

	/**
	 * 爬取天猫一级分类
	 */
	@Test
	public void getFirstLevelCategories() throws ScriptException, JSONException {

		// 分类爬取地址
		String requestUrl = "http://open.taobao.com/apitools/apiPropTools.htm";

		// 构造请求头
		Map<String, String> headers = new HashMap<String, String>();
		headers.put("Host", "open.taobao.com");
		headers.put("Connection", "keep-alive");
		headers.put("Cache-Control", "max-age=0");
		headers.put("Upgrade-Insecure-Requests", "1");
		headers.put("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
		headers.put("Cookie", "你的Cookie");

		// 发送请求
		String htmlStr = SpiderHttpUtils.sendGet(false, requestUrl, null, headers, "utf-8");
		// 打印响应内容
		System.out.println(htmlStr);

		// 使用正则截取有用信息
		String regEx = "var cid1_api = '0\\|([\\s\\S]*)\\|0';";
		Pattern pat = Pattern.compile(regEx);
		Matcher mat = pat.matcher(htmlStr);
		if (mat.find()) {
			String jsonstr = mat.group(1);

			// 转义特殊字符
			String newStr = jsonstr.replaceAll("\\\\\"", "\"").replaceAll("\\\\\\\\\\/", "\\/");
			String finalJsonStr = StringEscapeUtils.unescapeJava(newStr);

			// 转化成JSON对象
			JSONObject jsonObj = new JSONObject(finalJsonStr);
			JSONObject jsonObject1 = jsonObj.getJSONObject("itemcats_get_response");
			JSONObject jsonObject2 = jsonObject1.getJSONObject("item_cats");
			JSONArray itemList = jsonObject2.getJSONArray("item_cat");

			// 将json字符串还原为对象集合
			List<CategoryEntity> cats = gson.fromJson(itemList.toString(), new TypeToken<List<CategoryEntity>>() {
			}.getType());
			for (CategoryEntity cat : cats) {

				cat.setLevel(1); // 为分类设置层级
				dao.saveCategory(cat); // 更新或者保存分类
			}
		}
	}

	/*
	 * 获取二级分类
	 */
	@Test
	public void getFollowedLevelCategories() throws JSONException {
		// 请求地址
		String requestUrl = "http://open.taobao.com/apitools/ajax_props.do?_tb_token_=3b54e6e53e65e&cid={0}&act=childCid&restBool=false";

		// 构造请求头
		Map<String, String> headers = new HashMap<String, String>();
		headers.put("Host", "open.taobao.com");
		headers.put("Connection", "keep-alive");
		headers.put("Cache-Control", "max-age=0");
		headers.put("Upgrade-Insecure-Requests", "1");
		headers.put("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
		headers.put("Cookie", "你的Cookie");

		// 逐层获取isParent=true的各级分类的子分类
		for (int i = 1; i < 10; i++) {
			List<CategoryEntity> allCategory = dao.findCategories(true, i);
			if (allCategory.size() > 0) {
				for (CategoryEntity cat : allCategory) {
					saveChildrenCategories(cat, dao, requestUrl, headers, i + 1);
				}
			} else {
				break;
			}
		}
	}

	/*
	 * 获取当前目录的下一级分类
	 */
	public void saveChildrenCategories(CategoryEntity parent, CategoryDao dao, String requestUrl,
			Map<String, String> headers, int level) throws JSONException {
		// 请求地址
		String formatUrl = MessageFormat.format(requestUrl, parent.getCid());
		// 发送请求
		String htmlStr = SpiderHttpUtils.sendGet(false, formatUrl, null, headers, "utf-8");
		// 打印响应内容
		System.out.println(htmlStr);
		// 转义特殊字符
		String newStr = htmlStr.replaceAll("\\\\\"", "\"").replaceAll("\\\\\\\\\\/", "\\/");
		String finalJsonStr = StringEscapeUtils.unescapeJava(newStr);
		// 转化成JSON对象
		JSONObject jsonObj = new JSONObject(finalJsonStr);
		JSONObject jsonObject1 = jsonObj.getJSONObject("itemcats_get_response");
		JSONObject jsonObject2 = jsonObject1.getJSONObject("item_cats");
		JSONArray itemList = jsonObject2.getJSONArray("item_cat");
		// 将json字符串还原为对象集合
		List<CategoryEntity> cats = gson.fromJson(itemList.toString(), new TypeToken<List<CategoryEntity>>() {
		}.getType());
		for (CategoryEntity cate : cats) {
			cate.setLevel(1); // 为分类设置层级
			dao.saveCategory(cate); // 更新或者保存分类
		}
	}

}

猜你喜欢

转载自blog.csdn.net/pengjunlee/article/details/85317438