pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.spider</groupId>
<artifactId>spider</artifactId>
<version>1.0-SNAPSHOT</version>
<name>spider</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.0.3</version>
<scope>compile</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sf.json-lib/json-lib -->
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.53</version>
</dependency>
<dependency>
<groupId>com.spider</groupId>
<artifactId>spider</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>6</source>
<target>6</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
代码:
package com.spider;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.net.URL;
import java.util.Date;
/**
* @author 赵鑫
* @Time: 2018/7/14
* @Email:[email protected]
*/
public class JD {
public static void main(String[] args) throws Exception{
for (int i=0;i<3;i++){
BufferedWriter bw1=new BufferedWriter(new FileWriter("spider/comment/第"+(i+1)+"页评论内容"+".txt"));
BufferedWriter bw2=new BufferedWriter(new FileWriter("第"+(i+1)+"页评论图片连接"+".txt"));
// 1.用Jsoup解析网页
String url="https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv73104&productId=7437788&score=0&sortType=5&page="+i+"&pageSize=10&isShadowSku=0&fold=1";
CloseableHttpResponse indexRes = sendGet(url);
// 获取json内容,将其转换为字符串
String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
// 截取成json字符串
String json2=indexHtml.substring(indexHtml.indexOf('(')+1,indexHtml.lastIndexOf(')'));
// 获取评论
JSONArray array = JSON.parseObject(json2).getJSONArray("comments");
for (Object item : array) {
//获取评论中的内容
System.out.println(JSON.parseObject(item.toString()).getString("content"));
bw1.write(JSON.parseObject(item.toString()).getString("content"));
JSONArray array1 = JSON.parseObject(item.toString()).getJSONArray("images");
System.out.println(array1.size());
for (Object item1 : array1) {
String s=JSON.parseObject(item1.toString()).getString("imgUrl");
System.out.println(s);
Download("http:"+s);
bw2.write(JSON.parseObject(item1.toString()).getString("imgUrl")+"\n");
}
}
bw1.close();
bw2.close();
}
}
//发送get请求,获取响应结果
public static CloseableHttpResponse sendGet(String url) throws IOException {
//创建httpClient客户端
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建请求对象,发送请求
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36");
httpGet.setHeader("Cookie", "__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; unpl=V2_ZzNtbUBTFkV9XxUDLEkMA2IGQVpLBBRAd19ABi4QWgIwVkZZclRCFXwURlRnGloUZAEZWUVcQBRFCEdkexhdBGYBGlhLVXNILGYFAX5SCQBXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543032744.1543334183.4; __jdc=122270672; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd4zd4|tuiguang|35da9fbffaa744b68bfd3f7cd876fde5|1543334183097; PCSYCityID=412; _gcl_au=1.1.1618589019.1543334587; wlfstk_smdl=kpyxn7dgfu7ntzeriqf1nuoyf1pvmmz6; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; logintype=wx; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; npin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; pinId=KCcQw4HqqMKLC3rBXJmYjQ; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sB2NQRwVbUAJGSkkcCBliBBdXQVECWB9VS19SblEUWlkMB1tKeRpdBW4fElJBW1tLHkgSXAxsBhBiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; JSESSIONID=3AD0CE01D03F107D0B4F45BED45F806D.s1; shshshsID=52730810bd55258389660fdba736586f_15_1543337522651; __jdb=122270672.17.1011906297|4.1543334183; thor=__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; PCSYCityID=412; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; jwotest_product=99; unpl=V2_ZzNtbRVUQBAmD0EGexlVBmJQQlsSBEQUcQxCXHxKXVFnBEFZclRCFXwURlRnGlQUZwEZXkJcRhJFCEdkexhdBGYBGlhLVXNILGYFAX1BDFlXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543342650.1543440108.7; __jdc=122270672; thor=EBE4F58722D3C53DD96909AC59EFA6D1BC94658FBAC118C866693EA1EBF5169985038D47FF0615A96C39E195C704E4269C0AE2B142F2A6CF58BFF0E3C588B282CFE4B6DB95B893DBB7528C8A117BF09C2BB8B1A6955DEA1B2D00A191464B5CC90B094977CD8D55F54EAE17D856F65E1A4577319BD2627227472617F7462C4E24; pinId=KCcQw4HqqMKLC3rBXJmYjQ; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd7iam|tuiguang|f224b66c11824ba7ab6055596b0e16b5|1543440132756; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sUWJRElFbDAFGHkgRXRliAxMCQVFSXEtVGl8GYFYbWloIUwkceRpdBW4fElJBW1pLH0sSXwZsABRiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; _gcl_au=1.1.816281936.1543440149; shshshsID=d687c8924c94ae61b9d19c6054056eef_4_1543440174304; __jdb=122270672.7.1011906297|7.1543440108");
httpGet.setHeader("Connection", "keep-alive");
CloseableHttpResponse response = httpClient.execute(httpGet);
return response;
}
//下载图片
private static void Download(String listImgSrc) throws Exception {
try {
String url=listImgSrc;
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File("spider/img/"+imageName));//文件输出流
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println(imageName + "下载完成");
Date overdate = new Date();
} catch (Exception e) {
System.out.println("下载失败");
}
}
}