JAVA 爬取京东评论和图片

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.spider</groupId>
  <artifactId>spider</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>spider</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.2</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.7.3</version>
    </dependency>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-api</artifactId>
      <version>5.0.3</version>
      <scope>compile</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/net.sf.json-lib/json-lib -->
    <dependency>
      <groupId>net.sf.json-lib</groupId>
      <artifactId>json-lib</artifactId>
      <version>2.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
    <dependency>
      <groupId>com.google.code.gson</groupId>
      <artifactId>gson</artifactId>
      <version>2.8.0</version>
    </dependency>
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.53</version>
    </dependency>

    <dependency>
      <groupId>com.spider</groupId>
      <artifactId>spider</artifactId>
      <version>1.0-SNAPSHOT</version>
    </dependency>


  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
        <!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.7.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.20.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
      </plugins>
    </pluginManagement>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <configuration>
          <source>6</source>
          <target>6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>
</project>

代码:

package com.spider;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.net.URL;
import java.util.Date;
/**
 * @author 赵鑫
 * @Time: 2018/7/14
 * @Email:[email protected]
 */
public class JD {
    public static void main(String[] args) throws Exception{
        for (int i=0;i<3;i++){
        BufferedWriter bw1=new BufferedWriter(new FileWriter("spider/comment/第"+(i+1)+"页评论内容"+".txt"));
        BufferedWriter bw2=new BufferedWriter(new FileWriter("第"+(i+1)+"页评论图片连接"+".txt"));
//     1.用Jsoup解析网页
        String url="https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv73104&productId=7437788&score=0&sortType=5&page="+i+"&pageSize=10&isShadowSku=0&fold=1";
        CloseableHttpResponse indexRes = sendGet(url);
//        获取json内容,将其转换为字符串
        String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
//        截取成json字符串
        String json2=indexHtml.substring(indexHtml.indexOf('(')+1,indexHtml.lastIndexOf(')'));
//        获取评论
        JSONArray array = JSON.parseObject(json2).getJSONArray("comments");
        for (Object item : array) {
            //获取评论中的内容
            System.out.println(JSON.parseObject(item.toString()).getString("content"));
            bw1.write(JSON.parseObject(item.toString()).getString("content"));
            JSONArray array1 = JSON.parseObject(item.toString()).getJSONArray("images");
            System.out.println(array1.size());
            for (Object item1 : array1) {
                String s=JSON.parseObject(item1.toString()).getString("imgUrl");
                System.out.println(s);
                Download("http:"+s);
                bw2.write(JSON.parseObject(item1.toString()).getString("imgUrl")+"\n");
            }
        }
        bw1.close();
        bw2.close();
        }
    }
    //发送get请求,获取响应结果
    public static CloseableHttpResponse sendGet(String url) throws IOException {
        //创建httpClient客户端
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //创建请求对象,发送请求
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36");
        httpGet.setHeader("Cookie", "__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; unpl=V2_ZzNtbUBTFkV9XxUDLEkMA2IGQVpLBBRAd19ABi4QWgIwVkZZclRCFXwURlRnGloUZAEZWUVcQBRFCEdkexhdBGYBGlhLVXNILGYFAX5SCQBXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543032744.1543334183.4; __jdc=122270672; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd4zd4|tuiguang|35da9fbffaa744b68bfd3f7cd876fde5|1543334183097; PCSYCityID=412; _gcl_au=1.1.1618589019.1543334587; wlfstk_smdl=kpyxn7dgfu7ntzeriqf1nuoyf1pvmmz6; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; logintype=wx; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; npin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; pinId=KCcQw4HqqMKLC3rBXJmYjQ; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sB2NQRwVbUAJGSkkcCBliBBdXQVECWB9VS19SblEUWlkMB1tKeRpdBW4fElJBW1tLHkgSXAxsBhBiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; JSESSIONID=3AD0CE01D03F107D0B4F45BED45F806D.s1; shshshsID=52730810bd55258389660fdba736586f_15_1543337522651; __jdb=122270672.17.1011906297|4.1543334183; thor=__jdu=1011906297; shshshfpa=546abe25-2650-27ef-387c-cbbd473fdf61-1541645859; shshshfpb=010d34f55e5d106e8262dc1639c7243399901f2c0392299fd5be3a6228; ipLoc-djd=1-72-2799-0; PCSYCityID=412; _pst=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; unick=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; pin=%E9%AD%82%E5%AF%84%E6%A2%A6%E4%B9%A1; _tp=Zj9a6fHTft48nybOFAAH4sbL2FOhPiV6ww52%2BkgoNl8MBXdBGCUdKuMfROuV8QHU; jwotest_product=99; unpl=V2_ZzNtbRVUQBAmD0EGexlVBmJQQlsSBEQUcQxCXHxKXVFnBEFZclRCFXwURlRnGlQUZwEZXkJcRhJFCEdkexhdBGYBGlhLVXNILGYFAX1BDFlXMxFdcl9zFXQIRlx6Hl8NYTMiWnJnHk0qUh8EI1wMW1cFFlhBV0YUfQh2VUsYbE4JAl9dQ1dDHXQPRVx9KV01ZA%3d%3d; __jda=122270672.1011906297.1537011991.1543342650.1543440108.7; __jdc=122270672; thor=EBE4F58722D3C53DD96909AC59EFA6D1BC94658FBAC118C866693EA1EBF5169985038D47FF0615A96C39E195C704E4269C0AE2B142F2A6CF58BFF0E3C588B282CFE4B6DB95B893DBB7528C8A117BF09C2BB8B1A6955DEA1B2D00A191464B5CC90B094977CD8D55F54EAE17D856F65E1A4577319BD2627227472617F7462C4E24; pinId=KCcQw4HqqMKLC3rBXJmYjQ; __jdv=122270672|www.linkhaitao.com|t_1000039483_lh_rd7iam|tuiguang|f224b66c11824ba7ab6055596b0e16b5|1543440132756; 3AB9D23F7A4B3C9B=GPGP3C2BU4NAMC7CA2PZAKXKNW6757AO6KM6ENFTRAQ47S4RFIN3BMNNUQ2B3CUWAEQKVQEI5GA7Z245JTB2BIDIBU; mt_xid=V2_52007VwMTUl1QU10cQR9sUWJRElFbDAFGHkgRXRliAxMCQVFSXEtVGl8GYFYbWloIUwkceRpdBW4fElJBW1pLH0sSXwZsABRiX2hSahxMHFoMZQYSV21YV1wY; shshshfp=cc874848aa1eb0d35dfd56e0e4ba0fb3; _gcl_au=1.1.816281936.1543440149; shshshsID=d687c8924c94ae61b9d19c6054056eef_4_1543440174304; __jdb=122270672.7.1011906297|7.1543440108");
        httpGet.setHeader("Connection", "keep-alive");
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
    //下载图片
    private static void Download(String listImgSrc) throws Exception {
        try {
            String url=listImgSrc;
            String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
            URL uri = new URL(url);
            InputStream in = uri.openStream();
            FileOutputStream fo = new FileOutputStream(new File("spider/img/"+imageName));//文件输出流
            byte[] buf = new byte[1024];
            int length = 0;
            System.out.println("开始下载:" + url);
            while ((length = in.read(buf, 0, buf.length)) != -1) {
                fo.write(buf, 0, length);
            }
            in.close();
            fo.close();
            System.out.println(imageName + "下载完成");
            Date overdate = new Date();
        } catch (Exception e) {
            System.out.println("下载失败");
        }
    }
}

猜你喜欢

转载自blog.csdn.net/qq_37668945/article/details/84640903