爬取京东手机商品信息

在练习爬虫项目时,遇到了一个知识点分享给大家。

我们在爬取商品价格时,有时标签内并没有显示价格,而是通过ajax实现的。

所以我们在爬取这样的数据时,用到了Gson技术,Gson谷歌提欧专用于解析json,将json串转化成一个对象,从而获取我们要爬取得价格.


核心代码如下:

public static void main(String[] args) throws Exception {

//第一个事情:解析首页的信息,得到商品列表

parserIndex();

//第二个事情:解析分页的信息,得到商品列表

dopaging();

}

private static void dopaging() throws Exception {

int page = 1;

while (page <= 100) {

String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page="+(2 * page - 1);

System.out.println(url);

String pagingResult = getHtml(url);

getSearchResultInfo(pagingResult);

page++;

}

}

private static void parserIndex() throws Exception {

// 1.指定url

String indexUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=4462d633d7774a1dafc55419260fae59";

String indexHtml = getHtml(indexUrl);

getSearchResultInfo(indexHtml);

}

private static void getSearchResultInfo(String indexHtml) {

if (indexHtml != null) {

Document indexDoc = Jsoup.parse(indexHtml);

// 6.定位到商品列表

Elements liList = indexDoc.select("#J_goodsList li[data-pid]");

for (Element li : liList) {

// 7.依次每个商品的详情页,并解析出数据

try {

parserProductDetail(li.attr("data-pid"));

} catch (Exception e) {

System.out.println("商品url访问失败!   " + li.attr("data-pid") + e);

}

}

}

}

private static String getHtml(String indexUrl) throws IOException, ClientProtocolException {

// 2.url对象封装成httpget对象

HttpGet indexHttpGet = new HttpGet(indexUrl);

// 3.使用httpclient发起一个请求

CloseableHttpClient indexHttpClient = HttpClients.createDefault();

CloseableHttpResponse indexRes = indexHttpClient.execute(indexHttpGet);

// 4.从响应结果中,获得首页的html文档

if (200 == indexRes.getStatusLine().getStatusCode()) {

// 5.获得首页的信息,从首页中找出商品的列表

return EntityUtils.toString(indexRes.getEntity(), Charset.forName("utf-8"));

}

return null;

}

private static void parserProductDetail(String pId) throws Exception {

// 1.指定url

// https://item.jd.com/3367822.html

String pUrl = "https://item.jd.com/" + pId + ".html";

// 2.封装成一个get请求

HttpGet httpGet = new HttpGet(pUrl);

// 3.使用httpclient发起请求

CloseableHttpClient detailHttpClient = HttpClients.createDefault();

CloseableHttpResponse detailRes = detailHttpClient.execute(httpGet);

// 4.得到响应结果

if (200 == detailRes.getStatusLine().getStatusCode()) {

String detailHtml = EntityUtils.toString(detailRes.getEntity(), Charset.forName("utf-8"));

// 5.解析文档

Document detailDoc = Jsoup.parse(detailHtml);

// 6.一次解析出我们想要的数据

Product product = getProductInfo(detailDoc);

product.setId(pId);

product.setUrl(pUrl);

// 7.补全价格信息

// 1)指定url2)封装httpget请求3)发起期请求4) 得到值

String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_" + pId;

HttpGet priceHttpGet = new HttpGet(priceUrl);

CloseableHttpClient priceHttpClient = HttpClients.createDefault();

CloseableHttpResponse priceRes = priceHttpClient.execute(priceHttpGet);

if (200 == priceRes.getStatusLine().getStatusCode()) {

String priceJson = EntityUtils.toString(priceRes.getEntity(), Charset.forName("utf-8"));

// [{"op":"1999.00","m":"3000.00","id":"J_3367822","p":"1999.00"}]

// Gson谷歌提欧专用于解析json,将json串转化成一个对象。

//使用必须导入pom依赖。

Gson gson = new Gson();

ArrayList<Map> resultList = gson.fromJson(priceJson, ArrayList.class);

Map<String, String> map = (Map<String, String>) resultList.get(0);

//获取价格数据

String price = map.get("op");

product.setPrice(price);

}

System.out.println(product);

}

}

private static Product getProductInfo(Document detailDoc) {

*//**

 * private String name; private String title; private String price;

 * private String maidian; private String pinpai; private String

 * xinghao;

 *//*

Product product = new Product();

//获取商品名称

String name = detailDoc.select("[class=parameter2 p-parameter-list] li").get(0).text();

product.setName(name);

//获取标题

String title = detailDoc.select(".sku-name").get(0).text();

product.setTitle(title);

//获取卖点信息

// String maidian = detailDoc.select("#p-ad").get(0).attr("title");

// product.setMaidian(maidian);

return product;

}

}

猜你喜欢

转载自blog.csdn.net/huarui0820/article/details/78816852