Java crawler + html web page analysis

Java crawler + html web page analysis

Special thanks: Crazy God said Java https://www.bilibili.com/video/BV17a4y1x7zq?p=17

1. Springboot project, introduce jsoup

<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.2</version>
		</dependency>

2. Prepare to parse the object

Content.java

package com.asia.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@NoArgsConstructor
@AllArgsConstructor
public class Content {
	private String title;
	private String img;
	private String price;
}

3. Crawler tools

HtmlParseUtil.java

package com.asia.utils;

import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.asia.pojo.Content;

public class HtmlParseUtil {

	public static void main(String[] args) throws Exception {
		new HtmlParseUtil().parseJD("西瓜").forEach(System.out::println);
	}

	public List<Content> parseJD(String keywords) throws Exception {
		String url = "https://search.jd.com/Search?keyword=" + URLDecoder.decode(keywords, "GBK");
		// 解析网页.(Jsoup返回Document就是浏览器的Document对象)
		Document document = Jsoup.parse((new URL(url)), 30000);
		Element element = document.getElementById("J_goodsList");
		Elements elements = element.getElementsByTag("li");
		List<Content> list = new ArrayList<Content>();
		for (Element el : elements) {
			String src = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
			String price = el.getElementsByClass("p-price").eq(0).text();
			String name = el.getElementsByClass("p-name").eq(0).text();
			list.add(new Content(name, src, price));
		}
		return list;
	}
}

running result:
Insert picture description here

Guess you like

Origin blog.csdn.net/Asia1752/article/details/111468162