JAVA crawler

Based on httpclient and jsoup to obtain web page objects and parsing, using idea tools and spring boot framework

The GECCO crawler framework found on the Internet, the source code used, if there is a problem, you can find the gecco framework on github

 

1.

Requestor gets the web page object to encapsulate

 

package com.example.demo.httpclient;

import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class Requestor {

protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

protected CookieStore cookieStore;
protected HttpClientContext context;
protected CloseableHttpClient client;

public Requestor() {
client = HttpClients.createDefault();
}

public void doLogin(String loginUrl, Map<String, String> params) {
HttpResponse httpResponse = doPost(loginUrl, params);
printResponse(httpResponse);
}

public void printResponse(HttpResponse httpResponse) {
// 获取响应消息实体
HttpEntity entity = httpResponse.getEntity();
// 响应状态
System.out.println("status:" + httpResponse.getStatusLine());
System.out.println("headers:");
HeaderIterator iterator = httpResponse.headerIterator();
while (iterator.hasNext()) {
System.out.println("\t" + iterator.next());
}
// 判断响应实体是否为空
if (entity != null) {
String responseString;
try {
responseString = EntityUtils.toString(entity);
System.out.println("response length:" + responseString.length());
System.out.println("response content:" + responseString.replace("\r\n", ""));
} catch (org.apache.http.ParseException | IOException e) {
e.printStackTrace();
}
}
}

public void setContext() {
context = HttpClientContext.create();
Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
.build();
context.setCookieSpecRegistry(registry);
context.setCookieStore(cookieStore);
}

public void setCookieStore(HttpResponse httpResponse) {
cookieStore = new BasicCookieStore();
Header[] headers = httpResponse.getHeaders("Set-Cookie");

String cookieValue = null;
for (Header header : headers) {
System.out.println(header.getName() + ":" + header.getValue());
cookieValue = header.getValue();
}
// 新建一个Cookie
BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
cookie.setDomain(".oschina.net");
cookie.setPath("/");
cookieStore.addCookie(cookie);
}

public List<NameValuePair> getParam(Map<String, String>parameterMap) {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
Set<String> keySet = parameterMap.keySet();
for (String key : keySet) {
nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
}
return nameValuePairs;
}

public HttpResponse doGet(String url) {
HttpResponse result = null;
HttpGet httpGet = new HttpGet(url);
config(httpGet);
try {
result = client.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

public HttpResponse doPost(String url, Map<String, String> params) {
HttpResponse result = null;
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity;
try {
postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
httpPost.setEntity(postEntity);
config(httpPost);
result = client.execute(httpPost);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

protected void config(HttpRequestBase httpRequestBase) {
httpRequestBase.setHeader("User-Agent", USER_AGENT);
httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
httpRequestBase.setHeader("Referer", "https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F");
// 配置请求的超时设置
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(300000).setConnectTimeout(300000).setSocketTimeout(300000).build();
httpRequestBase.setConfig(requestConfig);
}

}

2.
AbstractClient class
package com.example.demo.httpclient;

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
import org.apache.http.message.BasicNameValuePair;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AbstractClient {

protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

protected CookieStore cookieStore;
protected HttpClientContext context;
protected CloseableHttpClient client;

public AbstractClient() {
client = HttpClients.createDefault();
}

public void setContext() {
context = HttpClientContext.create();
Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider>create()
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
.register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).build();
context.setCookieSpecRegistry(registry);
context.setCookieStore(cookieStore);
}

public void setCookieStore(HttpResponse httpResponse) {
cookieStore = new BasicCookieStore();
Header[] headers = httpResponse.getHeaders("Set-Cookie");

String cookieValue = null;
for (Header header : headers) {
System.out.println(header.getName() + ":" + header.getValue());
cookieValue = header.getValue();
}
// 新建一个Cookie
BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
cookie.setDomain(".oschina.net");
cookie.setPath("/");
cookieStore.addCookie(cookie);
}

public List<NameValuePair> getParam(Map<String, String> parameterMap) {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
Set<String> keySet = parameterMap.keySet();
for (String key : keySet) {
nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
}
return nameValuePairs;
}

public HttpResponse doGet(String url) {
HttpResponse result = null;
HttpGet httpGet = new HttpGet(url);
config(httpGet);
try {
result = client.execute(httpGet);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

public HttpResponse doPost(String url, Map<String, String> params) {
HttpResponse result = null;
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity;
try {
postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
httpPost.setEntity(postEntity);
config(httpPost);
result = client.execute(httpPost);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}

protected void config(HttpRequestBase httpRequestBase) {
httpRequestBase.setHeader("User-Agent", USER_AGENT);
httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
// 配置请求的超时设置
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(3000).setConnectTimeout(3000)
.setSocketTimeout(3000).build();
httpRequestBase.setConfig(requestConfig);
}

}

3. Use -- refer to usage
package com.example.demo.getpage;

import com.example.demo.entity.CarBrand;
import com.example.demo.entity.CarDemio;
import com.example.demo.entity.CarVehicle;
import com.alibaba.fastjson.JSONArray;
import com.example.demo.httpclient.Requestor;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class CarGet {

private Requestor requestor = new Requestor();

/**
* 获取汽车品牌
* @return
*/
public List<CarBrand> getCarBrands() {
List<CarBrand> carBrands=new ArrayList<CarBrand>();
try {
Document Alldocument = Jsoup.connect("https://www.che300.com/?from=bd_seo&city=11").get();
// String title=Alldocument.title();
Element elementDiv = Alldocument.getElementsByAttributeValue("class", "ucarselecttype_pinpaibottom_ul brand").first();
Elements links = elementDiv.getElementsByTag("p");
String[] chars = {"Q", "W", "E", "R", "T", "Y", "U",
"I", "O", "P", "A", "S", "D", "F", "G", "H", "J", "K", "L", "Z", "X", "C", "V", "B", "N", "M"};
List<String> clist = Arrays.asList(chars);
for (Element e : links) {
//判断id是否是A-Z 是就排除
if (!clist.contains(e.id())) {
CarBrand cb = new CarBrand();
cb.setSeries_brand(e.id());
cb.setBrand_name(e.html());
cb.setRel(e.attr("rel"));
carBrands.add(cb);
}
// System.out.println(e.html()+"---"+e.attr("rel")+"---"+e.id());
}
// for (CarBrand cb:carBrands
// ) {
// System.out.println(cb);
// }
// System.out.println(title);
} catch (IOException e) {
e.printStackTrace();
}finally {
return carBrands;
}

}


//Get all series under a certain brand
//Interface address
private List<CarDemio> getOneCarDemio(String url){
List<CarDemio> cds = new ArrayList<CarDemio>();
try {
//Interface address
// String url= "https://ssl-meta.che300.com/meta/series/series_brand{0}.json?v=159";
HttpResponse response = requestor.doGet(url);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
cds=array.toJavaList(CarDemio.class);
} catch (IOException e) {
e.printStackTrace();
}finally {
return cds;
}
}


/**
* 获取汽车系列
* @param carBrands
* @return
*/
public List<CarDemio> getCarDemio(List<CarBrand> carBrands) {
List<CarDemio> carDemios=new ArrayList<CarDemio>();
for (CarBrand cb : carBrands) {
String url="https://ssl-meta.che300.com/meta/series/series_brand"+cb.getSeries_brand()+".json?v=159";
List<CarDemio> cars=this.getOneCarDemio(url);
carDemios.addAll(cars);
}
return carDemios;
}

//获取单个系列下的汽车类型
private List<CarVehicle> getOneCarVhicle(String url){
List<CarVehicle> carVehicleList =new ArrayList<CarVehicle>();
try {
HttpResponse response = requestor.doGet(url);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
carVehicleList=array.toJavaList(CarVehicle.class);
} catch (IOException e) {
e.printStackTrace();
}finally {
return carVehicleList;
}
}

/**
* 获取所有汽车品种
* @param carDemios
* @return
*/
public List<CarVehicle> getCarVehicles(List<CarDemio> carDemios){
List<CarVehicle> carVehicles = new ArrayList<CarVehicle>();
for (CarDemio cd : carDemios) {
String url="https://ssl-meta.che300.com/meta/model/model_series"+cd.getSeries_id()+".json?v=159";
List<CarVehicle> vehicleList=this.getOneCarVhicle(url);
carVehicles.addAll(vehicleList);
}
return carVehicles;

}
}

4. Main usage
private Requestor requestor = new Requestor();

/** 
* Get the value in json format from the link
* car brand
* @throws Exception
*/
@Test
public void testVisitBlog() throws Exception {
HttpResponse response = requestor.doGet(testUrl);
HttpEntity entity = response.getEntity();
String str =EntityUtils.toString(entity);
JSONArray array = JSONArray.parseArray(str);
List<CarDemio> carDemioList=array.toJavaList(CarDemio.class);
for (CarDemio c:carDemioList
) {
System.out.println(c) ;
}
}
console prints:

CarDemio{series_id='2476', series_group_name='Zhidou Electric Vehicle', series_name='Zhidou', is_green='1'}
CarDemio{series_id='2477', series_group_name='Zhidou Electric Vehicle', series_name=' Zhidou D1', is_green='1'}
CarDemio{series_id='2478', series_group_name='Zhidou Electric Vehicle', series_name='Zhidou D2', is_green='1'}
CarDemio{series_id='33135', series_group_name='Zhidou Electric Vehicle', series_name='Zhidou D3', is_green='1'}

 

5.jar包

<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>

<dependency>
   <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>







Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325648069&siteId=291194637