Jsoup web crawler

First write a simple crawler project

package com.kgc;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
	public static void main(String[] args) {
		String url = "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=monline_3_dg&wd=jsoup%E8%A7%A3%E6%9E%90html&oq=httpclient4.4.9&rsv_pq=d7f6243e00006886&rsv_t=1c21FPkhF%2BgQg6I4fQ2ZuApWm%2B5jszdGTEjEmVgQAeQV1%2FQcJwcpl1e9fVIk6IexhrHV&rqlang=cn&rsv_enter=1&inputT=5488&rsv_sug3=34&rsv_sug1=35&rsv_sug7=100&rsv_sug2=1&prefixsug=jsoup&rsp=1&rsv_sug4=6912&rsv_sug=1";
		try {
			Document doc = Jsoup.connect(url).get();
			// System.out.println(doc.html());
			Elements es = doc.select("h3.t a");
			for (Element e : es) {
				System.out.println("h3.t a:\n" + e.attr("href") + "\n" + e.text());
			}
		} catch (IOException e) {
			e.printStackTrace ();
		}

	}
}
Anti-Spider (anti-crawling)
why anti-crawling
Anti- crawling strategy
1. Limit IP
2. Limit User Agent
3. Combine


User Agent restrictions
Concept


Anti-Anti-Spider (anti-crawling) strategy
1. Set User Agent
2. Dynamic agent IP concept: use different ip to set User Agent
for crawler Jsonp in the crawler process 1. Syntax Jsonp.connect (url).header("User-Agent","Mozilla/5.0 ...."); User Agent Jsonp set IP 1. Set IP 2. Get IP method  (1). Free proxy IP library  (2). Paid IP proxy














http://www.goubanjia.com/buy/dynamic.html

Anti-crawl

/*Get proxy ip*/
public class ProxyIPUtils{
	public static ProxyIp getProxyIp(){
		String url = "<address to get ip>";
		try{
			String ipStr = Jsonp.connect(url).get().text();
			String ip = ipStr.split(regex:":")[0];//The front of the colon is the ip, and the back of the colon is the port
			String port = ipStr.split(regex:":")[1];
			ProxyIp p = new ProxyIP();
			p.setPort(Integer.parseInt(port));
			p.setIp(ip);
		}catch(IOException e){
			/*If there is an exception, such as requesting ip too frequently
			The thread caches for 2 seconds and then calls the method recursively.
			Thread.sleep(millis:2000)
			return getProxyIp();*/
		}
		
		return p;
	}
	
}
/**

*/
public Class DocumentUtil{
	/**
         url: the address to be crawled
	**/
	public static Document getDocument(String url){
		Connection connection = Jsonp.connect(url);
		String usergent = "";//Set legal useragent
		connection.userAgent(usergent);//Different browsers have different userAgents
		//connection.header();

		//Set dynamic proxy ip
		Document document = setProxyIp(connection);
		return document;

	}

	public static Document setProxyIp(Connection connection){
		Document document = null;
		ProxyIp  proxyIp = new ProxyIp();
		try{
			proxyIp = ProxyIPUtils.getProxyIp();
			connection.proxy(proxyIp.getIp(),proxyIp.getPort());
			document = connection.get();
		}catch(IOException e){//If an exception occurs, it is required to continue to crawl, so it is necessary to catch the exception, sleep for a few seconds, and then recurse
			try{
			Thread.sleep(millis:5000);
			return setProxyIp(connection);
			}catch(IOException e){
				e.printStackTrace ();
			}
			e.printStackTrace ();
		}
		return document;
	}
}

tokenizer



1. Divide the recruitment information into single words
2. Filter out invalid words
3. Count the probability of occurrence
   of valid words :   Invalid word filter   expansion word definition Common   tokenizer IKAnalyzer 1. Introduce dependencies













2. Configure IKAnalyzer.cfg.xml


3. Set stop words, expand words

@Controller
@RequestMapping("/ik")
public class IKController{
	@Resource
	private AnnealingService rService;

	@Resource
	private KeywordService kservice;
	
	@RequestMapping("/ikDataDeal")
	@ResponseBody
	public String ikDataDeal(){
		/** Get all the job information,
		**/
		Map<String,Object> map = new HashMap<String,Object>();
		map.put("datatype",1");//Play song data type
		List<RecruitVo> list = rService.getReByMap();//Write again
		// Traverse the recruitment information and perform word segmentation processing on all the recruitment information
		for(int i = 0;i<list.size();i++){
			String jobDesc = list.get(i).getJobDescription();
			Integer rid = list.get(i).getId();
			/ / Then use the class just written for word segmentation
			Set<String> set = new HashSet<String>();
			set = IKUtils.getKeyword(jobDesc);
			
			for(String word: set){
				Keyword keyword = new Keyword();//Mapping entity class and keyword table
				keyword.setRid(rid);
				keyword.setName(word);//Each word from the segmentation
				keyword.setStatus(0);//0: not deleted; 1: deleted
				keyword.setDataType(1);
				keywordService.kdkk(keyword);//Storage operation
			}
		}

	}
}




public clas IKUtils {
	/**
	* @param jobDesc: job description
	* @return set word after participle, set also de-duplicates
	*/
	public static Set<String> getKeyWord(String jobDesc) throws Exception{
		//estword.etc: Extended word dictionary
		//stopword.etc: stop word dictionary, this is a word you don't care about
		Set<String> set = new HashSet<String>();
		//No repetition is allowed in set
		String jobstr = jobDesc.trim();
		StringReader reader = new StringReader(jsonstr);
		IKSegmenter ikSementer = new IKSegment(reader,userSmart:true);
		Lexene lexene = null;//Word, the word analyzed
		while(lexene = ikSementer.next() != null){
			String job = lexene.getLexeneText().trim();
			set.add(job);
		}
		return set;
	}
}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325954339&siteId=291194637