First write a simple crawler project
package com.kgc; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupTest { public static void main(String[] args) { String url = "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=monline_3_dg&wd=jsoup%E8%A7%A3%E6%9E%90html&oq=httpclient4.4.9&rsv_pq=d7f6243e00006886&rsv_t=1c21FPkhF%2BgQg6I4fQ2ZuApWm%2B5jszdGTEjEmVgQAeQV1%2FQcJwcpl1e9fVIk6IexhrHV&rqlang=cn&rsv_enter=1&inputT=5488&rsv_sug3=34&rsv_sug1=35&rsv_sug7=100&rsv_sug2=1&prefixsug=jsoup&rsp=1&rsv_sug4=6912&rsv_sug=1"; try { Document doc = Jsoup.connect(url).get(); // System.out.println(doc.html()); Elements es = doc.select("h3.t a"); for (Element e : es) { System.out.println("h3.t a:\n" + e.attr("href") + "\n" + e.text()); } } catch (IOException e) { e.printStackTrace (); } } }Anti-Spider (anti-crawling)
why anti-crawling
Anti- crawling strategy
1. Limit IP
2. Limit User Agent
3. Combine
User Agent restrictions
Concept
Anti-Anti-Spider (anti-crawling) strategy
1. Set User Agent
2. Dynamic agent IP concept: use different ip to set User Agent
for crawler Jsonp in the crawler process 1. Syntax Jsonp.connect (url).header("User-Agent","Mozilla/5.0 ...."); User Agent Jsonp set IP 1. Set IP 2. Get IP method (1). Free proxy IP library (2). Paid IP proxy
http://www.goubanjia.com/buy/dynamic.html
Anti-crawl
/*Get proxy ip*/ public class ProxyIPUtils{ public static ProxyIp getProxyIp(){ String url = "<address to get ip>"; try{ String ipStr = Jsonp.connect(url).get().text(); String ip = ipStr.split(regex:":")[0];//The front of the colon is the ip, and the back of the colon is the port String port = ipStr.split(regex:":")[1]; ProxyIp p = new ProxyIP(); p.setPort(Integer.parseInt(port)); p.setIp(ip); }catch(IOException e){ /*If there is an exception, such as requesting ip too frequently The thread caches for 2 seconds and then calls the method recursively. Thread.sleep(millis:2000) return getProxyIp();*/ } return p; } }
/** */ public Class DocumentUtil{ /** url: the address to be crawled **/ public static Document getDocument(String url){ Connection connection = Jsonp.connect(url); String usergent = "";//Set legal useragent connection.userAgent(usergent);//Different browsers have different userAgents //connection.header(); //Set dynamic proxy ip Document document = setProxyIp(connection); return document; } public static Document setProxyIp(Connection connection){ Document document = null; ProxyIp proxyIp = new ProxyIp(); try{ proxyIp = ProxyIPUtils.getProxyIp(); connection.proxy(proxyIp.getIp(),proxyIp.getPort()); document = connection.get(); }catch(IOException e){//If an exception occurs, it is required to continue to crawl, so it is necessary to catch the exception, sleep for a few seconds, and then recurse try{ Thread.sleep(millis:5000); return setProxyIp(connection); }catch(IOException e){ e.printStackTrace (); } e.printStackTrace (); } return document; } }
tokenizer
2. Filter out invalid words
3. Count the probability of occurrence
of valid words : Invalid word filter expansion word definition Common tokenizer IKAnalyzer 1. Introduce dependencies
2. Configure IKAnalyzer.cfg.xml
3. Set stop words, expand words
@Controller @RequestMapping("/ik") public class IKController{ @Resource private AnnealingService rService; @Resource private KeywordService kservice; @RequestMapping("/ikDataDeal") @ResponseBody public String ikDataDeal(){ /** Get all the job information, **/ Map<String,Object> map = new HashMap<String,Object>(); map.put("datatype",1");//Play song data type List<RecruitVo> list = rService.getReByMap();//Write again // Traverse the recruitment information and perform word segmentation processing on all the recruitment information for(int i = 0;i<list.size();i++){ String jobDesc = list.get(i).getJobDescription(); Integer rid = list.get(i).getId(); / / Then use the class just written for word segmentation Set<String> set = new HashSet<String>(); set = IKUtils.getKeyword(jobDesc); for(String word: set){ Keyword keyword = new Keyword();//Mapping entity class and keyword table keyword.setRid(rid); keyword.setName(word);//Each word from the segmentation keyword.setStatus(0);//0: not deleted; 1: deleted keyword.setDataType(1); keywordService.kdkk(keyword);//Storage operation } } } }
public clas IKUtils { /** * @param jobDesc: job description * @return set word after participle, set also de-duplicates */ public static Set<String> getKeyWord(String jobDesc) throws Exception{ //estword.etc: Extended word dictionary //stopword.etc: stop word dictionary, this is a word you don't care about Set<String> set = new HashSet<String>(); //No repetition is allowed in set String jobstr = jobDesc.trim(); StringReader reader = new StringReader(jsonstr); IKSegmenter ikSementer = new IKSegment(reader,userSmart:true); Lexene lexene = null;//Word, the word analyzed while(lexene = ikSementer.next() != null){ String job = lexene.getLexeneText().trim(); set.add(job); } return set; } }