使用Jsoup工具,它是一个HTML解析器,可以直接直接解析某个地址或者HTML文件。还可 通过Dom,CSS以及类型JQuery的操作方法操作数据。
Jsoup官方文档地址:https://jsoup.org/cookbook/introduction/parsing-a-document
注意:出现乱码时,需要查看编码方式网页的编码方式,使用它的编码方式解码。使用表单传输中文数据时有些网站需要进行url编码才能正常传输中文=。=
主要代码如下:
package com.galoliy.spider.maven_spider.domain; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Cat5jobs { public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException { Document doc = null; //multipart/form-data 编码类型转换,必须进行转换,不然会导致POST里的keyword乱码 //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST. keyword = URLEncoder.encode(keyword, "gbk"); try { //获取主页 //Get index page Response resp = Jsoup.connect(url).method(Method.GET).execute(); doc = resp.parse(); //获取查询结果页的跳转链接 //Get query results jump page link String actionPath = doc.select("form").attr("action"); Connection con = Jsoup.connect(actionPath) .data("keyword", keyword) .userAgent("Mozilla") .cookies(resp.cookies()) .header("Accept-Language", "zh-CN,zh;q=0.9") .timeout(300000); //得到查询结果页面 //Get query results page doc = con.method(Method.POST).execute().parse(); } catch (IOException e) { e.printStackTrace(); } return doc; } public void getResult(String url,String keyword,String dir,String fileName) { Document doc = null; File path = null; String htmlFilePath = dir + fileName + ".htm"; String txtFilePath = dir + fileName + "2.txt"; try { path = new File(htmlFilePath); doc = Jsoup.parse(path, "utf-8"); if(!doc.children().isEmpty()) { System.out.println("File not empty"); } } catch (IOException e) { System.out.println("file not found"); try { //从网址上获取查询结果页面 //Get query results page from web address doc = this.getResultPage(url,keyword); } catch (UnsupportedEncodingException e2) { e2.printStackTrace(); } try { path.createNewFile(); saveFile(doc.toString(),htmlFilePath); } catch (IOException e1) { e1.printStackTrace(); } } Map map = Screen51Jobs(doc); try { saveScreen(map, txtFilePath); } catch (IOException e) { e.printStackTrace(); } } private void saveScreen(Map<?,?> screen,String path) throws IOException { StringBuilder sb = new StringBuilder(); String p = "\r\n"; sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:" + screen.get("totalquerydata") + p + p + " Recruitment info:"); List list = (ArrayList)screen.get("recruitmentlist"); for (Object o : list) { Map map = (HashMap<String,Object>)o; for (Object obj : map.entrySet()) { Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj; sb.append(p + entry.getKey() + " == " + entry.getValue()); } sb.append(p); } File f = new File(path); if(!f.exists()) { f.createNewFile(); } System.out.println(sb.toString()); saveFile(sb.toString(), path); } @SuppressWarnings({ "rawtypes", "unchecked" }) private Map<?,?> Screen51Jobs(Document doc){ Map screen = new HashMap<String,Object>(); Elements resultList = doc.select("div[class=dw_table]div[id=resultList]"); Elements findKeyword = resultList.select("div[class=sbox]"); Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^共)"); Elements recruitmentInfo = resultList.select("div[class=el]"); screen.put("keyword", findKeyword.text()); screen.put("totalquerydata", totalQueryData.text()); List recruitmentList = new ArrayList<Map<String,String>>(); Map m = null; for (Element e : recruitmentInfo) { m = new HashMap<String,Object>(); m.put("position",e.select("p[class~=^t1]").text()); m.put("href", e.select("a").attr("href")); m.put("corporatename", e.select("a").text()); m.put("address", e.select("span[class=t3]").text()); m.put("salary", e.select("span[class=t4]").text()); m.put("releasedate", e.select("span[class=t5]").text()); recruitmentList.add(m); } screen.put("recruitmentlist", recruitmentList); return screen; } private void saveFile(String src,String path) throws IOException { // InputStream in = new FileInputStream(path); OutputStream out = new FileOutputStream(path); BufferedOutputStream bos = new BufferedOutputStream(out); byte[] bytes = src.getBytes("utf-8"); bos.write(bytes, 0, bytes.length); } }