简单JAVA爬虫51Jobs

使用Jsoup工具，它是一个HTML解析器，可以直接直接解析某个地址或者HTML文件。还可通过Dom,CSS以及类型JQuery的操作方法操作数据。
Jsoup官方文档地址：https://jsoup.org/cookbook/introduction/parsing-a-document
注意：出现乱码时，需要查看编码方式网页的编码方式，使用它的编码方式解码。使用表单传输中文数据时有些网站需要进行url编码才能正常传输中文=。=
主要代码如下：
package com.galoliy.spider.maven_spider.domain;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Cat5jobs {

    public Document getResultPage(String url,String keyword) throws UnsupportedEncodingException {
        Document doc = null;
        
        //multipart/form-data 编码类型转换，必须进行转换,不然会导致POST里的keyword乱码
        //Multipart/form-data code type conversion must be converted, otherwise it will cause keyword confusion in POST.
        keyword = URLEncoder.encode(keyword, "gbk");
        
        try {
            
            //获取主页
            //Get index page
            Response resp = Jsoup.connect(url).method(Method.GET).execute();
            doc = resp.parse();
        
            //获取查询结果页的跳转链接
            //Get query results jump page link
            String    actionPath = doc.select("form").attr("action");
            
             Connection con = Jsoup.connect(actionPath)
                    .data("keyword", keyword)
                    .userAgent("Mozilla")
                    .cookies(resp.cookies())
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(300000);
             //得到查询结果页面
             //Get query results page
            doc = con.method(Method.POST).execute().parse();
            
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
    

    public void getResult(String url,String keyword,String dir,String fileName) {

        Document doc = null;
        File path = null;
        String htmlFilePath = dir + fileName + ".htm";
        String txtFilePath = dir + fileName + "2.txt";
        try {
            
            path = new File(htmlFilePath);
            doc = Jsoup.parse(path, "utf-8");
            if(!doc.children().isEmpty()) {
                System.out.println("File not empty");
            }
            
        } catch (IOException e) {
            
            System.out.println("file not found");

            try {
                
                //从网址上获取查询结果页面
                //Get query results page from web address
                doc = this.getResultPage(url,keyword);
                
            } catch (UnsupportedEncodingException e2) {
                e2.printStackTrace();
            }
            try {
                path.createNewFile();
                saveFile(doc.toString(),htmlFilePath);
                
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        
        Map map = Screen51Jobs(doc);

        try {
            saveScreen(map, txtFilePath);
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
    private void saveScreen(Map<?,?> screen,String  path) throws IOException {
        
        StringBuilder sb = new StringBuilder();
        String p = "\r\n";
        sb.append(p + " KeyWord:" + screen.get("keyword") + p + p +" Total query data:" 
                    + screen.get("totalquerydata") + p + p + " Recruitment info:");
        
        List list = (ArrayList)screen.get("recruitmentlist");

        for (Object o : list) {
            Map map = (HashMap<String,Object>)o;

            for (Object obj : map.entrySet()) {
                Map.Entry<String, Object> entry = (Map.Entry<String, Object>)obj;
                sb.append(p + entry.getKey() + " == " + entry.getValue());
            }
            sb.append(p);
        }
        
        File f = new File(path);
        if(!f.exists()) {
            f.createNewFile();
        }
        System.out.println(sb.toString());
        saveFile(sb.toString(), path);
    }
    
    @SuppressWarnings({ "rawtypes", "unchecked" })
    private Map<?,?> Screen51Jobs(Document doc){
        
        Map screen = new HashMap<String,Object>(); 
        
        Elements resultList = doc.select("div[class=dw_table]div[id=resultList]");
        Elements findKeyword = resultList.select("div[class=sbox]");
        Elements totalQueryData = resultList.select("div[class=rt]div:matchesOwn(^共)");
        Elements recruitmentInfo = resultList.select("div[class=el]");
    
        screen.put("keyword", findKeyword.text());
        screen.put("totalquerydata", totalQueryData.text());
        
        List recruitmentList = new ArrayList<Map<String,String>>(); 
        Map m = null;
        for (Element e : recruitmentInfo) {
            m = new HashMap<String,Object>();
            m.put("position",e.select("p[class~=^t1]").text());
            m.put("href", e.select("a").attr("href"));
            m.put("corporatename", e.select("a").text());
            m.put("address", e.select("span[class=t3]").text());
            m.put("salary", e.select("span[class=t4]").text());
            m.put("releasedate", e.select("span[class=t5]").text());
            recruitmentList.add(m);
        }
        screen.put("recruitmentlist", recruitmentList);
        
        return screen;
    }
    
    private void saveFile(String src,String path) throws IOException {

    //    InputStream in = new FileInputStream(path);
        OutputStream out = new FileOutputStream(path);
        BufferedOutputStream bos = new BufferedOutputStream(out);
        
        byte[] bytes = src.getBytes("utf-8");
        
        bos.write(bytes, 0, bytes.length);        
    }
}
猜你喜欢