【Java】基于jsoup爬虫实现(从智联获取工作信息)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/hj7jay/article/details/84381973

这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!后续想通过Java把数据导入到Excel或者生成一个报表!

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**从智联招聘获取招聘信息
 * @url 智联招聘网站链接(建议不要更改)
 * @city 搜索工作的城市
 * @keywrods 搜索工作的相关关键字
 */

public class JsoupHtml {
    
    private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=";  //智联招聘网站
    private  String city="西安"; //搜索工作的城市
    private  String keywords="java";  //搜索工作的关键字
    public JsoupHtml(String city,String keywords){        
        this.city=city;
        this.keywords =keywords;
        
    }
    
    public void getZhiLianWork(){
        try {
            for (int i=0;i<10;i++) {
                    System.out.println("*********开始遍历第"+(i+1)+"页的求职信息*********");
                    Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get();                    
                    Element content = doc.getElementById("newlist_list_content_table");            
                    Elements zwmcEls = content.getElementsByClass("zwmc");
                    Elements gsmcEls = content.getElementsByClass("gsmc");            
                    Elements zwyxEls = content.getElementsByClass("zwyx");            
                    Elements gzddEls = content.getElementsByClass("gzdd");            
                    Elements gxsjEls = content.getElementsByClass("gxsj");
                    for(int j = 0;j<zwmcEls .size();j++){
                        
                        System.out.println(
                                zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+
                                "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+
                                "*****"+gxsjEls.get(j).tagName("a").text());
                        System.out.println();
                }
                    System.out.println("*********结束遍历第"+(i+1)+"页的求职信息*********");
            
            }
            
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {    
        
        JsoupHtml jHtml = new JsoupHtml("上海", "java");
        jHtml.getZhiLianWork();
        
    }

}

更新源代码,支持生成html表格:

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
 
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
public class JsoupHtml {
 
    public static void main(String[] args) {       
        try {
            String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?";
            String city ="西安";
            String keywords = "java";
            BufferedWriter bWriter = new BufferedWriter(
                    new OutputStreamWriter(
                            new FileOutputStream("output.html"),"utf-8"));
            bWriter.write("");
             
             
            File input = new File("input.html");
            Document doc2 = Jsoup.parse(input, "UTF-8", "");
            Element table = doc2.getElementById("workinfo");
            table.text("");
            Element theader = table.appendElement("tr");
            theader.appendElement("th").text("序号");
            theader.appendElement("th").text("职位名称");
            theader.appendElement("th").text("公司名称");
            theader.appendElement("th").text("职位月薪");
            theader.appendElement("th").text("工作地点");
            theader.appendElement("th").text("发布日期");          
         
             
            for(int page=0;page<10;page++){             
                Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get();               
                Element content = doc.getElementById("newlist_list_content_table");        
                Elements zwmcEls = content.getElementsByClass("zwmc");
                Elements gsmcEls = content.getElementsByClass("gsmc");         
                Elements zwyxEls = content.getElementsByClass("zwyx");         
                Elements gzddEls = content.getElementsByClass("gzdd");         
                Elements gxsjEls = content.getElementsByClass("gxsj");
                 
                for(int i = 1;i<zwmcEls .size();i++){               
                    Element tr =table.appendElement("tr");
                    tr.appendElement("td").text((page+1)+"-"+i);
                    tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gzddEls.get(i).tagName("a").text());
                    tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text());
                }
            }
            System.out.println(doc2.html());
            bWriter.write(doc2.html());
            bWriter.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
         
    }
 
}

output.html模板:

<!doctype html>
<html lang="en">
 <head>
  <meta charset="UTF-8">
  <meta name="Generator" content="EditPlus®">
  <meta name="Author" content="">
  <meta name="Keywords" content="">
  <meta name="Description" content="">
  <title>智联工作信息</title>
  <style>
  body{margin:0;padding:0;}
    .header{height:100px;width:100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px;
        font-family:"微软雅黑";}
    .body{width:100%;background:#fff;}
    .body table{width:90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; }
    .body table th,td{min-width:50px;max-width:300px;}
    .feeter{height:30px;width:100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px;
        font-family:"微软雅黑";}
  </style>
 </head>
 <body>
    <div class="header">智联工作信息</div>
    <div class="body">
        <table class="work" border="1">
            <tbody id="workinfo">
            </tbody>
        </table>
    </div>
    <div class="feeter">版权所有 翻版必究@2018 Joker</div>
 </body>
</html>

猜你喜欢

转载自blog.csdn.net/hj7jay/article/details/84381973
今日推荐