一个针对51job的过滤信息(培训)的java爬虫脚本,供大家参考,

//在51job搜索后再用此脚本过滤



package test;


import java.io.FileOutputStream;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.junit.Test;


public class demo {


// 设置51job职位列表首页面的url
public static String startUrl = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=060000%2C00&funtype=0000&industrytype=00&keyword=Java&keywordtype=2&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9";
// 设置过滤公司
public static String[] companys = { "中软","达内" };
// 设置过滤关键字
public static String[] keywords = { "培训", "实训" ,"零基础"};
// 保存信息StringBuffer
public static StringBuffer buffer=new StringBuffer();


//执行
public static void main(String[] args) throws Exception {
test(startUrl);
saveDetail();
}


@Test
// 主页面获取职位衔接
public static void test(String url) throws Exception {
Document mainDocument = null;
try {
mainDocument = Jsoup.connect(url).get();
} catch (Exception e) {
// TODO: handle exception
return;
}
// 获取下一页衔接
Elements pageList = mainDocument.getElementsByClass("bk");
String nextPageUrl = pageList.get(1).getAllElements().attr("href");


// 遍历职位
Elements jobList = mainDocument.getElementsByClass("dw_table");
Element jobsElement = jobList.get(0);
// 获取职位集
Elements jobs = jobsElement.getElementsByClass("el");

// 获取所有职位
for (Element job : jobs) {
int flag=0; //过滤公司标记变量
String companyName=job.text();
for (String company : companys) {
if (companyName.contains(company)) {
System.out.println("namepass:" + companyName);
flag=1; //过滤公司
}
}
if(flag==1){
continue;
}
// 穿过元素t1,a,href获取职位衔接
Elements t1Nodes = job.getElementsByClass("t1");
Element t1Node = t1Nodes.get(0);
Elements aNodes = t1Node.getElementsByTag("a");
String jobUrl = aNodes.attr("href");
// 开始检查关键字
if(checkDetail(jobUrl)){
buffer.append("<a target='_blank' href='"+jobUrl+"'>"+companyName+"</a></br></br>");
}else{
System.out.println("contentpass:"+companyName);
}
}

// 浏览下一页
if (!"".equals(nextPageUrl)) {
test(nextPageUrl);
} else {
System.out.println("浏览结束");
}
}


// 模式一: 检查职位详情内容,显示通过
public static boolean checkDetail(String jobUrl) throws Exception {
if (!"".equals(jobUrl)) {
Document jobDocument = null;
try {
jobDocument = Jsoup.connect(jobUrl).get();
} catch (Exception e) {
// TODO: handle exception
return false;
}
Elements jobDetails = jobDocument
.getElementsByClass("tCompany_main");
// 获取到职位详情
String detail = jobDetails.text();
// 根据关键字筛选
for (String keyword : keywords) {
if (detail.contains(keyword)) {
return false; //过滤不通过
}
}
return true; //过滤通过
}
return false;
}


//保存通过信息
public static void saveDetail() throws Exception {
FileOutputStream fos=new FileOutputStream("d://test.html");
fos.write(buffer.toString().getBytes());
fos.close();
}
}

猜你喜欢

转载自blog.csdn.net/fw6669998/article/details/54318199