案例8 爬虫爬取求职信息

编写一个爬虫采集数据源

使用jsoup 爬取招聘网站相应的招聘信息


实现代码:

1.jobBean.java

public class JobBean {
	private String jobName;
	private String comName;
	private String addr;
	private String salary;
	private String date;
	public String getJobName() {
		return jobName;
	}
	public void setJobName(String jobName) {
		this.jobName = jobName;
	}
	public String getComName() {
		return comName;
	}
	public void setComName(String comName) {
		this.comName = comName;
	}
	public String getAddr() {
		return addr;
	}
	public void setAddr(String addr) {
		this.addr = addr;
	}
	public String getSalary() {
		return salary;
	}
	public void setSalary(String salary) {
		this.salary = salary;
	}
	public String getDate() {
		return date;
	}
	public void setDate(String date) {
		this.date = date;
	}
	@Override
	public String toString() {
		return "JobBean [jobName=" + jobName + ", comName=" + comName + ", addr=" + addr + ", salary=" + salary
				+ ", date=" + date + "]";
	}
	public void set(String jobName, String comName, String addr, String salary, String date) {
		this.jobName = jobName;
		this.comName = comName;
		this.addr = addr;
		this.salary = salary;
		this.date = date;
	}
	
	

}
2.page.java
import org.jsoup.nodes.Document;

public class Page {
	private Document document;
	private String nextPageUrl;
	private boolean hasNextPage;
	
	public String getNextPageUrl() {
		return nextPageUrl;
	}
	public void setNextPageUrl(String nextPageUrl) {
		this.nextPageUrl = nextPageUrl;
	}
	public boolean isHasNextPage() {
		return hasNextPage;
	}
	public void setHasNextPage(boolean hasNextPage) {
		this.hasNextPage = hasNextPage;
	}
	public Document getDocument() {
		return document;
	}
	public void setDocument(Document document) {
		this.document = document;
	}
	@Override
	public String toString() {
		return "Page [document=" + document + ", nextPageUrl=" + nextPageUrl + ", hasNextPage=" + hasNextPage + "]";
	}
	

}
3.TestJsoup.java
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class TestJsoup {
	public static void main(String[] args) throws Exception {
		Document document = getDocumentByUrl();
		getValue(document);
		// <a target="_blank" title="北京拓尔思信息技术股份有限公司"
		// href="https://jobs.51job.com/all/co2706575.html">北京拓尔思信息技术股份有限公司</a>
	}

	/**
	 * 获取具体的值 1:标签对之间的值 2:标签的属性值
	 * 
	 * @param document
	 */
	public static void getValue(Document document) {
		Elements select = document.select("#resultList .el a");
		for (Element element : select) {
			// 标签对中间的值
			// System.out.println(element.text());
			String attr = element.attr("href");
			System.out.println(attr);
		}
	}

	/**
	 * 查找具体的数据有三种方式 1:通过id查找数据 2:通过class查找数据 3:通过标签查找数据(tag)
	 * 
	 * select里面直接传递就是上面的三种形式
	 * 
	 * @param document
	 */
	public static void getElements(Document document) {
		// 通过id查找所需要的数据 id是唯一的,所找到的数据是单个的,也是具体的
		Element elementById = document.getElementById("resultList");
		// System.out.println(elementById);
		// 通过class查找数据
		Elements elementsByClass = document.getElementsByClass("el");
		/*
		 * for (Element element : elementsByClass) {
		 * System.out.println(element);
		 * System.out.println("------------------"); }
		 */
		// 通过标签查找数据
		Elements elementsByTag = document.getElementsByTag("a");
		/*
		 * for (Element element : elementsByTag) { System.out.println(element);
		 * System.out.println("------------------"); }
		 */
		// css选择器
		Elements select = document.select("#resultList .el");
		for (Element element : select) {
			System.out.println(element);
			System.out.println("-----------");
		}
	}

	/**
	 * 通过父子标签和兄弟标签的到数据
	 * 
	 * @param document
	 */
	public static void getElementsByC(Document document) {
		Elements select = document.select("#resultList .el span");
		for (Element element : select) {
			Elements children = element.children();
			Element parent = element.parent();
			// 得到所有的兄弟标签
			element.siblingElements();
			// 得到下面的兄弟标签
			element.nextSibling();
			// 得到前面的兄弟标签
			element.previousElementSibling();

			for (Element element2 : children) {
				System.out.println(element2);
			}
		}
	}

	/**
	 * 通过url爬取整个网站的信息
	 * 
	 * @return
	 * @throws MalformedURLException
	 * @throws IOException
	 */
	private static Document getDocumentByUrl() throws MalformedURLException, IOException {
		// 第一个是url 第二个超时时间
		String urlStr = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
		URL url = new URL(urlStr);
		// 返回的整个爬取的数据
		Document document = Jsoup.parse(url, 4000);
		// System.out.println(document);
		return document;
	}

}

4.TestMain.java

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class TestMain {
	public static void main(String[] args) throws Exception {
		String usrEnd = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,145.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
		String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
		Document document = getDocumentByUrl(url);
		Page page = new Page();
		page.setDocument(document);
		int sum = 0;
		BufferedWriter   bw = new BufferedWriter(new FileWriter("D:/大数据.txt"));
		while (true) {
			sum++;
			List<JobBean> jobsByPage = getJobsByPage(page);
			getNextPageUrl(page);
			for (JobBean jobBean : jobsByPage) {
				//System.out.println(jobBean);
				if(jobBean.toString()!=null){
					bw.write(jobBean.toString());
					bw.newLine();
					bw.flush();
					
				}else{
					continue;
				}
			}
			 System.out.println("——————————————" + sum + "——————————————");
			if (page.isHasNextPage()) {
				document = getDocumentByUrl(page.getNextPageUrl());
				page.setDocument(document);
			} else {
				break;
			}
			// Thread.sleep(1000);
			}
		}
	public static void getNextPageUrl(Page page) {
		Document document = page.getDocument();
		Elements select = document.select(".bk");
		Elements select2 = select.get(1).select("a");
		if (select2 != null && select2.size() > 0) {
			Element element = select2.get(0);
			String url = element.attr("href");
			page.setNextPageUrl(url);
			page.setHasNextPage(true);
		} else {
			page.setHasNextPage(false);
		}
	}

	private static Document getDocumentByUrl(String url) {
		URL u;
		try {
			u = new URL(url);
			Document document = Jsoup.parse(u, 4000);
			return document;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}

	private static List<JobBean> getJobsByPage(Page page) {
		List<JobBean> list = new ArrayList<>();
		Document document = page.getDocument();
		Elements select = document.select("#resultList .el");
		select.remove(0);
		for (Element element : select) {
			String jobName = element.select(".t1 a").get(0).text();
			String comName = element.select(".t2 a").get(0).attr("title");
			String addr = element.select(".t3").get(0).text();
			String salary = element.select(".t4").get(0).text();
			String date = element.select(".t5").get(0).text();
			JobBean jobBean = new JobBean();
			jobBean.set(jobName, comName, addr, salary, date);
			list.add(jobBean);
		}
		return list;
	}
}



猜你喜欢

转载自blog.csdn.net/a331685690/article/details/80281801