编写一个爬虫采集数据源
使用jsoup 爬取招聘网站相应的招聘信息
实现代码:
1.jobBean.java
public class JobBean { private String jobName; private String comName; private String addr; private String salary; private String date; public String getJobName() { return jobName; } public void setJobName(String jobName) { this.jobName = jobName; } public String getComName() { return comName; } public void setComName(String comName) { this.comName = comName; } public String getAddr() { return addr; } public void setAddr(String addr) { this.addr = addr; } public String getSalary() { return salary; } public void setSalary(String salary) { this.salary = salary; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } @Override public String toString() { return "JobBean [jobName=" + jobName + ", comName=" + comName + ", addr=" + addr + ", salary=" + salary + ", date=" + date + "]"; } public void set(String jobName, String comName, String addr, String salary, String date) { this.jobName = jobName; this.comName = comName; this.addr = addr; this.salary = salary; this.date = date; } }2.page.java
import org.jsoup.nodes.Document; public class Page { private Document document; private String nextPageUrl; private boolean hasNextPage; public String getNextPageUrl() { return nextPageUrl; } public void setNextPageUrl(String nextPageUrl) { this.nextPageUrl = nextPageUrl; } public boolean isHasNextPage() { return hasNextPage; } public void setHasNextPage(boolean hasNextPage) { this.hasNextPage = hasNextPage; } public Document getDocument() { return document; } public void setDocument(Document document) { this.document = document; } @Override public String toString() { return "Page [document=" + document + ", nextPageUrl=" + nextPageUrl + ", hasNextPage=" + hasNextPage + "]"; } }3.TestJsoup.java
import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class TestJsoup { public static void main(String[] args) throws Exception { Document document = getDocumentByUrl(); getValue(document); // <a target="_blank" title="北京拓尔思信息技术股份有限公司" // href="https://jobs.51job.com/all/co2706575.html">北京拓尔思信息技术股份有限公司</a> } /** * 获取具体的值 1:标签对之间的值 2:标签的属性值 * * @param document */ public static void getValue(Document document) { Elements select = document.select("#resultList .el a"); for (Element element : select) { // 标签对中间的值 // System.out.println(element.text()); String attr = element.attr("href"); System.out.println(attr); } } /** * 查找具体的数据有三种方式 1:通过id查找数据 2:通过class查找数据 3:通过标签查找数据(tag) * * select里面直接传递就是上面的三种形式 * * @param document */ public static void getElements(Document document) { // 通过id查找所需要的数据 id是唯一的,所找到的数据是单个的,也是具体的 Element elementById = document.getElementById("resultList"); // System.out.println(elementById); // 通过class查找数据 Elements elementsByClass = document.getElementsByClass("el"); /* * for (Element element : elementsByClass) { * System.out.println(element); * System.out.println("------------------"); } */ // 通过标签查找数据 Elements elementsByTag = document.getElementsByTag("a"); /* * for (Element element : elementsByTag) { System.out.println(element); * System.out.println("------------------"); } */ // css选择器 Elements select = document.select("#resultList .el"); for (Element element : select) { System.out.println(element); System.out.println("-----------"); } } /** * 通过父子标签和兄弟标签的到数据 * * @param document */ public static void getElementsByC(Document document) { Elements select = document.select("#resultList .el span"); for (Element element : select) { Elements children = element.children(); Element parent = element.parent(); // 得到所有的兄弟标签 element.siblingElements(); // 得到下面的兄弟标签 element.nextSibling(); // 得到前面的兄弟标签 element.previousElementSibling(); for (Element element2 : children) { System.out.println(element2); } } } /** * 通过url爬取整个网站的信息 * * @return * @throws MalformedURLException * @throws IOException */ private static Document getDocumentByUrl() throws MalformedURLException, IOException { // 第一个是url 第二个超时时间 String urlStr = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; URL url = new URL(urlStr); // 返回的整个爬取的数据 Document document = Jsoup.parse(url, 4000); // System.out.println(document); return document; } }
4.TestMain.java
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class TestMain { public static void main(String[] args) throws Exception { String usrEnd = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,145.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; Document document = getDocumentByUrl(url); Page page = new Page(); page.setDocument(document); int sum = 0; BufferedWriter bw = new BufferedWriter(new FileWriter("D:/大数据.txt")); while (true) { sum++; List<JobBean> jobsByPage = getJobsByPage(page); getNextPageUrl(page); for (JobBean jobBean : jobsByPage) { //System.out.println(jobBean); if(jobBean.toString()!=null){ bw.write(jobBean.toString()); bw.newLine(); bw.flush(); }else{ continue; } } System.out.println("——————————————" + sum + "——————————————"); if (page.isHasNextPage()) { document = getDocumentByUrl(page.getNextPageUrl()); page.setDocument(document); } else { break; } // Thread.sleep(1000); } } public static void getNextPageUrl(Page page) { Document document = page.getDocument(); Elements select = document.select(".bk"); Elements select2 = select.get(1).select("a"); if (select2 != null && select2.size() > 0) { Element element = select2.get(0); String url = element.attr("href"); page.setNextPageUrl(url); page.setHasNextPage(true); } else { page.setHasNextPage(false); } } private static Document getDocumentByUrl(String url) { URL u; try { u = new URL(url); Document document = Jsoup.parse(u, 4000); return document; } catch (Exception e) { e.printStackTrace(); } return null; } private static List<JobBean> getJobsByPage(Page page) { List<JobBean> list = new ArrayList<>(); Document document = page.getDocument(); Elements select = document.select("#resultList .el"); select.remove(0); for (Element element : select) { String jobName = element.select(".t1 a").get(0).text(); String comName = element.select(".t2 a").get(0).attr("title"); String addr = element.select(".t3").get(0).text(); String salary = element.select(".t4").get(0).text(); String date = element.select(".t5").get(0).text(); JobBean jobBean = new JobBean(); jobBean.set(jobName, comName, addr, salary, date); list.add(jobBean); } return list; } }