idea爬虫爬取招聘信息,大数据

某工厂来学校培训大数据爬虫,先提供个网页
在这里插入图片描述
在这里插入图片描述

<%@ page language="java" import="java.util.*" pageEncoding="utf-8"%>
<html>
<head>
    <link rel="stylesheet" href="layui/css/layui.css">
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>大数据-烟台徐老师</title>

    <script  src="js/jquery-3.2.1.min.js"></script>
    <script  src="layui/layui.js"></script>
    <script  src="js/indexAjax.js"></script>
    <script  src="js/indexAjax2.js"></script>
    <script  src="js/indexAjax3.js"></script>
    <script  src="js/indexAjax4.js"></script>
    <script  src="js/indexAjax5.js"></script>
    <script>
        function scrapy() {
            // var layer = layui.layer;
            $.ajax({
                url:'${pageContext.request.contextPath }/scrapy.do',
                success:function () {
                    layui.use('layer', function(){
                        var layer = layui.layer;
                        layer.alert('爬取成功!');
                    });
                }
            });
        }

    </script>
    <link rel="stylesheet"
          href="layui/css/layui.css">
</head>
<body class="layui-layout-body">
<div class="layui-layout layui-layout-admin">
    <div class="layui-header">
        <div class="layui-logo">
            <a href="index.jsp"><img src="images/logoblack.png" /></a>
        </div>
        <!-- 头部区域(可配合layui已有的水平导航) -->
        <ul class="layui-nav layui-layout-left">
            <li class="layui-nav-item"><a href="">控制台</a></li>
            <li class="layui-nav-item"><a href="">商品管理</a></li>
            <li class="layui-nav-item"><a href="">用户</a></li>
            <li class="layui-nav-item"><a href="javascript:;">BigData</a>
                <dl class="layui-nav-child">
                    <dd>
                        <a href="showCrawlerData.jsp" target="main">查询数据</a>
                    </dd>
                    <dd>
                        <a href="recruit/insertZhiRecruit">智联招聘</a>
                    </dd>
                    <dd>
                        <a href="javascript:scrapy();">前程无忧</a>
                    </dd>
                    <dd>
                        <a href="recruit/insertBossRecruit">Boss直聘</a>
                    </dd>
                    <dd>
                        <a href="recruit/delAllRecruit">删库谨慎</a>
                    </dd>
                </dl>
            </li>
        </ul>
        <ul class="layui-nav layui-layout-right">
            <li class="layui-nav-item"><a href="javascript:;"> <img
                    src="images/logo.png" class="layui-nav-img"> 某大牛培训老师 </a>
                <dl class="layui-nav-child">
                    <dd>
                        <a href="javascript:ajaxRequest5();">基本资料</a>
                    </dd>
                    <dd>
                        <a href="echarts.jsp">安全设置</a>
                    </dd>
                </dl>
            </li>
            <li class="layui-nav-item"><a href="javascript:;">退了</a></li>
        </ul>
    </div>

    <div  class="layui-side layui-bg-black">
        <div class="layui-side-scroll">
            <!-- 左侧导航区域(可配合layui已有的垂直导航) -->
            <ul id="menu" class="layui-nav layui-nav-tree" lay-filter="test">
                <li class="layui-nav-item"><a class=""
                                              href="javascript:;">爬虫管理</a>
                    <dl class="layui-nav-child">
                        <dd>
                            <a href="javascript:ajaxRequest();">爬取招聘数据</a>
                        </dd>


                        <dd>
                            <a href="javascript:ajaxRequest3();">爬取行政区域</a>
                        </dd>
                        <dd>
                            <a href="javascript:ajaxRequest2();">爬取图片</a>
                        </dd>

                    </dl>
                </li>
                <li class="layui-nav-item"><a href="javascript:;">数据处理</a>
                    <dl class="layui-nav-child">
                        <dd>
                            <a href="ik/ikData">生成分词</a>
                        </dd>
                        <dd>
                            <a href="ik/ikData">招聘信息分析</a>
                        </dd>

                    </dl>
                </li>
                <li class="layui-nav-item"><a href="javascript:;">大数据处理</a>
                    <dl class="layui-nav-child">
                        <dd>
                            <a href="ik/ikData">生成分词</a>
                        </dd>
                        <dd>
                            <a href="javascript:ajaxRequest4();" target="main">提交数据</a>
                        </dd>
                        <dd>
                            <a href="ik/ikData">大数据统计</a>
                        </dd>

                    </dl>
                </li>
                <li class="layui-nav-item"><a href="javascript:;">系统管理</a>
                    <dl class="layui-nav-child">
                        <dd>
                            <a href="javascript:;">用户管理</a>
                        </dd>
                        <dd>
                            <a href="javascript:;">权限管理</a>
                        </dd>
                    </dl>
                </li>
            </ul>
        </div>
    </div>

    <div class="layui-body" style="padding: 5px">
        <!-- 内容主体区域 -->
        <div >
            <iframe name="main" frameborder="0" width="100%" height="90%" src="welcome.html"></iframe>
        </div>
    </div>

    <div class="layui-footer">
        <!-- 底部固定区域 -->
    </div>
</div>
<script src="${pageContext.request.contextPath }/layui/layui.js"></script>
<script>
    //JavaScript代码区域
    layui.use(['jquery','element'], function(){
        var element = layui.element;
        var $ = layui.$;
        $("#menu li").click(function(){
            $(this).siblings().removeClass("layui-nav-itemed");
        })
    });
</script>
</body>
</html>

在这里插入图片描述
在这里插入图片描述

package com.ld.jsoup.servlet;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
//ctrl+o
public class JsoupServlet extends HttpServlet {
    @Override
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        //设置编码格式
        request.setCharacterEncoding("UTF-8");//请求
        response.setContentType("text/html;charset=UTF-8");//响应  
        //获取PrintWriter对象设置响应文本
        PrintWriter out = response.getWriter();
        ExecutorService executorService = Executors.newFixedThreadPool(5);
        for(int i=1;i<=10;i++){
            final String url="https://search.51job.com/list/120400%252C010000,000000,0000,00,9,99,java,2,"+
                    i+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
            //connect方法访问某个网址,get获取Document文本对象
            try {
            final Document document = Jsoup.connect(url).get();
            executorService.execute(new Runnable() {
                public void run() {
                    //select方法 根据选择器获取对应的元素集合
                    Elements elements = document.select("p.t1.tg1 span a");
                    //遍历元素集合
                    for(Element element:elements){
                        String absUrl = element.absUrl("href");
                        //System.out.println(absUrl);
                        Document doc = null;
                        try {
                            doc = Jsoup.connect(absUrl).get();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                        Elements eles = doc.select("div.cn h1");
                        for(Element ele:eles){
                            //text()方法 获取元素中的内容
                            System.out.println("jsoup===="+element.text());
                        }
                    }
                }
            });
            } catch (IOException e) {
                e.printStackTrace();
            }
            System.out.println("-----------------------------------------------------");
        }
        out.print("success");
        out.close();
    }

    @Override
    protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
        doGet(req,resp);
    }
}

在这里插入图片描述

function ajaxRequest() {
    $.ajax({
        "url":"jsoup",
        "type":"GET",
        "dataType":"text",
        "success":ifSuccess,
        "error":function(){
            alert("请求错误!");
        }
    });
}

function ifSuccess(data){
    if(data=="success"){
        alert("爬取成功!");
    }else{
        alert("爬取失败!");
    }
}

在这里插入图片描述

<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns="http://java.sun.com/xml/ns/javaee"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
		  http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"
         version="3.0">
  <display-name>Archetype Created Web Application</display-name>
  <welcome-file-list>
    <welcome-file>index.jsp</welcome-file>
  </welcome-file-list>

  <servlet>
    <servlet-name>JsoupImgServlet</servlet-name>
    <servlet-class>com.ld.jsoup.servlet.JsoupImgServlet</servlet-class>
  </servlet>

  <servlet-mapping>
    <servlet-name>JsoupImgServlet</servlet-name>
    <url-pattern>/src</url-pattern>
  </servlet-mapping>


  <servlet>
    <servlet-name>JsoupServlet</servlet-name>
    <servlet-class>com.ld.jsoup.servlet.JsoupServlet</servlet-class>
  </servlet>
  <servlet-mapping>
    <servlet-name>JsoupServlet</servlet-name>
    <url-pattern>/jsoup</url-pattern>
  </servlet-mapping>
  
  <servlet>
    <servlet-name>JobServlet</servlet-name>
    <servlet-class>com.ld.jsoup.servlet.JobServlet</servlet-class>
  </servlet>
  <servlet-mapping>
    <servlet-name>JobServlet</servlet-name>
    <url-pattern>/JobServlet</url-pattern>
  </servlet-mapping>

  <servlet>
    <servlet-name>uploadFileServlet</servlet-name>
    <servlet-class>com.ld.jsoup.servlet.uploadFileServlet</servlet-class>
  </servlet>
  <servlet-mapping>
    <servlet-name>uploadFileServlet</servlet-name>
    <url-pattern>/uploadFileServlet</url-pattern>
  </servlet-mapping>


  <servlet>
    <servlet-name>JobToFileServlet</servlet-name>
    <servlet-class>com.ld.jsoup.servlet.JobToFileServlet</servlet-class>
  </servlet>
  <servlet-mapping>
    <servlet-name>JobToFileServlet</servlet-name>
    <url-pattern>/JobToFile</url-pattern>
  </servlet-mapping>
</web-app>

在这里插入图片描述
点击运行,爬取招聘数据
在这里插入图片描述
在这里插入图片描述

还是刚入门,只能做些比较low的东西,反正觉得用线程池很好用,下次学习存入数据库,那个就比较实用了
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_43820992/article/details/88052997