java爬虫Demo1 爬取数据到数据库

java爬虫Demo1            爬取数据到数据库

准备工作:

**url地址: **

https://search.51job.com/list/000000,000000,0000,01%252C32,9,99,Java%2B%25E6%259E%25B6%25E6%259E%2584,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

Maven依赖:

<parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.0.2.RELEASE</version>
    </parent>


    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <!--WebMagic核心包-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!--WebMagic扩展-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <!--WebMagic对布隆过滤器的支持-->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>16.0</version>
        </dependency>

        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-classic</artifactId>
            <version>1.2.3</version>
        </dependency>
        <dependency>
            <groupId>com.baomidou</groupId>
            <artifactId>mybatis-plus-core</artifactId>
            <version>3.0.5</version>
        </dependency>


    </dependencies>

启动项:

@EnableScheduling
@SpringBootApplication
public class application {
    public static void main(String[] args) {
        SpringApplication.run(application.class,args);
    }
}

配置文件:

spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql:///test
spring.datasource.username=root
spring.datasource.password=0000

spring.jpa.show-sql=true
spring.jpa.database=MYSQL

实体类:


@Entity
@Table(name = "drawlerdemo2")
public class User {

    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    private String companyName;
    private String companyAddr;
    private String companyInfo;
    private String jobName;
    private String jobAddr;
    private String jobInfo;
    private Integer salaryMin;
    private Integer salaryMax;
    private String url;
    private String time;


    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public String getCompanyName() {
        return companyName;
    }

    public void setCompanyName(String companyName) {
        this.companyName = companyName;
    }

    public String getCompanyAddr() {
        return companyAddr;
    }

    public void setCompanyAddr(String companyAddr) {
        this.companyAddr = companyAddr;
    }

    public String getCompanyInfo() {
        return companyInfo;
    }

    public void setCompanyInfo(String companyInfo) {
        this.companyInfo = companyInfo;
    }

    public String getJobName() {
        return jobName;
    }

    public void setJobName(String jobName) {
        this.jobName = jobName;
    }

    public String getJobAddr() {
        return jobAddr;
    }

    public void setJobAddr(String jobAddr) {
        this.jobAddr = jobAddr;
    }

    public String getJobInfo() {
        return jobInfo;
    }

    public void setJobInfo(String jobInfo) {
        this.jobInfo = jobInfo;
    }

    public Integer getSalaryMin() {
        return salaryMin;
    }

    public void setSalaryMin(Integer salaryMin) {
        this.salaryMin = salaryMin;
    }

    public Integer getSalaryMax() {
        return salaryMax;
    }

    public void setSalaryMax(Integer salaryMax) {
        this.salaryMax = salaryMax;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }


    @Override
    public String toString() {
        return "User{" +
                "id=" + id +
                ", companyName='" + companyName + '\'' +
                ", companyAddr='" + companyAddr + '\'' +
                ", companyInfo='" + companyInfo + '\'' +
                ", jobName='" + jobName + '\'' +
                ", jobAddr='" + jobAddr + '\'' +
                ", jobInfo='" + jobInfo + '\'' +
                ", salaryMin=" + salaryMin +
                ", salaryMax=" + salaryMax +
                ", url='" + url + '\'' +
                ", time='" + time + '\'' +
                '}';
    }
}

:Dao层


import com.drawler.domani.User;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Repository;


//对数据库的增删改查

@Repository
public interface IuserDao extends JpaRepository<User,Long> {
}

:Service层

接口:

public interface Iservice {

    public void save(User user);

}

接口实现类:


/**
 * @Author: 刚满20就秃顶
 * @Code:   将对爬取的数据,进行去重保存
 */

@Service
public class serviceImpl implements Iservice {


    @Autowired
    private IuserDao dao;
    @Override
    public void save(User user) {
        //保存数据前,判断数据库中是否有相同的数据
        //使用实体对象封装传入的数据,利用Example对url地址和Time判断   也可以用其他的
                                                 //只是利用其他进行比较重复度较高
        User u = new User();
        u.setUrl(user.getUrl());
        u.setTime(user.getTime());

         //where   只有和数据库中数据相等才能进入
        Example example = Example.of(u);

        List<User> list = dao.findAll(example);

        //说明重复没有数据  保存刷新数据库
        if(list.size()==0){
            dao.saveAndFlush(user);
        }

    }
}

:解析

@Component
public class process implements PageProcessor {
    @Override
    public void process(Page page) {
        Html html = page.getHtml();
        List<Selectable> list = html.css("div.dw_table div.el").nodes();
        //列表页
        if(list.size()!=0){
            for (Selectable parm : list){
                page.addTargetRequest(parm.links().toString());
            }
            page.addTargetRequest(page.getHtml().css("div.p_in   li:nth-last-child(1)").links().toString());

        }
        else {

            Html h = page.getHtml();

            User user = new User();
            //为啥程序知道你这里的是详情页地址还是下一页地址????
            //我是猪!!!    mad,下一页得到还不是详情页地址
            user.setUrl(page.getUrl().toString());
            user.setCompanyName(h.css("div.cn a[target=_blank][title]","text").toString());
            String addr = h.css("div.cn p.msg[title]","text").toString();
            String add = addr.substring(0,7);
            user.setCompanyAddr(add);

            user.setCompanyInfo(h.css("div.tBorderTop_box div.tmsg","text").toString());

            user.setJobName(h.css("div.mt10 a","text").toString());

            user.setJobAddr(h.css("div.tBorderTop_box p.fp","text").nodes().get(1).toString());


            user.setJobInfo(h.css("div.job_msg p","text").all().toString());

            Integer [] salary = MathSalary.getSalary(h.css("div.in div.cn strong","text").toString());
            user.setSalaryMax(salary[1]);
            user.setSalaryMin(salary[0]);


            String time = h.css("div.in div.cn p.msg","text").toString();
            String tm = time.substring(time.indexOf("人")+2, time.indexOf("发布"));

            user.setTime(tm);

            //上传到resultItem中
            page.putField("user",user);


        }
    }


    Site site = Site.me()
            .setTimeOut(1000*10)
            .setCharset("gbk")
            .setRetrySleepTime(1000)
            .setRetryTimes(5);
    @Override
    public Site getSite() {
        return site;
    }

    @Autowired
    private pepline pepline;


    @Scheduled (initialDelay = 1000,fixedDelay = 1000)
    public void poc(){
        Spider.create(new process())
                .addUrl("https://search.51job.com/" +
                        "list/000000,000000,0000,01%25" +
                        "2C32,9,99,Java%2B%25E6%259E%2" +
                        "5B6%25E6%259E%2584,2,1.html?lang" +
                        "=c&postchannel=0000&workyear=99&c" +
                        "otype=99&degreefrom=99&jobterm=99&c" +
                        "mpanysize=99&ord_field" +
                        "=0&dibiaoid=0&line=&welfare=")
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000)))
                .addPipeline(pepline)
                .thread(20)
                .run();
    }
}

:保存


/**
 * @Author: 刚满20就秃顶
 * @Code:
 */
@Component
public class pepline  implements Pipeline {
    @Autowired
    private Iservice iservice ;
    @Override
    public void process(ResultItems resultItems, Task task) {
        User user = resultItems.get("user");

        if(user!=null){
            iservice.save(user);
        }
    }
}

:工具类

package com.drawler.task;

public class MathSalary {


    /**
     * 获取薪水范围
     *
     * @param salaryStr
     * @return
     */
    public static Integer[] getSalary(String salaryStr) {
        //声明存放薪水范围的数组
        Integer[] salary = new Integer[2];

        //"500/天"
        //0.8-1.2万/月
        //5-8千/月
        //5-6万/年
        String date = salaryStr.substring(salaryStr.length() - 1, salaryStr.length());
        //如果是按天,则直接乘以240进行计算
        if (!"月".equals(date) && !"年".equals(date)) {
            salaryStr = salaryStr.substring(0, salaryStr.length() - 2);
            salary[0] = salary[1] = str2Num(salaryStr, 240);
            return salary;
        }

        String unit = salaryStr.substring(salaryStr.length() - 3, salaryStr.length() - 2);
        String[] salarys = salaryStr.substring(0, salaryStr.length() - 3).split("-");


        salary[0] = mathSalary(date, unit, salarys[0]);
        salary[1] = mathSalary(date, unit, salarys[1]);

        return salary;


    }

    //根据条件计算薪水
    private static Integer mathSalary(String date, String unit, String salaryStr) {
        Integer salary = 0;

        //判断单位是否是万
        if ("万".equals(unit)) {
            //如果是万,薪水乘以10000
            salary = str2Num(salaryStr, 10000);
        } else {
            //否则乘以1000
            salary = str2Num(salaryStr, 1000);
        }

        //判断时间是否是月
        if ("月".equals(date)) {
            //如果是月,薪水乘以12
            salary = str2Num(salary.toString(), 12);
        }

        return salary;
    }


    private static int str2Num(String salaryStr, int num) {
        try {
            // 把字符串转为小数,必须用Number接受,否则会有精度丢失的问题
            Number result = Float.parseFloat(salaryStr) * num;
            return result.intValue();
        } catch (Exception e) {
        }
        return 0;
    }
}

在这里插入图片描述
截取写得很LOW

照搬(黑马的视频)
模仿
理解
改写
我的
(手动滑稽)

发布了24 篇原创文章 · 获赞 2 · 访问量 458

猜你喜欢

转载自blog.csdn.net/tiangoua/article/details/103601691