准备工作:
**url地址: **
https://search.51job.com/list/000000,000000,0000,01%252C32,9,99,Java%2B%25E6%259E%25B6%25E6%259E%2584,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
Maven依赖:
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.2.RELEASE</version>
</parent>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--SpringData Jpa-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!--MySQL连接包-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!--WebMagic核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--WebMagic扩展-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--WebMagic对布隆过滤器的支持-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
<!--工具包-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-core</artifactId>
<version>3.0.5</version>
</dependency>
</dependencies>
启动项:
@EnableScheduling
@SpringBootApplication
public class application {
public static void main(String[] args) {
SpringApplication.run(application.class,args);
}
}
配置文件:
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql:///test
spring.datasource.username=root
spring.datasource.password=0000
spring.jpa.show-sql=true
spring.jpa.database=MYSQL
实体类:
@Entity
@Table(name = "drawlerdemo2")
public class User {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String companyName;
private String companyAddr;
private String companyInfo;
private String jobName;
private String jobAddr;
private String jobInfo;
private Integer salaryMin;
private Integer salaryMax;
private String url;
private String time;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getCompanyName() {
return companyName;
}
public void setCompanyName(String companyName) {
this.companyName = companyName;
}
public String getCompanyAddr() {
return companyAddr;
}
public void setCompanyAddr(String companyAddr) {
this.companyAddr = companyAddr;
}
public String getCompanyInfo() {
return companyInfo;
}
public void setCompanyInfo(String companyInfo) {
this.companyInfo = companyInfo;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public String getJobAddr() {
return jobAddr;
}
public void setJobAddr(String jobAddr) {
this.jobAddr = jobAddr;
}
public String getJobInfo() {
return jobInfo;
}
public void setJobInfo(String jobInfo) {
this.jobInfo = jobInfo;
}
public Integer getSalaryMin() {
return salaryMin;
}
public void setSalaryMin(Integer salaryMin) {
this.salaryMin = salaryMin;
}
public Integer getSalaryMax() {
return salaryMax;
}
public void setSalaryMax(Integer salaryMax) {
this.salaryMax = salaryMax;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
@Override
public String toString() {
return "User{" +
"id=" + id +
", companyName='" + companyName + '\'' +
", companyAddr='" + companyAddr + '\'' +
", companyInfo='" + companyInfo + '\'' +
", jobName='" + jobName + '\'' +
", jobAddr='" + jobAddr + '\'' +
", jobInfo='" + jobInfo + '\'' +
", salaryMin=" + salaryMin +
", salaryMax=" + salaryMax +
", url='" + url + '\'' +
", time='" + time + '\'' +
'}';
}
}
:Dao层
import com.drawler.domani.User;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Repository;
//对数据库的增删改查
@Repository
public interface IuserDao extends JpaRepository<User,Long> {
}
:Service层
接口:
public interface Iservice {
public void save(User user);
}
接口实现类:
/**
* @Author: 刚满20就秃顶
* @Code: 将对爬取的数据,进行去重保存
*/
@Service
public class serviceImpl implements Iservice {
@Autowired
private IuserDao dao;
@Override
public void save(User user) {
//保存数据前,判断数据库中是否有相同的数据
//使用实体对象封装传入的数据,利用Example对url地址和Time判断 也可以用其他的
//只是利用其他进行比较重复度较高
User u = new User();
u.setUrl(user.getUrl());
u.setTime(user.getTime());
//where 只有和数据库中数据相等才能进入
Example example = Example.of(u);
List<User> list = dao.findAll(example);
//说明重复没有数据 保存刷新数据库
if(list.size()==0){
dao.saveAndFlush(user);
}
}
}
:解析
@Component
public class process implements PageProcessor {
@Override
public void process(Page page) {
Html html = page.getHtml();
List<Selectable> list = html.css("div.dw_table div.el").nodes();
//列表页
if(list.size()!=0){
for (Selectable parm : list){
page.addTargetRequest(parm.links().toString());
}
page.addTargetRequest(page.getHtml().css("div.p_in li:nth-last-child(1)").links().toString());
}
else {
Html h = page.getHtml();
User user = new User();
//为啥程序知道你这里的是详情页地址还是下一页地址????
//我是猪!!! mad,下一页得到还不是详情页地址
user.setUrl(page.getUrl().toString());
user.setCompanyName(h.css("div.cn a[target=_blank][title]","text").toString());
String addr = h.css("div.cn p.msg[title]","text").toString();
String add = addr.substring(0,7);
user.setCompanyAddr(add);
user.setCompanyInfo(h.css("div.tBorderTop_box div.tmsg","text").toString());
user.setJobName(h.css("div.mt10 a","text").toString());
user.setJobAddr(h.css("div.tBorderTop_box p.fp","text").nodes().get(1).toString());
user.setJobInfo(h.css("div.job_msg p","text").all().toString());
Integer [] salary = MathSalary.getSalary(h.css("div.in div.cn strong","text").toString());
user.setSalaryMax(salary[1]);
user.setSalaryMin(salary[0]);
String time = h.css("div.in div.cn p.msg","text").toString();
String tm = time.substring(time.indexOf("人")+2, time.indexOf("发布"));
user.setTime(tm);
//上传到resultItem中
page.putField("user",user);
}
}
Site site = Site.me()
.setTimeOut(1000*10)
.setCharset("gbk")
.setRetrySleepTime(1000)
.setRetryTimes(5);
@Override
public Site getSite() {
return site;
}
@Autowired
private pepline pepline;
@Scheduled (initialDelay = 1000,fixedDelay = 1000)
public void poc(){
Spider.create(new process())
.addUrl("https://search.51job.com/" +
"list/000000,000000,0000,01%25" +
"2C32,9,99,Java%2B%25E6%259E%2" +
"5B6%25E6%259E%2584,2,1.html?lang" +
"=c&postchannel=0000&workyear=99&c" +
"otype=99°reefrom=99&jobterm=99&c" +
"mpanysize=99&ord_field" +
"=0&dibiaoid=0&line=&welfare=")
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000)))
.addPipeline(pepline)
.thread(20)
.run();
}
}
:保存
/**
* @Author: 刚满20就秃顶
* @Code:
*/
@Component
public class pepline implements Pipeline {
@Autowired
private Iservice iservice ;
@Override
public void process(ResultItems resultItems, Task task) {
User user = resultItems.get("user");
if(user!=null){
iservice.save(user);
}
}
}
:工具类
package com.drawler.task;
public class MathSalary {
/**
* 获取薪水范围
*
* @param salaryStr
* @return
*/
public static Integer[] getSalary(String salaryStr) {
//声明存放薪水范围的数组
Integer[] salary = new Integer[2];
//"500/天"
//0.8-1.2万/月
//5-8千/月
//5-6万/年
String date = salaryStr.substring(salaryStr.length() - 1, salaryStr.length());
//如果是按天,则直接乘以240进行计算
if (!"月".equals(date) && !"年".equals(date)) {
salaryStr = salaryStr.substring(0, salaryStr.length() - 2);
salary[0] = salary[1] = str2Num(salaryStr, 240);
return salary;
}
String unit = salaryStr.substring(salaryStr.length() - 3, salaryStr.length() - 2);
String[] salarys = salaryStr.substring(0, salaryStr.length() - 3).split("-");
salary[0] = mathSalary(date, unit, salarys[0]);
salary[1] = mathSalary(date, unit, salarys[1]);
return salary;
}
//根据条件计算薪水
private static Integer mathSalary(String date, String unit, String salaryStr) {
Integer salary = 0;
//判断单位是否是万
if ("万".equals(unit)) {
//如果是万,薪水乘以10000
salary = str2Num(salaryStr, 10000);
} else {
//否则乘以1000
salary = str2Num(salaryStr, 1000);
}
//判断时间是否是月
if ("月".equals(date)) {
//如果是月,薪水乘以12
salary = str2Num(salary.toString(), 12);
}
return salary;
}
private static int str2Num(String salaryStr, int num) {
try {
// 把字符串转为小数,必须用Number接受,否则会有精度丢失的问题
Number result = Float.parseFloat(salaryStr) * num;
return result.intValue();
} catch (Exception e) {
}
return 0;
}
}
截取写得很LOW
照搬(黑马的视频)
模仿
理解
改写
我的
(手动滑稽)