Java爬虫初体验:简单抓取IT之家热评(整合Spring Boot+Elasticsearch+Redis+Mybatis)

爬取主程序

使用Jsoup解析网页源代码

@Component
public class WebCrawler {

    private static final String encoding = "utf-8";

    @Autowired
    private HotCommentMapper hotCommentMapper;
    @Autowired
    private RedisService redisService;
    @Autowired
    private EsService esService;

    private static boolean done = false;
    private static final int THREAD_NUM = 15;
    private static AtomicInteger page = new AtomicInteger(0);
    private static List<String> breakpoints;

    /**
     * 定时爬取更新
     */
    //@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3)
    public void start(){
        done = false;
        System.out.println("开始爬取:"+System.currentTimeMillis());
        for (int i = 0;i<THREAD_NUM;++i){
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!done) {
                        int p = page.incrementAndGet();
                        crawl(p);
                    }
                    System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());
                }
            },"Thread--"+i).start();
        }
    }

    public synchronized void stop(){
        done = true;
        redisService.listRemove("ithome:breakpoints");
        redisService.listAdd("ithome:breakpoints",breakpoints);
    }

    /**
     * @param page :页码
     */
    public void crawl(int page){
        String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +
                "page="+page+"&type=indexpage&randnum="+Math.random();
        String src = getHtmlSrc(url,encoding);
        List<String> links = getArticleLinks(src);
        if (links.size()<=0){
            stop();
            return ;
        }
        //不知还有没更好的方法判断最近一次抓取的位置?
        if(redisService.containsValue("ithome:breakpoints",links)){
            stop();
            return ;
        }
        //保存第一页链接做结束点
        if (page == 1){
            breakpoints = links;
        }
        for (String link:links){
            parseAndSaveHotComments(link);
        }
    }

    /**
     *
     * @param url
     * @param encoding 编码
     * @return 网页源代码
     */
    public String getHtmlSrc(String url,String encoding){
        StringBuilder src = new StringBuilder();
        InputStreamReader isr = null;
        try {
            URL urlObj = new URL(url);//建立网络链接
            URLConnection urlConn = urlObj.openConnection();//打开链接
            isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流
            BufferedReader reader = new BufferedReader(isr);//建立缓冲
            String line = null;
            while ((line = reader.readLine())!=null){
                src.append(line);
            }
        }catch (Exception e){
            e.printStackTrace();
        }finally {
            try {
                if (isr != null){
                    isr.close();
                }
            }catch (Exception e){
                e.printStackTrace();
            }
        }
        return src.toString();
    }

    /**
     * @param srcCode
     * @return 解析源代码,获取文章链接
     */
    public List<String> getArticleLinks(String srcCode){
        List<String> links = new ArrayList<String>();
        Document document = Jsoup.parse(srcCode);
        Elements articleEls = document.select("h2>a");

        for (Element el:articleEls){
            String href = el.attr("href");
            links.add(href);
        }
        return links;
    }

    /**
     *
     * @param articleHref 文章链接
     * @description 使用Jsoup解析热评内容并保存
     */
    public void parseAndSaveHotComments(String articleHref){
        String articlePage = getHtmlSrc(articleHref,encoding);
        Document document = Jsoup.parse(articlePage);
        Element iframeEl = document.getElementById("ifcomment");
        if(iframeEl == null) {
            return ;
        }
        String commentHref = iframeEl.attr("src");//评论页面URL

        //获取文章ID
        document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));
        Element articleIdInput = document.getElementById("newsid");
        String articleId = articleIdInput.attr("value");

        //获取热评数据并解析
        String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";
        String hotCommentPage = getHtmlSrc(link,encoding);
        document = Jsoup.parse(hotCommentPage);
        Elements hotCommentEls = document.select("li.entry");

        HotComment hotComment = null;
        for (Element el:hotCommentEls){
            hotComment = new HotComment();
            String  commontId = el.attr("cid");
            String user = el.select("strong.nick a").text();
            String comment = el.getElementsByTag("P").text();
            int up = getNumber(el.select("a.s").text());
            int down = getNumber(el.select("a.a").text());
            String posandtime = el.select("span.posandtime").text();
            String mobile = el.select("span.mobile a").text();

            hotComment = new HotComment();
            hotComment.setCommentId(commontId);
            hotComment.setArticleUrl(articleHref);
            hotComment.setUser(user);
            hotComment.setComment(comment);
            hotComment.setUp(up);
            hotComment.setDown(down);
            hotComment.setPosandtime(posandtime);
            hotComment.setMobile(mobile);

            hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以
            esService.addHotComment(hotComment);//添加索引
            if(hotComment.getUp()>=2500){
                redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评
            }

            //System.out.println(hotComment.toString());
        }
    }

    /**
     *
     * @param str
     * @return 解析"()"中的数字
     */
    public int getNumber(String str){
        Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");
        Matcher matcher = pattern.matcher(str);

        if(matcher.find()){
            return Integer.parseInt(matcher.group());
        }
        return 0;
    }

//    public static void main(String [] args){
//        new WebCrawler().start();
//    }
}

整合Elasticsearch

相关依赖:

<!--es全文搜索-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>

application.yml配置:

spring:
  data:
  ##elasticsearch配置
    elasticsearch:
      cluster-name: elasticsearch
      cluster-nodes: localhost:9300

代码部分:

Respository

public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{
    public List<HotComment> findByUser(String user);
}

实体

@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")
public class HotComment implements Serializable{

    private static final long serialVersionUID = -4249699545233058684L;
    @Id
    private Long id;//热评编号
    private String commentId;
    private String user;//用户
    private String comment;//内容
    private int up;//支持数
    private int down;//反对数
    private String posandtime;//位置和时间
    private String mobile;//设备
    private String articleUrl;//源文章地址

    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public String getCommentId() {
        return commentId;
    }

    public void setCommentId(String commentId) {
        this.commentId = commentId;
    }


    public String getUser() {
        return user;
    }

    public void setUser(String user) {
        this.user = user;
    }

    public String getComment() {
        return comment;
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    public int getUp() {
        return up;
    }

    public void setUp(int up) {
        this.up = up;
    }

    public int getDown() {
        return down;
    }

    public void setDown(int down) {
        this.down = down;
    }

    public String getPosandtime() {
        return posandtime;
    }

    public void setPosandtime(String posandtime) {
        this.posandtime = posandtime;
    }

    public String getMobile() {
        return mobile;
    }

    public void setMobile(String mobile) {
        this.mobile = mobile;
    }

    public String getArticleUrl() {
        return articleUrl;
    }

    public void setArticleUrl(String articleUrl) {
        this.articleUrl = articleUrl;
    }

    @Override
    public String toString() {
        return "HotComment{" +
                "id='" + id + '\'' +
                "commentId='" + commentId + '\'' +
                ", user='" + user + '\'' +
                ", comment='" + comment + '\'' +
                ", up=" + up +
                ", down=" + down +
                ", posandtime='" + posandtime + '\'' +
                ", mobile='" + mobile + '\'' +
                ", articleUrl='" + articleUrl + '\'' +
                '}';
    }
}

Service

@Service
public class EsService {
    @Autowired
    private EsRepository esRepository;

    public void addHotComment(HotComment hotComment){
        esRepository.save(hotComment);
    }

    /**
     * 缓存搜索结果
     * @param user
     * @return
     */
    @Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")
    public List<HotComment> findByUser(String user){
        return esRepository.findByUser(user);
    }
}

整合Redis

相关依赖

<!-- redis -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>

相关配置

appliaction.yml配置

spring:
  ##redis配置
  redis:
    database: 0
    host: localhost
    port: 6379
    password: redis
    pool:
      max-active: 15
      max-wait: 1
      max-idle: 0
    timeout: 0

缓存相关配置

@Configuration
@EnableCaching
public class RedisConfig {
    @Bean
    public KeyGenerator keyGenerator(){
        return new KeyGenerator(){

            @Override
            public Object generate(Object o, Method method, Object... objects) {
                StringBuilder sb = new StringBuilder();
                sb.append(o.getClass().getName());
                sb.append(method.getName());
                for(Object obj : objects){
                    sb.append(obj.toString());
                }
                return sb.toString();
            }
        };
    }

    @Bean
    public CacheManager cacheManager(RedisTemplate redisTemplate){
        RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);
//        redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s
        Map<String,Long> map = new HashMap<>();
        map.put("ithome:hotcomments",60*60*24L);
        return redisCacheManager;
    }

    @Bean
    public RedisTemplate<String,String> redisTemplate(RedisConnectionFactory factory){
        StringRedisTemplate template = new StringRedisTemplate(factory);
        Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);
        ObjectMapper om = new ObjectMapper();
        om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
        om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
        jackson2JsonRedisSerializer.setObjectMapper(om);
        template.setValueSerializer(jackson2JsonRedisSerializer);
        template.afterPropertiesSet();

        return template;
    }
}

代码部分:

Service

@Service
public class RedisService {

    @Autowired
    private RedisTemplate redisTemplate;

    /**
     *
     * @param key
     * @param hotComment
     * 添加热评至 redis
     */
    public void rankAdd(String key, HotComment hotComment){
        ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
        zSetOperations.add(key,hotComment,hotComment.getUp());
    }

    /**
     *
     * @param key
     * @param top 前top条记录
     * @return
     */
    public Set<HotComment> rankGet(String key,int top){
        ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
        return zSetOperations.range(key,0,top);
    }

    /**
     *
     * @param key
     * @param values
     * @desc 保存最近一次抓取的位置点
     */
    public void listAdd(String key,List<String> values){
        ListOperations<String,String> listOperations = redisTemplate.opsForList();
        listOperations.rightPushAll(key,values);
    }

    public void listRemove(String key){
        redisTemplate.delete(key);
    }

    /**
     *
     * @param key
     * @param values
     * @return
     * @desc 判断是否抓取结束位置
     */
    public boolean containsValue(String key,List<String> values){
        ListOperations<String,String> listOperations = redisTemplate.opsForList();
        List<String> list = listOperations.range(key,0,-1);
        for (String val : values){
            if(list.contains(val)){
                return true;
            }
        }
        return false;
    }
}

整合Mybatis

依赖配置


        <!-- mybatis -->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>1.3.1</version>
        </dependency>
        <!-- mysql驱动 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <!--数据源依赖-->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.0.24</version>
        </dependency>

数据源配置

@Configuration
@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")
public class DatabaseConfig {
    /**
     * 数据源配置
     * @return
     */
    @Bean
    public DataSource druidDataSource(){
        DruidDataSource dataSource=new DruidDataSource();
        dataSource.setUsername("root");
        dataSource.setPassword("mysql");
        dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");
        dataSource.setDriverClassName("com.mysql.jdbc.Driver");
        return dataSource;
    }
}

代码部分:

Mapper

public interface HotCommentMapper {
    @Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")
    @Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")
    public void addHotComment(HotComment hotComment);
}

Controller

@Controller
@RequestMapping("/ithome")
public class HotCommentController {

    @Autowired
    private RedisService redisService;
    @Autowired
    private EsService esService;

    /**
     * 首页
     * @return
     */
    @GetMapping("/index")
    public ModelAndView index(){
        ModelAndView mav = new ModelAndView("search");
        Set<HotComment> set = redisService.rankGet("ithome:hotrank",50);
        mav.addObject("comments",set);
        return mav;
    }

    /**
     * 搜索
     * @param keyword
     * @return
     */
    @GetMapping("/search/{keyword}")
    public ModelAndView search(@PathVariable("keyword") String keyword){
        ModelAndView mav = new ModelAndView("search");
        List<HotComment> list = esService.findByUser(keyword);
        mav.addObject("comments",list);
        return mav;
    }
}

主程序

@SpringBootApplication
@EnableElasticsearchRepositories
@EnableScheduling
public class IthomecrawlerApplication {

    public static void main(String[] args) {
        SpringApplication.run(IthomecrawlerApplication.class, args);
    }
}

完整application.yml文件

#端口号
server:
  port: 8081

spring:
  data:
  ##elasticsearch配置
    elasticsearch:
      cluster-name: elasticsearch
      cluster-nodes: localhost:9300
  ##redis配置
  redis:
    database: 0
    host: localhost
    port: 6379
    password: redis
    pool:
      max-active: 15
      max-wait: 1
      max-idle: 0
    timeout: 0
  ##freemarker配置
  freemarker:
  ##是否允许属性覆盖
    allow-request-override: false
    allow-session-override: false
    cache: true
    check-template-location: true
    content-type: text/html
  ##暴露request属性
    expose-request-attributes: false
    expose-session-attributes: false
    expose-spring-macro-helpers: false
    suffix: .ftl
    template-loader-path: classpath:/templates/
    request-context-attribute: request
    settings:
      classic_compatible: true
      locale: zh_CN
      date_format: yyyy-MM-dd
      time_format: HH:mm:ss
      datetime_format: yyyy-MM-dd HH:mm:ss

完整pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>comcrazy</groupId>
    <artifactId>ithomecrawler</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>ithomecrawler</name>
    <description>ITHome Crawler.</description>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>1.5.6.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <!--es全文搜索-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>
        <!-- redis -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>
        <!-- mybatis -->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>1.3.1</version>
        </dependency>
        <!-- mysql驱动 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <!--数据源依赖-->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.0.24</version>
        </dependency>
        <!-- jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>

        <!-- freemarder -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-freemarker</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <build>
        <finalName>ithomecrawler</finalName>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>


</project>

完整代码

猜你喜欢

转载自blog.csdn.net/crazylai1996/article/details/77621772