爬取主程序
使用Jsoup解析网页源代码
@Component
public class WebCrawler {
private static final String encoding = "utf-8";
@Autowired
private HotCommentMapper hotCommentMapper;
@Autowired
private RedisService redisService;
@Autowired
private EsService esService;
private static boolean done = false;
private static final int THREAD_NUM = 15;
private static AtomicInteger page = new AtomicInteger(0);
private static List<String> breakpoints;
/**
* 定时爬取更新
*/
public void start(){
done = false;
System.out.println("开始爬取:"+System.currentTimeMillis());
for (int i = 0;i<THREAD_NUM;++i){
new Thread(new Runnable() {
@Override
public void run() {
while (!done) {
int p = page.incrementAndGet();
crawl(p);
}
System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());
}
},"Thread--"+i).start();
}
}
public synchronized void stop(){
done = true;
redisService.listRemove("ithome:breakpoints");
redisService.listAdd("ithome:breakpoints",breakpoints);
}
/**
* @param page :页码
*/
public void crawl(int page){
String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +
"page="+page+"&type=indexpage&randnum="+Math.random();
String src = getHtmlSrc(url,encoding);
List<String> links = getArticleLinks(src);
if (links.size()<=0){
stop();
return ;
}
if(redisService.containsValue("ithome:breakpoints",links)){
stop();
return ;
}
if (page == 1){
breakpoints = links;
}
for (String link:links){
parseAndSaveHotComments(link);
}
}
/**
*
* @param url
* @param encoding 编码
* @return 网页源代码
*/
public String getHtmlSrc(String url,String encoding){
StringBuilder src = new StringBuilder();
InputStreamReader isr = null;
try {
URL urlObj = new URL(url);
URLConnection urlConn = urlObj.openConnection();
isr = new InputStreamReader(urlConn.getInputStream(),encoding);
BufferedReader reader = new BufferedReader(isr);
String line = null;
while ((line = reader.readLine())!=null){
src.append(line);
}
}catch (Exception e){
e.printStackTrace();
}finally {
try {
if (isr != null){
isr.close();
}
}catch (Exception e){
e.printStackTrace();
}
}
return src.toString();
}
/**
* @param srcCode
* @return 解析源代码,获取文章链接
*/
public List<String> getArticleLinks(String srcCode){
List<String> links = new ArrayList<String>();
Document document = Jsoup.parse(srcCode);
Elements articleEls = document.select("h2>a");
for (Element el:articleEls){
String href = el.attr("href");
links.add(href);
}
return links;
}
/**
*
* @param articleHref 文章链接
* @description 使用Jsoup解析热评内容并保存
*/
public void parseAndSaveHotComments(String articleHref){
String articlePage = getHtmlSrc(articleHref,encoding);
Document document = Jsoup.parse(articlePage);
Element iframeEl = document.getElementById("ifcomment");
if(iframeEl == null) {
return ;
}
String commentHref = iframeEl.attr("src");
document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));
Element articleIdInput = document.getElementById("newsid");
String articleId = articleIdInput.attr("value");
String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";
String hotCommentPage = getHtmlSrc(link,encoding);
document = Jsoup.parse(hotCommentPage);
Elements hotCommentEls = document.select("li.entry");
HotComment hotComment = null;
for (Element el:hotCommentEls){
hotComment = new HotComment();
String commontId = el.attr("cid");
String user = el.select("strong.nick a").text();
String comment = el.getElementsByTag("P").text();
int up = getNumber(el.select("a.s").text());
int down = getNumber(el.select("a.a").text());
String posandtime = el.select("span.posandtime").text();
String mobile = el.select("span.mobile a").text();
hotComment = new HotComment();
hotComment.setCommentId(commontId);
hotComment.setArticleUrl(articleHref);
hotComment.setUser(user);
hotComment.setComment(comment);
hotComment.setUp(up);
hotComment.setDown(down);
hotComment.setPosandtime(posandtime);
hotComment.setMobile(mobile);
hotCommentMapper.addHotComment(hotComment);
esService.addHotComment(hotComment);
if(hotComment.getUp()>=2500){
redisService.rankAdd("ithome:hotrank",hotComment);
}
}
}
/**
*
* @param str
* @return 解析"()"中的数字
*/
public int getNumber(String str){
Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");
Matcher matcher = pattern.matcher(str);
if(matcher.find()){
return Integer.parseInt(matcher.group());
}
return 0;
}
}
整合Elasticsearch
相关依赖:
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
application.yml配置:
spring:
data:
##elasticsearch配置
elasticsearch:
cluster-name: elasticsearch
cluster-nodes: localhost:9300
代码部分:
Respository
public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{
public List<HotComment> findByUser(String user);
}
实体
@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")
public class HotComment implements Serializable{
private static final long serialVersionUID = -4249699545233058684L;
@Id
private Long id;
private String commentId;
private String user;
private String comment;
private int up;
private int down;
private String posandtime;
private String mobile;
private String articleUrl;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getCommentId() {
return commentId;
}
public void setCommentId(String commentId) {
this.commentId = commentId;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getComment() {
return comment;
}
public void setComment(String comment) {
this.comment = comment;
}
public int getUp() {
return up;
}
public void setUp(int up) {
this.up = up;
}
public int getDown() {
return down;
}
public void setDown(int down) {
this.down = down;
}
public String getPosandtime() {
return posandtime;
}
public void setPosandtime(String posandtime) {
this.posandtime = posandtime;
}
public String getMobile() {
return mobile;
}
public void setMobile(String mobile) {
this.mobile = mobile;
}
public String getArticleUrl() {
return articleUrl;
}
public void setArticleUrl(String articleUrl) {
this.articleUrl = articleUrl;
}
@Override
public String toString() {
return "HotComment{" +
"id='" + id + '\'' +
"commentId='" + commentId + '\'' +
", user='" + user + '\'' +
", comment='" + comment + '\'' +
", up=" + up +
", down=" + down +
", posandtime='" + posandtime + '\'' +
", mobile='" + mobile + '\'' +
", articleUrl='" + articleUrl + '\'' +
'}';
}
}
Service
@Service
public class EsService {
@Autowired
private EsRepository esRepository;
public void addHotComment(HotComment hotComment){
esRepository.save(hotComment);
}
/**
* 缓存搜索结果
* @param user
* @return
*/
@Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")
public List<HotComment> findByUser(String user){
return esRepository.findByUser(user);
}
}
整合Redis
相关依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
相关配置
appliaction.yml配置
spring:
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
缓存相关配置
@Configuration
@EnableCaching
public class RedisConfig {
@Bean
public KeyGenerator keyGenerator(){
return new KeyGenerator(){
@Override
public Object generate(Object o, Method method, Object... objects) {
StringBuilder sb = new StringBuilder();
sb.append(o.getClass().getName());
sb.append(method.getName());
for(Object obj : objects){
sb.append(obj.toString());
}
return sb.toString();
}
};
}
@Bean
public CacheManager cacheManager(RedisTemplate redisTemplate){
RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);
Map<String,Long> map = new HashMap<>();
map.put("ithome:hotcomments",60*60*24L);
return redisCacheManager;
}
@Bean
public RedisTemplate<String,String> redisTemplate(RedisConnectionFactory factory){
StringRedisTemplate template = new StringRedisTemplate(factory);
Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);
ObjectMapper om = new ObjectMapper();
om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
jackson2JsonRedisSerializer.setObjectMapper(om);
template.setValueSerializer(jackson2JsonRedisSerializer);
template.afterPropertiesSet();
return template;
}
}
代码部分:
Service
@Service
public class RedisService {
@Autowired
private RedisTemplate redisTemplate;
/**
*
* @param key
* @param hotComment
* 添加热评至 redis
*/
public void rankAdd(String key, HotComment hotComment){
ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
zSetOperations.add(key,hotComment,hotComment.getUp());
}
/**
*
* @param key
* @param top 前top条记录
* @return
*/
public Set<HotComment> rankGet(String key,int top){
ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
return zSetOperations.range(key,0,top);
}
/**
*
* @param key
* @param values
* @desc 保存最近一次抓取的位置点
*/
public void listAdd(String key,List<String> values){
ListOperations<String,String> listOperations = redisTemplate.opsForList();
listOperations.rightPushAll(key,values);
}
public void listRemove(String key){
redisTemplate.delete(key);
}
/**
*
* @param key
* @param values
* @return
* @desc 判断是否抓取结束位置
*/
public boolean containsValue(String key,List<String> values){
ListOperations<String,String> listOperations = redisTemplate.opsForList();
List<String> list = listOperations.range(key,0,-1);
for (String val : values){
if(list.contains(val)){
return true;
}
}
return false;
}
}
整合Mybatis
依赖配置
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.24</version>
</dependency>
数据源配置
@Configuration
@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")
public class DatabaseConfig {
/**
* 数据源配置
* @return
*/
@Bean
public DataSource druidDataSource(){
DruidDataSource dataSource=new DruidDataSource();
dataSource.setUsername("root");
dataSource.setPassword("mysql");
dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");
dataSource.setDriverClassName("com.mysql.jdbc.Driver");
return dataSource;
}
}
代码部分:
Mapper
public interface HotCommentMapper {
@Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")
@Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")
public void addHotComment(HotComment hotComment);
}
Controller
@Controller
@RequestMapping("/ithome")
public class HotCommentController {
@Autowired
private RedisService redisService;
@Autowired
private EsService esService;
/**
* 首页
* @return
*/
@GetMapping("/index")
public ModelAndView index(){
ModelAndView mav = new ModelAndView("search");
Set<HotComment> set = redisService.rankGet("ithome:hotrank",50);
mav.addObject("comments",set);
return mav;
}
/**
* 搜索
* @param keyword
* @return
*/
@GetMapping("/search/{keyword}")
public ModelAndView search(@PathVariable("keyword") String keyword){
ModelAndView mav = new ModelAndView("search");
List<HotComment> list = esService.findByUser(keyword);
mav.addObject("comments",list);
return mav;
}
}
主程序
@SpringBootApplication
@EnableElasticsearchRepositories
@EnableScheduling
public class IthomecrawlerApplication {
public static void main(String[] args) {
SpringApplication.run(IthomecrawlerApplication.class, args);
}
}
完整application.yml文件
server:
port: 8081
spring:
data:
elasticsearch:
cluster-name: elasticsearch
cluster-nodes: localhost:9300
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
freemarker:
allow-request-override: false
allow-session-override: false
cache: true
check-template-location: true
content-type: text/html
expose-request-attributes: false
expose-session-attributes: false
expose-spring-macro-helpers: false
suffix: .ftl
template-loader-path: classpath:/templates/
request-context-attribute: request
settings:
classic_compatible: true
locale: zh_CN
date_format: yyyy-MM-dd
time_format: HH:mm:ss
datetime_format: yyyy-MM-dd HH:mm:ss
完整pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>comcrazy</groupId>
<artifactId>ithomecrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>ithomecrawler</name>
<description>ITHome Crawler.</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.6.RELEASE</version>
<relativePath/>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.24</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-freemarker</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<finalName>ithomecrawler</finalName>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
完整代码