分类爬取新闻并存入mysql数据库

一、数据源介绍:https://www.jianshu.com/p/c54e25349b77

1、api

网易: https://3g.163.com
新闻:/touch/reconstruct/article/list/BBM54PGAwangning/0-10.html
娱乐:/touch/reconstruct/article/list/BA10TA81wangning/0-10.html
体育:/touch/reconstruct/article/list/BA8E6OEOwangning/0-10.html
财经:/touch/reconstruct/article/list/BA8EE5GMwangning/0-10.html
军事:/touch/reconstruct/article/list/BAI67OGGwangning/0-10.html
科技:/touch/reconstruct/article/list/BA8D4A3Rwangning/0-10.html
手机:/touch/reconstruct/article/list/BAI6I0O5wangning/0-10.html
数码:/touch/reconstruct/article/list/BAI6JOD9wangning/0-10.html
时尚:/touch/reconstruct/article/list/BA8F6ICNwangning/0-10.html
游戏:/touch/reconstruct/article/list/BAI6RHDKwangning/0-10.html
教育:/touch/reconstruct/article/list/BA8FF5PRwangning/0-10.html
健康:/touch/reconstruct/article/list/BDC4QSV3wangning/0-10.html
旅游:/touch/reconstruct/article/list/BEO4GINLwangning/0-10.html
视频:/touch/nc/api/video/recommend/Video_Recom/0-10.do?callback=videoList
2、数据结构:

 {
      "liveInfo": null,
      "docid": "F9R8L9K70001899O",
      "source": "国家卫健委",
      "title": "昨日全国新增确诊42例 其中境外输入38例本土4例",
      "priority": 100,
      "hasImg": 1,
      "url": "https://3g.163.com/news/20/0410/07/F9R8L9K70001899O.html",
      "skipURL": "http://3g.163.com/ntes/special/00340EPA/wapSpecialModule.html?sid=S1578049488158",
      "specialID": "S1578049488158",
      "commentCount": 4117,
      "imgsrc3gtype": "1",
      "stitle": "S1578049488158",
      "digest": "4月9日0—24时,31个省(自治区、直辖市)和新疆生产建设",
      "skipType": "special",
      "imgsrc": "http://cms-bucket.ws.126.net/2020/0410/7c943a79p00q8jnz6009lc000s600e3c.png",
      "ptime": "2020-04-10 07:26:48"
    }

二、实现代码

1、封装的发送URL请求并返回json格式的字符串的工具类

package com.me.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

public class HttpUtil {
    /**
     * 返回json
     * @param setUrl
     * @return
     */
    public static String setUrl(String setUrl){
        try {
            URL url = new URL(setUrl);
            HttpURLConnection conn = (HttpURLConnection)url.openConnection();
            conn.setConnectTimeout(5000);
            conn.setRequestMethod("GET");
            int responseCode = conn.getResponseCode();
            if (responseCode == HttpURLConnection.HTTP_OK){
                InputStream inputStream = conn.getInputStream();
                InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
                BufferedReader reader = new BufferedReader(inputStreamReader);
                StringBuffer stringBuffer = new StringBuffer();
                String string = reader.readLine();
                while (string != null) {
                    stringBuffer.append(string);
                    string = reader.readLine();
                }
                return stringBuffer.toString();
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return "";
    }
}
View Code

2、封装使用jsoup解析新闻内容的工具类

package com.me.utils;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class JsoupNewsUtil {

    /**
     *
     * @param data
     * @return News_dg
     */
    public static String zw(String data){
        Document document = Jsoup.parse(data);
        //获取新闻的内容
        Elements content = document.getElementsByClass("content");
        return content.text().trim();
    }
}
View Code

3、新闻的实体类

package com.me.domain;

public class News {
    private int id;
    private int priority;
    private int commentCount;
    private String source;
    private String title;
    private String url;
    private String digest;
    private String imgsrc;
    private String ptime;
    private String zw;
    private String type;

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    @Override
    public String toString() {
        return "News{" +
                "id=" + id +
                ", priority=" + priority +
                ", commentCount=" + commentCount +
                ", source='" + source + '\'' +
                ", title='" + title + '\'' +
                ", url='" + url + '\'' +
                ", digest='" + digest + '\'' +
                ", imgsrc='" + imgsrc + '\'' +
                ", ptime='" + ptime + '\'' +
                ", zw='" + zw + '\'' +
                '}';
    }

    public String getZw() {
        return zw;
    }

    public void setZw(String zw) {
        this.zw = zw;
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public int getPriority() {
        return priority;
    }

    public void setPriority(int priority) {
        this.priority = priority;
    }

    public int getCommentCount() {
        return commentCount;
    }

    public void setCommentCount(int commentCount) {
        this.commentCount = commentCount;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getDigest() {
        return digest;
    }

    public void setDigest(String digest) {
        this.digest = digest;
    }

    public String getImgsrc() {
        return imgsrc;
    }

    public void setImgsrc(String imgsrc) {
        this.imgsrc = imgsrc;
    }

    public String getPtime() {
        return ptime;
    }

    public void setPtime(String ptime) {
        this.ptime = ptime;
    }
}
View Code
package com.me.domain;

import java.util.List;

public class NewsListData {
    private List<News> BBM54PGAwangning;
    private List<News> BA10TA81wangning;
    private List<News> BA8E6OEOwangning;
    private List<News> BA8EE5GMwangning;
    private List<News> BAI67OGGwangning;
    private List<News> BA8D4A3Rwangning;
    private List<News> BAI6I0O5wangning;
    private List<News> BAI6JOD9wangning;
    private List<News> BA8F6ICNwangning;
    private List<News> BAI6RHDKwangning;
    private List<News> BA8FF5PRwangning;
    private List<News> BDC4QSV3wangning;
    private List<News> BEO4GINLwangning;

    public List<News> getBA10TA81wangning() {
        return BA10TA81wangning;
    }

    public void setBA10TA81wangning(List<News> BA10TA81wangning) {
        this.BA10TA81wangning = BA10TA81wangning;
    }

    public List<News> getBA8E6OEOwangning() {
        return BA8E6OEOwangning;
    }

    public void setBA8E6OEOwangning(List<News> BA8E6OEOwangning) {
        this.BA8E6OEOwangning = BA8E6OEOwangning;
    }

    public List<News> getBA8EE5GMwangning() {
        return BA8EE5GMwangning;
    }

    public void setBA8EE5GMwangning(List<News> BA8EE5GMwangning) {
        this.BA8EE5GMwangning = BA8EE5GMwangning;
    }

    public List<News> getBAI67OGGwangning() {
        return BAI67OGGwangning;
    }

    public void setBAI67OGGwangning(List<News> BAI67OGGwangning) {
        this.BAI67OGGwangning = BAI67OGGwangning;
    }

    public List<News> getBA8D4A3Rwangning() {
        return BA8D4A3Rwangning;
    }

    public void setBA8D4A3Rwangning(List<News> BA8D4A3Rwangning) {
        this.BA8D4A3Rwangning = BA8D4A3Rwangning;
    }

    public List<News> getBAI6I0O5wangning() {
        return BAI6I0O5wangning;
    }

    public void setBAI6I0O5wangning(List<News> BAI6I0O5wangning) {
        this.BAI6I0O5wangning = BAI6I0O5wangning;
    }

    public List<News> getBAI6JOD9wangning() {
        return BAI6JOD9wangning;
    }

    public void setBAI6JOD9wangning(List<News> BAI6JOD9wangning) {
        this.BAI6JOD9wangning = BAI6JOD9wangning;
    }

    public List<News> getBA8F6ICNwangning() {
        return BA8F6ICNwangning;
    }

    public void setBA8F6ICNwangning(List<News> BA8F6ICNwangning) {
        this.BA8F6ICNwangning = BA8F6ICNwangning;
    }

    public List<News> getBAI6RHDKwangning() {
        return BAI6RHDKwangning;
    }

    public void setBAI6RHDKwangning(List<News> BAI6RHDKwangning) {
        this.BAI6RHDKwangning = BAI6RHDKwangning;
    }

    public List<News> getBA8FF5PRwangning() {
        return BA8FF5PRwangning;
    }

    public void setBA8FF5PRwangning(List<News> BA8FF5PRwangning) {
        this.BA8FF5PRwangning = BA8FF5PRwangning;
    }

    public List<News> getBDC4QSV3wangning() {
        return BDC4QSV3wangning;
    }

    public void setBDC4QSV3wangning(List<News> BDC4QSV3wangning) {
        this.BDC4QSV3wangning = BDC4QSV3wangning;
    }

    public List<News> getBEO4GINLwangning() {
        return BEO4GINLwangning;
    }

    public void setBEO4GINLwangning(List<News> BEO4GINLwangning) {
        this.BEO4GINLwangning = BEO4GINLwangning;
    }

    public List<News> getBBM54PGAwangning() {
        return BBM54PGAwangning;
    }

    public void setBBM54PGAwangning(List<News> BBM54PGAwangning) {
        this.BBM54PGAwangning = BBM54PGAwangning;
    }
}
View Code

4、dao层

package com.me.dao;

import com.me.domain.News;
import com.me.utils.DBUtils;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;

import java.sql.SQLException;
import java.util.List;

public class NewsDao {



    /**
     *
     * @return
     * @throws SQLException
     */
    public boolean deleteAll() throws SQLException {
        QueryRunner qr =new QueryRunner(DBUtils.getDataSource());
        String sql="delete from newslist  ";
        int n = qr.update(sql);
        if (n > 0) {
            return true;
        } else {
            return false;
        }
    }

    /**
     *
     * @param zw
     * @param id
     * @return
     * @throws SQLException
     */
    public boolean zw(String zw,int id) throws SQLException {

        QueryRunner qr = new QueryRunner(DBUtils.getDataSource());
        String sql = "update newslist set zw = ?   where id=? ";
        int n = qr.update(sql, zw,id);
        if (n > 0) {
            return true;
        } else {
            return false;
        }
    }

    /**
     *
     * @return List<News>
     * @throws SQLException
     */
    public List<News> newsList() throws SQLException {
        QueryRunner qr = new QueryRunner(DBUtils.getDataSource());
        String sql = "select * from newslist where url != null  or url != '17KK0006|2145432'or url != ?";
        List<News> query = qr.query(sql, new BeanListHandler<News>(News.class),"");
        return query;
    }

    /**
     *
     * @param world
     * @return
     * @throws SQLException
     */
    public List<News> search(String world) throws SQLException {
        QueryRunner qr = new QueryRunner(DBUtils.getDataSource());
        String sql = "select * from newslist where (url != null or url != ?) and title like ? limit 0 , 5";
        List<News> query = qr.query(sql, new BeanListHandler<News>(News.class),"","'%"+world+"%'");
        return query;
    }

    /**
     *
     * @param news
     * @return
     * @throws SQLException
     */
    public boolean add(News news) throws SQLException {
        QueryRunner qr = new QueryRunner(DBUtils.getDataSource());
        String sql = "insert into newslist (source,title,priority,url,commentCount,digest,imgsrc,ptime,type) " +
                "values(?,?,?,?,?,?,?,?,?)";
        int update = qr.update(sql,news.getSource(),news.getTitle(),news.getPriority(),news.getUrl(),news.getCommentCount(),
                news.getDigest(),news.getImgsrc(),news.getPtime(),news.getType());
        if (update > 0) {
            return true;
        } else {
            return false;
        }
    }

}
View Code

5、service层

package com.me.service;

import com.google.gson.Gson;
import com.me.dao.NewsDao;
import com.me.domain.News;
import com.me.domain.NewsListData;
import com.me.utils.HttpUtil;
import com.me.utils.JsoupNewsUtil;

import java.sql.SQLException;
import java.util.List;

public class NewsList {
    NewsDao dao = new NewsDao();
    public static void main(String[] args) throws SQLException {
        NewsList newsList = new NewsList();
        /*String url = "https://3g.163.com/touch/reconstruct/article/list/";
        String [] typeArray = {"BBM54PGAwangning","BA10TA81wangning","BA8E6OEOwangning"
                ,"BA8EE5GMwangning","BAI67OGGwangning","BA8D4A3Rwangning","BAI6I0O5wangning"
                ,"BAI6JOD9wangning","BA8F6ICNwangning","BAI6RHDKwangning","BA8FF5PRwangning"
                ,"BDC4QSV3wangning","BEO4GINLwangning"};
        for (int i = 0; i < typeArray.length; i++) {

        }*/
        newsList.deleteAll();
        newsList.addXW("https://3g.163.com/touch/reconstruct/article/list/BBM54PGAwangning/0-20.html");
        newsList.addYL("https://3g.163.com/touch/reconstruct/article/list/BA10TA81wangning/0-20.html");
        newsList.addTY("https://3g.163.com/touch/reconstruct/article/list/BA8E6OEOwangning/0-20.html");
        newsList.addCJ("https://3g.163.com/touch/reconstruct/article/list/BA8EE5GMwangning/0-20.html");
        newsList.addJS("https://3g.163.com/touch/reconstruct/article/list/BAI67OGGwangning/0-20.html");
        newsList.addKJ("https://3g.163.com/touch/reconstruct/article/list/BA8D4A3Rwangning/0-20.html");
        newsList.addSJ("https://3g.163.com/touch/reconstruct/article/list/BAI6I0O5wangning/0-20.html");
        newsList.addSM("https://3g.163.com/touch/reconstruct/article/list/BAI6JOD9wangning/0-20.html");
        newsList.addSS("https://3g.163.com/touch/reconstruct/article/list/BA8F6ICNwangning/0-20.html");
        newsList.addYX("https://3g.163.com/touch/reconstruct/article/list/BAI6RHDKwangning/0-20.html");
        newsList.addJY("https://3g.163.com/touch/reconstruct/article/list/BA8FF5PRwangning/0-20.html");
        newsList.addJK("https://3g.163.com/touch/reconstruct/article/list/BDC4QSV3wangning/0-20.html");
        newsList.addLY("https://3g.163.com/touch/reconstruct/article/list/BEO4GINLwangning/0-20.html");
        newsList.zw();
//        newsList.test();
    }
    /**
     * type : 军事
     * @param url
     * @throws SQLException
     */
    public void addJS(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBAI67OGGwangning().size(); i++) {
            News n = newsListData.getBAI67OGGwangning().get(i);
            n.setType("军事");
            dao.add(n);
        }
    }
    /**
     * type : 旅游
     * @param url
     * @throws SQLException
     */
    public void addLY(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBEO4GINLwangning().size(); i++) {
            News n = newsListData.getBEO4GINLwangning().get(i);
            n.setType("旅游");
            dao.add(n);
        }
    }
    /**
     * type : 健康
     * @param url
     * @throws SQLException
     */
    public void addJK(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBDC4QSV3wangning().size(); i++) {
            News n = newsListData.getBDC4QSV3wangning().get(i);
            n.setType("健康");
            dao.add(n);
        }
    }/**
     * type : 教育
     * @param url
     * @throws SQLException
     */
    public void addJY(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA8FF5PRwangning().size(); i++) {
            News n = newsListData.getBA8FF5PRwangning().get(i);
            n.setType("教育");
            dao.add(n);
        }
    }/**
     * type : 游戏
     * @param url
     * @throws SQLException
     */
    public void addYX(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBAI6RHDKwangning().size(); i++) {
            News n = newsListData.getBAI6RHDKwangning().get(i);
            n.setType("游戏");
            dao.add(n);
        }
    }/**
     * type : 时尚
     * @param url
     * @throws SQLException
     */
    public void addSS(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA8F6ICNwangning().size(); i++) {
            News n = newsListData.getBA8F6ICNwangning().get(i);
            n.setType("时尚");
            dao.add(n);
        }
    }/**
     * type : 数码
     * @param url
     * @throws SQLException
     */
    public void addSM(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBAI6JOD9wangning().size(); i++) {
            News n = newsListData.getBAI6JOD9wangning().get(i);
            n.setType("数码");
            dao.add(n);
        }
    }/**
     * type : 手机
     * @param url
     * @throws SQLException
     */
    public void addSJ(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBAI6I0O5wangning().size(); i++) {
            News n = newsListData.getBAI6I0O5wangning().get(i);
            n.setType("手机");
            dao.add(n);
        }
    }

    /**
     * type : 科技
     * @param url
     * @throws SQLException
     */
    public void addKJ(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA8D4A3Rwangning().size(); i++) {
            News n = newsListData.getBA8D4A3Rwangning().get(i);
            n.setType("科技");
            dao.add(n);
        }
    }

    /**
     * type : 财经
     * @param url
     * @throws SQLException
     */
    public void addCJ(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA8EE5GMwangning().size(); i++) {
            News n = newsListData.getBA8EE5GMwangning().get(i);
            n.setType("财经");
            dao.add(n);
        }
    }

    /**
     * type : 体育
     * @param url
     * @throws SQLException
     */
    public void addTY(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA8E6OEOwangning().size(); i++) {
            News n = newsListData.getBA8E6OEOwangning().get(i);
            n.setType("体育");
            dao.add(n);
        }
    }

    /**
     * type : 娱乐
     * @param url
     * @throws SQLException
     */
    public void addYL(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBA10TA81wangning().size(); i++) {
            News n = newsListData.getBA10TA81wangning().get(i);
            n.setType("娱乐");
            dao.add(n);
        }
    }

    /**
     * 新闻
     * @param url
     * @throws SQLException
     */
    public void addXW(String url) throws SQLException {
        String data = HttpUtil.setUrl(url);
        Gson gson = new Gson();
        String ss = data.substring(9,data.length()-1);
        NewsListData newsListData = gson.fromJson(ss, NewsListData.class);
        for (int i = 0; i < newsListData.getBBM54PGAwangning().size(); i++) {
            News n = newsListData.getBBM54PGAwangning().get(i);
            n.setType("新闻");
            dao.add(n);
        }
    }

    /**
     * 正文
     * @throws SQLException
     */
    public void zw() throws SQLException {
        List<News> news = dao.newsList();
        for (int i = 0; i < news.size(); i++) {
            News n = news.get(i);
            String s = HttpUtil.setUrl(n.getUrl());
            String zw = JsoupNewsUtil.zw(s);
            dao.zw(zw,n.getId());
        }
    }

    /**
     * 清空
     * @throws SQLException
     */
    public void deleteAll() throws SQLException {
        dao.deleteAll();
    }

    /**
     * 测试
     */
    public void test (){
        Gson gson = new Gson();
        News news = gson.fromJson("{\n" +
                "      \"imgextra\": [\n" +
                "        {\n" +
                "          \"imgsrc\": \"http://cms-bucket.ws.126.net/2020/0410/4ef9af5aj00q8jdsh00bpc000sg00sgc.jpg\"\n" +
                "        },\n" +
                "        {\n" +
                "          \"imgsrc\": \"http://cms-bucket.ws.126.net/2020/0410/6f7bd38bj00q8jdsh003xc000sg00dic.jpg\"\n" +
                "        }\n" +
                "      ],\n" +
                "      \"liveInfo\": null,\n" +
                "      \"docid\": \"17KK0006|2145433\",\n" +
                "      \"source\": \"极客鲜疯队\",\n" +
                "      \"title\": \"宅家赏美丽高原 一生中值得一看的美景\",\n" +
                "      \"priority\": 150,\n" +
                "      \"url\": \"17KK0006|2145433\",\n" +
                "      \"skipURL\": \"http://3g.163.com/touch/photoview.html?channelid=0006&setid=2145433\",\n" +
                "      \"commentCount\": 15,\n" +
                "      \"imgsrc3gtype\": \"2\",\n" +
                "      \"stitle\": \"17KK0006|2145433\",\n" +
                "      \"digest\": \"\",\n" +
                "      \"skipType\": \"photoset\",\n" +
                "      \"photosetID\": \"0006|2145433\",\n" +
                "      \"imgsrc\": \"http://cms-bucket.ws.126.net/2020/0410/1bd79be9j00q8jdsh008tc000sg00izc.jpg\",\n" +
                "      \"ptime\": \"2020-04-10 03:51:04\",\n" +
                "      \"modelmode\": \"u\"\n" +
                "    }", News.class);
        System.out.println(news.toString());

    }
}
View Code

三、结果

猜你喜欢

转载自www.cnblogs.com/20183544-wangzhengshuai/p/12683975.html