BeautifulSoup--爬取CSDN个人数据详情(Py)(Java处理数据)

0x01.描述

  • CSDN自带数据分析功能,但毕竟功能有限,我们可以自己根据自己的需求来定制一些数据的分析,第一步,就是要获取数据,获取数据的种类有很多种,这里以左边小栏目的数据为例,使用的是BeautifulSoup。

在这里插入图片描述

0x02.Py思路

  • 这里的思路是获取这两个div下的所有dl的title值,也就是我们需要的数据。
    在这里插入图片描述

0x03.Py脚本

  • 可以根据需要每隔多久爬取一次。
  • 同时写入了此次爬取的时间。
import time
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen(r"https://blog.csdn.net/ATFWUS").read().decode('utf-8')
#print(html)


soup = BeautifulSoup(html, features="html.parser")
f = open("D://DeskTop/csdn.txt", 'w+', encoding='utf-8')
ltime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(ltime+"\n")
print(ltime)
while True:
    span1 = soup.find("span", attrs={"class": "name"}).get_text()
    span2 = soup.find("span", attrs={"class": "personal-home-page"}).get_text()
    print(span1.strip())
    print(span2.strip())
    f.write(span1.strip()+"\n")
    f.write(span2.strip()+"\n")

    # 简介
    title=soup.title.string
    f.write(title.strip() + "\n")

    tags = soup.find_all("div",attrs={"class":"data-info d-flex item-tiling"})
    for tag in tags:
        for i in range(5):
            t=tag.select('dl')[i]
            data=t.get('title')
            print(data)
            f.write(data+"\n")
    f.close()
    exit(0)



0x04.Java对数据进行处理

实体类:

import java.util.Date;

public class CSDN_datas {
    //此次数据更新时间
    private Date date;
    //昵称
    private String name;
    //码龄
    private String codeAge;
    //个人简介
    private String introduction;
    //原创文章数
    private Integer yc;
    //粉丝数
    private Integer fans;
    //获赞数
    private Integer zan;
    //评论数
    private Integer content;
    //总访客
    private Integer cust;
    //积分数
    private Integer jf;
    //文章被收藏数
    private Integer sc;
    //周排名
    private Integer WeekGrade;
    //总排名
    private Integer TotalGrade;
    //等级
    private Integer Grade;

    public Date getDate() {
        return date;
    }

    public void setDate(Date date) {
        this.date = date;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getCodeAge() {
        return codeAge;
    }

    public void setCodeAge(String codeAge) {
        this.codeAge = codeAge;
    }

    public String getIntroduction() {
        return introduction;
    }

    public void setIntroduction(String introduction) {
        this.introduction = introduction;
    }

    public Integer getYc() {
        return yc;
    }

    public void setYc(Integer yc) {
        this.yc = yc;
    }

    public Integer getFans() {
        return fans;
    }

    public void setFans(Integer fans) {
        this.fans = fans;
    }

    public Integer getZan() {
        return zan;
    }

    public void setZan(Integer zan) {
        this.zan = zan;
    }

    public Integer getContent() {
        return content;
    }

    public void setContent(Integer content) {
        this.content = content;
    }

    public Integer getCust() {
        return cust;
    }

    public void setCust(Integer cust) {
        this.cust = cust;
    }

    public Integer getJf() {
        return jf;
    }

    public void setJf(Integer jf) {
        this.jf = jf;
    }

    public Integer getSc() {
        return sc;
    }

    public void setSc(Integer sc) {
        this.sc = sc;
    }

    public Integer getWeekGrade() {
        return WeekGrade;
    }

    public void setWeekGrade(Integer weekGrade) {
        WeekGrade = weekGrade;
    }

    public Integer getTotalGrade() {
        return TotalGrade;
    }

    public void setTotalGrade(Integer totalGrade) {
        TotalGrade = totalGrade;
    }

    public Integer getGrade() {
        return Grade;
    }

    public void setGrade(Integer grade) {
        Grade = grade;
    }

    @Override
    public String toString() {
        return "CSDN_datas{" +
                "date=" + date +
                ", name='" + name + '\'' +
                ", codeAge='" + codeAge + '\'' +
                ", introduction='" + introduction + '\'' +
                ", yc=" + yc +
                ", fans=" + fans +
                ", zan=" + zan +
                ", content=" + content +
                ", cust=" + cust +
                ", jf=" + jf +
                ", sc=" + sc +
                ", WeekGrade=" + WeekGrade +
                ", TotalGrade=" + TotalGrade +
                ", Grade=" + Grade +
                '}';
    }
}

获取txt并转换:

import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class TxtUtils {
    public static String txt2String(File file) throws IOException, ParseException {
        CSDN_datas datas=new CSDN_datas();
        String result = "";
        BufferedReader br = new BufferedReader(new FileReader(file));
        String s = null;
        s = br.readLine();
        result+="此次数据更新时间:"+s+"\n";
        DateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        datas.setDate(format.parse(s));
        s = br.readLine();
        result+="昵称:"+s+"\n";
        datas.setName(s);
        s = br.readLine();
        result+="码龄:"+s.toCharArray()[2]+"年"+"\n";
        datas.setCodeAge(s.toCharArray()[2]+"年");
        s = br.readLine();
        result+="个人简介:"+s+"\n";
        datas.setIntroduction(s);
        s = br.readLine();
        result+="原创文章数:"+s+"\n";
        datas.setYc(Integer.valueOf(s));
        s = br.readLine();
        result+="粉丝数:"+s+"\n";
        datas.setFans(Integer.valueOf(s));
        s = br.readLine();
        result+="获赞数:"+s+"\n";
        datas.setZan(Integer.valueOf(s));
        s = br.readLine();
        result+="评论数:"+s+"\n";
        datas.setContent(Integer.valueOf(s));
        s = br.readLine();
        result+="总访客:"+s+"\n";
        datas.setCust(Integer.valueOf(s));
        s = br.readLine();
        result+="积分数:"+s+"\n";
        datas.setJf(Integer.valueOf(s));
        s = br.readLine();
        result+="文章被收藏数:"+s+"\n";
        datas.setSc(Integer.valueOf(s));
        s = br.readLine();
        result+="周排名:"+s+"\n";
        datas.setWeekGrade(Integer.valueOf(s));
        s = br.readLine();
        result+="总排名:"+s+"\n";
        datas.setTotalGrade(Integer.valueOf(s));
        s = br.readLine();
        result+="等级:"+s.toCharArray()[0]+"\n";
        datas.setGrade(Integer.valueOf(s.toCharArray()[0])-48);
        br.close();
        System.out.println(datas);
        return result;
    }

    public static void main(String[] args) throws IOException, ParseException {
        File file = new File("D://DeskTop/csdn.txt");
        System.out.println(txt2String(file));
    }
}

接下来就可以自由发挥,将数据存入数据库,或者每隔多久对数据进行一次分析等等。。

原创文章 249 获赞 289 访问量 4万+

猜你喜欢

转载自blog.csdn.net/ATFWUS/article/details/106172578