[Data Analysis] Based on news text data analysis

Due to vocabulary sensitivity and other reasons, the review failed every time, so it was uploaded as an image! The relevant codes are in the appendix at the end of the article.

Data Source: 2020 Memory: Reports, Non-Fiction and Personal Narrative (Continuous Update)

 

 

 

appendix:

t1.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/12 13:02
# @Author  : ystraw
# @Site    : 
# @File    : t1.py
# @Software: PyCharm Community Edition
# @function: 从github指定链接中进行数据获取
#            获取链接后,根据链接的不同来源,进行不同网页的抓取文章内容

import requests
import time
import datetime
from bs4 import BeautifulSoup
from openpyxl import Workbook
import random
from lxml import etree
from openpyxl import load_workbook
import getIpPool
proxies = getIpPool.getproxies()
MAX_num = 15    # Ip取值范围
openFlag = 1   # 0关闭Ip代理,1开启Ip代理
outTime = 10   # 超时时间

# 写入文件:, 新建不追加:
def writeFile(filename, file):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(file)
    print(filename, '已写入!')
    f.close()
# 写入文件:, 新建追加:
def writeFile_add(filename, file):
    with open(filename, 'a', encoding='utf-8') as f:
        f.write(file)
    print(filename, '已写入!')
    f.close()
# 读入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已读入!')
    f.close()
    return str
# 写入Excel
def write_excel_xls(path, sheet_name, value, bHead):
    # 获取需要写入数据的行数
    index = len(value)
    # 获取需要写入数据的行数
    index = len(value)
    wb = Workbook()
    # 激活 worksheet
    ws = wb.active
    # 第一行输入
    ws.append(bHead)
    # .cell(row=x, column=2, value=z.project)
    for i in range(2, index+2):
        for j in range(1, len(value[i-2]) + 1):
            # ws.append(value[i])
            ws.cell(row=i, column=j, value=value[i-2][j-1])
    # 保存
    wb.save(path)
    print(path + '表格写入数据成功!')

# 采集github的文章链接
def getUrl(path, url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    html = requests.get(url, headers=headers, verify=False).text
    # writeFile('data/top250.html', html)
    # xpath:提取信息(标题)
    text =etree.HTML(html)
    trs = text.xpath('//div[@class="Box-body"]//tbody/tr/td[2]/text()')
    # bs4: 提取信息:
    bs = BeautifulSoup(html, 'lxml')
    div = bs.findAll('div', attrs={'class': 'Box-body'})[0]
    # print(div)
    trList = div.findAll('tr')
    # print(len(trList))
    cnt = 0
    # 全部数据
    alldata = []
    for tr in trList:
        tds = tr.findAll('td')
        if tds != []:
            # 提取:日期,标题
            tempList = [tds[0].string, trs[cnt]]
            # 提取:【原始URL,截图,翻译,Archive】的链接
            for i in range(2, 6):
                astring = ''
                aList = tds[i].findAll('a')
                for a in aList:
                    astring += a['href'] + ','
                tempList.append(astring.strip(','))
            print(tempList)
            alldata.append(tempList)
            cnt += 1
    tableHead = ['日期', '标题', '原始URL', '截图', '翻译', 'Archive']
    write_excel_xls(path, 'link', alldata, tableHead)

# 提取微信文章
def getdetailContent_1(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="rich_media_content "])').replace(' ', '').replace('\n', '')
    # print(context.replace(' ', '').replace('\n', ''))
    return context

# 提取财经网
def getdetailContent_2(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@id="Main_Content_Val"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 经济观察网
def getdetailContent_3(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # 解决乱码
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text.encode('iso-8859-1')
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text.encode('iso-8859-1')
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text.encode('iso-8859-1')
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="xx_boxsing"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 方方博客
def getdetailContent_4(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # 解决乱码
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

# print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="blog_content"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 中国经营网专题
def getdetailContent_5(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # 解决乱码
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="contentleft auto"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 界面网
def getdetailContent_6(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('请求地址:', url)
    # 解决乱码
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="article-content"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 从excel中获取
def getContent(path, savePath):
    # 读取数据
    wb = load_workbook(path)
    sheet_names = wb.get_sheet_names()
    table = wb.get_sheet_by_name(sheet_names[0]) # index为0为第一张表
    nrows = table.max_row  # 行
    ncols = table.max_column  # 列
    print(nrows, ncols)
    cnt = 0
    alldata = []
    for i in range(2, nrows+1):
        templist = []
        for j in range(1, ncols+1):
            # print(table.cell(i, j).value)
            templist.append(table.cell(i, j).value)
        # 获取详情链接:
        url = table.cell(i, 3).value.split(',')[0]
        try:
            if url[:24] == 'https://mp.weixin.qq.com':
                # 微信公共号获取文章'
                content = getdetailContent_1(url)
                templist.append('微信公共号')
                templist.append(content)
                # print(content)
                # pass
            elif url[:24] == 'http://china.caixin.com/' or url[:22] == 'http://www.caixin.com/' or url[:25] == 'http://weekly.caixin.com/':
                # 财新网获取文章
                content = getdetailContent_2(url)
                templist.append('财新网')
                templist.append(content)
                # print(content)
                # pass
            elif url[:22] == 'http://www.eeo.com.cn/':
                # 经济观察网
                # # print('经济观察网', table.cell(i, 3).value)
                content = getdetailContent_3(url)
                templist.append('经济观察网')
                templist.append(content)
                # print(content)
                # pass
            elif url[:32] == 'http://fangfang.blog.caixin.com/':
                # 方方博客
                content = getdetailContent_4(url)
                templist.append('方方博客')
                templist.append(content)
                # print(content)
                # pass
            elif url[:21] == 'http://www.cb.com.cn/':
                # # 中国经营网专题
                content = getdetailContent_5(url)
                templist.append('中国经营网')
                templist.append(content)
                # # print(content)
                pass
            elif url[:24] == 'https://www.jiemian.com/':
                # 界面网
                content = getdetailContent_6(url)
                templist.append('界面网')
                templist.append(content)
                # print(content)
                # pass
            else:
                # print('else', table.cell(i, 3).value, '===', table.cell(i, 2).value)
                cnt += 1
                # print(table.cell(i, 3).value, table.cell(i, 5).value)
            alldata.append(templist)
        except Exception as ex:
            print('异常:', ex)
        # if i >= 10:
        #     break
        # time.sleep(random.randint(0, 2))
    print('剔除的:', cnt)
    tableHead = ['日期', '标题', '原始URL', '截图', '翻译', 'Archive','文章来源', '文章内容']
    write_excel_xls(savePath, 'link', alldata, tableHead)

if __name__ == '__main__':
    '''
     第一步:获取链接
    '''
    # 数据地址
    # url = 'https://github.com/2019ncovmemory/nCovMemory#%E7%AC%AC%E4%B8%80%E8%B4%A2%E7%BB%8Fyimagazine'
    # # 保存文件路径:
    # path = './data/all_text_2.xlsx'
    # getUrl(path, url)
    '''
     第二步:通过链接提取文章内容
    '''
    # url = 'https://web.archive.org/web/20200204084331/http://www.caixin.com/2020-02-04/101511377.html'
    # 读取链接文件地址:
    path = './data/all_text_link_2.xlsx'
    # 保存路径:
    savePath = './data/text_0.xlsx'
    getContent(path, savePath)

t2.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/13 13:46
# @Author  : ystraw
# @Site    : 
# @File    : t2.py
# @Software: PyCharm Community Edition
# @function: 对t1获得的 alltext.xlsx 进行必要的处理
# 1、删除空行

import numpy
import pandas as pd
import jieba

# 读入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已读入!')
    f.close()
    return str

# 删除空行
def dealNull(path, savepath):
    data = pd.read_excel(path, sheet_name=0)
    df = pd.DataFrame(data)
    # print(data.head())  # 无参时默认读前五行
    # print(data.tail())  # 无参时默认读后五行
    print(data.shape)        # 查看数据大小
    print(data.columns)      # 查看数据的列索引
    # 数据表基本信息(维度、列名称、数据格式、所占空间等)
    print(data.info())
    # 每一列数据的格式
    print('格式:\n', df.dtypes)
    # 读取某列的某行数据
    # df['文章内容'].astype('str')
    # df['文章内容'] = df['文章内容'].map(str.strip)
    # print(data['文章内容'].at[123])
    # 读取表格数据内容(不包括标题)
    # print(data.values)
    # 判断每一行的文章内容是否为空
    data_notnull = data['文章内容'].notnull()
    # print(data_notnull)
    # 删除空行
    data_new = data[data_notnull]
    # print(data_new)
    print('删除空行之后的大小:\n', data_new.shape)
    # 保存文件
    data_new.to_excel(savepath, index=False, header=True)

# 分词并统计词频
def fenci(content):
    # 读入停留词文件:
    sword = readFile('./data/stopword.txt')
    # 构建停留词词典:
    sword = sword.split('\n')
    worddict = {}
    wordlist = []
    for w in jieba.cut(content, cut_all=False):  # cut_all=False为精确模式,=True为全模式
        # print(w)
        if (w not in sword) and w != '' and w != ' ' and w != None and w != '\n' and len(w) >= 2:
            # print(w + '-')
            wordlist.append(w)
            try:
                worddict[w] = worddict[w] + 1
            except:
                worddict[w] = 1
    # print(worddict)
    return [worddict, wordlist]

# 数据预处理
def preDeal(path, savepath):
    # 读取数据
    data = pd.read_excel(path, sheet_name=0)
    df = pd.DataFrame(data)
    # 加一列
    df['文章内容分词'] = None
    for i in range(df.shape[0]):
        # 进行分词
        rt = fenci(df['文章内容'].at[i])
        df['文章内容分词'].at[i] = ' '.join(rt[1])
    # 保存文件
    df.to_excel(savepath, index=False, header=True)

if __name__ == '__main__':
    '''
    数据清洗
    '''
    # # 删除空行
    # path = './data/text_0.xlsx'
    # savepath = './data/text_1.xlsx'
    # dealNull(path, savepath)

    '''
    数据预处理
    '''
    path = './data/text_1.xlsx'
    savepath = './data/text_2.xlsx'
    preDeal(path, savepath)

t3.py:

# 导入Geo包,注意1.x版本的导入跟0.x版本的导入差别
# 更新方法:pip install  --upgrade pyecharts
from pyecharts.charts import Geo
# 导入配置项
from pyecharts import options as opts
# ChartType:图标类型,SymbolType:标记点类型
from pyecharts .globals import ChartType, SymbolType

# 读入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已读入!')
    f.close()
    return str

geo = Geo()
# 新增坐标点,添加名称跟经纬度
# 读入城市坐标数据:
zb_city = readFile('./data/1-5LineCity_2.txt')
# geo.add_coordinate(name="China",longitude=104.195,latitude=35.675)
cityList = zb_city.split('\n')
for cy in cityList:
    if cy == '' or cy == None:
        continue
    temp = cy.split(',')
    geo.add_coordinate(name=temp[0], longitude=temp[2], latitude=temp[1])

# 地图类型,世界地图可换为world
geo.add_schema(maptype="china")
# 获取权重:
cityList = readFile('./data/city_node.csv').split('\n')
data = []
for i in range(len(cityList)):
    city = cityList[i]
    if i == 0 or city == '' or city == None:
        continue
    data.append((city.split(' ')[0], int(city.split(' ')[2])))
# print(data)
# 获取流向
cityList = readFile('./data/city_edge.csv').split('\n')
data2 = []
for i in range(len(cityList)):
    city = cityList[i]
    if i == 0 or city == '' or city == None:
        continue
    # 共现次数较少的不展示:
    if int(city.split(' ')[2]) < 200:
        continue
    data2.append((city.split(' ')[0], city.split(' ')[1]))
# print(data2)
# 添加数据点
# geo.add("",[("北京",10),("上海",20),("广州",30),("成都",40),("哈尔滨",50)],type_=ChartType.EFFECT_SCATTER)
geo.add("", data, type_=ChartType.EFFECT_SCATTER)
# 添加流向,type_设置为LINES,涟漪配置为箭头,提供的标记类型包括 'circle', 'rect', 'roundRect', 'triangle',
#'diamond', 'pin', 'arrow', 'none'
geo.add("geo-lines",
        data2,
        type_=ChartType.LINES,
        effect_opts=opts.EffectOpts(symbol=SymbolType.ARROW,symbol_size=10,color="yellow"),
        linestyle_opts=opts.LineStyleOpts(curve=0.2),
        is_large=True)
# 不显示标签
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
# 设置图标标题,visualmap_opts=opts.VisualMapOpts()为左下角的视觉映射配置项
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(),title_opts=opts.TitleOpts(title="城市动态流向图"))
# 直接在notebook里显示图表
geo.render_notebook()
# 生成html文件,可传入位置参数
geo.render("城市动态流向图.html")

dataAnalysis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/17 18:42
# @Author  : ystraw
# @Site    : 
# @File    : dataAnalysis.py
# @Software: PyCharm Community Edition
# @function: 进行数据分析
import folium
import codecs
from folium.plugins import HeatMap
from pyecharts.charts import Geo
from pyecharts.charts import Map
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 读入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已读入!')
    f.close()
    return str

# 描述性分析
def ms_analysis(filepath):
    # 中文和负号的正常显示
    plt.rcParams['font.sans-serif'] = ['KaiTi']
    plt.rcParams['font.serif'] = ['KaiTi']
    # 读入数据
    data = pd.read_excel(filepath)
    '''
    发文数量、来源和发文日期
    '''
    # 绘制文章发布量与日期及来源关系图:
    # data.groupby('日期')['文章来源'].value_counts().unstack().fillna(value=0).plot(kind='bar', title='文章发布量分日统计')
    # plt.show()
    # return

    '''
    城市提及分析
    '''
    # 读入城市数据,构建城市字典:
    city = readFile('./data/1-5LineCity_2.txt')
    cityList = city.split('\n')
    # print(cityList)
    # 构建城市频率:
    cityDict = {}
    for cy in cityList:
        if cy == '' or cy == None:
            continue
        temp = cy.split(',')
        cityDict[temp[0][:-1]] = 0
    # print(cityDict)
    print(data.shape[0], data.shape[1])
    # 进行统计
    for i in range(data.shape[0]):
        wordList = data['文章内容分词'].at[i].split(' ')
        for word in wordList:
            try:
                cityDict[word] += 1
            except:
                pass
        # break
    print(cityDict)

    # 绘制地图:
    # 取字典中的值
    # provice = list(cityDict.keys())
    # values = list(cityDict.values())
    # 填充数据
    data = []
    for cy in cityList:
        if cy == '' or cy == None:
            continue
        temp = cy.split(',')
        data.append([float(temp[1]), float(temp[2]), cityDict[temp[0][:-1]]])
    # data=[[ 31.235929,121.480539, 1500 ]] #
    print(data)
    map_osm = folium.Map([33., 113.], zoom_start=12)    #绘制Map,开始缩放程度是5倍
    HeatMap(data).add_to(map_osm)  # 将热力图添加到前面建立的map里
    map_osm.save('./image/文章提及城市分布.html')#将绘制好的地图保存为html文件

# 得到城市共现矩阵文件
def city_gx_analysis(filepath):
    citys = {}			# 城市字典
    relationships = {} 	# 关系字典
    lineCitys = []		# 每篇城市关系

    # 构建城市集合:
    cityList = readFile('./data/1-5LineCity.txt').split('\n')
    citySet = set()
    for city in cityList:
        citySet.add(city.replace('市', ''))

    # 读入分词数据
    data = pd.read_excel(filepath)
    # 填充邻接矩阵
    for i in range(data.shape[0]):
        wordList = data['文章内容分词'].at[i].split(' ')
        lineCitys.append([])
        for word in wordList:
            if word not in citySet:
                continue
            lineCitys[-1].append(word)
            if citys.get(word) is None:
                citys[word] = 0
                relationships[word] = {}
            # 出现次数加1
            citys[word] += 1
    # explore relationships
    for line in lineCitys:					# 对于每一段
        for city1 in line:
            for city2 in line:				# 每段中的任意两个城市
                if city1 == city2:
                    continue
                if relationships[city1].get(city2) is None:		# 若两个城市尚未同时出现则新建项
                    relationships[city1][city2]= 1
                else:
                    relationships[city1][city2] = relationships[city1][city2]+ 1		# 两个城市共同出现次数加 1
    # output
    with codecs.open("./data/city_node.csv", "w", "utf-8") as f:
        f.write("Id Label Weight\r\n")
        for city, times in citys.items():
            f.write(city + " " + city + " " + str(times) + "\r\n")

    with codecs.open("./data/city_edge.csv", "w", "utf-8") as f:
        f.write("Source Target Weight\r\n")
        for city, edges in relationships.items():
            for v, w in edges.items():
                if w > 3:
                    f.write(city + " " + v + " " + str(w) + "\r\n")

if __name__ == '__main__':
    filepath = './data/text_2.xlsx'
    # 描述性分析
    # ms_analysis(filepath)

    # 分析城市间的共现关系
    city_gx_analysis(filepath)

TF-IDF.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/23 22:20
# @Author  : ystraw
# @Site    : 
# @File    : TF-IDF.py
# @Software: PyCharm Community Edition
# @function: 对文本内容进行关键词提取

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from openpyxl import Workbook

# 写入Excel
def write_excel_xls(path, sheet_name, value, bHead):
    # 获取需要写入数据的行数
    index = len(value)
    # 获取需要写入数据的行数
    index = len(value)
    wb = Workbook()
    # 激活 worksheet
    ws = wb.active
    # 第一行输入
    ws.append(bHead)
    # .cell(row=x, column=2, value=z.project)
    for i in range(2, index+2):
        for j in range(1, len(value[i-2]) + 1):
            # ws.append(value[i])
            ws.cell(row=i, column=j, value=value[i-2][j-1])
    # 保存
    wb.save(path)
    print(path + '表格写入数据成功!')

def TQ():
    # 读入数据
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章内容分词'])
    # print(document)
    # print(len(document))

    # min_df: 当构建词汇表时,严格忽略低于给出阈值的文档频率的词条,语料指定的停用词。如果是浮点值,该参数代表文档的比例,整型绝对计数值,如果词汇表不为None,此参数被忽略。
    tfidf_model = TfidfVectorizer(min_df=0.023).fit(document)
    # 得到语料库所有不重复的词
    feature = tfidf_model.get_feature_names()
    # print(feature)
    # print(len(feature))
    # ['一切', '一条', '便是', '全宇宙', '天狗', '日来', '星球']
    # 得到每个特征对应的id值:即上面数组的下标
    # print(tfidf_model.vocabulary_)
    # {'一条': 1, '天狗': 4, '日来': 5, '一切': 0, '星球': 6, '全宇宙': 3, '便是': 2}

    # 每一行中的指定特征的tf-idf值:
    sparse_result = tfidf_model.transform(document)
    # print(sparse_result)

    # 每一个语料中包含的各个特征值的tf-idf值:
    # 每一行代表一个预料,每一列代表这一行代表的语料中包含这个词的tf-idf值,不包含则为空
    weight = sparse_result.toarray()

    # 构建词与tf-idf的字典:
    feature_TFIDF = {}
    for i in range(len(weight)):
        for j in range(len(feature)):
            # print(feature[j], weight[i][j])
            if feature[j] not in feature_TFIDF:
                feature_TFIDF[feature[j]] = weight[i][j]
            else:
                feature_TFIDF[feature[j]] = max(feature_TFIDF[feature[j]], weight[i][j])
    # print(feature_TFIDF)
    # 按值排序:
    print('TF-IDF 排名前十的:')
    alldata = []
    featureList = sorted(feature_TFIDF.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    for i in range(1, 600 if len(featureList) > 600 else len(featureList)):
        print(featureList[i][0], featureList[i][1])
        alldata.append([featureList[i][0], featureList[i][1]])
    # 写入文件:
    tableHead = ['关键词', 'TF-IDF']
    import datetime
    filetime = str(datetime.datetime.now()).replace('-', '').replace(' ', '_').replace(':', '_')[:17]
    write_excel_xls('./data/关键词_' + filetime + '.xlsx', 'link', alldata, tableHead)

def drawWordCloud():
    from wordcloud import WordCloud
    from scipy.misc import imread
    # 读入数据
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章内容分词'])
    # 整理文本:
    # words = '一切 一条 便是 全宇宙 天狗 日来 星球' # 样例
    words = ''.join(document)
    # print(words)
    # 设置背景图片:
    b_mask = imread('./image/ciyun.webp')
    # 绘制词图:
    wc = WordCloud(
        background_color="white", #背景颜色
        max_words=2000, #显示最大词数
        font_path="./image/simkai.ttf",  #使用字体
        # min_font_size=5,
        # max_font_size=80,
        # width=400,  #图幅宽度
        mask=b_mask
    )
    wc.generate(words)
    # 准备一个写入的背景图片
    wc.to_file("./image/beijing_2.jpg")

if __name__ == '__main__':
    '''
    提取关键词
    '''
    # TQ()

    '''
    绘制词云图片
    '''
    drawWordCloud()

LDA_Theme Model.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/26 14:10
# @Author  : ystraw
# @Site    : 
# @File    : LDA_主题模型.py
# @Software: PyCharm Community Edition
# @function:
import pandas as pd
import numpy as np

def LDA():
    # 读入数据
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章内容分词'])

    # 获取词频向量:
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    corpus = document
    cntVector = CountVectorizer()
    cntTf = cntVector.fit_transform(corpus)
    # 输出选取词特征
    vocs = cntVector.get_feature_names()
    print('主题词袋:', len(vocs))
    # print(vocs)
    # 输出即为所有文档中各个词的词频向量
    # print(cntTf)

    # LDA主题模型
    lda = LatentDirichletAllocation(n_components=4,  # 主题个数
                                    max_iter=5,      # EM算法的最大迭代次数
                                    learning_method='online',
                                    learning_offset=20., # 仅仅在算法使用online时有意义,取值要大于1。用来减小前面训练样本批次对最终模型的影响
                                    random_state=0)
    docres = lda.fit_transform(cntTf)
    # 类别所属概率
    LDA_corpus = np.array(docres)
    print('类别所属概率:\n', LDA_corpus)
    # 每篇文章中对每个特征词的所属概率矩阵:list长度等于分类数量
    # print('主题词所属矩阵:\n', lda.components_)
    # 找到最大值所在列,确定属于的类别:
    arr = pd.DataFrame(LDA_corpus)
    data['主题类别'] = np.argmax(LDA_corpus, axis=1)  # 求最大值所在索引
    data['主题出现概率']=arr.max(axis=1)          # 求行最大值
    print('所属类别:\n',  data.head())
    data.to_excel('./data/LDA_主题分布_类别.xlsx', index=False)
    # return

    # 打印每个单词的主题的权重值
    tt_matrix = lda.components_
    # 类别id
    id = 0
    # 存储数据
    datalist = []
    for tt_m in tt_matrix:
        # 元组形式
        tt_dict = [(name, tt) for name, tt in zip(vocs, tt_m)]
        tt_dict = sorted(tt_dict, key=lambda x: x[1], reverse=True)
        # 打印权重值大于0.6的主题词:
        # tt_dict = [tt_threshold for tt_threshold in tt_dict if tt_threshold[1] > 0.6]
        # 打印每个类别的前20个主题词:
        tt_dict = tt_dict[:20]
        print('主题%d:' % id, tt_dict)
        # 存储:
        datalist += [[tt_dict[i][0], tt_dict[i][1], id]for i in range(len(tt_dict))]
        id += 1
    # 存入excel:
    # df = pd.DataFrame(datalist, columns=['特征词', '权重', '类别'])
    # df.to_excel('./data/LDA_主题分布3.xlsx', index=False)

if __name__ == '__main__':
    '''
        利用LDA主题模型进行主题提取:
    '''
    LDA()

Snownlp sentiment analysis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/27 12:54
# @Author  : ystraw
# @Site    : 
# @File    : Snownlp情感分析.py
# @Software: PyCharm Community Edition
# @function: 进行情感分析

import pandas as pd
from snownlp import SnowNLP

def qgjs():
    # 读入数据
    data = pd.read_excel('./data/LDA_主题分布_类别.xlsx')
    # print(data.shape)
    # 进行情感打分
    score = []
    for i in range(0, data.shape[0]):
        s = SnowNLP(data['标题'].at[i])
        score.append(s.sentiments)
    data['情绪得分'] = score
    print(data.head())
    data.to_excel('./data/情绪得分.xlsx', index=False)

if __name__ == '__main__':
    qgjs()
Published 314 original articles · 22 praises · 20,000+ views

Guess you like

Origin blog.csdn.net/qq_39451578/article/details/105455192