Python数据分析与挖掘实战 15章

因学习中发现《Python数据分析与挖掘实战》中的代码,有些不能实现,自己学习的时候走了很多弯路,特此分享可直接实现的代码,希望能让有需要的朋友少走弯路。

15-1部分

import pandas as pd
inputfile='../15.2/huizong.csv'
outfile='../15.2/meidi_jd.txt'
data=pd.read_csv(inputfile)

data=data[data[u'品牌']=='美的']
data=data[[u'评论']]
data.to_csv(outfile,index=False,header=False)

15-2部分

import pandas as pd
inputfile='../15.2/huizong.csv'
outfile='../15.2/meidi_jd_process_1.txt'
data=pd.read_csv(inputfile,encoding='utf-8',header=None)
l1=len(data)

data=pd.DataFrame(data[0].unique())
l2=len(data)
data.to_csv(outfile,index=False,header=False,encoding='utf-8')
print(u'删除了%s条评论'%(l1-l2))

15-3部分

import pandas as pd
inputfile1=open('../15.2/meidi_jd_process_end_正面情感结果.txt',encoding='utf-8')
inputfile2=open('../15.2/meidi_jd_process_end_负面情感结果.txt',encoding='utf-8')
outfile1='../15.2/meidi_jd_pos.txt'
outfile2='../15.2/meidi_jd_neg.txt'

data1=pd.read_csv(inputfile1,encoding='utf-8',header=None)
data2=pd.read_csv(inputfile2,encoding='utf-8',header=None)

data1=pd.DataFrame(data1[0].str.replace('.*?\d+?\\t',''))       #正则表达式修改数据
data2=pd.DataFrame(data2[0].str.replace('.*?\d+?\\t',''))

data1.to_csv(outfile1,index=False,header=False, encoding = 'utf-8')
data2.to_csv(outfile2,index=False,header=False, encoding = 'utf-8')
 
 

15-4部分

import pandas as pd
import jieba

inputfile1=open('../15.2/meidi_jd_pos.txt',encoding='utf-8')
inputfile2=open('../15.2/meidi_jd_neg.txt',encoding='utf-8')
outfile1='../15.2/meidi_jd_pos_cut.txt'
outfile2='../15.2/meidi_jd_neg_cut.txt'

data1=pd.read_csv(inputfile1,encoding='utf-8',header=None)
data2=pd.read_csv(inputfile2,encoding='utf-8',header=None)

mycut=lambda s:' '.join(jieba.cut(s))   #空格隔开

data1=data1[0].apply(mycut)
data2=data2[0].apply(mycut)

data1.to_csv(outfile1,index=False,header=False, encoding = 'utf-8')
data2.to_csv(outfile2,index=False,header=False, encoding = 'utf-8')
 
 

15-5部分

import pandas as pd

inputfile1=open('../15.2/meidi_jd_pos_cut.txt',encoding='utf-8')
inputfile2=open('../15.2/meidi_jd_neg_cut.txt',encoding='utf-8')
stopfile=open('../15.2/stoplist.txt',encoding='utf-8')

pos=pd.read_csv(inputfile1,encoding='utf-8',header=None)
neg=pd.read_csv(inputfile2,encoding='utf-8',header=None)
stop=pd.read_csv(stopfile,encoding='utf-8',header=None,sep='tipdm')

stop=[' ','']+list(stop[0])
neg[1]=neg[0].apply(lambda s:s.split(' '))
neg[2]=neg[1].apply(lambda x:[i for i in x if i not in stop])

pos[1]=pos[0].apply(lambda s:s.split(' '))
pos[2]=pos[1].apply(lambda x:[i for i in x if i not in stop])

from gensim import corpora,models
pos_dict=corpora.Dictionary(pos[2])     #建立词典
pos_corpus=[pos_dict.doc2bow(i) for i in pos[2]]        #建立预料库
pos_lda=models.LdaModel(pos_corpus,num_topics=3,id2word=pos_dict)       #LDA模型训练

for i in range(3):
    pos_lda.print_topic(i)      #输出每个主题

from gensim import corpora,models
neg_dict=corpora.Dictionary(neg[2])     #建立词典
neg_corpus=[neg_dict.doc2bow(i) for i in neg[2]]        #建立预料库
neg_lda=models.LdaModel(neg_corpus,num_topics=3,id2word=neg_dict)       #LDA模型训练

for i in range(3):
    neg_lda.print_topic(i)      #输出每个主题


猜你喜欢

转载自blog.csdn.net/lonely2018/article/details/80188028