因学习中发现《Python数据分析与挖掘实战》中的代码,有些不能实现,自己学习的时候走了很多弯路,特此分享可直接实现的代码,希望能让有需要的朋友少走弯路。
15-1部分
import pandas as pd inputfile='../15.2/huizong.csv' outfile='../15.2/meidi_jd.txt' data=pd.read_csv(inputfile) data=data[data[u'品牌']=='美的'] data=data[[u'评论']] data.to_csv(outfile,index=False,header=False)
15-2部分
import pandas as pd inputfile='../15.2/huizong.csv' outfile='../15.2/meidi_jd_process_1.txt' data=pd.read_csv(inputfile,encoding='utf-8',header=None) l1=len(data) data=pd.DataFrame(data[0].unique()) l2=len(data) data.to_csv(outfile,index=False,header=False,encoding='utf-8') print(u'删除了%s条评论'%(l1-l2))
15-3部分
import pandas as pd inputfile1=open('../15.2/meidi_jd_process_end_正面情感结果.txt',encoding='utf-8') inputfile2=open('../15.2/meidi_jd_process_end_负面情感结果.txt',encoding='utf-8') outfile1='../15.2/meidi_jd_pos.txt' outfile2='../15.2/meidi_jd_neg.txt' data1=pd.read_csv(inputfile1,encoding='utf-8',header=None) data2=pd.read_csv(inputfile2,encoding='utf-8',header=None) data1=pd.DataFrame(data1[0].str.replace('.*?\d+?\\t','')) #正则表达式修改数据 data2=pd.DataFrame(data2[0].str.replace('.*?\d+?\\t','')) data1.to_csv(outfile1,index=False,header=False, encoding = 'utf-8') data2.to_csv(outfile2,index=False,header=False, encoding = 'utf-8')
15-4部分
import pandas as pd import jieba inputfile1=open('../15.2/meidi_jd_pos.txt',encoding='utf-8') inputfile2=open('../15.2/meidi_jd_neg.txt',encoding='utf-8') outfile1='../15.2/meidi_jd_pos_cut.txt' outfile2='../15.2/meidi_jd_neg_cut.txt' data1=pd.read_csv(inputfile1,encoding='utf-8',header=None) data2=pd.read_csv(inputfile2,encoding='utf-8',header=None) mycut=lambda s:' '.join(jieba.cut(s)) #空格隔开 data1=data1[0].apply(mycut) data2=data2[0].apply(mycut) data1.to_csv(outfile1,index=False,header=False, encoding = 'utf-8') data2.to_csv(outfile2,index=False,header=False, encoding = 'utf-8')
15-5部分
import pandas as pd inputfile1=open('../15.2/meidi_jd_pos_cut.txt',encoding='utf-8') inputfile2=open('../15.2/meidi_jd_neg_cut.txt',encoding='utf-8') stopfile=open('../15.2/stoplist.txt',encoding='utf-8') pos=pd.read_csv(inputfile1,encoding='utf-8',header=None) neg=pd.read_csv(inputfile2,encoding='utf-8',header=None) stop=pd.read_csv(stopfile,encoding='utf-8',header=None,sep='tipdm') stop=[' ','']+list(stop[0]) neg[1]=neg[0].apply(lambda s:s.split(' ')) neg[2]=neg[1].apply(lambda x:[i for i in x if i not in stop]) pos[1]=pos[0].apply(lambda s:s.split(' ')) pos[2]=pos[1].apply(lambda x:[i for i in x if i not in stop]) from gensim import corpora,models pos_dict=corpora.Dictionary(pos[2]) #建立词典 pos_corpus=[pos_dict.doc2bow(i) for i in pos[2]] #建立预料库 pos_lda=models.LdaModel(pos_corpus,num_topics=3,id2word=pos_dict) #LDA模型训练 for i in range(3): pos_lda.print_topic(i) #输出每个主题 from gensim import corpora,models neg_dict=corpora.Dictionary(neg[2]) #建立词典 neg_corpus=[neg_dict.doc2bow(i) for i in neg[2]] #建立预料库 neg_lda=models.LdaModel(neg_corpus,num_topics=3,id2word=neg_dict) #LDA模型训练 for i in range(3): neg_lda.print_topic(i) #输出每个主题