对商品的评论进行数据挖掘得到评论标签(商品属性+评论观点),以及用户的分组信息

对商品的评论进行数据挖掘得到评论标签(商品属性+评论观点),以及用户的分组信息:

第一步:对文本进行预处理,分词并进行语义角色标注

# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import heapq
import re
import emoji

class Sentence_Parser:
    def __init__(self):
        #LTP_DIR = 'F:\project support\ltp_data_v3.4.0'
        LTP_DIR = './ltp_data_v3.4.0'
        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, 'cws.model'))

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, 'pos.model'))

        # 依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, 'parser.model'))

        # 命名实体识别(人名、地名、机构名等)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, 'ner.model'))

        # 词义角色标注(施事、受事、时间、地点)
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))


    def format_labelrole(self, words, postags):
        """
        词义角色标注
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {
    
    }
        for role in roles:
            roles_dict[role.index] = {
    
    arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments}
        # for item in roles_dict.items():
        #     print(item)
        return roles_dict


    def bulid_parser_child_dict(self, words, postags, arcs):
        """
        句法分析---为句子中的每个词语维护一个保存句法依存子节点的字典
        """
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation not in child_dict:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]
        # print(rely_id)
        relation = [arc.relation for arc in arcs]
        # for i in range(len(relation)):
        #     print(words[i], '_', postags[i], '_', i, '_', relation[i])
        heads = ['Root' if id == 0 else words[id-1] for id in rely_id]
        # print(heads)
        for i in range(len(words)):
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)
        return child_dict_list, format_parse_list


    def parser_main(self, sentence):
        """
        parser主函数
        """
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.bulid_parser_child_dict(words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list

    def select(self, words, postags):
        """
        筛选出名词和形容词
        """
        co_model = Word2Vec.load('coseg_text.model')
        n_list0 = []
        a_list = []
        for i in range(len(postags)):
            if postags[i] == 'n':
                if len(words[i]) >= 2:
                    n_list0.append(words[i])
            if postags[i] == 'a':
                # if len(words[i]) >= 2:
                a_list.append(words[i])
        n_list0 = list(set(n_list0))
        a_list = list(set(a_list))
        # print(n_list0)
        # print(a_list)
        si_p = []
        for n in n_list0:
            try:
                s = co_model.similarity(n, '手机')
                si_p.append(s)
            except Exception as e:
                si_p.append(0)
        index_list = list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手机相关度最高的n
        n_list = []
        for index in index_list:
            n_list.append(n_list0[index])
        # print(n_list)
        return n_list, a_list


    def simlarity(self, n_list0, a_list):
        """
        计算相似度,进行正逆向匹配,筛选出名词和形容词的最佳搭配
        """
        n_list0 = n_list0
        a_list = a_list
        co_model = Word2Vec.load('coseg_text.model')
        si_p = []
        for n in n_list0:
            try:
                s = co_model.similarity(n, '手机')
                si_p.append(s)
            except Exception as e:
                si_p.append(0)
        index_list = list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手机相关度最高的n
        n_list = []
        for index in index_list:
            n_list.append(n_list0[index])

        # 名词正向匹配
        comment1_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)])
        index = 0
        for i in range(len(n_list)):
            f_si = 0
            for j in range(len(a_list)):
                try:
                    si = co_model.similarity(n_list[i], a_list[j])
                    if si >= f_si:
                        f_si = si
                        comment_tag = n_list[i] + a_list[j]
                    else:
                        f_si = f_si
                except Exception as e:
                    print('语料库中缺少该词', e)
            comment1_df.loc[index, ] = [comment_tag, f_si]
            index += 1
        comment1_df = comment1_df.sort_values(by='similarity', ascending=False, ignore_index=True)
        comment1_df.dropna(subset=['comment_tag'], inplace=True)
        # comment1_df = comment1_df.iloc[0: int(0.2*len(comment_df)), ]

        # 形容词匹配逆向匹配
        comment2_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)])
        index = 0
        for i in range(len(a_list)):
            f_si = 0
            for j in range(len(n_list)):
                try:
                    si = co_model.similarity(n_list[j], a_list[i])
                    if si >= f_si:
                        f_si = si
                        comment_tag = n_list[j] + a_list[i]
                    else:
                        f_si = f_si
                except Exception as e:
                    print('语料库中缺少该词', e)
            comment2_df.loc[index, ] = [comment_tag, f_si]
            index += 1
            comment2_df = comment2_df.sort_values(by='similarity', ascending=False, ignore_index=True)
            comment1_df.dropna(subset=['comment_tag'], inplace=True)
        comment_df = pd.merge(comment1_df, comment2_df, on='comment_tag', how='inner')
        comment_df.dropna(subset=['comment_tag'], inplace=True)
        return comment_df

    def cleandata(self, x):
        """
        对数据进行清洗,替换一些不规则的标点符号
        """
        pat = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]")  # 只保留中英文,去掉符号
        x = x.replace(' ', ',')
        emoji.demojize(x)  # 去掉表情表情符号
        x = re.sub(pat, ',', x)
        return x



第二步:提取实体和相关实体信息


```python
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
from cixing import Sentence_Parser
import pandas as pd
import numpy as np
import heapq
import re
import emoji

class Extractor:
    def __init__(self):
        self.co_model = Word2Vec.load('coseg_text.model')
        self.parser = Sentence_Parser()

    def get_seginfo(self, comment_list):
        for c in range(len(comment_list)):
            if len(comment_list[c]) <= 200:
                sentence = comment_list[c]
            else:
                sentence = comment_list[c][0: 201]
            if sentence != '':
                sentence = self.parser.cleandata(sentence)
                words, postags, child_dict_list, roles_dict, format_parse_list = self.parser.parser_main(sentence)
                n_list, a_list = self.parser.select(words, postags)

                tags = []
                for j in range(len(a_list)):
                    # print(child_dict_list[j])
                    p = words.index(a_list[j])
                    if child_dict_list[p]:
                        # print(child_dict_list[p])
                        # 构成的是主谓关系
                        if 'SBV' in child_dict_list[p]:
                            # print(child_dict_list[p])
                            si_p = []
                            for po in child_dict_list[p]['SBV']:
                                try:
                                    si = self.co_model.similarity(words[po], '手机')
                                    si_p.append(si)
                                except Exception as e:
                                    si_p.append(0)
                                id = list(map(si_p.index, heapq.nlargest(1, si_p)))  # 和该形容词最高的名词

                            s = child_dict_list[p]['SBV'][id[0]]
                            w1 = words[s] + a_list[j]
                            if child_dict_list[s]:
                                # print(child_dict_list[s])
                                if 'ATT' in child_dict_list[s]:
                                    if postags[child_dict_list[s]['ATT'][0]] == 'n':
                                        w2 = words[child_dict_list[s]['ATT'][0]] + w1
                                        tags.append(w2)
                                    else:
                                        tags.append(w1)
                            else:
                                tags.append(w1)

                        if 'ATT' in child_dict_list[p]:
                            # print(child_dict_list[p])
                            s = child_dict_list[p]['ATT'][0]
                            if 'SBV' in child_dict_list[s]:
                                w3 = words[child_dict_list[s]['SBV'][0]]
                                w4 = w3 + a_list[j]
                                id1 = words.index(w3)
                                if child_dict_list[id1]:
                                    if 'ATT' in child_dict_list[id1]:
                                        if postags[child_dict_list[id1]['ATT'][0]] == 'n':
                                            w5 = words[child_dict_list[id1]['ATT'][0]] + w4
                                            tags.append(w5)
                                else:
                                    tags.append(w4)

                with open('F:\pycharm project data\\taobao\phone\\tags.txt', 'a') as t:
                    t.writelines(' '.join(tags))
                    t.writelines('\n')
                    # f.close()
                print(tags)


                # 获取相关的名词和用户组
                n_list = list(set(n_list))
                if n_list:
                    with open('F:\pycharm project data\\taobao\phone\\noun.txt', 'a') as f:
                        f.writelines(' '.join(n_list))
                        f.writelines('\n')
                        # f.close()
                si_p = []
                u_list = ['小孩子', '作业', '高中', '初中', '儿童', '学校', '小孩', '老师', '网瘾', '中学生', '小学', '女儿', '小学生', '孩子', '闺女', '儿子', '学生', '网课', '小朋友',
                            '同事', '表弟', '亲戚', '姐妹', '表哥', '邻居', '同学', '朋友', '盆友', '链接',
                            '姥姥', '老太太', '老人', '岳母', '父亲', '老娘', '小姨', '老丈人', '舅舅', '岳父', '亲人', '老妈子', '老头儿', '婆婆', '老太', '老头子', '父母', '家婆', '老父亲', '老爹', '长辈', '大人', '外爷', '爷爷', '我爸', '老头', '老妈', '老爷子', '爸妈', '奶奶', '老伴', '老爸', '母亲', '老人家', '妈妈', '公公', '爸爸', '丈母娘', '姥爷', '家里人', '家人',
                            '老奶奶', '小伙子', '阿姨', '娘娘', '小姑子', '姐姐', '老妹', '婶婶', '大姐', '外孙', '小屁孩', '孙子', '姨妈', '棉袄', '伯母', '孝心',
                            '媳妇', '妹妹', '男朋友', '对象', '生日', '女朋友', '男票', '老婆', '弟弟', '情人节', '爹妈', '麻麻', '老公', '外甥', '老弟'
                ]
                # print(n_list)
                # print(n_list)
                for n in range(len(n_list)):
                    for u in range(len(u_list)):
                        try:
                            s = self.co_model.similarity(n_list[n], u_list[u])
                            si_p.append(s)
                        except Exception as e:
                                si_p.append(0)
                index_list = list(map(si_p.index, heapq.nlargest(1, si_p)))  # 取出和手机相关度最高的n
                # print(index_list)
                user_list = []
                for index in index_list:
                    index = int(index/len(u_list))
                    user_list.append(n_list[index])
                # print(user_list)
                with open('F:\pycharm project data\\taobao\phone\\user.txt', 'a') as u:
                    u.writelines(user_list)
                    u.writelines('\n')
                    # f.close()
            t.close()
            f.close()
            u.close()

第三步:测试数据以及测试模型

# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import heapq
import re
import emoji
from extractor import Extractor

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 5000)
pd.set_option('max_colwidth', 30)
pd.set_option('display.width', 1000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)

# 一、数据处理
# 导入数据
df = pd.read_csv('F:\pycharm project data\\taobao\phone\\comment1.csv', encoding='utf-8-sig')
# 提取评论数据
co_df = df[['content']]
co_df = co_df.loc[co_df['content'] != '15天内买家未作出评价', ['content']]
co_df = co_df.loc[co_df['content'] != '评价方未及时做出评价,系统默认好评!', ['content']]
comment_list = co_df['content'].tolist()




if __name__ == '__main__':
    myextractor = Extractor()
    #myextractor.get_seginfo(comment_list)



猜你喜欢

转载自blog.csdn.net/stay_foolish12/article/details/112677591
今日推荐