推特情感分析-基于spark

参考链接：使用spark Mlib
数据集下载：由于对Twitter的操作需要FQ，为方便大家进行后续实验操作，我们已将数据保存至tweets.json, donald.json及hillary.json中，json 文件地址将在后续实验操作时给出。tweets.json包括和川普及希拉里有关的推特，而donald.json和hillary.json仅包括其文件名代表的候选人有关推特。
tweets.json下载
 donald.json下载
 hillary.json下载
# -*- coding=utf8 -*- 
from __future__ import print_function
import json
import re
import string
import numpy as np

from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.regression import LabeledPoint
# 参考地址:https://www.cnblogs.com/mrchige/p/6346601.html?utm_source=itdadao&utm_medium=referral
#寻找推文的协调性
#符号化推文的文本
#删除停用词，标点符号，url等
remove_spl_char_regex = re.compile('[%s]' % re.escape(string.punctuation))  # regex to remove special characters
stopwords = [u'rt', u're', u'i', u'me', u'my', u'myself', u'we', u'our',               u'ours', u'ourselves', u'you', u'your',
             u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers',
             u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what',
             u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were',
             u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a',
             u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by',
             u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after',
             u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under',
             u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all',
             u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not',
             u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don',
             u'should', u'now']
# tokenize函数对tweets内容进行分词
def tokenize(text):
    tokens = []
    text = text.encode('ascii', 'ignore')  # to decode
    text = text.decode('utf-8')  # python3

    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '',
                  text)  # to replace url with ''
    text = remove_spl_char_regex.sub(" ", text)  # Remove special characters
    text = text.lower()
    for word in text.split():
        if word not in stopwords \
                and word not in string.punctuation \
                and len(word) > 1 \
                and word != '``':
            tokens.append(word)
    return tokens
# 定义分词文本转换为向量的函数
def doc2vec(document):
    # 100维的向量
    doc_vec = np.zeros(100)
    tot_words = 0

    for word in document:
        try:
        # 查找该词在预训练的word2vec模型中的特征值
            vec = np.array(lookup_bd.value.get(word)) + 1
            # print(vec)
            # 若该特征词在预先训练好的模型中，则添加到向量中
            if vec != None:
                doc_vec += vec
                tot_words += 1
        except:
            continue
    vec = doc_vec / float(tot_words)
    return vec
conf = SparkConf().setAppName("sentiment_analysis")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
sqlContext = SQLContext(sc)
lookup = sqlContext.read.parquet("word2vecM_simple/data").alias("lookup")
print("------------------------------------------------------")
lookup.printSchema()
lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())
print("------------------------------------------------------")
# 读入tweets.json作为分类器训练数据集
with open('tweets.json', 'r') as f:
    rawTrn_data = json.load(f)
    f.close()

trn_data = []
for obj in rawTrn_data['results']:
    token_text = tokenize(obj['text']) # 规范化推特文本，进行分词
    tweet_text = doc2vec(token_text) # 将文本转换为向量
    # 使用LabeledPoint 将文本对应的情感属性polariy：该条训练数据的标记label，tweet_text：训练分类器的features特征，结合成可作为spark mllib分类训练的数据类型
    trn_data.append(LabeledPoint(obj['polarity'], tweet_text))
trnData = sc.parallelize(trn_data)
#print(trnData)
print("------------------------------------------------------")

# 读入hillary.json作为分类器测试数据集
with open('hillary.json', 'r') as f:
    rawTst_data = json.load(f)
    f.close()
tst_data = []
for obj in rawTst_data['results']:
    token_text = tokenize(obj['text'])
    tweet_text = doc2vec(token_text)
    tst_data.append(LabeledPoint(obj['polarity'], tweet_text))

tst_dataRDD = sc.parallelize(tst_data)
# 训练随机森林分类器模型
model = RandomForest.trainClassifier(trnData, numClasses=3, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

# 利用训练好的模型进行模型性能测试
predictions = model.predict(tst_dataRDD.map(lambda x: x.features))
labelsAndPredictions = tst_dataRDD.map(lambda lp: lp.label).zip(predictions)
# 计算分类错误率
# testErr = labelsAndPredictions.filter(lambda v,p: v != p).count() / float(tst_dataRDD.count())
# print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
# 输出训练好的随机森林的分类决策模型
print(model.toDebugString())
推特情感分析-基于spark

猜你喜欢