9.作者归属问题

# -*- coding: utf-8 -*-
"""
Created on Sun Oct  7 09:00:32 2018

@author: asus
"""
#9 作者归属问题

#9.1.3 获取数据
import os
import sys
data_folder = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\作者归属问题，支持向量机",
        "Data", "books")
#数据可用爬虫爬取下来，但时间太长

#加载文件时跳过古腾堡项目的说明
def clean_book(document):
    lines = document.split("\n") 
    #遍历文档的每一行，寻找作品的开头和结尾，中间部分就是作品内容。
    start = 0
    end = len(lines)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("*** START OF THIS PROJECT GUTENBERG"):
            start = i + 1
        elif line.startswith("*** END OF THIS PROJECT GUTENBERG"):
            end = i - 1
    #最后，用换行符把所有行连接起来，得到作品内容。
    return "\n".join(lines[start:end])
#创建函数，加载所有图书，进行上述预处理操作
import numpy as np
#声明加载图的函数，参数为图书所在目录books,该目录下是一系列以作者名字命名的子文件夹，
#图书文件就在这些子文件夹中
def load_books_data(folder=data_folder):
    documents = []
    authors = []
    #获取books目录下的所有子文件夹
    subfolders = [subfolder for subfolder in os.listdir(folder) if 
                  os.path.isdir(os.path.join(folder, subfolder))]
    #遍历这些子文件夹，使用enumerate函数为这些子文件夹指定索引
    for author_number, subfolder in enumerate(subfolders):
        #获取子文件夹的绝对路径，查找里面的所有图书文件
        full_subfolder_path = os.path.join(folder, subfolder)
        for document_name in os.listdir(full_subfolder_path):
            with open(os.path.join(full_subfolder_path), document_name) as inf:
                documents.append(clean_book(inf.read()))
                authors.append(author_number)
    return documents, np.array(authors, dtype='int')
documents, classes = load_books_data(data_folder)

#9.2 功能词
#统计功能词
function_words = ["a", "able", "aboard", "about", "above", "absent",
                  "according" , "accordingly", "across", "after", "against",
                  "ahead", "albeit", "all", "along", "alongside", "although",
                  "am", "amid", "amidst", "among", "amongst", "amount", "an",
                    "and", "another", "anti", "any", "anybody", "anyone",
                    "anything", "are", "around", "as", "aside", "astraddle",
                    "astride", "at", "away", "bar", "barring", "be", "because",
                    "been", "before", "behind", "being", "below", "beneath",
                    "beside", "besides", "better", "between", "beyond", "bit",
                    "both", "but", "by", "can", "certain", "circa", "close",
                    "concerning", "consequently", "considering", "could",
                    "couple", "dare", "deal", "despite", "down", "due", "during",
                    "each", "eight", "eighth", "either", "enough", "every",
                    "everybody", "everyone", "everything", "except", "excepting",
                    "excluding", "failing", "few", "fewer", "fifth", "first",
                    "five", "following", "for", "four", "fourth", "from", "front",
                    "given", "good", "great", "had", "half", "have", "he",
                    "heaps", "hence", "her", "hers", "herself", "him", "himself",
                    "his", "however", "i", "if", "in", "including", "inside",
                    "instead", "into", "is", "it", "its", "itself", "keeping",
                    "lack", "less", "like", "little", "loads", "lots", "majority",
                    "many", "masses", "may", "me", "might", "mine", "minority",
                    "minus", "more", "most", "much", "must", "my", "myself",
                    "near", "need", "neither", "nevertheless", "next", "nine",
                    "ninth", "no", "nobody", "none", "nor", "nothing",
                    "notwithstanding", "number", "numbers", "of", "off", "on",
                    "once", "one", "onto", "opposite", "or", "other", "ought",
                    "our", "ours", "ourselves", "out", "outside", "over", "part",
                    "past", "pending", "per", "pertaining", "place", "plenty",
                    "plethora", "plus", "quantities", "quantity", "quarter",
                    "regarding", "remainder", "respecting", "rest", "round",
                    "save", "saving", "second", "seven", "seventh", "several",
                    "shall", "she", "should", "similar", "since", "six", "sixth",
                    "so", "some", "somebody", "someone", "something", "spite",
                    "such", "ten", "tenth", "than", "thanks", "that", "the",
                    "their", "theirs", "them", "themselves", "then", "thence",
                  "therefore", "these", "they", "third", "this", "those",
"though", "three", "through", "throughout", "thru", "thus",
"till", "time", "to", "tons", "top", "toward", "towards",
"two", "under", "underneath", "unless", "unlike", "until",
"unto", "up", "upon", "us", "used", "various", "versus",
"via", "view", "wanting", "was", "we", "were", "what",
"whatever", "when", "whenever", "where", "whereas",
"wherever", "whether", "which", "whichever", "while",
                  "whilst", "who", "whoever", "whole", "whom", "whomever",
"whose", "will", "with", "within", "without", "would", "yet",
"you", "your", "yours", "yourself", "yourselves"]
#有了功能词列表，我们来创建功能词统计工具
from sklearn.feature_extraction.text import CountVectorizer
extractor = CountVectorizer(vocabulary=function_words)

#9.2.2 用功能次进行分类
#支持向量机SVC
from sklearn.svm import SVC
#from sklearn.cross_validation import cross_val_score 更新了
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
#from sklearn import grid_search 更新
#该模块在0.18版本中被弃用，支持所有重构的类和函数都被移动到的model_selection模块 
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import train_test_split

#支持向量机接收一系列参数。我们用字典结构来组织参数。参数kernel使用linear和rbf。
#C的值取1或10。接着用网络搜索法寻找最优参数。
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
#高斯内核(例如rbf)只适合用于数据集相对较小的情况，比如特征值少于10000。
svr = SVC()
grid = GridSearchCV(svr, parameters)
#接着创建流水线，把特征抽取和参数搜索两个步骤加入流水线中，特征(仅功能词)抽取使用
#CountVectorizer类，参数搜索使用SVM。
pipeline1 = Pipeline([('feature_extraction', extractor),
                      ('clf', grid)])
#然后，使用cross_val_score对流水线的结果进行交叉检验
scores = cross_val_score(pipeline1, documents, classes, scoring='f1')
print(np.mean(scores))

#9.3 支持向量机
#二分类器，假如我们有两个类别的数据，而这两个类别敲好能被一条线分开，线上所有点为一类，
#线下所有点为一类。SVM要做的就是找到这条线，并找出的是最佳分割线。让各点到分割线之间的
#距离最大化。

#9.3.1 用SVN分类
#C参数与分类器正确比例相关，但可能带来过拟合的风险。C值越高，间隔越小，表示要尽可能把
#所有数据正确分类。C值越小，间隔约达——有些数据将无法正确分类。C值越低，过拟合训练数据
#的可能性就低，但是分类效果可能会相对较差。

#9.3.2 内核
#如果数据线性不可分，就需要将其置入更高维的空间中，加入更多伪特征知道数据线性可分。
#寻找最佳分割显示往往需要计算个体之间的内积(又称点积)。
#常用的内核函数有几种，线性内核(linerr)，高斯内核(rbf)，Sigmoind内核。
#这些内核能够有效地确定两类数据之间的距离。

#9.4 字符N元语法
#N元语法由一系列的N个为一组的对象组成。N为魅族对象的个数(对文本来说，N通常取2到6之间
#的值)。基于单词的N元语法被广泛应用在通常与文档肢体相关的各项研究中。然而，基于字符的
#N元语法被证明在作者归属问题上效果很好。

#抽取字符N元语法
#我们接下来用CountVectorizer类来抽取N元语法，需要设置analyzer参数，指定N的值。
#scikit-learn的N元语法抽取工具提供了range参数，允许用户抽取不同长度的N元语法。这里
#用不到，抽取相同长度，range参数的两个值使用相同值即可。

pipeline = Pipeline([(
        'feature_extraction', CountVectorizer(analyzer='char',
                                              ngram_range=(3, 3))), (
                                              'classifier', grid)])
scores = cross_val_score(pipeline, documents, classes, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

#9.5 使用安然公司数据集
#https://www.cs.cmu.edu/~./enron/
#非Linux系统，使用免费的7-zip(http://www.7-zip.org/)等软件来解压
import os

#指定数据集所在位置
enron_data_folder = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\作者归属问题，支持向量机", 
                                 "enron_mail_20150507.tar", "maildir", "blair")

#9.5.2 创建数据集家在工具
#我们现在创建一个函数，它接受几个发件人作为参数，返回他们所发送的邮件。我们需要的有效
#信息是邮件内容而不是邮件本身。因此，还需要邮件解析器。
from email.parser import Parser
p = Parser()
#设置随机状态
from sklearn.utils import check_random_state
#用min_docs_author参数指定每个发件人至少发过10封邮件，用max_doce_author参数指定最多
#从一个用户那里抽取100封邮件。我们还用num_authors限定了收件人数量——默认为10.
def get_enron_corpus(num_authors=10, data_folder=enron_data_folder,
                     min_docs_author=10, max_docs_author=100,
                     random_state=None):
    random_state = check_random_state(random_state)
    #获取到安然公司员工的邮箱，随机对得到的邮箱进行排序。
    email_addresses = sorted(os.listdir(data_folder))
    random_state.shuffle(email_addresses)
    #我们创建文档列表、类别列表，author_num指的是每个新发件人的类别编号。
    documents = []
    classes = []
    author_num = 0
    #我们还需要记录我们所用到的收件人及他们的编号。
    authors = {}
    #接下来，遍历邮箱文件夹，查找它下面名字中含有"sent"的表示发件箱的子文件夹。
    for user in email_addresses:
        users_email_folder = os.path.join(data_folder, user)
        mail_folders = [os.path.join(users_email_folder, subfolder) for subfolder in os.listdir(users_email_folder) if "sent" in subfolder]
        #获取子文件夹中的每一封邮件
        try:
            authored_emails = [open(os.path.join(mail_folder, email_filename),
                                    encoding='cp1252').read() for mail_folder in mail_folders for email_filename in os.listdir(mail_folder)]
        except IsADirectoryError:
            continue
        if len(authored_emails) < min_docs_author:
            continue
        if len(authored_emails) > max_docs_author:
            authored_emails = authored_emails[:max_docs_author]
        #获取邮件内容，然后把邮件内容添加到数据集中
        contents = [p.parsestr(email)._payload for email in authored_emails]
        documents.extend(contents)
        #把该发件人添加到类别列表中，每一封邮件添加一次。
        classes.extend([author_num] * len(authored_emails))
        #记录该收件人的编号，再把编号加1，以便下一个收件人使用
        authors[user] = author_num
        author_num += 1
        #检测收件人数量是否达到我们设置的值，如果是，跳出循环，返回数据集
        if author_num >= num_authors or author_num >= len(email_addresses):
            break
    return documents, np.array(classes), authors
documents, classes, authors = get_enron_corpus(data_folder=enron_data_folder,
                                               random_state=14)

import quotequail #函数封装
def remove_replies(email_contents):
    r = quotequail.unwrap(email_contents)
    if r in None:
        return email_contents
    if 'text_top' in r:
        return r['text_top']
    elif 'text' in r:
        return r['text']
    return email_contents
documents = [remove_replies(document) for document in documents]

#9.5.3 组装起来
scores = cross_val_score(pipeline, documents, classes, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

#9.5.4 评估
from sklearn.cross_validation import train_test_split
training_documents, testing_documents, y_train, y_test = train_test_split(
        documents, classes, random_state=14)
pipeline.fit(training_documents, y_train)
y_pred = pipeline.predict(testing_documents)
#使用网格搜索
print(pipeline.named_steps['classifier'].best_params_)
#创建混淆矩阵
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred, y_test)
cm = cm / cm.astype(np.float).sum(axis=1)

sorted_authors = sorted(authors.keys(), key=lambda x:authors[x])
%matplotlib inline
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))
plt.imshow(cm, cmap='Blues')
tick_marks = np.arange(len(sorted_authors))
plt.xticks(tick_marks, sorted_authors)
plt.yticks(tick_marks, sorted_authors)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
猜你喜欢