机器学习项目 - 垃圾邮件分类

一, 数据清洗
（1），先做数据清洗，清洗过的数据被称之为"干净数据"；
具体过程为-》要结合业务场景来判断哪些特征是值得被提取的，
如果自身对业务场景并不熟悉，可以咨询或者请教身边经验丰富的人。

   举例：比较两句话的不同：
   ① 我司/代开/发票································1
   ② 月底/了/，/请/将/本月/发票/统一/装订/················0

（2），数据清洗过程中，也可以将所有认为可能对结果产生影响的特征全部进行提取，
   并且逐一分析每个特征对结果的影响，再删除掉那些与结果无关的特征。
（3），首先，根据label值，制作lable的正向字典
   其次，根据之前定义好的特征，分别提取非格式化邮件中的特征内容
   最后，讲所有提取到的特征转为行，并将216*300行内容保存至1个大的表格中。

二, 特征工程
（1）, 对from & to 中的域名进行匹配，并且寻找它们与label值之间的关系，
如果关系比较紧密，存在明显的关系，则进一步将它们转化为数值；
如果关系不大，则降它们删除

(2), 对date中的值进行正则匹配提取，并对匹配后的格式化数据进行时间段切分：
8~13=0;13~19=1;19~24=2;24~8=3;分别取寻找时间段和label之间的关系，
如果关系明确切明显，则就按照时间段作为特征来数值化特征值，否则删除。
data长度为16的值类型：['2005-9-2 上午11:04', '2005-9-2 上午10:55', '2005-9-2 上午10:55',
                    '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55',
                    '2005-9-2 上午10:55']
data长度为19的值类型：['Sep 23 2005 1:04 AM']
data长度为21的值类型：['August 24 2005 5:00pm', 'August 24 2005 5:00pm', 'August 24 2005 5:00pm']

通过分析，时间段值不能起到区分垃圾邮件的作用，但是如果是正常邮件一定含有日期；如果是垃圾邮件，
则不一定含有，所以可以构造'has date'这一列为新的特征作为最后带入的特征进行训练及预测。

对于邮件长度进行统计，发现特别短和特别长的邮件是垃圾邮件的概率非常大，因此可以拟合用e为底的指数和
对数函数来表征邮件长度的信息量，尽管信息量的值是大于1的，但是其特点和概率类似，即：值越大则是垃圾
邮件的概率就越大，值越小则是垃圾邮件的概率就越小。

最后，要为邮件的内容进行分词。

三、建模

（1）分词后邮件内容的tf-idf转化(也可以用词袋法)。
(2) 对tf-idf转化后的邮件内容做PCA 或者 svd 降维。
(3) 把降维后的数据带入模型进行训练并测试。

数据处理脚本 dataProcess.py

#encoding:utf-8
import os
import sys#调试用包
import time#计时包

print("kaishi")
def 制作标签字典(file_path):
    type_dict = {"spam": "1", "ham": "0"}
    index_file = open(file_path)#./date/full/index
    index_dict = {}#要创建的新字典
    try:
        for line in index_file:
            arr = line.split(" ")
            #[spam,../data/000/000]
            if len(arr) == 2:
                key, value = arr
                #(spam),(../data/000/000)
            # 添加到字段中
            value = value.replace("../data", "").replace("\n", "")
            #(spam),(/000/000)
            index_dict[value] = type_dict[key.lower()]
            #{/000/000: 1}
            #{/000/001: 0}
    finally:
        index_file.close()
    return index_dict
# for file_path in path_list:
# 邮件的文件内容数据读取
def 字典化邮件文本内容(file_path):#dll
    './data/data/000/000'
    file = open(file_path, "r", encoding="gb2312", errors='ignore')
    content_dict = {}
    try:
        is_content = False#初始化为False后
        for line in file:
            #切掉两头的空格，逐步逼近格式化数据
            line = line.strip()
            if line.startswith("From:"):
                #From: "yan"<(8月27-28,上海)培训课程>,
                content_dict['from'] = line[5:]
                # "yan"<(8月27-28,上海)培训课程>#
                #{'from':"yan"<(8月27-28,上海)培训课程>}
            elif line.startswith("To:"):
                content_dict['to'] = line[3:]
                #[email protected]
            elif line.startswith("Date:"):
                content_dict['date'] = line[5:]
                # Tue, 30 Aug 2005 10:08:15 +0800
            elif not line:
                is_content = True

            # 处理邮件内容,利用is_content的真假来执行一个小的针对文本内容的for loop
            if is_content:
                if 'content' in content_dict:
                    content_dict['content'] += line
                    #content_dict['content'] = content_dict['content']+line
                else:
                    content_dict['content'] = line
    finally:
        file.close()
        '释放堆中的内存地址'
    return content_dict

# 邮件数据处理
def 字典转文本(file_path):
    #先把内容读成字典
    content_dict = 字典化邮件文本内容(file_path)#先把非格式化邮件转化为字典

    # 再把字典转成文本
    result_str = content_dict.get('from', 'unkown').replace(',', '').strip() + ","
    result_str += content_dict.get('to', 'unknown').replace(',', '').strip() + ","
    result_str += content_dict.get('date', 'unknown').replace(',', '').strip() + ","
    result_str += content_dict.get('content', 'unknown').replace(',', ' ').strip()
    return result_str
    
#使用函数开始数据处理
start = time.time()#开始时间标记
index_dict = 制作标签字典('./data/full/index')
#{/000/000: 1}
#{/000/001: 0}
# index_dict = 制作标签字典('C:\\Users/Administrator/Desktop/index')#('./data/full/index')
# print(index_dict)
# sys.exit(0)
list0 = os.listdir('./data/data')#文件夹的名称
#{000,001,002,215}
 
for l1 in list0: #开始把N个文件夹中的file写入N*n个wiriter 
    '循环文件夹'
    l1_path = './data/data/' + l1#000
    #l1_path='./data/data/000',是文件夹的名称
    print('开始处理文件夹' + l1_path)#开始处理第000文件夹
    list1 = os.listdir(l1_path)#获取000文件夹内的所有文件名列表
    #[000,001]
    #list1文件列表
     
    write_file_path = './data/process01_' + l1
    #./data/process01_000
    #./data/process01_001
    #总共有216个
    #保存每个文件夹下面文件的文件 300行
    with open(write_file_path, "w", encoding= 'utf-8') as writer:
        for l2 in list1:
            '循环文件'
            l2_path = l1_path + "/" + l2#得到要处理文件的具体路径
             
            index_key = "/" + l1 + "/" + l2
             
            if index_key in index_dict:
                #{/000/000: 1}
                #{/000/001: 0}
                content_str = 字典转文本(l2_path)
                content_str += "," + index_dict[index_key] + "\n"
                writer.writelines(content_str)          
with open('./data/result_process01',"w", encoding ='utf-8') as writer:
    for l1 in list0:
        file_path= './data/process01_' + l1
        print("开始合并文件：" + file_path)
            
        with open(file_path, encoding = 'utf-8') as file:
            for line in file:
                writer.writelines(line)
#两个for嵌套共执行：  216*300=6W+        
            
end = time.time()
print('数据处理总共耗时%.2f'%(end- start))

特征工程脚本 fetureExtract.py

import pandas as pd
import numpy as np


import matplotlib as mpl
import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
import re
import time
import jieba
import sys

# mpl.rcParams['font.sans-serif'] = [u'simHei']#ָ改为指定字体“黑体”
# mpl.rcParams['axes.unicode_minus'] = False #使得坐标轴保存负号变更为方块，用来正常显示负号
# plt.title(u'我是中文')
# get_ipython().magic(u'matplotlib tk')
'from to  date content label'
df = pd.read_csv('./data/result_process01', sep = ',', header = None, names= ['from','to', 'date', 'content','label'])
# print(df.head(10))
# print(df.tail(10))
# print(df.info())
# sys.exit("第20行")


#分析邮件的收发地址对label的影响
def 获取邮件收发地址(strl):#发送接收地址提取
    it = re.findall(r"@([A-Za-z0-9]*\.[A-Za-z0-9\.]+)", str(strl))#正则匹配
    #[^d]
    result = ''
#     result =[]
#     result = {}
    if len(it)>0:
        result = it[0]
    else:
        result = 'unknown'
    return result
'''result0 = 获取邮件收发地址(df['from'])
print(df['from'])
print(df['from'].shape)
print(result0)
sys.exit(0)'''
df['from_address'] = pd.Series(map(lambda str : 获取邮件收发地址(str), df['from']))#map映射并添加
'''print(df.from_address.value_counts())
sys.exit(0)'''
df['to_address'] = pd.Series(map(lambda str: 获取邮件收发地址(str), df['to']))
print("="*10 + 'to address' + "="*20)#也可以这样写
print(df.to_address.value_counts().head(5))#
print("总邮件接受服务器类别数量为：" + str(df.to_address.unique().shape))#计算服务器的个数
print("="*10 + 'from address' + "= "*20)
print(df.from_address.value_counts().head(10))
print(df[['from_address', 'label']].groupby(['from_address', 'label'])['label'].count())
print(df[['to_address', 'label']].groupby(['to_address', 'label'])['label'].count())
print("邮件发送服务器类别数量为：" + str(df.from_address.unique().shape))
from_address_df = df.from_address.value_counts().to_frame()#转为结构化的输出,带出列名
len_less_10_from_address_count = from_address_df[from_address_df.from_address<=10].shape
print("发送邮件数量小于10封的服务器数量为：" + str(len_less_10_from_address_count))
# from_address_df[from_address_df.from_address<=10].to_csv('./data/fromToResult.csv')
# df.from_address.value_counts().to_csv('./data/fromToResultNoneFrame.csv')
#结论：from和to这两个特征没有用，最后要删除
#===================================================================================================
# np.unique(list(map(lambda t: len(str(t).strip()), df['date'])))#转换为list再去做
print(np.unique(list(map(lambda t: len(str(t).strip()), df['date']))))
# np.unique(list(filter(lambda t: len(str(t).strip())==30, df['date'])))
print((list(filter(lambda t: len(str(t).strip())==3, df['date']))))
#nan
print((list(filter(lambda t: len(str(t).strip())==7, df['date']))))
#unknown
print((list(filter(lambda t: len(str(t).strip())==16, df['date']))))
#2005-9-2 上午11:04
print((list(filter(lambda t: len(str(t).strip())==19, df['date']))))
#Sep 23 2005 1:04 AM
print((list(filter(lambda t: len(str(t).strip())==21, df['date']))))
#August 24 2005 5:00pm
print((list(filter(lambda t: len(str(t).strip())==23, df['date']))))
#Thu 1 Sep 2005 09:42:01
print((list(filter(lambda t: len(str(t).strip())==24, df['date']))))
#Mon 15 Aug 2005 07:04:08
print((list(filter(lambda t: len(str(t).strip())==26, df['date']))))
#Sat 1 Oct 2005 00:12:07 UT
print((list(filter(lambda t: len(str(t).strip())==27, df['date']))))
#Mon 1 Jan 2001 21:40:47 GMT
print((list(filter(lambda t: len(str(t).strip())==28, df['date']))))
#Sun 14 Aug 2005 11:59:22 GMT
print((list(filter(lambda t: len(str(t).strip())==61, df['date']))))
#[ 3  7 16 19 21 23 24 26 27 28 29 30 31 32 33 34 35 36 45 46 57 58 61 62]
#通过打印，发现3， 7， 16， 19 21不含有星期数，需要观察的格式不全，需要特别处理
def 根据日期长度提取日期特征(str1):#Tue 30 Aug 2005 10:08:15 +0800
    '''
    8~13=0;13~19=1;19~24=2;24~8=3;
    '''
    if not isinstance(str1, str):#如果不是字符串
        str1 = str(str1)
         
    str_len = len(str1)
    week = ""
    hour = ""
    time_quantum = ""      
    if str_len < 10:
        week = "unknown"
        hour = "unknown"
        time_quantum = "unknown"
        pass
    elif str_len == 16:#2005-9-2 上午11:04
        rex = r"(\d{2}):\d{2}"#只取冒号前面的
        it = re.findall(rex, str1)
        if len(it) == 1:
            hour = it[0]
        else:
            hour = "unknown"
        week = "Fri"
        time_quantum = "0"
        pass
        #['2005-9-2 上午11:04', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55']
    elif str_len == 19: #['Sep 23 2005 1:04 AM']
        week = "Sep"
        hour = "01"
        time_quantum = "3"
        pass
    elif str_len == 21: #['August 24 2005 5:00pm']
        week ="Wed"
        hour = "17"
        time_quantum = "1"
        pass
    else:               #'Fri 2 Sep 2005 08:17:50'  Wed 31 Aug 2005 15:06:36 
        rex = r"([A-Za-z]+\d?[A-Za-z]*) .*?(\d{2}):\d{2}:\d{2}.*"# 加问号保险些# 'Fri 23 Sep 2005 09:39:39 +0800 X-Priority: 3 X-Mailer: FoxMail'
        it = re.findall(rex, str1)
#         print(it)
#         print(len(it))
#         print(len(it[0]))
#         sys.exit('129')
        if len(it) == 1 and len(it[0]) ==2:
            week = it[0][0][-3:]#it是list 
            hour = it[0][1]
            int_hour = int(hour)
            #24~8=3;8~13=0;13~19=1;19~24=2;
            if int_hour <8:
                time_quantum = "3"
            elif int_hour <13:
                time_quantum = "0"
            elif int_hour <19:
                time_quantum = "1"
            else:
                time_quantum = "2"
            pass
        else:
            week = "unknown"
            hour = "unknown"
            time_quantum = 'unknown'
     
    week = week.lower()
    hour = hour.lower()
    time_quantum = time_quantum.lower()
    return(week, hour, time_quantum)
#数据转换
date_time_extract_result = list(map(lambda st: 根据日期长度提取日期特征(st), df['date']))
df['date_week'] = pd.Series(map(lambda t: t[0], date_time_extract_result))#匿名函数传出的是最后结构里面的值，是子集
df['date_hour'] = pd.Series(map(lambda t: t[1], date_time_extract_result))
df['date_time_quantum'] = pd.Series(map(lambda t: t[2], date_time_extract_result))
print("======星期属性字段的描述==========")
print(df.date_week.value_counts().head(3))
print(df[['date_week', 'label']].groupby(['date_week', 'label'])['label'].count())#先取data_week 和 label，然后按照label去排
print("======小时属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_hour', 'label']].groupby(['date_hour', 'label'])['label'].count())
print("======时间段属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_time_quantum', 'label']].groupby(['date_time_quantum', 'label'])['label'].count())                 
df['has_date'] = df.apply(lambda c: 0 if c['date_week'] == 'unknown' else 1, axis=1)#这里的1是按照行
#结论：data数据对标签没有太大指示作用，但是，一般的垃圾邮件都不会含有时间
# 开始分词==============================================
print('='*30 + '现在开始分词 ,请 耐心等待 5分钟 。。。' + '='*20)
df['content'] = df['content'].astype('str')#类型转换
df['jieba_cut_content'] = list(map(lambda st: "  ".join(jieba.cut(st)), df['content']))
df.head(4)    
#特征工程之四 长度提取  
def 邮件长度统计(lg):
    #以500为间隔递增
    if lg <= 10:
        return 0
    elif lg <= 100:
        return 1
    elif lg <= 500:
        return 2
    elif lg <= 1000:
        return 3
    elif lg <= 1500:
        return 4
    elif lg <= 2000:
        return 5
    elif lg <= 2500:
        return 6
    elif lg <=  3000:
        return 7
    elif lg <= 4000:
        return 8
    elif lg <= 5000:
        return 9
    elif lg <= 10000:
        return 10
    elif lg <= 20000:
        return 11
    elif lg <= 30000:
        return 12
    elif lg <= 50000:
        return 13
    else:
        return 14
print(df['content'])
df['content_length'] = pd.Series(map(lambda st:len(st),df['jieba_cut_content']))#content为切开之后的词向量 df['jieba_cut_content']
df['content_length_type'] = pd.Series(map(lambda st: 邮件长度统计(st), df['content_length']))
# print(df.head(10))  #如果不count就按照自然顺序排      
df2 = df.groupby(['content_length_type', 'label'])['label'].agg(['count']).reset_index()#agg 计算并且添加count,类似于eval
print(df2)
df3 = df2[df2.label == 1][['content_length_type', 'count']].rename(columns = {'count' : 'c1'})
df4 = df2[df2.label == 0][['content_length_type', 'count']].rename(columns = {'count' : 'c2'})
print(df3)
print(df4)
df5 = pd.merge(df3, df4)#注意pandas中merge与concat的区别
df5['c1_rage'] = df5.apply(lambda r: r['c1'] / (r['c1'] + r['c2']), axis = 1)#1所占百分比
df5['c2_rage'] = df5.apply(lambda r: r['c2'] / (r['c1'] + r['c2']), axis = 1)#0所占百分比
print(df5) 
#画图出来观测为信号添加做准备
plt.plot(df5['content_length_type'], df5['c1_rage'], label = u'垃圾邮件比例')#长度与概率的图像
plt.plot(df5['content_length_type'], df5['c2_rage'], label = u'正常邮件比例')
plt.grid(True)
plt.legend(loc = 0)#加入图例
plt.show()
#添加信号量,数值分析模拟回归方程
def 长度信息量计算(x):
    '''返回值是是否是垃圾邮件的信息量,值越大，则是垃圾邮件的概率越大，反之。。。'''
    if x > 10000:
        return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1) - np.log(abs(x - 10000)) + 1
    else:
        return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1)
a = np.arange(1, 20000)
plt.plot(a, list(map(lambda t: 长度信息量计算(t) ,a)), label = u'信息量')
# plt.plot(df['content_length'], list(map(lambda t: 长度信息量计算(t) ,df['content_length'])), label = u'信息量')
plt.grid(True)
plt.legend(loc = 0)
plt.show()
df['content_length_sema'] = list(map(lambda st: 长度信息量计算(st), df['content_length']))          
# print(df.dtypes) #可以查看每一列的数据类型，也可以查看每一列的名称
   
df.drop(['from', 'to', 'date', 'from_address', 'to_address', \
         'date_week','date_hour', 'date_time_quantum', 'content', \
         'content_length', 'content_length_type'], axis = 1, inplace=True) 
# print(df.info())
# print(df.head(10)) 
  
df.to_csv('./data/result_process02', encoding='utf-8', index = False)
df.to_csv('./data/result_process02.csv', encoding='utf-8', index = False)

贝叶斯分类脚本bayes.py

#encoding:utf-8
import pandas as pd
import numpy as np
# import matplotlib as mpl
# import matplotlib.pyplot as plt
import sys
import time
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

# mpl.rcParams['font.sans-serif'] = [u'simHei']
# mpl.rcParams['axes.unicode_minus'] = False

df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True) #删除数据中有空值的实例
# print(df.head(5))
# print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)

# print("训练集实例的个数是%d" % x_train.shape[0])
# print("测试集实例的个数是%d" % x_test.shape[0])
# print(x_train.head(10))
# print(x_test.head(10)) 
#================================================================================================
print('='*30 + '对分词后的油价内容做tf-idf转化' + '='*30)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)#加载tf-idf模型
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)# fit_transform(jieba_cut_content)
# df1 = transformer.fit_transform(jieba_cut_content)
print('='*30 + '对tf-idf后的数值矩阵进行svd降维' + '='*30)
svd = TruncatedSVD(n_components=20)#降成20维
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)

print('='*30 + '合并处理后的矩阵' + '='*30)
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])


print('='*30 + '朴素贝叶斯模型的加载及训练' + '='*30)  
nb = BernoulliNB(alpha = 1.0, binarize = 0.0005)
model = nb.fit(data, y_train)#训练模型

print('='*30 + '合并测试集数据矩阵' + '='*30)    
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])

print('='*30 + '测试数据' + '='*30)
start = time.time()  
y_predict = model.predict(data_test)
end = time.time()
print('测试模型共消耗时间为：%0.2f'%(end-start))

print('='*30 + '评估模型召回率' + '='*30)   
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('='*30 + '打印预测结果如下' + '='*30)   
print('模型精确率为%0.5f' % precision)
print('模型召回率为%0.5f' % recall)
print('F1_mean为%0.5f' % f1mean)

=========================下面为不同模型的测试============================

decision_tree.py

import pandas as pd
import numpy as np
import jieba
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True) 
print(df.head(5))
print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)
# print("训练集大小%d" % x_train.shape[0])
# print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10)) 
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())

data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())

tree = DecisionTreeClassifier(criterion='gini', max_depth = 5, random_state = 0)#'entropy'
model = tree.fit(data, y_train)

jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)

precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('精确率为：%0.5f' % precision)
print('召回率为：%0.5f' % recall)
print('F1均值为：%0.5f' % f1mean)

# list01 = list(zip(data[0:5], tree.feature_importances_)) 
# list02 = sorted(list01, key = lambda x: x[1], reverse = True)
#  
# print(list02)

gradient_boost_decision_tree.py

'''
Created on 2018年1月26日

@author: Administrator
'''
#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import jieba
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf

df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True) 
# print(df.head(5))
# print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)
# print("训练集大小%d" % x_train.shape[0])
# print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10)) 
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())

data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())
gbdt = GradientBoostingClassifier(learning_rate=0.01, n_estimators =100, max_depth=3,\
                                  min_samples_split = 50, loss = 'deviance', random_state = 0)
                                  #对数似然损失函数   指数损失函数exponential
model = gbdt.fit(data, y_train)

jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)

precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('精确率为：%0.5f' % precision)
print('召回率为：%0.5f' % recall)
print('F1均值为：%0.5f' % f1mean)

k_nearest_neighbor.py

'''
Created on 2018年1月26日

@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf

df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = 'True') 
print(df.head(5))
print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
print(x_train.head(1000))
print(x_test.head(10)) 
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print(data.head(10))
print(data.info())

data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
print(data.head(10))
print(data.info())

knn = KNeighborsClassifier(n_neighbors=5)
model = knn.fit(data, y_train)

jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
print(data_test.head(10))
print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)

precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('精确率为：%0.5f' % precision)
print('召回率为：%0.5f' % recall)
print('F1均值为：%0.5f' % f1mean)

random_forest.py

'''
Created on 2018年1月26日

@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf

df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True) 
# print(df.head(5))
# print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10)) 
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())

data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())

forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=3, random_state=0)
model = forest.fit(data, y_train)

jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)

precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('精确率为：%0.5f' % precision)
print('召回率为：%0.5f' % recall)
print('F1均值为：%0.5f' % f1mean)

support_vector_machine.py

'''
Created on 2018年1月26日

@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf

df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = 'True') 
print(df.head(5))
print(df.info())

x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
                                                        'content_length_sema']],df['label'],\
                                                    test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
print(x_train.head(1000))
print(x_test.head(10)) 
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print(data.head(10))
print(data.info())

data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
print(data.head(10))
print(data.info())

svm = SVC(C = 1, kernel='rbf', degree = 3, gamma = 0.001)
model = svm.fit(data, y_train)

jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
print(data_test.head(10))
print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)

precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)

print('精确率为：%0.5f' % precision)
print('召回率为：%0.5f' % recall)
print('F1均值为：%0.5f' % f1mean)

机器学习项目 - 垃圾邮件分类

猜你喜欢