机器学习-分类

写在前面

逻辑回归涉及到的知识点主要有:

  1. sigmoid函数、
  2. 如何用梯度下降法求解损失函数最小时的参数、
  3. 二元分类如何解决多元分类(1-rest)、
  4. 加入正则项防止过拟合,减少了特征的权重

python实现sigmoid函数

import matplotlib.pyplot as plt
#FontProperties是让图表显示中文
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\msyh.ttc", size=10)

sigmoid公式:
在这里插入图片描述

#画sigmoid函数
import numpy as np
plt.figure()
plt.axis([-6,6,0,1])
plt.grid(True)
X=np.arange(-6,6,0.1)
y=1/(1+np.e**(-X))
plt.plot(X,y,'b-')

在这里插入图片描述

#下面是做分类时的真实案例。**部分import还需要修改**
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Binarizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2, VarianceThreshold, GenericUnivariateSelect
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from jieba.posseg import POSTokenizer
from jieba import enable_parallel
import pkg_resources
import glob
from sklearn.metrics import classification_report
#结巴根据词性切词。x:字符串 u:助词 f:方位词 m:数词 c:连词 p:介词
tk = POSTokenizer()
excluded = set(['x', 'uj', 'f', 'm', 'c', 'uv', 'p'])
def pos_tokenizer(string):
    res = list()
    for word, pos in tk.lcut(string):
        if pos not in excluded:
            res.append(word)
    return res
#去掉停用词
def load_stopwords(path):
    if not os.path.isfile(path):
        raise OSError("Fail to find file {}".format(path))
    stopwords = list()
    with open(path, 'r',encoding='utf-8') as f:
        for line in f:
            stopwords.append(line.strip())
    return stopwords
stop_words = load_stopwords(os.environ.get('stopwords', 'stopwords.txt'))
#结巴根据字典切词
import jieba
def pos_tokenizer1(string):
    res=list()
    jieba.load_userdict('自定义词典.txt')
    for word in jieba.cut(string):
        res.append(word)
    return res
#准备分类文本和标签
path = "十二系统"
files = glob.glob(path + '\\**\\*.txt', recursive=True)
#glob模块的主要方法就是glob,该方法返回所有匹配的文件路径列表(list)
text, lab = [], []
for file in files:
    with open(file, 'r',encoding='utf-8') as f:
        text.append(f.read())
        lab.append(file.split('\\')[-2])

x_train,x_test,y_train,y_test = train_test_split(text,lab,test_size=0.2,random_state=0)
#查看文本和标签前5行
sick_df = pd.DataFrame({'desc':text, 'disease':lab})
sick_df.head()
#这是训练逻辑回归的一种方式。
from sklearn.model_selection import GridSearchCV
sf = chi2    
pipeline = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words)),
    ('norm', TfidfTransformer()),
    ('sk', GenericUnivariateSelect(score_func=sf, mode='k_best',param='all')), 
    ('est', LogisticRegression(class_weight='balanced'))])
parameters = {
    'cv__lowercase':(True,False),
    'cv__min_df':(50,100),
    'cv__max_df':(0.25,0.5),
    'cv__ngram_range':((1,1),(1,2)),
}
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring="accuracy")
grid_search.fit(x_train,y_train)
print(u'最佳效果:%0.3f'%grid_search.best_score_)
print(u'最优参数组合:')
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s:%r'%(param_name,best_parameters[param_name]))
#这是训练逻辑回归使用的特征提取方法。
cv = CountVectorizer(lowercase=True, tokenizer=pos_tokenizer, stop_words=stop_words, max_df=0.3, min_df=100)
#lowercase:是否转换为小写;tokenizer:切词方法;stop_words:过滤掉停用词;max_df:超过该百分比的词不计入;min_df:低于该数量的词不计入

norm = TfidfTransformer()
sf = chi2
sk = GenericUnivariateSelect(score_func=sf, mode='k_best', param=100)
est = LogisticRegression(class_weight='balanced')  

# 逻辑回归,class_weight选择balanced,那么类库会根据训练样本量来计算权重。某种类型样本量越多,则权重越低;样本量越少,则权重越高。

pipe = Pipeline(steps=[('cv', cv), ('norm', norm), ('sk', sk), ('est', est)])
pipe.fit(x_train, y_train)
#多元分类效果评估方法
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.cross_validation import cross_val_score
predictions=pipe.predict(x_test)
print(u'准确率',accuracy_score(y_test,predictions))
print(u'混淆矩阵',confusion_matrix(y_test,predictions))
print(u'分类报告',classification_report(y_test,predictions))
#这里介绍一下之前做cross validation的方法。主要有2个模块。
def accuracy(Pipe,tf,ef,tl,el):
    predictorL = Pipe.predict(ef)
    accuracy = np.mean( predictorL == el)
    precision = metrics.precision_score(el, predictorL, average='macro')
    recall = metrics.recall_score(el, predictorL, average='macro')
    fScore = metrics.f1_score(el, predictorL, average='weighted')
    return precision,recall,fScore
    
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
X = X_train + X_test
y = y_train + y_test
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)
precision = []
recall = []
fScore = []
StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
#         print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = np.array(X)[train_index], np.array(X)[test_index]
    y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]
    p,r,f = accuracy(pipe,X_train, X_test,y_train, y_test)
    precision.append(p)
    recall.append(r)
    fScore.append(f)
#         break
#     print (accuracy)
print ('Percision',np.mean(precision))
print ('Recall',np.mean(recall))
print ('F-score',np.mean(fScore))
#接下来是pipe里面内容的输出
cv = pipe.named_steps['cv']
print(cv.fit_transform(x_test).toarray()) #输出词频矩阵
words = pd.Series(cv.get_feature_names())#输出词汇
#下面是输出卡方的得分。也可以输出到excel文档中方便查看
sk = pipe.named_steps['sk']
score = pd.Series(sk.scores_)
supported = pd.Series(sk.get_support())
df = pd.DataFrame()
df['words'] = words
df['score'] = score
df['selected'] = supported
words_score = df
print(words_score)
excelPath = '词典切词.xlsx'
excelWriter = pd.ExcelWriter(excelPath, engine='openpyxl')
words_score.to_excel(excel_writer=excelWriter, index=True)
excelWriter.close()
excelWriter.save()
#输出模型系数等
sk = pipe.named_steps['sk']
est = pipe.named_steps['est']
coef = est.coef_
classes = pipe.classes_

#系数转置
coef = sk.inverse_transform(coef)
coef = pd.DataFrame(coef, index=classes)
coef = coef.T
coef['words'] = words_score['words']
coef = coef[words_score.selected]

#输出系数和feature值
excelPath = 'disease2.xlsx'
excelWriter = pd.ExcelWriter(excelPath, engine='openpyxl')
coef.to_excel(excel_writer=excelWriter, sheet_name='features', index=True)
excelWriter.close()
excelWriter.save()

其它分类方法

除了逻辑回归,我们还尝试了其它分类方法。参数含义需要看懂原理。
决策树和随机森林的整理链接:
https://blog.csdn.net/weixin_42791474/article/details/83027016
https://blog.csdn.net/u011026329/article/details/79182505
决策树和随机森林的参数解读:
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
SGD分类器:
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

est = SVC(probability=True, kernel='rbf', class_weight='balanced') # SVM
est = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                                       max_features=100, max_leaf_nodes=None,
                                       min_samples_leaf=3,
                                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                                       presort=False, random_state=1, splitter='random') # 决策树
 est = RandomForestClassifier(n_estimators=10, max_depth=None,
         min_samples_split=2, random_state=0) # 随机森林

下面是家庆整理的boosting方法。
http://note.youdao.com/noteshare?id=54318d8fb3274918d8d96279882af33c

xgboost的效果最好

est = AdaBoostClassifier(n_estimators=100)
est = GradientBoostingClassifier(random_state=10)
est = XGBClassifier(learning_rate=0.1,
                       n_estimators=19,         # 树的个数--1000棵树建立xgboost
                       max_depth=15,               # 树的深度
                       min_child_weight = 1,      # 叶子节点最小权重
                       gamma=0.,                  # 惩罚项中叶子结点个数前的参数
                       subsample=0.8,             # 随机选择80%样本建立决策树
                       colsample_btree=0.8,       # 随机选择80%特征建立决策树
                       objective='multi:softmax', # 指定损失函数
                       scale_pos_weight=1,        # 解决样本个数不平衡的问题
                       random_state=27            # 随机数
                       )

猜你喜欢

转载自blog.csdn.net/weixin_43473864/article/details/83215764