机器学习之sklearn回归--01

# coding=utf-8

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import label

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_squared_error
from sklearn import metrics

'''
每年高中生和大学生都会申请进入到各种各样的高校中去。每个学生都有一组唯一的考试分数,
成绩和背景数据。录取委员会根据这个数据决定是否接受这些申请者。
在这种情况下一个二元分类算法可用于接受或拒绝申请,逻辑回归是个不错的方法。

gre - Graduate Record Exam(研究生入学考试), a generalized test for prospective graduate students(一个通用的测试未来的研究生), continuous between 200 and 800. 
gpa - Cumulative grade point average(累积平均绩点), continuous between 0.0 and 4.0. 
admit - Binary variable, 0 or 1, where 1 means the applicant was admitted to the program.
'''
'''
    线性回归和逻辑回归的区别:
        线性回归是预测值,
        逻辑回归的预测某一类的概率值

'''

def logit(x):
    return np.exp(x) / (1 + np.exp(x))



#逻辑回归函数图像
def find_logit_():
    t = np.linspace(-6, 6, 50, dtype=float)
    ylogit = logit(t)
    plt.plot(t, ylogit,label='logistic')
    plt.ylabel("Probability")
    plt.xlabel("t")
    plt.title("Logistic Function")
    plt.show()

    a = logit(-10)
    b = logit(10)
    '''
    a:4.5397868702434395e-05
    b:0.99995460213129761
    '''

def test_plt(admissions):

    plt.scatter(admissions["gpa"], admissions["admit"])
    plt.show()

#用逻辑回归预测类别
def logit_stand(admissions):

    logistic_model = LogisticRegression()
    logistic_model.fit(admissions[['gpa']],admissions[['admit']])
    pre = logistic_model.predict(admissions[['gpa']])

    mse = mean_squared_error(admissions[['admit']],pre)

    plt.scatter(admissions[['gpa']], pre)

    plt.show()

#predict_proba 预测概率值
def logit_pro(admissions):
    logistic_model = LogisticRegression()
    logistic_model.fit(admissions[['gpa']], admissions[['admit']])
    #得到逻辑回归预测的概率值  l列可能和0列不可能  两列
    pre = logistic_model.predict_proba(admissions[['gpa']])

    plt.scatter(admissions[['gpa']], pre[:,1])

    plt.show()

#模型准确度的预测
def logit_admin(admissions):
    logistic_model = LogisticRegression()
    logistic_model.fit(admissions[['gpa']], admissions[['admit']])

    labels = logistic_model.predict(admissions[['gpa']])

    admissions['predicted_label'] = labels

    #print(admissions['predicted_label'].value_counts())

    admissions['actual_label'] = admissions[['admit']]

    matches = admissions['predicted_label'] == admissions['actual_label']

    correct_predictions = admissions[matches]

    accuracy = len(correct_predictions) / len(admissions)*1.0
    print('预测的精度:')
    print(accuracy)

    #非均衡分类问题
    #检测正例效果
    #现实中,在测试集中预测
    #TP
    true_positive_filter = (admissions['predicted_label'] == 1) & (admissions['actual_label'] == 1)
    true_positives = len(admissions[true_positive_filter])

    #TN
    true_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 0)
    true_negatives = len(admissions[true_negatives_filter])

    #FN
    false_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 1)
    false_negatives = len(admissions[false_negatives_filter])

    senditivity = true_positives / float((true_positives + false_negatives))

    print(senditivity)



def test_train(admissions):

    np.random.seed(8)

    admissions['actual_label'] = admissions[['admit']]

    #删除某行或某列
    admissions = admissions.drop('admit', axis=1)

    #np.random.shuffle() 没有返回值  permutation有返回打乱的index
    shffle_admissions = np.random.permutation(admissions.index)

    shuffled_admissions = admissions.loc[shffle_admissions]

    train = shuffled_admissions.iloc[0:515]
    test = shuffled_admissions.iloc[515:len(shuffled_admissions)]

    logistic_model = LogisticRegression()
    logistic_model.fit(train[['gpa']], train[['actual_label']])

    labels = logistic_model.predict(test[['gpa']])

    test['predicted_label'] = labels

    matches = test['predicted_label'] == test['actual_label']

    correct_predictions = test[matches]

    accuracy = len(correct_predictions) / len(test)*1.0
    #print('预测的精度:')
    #print(accuracy)



    '''
        ROC曲线:
        
        roc_curve (真是的label,是一个概率值(某一列))
        返回值 : FP,TP
    '''
    probabilities = logistic_model.predict_proba(test[['gpa']])
    fpr, tpr, thresholds = metrics.roc_curve(test['actual_label'], probabilities[:,1])

    #计算ROC曲线的面积,对模型的总和评判
    area = metrics.roc_auc_score(test['actual_label'], probabilities[:, 1])

    plt.plot(fpr, tpr)
    plt.show()


if __name__ == '__main__':
    admissions = pd.read_csv("admissions.csv")

    #find_logit_()

    #logit_stand(admissions)

    #logit_pro(admissions)

    #logit_admin(admissions)

    test_train(admissions)




# coding=utf-8


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy
//线性回归
columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns)

#导入线性模型
lr = LinearRegression(fit_intercept=True)

#对模型的数据进行训练
# 第一个参数: 样本矩阵
# 第二个参数;label(列向量 单标签) (矩阵 多标签)
lr.fit(cars[['weight']], cars['mpg'])

#预测v
predictions = lr.predict(cars[['weight']])

#print(predictions[0:5])
#print(cars['mpg'][0:5])


#均方误差
'''
    均方误差 = 累加和{(预测值 - 真实值)^2}/个数
'''
mse = mean_squared_error(cars['mpg'],predictions)
print(mse)

#标准差
rmse = mse**0.5

plt.scatter(cars[['weight']], cars['mpg'], c='red')
plt.scatter(cars[['weight']], predictions, c='blue')

plt.show()










# coding=utf-8


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

'''
    数据集:
        mpg:跑公里
        cylinders : 气缸数
        displacement:
        horsepower: 马力
        weight: 
        acceleration:加速度
        model year: 生产年
        origin: 生产地
'''


'''
    用逻辑回归解决多分类问题:
        对于三个类别A,B,C,需要进行分解,对问题进行分类,
        首先把A作为正例,把BC放在一起
        其次 B作为正例,把AC放在一起
        再者 C作为正例,把AB放在一起
        分别求三者的概率,求出最大值
        
        
    步骤:
        1.读取数据,并把数据洗牌
        2.进行数据集和测试集的划分
        3.依照类别的个数,对每一中类别进行逻辑回归
        4.求出几种类别的最大值,构建权重向量

'''


columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns)

#prefix被分类特征的前缀名  作用: prefix的值 + _  +  每一个数据 组成的列名
cummy_cylinders = pd.get_dummies(cars['cylinders'], prefix='cyl')

#concat cars列与cummy_cylinders列连在一起
cars = pd.concat([cars, cummy_cylinders], axis=1)

cummy_years = pd.get_dummies(cars['model year'], prefix='year')
cars = pd.concat([cars, cummy_years], axis=1)

cars = cars.drop('model year', axis=1)
cars = cars.drop('cylinders', axis=1)



#permutation 数据的下标进行洗牌
shuffled_rows = np.random.permutation(cars.index)
shuffed_cars = cars.iloc[shuffled_rows]
highest_train_row = int(cars.shape[0] * 0.7)

train = shuffed_cars.iloc[0:highest_train_row]
test = shuffed_cars.iloc[highest_train_row:]



#取出汽车产地的分类 [1,2,3]
unique_origins = cars['origin'].unique()

unique_origins.sort()

models = {}
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]
#print(features)


for origin in unique_origins:

    model = LogisticRegression()
    X_train = train[features]
    y_train = train['origin'] == origin

    model.fit(X_train, y_train)

    models[origin] = model #保存每一个样例为正例的结果


testing_probs = pd.DataFrame(columns=unique_origins)
for origin in unique_origins:
    #选择测试的特征
    X_test = test[features]

    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1 ]



predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)




















猜你喜欢

转载自blog.csdn.net/qq_30638831/article/details/80276642