Machine learning package Scikit-learn (2)

Scikit-learn (formerly scikits.learn, also known as sklearn) is a freeware machine learning library for the Python programming language. It features a variety of classification, regression, and clustering algorithms, including support vector machines, random forests, gradient boosting, k-means, and DBSCAN, and is designed to be used in conjunction with the Python numerical science libraries NumPy and SciPy.

logistic regression

#数据获取
import pandas as pd
data=pd.read_excel('credit.xlsx')
#训练样本与测试样本划分
'''
前600个申请者作为训练数据
后90个申请者作为测试数据
'''
x=data.iloc[:600,:14].values
y=data.iloc[:600,14].values
x1=data.iloc[600:,:14].values
y1=data.iloc[600:,14].values
#逻辑回归分析
from sklearn.linear_model import LogisticRegression as LR#导入逻辑回归模块
lr=LR()#创建逻辑回归对象lr
lr.fit(x,y)#调用lr中的fit()方法进行训练
r=lr.score(x,y)#模型准确率(针对训练数据)
R=lr.predict(x1)#获得预测结果
Z=R-y1
Rs=len(Z[Z==0])/len(Z)#预测准确率
print('预测结果为:',R)
print('预测准确率为:',Rs)

insert image description here

insert image description here

Neural Networks

#数据获取
import pandas as pd
data=pd.read_excel('credit.xlsx')
#训练样本与测试样本划分
'''
前600个申请者作为训练数据
后90个申请者作为测试数据
'''
x=data.iloc[:600,:14].values
y=data.iloc[:600,14].values
x1=data.iloc[600:,:14].values
y1=data.iloc[600:,14].values
#神经网络分类模型构建
from sklearn.neural_network import MLPClassifier#导入神经网络分类模块MLPClassifier
clf=MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(5,2),random_state=1)
'''
solver:神经网络优化求解算法,包括lbfgs,sgd,adam3种,默认值为adam
alpha:模型训练误差,默认值为0.0001
hidden_layer_sizes:隐含层神经元个数,如果是单层神经元,设置具体数值即可,本例中隐含层有两层,即5x2
random_state:默认设置为1即可
'''
clf.fit(x,y)#调用fit()方法进行网络训练
rv=clf.score(x,y)#神经网络的预测准确率(针对训练数据)
R=clf.predict(x1)#获得预测结果
Z=R-y1
Rs=len(Z[Z==0])/len(Z)#预测准确率
print('预测结果为:',R)
print('预测准确率为:',Rs)

insert image description here

#训练样本的特征输入变量用x表示,输出变量用y表示
import pandas as pd
data=pd.read_excel('发电场数据.xlsx')
x=data.iloc[:,0:4].values
y=data.iloc[:,4].values
#预测样本的构建
import numpy as np
x1=np.array([28.4,50.6,1011.9,80.54])
x1=x1.reshape(1,4)
#神经网络回归模型构建
from sklearn.neural_network import MLPRegressor
clf=MLPRegressor(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=8,random_state=1)
clf.fit(x,y)
rv=clf.score(x,y)
R=clf.predict(x1)
print('样本预测值为:',R)
样本预测值为: [439.27679293]

Support Vector Machines

#数据获取
import pandas as pd
data=pd.read_excel('car.xlsx')
#训练样本与测试样本划分
x=data.iloc[:1690,:6]
y=data.iloc[:1690,6]
x1=data.iloc[1690:,:6]
y1=data.iloc[1690:,6]
#支持向量机分类模型构建
from sklearn import svm
clf=svm.SVC(kernel='poly')#核函数可以选择线性核,多项式核,高斯核,sig核,分别用linear,poly,rbf,sigmoid表示,默认为高斯核
clf.fit(x,y)#训练数据
rv=clf.score(x,y)#模型准确率(针对训练数据)
R=clf.predict(x1)#获得预测结果
Z=R-y1
Rs=len(Z[Z==0])/len(Z)#预测准确率
print('预测结果为:',R)
print('预测准确率为:',Rs)

insert image description here

预测结果为: [2 4 3 1 2 3 1 4 3 2 4 3 3 3 3 3 3 3 3 3 3 3 2 3 1 4 3 2 4 1 1 2 3 1 4 3 2
 4]
预测准确率为: 0.868421052631579

K-Means Clustering

We can use the Python language to write the K-means clustering algorithm program, so that we can better understand the K-means clustering algorithm. The reference program is as follows:

def K_mean(data,knum):
    #输入:data--聚类特征数据集,要求为数据结构要求为numpy数值数组
    #输入:knum--聚类个数
    #返回值,data后面加一列类别,显示类别
    import pandas as pd
    import numpy as np
    p=len(data[0,:])                        #聚类数据维度
    cluscenter=np.zeros((knum,p))        #预定义元素为全0的初始聚类中心
    lastcluscenter=np.zeros((knum,p))   #预定义元素为全0的旧聚类中心
    #初始聚类中心和旧聚类中心初始化,取数据的前knum行作为初始值
    for i in range(knum):
        cluscenter[i,:]=data[i,:]
        lastcluscenter[i,:]=data[i,:]
    #预定义聚类类别一维数组,用于存放每次计算样本的所属类别
    clusindex=np.zeros((len(data)))
    while 1:
        for i in range(len(data)):
            #计算第i个样本到各个聚类中心的欧式距离
            #预定义sumsquare,用于存放第i个样本到各个聚类中心的欧式距离
            sumsquare=np.zeros((knum))
            for k in range(knum):
                sumsquare[k]=sum((data[i,:]-cluscenter[k,:])**2)
            sumsquare=np.sqrt(sumsquare)
            #对第i个样本到各个聚类中心的欧式距离进行升序排序
            s=pd.Series(sumsquare).sort_values()
            #判断第i个样本的类归属(距离最小,即s序列中第0个位置的index)
            clusindex[i]=s.index[0]
        #将聚类结果添加到聚类数据最后一列
        clusdata=np.hstack((data,clusindex.reshape((len(data),1))))
        #更新聚类中心,新的聚类中心为对应类别样本特征的均值
        for i in range(knum):
            cluscenter[i,:]=np.mean(clusdata[clusdata[:,p]==i,:-1],0).reshape (1,p)
        #新的聚类中心与旧的聚类中心相减
        t=abs(lastcluscenter-cluscenter)
        #如果新的聚类中心与旧的聚类中心一致,即聚类中心不发生变化 #返回聚类结果,并退出循环
        if sum(sum(t))==0:
            return clusdata
            break
        #如果更新的聚类中心与旧的聚类中心不一致 #将更新的聚类中心赋给旧的聚类中心,进入下一次循环
        else:
            for k in range(knum):
                lastcluscenter[k,:]=cluscenter[k,:]

#调用该算法函数,并绘制聚类效果
import pandas as pd
D=pd.read_excel('D.xlsx',header=None)
D=D.values
r=K_mean(D,2)
x0=r[r[:,2]==0,0]
y0=r[r[:,2]==0,1]
x1=r[r[:,2]==1,0]
y1=r[r[:,2]==1,1]
import matplotlib.pyplot as plt
plt.plot(x0,y0,'r*')
plt.plot(x1,y1,'bo')

insert image description here

insert image description here

#数据获取及标准化处理
import pandas as pd
data=pd.read_excel('农村居民人均可支配收入来源2016.xlsx')
X=data.iloc[:,1:]
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
#K-均值聚类分析
from sklearn.cluster import KMeans
model=KMeans(n_clusters=4,random_state=0,max_iter=500)
'''
n_clusters:设置的聚类个数K
random_state:随机初始状态,设置为0即可
max_iter:最大迭代次数
'''
model.fit(X)
c=model.labels_#聚类的标签
Fs=pd.Series(c,index=data['地区'])
Fs=Fs.sort_values(ascending=False)#True升序,False降序

insert image description here
insert image description here
The 31 regions are divided into 4 categories, and the category labels are 0, 1,
2, and 3. For example, the first category is: Zhejiang, Tianjin, and Jiangsu; the third category is Shanghai and Beijing. It
needs to be explained here that the value of the category label has no actual value. Meaning, it only serves as a class label

Guess you like

Origin blog.csdn.net/weixin_46322367/article/details/121753557#comments_25909778