林轩田 机器学习基石 作业3 线性回归(特征变换) 14题 python2.7

林轩田 机器学习基石 作业3 线性回归(特征变换) 14题 python2.7

注意random shuffle 的使用,shuffle洗牌可能洗到重复的
具体问题可查看该链接
所以这里我使用sklearn里面的shuffle,不使用random的shuffle

14题就是将原来的(1,x1,x2)特征转换成了(1,x1,x2,x1x2,x12,x22

希望大家能够提建议,谢谢

草稿代码如下

# coding=utf-8
import numpy as np
import random
from sklearn.utils import shuffle

def create_random_data(num=1000,error_rate=0.1):
    data_mat=np.zeros([num,4])##data_mat是一个array类型
    error_num=int(num*error_rate)
    for i in range(num):
        x1=random.uniform(-1,1)
        x2=random.uniform(-1,1)
        label=(1 if (x1**2+x2**2>=0.6) else -1)
        data_mat[i][0]=1
        data_mat[i][1]=x1
        data_mat[i][2]=x2
        data_mat[i][3]=label
    for j in range(error_num):
        data_mat[j][3]=-data_mat[j][3]
        data_mat=shuffle(data_mat)
        # random.shuffle(data_mat)千万小心random shuffle,洗牌之后可能出现重复项
    data_feature=data_mat[:,:3]
    data_lable=data_mat[:,3:]
    return data_feature,data_lable
# 新特征(1 x1 x2 x1x2 x1^2 x2^2)
def transform_feature(old_data_feature):
    m=np.shape(old_data_feature)[0]
    n=6
    new_data_feature=np.zeros([m,n])
    for i in range(m):
        new_data_feature[i,0]=1
        new_data_feature[i,1]=old_data_feature[i,1]
        new_data_feature[i,2]=old_data_feature[i,2]
        new_data_feature[i,3]=old_data_feature[i,1]*old_data_feature[i,2]
        new_data_feature[i,4]=old_data_feature[i,1]*old_data_feature[i,1]
        new_data_feature[i,5]=old_data_feature[i,2]*old_data_feature[i,2]
    return new_data_feature
def train_model(data_feature,data_lable):
    X=np.mat(data_feature)
    y=np.mat(data_lable)
    # w=np.dot(np.dot(np.dot(X.T,X).I,X.T),data_lable)
    w=np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y)
    return np.mat(w.T)
def error_rate(w,data_feature,data_lable):
    m,n=np.shape(data_feature)
    num_error=0
    for i in range(m):
        score=np.dot(w,data_feature[i])[0,0]
        if score*data_lable[i,0]<0:
            num_error+=1
    return num_error/float(m)
#代码验证
data_feature,data_lable=create_random_data()
new_feature=transform_feature(data_feature)
best_w=train_model(new_feature,data_lable)
best_error_rate=error_rate(best_w, new_feature, data_lable)
for i in range(1000):
    data_feature,data_lable=create_random_data()
    new_feature=transform_feature(data_feature)
    w=train_model(new_feature,data_lable)
    new_error_rate=error_rate(w, new_feature, data_lable)
    if new_error_rate<best_error_rate:
        best_error_rate=new_error_rate
        best_w=w
print best_error_rate
print best_w

猜你喜欢

转载自blog.csdn.net/m0_37534550/article/details/82863916