Machine learning algorithms OneR

OneR algorithm: the use of a feature to predict Category

Algorithm thinking:
Firstly, through each value of each feature, for each feature value, statistics that the number of occurrences in each category, it appears to find the highest number of categories, and count the number of times it appears in other categories;
complete statistics for all eigenvalues and after a number of occurrences of each category, let us calculate the error rate for each feature. It is calculated as the sum of each value of the error rate, wherein selecting the lowest error rate as the only sorting criteria (OneR), for subsequent classification.

Algorithm steps:
1. Set a data processing characteristic value 0/1 (mean greater than or equal eigenvalues value of 1, otherwise 0)
calculate the number of the respective feature values of each feature appearing in the respective categories 2, and the number of false feature value calculating most number
3. misjudgment all times a certain characteristic feature values together, to find the lowest false feature combinations and home total number of errors
4. comparison of the total of each feature times the size of the error, find the smallest number of errors feature, the characteristic feature is the best judgment
5. classification feature value corresponding to the feature being an optimal classification rules

Example:

import numpy as np
from sklearn.datasets import load_iris
from collections import defaultdict
from operator import itemgetter
from sklearn.cross_validation import train_test_split

dataset = load_iris()

X=dataset.data
y=dataset.target

attribute_means=X.mean(axis=0)#按列求均值

#将特征值小于均值的赋值为0,大于均值的赋值为1
X_d=np.array(X >= attribute_means,dtype='int')


def train_feature_value(X,y_true,feature_index,value):
    class_counts=defaultdict(int)
    for sample,y in zip(X,y_true):
#        print(sample,y,feature_index)
#        print(sample[feature_index],value)
#        print('-----')
        if sample[feature_index]==value:
            class_counts[y]+=1
    
    sorted_class_counts=sorted(class_counts.items(),key=itemgetter(1),reverse=True)
    most_frequent_class=sorted_class_counts[0][0]

    incorrect_predictions=[class_count for class_value,class_count in class_counts.items() if class_value != most_frequent_class]
    error=sum(incorrect_predictions)

    return most_frequent_class,error
    
def train_on_feature(X,y_true,feature_index):
    #特征值
    values = set(X[:,feature_index])
    predictors = {}
    errors = []
    for current_value in values:
        most_frequent_class,error = train_feature_value(X,y_true,feature_index,current_value)
#        print(most_frequent_class,error)
#        print('---')
        predictors[current_value] = most_frequent_class
        errors.append(error)
        
    total_error = sum(errors)
#    print(predictors,errors,total_error)
    return predictors, total_error
    
    
Xd_train,Xd_test,y_train,y_test=train_test_split(X_d,y,random_state=None)
#random_state为随机参数,固定为某一个值时,每次执行会得到相同的结果
    
all_predictors = {}
errors = {}
for feature_index in range(Xd_train.shape[1]):
#    print(feature_index)  
    predictors,total_error = train_on_feature(Xd_train,y_train,feature_index)
#    print(predictors,total_error)
#    print('---')
    all_predictors[feature_index] = predictors
    errors[feature_index] = total_error
#print(all_predictors,errors)
best_feature,best_error = sorted(errors.items(),key=itemgetter(1))[0]
#print(best_feature,best_error)
model = {'feature':best_feature,'predictor':all_predictors[best_feature]}
#print(model)

def predict(X_test,model):
    variable = model['feature']
#    print(variable)
    predictor = model['predictor']
#    print(predictor)
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted
    
y_predicted = predict(Xd_test,model)
#print(y_predicted)
#print(y_test)
#计算准确率
accuracy = np.mean(y_predicted==y_test)*100
print("The test accuracy is {:.1f}%".format(accuracy))


Guess you like

Origin blog.csdn.net/d1240673769/article/details/88578411