Data analysis notes python - Chapter X predictive analytics and machine learning

2. Pretreatment

import numpy as np
from sklearn import preprocessing
from scipy.stats import anderson
# 加载数据
rain = np.load('rain.npy')
rain = .1 * rain
rain[rain < 0] = .05 / 2
# 期望值 标准差和安德森
print("Rain mean", rain.mean())
print("Rain Variance", rain.var())
print("Anderson Rain", anderson(rain))
#对数据缩放处理
scaled = preprocessing.scale(rain)
print("Scaled mean", scaled.mean())
print("Scaled Variance", scaled.var())
print("Anderson Scaled", anderson(scaled))
# 把特征值从数值型转换布尔型
binarized = preprocessing.binarize(rain)
print("binarized", np.unique(binarized), binarized.sum())
# 用整数来标注类别
lb = preprocessing.LabelBinarizer()
lb.fit(rain.astype(int))
print(lb.classes_)

3. Classification based on logistic regression of
the probability that the algorithm can be used to predict the probability of an event occurring, or whether something belongs to a category of

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import datasets
import numpy as np

def classify(x, y):
#使用逻辑回归进行分类
    clf = LogisticRegression(random_state=12)
    scores = []
# k-折交叉验证
    kf = KFold(len(y), n_folds=10)
# 检查分类的状确性
    for train, test in kf:
        clf.fit(x[train], y[train])
        scores.append(clf.score(x[test], y[test]))
    print(np.mean(scores))

#加载数据信息
rain = np.load('rain.npy')
dates = np.load('doy.npy')
#使用日期和降雨量来构建数组
x = np.vstack((dates[:-1], rain[:-1]))
# 无雨,小雨,雨
y = np.sign(rain[1:])
classify(x.T, y)
#sklearn数据集
iris = datasets.load_iris()
x = iris.data[:, :2]
y = iris.target
classify(x, y)

4. based on support vector machine
SVM Support vector machines SVM
support vector regression Support vector Regression SVR
can be used for regression analysis can be used to classify

from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
import numpy as np
from pprint import PrettyPrinter

def classify(x, y):
    # 进行网格搜索
    clf = GridSearchCV(SVC(random_state=42, max_iter=100), {'kernel': ['linear', 'poly', 'rbf'], 'C': [1, 10]})
    clf.fit(x, y)
    print("Score", clf.score(x, y))
    PrettyPrinter().pprint(clf.grid_scores_)

rain = np.load('rain.npy')
dates = np.load('doy.npy')
x = np.vstack((dates[:-1], rain[:-1]))
y = np.sign(rain[1:])
classify(x.T, y)

iris = datasets.load_iris()
x = iris.data[:, :2]
y = iris.target
classify(x, y)

The regression-based classification of elasticNetCV
overfitting risk flexible grid Elasic net Regularization Regularization reduced regression analysis
is actually a linear combination LASSO (The Least Absolute Shrikage and Selection Operator) and the ridge regression algorithm.

from sklearn.linear_model import ElasticNetCV
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

def regress(x, y, title):
    clf = ElasticNetCV(max_iter=200,# 最大迭代次数
                       cv=10,# 包总量
                       l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
                       # 0表示只使用岭回归,1表示只使用 LASSO回归,否则使用混合算法
    clf.fit(x, y)
    print("Score", clf.score(x, y))

    pred = clf.predict(x)
    plt.title("Scatter plot of prediction and " + title)
    plt.xlabel("Prediction")
    plt.ylabel("Target")
    plt.scatter(y, pred)

    if "Boston" in title:
        plt.plot(y, y, label="Perfect Fit")
        plt.legend()
    plt.grid = True
    plt.show()

rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05 / 2
dates = np.load("doy.npy")

x = np.vstack((dates[:-1], rain[:-1]))
y = rain[1:]
regress(x.T, y, "rain data")

boston = datasets.load_boston()
x = boston.data
y = boston.target
regress(x, y, "Boston house prices")

6. Support Vector Regression

import numpy as np
from sklearn import datasets
from sklearn.learning_curve import learning_curve
from sklearn.svm import SVR
from sklearn import preprocessing
import multiprocessing
import matplotlib.pyplot as plt

def regress(x, y, ncpus, title):
    X = preprocessing.scale(x)
    Y = preprocessing.scale(y)
    clf = SVR(max_iter=ncpus * 200)
    # 根据cpu数量创建作业数
    train_sizes, train_scores, test_scores = learning_curve(clf, X, Y, n_jobs=ncpus)

    # 求平均数,然后画出得分
    plt.figure()
    plt.title(title)
    plt.plot(train_sizes, train_scores.mean(axis=1), label="Train score")
    plt.plot(train_sizes, test_scores.mean(axis=1), '--', label="Test score")
    print("Max test score " + title, test_scores.max())
    plt.legend(loc='best')
    plt.show()

def main():
    rain = .1 * np.load('rain.npy')
    rain[rain < 0] = .05 / 2
    dates = np.load('doy.npy')

    x = np.vstack((dates[:-1], rain[:-1]))
    y = rain[1:]
    ncpus = multiprocessing.cpu_count()
    regress(x.T, y, ncpus, "Rain")

    boston = datasets.load_boston()
    x = boston.data
    y = boston.target
    regress(x, y, ncpus, "Boston")

if __name__ == '__main__':
    main()

7. Based on Cluster Analysis affinity propagation algorithm of
cluster analysis is to put data into a number of groups, which are known as cluster
cluster analysis, is unsupervised learning
affinity propagation affinity propagation

import numpy as np
from sklearn import datasets
from sklearn import cluster
from sklearn.metrics import euclidean_distances
import matplotlib.pyplot as plt

# 生成三个数据块
x, _ = datasets.make_blobs(n_samples=100, centers=3, n_features=2, random_state=10)
# 创建矩阵
S = euclidean_distances(x)
# print(S)

# 根据矩阵,给数据标注其所属聚类
aff_pro = cluster.AffinityPropagation().fit(S)
labels = aff_pro.labels_

# 绘制图形
styles = ['o', 'x', '^']
for style, label in zip(styles, np.unique(labels)):
    print(label)
    plt.plot(x[labels == label], style, label=label)

plt.title("Clustering Blobs")
plt.legend(loc='best')
plt.show()

8 mean shift algorithm
that does not require clustering algorithm to estimate the number of clusters.

import numpy as np
from sklearn import cluster
import matplotlib.pyplot as plt
import pandas as pd

# 加载数据
rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05 / 2
dates = np.load('doy.npy')
x = np.vstack((dates, rain))

# 创建dataFrame,并计算平均值
df = pd.DataFrame.from_records(x.T, columns=['dates', 'rain'])
df = df.groupby('dates').mean()
df.plot()

# 均值漂移算法
x = np.vstack((np.arange(1, len(df) + 1), df.as_matrix().ravel()))
x = x.T
ms = cluster.MeanShift()
ms.fit(x)
labels = ms.predict(x)

# 绘制图形
plt.figure()
grays = ['0', '0.5', '0.75']

for gray, label in zip(grays, np.unique(labels)):
    match = labels == label
    x0 = x[:, 0]
    x1 = x[:, 1]
    plt.plot(x0[match], x1[match], lw=label + 1, label=label)
    plt.fill_between(x0, x1, where=match, color=gray)

plt.legend()
plt.show()

9. Genetic Algorithms

import array
import random
import numpy as np
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from scipy.stats import shapiro
import matplotlib.pyplot as plt


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_float", random.random)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, 200)
toolbox.register("populate", tools.initRepeat, list, toolbox.individual)

def eval(individual):
    return shapiro(individual)[1],

toolbox.register("evaluate", eval)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=4)

random.seed(42)

pop = toolbox.populate(n=400)
hof = tools.HallOfFame(1)
stats = tools.Statistics(key=lambda ind: ind.fitness.values)
stats.register("max", np.max)

algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=80, stats=stats, halloffame=hof)

print (shapiro(hof[0])[1])
plt.hist(hof[0])
plt.grid(True)
plt.show()

10. Neural Networks

import numpy as np
import theanets
import multiprocessing
from sklearn import datasets
from sklearn.metrics import accuracy_score


rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05/2
dates = np.load('doy.npy')
x = np.vstack((dates[:-1], np.sign(rain[:-1])))
x = x.T

y = np.vstack(np.sign(rain[1:]),)
N = int(.9 * len(x))

e = theanets.Experiment(theanets.Regressor,
                        layers=(2, 3, 1),
                        learning_rate=0.1,
                        momentum=0.5,
                        patience=300,
                        train_batches=multiprocessing.cpu_count(),
                        num_updates=500)

train = [x[:N], y[:N]]
valid = [x[N:], y[N:]]
e.run(train, valid)

pred = e.network(x[N:]).ravel()
print ("Pred Min", pred.min(), "Max", pred.max())
print ("Y Min", y.min(), "Max", y.max())
print ("Accuracy", accuracy_score(y[N:], pred >= .5))

11. Decision Tree

#所属模块发生变化
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import RandomizedSearchCV
from sklearn import tree
from scipy.stats import randint as sp_randint
import pydot
# import StringIO
from io import StringIO
import numpy as np
from tempfile import NamedTemporaryFile

# 加载数据信息
rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05 / 2

dates = np.load('doy.npy').astype(int)
x = np.vstack((dates[:-1], np.sign(rain[:-1])))
x = x.T

y = np.sign(rain[1:])

# 创建测试集和训练集数据
x_tain, x_test, y_train, y_test = train_test_split(x, y, random_state=37)

# 验证各参数的取值范围
clf = tree.DecisionTreeClassifier(random_state=37)
params = {"max_depth": [2, None],"min_samples_leaf": sp_randint(1, 5),"criterion": ["gini", "entropy"]}
rscv = RandomizedSearchCV(clf, params)
rscv.fit(x_tain, y_train)

# 绘制决策树的对象
sio = StringIO()
tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year', 'yest'])
dec_tree = pydot.graph_from_dot_data(sio.getvalue())
with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f:
    # dec_tree.write_png(f.name)
    dec_tree[0].write_png(f.name)
    print("Written figure to", f.name)

print('Best Train Score', rscv.best_score_)
print('Test Score', rscv.score(x_test, y_test))
print("Best params", rscv.best_params_)

Guess you like

Origin blog.csdn.net/qq_28467367/article/details/89504651