Python Machine Learning Blueprints

import requests
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import  train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split

#获取
# r = requests.get(r"https://api.github.com/users/acombs/starred")
# print(r.json())

#Pandas
# PATH = r"/"
# r = requests.get(r"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
# with open(PATH+'iris.data','w') as f:
#     f.write(r.text)
# os.chdir(PATH)
# df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width',
#                                             'petal length','petal width','class'])
# print(df.head())
# print(df['sepal length'])
# print(df.ix[:3,:2])
# print(df.ix[:2,[x for x in df.columns if 'width' in x]])
# print(df['class'].unique())
# print(df[df['class']=='Iris-virginica'])
# print(df.count())
# print(df[df['class']=='Iris-virginica'].count())
# virginica = df[df['class']=='Iris-virginica'].reset_index(drop=True)
# print(virginica)
# print(df[(df['class']=='Iris-virginica')&(df['petal width'] > 2.2)])
# print(df.describe())
# print(df.describe(percentiles=[.20,.40,.80,.90,.95]))
# print(df.corr())

#Matplotlib
# 柱状图
# fig, ax = plt.subplots(figsize=(6,4))
# ax.hist(df['petal width'], color='black')
# ax.set_ylabel('Count', fontsize = 12)
# ax.set_xlabel('Width',fontsize = 12)
# plt.title('Iris Petal Width', fontsize=14, y=1.01)
# plt.show()

# fig, ax = plt.subplots(2,2, figsize = (8,4))
# ax[0][0].hist(df['petal width'], color='black')
# ax[0][0].set_ylabel('Count',fontsize = 12)
# ax[0][0].set_xlabel('Width', fontsize = 12)
# ax[0][0].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[0][1].hist(df['petal length'], color='black')
# ax[0][1].set_ylabel('Count',fontsize = 12)
# ax[0][1].set_xlabel('Width', fontsize = 12)
# ax[0][1].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[1][0].hist(df['sepal width'], color='black')
# ax[1][0].set_ylabel('Count',fontsize = 12)
# ax[1][0].set_xlabel('Width', fontsize = 12)
# ax[1][0].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[1][1].hist(df['sepal length'], color='black')
# ax[1][1].set_ylabel('Count',fontsize = 12)
# ax[1][1].set_xlabel('Width', fontsize = 12)
# ax[1][1].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# plt.show()

#散点图
# fig, ax = plt.subplots(figsize=(6,6))
# ax.scatter(df['petal width'],df['petal length'], color='red')
# ax.set_xlabel('Petal Width')
# ax.set_ylabel('Petal Length')
# ax.set_title('Petal Scatterplot')
# plt.show()

#折线图
# fig, ax = plt.subplots(figsize=(6,6))
# ax.plot(df['petal length'], color='blue')
# ax.set_xlabel('Specimen Number')
# ax.set_ylabel('Petal Length')
# ax.set_title('Petal Length Plot')
# plt.show()

#条形图
# fig, ax = plt.subplots(figsize=(6,6))
# bar_width = .8
# labels = [x for x in df.columns if 'length' in x or 'width' in x]
# ver_y = [df[df['class']=='Iris-versicolor'][x].mean() for x in labels]
# vir_y = [df[df['class']=='Iris-virginica'][x].mean() for x in labels]
# set_y = [df[df['class']=='Iris-setosa'][x].mean() for x in labels]
# x = np.arange(len(labels))
# ax.bar(x, vir_y, bar_width, bottom=set_y, color='darkgrey')
# ax.bar(x, set_y, bar_width, bottom=ver_y, color='white')
# ax.bar(x, ver_y, bar_width, color='black')
# ax.set_xticklabels(labels, rotation=-70, fontsize=12)
# ax.set_title('Mean Feature Measurement By Class', y=1.01)
# ax.legend(['Virginica','Setosa','Versicolor'])
# plt.show()

#Seaborn
# sns.pairplot(df, hue='class')
# plt.show()

#小提琴图
# fig,ax = plt.subplots(2,2, figsize=(7,7))
# sns.set(style='white', palette='muted')
# sns.violinplot(x=df['class'], y=df['sepal length'], ax=ax[0,0])
# sns.violinplot(x=df['class'], y=df['sepal width'], ax=ax[0,1])
# sns.violinplot(x=df['class'], y=df['petal length'], ax=ax[1,0])
# sns.violinplot(x=df['class'], y=df['petal width'], ax=ax[1,1])
# fig.suptitle('Violin Plots', fontsize = 16, y=1.03)
# for i in ax.flat:
#     plt.setp(i.get_xticklabels(),rotation=-90)
# fig.tight_layout()
# plt.show()

#Map
# df['class'] = df['class'].map({'Iris-setosa':'SET','Iris-virginica':'VIR','Iris-versicolor':'VER'})
# print(df['class'])

#Apply
# df['wide petal'] = df['petal width'].apply(lambda v:1 if v>=1.3 else 0)
# print(df)

# df['petal area'] = df.apply(lambda r:r['petal length']*r['petal width'], axis=1)
# print(df)

#Applymap
# print(df.applymap(lambda v:np.log(v) if isinstance(v,float) else v))

#Groupby
# print(df.groupby('class').mean())
# print(df.groupby('class').describe())
# print(df.groupby('petal width')['class'].unique())
# print(df.groupby('class')['petal width']
#       .agg({'delta':lambda x:x.max() - x.min(),'max':np.max, 'min':np.min}))

#Statsmodels
# fig, ax = plt.subplots(figsize=(7,7))
# ax.scatter(df['sepal width'][:50], df['sepal length'][:50])
# ax.set_ylabel('Sepal Length')
# ax.set_xlabel('Sepal Width')
# ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02)
# plt.show()

# y = df['sepal length'][:50]
# x = df['sepal width'][:50]
# X = sm.add_constant(x)
# results = sm.OLS(y,X).fit()
# print(results.summary())
#
# fig, ax = plt.subplots(figsize=(7,7))
# ax.plot(x, results.fittedvalues, label='regression line')
# ax.scatter(x, y, label='data point', color='r')
# ax.set_ylabel('Sepal Length')
# ax.set_xlabel('Sepal Width')
# ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02)
# ax.legend(loc=2)
# plt.show()

#scikit-learn
# clf = RandomForestClassifier(max_depth=5,n_estimators=10)
# X = df.ix[:,:4]
# y = df.ix[:,4]
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual'])
# rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1)
# # print(rf)
# print(rf['correct'].sum()/rf['correct'].count())

#SVM
# clf = OneVsOneClassifier(SVC(kernel='linear'))
# X = df.ix[:,:4]
# y = np.array(df.ix[:,4]).astype(str)
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual'])
# rf['correct'] = rf.apply(lambda r:1 if r['predicted'] == r['actual'] else 0,axis=1)
# # print(rf)
# print(rf['correct'].sum()/rf['correct'].count())
Python Machine Learning Blueprints

猜你喜欢