目前测试了下列算法,最好的是DTC,也只有0.67左右。SVC计算量太大,结果也不行。
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
但我上传后成绩只有0.07,大跌眼镜,实在太差了。没有做任何特征工程!也不至于这么差吧?!
按着GitHub里wmpscc/DataMiningNotesAndPractice的做法再上传一次看看。
见另一篇博客:sofasofa 地震后建筑修复建议(2)
地震后建筑修复建议
数据集:
训练集中共有652936座建筑,预测集中有400000座建筑。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def read_dataset(fname):
# 指定第一列作为行索引
data = pd.read_csv(fname, index_col=0)
# 处理地表状况数据
land_conditionlabels = data['land_condition'].unique().tolist()
data['land_condition'] = data['land_condition'].apply(lambda n: land_conditionlabels.index(n))
# 处理地基类型数据
foundation_typelabels = data['foundation_type'].unique().tolist()
data['foundation_type'] = data['foundation_type'].apply(lambda n: foundation_typelabels.index(n))
# 处理建筑屋顶类型数据
roof_typelabels = data['roof_type'].unique().tolist()
data['roof_type'] = data['roof_type'].apply(lambda n: roof_typelabels.index(n))
# 处理建筑一楼的结构类型数据
ground_floor_typelabels = data['ground_floor_type'].unique().tolist()
data['ground_floor_type'] = data['ground_floor_type'].apply(lambda n: ground_floor_typelabels.index(n))
# 处理建筑与其他建筑的位置关系类型数据
positionlabels = data['position'].unique().tolist()
data['position'] = data['position'].apply(lambda n: positionlabels.index(n))
# 处理修复建议类型数据
ylabels = data['y'].unique().tolist()
data['y'] = data['y'].apply(lambda n: ylabels.index(n))
return data
train = read_dataset(r'C:\Users\Qiuyi\Desktop\Post-earthquake rebuild advice datas\train.csv')
train.head()
用DataFrame.sample()随机取200000行数据:
from sklearn.model_selection import train_test_split
train = train.sample(n=200000, axis=0) #随机取50000行数据
y = train['y'].values
X = train.drop(['y'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('train dataset: {0}; test dataset: {1}'.format(
X_train.shape, X_test.shape))
train dataset: (160000, 13); test dataset: (40000, 13)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.model_selection import KFold
#from sklearn.model_selection import cross_val_score
import time
start = time.time()
models = []
#models.append(("KNN", KNeighborsClassifier(n_neighbors=4)))
#models.append(("LR",LinearRegression()))
#models.append(("SVC",SVC(C=100,degree=2)))
models.append(("DTC",DecisionTreeClassifier(criterion='entropy',max_depth=16)))
models.append(("AdaBoost",AdaBoostClassifier()))
#models.append(("GNB",MultinomialNB()))
#from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
results = []
i = 0
for name, model in models:
cv_result = cross_val_score(model, X, y, cv=2)
results.append((name, cv_result))
print("name: {}; cross val score: {}".format(
results[i][0],results[i][1].mean()))
print('using time:',time.time()-start,'s')
start = time.time()
i += 1
name: DTC; cross val score: 0.671745028768698
using time: 0.6570868492126465 s
name: AdaBoost; cross val score: 0.6651676660908268
using time: 26.656273365020752 s
扫描二维码关注公众号,回复:
4615581 查看本文章
from sklearn.model_selection import ShuffleSplit
import sys
sys.path.append(r"C:\Users\Qiuyi\Desktop\Post-earthquake rebuild advice datas")
from utils import plot_learning_curve
#数据集里有个common文件夹,里面有一个utils.py文件
for name, model in models:
cv = ShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
plt.figure(figsize=(10, 6))
plot_learning_curve(plt, model, 'Learn Curve for '+ name + ' Diabetes',
X, y, ylim=(0.0, 1.01), cv=cv);