sofasofa 地震后建筑修复建议(1)

目前测试了下列算法,最好的是DTC,也只有0.67左右。SVC计算量太大,结果也不行。

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

但我上传后成绩只有0.07,大跌眼镜,实在太差了。没有做任何特征工程!也不至于这么差吧?!

按着GitHub里wmpscc/DataMiningNotesAndPractice的做法再上传一次看看。

见另一篇博客:sofasofa 地震后建筑修复建议(2)

地震后建筑修复建议

在这里插入图片描述

数据集:

训练集中共有652936座建筑,预测集中有400000座建筑。
在这里插入图片描述

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def read_dataset(fname):
    
    # 指定第一列作为行索引
    data = pd.read_csv(fname, index_col=0) 

    # 处理地表状况数据
    land_conditionlabels = data['land_condition'].unique().tolist()
    data['land_condition'] = data['land_condition'].apply(lambda n: land_conditionlabels.index(n))
    
    # 处理地基类型数据
    foundation_typelabels = data['foundation_type'].unique().tolist()
    data['foundation_type'] = data['foundation_type'].apply(lambda n: foundation_typelabels.index(n))
    
    # 处理建筑屋顶类型数据
    roof_typelabels = data['roof_type'].unique().tolist()
    data['roof_type'] = data['roof_type'].apply(lambda n: roof_typelabels.index(n))
    
    # 处理建筑一楼的结构类型数据
    ground_floor_typelabels = data['ground_floor_type'].unique().tolist()
    data['ground_floor_type'] = data['ground_floor_type'].apply(lambda n: ground_floor_typelabels.index(n))
    
    # 处理建筑与其他建筑的位置关系类型数据
    positionlabels = data['position'].unique().tolist()
    data['position'] = data['position'].apply(lambda n: positionlabels.index(n))
    
    # 处理修复建议类型数据
    ylabels = data['y'].unique().tolist()
    data['y'] = data['y'].apply(lambda n: ylabels.index(n))

    return data

train = read_dataset(r'C:\Users\Qiuyi\Desktop\Post-earthquake rebuild advice datas\train.csv')

train.head()

在这里插入图片描述

用DataFrame.sample()随机取200000行数据:

from sklearn.model_selection import train_test_split

train = train.sample(n=200000, axis=0) #随机取50000行数据

y = train['y'].values
X = train.drop(['y'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('train dataset: {0}; test dataset: {1}'.format(
    X_train.shape, X_test.shape))

train dataset: (160000, 13); test dataset: (40000, 13)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

#from sklearn.model_selection import KFold
#from sklearn.model_selection import cross_val_score

import time
start = time.time()
models = []
#models.append(("KNN", KNeighborsClassifier(n_neighbors=4)))
#models.append(("LR",LinearRegression()))
#models.append(("SVC",SVC(C=100,degree=2)))
models.append(("DTC",DecisionTreeClassifier(criterion='entropy',max_depth=16)))
models.append(("AdaBoost",AdaBoostClassifier()))
#models.append(("GNB",MultinomialNB()))

#from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

results = []
i = 0
for name, model in models:
    cv_result = cross_val_score(model, X, y, cv=2)
    results.append((name, cv_result))
    print("name: {}; cross val score: {}".format(
        results[i][0],results[i][1].mean()))
    print('using time:',time.time()-start,'s')
    start = time.time()
    i += 1

name: DTC; cross val score: 0.671745028768698
using time: 0.6570868492126465 s

name: AdaBoost; cross val score: 0.6651676660908268
using time: 26.656273365020752 s

扫描二维码关注公众号,回复: 4615581 查看本文章
from sklearn.model_selection import ShuffleSplit
import sys
sys.path.append(r"C:\Users\Qiuyi\Desktop\Post-earthquake rebuild advice datas")
from utils import plot_learning_curve
#数据集里有个common文件夹,里面有一个utils.py文件

for name, model in models:
    cv = ShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
    plt.figure(figsize=(10, 6))
    plot_learning_curve(plt, model, 'Learn Curve for '+ name + ' Diabetes', 
                        X, y, ylim=(0.0, 1.01), cv=cv);

在这里插入图片描述
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_34275246/article/details/85209409