DNN used car price prediction complete code

Preface

I have been studying deep learning recently, and I tried to use DNN to predict the price of second-hand cars in Tianchi Sai. The features were still using the features of the previous integrated model. By constantly debugging the model’s learning rate, number of hidden layers, number of neurons, optimizer, Activation function, number of iterations, batchsize, KFold, finally reached a score similar to that of the previous integrated model, but the training time is much faster than catboost and lightgbm. After all, it only takes a few iterations to achieve similar results, and then with The integrated model is integrated to improve the previous score from 422 to 406. It can be regarded as an exercise in DNN. After all, the upper limit of the model still depends on feature engineering. The complete DNN code is attached below. After cross-validation and averaging, it can reach 428 online. Friends who need it can help themselves.


import pandas as pd
import numpy as np
import Meancoder 
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.autograd import Variable
df = pd.read_csv('/train.csv', sep=' ')
test= pd.read_csv('/test.csv', sep=' ')

def date_process(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])
    if month < 1:
        month = 1
    date = datetime(year, month, day)
    return date
df['regDate'] = df['regDate'].apply(date_process)
df['creatDate'] = df['creatDate'].apply(date_process)
df['regDate_year'] = df['regDate'].dt.year
df['regDate_month'] = df['regDate'].dt.month
df['regDate_day'] = df['regDate'].dt.day
df['creatDate_year'] = df['creatDate'].dt.year
df['creatDate_month'] = df['creatDate'].dt.month
df['creatDate_day'] = df['creatDate'].dt.day
df['car_age_day'] = (df['creatDate'] - df['regDate']).dt.days
df['car_age_year'] = round(df['car_age_day'] / 365, 1)

df['notRepairedDamage']=df['notRepairedDamage'].replace('-',0.0).astype('float64')
df['power'][df['power']>600] = 600
df['power'][df['power']<1] = 1
df['v_13'][df['v_13']>6] = 6
df['v_14'][df['v_14']>4] = 4
df['fuelType'] = df['fuelType'].fillna(0)
df['gearbox'] = df['gearbox'].fillna(0)
df['bodyType'] = df['bodyType'].fillna(0)
df['model'] = df['model'].fillna(0)

test['regDate'] = test['regDate'].apply(date_process)
test['creatDate'] = test['creatDate'].apply(date_process)
test['regDate_year'] = test['regDate'].dt.year
test['regDate_month'] = test['regDate'].dt.month
test['regDate_day'] = test['regDate'].dt.day
test['creatDate_year'] = test['creatDate'].dt.year
test['creatDate_month'] = test['creatDate'].dt.month
test['creatDate_day'] = test['creatDate'].dt.day
test['car_age_day'] = (test['creatDate'] - test['regDate']).dt.days
test['car_age_year'] = round(test['car_age_day'] / 365, 1)

test['notRepairedDamage']=test['notRepairedDamage'].replace('-',0).astype('float64')
test['power'][test['power']>600] = 600
test['power'][test['power']<1] = 1
test['v_13'][test['v_13']>6] = 6
test['v_14'][test['v_14']>4] = 4
test['fuelType'] = test['fuelType'].fillna(0)
test['gearbox'] = test['gearbox'].fillna(0)
test['bodyType'] = test['bodyType'].fillna(0)
test['model'] = test['model'].fillna(0)

num_cols = [0,2,3,6,8,10,12,14]
for index, value in enumerate(num_cols):
    for j in num_cols[index+1:]:
        df['new'+str(value)+'*'+str(j)]=df['v_'+str(value)]*df['v_'+str(j)]
        df['new'+str(value)+'+'+str(j)]=df['v_'+str(value)]+df['v_'+str(j)]
        df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]
        test['new'+str(value)+'*'+str(j)]=test['v_'+str(value)]*test['v_'+str(j)]
        test['new'+str(value)+'+'+str(j)]=test['v_'+str(value)]+test['v_'+str(j)]
        test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]
for i in range(15):
    df['new'+str(i)+'*year']=df['v_'+str(i)] * df['car_age_year']
    test['new'+str(i)+'*year']=test['v_'+str(i)] * test['car_age_year']

num_cols1 = [3,5,1,11]
for index, value in enumerate(num_cols1):
    for j in num_cols1[index+1:]:
        df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]
        test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]

X=df.drop(columns=['price','SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])
test=test.drop(columns=['SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])
Y=df['price']

class_list = ['model','brand','power','v_0','v_3','v_8','v_12']#+date_cols  'v_6','v_10','v_14','v_2'
MeanEnocodeFeature = class_list   
ME = Meancoder.MeanEncoder(MeanEnocodeFeature,target_type='regression') 
X = ME.fit_transform(X,Y)   
test = ME.transform(test)

df_concat = pd.concat([X, test], ignore_index = True)
df_concat=StandardScaler().fit_transform(df_concat) 
X1=df_concat[:150000]
test1=df_concat[150000:]

# 模型设置
input_size = 143  
hidden_size = 320
num_classes = 1
batch_size = 2048  
learning_rate = 0.05 
x=torch.tensor(X1,dtype=torch.float32)
y=torch.FloatTensor(Y.to_numpy())
y=Variable(y.view(-1, 1))
test=torch.tensor(test1,dtype=torch.float32)  

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

net = Net(input_size, hidden_size, num_classes)
print(net)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

result = []
mean_score = 0
n_folds=5
kf = KFold(n_splits=n_folds ,shuffle=True,random_state=2022)
for train_index, test_index in kf.split(x):
    x_train = x[train_index]
    y_train = y[train_index]
    x_test = x[test_index]
    y_test = y[test_index]
    for i in range(2000):
        for start in range(0, len(x_train), batch_size):  
            end = start + batch_size if start + batch_size < len(x_train) else len(x_train)
            xx = x_train[start:end]
            yy = y_train[start:end]
            outputs = net(xx)
            loss = criterion(outputs, yy)
            net.zero_grad()
            loss.backward()
            optimizer.step()
    y_pred = net.forward(x_test)
    loss1 = criterion(y_test, y_pred)
    mean_score += loss1.item()/ n_folds
    print('验证集loss:{}'.format(loss1.item()))
    test_pred = net.forward(test)
    result.append(test_pred)
# 模型评估
print('mean 验证集Auc:{}'.format(mean_score))
cat_pre=sum(result)/n_folds  
cat_pre=cat_pre.detach().numpy()
ret=pd.DataFrame(cat_pre,columns=['price'])
ret.to_csv('/DNN.csv')

Guess you like

Origin blog.csdn.net/weixin_46685991/article/details/129975167