东证期货比赛代码

import random
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tarfile
from six.moves import urllib

# frameworks for ML
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

# transformers for category variables
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# transformers for numerical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer

# transformers for combined variables
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures


# user-defined transformers
from sklearn.preprocessing import FunctionTransformer


# classification models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


data = pd.read_csv("C:/Users/july/Desktop/contest_basic_train.tsv", sep='\t')
data_se = data.drop(["REPORT_ID","ID_CARD","LOAN_DATE","AGENT","WORK_PROVINCE","Y"],axis=1)
data_label = data["Y"].copy()

def set_data(df):
    df.loc[(df.HAS_FUND.isnull()), 'HAS_FUND'] = 0
    df.loc[(df.EDU_LEVEL.isnull()), 'EDU_LEVEL'] = "专科"
    return df
data_set = set_data(data_se)

imputer = Imputer(strategy="mean")
data_num =data_set["SALARY"].copy()
data_cat =data_set.drop("SALARY",axis=1)


class dataframese(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self,x,y=None):
        return self
    def transform(self,x):
        return x[self.attribute_names].values

from  categoricalencoder import  CategoricalEncoder
num = ["SALARY"]
cat =list(data_cat)
num_pipeline = Pipeline([
        ('selector',dataframese(num)),
        ("imputer",Imputer(strategy="mean")),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
  ])
cat_pipeline = Pipeline([
        ('selector', dataframese(cat)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
  ])
full_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline),
    ])


data_prepared =full_pipeline.fit_transform(data_set)


from sklearn.metrics import mean_squared_error, confusion_matrix

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_prepared, data_label)
data_predictions = tree_reg.predict(data_prepared)
tree_mse = mean_squared_error(data_label, data_predictions)
tree_rmse = np.sqrt(tree_mse)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, data_prepared, 
                         data_label, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("scores:", scores)
    print("mean:", scores.mean)
    print("standard deviation:", scores.std())


from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
forest_cla = RandomForestClassifier()
forest_cla.fit(data_prepared, data_label)
data_predictions2 = forest_cla.predict(data_prepared)
forest_mse = confusion_matrix(data_label, data_predictions2)

forest_scores = cross_val_score(forest_cla, data_prepared, 
                                data_label, scoring="accuracy", cv=10)

猜你喜欢

转载自blog.csdn.net/weixin_41908529/article/details/81342336
今日推荐