Scorecard model Easy Code

AS NP numpy Import
Import PANDAS AS PD
from Import function *
# = loading data Shape (2500,627)
Data = pd.read_csv ( "train2500.csv", DELIMITER = ",")
#data data.iloc = [: 10000, :]
Print (data.shape)
# duplicates removed
data.drop_duplicates (InPlace = True)
data [ "target"] = data [ "bad_good"]
del data [ "bad_good"]
# Step1: rough processing data
# The field descriptions first remove some unwanted fields: customer number, account opening means, document type
del Data [data.columns [0]]
del Data [data.columns [. 1]]
del Data [data.columns [2]]
# of a column in a separate columns ratio values greater than 90 or more deletion
del_index = []
# for COL in data.columns [: -. 1]:
# IF max (Data [COL] .value_counts (the normalize = True))> 0.9:
# # del Data [COL]
# del_index.append (COL)
I in Range for (len (data.columns [: -. 1])):
IF max (. Data [data.columns [I]] value_counts (the normalize = True))> 0.9:
del_index.append (I)
data.drop (data.columns [del_index], Axis =. 1, InPlace = True)
Print (data.shape)
# through rough treatment, the remaining dimensions are: (2500, 221)
# Step2: classifying data, numeric and categorical
# divided by: if not repeat value exceeds 10, then a continuous value, otherwise it is categorical data
numerical_feature = [] # numerical signature list 211
categorical_feature = [] # categorical feature list 10
# circulating add features the two lists
for COL in data.columns:
IF len (Data [COL] .drop_duplicates ())> 10:
numerical_feature.append (COL)
the else:
categorical_feature.append (COL)
categorical_feature.remove ( "target")
"" "
handling missing values:
If the deletion of more than 80%, characterized derived constructs (deletion and two categories of non-missing as), and deletes the original feature
If the deletion is between 50% and 80%, the data sub-box process, and the missing samples as a separate category tag
if the deletion is between 30% and 50%, wherein the model is selected using a random padding, numeric fill regression, categorical use classification to fill,
"" "
presence # judge column missing values
notNullCol = [] # there is no missing values in a column 221
isNullCol = [] # missing values exist column 0
for COL in data.columns [ : -1]:
IF SUM (Data [COL] .isnull ())> 0: # If there is a missing value, is greater than 0
isNullCol.append (COL)
the else:
notNullCol.append (COL)
Print (len (notNullCol))
Print (len (isNullCol))
# deletion of a column is calculated
DEF MissingRate (DF, COL):
. return DF [COL] .isnull () SUM () / df.shape [0]
# when the missing value is greater than a certain threshold when building characteristics derived, and returns two 1 0
DEF MissingNewFeature (DF, COL):
. return DF [COL] .isnull () asType (int)
# If deletion between 50% and 80%, the data dividing box process, and the missing samples marked as a separate category, the processed returned df
CategoricalMissingFeature DEF (DF, COL, n-=. 4):
Data DF = [DF [COL] .notnull ()] [COL]
df.loc [DF [COL] .notnull (), COL] = pd.qcut (Data, n-)
df.loc [DF [COL] .isnull (), COL] = "Miss"
return DF
# 50% if missing, the Lagrangian numerical features, wherein the number of categories of public use padding
def Model_fillna (df , COL, feature, Method):
"" "
: param DF:
: param COL:
: param feature: randomly selected columns nonischemic wherein
: param Method:
: return:
" ""
train_x DF = [DF [COL] .notnull ()] [Feature]
test_x DF = [DF [COL] .isnull ()] [Feature]
train_y DF = [DF [COL] .notnull ()] [COL]
IF Method == "CLF":
# classification using random forests classification
from sklearn.ensemble Import RandomForestClassifier
the CLF = RandomForestClassifier ()
clf.fit (train_x, train_y)
df.loc [df [col] .isnull (), COL] = clf.predict (test_x)
elif Method == "REG":
# using Lagrange polynomial
df [col] = LAGRANGE (Data, COL)
return DF
# starts missing values
for COL in isNullCol:
IF MissingRate (Data, COL)> 0.8:
Data [COL] = MissingNewFeature (Data, COL)
elif 0.5 <MissingRate (Data, COL) <0.8:
DF = CategoricalMissingFeature (Data, COL)
elif MissingNewFeature (Data, COL) <0.5:
Feature = np.random.sample (notNullCol,. 3)
IF COL in numerical_feature:
Data = Model_fillna (Data, COL, Feature, Method = "REG")
the else:
Data = Model_fillna (Data, COL, Feature, Method = "CLF")
# through said missing values, numeric and categorical variables vary, the reclassification
numerical_feature = [] # numerical signature list 211
categorical_feature = [] # 10 categorical list of features
for COL in data.columns:
IF len (Data [COL] .drop_duplicates ())> 10:
numerical_feature.append (COL)
the else:
categorical_feature.append (COL)
categorical_feature.remove ( "target")
"" "
. Up to here, all the data following completion of a coarse pre-start feature selection
" ""
# processing for numerical variables
# first value data normalized for z_socre
Print (data.shape)
from sklearn.preprocessing Import StandardScaler
for COL in numerical_feature:
.. data [COL] = StandardScaler () fit_transform (np.array (data [COL]) the RESHAPE (-1, 1))
# ANOVA
data = variance (data, col) # post-processing to reduce the number of features 154
woe value iv and each feature value # seeking for categorical variables, the value iv, filtering characteristics
for col in categorical_feature:
bad_total = sum(data["target"]) #坏样本总数
bad = data.groupby([col])["target"].sum()
bad_percent = np.array(bad) * 1.0 / np.array(bad_total)
good_total = data.shape[0]-bad_total #好样本总数
good = data.groupby([col])["target"].count()-bad
good_percent = np.array(good)*1.0 / np.array(good)
diff = bad_percent-good_percent
percent = bad_percent / good_percent
IV = 0
for i in range(len(percent)):
if percent[i] != 0 :
IV += diff[i]*np.log(percent[i])
if IV <0.2:
del data[col]
categorical_feature.remove(col)
#对于离散特征进行bad_rate编码
for col in categorical_feature:
bad_total = sum(data["target"])#该列分组的各类总数
bad = data.groupby([col])["target"].sum()
percent = np.array(bad)*1.0 / np.array(bad_total)
dicts = dict(zip(bad.index,percent))
data[col+"_badEncoding"] = data[col].map(dicts)
del data[col]
#通过递归RFE删除特征
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
estimator = DecisionTreeClassifier()
rfe = RFE(estimator=estimator,n_features_to_select=80)
rfe.fit_transform(np.array(data.iloc[:,:-1]),np.array(data["target

Guess you like

Origin www.cnblogs.com/daguonice/p/11420248.html