Risk control model demo

# Lagrange polynomial
from scipy.interpolate Import Lagrange
DEF LAGRANGE (DF, COL):
DEF polyinterp_column (S, n-, K =. 5):
# Take five values before and after the missing values as parameters
y = s [list (Range (NK, n-)) + List (Range (n-+. 1, n-K + +. 1))]
Y = Y [y.notnull ()] # remove null
return lagrange (y.index, list (y ) ) (n-)
# determines whether interpolation individually
for I in Range (len (DF)):
iF (DF [I] .isnull ()) [I]:
DF [COL] [I] = polyinterp_column (DF [COL] , I)
return DF [COL]
# removed numerical analysis of variance characteristic feature type
Import numpy AS NP
Import PANDAS AS PD
DEF variance (DF, COL):
ARR = np.array (DF [COL])
# variance adopted here remove less than 1
IF arr.var () <1:
del DF [COL]
return DF
from sklearn.cluster Import KMeans
KmeansAbnormal DEF (DF, K, spec):
"" "
: param DF: incoming data
: param k: the number of cluster centers
: param sep: Threshold
: return: return data after outlier removal
" ""
Data = df.copy ()
# normalized data
data = 1.0 * (data - data.mean ()) / data.median ()
# clustering models establish
km = KMeans (n_clusters = K, = 200 is max_iter)
km.fit_transform ( Data)
# Add category columns
Data [ "Center"] = [np.array (km.cluster_centers_ [I]) for I in km.labels_]
# for each sample to calculate the corresponding cluster center distance
norm_lis = []
for I in Range (data.shape [0]):
NORM = np.array (data.loc [I] [data.columns [: -. 1]]) - data.loc [I] [ "Center"]
NORM = np.linalg .norm (NORM)
norm_lis.append (NORM)
Data [ "Distance"] = norm_lis
Data [ "categorical"] = km.labels_
# calculates the average distance for each cluster center
avergae_distance = []
for I in Range (K):
avergae_distance.append (Data [ "Distance"] [Data [ "categorical"] = I =] .mean ())
# sample point is calculated for each ratio of the distance from the center to the average distance of the center of the
data [ "spec"] = [ data.iloc [i] [ "distance"] / avergae_distance [data.iloc [I] [ "categorical"]] for I in Range (data.shape [0])]
# retention less than the threshold data
DF DF = [data [ "spec"] <spec]
return DF

Guess you like

Origin www.cnblogs.com/daguonice/p/11420240.html