文件202106_10000_drop.tsv数据
DATA_MONTH ITEM_ID BRAND_ID ITEM_PRICE ITEM_SALES_VOLUME ITEM_SALES_AMOUNT ITEM_FAV_NUM TOTAL_EVAL_NUM ITEM_STOCK USER_ID
202106 558805384323 10.0 29 290.0 204.0 174.0 64.0 1049653664
202106 559237486452 3406189.0 29.5 37 1091.5 49404.0 12631.0 564912.0 3085368336
202106 560359123903 325792419.0 18.0 2 36.0 4.0 6.0 14.0 1659743877
202106 562152845686 4357425.0 299.0 69 20631.0 1020.0 296.0 782.0 1015350104
202106 562869175959 51401707.0 6.5 3 19.5 1689945214
202106 562916820034 3326980.0 25.0 20 500.0 1742.0 368.0 18624.0 2258915412
202106 563074665088 115924646.0 4.0 20 80.0 6.0 3.0 120.0 1689945214
202106 563225896544 172882442.0 39.9 4 159.6 36.0 7.0 9.0 3457531540
202106 564120320875 699832083.0 69.0 10 690.0 3816.0 116.0 13821.0 3076303883
202106 564213113614 12141068.0 89.0 3 267.0 120.0 14.0 2240.0 2132910129
202106 564900159316 17.4 12 208.8 114.0 28.0 39.0 1049653664
202106 566444677669 78935831.0 239.0 1 239.0 1430.0 91.0 49.0 793515647
202106 567481931472 18.4 1 18.4 54.0 13.0 2.0 1049653664
202106 567609110867 4632607825.0 69.9 127241 8894145.9 1642308.0 510265.0 59406.0 1773211220
202106 624599883556 13.8 1 13.8 4.0 2.0 1478.0 1709315109
202106 633352837163 1044912147.0 85.0 10 850.0 40129.0 3667813497
202106 633404796991 84105527.0 50.0 1183 59150.0 725677994
202106 633544912008 31840.0 159.0 2 318.0 1372.0 1918545422
202106 633607380699 16757534.0 5.8 298 1728.4 59804.0 2777020216
202106 633669276086 9417664138.0 87.0 2180 189660.0 124031.0 2209054917954
202106 634160098230 358014032.0 599.0 37 22163.0 1713424658
202106 634256348035 141089319.0 6.6 1 6.6 2387.0 2206491393401
202106 634391310219 3307903157.0 2.9 264 765.6 6659.0 2201195930685
202106 634400640726 4402936824.0 68.0 6 408.0 1136.0 2206624915443
202106 634495827445 5302962172.0 68.0 8 544.0 6571.0 2206741768412
202106 634544850800 31557353.0 240.0 28 6720.0 1121.0 2204867871836
202106 634580040933 93368.0 9.9 2 19.8 2483.0 2209561837738
202106 634598728943 954854018.0 22.8 1 22.8 19986.0 2200728852878
202106 605469635919 73387830.0 69.0 240 16560.0 898.0 222.0 21150.0 2077787057
202106 605486212835 235460937.0 198.0 1 198.0 720.0 231.0 65801.0 3461088480
202106 605563723673 17502779.0 54.0 109 5886.0 832978172
202106 605933084083 754254510.0 980.0 8 7840.0 3258.0 549.0 3.0 2318430357
202106 606165305693 1313440085.0 34.8 20 696.0 12.0 7.0 690.0 3438682238
202106 606674410711 15.8 2 31.6 38.0 16.0 1364.0 2200737190895
202106 606864164711 21210.0 299.0 9 2691.0 1132.0 259.0 2449.0 2206500962966
202106 606922881613 1000.0 255 255000.0 4.0 7.0 9999999.0 3469868662
202106 606927511754 1027078381.0 18.0 18 324.0 658.0 220.0 15432.0 2205370590856
202106 609620971975 1650548186.0 6130.0 13 79690.0 692.0 12.0 28.0 3159428248
202106 609636651452 48072016.0 179.9 311 55948.9 725677994
202106 609862526960 1318578578.0 18.0 2 36.0 138.0 12.0 6910.0 2200535469688
202106 609863137975 31342.0 449.0 13 5837.0 2531.0 646266812
202106 609947413467 606252880.0 28.0 20761 581308.0 725677994
202106 635221013280 3033154219.0 26.1 1 26.1 139980.0 2200801414962
202106 635251095064 74881026.0 35.0 7 245.0 312.0 458401971
202106 635461034516 612466455.0 39.8 19 756.2 39329.0 2022561587
202106 635528589886 2387021868.0 129.0 1 129.0 3932.0 4104190852
202106 635530018805 80798.0 269.0 7 1883.0 1627473180
202106 635921562666 730148995.0 198.0 9 1782.0 2264895964
202106 635984964565 3719775.0 2299.0 2 4598.0 8.0 2210020522863
202106 636177822114 1867398035.0 39.9 2 79.8 3472852359
202106 636246811612 390194493.0 196.0 3 588.0 2206408294750
202106 636455837645 56200895.0 58.0 3 174.0 2938385359
202106 637020133801 79000978.0 59.0 9 531.0 686582311
202106 637423690521 3748460229.0 25.0 128 3200.0 4118751143
202106 610537314226 7135186.0 26.9 2 53.8 1108.0 167.0 24979.0 1667650872
202106 610646688399 15091109.0 8.8 1 8.8 16.0 14.0 39926.0 3079354541
202106 610738302193 1952466888.0 322.0 1 322.0 46.0 2.0 52.0 3816993362
202106 610820918839 53363672.0 298.0 1 298.0 4.0 1.0 18.0 497447433
202106 610899947394 54917496.0 43.0 14 602.0 382.0 33.0 13150.0 1994142888
202106 610964323695 8237498.0 12.9 7 90.3 10.0 22.0 569.0 3382215252
202106 610965379746 2221593704.0 9.9 6 59.4 334.0 105.0 149314.0 2206737579137
202106 611118365400 71525148.0 358.0 355 127090.0 725677994
202106 612031140235 27997.0 11.57 2 23.14 18.0 38.0 15765.0 2206389114430
202106 612439840638 2.5 7153 17882.5 1250.0 831.0 93672.0 2207264919687
202106 612486516163 5994766479.0 35.0 8 280.0 154.0 26.0 1861.0 2206941237576
202106 613133031335 7005516.0 1599.0 1 1599.0 2206384590783
202106 613140329736 861348621.0 2.6 52334 136068.4 20356.0 22010.0 13007.0 3937219703
202106 624687702597 1154246537.0 15.8 75 1185.0 10154.0 4889.0 272284.0 2816081886
202106 624701026778 62852543.0 22.55 1 22.55 0.0 1.0 243.0 2206784883994
202106 624715683705 1534945333.0 845.0 10 8450.0 156046.0 2208728183304
202106 624746422943 898646790.0 57.0 4 228.0 460.0 100.0 14754.0 2206419124132
202106 624860134317 112256961.0 3.4 26 88.4 14.0 10.0 341.0 2047979514
202106 624932834746 390194493.0 197.0 4 788.0 450.0 18.0 4183.0 2206408294750
202106 625010844162 51863835.0 148.0 2 296.0 10.0 1.0 1001.0 2087094604
202106 637592516499 31703.0 999.0 1 999.0 2206384590783
202106 637607515618 203418295.0 13.9 156 2168.4 2200664412979
202106 637715678524 40349909.0 11.92 13 154.96 3368844370
202106 637855800332 872004454.0 25.98 3 77.94 2209088959988
202106 637915710388 5537632077.0 3.8 19493 74073.4 3937219703
202106 637952522971 383890508.0 16.8 2 33.6 2200703564947
202106 637956198954 228638946.0 38.8 277 10747.6 2203121766205
202106 637987277927 1626406977.0 49.0 3 147.0 3328346327
202106 638204557269 8758516.0 38.4 5 192.0 3360919541
202106 638299973949 109024051.0 25.0 14 350.0 757924657
202106 638318880776 3856489.0 899.0 3 2697.0 378537552
202106 638522152951 78935831.0 278.0 4 1112.0 793515647
202106 638589131457 142586119.0 168.0 2 336.0 2122193287
202106 638628894161 2502512991.0 198.0 1 198.0 2208416096173
202106 613502291263 106743813.0 58.0 8 464.0 822.0 252.0 19156.0 2959491051
202106 613845437846 21.8 62 1351.6 9999.0 434524724
202106 614190419052 3786707.0 59.9 8811 527778.9 725677994
202106 614290182940 671968979.0 4.22 1 4.22 98427.0 2207522017618
202106 614319628243 4062495.0 22.9 2789 63868.1 725677994
202106 614321315846 1598458854.0 88.0 2 176.0 1690.0 2161.0 5152.0 3070665310
202106 614515294056 107806.0 198.0 1 198.0 104.0 62.0 33180.0 3365249032
202106 614729941830 181942151.0 2590.0 1 2590.0 1734541095
202106 614923855881 104789361.0 49.0 6 294.0 3374.0 272.0 49183.0 1047125087
202106 615181441064 674538951.0 39.0 45 1755.0 3730.0 396.0 1705.0 2205013289
202106 615522665465 141092588.0 25.0 9 225.0 64.0 23.0 2128.0 2369187990
202106 615630292674 60161005.0 29.9 43 1285.7 182.0 49.0 72.0 1707321021
回归填补缺失值
# coding:utf-8
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定中文字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
def check_data_and_process(need_fill_col, need_fill_file):
f = pd.read_csv(need_fill_file, sep="\t", encoding="utf-8")
col = f.columns
null_col = [] # 有缺失值的列
no_need_col = ["DATA_MONTH"] # 确定对训练无用的字段
for c in col:
if len(f[f[c].isnull()][c]) != 0:
null_col.append(c)
print("字段:", c, "缺失数:", len(f[f[c].isnull()][c]))
need_fill_col = need_fill_col
drop_col = [] # 填补数据时需要先删除缺失的数据,一个一个补
drop_col.extend(no_need_col)
for c in null_col:
if c != need_fill_col:
drop_col.append(c)
data_all = f.drop(columns=drop_col) # 需要缺失值预测字段的缺失和非缺失都在一起
# data_need_to_pre = data_all[data_all[need_fill_col].isnull()] # 需要填补此字段的数据
data_need_to_train = data_all[~data_all[need_fill_col].isnull()] # 无缺失数据作为训练数据
data = data_need_to_train.drop(columns=[need_fill_col])
label = data_need_to_train[need_fill_col]
return data, label, need_fill_col, f, data_all
def train_and_chose_model(model, data, label):
data = np.array(data)
label = np.array(label)
scaler_1 = MinMaxScaler(feature_range=(0, 1))
scaler_2 = MinMaxScaler(feature_range=(0, 1))
data = scaler_1.fit_transform(data) # 归一化处理,构造两个因为之后还需要反归一化
label = scaler_2.fit_transform(label.reshape(-1, 1))
data_tr, data_te, labels_tr, labels_te = train_test_split(data, label, test_size=0.2, random_state=10)
model.fit(data_tr, labels_tr) # 训练模型
score_test = model.score(data_te, labels_te)
score_train = model.score(data_tr, labels_tr)
print(str(model) + "训练集准确率为:" + str(score_train))
print(str(model) + "测试集准确率为:" + str(score_test)) # 出现负数不是很理解
y_test_pre = model.predict(data_te) # 预测测试集
y_train_pre = model.predict(data_tr) # 预测训练集
print("训练集均方误差:", mean_squared_error(labels_tr, y_train_pre))
print("测试集均方误差:", mean_squared_error(labels_te, y_test_pre))
def fill_data(model, data, label, need_fill_col, f, data_all):
data_need_to_pre = data_all[data_all[need_fill_col].isnull()].drop(columns=[need_fill_col]) # 含缺失的去除缺失标签的数据
data = np.array(data)
label = np.array(label)
scaler_1 = MinMaxScaler(feature_range=(0, 1))
scaler_2 = MinMaxScaler(feature_range=(0, 1))
scaler_1.fit_transform(data_all.drop(columns=[need_fill_col]))
data_to_pre = scaler_1.fit_transform(data_need_to_pre)
data = scaler_1.fit_transform(data) # 归一化处理,构造两个因为之后还需要反归一化
label = scaler_2.fit_transform(label.reshape(-1, 1))
data_tr, data_te, labels_tr, labels_te = train_test_split(data, label, test_size=0.2, random_state=10)
model.fit(data_tr, labels_tr) # 训练模型
score_test = model.score(data_te, labels_te)
score_train = model.score(data_tr, labels_tr)
print(str(model) + "训练集准确率为:" + str(score_train))
print(str(model) + "测试集准确率为:" + str(score_test)) # 出现负数不是很理解
y_test_pre = model.predict(data_te) # 预测测试集
y_train_pre = model.predict(data_tr) # 预测训练集
y_need_pred = model.predict(data_to_pre) # 填补缺失值
print("训练集均方误差:", mean_squared_error(labels_tr, y_train_pre))
print("测试集均方误差:", mean_squared_error(labels_te, y_test_pre))
data_need_pred = scaler_2.inverse_transform(y_need_pred.reshape(-1, 1))
for i, x in zip(f[f[need_fill_col].isnull()][need_fill_col].index, data_need_pred): # 遍历索引和预测值,一个一个补进去
f.loc[i, need_fill_col] = x
f.to_csv(need_fill_col + "已补" + "202106_10000_drop.tsv", sep="\t", index=0)
# print(f[f[need_fill_col].isnull()][need_fill_col]) # 检测是否补完
def check(file):
f = pd.read_csv(file, sep="\t", encoding="utf-8")
col = f.columns
null_col = [] # 有缺失值的列
for c in col:
if len(f[f[c].isnull()][c]) != 0:
null_col.append(c)
print("字段:", c, "缺失数:", len(f[f[c].isnull()][c]))
def main():
need_fill_file = "202106_10000_drop.tsv" # 需要补的文件
need_fill_col = "ITEM_STOCK" # 需要补的字段
# check("ITEM_STOCK已补202106_10000_drop.tsv") # 检查
data, label, need_fill_col, f, data_all = check_data_and_process(need_fill_col, need_fill_file) # 用于数据分析以及数据预处理
model = [RandomForestRegressor()]
for m in model:
train_and_chose_model(m, data, label) # 选择最优模型
fill_data(RandomForestRegressor(), data, label, need_fill_col, f, data_all) # 填补数据
if __name__ == '__main__':
main()
分类填补缺失值
# coding:utf-8
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定中文字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
def check_data_and_process(need_fill_col, need_fill_file):
f = pd.read_csv(need_fill_file, sep="\t", encoding="utf-8")
col = f.columns
null_col = [] # 有缺失值的列
no_need_col = ["DATA_MONTH"] # 确定对训练无用的字段
for c in col:
if len(f[f[c].isnull()][c]) != 0:
null_col.append(c)
print("字段:", c, "缺失数:", len(f[f[c].isnull()][c]))
need_fill_col = need_fill_col
drop_col = [] # 填补数据时需要先删除缺失的数据,一个一个补
drop_col.extend(no_need_col)
for c in null_col:
if c != need_fill_col:
drop_col.append(c)
data_all = f.drop(columns=drop_col) # 需要缺失值预测字段的缺失和非缺失都在一起
# data_need_to_pre = data_all[data_all[need_fill_col].isnull()] # 需要填补此字段的数据
data_need_to_train = data_all[~data_all[need_fill_col].isnull()] # 无缺失数据作为训练数据
data = data_need_to_train.drop(columns=[need_fill_col])
label = data_need_to_train[need_fill_col]
return data, label, need_fill_col, f, data_all
def train_and_chose_model(model, data, label):
data = np.array(data)
label = np.array(label)
scaler_1 = MinMaxScaler(feature_range=(0, 1))
data = scaler_1.fit_transform(data) # 归一化处理,构造两个因为之后还需要反归一化
data_tr, data_te, labels_tr, labels_te = train_test_split(data, label, test_size=0.2, random_state=10)
model.fit(data_tr, labels_tr.astype("int")) # 训练模型
score_test = model.score(data_te, labels_te.astype("int"))
score_train = model.score(data_tr, labels_tr.astype("int"))
print(str(model) + "训练集准确率为:" + str(score_train))
print(str(model) + "测试集准确率为:" + str(score_test)) # 出现负数不是很理解
def fill_data(model, data, label, need_fill_col, f, data_all):
data_need_to_pre = data_all[data_all[need_fill_col].isnull()].drop(columns=[need_fill_col]) # 含缺失的去除缺失标签的数据
data = np.array(data)
label = np.array(label)
scaler_1 = MinMaxScaler(feature_range=(0, 1))
scaler_1.fit_transform(data_all.drop(columns=[need_fill_col]))
data_to_pre = scaler_1.fit_transform(data_need_to_pre)
data = scaler_1.fit_transform(data) # 归一化处理,构造两个因为之后还需要反归一化
data_tr, data_te, labels_tr, labels_te = train_test_split(data, label, test_size=0.2, random_state=10)
model.fit(data_tr, labels_tr.astype("int")) # 训练模型
score_test = model.score(data_te, labels_te.astype("int"))
score_train = model.score(data_tr, labels_tr.astype("int"))
print(str(model) + "训练集准确率为:" + str(score_train))
print(str(model) + "测试集准确率为:" + str(score_test)) # 出现负数不是很理解
data_pred = model.predict(data_to_pre) # 填补缺失值
for i, x in zip(f[f[need_fill_col].isnull()][need_fill_col].index, data_pred): # 遍历索引和预测值,一个一个补进去
f.loc[i, need_fill_col] = x
f.to_csv(need_fill_col + "已补" + "202106_10000_drop.tsv", sep="\t", index=0)
# print(f[f[need_fill_col].isnull()][need_fill_col]) # 检测是否补完
def check(file):
f = pd.read_csv(file, sep="\t", encoding="utf-8")
col = f.columns
null_col = [] # 有缺失值的列
for c in col:
if len(f[f[c].isnull()][c]) != 0:
null_col.append(c)
print("字段:", c, "缺失数:", len(f[f[c].isnull()][c]))
def main():
need_fill_file = "TOTAL_EVAL_NUM已补202106_10000_drop.tsv" # 需要补的文件
need_fill_col = "BRAND_ID" # 需要补的字段
# check("ITEM_STOCK已补202106_10000_drop.tsv")
data, label, need_fill_col, f, data_all = check_data_and_process(need_fill_col, need_fill_file) # 用于数据分析以及数据预处理
train_and_chose_model(RandomForestClassifier(), data, label) # 选择最优模型
fill_data(RandomForestClassifier(), data, label, need_fill_col, f, data_all) # 填补数据
if __name__ == '__main__':
main()