pyspark
def data_sample_(df, n):
# 正负样本比
df_c = df.count()
print(' 样本量:', df_c)
df_pos = df.filter(df.y == 1)
df_neg = df.filter(df.y == 0)
df_pos_c = df_pos.select('y').count()
df_neg_c = df_neg.select('y').count()
print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))
pos_rate = df_pos_c / df_c
neg_rate = df_neg_c / df_c
print(' 正样本占比:{},负样本占比:{}'.format(pos_rate, neg_rate))
try:
rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
print(' 负样本采样概率:', rate)
df_neg = df_neg.sample(fraction=rate)
print(' 采样后正负样本比为:{} : {}'.format(1, np.around(df_neg.count() / df_pos_c, decimals=4)))
df_new = df_neg.unionAll(df_pos)
df_new.cache()
sample_c = df_new.count()
sample_pos_c = df_new.filter(df.y == 1).count()
sample_neg_c = df_new.filter(df.y == 0).count()
pos_rate_sample = sample_pos_c / sample_c
neg_rate_sample = sample_neg_c / sample_c
print(' 采样后正样本占比:{},负样本占比:{}'.format(pos_rate_sample, neg_rate_sample))
return df_new, rate, pos_rate, neg_rate, pos_rate_sample, neg_rate_sample
except Exception as e:
print(e)
return df, 0.0, 0.0, 0.0, 0.0, 0.0
pandas
''' pandas 1:n 采样 '''
def data_sample_pan(data, n):
df_pos = data[data['y'] == 1]
df_neg = data[data['y'] == 0]
df_pos_c = len(df_pos)
df_neg_c = len(df_neg)
df_c = df_pos_c + df_neg_c
print(' 样本总数:', df_c)
print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))
print(' 正样本占比:{},负样本占比:{}'.format(df_pos_c / df_c, df_neg_c / df_c))
rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
print(' 负样本采样概率:', rate)
df_neg = df_neg.sample(frac=rate, replace=False)
print(' 采样后正负样本比为:{} : {}'.format(1, np.around(len(df_neg) / df_pos_c, decimals=4)))
df_new = pd.concat([df_pos, df_neg], axis=0, ignore_index=True)
df_new = skl_shuffle(df_new)
return df_new
train_test_split 划分数据集,并保证正负样本比一致:
https://blog.csdn.net/qq_42363032/article/details/122231322?spm=1001.2014.3001.5501