Article directory
I can't help but say, spark is really troublesome for data processing, very troublesome! ! !
Directly on the code, I hope it can help you
pyspark implementation (unique in the whole network)
The code can be modified according to your needs. If there is no problem with the test, remove the count and run it, which can be faster than half an hour.
''' spark df 采样、分割train、val、test '''
def data_sample_split_distribution(df, ss, n=1, rate_val=0.1, rate_test=0.1, rate_test_with=30, ifflag=True):
'''
首先 1:n 采样,再划分train、val、test
:param df: dataframe
:param ss: sparksession 用于添加自增id
:param n: 1:n采样
:param rate_val: 验证集划分比例
:param rate_test: 测试集划分比例
:param rate_test_with: 测试集分布和实际线上持平,例如这里是1:30
:return: df_train、df_val、df_test
'''
df_c = df.count()
print(' 样本总量:', df_c)
df_pos = df.filter(df.y == 1)
df_neg = df.filter(df.y == 0)
df_pos_c = df_pos.count()
df_neg_c = df_neg.count()
print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))
if ifflag:
# 只对负样本添加自增id tmpid
df_neg = df_neg.withColumn("tmpid", fn.monotonically_increasing_id())
# 1:n 采样
rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
print(' 负样本欠采样概率:', rate)
df_neg_sample = df_neg.sample(fraction=rate)
df_neg_sample_c = df_neg_sample.count()
print(' 1:{} 采样后,样本量:{},正负样本比为:{} : {}'.format(n, df_neg_sample_c + df_pos_c, 1, np.around(df_neg_sample_c / df_pos_c, decimals=4)))
if ifflag:
# 将采样后的负样本tmpid取出,保证测试集负样本和训练、验证独立
rmlst = [c[0] for c in df_neg_sample.select('tmpid').collect()]
df_neg_sample = df_neg_sample.drop('tmpid')
# 划分train、test
df_pos_train, df_pos_test = df_pos.randomSplit(weights=[1 - rate_test, rate_test], seed=20)
df_neg_train, _ = df_neg_sample.randomSplit(weights=[1 - rate_test, rate_test], seed=20)
# 划分test负样本
df_pos_test_c = df_pos_test.count()
df_neg_test_c = df_pos_test_c * rate_test_with # 测试集和实际线上分布持平
df_neg_test = df_neg.filter(~df_neg['tmpid'].isin(rmlst))
# 将自增id删除后再添加,目的是取df的 df_neg_test_c 行
df_neg_test = df_neg_test.drop('tmpid')
df_neg_test = df_neg_test.withColumn("tmpid", fn.monotonically_increasing_id())
# df_neg_test = df_neg_test.filter(df_neg_test['tmpid'].between(0, df_neg_test_c + 1000))
df_neg_test = ss.createDataFrame(df_neg_test.take(df_neg_test_c + 1000))
df_neg_test = df_neg_test.drop('tmpid')
dfTest = df_pos_test.unionAll(df_neg_test)
dfTest.cache()
# 划分train、val
df_pos_train, df_pos_val = df_pos_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)
df_neg_train, df_neg_val = df_neg_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)
dfTrain = df_pos_train.unionAll(df_neg_train)
dfVal = df_pos_val.unionAll(df_neg_val)
dfTrain.cache()
dfVal.cache()
# 将采样后的样本洗牌
dfTrain = df_shuffle(dfTrain)
dfVal = df_shuffle(dfVal)
dfTest = df_shuffle(dfTest)
sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(dfTrain, ifspark=True)
sample_cVal, sample_pos_cValn, sample_neg_cVal = pos_neg_rate2(dfVal, ifspark=True)
sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(dfTest, ifspark=True)
print(' 划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
print(' 划分后,验证集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cVal, sample_pos_cValn, sample_neg_cVal, 1, np.around(sample_neg_cVal / sample_pos_cValn, decimals=4)))
print(' 划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))
return dfTrain, dfVal, dfTest
else:
df_new = df_neg_sample.unionAll(df_pos)
df_new.cache()
return df_new, None, None
def pos_neg_rate2(df, ifspark=True):
if ifspark:
sample_c = df.count()
sample_pos_c = df.filter(df.y == 1).count()
sample_neg_c = df.filter(df.y == 0).count()
else:
sample_c = len(df)
sample_pos_c = len(df[df['y'] == 1])
sample_neg_c = len(df[df['y'] == 0])
return sample_c, sample_pos_c, sample_neg_c
def flat(l):
for k in l:
if not isinstance(k, (list, tuple)):
yield k
else:
yield from flat(k)
# 给df增加一列连续自增id,用于拼接
def mkdf_tojoin(df, ss):
schema = df.schema.add(StructField("tmpid", LongType()))
rdd = df.rdd.zipWithIndex()
rdd = rdd.map(lambda x: list(flat(x)))
df = ss.createDataFrame(rdd, schema)
return df
def df_shuffle(df):
df = df.withColumn('rand', fn.rand() * 10000)
df = df.orderBy(df['rand'])
df = df.drop('rand')
return df
example:
样本总量: 10577260
正样本数:300081,负样本数:10277179,正负样本比为:1 : 34.248016368913724
负样本欠采样概率: 0.0294
1:1 采样后,样本量:600662,正负样本比为:1 : 1.0017
划分后,训练集样本量:485414,正样本数:242513,负样本数:242901,正负样本比:1 : 1.0016
划分后,验证集样本量:54584,正样本数:27272,负样本数:27312,正负样本比:1 : 1.0015
划分后,测试集样本量:940176,正样本数:30296,负样本数:909880,正负样本比:1 : 30.033
sklearn implementation 1: divide training set and test set
''' pandas df 采样,针对直客数据,分割train、test '''
def df_sample_split(df, n=1, rate_test_with=15):
print('样本量:', len(df))
df_pos = skl_shuffle(df[df['y'] == 1])
df_neg = skl_shuffle(df[df['y'] == 0])
df_pos_c = len(df_pos)
df_neg_c = len(df_neg)
print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))
df_neg_train = df_neg.iloc[:df_pos_c * n + 1, :] # epsilon
# 测试集和实际线上分布持平
df_neg_test = df_neg.iloc[df_pos_c * n + 2: df_pos_c * rate_test_with + 1, :] # epsilon
df_train = skl_shuffle(pd.concat([df_pos, df_neg_train], axis=0, ignore_index=True))
df_test = skl_shuffle(pd.concat([df_pos, df_neg_test], axis=0, ignore_index=True))
sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(df_train, ifspark=False)
sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(df_test, ifspark=False)
print(' 划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
print(' 划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))
return df_train, df_test