pyspark divides training set, validation set, and test set (the best in the whole network)

I can't help but say, spark is really troublesome for data processing, very troublesome! ! !

Directly on the code, I hope it can help you

pyspark implementation (unique in the whole network)

insert image description here

The code can be modified according to your needs. If there is no problem with the test, remove the count and run it, which can be faster than half an hour.

'''  spark df 采样、分割train、val、test   '''
def data_sample_split_distribution(df, ss, n=1, rate_val=0.1, rate_test=0.1, rate_test_with=30, ifflag=True):
    '''
     首先 1:n 采样,再划分train、val、test
    :param df: dataframe
    :param ss: sparksession 用于添加自增id
    :param n: 1:n采样
    :param rate_val: 验证集划分比例
    :param rate_test: 测试集划分比例
    :param rate_test_with: 测试集分布和实际线上持平,例如这里是1:30
    :return: df_train、df_val、df_test
    '''
    df_c = df.count()
    print(' 样本总量:', df_c)
    df_pos = df.filter(df.y == 1)
    df_neg = df.filter(df.y == 0)
    df_pos_c = df_pos.count()
    df_neg_c = df_neg.count()
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    if ifflag:
        # 只对负样本添加自增id  tmpid
        df_neg = df_neg.withColumn("tmpid", fn.monotonically_increasing_id())

    # 1:n 采样
    rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
    print(' 负样本欠采样概率:', rate)
    df_neg_sample = df_neg.sample(fraction=rate)
    df_neg_sample_c = df_neg_sample.count()
    print(' 1:{} 采样后,样本量:{},正负样本比为:{} : {}'.format(n, df_neg_sample_c + df_pos_c, 1, np.around(df_neg_sample_c / df_pos_c, decimals=4)))

    if ifflag:
        # 将采样后的负样本tmpid取出,保证测试集负样本和训练、验证独立
        rmlst = [c[0] for c in df_neg_sample.select('tmpid').collect()]
        df_neg_sample = df_neg_sample.drop('tmpid')

        # 划分train、test
        df_pos_train, df_pos_test = df_pos.randomSplit(weights=[1 - rate_test, rate_test], seed=20)
        df_neg_train, _ = df_neg_sample.randomSplit(weights=[1 - rate_test, rate_test], seed=20)

        # 划分test负样本
        df_pos_test_c = df_pos_test.count()
        df_neg_test_c = df_pos_test_c * rate_test_with  # 测试集和实际线上分布持平
        df_neg_test = df_neg.filter(~df_neg['tmpid'].isin(rmlst))
        # 将自增id删除后再添加,目的是取df的 df_neg_test_c 行
        df_neg_test = df_neg_test.drop('tmpid')
        df_neg_test = df_neg_test.withColumn("tmpid", fn.monotonically_increasing_id())
        # df_neg_test = df_neg_test.filter(df_neg_test['tmpid'].between(0, df_neg_test_c + 1000))
        df_neg_test = ss.createDataFrame(df_neg_test.take(df_neg_test_c + 1000))
        df_neg_test = df_neg_test.drop('tmpid')
        dfTest = df_pos_test.unionAll(df_neg_test)
        dfTest.cache()

        # 划分train、val
        df_pos_train, df_pos_val = df_pos_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)
        df_neg_train, df_neg_val = df_neg_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)

        dfTrain = df_pos_train.unionAll(df_neg_train)
        dfVal = df_pos_val.unionAll(df_neg_val)
        dfTrain.cache()
        dfVal.cache()

        # 将采样后的样本洗牌
        dfTrain = df_shuffle(dfTrain)
        dfVal = df_shuffle(dfVal)
        dfTest = df_shuffle(dfTest)

        sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(dfTrain, ifspark=True)
        sample_cVal, sample_pos_cValn, sample_neg_cVal = pos_neg_rate2(dfVal, ifspark=True)
        sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(dfTest, ifspark=True)

        print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
        print('  划分后,验证集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cVal, sample_pos_cValn, sample_neg_cVal, 1, np.around(sample_neg_cVal / sample_pos_cValn, decimals=4)))
        print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))
        return dfTrain, dfVal, dfTest

    else:
        df_new = df_neg_sample.unionAll(df_pos)
        df_new.cache()

        return df_new, None, None
def pos_neg_rate2(df, ifspark=True):
    if ifspark:
        sample_c = df.count()
        sample_pos_c = df.filter(df.y == 1).count()
        sample_neg_c = df.filter(df.y == 0).count()
    else:
        sample_c = len(df)
        sample_pos_c = len(df[df['y'] == 1])
        sample_neg_c = len(df[df['y'] == 0])
    return sample_c, sample_pos_c, sample_neg_c

def flat(l):
    for k in l:
        if not isinstance(k, (list, tuple)):
            yield k
        else:
            yield from flat(k)

# 给df增加一列连续自增id,用于拼接
def mkdf_tojoin(df, ss):
    schema = df.schema.add(StructField("tmpid", LongType()))
    rdd = df.rdd.zipWithIndex()
    rdd = rdd.map(lambda x: list(flat(x)))
    df = ss.createDataFrame(rdd, schema)
    return df

def df_shuffle(df):
    df = df.withColumn('rand', fn.rand() * 10000)
    df = df.orderBy(df['rand'])
    df = df.drop('rand')
    return df

example:

 样本总量: 10577260
 正样本数:300081,负样本数:10277179,正负样本比为:1 : 34.248016368913724
 负样本欠采样概率: 0.0294
 1:1 采样后,样本量:600662,正负样本比为:1 : 1.0017
  划分后,训练集样本量:485414,正样本数:242513,负样本数:242901,正负样本比:1 : 1.0016
  划分后,验证集样本量:54584,正样本数:27272,负样本数:27312,正负样本比:1 : 1.0015
  划分后,测试集样本量:940176,正样本数:30296,负样本数:909880,正负样本比:1 : 30.033

sklearn implementation 1: divide training set and test set

'''   pandas df 采样,针对直客数据,分割train、test   '''
def df_sample_split(df, n=1, rate_test_with=15):
    print('样本量:', len(df))
    df_pos = skl_shuffle(df[df['y'] == 1])
    df_neg = skl_shuffle(df[df['y'] == 0])
    df_pos_c = len(df_pos)
    df_neg_c = len(df_neg)
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    df_neg_train = df_neg.iloc[:df_pos_c * n + 1, :]  # epsilon

    # 测试集和实际线上分布持平
    df_neg_test = df_neg.iloc[df_pos_c * n + 2: df_pos_c * rate_test_with + 1, :]  # epsilon

    df_train = skl_shuffle(pd.concat([df_pos, df_neg_train], axis=0, ignore_index=True))
    df_test = skl_shuffle(pd.concat([df_pos, df_neg_test], axis=0, ignore_index=True))

    sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(df_train, ifspark=False)
    sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(df_test, ifspark=False)

    print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
    print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))

    return df_train, df_test

sklearn implementation 2: train_test_split divides the dataset and ensures that the positive and negative sample ratios are consistent

https://blog.csdn.net/qq_42363032/article/details/122231322

Guess you like

Origin blog.csdn.net/qq_42363032/article/details/123324804