pyspark divide el conjunto de entrenamiento, el conjunto de validación y el conjunto de prueba (el mejor de toda la red)

No puedo evitar decir que Spark es realmente problemático para el procesamiento de datos, ¡muy problemático! ! !

Directamente en el código, espero que te pueda ayudar

Implementación de pyspark (única en toda la red)

inserte la descripción de la imagen aquí

El código se puede modificar de acuerdo a sus necesidades, si no hay ningún problema con la prueba, elimine el conteo y ejecútelo, lo que puede ser más rápido que media hora.

'''  spark df 采样、分割train、val、test   '''
def data_sample_split_distribution(df, ss, n=1, rate_val=0.1, rate_test=0.1, rate_test_with=30, ifflag=True):
    '''
     首先 1:n 采样,再划分train、val、test
    :param df: dataframe
    :param ss: sparksession 用于添加自增id
    :param n: 1:n采样
    :param rate_val: 验证集划分比例
    :param rate_test: 测试集划分比例
    :param rate_test_with: 测试集分布和实际线上持平,例如这里是1:30
    :return: df_train、df_val、df_test
    '''
    df_c = df.count()
    print(' 样本总量:', df_c)
    df_pos = df.filter(df.y == 1)
    df_neg = df.filter(df.y == 0)
    df_pos_c = df_pos.count()
    df_neg_c = df_neg.count()
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    if ifflag:
        # 只对负样本添加自增id  tmpid
        df_neg = df_neg.withColumn("tmpid", fn.monotonically_increasing_id())

    # 1:n 采样
    rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
    print(' 负样本欠采样概率:', rate)
    df_neg_sample = df_neg.sample(fraction=rate)
    df_neg_sample_c = df_neg_sample.count()
    print(' 1:{} 采样后,样本量:{},正负样本比为:{} : {}'.format(n, df_neg_sample_c + df_pos_c, 1, np.around(df_neg_sample_c / df_pos_c, decimals=4)))

    if ifflag:
        # 将采样后的负样本tmpid取出,保证测试集负样本和训练、验证独立
        rmlst = [c[0] for c in df_neg_sample.select('tmpid').collect()]
        df_neg_sample = df_neg_sample.drop('tmpid')

        # 划分train、test
        df_pos_train, df_pos_test = df_pos.randomSplit(weights=[1 - rate_test, rate_test], seed=20)
        df_neg_train, _ = df_neg_sample.randomSplit(weights=[1 - rate_test, rate_test], seed=20)

        # 划分test负样本
        df_pos_test_c = df_pos_test.count()
        df_neg_test_c = df_pos_test_c * rate_test_with  # 测试集和实际线上分布持平
        df_neg_test = df_neg.filter(~df_neg['tmpid'].isin(rmlst))
        # 将自增id删除后再添加,目的是取df的 df_neg_test_c 行
        df_neg_test = df_neg_test.drop('tmpid')
        df_neg_test = df_neg_test.withColumn("tmpid", fn.monotonically_increasing_id())
        # df_neg_test = df_neg_test.filter(df_neg_test['tmpid'].between(0, df_neg_test_c + 1000))
        df_neg_test = ss.createDataFrame(df_neg_test.take(df_neg_test_c + 1000))
        df_neg_test = df_neg_test.drop('tmpid')
        dfTest = df_pos_test.unionAll(df_neg_test)
        dfTest.cache()

        # 划分train、val
        df_pos_train, df_pos_val = df_pos_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)
        df_neg_train, df_neg_val = df_neg_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)

        dfTrain = df_pos_train.unionAll(df_neg_train)
        dfVal = df_pos_val.unionAll(df_neg_val)
        dfTrain.cache()
        dfVal.cache()

        # 将采样后的样本洗牌
        dfTrain = df_shuffle(dfTrain)
        dfVal = df_shuffle(dfVal)
        dfTest = df_shuffle(dfTest)

        sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(dfTrain, ifspark=True)
        sample_cVal, sample_pos_cValn, sample_neg_cVal = pos_neg_rate2(dfVal, ifspark=True)
        sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(dfTest, ifspark=True)

        print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
        print('  划分后,验证集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cVal, sample_pos_cValn, sample_neg_cVal, 1, np.around(sample_neg_cVal / sample_pos_cValn, decimals=4)))
        print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))
        return dfTrain, dfVal, dfTest

    else:
        df_new = df_neg_sample.unionAll(df_pos)
        df_new.cache()

        return df_new, None, None
def pos_neg_rate2(df, ifspark=True):
    if ifspark:
        sample_c = df.count()
        sample_pos_c = df.filter(df.y == 1).count()
        sample_neg_c = df.filter(df.y == 0).count()
    else:
        sample_c = len(df)
        sample_pos_c = len(df[df['y'] == 1])
        sample_neg_c = len(df[df['y'] == 0])
    return sample_c, sample_pos_c, sample_neg_c

def flat(l):
    for k in l:
        if not isinstance(k, (list, tuple)):
            yield k
        else:
            yield from flat(k)

# 给df增加一列连续自增id,用于拼接
def mkdf_tojoin(df, ss):
    schema = df.schema.add(StructField("tmpid", LongType()))
    rdd = df.rdd.zipWithIndex()
    rdd = rdd.map(lambda x: list(flat(x)))
    df = ss.createDataFrame(rdd, schema)
    return df

def df_shuffle(df):
    df = df.withColumn('rand', fn.rand() * 10000)
    df = df.orderBy(df['rand'])
    df = df.drop('rand')
    return df

ejemplo:

 样本总量: 10577260
 正样本数:300081,负样本数:10277179,正负样本比为:1 : 34.248016368913724
 负样本欠采样概率: 0.0294
 1:1 采样后,样本量:600662,正负样本比为:1 : 1.0017
  划分后,训练集样本量:485414,正样本数:242513,负样本数:242901,正负样本比:1 : 1.0016
  划分后,验证集样本量:54584,正样本数:27272,负样本数:27312,正负样本比:1 : 1.0015
  划分后,测试集样本量:940176,正样本数:30296,负样本数:909880,正负样本比:1 : 30.033

Implementación sklearn 1: dividir el conjunto de entrenamiento y el conjunto de prueba

'''   pandas df 采样,针对直客数据,分割train、test   '''
def df_sample_split(df, n=1, rate_test_with=15):
    print('样本量:', len(df))
    df_pos = skl_shuffle(df[df['y'] == 1])
    df_neg = skl_shuffle(df[df['y'] == 0])
    df_pos_c = len(df_pos)
    df_neg_c = len(df_neg)
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    df_neg_train = df_neg.iloc[:df_pos_c * n + 1, :]  # epsilon

    # 测试集和实际线上分布持平
    df_neg_test = df_neg.iloc[df_pos_c * n + 2: df_pos_c * rate_test_with + 1, :]  # epsilon

    df_train = skl_shuffle(pd.concat([df_pos, df_neg_train], axis=0, ignore_index=True))
    df_test = skl_shuffle(pd.concat([df_pos, df_neg_test], axis=0, ignore_index=True))

    sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(df_train, ifspark=False)
    sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(df_test, ifspark=False)

    print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
    print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))

    return df_train, df_test

Implementación sklearn 2: train_test_split divide el conjunto de datos y asegura que las proporciones de muestras positivas y negativas sean consistentes

https://blog.csdn.net/qq_42363032/article/details/122231322

Supongo que te gusta

Origin blog.csdn.net/qq_42363032/article/details/123324804
Recomendado
Clasificación