基于pyspark的用户id训练Word2Vec生成id Embedding

需求

  • 将用户id、广告id做成embedding

思路:

  • 用户id embedding
    • 根据广告id分组
    • 统计每个广告id下,有点击的用户id,并根据用户id的点击次数,降序将用户id的序列返回,形成一个序列向量
    • 将用户id的序列向量训练word2vec,得到每个用户id的词向量
    • 用户id embedding完成

广告id embedding思路一样

Word2vec

关于word2vec原理看这篇文章,这里不再阐述

https://blog.csdn.net/qq_42363032/article/details/113697460

word2vec,pysparkAPI

http://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.Word2Vec.html#pyspark.ml.feature.Word2Vec

关键代码

对suuid分组,计算广告id的序列向量

def row_suuidDense(row):
    # 计算每个用户,点击数最多的广告id,根据点击数降序
    suuid, advertisement, y = row[0], row[1], row[2]
    
    # key:adid, value:点击次数
    denseDict = {
    
    }
    # 只对有点击的数据进行embedding
    for i in range(len(y)):
        if y[i] != 0:
            if advertisement[i] in denseDict.keys():
                tmp = denseDict[advertisement[i]]
                denseDict[advertisement[i]] = tmp + 1
            else:
                denseDict[advertisement[i]] = 1 

    denseDict=dict(sorted(denseDict.items(), key=lambda x:x[0], reverse=True))
    userDense = list(denseDict.keys())

    return suuid, userDense

# 分组,计算每个用户id的点击广告id序列
df_suuid = dftrte.groupBy('suuid').agg(
    fn.collect_list('advertisement').alias('advertisement'),
    fn.collect_list('y').alias('y')
).rdd.map(row_suuidDense).toDF(schema=['suuid', 'adDense'])
df_suuid.printSchema()
df_suuid.show(10, truncate=False)
'''
root
 |-- suuid: string (nullable = true)
 |-- adDense: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------------------------------------+----------------+
|suuid                               |adDense         |
+------------------------------------+----------------+
|00000000-0194-f184-785c-7ca27fa7413d|[p89544, p89343]|
|00000000-1bf0-a0dc-8801-73136c45699f|[p89509, p89343......]|
|00000000-217c-8379-032b-f26c741ca303|[11159]         |
|00000000-29d0-0c59-dcfb-57765fd9ab17|[p89509, w09189]......]|
|00000000-39a5-7555-2642-791e11b7e51a|[]              |
|00000000-4844-dc1e-341a-2b5339dcc7ec|[11295]         |
|00000000-57f6-a5fa-ffff-ffffeccc5621|[]              |
|00000000-5958-4a79-ffff-ffffe94b9817|[11295]         |
|00000000-73b1-f742-4adf-443600bb7359|[]              |
|03E82FE6-AE75-4893-A72C-F8DF88E414FF|[]              |
+------------------------------------+----------------+
only showing top 10 rows
'''

对于没有点击的数据,可以删掉,也可以填充

训练Word2vec,得到广告id embedding

# 训练Word2vec
# 没有点击的广告id,删除即可
word2Vec = Word2Vec(vectorSize=1, minCount=0, seed=42, inputCol="adDense", outputCol="adDense_embedding",
                    stepSize=0.025, windowSize=5, maxSentenceLength=100)
model = word2Vec.fit(df_suuid_)

# df_suuid_ = model.transform(df_suuid_)	# 得到句子向量
# df_suuid_.select(['suuid', 'userDense', 'userDense_embedding']).show(10)
adembedding = model.getVectors()	# 得到词向量,embedding完成
adembedding = adembedding.withColumnRenamed('vector', 'advector').withColumnRenamed('word', 'advertisement')

adembedding.show(truncate=False)
'''
+-------------+----------------------+
|advertisement|advector              |
+-------------+----------------------+
|p39412       |[0.16868238151073456] |
|11153        |[0.1493111401796341]  |
|11303        |[-0.11455214768648148]|
|11279        |[0.42697083950042725] |
|p41210       |[0.41842252016067505] |
|11309        |[0.4229533076286316]  |
|11341        |[3.1918866634368896]  |
|11149        |[3.9014792442321777]  |
|p91233       |[-0.06415798515081406]|
|11325        |[-0.6188488602638245] |
|11297        |[0.7119559645652771]  |
|11369        |[3.152208089828491]   |
|p41113       |[-0.3910478353500366] |
|11275        |[3.341848850250244]   |
|p40723       |[0.14801627397537231] |
|p41009       |[-0.13066606223583221]|
|p41248       |[-0.3884568512439728] |
|8179         |[0.20165494084358215] |
|p39398       |[-0.07097986340522766]|
|11271        |[-0.45492124557495117]|
+-------------+----------------------+
only showing top 20 rows
'''

全部代码

import warnings
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn
import pyspark.sql.types as T
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.linalg import Vectors
import numpy as np
import time

sc = SparkContext(appName="idEmbedding")
sqlContext = SQLContext(sc)
ss = SparkSession(sc).builder.getOrCreate()

warnings.filterwarnings("ignore", category=DeprecationWarning)
sc.setLogLevel('ERROR')

# 训练Word2Vec
def word2vecEmbedding(df, inputCol, outputCol, vectorName, wordName, flag=True):
    '''
    训练Word2vec,得到词向量
    :param df: dataframe
    :param inputCol: id稠密序列
    :param outputCol: 句子向量
    :param vectorName: 词向量列名
    :param wordName: 稠密的id列名
    :param flag: 是否打印数据框
    :return:
    '''
    # 没有点击的广告id,删除即可
    word2Vec = Word2Vec(vectorSize=1, minCount=0, seed=42, inputCol=inputCol, outputCol=outputCol,
                        stepSize=0.025, windowSize=5, maxSentenceLength=100)
    model = word2Vec.fit(df)

    # df_suuid_ = model.transform(df_suuid_)
    # df_suuid_.select(['suuid', 'userDense', 'userDense_embedding']).show(10)
    idembedding = model.getVectors()    # 得到词向量
    idembedding = idembedding.withColumnRenamed('vector', vectorName).withColumnRenamed('word', wordName)
    print('vectorName count: ', idembedding.count())

    if flag:
        idembedding.show(10, truncate=False)
    return idembedding

# 稠密id序列生成
def row_Dense(row):
    '''
    计算每个id下,点击最多的id降序序列
    ex:DOMS-000(suuid): [100、P0291、2889](adid)
    :param row: 根据id分组后的row
    :return: id、id的稠密序列
    '''
    id, denseId, y = row[0], row[1], row[2]

    denseDict = {
    
    }
    # 只对有点击的数据进行embedding
    for i in range(len(y)):
        if y[i] != 0:
            if denseId[i] in denseDict.keys():
                tmp = denseDict[denseId[i]]
                denseDict[denseId[i]] = tmp + 1
            else:
                denseDict[denseId[i]] = 1

    denseDict = dict(sorted(denseDict.items(), key=lambda x: x[0], reverse=True))
    dense = list(denseDict.keys())

    return id, dense

def embedding(df_train, df_test):
    '''
    对用户id和广告id做embedding
    用户id embedding:根据用户id分组,计算每个用户的广告id稠密序列(广告被点击次数降序的广告id序列),训练word2vec,得到每个广告id的词向量
    广告id embedding:根据广告id分组,计算每个广告的用户id稠密序列(用户点击次数降序的用户id序列),训练word2vec,得到每个用户id的词向量
    :return: 用户id embedding、广告id embedding
    '''
    # 训练集、测试集合并做embedding
    dftrte = df_train.unionAll(df_test)

    # # # 对suuid,进行embedding,计算广告id的embedding
    # 先算每个suuid下,根据点击数降序的广告id
    # 训练word2vec得到广告id的embedding
    df_suuid = dftrte.groupBy('suuid').agg(
        fn.collect_list('advertisement').alias('advertisement'),
        fn.collect_list('y').alias('y')
    ).rdd.map(row_Dense).toDF(schema=['suuid', 'adDense'])
    # df_suuid.printSchema()
    # df_suuid.show(1, truncate=False)

    df_suuid_ = df_suuid.filter(fn.size(df_suuid['adDense']) != 0)
    df_suuid_0 = df_suuid.filter(fn.size(df_suuid['adDense']) == 0)
    print('suuid中,点击的广告序列不为0的:', df_suuid_.count())
    print('suuid中,点击的广告序列为0的:', df_suuid_0.count())

    # 只对有点击的进行embedding,得到每个广告id的embedding
    adembedding = word2vecEmbedding(df_suuid_, 'adDense', 'adDense_embedding', 'adVector', 'advertisement', True)

    # # # 对广告id,进行embedding,计算用户id的embedding
    # 先算每个广告id下,根据用户点击数降序的用户id
    # 训练word2vec得到embedding
    df_adid = dftrte.groupBy('advertisement').agg(
        fn.collect_list('suuid').alias('suuid'),
        fn.collect_list('y').alias('y')
    ).rdd.map(row_Dense).toDF(schema=['advertisement', 'userDense'])
    # df_adid.printSchema()
    # df_adid.show(1, truncate=False)

    df_adid_ = df_adid.filter(fn.size(df_adid['userDense']) != 0)
    df_adid_0 = df_adid.filter(fn.size(df_adid['userDense']) == 0)
    print('adid中,点击的用户序列不为0的:', df_adid_.count())
    print('adid中,点击的用户序列为0的:', df_adid_0.count())

    # 只对有点击的进行embedding,得到每个用户id的embedding
    userembedding = word2vecEmbedding(df_adid_, 'userDense', 'userDense_embedding', 'userVector', 'suuid', True)

    print('embedding完成')
    return userembedding, adembedding

def idEmbeddingMain():
    train_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-23_train.csv'
    test_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-23_test.csv'
    eval_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-24_eval.csv'

    df_train = ss.read.options(header='True', inferSchema='True', delimiter=',').csv(train_path)
    df_test = ss.read.options(header='True', inferSchema='True', delimiter=',').csv(test_path)
    print('原始训练集样本量:{}'.format(df_train.count()))
    print('原始测试集样本量:{}'.format(df_test.count()))
    df_train = df_train.drop_duplicates()
    df_test = df_test.drop_duplicates()
    print('去重后训练集样本量:{}'.format(df_train.count()))
    print('去重后测试集样本量:{}'.format(df_test.count()))

    # embedding
    userembedding, adembedding = embedding(df_train, df_test)
    userembedding.cache()
    adembedding.cache()

    # 广告id embedding拼接
    df_train_advector = df_train.join(adembedding, how='left', on='advertisement')
    df_train_advector.select(['suuid', 'advertisement', 'adVector', 'y']).show(10, truncate=False)
    print('训练集拼接广告id向量 df_train_advector:', df_train_advector.count())
    # 将没有点击的删掉
    df_train_advector = df_train_advector.dropna()
    # # 将没有点击的填充0.0
    # df_train_advector = df_train_advector.fillna(0.0)
    print('过滤掉没有点击的广告id向量---', df_train_advector.dropna().count())

    # 用户id embdding拼接
    df_train_vector = df_train_advector.join(userembedding, how='left', on='suuid')
    print('训练集拼接用户id向量 df_train_vector:', df_train_vector.count())
    df_train_vector = df_train_vector.dropna()
    # df_train_vector = df_train_vector.fillna(0.0)
    print('过滤掉没有点击的用户id向量---', df_train_vector.dropna().count())
    df_train_vector.select(['suuid', 'advertisement', 'userVector', 'adVector', 'y']).show(10, truncate=False)
    df_train_vector.printSchema()

    print('embedding后,训练集正样本:{},负样本:{}'.format(df_train_vector.filter(df_train_vector['y'] == 1).count(),
                                               df_train_vector.filter(df_train_vector['y'] == 0).count()))

if __name__ == '__main__':
    
    idEmbeddingMain()

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/121018264