需求
- 将用户id、广告id做成embedding
思路:
- 用户id embedding
- 根据广告id分组
- 统计每个广告id下,有点击的用户id,并根据用户id的点击次数,降序将用户id的序列返回,形成一个序列向量
- 将用户id的序列向量训练word2vec,得到每个用户id的词向量
- 用户id embedding完成
广告id embedding思路一样
Word2vec
关于word2vec原理看这篇文章,这里不再阐述
https://blog.csdn.net/qq_42363032/article/details/113697460
word2vec,pysparkAPI
关键代码
对suuid分组,计算广告id的序列向量
def row_suuidDense(row):
# 计算每个用户,点击数最多的广告id,根据点击数降序
suuid, advertisement, y = row[0], row[1], row[2]
# key:adid, value:点击次数
denseDict = {
}
# 只对有点击的数据进行embedding
for i in range(len(y)):
if y[i] != 0:
if advertisement[i] in denseDict.keys():
tmp = denseDict[advertisement[i]]
denseDict[advertisement[i]] = tmp + 1
else:
denseDict[advertisement[i]] = 1
denseDict=dict(sorted(denseDict.items(), key=lambda x:x[0], reverse=True))
userDense = list(denseDict.keys())
return suuid, userDense
# 分组,计算每个用户id的点击广告id序列
df_suuid = dftrte.groupBy('suuid').agg(
fn.collect_list('advertisement').alias('advertisement'),
fn.collect_list('y').alias('y')
).rdd.map(row_suuidDense).toDF(schema=['suuid', 'adDense'])
df_suuid.printSchema()
df_suuid.show(10, truncate=False)
'''
root
|-- suuid: string (nullable = true)
|-- adDense: array (nullable = true)
| |-- element: string (containsNull = true)
+------------------------------------+----------------+
|suuid |adDense |
+------------------------------------+----------------+
|00000000-0194-f184-785c-7ca27fa7413d|[p89544, p89343]|
|00000000-1bf0-a0dc-8801-73136c45699f|[p89509, p89343......]|
|00000000-217c-8379-032b-f26c741ca303|[11159] |
|00000000-29d0-0c59-dcfb-57765fd9ab17|[p89509, w09189]......]|
|00000000-39a5-7555-2642-791e11b7e51a|[] |
|00000000-4844-dc1e-341a-2b5339dcc7ec|[11295] |
|00000000-57f6-a5fa-ffff-ffffeccc5621|[] |
|00000000-5958-4a79-ffff-ffffe94b9817|[11295] |
|00000000-73b1-f742-4adf-443600bb7359|[] |
|03E82FE6-AE75-4893-A72C-F8DF88E414FF|[] |
+------------------------------------+----------------+
only showing top 10 rows
'''
对于没有点击的数据,可以删掉,也可以填充
训练Word2vec,得到广告id embedding
# 训练Word2vec
# 没有点击的广告id,删除即可
word2Vec = Word2Vec(vectorSize=1, minCount=0, seed=42, inputCol="adDense", outputCol="adDense_embedding",
stepSize=0.025, windowSize=5, maxSentenceLength=100)
model = word2Vec.fit(df_suuid_)
# df_suuid_ = model.transform(df_suuid_) # 得到句子向量
# df_suuid_.select(['suuid', 'userDense', 'userDense_embedding']).show(10)
adembedding = model.getVectors() # 得到词向量,embedding完成
adembedding = adembedding.withColumnRenamed('vector', 'advector').withColumnRenamed('word', 'advertisement')
adembedding.show(truncate=False)
'''
+-------------+----------------------+
|advertisement|advector |
+-------------+----------------------+
|p39412 |[0.16868238151073456] |
|11153 |[0.1493111401796341] |
|11303 |[-0.11455214768648148]|
|11279 |[0.42697083950042725] |
|p41210 |[0.41842252016067505] |
|11309 |[0.4229533076286316] |
|11341 |[3.1918866634368896] |
|11149 |[3.9014792442321777] |
|p91233 |[-0.06415798515081406]|
|11325 |[-0.6188488602638245] |
|11297 |[0.7119559645652771] |
|11369 |[3.152208089828491] |
|p41113 |[-0.3910478353500366] |
|11275 |[3.341848850250244] |
|p40723 |[0.14801627397537231] |
|p41009 |[-0.13066606223583221]|
|p41248 |[-0.3884568512439728] |
|8179 |[0.20165494084358215] |
|p39398 |[-0.07097986340522766]|
|11271 |[-0.45492124557495117]|
+-------------+----------------------+
only showing top 20 rows
'''
全部代码
import warnings
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn
import pyspark.sql.types as T
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.linalg import Vectors
import numpy as np
import time
sc = SparkContext(appName="idEmbedding")
sqlContext = SQLContext(sc)
ss = SparkSession(sc).builder.getOrCreate()
warnings.filterwarnings("ignore", category=DeprecationWarning)
sc.setLogLevel('ERROR')
# 训练Word2Vec
def word2vecEmbedding(df, inputCol, outputCol, vectorName, wordName, flag=True):
'''
训练Word2vec,得到词向量
:param df: dataframe
:param inputCol: id稠密序列
:param outputCol: 句子向量
:param vectorName: 词向量列名
:param wordName: 稠密的id列名
:param flag: 是否打印数据框
:return:
'''
# 没有点击的广告id,删除即可
word2Vec = Word2Vec(vectorSize=1, minCount=0, seed=42, inputCol=inputCol, outputCol=outputCol,
stepSize=0.025, windowSize=5, maxSentenceLength=100)
model = word2Vec.fit(df)
# df_suuid_ = model.transform(df_suuid_)
# df_suuid_.select(['suuid', 'userDense', 'userDense_embedding']).show(10)
idembedding = model.getVectors() # 得到词向量
idembedding = idembedding.withColumnRenamed('vector', vectorName).withColumnRenamed('word', wordName)
print('vectorName count: ', idembedding.count())
if flag:
idembedding.show(10, truncate=False)
return idembedding
# 稠密id序列生成
def row_Dense(row):
'''
计算每个id下,点击最多的id降序序列
ex:DOMS-000(suuid): [100、P0291、2889](adid)
:param row: 根据id分组后的row
:return: id、id的稠密序列
'''
id, denseId, y = row[0], row[1], row[2]
denseDict = {
}
# 只对有点击的数据进行embedding
for i in range(len(y)):
if y[i] != 0:
if denseId[i] in denseDict.keys():
tmp = denseDict[denseId[i]]
denseDict[denseId[i]] = tmp + 1
else:
denseDict[denseId[i]] = 1
denseDict = dict(sorted(denseDict.items(), key=lambda x: x[0], reverse=True))
dense = list(denseDict.keys())
return id, dense
def embedding(df_train, df_test):
'''
对用户id和广告id做embedding
用户id embedding:根据用户id分组,计算每个用户的广告id稠密序列(广告被点击次数降序的广告id序列),训练word2vec,得到每个广告id的词向量
广告id embedding:根据广告id分组,计算每个广告的用户id稠密序列(用户点击次数降序的用户id序列),训练word2vec,得到每个用户id的词向量
:return: 用户id embedding、广告id embedding
'''
# 训练集、测试集合并做embedding
dftrte = df_train.unionAll(df_test)
# # # 对suuid,进行embedding,计算广告id的embedding
# 先算每个suuid下,根据点击数降序的广告id
# 训练word2vec得到广告id的embedding
df_suuid = dftrte.groupBy('suuid').agg(
fn.collect_list('advertisement').alias('advertisement'),
fn.collect_list('y').alias('y')
).rdd.map(row_Dense).toDF(schema=['suuid', 'adDense'])
# df_suuid.printSchema()
# df_suuid.show(1, truncate=False)
df_suuid_ = df_suuid.filter(fn.size(df_suuid['adDense']) != 0)
df_suuid_0 = df_suuid.filter(fn.size(df_suuid['adDense']) == 0)
print('suuid中,点击的广告序列不为0的:', df_suuid_.count())
print('suuid中,点击的广告序列为0的:', df_suuid_0.count())
# 只对有点击的进行embedding,得到每个广告id的embedding
adembedding = word2vecEmbedding(df_suuid_, 'adDense', 'adDense_embedding', 'adVector', 'advertisement', True)
# # # 对广告id,进行embedding,计算用户id的embedding
# 先算每个广告id下,根据用户点击数降序的用户id
# 训练word2vec得到embedding
df_adid = dftrte.groupBy('advertisement').agg(
fn.collect_list('suuid').alias('suuid'),
fn.collect_list('y').alias('y')
).rdd.map(row_Dense).toDF(schema=['advertisement', 'userDense'])
# df_adid.printSchema()
# df_adid.show(1, truncate=False)
df_adid_ = df_adid.filter(fn.size(df_adid['userDense']) != 0)
df_adid_0 = df_adid.filter(fn.size(df_adid['userDense']) == 0)
print('adid中,点击的用户序列不为0的:', df_adid_.count())
print('adid中,点击的用户序列为0的:', df_adid_0.count())
# 只对有点击的进行embedding,得到每个用户id的embedding
userembedding = word2vecEmbedding(df_adid_, 'userDense', 'userDense_embedding', 'userVector', 'suuid', True)
print('embedding完成')
return userembedding, adembedding
def idEmbeddingMain():
train_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-23_train.csv'
test_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-23_test.csv'
eval_path = '/user/renwanxin/ad_ctr_test/tmp/2021-10-24_eval.csv'
df_train = ss.read.options(header='True', inferSchema='True', delimiter=',').csv(train_path)
df_test = ss.read.options(header='True', inferSchema='True', delimiter=',').csv(test_path)
print('原始训练集样本量:{}'.format(df_train.count()))
print('原始测试集样本量:{}'.format(df_test.count()))
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()
print('去重后训练集样本量:{}'.format(df_train.count()))
print('去重后测试集样本量:{}'.format(df_test.count()))
# embedding
userembedding, adembedding = embedding(df_train, df_test)
userembedding.cache()
adembedding.cache()
# 广告id embedding拼接
df_train_advector = df_train.join(adembedding, how='left', on='advertisement')
df_train_advector.select(['suuid', 'advertisement', 'adVector', 'y']).show(10, truncate=False)
print('训练集拼接广告id向量 df_train_advector:', df_train_advector.count())
# 将没有点击的删掉
df_train_advector = df_train_advector.dropna()
# # 将没有点击的填充0.0
# df_train_advector = df_train_advector.fillna(0.0)
print('过滤掉没有点击的广告id向量---', df_train_advector.dropna().count())
# 用户id embdding拼接
df_train_vector = df_train_advector.join(userembedding, how='left', on='suuid')
print('训练集拼接用户id向量 df_train_vector:', df_train_vector.count())
df_train_vector = df_train_vector.dropna()
# df_train_vector = df_train_vector.fillna(0.0)
print('过滤掉没有点击的用户id向量---', df_train_vector.dropna().count())
df_train_vector.select(['suuid', 'advertisement', 'userVector', 'adVector', 'y']).show(10, truncate=False)
df_train_vector.printSchema()
print('embedding后,训练集正样本:{},负样本:{}'.format(df_train_vector.filter(df_train_vector['y'] == 1).count(),
df_train_vector.filter(df_train_vector['y'] == 0).count()))
if __name__ == '__main__':
idEmbeddingMain()