tf2.0 实现DeepFM

原理回顾

在这里插入图片描述

  • 左边用 FM 替换了 Wide&Deep 左边 的 Wide 部分,加强了浅层网络部分特征组合的能力
  • 右边的部分跟 Wide&Deep 的 Deep 部分一样,主要利用 多层神经网络进行所有特征的深层处理
  • 最后的输出层是把 FM 部分的输出和 Deep 部分的输出综合起来,产生最 后的预估结果。这就是 DeepFM 的结构

详细请见这篇文章:

https://blog.csdn.net/qq_42363032/article/details/113696907

基于tf2.0组网DeepFM

import sys, time
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.python.keras.models import save_model, load_model
from tensorflow.keras.models import model_from_yaml
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, roc_auc_score

from toolsnn import *

class myDeepFM():
    def __init__(self, data):
        self.cols = data.columns.values
        # 定义特征组
        self.dense_feats = [f for f in self.cols if f[0] == "I"]
        self.sparse_feats = [f for f in self.cols if f[0] == "C"]

    ''' 处理dense特征 '''
    def process_dense_feats(self, data, feats):
        d = data.copy()
        d = d[feats].fillna(0.0)
        for f in feats:
            d[f] = d[f].apply(lambda x: np.log(x + 1) if x > -1 else -1)

        return d

    ''' 处理sparse特征 '''
    def process_sparse_feats(self, data, feats):
        d = data.copy()
        d = d[feats].fillna("-1")
        for f in feats:
            label_encoder = LabelEncoder()
            d[f] = label_encoder.fit_transform(d[f])

        return d

    ''' 一阶特征 '''
    def first_order_features(self):
        # 构造 dense 特征的输入
        dense_inputs = []
        for f in self.dense_feats:
            _input = Input([1], name=f)
            dense_inputs.append(_input)
        # 将输入拼接到一起,方便连接 Dense 层
        concat_dense_inputs = Concatenate(axis=1)(dense_inputs)  # ?, 13,13是稠密特征维度
        # 然后连上输出为1个单元的全连接层,表示对 dense 变量的加权求和
        fst_order_dense_layer = Dense(1)(concat_dense_inputs)  # ?, 1

        # 构造 sparse 特征的输入,这里单独对每一个 sparse 特征构造输入,目的是方便后面构造二阶组合特征
        sparse_inputs = []
        for f in self.sparse_feats:
            _input = Input([1], name=f)
            sparse_inputs.append(_input)

        sparse_1d_embed = []
        for i, _input in enumerate(sparse_inputs):
            f = self.sparse_feats[i]
            voc_size = total_data[f].nunique()
            # 使用 l2 正则化防止过拟合
            reg = tf.keras.regularizers.l2(0.5)
            _embed = Embedding(voc_size, 1, embeddings_regularizer=reg)(_input)
            # 由于 Embedding 的结果是二维的,因此如果需要在 Embedding 之后加入 Dense 层,则需要先连接上 Flatten 层
            _embed = Flatten()(_embed)
            sparse_1d_embed.append(_embed)
        # 对每个 embedding lookup 的结果 wi 求和
        fst_order_sparse_layer = Add()(sparse_1d_embed)

        # Linear部分合并
        linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])

        return dense_inputs, sparse_inputs, linear_part

    ''' 二阶特征 '''
    def second_order_features(self, sparse_inputs):
        # embedding size
        k = 8

        # 只考虑sparse的二阶交叉
        sparse_kd_embed = []
        for i, _input in enumerate(sparse_inputs):
            f = self.sparse_feats[i]
            voc_size = total_data[f].nunique()
            reg = tf.keras.regularizers.l2(0.7)
            _embed = Embedding(voc_size, k, embeddings_regularizer=reg)(_input)
            sparse_kd_embed.append(_embed)

        # 1.将所有sparse的embedding拼接起来,得到 (n, k)的矩阵,其中n为特征数,k为embedding大小
        concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed)  # ?, n, k

        # 2.先求和再平方
        sum_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(concat_sparse_kd_embed)  # ?, k
        square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])  # ?, k

        # 3.先平方再求和
        square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed])  # ?, n, k
        sum_square_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(square_kd_embed)  # ?, k

        # 4.相减除以2
        sub = Subtract()([square_sum_kd_embed, sum_square_kd_embed])  # ?, k
        sub = Lambda(lambda x: x * 0.5)(sub)  # ?, k
        snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1, keepdims=True))(sub)  # ?, 1

        return concat_sparse_kd_embed, snd_order_sparse_layer

    ''' DNN部分 '''
    def dnn(self, concat_sparse_kd_embed):
        flatten_sparse_embed = Flatten()(concat_sparse_kd_embed)  # ?, n*k

        fc_layer = Dropout(0.5)(Dense(256, activation='relu')(flatten_sparse_embed))  # ?, 256
        fc_layer = Dropout(0.3)(Dense(256, activation='relu')(fc_layer))  # ?, 256
        fc_layer = Dropout(0.1)(Dense(256, activation='relu')(fc_layer))  # ?, 256
        fc_layer = Dropout(0.1)(Dense(128, activation='relu')(fc_layer))  # ?, 256
        fc_layer = Dropout(0.1)(Dense(32, activation='relu')(fc_layer))  # ?, 256

        fc_layer_output = Dense(1)(fc_layer)  # ?, 1

        return fc_layer_output

    ''' 输出结果 '''
    def outRes(self, linear_part, snd_order_sparse_layer, fc_layer_output):
        output_layer = Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
        output_layer = Activation("sigmoid")(output_layer)

        return output_layer

    ''' 编译模型 '''
    def compile_model(self, dense_inputs, sparse_inputs, output_layer):
        model = Model(dense_inputs + sparse_inputs, output_layer)
        # model.summary()

        mNadam = Adam(lr=1e-4, beta_1=0.98, beta_2=0.999)
        model.compile(optimizer=mNadam,
                      loss="binary_crossentropy",
                      metrics=['Precision', 'Recall', tf.keras.metrics.AUC(name='auc')])

        return model


def GeneratorRandomPatchs(train_x, train_y, batch_size):
    totl, col = np.array(train_x).shape  # (39, 500000)  特征数、样本数
    # 保证 steps_per_epoch * epoch 批次的数据够
    while True:
        for index in range(0, col, batch_size):
            xs, ys = [], []
            for t in range(totl):
                xs.append(train_x[t][index: index + batch_size])
            ys.append(train_y[0][index: index + batch_size])
            yield (xs, ys)



if __name__ == '__main__':

    data = pd.read_csv('../../data/criteo_sampled_data.csv')
    # data = pd.read_csv(sys.argv[1])
    # data = shuffle(data)

    print(data.shape)

    print(list(data.columns))

    deepFMmodel = myDeepFM(data)

    # 特征处理
    data_dense = deepFMmodel.process_dense_feats(data, deepFMmodel.dense_feats)
    data_sparse = deepFMmodel.process_sparse_feats(data, deepFMmodel.sparse_feats)
    print('dense count:', len(deepFMmodel.dense_feats))
    print('spare count:', len(deepFMmodel.sparse_feats))
    total_data = pd.concat([data_dense, data_sparse], axis=1)
    total_data['label'] = data['label']

    print(total_data.head(3))

    # 一阶特征
    dense_inputs, sparse_inputs, linear_part = deepFMmodel.first_order_features()
    # 二阶特征
    concat_sparse_kd_embed, snd_order_sparse_layer = deepFMmodel.second_order_features(sparse_inputs)
    # DNN部分
    fc_layer_output = deepFMmodel.dnn(concat_sparse_kd_embed)
    # 输出结果
    output_layer = deepFMmodel.outRes(linear_part, snd_order_sparse_layer, fc_layer_output)

    # 编译模型
    model = deepFMmodel.compile_model(dense_inputs, sparse_inputs, output_layer)

    # 训练
    train_data = total_data.loc[:500000 - 1]
    valid_data = total_data.loc[500000:]

    print('train_data len: ', len(train_data))
    print('validation_data len: ', len(valid_data))


    train_dense_x = [train_data[f].values for f in deepFMmodel.dense_feats]
    train_sparse_x = [train_data[f].values for f in deepFMmodel.sparse_feats]
    train_x = train_dense_x + train_sparse_x
    train_y = [train_data['label'].values]

    val_dense_x = [valid_data[f].values for f in deepFMmodel.dense_feats]
    val_sparse_x = [valid_data[f].values for f in deepFMmodel.sparse_feats]
    val_x = val_dense_x + val_sparse_x
    val_y = [valid_data['label'].values]


    print(train_x)
    print(train_y)
    # exit()

    # model.fit(train_x, train_y,
    #           batch_size=64, epochs=5, verbose=2,
    #           validation_data=(val_x, val_y),
    #           use_multiprocessing=True, workers=4)

    batch_size = 2048
    model.fit_generator(
        GeneratorRandomPatchs(train_x, train_y, batch_size),
        validation_data=(val_x, val_y),
        steps_per_epoch=len(train_data) // batch_size,
        epochs=5,
        verbose=2,
        shuffle=True,
    )

    # path = '/ad_ctr/data/deepFMmodel-10-27.h5'
    path = '../../data/my_deepFMmodel-10-27.h5'
    model.save(path)
    print(' 模型保存完成', time.strftime("%H:%M:%S", time.localtime(time.time())))


    modelnew = tf.keras.models.load_model(path)
    y_pre = modelnew.predict(x=val_x, batch_size=256,)
    print(type(y_pre))
    print(y_pre.shape)

    y = val_y[0]
    y_pre = [int(i) for i in y_pre]
    print(' ', set(y_pre))

    f1 = f1_score(y, y_pre)
    auc = roc_auc_score(y, y_pre)
    acc = accuracy_score(y, y_pre)
    print(' 精确率: %.5f' % (precision_score(y, y_pre)))
    print(' 召回率: %.5f' % (recall_score(y, y_pre)))
    print(' F1: %.5f' % (f1))
    print(' AUC: %.5f' % (auc))
    print(' 准确率: %.5f' % (acc))
    accDealWith(y, y_pre)
    print(' 全量评估完成', time.strftime("%H:%M:%S", time.localtime(time.time())))



    print('==============================')
    print()
    print()
    print()

基于deepctr实现DeepFM

import os, warnings, time, sys
import pickle
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, roc_curve, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from deepctr.models import DeepFM, xDeepFM, MLR, DeepFEFM, DIN, AFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from deepctr.layers import custom_objects
from tensorflow.python.keras.models import save_model, load_model
from tensorflow.keras.models import model_from_yaml
import tensorflow as tf
from tensorflow.python.ops import array_ops
import tensorflow.keras.backend as K
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.models import model_from_json
from tensorflow.keras.callbacks import *
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers.embeddings import Embedding

from toolsnn import *


def train_deepFM2():
    print('DeepFM 模型训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
    start_time_start = time.time()

    # pdtrain:正样本数:565485,负样本数:1133910,正负样本比: 1 : 2.0052
    # pdtest:正样本数:565485,负样本数:1134505,正负样本比: 1 : 2.0063
    # pdeval_full:正样本数:46,负样本数:8253,正负样本比: 1 : 179.413
    pdtrain = pd.read_csv(train_path_ascii)
    pdtest = pd.read_csv(test_path_ascii)
    data = pd.concat([pdtrain, pdtest[pdtest['y'] == 0]], axis=0, ignore_index=True)

    data = data.drop(['WilsonClickRate_all', 'WilsonClickRate_yesterday', 'WilsonAd_clickRate_all',
                      'WilsonAd_clickRate_yesterday'], axis=1)

    # 将 `用户id`、`广告id`、`用户设备`、`多牛广告位id` 用ASCII数值化,转为embedding: 利用卷积原理,将每个字符的ascii码相加作为字符串的数值
    data['suuid'] = data['suuid'].apply(lambda x: sum([ord(i) for i in x]))
    data['advertisement'] = data['advertisement'].apply(lambda x: sum([ord(i) for i in x]))
    # data['position'] = data['position'].apply(lambda x: sum([ord(i) for i in x]))     # 多牛广告位id本身就是float类型,直接embedding
    data['user_modelMake'] = data['user_modelMake'].apply(lambda x: sum([ord(i) for i in x]))

    # double -> float
    data = transformDF(data, ['reserve_price', 'reserve_price_cpc', 'clickRate_all', 'clickRate_yesterday', 'ad_clickRate_yesterday'], float)

    '''   特征处理   '''
    global sparsecols, densecols
    # 稀疏-onehot
    sparsecols = ['hour', 'advert_place', 'province_id', 'port_type', 'user_osID', 'is_holidays', 'is_being',
                  'is_outflow', 'advertiser', 'ad_from', 'payment']
    # ascii embedding
    sparse_ascii = ['suuid', 'advertisement', 'position', 'user_modelMake']
    # 稠密-归一化
    densecols = ['W', 'H', 'reserve_price', 'reserve_price_cpc', 'is_rest_click', 'clickPerHour_yesterday',
                 'display_nums_all', 'click_nums_all', 'display_nums_yesterday', 'click_nums_yesterday',
                 'ad_display_all', 'ad_click_all', 'ad_display_yesterday', 'ad_click_yesterday']
    # 稠密-点击率
    ratecols = ['WHrate', 'clickRate_all', 'clickRate_yesterday', 'ad_clickRate_yesterday']

    global namesoh
    namesoh = {
    
    }
    for sparse in sparsecols:
        onehot = OneHotEncoder()
        arrays = onehot.fit_transform(np.array(data[sparse]).reshape(-1, 1))
        # 将onehot后的稀疏矩阵拼回原来的df
        arrays = arrays.toarray()
        names = [sparse + '_' + str(n) for n in range(len(arrays[0]))]
        namesoh[sparse] = names
        data = pd.concat([data, pd.DataFrame(arrays, columns=names)], axis=1)
        data = data.drop([sparse], axis=1)
        # 保存编码规则
        with open(feature_encode_path.format(sparse) + '.pkl', 'wb') as f:
            pickle.dump(onehot, f)
        # print(' {} onehot完成'.format(sparse))
    print(' onehot完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    for dense in densecols:
        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense] = mms.fit_transform(np.array(data[dense]).reshape(-1, 1))
        with open(feature_encode_path.format(dense) + '.pkl', 'wb') as f:
            pickle.dump(mms, f)
        # print(' {} 归一化完成'.format(dense))
    print(' 归一化完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    print(' columns: ', len(list(data.columns)))

    '''   训练集、测试集、验证集划分   '''
    train_data, test_data = getRata2(data, num=1)
    _, val_data = train_test_split(test_data, test_size=0.2, random_state=1, shuffle=True)

    train_data = shuffle(train_data)
    test_data = shuffle(test_data)
    val_data = shuffle(val_data)
    negBpow(train_data, '训练集')
    negBpow(val_data, '验证集')
    negBpow(test_data, '测试集')

    print(' train_data shape: ', train_data.shape)
    print(' val_data shape: ', val_data.shape)
    print(' test_data shape: ', test_data.shape)

    '''   一阶特征   '''
    sparse_features = []
    for value in namesoh.values():
        for v in value:
            sparse_features.append(v)
    dense_features = densecols + ratecols
    '''   二阶特征   '''
    sparse_feature_columns1 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
                              for i, feat in enumerate(sparse_features)]
    sparse_feature_columns2 = [SparseFeat(feat, vocabulary_size=int(train_data[feat].max() + 1), embedding_dim=4)
                              for i, feat in enumerate(sparse_ascii)]
    sparse_feature_columns = sparse_feature_columns1 + sparse_feature_columns2
    dense_feature_columns = [DenseFeat(feat, 1)
                             for feat in dense_features]

    print(' sparse_features count: ', len(sparse_features))
    print(' dense_features count: ', len(dense_features))

    '''   DNN   '''
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns
    '''   FM   '''
    linear_feature_columns = sparse_feature_columns + dense_feature_columns

    global feature_names
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    print(' feature_names: ', feature_names)

    '''   feed input   '''
    train_x = {
    
    name: train_data[name].values for name in feature_names}
    test_x = {
    
    name: test_data[name].values for name in feature_names}
    val_x = {
    
    name: val_data[name].values for name in feature_names}
    train_y = train_data[['y']].values
    test_y = test_data[['y']].values
    val_y = val_data[['y']].values

    print(' 数据处理完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
    # print(' train_model_input: ', train_x)
    # print(' val_model_input: ', val_x)
    # print('train_y: ', train_y, train_y.shape)

    deep = DeepFM(linear_feature_columns, dnn_feature_columns,
                  dnn_hidden_units=(256, 128, 64, 32, 1),
                  l2_reg_linear=0.01, l2_reg_embedding=0.01,
                  dnn_dropout=0.2,
                  dnn_activation='relu', dnn_use_bn=True, task='binary')

    mNadam = Adam(lr=1e-4, beta_1=0.95, beta_2=0.96)
    deep.compile(optimizer=mNadam, loss='binary_crossentropy',
                 metrics=['AUC', 'Precision', 'Recall'])

    print(' 组网完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
    print(' 训练开始 ', time.strftime("%H:%M:%S", time.localtime(time.time())))
    start_time = time.time()

    '''   训练   '''
    # 早停止:验证集精确率上升幅度小于min_delta,训练停止
    earlystop_callback = EarlyStopping(
        monitor='val_precision', min_delta=0.001, mode='max',
        verbose=2, patience=3)

    generator_flag = False    # fit
    # generator_flag = True       # fit_generator
    if not generator_flag:
        history = deep.fit(
            train_x, train_y, validation_data=(val_x, val_y),
            batch_size=2000,
            epochs=3000,
            verbose=2,
            shuffle=True,
            # callbacks=[earlystop_callback]
        )
    else:
        batch_size = 2000
        train_nums = len(train_data)
        history = deep.fit_generator(
            GeneratorRandomPatchs(train_x, train_y, batch_size, train_nums, feature_names),
            validation_data=(val_x, val_y),
            steps_per_epoch=train_nums // batch_size,
            epochs=3000,
            verbose=2,
            shuffle=True,
            # callbacks=[earlystop_callback]
        )

    end_time = time.time()
    print(' 训练完成', time.strftime("%H:%M:%S", time.localtime(time.time())))
    print((' 训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time - start_time) // 60, (end_time - start_time) % 60)))

    # 模型保存成yaml文件
    save_model(deep, save_path)
    print(' 模型保存完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    # 训练可视化
    visualization(history, saveflag=True, showflag=False, path1=loss_plt_path.format('loss_auc.jpg'), path2=loss_plt_path.format('precision_recall.jpg'))

    # 测试集评估
    scores = deep.evaluate(test_x, test_y, verbose=0)
    print(' %s: %.4f' % (deep.metrics_names[0], scores[0]))
    print(' %s: %.4f' % (deep.metrics_names[1], scores[1]))
    print(' %s: %.4f' % (deep.metrics_names[2], scores[2]))
    print(' %s: %.4f' % (deep.metrics_names[3], scores[3]))
    print(' %s: %.4f' % ('F1', (2*scores[2]*scores[3])/(scores[2]+scores[3])))
    print(' 验证集再评估完成', time.strftime("%H:%M:%S", time.localtime(time.time())))

    # 全量评估
    full_evaluate2()

    end_time_end = time.time()
    print(('DeepFM 模型训练运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60, (end_time_end - start_time_start) % 60)))
    print(('{:.0f}小时'.format((end_time_end - start_time_start) // 60 / 60)))

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/121423022