Recommendation-data analysis

Step by step analysis and graph display from different indicators

Import related packages

%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.rc(‘font’, family=‘SimHei’, size=13)

import os,gc,re,warnings,sys
warnings.filterwarnings(“ignore”)

Read data

path = ‘./data_raw/’

#####train
trn_click = pd.read_csv(path+‘train_click_log.csv’)
item_df = pd.read_csv(path+‘articles.csv’)
item_df = item_df.rename(columns={‘article_id’: ‘click_article_id’}) #重命名,方便后续match
item_emb_df = pd.read_csv(path+‘articles_emb.csv’)
#####test
tst_click = pd.read_csv(path+‘testA_click_log.csv’)

Data preprocessing

Calculate user's click rank and number of clicks

Sort the click timestamp of each user

trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)
tst_click['rank'] = tst_click.groupby(['user_id'] )['click_timestamp'].rank(ascending=False).astype(int) #Count
the number of times users click on the article and add a new column count
trn_click['click_cnts'] = trn_click.groupby(['user_id'])[ 'click_timestamp'].transform('count')
tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')

Data browsing

User click log file_training set

trn_click = trn_click.merge(item_df, how=‘left’, on=[‘click_article_id’])
trn_click.head()
user_id click_article_id click_timestamp click_environment click_deviceGroup click_os click_country click_region click_referrer_type rank click_cnts category_id created_at_ts words_count
0 199999 160417 1507029570190 4 1 17 1 13 1 11 11 281 1506942089000 173
1 199999 5408 1507029571478 4 1 17 1 13 1 10 11 4 1506994257000 118
2 199999 50823 1507029601478 4 1 17 1 13 1 9 11 99 1507013614000 213
3 199998 157770 1507029532200 4 1 17 1 25 5 40 40 281 1506983935000 201
4 199998 96613 1507029671831 4 1 17 1 25 5 39 40 209 1506938444000 185
#用户点击日志信息
trn_click.info()
<class ‘pandas.core.frame.DataFrame’>
Int64Index: 1112623 entries, 0 to 1112622
Data columns (total 14 columns):
user_id 1112623 non-null int64
click_article_id 1112623 non-null int64
click_timestamp 1112623 non-null int64
click_environment 1112623 non-null int64
click_deviceGroup 1112623 non-null int64
click_os 1112623 non-null int64
click_country 1112623 non-null int64
click_region 1112623 non-null int64
click_referrer_type 1112623 non-null int64
rank 1112623 non-null int32
click_cnts 1112623 non-null int64
category_id 1112623 non-null int64
created_at_ts 1112623 non-null int64
words_count 1112623 non-null int64
dtypes: int32(1), int64(13)
memory usage: 123.1 MB

trn_click.describe()
user_id click_article_id click_timestamp click_environment click_deviceGroup click_os click_country click_region click_referrer_type rank click_cnts category_id created_at_ts words_count
count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06
mean 1.221198e+05 1.951541e+05 1.507588e+12 3.947786e+00 1.815981e+00 1.301976e+01 1.310776e+00 1.813587e+01 1.910063e+00 7.118518e+00 1.323704e+01 3.056176e+02 1.506598e+12 2.011981e+02
std 5.540349e+04 9.292286e+04 3.363466e+08 3.276715e-01 1.035170e+00 6.967844e+00 1.618264e+00 7.105832e+00 1.220012e+00 1.016095e+01 1.631503e+01 1.155791e+02 8.343066e+09 5.223881e+01
min 0.000000e + 00 3.000000e + 00 1.507030e + 12 1.000000e + 00 1.000000e + 00 2.000000e + 00 1.000000e + 00 1.000000e + 00 1.000000e + 00 1.000000e + 00 2.000000e + 00 1.000000e + 00 1.166573 e + 12 0.000000e + 00
25% 7.934700e + 04 1.239090e + 05 1.507297e + 12 4.000000e + 00 1.000000e + 00 2.000000e + 00 1.000000e + 00 1.300000e + 01 1.000000e + 00 2.000000e + 00 4.000000 e + 00 2.500000e + 02 1.507220e + 12 1.700000e + 02
50% 1.309670e + 05 2.038900e + 05 1.507596e + 12 4.000000e + 00 1.000000e + 00 1.700000e + 01 1.000000e + 00 2.100000e + 01 2.000000 e + 00 4.000000e + 00 8.000000e + 00 3.280000e + 02 1.507553e + 12 1.970000e + 02
75% 1.704010e + 05 2.777120e + 05 1.507841e + 12 4.000000e + 00 3.000000e + 00 1.700000e + 01 1.000000 e + 00 2.500000e + 01 2.000000e + 00 8.000000e + 00 1.600000e + 01 4.100000e + 02 1.507756e + 12 2.280000e + 02
max 1.999990e+05 3.640460e+05 1.510603e+12 4.000000e+00 5.000000e+00 2.000000e+01 1.100000e+01 2.800000e+01 7.000000e+00 2.410000e+02 2.410000e+02 4.600000e+02 1.510666e+12 6.690000e+03


#The number of users in the training set is 20w trn_click.user_id.nunique()
200000
trn_click.groupby('user_id')['click_article_id'].count().min() # Each user in the training set has clicked on at least two articles
plt.figure()
plt.figure(figsize=(15, 20))
i = 1
for col in ['click_article_id','click_timestamp','click_environment','click_deviceGroup','click_os','click_country',
'click_region ','click_referrer_type','rank','click_cnts']:
plot_envs = plt.subplot(5, 2, i)
i += 1
v = trn_click[col].value_counts().reset_index()[:10]
fig = sns.barplot(x=v['index'], y=v[col])
for item in fig.get_xticklabels():
item.set_rotation(90)
plt.title(col)
plt.tight_layout()
plt.show ()
Insert picture description here
Insert picture description here
Insert picture description here
tst_click = tst_click.merge(item_df, how=‘left’, on=[‘click_article_id’])
tst_click.head()

Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here

Guess you like

Origin blog.csdn.net/m0_49978528/article/details/110207056