1、两个时间差计算
## 发表时间距离点击时长
import datetime
def combine7(x):
try:
publishdate = str(x["publishdate"])
c_imp_date = str(x["c_imp_date"])
# print(publishdate,c_imp_date)
d1 = datetime.datetime.strptime(publishdate, '%Y%m%d')
d2 = datetime.datetime.strptime(c_imp_date, '%Y%m%d')
delta = d2 - d1
return delta.days
except:
return 100
pass
df2_1['publishdate_to_clickdate'] = df2_1.apply(lambda x: combine7(x),axis=1)
现在时间
from datetime import datetime
print(datetime.now())
2、pandas apply函数加速包
参考:https://www.jianshu.com/p/771f3c13752b
a、swifter
pip install swifter
df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]})
# runs on single core
df['x2'] = df['x'].apply(lambda x: x**2)
# runs on multiple cores
df['x2'] = df['x'].swifter.apply(lambda x: x**2)
# use swifter apply on whole dataframe
df['agg'] = df.swifter.apply(lambda x: x.sum() - x.min())
# use swifter apply on specific columns
df['outCol'] = df[['inCol1', 'inCol2']].swifter.apply(my_func)
df['outCol'] = df[['inCol1', 'inCol2', 'inCol3']].swifter.apply(my_func,
positional_arg, keyword_arg=keyword_argval)
b、parallelize
pip install pandarallel
from pandarallel import pandarallel
# Initialization
pandarallel.initialize(progress_bar=True)
# Parallel apply
df.parallel_apply(func)
c、量化或者转化为numpy操作
参考:https://vimsky.com/article/4327.html
query_cut = dfs1["query_cut"].tolist()
title_cut= dfs1["title_cut"].tolist()
brief_cut= dfs1["brief_cut"].tolist()
cont_all = zip(query_cut,title_cut,brief_cut)
tfidf_vec = TfidfVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')
query_title_tfidf1=[]
query_brief_tfidf1=[]
for (x,y,z) in cont_all:
corpus = [x,y,z]
# print(corpus)
vectoerizer = tfidf_vec.fit_transform(corpus)
cos1 = cosine_similarity(vectoerizer)[0]
query_title_tfidf1.append(cos1[1])
query_brief_tfidf1.append(cos1[2])
dff = pd.DataFrame({"query_title_tfidf1":query_title_tfidf1,"query_brief_tfidf1":query_brief_tfidf1})
dfs2= pd.concat([dfs1,dff], axis=1) #axis=1 横向拼接