from pyspark.sql.functions import udf
@udf
def ecpm_var_fn(ecpms):
if len(ecpms) == 1:
return 0.0
else:
return float(np.var(ecpms))
da_gb = source_data.groupby('alpos_id').agg(fn.collect_list('ecpm').alias('ecpm'))
ecpm_var = da_gb.withColumn('ecpm_var', ecpm_var_fn(fn.col('ecpm')))
ecpm_var.show()
pyspark 分组对某列取方差
Guess you like
Origin blog.csdn.net/qq_42363032/article/details/118413812
Ranking