sparksql_ fill in missing values
Reference:
https://www.jianshu.com/p/56cff9f6e0be
#为none值填充新值
means = df_miss_no_income.agg(*[fn.mean(c).alias(c)
for c in df_miss_no_income.columns if c != 'gender'])\
.toPandas().to_dict('records')[0]
means['gender'] = "missing"
print(means)
#df.fillna(dict) 填充df中的none值,dict中以各个col字段作为key,要填充的值作为value
df_miss_no_income.fillna(means).show()
{'age': 40.4, 'height': 5.471428571428571, 'gender': 'missing', 'weight': 140.28333333333333, 'id': 4.0}
+---+------------------+------+---+-------+
| id| weight|height|age| gender|
+---+------------------+------+---+-------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3|140.28333333333333| 5.2| 40|missing|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2| 40| F|
| 7| 129.2| 5.3| 42| M|
+---+------------------+------+---+-------+