使用陌陌案例数据,用sparksql进行需求实现
需求说明
字段说明
代码实现
#conding = utf-8
import os
os.environ['JAVA_HOME']= '/export/server/jdk1.8.0_241'
os.environ['PYSPARK_PYTHON']= '/root/anaconda3/envs/pyspark_env/bin/python'
import pandasas pd
from pyspark.sqlimport SparkSession
from pyspark.sql.typesimport StringType,FloatType,IntegerType,DateType
from pyspark.sqlimport functionsas F
import warnings
warnings.filterwarnings('ignore')
if __name__== '__main__':
spark= SparkSession.builder.\
master("local[*]").\
config('spark.sql.shuffle.partitions','4').\
config('spark.sql.warehouse.dir','hdfs://node1:8020/user/hive/warehouse').\
config('hive.metastore.uris','thrift://node1:9083').\
enableHiveSupport().\
getOrCreate()
print('--------------------')
data= spark.sql('select * from db_msg.tb_msg_source')
# 查看数据情况,软件显示不全,截取部分需要用到字段进行查看
print(data.describe('msg_time','sender_name','sender_account',
'receiver_account','sender_gps','receiver_os',
'receiver_phonetype'
).show())
print(data.columns)
def mm_hour(msg_time):
r= pd.to_datetime(msg_time).hour
return r
udf1= F.udf(mm_hour,IntegerType())
data= data.withColumn('hour_info',udf1('msg_time'))
# 查看是否增加成功
print(data.select(data['msg_time'], data['hour_info']).show(3))
# data.persist() #电脑性能不好会卡住
# 统计今日总消息量persist
today_message_number= data.count()
print('统计今日总消息量{}'.format(today_message_number))
# 统计每小时的消息量,发送和接受的用户数
df_msg_hour= data.groupby('hour_info').\
agg(
F.count('msg_time').alias('消息量'),
F.countDistinct('sender_account').alias('发送用户数'),
F.countDistinct('receiver_account').alias('接受用户数')
)
print(df_msg_hour.show(24))
#各地区发送消息数据量
# 发送消息最多的top10用户
msg_sender_user= data.groupby('sender_account').\
agg(F.count('msg_time').alias('消息量')).\
sort(['消息量'],ascending=False)
print(msg_sender_user.show(10))
#接受消息最多的top10用户
msg_receiver_user= data.groupby('receiver_account').\
agg(F.count('msg_time').alias('消息量')).\
sort(['消息量'],ascending=False)
print(msg_receiver_user.show(10))
# 统计手机型号分布情况
phonetype_sender_user= data.groupby('sender_phonetype').\
agg(F.countDistinct('sender_account').alias('数量')).\
sort(['数量'],ascending=False)
print(phonetype_sender_user.show(truncate=False))
# 统计操作系统分布情况
sender_os_cnt= data.groupby('sender_os').\
agg(F.countDistinct('sender_account').alias('数量')).\
sort(['数量'],ascending=False)
print(sender_os_cnt.show(truncate=False))