Spark 教育プロジェクトの実践コード

Spark 教育プロジェクトの実践コード

# -*- coding: utf-8 -*-
# Program function:读取数据
#
# -*- coding: utf-8 -*-
# Program function:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Import data types
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

# 这里可以选择本地PySpark环境执行Spark代码,也可以使用虚拟机中PySpark环境,通过os可以配置
os.environ['SPARK_HOME'] = '/export/server/spark-2.3.0-bin-hadoop2.7'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
# 当存在多个版本时,不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON


if __name__ == '__main__':
    # 1-环境变量
    spark = SparkSession.builder \
        .appName('test') \
        .getOrCreate()
    sc = spark.sparkContext
    # 2-获取数据
    jdbcDF = spark.read.format("csv")\
        .option("header", True)\
        .option("sep", "\t")\
        .option("inferSchema", "true")\
        .load("file:///tmp/pycharm_project_553/EduAnalysis/data/eduxxx.csv")

    jdbcDF.printSchema()
    jdbcDF.show(2)

    """
    要求: 找到Top50热点题对应的科目,然后统计这些科目中,分别包含这几道热点题的条目数
    热点题
    题号 热度(数量) 学科
    1   100  数学
    2   99   数学
    3   98   语文

    最终结果:
    学科  热点题数量
    数学  2
    语文 1
    """
    allInfoDS = jdbcDF
    allInfoDS.createOrReplaceTempView("t_answer")
    def q1():
        spark.sql("""
            select subject_id, count(t_answer.question_id) as hot_question_count
            from
            (select question_id, count(*) as frequency
            from t_answer
            group by question_id
            order by frequency desc limit 50) t1
            join t_answer on t1.question_id = t_answer.question_id
            group by subject_id
            order by hot_question_count desc
        """).show()
        # DSL
        top50DS = allInfoDS.groupBy("question_id").count().orderBy("count", ascending=False).limit(50)
        top50DS.join(allInfoDS, "question_id").groupBy("subject_id").count().orderBy("count", ascending=False).show()
    # q1()

    """
    +----------+------------------+
    |subject_id|hot_question_count|
    +----------+------------------+
    | 科目ID_1_数学|               311|
    | 科目ID_2_语文|               276|
    | 科目ID_3_英语|               267|
    +----------+------------------+
    """

    # 各科目推荐题分析
    def q2():
        spark.sql("""
            select t4.subject_id, count(*) as frequency
            from
            (select distinct(t3.question_id), t_answer.subject_id
            from
            (select explode(split(t2.recommendations, ',')) as question_id
            from
            (select recommendations
            from
            (select question_id, count(*) as frequency
            from t_answer
            group by question_id
            order by frequency
            desc limit 20) t1
            join t_answer
            on t1.question_id=t_answer.question_id) t2) t3
            join t_answer
            on t3.question_id=t_answer.question_id) t4
            group by t4.subject_id
            order by frequency desc
        """).show()

        """
        +----------+---------+
        |subject_id|frequency|
        +----------+---------+
        | 科目ID_3_英语|      262|
        | 科目ID_2_语文|      240|
        | 科目ID_1_数学|      239|
        +----------+---------+
        """
        # DSL
        top20DS = allInfoDS.groupBy("question_id").count().orderBy("count", ascending=False).limit(20)
        recommendListDF = top20DS.join(allInfoDS, "question_id")
        questionIdDF = recommendListDF.select(F.explode(F.split("recommendations", ",")).alias("question_id"))
        questionIdAndSubjectIdDF = questionIdDF.distinct().\
            join(allInfoDS.dropDuplicates(["question_id"]), on="question_id").select("question_id", "subject_id")
        questionIdAndSubjectIdDF.groupBy("subject_id").count().orderBy("count", ascending=False).show()
    # q2()

おすすめ

転載: blog.csdn.net/HELLOWORLD2424/article/details/129819234