StructuredStreaming_练习题(读取文件、Spark SQL)

1、使用Structured Streaming读取Socket数据,把单词和单词的反转组成 json 格式写入到当前目录中的file文件夹中

代码块:

package com.czxy.StructuredStreaming_0417

import org.apache.spark.SparkContext
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object Text01 {
  def main(args: Array[String]): Unit = {
    //1.创建SparkSession
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text01").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    val frame: DataFrame = spark.readStream
      .option("host", "node01")
      .option("port", 9999)
      .format("socket")
      .load()
    import  spark.implicits._
    val dataDS: Dataset[String] = frame.as[String]
    val wordDF = dataDS.flatMap(_.split(" "))
      .map({ x => (x, x.reverse) }).toDF("before", "reverse")
    //wordDF.show()
    //输出数据
    wordDF.writeStream
      .format("json")
      .option("path","D:\\Spark_04_17\\file")
      .option("checkpointLocation","json")//必须指定 checkpoint 目录,否则报错
      .trigger(Trigger.ProcessingTime(0))
      .start()
      .awaitTermination()
  }
}
2、请使用Structured Streaming读取student_info文件夹写的csv文件,
package com.czxy.StructuredStreaming_0417

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType

object Text02 {
  def main(args: Array[String]): Unit = {
    //创建sparkSession
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text02").getOrCreate()
    //准备数据结构
    //学号,姓名,性别,所属班级编号,入学日期
    val structType: StructType = new StructType()
      .add("id", "string")
      .add("name", "string")
      .add("sex", "string")
      .add("idStudent", "string")
      .add("date", "string")
    //接收数据
    val csvDatas: DataFrame = spark.readStream.schema(structType).csv("D:\\Spark_04_17\\student_info\\")
    //更业务计算数据
    import spark.implicits._
    //2.1统计出文件中的男女生各有多少人
    val SEX : Dataset[Row] = csvDatas.selectExpr("sex").groupBy("sex").count().sort($"count".desc)
    //2.2统计出姓“王”男生和女生的各有多少人
    val wang :Dataset[Row] = csvDatas.select("name","sex").where("name like '%王%'").groupBy("sex").count().sort($"count".desc)

    //输出数据
    wang.writeStream
      .format("console")
      .outputMode("complete")
      .start()
      .awaitTermination()
  }
}
3、请使用Structured Streaming读取department_info文件夹写的csv文件
package com.czxy.StructuredStreaming_0417

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType

object Text03 {
  
  def main(args: Array[String]): Unit = {
    //创建sparkSession
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text03").getOrCreate()
    //准备数据结构
    //院系编号,院系名称
    val structType: StructType = new StructType()
      .add("id", "string")
      .add("name", "string")

    //接收数据
    val csvDatas: DataFrame = spark.readStream.schema(structType).csv("D:\\\\Spark_04_17\\department_info")
    //更业务计算数据
    import spark.implicits._
    //3.1统计出各个院系的分别多少条信息
    val name : Dataset[Row] = csvDatas.selectExpr("name").groupBy("name").count().sort($"count".desc)

    //输出数据
    name.writeStream
      .format("console")
      .outputMode("complete")
      .start()
      .awaitTermination()
  }
}
4、请使用spark sql读取student_score文件夹写的csv文件
package com.czxy.StructuredStreaming_0417

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Text04 {
  def main(args: Array[String]): Unit = {
    //1.创建SparkSession
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    //2.读取文件
    //学号,姓名,性别,所属班级编号,入学成绩
    val frame: DataFrame = spark.read.csv("D:\\Spark_04_17\\student_score").toDF("id", "name", "sex", "idStudent", "grade")

    import spark.implicits._
    //将RDD转成toDF
    val personDF: DataFrame = frame.toDF("id", "name", "sex", "idStudent", "grade")
    //打印数据
    personDF.createOrReplaceTempView("student")
    //4.1、统计出每个班级的最高分数
    spark.sql("select idStudent,max(grade) from student group by idStudent ").show()
    //4.2、统计出男生最高分
    spark.sql("select sex,max(grade) from student where sex = '男' group by sex").show()
    //4.3、统计出女生最高分
    spark.sql("select sex,max(grade) from student where sex = '女' group by sex").show()
    //4.4、分别统计出男生和女生的分数前三名
    spark.sql("select * from (select name,sex,score ,row_number() over(partition by sex order by score desc)rank from student where sex ='男' or sex='女') a where rank <=3").show()
    //4.5、统计出分数在500分以上的人数
    spark.sql("select count(grade) from student where grade > 500 ").show()
    //4.7、统计出分数在300分以下的人中男女各占多少
    spark.sql("select sex,count(sex) from (select * from student where grade<300)  group by sex").show()
  }
}
5.请使用Spark sql读取class_info文件夹写的csv文件
package com.czxy.StructuredStreaming_0417

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}

object Text05 {
  def main(args: Array[String]): Unit = {
    //1.创建SparkSession
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    //2.读取文件

    val frame: DataFrame = spark.read.csv("D:\\Spark_04_17\\class_info")
    import spark.implicits._
    //将RDD转成toDF
    //班级编号,班级名称,入学日期,所属院系中文名
    val personDF: DataFrame = frame.toDF("id", "name", "date", "classs")
    //打印数据
    personDF.createOrReplaceTempView("classI")
    //    5.1、统计出哪个院系的专业最多
    spark.sql("select classs,max(name) from classI group by classs").show()
    //    5.2、统计出计算机学院中有多少专业
    spark.sql("select count(name) from classI where classs ='计算机学院'").show()
    //    5.3、统计出经济管理学院的会计和工商管理的班级数
    spark.sql("select subStr(name,0,2),count(*) from classI where classs ='经济管理学院' and name like '会计%' or name like '工商管理%' group by subStr(name,0,2)").show()
    //    5.4、分别统计出每个学院的班级最多的专业
    spark.sql("select max(name),classs from classI group by classs").show()
    //    5.5、统计出班级编号以2开头的所有的专业名称
    spark.sql("select id,name from classI where id like '_2%'").show()
  }
}

Spark SQL

表(一)Student (学生表)

属性名 数据类型 可否为空 含 义
Sno varchar (20) 学号
Sname varchar (20) 学生姓名
Ssex varchar (20) 学生性别
Sbirthday datetime 学生出生年月
Class varchar (20) 学生所在班级

表(二)Course(课程表)

属性名 数据类型 可否为空 含 义
Cno varchar (20) 课程号
Cname varchar (20) 课程名称
Tno varchar (20) 教工编号

表(三)Score(成绩表)

属性名 数据类型 可否为空 含 义
Sno varchar (20) 学号
Cno varchar (20) 课程号
Degree varchar (20) 成绩

表(四)Teacher(教师表)

属性名 数据类型 可否为空 含 义
Tno varchar (20) 教工编号
Tname varchar (20) 教工姓名
Tsex varchar (20) 教工性别
Tbirthday varchar (20) 教工出生年月
Prof varchar (20) 职称
Depart varchar (20) 教工所在部门

表1-2数据库中的数据
表(一)Student

Sno Sname Ssex Sbirthday class
108 丘东 1977-09-01 95033
105 匡明 1975-10-02 95031
107 王丽 1976-01-23 95033
101 李军 1976-02-20 95033
109 王芳 1975-02-10 95031
103 陆君 1974-06-03 95031

表(二)Course

Cno Cname Tno
3-105 计算机导论 825
3-245 操作系统 804
6-166 数字电路 856
9-888 高等数学 831

表(三)Score

Sno Cno Degree
103 3-245 86
105 3-245 75
109 3-245 68
103 3-105 92
105 3-105 88
109 3-105 76
101 3-105 64
107 3-105 91
108 3-105 78
101 6-166 85
107 6-166 79
108 6-166 81

表(四)Teacher

Tno Tname Tsex Tbirthday Prof Depart
804 李诚 1958-12-02 副教授 计算机系
856 张旭 1969-03-12 讲师 电子工程系
825 王萍 1972-05-05 助教 计算机系
831 刘冰 1977-08-14 助教 电子工程系

代码:

package com.czxy.StructuredStreaming_0417

import org.apache.spark.sql.{DataFrame, SparkSession}

object Text06 {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder.master("local[*]").appName("Text06").getOrCreate()
    //链接数据库
    val jdbcDFStudent = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
      .option("dbtable", "student")
      .option("user", "root")
      .option("password", "root")
      .load()

    val frameStudent: DataFrame = jdbcDFStudent.toDF()
    frameStudent.createOrReplaceTempView("student")


    val jdbcDFCourse = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
      .option("dbtable", "course")
      .option("user", "root")
      .option("password", "root")
      .load()
    val frameCourse: DataFrame = jdbcDFCourse.toDF()
    frameCourse.createOrReplaceTempView("course")


    val jdbcDFScore = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
      .option("dbtable", "score")
      .option("user", "root")
      .option("password", "root")
      .load()
    val frameScore: DataFrame = jdbcDFScore.toDF()
    frameScore.createOrReplaceTempView("score")

    val jdbcDFTeacher = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
      .option("dbtable", "teacher")
      .option("user", "root")
      .option("password", "root")
      .load()
    val frameTeacher: DataFrame = jdbcDFTeacher.toDF()
    frameTeacher.createOrReplaceTempView("teacher")


    //6、查询Student表中“95031”班或性别为“女”的同学记录。
    spark.sql("select * from `student` where Class = 95031 and Ssex = '女'").show()
    //7、以Class降序,升序查询Student表的所有记录。
    spark.sql("select * from student order by Class desc").show()
    //8、以Sno升序、Degree降序查询Score表的所有记录。
    spark.sql("select * from student order by Sno ASC").show()
    //9、查询“95031”班的学生。
    spark.sql("select * from student where Class = 95031").show()
    //10、查询Score表中的最高分的学生学号和课程号。(子查询或者排序)
    spark.sql("SELECT Sno,Cno FROM score WHERE Degree=(SELECT MAX(Degree) FROM score)").show()
    //11、查询每门课的平均成绩。
    spark.sql("SELECT Cno,AVG(Degree) FROM score GROUP BY Cno").show()
    //12、查询Score表中至少有5名学生选修的并以3开头的课程的平均分数。
    spark.sql("SELECT Cno,AVG(Degree) FROM score GROUP BY Cno HAVING Cno LIKE '3-%' AND COUNT(*)>5").show()
    //13、查询分数大于70,小于90的Sno列。
    spark.sql("SELECT Cno FROM score WHERE Degree BETWEEN 70 AND 90").show()
    //14、查询所有学生的Sname、Cno和Degree列。
    spark.sql("SELECT Sname,Cno,Degree FROM student JOIN Score ON student.Sno=Score.Sno").show()
    //15、查询所有学生的Sno、Cname和Degree列。
    spark.sql("SELECT Sno,Cname,degree FROM Score JOIN Course ON Course.Cno=Score.Cno").show()
    //16、查询所有学生的Sname、Cname和Degree列。
    spark.sql("SELECT student.Sname,Cname,degree FROM student JOIN Score ON student.Sno=Score.Sno JOIN Course ON Course.Cno=Score.Cno").show()
    //17、查询“95033”班学生的平均分。
    spark.sql("SELECT AVG(Degree) FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Class='95033')").show()
    //18、查询所有选修“计算机导论”课程的“女”同学的成绩表。
    spark.sql("SELECT Sno,Degree FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Ssex='女') AND Cno IN (SELECT Cno FROM Course WHERE Cname='计算机导论')").show()
    //19、查询选修“3-105”课程的成绩高于“109”号同学成绩的所有同学的记录。
    spark.sql("SELECT * FROM student,Score WHERE Score.Cno='3-105' AND student.Sno=Score.Sno AND Score.Degree>(SELECT Degree FROM Score WHERE Cno='3-105' AND Sno='109')").show()
    //20、查询score中选学多门课程的同学中分数为非最高分成绩的记录。
    spark.sql("SELECT * FROM Score a WHERE Degree <(SELECT MAX(degree) FROM Score b WHERE a.Cno=b.Cno) AND Sno IN(SELECT Sno FROM Score GROUP BY Sno HAVING COUNT(*)>1)").show()
    //21、查询成绩高于学号为“109”、课程号为“3-105”的成绩的所有记录。
    spark.sql("SELECT * FROM student,Score WHERE student.Sno=Score.Sno AND Score.Degree>(SELECT Degree FROM Score WHERE Cno='3-105' AND Sno='109')").show()
    //22、查询和学号为105的同学同年出生的所有学生的Sno、Sname和Sbirthday列。
    spark.sql("SELECT Sno,Sname,Sbirthday FROM student WHERE YEAR(student.Sbirthday)=(SELECT YEAR(Sbirthday) FROM student WHERE Sno='105')").show()
    //23、查询“张旭“教师任课的学生成绩
    spark.sql("SELECT Degree FROM Score,Teacher,Course WHERE Teacher.Tname='张旭' AND Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno").show()
    //24、查询选修某课程的同学人数多于4人的教师姓名。
    spark.sql("SELECT Tname FROM Teacher WHERE Tno IN (SELECT Tno FROM Course WHERE Cno IN (SELECT Cno FROM Score GROUP BY Cno HAVING COUNT(*)>4) )").show()
    //25、查询95033班和95031班全体学生的记录。
    spark.sql("SELECT * FROM student WHERE Class='95033' OR Class='95031'").show()
    //26、查询存在有85分以上成绩的课程Cno.
    spark.sql("SELECT DISTINCT cno FROM Score WHERE Degree>85").show()
    //27、查询出“计算机系“教师所教课程的成绩表。
    spark.sql("SELECT sno,Cno ,Degree FROM Score WHERE Cno IN (SELECT Cno FROM Course WHERE Tno IN (SELECT tno FROM Teacher WHERE Depart='计算机系'))").show()
    //28、查询“计算机系”与“电子工程系“不同职称的教师的Tname和Prof。
    spark.sql("SELECT Tname,Prof FROM Teacher a WHERE Prof NOT IN(SELECT Prof FROM Teacher b WHERE a.Depart!=b.Depart)").show()
    //29、查询选修编号为“3-105“课程且成绩至少高于选修编号为“3-245”的同学的Cno、Sno和Degree,并按Degree从高到低次序排序。
    spark.sql("SELECT Cno,Sno,Degree FROM Score a WHERE (SELECT Degree FROM Score b WHERE Cno='3-105' AND b.Sno=a.Sno)>=(SELECT Degree FROM Score c WHERE Cno='3-245' AND c.Sno=a.Sno) ORDER BY Degree DESC").show()
    //30、查询选修编号为“3-105”且成绩高于选修编号为“3-245”课程的同学的Cno、Sno和Degree.
    spark.sql("SELECT Cno,Sno,Degree FROM Score a WHERE (SELECT Degree FROM Score b WHERE Cno='3-105' AND b.Sno=a.Sno)>(SELECT Degree FROM Score c WHERE Cno='3-245' AND c.Sno=a.Sno)").show()
    //31、查询所有教师和同学的name、sex和birthday.
    spark.sql("SELECT DISTINCT Sname AS NAME,Ssex AS sex,Sbirthday AS birthday FROM student").show()
    //32、查询所有“女”教师和“女”同学的name、sex和birthday.
    spark.sql("SELECT DISTINCT Sname AS NAME,Ssex AS sex,Sbirthday AS birthday FROM student WHERE Ssex='女'").show()
    //33、查询成绩比该课程平均成绩低的同学的成绩表。
    spark.sql("SELECT Sno,Cno,Degree FROM Score a WHERE a.Degree<(SELECT AVG(Degree) FROM Score b WHERE a.Cno=b.Cno)").show()
    //34、查询所有任课教师的Tname和Depart.
    spark.sql("SELECT Tname,Depart FROM Teacher WHERE Tname IN (SELECT DISTINCT Tname FROM Teacher,Course,Score WHERE Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno)").show()
    //35、查询所有未讲课的教师的Tname和Depart.
    spark.sql("SELECT Tname,Depart FROM Teacher WHERE Tname NOT IN (SELECT DISTINCT Tname FROM Teacher,Course,Score WHERE Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno)").show()
    //36、查询至少有2名男生的班号。
    spark.sql("SELECT Class FROM student WHERE Ssex='男' GROUP BY Class HAVING COUNT(*)>1").show()
    //37、查询Student表中不姓“王”的同学记录。
    spark.sql("SELECT * FROM student WHERE Sname NOT LIKE ('王%')").show()
    //38、查询Student表中每个学生的姓名和年龄。将函数运用到spark sql中去计算,可以直接拿String的类型计算不需要再转换成数值型 默认是会转换成Double类型计算浮点型转整型 39、查询Student表中最大和最小的Sbirthday日期值。 时间格式最大值,最小值
    spark.sql("").show()


    //40、以班号和年龄从大到小的顺序查询Student表中的全部记录。 查询结果排序
    spark.sql("SELECT * FROM student ORDER BY Class DESC,Sbirthday ASC").show()
    //41、查询“男”教师及其所上的课程。
    spark.sql("SELECT Tname,Cname FROM Teacher,Course WHERE Tsex='男' AND Teacher.Tno=Course.Tno").show()
    //42、查询最高分同学的Sno、Cno和Degree列。
    spark.sql("SELECT Sno,Cno,Degree FROM Score WHERE degree=(SELECT MAX(Degree)FROM Score)").show()
    //43、查询和“李军”同性别的所有同学的Sname.
    spark.sql("SELECT Sname FROM student WHERE Ssex=(SELECT Ssex FROM student WHERE Sname='李军') AND Sname NOT IN ('李军')").show()
    //44、查询和“李军”同性别并同班的同学Sname.
    spark.sql("SELECT Sname FROM student WHERE Ssex=(SELECT Ssex FROM student WHERE Sname='李军') AND Sname NOT IN ('李军') AND Class=(SELECT Class FROM student WHERE Sname='李军')").show()
    //45、查询所有选修“计算机导论”课程的“男”同学的成绩表。
    spark.sql("SELECT Sno,Degree FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Ssex='男') AND Cno IN (SELECT Cno FROM Course WHERE Cname='计算机导论')").show()
    //46、查询Student表中的所有记录的Sname、Ssex和Class列。
    spark.sql("SELECT Sname,Ssex,Class FROM studen").show()
    //47、查询教师所有的单位即不重复的Depart列。
    spark.sql("SELECT DISTINCT Depart FROM Teacher").show()
    //48、查询Student表的所有记录
    spark.sql("SELECT * FROM student").show()
    //49、查询Score表中成绩在60到80之间的所有记录。
    spark.sql("SELECT * FROM Score WHERE Degree BETWEEN 60 AND 80").show()
    //50、查询Score表中成绩为85,86或88的记录。
    spark.sql("SELECT * FROM score WHERE Degree IN (85,86,88)").show()
  }
}

发布了124 篇原创文章 · 获赞 154 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/weixin_45737446/article/details/105627061