1、使用Structured Streaming读取Socket数据,把单词和单词的反转组成 json 格式写入到当前目录中的file文件夹中
代码块:
package com.czxy.StructuredStreaming_0417
import org.apache.spark.SparkContext
import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object Text01 {
def main(args: Array[String]): Unit = {
//1.创建SparkSession
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text01").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
val frame: DataFrame = spark.readStream
.option("host", "node01")
.option("port", 9999)
.format("socket")
.load()
import spark.implicits._
val dataDS: Dataset[String] = frame.as[String]
val wordDF = dataDS.flatMap(_.split(" "))
.map({ x => (x, x.reverse) }).toDF("before", "reverse")
//wordDF.show()
//输出数据
wordDF.writeStream
.format("json")
.option("path","D:\\Spark_04_17\\file")
.option("checkpointLocation","json")//必须指定 checkpoint 目录,否则报错
.trigger(Trigger.ProcessingTime(0))
.start()
.awaitTermination()
}
}
2、请使用Structured Streaming读取student_info文件夹写的csv文件,
package com.czxy.StructuredStreaming_0417
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType
object Text02 {
def main(args: Array[String]): Unit = {
//创建sparkSession
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text02").getOrCreate()
//准备数据结构
//学号,姓名,性别,所属班级编号,入学日期
val structType: StructType = new StructType()
.add("id", "string")
.add("name", "string")
.add("sex", "string")
.add("idStudent", "string")
.add("date", "string")
//接收数据
val csvDatas: DataFrame = spark.readStream.schema(structType).csv("D:\\Spark_04_17\\student_info\\")
//更业务计算数据
import spark.implicits._
//2.1统计出文件中的男女生各有多少人
val SEX : Dataset[Row] = csvDatas.selectExpr("sex").groupBy("sex").count().sort($"count".desc)
//2.2统计出姓“王”男生和女生的各有多少人
val wang :Dataset[Row] = csvDatas.select("name","sex").where("name like '%王%'").groupBy("sex").count().sort($"count".desc)
//输出数据
wang.writeStream
.format("console")
.outputMode("complete")
.start()
.awaitTermination()
}
}
3、请使用Structured Streaming读取department_info文件夹写的csv文件
package com.czxy.StructuredStreaming_0417
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType
object Text03 {
def main(args: Array[String]): Unit = {
//创建sparkSession
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("Text03").getOrCreate()
//准备数据结构
//院系编号,院系名称
val structType: StructType = new StructType()
.add("id", "string")
.add("name", "string")
//接收数据
val csvDatas: DataFrame = spark.readStream.schema(structType).csv("D:\\\\Spark_04_17\\department_info")
//更业务计算数据
import spark.implicits._
//3.1统计出各个院系的分别多少条信息
val name : Dataset[Row] = csvDatas.selectExpr("name").groupBy("name").count().sort($"count".desc)
//输出数据
name.writeStream
.format("console")
.outputMode("complete")
.start()
.awaitTermination()
}
}
4、请使用spark sql读取student_score文件夹写的csv文件
package com.czxy.StructuredStreaming_0417
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Text04 {
def main(args: Array[String]): Unit = {
//1.创建SparkSession
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//2.读取文件
//学号,姓名,性别,所属班级编号,入学成绩
val frame: DataFrame = spark.read.csv("D:\\Spark_04_17\\student_score").toDF("id", "name", "sex", "idStudent", "grade")
import spark.implicits._
//将RDD转成toDF
val personDF: DataFrame = frame.toDF("id", "name", "sex", "idStudent", "grade")
//打印数据
personDF.createOrReplaceTempView("student")
//4.1、统计出每个班级的最高分数
spark.sql("select idStudent,max(grade) from student group by idStudent ").show()
//4.2、统计出男生最高分
spark.sql("select sex,max(grade) from student where sex = '男' group by sex").show()
//4.3、统计出女生最高分
spark.sql("select sex,max(grade) from student where sex = '女' group by sex").show()
//4.4、分别统计出男生和女生的分数前三名
spark.sql("select * from (select name,sex,score ,row_number() over(partition by sex order by score desc)rank from student where sex ='男' or sex='女') a where rank <=3").show()
//4.5、统计出分数在500分以上的人数
spark.sql("select count(grade) from student where grade > 500 ").show()
//4.7、统计出分数在300分以下的人中男女各占多少
spark.sql("select sex,count(sex) from (select * from student where grade<300) group by sex").show()
}
}
5.请使用Spark sql读取class_info文件夹写的csv文件
package com.czxy.StructuredStreaming_0417
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
object Text05 {
def main(args: Array[String]): Unit = {
//1.创建SparkSession
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//2.读取文件
val frame: DataFrame = spark.read.csv("D:\\Spark_04_17\\class_info")
import spark.implicits._
//将RDD转成toDF
//班级编号,班级名称,入学日期,所属院系中文名
val personDF: DataFrame = frame.toDF("id", "name", "date", "classs")
//打印数据
personDF.createOrReplaceTempView("classI")
// 5.1、统计出哪个院系的专业最多
spark.sql("select classs,max(name) from classI group by classs").show()
// 5.2、统计出计算机学院中有多少专业
spark.sql("select count(name) from classI where classs ='计算机学院'").show()
// 5.3、统计出经济管理学院的会计和工商管理的班级数
spark.sql("select subStr(name,0,2),count(*) from classI where classs ='经济管理学院' and name like '会计%' or name like '工商管理%' group by subStr(name,0,2)").show()
// 5.4、分别统计出每个学院的班级最多的专业
spark.sql("select max(name),classs from classI group by classs").show()
// 5.5、统计出班级编号以2开头的所有的专业名称
spark.sql("select id,name from classI where id like '_2%'").show()
}
}
Spark SQL
表(一)Student (学生表)
属性名 | 数据类型 | 可否为空 | 含 义 |
---|---|---|---|
Sno varchar (20) | 否 | 学号 | |
Sname | varchar (20) | 否 | 学生姓名 |
Ssex | varchar (20) | 否 | 学生性别 |
Sbirthday | datetime | 可 | 学生出生年月 |
Class | varchar (20) | 可 | 学生所在班级 |
表(二)Course(课程表)
属性名 | 数据类型 | 可否为空 | 含 义 |
---|---|---|---|
Cno | varchar (20) | 否 | 课程号 |
Cname | varchar (20) | 否 | 课程名称 |
Tno | varchar (20) | 否 | 教工编号 |
表(三)Score(成绩表)
属性名 | 数据类型 | 可否为空 | 含 义 |
---|---|---|---|
Sno | varchar (20) | 否 | 学号 |
Cno | varchar (20) | 否 | 课程号 |
Degree | varchar (20) | 可 | 成绩 |
表(四)Teacher(教师表)
属性名 | 数据类型 | 可否为空 | 含 义 |
---|---|---|---|
Tno | varchar (20) | 否 | 教工编号 |
Tname | varchar (20) | 否 | 教工姓名 |
Tsex | varchar (20) | 否 | 教工性别 |
Tbirthday | varchar (20) | 可 | 教工出生年月 |
Prof | varchar (20) | 可 | 职称 |
Depart | varchar (20) | 否 | 教工所在部门 |
表1-2数据库中的数据
表(一)Student
Sno | Sname | Ssex | Sbirthday | class |
---|---|---|---|---|
108 | 丘东 | 男 | 1977-09-01 | 95033 |
105 | 匡明 | 男 | 1975-10-02 | 95031 |
107 | 王丽 | 女 | 1976-01-23 | 95033 |
101 | 李军 | 男 | 1976-02-20 | 95033 |
109 | 王芳 | 女 | 1975-02-10 | 95031 |
103 | 陆君 | 男 | 1974-06-03 | 95031 |
表(二)Course
Cno | Cname | Tno |
---|---|---|
3-105 | 计算机导论 | 825 |
3-245 | 操作系统 | 804 |
6-166 | 数字电路 | 856 |
9-888 | 高等数学 | 831 |
表(三)Score
Sno | Cno | Degree |
---|---|---|
103 | 3-245 | 86 |
105 | 3-245 | 75 |
109 | 3-245 | 68 |
103 | 3-105 | 92 |
105 | 3-105 | 88 |
109 | 3-105 | 76 |
101 | 3-105 | 64 |
107 | 3-105 | 91 |
108 | 3-105 | 78 |
101 | 6-166 | 85 |
107 | 6-166 | 79 |
108 | 6-166 | 81 |
表(四)Teacher
Tno | Tname | Tsex | Tbirthday | Prof | Depart |
---|---|---|---|---|---|
804 | 李诚 | 男 | 1958-12-02 | 副教授 | 计算机系 |
856 | 张旭 | 男 | 1969-03-12 | 讲师 | 电子工程系 |
825 | 王萍 | 女 | 1972-05-05 | 助教 | 计算机系 |
831 | 刘冰 | 女 | 1977-08-14 | 助教 | 电子工程系 |
代码:
package com.czxy.StructuredStreaming_0417
import org.apache.spark.sql.{DataFrame, SparkSession}
object Text06 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.master("local[*]").appName("Text06").getOrCreate()
//链接数据库
val jdbcDFStudent = spark.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
.option("dbtable", "student")
.option("user", "root")
.option("password", "root")
.load()
val frameStudent: DataFrame = jdbcDFStudent.toDF()
frameStudent.createOrReplaceTempView("student")
val jdbcDFCourse = spark.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
.option("dbtable", "course")
.option("user", "root")
.option("password", "root")
.load()
val frameCourse: DataFrame = jdbcDFCourse.toDF()
frameCourse.createOrReplaceTempView("course")
val jdbcDFScore = spark.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
.option("dbtable", "score")
.option("user", "root")
.option("password", "root")
.load()
val frameScore: DataFrame = jdbcDFScore.toDF()
frameScore.createOrReplaceTempView("score")
val jdbcDFTeacher = spark.read.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8")
.option("dbtable", "teacher")
.option("user", "root")
.option("password", "root")
.load()
val frameTeacher: DataFrame = jdbcDFTeacher.toDF()
frameTeacher.createOrReplaceTempView("teacher")
//6、查询Student表中“95031”班或性别为“女”的同学记录。
spark.sql("select * from `student` where Class = 95031 and Ssex = '女'").show()
//7、以Class降序,升序查询Student表的所有记录。
spark.sql("select * from student order by Class desc").show()
//8、以Sno升序、Degree降序查询Score表的所有记录。
spark.sql("select * from student order by Sno ASC").show()
//9、查询“95031”班的学生。
spark.sql("select * from student where Class = 95031").show()
//10、查询Score表中的最高分的学生学号和课程号。(子查询或者排序)
spark.sql("SELECT Sno,Cno FROM score WHERE Degree=(SELECT MAX(Degree) FROM score)").show()
//11、查询每门课的平均成绩。
spark.sql("SELECT Cno,AVG(Degree) FROM score GROUP BY Cno").show()
//12、查询Score表中至少有5名学生选修的并以3开头的课程的平均分数。
spark.sql("SELECT Cno,AVG(Degree) FROM score GROUP BY Cno HAVING Cno LIKE '3-%' AND COUNT(*)>5").show()
//13、查询分数大于70,小于90的Sno列。
spark.sql("SELECT Cno FROM score WHERE Degree BETWEEN 70 AND 90").show()
//14、查询所有学生的Sname、Cno和Degree列。
spark.sql("SELECT Sname,Cno,Degree FROM student JOIN Score ON student.Sno=Score.Sno").show()
//15、查询所有学生的Sno、Cname和Degree列。
spark.sql("SELECT Sno,Cname,degree FROM Score JOIN Course ON Course.Cno=Score.Cno").show()
//16、查询所有学生的Sname、Cname和Degree列。
spark.sql("SELECT student.Sname,Cname,degree FROM student JOIN Score ON student.Sno=Score.Sno JOIN Course ON Course.Cno=Score.Cno").show()
//17、查询“95033”班学生的平均分。
spark.sql("SELECT AVG(Degree) FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Class='95033')").show()
//18、查询所有选修“计算机导论”课程的“女”同学的成绩表。
spark.sql("SELECT Sno,Degree FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Ssex='女') AND Cno IN (SELECT Cno FROM Course WHERE Cname='计算机导论')").show()
//19、查询选修“3-105”课程的成绩高于“109”号同学成绩的所有同学的记录。
spark.sql("SELECT * FROM student,Score WHERE Score.Cno='3-105' AND student.Sno=Score.Sno AND Score.Degree>(SELECT Degree FROM Score WHERE Cno='3-105' AND Sno='109')").show()
//20、查询score中选学多门课程的同学中分数为非最高分成绩的记录。
spark.sql("SELECT * FROM Score a WHERE Degree <(SELECT MAX(degree) FROM Score b WHERE a.Cno=b.Cno) AND Sno IN(SELECT Sno FROM Score GROUP BY Sno HAVING COUNT(*)>1)").show()
//21、查询成绩高于学号为“109”、课程号为“3-105”的成绩的所有记录。
spark.sql("SELECT * FROM student,Score WHERE student.Sno=Score.Sno AND Score.Degree>(SELECT Degree FROM Score WHERE Cno='3-105' AND Sno='109')").show()
//22、查询和学号为105的同学同年出生的所有学生的Sno、Sname和Sbirthday列。
spark.sql("SELECT Sno,Sname,Sbirthday FROM student WHERE YEAR(student.Sbirthday)=(SELECT YEAR(Sbirthday) FROM student WHERE Sno='105')").show()
//23、查询“张旭“教师任课的学生成绩
spark.sql("SELECT Degree FROM Score,Teacher,Course WHERE Teacher.Tname='张旭' AND Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno").show()
//24、查询选修某课程的同学人数多于4人的教师姓名。
spark.sql("SELECT Tname FROM Teacher WHERE Tno IN (SELECT Tno FROM Course WHERE Cno IN (SELECT Cno FROM Score GROUP BY Cno HAVING COUNT(*)>4) )").show()
//25、查询95033班和95031班全体学生的记录。
spark.sql("SELECT * FROM student WHERE Class='95033' OR Class='95031'").show()
//26、查询存在有85分以上成绩的课程Cno.
spark.sql("SELECT DISTINCT cno FROM Score WHERE Degree>85").show()
//27、查询出“计算机系“教师所教课程的成绩表。
spark.sql("SELECT sno,Cno ,Degree FROM Score WHERE Cno IN (SELECT Cno FROM Course WHERE Tno IN (SELECT tno FROM Teacher WHERE Depart='计算机系'))").show()
//28、查询“计算机系”与“电子工程系“不同职称的教师的Tname和Prof。
spark.sql("SELECT Tname,Prof FROM Teacher a WHERE Prof NOT IN(SELECT Prof FROM Teacher b WHERE a.Depart!=b.Depart)").show()
//29、查询选修编号为“3-105“课程且成绩至少高于选修编号为“3-245”的同学的Cno、Sno和Degree,并按Degree从高到低次序排序。
spark.sql("SELECT Cno,Sno,Degree FROM Score a WHERE (SELECT Degree FROM Score b WHERE Cno='3-105' AND b.Sno=a.Sno)>=(SELECT Degree FROM Score c WHERE Cno='3-245' AND c.Sno=a.Sno) ORDER BY Degree DESC").show()
//30、查询选修编号为“3-105”且成绩高于选修编号为“3-245”课程的同学的Cno、Sno和Degree.
spark.sql("SELECT Cno,Sno,Degree FROM Score a WHERE (SELECT Degree FROM Score b WHERE Cno='3-105' AND b.Sno=a.Sno)>(SELECT Degree FROM Score c WHERE Cno='3-245' AND c.Sno=a.Sno)").show()
//31、查询所有教师和同学的name、sex和birthday.
spark.sql("SELECT DISTINCT Sname AS NAME,Ssex AS sex,Sbirthday AS birthday FROM student").show()
//32、查询所有“女”教师和“女”同学的name、sex和birthday.
spark.sql("SELECT DISTINCT Sname AS NAME,Ssex AS sex,Sbirthday AS birthday FROM student WHERE Ssex='女'").show()
//33、查询成绩比该课程平均成绩低的同学的成绩表。
spark.sql("SELECT Sno,Cno,Degree FROM Score a WHERE a.Degree<(SELECT AVG(Degree) FROM Score b WHERE a.Cno=b.Cno)").show()
//34、查询所有任课教师的Tname和Depart.
spark.sql("SELECT Tname,Depart FROM Teacher WHERE Tname IN (SELECT DISTINCT Tname FROM Teacher,Course,Score WHERE Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno)").show()
//35、查询所有未讲课的教师的Tname和Depart.
spark.sql("SELECT Tname,Depart FROM Teacher WHERE Tname NOT IN (SELECT DISTINCT Tname FROM Teacher,Course,Score WHERE Teacher.Tno=Course.Tno AND Course.Cno=Score.Cno)").show()
//36、查询至少有2名男生的班号。
spark.sql("SELECT Class FROM student WHERE Ssex='男' GROUP BY Class HAVING COUNT(*)>1").show()
//37、查询Student表中不姓“王”的同学记录。
spark.sql("SELECT * FROM student WHERE Sname NOT LIKE ('王%')").show()
//38、查询Student表中每个学生的姓名和年龄。将函数运用到spark sql中去计算,可以直接拿String的类型计算不需要再转换成数值型 默认是会转换成Double类型计算浮点型转整型 39、查询Student表中最大和最小的Sbirthday日期值。 时间格式最大值,最小值
spark.sql("").show()
//40、以班号和年龄从大到小的顺序查询Student表中的全部记录。 查询结果排序
spark.sql("SELECT * FROM student ORDER BY Class DESC,Sbirthday ASC").show()
//41、查询“男”教师及其所上的课程。
spark.sql("SELECT Tname,Cname FROM Teacher,Course WHERE Tsex='男' AND Teacher.Tno=Course.Tno").show()
//42、查询最高分同学的Sno、Cno和Degree列。
spark.sql("SELECT Sno,Cno,Degree FROM Score WHERE degree=(SELECT MAX(Degree)FROM Score)").show()
//43、查询和“李军”同性别的所有同学的Sname.
spark.sql("SELECT Sname FROM student WHERE Ssex=(SELECT Ssex FROM student WHERE Sname='李军') AND Sname NOT IN ('李军')").show()
//44、查询和“李军”同性别并同班的同学Sname.
spark.sql("SELECT Sname FROM student WHERE Ssex=(SELECT Ssex FROM student WHERE Sname='李军') AND Sname NOT IN ('李军') AND Class=(SELECT Class FROM student WHERE Sname='李军')").show()
//45、查询所有选修“计算机导论”课程的“男”同学的成绩表。
spark.sql("SELECT Sno,Degree FROM Score WHERE Sno IN (SELECT Sno FROM student WHERE Ssex='男') AND Cno IN (SELECT Cno FROM Course WHERE Cname='计算机导论')").show()
//46、查询Student表中的所有记录的Sname、Ssex和Class列。
spark.sql("SELECT Sname,Ssex,Class FROM studen").show()
//47、查询教师所有的单位即不重复的Depart列。
spark.sql("SELECT DISTINCT Depart FROM Teacher").show()
//48、查询Student表的所有记录
spark.sql("SELECT * FROM student").show()
//49、查询Score表中成绩在60到80之间的所有记录。
spark.sql("SELECT * FROM Score WHERE Degree BETWEEN 60 AND 80").show()
//50、查询Score表中成绩为85,86或88的记录。
spark.sql("SELECT * FROM score WHERE Degree IN (85,86,88)").show()
}
}