SparkSql常用操作

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_41455420/article/details/84346541


Spark SQL中的DataFrame类似于一张关系型数据表。在关系型数据库中对单表或进行的查询操作,在DataFrame中都可以通过调用Scala提供的DataFrame API来实现。

一、数据库数据准备

-- ----------------------------
-- Table structure for persion
-- ----------------------------
DROP TABLE IF EXISTS `persion`;
CREATE TABLE `persion` (
  `id` int(11) NOT NULL,
  `name` text,
  `age` int(11) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of persion
-- ----------------------------
INSERT INTO `persion` VALUES ('2', 'lisi', '29');
INSERT INTO `persion` VALUES ('3', 'wangwu', '25');
INSERT INTO `persion` VALUES ('1', 'zhangsan', '20');
INSERT INTO `persion` VALUES ('6', 'kobe', '40');
INSERT INTO `persion` VALUES ('5', 'tianqi', '35');
INSERT INTO `persion` VALUES ('4', 'zhaoliu', '30');

-- ----------------------------
-- Table structure for student
-- ----------------------------
DROP TABLE IF EXISTS `student`;
CREATE TABLE `student` (
  `id` int(20) NOT NULL,
  `name` varchar(50) DEFAULT NULL,
  `height` varchar(10) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of student
-- ----------------------------
INSERT INTO `student` VALUES ('1', 'zhangsan', '20');
INSERT INTO `student` VALUES ('2', 'lisi', '12');
INSERT INTO `student` VALUES ('3', 'wangwu', '123');
INSERT INTO `student` VALUES ('4', 'zhaoliu', '34');
INSERT INTO `student` VALUES ('5', 'tianqi', '100');

-- ----------------------------
-- Table structure for student_class_cj
-- ----------------------------
DROP TABLE IF EXISTS `student_class_cj`;
CREATE TABLE `student_class_cj` (
  `id` int(10) NOT NULL,
  `name` varchar(20) DEFAULT NULL,
  `class_no` varchar(10) DEFAULT NULL,
  `class_cj` varchar(10) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of student_class_cj
-- ----------------------------
INSERT INTO `student_class_cj` VALUES ('1', 'tianqi', '01', '120');
INSERT INTO `student_class_cj` VALUES ('2', 'tianqi', '02', '99');
INSERT INTO `student_class_cj` VALUES ('3', 'zhaoliu', '01', '60');
INSERT INTO `student_class_cj` VALUES ('10', 'lisi', '04', '77');
INSERT INTO `student_class_cj` VALUES ('5', 'lisi', '02', '100');
INSERT INTO `student_class_cj` VALUES ('6', 'lisi', '03', '52');
INSERT INTO `student_class_cj` VALUES ('7', 'tianqi', '03', '150');

二、实战

**主要操作:**查询、过滤、join、聚合

package com.little_devil.mysql

import java.util.Properties
import org.apache.spark.SparkContext
import org.apache.spark.sql._

object loadDataFromMysql {
  def main(args: Array[String]): Unit = {
    //1、创建sparkSession
    val spark: SparkSession = SparkSession.builder().appName("loadDataFromMysql").master("local[2]").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")
    //2、读取mysql表中的数据
    //定义数据库连接
    val url = "jdbc:mysql://192.168.52.140:3306/spark"
    //定义相关属性
    val properties = new Properties()
    properties.setProperty("user", "root")
    properties.setProperty("password", "root")

    val mysqlDF: DataFrame = spark.read.jdbc(url, "persion", properties)
    val mysqlDF2: DataFrame = spark.read.jdbc(url, "student", properties)
    val mysqlDF3: DataFrame = spark.read.jdbc(url, "student_class_cj", properties)

    //打印结果数据
    mysqlDF.show()
    mysqlDF2.show()
    mysqlDF3.show()

    val join1: DataFrame = mysqlDF.join(mysqlDF2,mysqlDF("name")===mysqlDF2("name"),"left")
    join1.show()
    join1.limit(3).show()
    val view1: DataFrame = join1.select(mysqlDF("id"),mysqlDF("name"),mysqlDF("age"),mysqlDF2("height"))
    view1.show()
    val view2: DataFrame = view1.join(mysqlDF3,view1("name")===mysqlDF3("name"),"left").select(view1("*"),
      mysqlDF3("class_no"),mysqlDF3("class_cj")).orderBy(view1("id").desc)
    view2.show()
    val view3: DataFrame = view2.agg("id"-> "max")
    view3.show()
    val view4: DataFrame = view2.where("class_cj is not null and class_no is not null").groupBy("name").agg("class_no"->"max","class_cj"->"avg")
    view4.show()
    val view5: DataFrame = view2.where("class_cj is not null and class_no is not null").groupBy("name").agg(Map("class_no"->"max","class_cj"->"avg"))
    view5.show()

    spark.stop()

  }
}

执行结果:

18/11/22 18:37:19 INFO SharedState: Warehouse path is 'file:/D:/idea/spark-to-mysql/spark-warehouse'.
+---+--------+---+
| id|    name|age|
+---+--------+---+
|  2|    lisi| 29|
|  3|  wangwu| 25|
|  1|zhangsan| 20|
|  6|    kobe| 40|
|  5|  tianqi| 35|
|  4| zhaoliu| 30|
+---+--------+---+

+---+--------+------+
| id|    name|height|
+---+--------+------+
|  1|zhangsan|    20|
|  2|    lisi|    12|
|  3|  wangwu|   123|
|  4| zhaoliu|    34|
|  5|  tianqi|   100|
+---+--------+------+

+---+-------+--------+--------+
| id|   name|class_no|class_cj|
+---+-------+--------+--------+
|  1| tianqi|      01|     120|
|  2| tianqi|      02|      99|
|  3|zhaoliu|      01|      60|
| 10|   lisi|      04|      77|
|  5|   lisi|      02|     100|
|  6|   lisi|      03|      52|
|  7| tianqi|      03|     150|
+---+-------+--------+--------+

+---+--------+---+----+--------+------+
| id|    name|age|  id|    name|height|
+---+--------+---+----+--------+------+
|  3|  wangwu| 25|   3|  wangwu|   123|
|  4| zhaoliu| 30|   4| zhaoliu|    34|
|  6|    kobe| 40|null|    null|  null|
|  1|zhangsan| 20|   1|zhangsan|    20|
|  2|    lisi| 29|   2|    lisi|    12|
|  5|  tianqi| 35|   5|  tianqi|   100|
+---+--------+---+----+--------+------+

+---+--------+---+---+--------+------+
| id|    name|age| id|    name|height|
+---+--------+---+---+--------+------+
|  3|  wangwu| 25|  3|  wangwu|   123|
|  1|zhangsan| 20|  1|zhangsan|    20|
|  2|    lisi| 29|  2|    lisi|    12|
+---+--------+---+---+--------+------+

+---+--------+---+------+
| id|    name|age|height|
+---+--------+---+------+
|  3|  wangwu| 25|   123|
|  4| zhaoliu| 30|    34|
|  6|    kobe| 40|  null|
|  1|zhangsan| 20|    20|
|  2|    lisi| 29|    12|
|  5|  tianqi| 35|   100|
+---+--------+---+------+

+---+--------+---+------+--------+--------+
| id|    name|age|height|class_no|class_cj|
+---+--------+---+------+--------+--------+
|  6|    kobe| 40|  null|    null|    null|
|  5|  tianqi| 35|   100|      02|      99|
|  5|  tianqi| 35|   100|      03|     150|
|  5|  tianqi| 35|   100|      01|     120|
|  4| zhaoliu| 30|    34|      01|      60|
|  3|  wangwu| 25|   123|    null|    null|
|  2|    lisi| 29|    12|      04|      77|
|  2|    lisi| 29|    12|      02|     100|
|  2|    lisi| 29|    12|      03|      52|
|  1|zhangsan| 20|    20|    null|    null|
+---+--------+---+------+--------+--------+

+-------+
|max(id)|
+-------+
|      6|
+-------+

+-------+-------------+-----------------+
|   name|max(class_no)|    avg(class_cj)|
+-------+-------------+-----------------+
|zhaoliu|           01|             60.0|
|   lisi|           04|76.33333333333333|
| tianqi|           03|            123.0|
+-------+-------------+-----------------+

+-------+-------------+-----------------+
|   name|max(class_no)|    avg(class_cj)|
+-------+-------------+-----------------+
|zhaoliu|           01|             60.0|
|   lisi|           04|76.33333333333333|
| tianqi|           03|            123.0|
+-------+-------------+-----------------+


Process finished with exit code 0

喜欢就点赞评论+关注吧

在这里插入图片描述

感谢阅读,希望能帮助到大家,谢谢大家的支持!

猜你喜欢

转载自blog.csdn.net/qq_41455420/article/details/84346541
今日推荐