Spark SQL外部数据源综合使用(Hive和MySQL进行Join)

前置文章:
Spark SQL External Data Source 产生背景 & 概述 & 目标 & 使用
Spark SQL整合Hive使用

先在MySQL中创建数据库、表:

mysql> create database spark;
mysql> use spark;

mysql> CREATE TABLE DEPT(
DEPTNO int(2) PRIMARY KEY,
DNAME VARCHAR(14) ,
LOC VARCHAR(13) ) ;

mysql> INSERT INTO DEPT VALUES(10,'ACCOUNTING','NEW YORK');
mysql> INSERT INTO DEPT VALUES(20,'RESEARCH','DALLAS');
mysql> INSERT INTO DEPT VALUES(30,'SALES','CHICAGO');
mysql> INSERT INTO DEPT VALUES(40,'OPERATIONS','BOSTON');

代码:

/**
 * 使用外部数据源综合查询Hive和MySQL的表数据
 */
object HiveMySQLApp {

  def main(args: Array[String]) {
    val spark = SparkSession.builder().appName("HiveMySQLApp")
      .master("local[2]").getOrCreate()

    // 加载Hive表数据
    val hiveDF = spark.table("emp")

    // 加载MySQL表数据
    val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "spark.DEPT")
                                           .option("user", "root")
                                           .option("password", "root")
                                           .option("driver", "com.mysql.jdbc.Driver")
                                           .load()

    // JOIN
    val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO"))
    resultDF.show

    resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"),
      mysqlDF.col("deptno"), mysqlDF.col("dname")).show

    spark.stop()
  }

}

猜你喜欢

转载自blog.csdn.net/lemonZhaoTao/article/details/80593338