版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lihaogn/article/details/82180032
1 概述
1)产生背景
- every spark application starts with loading data and ends with saving data.
- loading and saving data is not easy.
- datasets stored in various formats/system.
2)目标:easy loading/saving DataFrames
2 操作Parquet文件
package com.lihaogn.spark
import org.apache.spark.sql.SparkSession
object ParquetApp {
def main(args: Array[String]): Unit = {
val spark=SparkSession.builder().appName("ParquetApp").master("local[2]").getOrCreate()
// 加载数据
val userDF=spark.read.format("parquet")
.load("/Users/Mac/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet")
userDF.printSchema()
userDF.show()
userDF.select("name","favorite_color").show()
userDF.select("name","favorite_color").write.format("json").save("/Users/Mac/testdata/jsonout")
// sparksql默认读取的的format就是parquet
spark.read.load("/Users/Mac/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").show()
// 另外一种加载方式
spark.read.format("parquet")
.option("path","/Users/Mac/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet")
.load().show()
spark.stop()
}
}
3 操作Hive表数据
在spark-shell中操作
spark.sql("select name,age from student").write.saveAsTable("student_1")
4 操作MySQL表数据
// Loading data from a JDBC source
// 第一种方式
val jdbcDF = spark.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/hive")
.option("dbtable", "hive.TBLS")
.option("user", "root")
.option("password", "rootroot")
.load()
// 第二种方式
val connectionProperties = new Properties()
connectionProperties.put("user", "root")
connectionProperties.put("password", "root")
val jdbcDF2 = spark.read
.jdbc("jdbc:mysql://localhost:3306/hive", "hive.TBLS", connectionProperties)
// Saving data to a JDBC source
jdbcDF.write
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/hive")
.option("dbtable", "hive.TBLS")
.option("user", "root")
.option("password", "rootroot")
.save()
jdbcDF2.write
.jdbc("jdbc:mysql://localhost:3306/hive", "hive.TBLS", connectionProperties)
// Specifying create table column data types on write
jdbcDF.write
.option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)")
.jdbc("jdbc:mysql://localhost:3306/hive", "hive.TBLS", connectionProperties)