Load & save 函数
- MySQL集成(引入MySQL驱动jar)
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
spark.read
.format("jdbc")
.option("url", "jdbc:mysql://CentOS:3306/test")
.option("dbtable", "t_user")
.option("user", "root")
.option("password", "root")
.load().createTempView("t_user")
spark.sql("select * from t_user").show()
- 读取CSV格式的数据
package demojar
import org.apache.spark.sql.SparkSession
object Testfit {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("CSV").master("local[2]").getOrCreate()
import spark.implicits._
// spark.read.format("csv")
// .option("sep",",")
// .option("inferSchema","true")
// .option("header","true")
// .load("file:///E:/t_user.csv").createTempView("t_user")
spark.read.format("csv")
.option("sep",",")
.option("inferSchema","true")
.option("header","true")
.csv("file:///E:/t_user.csv").createTempView("t_user")
spark.sql("select * from t_user").show()
spark.sparkContext.setLogLevel("FATAL")
spark.stop()
}
}
- 读取json数据
spark.read.format("json")
.load("file:///D:/Person.json")
.createTempView("t_user")
- 数据导入MySQL
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
List(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
val props = new Properties()
props.put("user", "root")
props.put("password", "root")
spark.createDataFrame(personRDD,schema)
.write.mode("append") //存储模型为追加
.jdbc("jdbc:mysql://CentOS:3306/test",
"t_user",props)
- 存储为json
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
List(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
spark.createDataFrame(personRDD,schema)
.write.mode("append") //存储模型为追加
.format("json")
.save("file:///D:/aa.json")
- 存储为CSV格式
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
List(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
spark.createDataFrame(personRDD,schema)
.write.mode("append")
.format("csv")
.option("header", "true")//存储表头
.save("file:///D:/csv")
- 生产分区文件
val spark = SparkSession.builder()
.appName("hellosql")
.master("local[10]")
.getOrCreate()
spark.read.format("csv")
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.load("file:///D:/t_user.csv").createTempView("t_user")
spark.sql("select * from t_user")
.write.format("json")
.mode(SaveMode.Overwrite)
.partitionBy("id")
.save("file:///D:/partitions")
spark.stop()
jar包问题
- 使用shade插件, 生成fatjar
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
- 使用 --packages解决jar包依赖(需要联网)
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://CentOS:7077 --deploy-mode client --class com.baizhi.demo10parkStreamWordCounts --total-executor-cores 4 --packages 'org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.3,redis.clients:jedis:2.9.0' /root/original-sparkstream-1.0-SNAPSHOT.jar
注意:以上两种方式均不可以解决MySQL的依赖问题。
- 使用spark.executor.extraClassPath和spark.driver.extraClassPath能够解决MySQL依赖问题(具体路径根据个人情况而定)
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://CentOS:7077 --deploy-mode client --class com.baizhi.demo10parkStreamWordCounts --total-executor-cores 4 --packages 'org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.3,redis.clients:jedis:2.9.0' --conf spark.executor.extraClassPath=/root/mysql-xxx.jar --conf spark.driver.extraClassPath=/root/mysql-xxx.jar /root/original-sparkstream-1.0-SNAPSHOT.jar
如果大家觉得麻烦,还可以在spark-defaut.conf配置改参数(具体路径依据个人情况而定)
spark.executor.extraClassPath=/root/.ivy2/jars/*
spark.driver.extraClassPath=/root/.ivy2/jars/*