Spark的load和save函数 以及jar包的解决

Load & save 函数

  • MySQL集成(引入MySQL驱动jar)
<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.47</version>
</dependency>
spark.read
    .format("jdbc")
    .option("url", "jdbc:mysql://CentOS:3306/test")
    .option("dbtable", "t_user")
    .option("user", "root")
    .option("password", "root")
    .load().createTempView("t_user")

spark.sql("select * from t_user").show()
  • 读取CSV格式的数据
package demojar

import org.apache.spark.sql.SparkSession

object Testfit {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("CSV").master("local[2]").getOrCreate()
    import spark.implicits._
//    spark.read.format("csv")
//      .option("sep",",")
//      .option("inferSchema","true")
//      .option("header","true")
//      .load("file:///E:/t_user.csv").createTempView("t_user")

      spark.read.format("csv")
          .option("sep",",")
          .option("inferSchema","true")
          .option("header","true")
          .csv("file:///E:/t_user.csv").createTempView("t_user")
  
    spark.sql("select * from t_user").show()

    spark.sparkContext.setLogLevel("FATAL")
    spark.stop()
  }
}

  • 读取json数据
 spark.read.format("json")
     .load("file:///D:/Person.json")
     .createTempView("t_user")
  • 数据导入MySQL
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
    List(
        StructField("id",IntegerType,true),
        StructField("name",StringType,true),
        StructField("age",IntegerType,true)
    )
)
val props = new Properties()
props.put("user", "root")
props.put("password", "root")

spark.createDataFrame(personRDD,schema)
.write.mode("append")	//存储模型为追加
.jdbc("jdbc:mysql://CentOS:3306/test",
      "t_user",props)
  • 存储为json
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
    List(
        StructField("id",IntegerType,true),
        StructField("name",StringType,true),
        StructField("age",IntegerType,true)
    )
)

spark.createDataFrame(personRDD,schema)
.write.mode("append")	//存储模型为追加
.format("json")
.save("file:///D:/aa.json")
  • 存储为CSV格式
val personRDD = spark.sparkContext.parallelize(Array("14 tom 19", "15 jerry 18", "16 kitty 20"))
.map(_.split(" "))
.map(tokens=>Row(tokens(0).toInt,tokens(1),tokens(2).toInt))
//通过StrutType直接指定每个字段的schema
val schema = StructType(
    List(
        StructField("id",IntegerType,true),
        StructField("name",StringType,true),
        StructField("age",IntegerType,true)
    )
)

spark.createDataFrame(personRDD,schema)
.write.mode("append")
.format("csv")
.option("header", "true")//存储表头
.save("file:///D:/csv")
  • 生产分区文件
val spark = SparkSession.builder()
.appName("hellosql")
.master("local[10]")
.getOrCreate()

spark.read.format("csv")
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.load("file:///D:/t_user.csv").createTempView("t_user")

spark.sql("select * from t_user")
.write.format("json")
.mode(SaveMode.Overwrite)
.partitionBy("id")
.save("file:///D:/partitions")

spark.stop()

jar包问题

  • 使用shade插件, 生成fatjar
<plugin>
    <groupId>net.alchim31.maven</groupId>
    <artifactId>scala-maven-plugin</artifactId>
    <version>4.0.1</version>
    <executions>
        <execution>
            <id>scala-compile-first</id>
            <phase>process-resources</phase>
            <goals>
                <goal>add-source</goal>
                <goal>compile</goal>
            </goals>
        </execution>
    </executions>
</plugin>
<plugin>
    <groupId>org.apache.maven.plugins</groupId>
    <artifactId>maven-shade-plugin</artifactId>
    <version>2.4.3</version>
    <executions>
        <execution>
            <phase>package</phase>
            <goals>
                <goal>shade</goal>
            </goals>
            <configuration>
                <filters>
                    <filter>
                        <artifact>*:*</artifact>
                        <excludes>
                            <exclude>META-INF/*.SF</exclude>
                            <exclude>META-INF/*.DSA</exclude>
                            <exclude>META-INF/*.RSA</exclude>
                        </excludes>
                    </filter>
                </filters>
            </configuration>
        </execution>
    </executions>
</plugin>
  • 使用 --packages解决jar包依赖(需要联网)
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://CentOS:7077 --deploy-mode client  --class com.baizhi.demo10parkStreamWordCounts --total-executor-cores 4 --packages 'org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.3,redis.clients:jedis:2.9.0'  /root/original-sparkstream-1.0-SNAPSHOT.jar

注意:以上两种方式均不可以解决MySQL的依赖问题。

  • 使用spark.executor.extraClassPath和spark.driver.extraClassPath能够解决MySQL依赖问题(具体路径根据个人情况而定)
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://CentOS:7077 --deploy-mode client  --class com.baizhi.demo10parkStreamWordCounts --total-executor-cores 4 --packages 'org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.3,redis.clients:jedis:2.9.0' --conf spark.executor.extraClassPath=/root/mysql-xxx.jar --conf  spark.driver.extraClassPath=/root/mysql-xxx.jar  /root/original-sparkstream-1.0-SNAPSHOT.jar

如果大家觉得麻烦,还可以在spark-defaut.conf配置改参数(具体路径依据个人情况而定)

spark.executor.extraClassPath=/root/.ivy2/jars/*
spark.driver.extraClassPath=/root/.ivy2/jars/*

猜你喜欢

转载自blog.csdn.net/weixin_43655644/article/details/95099844
今日推荐