Create a Dataset
object DatasetCreation { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("SparkSessionTest") .getOrCreate() import spark.implicits._ //1: range val ds1 = spark.range(0, 10, 2, 2) ds1.show() val dogs = Seq(Dog("jitty", "red"), Dog("mytty", "yellow")) val cats = Seq(new Cat("jitty", 2), new Cat("mytty", 4)) //2: 从Seq[T]中创建 val data = dogs val ds = spark.createDataset(data) ds.show() //3: 从RDD[T]中创建 val dogRDD = spark.sparkContext.parallelize(dogs) val dogDS = spark.createDataset(dogRDD) dogDS.show() val catRDD = spark.sparkContext.parallelize(cats) //val catDSWithoutEncoder = spark.createDataset(catRDD) val catDS = spark.createDataset(catRDD)(Encoders.bean(classOf[Cat])) catDS.show() //Encoders 负责JVM对象类型与spark SQL内部数据类型之间的转换 val intDs = Seq(1, 2, 3).toDS() // implicitly provided (spark.implicits.newIntEncoder) val seqIntDs = Seq(Seq(1), Seq(2), Seq(3)).toDS() // implicitly provided (spark.implicits.newIntSeqEncoder) val arrayIntDs = Seq(Array(1), Array(2), Array(3)).toDS() // implicitly provided (spark.implicits.newIntArrayEncoder) //支持的Encoders有如下: Encoders.product //tuples and case classes Encoders.scalaBoolean Encoders.scalaByte Encoders.scalaDouble Encoders.scalaFloat Encoders.scalaInt Encoders.scalaLong Encoders.scalaShort Encoders.bean(classOf[Cat]) spark.stop() } }
DataFrame create
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} object DataFrameCreation { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("SparkSessionTest") .getOrCreate() //1: 从RDD[A <: Product]中创建, case class 和 tuple都是Product的子类 val rdd = spark.sparkContext.textFile("").map(line => { val splitData = line.split(",") Dog(splitData(0), splitData(1)) }) val tupleRDD = spark.sparkContext.parallelize(Seq(("jitty", 2), ("mytty", 4))) spark.createDataFrame(rdd) spark.createDataFrame(tupleRDD) val dogRDD = spark.sparkContext.parallelize(Seq(Dog("jitty", "red"), Dog("mytty", "yellow"))) val dogDf = spark.createDataFrame(dogRDD) dogDf.show() //2: 从Seq[A <: Product]中创建 val dogSeq = Seq(Dog("jitty", "red"), Dog("mytty", "yellow")) spark.createDataFrame(dogSeq).show() //3:用RDD[_] + class创建,这个class是java的bean val catRDD = spark.sparkContext.parallelize(Seq(new Cat("jitty", 2), new Cat("mytty", 4))) //val catDf = spark.createDataFrame(catRDD) val catDf = spark.createDataFrame(catRDD, classOf[Cat]) catDf.show() catDf.createOrReplaceTempView("cat") spark.sql("select * from cat").show () // Note that the order of attributes check out the cat is not fixed // 4: using RDD [Row] + schema creation val rowSeq = Seq ( "tom, 30", "katy, 46") .map (_. Split ( ",")). Map (P => Row (P (0), P (. 1) .trim.toInt)) Val rowRDD = spark.sparkContext.parallelize (rowSeq) Val = Schema StructType ( StructField ( "name", StringType, false) :: StructField ( "Age", IntegerType, to true) :: Nil) Val DataFrame = spark.createDataFrame (rowRDD, Schema) dataFrame.printSchema dataFrame.show () // 5: Create from an external data source Val spark.read.json DF = (S "$ {} /IoT_device_info.json BASE_PATH") df.show () Spark.stop() } }
RDD & Dataset & DataFrame conversion
package com.twq.dataset.creation import com.twq.dataset.Dog import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} object RDDDatasetTransform { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("RDDDatasetTransform") .getOrCreate() val dogs = Seq(Dog("jitty", "red"), Dog("mytty", "yellow")) val dogRDD = spark.sparkContext.parallelize(dogs) //1: RDD转DataFrame import spark.implicits._ val dogDF = dogRDD.toDF() dogDF.show() val renameSchemaDF = dogRDD.toDF ("first_name", "lovest_color") renameSchemaDF.show() // 2: DataFrame转roads, schema信息丢掉了 Val dogRowRDD: eet [Row] = dogDF.rdd dogRowRDD.collect () renameSchemaDF.rdd.collect () // 3: eet转Dataset Val dogDS = dogRDD.toDS () dogDS.show () // 4: Dataset转eet Val dogRDDFromDs: eet [Dog] = dogDS.rdd dogRDDFromDs.collect () // 5: DataFrame转Dataset Val dogDsFromDf = dogDF.as [Dog] dogDsFromDf.show () / / 6: Dataset转DataFrame Val dogDfFromDs = dogDsFromDf.toDF () dogDfFromDs.show () spark.stop () } }
Schema definition of complex data types and usage
import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SaveMode, SparkSession} object SchemaApiTest { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("SchemaApiTest") .master("local") .getOrCreate() val iotDeviceDf = spark.read.json(s"${BASE_PATH}/IoT_device_info.json") iotDeviceDf.toString() //1: schema的展示 iotDeviceDf.schema iotDeviceDf.printSchema() //2: schema中可以有复杂数据类型 val schema = StructType( StructField("name", StringType, false) :: StructField("age", IntegerType, true) :: StructField("map", MapType(StringType, StringType), true) :: StructField("array", ArrayType(StringType), true) :: StructField("struct", StructType(Seq(StructField("field1", StringType), StructField("field2", StringType)))) :: Nil) val people = spark.sparkContext.parallelize(Seq("tom,30", "katy, 46")).map(_.split(",")).map(p => Row(p(0), p(1).trim.toInt, Map(p(0) -> p(1)), Seq(p(0), p(1)), Row("value1", "value2"))) val dataFrame = spark.createDataFrame(people, schema) dataFrame.printSchema dataFrame.show () dataFrame.select ( "Map"). the collect (). Map (Row => row.getAs [the Map [String, String]] ( "Map")) dataFrame.select ( "Array"). the collect (). Map ( = Row> row.getAs [Seq [String]] ( "Array")) dataFrame.select ( "struct"). the collect (). Map (Row => row.getAs [Row] ( "struct")) // schema usefulness Val exampleSchema new new StructType = (). the Add ( "name", the StringType) .add ( "Age", IntegerType) exampleSchema ( "name") /// extract name information, type exampleSchema.fields // all field types information exampleSchema.fieldNames // All field names exampleSchema.fieldIndex ( "name") /// location index field @ 1: View of a parquet file Schema Val sessionDf spark.read.parquet = (S "$ {} BASE_PATH / trackerSession ") sessionDf.schema sessionDf.printSchema() //2:比对两个parquet文件的schema是否相同 val changedSchemaFieldNames = sessionDf.schema.fieldNames.map(fieldName => { if (fieldName == "pageview_count") { "pv_count" } else fieldName }) sessionDf.toDF(changedSchemaFieldNames:_*).write.mode(SaveMode.Overwrite).parquet(s"${BASE_PATH}/trackerSession_changeSchema") val schemaChangeSessionDf = spark.read.parquet(s"${BASE_PATH}/trackerSession_changeSchema") schemaChangeSessionDf.schema schemaChangeSessionDf.printSchema() val oldSchema = sessionDf.schema val changeSchema = schemaChangeSessionDf.Schema // 3: Schema parquet two files are not the same, the need for unity oldSchema == false changeSchema // val allSessionError = spark.read.parquet(s"${BASE_PATH}/trackerSession", s"${BASE_PATH}/trackerSession_changeSchema") allSessionError.printSchema() allSessionError.show() val allSessionRight = sessionDf.toDF(changeSchema.fieldNames:_*).union(schemaChangeSessionDf) allSessionRight.printSchema() allSessionRight.show() spark.stop() } }