Multiple ways to create a DataFrame

1. Created by RDD[Row] and StructType

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{
    
    IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{
    
    DataFrame, Row, SparkSession, types}

/**
 *通过RDD[Row]和StructType创建DataFrame
 */
object DataFrameDemo {
    
    
  def main(args: Array[String]): Unit = {
    
    
    // 通过SparkSession创建spark入口
    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    // 创建RDD
    val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
    // 将RDD与Row联合
    val rowRdd: RDD[Row] = sparkRdd.map(t => {
    
    
      val per: Array[String] = t.split(",")
      Row(per(0), per(1).toInt, per(2))
    })
    // 创建StructType实例,设置字段名和类型
    val schema: StructType = StructType(
      List(
        StructField("name", StringType),
        StructField("age", IntegerType),
        StructField("sex", StringType)
      )
    )
    // 创建dataFrame
    val dataFrame: DataFrame = spark.createDataFrame(rowRdd, schema)
    // 展示数据
    dataFrame.show()
    // 释放资源
    spark.stop()
  }
}

2. Create a DataFrame through RDD and ScalaBean

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
    
    DataFrame, SparkSession}

/**
 * 通过RDD和scalabean创建DataFrame
 */
object DataFrameDemo2 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    // 通过SparkSession创建spark入口
    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    // 创建RDD
    val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
    // 关联SacalaBean的RDD
    val beanRdd: RDD[Per] = sparkRdd.map(t => {
    
    
      val per: Array[String] = t.split(",")
      Per(per(0), per(1).toInt, per(2))
    })
    // 必须导入隐式转换才能使用.toDF
    import spark.implicits._
    // 创建dataFrame
    val df: DataFrame = beanRdd.toDF
    // 创建视图
    df.createTempView("t_per")
    // 查询数据
    val res: DataFrame = spark.sql("SELECT name,age FROM t_per ORDER BY age")
    // 展示数据
    res.show()
    // 释放资源
    spark.stop()
  }
}

case class Per(name: String, age: Int, sex: String)

3. Create DataFrame directly through Rdd.toDF()

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
    
    DataFrame, SparkSession}

/**
 * 通过RDD.toDF直接创建DataFrame
 */
object DataFrameDemo3 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    // 通过SparkSession创建spark入口
    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    // 创建RDD
    val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
    // 将RDD与Row联合
    val toDFRdd: RDD[(String, Int, String)] = sparkRdd.map(t => {
    
    
      val per: Array[String] = t.split(",")
      (per(0), per(1).toInt, per(2))
    })
    import org.apache.spark.sql.functions._
    import spark.implicits._
    // 创建DataFrame,设置schema字段名
    val frame: DataFrame = toDFRdd.toDF("name", "age", "sex")

    // 展示数据
    frame.agg(sum("age") as "avg_age").show()
    // 释放资源
    spark.stop()
  }
}

4. Create a DataFrame with RDD and JavaBean

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
    
    DataFrame, SparkSession}

/**
 * 通过RDD和javabean创建DataFrame
 */
object DataFrameDemo4 {
    
    
  def main(args: Array[String]): Unit = {
    
    
    // 通过SparkSession创建spark入口
    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()
    // 创建RDD
    val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
    // 将RDD与Row联合
    val javaBeanRdd: RDD[Person] = sparkRdd.map(t => {
    
    
      val per: Array[String] = t.split(",")
      new Person(per(0), per(1).toInt, per(2))
    })
    // 创建DataFrame
    val frame: DataFrame = spark.createDataFrame(javaBeanRdd, classOf[Person])
    // 展示数据
    frame.show()
    // 释放资源
    spark.stop()
  }
}

The above four methods are all methods of creating DataFrame, use different methods according to your own needs

Guess you like

Origin blog.csdn.net/AnameJL/article/details/107391981