1. Created by RDD[Row] and StructType
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{
IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{
DataFrame, Row, SparkSession, types}
/**
*通过RDD[Row]和StructType创建DataFrame
*/
object DataFrameDemo {
def main(args: Array[String]): Unit = {
// 通过SparkSession创建spark入口
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
// 创建RDD
val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
// 将RDD与Row联合
val rowRdd: RDD[Row] = sparkRdd.map(t => {
val per: Array[String] = t.split(",")
Row(per(0), per(1).toInt, per(2))
})
// 创建StructType实例,设置字段名和类型
val schema: StructType = StructType(
List(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("sex", StringType)
)
)
// 创建dataFrame
val dataFrame: DataFrame = spark.createDataFrame(rowRdd, schema)
// 展示数据
dataFrame.show()
// 释放资源
spark.stop()
}
}
2. Create a DataFrame through RDD and ScalaBean
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* 通过RDD和scalabean创建DataFrame
*/
object DataFrameDemo2 {
def main(args: Array[String]): Unit = {
// 通过SparkSession创建spark入口
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
// 创建RDD
val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
// 关联SacalaBean的RDD
val beanRdd: RDD[Per] = sparkRdd.map(t => {
val per: Array[String] = t.split(",")
Per(per(0), per(1).toInt, per(2))
})
// 必须导入隐式转换才能使用.toDF
import spark.implicits._
// 创建dataFrame
val df: DataFrame = beanRdd.toDF
// 创建视图
df.createTempView("t_per")
// 查询数据
val res: DataFrame = spark.sql("SELECT name,age FROM t_per ORDER BY age")
// 展示数据
res.show()
// 释放资源
spark.stop()
}
}
case class Per(name: String, age: Int, sex: String)
3. Create DataFrame directly through Rdd.toDF()
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* 通过RDD.toDF直接创建DataFrame
*/
object DataFrameDemo3 {
def main(args: Array[String]): Unit = {
// 通过SparkSession创建spark入口
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
// 创建RDD
val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
// 将RDD与Row联合
val toDFRdd: RDD[(String, Int, String)] = sparkRdd.map(t => {
val per: Array[String] = t.split(",")
(per(0), per(1).toInt, per(2))
})
import org.apache.spark.sql.functions._
import spark.implicits._
// 创建DataFrame,设置schema字段名
val frame: DataFrame = toDFRdd.toDF("name", "age", "sex")
// 展示数据
frame.agg(sum("age") as "avg_age").show()
// 释放资源
spark.stop()
}
}
4. Create a DataFrame with RDD and JavaBean
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* 通过RDD和javabean创建DataFrame
*/
object DataFrameDemo4 {
def main(args: Array[String]): Unit = {
// 通过SparkSession创建spark入口
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
// 创建RDD
val sparkRdd: RDD[String] = spark.sparkContext.parallelize(List("X,22,M", "y,21,W", "N,22,M"))
// 将RDD与Row联合
val javaBeanRdd: RDD[Person] = sparkRdd.map(t => {
val per: Array[String] = t.split(",")
new Person(per(0), per(1).toInt, per(2))
})
// 创建DataFrame
val frame: DataFrame = spark.createDataFrame(javaBeanRdd, classOf[Person])
// 展示数据
frame.show()
// 释放资源
spark.stop()
}
}
The above four methods are all methods of creating DataFrame, use different methods according to your own needs