spark - RDD转成DataFrame

1.RDD转成DataFrame的两种方式:

package df

import org.apache.spark.sql.{Row, SparkSession}

object RDD2DataFrame {

  case class Person(name: String, score: Int)

  def main(args: Array[String]): Unit = {
    //enableHiveSupport()开启支持hive
    val spark = SparkSession.builder().appName("DF_DEMO").master("local").getOrCreate()

    //---------------1.基于反射的方式(必须事先知道schema,通过case class定义schema,通过反射拿到case class中的字段和类型 ,spark1.6版本case class只支持22个字段,高版本不限制字段个数)---------------------
    /**
      * 1.创建case class
      * 2.创建rdd => rdd [case class] => .toDF().
      */
    //导入隐式转换,才能调用toDF()方法
    import spark.implicits._

    /**
      * 测试数据内容如下:
      * a,100
      * b,90
      * c,80
      */
    val df = spark.sparkContext.textFile("file:///F:\\test\\2.txt").map(x => x.split(",")).map(x => Person(x(0), x(1).toInt)).toDF()
    df.show()

    /**
      * 输出结果:
      * +--------+
      * |   value|
      * +--------+
      * |name : a|
      * |name : b|
      * |name : c|
      * +--------+
      */
    df.map(x => "name : " + x.getAs[String]("name")).show()

    //---------------2. 基于编程的方式指定 ---------------------
    /**
      * 1.创建schemaString =>  StructField => StructType
      * 2.创建rdd => Rdd[Row]
      * 3.spark.createDataFrame(rowRDD, StructType)
      */
    //导入隐式转换(否则StringType找不到)
    import org.apache.spark.sql.types._

    val schemaString = "name score"
    val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))
    val schema = StructType(fields)

    val rowRDD = spark.sparkContext.textFile("file:///F:\\test\\2.txt").map(x => x.split(",")).map(x => Row(x(0), x(1)))
    val df2 = spark.createDataFrame(rowRDD, schema)
    df2.show()
    spark.stop()
  }
}

猜你喜欢

转载自coderlxl201209164551.iteye.com/blog/2415882