1.RDD转成DataFrame的两种方式:
package df import org.apache.spark.sql.{Row, SparkSession} object RDD2DataFrame { case class Person(name: String, score: Int) def main(args: Array[String]): Unit = { //enableHiveSupport()开启支持hive val spark = SparkSession.builder().appName("DF_DEMO").master("local").getOrCreate() //---------------1.基于反射的方式(必须事先知道schema,通过case class定义schema,通过反射拿到case class中的字段和类型 ,spark1.6版本case class只支持22个字段,高版本不限制字段个数)--------------------- /** * 1.创建case class * 2.创建rdd => rdd [case class] => .toDF(). */ //导入隐式转换,才能调用toDF()方法 import spark.implicits._ /** * 测试数据内容如下: * a,100 * b,90 * c,80 */ val df = spark.sparkContext.textFile("file:///F:\\test\\2.txt").map(x => x.split(",")).map(x => Person(x(0), x(1).toInt)).toDF() df.show() /** * 输出结果: * +--------+ * | value| * +--------+ * |name : a| * |name : b| * |name : c| * +--------+ */ df.map(x => "name : " + x.getAs[String]("name")).show() //---------------2. 基于编程的方式指定 --------------------- /** * 1.创建schemaString => StructField => StructType * 2.创建rdd => Rdd[Row] * 3.spark.createDataFrame(rowRDD, StructType) */ //导入隐式转换(否则StringType找不到) import org.apache.spark.sql.types._ val schemaString = "name score" val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true)) val schema = StructType(fields) val rowRDD = spark.sparkContext.textFile("file:///F:\\test\\2.txt").map(x => x.split(",")).map(x => Row(x(0), x(1))) val df2 = spark.createDataFrame(rowRDD, schema) df2.show() spark.stop() } }