1.DataFrame
1.1 Case class封装数据
wc.txt
hello spark tom tom jim hello tom spark
package com.wedoctor.sparksql
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameWC {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val sc: SparkContext = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val wcData: RDD[String] = sc.textFile("wc.txt")
val words: RDD[Words] = wcData.flatMap(_.split(
" ")).map(Words(_))
val frame: DataFrame = words.toDF()
//SQL风格
frame.registerTempTable("t_word")
sqlContext.sql("select * from t_word").show()
//DSL风格
frame.select("word").show(2)
sc.stop()
}
}
case class Words(word:String)
1.2 Row封装数据
person.txt
zs,20,zs.163.com ls,22,ls.qq.com ww,22,ww.qq.com zl,21,xxx.qq.com zl,26,xxx.qq.com ww,122,ww.qq.com
package com.wedoctor.sparksql
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object DateFrameDemo {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val sc: SparkContext = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val personData: RDD[String] = sc.textFile("person.txt")
val rowRDD: RDD[Row] = personData.map(str => {
val arr: Array[String] = str.split(",")
Row(arr(0), arr(1).toInt, arr(2))
})
val schema: StructType = StructType(
List(
StructField("name", StringType, true),
StructField("age", IntegerType, false),
StructField("email", StringType, true)
)
)
val pdf: DataFrame = sqlContext.createDataFrame(rowRDD,schema)
pdf.select("name") .show()
sc.stop()
}
}
2.DataSet
person.txt
zs,20,zs.163.com ls,22,ls.qq.com ww,22,ww.qq.com zl,21,xxx.qq.com zl,26,xxx.qq.com ww,122,ww.qq.com
package com.wedoctor.sparksql
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object DataSetDemo {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getSimpleName)
.getOrCreate()
//导入隐式转换
import spark.implicits._
val ds: Dataset[String] = spark.read.textFile("person.txt")
val ds2: Dataset[(String, Int, String)] = ds.map(str => {
val arr: Array[String] = str.split(",")
(arr(0), arr(1).toInt, arr(2))
})
/**
* root
* |-- _1: string (nullable = true)
* |-- _2: integer (nullable = false)
* |-- _3: string (nullable = true)
*/
//dsl风格
val ds3: DataFrame = ds2.toDF("age","name","email")
ds3.select("age").show(3)
//sql风格
ds3.createTempView("t_person_test")
spark.sql("select * from t_person_test").show()
spark.close()
}
}