package Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* Created by legotime
*/
object dataSetOperation {
case class Person(name: String, age: Long)
val sparkSession = SparkSession.builder().appName("data set example")
.master("local").getOrCreate()
import sparkSession.implicits._
val rdd = sparkSession.sparkContext.textFile("hdfs://master:9000/src/main/resources/people.txt")
val dataSet = rdd.map(_.split(",")).map(p =>Person(p(0),p(1).trim.toLong)).toDS()
//---------------------------------------------------------------Actions--------------------------------------
def dataSet_collect() = {
//Returns an array that contains all of Rows in this Dataset.
dataSet.collect().foreach(println)
/**
* Person(Michael,29)
* Person(Andy,30)
* Person(Justin,19)
*/
}
def dataSet_collectAsList()={
//Returns a Java list that contains all of Rows in this Dataset.
println(dataSet.collectAsList)
/**
* [Person(Michael,29), Person(Andy,30), Person(Justin,19)]
*/
}
def dataSet_count() = {
//Returns the number of rows in the Dataset.
println(dataSet.count())
/**
* 3
*/
}
def dataSet_describe()={
//Computes statistics for numeric columns, including count, mean, stddev, min, and max
dataSet.describe("name","age").show
/** 这个函数可以计算可中基本统计信息
* +-------+-------+-----------------+
* |summary| name| age|
* +-------+-------+-----------------+
* | count| 3| 3|
* | mean| null| 26.0|
* | stddev| null|6.082762530298219|
* | min| Andy| 19|
* | max|Michael| 30|
* +-------+-------+-----------------+
*/
}
def dataSet_first()={
//Returns the first row.
println(dataSet.first())
/**
* Person(Michael,29)
*/
}
def dataSet_foreachPartition()={
//Applies a function f to each partition of this Dataset.
dataSet.foreachPartition{ part =>
println(part.toList)
}
/**
* List(Person(Michael,29), Person(Andy,30), Person(Justin,19))
*/
}
def dataSet_head()={
//Returns the first n rows.
dataSet.head(2).foreach(println)
/**
* Person(Michael,29)
* Person(Andy,30)
*/
}
def dataSet_reduce()={
//(Scala-specific) Reduces the elements of this Dataset using the specified binary function.
val data: Dataset[String] = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
println(data.reduce(_+_))
/**
* Michael, 29Andy, 30Justin, 19
*/
}
def dataSet_show() ={
println("----------默认全部打印---------------")
dataSet.show()
println("----------only showing top 2 rows---------------")
dataSet.show(2)
println("---------Displays the top 20 rows of Dataset in a tabular form.------------")
dataSet.show(true)
}
def dataSet_toLocalIterator()={
//Return an iterator that contains all of Rows in this Dataset.
val tmp = dataSet.toLocalIterator()
while (tmp.hasNext){
println(tmp.next())
}
/**
* Person(Michael,29)
* Person(Andy,30)
* Person(Justin,19)
*/
}
//---------------------------------------------------------------Basic Dataset functions---------------------
def dataSet_as()={
//Returns a new Dataset where each record has been mapped on to the specified type
val data: DataFrame = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt")
data.as[String]
data.show()
/**
* +-----------+
* | value|
* +-----------+
* |Michael, 29|
* | Andy, 30|
* | Justin, 19|
* +-----------+
*/
}
def dataSet_cache()={
dataSet.cache()
/**
* 对数据进行缓存
*/
}
def dataSet_columns() ={
//Returns all column names as an array.
dataSet.columns.foreach(println)
/**
* name
* age
*/
}
def dataSet_createOrReplaceTempView()={
//Creates a temporary view using the given name.
dataSet.createOrReplaceTempView("myPerson")
val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
dataFrame.show()
/**
* +------+---+
| name|age|
+------+---+
|Justin| 19|
+------+---+
*/
}
def dataSet_createTempView() = {
//Creates a temporary view using the given name.
dataSet.createOrReplaceTempView("myPerson")
val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
dataFrame.map(teenager => "Name: " + teenager(0)).show()
/**
* +------------+
| value|
+------------+
|Name: Justin|
+------------+
*/
}
def dataSet_dtypes()={
//Returns all column names and their data types as an array.
dataSet.dtypes.foreach(println)
/**
* (name,StringType)
(age,LongType)
*/
}
def dataSet_explain()={
//Prints the physical plan to the console for debugging purposes.
//dataSet.explain()
/**
* == Physical Plan ==
Scan ExistingRDD[name#2,age#3L]
*/
//Prints the plans (logical and physical) to the console for debugging purposes.
dataSet.explain(true)
/**
* == Parsed Logical Plan ==
LogicalRDD [name#2, age#3L]
== Analyzed Logical Plan ==
name: string, age: bigint
LogicalRDD [name#2, age#3L]
== Optimized Logical Plan ==
LogicalRDD [name#2, age#3L]
== Physical Plan ==
Scan ExistingRDD[name#2,age#3L]
*/
}
def dataSet_inputFiles()={
println(dataSet.inputFiles.toList)
//List()
}
def dataSet_isLocal() ={
//Returns true if the collect and take methods can be run locally (without any Spark executors).
dataSet.isLocal
//false
}
def dataSet_isStreaming() ={
dataSet.isStreaming
}
def dataSet_javaRDD()={
//Returns the content of the Dataset as a JavaRDD of Ts.
println(dataSet.toJavaRDD)
//MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:222
}
def dataSet_persist()={
//Persist this Dataset with the given storage level.
dataSet.persist()
/**
* 省却的情况下是(MEMORY_AND_DISK).
*/
}
def dataSet_printSchema()={
//Prints the schema to the console in a nice tree format.
dataSet.printSchema()
/**
* root
|-- name: string (nullable = true)
|-- age: long (nullable = false)
*/
}
def dataSet_rdd()={
//Represents the content of the Dataset as an RDD of T.
println(dataSet.rdd)
/**返回RDD形式
*MapPartitionsRDD[7] at rdd at dataSetOperation.scala:243
*/
}
def dataSet_schema()={
//Returns the schema of this Dataset.
println(dataSet.schema)
/**
* StructType(StructField(name,StringType,true), StructField(age,LongType,false))
*/
}
def dataSet_toDF()={
//Converts this strongly typed collection of data to generic Dataframe.
dataSet.toDF().show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
//Converts this strongly typed collection of data to generic DataFrame with columns renamed
dataSet.toDF("man","ID").show()//列的数目要和原来一样
/**
* +-------+---+
| man| ID|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
}
def dataSet_toJavaRDD()={
//Returns the content of the Dataset as a JavaRDD of Ts.
println(dataSet.toJavaRDD)
//MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:285
}
def dataSet_unpersist()={
//Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.
dataSet.unpersist(true)
//dataSet.unpersist()
}
def dataSet_write()={
//Interface for saving the content of the non-streaming Dataset out into external storage.
dataSet.write
/**
* 实验阶段
*/
}
def dataSet_writeStream()={
//Interface for saving the content of the non-streaming Dataset out into external storage.
dataSet.writeStream
/**
* 实验阶段
*/
}
def dataSet_registerTempTable()={
//Registers this Dataset as a temporary table using the given name.
// The lifetime of this temporary table is tied to the SparkSession that was used to create this Dataset.
dataSet.registerTempTable("myPerson")
val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
dataFrame.map(teenager => "Name: " + teenager(0)).show()
/**已经用createTempView代替,后来版本会慢慢取消,
* +------------+
| value|
+------------+
|Name: Justin|
+------------+
*/
}
//---------------------------------------------------------------Typed transformations----------------------
def dataSet_AS()={
val tmpDS: Dataset[Person] = dataSet.as("oldDataSet")
}
def dataSet_alias()={
/**本质是调用as
* def alias(alias: Symbol): Dataset[T] = as(alias)
* def alias(alias: String): Dataset[T] = as(alias)
*
* [name: string, age: bigint]
*/
}
def dataSet_coalesce()={
//本质:Repartition(numPartitions, shuffle = false, logicalPlan)
//给dataSet重新设置partition数目,和RDD一样
//但是数据量非常小的时候,发现重新设置分区数不起作用
def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
}
dataSet.coalesce(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:0, val: Person(Michael,29)]
[partID:0, val: Person(Andy,30)]
[partID:0, val: Person(Justin,19)]
*/
println(dataSet.coalesce(2).toJavaRDD.rdd.partitions.length)
/**
* 1
*/
}
def dataSet_distinct()={
val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100)))
val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
unionedDS.show()
/**
* +--------+---+
| Michael| 29|
| Andy| 30|
| Justin| 19|
|legotime|100|
|legotime|100|
+--------+---+
*/
unionedDS.distinct().show()
/**
* +--------+---+
| name|age|
+--------+---+
| Andy| 30|
|legotime|100|
| Michael| 29|
| Justin| 19|
+--------+---+
*/
// distinct 操作内部是会进行shuffle 排序的
}
def dataSet_dropDuplicates()={
//def dropDuplicates(): Dataset[T]
//def distinct(): Dataset[T]
//def dropDuplicates(colNames: Array[String]): Dataset[T]
//Returns a new Dataset with duplicate rows removed, considering only the subset of columns.
val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
unionedDS.show()
/**
* +--------+---+
| name|age|
+--------+---+
| Michael| 29|
| Andy| 30|
| Justin| 19|
|legotime|100|
| lego| 19|
|legotime|100|
| lego| 19|
+--------+---+
*/
unionedDS.dropDuplicates().show()
/**
* +--------+---+
| name|age|
+--------+---+
| Andy| 30|
| lego| 19|
|legotime|100|
| Michael| 29|
| Justin| 19|
+--------+---+
*/
unionedDS.dropDuplicates("name").show()
/**
* +--------+---+
| name|age|
+--------+---+
| Michael| 29|
| Andy| 30|
| lego| 19|
|legotime|100|
| Justin| 19|
+--------+---+
*/
unionedDS.dropDuplicates("age").show()
/**
* +--------+---+
| name|age|
+--------+---+
| Michael| 29|
| Justin| 19|
|legotime|100|
| Andy| 30|
+--------+---+
*/
unionedDS.dropDuplicates(Array("name","age")).show()
/**
* +--------+---+
| name|age|
+--------+---+
| Andy| 30|
| lego| 19|
|legotime|100|
| Michael| 29|
| Justin| 19|
+--------+---+
*/
}
def dataSet_except()={
//Returns a new Dataset containing rows in this Dataset but not in another Dataset. This is equivalent to EXCEPT in SQL.
val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
dataSet.except(tmpDataSet).show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Justin| 19|
+-------+---+
*/
}
def dataSet_filter()={
//def filter(func: (T) ⇒ Boolean): Dataset[T]
//def filter(conditionExpr: String): Dataset[T]
//def filter(condition: Column): Dataset[T]
dataSet.filter($"age" > 20).show()
dataSet.filter("age > 20").show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
+-------+---+
*/
}
def dataSet_flatMap()={
//def flatMap[U](func: (T) ⇒ TraversableOnce[U])(implicit arg0: Encoder[U]): Dataset[U]
/**因为序列化问题,报错,期待后续开发
* dataSet.flatMap{ P =>
P.toString
}.show()
*/
val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
val words = tmpDataSet.flatMap(line => line.split(","))
words.show()
/**
* +-------+
| value|
+-------+
|Michael|
| 29|
| Andy|
| 30|
| Justin|
| 19|
+-------+
*/
//可以说,只要你想对内部具体的值进行动刀,都离不开flatMap ,flatMap之后可以实现很多要求,比如如下:
words.map((word) =>(word,1)).show()
/**
* +-------+---+
| _1| _2|
+-------+---+
|Michael| 1|
| 29| 1|
| Andy| 1|
| 30| 1|
| Justin| 1|
| 19| 1|
+-------+---+
*/
//在如下:
words.map((word) =>(word,1)).groupByKey(value => value).count().show()
/**
* +-----------+--------+
| key|count(1)|
+-----------+--------+
| [Andy,1]| 1|
|[Michael,1]| 1|
| [ 29,1]| 1|
| [Justin,1]| 1|
| [ 30,1]| 1|
| [ 19,1]| 1|
+-----------+--------+
*/
/**
* 实验阶段
*/
}
def dataSet_groupByKey()={
val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
val words = tmpDataSet.flatMap(line => line.split(","))
words.groupByKey(_.toLowerCase).count().show()
/**
* +-------+--------+
| value|count(1)|
+-------+--------+
| 29| 1|
| andy| 1|
|michael| 1|
| justin| 1|
| 19| 1|
| 30| 1|
+-------+--------+
*/
/**
* 值得注意的是,它是spark中降低效率前几的一个函数,尽量用其他函数代替。
*/
}
def dataSet_intersect()={
//Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is equivalent to INTERSECT in SQL.(交集)
val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
dataSet.show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
dataSet.intersect(tmpDataSet).show()
/**
* +----+---+
|name|age|
+----+---+
|Andy| 30|
+----+---+
*/
}
def dataSet_joinWith()={
/**
* 实验阶段
*/
//Using inner equi-join to join this Dataset returning a Tuple2 for each pair where condition evaluates to true.
val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
dataSet.joinWith(tmpDataSet,tmpDataSet("name") === dataSet("name")).show()
/**
* +---------+---------+
| _1| _2|
+---------+---------+
|[Andy,30]|[Andy,30]|
+---------+---------+
*/
dataSet.joinWith(tmpDataSet,tmpDataSet("age") === dataSet("age")).show()
/**
* +-----------+---------+
| _1| _2|
+-----------+---------+
| [Andy,30]|[Andy,30]|
|[Justin,19]|[lego,19]|
+-----------+---------+
*/
dataSet.joinWith(tmpDataSet,tmpDataSet("age") === 19).show()
/**
* +------------+---------+
| _1| _2|
+------------+---------+
|[Michael,29]|[lego,19]|
| [Andy,30]|[lego,19]|
| [Justin,19]|[lego,19]|
+------------+---------+
*/
}
def dataSet_limit()={
/**
* Returns a new Dataset by taking the first n rows.
* The difference between this function and head is that head is an action and returns an array
* (by triggering query execution) while limit returns a new Dataset.
*/
dataSet.show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
dataSet.limit(2).show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
+-------+---+
*/
}
def dataSet_map()={
//Returns a new Dataset that contains the result of applying func to each element.
dataSet.map(Person =>Person.age).show()
/**
* +-----+
|value|
+-----+
| 29|
| 30|
| 19|
+-----+
*/
}
def dataSet_mapPartitions()={
def myfunc[Person](iter: Iterator[Person]) : Iterator[(Person, Person)] = {
var res = List[(Person, Person)]()
var pre = iter.next
while (iter.hasNext)
{
val cur = iter.next
res .::= (pre, cur)
pre = cur
}
res.iterator
}
dataSet.mapPartitions(myfunc).show()
/**
* +------------+-----------+
| _1| _2|
+------------+-----------+
| [Andy,30]|[Justin,19]|
|[Michael,29]| [Andy,30]|
+------------+-----------+
*/
/**
* 实验阶段
*/
}
def dataSet_orderBy()={
dataSet.show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
dataSet.orderBy($"age").show()
/**
* +-------+---+
| name|age|
+-------+---+
| Justin| 19|
|Michael| 29|
| Andy| 30|
+-------+---+
*/
}
def dataSet_randomSplit()={
dataSet.randomSplit(Array(0.6,0.4),0L).foreach{ds =>
ds.show()
}
/**
* +-------+---+
| name|age|
+-------+---+
| Andy| 30|
|Michael| 29|
+-------+---+
*/
/**
* +------+---+
| name|age|
+------+---+
|Justin| 19|
+------+---+
*/
}
def dataSet_randomSplitAsList()={
//Returns a Java list that contains randomly split Dataset with the provided weights.
println(dataSet.randomSplitAsList(Array(0.6,0.4),0L).size())
//2
}
def dataSet_repartition()={
def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
}
dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:0, val: Person(Michael,29)]
[partID:0, val: Person(Justin,19)]
[partID:1, val: Person(Andy,30)]
*/
dataSet.repartition($"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:71, val: Person(Michael,29)]
[partID:164, val: Person(Andy,30)]
[partID:169, val: Person(Justin,19)]
*/
dataSet.repartition(2,$"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:0, val: Person(Andy,30)]
[partID:1, val: Person(Michael,29)]
[partID:1, val: Person(Justin,19)]
*/
}
def dataSet_sample()={
//Returns a new Dataset by sampling a fraction of rows, using a random seed.
dataSet.sample(withReplacement = true,0.6,0L).show()
/**
* +----+---+
|name|age|
+----+---+
|Andy| 30|
+----+---+
*/
}
def dataSet_select()={
dataSet.select($"name").show()
/**
* +-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
*/
}
def dataSet_sort()={
dataSet.show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
dataSet.sort($"name",$"age".desc).show()
/**
* +-------+---+
| name|age|
+-------+---+
| Andy| 30|
| Justin| 19|
|Michael| 29|
+-------+---+
*/
}
def dataSet_sortWithinPartitions()={
def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
}
dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:0, val: Person(Michael,29)]
[partID:0, val: Person(Justin,19)]
[partID:1, val: Person(Andy,30)]
*/
dataSet.repartition(2).sortWithinPartitions($"age").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)
/**
* [partID:0, val: Person(Justin,19)]
[partID:0, val: Person(Michael,29)]
[partID:1, val: Person(Andy,30)]
*/
}
def dataSet_transform()={
//Concise syntax for chaining custom transformations.
dataSet.show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
dataSet.transform{ p =>p.sort($"age".desc)}.show()
/**
* +-------+---+
| name|age|
+-------+---+
| Andy| 30|
|Michael| 29|
| Justin| 19|
+-------+---+
*/
}
def dataSet_union()={
//Returns a new Dataset containing union of rows in this Dataset and another Dataset. This is equivalent to UNION ALL in SQL.
val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
unionedDS.show()
/**
* +--------+---+
| name|age|
+--------+---+
| Michael| 29|
| Andy| 30|
| Justin| 19|
|legotime|100|
| lego| 19|
|legotime|100|
| lego| 19|
+--------+---+
*/
}
def dataSet_where()={
dataSet.where($"age">20).show()
dataSet.where("age > 20").show()
dataSet.filter($"age">20).show()
dataSet.filter("age >20").show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
+-------+---+
*/
}
def dataSet_unionAll()={
/**
* Annotation @deprecated
Deprecate (Since version 2.0.0) use union()
*/
val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
val unionedDS = dataSet.unionAll(tmpDataSet).union(tmpDataSet)
unionedDS.show()
/**
* +--------+---+
| name|age|
+--------+---+
| Michael| 29|
| Andy| 30|
| Justin| 19|
|legotime|100|
| lego| 19|
|legotime|100|
| lego| 19|
+--------+---+
*/
}
//---------------------------------------------------------------Untyped transformations---------------------
def dataSet_agg()={
// import org.apache.spark.sql.functions._
dataSet.groupBy($"age",$"name").agg(max($"name"), avg($"age")).show()
/**
* +---+-------+---------+--------+
|age| name|max(name)|avg(age)|
+---+-------+---------+--------+
| 29|Michael| Michael| 29.0|
| 30| Andy| Andy| 30.0|
| 19| Justin| Justin| 19.0|
+---+-------+---------+--------+
*/
dataSet.groupBy().agg(max($"name"), avg($"age")).show()
dataSet.agg(max($"name"), avg($"age")).show()
// dataSet.agg(...) is a shorthand for dataSet.groupBy().agg(...)
/**
* +---------+--------+
|max(name)|avg(age)|
+---------+--------+
| Michael| 26.0|
+---------+--------+
*/
}
def dataSet_apply()={
//Selects column based on the column name and return it as a Column. Note that the column name can also reference to a nested column like a.b.
println(dataSet.apply("age"))
//age
}
def dataSet_col()={
//Selects column based on the column name and return it as a Column.
dataSet.select(col("age")).show()
/**
* +---+
|age|
+---+
| 29|
| 30|
| 19|
+---+
*/
}
def dataSet_cube()={
//Create a multi-dimensional cube for the current Dataset using the specified columns, so we can run aggregation on them.
dataSet.cube("age","name").agg(avg($"age")).show()
/**
* +----+-------+--------+
| age| name|max(age)|
+----+-------+--------+
|null|Michael| 29|
|null| null| 30|
| 29|Michael| 29|
| 19| null| 19|
| 30| Andy| 30|
| 30| null| 30|
|null| Andy| 30|
| 19| Justin| 19|
| 29| null| 29|
|null| Justin| 19|
+----+-------+--------+
*/
}
def dataSet_drop()={
//Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s).
dataSet.drop("age").show()
//Returns a new Dataset with a column dropped. This version of drop accepts a Column rather than a name.
// This is a no-op if the Dataset doesn't have a column with an equivalent expression.
dataSet.drop(col = col("age")).show()
/**
* +-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
*/
}
def dataSet_groupBy()={
dataSet.groupBy(col("age")).agg{Map(
"age"->"avg",
"name"->"max"
)}.show()
dataSet.groupBy($"age").agg{Map(
"age"->"avg",
"name"->"max"
)}.show()
/**
* +---+--------+---------+
|age|avg(age)|max(name)|
+---+--------+---------+
| 29| 29.0| Michael|
| 19| 19.0| Justin|
| 30| 30.0| Andy|
+---+--------+---------+
*/
}
def dataSet_join()={
val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
dataSet.join(tmpDataSet).show()
/**
* +-------+---+--------+---+
|Michael| 29|legotime|100|
|Michael| 29| lego| 19|
| Andy| 30|legotime|100|
| Andy| 30| lego| 19|
| Justin| 19|legotime|100|
| Justin| 19| lego| 19|
+-------+---+--------+---+
*/
dataSet.join(tmpDataSet,"age").show()
/**
* +---+------+----+
|age| name|name|
+---+------+----+
| 19|Justin|lego|
+---+------+----+
*/
dataSet.join(tmpDataSet,Seq("age","name")).show()
/**
* +---+----+
|age|name|
+---+----+
+---+----+
*/
}
def dataSet_na()={
//Returns a DataFrameNaFunctions for working with missing data.
dataSet.na.drop("all").show()
/**
* +-------+---+
| name|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
}
def dataSet_rollup()={
//Create a multi-dimensional rollup for the current Dataset using the specified columns, so we can run aggregation on them
dataSet.rollup("age", "name").avg().show()
/**
* +----+-------+--------+
| age| name|avg(age)|
+----+-------+--------+
|null| null| 26.0|
| 29|Michael| 29.0|
| 19| null| 19.0|
| 30| Andy| 30.0|
| 30| null| 30.0|
| 19| Justin| 19.0|
| 29| null| 29.0|
+----+-------+--------+
*/
}
def dataSet_select_2()={
dataSet.select("age","name","age").show()
/**
* +---+-------+---+
|age| name|age|
+---+-------+---+
| 29|Michael| 29|
| 30| Andy| 30|
| 19| Justin| 19|
+---+-------+---+
*/
}
def dataSet_selectExpr()={
//Selects a set of SQL expressions. This is a variant of select that accepts SQL expressions.
dataSet.selectExpr("name","age+1","name as NAME","age as AGE").show()
dataSet.select(expr("name"),expr("age+1"), expr("name as NAME"), expr("age as AGE"))
/**
* +-------+---------+-------+---+
| name|(age + 1)| NAME|AGE|
+-------+---------+-------+---+
|Michael| 30|Michael| 29|
| Andy| 31| Andy| 30|
| Justin| 20| Justin| 19|
+-------+---------+-------+---+
*/
}
def dataSet_stat()={
//Returns a DataFrameStatFunctions for working statistic functions support.
dataSet.stat.sampleBy("age",Map("age"->0.5,"name"->0.5),0L).show()
/**
* +----+---+
|name|age|
+----+---+
+----+---+
*/
}
def dataSet_withColumn()={
//Returns a new Dataset by adding a column or replacing the existing column that has the same name.
dataSet.withColumn("NAME",col("name")).show()
/**
* +-------+---+
| NAME|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
}
def dataSet_withColumnRenamed()={
dataSet.withColumnRenamed("name","newName").show()
/**
* +-------+---+
|newName|age|
+-------+---+
|Michael| 29|
| Andy| 30|
| Justin| 19|
+-------+---+
*/
}
def dataSet_explode()={
/**
* Annotations @deprecated
Deprecated (Since version 2.0.0) use flatMap() or select() with functions.explode() instead
Since 2.0.0
*/
dataSet.explain()
}
def main(args: Array[String]) {
//dataSet_collect()
//dataSet_collectAsList
//dataSet_count
//dataSet_describe
//dataSet_first
//dataSet_foreachPartition
//dataSet_head
//dataSet_reduce
//dataSet_show
//dataSet_toLocalIterator
//dataSet_as
//dataSet_cache
//dataSet_columns
//dataSet_createOrReplaceTempView
//dataSet_createTempView
//dataSet_dtypes
//dataSet_explain
//dataSet_inputFiles
//println(dataSet_isLocal)
//dataSet_isStreaming
//dataSet_javaRDD
//dataSet_persist
//dataSet_printSchema
//dataSet_rdd
//dataSet_schema
//dataSet_toDF
//dataSet_toJavaRDD
//dataSet_unpersist
//dataSet_write
//dataSet_write
//dataSet_writeStream
//dataSet_registerTempTable
//dataSet_alias
//dataSet_coalesce
//dataSet_distinct
//dataSet_dropDuplicates
//dataSet_except
//dataSet_filter
//dataSet_flatMap
//dataSet_groupByKey
//dataSet_intersect
//dataSet_joinWith
//dataSet_limit
//dataSet_map
dataSet_mapPartitions
//dataSet_orderBy
//dataSet_randomSplit
//dataSet_randomSplitAsList
//dataSet_repartition
//dataSet_sample
//dataSet_select
//dataSet_sort
//dataSet_sortWithinPartitions
//dataSet_transform
//dataSet_union
//dataSet_where
//dataSet_unionAll
//dataSet_agg
//dataSet_apply
//dataSet_col
//dataSet_cube
//dataSet_drop
//dataSet_groupBy
//dataSet_join
//dataSet_na
//dataSet_rollup
//dataSet_select_2
//dataSet_selectExpr
//dataSet_stat
//dataSet_stat
//dataSet_withColumn
//dataSet_withColumnRenamed
//dataSet_explode
}
}
---------------------
作者:legotime
来源:CSDN
原文:https://blog.csdn.net/legotime/article/details/52562796?utm_source=copy
版权声明:本文为博主原创文章,转载请附上博文链接!
spark源码阅读笔记Dataset(二)Dataset中Actions、function、transformations
猜你喜欢
转载自blog.csdn.net/toto1297488504/article/details/83031001
今日推荐
周排行