kick rdd dataset sql udf udaf

In the future, for the convenience of operation, the logic will be packaged into udf, udaf.  
Write a wrapper interface to operate on a piece of data,
directly add meta desc to hdfs
in repl, and write sql directly in repl. The result is saved directly .  
Can accumulate business logic. Reuse






import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
import org.apache.spark. {SparkContext, SparkConf}
import yunzhi.utils._


object wordCount {
  def main(args: Array[String]) {


    val spark = SparkSession
      .builder()
      .appName("Spark SQL Data Soures Example")
      .config(" spark.some.config.option", "some-value")
      .getOrCreate()




    val data = spark.sparkContext.textFile("testdata/c01/wc.txt") //读取文件
.flatMap(_.split(" "))
      .map((_, 10))

    data.cache()


    // Rdd
data.reduceByKey(_ + _)
      .collect()
      .foreach(println) //word计数
    // sql  rdd => DataFrame  要加入implicits 转换
import spark.implicits._

    val dataDF =   data.toDF()
    dataDF.createOrReplaceTempView("dataDF")
    spark.sql("SELECT _1,sum(_2) as cnt  FROM dataDF group by _1").show()


    // register UDF
spark.sqlContext.udf.register("strLen", (s: String) => s.length())
    spark.sql("SELECT strLen(_1),sum(_2) as cnt  FROM dataDF group by _1").show()





    //register UDAF  wordCountUDAF(String)
spark.sqlContext.udf.register("wordCountUDAF", new wordCountUDAF)

    spark.sql("SELECT strLen(_1),wordCountUDAF(_1) as cnt  FROM dataDF group by _1").show()

    //register UDAF  sum(Int)
spark.sqlContext.udf.register("IntSumUDAF", new IntSumUDAF)
    spark.sql("SELECT _1,wordCountUDAF(_1) as countcnt , IntSumUDAF(_2) as sumcnt   FROM dataDF group by _1").show()



  }
}



package yunzhi.utils

import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._

/**
  * Created by l on 16-10-13.
  */
class UDAFUtil {

}



class wordCountUDAF extends UserDefinedAggregateFunction{ //ctrl+I implements the overriding method
/**
    * This method specifies the type of specific input data. The Array type can input multiple parameters, and define multiple StructField and Array formats. Because in sql, multiple columns may be passed in. , are arrays in udaf.
    * @return
*/
override def inputSchema: StructType = StructType(Array(StructField("input", StringType, true),StructField("input", StringType, true)))
  /* *
    * The type Array of the result of the data to be processed during the aggregation operation can define multiple StructField Array formats
    * @return
*/
override def bufferSchema: StructType = StructType(Array(StructField("count", IntegerType, true)))
  /**
    * Specify the result type returned by the UDAF function after calculation
    * @return
*/
override def dataType: DataType = IntegerType
  override def deterministic: Boolean = true
/**
    * The initialization result of each group of data before Aggregate
    * @param buffer
*/
override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)=0}
  /**
    * in When a new value comes in during aggregation, how to calculate the grouped aggregation
    * The local aggregation operation is equivalent to the Combiner in the Hadoop MapReduce model (the Row here has nothing to do with the Row of the DataFrame) The worker first calculates
    * @param buffer
* @param input
*/
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Int](0) + 1
}
  /**
    * Finally, after the Local Reduce on the distributed nodes is completed, a global operation is required Calculation before level Merge operation worker
    * @param buffer1
* @param buffer2
*/
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Int](0) + buffer2 .getAs[Int](0)
  }
  /**
    * Return the final calculation result of UDAF
    * @param buffer
* @return
*/
override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)
}






/ / Behaves very similar to the arguments of the aggrerate function. initialize update merge  …
class IntSumUDAF extends UserDefinedAggregateFunction{ //ctrl+I implements the override method
/**
    * This method specifies the type of specific input data
    * @return
*/
override def inputSchema: StructType = StructType(Array(StructField("input", IntegerType, true )))
  /**
    * The type of the result of the data to be processed during the aggregation operation
    * @return
*/
override def bufferSchema: StructType = StructType(Array(StructField("count", IntegerType, true)))
  / **
    * Specify the result type returned by the UDAF function after calculation
    * @return
*/
override def dataType: DataType = IntegerType
  override def deterministic: Boolean = true
/**
    * The initialization result of each group of data before Aggregate
    * @param buffer
*/
override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)=0}
  /**
    * When a new value comes in during aggregation, how to calculate the grouped aggregation
    * Local aggregation operation, Equivalent to Combiner in Hadoop MapReduce model (Row here has nothing to do with Row of DataFrame) Worker first calculates
    * @param buffer
* @param input
*/
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer( 0) = buffer.getAs[Int](0) + input.getAs[Int](0)
  }
  /**
    * Finally, after the completion of Local Reduce on the distributed nodes, a global-level Merge operation needs to be performed before the calculation of workers
    * @ param buffer1
* @param buffer2
*/
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Int](0) + buffer2.getAs[Int](0)
  }
  /**
    * Return the final calculation result of UDAF
    * @param buffer
* @return
*/
override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)



}




Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326925711&siteId=291194637