sparksql series (six) SparkSql the UDF, UDAF, UDTF

RDD this method can not be registered.

Use sparksql found in the course UDF is still somewhat used it, or write a separate blog post record it.

UDF = "a inputs and one output. Equivalent map

UDAF = "a plurality of inputs and one output. Equivalent to reduce

UDTF = "a plurality of input output. Equivalent flatMap. (Required hive environment, not yet tested)

UDF

        In fact registered function in sql statement, do not think too hard. For everyone to write a case when statement

        import java.util.Arrays

        import org.apache.spark.SparkConf
        import org.apache.spark.api.java.JavaSparkContext
        import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
        import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
        import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
        import org.apache.spark.sql.functions.concat
        import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
        import org.apache.spark.sql.expressions.Window
        import org.apache.spark.storage.StorageLevel
        import org.apache.spark.sql.SaveMode
        import java.util.ArrayList

        object WordCount {

                def main(args: Array[String]): Unit = {
                        val sparkSession = SparkSession.builder().master("local").getOrCreate()
                        val javasc = new JavaSparkContext(sparkSession.sparkContext)

                        val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}", "{'id':'8'}",
                                "{'id':'9'}","{'id':'10'}"));
                        val nameRDD1df = sparkSession.read.json(nameRDD1)

                        nameRDD1df.createTempView("idList")
        
                        sparkSession.udf.register("idParse",(str:String)=>{//注册一个函数,实现case when的函数
                                str match{
                                        case "7" => "id7"
                                        case "8" => "id8"
                                        case "9" => "id9"
                                        case _=>"others"
                                }
                        })
                        val data = sparkSession.sql("select idParse(id) from idList").show(100)
                }
        }

OUT OF

        import java.util.Arrays

        import org.apache.spark.SparkConf
        import org.apache.spark.api.java.JavaSparkContext
        import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
        import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
        import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
        import org.apache.spark.sql.functions.concat
        import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
        import org.apache.spark.sql.expressions.Window
        import org.apache.spark.storage.StorageLevel
        import org.apache.spark.sql.SaveMode
        import java.util.ArrayList
        import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
        import org.apache.spark.sql.expressions.MutableAggregationBuffer
        import org.apache.spark.sql.types.IntegerType
        import org.apache.spark.sql.types.DataType

        MyMax the extends UserDefinedAggregateFunction {class
                // definition of the type of input data, two way are
                // inputSchema the override DEF: StructType = StructType (the Array (StructField ( "INPUT", IntegerType, to true)))
                the override DEF inputSchema: StructType StructType = ( StructField ( "INPUT", IntegerType) :: Nil)
                // data type definition during the polymerization process in
                // the override DEF bufferSchema: StructType = StructType (the Array (StructField ( "Cache", IntegerType, to true)))
                the override DEF bufferSchema : StructType = StructType (StructField ( "max", IntegerType) :: Nil)
                // defines the type of input data
                the override DEF dataType: the dataType = IntegerType
                // predetermined consistency
                DEF DETERMINISTIC the override: = Boolean to true
                // before polymerization, the initialization operation of each data
                override def initialize (buffer: MutableAggregationBuffer) : Unit = {buffer (0) = 0}
                each set of data //, when a new value coming when, how to calculate the aggregate value of
                the override DEF Update (Buffer: MutableAggregationBuffer, INPUT: Row): Unit = {
                        IF (input.getInt (0)> buffer.getInt (0))
                                Buffer (0) = input.getInt ( 0)
                }
                // results of each packet combined
                the override DEF merge (Buffer1: MutableAggregationBuffer, Buffer2: Row): Unit = {
                        IF (buffer2.getInt (0)> buffer1.getInt (0)) {
                                Buffer1 (0) = buffer2.getInt (0)
                        }
                }
                // returns the final result of
                the override the evaluate DEF (Buffer: Row): the Any buffer.getInt = {(0)}
        }


        the extends UserDefinedAggregateFunction myavg {class
                // input data type
                the override DEF inputSchema: StructType = StructType (StructField ( "INPUT", IntegerType) :: Nil)
                // intermediate result data type
                the override DEF bufferSchema: StructType = StructType (
                        StructField ( "SUM ", IntegerType) :: StructField (" COUNT ", IntegerType) :: Nil)
                // defines the input data type
                override def dataType: the dataType = IntegerType
                // predetermined consistency
                override def DETERMINISTIC: = Boolean to true
                // initialization
                override def initialize (buffer: MutableAggregationBuffer): Unit = {buffer (0) = 0; buffer (1) = 0;}

                end // map reduce, all over this section of the code data must be
                the override DEF Update (Buffer: MutableAggregationBuffer, INPUT: Row): Unit = {
                        buffer.update (0, buffer.getInt (0) + input.getInt (0))
                        Buffer .Update (. 1, buffer.getInt (. 1) + 1'd)
                }
                // the reduce data, update there Row, no second field, which have a second field time
                override def merge (buffer: MutableAggregationBuffer, input: Row ): Unit = {
                        buffer.update (0, buffer.getInt (0) + input.getInt (0))
                        buffer.update (. 1, buffer.getInt (. 1) input.getInt + (. 1))
                }
                // return the final result
                override def evaluate (finalVaue: Row): Int = {finalVaue.getInt (0) /finalVaue.getInt (1)}
                }

                object WordCount {

                        def main(args: Array[String]): Unit = {
                                val sparkSession = SparkSession.builder().master("local").getOrCreate()
                                val javasc = new JavaSparkContext(sparkSession.sparkContext)

                                val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}"));
                                val nameRDD1df = sparkSession.read.json(nameRDD1)
                                val nameRDD2 = javasc.parallelize(Arrays.asList( "{'id':'8'}"));
                                val nameRDD2df = sparkSession.read.json(nameRDD2)
                                val nameRDD3 = javasc.parallelize(Arrays.asList("{'id':'9'}"));
                                val nameRDD3df = sparkSession.read.json(nameRDD3)
                                val nameRDD4 = javasc.parallelize(Arrays.asList("{'id':'10'}"));
                                val nameRDD4df = sparkSession.read.json (nameRDD4)

                                nameRDD1df.union(nameRDD2df).union(nameRDD3df).union(nameRDD4df).registerTempTable("idList")

                                // sparkSession.udf.register("myMax",new MyMax)
                                sparkSession.udf.register("myAvg",new MyAvg)

                                val data = sparkSession.sql("select myAvg(id) from idList").show(100)


                }
        }

UDTF being no test, no hive home environment

       import java.util.Arrays

       import org.apache.spark.SparkConf
       import org.apache.spark.api.java.JavaSparkContext
       import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
       import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
       import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
       import org.apache.spark.sql.functions.concat
       import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
       import org.apache.spark.sql.expressions.Window
       import org.apache.spark.storage.StorageLevel
       import org.apache.spark.sql.SaveMode
       import java.util.ArrayList
       import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
       import org.apache.spark.sql.expressions.MutableAggregationBuffer
       import org.apache.spark.sql.types.IntegerType
       import org.apache.spark.sql.types.DataType
       import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
       import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
       import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
       import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory
       import org.apache.hadoop.hive.ql.exec.UDFArgumentException
       import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException
       import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory

       MyFloatMap the extends GenericUDTF {class
              Unit = {}: () Close DEF override
              action // this method: 1. 2. The input parameter output parity column definitions, may be more than one, you may be generated corresponding to rows and columns of data
              override the initialize DEF (args: the Array [ObjectInspector]): = {StructObjectInspector
                     IF (! = args.length. 1) {
                            the throw new new UDFArgumentLengthException ( "Takes only UserDefinedUDTF One argument")
                     }
                     ! IF (args (0) .getCategory () ObjectInspector = .Category.PRIMITIVE) {
                            the throw new new UDFArgumentException ( "AS UserDefinedUDTF Takes String Parameter A")
                     }

                     val fieldNames = new java.util.ArrayList[String]
                     val fieldOIs = new java.util.ArrayList[ObjectInspector]

                     // output columns defined here is the default field name
                     fieldNames.add ( "col1")
                     // defined here is the output column field type
                     fieldOIs.add (PrimitiveObjectInspectorFactory.javaStringObjectInspector)

                     ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs)
              }

              // This is the method of processing data, the parameter set, only one line of data, i.e., each call process method of data processing for one line only
              override def process (args: Array [ AnyRef]): Unit = {
                     string will cut into // an array of single character
                     Val strLst = args (0) .toString.split ( "")
                     for (I <- strLst) {
                            var tmp: the array [String] = new new the array [String] (. 1)
                            tmp (0) = I
                            / / call forward method, must pass a string array, even if only one element
                            forward (tmp)
                     }
              }
       }

       object WordCount {

              def main(args: Array[String]): Unit = {
                     val sparkSession = SparkSession.builder().master("local").getOrCreate()
                     val javasc = new JavaSparkContext(sparkSession.sparkContext)

                     val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}"));
                     val nameRDD1df = sparkSession.read.json(nameRDD1)

                     nameRDD1df.createOrReplaceTempView("idList")

                     sparkSession.sql("create temporary function myFloatMap as 'MyFloatMap'")

                     val data = sparkSession.sql("select myFloatMap(id) from idList").show(100)

              }
       }

Guess you like

Origin www.cnblogs.com/wuxiaolong4/p/11924172.html