RDD this method can not be registered.
Use sparksql found in the course UDF is still somewhat used it, or write a separate blog post record it.
UDF = "a inputs and one output. Equivalent map
UDAF = "a plurality of inputs and one output. Equivalent to reduce
UDTF = "a plurality of input output. Equivalent flatMap. (Required hive environment, not yet tested)
UDF
In fact registered function in sql statement, do not think too hard. For everyone to write a case when statement
import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
import java.util.ArrayList
object WordCount {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().master("local").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}", "{'id':'8'}",
"{'id':'9'}","{'id':'10'}"));
val nameRDD1df = sparkSession.read.json(nameRDD1)
nameRDD1df.createTempView("idList")
sparkSession.udf.register("idParse",(str:String)=>{//注册一个函数,实现case when的函数
str match{
case "7" => "id7"
case "8" => "id8"
case "9" => "id9"
case _=>"others"
}
})
val data = sparkSession.sql("select idParse(id) from idList").show(100)
}
}
OUT OF
import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
import java.util.ArrayList
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DataType
MyMax the extends UserDefinedAggregateFunction {class
// definition of the type of input data, two way are
// inputSchema the override DEF: StructType = StructType (the Array (StructField ( "INPUT", IntegerType, to true)))
the override DEF inputSchema: StructType StructType = ( StructField ( "INPUT", IntegerType) :: Nil)
// data type definition during the polymerization process in
// the override DEF bufferSchema: StructType = StructType (the Array (StructField ( "Cache", IntegerType, to true)))
the override DEF bufferSchema : StructType = StructType (StructField ( "max", IntegerType) :: Nil)
// defines the type of input data
the override DEF dataType: the dataType = IntegerType
// predetermined consistency
DEF DETERMINISTIC the override: = Boolean to true
// before polymerization, the initialization operation of each data
override def initialize (buffer: MutableAggregationBuffer) : Unit = {buffer (0) = 0}
each set of data //, when a new value coming when, how to calculate the aggregate value of
the override DEF Update (Buffer: MutableAggregationBuffer, INPUT: Row): Unit = {
IF (input.getInt (0)> buffer.getInt (0))
Buffer (0) = input.getInt ( 0)
}
// results of each packet combined
the override DEF merge (Buffer1: MutableAggregationBuffer, Buffer2: Row): Unit = {
IF (buffer2.getInt (0)> buffer1.getInt (0)) {
Buffer1 (0) = buffer2.getInt (0)
}
}
// returns the final result of
the override the evaluate DEF (Buffer: Row): the Any buffer.getInt = {(0)}
}
the extends UserDefinedAggregateFunction myavg {class
// input data type
the override DEF inputSchema: StructType = StructType (StructField ( "INPUT", IntegerType) :: Nil)
// intermediate result data type
the override DEF bufferSchema: StructType = StructType (
StructField ( "SUM ", IntegerType) :: StructField (" COUNT ", IntegerType) :: Nil)
// defines the input data type
override def dataType: the dataType = IntegerType
// predetermined consistency
override def DETERMINISTIC: = Boolean to true
// initialization
override def initialize (buffer: MutableAggregationBuffer): Unit = {buffer (0) = 0; buffer (1) = 0;}
end // map reduce, all over this section of the code data must be
the override DEF Update (Buffer: MutableAggregationBuffer, INPUT: Row): Unit = {
buffer.update (0, buffer.getInt (0) + input.getInt (0))
Buffer .Update (. 1, buffer.getInt (. 1) + 1'd)
}
// the reduce data, update there Row, no second field, which have a second field time
override def merge (buffer: MutableAggregationBuffer, input: Row ): Unit = {
buffer.update (0, buffer.getInt (0) + input.getInt (0))
buffer.update (. 1, buffer.getInt (. 1) input.getInt + (. 1))
}
// return the final result
override def evaluate (finalVaue: Row): Int = {finalVaue.getInt (0) /finalVaue.getInt (1)}
}
object WordCount {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().master("local").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}"));
val nameRDD1df = sparkSession.read.json(nameRDD1)
val nameRDD2 = javasc.parallelize(Arrays.asList( "{'id':'8'}"));
val nameRDD2df = sparkSession.read.json(nameRDD2)
val nameRDD3 = javasc.parallelize(Arrays.asList("{'id':'9'}"));
val nameRDD3df = sparkSession.read.json(nameRDD3)
val nameRDD4 = javasc.parallelize(Arrays.asList("{'id':'10'}"));
val nameRDD4df = sparkSession.read.json (nameRDD4)
nameRDD1df.union(nameRDD2df).union(nameRDD3df).union(nameRDD4df).registerTempTable("idList")
// sparkSession.udf.register("myMax",new MyMax)
sparkSession.udf.register("myAvg",new MyAvg)
val data = sparkSession.sql("select myAvg(id) from idList").show(100)
}
}
UDTF being no test, no hive home environment
import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ DataFrame, Row, SparkSession, functions }
import org.apache.spark.sql.functions.{ col, desc, length, row_number, trim, when }
import org.apache.spark.sql.functions.{ countDistinct, sum, count, avg }
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
import java.util.ArrayList
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DataType
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory
import org.apache.hadoop.hive.ql.exec.UDFArgumentException
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
MyFloatMap the extends GenericUDTF {class
Unit = {}: () Close DEF override
action // this method: 1. 2. The input parameter output parity column definitions, may be more than one, you may be generated corresponding to rows and columns of data
override the initialize DEF (args: the Array [ObjectInspector]): = {StructObjectInspector
IF (! = args.length. 1) {
the throw new new UDFArgumentLengthException ( "Takes only UserDefinedUDTF One argument")
}
! IF (args (0) .getCategory () ObjectInspector = .Category.PRIMITIVE) {
the throw new new UDFArgumentException ( "AS UserDefinedUDTF Takes String Parameter A")
}
val fieldNames = new java.util.ArrayList[String]
val fieldOIs = new java.util.ArrayList[ObjectInspector]
// output columns defined here is the default field name
fieldNames.add ( "col1")
// defined here is the output column field type
fieldOIs.add (PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs)
}
// This is the method of processing data, the parameter set, only one line of data, i.e., each call process method of data processing for one line only
override def process (args: Array [ AnyRef]): Unit = {
string will cut into // an array of single character
Val strLst = args (0) .toString.split ( "")
for (I <- strLst) {
var tmp: the array [String] = new new the array [String] (. 1)
tmp (0) = I
/ / call forward method, must pass a string array, even if only one element
forward (tmp)
}
}
}
object WordCount {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().master("local").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD1 = javasc.parallelize(Arrays.asList("{'id':'7'}"));
val nameRDD1df = sparkSession.read.json(nameRDD1)
nameRDD1df.createOrReplaceTempView("idList")
sparkSession.sql("create temporary function myFloatMap as 'MyFloatMap'")
val data = sparkSession.sql("select myFloatMap(id) from idList").show(100)
}
}