Hadoop installed in window10, create a maven project with ida.
<properties> <spark.version>2.2.0</spark.version> <scala.version>2.11</scala.version> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-yarn_${scala.version}</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.16</version> </dependency> </dependencies> <build> <finalName>learnspark</finalName> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>3.0.0</version> <configuration> <archive> <manifest> <mainClass>learn</mainClass> </manifest> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
data preparation:
{"name":"张3", "age":20}
{"name":"李4", "age":20}
{"name":"王5", "age":20}
{"name":"赵6", "age":20}
路径:
data/input/user/user.json
程序:
package com.zouxxyy.spark.sql import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{DataType, DoubleType, LongType, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder, Encoders, Row, SparkSession, TypedColumn} /** * UDF: User-defined functions */ object UDF { def main(args: Array[String]): Unit = { System.setProperty("hadoop.home.dir","D:\\gitworkplace\\winutils\\hadoop-2.7.1" ) // This is used to specify the path of my hadoop, if your environment variables hadoop no problem, you can not write val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("UDF") // Create SparkSession val spark: SparkSession = SparkSession.builder.config(sparkConf).getOrCreate() import spark.implicits._ // get from the read json is DataFrame val frame: DataFrame = spark.read.json("data/input/user/user.json") frame.createOrReplaceTempView("user") // Case I: Customize a simple function test spark.udf.register("addName", (x:String)=> "Name:"+x) spark.sql("select addName(name) from user").show() // Case II: from a weakly typed defined aggregate function test val udaf1 = new MyAgeAvgFunction spark.udf.register("avgAge", udaf1) spark.sql("select avgAge(age) from user").show() // Case 3: Customizing a strongly typed aggregate function test val udaf2 = new MyAgeAvgClassFunction // will be converted to a query row aggregate function val avgCol: TypedColumn[UserBean, Double] = udaf2.toColumn.name("aveAge") // using strongly typed Dataset of DSL-style programming syntax val userDS: Dataset[UserBean] = frame.as[UserBean] userDS.select(avgCol).show() spark.stop() } } /** * Custom function cohesion (weak type) */ class MyAgeAvgFunction extends UserDefinedAggregateFunction{ // input data structure override def inputSchema: StructType = { new StructType().add("age", LongType) } // data structure when calculating override def bufferSchema: StructType = { new StructType().add("sum", LongType).add("count", LongType) } // data type returned by the function override def dataType: DataType = DoubleType // function is stable override def deterministic: Boolean = true // initialize before computing the buffer zone override def initialize(buffer: MutableAggregationBuffer): Unit = { // no name, only the structure buffer(0) = 0L buffer(1) = 0L } // the query results, update the data buffer override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getLong(0) + input.getLong(0) buffer(1) = buffer.getLong(1) + 1 } // merge buffer of the plurality of nodes override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer1.getLong(0) + buffer2.getLong(0) buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) } // Calculate the buffer zone where things have to eventually return results override def evaluate(buffer: Row): Any = { buffer.getLong(0).toDouble / buffer.getLong(1) } } /** * Since the function defined poly (strong type) */ case class UserBean (name: String, age: BigInt) // read digital files by default BigInt case class AvgBuffer(var sum: BigInt, var count: Int) class MyAgeAvgClassFunction extends Aggregator[UserBean, AvgBuffer, Double] { // Initialize the buffer zone override def zero: AvgBuffer = { AvgBuffer (0, 0) } // buffer input data and calculation override def reduce(b: AvgBuffer, a: UserBean): AvgBuffer = { b.sum = b.sum + a.age b.count = b.count + 1 // returns b b } // merge buffer zone override def merge(b1: AvgBuffer, b2: AvgBuffer): AvgBuffer = { b1.sum = b1.sum + b2.sum b1.count = b1.count + b2.count b1 } // Calculate the return value override def finish(reduction: AvgBuffer): Double = { reduction.sum.toDouble / reduction.count } override def bufferEncoder: Encoder[AvgBuffer] = Encoders.product override def outputEncoder: Encoder[Double] = Encoders.scalaDouble }