首先添加Maven依赖与打包插件
<properties>
<spark.version>2.2.2</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.8</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.39</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-xml_2.11</artifactId>
<version>1.1.1</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<configuration>
<downloadSources>true</downloadSources>
<buildcommands>
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
</buildcommands>
<additionalProjectnatures>
<projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
</additionalProjectnatures>
<classpathContainers>
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
</classpathContainers>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<!-- 对第三方依赖进行打包-->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<!--<manifest><mainClass></mainClass></manifest>-->
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
这里使用scala编写一个object,模拟hive中获取字符串的长度的函数
import org.apache.spark.sql.SparkSession
/**
* @Author Daniel
* @Description Spark SQL UDF开发
**/
object SparkSQLUDF {
def main(args: Array[String]): Unit = {
//用法提示
if (args == null || args.length < 2) {
println(
"""
|Parameter Errors! Usage: <basicPath> <infoPath>
""".stripMargin)
System.exit(-1)
}
//接收参数
val Array(basicPath, infoPath) = args
//设置连接hive的参数
val spark = SparkSession.builder()
.appName("SparkSQLUDF")
.master("local[*]")
.enableHiveSupport() //支持hive的操作
.getOrCreate()
//调用spark.sql直接写sql语句,注意不能将多条sql合并在一个dataframe中执行
spark.sql("create database if not exists spark_on_hive")
//创建teacher_basic
spark.sql(
"""
|create table if not exists spark_on_hive.teacher_basic (
| name string,
| age int,
| married boolean,
| classes int
|) row format delimited
|fields terminated by ','
""".stripMargin)
//创建teacher_info
spark.sql(
"""
|create table if not exists spark_on_hive.teacher_info (
| name string,
| height double
|) row format delimited
|fields terminated by ','
""".stripMargin)
//加载本地数据
spark.sql(
s"""
|load data local inpath '${basicPath}' into table spark_on_hive.teacher_basic
""".stripMargin)
spark.sql(
s"""
|load data local inpath '${infoPath}' into table spark_on_hive.teacher_info
""".stripMargin)
//这里做了一个join操作
val joinSQL =
"""
|select
| b.name,
| b.age,
| b.married,
| b.classes,
| i.height
|from spark_on_hive.teacher_basic b
|left join spark_on_hive.teacher_info i on b.name = i.name
""".stripMargin
//拿到sql的结果
val joinDF = spark.sql(joinSQL)
//将查询的结果存入表中
joinDF.write.saveAsTable("spark_on_hive.teacher")
//声明udf,参数为String,返回值为int
spark.udf.register[Int, String]("myLen", myStrLength)
val udfSQL =
"""
|select
|mylen(name) namelen
|from spark_on_hive.teacher
""".stripMargin
val udfDF = spark.sql(udfSQL)
udfDF.write.saveAsTable("spark_on_hive.name")
//结果展示
spark.sql(udfSQL).show()
spark.stop()
}
//udf
def myStrLength(str: String): Int = str.length
}
然后打成jar包,在集群上运行
数据:
teacher_basic.txt
li,18,true,8
liu,19,true,9
yang,20,true,10
peng,21,true,1
huang,22,true,2
teacher_info.txt
li,166
liu,170
yang,181
huang,178
编写spark任务脚本
vi sparksqludf.sh
#!/bin/sh
SPARK_HOME=/home/hadoop/apps/spark
export HADOOP_CONF_DIR=/home/hadoop/apps/hadoop-2.7.6/etc/hadoop
$SPARK_HOME/bin/spark-submit \
--master local[*] \
--deploy-mode client \
--class SparkSQLUDF \
--executor-memory 600M \
--executor-cores 1 \
--driver-cores 1 \
--num-executors 1 \
--driver-memory 600M \
/home/hadoop/hive_jar/myudf.jar \
/home/hadoop/hive_data/teacher_basic.txt \
/home/hadoop/hive_data/teacher_info.txt
echo 'successfully!!'
直接运行即可
sh sparksqludf.sh
这个error可以不用管,直接看结果即可