pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>SparkDemo</artifactId>
<version>1.0-SNAPSHOT</version>
<name>scala-demo-project</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.11.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.0</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scalikejdbc/scalikejdbc -->
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc_2.11</artifactId>
<version>3.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scalikejdbc/scalikejdbc-config -->
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc-config_2.11</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.69</version>
</dependency>
<dependency>
<groupId>ch.hsr</groupId>
<artifactId>geohash</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.10</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api-scala_2.11</artifactId>
<version>11.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>cn.kgc.kafak.demo.ThreadProducer</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude><META-I></META-I>NF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
TopN(练习1)
数据:test-products.txt
香菜 2.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
大葱 2.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
葱头 1.60 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
大蒜 3.60 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
蒜苔 6.20 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
韭菜 5.60 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
青椒 5.20 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
茄子 5.40 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
西红柿 4.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
黄瓜 3.40 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
青冬 1.60 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
西葫芦 2.80 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
白萝卜 1.20 2018/1/1 山西汾阳市晋阳农副产品批发市场 山西 汾阳
黄豆芽 1.50 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
豌豆苗(尖) 30.00 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
油麦菜 3.00 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
杏鲍菇 10.00 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
大米 4.50 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
绿豆 9.00 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
红小豆 9.20 2018/1/1 北京朝阳区大洋路综合市场 北京 朝阳
胡萝卜 2.50 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
白萝卜 2.30 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
土豆 2.40 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
山药 5.50 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
平菇 7.00 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
黑木耳 35.00 2018/1/1 新疆石河子西部绿珠果蔬菜批发市场 北京 朝阳
西瓜 5.60 2018/1/1 山东济南市堤口路果品批发市场 山东 济南
菠萝 3.60 2018/1/1 山东济南市堤口路果品批发市场 山东 济南
巨峰葡萄 11.00 2018/1/1 山东济南市堤口路果品批发市场 山东 济南
葱头 1.60 2018/1/1 山西临汾尧丰农副产品批发市场 山西 临汾
洋白菜 1.00 2018/1/1 四川南充市桑园坝农产品批发市场 四川 南充
香菜 5.80 2018/1/1 四川南充市桑园坝农产品批发市场 四川 南充
豌豆苗(尖) 5.00 2018/1/1 四川南充市桑园坝农产品批发市场 四川 南充
黄豆芽 1.80 2018/1/1 四川南充市桑园坝农产品批发市场 四川 南充
平菇 2.08 2018/1/1 河北乐亭县冀东果蔬批发市场 河北 乐亭
香菇 8.00 2018/1/1 河北乐亭县冀东果蔬批发市场 河北 乐亭
鸡腿菇 2.90 2018/1/1 河北乐亭县冀东果蔬批发市场 河北 乐亭
蒜苔 6.00 2018/1/1 河北乐亭县冀东果蔬批发市场 河北 乐亭
方法1:
package SparkTest
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object TopNDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/test-products.txt")
lines.filter(_.split("\t").length >= 5)
.map(line => {
val fields = line.split("\t"); ((fields(4), fields(3)), 1) //数据清洗
})
.reduceByKey(_ + _)
.map(x=>(x._1._1,(x._1._2,x._2)))
.groupByKey()
.mapValues(_.toList.sortBy(-_._2).take(3))
.collect().foreach(println)
// 结果:(山西,List((山西汾阳市晋阳农副产品批发市场,13), (山西临汾尧丰农副产品批发市场,1)))
// (河北,List((河北乐亭县冀东果蔬批发市场,4)))
// (四川,List((四川南充市桑园坝农产品批发市场,4)))
// (山东,List((山东济南市堤口路果品批发市场,3)))
// (北京,List((北京朝阳区大洋路综合市场,7), (新疆石河子西部绿珠果蔬菜批发市场,6)))
sc.stop()
}
}
方法2:
package SparkTest
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object TopNDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/test-products.txt")
val reduced = lines.filter(_.split("\t").length >= 5)
.map(line => {
val fields = line.split("\t");
((fields(4), fields(3)), 1)
})
.reduceByKey(_ + _)
val provines = reduced.map(_._1._1).distinct().collect()
for (provine <- provines) {
reduced.filter(x => x._1._1.equals(provine)).sortBy(-_._2).take(3).foreach(println)
}
//结果:((山西,山西汾阳市晋阳农副产品批发市场),13)
//((山西,山西临汾尧丰农副产品批发市场),1)
//((河北,河北乐亭县冀东果蔬批发市场),4)
//((四川,四川南充市桑园坝农产品批发市场),4)
//((山东,山东济南市堤口路果品批发市场),3)
//((北京,北京朝阳区大洋路综合市场),7)
//((北京,新疆石河子西部绿珠果蔬菜批发市场),6)
数据写入Mysql(练习2)
数据:user_test.csv
name number
lisi 123 111 222
wangwu 456 333
zhangsan 789 444 555 666
mysql创建表
mysql>use testdb;
mysql> create table user_test ( user_name varchar(50), number varchar(10));
创建mysql连接类
package SparkTest.util
import java.sql.{
Connection, DriverManager}
object DBUtil {
def getConnection():Connection={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://192.168.58.203/testdb","root","123")
}
}
方法1:将数据 收集到Driver端, collect 在 Driver端建立连接。
package SparkTest
import SparkTest.util.DBUtil
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//写入mysql数据库
object TransformationDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/user_test.csv")
// println(lines.partitions.length)
val result = lines.mapPartitionsWithIndex((index, it) => if (index == 0) it.drop(1) else it)
.flatMap(line => {
val fields = line.split(",", -1)
for (i <- fields(1).split(" ", -1)) yield (fields(0), i)
})//.collect().foreach(println)
val tuples: Array[(String, String)] = result.collect()
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("insert into user_test values (?,?)")
try {
var counter=0
tuples.foreach(tuple => {
pstmt.setString(1, tuple._1)
pstmt.setString(2, tuple._2)
pstmt.addBatch()
counter+=1
if(counter%2==0){
pstmt.executeBatch()
pstmt.clearBatch()
}
})
pstmt.executeBatch()
} catch {
case e: Exception =>
e.printStackTrace()
} finally {
if (con != null) {
con.close()
}
if (pstmt != null) {
pstmt.close()
}
}
sc.stop()
}
}
方法2:在exector端执行:
foreachPartition 批量运行
package SparkTest
import SparkTest.util.DBUtil
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//写入mysql数据库
object TransformationDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/user_test.csv")
// println(lines.partitions.length)
val result = lines.mapPartitionsWithIndex((index, it) => if (index == 0) it.drop(1) else it)
.flatMap(line => {
val fields = line.split(",", -1)
for (i <- fields(1).split(" ", -1)) yield (fields(0), i)
})//.collect().foreach(println)
result.foreachPartition(it => {
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("insert into user_test values (?,?)")
try {
// var counter = 0
it.foreach(tuple => {
pstmt.setString(1, tuple._1)
pstmt.setString(2, tuple._2)
/* pstmt.addBatch()
counter += 1
if (counter % 2 == 0) {
pstmt.executeBatch()
pstmt.clearBatch()
}*/
})
pstmt.executeBatch()
} catch {
case e: Exception =>
e.printStackTrace()
} finally {
if (con != null) {
con.close()
}
if (pstmt != null) {
pstmt.close()
}
}
})
sc.stop()
}
}
文件读取emp信息表,通过RDD加载部门信息 部门存在数据库中 (练习3)
数据:emp_test.csv
1,SMITH,CLERK,13,2000-12-17,800.00,\N,20
2,ALLEN,SALESMAN,6,2000-02-20,1600.00,300.00,40
3,WARD,SALESMAN,6,2000-02-22,1200.00,500.00,30
4,JONES,MANAGER,9,2000-04-02,2975.00,\N,20
5,MARTIN,SALESMAN,6,2000-09-28,1200.00,1400.00,30
6,BLAKE,MANAGER,9,2000-05-01,2800.00,\N,30
7,CLARK,MANAGER,9,2000-06-09,2400.00,\N,10
8,SCOTT,ANALYST,4,2000-07-13,3000.00,\N,20
9,KING,PRESIDENT,\N,2000-11-17,5000.00,\N,10
10,TURNER,SALESMAN,6,2000-09-08,1500.00,0.00,30
11,ADAMS,CLERK,8,2000-07-13,1300.00,\N,20
12,JAMES,CLERK,6,2000-12-03,950.00,\N,30
mysql创建表 dept_test并插入数据
create table dept_test ( DEPTNO int(10),DNAME varchar(50),LOC varchar(50));
insert into dept_test values(10,'ACCOUNTING','NEW YORK');
insert into dept_test values(20,'RESEARCH','DALLAS');
insert into dept_test values(30,'SALES','CHICAGO');
insert into dept_test values(40,'OPERATIONS','BOSTON');
法1:使用Mappartion打开数据库链接,文件,socket只能在迭代完成后关闭链接
package SparkTest
import SparkTest.util.DBUtil
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
case class Employee(val empNo: String, val name: String, val job: String, val mgr: String,
val hireDate: String, val sal: Double, var deptNo: String, var dname: String = null)//设置初始值
object MapPartitionTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/emp_test.csv")
//读取信息表 通过RDD加载部门信息 部门存在在数据库中,目的是封装数据
lines.map(line => {
val fields = line.split(",", -1)
new Employee(fields(0), fields(1), fields(2), fields(3), fields(4), fields(5).toDouble, fields(7))
})//.collect().foreach(println)
.mapPartitions(it => {
//it 迭代器
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("select dname from dept_test where deptno=?")
it.map(employee => {
pstmt.setInt(1, employee.deptNo.toInt)
val rs = pstmt.executeQuery()
while (rs.next()) {
employee.dname = rs.getString("dname")
}
if(!it.hasNext){
if (rs != null) {
rs.close()
}
if (pstmt != null) {
pstmt.close()
}
if (con != null) {
con.close()
}
}
employee
})
}).collect().foreach(println)
sc.stop()
}
}
创建mysql连接类
package SparkTest.util
import java.sql.{
Connection, DriverManager}
object DBUtil {
def getConnection():Connection={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://192.168.58.203/testdb","root","123")
}
}
法2:创建一个静态的对象,封装数据,提供接口,获取数据
数据可更新,设置个定时器,定时加载变化的数据 java要使用单例模式
package SparkTest
import SparkTest.util.{
DBUtil, DeptUtil}
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//应用场景:数据输出到外部存储系统:mysql、redis
object ObjectTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/emp_test.csv")
val result = lines.map(line => {
val fields = line.split(",", -1)
new Employee(fields(0), fields(1), fields(2), fields(3), fields(4), fields(5).toDouble, fields(7))
}).map(employee=>{
employee.dname=DeptUtil.getPropery(employee.deptNo)
employee
}).collect().foreach(println)
sc.stop
}
}
封装类 字符拼接
package SparkTest.util
import scala.collection.mutable.ArrayBuffer
//3 封装类 字符拼接
object DeptUtil {
val maps:ArrayBuffer[(String,String)]=ArrayBuffer(("10","ACCOUNTING"),("20","RESEARCH"),("30","SALES"),("40","OPERATIONS"))
def getPropery(key: String) = {
maps.toMap.get(key).getOrElse("None")
}
}
法3:广播变量 不可变
package SparkTest
import SparkTest.util.DeptUtil
import org.apache.spark.{
SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//4 广播变量
import scala.collection.mutable.ArrayBuffer
object BCTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformationDemo").setMaster("local[*]")
val sc = new SparkContext(conf)
val maps: ArrayBuffer[(String, String)] = ArrayBuffer(("10", "ACCOUNTING"), ("20", "RESEARCH"), ("30", "SALES"), ("40", "OPERATIONS"))
//将配置的数广播出去
val bcMap = sc.broadcast(maps.toMap)
val lines: RDD[String] = sc.textFile("file:///F:/JavaTest/SparkDemo/data/emp_test.csv")
val result = lines.map(line => {
val fields = line.split(",", -1)
new Employee(fields(0), fields(1), fields(2), fields(3), fields(4), fields(5).toDouble, fields(7))
}).map(employee => {
//通过 应用获取广播的值
val map = bcMap.value
employee.dname = map.get(employee.deptNo).getOrElse("None")
employee
}).collect().foreach(println)
sc.stop
}
}