目录
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wedoctor.spark</groupId>
<artifactId>spark-0708</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
<hadoop.version>2.8.1</hadoop.version>
<encoding>UTF-8</encoding>
</properties>
<dependencies>
<!-- 导入scala的依赖 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- 导入spark的依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<!--导入sparksql的依赖jar包-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.41</version>
</dependency>
<!--导入 configFactory的jar包-->
<dependency>
<groupId>com.typesafe</groupId>
<artifactId>config</artifactId>
<version>1.3.0</version>
</dependency>
<!--导入streaming的jar包-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--导入redis的客户端jedis jar包-->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc_2.11</artifactId>
<version>2.5.0</version>
</dependency>
<!-- scalikejdbc-config_2.11 -->
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc-config_2.11</artifactId>
<version>2.5.0</version>
</dependency>
<!--sparkstreaming + kafka-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--kafka的jar包-->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.2.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 指定编译java的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
<!-- 指定编译scala的插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<!-- 打jar插件 -->
<!--<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<!– 指定main方法 –>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass></mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>-->
</plugins>
</build>
</project>
1.自动提交偏移量(默认)
package com.wedoctor.sparkstreaming.kafka_offset_manage
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
object StreamingKafkaDemo {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val topics = Array("first")
//kafka的配置参数
val kafkaParams = mutable.HashMap[String, Object](
"bootstrap.servers" -> "hdp-01:9092",
"key.deserializer" -> classOf[StringDeserializer].getName,
"value.deserializer" -> classOf[StringDeserializer].getName,
"group.id" -> "streaming1",
"auto.offset.reset" -> "earliest",
//enable.auto.commit默认就是true
"enable.auto.commit" -> "true"
)
val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
/**
* 运行策略:
* PreferBrokers: 如果executor和broker在同一台机器上, 方便读取数据
* PreferConsistent: 所有的kafka的数据,会被均匀的在各个executor执行 均衡的数据分布的策略 优先选择
*/
LocationStrategies.PreferConsistent,
// 订阅 主题
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
// 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
kafkaDs.foreachRDD(rdd=>{
rdd.foreach(t=>println(t.value(),t.offset(),t.partition()))
})
ssc.start()
ssc.awaitTermination()
}
}
2.手动提交偏移量
package com.wedoctor.sparkstreaming.kafka_offset_manage
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import scala.collection.mutable
object StreamingKafkaOffset {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val topics = Array("first")
//kafka的配置参数
val kafkaParams = mutable.HashMap[String, Object](
"bootstrap.servers" -> "hdp-01:9092",
"key.deserializer" -> classOf[StringDeserializer].getName,
"value.deserializer" -> classOf[StringDeserializer].getName,
"group.id" -> "streaming1",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false"
)
val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
/**
* 运行策略:
* PreferBrokers: 如果executor和broker在同一台机器上, 方便读取数据
* PreferConsistent: 所有的kafka的数据,会被均匀的在各个executor执行 均衡的数据分布的策略 优先选择
*/
LocationStrategies.PreferConsistent,
// 订阅 主题
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
// 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
kafkaDs.foreachRDD(rdd=>{
/**
* val topic: String,
* val partition: Int,
* val fromOffset: Long,
* val untilOffset: Long)
*/
//获取每一个分区的消费的偏移量
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
ranges.foreach(println)
//处理业务
rdd.foreach(t=>println(t.value(),t.offset(),t.partition()))
//手动提交偏移量
kafkaDs.asInstanceOf[CanCommitOffsets].commitAsync(ranges)
})
ssc.start()
ssc.awaitTermination()
}
}
3.Mysql管理偏移量
3.1 建表语句
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `mysqlOffset`
-- ----------------------------
DROP TABLE IF EXISTS `mysqlOffset`;
CREATE TABLE `mysqlOffset` (
`topic` varchar(20) NOT NULL,
`part` int(11) NOT NULL,
`offset` int(11) DEFAULT NULL,
`groupId` varchar(20) NOT NULL,
PRIMARY KEY (`topic`,`part`,`groupId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of mysqlOffset
-- ----------------------------
3.2 配置文件
db.url="jdbc:mysql://bigdata1:3306/test?characterEncoding=utf-8" db.driver="com.mysql.jdbc.Driver" db.password="123456" db.user="root" db.tablename="streaming_wc"
3.3 代码
package com.wedoctor.sparkstreaming.kafka_offset_manage
import java.sql.{Connection, DriverManager, ResultSet}
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import scala.collection.mutable
object StreamingMysqlOffset {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val topics = Array("first")
val groupId = "streaming124"
//kafka的配置参数
val kafkaParams = mutable.HashMap[String, Object](
"bootstrap.servers" -> "hdp-01:9092",
"key.deserializer" -> classOf[StringDeserializer].getName,
"value.deserializer" -> classOf[StringDeserializer].getName,
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false"
)
//从Mysql中获取偏移量
val offsetsMap: mutable.HashMap[TopicPartition, Long] = mutable.HashMap[TopicPartition,Long]()
val config2: Config = ConfigFactory.load()
val conn = DriverManager.getConnection(config2.getString("db.url"),"root","123456")
val query = conn.prepareStatement("select * from mysqlOffset where topic = ? and groupId =?")
query.setString(1,topics(0))
query.setString(2,groupId)
val rs: ResultSet = query.executeQuery()
while (rs.next()){
val part: Int = rs.getInt("part")
val curOffset = rs.getLong("offset")
// 把 从mysql中获取的偏移量写入配置参数
offsetsMap += (new TopicPartition(topics(0),part) -> curOffset)
}
val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
// 订阅 主题
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams,offsetsMap))
// 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
kafkaDs.foreachRDD(rdd=>{
// 判断rdd非空
if (!rdd.isEmpty()) {
// 获取每一个分区的消费的偏移量
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
ranges.foreach(println)
// 业务逻辑
rdd.foreach(t => (t.value(), t.offset(), t.partition()))
// 偏移量 写到msyql中
val curTopic = topics(0)
val config = ConfigFactory.load()
for (t <- ranges) {
// 联合主键 groupId topic partition
val conn: Connection = DriverManager.getConnection(config.getString("db.url"), "root", "123456")
val pstm = conn.prepareStatement("replace into mysqlOffset values (?,?,?,?)")
pstm.setString(1, t.topic)
pstm.setInt(2, t.partition)
pstm.setLong(3, t.untilOffset)
pstm.setString(4, groupId)
pstm.execute()
// 释放资源
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
4.Redis管理偏移量
package com.wedoctor.sparkstreaming
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
object JedisUtils {
def createJedisPool(host: String, port: Int) = {
// 创建一个连接池的配置对象
val config = new JedisPoolConfig()
//最大空闲连接数, 默认8个
config.setMaxIdle(1000)
//最大连接数, 默认8个
config.setMaxTotal(2000)
// 获取一个连接池
val pool: JedisPool = new JedisPool(config, host, port)
pool
}
private def getJedis(host: String, port: Int): Jedis = {
createJedisPool(host,port).getResource
}
def apply(host: String, port: Int): Jedis = {
getJedis(host, port)
}
}
package com.wedoctor.sparkstreaming.kafka_offset_manage
import java.util
import com.typesafe.config.{ConfigFactory}
import com.wedoctor.sparkstreaming.JedisUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import redis.clients.jedis.Jedis
import scala.collection.mutable
object StreamingRedisOffset {
Logger.getLogger("org").setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
val ssc: StreamingContext = new StreamingContext(conf, Seconds(2))
val topics = Array("first")
val groupId = "streaming125"
//kafka的配置参数
val kafkaParams = mutable.HashMap[String, Object](
"bootstrap.servers" -> "hdp-01:9092",
"key.deserializer" -> classOf[StringDeserializer].getName,
"value.deserializer" -> classOf[StringDeserializer].getName,
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false"
)
//从resis中获取偏移量
val offsetsMap = mutable.HashMap[TopicPartition, Long]()
val jedis: Jedis = JedisUtils("hdp-01",6379)
val partandOffset: util.Map[String, String] = jedis.hgetAll(topics(0) + "-" + groupId)
import scala.collection.JavaConversions._
for (part <- partandOffset) {
offsetsMap += (new TopicPartition(topics(0), part._1.toInt) -> part._2.toLong)
}
val kafkaDs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
// 订阅 主题
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams,offsetsMap))
// 后续的业务逻辑,和从socket中读取的数据的业务逻辑操作基本一致
kafkaDs.foreachRDD(rdd => {
// 判断rdd非空
if (!rdd.isEmpty()) {
// 获取每一个分区的消费的偏移量
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
ranges.foreach(println)
// 业务逻辑
rdd.foreach(t => {
println(t.value(), t.offset(), t.partition())
})
// 偏移量 写到redis中
val curTopic = topics(0)
val config = ConfigFactory.load()
for (t <- ranges) {
// hash groupId topic partition offset hset (key,p1-v1)
// 拼接成key: groupId-topic
val jedis: Jedis = JedisUtils("hdp-01", 6379)
jedis.hset(t.topic + "-" + groupId, t.partition + "", t.untilOffset + "")
jedis.close()
}
}
})
ssc.start()
ssc.awaitTermination()
}
}