Make DataTo Kafka
package SparkTest.SparkStreaming
import java.util.Properties
import org.apache.kafka.clients.producer.{
KafkaProducer, ProducerConfig, ProducerRecord}
import scala.collection.mutable.ListBuffer
import scala.util.Random
//生产数据 到kafka中
object MakeDataDemo {
def main(args: Array[String]): Unit = {
val props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092");
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
val producer = new KafkaProducer[String, String](props)
val words = List[String]("zhangsan", "lisi", "wangwu", "spark", "sql", "Streaming", "xiaowang", "xiaohong")
while (true) {
val lines = new ListBuffer[String]
for (i <- 1 to (1 + new Random().nextInt(6))) {
lines.append(words(new Random().nextInt(8)))
}
val data = new ProducerRecord[String, String]("sparktest", lines.mkString(" "))
producer.send(data)
Thread.sleep(500)
// print(data)
}
}
}
创建表相应表
mysql>create table wordcount (word varchar(200) primary key,counts int(11));
mysql>create table offset (app_gid varchar(30) ,topic_partition varchar(30),offset int(11));
mysql>truncate table wordcount;
mysql>truncate table offset;
CommitOffsetDemo 手动设置offset
package SparkTest.SparkStreaming
import SparkTest.util.DBUtil
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{
CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object CommitOffsetDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-demo",
"auto.offset.reset" -> "latest",
//设置手动offset
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("sparktest")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
stream.foreachRDD(rdd => {
//获取本批次数的offset
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//数据量的问题
// 将计算结果收集到Driver端
val result = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).collect()
//保存计算书结果
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("insert into wordcount values(?,?) ON DUPLICATE KEY UPDATE counts=counts+?")
result.foreach(word => {
pstmt.setString(1, word._1)
pstmt.setInt(2, word._2)
pstmt.setInt(3, word._2)
pstmt.addBatch()
})
pstmt.executeBatch()
//手动提交本批次的offset
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
)
ssc.start()
ssc.awaitTermination()
}
}
Spark Streaming读取kafka写入到mysql
package SparkTest.SparkStreaming
import java.sql.PreparedStatement
import SparkTest.util.{
DBUtil, OffsetUtils}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Durations, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{
HasOffsetRanges, KafkaUtils, OffsetRange}
object SparkstreamingKafkaSaveToMysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName(this.getClass.getSimpleName)
val ssc = new StreamingContext(conf, Durations.seconds(5))
//设置日志级别
ssc.sparkContext.setLogLevel("Error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "demo3", //
/**
* 当没有初始的offset,或者当前的offset不存在,如何处理数据
* earliest :自动重置偏移量为最小偏移量
* latest:自动重置偏移量为最大偏移量【默认】
* none:没有找到以前的offset,抛出异常
*/
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
/**
* 当设置 enable.auto.commit为false时,不会自动向kafka中保存消费者offset.需要异步的处理完数据之后手动提交
*/
// "enable.auto.commit" -> (true: java.lang.Boolean) //默认是true 每5s提交一次
)
val topics = Array("sparktest")
// val offset = Map.empty[TopicPartition, Long]
// val offset = OffsetUtils.queryHistoryOffsetFromRedis()
val offset=OffsetUtils.queryHistoryOffsetFromMysql();
// val offset = Map.empty[TopicPartition, Long]
// 原始的stream 中包含了 offset的信息
val kafkaDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams,offset)
)
kafkaDstream.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
var offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val reduced = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
val result = reduced.collect()
println(result.toBuffer)
val con = DBUtil.getConnection()
var pstmt1: PreparedStatement = null
var pstmt2: PreparedStatement = null
try {
//取消自动提交,默认jdbc 一条语句一个事务
con.setAutoCommit(false)
/*
result.foreach(tuple => {
pstmt1 = con.prepareStatement("INSERT INTO wordcount(word,counts) VALUES(?,?) ON DUPLICATE KEY UPDATE counts=counts+?;")
pstmt1.setString(1, tuple._1)
pstmt1.setInt(2, tuple._2)
pstmt1.setInt(3, tuple._2)
pstmt1.addBatch()
})
pstmt1.executeBatch()
*/
for (tuple <- result) {
pstmt1 = con.prepareStatement("INSERT INTO wordcount(word,counts) VALUES(?,?) ON DUPLICATE KEY UPDATE counts=counts+?;")
pstmt1.setString(1, tuple._1)
pstmt1.setInt(2, tuple._2)
pstmt1.setInt(3, tuple._2)
pstmt1.executeUpdate()
}
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
//create table offset (app_gid varchar(30) ,topic_partition varchar(30),offset int(11));
pstmt2 = con.prepareStatement("insert into offset (app_gid,topic_partition,offset) values (?,?,?) on DUPLICATE KEY UPDATE offset=?")
pstmt2.setString(1, "test_demo1")
pstmt2.setString(2, topic + "-" + partition)
pstmt2.setInt(3, untilOffset.toInt)
pstmt2.setInt(4, untilOffset.toInt)
pstmt2.executeUpdate()
}
con.commit()
} catch {
case e: Exception => {
con.rollback()
ssc.stop()
e.printStackTrace()
}
} finally {
if (pstmt1 != null) {
pstmt1.close()
}
if (pstmt2 != null) {
pstmt2.close()
}
if (con != null) {
con.close()
}
}
}
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
}
mysql 连接
package SparkTest.util
import java.sql.{
Connection, DriverManager}
object DBUtil {
def getConnection():Connection={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://192.168.58.203/testdb","root","123")
}
}
JedisDemo 写入redis
package SparkTest.SparkStreaming
import SparkTest.util.JedisPoolUtil
object JedisDemo {
def main(args: Array[String]): Unit = {
val jedis = JedisPoolUtil.getConnection()
//linux>bin/redis-server redis.conf 启动redis服务
val wordcounts=List(("zhangsan",3),("lisi",4),("wangwu",5),("xiaohang",1),("spark",8))
wordcounts.foreach(word=>{
jedis.hincrBy("rediswctest",word._1,word._2)
})
// val map = jedis.hgetAll("wordcounts")
//
// print(map)
//linux>bin/redis-cli 进入redis客户端
//redis>auth 123
//redis>keys *
//redis>HGETALL rediswctest
}
}
JedisPoolUtil redis连接类
package SparkTest.util
import redis.clients.jedis.{
Jedis, JedisPool, JedisPoolConfig}
object JedisPoolUtil {
val config = new JedisPoolConfig()
config.setMaxTotal(20)
config.setMaxIdle(10)
config.setTestOnBorrow(true)
val pool = new JedisPool(config, "192.168.58.203", 6379, 1000,"123")
def getConnection(): Jedis = {
pool.getResource
}
}