Pipeline组件
Transformers - 转换器
转换器是包含特征转换器和学习模型的抽象概念,严格地说,转换器需要实现transform方法,该方法将一个DataFrame转换为另一个DataFrame,通常这种转换是通过在原基础上增加一列或者多列,例如:
一个特征转换器接收一个DataFrame,读取其中一列(比如text),将其映射到一个新的列上(比如feature vector),然后输出一个新的DataFrame包含映射得到的新列;
一个学习模型接收一个DataFrame,读取包含特征向量的列,为每个特征向量预测其标签值,然后输出一个新的DataFrame包含标签列;
pipeline主要是特征工程流程化,也便于自己扩展
RedisTransactionDemo 数据写入redis
package SparkTest.SparkStreaming
import SparkTest.util.JedisPoolUtil
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.{
Jedis, Pipeline}
object RedisTransactionDemo {
def main(args: Array[String]): Unit = {
var jedies: Jedis = null
var pipeline: Pipeline = null
try {
jedies = JedisPoolUtil.getConnection()
pipeline = jedies.pipelined()
pipeline.multi()
val result = List(("zhangsan", 3), ("lisi", 4), ("wangwu", 5), ("xiaowang", 1), ("xiaohong", 8))
for (tuple <- result) {
pipeline.hincrBy("wordcounts", tuple._1, tuple._2)
}
val offsetRanges = List(OffsetRange("sparktest", 0, 10, 12), OffsetRange("sparktest", 1, 5, 7), OffsetRange("sparktest", 2, 3, 6))
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
pipeline.hset("redisdemo", "wordcount_" + partition, untilOffset.toString)
}
//redis>HGETALL redisdemo
pipeline.exec()
pipeline.sync()
} catch {
case e: Exception => {
pipeline.discard()
e.printStackTrace()
}
}
}
}
NoTranscationDemo 数据写入redis
package SparkTest.SparkStreaming
import SparkTest.util.JedisPoolUtil
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.{
Jedis, Pipeline}
object NoTranscationDemo {
def main(args: Array[String]): Unit = {
var jedies: Jedis = null
try {
jedies = JedisPoolUtil.getConnection()
val result = List(("zhangsan", 3), ("lisi", 4), ("wangwu", 5), ("xiaowang", 1), ("xiaohong", 8))
for (tuple <- result) {
jedies.hincrBy("wordcounts", tuple._1, tuple._2)
}
val offsetRanges = List(OffsetRange("sparktest", 0, 10, 12), OffsetRange("sparktest", 1, 5, 7), OffsetRange("sparktest", 2, 3, 6))
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
jedies.hset("redisdemo", "wordcount_" + partition, untilOffset.toString)
}
//redis>FLUSHALL
//redos>keys * //wordcounts
} catch {
case e: Exception => {
e.printStackTrace()
}
}
finally {
jedies.close()
}
}
}
NoHA
package SparkTest.SparkStreaming
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object NoHADemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "demonoha",
"auto.offset.reset" -> "earliest"
// "enable.auto.commit" -> (t: java.lang.Boolean)
)
val topics = Array("sparktest")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val result = stream.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
result.print()
ssc.start()
ssc.awaitTermination()
ssc.stop()
//再次启动是需要重置offset
//linux>bin/kafka-consumer-groups.sh --bootstrap-server 192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092 --group demonoha --reset-offsets --all-topics --to-earliest --execute
}
}
DriverHADemo
package SparkTest.SparkStreaming
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Durations, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object DriverHADemo {
val checkpointDir = "file:///F:/JavaTest/SparkIPMappingDemo/streamingCheckpoint"
def main(args: Array[String]): Unit = {
//checkpoint中有就恢复,没有就创建
val ssc: StreamingContext = StreamingContext.getOrCreate(checkpointDir, createStreamingContext)
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
def createStreamingContext() = {
println("-----------------Create new StreamingContext ----------------")
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName("DriverHA")
val ssc: StreamingContext = new StreamingContext(conf, Durations.seconds(5))
ssc.sparkContext.setLogLevel("Error")
//监测目录,有文件就处理
ssc.checkpoint(checkpointDir)
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-demo",
"auto.offset.reset" -> "earliest"
// "enable.auto.commit" -> (t: java.lang.Boolean)
)
val topics = Array("sparktest")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val result = stream.map(_.value).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
result.print()
//linux>bin/kafka-consumer-groups.sh --bootstrap-server 192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092 --group spark-demo --reset-offsets --all-topics --to-earliest --execute
println("============================================================================")
ssc
}
}
Spark Streaming读取kafka数据写入到redis
package SparkTest.SparkStreaming
import SparkTest.util.{
JedisPoolUtil, OffsetUtils}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Durations, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{
HasOffsetRanges, KafkaUtils, OffsetRange}
import redis.clients.jedis.{
Jedis, Pipeline}
object SparkStreamingKafkaSaveToRedis {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName(this.getClass.getSimpleName)
val ssc = new StreamingContext(conf, Durations.seconds(5))
//设置日志级别
ssc.sparkContext.setLogLevel("Error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "wordcount_test", //
/**
* 当没有初始的offset,或者当前的offset不存在,如何处理数据
* earliest :自动重置偏移量为最小偏移量
* latest:自动重置偏移量为最大偏移量【默认】
* none:没有找到以前的offset,抛出异常
*/
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
/**
* 当设置 enable.auto.commit为false时,不会自动向kafka中保存消费者offset.需要异步的处理完数据之后手动提交
*/
// "enable.auto.commit" -> (true: java.lang.Boolean) //默认是true 每5s提交一次
)
val topics = Array("sparktest")
val offset = OffsetUtils.queryHistoryOffsetFromMysql()
// 原始的stream 中包含了 offset的信息
val kafkaDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams, offset)
)
kafkaDstream.foreachRDD(rdd => {
var offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val result = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).collect()
println(result.toBuffer)
var jedies: Jedis = null
var pipeline: Pipeline = null
try {
jedies = JedisPoolUtil.getConnection()
pipeline = jedies.pipelined()
pipeline.multi()
//保存计算结果
for (tuple <- result) {
pipeline.hincrBy("word_count", tuple._1, tuple._2)
}
//保存offset
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val untilOffset = offsetRange.untilOffset
pipeline.hset("wordcount_test", "sparktest-" + partition, untilOffset.toString)
}
pipeline.exec()
pipeline.sync()
} catch {
case e: Exception => {
pipeline.discard()
e.printStackTrace()
}
} finally {
pipeline.close()
jedies.close()
}
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
}
redis连接类
package SparkTest.util
import redis.clients.jedis.{
Jedis, JedisPool, JedisPoolConfig}
object JedisPoolUtil {
val config = new JedisPoolConfig()
config.setMaxTotal(20)
config.setMaxIdle(10)
config.setTestOnBorrow(true)
val pool = new JedisPool(config, "192.168.58.203", 6379, 1000,"123")
def getConnection(): Jedis = {
pool.getResource
}
}
OffsetUtils
package SparkTest.util
import org.apache.kafka.common.TopicPartition
import scala.collection.mutable
object OffsetUtils {
def queryHistoryOffsetFromMysql(): Map[TopicPartition, Long] = {
val offsets = new mutable.HashMap[TopicPartition, Long]
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("select topic_partition,offset from offset where app_gid=?")
pstmt.setString(1, "test_demo3")
val rs = pstmt.executeQuery()
while (rs.next()) {
val topicAndParition = rs.getString("topic_partition")
val offset = rs.getLong("offset")
val fields = topicAndParition.split("-")
var topic = fields(0)
val partition = fields(1).toInt
// println(topic)
// println(partition)
// println(offset)
val topicPartition = new TopicPartition(topic, partition)
offsets(topicPartition) = offset
}
offsets.toMap
}
def main(args: Array[String]): Unit = {
println(queryHistoryOffsetFromMysql())
}
}