上篇已经写了一片flume跟kafka连接的例子,现在聊聊flume直接连接SparkStream的2种方式
1.push:flume将数据推到streaming,这种方式的缺点如果flume推送过来的数据spark接收不了,那么就会导致数据的丢失
先介绍push方式
创建一个文件 在kafka/data/fulmeToKafka.conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
# Describe the sink
#a1.sinks.k1.type = logger
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = test01
a1.sinks.k1.brokerList = es2:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
scala代码:
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setAppName("flume").setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc,Seconds(3))
////推送方式:flume向spark发送数据 这个ip是window上的ip地址。
val flume: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createStream(ssc,"10.8.160.27",8888)
sc.setCheckpointDir("e:/check")
val res= flume.flatMap(t=>{
new String(t.event.getBody.array()).split(" ")
}).map((_,1)).updateStateByKey(myFunc,new HashPartitioner(sc.defaultParallelism),true)
res.print()
ssc.start()
ssc.awaitTermination()
}
def myFunc = (it: Iterator[(String, Seq[Int], Option[Int])]) => {
it.map(x => {
(x._1, x._2.sum + x._3.getOrElse(0))
})
}
测试 开启flume,bin/flume-ng agent --conf conf/ --name a1 --conf-file data/fulmeToKafka.conf
开启java代码,使用telnet localhost 44444模拟生产数据
第二种
需要下载一个jar包
但是这种方法必须要引入Spark官方的一个jar包,见官方的文档:点击跳转,将jar下载下来放到flume安装包的lib目录下即可,点击直接下载jar包
创建一个文件 在kafka/data/fulmeKafka2.conf
#agent名, source、channel、sink的名称
a1.sources = r1
a1.channels = c1
a1.sinks = k1
#具体定义source
#a1.sources.r1.type = spooldir
#a1.sources.r1.spoolDir = /home/hadoop/monitor
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
#具体定义channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000
a1.channels.c1.transactionCapacity = 100
#具体定义sink
a1.sinks.k1.type = org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname = 192.168.244.161
a1.sinks.k1.port = 8889
#组装source、channel、sink
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
scala代码:
def myFunc = (it: Iterator[(String, Seq[Int], Option[Int])]) => {
it.map(x => {
(x._1, x._2.sum + x._3.getOrElse(0))
})
}
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setAppName("flume2").setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))
val addrs = Seq(new InetSocketAddress("192.168.244.161", 8889))
val ds = FlumeUtils.createPollingStream(ssc, addrs, StorageLevel.MEMORY_AND_DISK_2)
sc.setCheckpointDir("e:/check")
val res = ds.flatMap(x => {
new String(x.event.getBody.array()).split(" ")
}).map((_, 1)).updateStateByKey(myFunc, new HashPartitioner(sc.defaultParallelism), true)
res.print()
ssc.start()
ssc.awaitTermination()
}
测试 启动flume bin/flume-ng agent --conf conf/ --name a1 --conf-file data/fulmeKafka2.conf
开启scala代码,使用telnet localhost 44444
测试结果跟上面一样 ,再此就不截图了