SparkStreaming 日志双写 直连flume

生产环境中,开发会写入一些测试数据,或者黑名单过滤,或者只抓取黑名单

transform

日志双写 ==> log
a:正常的
a:test的
黑名单过滤
log    

black

======================================================================================================================================================

import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer

object FilterApp {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf()
      .setAppName("FilterApp")
      .setMaster("local[2]")
    val sc = new SparkContext(sparkConf)

    val blackTuple = new ListBuffer[(String,Boolean)]
    blackTuple.append(("sm",true))
    val blacksRDD = sc.parallelize(blackTuple)

    // 准备测试数据: log
    val input = new ListBuffer[(String,String)]
    input.append(("su","20180808,su,M,20"))
    input.append(("kk","20180808,kk,M,20"))
    input.append(("sm","20180808,sm,M,20"))
    val inputRDD = sc.parallelize(input)

    // TODO... 过滤掉黑名单用户
    val joinRDD = inputRDD.leftOuterJoin(blacksRDD) //.foreach(println)

    joinRDD.filter(x => {
      x._2._2.getOrElse(false) != true
    }).map(_._2._1).foreach(println)
    sc.stop()
  }

}

import org.apache.spark.SparkConf
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object FlumePullApp {

  def main(args: Array[String]) {

    val Array(hostname, port) = args

    val sparkConf = new SparkConf()
      .setAppName("FlumePullApp")
      .setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    val lines = FlumeUtils.createPollingStream(ssc, hostname,port.toInt)
    lines.map(x => new String(x.event.getBody.array()).trim)
      .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()

    ssc.start()
    ssc.awaitTermination()
  }

}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object FlumePushApp {

  def main(args: Array[String]) {

    val Array(hostname, port) = args

    val sparkConf = new SparkConf()
      .setAppName("FlumePushApp")
      .setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    val lines = FlumeUtils.createStream(ssc,hostname,port.toInt)
    lines.map(x => new String(x.event.getBody.array()).trim)
      .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()

    ssc.start()
    ssc.awaitTermination()
  }

}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StreamingFilterApp {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf()
      .setAppName("SocketWordCountApp")
      .setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    val blacks = List("sm","su")
    val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x=>(x,true))

    val lines = ssc.socketTextStream("hadoop000",9997)
    // (su, log of su)
    val clickLogDstream = lines.map(x => (x.split(",")(1), x)).transform(rdd => {
      rdd.leftOuterJoin(blacksRDD)
        .filter(x => {
          // _2 : (string,option)  _2:option
          x._2._2.getOrElse(false) != true
      }).map(_._2._1)
    })

    clickLogDstream.print()

    ssc.start()
    ssc.awaitTermination()
  }

}

netcat => memory ==> avro sink ==> streaming
a1.sinks = k1
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = localhost

a1.sinks.k1.port = 41414


a1.sinks = spark
a1.sinks.spark.type = org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.spark.hostname = localhost

a1.sinks.spark.port = 41414


./spark-submit --master local[2] \
--class com.ruozedata.streaming.FlumePullApp \
--name FlumePushApp \
/home/hadoop/lib/train-scala-1.0-jar-with-dependencies.jar \
localhost 41414

猜你喜欢

转载自blog.csdn.net/qq_15300683/article/details/80690137