图书热度实时分析

Kafka作为Flume的Channels
spool_kafka:

a1.sources=s1
a1.channels=c1

a1.sources.s1.type=spooldir
a1.sources.s1.spoolDir=/opt/flume_Method/flumeDemo/data/
ad.sources.s1.interceptors=i1
ad.sources.s1.interceptors.i1.type=regex_filter
ad.sources.s1.interceptors.i1.regex=([a-z]) | \x20 | ^\s*\n
ad.sources.s1.interceptors.i1.excludeEvents=true

a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=master:9092,slave1:9092,slave2:9092
a1.channels.c1.kafka.topic=shopping1

a1.sources.s1.channels=c1


模拟日志文件生成(bookratings_log.sh):
shuf -n10 /opt/flume_Method/flumeDemo/BookRating.txt > /opt/flume_Method/flumeDemo/data/bookratings.log
每隔一分钟执行一次脚本文件(crontab -e):
* * * * * /opt/flume_Method/flumeDemo/conf/bookratings_log.sh

kafka_file_spark:

a1.sources=s1
a1.channels=c1
a1.sinks=k1

a1.sources.s1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.s1.kafka.bootstrap.servers=master:9092,slave1:9092,slave2:9092
a1.sources.s1.kafka.topics=shopping1
a1.sources.s1.kafka.consumer.auto.offset.reset=latest
a1.sources.s1.channels=c1


a1.channels.c1.type=file
a1.channels.c1.checkpointDir=/opt/flume_Method/data/checkpoint
a1.channels.c1.dataDirs=/opt/flume_Method/data/dataDir


a1.sinks.k1.type=org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname=master
a1.sinks.k1.port=16060

a1.sinks.k1.channel=c1

生产者:
bin/kafka-topics.sh --bootstrap-server master:9092,slave1:9092,slave2:9092 --create --topic shopping1 --partitions 3 --replication-factor 3
消费者读取数据(消费数据)
bin/kafka-console-consumer.sh --bootstrap-server master:9092,slave1:9092,slave2:9092 --topic shopping1


SparkStreaming_API:


更换kafka_2.12-3.0.0.jar和kafka-clients-3.0.0.jar

resource: hive-site.xml

热度计算代码:
package flumeTestDemo

import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
 * DATE:2022/9/23 14:57
 * AUTHOR:GX
 */
object BookReatings {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("book")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    val sc = spark.sparkContext
    sc.setLogLevel("WARN")

    val ssc = new StreamingContext(sc, Seconds(20))
    ssc.checkpoint("./flume")
    val stream = FlumeUtils.createPollingStream(ssc, "master", 16060)
    val data = stream.map(x => new String(x.event.getBody.array()).trim) //.trim 消除左右两边的空格
//    data.print()
    val data_split = data.map{
      x => val y = x.split("\t");(y(0),y(1),y(2).toInt)
    }
//      data_split.print()
    val dataubr = data_split.foreachRDD(line => {
      import org.apache.spark.sql.functions._
      import spark.implicits._
      val df = line.toDF().withColumnRenamed("_1","userId")
        .withColumnRenamed("_2","bookId")
        .withColumnRenamed("_3","ratings")

      val ur_count = df
//        .selectExpr("userId")
        .groupBy("userId")
        .count()
        .withColumnRenamed("count","count_user")

//      ur_count.write.mode("append").saveAsTable("book")
      val ur_avg = df.groupBy("userId").avg("ratings")
        .withColumnRenamed("avg(ratings)","avg_ratings_user")
//        .show()

      val br_count = df.groupBy("bookId").count()
        .withColumnRenamed("count","count_book")
      val br_avg = df.groupBy("bookId").avg("ratings")
        .withColumnRenamed("avg(ratings)","avg_ratings_book")

      val user_ratings = df.join(ur_avg,ur_avg("userId")===df("userId")).drop(ur_avg("userId"))
      val user_ratings_book = user_ratings.join(br_avg,br_avg("bookId")===user_ratings("bookId"))
        .drop(br_avg("bookId"))

      val user_ratings_book_user_count = user_ratings_book.join(ur_count,
        ur_count("userId")===user_ratings_book("userId"))
        .drop( ur_count("userId")).withColumnRenamed("count","count_user")
      val total_data = user_ratings_book_user_count.join(br_count,
        br_count("bookId")===user_ratings_book_user_count("bookId"))
        .drop(user_ratings_book_user_count("bookId")).withColumnRenamed("count",
        "count_book")

      val BookHot = total_data.withColumn("hot",col("count_user")
        *col("avg_ratings_user")*0.3+col("count_book")
        *col("avg_ratings_book"))

//      BookHot.show()
      // 排序并保存
      BookHot.sort(desc("hot")).coalesce(1).write.mode("append")
        .saveAsTable("bookhot.topBookHot")
//      val time = new Date().getTime
//      val format = new SimpleDateFormat("HH:mm:ss")
//      println("="*20+format.format(time)+"="*20)
    })
    ssc.start()
    ssc.awaitTermination()
  }
}
 

猜你喜欢

转载自blog.csdn.net/GX_0824/article/details/127075928