Kafka作为Flume的Channels
spool_kafka:
a1.sources=s1
a1.channels=c1
a1.sources.s1.type=spooldir
a1.sources.s1.spoolDir=/opt/flume_Method/flumeDemo/data/
ad.sources.s1.interceptors=i1
ad.sources.s1.interceptors.i1.type=regex_filter
ad.sources.s1.interceptors.i1.regex=([a-z]) | \x20 | ^\s*\n
ad.sources.s1.interceptors.i1.excludeEvents=true
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers=master:9092,slave1:9092,slave2:9092
a1.channels.c1.kafka.topic=shopping1
a1.sources.s1.channels=c1
模拟日志文件生成(bookratings_log.sh):
shuf -n10 /opt/flume_Method/flumeDemo/BookRating.txt > /opt/flume_Method/flumeDemo/data/bookratings.log
每隔一分钟执行一次脚本文件(crontab -e):
* * * * * /opt/flume_Method/flumeDemo/conf/bookratings_log.sh
kafka_file_spark:
a1.sources=s1
a1.channels=c1
a1.sinks=k1
a1.sources.s1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.s1.kafka.bootstrap.servers=master:9092,slave1:9092,slave2:9092
a1.sources.s1.kafka.topics=shopping1
a1.sources.s1.kafka.consumer.auto.offset.reset=latest
a1.sources.s1.channels=c1
a1.channels.c1.type=file
a1.channels.c1.checkpointDir=/opt/flume_Method/data/checkpoint
a1.channels.c1.dataDirs=/opt/flume_Method/data/dataDir
a1.sinks.k1.type=org.apache.spark.streaming.flume.sink.SparkSink
a1.sinks.k1.hostname=master
a1.sinks.k1.port=16060
a1.sinks.k1.channel=c1
生产者:
bin/kafka-topics.sh --bootstrap-server master:9092,slave1:9092,slave2:9092 --create --topic shopping1 --partitions 3 --replication-factor 3
消费者读取数据(消费数据)
bin/kafka-console-consumer.sh --bootstrap-server master:9092,slave1:9092,slave2:9092 --topic shopping1
SparkStreaming_API:
更换kafka_2.12-3.0.0.jar和kafka-clients-3.0.0.jar
resource: hive-site.xml
热度计算代码:
package flumeTestDemo
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* DATE:2022/9/23 14:57
* AUTHOR:GX
*/
object BookReatings {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("book")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
val sc = spark.sparkContext
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(20))
ssc.checkpoint("./flume")
val stream = FlumeUtils.createPollingStream(ssc, "master", 16060)
val data = stream.map(x => new String(x.event.getBody.array()).trim) //.trim 消除左右两边的空格
// data.print()
val data_split = data.map{
x => val y = x.split("\t");(y(0),y(1),y(2).toInt)
}
// data_split.print()
val dataubr = data_split.foreachRDD(line => {
import org.apache.spark.sql.functions._
import spark.implicits._
val df = line.toDF().withColumnRenamed("_1","userId")
.withColumnRenamed("_2","bookId")
.withColumnRenamed("_3","ratings")
val ur_count = df
// .selectExpr("userId")
.groupBy("userId")
.count()
.withColumnRenamed("count","count_user")
// ur_count.write.mode("append").saveAsTable("book")
val ur_avg = df.groupBy("userId").avg("ratings")
.withColumnRenamed("avg(ratings)","avg_ratings_user")
// .show()
val br_count = df.groupBy("bookId").count()
.withColumnRenamed("count","count_book")
val br_avg = df.groupBy("bookId").avg("ratings")
.withColumnRenamed("avg(ratings)","avg_ratings_book")
val user_ratings = df.join(ur_avg,ur_avg("userId")===df("userId")).drop(ur_avg("userId"))
val user_ratings_book = user_ratings.join(br_avg,br_avg("bookId")===user_ratings("bookId"))
.drop(br_avg("bookId"))
val user_ratings_book_user_count = user_ratings_book.join(ur_count,
ur_count("userId")===user_ratings_book("userId"))
.drop( ur_count("userId")).withColumnRenamed("count","count_user")
val total_data = user_ratings_book_user_count.join(br_count,
br_count("bookId")===user_ratings_book_user_count("bookId"))
.drop(user_ratings_book_user_count("bookId")).withColumnRenamed("count",
"count_book")
val BookHot = total_data.withColumn("hot",col("count_user")
*col("avg_ratings_user")*0.3+col("count_book")
*col("avg_ratings_book"))
// BookHot.show()
// 排序并保存
BookHot.sort(desc("hot")).coalesce(1).write.mode("append")
.saveAsTable("bookhot.topBookHot")
// val time = new Date().getTime
// val format = new SimpleDateFormat("HH:mm:ss")
// println("="*20+format.format(time)+"="*20)
})
ssc.start()
ssc.awaitTermination()
}
}