需求
- ●模拟一个智能物联网系统的数据统计
- 使用离线和实时两种方式统计如下指标:
- 1.信号强度大于10的设备–过滤
- 2.各种设备类型的数量–分组-聚合
- 3.各种设备类型的平均信号强度 --分组再求平均
数据
- 数据是json格式,SparkSQL可以直接支持json数据源
{
"device":"Michael","deviceType": "people","signal": 15,"time": "2018-01-02 15:20:00"}
{
"device":"jack","deviceType": "people","signal": 50,"time": "2018-01-02 15:20:00"}
{
"device":"lenovo","deviceType": "computer","signal": 100,"time": "2018-01-02 15:20:00"}
{
"device":"thunisoft","deviceType": "soft","signal": 30,"time": "2018-01-02 15:20:00"}
{
"device":"book","deviceType": "soft","signal": 18,"time": "2018-01-02 15:20:00"}
{
"device":"apache","deviceType": "soft","signal": 60,"time": "2018-01-02 15:20:00"}
{
"device":"spark","deviceType": "soft","signal": 20,"time": "2018-01-02 15:20:00"}
{
"device":"redmi","deviceType": "phone","signal": 10,"time": "2018-01-02 15:20:00"}
代码实现-1-SparkSQL离线版
package cn.hanjiaxiaozhi.sql
import java.sql.Timestamp
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.{
DataFrame, Dataset, SparkSession}
object IoTAnalysis {
case class DeviceData(device: String, deviceType: String, signal: Double, time: Timestamp)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("IoTAnalysis").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
val df: DataFrame = spark.read.json("file:///D:\\data\\spark\\device.json")
val ds: Dataset[DeviceData] = df.as[DeviceData]
df.filter($"signal" > 10).show(false)
df.filter('signal > 10).show(false)
df.where($"signal" > 10).show(false)
df.where('signal > 10).show(false)
ds.filter($"signal" > 10).show(false)
ds.filter('signal > 10).show(false)
ds.filter(_.signal > 10).show(false)
ds.where($"signal" > 10).show(false)
ds.where('signal > 10).show(false)
df.groupBy("deviceType").count().sort($"count".desc).show()
df.groupBy("deviceType").count().sort('count.desc).show()
ds.groupByKey(_.deviceType).count().sort($"count(1)".desc).show()
df.groupBy("deviceType").avg("signal").show(false)
import org.apache.spark.sql.functions._
df.groupBy("deviceType").agg(avg("signal").as("xxx"),sum("signal")).show(false)
import org.apache.spark.sql.expressions.scalalang.typed
ds.groupByKey(_.deviceType).agg(typed.avg(_.signal)).toDF("deviceType","avg").show(false)
}
}
代码实现-2-StructuredStreaming实时版
- 1.注意:
- 如果要用SparkStreaming也行,但是SparkStreaming中没有DataFrame/DataSet-API处理起来麻烦
- 如果是普通字符串,后续的分割处理都类似,那如果是该练习中的json那么就需要单独处理了,因为json需要解析
- 所以该实时版中,重点在json数据的实时解析
- 2.启动Kafka并开启kafka控制台生产者
- 三台执行:
/export/servers/kafka/bin/kafka-server-start.sh /export/servers/kafka/config/server.properties>/dev/null" &
- 一台执行:
/export/servers/kafka/bin/kafka-console-producer.sh --broker-list node01:9092 --topic spark_kafka
- 3.手动往kafka发送实时日志消息
{
"device":"Michael","deviceType": "people","signal": 15,"time": "2018-01-02 15:20:00"}
{
"device":"jack","deviceType": "people","signal": 50,"time": "2018-01-02 15:20:00"}
{
"device":"lenovo","deviceType": "computer","signal": 100,"time": "2018-01-02 15:20:00"}
{
"device":"thunisoft","deviceType": "soft","signal": 30,"time": "2018-01-02 15:20:00"}
{
"device":"book","deviceType": "soft","signal": 18,"time": "2018-01-02 15:20:00"}
{
"device":"apache","deviceType": "soft","signal": 60,"time": "2018-01-02 15:20:00"}
{
"device":"spark","deviceType": "soft","signal": 20,"time": "2018-01-02 15:20:00"}
{
"device":"redmi","deviceType": "phone","signal": 10,"time": "2018-01-02 15:20:00"}
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
package cn.hanjiaxiaozhi.sql
import java.sql.Timestamp
import com.alibaba.fastjson.JSON
import org.apache.spark.SparkContext
import org.apache.spark.sql.{
DataFrame, Dataset, SparkSession}
object IoTAnalysis2 {
case class DeviceData(device: String, deviceType: String, signal: Double, time: Timestamp)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("IoTAnalysis").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
val kafkaDF: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "node01:9092")
.option("subscribe", "spark_kafka")
.load()
val jsonStrDS: Dataset[String] = kafkaDF.selectExpr("CAST(value AS String)").as[String]
val ds: Dataset[DeviceData] = jsonStrDS.map(jsonStr => {
JSON.parseObject(jsonStr, classOf[DeviceData])
})
val result1: Dataset[DeviceData] = ds.filter(_.signal>10)
val result2: Dataset[(String, Long)] = ds.groupByKey(_.deviceType).count()
import org.apache.spark.sql.expressions.scalalang.typed
val result3: Dataset[(String, Double)] = ds.groupByKey(_.deviceType).agg(typed.avg(_.signal))
result1.writeStream
.format("console")
.outputMode("append")
.option("truncate",false)
.start()
result2.writeStream
.format("console")
.outputMode("complete")
.option("truncate",false)
.start()
result3.writeStream
.format("console")
.outputMode("complete")
.option("truncate",false)
.start()
.awaitTermination()
}
}