spark2.3结构化(Structured Streaming)的流之Streaming+streaming join

pom文件如下


<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artifactId>
    <version>2.3.0</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
    <version>2.3.0</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.11</artifactId>
    <version>2.3.0</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
    <version>2.3.0</version>
</dependency>
<dependency>
    <groupId>log4j</groupId>
    <artifactId>log4j</artifactId>
    <version>1.2.17</version>
</dependency>
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.5</version>
</dependency>

下面的代码在kafka的Topic上没有ACL权限认证时运行良好

package com.unistack.calc.structstream;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.MetadataBuilder;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import static org.apache.spark.sql.functions.*;
import static org.apache.spark.sql.types.DataTypes.IntegerType;


public class StructedStreamTest2 {


//192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093
    private static final String ips = "192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093";

    public static void main(String[] args) {
//        System.setProperty("java.security.auth.login.config","/Users/frank/Desktop/shell/lyh.conf");

        SparkSession spark = SparkSession
                                .builder()
                                .appName("app")
                                .master("local[6]")
                                .getOrCreate();


        Dataset<Row> df1 = spark
                .readStream()
                .format("kafka")
                .option("kafka.bootstrap.servers",ips)
                .option("startingOffsets", "earliest")
//                .option("security.protocol","SASL_PLAINTEXT")
//                .option("sasl.mechanism","PLAIN")
                .option("subscribe", "yh1")
                .load();


        MetadataBuilder b = new MetadataBuilder();
        StructField[] fields = {
                new StructField("id",IntegerType, true,b.build()),
                new StructField("age",IntegerType, true,b.build()),
                new StructField("height",IntegerType, true,b.build())
        };

        StructType type = new StructType(fields);
        Dataset<Row> d1 = df1
                .withWatermark("timestamp","1 hours")
                .selectExpr("CAST(value AS STRING)")
                .select(from_json(col("value"),type).as("v"))
                .selectExpr("v.id","v.age","v.height");


        Dataset<Row> df2 = spark
                .readStream()
                .format("kafka")
                .option("kafka.bootstrap.servers",ips)
                .option("startingOffsets", "earliest")
//                .option("security.protocol","SASL_PLAINTEXT")
//                .option("sasl.mechanism","PLAIN")
                .option("subscribe", "yh2")
                .load();

        StructField[] fields2={
                new StructField("yh2_id",IntegerType, true,b.build()),
                new StructField("yh2_age",IntegerType, true,b.build()),
                new StructField("yh2_height",IntegerType, true,b.build())
        };

        StructType type2 = new StructType(fields2);
        Dataset<Row> d2 = df2
                .withWatermark("timestamp","1 hours")
                .selectExpr("CAST(value AS STRING)")
                .select(from_json(col("value"),type2).as("v"))
                .selectExpr("v.yh2_id","v.yh2_age","v.yh2_height");

        StreamingQuery query = d1.join(d2,expr("id = yh2_id"))
                .writeStream()
                .format("console")
                .outputMode(OutputMode.Append())
                .start();

        try{
            query.awaitTermination();
        }catch(StreamingQueryException e) {
            e.printStackTrace();
        }

//  query = df \
//  .selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value") \
//  .writeStream \
//  .format("kafka") \
//  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
//  .option("topic", "topic1") \
//  .option("checkpointLocation", "/path/to/HDFS/dir") \
//  .start()
    }
}

参考文章:

https://sonra.io/2017/11/27/advanced-spark-structured-streaming-aggregations-joins-checkpointing/

https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html

其中yh1中的数据如下：

{"id":1,"age":1,"height":1}

yh2中的数据如下

{"yh2_id":1,"yh2_age":1,"yh2_height":1}

spark2.3结构化(Structured Streaming)的流之Streaming+streaming join

猜你喜欢