版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lyzx_in_csdn/article/details/81172299
pom文件如下
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
下面的代码在kafka的Topic上没有ACL权限认证时运行良好
package com.unistack.calc.structstream;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.MetadataBuilder;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import static org.apache.spark.sql.functions.*;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
public class StructedStreamTest2 {
//192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093
private static final String ips = "192.168.1.110:9093,192.168.1.111:9093,192.168.1.112:9093";
public static void main(String[] args) {
// System.setProperty("java.security.auth.login.config","/Users/frank/Desktop/shell/lyh.conf");
SparkSession spark = SparkSession
.builder()
.appName("app")
.master("local[6]")
.getOrCreate();
Dataset<Row> df1 = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers",ips)
.option("startingOffsets", "earliest")
// .option("security.protocol","SASL_PLAINTEXT")
// .option("sasl.mechanism","PLAIN")
.option("subscribe", "yh1")
.load();
MetadataBuilder b = new MetadataBuilder();
StructField[] fields = {
new StructField("id",IntegerType, true,b.build()),
new StructField("age",IntegerType, true,b.build()),
new StructField("height",IntegerType, true,b.build())
};
StructType type = new StructType(fields);
Dataset<Row> d1 = df1
.withWatermark("timestamp","1 hours")
.selectExpr("CAST(value AS STRING)")
.select(from_json(col("value"),type).as("v"))
.selectExpr("v.id","v.age","v.height");
Dataset<Row> df2 = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers",ips)
.option("startingOffsets", "earliest")
// .option("security.protocol","SASL_PLAINTEXT")
// .option("sasl.mechanism","PLAIN")
.option("subscribe", "yh2")
.load();
StructField[] fields2={
new StructField("yh2_id",IntegerType, true,b.build()),
new StructField("yh2_age",IntegerType, true,b.build()),
new StructField("yh2_height",IntegerType, true,b.build())
};
StructType type2 = new StructType(fields2);
Dataset<Row> d2 = df2
.withWatermark("timestamp","1 hours")
.selectExpr("CAST(value AS STRING)")
.select(from_json(col("value"),type2).as("v"))
.selectExpr("v.yh2_id","v.yh2_age","v.yh2_height");
StreamingQuery query = d1.join(d2,expr("id = yh2_id"))
.writeStream()
.format("console")
.outputMode(OutputMode.Append())
.start();
try{
query.awaitTermination();
}catch(StreamingQueryException e) {
e.printStackTrace();
}
// query = df \
// .selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value") \
// .writeStream \
// .format("kafka") \
// .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
// .option("topic", "topic1") \
// .option("checkpointLocation", "/path/to/HDFS/dir") \
// .start()
}
}
参考文章:
https://sonra.io/2017/11/27/advanced-spark-structured-streaming-aggregations-joins-checkpointing/
其中yh1中的数据如下:
{"id":1,"age":1,"height":1}
yh2中的数据如下
{"yh2_id":1,"yh2_age":1,"yh2_height":1}