一.实验内容
实验环境
Hadoop2.7.5
Spark2.4.0
Zookeeper -3.4.8
Kafka_2.11-2.1.0
mysql
实验内容
模拟产生数据写入Kafka,Structured Streaming读取Kafka中数据,通过Structured Streaming计算将数据写入Mysql,其中写入Mysql的过程中需要借助JdbcSink.
二.实验步骤
2.1 使用IDEA创建Maven项目
2,2生成模拟数据写入到kafka StockDataProducer.java
package com.processor.driver;
import java.util.Properties;
import java.util.Random;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
public class StockDataProducer {
private KafkaProducer<String, String> kafkaProducer;
public static void main(String[] args) {
StockDataProducer driver = new StockDataProducer();
driver.run();
}
public void run() {
Properties props = new Properties();
props.put("bootstrap.servers", "192.168.100.10:9092");
props.put("acks", "all");
props.put("retries", 0);
props.put("batch.size", 16384);
props.put("linger.ms", 1);
props.put("buffer.memory", 33554432);
props.put("key.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
this.kafkaProducer = new KafkaProducer<String, String>(props);
int i = 0;
Random random = new Random();
while (true) {
for (i = 0; i <= 10; i++) {
kafkaProducer.send(new ProducerRecord<String, String>(
"market-data-input", "" + i,
"1" + i + ",2019-01-14 15:35:45,ANZ"
+ random.nextInt(10) + ",buyer,seller,"
+ 100.0 * random.nextInt(100) + ","
+ 10 * random.nextInt(100)));
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
kafkaProducer.close();
}
System.out.println("write success");
}
}
}
2.3使用Structured Streaming对kafka中的数据进行处理 StockDataProcessingDriver.java
在代码中通过读配置文件获取连接的kafka等信息实现初始化,然后编写run方法,对得到的结果集进行处理,在对计算的信息写入mysql时,需要使用JdbcSink对数据库进行更新形式的操作
package com.processor.driver;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import com.processor.function.KafkaOutputMapper;
import com.processor.function.StockMapper;
import com.processor.model.KafkaOutputBO;
import com.processor.model.StockBO;
import com.processor.sink.JdbcSink;
import com.processor.util.Util;
public class StockDataProcessingDriver {
private SparkSession sparkSession;
private String outputMode;
private Map<String, String> config = new HashMap<String, String>();
private Properties prop;
private String propFileName;
private String shutdownHook;
public StockDataProcessingDriver() {
this.propFileName = "config.properties";
init();
}
// constructor for integration tests
public StockDataProcessingDriver(String propertyFile, String shutdownHook) {
this.propFileName = propertyFile;
this.shutdownHook = shutdownHook;
init();
}
public void init() {
this.prop = Util.loadProperties(propFileName, this.getClass());
this.sparkSession = SparkSession.builder()
.config("spark.master", this.prop.getProperty("sparkMaster"))
.appName("market-data-app").getOrCreate();
this.outputMode = this.prop.getProperty("outputMode");
config.put("kafka.bootstrap.servers", this.prop.getProperty("kafka.bootstrap.servers.sink"));
config.put("subscribe", this.prop.getProperty("inputTopicName"));
config.put("maxOffsetsPerTrigger", this.prop.getProperty("maxOffsetsPerTrigger"));
config.put("startingOffsets", this.prop.getProperty("startingOffsets"));
}
public static void main(String[] args) {
StockDataProcessingDriver driver = new StockDataProcessingDriver();
driver.run();
}
public void run() {
Dataset<Row> df = this.sparkSession.readStream().format("kafka").options(this.config).load();
Dataset<Row> df_value = df.selectExpr("CAST(value AS STRING)");
Dataset<StockBO> dsStocks = df_value.map(new StockMapper(), Encoders.bean(StockBO.class));
// raw data storage
StreamingQuery rawStorage = dsStocks.writeStream().format("parquet")
.option("startingOffsets", "earliest")
.option("checkpointLocation", this.prop.getProperty("checkpointRaw"))
.option("path", this.prop.getProperty("outputPath"))
.partitionBy("stockCode").start();
Dataset<Row> aggResult = dsStocks
.groupBy(dsStocks.col("stockCode").alias("Stock"),
functions.window(dsStocks.col("timestamp"), "1 hour").alias("Hour"))
.agg(functions.max(dsStocks.col("price").alias("Max")),
functions.min(dsStocks.col("price").alias("Min")),
functions.sum(dsStocks.col("volume")).alias("Volume"));
StreamingQuery queryAgg = null;
// kafka change log in append mode
if (outputMode.equalsIgnoreCase("kafka")) {
Dataset<KafkaOutputBO> dsKafka = aggResult.map(
new KafkaOutputMapper(),
Encoders.bean(KafkaOutputBO.class));
queryAgg = dsKafka
.selectExpr("CAST(key AS STRING) AS key", "CAST(value AS STRING) as value")
.writeStream().format("kafka").outputMode("complete")
.option("kafka.bootstrap.servers", this.prop.getProperty("kafka.bootstrap.servers.sink"))
.option("topic", this.prop.getProperty("outputTopicName"))
.option("checkpointLocation", this.prop.getProperty("checkPointSink"))
.start();
}
// writing to mysql with foreach sink in update mode
if (outputMode.equalsIgnoreCase("jdbc")) {
JdbcSink sink = new JdbcSink();
queryAgg = aggResult.writeStream().outputMode("update").foreach(sink).start();
}
try {
if (this.shutdownHook != null) {
rawStorage.processAllAvailable();
queryAgg.processAllAvailable();
queryAgg.stop();
rawStorage.stop();
} else {
rawStorage.awaitTermination();
queryAgg.awaitTermination();
}
} catch (StreamingQueryException e) {
shutdown();
e.printStackTrace();
}
}
public void shutdown() {
sparkSession.close();
}
}
2.4使用JdbcSink连接数据库,并对数据库进行插入数据的操作 JdbcSink.java
package com.processor.sink;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Properties;
import org.apache.spark.sql.ForeachWriter;
import org.apache.spark.sql.Row;
import com.processor.util.Util;
public class JdbcSink extends ForeachWriter<Row> {
private static final long serialVersionUID = -873764205562254833L;
public static final String DRIVER_CLASS = "com.mysql.jdbc.Driver";
public static String url;
public static String user;
public static String password;
private PreparedStatement statement;
private Connection connection;
private Properties prop;
public JdbcSink() {
loadProperties();
}
public void loadProperties() {
this.prop = new Properties();
String propFileName = "config.properties";
InputStream inputStream = this.getClass().getClassLoader()
.getResourceAsStream(propFileName);
if (inputStream != null) {
try {
prop.load(inputStream);
user = prop.getProperty("user");
url = prop.getProperty("url");
password = prop.getProperty("password");
} catch (IOException e) {
e.printStackTrace();
}
} else {
System.out.println("property file '" + propFileName
+ "' not found in the classpath");
}
}
public boolean open(long arg0, long arg1) {
try {
Class.forName(DRIVER_CLASS);
connection = DriverManager.getConnection("jdbc:mysql://192.168.100.10:3306/test", "root", "******");
statement = connection.prepareStatement(
"replace into " + prop.getProperty("tableName")
+ "(Hour,Stock,Min,Max,Volume) values(?,?,?,?,?)");
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
public void process(Row row) {
String[] arr = Util.getRowArray(row);
try {
statement.setString(1, Util.getHourFromTimeStamp(arr[1]));
statement.setString(2, arr[0]);
statement.setString(3, arr[4]);
statement.setString(4, arr[3]);
statement.setString(5, arr[5]);
statement.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
}
@Override
public void close(Throwable arg0) {
try {
statement.close();
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}