1, spark流计算的算子:对比
算子 |
方法参数 |
结果 |
reduceByKeyAndWindow |
(新数据计算方法, 窗口time, 滑动time) |
(zz,1)(zz,1)(zz,1) --> 空 空 空 |
reduceByKeyAndWindow |
(新数据计算方法, 旧数据计算方法,窗口time, 滑动time) |
(xx,1)(xx,1)(xx,1) --> (xx,0)(xx,0)(xx,0) |
updateStateByKey |
( 历史数据计算方法) |
(a,2) (a,3) (a,4) --> (a,4) (a,4)(a,4) |
2, 流计算操作过程如下:
操作说明 |
step1 |
step2 |
kafka数据格式 |
(时间戳, “PERSON_NAME->渠莲,ID_CARD_NO->9999”) |
|
spark获取Kafka数据 |
val dstream = KafkaUtils.createStream(…) |
dstream.updateStateByKey(…) |
流计算 |
原数据保存到hbase |
统计数据保存到mysql |
3, rdd中的数据格式的变换过程:
数据源 |
原始数据格式 |
算子变换1 |
算子变换2 |
算子变换3 |
算子变换4 |
socket |
“ab" |
map==> (“ab”,1) |
updateStateByKey(newdata:Seq[Int], oldData:Option[Int]): Option[Int]) |
|
|
kafka |
(时间,”k1->v1,k2->v2") |
map=> (”k1->v1,k2->v2") |
flatMap= > [ [k1,v1], [k2,v2] ] |
map=> (k1,v1), (k2,v2) |
updateStateByKey(newdata:Seq[Int], oldData:Option[Int]): Option[Int]) |
1,配置pom.xml
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.5.0-cdh5.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.5.0-cdh5.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.5.0-cdh5.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.10.2.2</version>
</dependency>
</dependencies>
2,在idea中编写spark程序
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import stream.util.{HbaseUtil, MysqlUtil}
//字段长度不足:累加统计各自的总数
object StreamLengthCount3 {
def main(args: Array[String]): Unit = {
//spark配置
val conf = new SparkConf().setMaster("local[*]").setAppName("test")
//流配置
val ssc = new StreamingContext(conf, Seconds(1))
ssc.checkpoint("check-data")//流计算:必须设置检查点,否则报错
//spark读取kafka数据
val zkQuorum = "127.0.0.1:2181"
val groupId = "g2"
val topicMap = Map("t2" -> 1)
val dstream = KafkaUtils.createStream(
ssc,
zkQuorum,
groupId,
topicMap,
StorageLevel.MEMORY_AND_DISK
)
println("开始streaming.....")
//任务1: 原数据保存到hbase中
val dataDstream = dstream.map(tup=>tup._2)//(时间,”k1->v1,k2->v2")
dataDstream.foreachRDD(rdd=>{
rdd.foreach(strMesg=>{
//任务1: 原数据保存到hbase中
HbaseUtil.writeToHbase(strMesg)
//返回原数据
strMesg
})
})
//任务2: 统计每个字段:长度不足的总数
val kvArrDstream = dataDstream.flatMap(_.split(","))// [ k1->v1, k2,v2 ]
val kvDstream = kvArrDstream.map(_.split("->").toSeq)//[k1, v1],[k2, v2]
val kvDstream2=kvDstream.map(seq=>{
val key = seq(0)
val value = seq(1)
var lengLowCount = 0
if (value == null || value.length < 9 ) lengLowCount = 1
//返回新的数据格式(key,长度不符合的个数)
(key, lengLowCount)
})
val kvDstream3= kvDstream2.updateStateByKey(updateState)//持续累加求和
kvDstream3.foreachRDD(rdd=>{//(COST ,3518357)
//统计数据保存到mysql
rdd.foreachPartition(iter=>{
while (iter.hasNext){
val tuple = iter.next()
val key = tuple._1
val lengCount = tuple._2
MysqlUtil.insert(key,lengCount) //统计数据保存到mysql
}
})
})
//启动
ssc.start()
ssc.awaitTermination()
}
//求总数: 累加和
def updateState(newdata:Seq[Int], oldData:Option[Int]): Option[Int] ={
val sumval = newdata.sum
val oldval = oldData.getOrElse(0)
Some(sumval+oldval)
}
}
/**
开始streaming.....
==========>接收到kafaka数据: PERSON_NAME->渠莲0,DOC_NAME->入院记录,DOC_CODE->C0034
------------- SparkStreamUtil.sparkWriteHbase ==>rowkey: 045 | gzfnetyy | 8540099 | 80
hbase 数据 保存ok...
[Stage 155:> (0 + 3) / 4]
mysql 数据 保存ok...
mysql 数据 保存ok...
*/
/**
mysql> select * from exame_fields;
+---------------+--------+
| field | count |
+---------------+--------+
| DOC_NAME | 151 |
| DOC_CODE | 127823 |
| COST | 127823 |
| SEX | 127823 |
*/
工具类方法:
def writeToHbase(strMesg:String): Unit ={
//任务1: 数据写入hbase
println("==========>接收到kafaka数据: " + strMesg)
val hbaseconf = HBaseConfiguration.create()
val table = new HTable(hbaseconf, "t")
var rowKey = ""
val infos = strMesg.split(",")
if (strMesg.contains("ROWKEY")) {
for (prop <- infos) {
val kvArr = prop.split("->")
if (kvArr(0).toUpperCase().contains("ROWKEY")) {
rowKey = kvArr(1)
println("------------- SparkStreamUtil.sparkWriteHbase ==>rowkey: "+ rowKey )
}
}
}
//1, //写入数据name->lisi,ROWKEY->123456,age->12
val put = new Put(Bytes.toBytes(rowKey))
try {
//2, 添加字段put
for (prop <- infos) { //name->lisi 数组
if (!prop.contains("ROWKEY")) {
val kvArr = prop.split("->")
val propName = kvArr(0)
val propValue = kvArr(1)
put.add(Bytes.toBytes("f"), Bytes.toBytes(propName), Bytes.toBytes(propValue))
}
}
}
table.put(put)
table.close
println("hbase 数据 保存ok...")
}
public class MysqlUtil {
static Connection conn;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test", "root", "daitoue");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void insert(String field, int count) throws ClassNotFoundException, SQLException {
PreparedStatement stm = conn.prepareStatement("create table if not EXISTS exame_fields(field varchar(20), bad_count long, good_count long)");
stm.execute();
stm = conn.prepareStatement("select count(*) from exame_fields where field='" + field + "'");
ResultSet result = stm.executeQuery();
while (result.next()) {
int field_count = result.getInt(1);
if (field_count == 0) {
stm = conn.prepareStatement("insert into exame_fields(field,bad_count,good_count) values(?,?,?)");
stm.setString(1, field);
stm.setLong(2, count);
stm.setLong(3, 0);
} else {
stm = conn.prepareStatement("update exame_fields set bad_count=? , good_count=? where field=?");
stm.setLong(1, count);
stm.setLong(2, 0);
stm.setString(3, field);
}
}
stm.executeUpdate();
stm.close();
System.out.println("mysql 数据 保存ok...");
}
}
3, kafka生产者代码
package stream;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
public class Producer {
static KafkaProducer<String, String> producer = null;
public static void main(String[] args) throws InterruptedException, IOException {
Map<String, String> mm = new HashMap<String, String>();
//配置kafkan
Properties kafka_properties = new Properties();
Properties param = new Properties();
//加载配置文件
kafka_properties.setProperty("bootstrap.servers", "localhost:9092");
kafka_properties.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
kafka_properties.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
String topic = "t1";
while (true) {
int i = 0;
for (; ; ) {
mm.put("PERSON_NAME", "渠莲" + (long)( i * 1000 * Math.random()));
mm.put("DOC_NAME", "入院记录" + (long) (i * 10 * Math.random()));
mm.put("VISIT_ID", "1");
mm.put("IN_PATIENT_ID", "8540099" + (long) (i * 100 * Math.random()));
mm.put("ROWKEY", "045 | gzfnetyy | 8540099 | 8" + i);
mm.put("DEPT", "眼科");
mm.put("TIME", (long) i * 1000 * Math.random() + " / 11 / 04");
mm.put("DOC_CODE", "C0034");
mm.put("COST", i + "");
mm.put("SEX", (i % 2) + "");
sendMesg(mm,kafka_properties, topic);
if (i % 5000 == 0)
Thread.sleep(1000);
i++;
}
}
}
//方法1: 给kafka 发送消息
public static void sendMesg(Map<String, String> mm, Properties kafka_properties, String topic) throws IOException {
//创建生产者
if (producer == null) {
producer = new KafkaProducer<String, String>(kafka_properties);
}
//生产消息
StringBuffer sb = new StringBuffer();
for (String s : mm.keySet()) {
String val = mm.get(s);
sb.append(s + "->" + val + ",");
}
StringBuffer sb2 = sb.deleteCharAt(sb.length() - 1);
producer.send(new ProducerRecord<String, String>(topic, new Date().toString(), sb2.toString())); //id->21,name->lisi
System.out.println("send mesg ok.. " + sb2.toString());
sb = null;
sb2 = null;
}
}