kafka消息格式为(None,[json串])
利用Python3有以下3种方式将kafka消息的往HBASE写入
1、直接消费kafka消息写入HBASE:
from kafka import KafkaConsumer import time import happybase import json hbase_ip='192.168.xxx.xxx' hbase_port=9090 ip = hbase_ip port = hbase_port pool = happybase.ConnectionPool(size=3, host=ip) #往tableName里插数据 def hbase_load(tableName, lists): with pool.connection() as connection: connection.open() if tableName not in str(connection.tables()): create_table(connection, tableName) #print(tableName,str(connection.tables())) table = connection.table(tableName) b = table.batch(batch_size=1024) for li in lists: try: rowkey = li['info'] data_dicts = {} for d, x in li.items(): key = "ss:" + d value = str(x) data_dicts[key] = value b.put(row=rowkey, data=data_dicts) b.send() print("rowkey:" + rowkey + " data append success") except Exception as ex: print(str(ex) + " 插入数据失败") connection.close() #创建HBASE表 def create_table(conn, table): try: conn.create_table( table, { "ss": dict(max_versions=10) } ) except Exception as ex: print(str(ex) + " table exists !!!") #打印日志 def log(str): t = time.strftime(r"%Y-%m-%d_%H-%M-%S", time.localtime()) print("[%s]%s" % (t, str)) lst = [] log('start consumer') # 消费192.168.xxx.xxx:9092上的logfile 这个Topic,指定consumer group是test-consumer-group consumer = KafkaConsumer('logfile', group_id='test-consumer-group', bootstrap_servers=['192.168.xxx.xxx:9092']) for msg in consumer: recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value) log(recv) dict_data = json.loads(msg.value) dict_data['info'] = str(dict_data['time'])+'-'+dict_data['pool'] lst.append(dict_data) hbase_load('logfile_zf', lst)
2、使用sparkstreaming的方法直接将RDD往HBASE写:
写入HBASE配置参考:http://dblab.xmu.edu.cn/blog/1715-2/
需要注意:在Spark 2.0版本上缺少相关把hbase的数据转换python可读取的jar包,需要我们另行下载。
打开spark-examples_2.11-1.6.0-typesafe-001.jar下载jar包
#!/usr/bin/env python3 from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils import json import time conf = SparkConf().setAppName("logSparkStreaming") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 5) # HBASE表,需要提前在HBASE中建好 table = 'logfile_stream2' broker = "192.168.xxx.xxx:9092" # kafka的topic topic = "logfile" # HBASE的zookeeper hbaseZK = "192.168.xxx.xxx" keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" hbaseConf = {"hbase.zookeeper.quorum": hbaseZK, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} #打印日志 def log(str): t = time.strftime(r"%Y-%m-%d %H:%M:%S", time.localtime()) print("[%s]%s" % (t, str)) #处理RDD元素,此RDD元素需为字典类型 def fmt_data(msg_dict): if msg_dict is not None: msg_dict['info'] = str(msg_dict['time'])+'-'+msg_dict['pool'] rowkey = msg_dict['info'] lst = [] for d, x in msg_dict.items(): col_name = d col_value = str(x) col_family = 'ss' # 需要将RDD中的字典的每个键值对准备成这种元组格式(rowkey, [row key, column family, column name, value])写入HBASE msg_tuple = (rowkey, [rowkey, col_family, col_name, col_value]) print("rowkey:" + rowkey + "\ndata " + str(msg_tuple) + " append success") lst.append(msg_tuple) return lst #处理RDD并向HBASE中写入 def connectAndWrite(data): if not data.isEmpty(): # 接收到的RDD中的元素转为字典,收到的格式为(None,[json串]),所以map第二个元素反序列化成为字典类型 msg_list = data.map(lambda x: json.loads(x[1])) # 打印RDD观察,类似一个以字典类型数据为元素的列表 log(msg_list.collect()) try: # 处理RDD中元素为写入HBASE需要的格式,形成元组格式 msg_row = msg_list.map(lambda x: fmt_data(x)) # print(msg_row.flatMap(lambda x: x).map(lambda x: x).collect()) # 将RDD中所有元素中的元组扁平化,再map后往HBASE存储 msg_row.flatMap(lambda x: x).map(lambda x: x).saveAsNewAPIHadoopDataset(conf=hbaseConf, keyConverter=keyConv,valueConverter=valueConv) except Exception as ex: print(str(ex) + " 插入数据失败") kafkaStreams = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list": broker}) # kafkaStreams.pprint() kafkaStreams.foreachRDD(connectAndWrite) log('start consumer') ssc.start() ssc.awaitTermination()
提交spark的命令如下:
$SPARK_HOME/bin/spark-submit --master local --packages org.apache.spark:spark-streaming-kafka_2.11:1.6.0 --jars spark-examples_2.11-1.6.0-typesafe-001.jar /home/user/spark/sparkstreaming_kafka2.py > /home/user/spark/sparkstreaming_kafka.log
注:spark-examples_2.11-1.6.0-typesafe-001.jar为把hbase的数据转换python可读取的jar包
3、读出sparkstreaming的RDD数据往HBASE写:
#!/usr/bin/env python3 from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import SQLContext import json import time import happybase hbase_ip='192.168.xxx.xxx' hbase_port=9090 ip = hbase_ip port = hbase_port pool = happybase.ConnectionPool(size=3, host=ip) #创建HBASE表 def create_table(conn, table): try: conn.create_table( table, { "ss": dict(max_versions=10) } ) except Exception as ex: print(str(ex) + " table exists !!!") #打印日志 def log(str): t = time.strftime(r"%Y-%m-%d_%H-%M-%S", time.localtime()) print("[%s]%s" % (t, str)) def writeHbase(msg): with pool.connection() as connection: connection.open() if table not in str(connection.tables()): create_table(connection, table) #print(tableName,str(connection.tables())) hbaseTable = connection.table(table) b = hbaseTable.batch(batch_size=1024) if not msg.isEmpty(): # print(msg.collect()) msg_rdd = msg.map(lambda x: json.loads(x[1])) # 读出RDD数据赋给Python变量来写入HBASE msg_list = msg_rdd.collect() lst = [] for msg_dict in msg_list: # print(msg_dict) msg_dict['info'] = str(msg_dict['time'])+'-'+msg_dict['pool'] lst.append(msg_dict) # print(lst) try: rowkey = msg_dict['info'] data_dict = {} for d, x in msg_dict.items(): key = "ss:" + d value = str(x) data_dict[key] = value b.put(row=rowkey, data=data_dict) b.send() print("rowkey:" + rowkey + "\ndata " + str(data_dict) + " append success") except Exception as ex: print(str(ex) + " 插入数据失败") connection.close() conf = SparkConf().setAppName("logSparkStreaming") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) sqc = SQLContext(sc) table = 'logfile_stream' broker = "192.168.xxx.xxx:9092" topic = "logfile" kafkaStreams = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list": broker}) # kafkaStreams.pprint() kafkaStreams.foreachRDD(writeHbase) log('start consumer') ssc.start() ssc.awaitTermination()
$SPARK_HOME/bin/spark-submit --master local[3] --packages org.apache.spark:spark-streaming-kafka_2.11:1.6.0 /home/user/spark/sparkstreaming_kafka.py > /home/user/spark/sparkstreaming_kafka.log