# -*- coding: gbk -*-
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.functions import RuntimeContext, FlatMapFunction, MapFunction
import json
import re
import logging
import sys
from pyflink.datastream.state import ValueStateDescriptor, MapStateDescriptor
from pyflink.datastream .connectors.kafka importar FlinkKafkaConsumer, TypeInformation
de pyflink.common.typeinfo importar Types
de pyflink.datastream.connectors.elasticsearch importar Elasticsearch7SinkBuilder, ElasticsearchEmitter, FlushBackoffType
de pyflink.datastream.connectors importar DeliveryGuarantee
de pyflink.common.serialization importar SimpleStringSchema
from datetime import datetime
from pyflink.common.time import Hora
de pyflink.common.typeinfo import Tipos
de pyflink.datastream.state import ValueStateDescriptor, StateTtlConfig
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s-%(levelname)s-%(message)s") logger = logging.getLogger(__
name__)
# 创建 StreamExecutionEnvironment 对象
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
#env.add_jars("file:///root/pyflink/flink-sql-connector-kafka_2.11-1.14.4.jar")
TEST_KAFKA_SERVERS = "1.1.101.39:9092,1.1.101.40:9092,1.1.101.42:9092"
TEST_KAFKA_TOPIC = "elink-midsys-flink-topic"
TEST_GROUP_ID = "pyflink_elink_midsys"
def get_kafka_customer_properties(kafka_servers: str, group_id: str):
properties = { "bootstrap.servers": kafka_servers, "fetch.max.bytes": "67108864", "key.deserializer": "org.apache.kafka.common. serialization.StringDeserializer", "value.deserializer": "org.apache.kafka.common.serialization.StringDeserializer", "enable.auto.commit": "false", # Fechar envio automático kafka, tipo bool não pode ser passado aqui Erro "group.id": group_id, } retorna propriedades
propriedades = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID)
class LogEvent:
# id significa pipeline global
id = None
# source ip
source = None
#process name
fileTag= None
#file name
fileName = None
#scene code
serviceCode = None
#system name
appName= None
#time stamp
timestamp = None
#offset volume
deslocamento = nenhum
def __init__(self, id,source, fileTag,fileName, serviceCode,appName,timestamp,offset,message,index_name): self.id=id self.source = source self.fileTag =
fileTag
self.fileName
=
fileName
self.serviceCode = serviceCode
self.appName = appName
self.timestamp= timestamp
self.offset = offset
self.message = mensagem
self.index_name = index_name
def to_dict(self):
return { "id": str(self.id), "source": str(self.source), "fileTag": str(self.fileTag), "fileName":str(self.fileName ), "serviceCode":str(self.serviceCode), "appName":str(self.appName), "timestamp":self.timestamp, "offset":str(self.offset), "message":self.message , "index_name": self.index_name }
class MyMapFunction(FlatMapFunction):
def open(self, runtime_context: RuntimeContext):
ttl_config = StateTtlConfig \
.new_builder(Time.seconds(120)) \
.set_update_type(StateTtlConfig.UpdateType.OnCreateAndWrite) \
.set_state_visibility(StateTtlConfig.StateVisibility.NeverReturnExpired) \
.build()
desciption_map=MapStateDescriptor('process_id_map_bus_seq', Types.STRING(), Types.STRING())
desciption_map.enable_time_to_live(ttl_config)
self.process_id_to_bus_seq = runtime_context.get_map_state(desciption_map)
def flat_map(self, raw_message):
id = ''
source =''
fileTag =''
fileName =''
serviceCode =''
appName =''
timestamp =''
process_id = ''
offset =''
message =''
unique_key = ''
tente:
raw_message = raw_message.replace("\n", "")
#print(raw_message)
out=json.load(raw_message)
message = out['message']
source = out['source']
fileTag = out['file_tag']
serviceCode='00000'
appName=out['app_name']
timestamp=str(out.get('time_nano'))
offset=out.get('offset')
fileName=out.get('file_name')
pattern = r".*?接收数据.*?\d{26} "
matchObj = re.match(pattern, message)
except:
#logger.info('11111111111111111111111111111111')
return
if matchObj:
try:
if self.process_id_to_bus_seq.contains(unique_key):
self.process_id_to_bus_seq.remove(unique_key)
pat = re. compile(r".*?接收数据.*?(\d{26}).*?")
bus_seq = pat.search(message).group(1)
process_id = message.dividir()[1]
unique_key=source+'_'+ appName +'_'+ fileTag +'_'+str(process_id)
self.process_id_to_bus_seq.put(unique_key, bus_seq)
exceto:
#print('ValueError:', e)
#logger.info( '22222222222222222222222222222222')
return
try:
process_id = message.split()[1]
unique_key=source+'_'+ appName +'_'+ fileTag +'_'+str(process_id)
except:
#print('ValueError:', e)
#logger.info('333333333333333333333')
return
try:
bus_seq = self.process_id_to_bus_seq.get(unique_key)
exceto:
return
if not bus_seq:
bus_seq = '0'
id=bus_seq
# self.r.delete(process_id)
# log_event = LogEvent(bus_seq.decode('UTF-8'),message)
# LogEvent['bus_seq']= bus_seq.decode('UTF-8')
date_str = datetime.now().strftime("%Y%m%d")
index_name = 'flink-log-elink-midsys-'+ str(date_str)
tente:
log_event = LogEvent(id,source, fileTag,fileName, serviceCode,appName,timestamp,offset,message,index_name)
exceto:
return
#print(log_event.to_dict())
yield log_event.to_dict()
data_stream = env.add_source(
FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC,
properties=propriedades,
deserialization_schema=SimpleStringSchema()) \
.set_commit_offsets_on_checkpoints(True) \
.set_start_from_latest()
).name(f"消费{TEST_KAFKA_TOPIC}主题数据")
#env.add_jars("file:///root/pyflink/flink-sql-connector-elasticsearch7-3.0.1-1.16.jar")
# .set_hosts(['1.1.101.32:9200','1.1.101.33:9200','1.1.101.38:9200']) \
es_sink = Elasticsearch7SinkBuilder() \
.set_bulk_flush_backoff_strategy(FlushBackoffType.EXPONENTIAL, 5, 10 00) \
. set_emitter(ElasticsearchEmitter.dynamic_index('index_name')) \
.set_hosts(['1.1.101.32:9200','1.1.101.33:9200','1.1.101.38:9200']) \
.set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
.set_bulk_flush_max_actions(100) \
.set_bulk_flush_interval(1000) \
.set_connection_request_timeout(30000) \
.set_connection_timeout(31000) \
.set_socket_timeout(32000) \
.build()
def get_line_key(line):
message = ''
try:
message = line.replace("\n", "")
source = json.loads(message)['source']
exceto:
source = '999999'
return source
data_stream.key_by(get_line_key).flat_map(MyMapFunction(),output_type=Types.MAP(Types.STRING(), Types.STRING())).set_parallelism(2).sink_to(es_sink).set_parallelism(3) #data_stream
. key_by(get_line_key).flat_map(MyMapFunction(),output_type=Types.MAP(Types.STRING(), Types.STRING())).print()
# Execute a tarefa
env.execute('xxx')