Configuração ttl do estado em pyflink

# -*- coding: gbk -*-
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.functions import RuntimeContext, FlatMapFunction, MapFunction
import json
import re
import logging
import sys
from pyflink.datastream.state import ValueStateDescriptor, MapStateDescriptor
from pyflink.datastream .connectors.kafka importar FlinkKafkaConsumer, TypeInformation
de pyflink.common.typeinfo importar Types
de pyflink.datastream.connectors.elasticsearch importar Elasticsearch7SinkBuilder, ElasticsearchEmitter, FlushBackoffType
de pyflink.datastream.connectors importar DeliveryGuarantee
de pyflink.common.serialization importar SimpleStringSchema
from datetime import datetime
from pyflink.common.time import Hora
de pyflink.common.typeinfo import Tipos
de pyflink.datastream.state import ValueStateDescriptor, StateTtlConfig

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s-%(levelname)s-%(message)s") logger = logging.getLogger(__
name__)

# 创建 StreamExecutionEnvironment 对象
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
#env.add_jars("file:///root/pyflink/flink-sql-connector-kafka_2.11-1.14.4.jar")

TEST_KAFKA_SERVERS = "1.1.101.39:9092,1.1.101.40:9092,1.1.101.42:9092"
TEST_KAFKA_TOPIC = "elink-midsys-flink-topic"
TEST_GROUP_ID = "pyflink_elink_midsys"


def get_kafka_customer_properties(kafka_servers: str, group_id: str):
    properties = {         "bootstrap.servers": kafka_servers,         "fetch.max.bytes": "67108864",         "key.deserializer": "org.apache.kafka.common. serialization.StringDeserializer",         "value.deserializer": "org.apache.kafka.common.serialization.StringDeserializer",         "enable.auto.commit": "false", # Fechar envio automático kafka, tipo bool não pode ser passado aqui Erro         "group.id": group_id,     }     retorna propriedades








propriedades = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID)


class LogEvent:
    # id significa pipeline global
    id = None
    # source ip
    source = None
    #process name
    fileTag= None
    #file name
    fileName = None
    #scene code
    serviceCode = None
    #system name
    appName= None
    #time stamp
    timestamp = None
    #offset volume
    deslocamento = nenhum

    def __init__(self, id,source, fileTag,fileName, serviceCode,appName,timestamp,offset,message,index_name): self.id=id self.source = source self.fileTag =
        fileTag
        self.fileName
        =
        fileName
        self.serviceCode = serviceCode
        self.appName = appName
        self.timestamp= timestamp
        self.offset = offset
        self.message = mensagem
        self.index_name = index_name

    def to_dict(self):
        return {             "id": str(self.id),             "source": str(self.source),             "fileTag": str(self.fileTag),             "fileName":str(self.fileName ),             "serviceCode":str(self.serviceCode),             "appName":str(self.appName),             "timestamp":self.timestamp,             "offset":str(self.offset),             "message":self.message ,             "index_name": self.index_name         }











class MyMapFunction(FlatMapFunction):
    def open(self, runtime_context: RuntimeContext):
         ttl_config = StateTtlConfig \
            .new_builder(Time.seconds(120)) \
            .set_update_type(StateTtlConfig.UpdateType.OnCreateAndWrite) \
            .set_state_visibility(StateTtlConfig.StateVisibility.NeverReturnExpired) \
            .build()
         desciption_map=MapStateDescriptor('process_id_map_bus_seq', Types.STRING(), Types.STRING())
         desciption_map.enable_time_to_live(ttl_config)
         self.process_id_to_bus_seq = runtime_context.get_map_state(desciption_map)

    def flat_map(self, raw_message):
        id = ''
        source =''
        fileTag =''
        fileName =''
        serviceCode =''
        appName =''
        timestamp =''
        process_id = ''
        offset =''
        message =''
        unique_key = ''
        tente:
           raw_message = raw_message.replace("\n", "")
           #print(raw_message)
           out=json.load(raw_message)
           message = out['message']
           source = out['source']
           fileTag = out['file_tag']
           serviceCode='00000'
           appName=out['app_name']
           timestamp=str(out.get('time_nano'))
           offset=out.get('offset')
           fileName=out.get('file_name')
           pattern = r".*?接收数据.*?\d{26} "
           matchObj = re.match(pattern, message)
        except:
             #logger.info('11111111111111111111111111111111')
             return
        if matchObj:
            try:
                if self.process_id_to_bus_seq.contains(unique_key):
                   self.process_id_to_bus_seq.remove(unique_key)
                pat = re. compile(r".*?接收数据.*?(\d{26}).*?")
                bus_seq = pat.search(message).group(1)
                process_id = message.dividir()[1]
                unique_key=source+'_'+ appName +'_'+ fileTag +'_'+str(process_id)
                self.process_id_to_bus_seq.put(unique_key, bus_seq)
            exceto:
                #print('ValueError:', e)
                #logger.info( '22222222222222222222222222222222')
                return
        try:         
            process_id = message.split()[1]
            unique_key=source+'_'+ appName +'_'+ fileTag +'_'+str(process_id)
        except:
            #print('ValueError:', e)
            #logger.info('333333333333333333333')
            return
        try:
            bus_seq = self.process_id_to_bus_seq.get(unique_key)
        exceto:
            return
        if not bus_seq:
            bus_seq = '0'
        id=bus_seq
        # self.r.delete(process_id)
        # log_event = LogEvent(bus_seq.decode('UTF-8'),message)
        # LogEvent['bus_seq']= bus_seq.decode('UTF-8')
        date_str = datetime.now().strftime("%Y%m%d")
        index_name = 'flink-log-elink-midsys-'+ str(date_str)
        tente:
            log_event = LogEvent(id,source, fileTag,fileName, serviceCode,appName,timestamp,offset,message,index_name)
        exceto:
            return
        #print(log_event.to_dict())
   
        yield log_event.to_dict()
     


data_stream = env.add_source(
    FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC,
        properties=propriedades,
        deserialization_schema=SimpleStringSchema()) \
        .set_commit_offsets_on_checkpoints(True) \
        .set_start_from_latest()
).name(f"消费{TEST_KAFKA_TOPIC}主题数据")

#env.add_jars("file:///root/pyflink/flink-sql-connector-elasticsearch7-3.0.1-1.16.jar")

# .set_hosts(['1.1.101.32:9200','1.1.101.33:9200','1.1.101.38:9200']) \
es_sink = Elasticsearch7SinkBuilder() \
        .set_bulk_flush_backoff_strategy(FlushBackoffType.EXPONENTIAL, 5, 10 00) \
        . set_emitter(ElasticsearchEmitter.dynamic_index('index_name')) \
        .set_hosts(['1.1.101.32:9200','1.1.101.33:9200','1.1.101.38:9200']) \
        .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
        .set_bulk_flush_max_actions(100) \
        .set_bulk_flush_interval(1000) \
        .set_connection_request_timeout(30000) \
        .set_connection_timeout(31000) \
        .set_socket_timeout(32000) \
        .build()


def get_line_key(line):
    message = ''
    try:
        message = line.replace("\n", "")
        source = json.loads(message)['source']
    exceto:
        source = '999999'
    return source

data_stream.key_by(get_line_key).flat_map(MyMapFunction(),output_type=Types.MAP(Types.STRING(), Types.STRING())).set_parallelism(2).sink_to(es_sink).set_parallelism(3) #data_stream
. key_by(get_line_key).flat_map(MyMapFunction(),output_type=Types.MAP(Types.STRING(), Types.STRING())).print()

# Execute a tarefa
env.execute('xxx')
 

Acho que você gosta

Origin blog.csdn.net/zhaoyangjian724/article/details/131646523
Recomendado
Clasificación