Kafka源码见解

本文已参与「新人创作礼」活动,一起开启掘金创作之路。


本人Kafka有七八年经验了。。。不过这篇博客写的时间比较老,有的内容落后了,尤其是现在kafka的版本,api变了样,不过学习kafka的目的主要是了解一个分布式的消息流系统是如何构建的,如何做到高可用,高并发。

Kafka.scala

KafkaServer,依次启动各个模块

quotaManagers = QuotaFactory.instantiate(config, metrics, time)
kafkaScheduler.startup()
logManager = createLogManager(zkUtils.zkClient, brokerState)
socketServer = new SocketServer(config, metrics, time)
replicaManager = new ReplicaManager(config, metrics, time, zkUtils, kafkaScheduler, logManager,
 isShuttingDown, quotaManagers.follower)
kafkaController = new KafkaController(config, zkUtils, brokerState, time, metrics, threadNamePrefix)
groupCoordinator = GroupCoordinator(config, zkUtils, replicaManager, Time.SYSTEM)
apis = new KafkaApis(socketServer.requestChannel, replicaManager, adminManager, groupCoordinator,
 kafkaController, zkUtils, config.brokerId, config, metadataCache, metrics, authorizer, quotaManagers,
 clusterId, time)

requestHandlerPool = new KafkaRequestHandlerPool(config.brokerId, socketServer.requestChannel, apis, time,
 config.numIoThreads)
dynamicConfigManager = new DynamicConfigManager(zkUtils, dynamicConfigHandlers)
kafkaHealthcheck = new KafkaHealthcheck(config.brokerId, listeners, zkUtils, config.rack,
 config.interBrokerProtocolVersion)


复制代码

KafkaApi

 case ApiKeys.PRODUCE => handleProducerRequest(request)
        case ApiKeys.FETCH => handleFetchRequest(request)
        case ApiKeys.LIST_OFFSETS => handleOffsetRequest(request)
        case ApiKeys.METADATA => handleTopicMetadataRequest(request)
        case ApiKeys.LEADER_AND_ISR => handleLeaderAndIsrRequest(request)
        case ApiKeys.STOP_REPLICA => handleStopReplicaRequest(request)
        case ApiKeys.UPDATE_METADATA_KEY => handleUpdateMetadataRequest(request)
        case ApiKeys.CONTROLLED_SHUTDOWN_KEY => handleControlledShutdownRequest(request)
        case ApiKeys.OFFSET_COMMIT => handleOffsetCommitRequest(request)
        case ApiKeys.OFFSET_FETCH => handleOffsetFetchRequest(request)
        case ApiKeys.GROUP_COORDINATOR => handleGroupCoordinatorRequest(request)
        case ApiKeys.JOIN_GROUP => handleJoinGroupRequest(request)
        case ApiKeys.HEARTBEAT => handleHeartbeatRequest(request)
        case ApiKeys.LEAVE_GROUP => handleLeaveGroupRequest(request)
        case ApiKeys.SYNC_GROUP => handleSyncGroupRequest(request)
        case ApiKeys.DESCRIBE_GROUPS => handleDescribeGroupRequest(request)
        case ApiKeys.LIST_GROUPS => handleListGroupsRequest(request)
        case ApiKeys.SASL_HANDSHAKE => handleSaslHandshakeRequest(request)
        case ApiKeys.API_VERSIONS => handleApiVersionsRequest(request)
        case ApiKeys.CREATE_TOPICS => handleCreateTopicsRequest(request)
        case ApiKeys.DELETE_TOPICS => handleDeleteTopicsRequest(request)
        case requestId => throw new KafkaException("Unknown api code " + requestId)
复制代码

before kafkaApi , we need to look at some classes : FileMessageSet, LogSegment,Log,Replica,Partition,ReplicaManage

class FileMessageSet private[kafka](@volatile var file: File,
                                    private[log] val channel: FileChannel,
                                    private[log] val start: Int,
                                    private[log] val end: Int,
                                    isSlice: Boolean) extends MessageSet {
...
}
复制代码
class Log
复制代码
class Replica:
 
复制代码
class Partition:
 
 
复制代码
 class ReplicaManager:

private val allPartitions = new Pool[(String, Int), Partition](valueFactory = Some { case (t, p) =>
  new Partition(t, p, time, this)
})
 
 
@volatile var controllerEpoch: Int = KafkaController.InitialControllerEpoch - 1
private val localBrokerId = config.brokerId
private val allPartitions = new Pool[(String, Int), Partition](valueFactory = Some { case (t, p) =>
  new Partition(t, p, time, this)
})
复制代码

handleProducerRequest

replicaManager.appendMessages(
produceRequest.timeout.toLong,
produceRequest.acks,
internalTopicsAllowed,
authorizedMessagesPerPartition,
sendResponseCallback)`
复制代码

messagesPerPartition: Map[TopicPartition, MessageSet]

val localProduceResults = appendToLocalLog(internalTopicsAllowed, messagesPerPartition, requiredAcks)
....
case Some(partition) =>
  partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet], requiredAcks)


val info = log.append(messages, assignOffsets = true)
....
// maybe roll the log if this segment is full
val segment = maybeRoll(messagesSize = validMessages.sizeInBytes,
 maxTimestampInMessages = appendInfo.maxTimestamp)

// now append to the log
segment.append(firstOffset = appendInfo.firstOffset, largestTimestamp = appendInfo.maxTimestamp,
 offsetOfLargestTimestamp = appendInfo.offsetOfMaxTimestamp, messages = validMessages)


...
 */这里是一个NIO写入
def append(messages: ByteBufferMessageSet) {
  val written = messages.writeFullyTo(channel)
  _size.getAndAdd(written)
}
复制代码

summar: MessageSet Segment Log ByteBufferMessageSet

handleFetchRequest

class : FetchRequest
private final int replicaId;
private final int maxWait;
private final int minBytes;
private final int maxBytes;
private final LinkedHashMap<TopicPartition, PartitionData> fetchData;
复制代码
val (existingAndAuthorizedForDescribeTopics, nonExistingOrUnauthorizedForDescribeTopics) = fetchRequest.fetchData.asScala.toSeq.partition {
  case (tp, _) => authorize(request.session, Describe, new Resource(auth.Topic, tp.topic)) && metadataCache.contains(tp.topic)
}
 
public FetchResponse(int version, LinkedHashMap<TopicPartition, PartitionData> responseData, int throttleTime) {
    super(new Struct(ProtoUtils.responseSchema(ApiKeys.FETCH.id, version)));
    writeStruct(struct, version, responseData, throttleTime);
    this.responseData = responseData;
    this.throttleTime = throttleTime;
}
public static final class PartitionData {
    public final short errorCode;
    public final long highWatermark;
    public final Records records;

    public PartitionData(short errorCode, long highWatermark, Records records) {
        this.errorCode = errorCode;
        this.highWatermark = highWatermark;
        this.records = records;
    }
}
 
public interface Records extends Iterable<LogEntry> {

    int SIZE_LENGTH = 4;
    int OFFSET_LENGTH = 8;
    int LOG_OVERHEAD = SIZE_LENGTH + OFFSET_LENGTH;
}
 
....
replicaManager.fetchMessages(
  fetchRequest.maxWait.toLong,
  fetchRequest.replicaId,
  fetchRequest.minBytes,
  fetchRequest.maxBytes,
  versionId <= 2,
  authorizedRequestInfo,
  replicationQuota(fetchRequest),
  sendResponseCallback)
复制代码
val logReadResults = readFromLocalLog(
  replicaId = replicaId,
  fetchOnlyFromLeader = fetchOnlyFromLeader,
  readOnlyCommitted = fetchOnlyCommitted,
  fetchMaxBytes = fetchMaxBytes,
  hardMaxBytesLimit = hardMaxBytesLimit,
  readPartitionInfo = fetchInfos,
  quota = quota)
 
 
...
// if the fetch comes from the follower,
// update its corresponding log end offset
if(Request.isValidBrokerId(replicaId))
  updateFollowerLogReadResults(replicaId, logReadResults)
 
...
def fetchResponseCallback(delayTimeMs: Int) {
  trace(s"Sending fetch response to client $clientId of " +
    s"${convertedPartitionData.map { case (_, v) => v.records.sizeInBytes }.sum} bytes")
  val fetchResponse = if (delayTimeMs > 0) new FetchResponse(versionId, fetchedPartitionData, delayTimeMs) else response
  requestChannel.sendResponse(new RequestChannel.Response(request, fetchResponse))
}
 
public FetchResponse(int version, LinkedHashMap<TopicPartition, PartitionData> responseData, int throttleTime) {
    super(new Struct(ProtoUtils.responseSchema(ApiKeys.FETCH.id, version)));
    writeStruct(struct, version, responseData, throttleTime);
    this.responseData = responseData;
    this.throttleTime = throttleTime;
}
def fetchResponseCallback(delayTimeMs: Int) {
  trace(s"Sending fetch response to client $clientId of " +
    s"${convertedPartitionData.map { case (_, v) => v.records.sizeInBytes }.sum} bytes")
  val fetchResponse = if (delayTimeMs > 0) new FetchResponse(versionId, fetchedPartitionData, delayTimeMs) else response
  requestChannel.sendResponse(new RequestChannel.Response(request, fetchResponse))
}
 
复制代码

handleOffsetRequest

public class ListOffsetRequest extends AbstractRequest {
...
private final Map<TopicPartition, PartitionData> offsetData;
private final Map<TopicPartition, Long> partitionTimestamps;
private final Set<TopicPartition> duplicatePartitions;
...
}
复制代码

handleTopicMetadataRequest

val responseBody = new MetadataResponse(
  brokers.map(_.getNode(request.securityProtocol)).asJava,
  clusterId,
  metadataCache.getControllerId.getOrElse(MetadataResponse.NO_CONTROLLER_ID),
  completeTopicMetadata.asJava,
  requestVersion
)
复制代码
public MetadataResponse(List<Node> brokers, String clusterId, int controllerId, List<TopicMetadata> topicMetadata, int version) {
    super(new Struct(ProtoUtils.responseSchema(ApiKeys.METADATA.id, version)));
    this.brokers = brokers;
    this.controller = getControllerNode(controllerId, brokers);
    this.topicMetadata = topicMetadata;
    this.clusterId = clusterId;
复制代码

handleLeaderAndIsrRequest

class LeaderAndIsrRequest{
...
private final int controllerId;
private final int controllerEpoch;
private final Map<TopicPartition, PartitionState> partitionStates;
private final Set<Node> liveLeaders;
...
}
复制代码
public class PartitionState {
    public final int controllerEpoch;
    public final int leader;
    public final int leaderEpoch;
    public final List<Integer> isr;
    public final int zkVersion;
    public final Set<Integer> replicas;
...
}
复制代码
// If the leader epoch is valid record the epoch of the controller that made the leadership decision.
// This is useful while updating the isr to maintain the decision maker controller's epoch in the zookeeper path
if (partitionLeaderEpoch < stateInfo.leaderEpoch) {
    if(stateInfo.replicas.contains(config.brokerId))
      partitionState.put(partition, stateInfo)
...}
 
val partitionsTobeLeader = partitionState.filter { case (_, stateInfo) =>
  stateInfo.leader == config.brokerId
}
val partitionsToBeFollower = partitionState -- partitionsTobeLeader.keys



val partitionsBecomeLeader = if (partitionsTobeLeader.nonEmpty)
  makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, correlationId, responseMap)
复制代码
/*
 * Make the current broker to become leader for a given set of partitions by:
 *
 * 1. Stop fetchers for these partitions
 * 2. Update the partition metadata in cache
 * 3. Add these partitions to the leader partitions set
 *
 * If an unexpected error is thrown in this function, it will be propagated to KafkaApis where
 * the error message will be set on each partition since we do not know which partition caused it. Otherwise,
 * return the set of partitions that are made leader due to this method
 *
 *  TODO: the above may need to be fixed later
 */
private def makeLeaders(controllerId: Int,
                        epoch: Int,
                        partitionState: Map[Partition, PartitionState],
                        correlationId: Int,
                        responseMap: mutable.Map[TopicPartition, Short]): Set[Partition] = {
复制代码

插入几个class,专门来研究一下authorize;

private def authorize(session: Session, operation: Operation, resource: Resource): Boolean =
  authorizer.forall(_.authorize(session, operation, resource))
 
case class Session(principal: KafkaPrincipal, clientAddress: InetAddress) {
  val sanitizedUser = QuotaId.sanitize(principal.getName)
}
 
public class KafkaPrincipal implements Principal {
    public static final String SEPARATOR = ":";
    public static final String USER_TYPE = "User";
    public final static KafkaPrincipal ANONYMOUS = new KafkaPrincipal(KafkaPrincipal.USER_TYPE, "ANONYMOUS");

    private String principalType;
    private String name;
...
}
sealed trait Operation extends BaseEnum
case object Read extends Operation { val name = "Read" }
case object Write extends Operation { val name = "Write" }
case object Create extends Operation { val name = "Create" }
case object Delete extends Operation { val name = "Delete" }
case object Alter extends Operation { val name = "Alter" }
case object Describe extends Operation { val name = "Describe" }
case object ClusterAction extends Operation { val name = "ClusterAction" }
case object All extends Operation { val name = "All" }
 
 
复制代码

handleStopReplicaRequest

val (result, error) = replicaManager.stopReplicas(stopReplicaRequest)
 
 
// First stop fetchers for all partitions, then stop the corresponding replicas
replicaFetcherManager.removeFetcherForPartitions(partitions)
//then delete partition
def stopReplica(topicPartition: TopicPartition, deletePartition: Boolean): Short  = {
复制代码

handleUpdateMetadataRequest

插入一个KafkaController分析;

class KafkaController(val config : KafkaConfig, zkUtils: ZkUtils, val brokerState: BrokerState, time: Time, metrics: Metrics, threadNamePrefix: Option[String] = None) extends Logging with KafkaMetricsGroup {
  this.logIdent = "[Controller " + config.brokerId + "]: "
  private var isRunning = true
  private val stateChangeLogger = KafkaController.stateChangeLogger
  val controllerContext = new ControllerContext(zkUtils)
  val partitionStateMachine = new PartitionStateMachine(this)
  val replicaStateMachine = new ReplicaStateMachine(this)
  private val controllerElector = new ZookeeperLeaderElector(controllerContext, ZkUtils.ControllerPath, onControllerFailover,
    onControllerResignation, config.brokerId, time)
  // have a separate scheduler for the controller to be able to start and stop independently of the
  // kafka server
  private val autoRebalanceScheduler = new KafkaScheduler(1)
  var deleteTopicManager: TopicDeletionManager = null
  val offlinePartitionSelector = new OfflinePartitionLeaderSelector(controllerContext, config)
  private val reassignedPartitionLeaderSelector = new ReassignedPartitionLeaderSelector(controllerContext)
  private val preferredReplicaPartitionLeaderSelector = new PreferredReplicaPartitionLeaderSelector(controllerContext)
  private val controlledShutdownPartitionLeaderSelector = new ControlledShutdownLeaderSelector(controllerContext)
  private val brokerRequestBatch = new ControllerBrokerRequestBatch(this)

  private val partitionReassignedListener = new PartitionsReassignedListener(this)
  private val preferredReplicaElectionListener = new PreferredReplicaElectionListener(this)
  private val isrChangeNotificationListener = new IsrChangeNotificationListener(this)
...
}
复制代码

handleControlledShutdownRequest

  def handleControlledShutdownRequest(request: RequestChannel.Request) {
    // ensureTopicExists is only for client facing requests
    // We can't have the ensureTopicExists check here since the controller sends it as an advisory to all brokers so they
    // stop serving data to clients for the topic being deleted
    val controlledShutdownRequest = request.requestObj.asInstanceOf[ControlledShutdownRequest]

    authorizeClusterAction(request)

    val partitionsRemaining = controller.shutdownBroker(controlledShutdownRequest.brokerId)
    val controlledShutdownResponse = new ControlledShutdownResponse(controlledShutdownRequest.correlationId,
      Errors.NONE.code, partitionsRemaining)
    requestChannel.sendResponse(new Response(request, new RequestOrResponseSend(request.connectionId, controlledShutdownResponse)))
  }
复制代码

handleOffsetCommitRequest

    // reject the request if not authorized to the group
    if (!authorize(request.session, Read, new Resource(Group, offsetCommitRequest.groupId))) {
复制代码
/*GroupCoordinator
*/
        coordinator.handleCommitOffsets(
          offsetCommitRequest.groupId,
          offsetCommitRequest.memberId,
          offsetCommitRequest.generationId,
          partitionData,
          sendResponseCallback)
复制代码
/**
 * GroupCoordinator handles general group membership and offset management.
 *
 * Each Kafka server instantiates a coordinator which is responsible for a set of
 * groups. Groups are assigned to coordinators based on their group names.
 */
class GroupCoordinator(val brokerId: Int,
                       val groupConfig: GroupConfig,
                       val offsetConfig: OffsetConfig,
                       val groupManager: GroupMetadataManager,
                       val heartbeatPurgatory: DelayedOperationPurgatory[DelayedHeartbeat],
                       val joinPurgatory: DelayedOperationPurgatory[DelayedJoin],
                       time: Time) extends Logging {
复制代码
/**
 * Group contains the following metadata:
 *
 *  Membership metadata:
 *  1. Members registered in this group
 *  2. Current protocol assigned to the group (e.g. partition assignment strategy for consumers)
 *  3. Protocol metadata associated with group members
 *
 *  State metadata:
 *  1. group state
 *  2. generation id
 *  3. leader id
 */
@nonthreadsafe
private[coordinator] class GroupMetadata(val groupId: String, initialState: GroupState = Empty) {

复制代码
 case Some(group) =>
          doCommitOffsets(group, memberId, generationId, offsetMetadata, responseCallback)
复制代码
def doCommitOffsets(group: GroupMetadata,
                      memberId: String,
                      generationId: Int,
                      offsetMetadata: immutable.Map[TopicPartition, OffsetAndMetadata],
                      responseCallback: immutable.Map[TopicPartition, Short] => Unit) {
    var delayedOffsetStore: Option[DelayedStore] = None

    group synchronized {
      if (group.is(Dead)) {
        responseCallback(offsetMetadata.mapValues(_ => Errors.UNKNOWN_MEMBER_ID.code))
      } else if (generationId < 0 && group.is(Empty)) {
        // the group is only using Kafka to store offsets
        delayedOffsetStore = groupManager.prepareStoreOffsets(group, memberId, generationId,
          offsetMetadata, responseCallback)
      } else if (group.is(AwaitingSync)) {
        responseCallback(offsetMetadata.mapValues(_ => Errors.REBALANCE_IN_PROGRESS.code))
      } else if (!group.has(memberId)) {
        responseCallback(offsetMetadata.mapValues(_ => Errors.UNKNOWN_MEMBER_ID.code))
      } else if (generationId != group.generationId) {
        responseCallback(offsetMetadata.mapValues(_ => Errors.ILLEGAL_GENERATION.code))
      } else {
        val member = group.get(memberId)
        completeAndScheduleNextHeartbeatExpiration(group, member)
        delayedOffsetStore = groupManager.prepareStoreOffsets(group, memberId, generationId,
          offsetMetadata, responseCallback)
      }
    }

    // store the offsets without holding the group lock
    delayedOffsetStore.foreach(groupManager.store)
  }
   /**
   * Store offsets by appending it to the replicated log and then inserting to cache
   */
  def prepareStoreOffsets(group: GroupMetadata,
                          consumerId: String,
                          generationId: Int,
                          offsetMetadata: immutable.Map[TopicPartition, OffsetAndMetadata],
                          responseCallback: immutable.Map[TopicPartition, Short] => Unit): Option[DelayedStore] = {
    // first filter out partitions with offset metadata size exceeding limit
 ...

 // construct the message set to append
    val magicValueAndTimestampOpt = getMessageFormatVersionAndTimestamp(partitionFor(group.groupId))
    magicValueAndTimestampOpt match {
      case Some((magicValue, timestamp)) =>
        val messages = filteredOffsetMetadata.map { case (topicAndPartition, offsetAndMetadata) =>
          new Message(
            key = GroupMetadataManager.offsetCommitKey(group.groupId, topicAndPartition.topic, topicAndPartition.partition),
            bytes = GroupMetadataManager.offsetCommitValue(offsetAndMetadata),
            timestamp = timestamp,
            magicValue = magicValue
          )
        }.toSeq

        val offsetTopicPartition = new TopicPartition(Topic.GroupMetadataTopicName, partitionFor(group.groupId))

        val offsetsAndMetadataMessageSet = Map(offsetTopicPartition ->
          new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*))

        // set the callback function to insert offsets into cache after log append completed
        def putCacheCallback(responseStatus: Map[TopicPartition, PartitionResponse]) {
          // the append response should only contain the topics partition
          if (responseStatus.size != 1 || ! responseStatus.contains(offsetTopicPartition))
            throw new IllegalStateException("Append status %s should only have one partition %s"
              .format(responseStatus, offsetTopicPartition))

          // construct the commit response status and insert
          // the offset and metadata to cache if the append status has no error
          val status = responseStatus(offsetTopicPartition)
          val statusError = Errors.forCode(status.errorCode)

          val responseCode =
            group synchronized {
              if (statusError == Errors.NONE) {
                if (!group.is(Dead)) {
                  filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) =>
                    group.completePendingOffsetWrite(topicAndPartition, offsetAndMetadata)
                  }
                }
                Errors.NONE.code
              } else {
                if (!group.is(Dead)) {
                  filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) =>
                    group.failPendingOffsetWrite(topicAndPartition, offsetAndMetadata)
                  }
                }

                debug(s"Offset commit $filteredOffsetMetadata from group ${group.groupId}, consumer $consumerId " +
                  s"with generation $generationId failed when appending to log due to ${statusError.exceptionName}")

                // transform the log append error code to the corresponding the commit status error code
                val responseError = statusError match {
                  case Errors.UNKNOWN_TOPIC_OR_PARTITION
                       | Errors.NOT_ENOUGH_REPLICAS
                       | Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND =>
                    Errors.GROUP_COORDINATOR_NOT_AVAILABLE

                  case Errors.NOT_LEADER_FOR_PARTITION =>
                    Errors.NOT_COORDINATOR_FOR_GROUP

                  case Errors.MESSAGE_TOO_LARGE
                       | Errors.RECORD_LIST_TOO_LARGE
                       | Errors.INVALID_FETCH_SIZE =>
                    Errors.INVALID_COMMIT_OFFSET_SIZE

                  case other => other
                }

                responseError.code
              }
            }

          // compute the final error codes for the commit response
          val commitStatus = offsetMetadata.map { case (topicAndPartition, offsetAndMetadata) =>
            if (validateOffsetMetadataLength(offsetAndMetadata.metadata))
              (topicAndPartition, responseCode)
            else
              (topicAndPartition, Errors.OFFSET_METADATA_TOO_LARGE.code)
          }

          // finally trigger the callback logic passed from the API layer
          responseCallback(commitStatus)
        }

        group synchronized {
          group.prepareOffsetCommit(offsetMetadata)
        }

        Some(DelayedStore(offsetsAndMetadataMessageSet, putCacheCallback))
...
 }
复制代码

handleOffsetFetchRequest

    } else {
        // version 1 reads offsets from Kafka;
        val offsets = coordinator.handleFetchOffsets(offsetFetchRequest.groupId, authorizedTopicPartitions).toMap

        // Note that we do not need to filter the partitions in the
        // metadata cache as the topic partitions will be filtered
        // in coordinator's offset manager through the offset cache
        new OffsetFetchResponse((offsets ++ unauthorizedStatus).asJava)
      }
复制代码
    groupManager.getOffsets(groupId, partitions)
复制代码
    trace("Getting offsets %s for group %s.".format(topicPartitions, groupId))
    val group = groupMetadataCache.get(groupId)
    if (group == null) {
      topicPartitions.map { topicPartition =>
        (topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
      }.toMap
    } else {
      group synchronized {
        if (group.is(Dead)) {
          topicPartitions.map { topicPartition =>
            (topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
          }.toMap
        } else {
            if (topicPartitions.isEmpty) {
              // Return offsets for all partitions owned by this consumer group. (this only applies to consumers that commit offsets to Kafka.)
              group.allOffsets.map { case (topicPartition, offsetAndMetadata) =>
                (topicPartition, new OffsetFetchResponse.PartitionData(offsetAndMetadata.offset, offsetAndMetadata.metadata, Errors.NONE.code))
              }
            } else {
              topicPartitions.map { topicPartition =>
                group.offset(topicPartition) match {
                  case None => (topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
                  case Some(offsetAndMetadata) =>
                    (topicPartition, new OffsetFetchResponse.PartitionData(offsetAndMetadata.offset, offsetAndMetadata.metadata, Errors.NONE.code))
                }
              }.toMap
            }
        }
      } 
复制代码

handleGroupCoordinatorRequest

handleJoinGroupRequest

handleHeartbeatRequest

handleLeaveGroupRequest

handleSyncGroupRequest

handleDescribeGroupRequest

handleListGroupsRequest

handleSaslHandshakeRequest

handleApiVersionsRequest

handleCreateTopicsRequest

public class CreateTopicsRequest extends AbstractRequest {
    private static final Schema CURRENT_SCHEMA = ProtoUtils.currentRequestSchema(ApiKeys.CREATE_TOPICS.id);

    private static final String REQUESTS_KEY_NAME = "create_topic_requests";

    private static final String TIMEOUT_KEY_NAME = "timeout";
    private static final String TOPIC_KEY_NAME = "topic";
    private static final String NUM_PARTITIONS_KEY_NAME = "num_partitions";
    private static final String REPLICATION_FACTOR_KEY_NAME = "replication_factor";
    private static final String REPLICA_ASSIGNMENT_KEY_NAME = "replica_assignment";
    private static final String REPLICA_ASSIGNMENT_PARTITION_ID_KEY_NAME = "partition_id";
    private static final String REPLICA_ASSIGNMENT_REPLICAS_KEY_NAME = "replicas";

    private static final String CONFIG_KEY_KEY_NAME = "config_key";
    private static final String CONFIG_VALUE_KEY_NAME = "config_value";
    private static final String CONFIGS_KEY_NAME = "configs";

    public static final class TopicDetails {
        public final int numPartitions;
        public final short replicationFactor;
        public final Map<Integer, List<Integer>> replicasAssignments;
        public final Map<String, String> configs;
        ...
        }
复制代码
      adminManager.createTopics(
        createTopicsRequest.timeout.toInt,
        validTopics,
        sendResponseWithDuplicatesCallback
      )
复制代码
  /**
    * Create topics and wait until the topics have been completely created.
    * The callback function will be triggered either when timeout, error or the topics are created.
    */
  def createTopics(timeout: Int,
                   createInfo: Map[String, TopicDetails],
                   responseCallback: Map[String, Errors] => Unit) {


复制代码
else {
            AdminUtils.assignReplicasToBrokers(brokers, arguments.numPartitions, arguments.replicationFactor)
          }
复制代码
/**
   * There are 3 goals of replica assignment:
   *
   * 1. Spread the replicas evenly among brokers.
   * 2. For partitions assigned to a particular broker, their other replicas are spread over the other brokers.
   * 3. If all brokers have rack information, assign the replicas for each partition to different racks if possible
   *
   * To achieve this goal for replica assignment without considering racks, we:
   * 1. Assign the first replica of each partition by round-robin, starting from a random position in the broker list.
   * 2. Assign the remaining replicas of each partition with an increasing shift.
   *
   * Here is an example of assigning
   * broker-0  broker-1  broker-2  broker-3  broker-4
   * p0        p1        p2        p3        p4       (1st replica)
   * p5        p6        p7        p8        p9       (1st replica)
   * p4        p0        p1        p2        p3       (2nd replica)
   * p8        p9        p5        p6        p7       (2nd replica)
   * p3        p4        p0        p1        p2       (3nd replica)
   * p7        p8        p9        p5        p6       (3nd replica)
   *
   * To create rack aware assignment, this API will first create a rack alternated broker list. For example,
   * from this brokerID -> rack mapping:
   *
   * 0 -> "rack1", 1 -> "rack3", 2 -> "rack3", 3 -> "rack2", 4 -> "rack2", 5 -> "rack1"
   *
   * The rack alternated list will be:
   *
   * 0, 3, 1, 5, 4, 2
   *
   * Then an easy round-robin assignment can be applied. Assume 6 partitions with replication factor of 3, the assignment
   * will be:
   *
   * 0 -> 0,3,1
   * 1 -> 3,1,5
   * 2 -> 1,5,4
   * 3 -> 5,4,2
   * 4 -> 4,2,0
   * 5 -> 2,0,3
   *
   * Once it has completed the first round-robin, if there are more partitions to assign, the algorithm will start
   * shifting the followers. This is to ensure we will not always get the same set of sequences.
   * In this case, if there is another partition to assign (partition #6), the assignment will be:
   *
   * 6 -> 0,4,2 (instead of repeating 0,3,1 as partition 0)
   *
   * The rack aware assignment always chooses the 1st replica of the partition using round robin on the rack alternated
   * broker list. For rest of the replicas, it will be biased towards brokers on racks that do not have
   * any replica assignment, until every rack has a replica. Then the assignment will go back to round-robin on
   * the broker list.
   *
   * As the result, if the number of replicas is equal to or greater than the number of racks, it will ensure that
   * each rack will get at least one replica. Otherwise, each rack will get at most one replica. In a perfect
   * situation where the number of replicas is the same as the number of racks and each rack has the same number of
   * brokers, it guarantees that the replica distribution is even across brokers and racks.
   *
   * @return a Map from partition id to replica ids
   * @throws AdminOperationException If rack information is supplied but it is incomplete, or if it is not possible to
   *                                 assign each replica to a unique rack.
   *
   */
  def assignReplicasToBrokers(brokerMetadatas: Seq[BrokerMetadata],
                              nPartitions: Int,
                              replicationFactor: Int,
                              fixedStartIndex: Int = -1,
                              startPartitionId: Int = -1): Map[Int, Seq[Int]] = {
复制代码

handleDeleteTopicsRequest

adminManager.deleteTopics(
          deleteTopicRequest.timeout.toInt,
          authorizedTopics,
          sendResponseCallback
        )
复制代码
/**
    * Delete topics and wait until the topics have been completely deleted.
    * The callback function will be triggered either when timeout, error or the topics are deleted.
    */
  def deleteTopics(timeout: Int,
                   topics: Set[String],
                   responseCallback: Map[String, Errors] => Unit) {

    // 1. map over topics calling the asynchronous delete
    val metadata = topics.map { topic =>
        try {
          AdminUtils.deleteTopic(zkUtils, topic)
          DeleteTopicMetadata(topic, Errors.NONE)
        } catch {
          case _: TopicAlreadyMarkedForDeletionException =>
            // swallow the exception, and still track deletion allowing multiple calls to wait for deletion
            DeleteTopicMetadata(topic, Errors.NONE)
          case e: Throwable =>
            error(s"Error processing delete topic request for topic $topic", e)
            DeleteTopicMetadata(topic, Errors.forException(e))
        }
    }

    // 2. if timeout <= 0 or no topics can proceed return immediately
    if (timeout <= 0 || !metadata.exists(_.error == Errors.NONE)) {
      val results = metadata.map { deleteTopicMetadata =>
        // ignore topics that already have errors
        if (deleteTopicMetadata.error == Errors.NONE) {
          (deleteTopicMetadata.topic, Errors.REQUEST_TIMED_OUT)
        } else {
          (deleteTopicMetadata.topic, deleteTopicMetadata.error)
        }
      }.toMap
      responseCallback(results)
    } else {
      // 3. else pass the topics and errors to the delayed operation and set the keys
      val delayedDelete = new DelayedDeleteTopics(timeout, metadata.toSeq, this, responseCallback)
      val delayedDeleteKeys = topics.map(new TopicKey(_)).toSeq
      // try to complete the request immediately, otherwise put it into the purgatory
      topicPurgatory.tryCompleteElseWatch(delayedDelete, delayedDeleteKeys)
    }
  }
复制代码

猜你喜欢

转载自juejin.im/post/7128204198839058469