前言
- 最近一直在研究Nacos开源框架的源码和执行流程原理
- 本次简单聊下AP集群架构下Nacos心跳设计原理
nacos客户端心跳健康上报源码
- 客户端注册到nacos服务端的同时会开启一个定时任务线程BeatTask,默认每隔5秒发送心跳到服务端进行检测。
public class NacosNamingService implements NamingService {
......
@Override
public void registerInstance(String serviceName, String groupName, Instance instance) throws NacosException {
if (instance.isEphemeral()) {
BeatInfo beatInfo = new BeatInfo();
beatInfo.setServiceName(NamingUtils.getGroupedName(serviceName, groupName));
beatInfo.setIp(instance.getIp());
beatInfo.setPort(instance.getPort());
beatInfo.setCluster(instance.getClusterName());
beatInfo.setWeight(instance.getWeight());
beatInfo.setMetadata(instance.getMetadata());
beatInfo.setScheduled(false);
beatInfo.setPeriod(instance.getInstanceHeartBeatInterval());
// 创建一个发送心跳的定时任务
beatReactor.addBeatInfo(NamingUtils.getGroupedName(serviceName, groupName), beatInfo);
}
// 把当前客户端的信息注册到nacos服务
serverProxy.registerService(NamingUtils.getGroupedName(serviceName, groupName), groupName, instance);
}
......
}
复制代码
- 客户端心跳健康上报类对象BeatReactor,就是在定时器发送http请求,定期把数据上次给服务端
- 如果中间心跳健康检查断开连接后,超过30秒再次上次心跳就会重新走注册上报流程
public class BeatReactor {
......
public BeatReactor(NamingProxy serverProxy, int threadCount) {
this.serverProxy = serverProxy;
// 构建定时任务线程池,并设置为守护线程
executorService = new ScheduledThreadPoolExecutor(threadCount, new ThreadFactory() {
@Override
public Thread newThread(Runnable r) {
Thread thread = new Thread(r);
thread.setDaemon(true);
thread.setName("com.alibaba.nacos.naming.beat.sender");
return thread;
}
});
}
public void addBeatInfo(String serviceName, BeatInfo beatInfo) {
NAMING_LOGGER.info("[BEAT] adding beat: {} to beat map.", beatInfo);
String key = buildKey(serviceName, beatInfo.getIp(), beatInfo.getPort());
BeatInfo existBeat = null;
//fix #1733
if ((existBeat = dom2Beat.remove(key)) != null) {
existBeat.setStopped(true);
}
dom2Beat.put(key, beatInfo);
executorService.schedule(new BeatTask(beatInfo), beatInfo.getPeriod(), TimeUnit.MILLISECONDS);
MetricsMonitor.getDom2BeatSizeMonitor().set(dom2Beat.size());
}
class BeatTask implements Runnable {
BeatInfo beatInfo;
public BeatTask(BeatInfo beatInfo) {
this.beatInfo = beatInfo;
}
@Override
public void run() {
if (beatInfo.isStopped()) {
return;
}
long nextTime = beatInfo.getPeriod();
try {
// 发送心跳请求到nacos服务端
JSONObject result = serverProxy.sendBeat(beatInfo, BeatReactor.this.lightBeatEnabled);
long interval = result.getIntValue("clientBeatInterval");
boolean lightBeatEnabled = false;
if (result.containsKey(CommonParams.LIGHT_BEAT_ENABLED)) {
lightBeatEnabled = result.getBooleanValue(CommonParams.LIGHT_BEAT_ENABLED);
}
BeatReactor.this.lightBeatEnabled = lightBeatEnabled;
if (interval > 0) {
nextTime = interval;
}
int code = NamingResponseCode.OK;
if (result.containsKey(CommonParams.CODE)) {
code = result.getIntValue(CommonParams.CODE);
}
if (code == NamingResponseCode.RESOURCE_NOT_FOUND) {
Instance instance = new Instance();
instance.setPort(beatInfo.getPort());
instance.setIp(beatInfo.getIp());
instance.setWeight(beatInfo.getWeight());
instance.setMetadata(beatInfo.getMetadata());
instance.setClusterName(beatInfo.getCluster());
instance.setServiceName(beatInfo.getServiceName());
instance.setInstanceId(instance.getInstanceId());
instance.setEphemeral(true);
try {
// 实例重新注册上报
serverProxy.registerService(beatInfo.getServiceName(),
NamingUtils.getGroupName(beatInfo.getServiceName()), instance);
} catch (Exception ignore) {
}
}
} catch (NacosException ne) {
NAMING_LOGGER.error("[CLIENT-BEAT] failed to send beat: {}, code: {}, msg: {}",
JSON.toJSONString(beatInfo), ne.getErrCode(), ne.getErrMsg());
}
executorService.schedule(new BeatTask(beatInfo), nextTime, TimeUnit.MILLISECONDS);
}
}
}
复制代码
nacos服务端心跳健康检查源码
- nacos服务端接收到客户端注册实例后会开一个定时任务线程ClientBeatCheckTask,每隔5秒检测实例的上次心跳间隔时间是否大于15秒,大于就设置健康状态为false。如果时间间隔超过30秒,会剔除实例,通过http自我调用delete接口进行剔除,重新发送心跳也会重新注册。
注:Nacos集群环境下只会有一个nacos节点与客户端进行心跳监控连接,然后这个节点会同步客户端信息到其他nacos节点。
/**
* Check and update statues of ephemeral instances, remove them if they have been expired.
*
* @author nkorange
*/
public class ClientBeatCheckTask implements Runnable {
private Service service;
public ClientBeatCheckTask(Service service) {
this.service = service;
}
......
@Override
public void run() {
try {
// 这里是判断在nacos集群环境中是否以当前nacos节点和客户端进行心跳监控
if (!getDistroMapper().responsible(service.getName())) {
return;
}
if (!getSwitchDomain().isHealthCheckEnabled()) {
return;
}
List<Instance> instances = service.allIPs(true);
// 判断客户端实例上次心跳时间是否超过15秒,如果超过就把健康状态设置为false,并通知其他nacos节点
for (Instance instance : instances) {
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getInstanceHeartBeatTimeOut()) {
if (!instance.isMarked()) {
if (instance.isHealthy()) {
instance.setHealthy(false);
Loggers.EVT_LOG
.info("{POS} {IP-DISABLED} valid: {}:{}@{}@{}, region: {}, msg: client timeout after {}, last beat: {}",
instance.getIp(), instance.getPort(), instance.getClusterName(),
service.getName(), UtilsAndCommons.LOCALHOST_SITE,
instance.getInstanceHeartBeatTimeOut(), instance.getLastBeat());
getPushService().serviceChanged(service);
ApplicationUtils.publishEvent(new InstanceHeartbeatTimeoutEvent(this, instance));
}
}
}
}
if (!getGlobalConfig().isExpireInstance()) {
return;
}
// 判断客户端实例上次心跳时间是否超过30秒,如果超过就自我调用http请求删除客户端
for (Instance instance : instances) {
if (instance.isMarked()) {
continue;
}
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getIpDeleteTimeout()) {
// delete instance
Loggers.SRV_LOG.info("[AUTO-DELETE-IP] service: {}, ip: {}", service.getName(),
JacksonUtils.toJson(instance));
deleteIp(instance);
}
}
} catch (Exception e) {
Loggers.SRV_LOG.warn("Exception while processing client beat time out.", e);
}
}
......
}
复制代码
- nacos集群环境下只有一个nacos节点开启心跳检测,在DistroMapper.responsible()方法里通过指定hash算法得出哪个节点进行心跳检测
/**
* Distro mapper, judge which server response input service.
*
* @author nkorange
*/
@Component("distroMapper")
public class DistroMapper extends MemberChangeListener {
......
/**
* Judge whether current server is responsible for input service.
*
* @param serviceName service name
* @return true if input service is response, otherwise false
*/
public boolean responsible(String serviceName) {
final List<String> servers = healthyList;
if (!switchDomain.isDistroEnabled() || EnvUtil.getStandaloneMode()) {
return true;
}
if (CollectionUtils.isEmpty(servers)) {
// means distro config is not ready yet
return false;
}
int index = servers.indexOf(EnvUtil.getLocalAddress());
int lastIndex = servers.lastIndexOf(EnvUtil.getLocalAddress());
if (lastIndex < 0 || index < 0) {
return true;
}
int target = distroHash(serviceName) % servers.size();
return target >= index && target <= lastIndex;
}
private int distroHash(String serviceName) {
return Math.abs(serviceName.hashCode() % Integer.MAX_VALUE);
}
......
}
复制代码
- 算法主要是对nacos集群节点取模,然后来判断在哪一个节点进行心跳连接。
- 但这里就会有一个问题,那就是如果nacos节点挂了,那么取模的方式就会有问题,所以nacos集群节点就需要进行同步状态。
- Nacos集群架构下节点状态同步的设计原理
最后
- 虚心学习,共同进步-_-