github.com/polarismesh/polaris@v1.17.8/plugin/healthchecker/leader/checker_leader.go (about) 1 /** 2 * Tencent is pleased to support the open source community by making Polaris available. 3 * 4 * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 5 * 6 * Licensed under the BSD 3-Clause License (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * https://opensource.org/licenses/BSD-3-Clause 11 * 12 * Unless required by applicable law or agreed to in writing, software distributed 13 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 14 * CONDITIONS OF ANY KIND, either express or implied. See the License for the 15 * specific language governing permissions and limitations under the License. 16 */ 17 18 package leader 19 20 import ( 21 "context" 22 "errors" 23 "runtime" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "go.uber.org/zap" 29 "google.golang.org/grpc/metadata" 30 31 "github.com/polarismesh/polaris/common/batchjob" 32 "github.com/polarismesh/polaris/common/eventhub" 33 commontime "github.com/polarismesh/polaris/common/time" 34 "github.com/polarismesh/polaris/common/utils" 35 "github.com/polarismesh/polaris/plugin" 36 "github.com/polarismesh/polaris/store" 37 ) 38 39 func init() { 40 d := &LeaderHealthChecker{} 41 plugin.RegisterPlugin(d.Name(), d) 42 } 43 44 const ( 45 // PluginName plugin name 46 PluginName = "heartbeatLeader" 47 // Servers key to manage hb servers 48 Servers = "servers" 49 // CountSep separator to divide server and count 50 Split = "|" 51 // optionSoltNum option key of soltNum 52 optionSoltNum = "soltNum" 53 // optionStreamNum option key of batch heartbeat stream num 54 optionStreamNum = "streamNum" 55 // electionKey use election key 56 electionKey = store.ElectionKeySelfServiceChecker 57 // subscriberName eventhub subscriber name 58 subscriberName = PluginName 59 // uninitializeSignal . 60 uninitializeSignal = int32(0) 61 // initializedSignal . 62 initializedSignal = int32(1) 63 // sendResource . 64 sendResource = "leaderchecker" 65 ) 66 67 var ( 68 // DefaultSoltNum default soltNum of LocalBeatRecordCache 69 DefaultSoltNum = int32(runtime.GOMAXPROCS(0) * 16) 70 // streamNum 71 streamNum = runtime.GOMAXPROCS(0) 72 ) 73 74 var ( 75 ErrorRedirectOnlyOnce = errors.New("redirect request only once") 76 ) 77 78 // LeaderHealthChecker Leader~Follower 节点心跳健康检查 79 // 1. 监听 LeaderChangeEvent 事件, 80 // 2. LeaderHealthChecker 启动时先根据 store 层的 LeaderElection 选举能力选出一个 Leader 81 // 3. Leader 和 Follower 之间建立 gRPC 长连接 82 // 4. LeaderHealthChecker 在处理 Report/Query/Check/Delete 先判断自己是否为 Leader 83 // - Leader 节点 84 // a. 心跳数据的读写直接写本地 map 内存 85 // - 非 Leader 节点 86 // a. 心跳写请求通过 gRPC 长连接直接发给 Leader 节点 87 // b. 心跳读请求通过 gRPC 长连接直接发给 Leader 节点,Leader 节点返回心跳时间戳信息 88 type LeaderHealthChecker struct { 89 initialize int32 90 // leaderChangeTimeSec last peer list start refresh occur timestamp 91 leaderChangeTimeSec int64 92 // suspendTimeSec healthcheck last suspend timestamp 93 suspendTimeSec int64 94 // conf leaderChecker config 95 conf *Config 96 // lock keeps safe to change leader info 97 lock sync.RWMutex 98 // leader leader signal 99 leader int32 100 // leaderVersion 自己本地记录的 101 leaderVersion int64 102 // remote remote peer info 103 remote Peer 104 // self self peer info 105 self Peer 106 // s store.Store 107 s store.Store 108 // subCtx 109 subCtx *eventhub.SubscribtionContext 110 } 111 112 // Name . 113 func (c *LeaderHealthChecker) Name() string { 114 return PluginName 115 } 116 117 // Initialize . 118 func (c *LeaderHealthChecker) Initialize(entry *plugin.ConfigEntry) error { 119 conf, err := unmarshal(entry.Option) 120 if err != nil { 121 return err 122 } 123 streamNum = int(conf.StreamNum) 124 c.conf = conf 125 c.self = NewLocalPeerFunc() 126 c.self.Initialize(*conf) 127 if err := c.self.Serve(context.Background(), c, "", 0); err != nil { 128 return err 129 } 130 subCtx, err := eventhub.Subscribe(eventhub.LeaderChangeEventTopic, c) 131 if err != nil { 132 return err 133 } 134 c.subCtx = subCtx 135 if c.s == nil { 136 storage, err := store.GetStore() 137 if err != nil { 138 return err 139 } 140 c.s = storage 141 } 142 if err := c.s.StartLeaderElection(electionKey); err != nil { 143 return err 144 } 145 registerMetrics() 146 return nil 147 } 148 149 // PreProcess do preprocess logic for event 150 func (c *LeaderHealthChecker) PreProcess(ctx context.Context, value any) any { 151 return value 152 } 153 154 // OnEvent event trigger 155 func (c *LeaderHealthChecker) OnEvent(ctx context.Context, i interface{}) error { 156 e, ok := i.(store.LeaderChangeEvent) 157 if !ok || e.Key != electionKey { 158 return nil 159 } 160 161 c.lock.Lock() 162 defer c.lock.Unlock() 163 164 atomic.AddInt64(&c.leaderVersion, 1) 165 atomic.StoreInt32(&c.initialize, uninitializeSignal) 166 curLeaderVersion := atomic.LoadInt64(&c.leaderVersion) 167 if e.Leader { 168 c.becomeLeader() 169 } else { 170 c.becomeFollower(e, curLeaderVersion) 171 c.self.Storage().Clean() 172 } 173 c.refreshLeaderChangeTimeSec() 174 return nil 175 } 176 177 func (c *LeaderHealthChecker) becomeLeader() { 178 if c.remote != nil { 179 plog.Info("[HealthCheck][Leader] become leader and close old leader", 180 zap.String("leader", c.remote.Host())) 181 // 关闭原来自己跟随的 leader 节点信息 182 _ = c.remote.Close() 183 c.remote = nil 184 } 185 if !c.isLeader() { 186 plog.Info("[HealthCheck][Leader] self become leader") 187 } 188 // leader 指向自己 189 atomic.StoreInt32(&c.leader, 1) 190 atomic.StoreInt32(&c.initialize, initializedSignal) 191 } 192 193 func (c *LeaderHealthChecker) becomeFollower(e store.LeaderChangeEvent, leaderVersion int64) { 194 // election.Host == "", 等待下一次通知 195 if e.LeaderHost == "" { 196 return 197 } 198 plog.Info("[HealthCheck][Leader] self become follower") 199 if c.remote != nil { 200 // leader 未发生变化 201 if e.LeaderHost == c.remote.Host() { 202 atomic.StoreInt32(&c.initialize, initializedSignal) 203 return 204 } 205 // leader 出现变更切换 206 if e.LeaderHost != c.remote.Host() { 207 plog.Info("[HealthCheck][Leader] become follower and leader change", 208 zap.String("leader", e.LeaderHost)) 209 // 关闭原来的 leader 节点信息 210 oldLeader := c.remote 211 c.remote = nil 212 _ = oldLeader.Close() 213 } 214 } 215 remoteLeader := NewRemotePeerFunc() 216 remoteLeader.Initialize(*c.conf) 217 if err := remoteLeader.Serve(context.Background(), c, e.LeaderHost, uint32(utils.LocalPort)); err != nil { 218 _ = remoteLeader.Close() 219 plog.Error("[HealthCheck][Leader] follower run serve, do retry", zap.Error(err)) 220 go func(e store.LeaderChangeEvent, leaderVersion int64) { 221 time.Sleep(time.Second) 222 c.lock.Lock() 223 defer c.lock.Unlock() 224 curVersion := atomic.LoadInt64(&c.leaderVersion) 225 if leaderVersion != curVersion { 226 return 227 } 228 c.becomeFollower(e, leaderVersion) 229 }(e, leaderVersion) 230 return 231 } 232 c.remote = remoteLeader 233 atomic.StoreInt32(&c.leader, 0) 234 atomic.StoreInt32(&c.initialize, initializedSignal) 235 return 236 } 237 238 // Destroy . 239 func (c *LeaderHealthChecker) Destroy() error { 240 c.subCtx.Cancel() 241 return nil 242 } 243 244 // Type for health check plugin, only one same type plugin is allowed 245 func (c *LeaderHealthChecker) Type() plugin.HealthCheckType { 246 return plugin.HealthCheckerHeartbeat 247 } 248 249 // Report process heartbeat info report 250 func (c *LeaderHealthChecker) Report(ctx context.Context, request *plugin.ReportRequest) error { 251 if !c.isLeader() && isSendFromPeer(ctx) { 252 plog.Error("[Health Check][Leader] follower checker receive other follower request") 253 return ErrorRedirectOnlyOnce 254 } 255 256 c.lock.RLock() 257 defer c.lock.RUnlock() 258 if !c.isInitialize() { 259 plog.Warn("[Health Check][Leader] leader checker uninitialize, ignore report") 260 return nil 261 } 262 responsible := c.findLeaderPeer() 263 record := WriteBeatRecord{ 264 Record: RecordValue{ 265 Server: responsible.Host(), 266 CurTimeSec: request.CurTimeSec, 267 Count: request.Count, 268 }, 269 Key: request.InstanceId, 270 } 271 if err := responsible.Put(record); err != nil { 272 return err 273 } 274 if log.DebugEnabled() { 275 log.Debugf("[HealthCheck][Leader] add hb record, instanceId %s, record %+v", request.InstanceId, record) 276 } 277 return nil 278 } 279 280 // Check process the instance check 281 func (c *LeaderHealthChecker) Check(request *plugin.CheckRequest) (*plugin.CheckResponse, error) { 282 queryResp, err := c.Query(context.Background(), &request.QueryRequest) 283 if err != nil { 284 return nil, err 285 } 286 lastHeartbeatTime := queryResp.LastHeartbeatSec 287 checkResp := &plugin.CheckResponse{ 288 LastHeartbeatTimeSec: lastHeartbeatTime, 289 } 290 curTimeSec := request.CurTimeSec() 291 if c.skipCheck(request.InstanceId, int64(request.ExpireDurationSec)) { 292 checkResp.StayUnchanged = true 293 return checkResp, nil 294 } 295 if log.DebugEnabled() { 296 log.Debug("[HealthCheck][Leader] check hb record", zap.String("id", request.InstanceId), 297 zap.Int64("curTimeSec", curTimeSec), zap.Int64("lastHeartbeatTime", lastHeartbeatTime)) 298 } 299 if curTimeSec > lastHeartbeatTime { 300 if curTimeSec-lastHeartbeatTime >= int64(request.ExpireDurationSec) { 301 // 心跳超时 302 checkResp.Healthy = false 303 if request.Healthy { 304 log.Infof("[Health Check][Leader] health check expired, "+ 305 "last hb timestamp is %d, curTimeSec is %d, expireDurationSec is %d, instanceId %s", 306 lastHeartbeatTime, curTimeSec, request.ExpireDurationSec, request.InstanceId) 307 } else { 308 checkResp.StayUnchanged = true 309 } 310 return checkResp, nil 311 } 312 } 313 checkResp.Healthy = true 314 if !request.Healthy { 315 log.Infof("[Health Check][Leader] health check resumed, "+ 316 "last hb timestamp is %d, curTimeSec is %d, expireDurationSec is %d instanceId %s", 317 lastHeartbeatTime, curTimeSec, request.ExpireDurationSec, request.InstanceId) 318 } else { 319 checkResp.StayUnchanged = true 320 } 321 322 return checkResp, nil 323 } 324 325 // Query queries the heartbeat time 326 func (c *LeaderHealthChecker) Query(ctx context.Context, request *plugin.QueryRequest) (*plugin.QueryResponse, error) { 327 if isSendFromPeer(ctx) { 328 return nil, ErrorRedirectOnlyOnce 329 } 330 331 c.lock.RLock() 332 defer c.lock.RUnlock() 333 if !c.isInitialize() { 334 plog.Infof("[Health Check][Leader] leader checker uninitialize, ignore query") 335 return &plugin.QueryResponse{ 336 LastHeartbeatSec: 0, 337 }, nil 338 } 339 responsible := c.findLeaderPeer() 340 record, err := responsible.Get(request.InstanceId) 341 if err != nil { 342 return nil, err 343 } 344 if log.DebugEnabled() { 345 log.Debugf("[HealthCheck][Leader] query hb record, instanceId %s, record %+v", request.InstanceId, record) 346 } 347 return &plugin.QueryResponse{ 348 Server: responsible.Host(), 349 LastHeartbeatSec: record.Record.CurTimeSec, 350 Count: record.Record.Count, 351 Exists: record.Exist, 352 }, nil 353 } 354 355 // Delete delete record by key 356 func (c *LeaderHealthChecker) Delete(ctx context.Context, key string) error { 357 if isSendFromPeer(ctx) { 358 return ErrorRedirectOnlyOnce 359 } 360 c.lock.RLock() 361 defer c.lock.RUnlock() 362 responsible := c.findLeaderPeer() 363 return responsible.Del(key) 364 } 365 366 // Suspend checker for an entire expired interval 367 func (c *LeaderHealthChecker) Suspend() { 368 curTimeMilli := commontime.CurrentMillisecond() / 1000 369 log.Infof("[Health Check][Leader] suspend checker, start time %d", curTimeMilli) 370 atomic.StoreInt64(&c.suspendTimeSec, curTimeMilli) 371 } 372 373 // SuspendTimeSec get suspend time in seconds 374 func (c *LeaderHealthChecker) SuspendTimeSec() int64 { 375 return atomic.LoadInt64(&c.suspendTimeSec) 376 } 377 378 func (c *LeaderHealthChecker) findLeaderPeer() Peer { 379 if c.isLeader() { 380 return c.self 381 } 382 return c.remote 383 } 384 385 func (c *LeaderHealthChecker) skipCheck(key string, expireDurationSec int64) bool { 386 // 如果没有初始化,则忽略检查 387 if !c.isInitialize() { 388 log.Infof("[Health Check][Leader] leader checker uninitialize, ensure skip check") 389 return true 390 } 391 392 suspendTimeSec := c.SuspendTimeSec() 393 localCurTimeSec := commontime.CurrentMillisecond() / 1000 394 if suspendTimeSec > 0 && localCurTimeSec >= suspendTimeSec && 395 localCurTimeSec-suspendTimeSec < expireDurationSec { 396 log.Infof("[Health Check][Leader] health check peers suspended, "+ 397 "suspendTimeSec is %d, localCurTimeSec is %d, expireDurationSec is %d, id %s", 398 suspendTimeSec, localCurTimeSec, expireDurationSec, key) 399 return true 400 } 401 402 // 当 T1 时刻出现 Leader 节点切换,到 T2 时刻 Leader 节点切换成,在这期间,可能会出现以下情况 403 // case 1: T1~T2 时刻不存在 Leader 404 // case 2: T1~T2 时刻存在多个 Leader 405 leaderChangeTimeSec := c.LeaderChangeTimeSec() 406 if leaderChangeTimeSec > 0 && localCurTimeSec >= leaderChangeTimeSec && 407 localCurTimeSec-leaderChangeTimeSec < expireDurationSec { 408 log.Infof("[Health Check][Leader] health check peers on refresh, "+ 409 "refreshPeerTimeSec is %d, localCurTimeSec is %d, expireDurationSec is %d, id %s", 410 suspendTimeSec, localCurTimeSec, expireDurationSec, key) 411 return true 412 } 413 return false 414 } 415 416 func (c *LeaderHealthChecker) refreshLeaderChangeTimeSec() { 417 atomic.StoreInt64(&c.leaderChangeTimeSec, commontime.CurrentMillisecond()/1000) 418 } 419 420 func (c *LeaderHealthChecker) LeaderChangeTimeSec() int64 { 421 return atomic.LoadInt64(&c.leaderChangeTimeSec) 422 } 423 424 func (c *LeaderHealthChecker) isInitialize() bool { 425 return atomic.LoadInt32(&c.initialize) == initializedSignal 426 } 427 428 func (c *LeaderHealthChecker) isLeader() bool { 429 return atomic.LoadInt32(&c.leader) == 1 430 } 431 432 func (c *LeaderHealthChecker) DebugHandlers() []plugin.DebugHandler { 433 return []plugin.DebugHandler{ 434 { 435 Path: "/debug/checker/leader/info", 436 Handler: handleDescribeLeaderInfo(c), 437 }, 438 { 439 Path: "/debug/checker/leader/cache", 440 Handler: handleDescribeBeatCache(c), 441 }, 442 } 443 } 444 445 func (c *LeaderHealthChecker) handleSendGetRecords(futures []batchjob.Future) { 446 peers := make(map[string]*PeerReadTask) 447 for i := range futures { 448 taskInfo := futures[i].Param() 449 task := taskInfo.(*PeerTask) 450 peer := task.Peer 451 if peer.isClose() { 452 _ = futures[i].Reply(nil, ErrorPeerClosed) 453 continue 454 } 455 if _, ok := peers[peer.Host()]; !ok { 456 peers[peer.Host()] = &PeerReadTask{ 457 Peer: peer, 458 Keys: make([]string, 0, 16), 459 Futures: make(map[string][]batchjob.Future), 460 } 461 } 462 key := task.Key 463 peers[peer.Host()].Keys = append(peers[peer.Host()].Keys, key) 464 if _, ok := peers[peer.Host()].Futures[key]; !ok { 465 peers[peer.Host()].Futures[key] = make([]batchjob.Future, 0, 4) 466 } 467 peers[peer.Host()].Futures[key] = append(peers[peer.Host()].Futures[key], futures[i]) 468 } 469 470 for i := range peers { 471 peer := peers[i].Peer 472 keys := peers[i].Keys 473 peerfutures := peers[i].Futures 474 resp := peer.Cache.Get(keys...) 475 for key := range resp { 476 fs := peerfutures[key] 477 for index := range fs { 478 _ = fs[index].Reply(map[string]*ReadBeatRecord{ 479 key: resp[key], 480 }, nil) 481 } 482 } 483 } 484 for i := range futures { 485 _ = futures[i].Reply(nil, ErrorRecordNotFound) 486 } 487 } 488 489 func (c *LeaderHealthChecker) handleSendPutRecords(futures []batchjob.Future) { 490 peers := make(map[string]*PeerWriteTask) 491 for i := range futures { 492 taskInfo := futures[i].Param() 493 task := taskInfo.(*PeerTask) 494 peer := task.Peer 495 if peer.isClose() { 496 _ = futures[i].Reply(nil, ErrorPeerClosed) 497 continue 498 } 499 if _, ok := peers[peer.Host()]; !ok { 500 peers[peer.Host()] = &PeerWriteTask{ 501 Peer: peer, 502 Records: make([]WriteBeatRecord, 0, 16), 503 } 504 } 505 peers[peer.Host()].Records = append(peers[peer.Host()].Records, *task.Record) 506 } 507 508 for i := range peers { 509 peer := peers[i].Peer 510 peer.Cache.Put(peers[i].Records...) 511 } 512 for i := range futures { 513 _ = futures[i].Reply(struct{}{}, nil) 514 } 515 } 516 517 func isSendFromPeer(ctx context.Context) bool { 518 if md, ok := metadata.FromIncomingContext(ctx); ok { 519 if _, exist := md[sendResource]; exist { 520 return true 521 } 522 } 523 return false 524 } 525 526 type PeerTask struct { 527 Peer *RemotePeer 528 Key string 529 Record *WriteBeatRecord 530 }