github.com/polarismesh/polaris@v1.17.8/service/batch/instance.go (about) 1 /** 2 * Tencent is pleased to support the open source community by making Polaris available. 3 * 4 * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 5 * 6 * Licensed under the BSD 3-Clause License (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * https://opensource.org/licenses/BSD-3-Clause 11 * 12 * Unless required by applicable law or agreed to in writing, software distributed 13 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 14 * CONDITIONS OF ANY KIND, either express or implied. See the License for the 15 * specific language governing permissions and limitations under the License. 16 */ 17 18 package batch 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "strconv" 25 "time" 26 27 "github.com/golang/protobuf/ptypes/wrappers" 28 apimodel "github.com/polarismesh/specification/source/go/api/v1/model" 29 "go.uber.org/zap" 30 31 "github.com/polarismesh/polaris/cache" 32 api "github.com/polarismesh/polaris/common/api/v1" 33 "github.com/polarismesh/polaris/common/model" 34 commonstore "github.com/polarismesh/polaris/common/store" 35 "github.com/polarismesh/polaris/common/utils" 36 "github.com/polarismesh/polaris/store" 37 ) 38 39 var ( 40 ErrorNotFoundService = errors.New("not found service") 41 ErrorSameRegIsInstanceRequest = errors.New("there is the same instance request") 42 ErrorRegIsInstanceTimeout = errors.New("polaris-sever regis instance busy") 43 ) 44 45 const ( 46 defaultWaitTime = 32 * time.Millisecond 47 defaultTaskLife = 30 * time.Second 48 ) 49 50 // InstanceCtrl 批量操作实例的类 51 type InstanceCtrl struct { 52 config *CtrlConfig 53 storage store.Store 54 cacheMgn *cache.CacheManager 55 56 // store协程,负责写操作 57 storeThreadCh []chan []*InstanceFuture 58 59 // store协程里面调用的instance处理函数,可以是注册和反注册 60 instanceHandler func([]*InstanceFuture) error 61 62 // 空闲的store协程,记录每一个空闲id 63 idleStoreThread chan int 64 waitDuration time.Duration 65 66 // 任务的有效时间 67 taskLife time.Duration 68 69 // 请求接受协程 70 queue chan *InstanceFuture 71 label string 72 73 // 是否开启了心跳上报功能 74 hbOpen bool 75 } 76 77 // NewBatchRegisterCtrl 注册实例批量操作对象 78 func NewBatchRegisterCtrl(storage store.Store, cacheMgn *cache.CacheManager, 79 config *CtrlConfig) (*InstanceCtrl, error) { 80 81 register, err := newBatchInstanceCtrl("register", storage, cacheMgn, config) 82 if err != nil { 83 return nil, err 84 } 85 if register == nil { 86 return nil, nil 87 } 88 89 log.Info("[Batch] open batch register") 90 register.label = "register" 91 register.instanceHandler = register.registerHandler 92 return register, nil 93 } 94 95 // NewBatchDeregisterCtrl 实例反注册的操作对象 96 func NewBatchDeregisterCtrl(storage store.Store, cacheMgn *cache.CacheManager, config *CtrlConfig) ( 97 *InstanceCtrl, error) { 98 deregister, err := newBatchInstanceCtrl("deregister", storage, cacheMgn, config) 99 if err != nil { 100 return nil, err 101 } 102 if deregister == nil { 103 return nil, nil 104 } 105 106 log.Info("[Batch] open batch deregister") 107 deregister.label = "deregister" 108 deregister.instanceHandler = deregister.deregisterHandler 109 110 return deregister, nil 111 } 112 113 // NewBatchHeartbeatCtrl 实例心跳的操作对象 114 func NewBatchHeartbeatCtrl(storage store.Store, cacheMgn *cache.CacheManager, config *CtrlConfig) ( 115 *InstanceCtrl, error) { 116 heartbeat, err := newBatchInstanceCtrl("heartbeat", storage, cacheMgn, config) 117 if err != nil { 118 return nil, err 119 } 120 if heartbeat == nil { 121 return nil, nil 122 } 123 124 log.Info("[Batch] open batch heartbeat") 125 heartbeat.label = "heartbeat" 126 heartbeat.instanceHandler = heartbeat.heartbeatHandler 127 128 return heartbeat, nil 129 } 130 131 // Start 开始启动批量操作实例的相关协程 132 func (ctrl *InstanceCtrl) Start(ctx context.Context) { 133 log.Infof("[Batch] Start batch instance, config: %+v", ctrl.config) 134 135 // 初始化并且启动多个store协程,并发对数据库写 136 for i := 0; i < ctrl.config.Concurrency; i++ { 137 ctrl.storeThreadCh = append(ctrl.storeThreadCh, make(chan []*InstanceFuture)) 138 } 139 for i := 0; i < ctrl.config.Concurrency; i++ { 140 go ctrl.storeWorker(ctx, i) 141 } 142 143 // 进入主循环 144 ctrl.mainLoop(ctx) 145 } 146 147 // newBatchInstanceCtrl 创建批量控制instance的对象 148 func newBatchInstanceCtrl(label string, storage store.Store, cacheMgn *cache.CacheManager, 149 config *CtrlConfig) (*InstanceCtrl, error) { 150 151 if config == nil || !config.Open { 152 return nil, nil 153 } 154 155 duration, err := time.ParseDuration(config.WaitTime) 156 if err != nil { 157 log.Errorf("[Batch] parse waitTime(%s) err: %s", config.WaitTime, err.Error()) 158 return nil, err 159 } 160 if duration == 0 { 161 log.Infof("[Batch] waitTime(%s) is 0, use default %v", config.WaitTime, defaultWaitTime) 162 duration = defaultWaitTime 163 } 164 165 taskLife := defaultTaskLife 166 if config.TaskLife != "" { 167 taskLife, err := time.ParseDuration(config.TaskLife) 168 if err != nil { 169 log.Errorf("[Batch] parse taskLife(%s) err: %s", config.TaskLife, err.Error()) 170 return nil, err 171 } 172 if taskLife == 0 { 173 log.Infof("[Batch] taskLife(%s) is 0, use default %v", config.TaskLife, defaultTaskLife) 174 taskLife = defaultTaskLife 175 } 176 } else { 177 // mean not allow drop expire task 178 taskLife = time.Duration(0) 179 } 180 log.Info("[Batch] drop expire task", zap.String("type", label), zap.Bool("switch-open", taskLife == 0)) 181 182 instance := &InstanceCtrl{ 183 config: config, 184 storage: storage, 185 cacheMgn: cacheMgn, 186 storeThreadCh: make([]chan []*InstanceFuture, 0, config.Concurrency), 187 idleStoreThread: make(chan int, config.Concurrency), 188 queue: make(chan *InstanceFuture, config.QueueSize), 189 waitDuration: duration, 190 taskLife: taskLife, 191 } 192 return instance, nil 193 } 194 195 // mainLoop 注册主协程 196 // 从注册队列中获取注册请求,当达到b.config.MaxBatchCount, 197 // 或当到了一个超时时间b.waitDuration,则发起一个写请求 198 // 写请求发送到store协程,规则:从空闲的管道idleStoreThread中挑选一个 199 func (ctrl *InstanceCtrl) mainLoop(ctx context.Context) { 200 futures := make([]*InstanceFuture, 0, ctrl.config.MaxBatchCount) 201 idx := 0 202 triggerConsume := func(data []*InstanceFuture) { 203 if idx == 0 { 204 return 205 } 206 // 选择一个idle的store协程写数据 TODO 这里需要统计一下 207 idleIdx := <-ctrl.idleStoreThread 208 ctrl.storeThreadCh[idleIdx] <- data 209 futures = make([]*InstanceFuture, 0, ctrl.config.MaxBatchCount) 210 idx = 0 211 } 212 // 启动接受注册请求的协程 213 go func() { 214 ticker := time.NewTicker(ctrl.waitDuration) 215 defer ticker.Stop() 216 for { 217 select { 218 case future := <-ctrl.queue: 219 futures = append(futures, future) 220 idx++ 221 if idx == ctrl.config.MaxBatchCount { 222 triggerConsume(futures[0:idx]) 223 } 224 case <-ticker.C: 225 triggerConsume(futures[0:idx]) 226 case <-ctx.Done(): 227 log.Infof("[Batch] %s main loop exited", ctrl.label) 228 return 229 } 230 } 231 }() 232 } 233 234 // storeWorker store写协程的主循环 235 // 从chan中获取数据,直接写数据库 236 // 每次写完,设置协程为空闲 237 func (ctrl *InstanceCtrl) storeWorker(ctx context.Context, index int) { 238 log.Infof("[Batch] %s worker(%d) running in main loop", ctrl.label, index) 239 // store协程启动,先把自己注册到idle中 240 ctrl.idleStoreThread <- index 241 // 主循环 242 for { 243 select { 244 case futures := <-ctrl.storeThreadCh[index]: 245 if err := ctrl.instanceHandler(futures); err != nil { 246 // 所有的错误都在instanceHandler函数里面进行答复和处理,这里只需记录一条日志 247 log.Errorf("[Batch] %s instances err: %s", ctrl.label, err.Error()) 248 } 249 ctrl.idleStoreThread <- index 250 case <-ctx.Done(): 251 // idle is not ready 252 log.Infof("[Batch] %s worker(%d) exited", ctrl.label, index) 253 return 254 } 255 } 256 } 257 258 // registerHandler 外部应该把鉴权完成 259 // 判断实例是否存在,也可以提前判断,减少batch复杂度 260 // 提前通过token判断,再进入batch操作 261 // batch操作,只是写操作 262 func (ctrl *InstanceCtrl) registerHandler(futures []*InstanceFuture) error { 263 if len(futures) == 0 { 264 log.Warn("[Batch] futures is empty") 265 return nil 266 } 267 268 cur := time.Now() 269 taskLife := ctrl.taskLife 270 dropExpire := taskLife != 0 271 272 log.Infof("[Batch] Start batch creating instances count: %d", len(futures)) 273 remains := make(map[string]*InstanceFuture, len(futures)) 274 for i := range futures { 275 entry := futures[i] 276 277 if _, ok := remains[entry.request.GetId().GetValue()]; ok { 278 entry.Reply(cur, apimodel.Code_SameInstanceRequest, ErrorSameRegIsInstanceRequest) 279 continue 280 } 281 282 if dropExpire && entry.CanDrop() && entry.begin.Add(taskLife).Before(cur) { 283 entry.Reply(cur, apimodel.Code_InstanceRegisTimeout, ErrorRegIsInstanceTimeout) 284 continue 285 } 286 287 remains[entry.request.GetId().GetValue()] = entry 288 } 289 290 // 统一判断实例是否存在,存在则需要更新部分数据 291 if err := ctrl.batchRestoreInstanceIsolate(remains); err != nil { 292 log.Errorf("[Batch] batch check instances existed err: %s", err.Error()) 293 } 294 295 // 判断入参数组是否为0 296 if len(remains) == 0 { 297 log.Info("[Batch] all instances is existed, return create instances process") 298 return nil 299 } 300 // 构造model数据 301 for _, entry := range remains { 302 ins := model.CreateInstanceModel(entry.serviceId, entry.request) 303 entry.SetInstance(ins) 304 } 305 // 调用batch接口,创建实例 306 instances := make([]*model.Instance, 0, len(remains)) 307 for _, entry := range remains { 308 instances = append(instances, entry.instance) 309 } 310 if err := ctrl.storage.BatchAddInstances(instances); err != nil { 311 sendReply(remains, commonstore.StoreCode2APICode(err), err) 312 return err 313 } 314 315 sendReply(remains, apimodel.Code_ExecuteSuccess, nil) 316 return nil 317 } 318 319 // heartbeatHandler 心跳状态变更处理函数 320 func (ctrl *InstanceCtrl) heartbeatHandler(futures []*InstanceFuture) error { 321 if len(futures) == 0 { 322 return nil 323 } 324 log.Infof("[Batch] start batch heartbeat instances count: %d", len(futures)) 325 ids := make(map[string]bool, len(futures)) 326 statusToIds := map[bool]map[string]int64{ 327 true: make(map[string]int64, len(futures)), 328 false: make(map[string]int64, len(futures)), 329 } 330 for _, entry := range futures { 331 // 多个记录,只有后面的一个生效 332 id := entry.request.GetId().GetValue() 333 if _, ok := ids[id]; ok { 334 values := statusToIds[!entry.healthy] 335 delete(values, id) 336 } 337 ids[id] = false 338 statusToIds[entry.healthy][id] = entry.lastHeartbeatTimeSec 339 } 340 341 // 转为不健康的实例,需要添加 metadata 342 appendMetaReqs := make([]*store.InstanceMetadataRequest, 0, len(statusToIds[false])) 343 // 转为健康的实例,需要删除 metadata 344 removeMetaReqs := make([]*store.InstanceMetadataRequest, 0, len(statusToIds[true])) 345 revision := utils.NewUUID() 346 for healthy, values := range statusToIds { 347 if len(values) == 0 { 348 continue 349 } 350 idValues := make([]interface{}, 0, len(values)) 351 for id := range values { 352 if healthy { 353 removeMetaReqs = append(removeMetaReqs, &store.InstanceMetadataRequest{ 354 InstanceID: id, 355 Revision: revision, 356 Keys: []string{model.MetadataInstanceLastHeartbeatTime}, 357 }) 358 } else { 359 appendMetaReqs = append(appendMetaReqs, &store.InstanceMetadataRequest{ 360 InstanceID: id, 361 Revision: revision, 362 Metadata: map[string]string{ 363 model.MetadataInstanceLastHeartbeatTime: strconv.FormatInt(values[id], 10), 364 }, 365 }) 366 } 367 idValues = append(idValues, id) 368 } 369 err := ctrl.storage.BatchSetInstanceHealthStatus(idValues, model.StatusBoolToInt(healthy), utils.NewUUID()) 370 if err != nil { 371 log.Errorf("[Batch] batch healthy check instances err: %s", err.Error()) 372 sendReply(futures, commonstore.StoreCode2APICode(err), err) 373 return err 374 } 375 if err := ctrl.storage.BatchAppendInstanceMetadata(appendMetaReqs); err != nil { 376 log.Errorf("[Batch] batch healthy check instances append metadata err: %s", err.Error()) 377 sendReply(futures, commonstore.StoreCode2APICode(err), err) 378 return err 379 } 380 if err := ctrl.storage.BatchRemoveInstanceMetadata(removeMetaReqs); err != nil { 381 log.Errorf("[Batch] batch healthy check instances remove metadata err: %s", err.Error()) 382 sendReply(futures, commonstore.StoreCode2APICode(err), err) 383 return err 384 } 385 } 386 sendReply(futures, apimodel.Code_ExecuteSuccess, nil) 387 return nil 388 } 389 390 // deregisterHandler 反注册处理函数 391 // 步骤: 392 // - 从数据库中批量读取实例ID对应的实例简要信息: 393 // 包括:ID,host,port,serviceName,serviceNamespace,serviceToken 394 // - 对instance做存在与token的双重校验,较少与数据库的交互 395 // - 对于不存在的token,返回notFoundResource 396 // - 对于token校验失败的,返回校验失败 397 // - 调用批量接口删除实例 398 func (ctrl *InstanceCtrl) deregisterHandler(futures []*InstanceFuture) error { 399 if len(futures) == 0 { 400 return nil 401 } 402 403 cur := time.Now() 404 log.Infof("[Batch] Start batch deregister instances count: %d", len(futures)) 405 remains := make(map[string]*InstanceFuture, len(futures)) 406 ids := make(map[string]bool, len(futures)) 407 for _, entry := range futures { 408 if _, ok := remains[entry.request.GetId().GetValue()]; ok { 409 entry.Reply(cur, apimodel.Code_SameInstanceRequest, ErrorSameRegIsInstanceRequest) 410 continue 411 } 412 413 remains[entry.request.GetId().GetValue()] = entry 414 ids[entry.request.GetId().GetValue()] = false 415 } 416 417 // 统一鉴权与判断是否存在 418 instances, err := ctrl.storage.GetInstancesBrief(ids) 419 if err != nil { 420 log.Errorf("[Batch] get instances service token err: %s", err.Error()) 421 sendReply(remains, commonstore.StoreCode2APICode(err), err) 422 return err 423 } 424 for _, future := range futures { 425 instance, ok := instances[future.request.GetId().GetValue()] 426 if !ok { 427 // 不存在,意味着不需要删除了 428 future.Reply(cur, apimodel.Code_NotFoundResource, fmt.Errorf("%s", api.Code2Info(api.NotFoundResource))) 429 delete(remains, future.request.GetId().GetValue()) 430 continue 431 } 432 433 future.SetInstance(instance) // 这里保存instance的目的:方便上层使用model数据 434 } 435 436 if len(remains) == 0 { 437 log.Infof("[Batch] deregister instances verify failed or instances is not existed, no remain any instances") 438 return nil 439 } 440 441 // 调用storage batch接口,删除实例 442 args := make([]interface{}, 0, len(remains)) 443 for _, entry := range remains { 444 args = append(args, entry.request.GetId().GetValue()) 445 } 446 if err := ctrl.storage.BatchDeleteInstances(args); err != nil { 447 log.Errorf("[Batch] batch delete instances err: %s", err.Error()) 448 sendReply(remains, commonstore.StoreCode2APICode(err), err) 449 return err 450 } 451 452 sendReply(remains, apimodel.Code_ExecuteSuccess, nil) 453 return nil 454 } 455 456 // batchRestoreInstanceIsolate 批量恢复实例的隔离状态,以请求为准,请求如果不存在,就以数据库为准 457 func (ctrl *InstanceCtrl) batchRestoreInstanceIsolate(futures map[string]*InstanceFuture) error { 458 if len(futures) == 0 { 459 return nil 460 } 461 462 // 初始化所有的id都是不存在的 463 ids := make(map[string]bool, len(futures)) 464 for _, entry := range futures { 465 ids[entry.request.GetId().GetValue()] = false 466 } 467 var id2Isolate map[string]bool 468 var err error 469 if id2Isolate, err = ctrl.storage.BatchGetInstanceIsolate(ids); err != nil { 470 log.Errorf("[Batch] check instances existed storage err: %s", err.Error()) 471 sendReply(futures, commonstore.StoreCode2APICode(err), err) 472 return err 473 } 474 475 if len(id2Isolate) == 0 { 476 return nil 477 } 478 479 if len(id2Isolate) > 0 { 480 for id, isolate := range id2Isolate { 481 if future, ok := futures[id]; ok && future.request.Isolate == nil { 482 future.request.Isolate = &wrappers.BoolValue{Value: isolate} 483 } 484 } 485 } 486 return nil 487 }