github.com/polarismesh/polaris@v1.17.8/service/batch/instance.go (about)

     1  /**
     2   * Tencent is pleased to support the open source community by making Polaris available.
     3   *
     4   * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
     5   *
     6   * Licensed under the BSD 3-Clause License (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at
     9   *
    10   * https://opensource.org/licenses/BSD-3-Clause
    11   *
    12   * Unless required by applicable law or agreed to in writing, software distributed
    13   * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
    14   * CONDITIONS OF ANY KIND, either express or implied. See the License for the
    15   * specific language governing permissions and limitations under the License.
    16   */
    17  
    18  package batch
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"strconv"
    25  	"time"
    26  
    27  	"github.com/golang/protobuf/ptypes/wrappers"
    28  	apimodel "github.com/polarismesh/specification/source/go/api/v1/model"
    29  	"go.uber.org/zap"
    30  
    31  	"github.com/polarismesh/polaris/cache"
    32  	api "github.com/polarismesh/polaris/common/api/v1"
    33  	"github.com/polarismesh/polaris/common/model"
    34  	commonstore "github.com/polarismesh/polaris/common/store"
    35  	"github.com/polarismesh/polaris/common/utils"
    36  	"github.com/polarismesh/polaris/store"
    37  )
    38  
    39  var (
    40  	ErrorNotFoundService          = errors.New("not found service")
    41  	ErrorSameRegIsInstanceRequest = errors.New("there is the same instance request")
    42  	ErrorRegIsInstanceTimeout     = errors.New("polaris-sever regis instance busy")
    43  )
    44  
    45  const (
    46  	defaultWaitTime = 32 * time.Millisecond
    47  	defaultTaskLife = 30 * time.Second
    48  )
    49  
    50  // InstanceCtrl 批量操作实例的类
    51  type InstanceCtrl struct {
    52  	config   *CtrlConfig
    53  	storage  store.Store
    54  	cacheMgn *cache.CacheManager
    55  
    56  	// store协程,负责写操作
    57  	storeThreadCh []chan []*InstanceFuture
    58  
    59  	// store协程里面调用的instance处理函数,可以是注册和反注册
    60  	instanceHandler func([]*InstanceFuture) error
    61  
    62  	// 空闲的store协程,记录每一个空闲id
    63  	idleStoreThread chan int
    64  	waitDuration    time.Duration
    65  
    66  	// 任务的有效时间
    67  	taskLife time.Duration
    68  
    69  	// 请求接受协程
    70  	queue chan *InstanceFuture
    71  	label string
    72  
    73  	// 是否开启了心跳上报功能
    74  	hbOpen bool
    75  }
    76  
    77  // NewBatchRegisterCtrl 注册实例批量操作对象
    78  func NewBatchRegisterCtrl(storage store.Store, cacheMgn *cache.CacheManager,
    79  	config *CtrlConfig) (*InstanceCtrl, error) {
    80  
    81  	register, err := newBatchInstanceCtrl("register", storage, cacheMgn, config)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	if register == nil {
    86  		return nil, nil
    87  	}
    88  
    89  	log.Info("[Batch] open batch register")
    90  	register.label = "register"
    91  	register.instanceHandler = register.registerHandler
    92  	return register, nil
    93  }
    94  
    95  // NewBatchDeregisterCtrl 实例反注册的操作对象
    96  func NewBatchDeregisterCtrl(storage store.Store, cacheMgn *cache.CacheManager, config *CtrlConfig) (
    97  	*InstanceCtrl, error) {
    98  	deregister, err := newBatchInstanceCtrl("deregister", storage, cacheMgn, config)
    99  	if err != nil {
   100  		return nil, err
   101  	}
   102  	if deregister == nil {
   103  		return nil, nil
   104  	}
   105  
   106  	log.Info("[Batch] open batch deregister")
   107  	deregister.label = "deregister"
   108  	deregister.instanceHandler = deregister.deregisterHandler
   109  
   110  	return deregister, nil
   111  }
   112  
   113  // NewBatchHeartbeatCtrl 实例心跳的操作对象
   114  func NewBatchHeartbeatCtrl(storage store.Store, cacheMgn *cache.CacheManager, config *CtrlConfig) (
   115  	*InstanceCtrl, error) {
   116  	heartbeat, err := newBatchInstanceCtrl("heartbeat", storage, cacheMgn, config)
   117  	if err != nil {
   118  		return nil, err
   119  	}
   120  	if heartbeat == nil {
   121  		return nil, nil
   122  	}
   123  
   124  	log.Info("[Batch] open batch heartbeat")
   125  	heartbeat.label = "heartbeat"
   126  	heartbeat.instanceHandler = heartbeat.heartbeatHandler
   127  
   128  	return heartbeat, nil
   129  }
   130  
   131  // Start 开始启动批量操作实例的相关协程
   132  func (ctrl *InstanceCtrl) Start(ctx context.Context) {
   133  	log.Infof("[Batch] Start batch instance, config: %+v", ctrl.config)
   134  
   135  	// 初始化并且启动多个store协程,并发对数据库写
   136  	for i := 0; i < ctrl.config.Concurrency; i++ {
   137  		ctrl.storeThreadCh = append(ctrl.storeThreadCh, make(chan []*InstanceFuture))
   138  	}
   139  	for i := 0; i < ctrl.config.Concurrency; i++ {
   140  		go ctrl.storeWorker(ctx, i)
   141  	}
   142  
   143  	// 进入主循环
   144  	ctrl.mainLoop(ctx)
   145  }
   146  
   147  // newBatchInstanceCtrl 创建批量控制instance的对象
   148  func newBatchInstanceCtrl(label string, storage store.Store, cacheMgn *cache.CacheManager,
   149  	config *CtrlConfig) (*InstanceCtrl, error) {
   150  
   151  	if config == nil || !config.Open {
   152  		return nil, nil
   153  	}
   154  
   155  	duration, err := time.ParseDuration(config.WaitTime)
   156  	if err != nil {
   157  		log.Errorf("[Batch] parse waitTime(%s) err: %s", config.WaitTime, err.Error())
   158  		return nil, err
   159  	}
   160  	if duration == 0 {
   161  		log.Infof("[Batch] waitTime(%s) is 0, use default %v", config.WaitTime, defaultWaitTime)
   162  		duration = defaultWaitTime
   163  	}
   164  
   165  	taskLife := defaultTaskLife
   166  	if config.TaskLife != "" {
   167  		taskLife, err := time.ParseDuration(config.TaskLife)
   168  		if err != nil {
   169  			log.Errorf("[Batch] parse taskLife(%s) err: %s", config.TaskLife, err.Error())
   170  			return nil, err
   171  		}
   172  		if taskLife == 0 {
   173  			log.Infof("[Batch] taskLife(%s) is 0, use default %v", config.TaskLife, defaultTaskLife)
   174  			taskLife = defaultTaskLife
   175  		}
   176  	} else {
   177  		// mean not allow drop expire task
   178  		taskLife = time.Duration(0)
   179  	}
   180  	log.Info("[Batch] drop expire task", zap.String("type", label), zap.Bool("switch-open", taskLife == 0))
   181  
   182  	instance := &InstanceCtrl{
   183  		config:          config,
   184  		storage:         storage,
   185  		cacheMgn:        cacheMgn,
   186  		storeThreadCh:   make([]chan []*InstanceFuture, 0, config.Concurrency),
   187  		idleStoreThread: make(chan int, config.Concurrency),
   188  		queue:           make(chan *InstanceFuture, config.QueueSize),
   189  		waitDuration:    duration,
   190  		taskLife:        taskLife,
   191  	}
   192  	return instance, nil
   193  }
   194  
   195  // mainLoop 注册主协程
   196  // 从注册队列中获取注册请求,当达到b.config.MaxBatchCount,
   197  // 或当到了一个超时时间b.waitDuration,则发起一个写请求
   198  // 写请求发送到store协程,规则:从空闲的管道idleStoreThread中挑选一个
   199  func (ctrl *InstanceCtrl) mainLoop(ctx context.Context) {
   200  	futures := make([]*InstanceFuture, 0, ctrl.config.MaxBatchCount)
   201  	idx := 0
   202  	triggerConsume := func(data []*InstanceFuture) {
   203  		if idx == 0 {
   204  			return
   205  		}
   206  		// 选择一个idle的store协程写数据 TODO 这里需要统计一下
   207  		idleIdx := <-ctrl.idleStoreThread
   208  		ctrl.storeThreadCh[idleIdx] <- data
   209  		futures = make([]*InstanceFuture, 0, ctrl.config.MaxBatchCount)
   210  		idx = 0
   211  	}
   212  	// 启动接受注册请求的协程
   213  	go func() {
   214  		ticker := time.NewTicker(ctrl.waitDuration)
   215  		defer ticker.Stop()
   216  		for {
   217  			select {
   218  			case future := <-ctrl.queue:
   219  				futures = append(futures, future)
   220  				idx++
   221  				if idx == ctrl.config.MaxBatchCount {
   222  					triggerConsume(futures[0:idx])
   223  				}
   224  			case <-ticker.C:
   225  				triggerConsume(futures[0:idx])
   226  			case <-ctx.Done():
   227  				log.Infof("[Batch] %s main loop exited", ctrl.label)
   228  				return
   229  			}
   230  		}
   231  	}()
   232  }
   233  
   234  // storeWorker store写协程的主循环
   235  // 从chan中获取数据,直接写数据库
   236  // 每次写完,设置协程为空闲
   237  func (ctrl *InstanceCtrl) storeWorker(ctx context.Context, index int) {
   238  	log.Infof("[Batch] %s worker(%d) running in main loop", ctrl.label, index)
   239  	// store协程启动,先把自己注册到idle中
   240  	ctrl.idleStoreThread <- index
   241  	// 主循环
   242  	for {
   243  		select {
   244  		case futures := <-ctrl.storeThreadCh[index]:
   245  			if err := ctrl.instanceHandler(futures); err != nil {
   246  				// 所有的错误都在instanceHandler函数里面进行答复和处理,这里只需记录一条日志
   247  				log.Errorf("[Batch] %s instances err: %s", ctrl.label, err.Error())
   248  			}
   249  			ctrl.idleStoreThread <- index
   250  		case <-ctx.Done():
   251  			// idle is not ready
   252  			log.Infof("[Batch] %s worker(%d) exited", ctrl.label, index)
   253  			return
   254  		}
   255  	}
   256  }
   257  
   258  // registerHandler 外部应该把鉴权完成
   259  // 判断实例是否存在,也可以提前判断,减少batch复杂度
   260  // 提前通过token判断,再进入batch操作
   261  // batch操作,只是写操作
   262  func (ctrl *InstanceCtrl) registerHandler(futures []*InstanceFuture) error {
   263  	if len(futures) == 0 {
   264  		log.Warn("[Batch] futures is empty")
   265  		return nil
   266  	}
   267  
   268  	cur := time.Now()
   269  	taskLife := ctrl.taskLife
   270  	dropExpire := taskLife != 0
   271  
   272  	log.Infof("[Batch] Start batch creating instances count: %d", len(futures))
   273  	remains := make(map[string]*InstanceFuture, len(futures))
   274  	for i := range futures {
   275  		entry := futures[i]
   276  
   277  		if _, ok := remains[entry.request.GetId().GetValue()]; ok {
   278  			entry.Reply(cur, apimodel.Code_SameInstanceRequest, ErrorSameRegIsInstanceRequest)
   279  			continue
   280  		}
   281  
   282  		if dropExpire && entry.CanDrop() && entry.begin.Add(taskLife).Before(cur) {
   283  			entry.Reply(cur, apimodel.Code_InstanceRegisTimeout, ErrorRegIsInstanceTimeout)
   284  			continue
   285  		}
   286  
   287  		remains[entry.request.GetId().GetValue()] = entry
   288  	}
   289  
   290  	// 统一判断实例是否存在,存在则需要更新部分数据
   291  	if err := ctrl.batchRestoreInstanceIsolate(remains); err != nil {
   292  		log.Errorf("[Batch] batch check instances existed err: %s", err.Error())
   293  	}
   294  
   295  	// 判断入参数组是否为0
   296  	if len(remains) == 0 {
   297  		log.Info("[Batch] all instances is existed, return create instances process")
   298  		return nil
   299  	}
   300  	// 构造model数据
   301  	for _, entry := range remains {
   302  		ins := model.CreateInstanceModel(entry.serviceId, entry.request)
   303  		entry.SetInstance(ins)
   304  	}
   305  	// 调用batch接口,创建实例
   306  	instances := make([]*model.Instance, 0, len(remains))
   307  	for _, entry := range remains {
   308  		instances = append(instances, entry.instance)
   309  	}
   310  	if err := ctrl.storage.BatchAddInstances(instances); err != nil {
   311  		sendReply(remains, commonstore.StoreCode2APICode(err), err)
   312  		return err
   313  	}
   314  
   315  	sendReply(remains, apimodel.Code_ExecuteSuccess, nil)
   316  	return nil
   317  }
   318  
   319  // heartbeatHandler 心跳状态变更处理函数
   320  func (ctrl *InstanceCtrl) heartbeatHandler(futures []*InstanceFuture) error {
   321  	if len(futures) == 0 {
   322  		return nil
   323  	}
   324  	log.Infof("[Batch] start batch heartbeat instances count: %d", len(futures))
   325  	ids := make(map[string]bool, len(futures))
   326  	statusToIds := map[bool]map[string]int64{
   327  		true:  make(map[string]int64, len(futures)),
   328  		false: make(map[string]int64, len(futures)),
   329  	}
   330  	for _, entry := range futures {
   331  		// 多个记录,只有后面的一个生效
   332  		id := entry.request.GetId().GetValue()
   333  		if _, ok := ids[id]; ok {
   334  			values := statusToIds[!entry.healthy]
   335  			delete(values, id)
   336  		}
   337  		ids[id] = false
   338  		statusToIds[entry.healthy][id] = entry.lastHeartbeatTimeSec
   339  	}
   340  
   341  	// 转为不健康的实例,需要添加 metadata
   342  	appendMetaReqs := make([]*store.InstanceMetadataRequest, 0, len(statusToIds[false]))
   343  	// 转为健康的实例,需要删除 metadata
   344  	removeMetaReqs := make([]*store.InstanceMetadataRequest, 0, len(statusToIds[true]))
   345  	revision := utils.NewUUID()
   346  	for healthy, values := range statusToIds {
   347  		if len(values) == 0 {
   348  			continue
   349  		}
   350  		idValues := make([]interface{}, 0, len(values))
   351  		for id := range values {
   352  			if healthy {
   353  				removeMetaReqs = append(removeMetaReqs, &store.InstanceMetadataRequest{
   354  					InstanceID: id,
   355  					Revision:   revision,
   356  					Keys:       []string{model.MetadataInstanceLastHeartbeatTime},
   357  				})
   358  			} else {
   359  				appendMetaReqs = append(appendMetaReqs, &store.InstanceMetadataRequest{
   360  					InstanceID: id,
   361  					Revision:   revision,
   362  					Metadata: map[string]string{
   363  						model.MetadataInstanceLastHeartbeatTime: strconv.FormatInt(values[id], 10),
   364  					},
   365  				})
   366  			}
   367  			idValues = append(idValues, id)
   368  		}
   369  		err := ctrl.storage.BatchSetInstanceHealthStatus(idValues, model.StatusBoolToInt(healthy), utils.NewUUID())
   370  		if err != nil {
   371  			log.Errorf("[Batch] batch healthy check instances err: %s", err.Error())
   372  			sendReply(futures, commonstore.StoreCode2APICode(err), err)
   373  			return err
   374  		}
   375  		if err := ctrl.storage.BatchAppendInstanceMetadata(appendMetaReqs); err != nil {
   376  			log.Errorf("[Batch] batch healthy check instances append metadata err: %s", err.Error())
   377  			sendReply(futures, commonstore.StoreCode2APICode(err), err)
   378  			return err
   379  		}
   380  		if err := ctrl.storage.BatchRemoveInstanceMetadata(removeMetaReqs); err != nil {
   381  			log.Errorf("[Batch] batch healthy check instances remove metadata err: %s", err.Error())
   382  			sendReply(futures, commonstore.StoreCode2APICode(err), err)
   383  			return err
   384  		}
   385  	}
   386  	sendReply(futures, apimodel.Code_ExecuteSuccess, nil)
   387  	return nil
   388  }
   389  
   390  // deregisterHandler 反注册处理函数
   391  // 步骤:
   392  //   - 从数据库中批量读取实例ID对应的实例简要信息:
   393  //     包括:ID,host,port,serviceName,serviceNamespace,serviceToken
   394  //   - 对instance做存在与token的双重校验,较少与数据库的交互
   395  //   - 对于不存在的token,返回notFoundResource
   396  //   - 对于token校验失败的,返回校验失败
   397  //   - 调用批量接口删除实例
   398  func (ctrl *InstanceCtrl) deregisterHandler(futures []*InstanceFuture) error {
   399  	if len(futures) == 0 {
   400  		return nil
   401  	}
   402  
   403  	cur := time.Now()
   404  	log.Infof("[Batch] Start batch deregister instances count: %d", len(futures))
   405  	remains := make(map[string]*InstanceFuture, len(futures))
   406  	ids := make(map[string]bool, len(futures))
   407  	for _, entry := range futures {
   408  		if _, ok := remains[entry.request.GetId().GetValue()]; ok {
   409  			entry.Reply(cur, apimodel.Code_SameInstanceRequest, ErrorSameRegIsInstanceRequest)
   410  			continue
   411  		}
   412  
   413  		remains[entry.request.GetId().GetValue()] = entry
   414  		ids[entry.request.GetId().GetValue()] = false
   415  	}
   416  
   417  	// 统一鉴权与判断是否存在
   418  	instances, err := ctrl.storage.GetInstancesBrief(ids)
   419  	if err != nil {
   420  		log.Errorf("[Batch] get instances service token err: %s", err.Error())
   421  		sendReply(remains, commonstore.StoreCode2APICode(err), err)
   422  		return err
   423  	}
   424  	for _, future := range futures {
   425  		instance, ok := instances[future.request.GetId().GetValue()]
   426  		if !ok {
   427  			// 不存在,意味着不需要删除了
   428  			future.Reply(cur, apimodel.Code_NotFoundResource, fmt.Errorf("%s", api.Code2Info(api.NotFoundResource)))
   429  			delete(remains, future.request.GetId().GetValue())
   430  			continue
   431  		}
   432  
   433  		future.SetInstance(instance) // 这里保存instance的目的:方便上层使用model数据
   434  	}
   435  
   436  	if len(remains) == 0 {
   437  		log.Infof("[Batch] deregister instances verify failed or instances is not existed, no remain any instances")
   438  		return nil
   439  	}
   440  
   441  	// 调用storage batch接口,删除实例
   442  	args := make([]interface{}, 0, len(remains))
   443  	for _, entry := range remains {
   444  		args = append(args, entry.request.GetId().GetValue())
   445  	}
   446  	if err := ctrl.storage.BatchDeleteInstances(args); err != nil {
   447  		log.Errorf("[Batch] batch delete instances err: %s", err.Error())
   448  		sendReply(remains, commonstore.StoreCode2APICode(err), err)
   449  		return err
   450  	}
   451  
   452  	sendReply(remains, apimodel.Code_ExecuteSuccess, nil)
   453  	return nil
   454  }
   455  
   456  // batchRestoreInstanceIsolate 批量恢复实例的隔离状态,以请求为准,请求如果不存在,就以数据库为准
   457  func (ctrl *InstanceCtrl) batchRestoreInstanceIsolate(futures map[string]*InstanceFuture) error {
   458  	if len(futures) == 0 {
   459  		return nil
   460  	}
   461  
   462  	// 初始化所有的id都是不存在的
   463  	ids := make(map[string]bool, len(futures))
   464  	for _, entry := range futures {
   465  		ids[entry.request.GetId().GetValue()] = false
   466  	}
   467  	var id2Isolate map[string]bool
   468  	var err error
   469  	if id2Isolate, err = ctrl.storage.BatchGetInstanceIsolate(ids); err != nil {
   470  		log.Errorf("[Batch] check instances existed storage err: %s", err.Error())
   471  		sendReply(futures, commonstore.StoreCode2APICode(err), err)
   472  		return err
   473  	}
   474  
   475  	if len(id2Isolate) == 0 {
   476  		return nil
   477  	}
   478  
   479  	if len(id2Isolate) > 0 {
   480  		for id, isolate := range id2Isolate {
   481  			if future, ok := futures[id]; ok && future.request.Isolate == nil {
   482  				future.request.Isolate = &wrappers.BoolValue{Value: isolate}
   483  			}
   484  		}
   485  	}
   486  	return nil
   487  }