github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/aggregator/tools/deploy/planner.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package deploy
    22  
    23  import (
    24  	"fmt"
    25  	"sort"
    26  	"sync"
    27  
    28  	"github.com/m3db/m3/src/cluster/services"
    29  	"github.com/m3db/m3/src/x/errors"
    30  	xsync "github.com/m3db/m3/src/x/sync"
    31  )
    32  
    33  var (
    34  	emptyPlan deploymentPlan
    35  	emptyStep deploymentStep
    36  )
    37  
    38  // planner generates deployment plans for given instances under constraints.
    39  type planner interface {
    40  	// GeneratePlan generates a deployment plan for given target instances.
    41  	GeneratePlan(toDeploy, all instanceMetadatas) (deploymentPlan, error)
    42  
    43  	// GenerateOneStep generates one deployment step for given target instances.
    44  	GenerateOneStep(toDeploy, all instanceMetadatas) (deploymentStep, error)
    45  }
    46  
    47  type deploymentPlanner struct {
    48  	leaderService    services.LeaderService
    49  	workers          xsync.WorkerPool
    50  	electionKeyFmt   string
    51  	maxStepSize      int
    52  	validatorFactory validatorFactory
    53  }
    54  
    55  // newPlanner creates a new deployment planner.
    56  func newPlanner(client AggregatorClient, opts PlannerOptions) planner {
    57  	workers := opts.WorkerPool()
    58  	validatorFactory := newValidatorFactory(client, workers)
    59  	return deploymentPlanner{
    60  		leaderService:    opts.LeaderService(),
    61  		workers:          opts.WorkerPool(),
    62  		electionKeyFmt:   opts.ElectionKeyFmt(),
    63  		maxStepSize:      opts.MaxStepSize(),
    64  		validatorFactory: validatorFactory,
    65  	}
    66  }
    67  
    68  func (p deploymentPlanner) GeneratePlan(
    69  	toDeploy, all instanceMetadatas,
    70  ) (deploymentPlan, error) {
    71  	grouped, err := p.groupInstancesByShardSetID(toDeploy, all)
    72  	if err != nil {
    73  		return emptyPlan, fmt.Errorf("unable to group instances by shard set id: %v", err)
    74  	}
    75  	return p.generatePlan(grouped, len(toDeploy), p.maxStepSize), nil
    76  }
    77  
    78  func (p deploymentPlanner) GenerateOneStep(
    79  	toDeploy, all instanceMetadatas,
    80  ) (deploymentStep, error) {
    81  	grouped, err := p.groupInstancesByShardSetID(toDeploy, all)
    82  	if err != nil {
    83  		return emptyStep, fmt.Errorf("unable to group instances by shard set id: %v", err)
    84  	}
    85  	return p.generateStep(grouped, p.maxStepSize), nil
    86  }
    87  
    88  func (p deploymentPlanner) generatePlan(
    89  	instances map[uint32]*instanceGroup,
    90  	numInstances int,
    91  	maxStepSize int,
    92  ) deploymentPlan {
    93  	var (
    94  		step  deploymentStep
    95  		plan  deploymentPlan
    96  		total = numInstances
    97  	)
    98  	for total > 0 {
    99  		step = p.generateStep(instances, maxStepSize)
   100  		plan.Steps = append(plan.Steps, step)
   101  		total -= len(step.Targets)
   102  	}
   103  	return plan
   104  }
   105  
   106  func (p deploymentPlanner) generateStep(
   107  	instances map[uint32]*instanceGroup,
   108  	maxStepSize int,
   109  ) deploymentStep {
   110  	// NB(xichen): we always choose instances that are currently in the follower state first,
   111  	// unless there are no more follower instances, in which case we'll deploy the leader instances.
   112  	// This is to reduce the overall deployment time due to reduced number of leader promotions and
   113  	// as such we are less likely to need to wait for the follower instances to be ready to take over
   114  	// the leader role.
   115  	step := p.generateStepFromTargetType(instances, maxStepSize, followerTarget)
   116  
   117  	// If we have found some follower instances to deploy, we don't attempt to deploy leader
   118  	// instances in the same step even if we have not reached the max step size to avoid delaying
   119  	// deploying to the followers due to deploying leader instances.
   120  	if len(step.Targets) > 0 {
   121  		return step
   122  	}
   123  
   124  	// If we have not found any followers, we proceed to deploy leader instances.
   125  	return p.generateStepFromTargetType(instances, maxStepSize, leaderTarget)
   126  }
   127  
   128  func (p deploymentPlanner) generateStepFromTargetType(
   129  	instances map[uint32]*instanceGroup,
   130  	maxStepSize int,
   131  	targetType targetType,
   132  ) deploymentStep {
   133  	step := deploymentStep{Targets: make([]deploymentTarget, 0, maxStepSize)}
   134  	for shardSetID, group := range instances {
   135  		if len(group.ToDeploy) == 0 {
   136  			delete(instances, shardSetID)
   137  			continue
   138  		}
   139  
   140  		done := false
   141  		for i, instance := range group.ToDeploy {
   142  			if !matchTargetType(instance.PlacementInstanceID, group.LeaderID, targetType) {
   143  				continue
   144  			}
   145  			target := deploymentTarget{
   146  				Instance:  instance,
   147  				Validator: p.validatorFactory.ValidatorFor(instance, group, targetType),
   148  			}
   149  			step.Targets = append(step.Targets, target)
   150  			group.removeInstanceToDeploy(i)
   151  			if maxStepSize != 0 && len(step.Targets) >= maxStepSize {
   152  				done = true
   153  			}
   154  			break
   155  		}
   156  		if done {
   157  			break
   158  		}
   159  	}
   160  
   161  	// Sort targets by instance id for deterministic ordering.
   162  	sort.Sort(targetsByInstanceIDAsc(step.Targets))
   163  	return step
   164  }
   165  
   166  func (p deploymentPlanner) groupInstancesByShardSetID(
   167  	toDeploy, all instanceMetadatas,
   168  ) (map[uint32]*instanceGroup, error) {
   169  	grouped := make(map[uint32]*instanceGroup, len(toDeploy))
   170  
   171  	// Group the instances to be deployed by shard set id.
   172  	for _, instance := range toDeploy {
   173  		shardSetID := instance.ShardSetID
   174  		group, exists := grouped[shardSetID]
   175  		if !exists {
   176  			group = &instanceGroup{
   177  				ToDeploy: make(instanceMetadatas, 0, 2),
   178  				All:      make(instanceMetadatas, 0, 2),
   179  			}
   180  		}
   181  		group.ToDeploy = append(group.ToDeploy, instance)
   182  		grouped[shardSetID] = group
   183  	}
   184  
   185  	// Determine the full set of instances in each group.
   186  	for _, instance := range all {
   187  		shardSetID := instance.ShardSetID
   188  		group, exists := grouped[shardSetID]
   189  		if !exists {
   190  			continue
   191  		}
   192  		group.All = append(group.All, instance)
   193  	}
   194  
   195  	// Determine the leader of each group.
   196  	var (
   197  		wg    sync.WaitGroup
   198  		errCh = make(chan error, len(grouped))
   199  	)
   200  	for shardSetID, group := range grouped {
   201  		shardSetID, group := shardSetID, group
   202  		wg.Add(1)
   203  		p.workers.Go(func() {
   204  			defer wg.Done()
   205  
   206  			electionKey := fmt.Sprintf(p.electionKeyFmt, shardSetID)
   207  			leader, err := p.leaderService.Leader(electionKey)
   208  			if err != nil {
   209  				err = fmt.Errorf("unable to determine leader for shard set id %d: %v", shardSetID, err)
   210  				errCh <- err
   211  				return
   212  			}
   213  			for _, instance := range group.All {
   214  				if instance.PlacementInstanceID == leader {
   215  					group.LeaderID = instance.PlacementInstanceID
   216  					return
   217  				}
   218  			}
   219  			err = fmt.Errorf("unknown leader %s for shard set id %d", leader, shardSetID)
   220  			errCh <- err
   221  		})
   222  	}
   223  
   224  	wg.Wait()
   225  	close(errCh)
   226  	multiErr := errors.NewMultiError()
   227  	for err := range errCh {
   228  		multiErr = multiErr.Add(err)
   229  	}
   230  	if err := multiErr.FinalError(); err != nil {
   231  		return nil, err
   232  	}
   233  	return grouped, nil
   234  }
   235  
   236  // deploymentTarget is a deployment target.
   237  type deploymentTarget struct {
   238  	Instance  instanceMetadata
   239  	Validator validator
   240  }
   241  
   242  func (t deploymentTarget) String() string { return t.Instance.PlacementInstanceID }
   243  
   244  // deploymentTargets is a list of deployment targets.
   245  type deploymentTargets []deploymentTarget
   246  
   247  func (targets deploymentTargets) DeploymentInstanceIDs() []string {
   248  	deploymentInstanceIDs := make([]string, 0, len(targets))
   249  	for _, target := range targets {
   250  		deploymentInstanceIDs = append(deploymentInstanceIDs, target.Instance.DeploymentInstanceID)
   251  	}
   252  	return deploymentInstanceIDs
   253  }
   254  
   255  type targetsByInstanceIDAsc []deploymentTarget
   256  
   257  func (a targetsByInstanceIDAsc) Len() int      { return len(a) }
   258  func (a targetsByInstanceIDAsc) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
   259  
   260  func (a targetsByInstanceIDAsc) Less(i, j int) bool {
   261  	return a[i].Instance.PlacementInstanceID < a[j].Instance.PlacementInstanceID
   262  }
   263  
   264  // deploymentStep is a deployment step.
   265  type deploymentStep struct {
   266  	Targets deploymentTargets
   267  }
   268  
   269  // deploymentPlan is a deployment plan.
   270  type deploymentPlan struct {
   271  	Steps []deploymentStep
   272  }
   273  
   274  type targetType int
   275  
   276  const (
   277  	followerTarget targetType = iota
   278  	leaderTarget
   279  )
   280  
   281  func matchTargetType(
   282  	instanceID string,
   283  	leaderID string,
   284  	targetType targetType,
   285  ) bool {
   286  	if targetType == leaderTarget {
   287  		return instanceID == leaderID
   288  	}
   289  	return instanceID != leaderID
   290  }
   291  
   292  type instanceGroup struct {
   293  	// LeaderID is the instance id of the leader in the group.
   294  	LeaderID string
   295  
   296  	// ToDeploy are the instances to be deployed in the group.
   297  	ToDeploy instanceMetadatas
   298  
   299  	// All include all the instances in the group regardless of whether they need to be deployed.
   300  	All instanceMetadatas
   301  }
   302  
   303  func (group *instanceGroup) removeInstanceToDeploy(i int) {
   304  	lastIdx := len(group.ToDeploy) - 1
   305  	group.ToDeploy[i], group.ToDeploy[lastIdx] = group.ToDeploy[lastIdx], group.ToDeploy[i]
   306  	group.ToDeploy = group.ToDeploy[:lastIdx]
   307  }