github.com/m3db/m3@v1.5.0/src/cluster/placement/algo/sharded_helper.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package algo
    22  
    23  import (
    24  	"container/heap"
    25  	"errors"
    26  	"fmt"
    27  	"math"
    28  
    29  	"github.com/m3db/m3/src/cluster/placement"
    30  	"github.com/m3db/m3/src/cluster/shard"
    31  
    32  	"go.uber.org/zap"
    33  )
    34  
    35  var (
    36  	errAddingInstanceAlreadyExist         = errors.New("the adding instance is already in the placement")
    37  	errInstanceContainsNonLeavingShards   = errors.New("the adding instance contains non leaving shards")
    38  	errInstanceContainsInitializingShards = errors.New("the adding instance contains initializing shards")
    39  )
    40  
    41  type instanceType int
    42  
    43  const (
    44  	anyType instanceType = iota
    45  	withShards
    46  	withLeavingShardsOnly
    47  	withAvailableOrLeavingShardsOnly
    48  )
    49  
    50  type optimizeType int
    51  
    52  const (
    53  	// safe optimizes the load distribution without violating
    54  	// minimal shard movement.
    55  	safe optimizeType = iota
    56  	// unsafe optimizes the load distribution with the potential of violating
    57  	// minimal shard movement in order to reach best shard distribution.
    58  	unsafe
    59  )
    60  
    61  type assignLoadFn func(instance placement.Instance) error
    62  
    63  type placementHelper interface {
    64  	PlacementHelper
    65  
    66  	// placeShards distributes shards to the instances in the helper, with aware of where are the shards coming from.
    67  	placeShards(shards []shard.Shard, from placement.Instance, candidates []placement.Instance) error
    68  
    69  	// addInstance adds an instance to the placement.
    70  	addInstance(addingInstance placement.Instance) error
    71  
    72  	// optimize rebalances the load distribution in the cluster.
    73  	optimize(t optimizeType) error
    74  
    75  	// generatePlacement generates a placement.
    76  	generatePlacement() placement.Placement
    77  
    78  	// reclaimLeavingShards reclaims all the leaving shards on the given instance
    79  	// by pulling them back from the rest of the cluster.
    80  	reclaimLeavingShards(instance placement.Instance)
    81  
    82  	// returnInitializingShards returns all the initializing shards on the given instance
    83  	// by returning them back to the original owners.
    84  	returnInitializingShards(instance placement.Instance)
    85  }
    86  
    87  // PlacementHelper helps the algorithm to place shards.
    88  type PlacementHelper interface {
    89  	// Instances returns the list of instances managed by the PlacementHelper.
    90  	Instances() []placement.Instance
    91  
    92  	// CanMoveShard checks if the shard can be moved from the instance to the target isolation group.
    93  	CanMoveShard(shard uint32, fromInstance placement.Instance, toIsolationGroup string) bool
    94  }
    95  
    96  type helper struct {
    97  	targetLoad          map[string]int
    98  	shardToInstanceMap  map[uint32]map[placement.Instance]struct{}
    99  	groupToInstancesMap map[string]map[placement.Instance]struct{}
   100  	groupToWeightMap    map[string]uint32
   101  	rf                  int
   102  	uniqueShards        []uint32
   103  	instances           map[string]placement.Instance
   104  	log                 *zap.Logger
   105  	opts                placement.Options
   106  	totalWeight         uint32
   107  	maxShardSetID       uint32
   108  }
   109  
   110  // NewPlacementHelper returns a placement helper
   111  func NewPlacementHelper(p placement.Placement, opts placement.Options) PlacementHelper {
   112  	return newHelper(p, p.ReplicaFactor(), opts)
   113  }
   114  
   115  func newInitHelper(instances []placement.Instance, ids []uint32, opts placement.Options) placementHelper {
   116  	emptyPlacement := placement.NewPlacement().
   117  		SetInstances(instances).
   118  		SetShards(ids).
   119  		SetReplicaFactor(0).
   120  		SetIsSharded(true).
   121  		SetCutoverNanos(opts.PlacementCutoverNanosFn()())
   122  	return newHelper(emptyPlacement, emptyPlacement.ReplicaFactor()+1, opts)
   123  }
   124  
   125  func newAddReplicaHelper(p placement.Placement, opts placement.Options) placementHelper {
   126  	return newHelper(p, p.ReplicaFactor()+1, opts)
   127  }
   128  
   129  func newAddInstanceHelper(
   130  	p placement.Placement,
   131  	instance placement.Instance,
   132  	opts placement.Options,
   133  	t instanceType,
   134  ) (placementHelper, placement.Instance, error) {
   135  	instanceInPlacement, exist := p.Instance(instance.ID())
   136  	if !exist {
   137  		return newHelper(p.SetInstances(append(p.Instances(), instance)), p.ReplicaFactor(), opts), instance, nil
   138  	}
   139  
   140  	switch t {
   141  	case withLeavingShardsOnly:
   142  		if !instanceInPlacement.IsLeaving() {
   143  			return nil, nil, errInstanceContainsNonLeavingShards
   144  		}
   145  	case withAvailableOrLeavingShardsOnly:
   146  		shards := instanceInPlacement.Shards()
   147  		if shards.NumShards() != shards.NumShardsForState(shard.Available)+shards.NumShardsForState(shard.Leaving) {
   148  			return nil, nil, errInstanceContainsInitializingShards
   149  		}
   150  	default:
   151  		return nil, nil, fmt.Errorf("unexpected type %v", t)
   152  	}
   153  
   154  	return newHelper(p, p.ReplicaFactor(), opts), instanceInPlacement, nil
   155  }
   156  
   157  func newRemoveInstanceHelper(
   158  	p placement.Placement,
   159  	instanceID string,
   160  	opts placement.Options,
   161  ) (placementHelper, placement.Instance, error) {
   162  	p, leavingInstance, err := removeInstanceFromPlacement(p, instanceID)
   163  	if err != nil {
   164  		return nil, nil, err
   165  	}
   166  	return newHelper(p, p.ReplicaFactor(), opts), leavingInstance, nil
   167  }
   168  
   169  func newReplaceInstanceHelper(
   170  	p placement.Placement,
   171  	instanceIDs []string,
   172  	addingInstances []placement.Instance,
   173  	opts placement.Options,
   174  ) (placementHelper, []placement.Instance, []placement.Instance, error) {
   175  	var (
   176  		leavingInstances = make([]placement.Instance, len(instanceIDs))
   177  		err              error
   178  	)
   179  	for i, instanceID := range instanceIDs {
   180  		p, leavingInstances[i], err = removeInstanceFromPlacement(p, instanceID)
   181  		if err != nil {
   182  			return nil, nil, nil, err
   183  		}
   184  	}
   185  
   186  	newAddingInstances := make([]placement.Instance, len(addingInstances))
   187  	for i, instance := range addingInstances {
   188  		p, newAddingInstances[i], err = addInstanceToPlacement(p, instance, anyType)
   189  		if err != nil {
   190  			return nil, nil, nil, err
   191  		}
   192  	}
   193  	return newHelper(p, p.ReplicaFactor(), opts), leavingInstances, newAddingInstances, nil
   194  }
   195  
   196  func newHelper(p placement.Placement, targetRF int, opts placement.Options) placementHelper {
   197  	ph := &helper{
   198  		rf:            targetRF,
   199  		instances:     make(map[string]placement.Instance, p.NumInstances()),
   200  		uniqueShards:  p.Shards(),
   201  		maxShardSetID: p.MaxShardSetID(),
   202  		log:           opts.InstrumentOptions().Logger(),
   203  		opts:          opts,
   204  	}
   205  
   206  	for _, instance := range p.Instances() {
   207  		ph.instances[instance.ID()] = instance
   208  	}
   209  
   210  	ph.scanCurrentLoad()
   211  	ph.buildTargetLoad()
   212  	return ph
   213  }
   214  
   215  func (ph *helper) scanCurrentLoad() {
   216  	ph.shardToInstanceMap = make(map[uint32]map[placement.Instance]struct{}, len(ph.uniqueShards))
   217  	ph.groupToInstancesMap = make(map[string]map[placement.Instance]struct{})
   218  	ph.groupToWeightMap = make(map[string]uint32)
   219  	totalWeight := uint32(0)
   220  	for _, instance := range ph.instances {
   221  		if _, exist := ph.groupToInstancesMap[instance.IsolationGroup()]; !exist {
   222  			ph.groupToInstancesMap[instance.IsolationGroup()] = make(map[placement.Instance]struct{})
   223  		}
   224  		ph.groupToInstancesMap[instance.IsolationGroup()][instance] = struct{}{}
   225  
   226  		if instance.IsLeaving() {
   227  			// Leaving instances are not counted as usable capacities in the placement.
   228  			continue
   229  		}
   230  
   231  		ph.groupToWeightMap[instance.IsolationGroup()] = ph.groupToWeightMap[instance.IsolationGroup()] + instance.Weight()
   232  		totalWeight += instance.Weight()
   233  
   234  		for _, s := range instance.Shards().All() {
   235  			if s.State() == shard.Leaving {
   236  				continue
   237  			}
   238  			ph.assignShardToInstance(s, instance)
   239  		}
   240  	}
   241  	ph.totalWeight = totalWeight
   242  }
   243  
   244  func (ph *helper) buildTargetLoad() {
   245  	overWeightedGroups := 0
   246  	overWeight := uint32(0)
   247  	for _, weight := range ph.groupToWeightMap {
   248  		if isOverWeighted(weight, ph.totalWeight, ph.rf) {
   249  			overWeightedGroups++
   250  			overWeight += weight
   251  		}
   252  	}
   253  
   254  	targetLoad := make(map[string]int, len(ph.instances))
   255  	for _, instance := range ph.instances {
   256  		if instance.IsLeaving() {
   257  			// We should not set a target load for leaving instances.
   258  			continue
   259  		}
   260  		igWeight := ph.groupToWeightMap[instance.IsolationGroup()]
   261  		if isOverWeighted(igWeight, ph.totalWeight, ph.rf) {
   262  			// If the instance is on a over-sized isolation group, the target load
   263  			// equals (shardLen / capacity of the isolation group).
   264  			targetLoad[instance.ID()] = int(math.Ceil(float64(ph.getShardLen()) * float64(instance.Weight()) / float64(igWeight)))
   265  		} else {
   266  			// If the instance is on a normal isolation group, get the target load
   267  			// with aware of other over-sized isolation group.
   268  			targetLoad[instance.ID()] = ph.getShardLen() * (ph.rf - overWeightedGroups) * int(instance.Weight()) / int(ph.totalWeight-overWeight)
   269  		}
   270  	}
   271  	ph.targetLoad = targetLoad
   272  }
   273  
   274  func (ph *helper) Instances() []placement.Instance {
   275  	res := make([]placement.Instance, 0, len(ph.instances))
   276  	for _, instance := range ph.instances {
   277  		res = append(res, instance)
   278  	}
   279  	return res
   280  }
   281  
   282  func (ph *helper) getShardLen() int {
   283  	return len(ph.uniqueShards)
   284  }
   285  
   286  func (ph *helper) targetLoadForInstance(id string) int {
   287  	return ph.targetLoad[id]
   288  }
   289  
   290  func (ph *helper) moveOneShard(from, to placement.Instance) bool {
   291  	// The order matter here:
   292  	// The Unknown shards were just moved, so free to be moved around.
   293  	// The Initializing shards were still being initialized on the instance,
   294  	// so moving them are cheaper than moving those Available shards.
   295  	return ph.moveOneShardInState(from, to, shard.Unknown) ||
   296  		ph.moveOneShardInState(from, to, shard.Initializing) ||
   297  		ph.moveOneShardInState(from, to, shard.Available)
   298  }
   299  
   300  // nolint: unparam
   301  func (ph *helper) moveOneShardInState(from, to placement.Instance, state shard.State) bool {
   302  	for _, s := range from.Shards().ShardsForState(state) {
   303  		if ph.moveShard(s, from, to) {
   304  			return true
   305  		}
   306  	}
   307  	return false
   308  }
   309  
   310  func (ph *helper) moveShard(candidateShard shard.Shard, from, to placement.Instance) bool {
   311  	shardID := candidateShard.ID()
   312  	if !ph.canAssignInstance(shardID, from, to) {
   313  		return false
   314  	}
   315  
   316  	if candidateShard.State() == shard.Leaving {
   317  		// should not move a Leaving shard,
   318  		// Leaving shard will be removed when the Initializing shard is marked as Available
   319  		return false
   320  	}
   321  
   322  	newShard := shard.NewShard(shardID)
   323  
   324  	if from != nil {
   325  		switch candidateShard.State() {
   326  		case shard.Unknown, shard.Initializing:
   327  			from.Shards().Remove(shardID)
   328  			newShard.SetSourceID(candidateShard.SourceID())
   329  		case shard.Available:
   330  			candidateShard.
   331  				SetState(shard.Leaving).
   332  				SetCutoffNanos(ph.opts.ShardCutoffNanosFn()())
   333  			newShard.SetSourceID(from.ID())
   334  		}
   335  
   336  		delete(ph.shardToInstanceMap[shardID], from)
   337  	}
   338  
   339  	curShard, ok := to.Shards().Shard(shardID)
   340  	if ok && curShard.State() == shard.Leaving {
   341  		// NB(cw): if the instance already owns the shard in Leaving state,
   342  		// simply mark it as Available
   343  		newShard = shard.NewShard(shardID).SetState(shard.Available)
   344  		// NB(cw): Break the link between new owner of this shard with this Leaving instance
   345  		instances := ph.shardToInstanceMap[shardID]
   346  		for instance := range instances {
   347  			shards := instance.Shards()
   348  			initShard, ok := shards.Shard(shardID)
   349  			if ok && initShard.SourceID() == to.ID() {
   350  				initShard.SetSourceID("")
   351  			}
   352  		}
   353  
   354  	}
   355  
   356  	ph.assignShardToInstance(newShard, to)
   357  	return true
   358  }
   359  
   360  func (ph *helper) CanMoveShard(shard uint32, from placement.Instance, toIsolationGroup string) bool {
   361  	if from != nil {
   362  		if from.IsolationGroup() == toIsolationGroup {
   363  			return true
   364  		}
   365  	}
   366  	for instance := range ph.shardToInstanceMap[shard] {
   367  		if instance.IsolationGroup() == toIsolationGroup {
   368  			return false
   369  		}
   370  	}
   371  	return true
   372  }
   373  
   374  func (ph *helper) buildInstanceHeap(instances []placement.Instance, availableCapacityAscending bool) (heap.Interface, error) {
   375  	return newHeap(instances, availableCapacityAscending, ph.targetLoad, ph.groupToWeightMap)
   376  }
   377  
   378  func (ph *helper) generatePlacement() placement.Placement {
   379  	var instances = make([]placement.Instance, 0, len(ph.instances))
   380  
   381  	for _, instance := range ph.instances {
   382  		if instance.Shards().NumShards() > 0 {
   383  			instances = append(instances, instance)
   384  		}
   385  	}
   386  
   387  	maxShardSetID := ph.maxShardSetID
   388  	for _, instance := range instances {
   389  		shards := instance.Shards()
   390  		for _, s := range shards.ShardsForState(shard.Unknown) {
   391  			shards.Add(shard.NewShard(s.ID()).
   392  				SetSourceID(s.SourceID()).
   393  				SetState(shard.Initializing).
   394  				SetCutoverNanos(ph.opts.ShardCutoverNanosFn()()))
   395  		}
   396  		if shardSetID := instance.ShardSetID(); shardSetID >= maxShardSetID {
   397  			maxShardSetID = shardSetID
   398  		}
   399  	}
   400  
   401  	return placement.NewPlacement().
   402  		SetInstances(instances).
   403  		SetShards(ph.uniqueShards).
   404  		SetReplicaFactor(ph.rf).
   405  		SetIsSharded(true).
   406  		SetIsMirrored(ph.opts.IsMirrored()).
   407  		SetCutoverNanos(ph.opts.PlacementCutoverNanosFn()()).
   408  		SetMaxShardSetID(maxShardSetID)
   409  }
   410  
   411  func (ph *helper) placeShards(
   412  	shards []shard.Shard,
   413  	from placement.Instance,
   414  	candidates []placement.Instance,
   415  ) error {
   416  	shardSet := getShardMap(shards)
   417  	if from != nil {
   418  		// NB(cw) when removing an adding instance that has not finished bootstrapping its
   419  		// Initializing shards, prefer to return those Initializing shards back to the leaving instance
   420  		// to reduce some bootstrapping work in the cluster.
   421  		ph.returnInitializingShardsToSource(shardSet, from, candidates)
   422  	}
   423  
   424  	instanceHeap, err := ph.buildInstanceHeap(nonLeavingInstances(candidates), true)
   425  	if err != nil {
   426  		return err
   427  	}
   428  	// if there are shards left to be assigned, distribute them evenly
   429  	var triedInstances []placement.Instance
   430  	for _, s := range shardSet {
   431  		if s.State() == shard.Leaving {
   432  			continue
   433  		}
   434  		moved := false
   435  		for instanceHeap.Len() > 0 {
   436  			tryInstance := heap.Pop(instanceHeap).(placement.Instance)
   437  			triedInstances = append(triedInstances, tryInstance)
   438  			if ph.moveShard(s, from, tryInstance) {
   439  				moved = true
   440  				break
   441  			}
   442  		}
   443  		if !moved {
   444  			// This should only happen when RF > number of isolation groups.
   445  			return errNotEnoughIsolationGroups
   446  		}
   447  		for _, triedInstance := range triedInstances {
   448  			heap.Push(instanceHeap, triedInstance)
   449  		}
   450  		triedInstances = triedInstances[:0]
   451  	}
   452  	return nil
   453  }
   454  
   455  func (ph *helper) returnInitializingShards(instance placement.Instance) {
   456  	shardSet := getShardMap(instance.Shards().All())
   457  	ph.returnInitializingShardsToSource(shardSet, instance, ph.Instances())
   458  }
   459  
   460  func (ph *helper) returnInitializingShardsToSource(
   461  	shardSet map[uint32]shard.Shard,
   462  	from placement.Instance,
   463  	candidates []placement.Instance,
   464  ) {
   465  	candidateMap := make(map[string]placement.Instance, len(candidates))
   466  	for _, candidate := range candidates {
   467  		candidateMap[candidate.ID()] = candidate
   468  	}
   469  	for _, s := range shardSet {
   470  		if s.State() != shard.Initializing {
   471  			continue
   472  		}
   473  		sourceID := s.SourceID()
   474  		if sourceID == "" {
   475  			continue
   476  		}
   477  		sourceInstance, ok := candidateMap[sourceID]
   478  		if !ok {
   479  			// NB(cw): This is not an error because the candidates are not
   480  			// necessarily all the instances in the placement.
   481  			continue
   482  		}
   483  		if sourceInstance.IsLeaving() {
   484  			continue
   485  		}
   486  		if ph.moveShard(s, from, sourceInstance) {
   487  			delete(shardSet, s.ID())
   488  		}
   489  	}
   490  }
   491  
   492  func (ph *helper) mostUnderLoadedInstance() (placement.Instance, bool) {
   493  	var (
   494  		res              placement.Instance
   495  		maxLoadGap       int
   496  		totalLoadSurplus int
   497  	)
   498  
   499  	for id, instance := range ph.instances {
   500  		loadGap := ph.targetLoad[id] - loadOnInstance(instance)
   501  		if loadGap > maxLoadGap {
   502  			maxLoadGap = loadGap
   503  			res = instance
   504  		}
   505  		if loadGap == maxLoadGap && res != nil && res.ID() > id {
   506  			res = instance
   507  		}
   508  		if loadGap < 0 {
   509  			totalLoadSurplus -= loadGap
   510  		}
   511  	}
   512  	if maxLoadGap > 0 && totalLoadSurplus != 0 {
   513  		return res, true
   514  	}
   515  	return nil, false
   516  }
   517  
   518  func (ph *helper) optimize(t optimizeType) error {
   519  	var fn assignLoadFn
   520  	switch t {
   521  	case safe:
   522  		fn = ph.assignLoadToInstanceSafe
   523  	case unsafe:
   524  		fn = ph.assignLoadToInstanceUnsafe
   525  	}
   526  	uniq := make(map[string]struct{}, len(ph.instances))
   527  	for {
   528  		ins, ok := ph.mostUnderLoadedInstance()
   529  		if !ok {
   530  			return nil
   531  		}
   532  		if _, exist := uniq[ins.ID()]; exist {
   533  			return nil
   534  		}
   535  
   536  		uniq[ins.ID()] = struct{}{}
   537  		if err := fn(ins); err != nil {
   538  			return err
   539  		}
   540  	}
   541  }
   542  
   543  func (ph *helper) assignLoadToInstanceSafe(addingInstance placement.Instance) error {
   544  	return ph.assignTargetLoad(addingInstance, func(from, to placement.Instance) bool {
   545  		return ph.moveOneShardInState(from, to, shard.Unknown)
   546  	})
   547  }
   548  
   549  func (ph *helper) assignLoadToInstanceUnsafe(addingInstance placement.Instance) error {
   550  	return ph.assignTargetLoad(addingInstance, func(from, to placement.Instance) bool {
   551  		return ph.moveOneShard(from, to)
   552  	})
   553  }
   554  
   555  func (ph *helper) reclaimLeavingShards(instance placement.Instance) {
   556  	if instance.Shards().NumShardsForState(shard.Leaving) == 0 {
   557  		// Shortcut if there is nothing to be reclaimed.
   558  		return
   559  	}
   560  	id := instance.ID()
   561  	for _, i := range ph.instances {
   562  		for _, s := range i.Shards().ShardsForState(shard.Initializing) {
   563  			if s.SourceID() == id {
   564  				// NB(cw) in very rare case, the leaving shards could not be taken back.
   565  				// For example: in a RF=2 case, instance a and b on ig1, instance c on ig2,
   566  				// c took shard1 from instance a, before we tried to assign shard1 back to instance a,
   567  				// b got assigned shard1, now if we try to add instance a back to the topology, a can
   568  				// no longer take shard1 back.
   569  				// But it's fine, the algo will fil up those load with other shards from the cluster
   570  				ph.moveShard(s, i, instance)
   571  			}
   572  		}
   573  	}
   574  }
   575  
   576  func (ph *helper) addInstance(addingInstance placement.Instance) error {
   577  	ph.reclaimLeavingShards(addingInstance)
   578  	return ph.assignLoadToInstanceUnsafe(addingInstance)
   579  }
   580  
   581  func (ph *helper) assignTargetLoad(
   582  	targetInstance placement.Instance,
   583  	moveOneShardFn func(from, to placement.Instance) bool,
   584  ) error {
   585  	targetLoad := ph.targetLoadForInstance(targetInstance.ID())
   586  	// try to take shards from the most loaded instances until the adding instance reaches target load
   587  	instanceHeap, err := ph.buildInstanceHeap(nonLeavingInstances(ph.Instances()), false)
   588  	if err != nil {
   589  		return err
   590  	}
   591  	for targetInstance.Shards().NumShards() < targetLoad && instanceHeap.Len() > 0 {
   592  		fromInstance := heap.Pop(instanceHeap).(placement.Instance)
   593  		if moved := moveOneShardFn(fromInstance, targetInstance); moved {
   594  			heap.Push(instanceHeap, fromInstance)
   595  		}
   596  	}
   597  	return nil
   598  }
   599  
   600  func (ph *helper) canAssignInstance(shardID uint32, from, to placement.Instance) bool {
   601  	s, ok := to.Shards().Shard(shardID)
   602  	if ok && s.State() != shard.Leaving {
   603  		// NB(cw): a Leaving shard is not counted to the load of the instance
   604  		// so the instance should be able to take the ownership back if needed
   605  		// assuming i1 owns shard 1 as Available, this case can be triggered by:
   606  		// 1: add i2, now shard 1 is "Leaving" on i1 and "Initializing" on i2
   607  		// 2: remove i2, now i2 needs to return shard 1 back to i1
   608  		// and i1 should be able to take it and mark it as "Available"
   609  		return false
   610  	}
   611  	return ph.CanMoveShard(shardID, from, to.IsolationGroup())
   612  }
   613  
   614  func (ph *helper) assignShardToInstance(s shard.Shard, to placement.Instance) {
   615  	to.Shards().Add(s)
   616  
   617  	if _, exist := ph.shardToInstanceMap[s.ID()]; !exist {
   618  		ph.shardToInstanceMap[s.ID()] = make(map[placement.Instance]struct{})
   619  	}
   620  	ph.shardToInstanceMap[s.ID()][to] = struct{}{}
   621  }
   622  
   623  // instanceHeap provides an easy way to get best candidate instance to assign/steal a shard
   624  type instanceHeap struct {
   625  	instances         []placement.Instance
   626  	igToWeightMap     map[string]uint32
   627  	targetLoad        map[string]int
   628  	capacityAscending bool
   629  }
   630  
   631  func newHeap(
   632  	instances []placement.Instance,
   633  	capacityAscending bool,
   634  	targetLoad map[string]int,
   635  	igToWeightMap map[string]uint32,
   636  ) (*instanceHeap, error) {
   637  	h := &instanceHeap{
   638  		capacityAscending: capacityAscending,
   639  		instances:         instances,
   640  		targetLoad:        targetLoad,
   641  		igToWeightMap:     igToWeightMap,
   642  	}
   643  	heap.Init(h)
   644  	return h, nil
   645  }
   646  
   647  func (h *instanceHeap) targetLoadForInstance(id string) int {
   648  	return h.targetLoad[id]
   649  }
   650  
   651  func (h *instanceHeap) Len() int {
   652  	return len(h.instances)
   653  }
   654  
   655  func (h *instanceHeap) Less(i, j int) bool {
   656  	instanceI := h.instances[i]
   657  	instanceJ := h.instances[j]
   658  	leftLoadOnI := h.targetLoadForInstance(instanceI.ID()) - loadOnInstance(instanceI)
   659  	leftLoadOnJ := h.targetLoadForInstance(instanceJ.ID()) - loadOnInstance(instanceJ)
   660  	// If both instance has tokens to be filled, prefer the one from bigger isolation group
   661  	// since it tends to be more picky in accepting shards
   662  	if leftLoadOnI > 0 && leftLoadOnJ > 0 && instanceI.IsolationGroup() != instanceJ.IsolationGroup() {
   663  		var (
   664  			igWeightI = h.igToWeightMap[instanceI.IsolationGroup()]
   665  			igWeightJ = h.igToWeightMap[instanceJ.IsolationGroup()]
   666  		)
   667  		if igWeightI != igWeightJ {
   668  			return igWeightI > igWeightJ
   669  		}
   670  	}
   671  	// compare left capacity on both instances
   672  	if leftLoadOnI == leftLoadOnJ {
   673  		return instanceI.ID() < instanceJ.ID()
   674  	}
   675  	if h.capacityAscending {
   676  		return leftLoadOnI > leftLoadOnJ
   677  	}
   678  	return leftLoadOnI < leftLoadOnJ
   679  }
   680  
   681  func (h instanceHeap) Swap(i, j int) {
   682  	h.instances[i], h.instances[j] = h.instances[j], h.instances[i]
   683  }
   684  
   685  func (h *instanceHeap) Push(i interface{}) {
   686  	instance := i.(placement.Instance)
   687  	h.instances = append(h.instances, instance)
   688  }
   689  
   690  func (h *instanceHeap) Pop() interface{} {
   691  	n := len(h.instances)
   692  	instance := h.instances[n-1]
   693  	h.instances = h.instances[0 : n-1]
   694  	return instance
   695  }
   696  
   697  func isOverWeighted(igWeight, totalWeight uint32, rf int) bool {
   698  	return float64(igWeight)/float64(totalWeight) >= 1.0/float64(rf)
   699  }
   700  
   701  func addInstanceToPlacement(
   702  	p placement.Placement,
   703  	i placement.Instance,
   704  	t instanceType,
   705  ) (placement.Placement, placement.Instance, error) {
   706  	if _, exist := p.Instance(i.ID()); exist {
   707  		return nil, nil, errAddingInstanceAlreadyExist
   708  	}
   709  
   710  	switch t {
   711  	case anyType:
   712  	case withShards:
   713  		if i.Shards().NumShards() == 0 {
   714  			return p, i, nil
   715  		}
   716  	default:
   717  		return nil, nil, fmt.Errorf("unexpected type %v", t)
   718  	}
   719  
   720  	instance := i.Clone()
   721  	return p.SetInstances(append(p.Instances(), instance)), instance, nil
   722  }
   723  
   724  func removeInstanceFromPlacement(p placement.Placement, id string) (placement.Placement, placement.Instance, error) {
   725  	leavingInstance, exist := p.Instance(id)
   726  	if !exist {
   727  		return nil, nil, fmt.Errorf("instance %s does not exist in placement", id)
   728  	}
   729  	return p.SetInstances(removeInstanceFromList(p.Instances(), id)), leavingInstance, nil
   730  }
   731  
   732  func getShardMap(shards []shard.Shard) map[uint32]shard.Shard {
   733  	r := make(map[uint32]shard.Shard, len(shards))
   734  
   735  	for _, s := range shards {
   736  		r[s.ID()] = s
   737  	}
   738  	return r
   739  }
   740  
   741  func loadOnInstance(instance placement.Instance) int {
   742  	return instance.Shards().NumShards() - instance.Shards().NumShardsForState(shard.Leaving)
   743  }
   744  
   745  func nonLeavingInstances(instances []placement.Instance) []placement.Instance {
   746  	r := make([]placement.Instance, 0, len(instances))
   747  	for _, instance := range instances {
   748  		if instance.IsLeaving() {
   749  			continue
   750  		}
   751  		r = append(r, instance)
   752  	}
   753  
   754  	return r
   755  }
   756  
   757  func newShards(shardIDs []uint32) []shard.Shard {
   758  	r := make([]shard.Shard, len(shardIDs))
   759  	for i, id := range shardIDs {
   760  		r[i] = shard.NewShard(id).SetState(shard.Unknown)
   761  	}
   762  	return r
   763  }
   764  
   765  func removeInstanceFromList(instances []placement.Instance, instanceID string) []placement.Instance {
   766  	for i, instance := range instances {
   767  		if instance.ID() == instanceID {
   768  			last := len(instances) - 1
   769  			instances[i] = instances[last]
   770  			return instances[:last]
   771  		}
   772  	}
   773  	return instances
   774  }
   775  
   776  func markShardsAvailable(p placement.Placement, instanceID string, shardIDs []uint32, opts placement.Options) (placement.Placement, error) {
   777  	instance, exist := p.Instance(instanceID)
   778  	if !exist {
   779  		return nil, fmt.Errorf("instance %s does not exist in placement", instanceID)
   780  	}
   781  
   782  	shards := instance.Shards()
   783  	for _, shardID := range shardIDs {
   784  		s, exist := shards.Shard(shardID)
   785  		if !exist {
   786  			return nil, fmt.Errorf("shard %d does not exist in instance %s", shardID, instanceID)
   787  		}
   788  
   789  		if s.State() != shard.Initializing {
   790  			return nil, fmt.Errorf("could not mark shard %d as available, it's not in Initializing state", s.ID())
   791  		}
   792  
   793  		isCutoverFn := opts.IsShardCutoverFn()
   794  		if isCutoverFn != nil {
   795  			if err := isCutoverFn(s); err != nil {
   796  				return nil, err
   797  			}
   798  		}
   799  
   800  		p = p.SetCutoverNanos(opts.PlacementCutoverNanosFn()())
   801  		sourceID := s.SourceID()
   802  		shards.Add(shard.NewShard(shardID).SetState(shard.Available))
   803  
   804  		// There could be no source for cases like initial placement.
   805  		if sourceID == "" {
   806  			continue
   807  		}
   808  
   809  		sourceInstance, exist := p.Instance(sourceID)
   810  		if !exist {
   811  			return nil, fmt.Errorf("source instance %s for shard %d does not exist in placement", sourceID, shardID)
   812  		}
   813  
   814  		sourceShards := sourceInstance.Shards()
   815  		leavingShard, exist := sourceShards.Shard(shardID)
   816  		if !exist {
   817  			return nil, fmt.Errorf("shard %d does not exist in source instance %s", shardID, sourceID)
   818  		}
   819  
   820  		if leavingShard.State() != shard.Leaving {
   821  			return nil, fmt.Errorf("shard %d is not leaving instance %s", shardID, sourceID)
   822  		}
   823  
   824  		isCutoffFn := opts.IsShardCutoffFn()
   825  		if isCutoffFn != nil {
   826  			if err := isCutoffFn(leavingShard); err != nil {
   827  				return nil, err
   828  			}
   829  		}
   830  
   831  		sourceShards.Remove(shardID)
   832  		if sourceShards.NumShards() == 0 {
   833  			p = p.SetInstances(removeInstanceFromList(p.Instances(), sourceInstance.ID()))
   834  		}
   835  	}
   836  
   837  	return p, nil
   838  }
   839  
   840  // tryCleanupShardState cleans up the shard states if the user only
   841  // wants to keep stable shard state in the placement.
   842  func tryCleanupShardState(
   843  	p placement.Placement,
   844  	opts placement.Options,
   845  ) (placement.Placement, error) {
   846  	if opts.ShardStateMode() == placement.StableShardStateOnly {
   847  		p, _, err := markAllShardsAvailable(
   848  			p,
   849  			opts.SetIsShardCutoverFn(nil).SetIsShardCutoffFn(nil),
   850  		)
   851  		return p, err
   852  	}
   853  	return p, nil
   854  }
   855  
   856  func markAllShardsAvailable(
   857  	p placement.Placement,
   858  	opts placement.Options,
   859  ) (placement.Placement, bool, error) {
   860  	var (
   861  		err     error
   862  		updated = false
   863  	)
   864  	p = p.Clone()
   865  	for _, instance := range p.Instances() {
   866  		for _, s := range instance.Shards().All() {
   867  			if s.State() == shard.Initializing {
   868  				p, err = markShardsAvailable(p, instance.ID(), []uint32{s.ID()}, opts)
   869  				if err != nil {
   870  					return nil, false, err
   871  				}
   872  				updated = true
   873  			}
   874  		}
   875  	}
   876  	return p, updated, nil
   877  }