github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/aggregator/tools/deploy/helper.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package deploy
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	"github.com/m3db/m3/src/cluster/placement"
    31  	xerrors "github.com/m3db/m3/src/x/errors"
    32  	"github.com/m3db/m3/src/x/retry"
    33  	xsync "github.com/m3db/m3/src/x/sync"
    34  
    35  	"go.uber.org/zap"
    36  )
    37  
    38  var (
    39  	errNoDeploymentProgress = errors.New("no deployment progress")
    40  	errInvalidRevision      = errors.New("invalid revision")
    41  )
    42  
    43  // Mode is the deployment mode.
    44  type Mode int
    45  
    46  // A list of supported deployment modes.
    47  const (
    48  	DryRunMode Mode = iota
    49  	ForceMode
    50  )
    51  
    52  // Helper is a helper class handling deployments.
    53  type Helper interface {
    54  	// Deploy deploys a target revision to the instances in the placement.
    55  	Deploy(revision string, placement placement.Placement, mode Mode) error
    56  }
    57  
    58  // TODO(xichen): disable deployment while another is ongoing.
    59  type helper struct {
    60  	logger                  *zap.Logger
    61  	planner                 planner
    62  	client                  AggregatorClient
    63  	mgr                     Manager
    64  	retrier                 retry.Retrier
    65  	foreverRetrier          retry.Retrier
    66  	workers                 xsync.WorkerPool
    67  	toPlacementInstanceIDFn ToPlacementInstanceIDFn
    68  	toAPIEndpointFn         ToAPIEndpointFn
    69  	settleBetweenSteps      time.Duration
    70  }
    71  
    72  // NewHelper creates a new deployment helper.
    73  func NewHelper(opts HelperOptions) (Helper, error) {
    74  	client := NewAggregatorClient(opts.HTTPClient())
    75  	planner := newPlanner(client, opts.PlannerOptions())
    76  	retryOpts := opts.RetryOptions()
    77  	retrier := retry.NewRetrier(retryOpts)
    78  	foreverRetrier := retry.NewRetrier(retryOpts.SetForever(true))
    79  	return helper{
    80  		logger:                  opts.InstrumentOptions().Logger(),
    81  		planner:                 planner,
    82  		client:                  client,
    83  		mgr:                     opts.Manager(),
    84  		retrier:                 retrier,
    85  		foreverRetrier:          foreverRetrier,
    86  		workers:                 opts.WorkerPool(),
    87  		toPlacementInstanceIDFn: opts.ToPlacementInstanceIDFn(),
    88  		toAPIEndpointFn:         opts.ToAPIEndpointFn(),
    89  		settleBetweenSteps:      opts.SettleDurationBetweenSteps(),
    90  	}, nil
    91  }
    92  
    93  func (h helper) Deploy(revision string, placement placement.Placement, mode Mode) error {
    94  	if revision == "" {
    95  		return errInvalidRevision
    96  	}
    97  	all, err := h.allInstanceMetadatas(placement)
    98  	if err != nil {
    99  		return fmt.Errorf("unable to get all instance metadatas: %v", err)
   100  	}
   101  	filtered := all.WithoutRevision(revision)
   102  
   103  	plan, err := h.planner.GeneratePlan(filtered, all)
   104  	if err != nil {
   105  		return fmt.Errorf("unable to generate deployment plan: %v", err)
   106  	}
   107  
   108  	h.logger.Sugar().Info("generated deployment plan: %+v", plan)
   109  
   110  	// If in dry run mode, log the generated deployment plan and return.
   111  	if mode == DryRunMode {
   112  		return nil
   113  	}
   114  
   115  	if err = h.execute(plan, revision, all); err != nil {
   116  		return fmt.Errorf("unable to execute deployment plan: %v", err)
   117  	}
   118  
   119  	return nil
   120  }
   121  
   122  func (h helper) execute(
   123  	plan deploymentPlan,
   124  	revision string,
   125  	all instanceMetadatas,
   126  ) error {
   127  	numSteps := len(plan.Steps)
   128  	for i, step := range plan.Steps {
   129  		h.logger.Sugar().Infof("deploying step %d of %d", i+1, numSteps)
   130  		if err := h.executeStep(step, revision, all); err != nil {
   131  			return err
   132  		}
   133  		h.logger.Sugar().Infof("deploying step %d succeeded", i+1)
   134  		if h.settleBetweenSteps > 0 {
   135  			h.logger.Sugar().Infof("waiting settle duration after step: %s", h.settleBetweenSteps.String())
   136  			time.Sleep(h.settleBetweenSteps)
   137  		}
   138  	}
   139  	return nil
   140  }
   141  
   142  func (h helper) executeStep(
   143  	step deploymentStep,
   144  	revision string,
   145  	all instanceMetadatas,
   146  ) error {
   147  	h.logger.Sugar().Infof("waiting until safe to deploy for step %v", step)
   148  	if err := h.waitUntilSafe(all); err != nil {
   149  		return err
   150  	}
   151  
   152  	h.logger.Sugar().Infof("waiting until all targets are validated for step %v", step)
   153  	if err := h.validate(step.Targets); err != nil {
   154  		return err
   155  	}
   156  
   157  	h.logger.Sugar().Infof("waiting until all targets have resigned for step %v", step)
   158  	if err := h.resign(step.Targets); err != nil {
   159  		return err
   160  	}
   161  
   162  	h.logger.Sugar().Infof("beginning to deploy instances for step %v", step)
   163  	targetIDs := step.Targets.DeploymentInstanceIDs()
   164  	if err := h.deploy(targetIDs, revision); err != nil {
   165  		return err
   166  	}
   167  
   168  	h.logger.Sugar().Infof("deployment started, waiting for progress: %v", step)
   169  	if err := h.waitUntilProgressing(targetIDs, revision); err != nil {
   170  		return err
   171  	}
   172  
   173  	h.logger.Sugar().Infof("deployment progressed, waiting for completion: %v", step)
   174  	return h.waitUntilSafe(all)
   175  }
   176  
   177  func (h helper) waitUntilSafe(instances instanceMetadatas) error {
   178  	deploymentInstanceIDs := instances.DeploymentInstanceIDs()
   179  	return h.foreverRetrier.Attempt(func() error {
   180  		deploymentInstances, err := h.mgr.Query(deploymentInstanceIDs)
   181  		if err != nil {
   182  			return fmt.Errorf("error querying instances: %v", err)
   183  		}
   184  
   185  		var (
   186  			wg   sync.WaitGroup
   187  			safe int64
   188  		)
   189  		for i := range deploymentInstances {
   190  			i := i
   191  			wg.Add(1)
   192  			h.workers.Go(func() {
   193  				defer wg.Done()
   194  
   195  				if !deploymentInstances[i].IsHealthy() || deploymentInstances[i].IsDeploying() {
   196  					return
   197  				}
   198  				if err := h.client.IsHealthy(instances[i].APIEndpoint); err != nil {
   199  					return
   200  				}
   201  				atomic.AddInt64(&safe, 1)
   202  			})
   203  		}
   204  		wg.Wait()
   205  
   206  		if safe != int64(len(instances)) {
   207  			return fmt.Errorf("only %d out of %d instances are safe to deploy", safe, len(instances))
   208  		}
   209  		return nil
   210  	})
   211  }
   212  
   213  func (h helper) validate(targets deploymentTargets) error {
   214  	return h.forEachTarget(targets, func(target deploymentTarget) error {
   215  		return h.foreverRetrier.Attempt(func() error {
   216  			validator := target.Validator
   217  			if validator == nil {
   218  				return nil
   219  			}
   220  			if err := validator(); err != nil {
   221  				err = fmt.Errorf("validation error for instance %s: %v", target.Instance.PlacementInstanceID, err)
   222  				return err
   223  			}
   224  			return nil
   225  		})
   226  	})
   227  }
   228  
   229  func (h helper) resign(targets deploymentTargets) error {
   230  	return h.forEachTarget(targets, func(target deploymentTarget) error {
   231  		return h.retrier.Attempt(func() error {
   232  			instance := target.Instance
   233  			if err := h.client.Resign(instance.APIEndpoint); err != nil {
   234  				err = fmt.Errorf("resign error for instance %s: %v", instance.PlacementInstanceID, err)
   235  				return err
   236  			}
   237  			return nil
   238  		})
   239  	})
   240  }
   241  
   242  func (h helper) deploy(targetIDs []string, revision string) error {
   243  	return h.retrier.Attempt(func() error {
   244  		return h.mgr.Deploy(targetIDs, revision)
   245  	})
   246  }
   247  
   248  func (h helper) waitUntilProgressing(targetIDs []string, revision string) error {
   249  	return h.foreverRetrier.Attempt(func() error {
   250  		targetInstances, err := h.mgr.Query(targetIDs)
   251  		if err != nil {
   252  			return fmt.Errorf("error querying instances: %v", err)
   253  		}
   254  
   255  		for _, di := range targetInstances {
   256  			if di.IsDeploying() || di.Revision() == revision {
   257  				return nil
   258  			}
   259  		}
   260  
   261  		return errNoDeploymentProgress
   262  	})
   263  }
   264  
   265  func (h helper) forEachTarget(targets deploymentTargets, workFn targetWorkFn) error {
   266  	var (
   267  		wg    sync.WaitGroup
   268  		errCh = make(chan error, len(targets))
   269  	)
   270  	for i := range targets {
   271  		i := i
   272  		wg.Add(1)
   273  		h.workers.Go(func() {
   274  			defer wg.Done()
   275  
   276  			if err := workFn(targets[i]); err != nil {
   277  				errCh <- err
   278  			}
   279  		})
   280  	}
   281  	wg.Wait()
   282  	close(errCh)
   283  
   284  	multiErr := xerrors.NewMultiError()
   285  	for err := range errCh {
   286  		multiErr = multiErr.Add(err)
   287  	}
   288  	return multiErr.FinalError()
   289  }
   290  
   291  func (h helper) allInstanceMetadatas(placement placement.Placement) (instanceMetadatas, error) {
   292  	placementInstances := placement.Instances()
   293  	deploymentInstances, err := h.mgr.QueryAll()
   294  	if err != nil {
   295  		return nil, fmt.Errorf("unable to query all instances from deployment: %v", err)
   296  	}
   297  	metadatas, err := h.computeInstanceMetadatas(placementInstances, deploymentInstances)
   298  	if err != nil {
   299  		return nil, fmt.Errorf("unable to compute instance metadatas: %v", err)
   300  	}
   301  	return metadatas, nil
   302  }
   303  
   304  // validateInstances validates instances derived from placement against
   305  // instances derived from deployment, ensuring there are no duplicate instances
   306  // and the instances derived from two sources match against each other.
   307  func (h helper) computeInstanceMetadatas(
   308  	placementInstances []placement.Instance,
   309  	deploymentInstances []Instance,
   310  ) (instanceMetadatas, error) {
   311  	if len(placementInstances) != len(deploymentInstances) {
   312  		errMsg := "number of instances is %d in the placement and %d in the deployment"
   313  		return nil, fmt.Errorf(errMsg, len(placementInstances), len(deploymentInstances))
   314  	}
   315  
   316  	// Populate instance metadata from placement information.
   317  	metadatas := make(instanceMetadatas, len(placementInstances))
   318  	unique := make(map[string]int)
   319  	for i, pi := range placementInstances {
   320  		id := pi.ID()
   321  		_, exists := unique[id]
   322  		if exists {
   323  			return nil, fmt.Errorf("instance %s not unique in the placement", id)
   324  		}
   325  		endpoint := pi.Endpoint()
   326  		apiEndpoint, err := h.toAPIEndpointFn(endpoint)
   327  		if err != nil {
   328  			return nil, fmt.Errorf("unable to convert placement endpoint %s to api endpoint: %v", endpoint, err)
   329  		}
   330  		unique[id] = i
   331  		metadatas[i].PlacementInstanceID = id
   332  		metadatas[i].ShardSetID = pi.ShardSetID()
   333  		metadatas[i].APIEndpoint = apiEndpoint
   334  	}
   335  
   336  	// Populate instance metadata from deployment information.
   337  	for _, di := range deploymentInstances {
   338  		id := di.ID()
   339  		placementInstanceID, err := h.toPlacementInstanceIDFn(id)
   340  		if err != nil {
   341  			return nil, fmt.Errorf("unable to convert deployment instance id %s to placement instance id", id)
   342  		}
   343  		idx, exists := unique[placementInstanceID]
   344  		if !exists {
   345  			return nil, fmt.Errorf("instance %s is in deployment but not in placement", id)
   346  		}
   347  		if metadatas[idx].DeploymentInstanceID != "" {
   348  			return nil, fmt.Errorf("instance %s not unique in the deployment", id)
   349  		}
   350  		metadatas[idx].DeploymentInstanceID = id
   351  		metadatas[idx].Revision = di.Revision()
   352  	}
   353  
   354  	return metadatas, nil
   355  }
   356  
   357  type targetWorkFn func(target deploymentTarget) error
   358  
   359  // instanceMetadata contains instance metadata.
   360  type instanceMetadata struct {
   361  	// PlacementInstanceID is the instance id in the placement.
   362  	PlacementInstanceID string
   363  
   364  	// DeploymentInstanceID is the instance id in the deployment system.
   365  	DeploymentInstanceID string
   366  
   367  	// ShardSetID is the shard set id associated with the instance.
   368  	ShardSetID uint32
   369  
   370  	// APIEndpoint is the api endpoint for the instance.
   371  	APIEndpoint string
   372  
   373  	// Revision is the revision deployed to the instance.
   374  	Revision string
   375  }
   376  
   377  type instanceMetadatas []instanceMetadata
   378  
   379  func (m instanceMetadatas) DeploymentInstanceIDs() []string {
   380  	res := make([]string, 0, len(m))
   381  	for _, metadata := range m {
   382  		res = append(res, metadata.DeploymentInstanceID)
   383  	}
   384  	return res
   385  }
   386  
   387  func (m instanceMetadatas) WithoutRevision(revision string) instanceMetadatas {
   388  	filtered := make(instanceMetadatas, 0, len(m))
   389  	for _, metadata := range m {
   390  		if metadata.Revision == revision {
   391  			continue
   392  		}
   393  		filtered = append(filtered, metadata)
   394  	}
   395  	return filtered
   396  }