github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/aggregator/tools/deploy/helper.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package deploy 22 23 import ( 24 "errors" 25 "fmt" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 "github.com/m3db/m3/src/cluster/placement" 31 xerrors "github.com/m3db/m3/src/x/errors" 32 "github.com/m3db/m3/src/x/retry" 33 xsync "github.com/m3db/m3/src/x/sync" 34 35 "go.uber.org/zap" 36 ) 37 38 var ( 39 errNoDeploymentProgress = errors.New("no deployment progress") 40 errInvalidRevision = errors.New("invalid revision") 41 ) 42 43 // Mode is the deployment mode. 44 type Mode int 45 46 // A list of supported deployment modes. 47 const ( 48 DryRunMode Mode = iota 49 ForceMode 50 ) 51 52 // Helper is a helper class handling deployments. 53 type Helper interface { 54 // Deploy deploys a target revision to the instances in the placement. 55 Deploy(revision string, placement placement.Placement, mode Mode) error 56 } 57 58 // TODO(xichen): disable deployment while another is ongoing. 59 type helper struct { 60 logger *zap.Logger 61 planner planner 62 client AggregatorClient 63 mgr Manager 64 retrier retry.Retrier 65 foreverRetrier retry.Retrier 66 workers xsync.WorkerPool 67 toPlacementInstanceIDFn ToPlacementInstanceIDFn 68 toAPIEndpointFn ToAPIEndpointFn 69 settleBetweenSteps time.Duration 70 } 71 72 // NewHelper creates a new deployment helper. 73 func NewHelper(opts HelperOptions) (Helper, error) { 74 client := NewAggregatorClient(opts.HTTPClient()) 75 planner := newPlanner(client, opts.PlannerOptions()) 76 retryOpts := opts.RetryOptions() 77 retrier := retry.NewRetrier(retryOpts) 78 foreverRetrier := retry.NewRetrier(retryOpts.SetForever(true)) 79 return helper{ 80 logger: opts.InstrumentOptions().Logger(), 81 planner: planner, 82 client: client, 83 mgr: opts.Manager(), 84 retrier: retrier, 85 foreverRetrier: foreverRetrier, 86 workers: opts.WorkerPool(), 87 toPlacementInstanceIDFn: opts.ToPlacementInstanceIDFn(), 88 toAPIEndpointFn: opts.ToAPIEndpointFn(), 89 settleBetweenSteps: opts.SettleDurationBetweenSteps(), 90 }, nil 91 } 92 93 func (h helper) Deploy(revision string, placement placement.Placement, mode Mode) error { 94 if revision == "" { 95 return errInvalidRevision 96 } 97 all, err := h.allInstanceMetadatas(placement) 98 if err != nil { 99 return fmt.Errorf("unable to get all instance metadatas: %v", err) 100 } 101 filtered := all.WithoutRevision(revision) 102 103 plan, err := h.planner.GeneratePlan(filtered, all) 104 if err != nil { 105 return fmt.Errorf("unable to generate deployment plan: %v", err) 106 } 107 108 h.logger.Sugar().Info("generated deployment plan: %+v", plan) 109 110 // If in dry run mode, log the generated deployment plan and return. 111 if mode == DryRunMode { 112 return nil 113 } 114 115 if err = h.execute(plan, revision, all); err != nil { 116 return fmt.Errorf("unable to execute deployment plan: %v", err) 117 } 118 119 return nil 120 } 121 122 func (h helper) execute( 123 plan deploymentPlan, 124 revision string, 125 all instanceMetadatas, 126 ) error { 127 numSteps := len(plan.Steps) 128 for i, step := range plan.Steps { 129 h.logger.Sugar().Infof("deploying step %d of %d", i+1, numSteps) 130 if err := h.executeStep(step, revision, all); err != nil { 131 return err 132 } 133 h.logger.Sugar().Infof("deploying step %d succeeded", i+1) 134 if h.settleBetweenSteps > 0 { 135 h.logger.Sugar().Infof("waiting settle duration after step: %s", h.settleBetweenSteps.String()) 136 time.Sleep(h.settleBetweenSteps) 137 } 138 } 139 return nil 140 } 141 142 func (h helper) executeStep( 143 step deploymentStep, 144 revision string, 145 all instanceMetadatas, 146 ) error { 147 h.logger.Sugar().Infof("waiting until safe to deploy for step %v", step) 148 if err := h.waitUntilSafe(all); err != nil { 149 return err 150 } 151 152 h.logger.Sugar().Infof("waiting until all targets are validated for step %v", step) 153 if err := h.validate(step.Targets); err != nil { 154 return err 155 } 156 157 h.logger.Sugar().Infof("waiting until all targets have resigned for step %v", step) 158 if err := h.resign(step.Targets); err != nil { 159 return err 160 } 161 162 h.logger.Sugar().Infof("beginning to deploy instances for step %v", step) 163 targetIDs := step.Targets.DeploymentInstanceIDs() 164 if err := h.deploy(targetIDs, revision); err != nil { 165 return err 166 } 167 168 h.logger.Sugar().Infof("deployment started, waiting for progress: %v", step) 169 if err := h.waitUntilProgressing(targetIDs, revision); err != nil { 170 return err 171 } 172 173 h.logger.Sugar().Infof("deployment progressed, waiting for completion: %v", step) 174 return h.waitUntilSafe(all) 175 } 176 177 func (h helper) waitUntilSafe(instances instanceMetadatas) error { 178 deploymentInstanceIDs := instances.DeploymentInstanceIDs() 179 return h.foreverRetrier.Attempt(func() error { 180 deploymentInstances, err := h.mgr.Query(deploymentInstanceIDs) 181 if err != nil { 182 return fmt.Errorf("error querying instances: %v", err) 183 } 184 185 var ( 186 wg sync.WaitGroup 187 safe int64 188 ) 189 for i := range deploymentInstances { 190 i := i 191 wg.Add(1) 192 h.workers.Go(func() { 193 defer wg.Done() 194 195 if !deploymentInstances[i].IsHealthy() || deploymentInstances[i].IsDeploying() { 196 return 197 } 198 if err := h.client.IsHealthy(instances[i].APIEndpoint); err != nil { 199 return 200 } 201 atomic.AddInt64(&safe, 1) 202 }) 203 } 204 wg.Wait() 205 206 if safe != int64(len(instances)) { 207 return fmt.Errorf("only %d out of %d instances are safe to deploy", safe, len(instances)) 208 } 209 return nil 210 }) 211 } 212 213 func (h helper) validate(targets deploymentTargets) error { 214 return h.forEachTarget(targets, func(target deploymentTarget) error { 215 return h.foreverRetrier.Attempt(func() error { 216 validator := target.Validator 217 if validator == nil { 218 return nil 219 } 220 if err := validator(); err != nil { 221 err = fmt.Errorf("validation error for instance %s: %v", target.Instance.PlacementInstanceID, err) 222 return err 223 } 224 return nil 225 }) 226 }) 227 } 228 229 func (h helper) resign(targets deploymentTargets) error { 230 return h.forEachTarget(targets, func(target deploymentTarget) error { 231 return h.retrier.Attempt(func() error { 232 instance := target.Instance 233 if err := h.client.Resign(instance.APIEndpoint); err != nil { 234 err = fmt.Errorf("resign error for instance %s: %v", instance.PlacementInstanceID, err) 235 return err 236 } 237 return nil 238 }) 239 }) 240 } 241 242 func (h helper) deploy(targetIDs []string, revision string) error { 243 return h.retrier.Attempt(func() error { 244 return h.mgr.Deploy(targetIDs, revision) 245 }) 246 } 247 248 func (h helper) waitUntilProgressing(targetIDs []string, revision string) error { 249 return h.foreverRetrier.Attempt(func() error { 250 targetInstances, err := h.mgr.Query(targetIDs) 251 if err != nil { 252 return fmt.Errorf("error querying instances: %v", err) 253 } 254 255 for _, di := range targetInstances { 256 if di.IsDeploying() || di.Revision() == revision { 257 return nil 258 } 259 } 260 261 return errNoDeploymentProgress 262 }) 263 } 264 265 func (h helper) forEachTarget(targets deploymentTargets, workFn targetWorkFn) error { 266 var ( 267 wg sync.WaitGroup 268 errCh = make(chan error, len(targets)) 269 ) 270 for i := range targets { 271 i := i 272 wg.Add(1) 273 h.workers.Go(func() { 274 defer wg.Done() 275 276 if err := workFn(targets[i]); err != nil { 277 errCh <- err 278 } 279 }) 280 } 281 wg.Wait() 282 close(errCh) 283 284 multiErr := xerrors.NewMultiError() 285 for err := range errCh { 286 multiErr = multiErr.Add(err) 287 } 288 return multiErr.FinalError() 289 } 290 291 func (h helper) allInstanceMetadatas(placement placement.Placement) (instanceMetadatas, error) { 292 placementInstances := placement.Instances() 293 deploymentInstances, err := h.mgr.QueryAll() 294 if err != nil { 295 return nil, fmt.Errorf("unable to query all instances from deployment: %v", err) 296 } 297 metadatas, err := h.computeInstanceMetadatas(placementInstances, deploymentInstances) 298 if err != nil { 299 return nil, fmt.Errorf("unable to compute instance metadatas: %v", err) 300 } 301 return metadatas, nil 302 } 303 304 // validateInstances validates instances derived from placement against 305 // instances derived from deployment, ensuring there are no duplicate instances 306 // and the instances derived from two sources match against each other. 307 func (h helper) computeInstanceMetadatas( 308 placementInstances []placement.Instance, 309 deploymentInstances []Instance, 310 ) (instanceMetadatas, error) { 311 if len(placementInstances) != len(deploymentInstances) { 312 errMsg := "number of instances is %d in the placement and %d in the deployment" 313 return nil, fmt.Errorf(errMsg, len(placementInstances), len(deploymentInstances)) 314 } 315 316 // Populate instance metadata from placement information. 317 metadatas := make(instanceMetadatas, len(placementInstances)) 318 unique := make(map[string]int) 319 for i, pi := range placementInstances { 320 id := pi.ID() 321 _, exists := unique[id] 322 if exists { 323 return nil, fmt.Errorf("instance %s not unique in the placement", id) 324 } 325 endpoint := pi.Endpoint() 326 apiEndpoint, err := h.toAPIEndpointFn(endpoint) 327 if err != nil { 328 return nil, fmt.Errorf("unable to convert placement endpoint %s to api endpoint: %v", endpoint, err) 329 } 330 unique[id] = i 331 metadatas[i].PlacementInstanceID = id 332 metadatas[i].ShardSetID = pi.ShardSetID() 333 metadatas[i].APIEndpoint = apiEndpoint 334 } 335 336 // Populate instance metadata from deployment information. 337 for _, di := range deploymentInstances { 338 id := di.ID() 339 placementInstanceID, err := h.toPlacementInstanceIDFn(id) 340 if err != nil { 341 return nil, fmt.Errorf("unable to convert deployment instance id %s to placement instance id", id) 342 } 343 idx, exists := unique[placementInstanceID] 344 if !exists { 345 return nil, fmt.Errorf("instance %s is in deployment but not in placement", id) 346 } 347 if metadatas[idx].DeploymentInstanceID != "" { 348 return nil, fmt.Errorf("instance %s not unique in the deployment", id) 349 } 350 metadatas[idx].DeploymentInstanceID = id 351 metadatas[idx].Revision = di.Revision() 352 } 353 354 return metadatas, nil 355 } 356 357 type targetWorkFn func(target deploymentTarget) error 358 359 // instanceMetadata contains instance metadata. 360 type instanceMetadata struct { 361 // PlacementInstanceID is the instance id in the placement. 362 PlacementInstanceID string 363 364 // DeploymentInstanceID is the instance id in the deployment system. 365 DeploymentInstanceID string 366 367 // ShardSetID is the shard set id associated with the instance. 368 ShardSetID uint32 369 370 // APIEndpoint is the api endpoint for the instance. 371 APIEndpoint string 372 373 // Revision is the revision deployed to the instance. 374 Revision string 375 } 376 377 type instanceMetadatas []instanceMetadata 378 379 func (m instanceMetadatas) DeploymentInstanceIDs() []string { 380 res := make([]string, 0, len(m)) 381 for _, metadata := range m { 382 res = append(res, metadata.DeploymentInstanceID) 383 } 384 return res 385 } 386 387 func (m instanceMetadatas) WithoutRevision(revision string) instanceMetadatas { 388 filtered := make(instanceMetadatas, 0, len(m)) 389 for _, metadata := range m { 390 if metadata.Revision == revision { 391 continue 392 } 393 filtered = append(filtered, metadata) 394 } 395 return filtered 396 }