github.com/bigcommerce/nomad@v0.9.3-bc/client/allocwatcher/alloc_watcher.go (about) 1 package allocwatcher 2 3 import ( 4 "archive/tar" 5 "context" 6 "fmt" 7 "io" 8 "os" 9 "path/filepath" 10 "sync" 11 "syscall" 12 "time" 13 14 "github.com/hashicorp/consul/lib" 15 hclog "github.com/hashicorp/go-hclog" 16 nomadapi "github.com/hashicorp/nomad/api" 17 "github.com/hashicorp/nomad/client/allocdir" 18 "github.com/hashicorp/nomad/client/config" 19 cstructs "github.com/hashicorp/nomad/client/structs" 20 "github.com/hashicorp/nomad/nomad/structs" 21 ) 22 23 const ( 24 // getRemoteRetryIntv is minimum interval on which we retry 25 // to fetch remote objects. We pick a value between this and 2x this. 26 getRemoteRetryIntv = 30 * time.Second 27 ) 28 29 // RPCer is the interface needed by a prevAllocWatcher to make RPC calls. 30 type RPCer interface { 31 // RPC allows retrieving remote allocs. 32 RPC(method string, args interface{}, reply interface{}) error 33 } 34 35 // terminated is the interface needed by a prevAllocWatcher to check if an 36 // alloc is terminated. 37 type terminated interface { 38 Terminated() bool 39 } 40 41 // AllocRunnerMeta provides metadata about an AllocRunner such as its alloc and 42 // alloc dir. 43 type AllocRunnerMeta interface { 44 GetAllocDir() *allocdir.AllocDir 45 Listener() *cstructs.AllocListener 46 Alloc() *structs.Allocation 47 } 48 49 // PrevAllocWatcher allows AllocRunners to wait for a previous allocation to 50 // terminate whether or not the previous allocation is local or remote. 51 // See `PrevAllocMigrator` for migrating workloads. 52 type PrevAllocWatcher interface { 53 // Wait for previous alloc to terminate 54 Wait(context.Context) error 55 56 // IsWaiting returns true if a concurrent caller is blocked in Wait 57 IsWaiting() bool 58 } 59 60 // PrevAllocMigrator allows AllocRunners to migrate a previous allocation 61 // whether or not the previous allocation is local or remote. 62 type PrevAllocMigrator interface { 63 PrevAllocWatcher 64 65 // IsMigrating returns true if a concurrent caller is in Migrate 66 IsMigrating() bool 67 68 // Migrate data from previous alloc 69 Migrate(ctx context.Context, dest *allocdir.AllocDir) error 70 } 71 72 type Config struct { 73 // Alloc is the current allocation which may need to block on its 74 // previous allocation stopping. 75 Alloc *structs.Allocation 76 77 // PreviousRunner is non-nil if Alloc has a PreviousAllocation and it is 78 // running locally. 79 PreviousRunner AllocRunnerMeta 80 81 // PreemptedRunners is non-nil if Alloc has one or more PreemptedAllocations. 82 PreemptedRunners map[string]AllocRunnerMeta 83 84 // RPC allows the alloc watcher to monitor remote allocations. 85 RPC RPCer 86 87 // Config is necessary for using the RPC. 88 Config *config.Config 89 90 // MigrateToken is used to migrate remote alloc dirs when ACLs are 91 // enabled. 92 MigrateToken string 93 94 Logger hclog.Logger 95 } 96 97 func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string, m AllocRunnerMeta) PrevAllocMigrator { 98 logger := c.Logger.Named("alloc_migrator").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID) 99 100 tasks := tg.Tasks 101 sticky := tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky 102 migrate := tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate 103 104 if m != nil { 105 // Local Allocation because there's no meta 106 return &localPrevAlloc{ 107 allocID: c.Alloc.ID, 108 prevAllocID: watchedAllocID, 109 tasks: tasks, 110 sticky: sticky, 111 prevAllocDir: m.GetAllocDir(), 112 prevListener: m.Listener(), 113 prevStatus: m.Alloc(), 114 logger: logger, 115 } 116 } 117 118 return &remotePrevAlloc{ 119 allocID: c.Alloc.ID, 120 prevAllocID: c.Alloc.PreviousAllocation, 121 tasks: tasks, 122 config: c.Config, 123 migrate: migrate, 124 rpc: c.RPC, 125 migrateToken: c.MigrateToken, 126 logger: logger, 127 } 128 } 129 130 func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) PrevAllocWatcher { 131 logger := c.Logger.Named("alloc_watcher").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID) 132 133 if m != nil { 134 // Local Allocation because there's no meta 135 return &localPrevAlloc{ 136 allocID: c.Alloc.ID, 137 prevAllocID: watchedAllocID, 138 prevAllocDir: m.GetAllocDir(), 139 prevListener: m.Listener(), 140 prevStatus: m.Alloc(), 141 logger: logger, 142 } 143 } 144 145 return &remotePrevAlloc{ 146 allocID: c.Alloc.ID, 147 prevAllocID: c.Alloc.PreviousAllocation, 148 config: c.Config, 149 rpc: c.RPC, 150 migrateToken: c.MigrateToken, 151 logger: logger, 152 } 153 } 154 155 // NewAllocWatcher creates a PrevAllocWatcher appropriate for whether this 156 // alloc's previous allocation was local or remote. If this alloc has no 157 // previous alloc then a noop implementation is returned. 158 func NewAllocWatcher(c Config) (PrevAllocWatcher, PrevAllocMigrator) { 159 if c.Alloc.PreviousAllocation == "" && c.PreemptedRunners == nil { 160 return NoopPrevAlloc{}, NoopPrevAlloc{} 161 } 162 163 var prevAllocWatchers []PrevAllocWatcher 164 var prevAllocMigrator PrevAllocMigrator = NoopPrevAlloc{} 165 166 // We have a previous allocation, add its listener to the watchers, and 167 // use a migrator. 168 if c.Alloc.PreviousAllocation != "" { 169 tg := c.Alloc.Job.LookupTaskGroup(c.Alloc.TaskGroup) 170 m := newMigratorForAlloc(c, tg, c.Alloc.PreviousAllocation, c.PreviousRunner) 171 prevAllocWatchers = append(prevAllocWatchers, m) 172 prevAllocMigrator = m 173 } 174 175 // We are preempting allocations, add their listeners to the watchers. 176 if c.PreemptedRunners != nil { 177 for aid, r := range c.PreemptedRunners { 178 w := newWatcherForAlloc(c, aid, r) 179 prevAllocWatchers = append(prevAllocWatchers, w) 180 } 181 } 182 183 groupWatcher := &groupPrevAllocWatcher{ 184 prevAllocs: prevAllocWatchers, 185 } 186 187 return groupWatcher, prevAllocMigrator 188 } 189 190 // localPrevAlloc is a prevAllocWatcher for previous allocations on the same 191 // node as an updated allocation. 192 type localPrevAlloc struct { 193 // allocID is the ID of the alloc being blocked 194 allocID string 195 196 // prevAllocID is the ID of the alloc being replaced 197 prevAllocID string 198 199 // tasks on the new alloc 200 tasks []*structs.Task 201 202 // sticky is true if data should be moved 203 sticky bool 204 205 // prevAllocDir is the alloc dir for the previous alloc 206 prevAllocDir *allocdir.AllocDir 207 208 // prevListener allows blocking for updates to the previous alloc 209 prevListener *cstructs.AllocListener 210 211 // prevStatus allows checking if the previous alloc has already 212 // terminated (and therefore won't send updates to the listener) 213 prevStatus terminated 214 215 // waiting and migrating are true when alloc runner is waiting on the 216 // prevAllocWatcher. Writers must acquire the waitingLock and readers 217 // should use the helper methods IsWaiting and IsMigrating. 218 waiting bool 219 migrating bool 220 waitingLock sync.RWMutex 221 222 logger hclog.Logger 223 } 224 225 // IsWaiting returns true if there's a concurrent call inside Wait 226 func (p *localPrevAlloc) IsWaiting() bool { 227 p.waitingLock.RLock() 228 b := p.waiting 229 p.waitingLock.RUnlock() 230 return b 231 } 232 233 // IsMigrating returns true if there's a concurrent call inside Migrate 234 func (p *localPrevAlloc) IsMigrating() bool { 235 p.waitingLock.RLock() 236 b := p.migrating 237 p.waitingLock.RUnlock() 238 return b 239 } 240 241 // Wait on a local alloc to become terminal, exit, or the context to be done. 242 func (p *localPrevAlloc) Wait(ctx context.Context) error { 243 p.waitingLock.Lock() 244 p.waiting = true 245 p.waitingLock.Unlock() 246 defer func() { 247 p.waitingLock.Lock() 248 p.waiting = false 249 p.waitingLock.Unlock() 250 }() 251 252 defer p.prevListener.Close() 253 254 // Don't bother blocking for updates from the previous alloc if it has 255 // already terminated. 256 if p.prevStatus.Terminated() { 257 p.logger.Trace("previous allocation already terminated") 258 return nil 259 } 260 261 // Block until previous alloc exits 262 p.logger.Debug("waiting for previous alloc to terminate") 263 for { 264 select { 265 case prevAlloc, ok := <-p.prevListener.Ch(): 266 if !ok || prevAlloc.Terminated() { 267 return nil 268 } 269 case <-ctx.Done(): 270 return ctx.Err() 271 } 272 } 273 } 274 275 // Migrate from previous local alloc dir to destination alloc dir. 276 func (p *localPrevAlloc) Migrate(ctx context.Context, dest *allocdir.AllocDir) error { 277 if !p.sticky { 278 // Not a sticky volume, nothing to migrate 279 return nil 280 } 281 282 p.waitingLock.Lock() 283 p.migrating = true 284 p.waitingLock.Unlock() 285 defer func() { 286 p.waitingLock.Lock() 287 p.migrating = false 288 p.waitingLock.Unlock() 289 }() 290 291 p.logger.Debug("copying previous alloc") 292 293 moveErr := dest.Move(p.prevAllocDir, p.tasks) 294 295 // Always cleanup previous alloc 296 if err := p.prevAllocDir.Destroy(); err != nil { 297 p.logger.Error("error destroying alloc dir", 298 "error", err, "previous_alloc_dir", p.prevAllocDir.AllocDir) 299 } 300 301 return moveErr 302 } 303 304 // remotePrevAlloc is a prevAllocWatcher for previous allocations on remote 305 // nodes as an updated allocation. 306 type remotePrevAlloc struct { 307 // allocID is the ID of the alloc being blocked 308 allocID string 309 310 // prevAllocID is the ID of the alloc being replaced 311 prevAllocID string 312 313 // tasks on the new alloc 314 tasks []*structs.Task 315 316 // config for the Client to get AllocDir, Region, and Node.SecretID 317 config *config.Config 318 319 // migrate is true if data should be moved between nodes 320 migrate bool 321 322 // rpc provides an RPC method for watching for updates to the previous 323 // alloc and determining what node it was on. 324 rpc RPCer 325 326 // nodeID is the node the previous alloc. Set by Wait() for use in 327 // Migrate() iff the previous alloc has not already been GC'd. 328 nodeID string 329 330 // waiting and migrating are true when alloc runner is waiting on the 331 // prevAllocWatcher. Writers must acquire the waitingLock and readers 332 // should use the helper methods IsWaiting and IsMigrating. 333 waiting bool 334 migrating bool 335 waitingLock sync.RWMutex 336 337 logger hclog.Logger 338 339 // migrateToken allows a client to migrate data in an ACL-protected remote 340 // volume 341 migrateToken string 342 } 343 344 // IsWaiting returns true if there's a concurrent call inside Wait 345 func (p *remotePrevAlloc) IsWaiting() bool { 346 p.waitingLock.RLock() 347 b := p.waiting 348 p.waitingLock.RUnlock() 349 return b 350 } 351 352 // IsMigrating returns true if there's a concurrent call inside Migrate 353 func (p *remotePrevAlloc) IsMigrating() bool { 354 p.waitingLock.RLock() 355 b := p.migrating 356 p.waitingLock.RUnlock() 357 return b 358 } 359 360 // Wait until the remote previous allocation has terminated. 361 func (p *remotePrevAlloc) Wait(ctx context.Context) error { 362 p.waitingLock.Lock() 363 p.waiting = true 364 p.waitingLock.Unlock() 365 defer func() { 366 p.waitingLock.Lock() 367 p.waiting = false 368 p.waitingLock.Unlock() 369 }() 370 371 p.logger.Debug("waiting for remote previous alloc to terminate") 372 req := structs.AllocSpecificRequest{ 373 AllocID: p.prevAllocID, 374 QueryOptions: structs.QueryOptions{ 375 Region: p.config.Region, 376 AllowStale: true, 377 AuthToken: p.config.Node.SecretID, 378 }, 379 } 380 381 done := func() bool { 382 select { 383 case <-ctx.Done(): 384 return true 385 default: 386 return false 387 } 388 } 389 390 for !done() { 391 resp := structs.SingleAllocResponse{} 392 err := p.rpc.RPC("Alloc.GetAlloc", &req, &resp) 393 if err != nil { 394 p.logger.Error("error querying previous alloc", "error", err) 395 retry := getRemoteRetryIntv + lib.RandomStagger(getRemoteRetryIntv) 396 select { 397 case <-time.After(retry): 398 continue 399 case <-ctx.Done(): 400 return ctx.Err() 401 } 402 } 403 if resp.Alloc == nil { 404 p.logger.Debug("blocking alloc was GC'd") 405 return nil 406 } 407 if resp.Alloc.Terminated() { 408 // Terminated! 409 p.nodeID = resp.Alloc.NodeID 410 return nil 411 } 412 413 // Update the query index and requery. 414 if resp.Index > req.MinQueryIndex { 415 req.MinQueryIndex = resp.Index 416 } 417 } 418 419 return ctx.Err() 420 } 421 422 // Migrate alloc data from a remote node if the new alloc has migration enabled 423 // and the old alloc hasn't been GC'd. 424 func (p *remotePrevAlloc) Migrate(ctx context.Context, dest *allocdir.AllocDir) error { 425 if !p.migrate { 426 // Volume wasn't configured to be migrated, return early 427 return nil 428 } 429 430 p.waitingLock.Lock() 431 p.migrating = true 432 p.waitingLock.Unlock() 433 defer func() { 434 p.waitingLock.Lock() 435 p.migrating = false 436 p.waitingLock.Unlock() 437 }() 438 439 p.logger.Debug("copying from remote previous alloc") 440 441 if p.nodeID == "" { 442 // NodeID couldn't be found; likely alloc was GC'd 443 p.logger.Warn("unable to migrate data from previous alloc; previous alloc may have been GC'd") 444 return nil 445 } 446 447 addr, err := p.getNodeAddr(ctx, p.nodeID) 448 if err != nil { 449 return err 450 } 451 452 prevAllocDir, err := p.migrateAllocDir(ctx, addr) 453 if err != nil { 454 return err 455 } 456 457 if err := dest.Move(prevAllocDir, p.tasks); err != nil { 458 // cleanup on error 459 prevAllocDir.Destroy() 460 return err 461 } 462 463 if err := prevAllocDir.Destroy(); err != nil { 464 p.logger.Error("error destroying alloc dir", 465 "error", err, "previous_alloc_dir", prevAllocDir.AllocDir) 466 } 467 return nil 468 } 469 470 // getNodeAddr gets the node from the server with the given Node ID 471 func (p *remotePrevAlloc) getNodeAddr(ctx context.Context, nodeID string) (string, error) { 472 req := structs.NodeSpecificRequest{ 473 NodeID: nodeID, 474 QueryOptions: structs.QueryOptions{ 475 Region: p.config.Region, 476 AllowStale: true, 477 AuthToken: p.config.Node.SecretID, 478 }, 479 } 480 481 resp := structs.SingleNodeResponse{} 482 for { 483 err := p.rpc.RPC("Node.GetNode", &req, &resp) 484 if err != nil { 485 p.logger.Error("failed to query node", "error", err, "node", nodeID) 486 retry := getRemoteRetryIntv + lib.RandomStagger(getRemoteRetryIntv) 487 select { 488 case <-time.After(retry): 489 continue 490 case <-ctx.Done(): 491 return "", ctx.Err() 492 } 493 } 494 break 495 } 496 497 if resp.Node == nil { 498 return "", fmt.Errorf("node %q not found", nodeID) 499 } 500 501 scheme := "http://" 502 if resp.Node.TLSEnabled { 503 scheme = "https://" 504 } 505 return scheme + resp.Node.HTTPAddr, nil 506 } 507 508 // migrate a remote alloc dir to local node. Caller is responsible for calling 509 // Destroy on the returned allocdir if no error occurs. 510 func (p *remotePrevAlloc) migrateAllocDir(ctx context.Context, nodeAddr string) (*allocdir.AllocDir, error) { 511 // Create the previous alloc dir 512 prevAllocDir := allocdir.NewAllocDir(p.logger, filepath.Join(p.config.AllocDir, p.prevAllocID)) 513 if err := prevAllocDir.Build(); err != nil { 514 return nil, fmt.Errorf("error building alloc dir for previous alloc %q: %v", p.prevAllocID, err) 515 } 516 517 // Create an API client 518 apiConfig := nomadapi.DefaultConfig() 519 apiConfig.Address = nodeAddr 520 apiConfig.TLSConfig = &nomadapi.TLSConfig{ 521 CACert: p.config.TLSConfig.CAFile, 522 ClientCert: p.config.TLSConfig.CertFile, 523 ClientKey: p.config.TLSConfig.KeyFile, 524 TLSServerName: fmt.Sprintf("client.%s.nomad", p.config.Region), 525 } 526 apiClient, err := nomadapi.NewClient(apiConfig) 527 if err != nil { 528 return nil, err 529 } 530 531 url := fmt.Sprintf("/v1/client/allocation/%v/snapshot", p.prevAllocID) 532 qo := &nomadapi.QueryOptions{AuthToken: p.migrateToken} 533 resp, err := apiClient.Raw().Response(url, qo) 534 if err != nil { 535 prevAllocDir.Destroy() 536 return nil, fmt.Errorf("error getting snapshot from previous alloc %q: %v", p.prevAllocID, err) 537 } 538 539 if err := p.streamAllocDir(ctx, resp, prevAllocDir.AllocDir); err != nil { 540 prevAllocDir.Destroy() 541 return nil, err 542 } 543 544 return prevAllocDir, nil 545 } 546 547 // stream remote alloc to dir to a local path. Caller should cleanup dest on 548 // error. 549 func (p *remotePrevAlloc) streamAllocDir(ctx context.Context, resp io.ReadCloser, dest string) error { 550 p.logger.Debug("streaming snapshot of previous alloc", "destination", dest) 551 tr := tar.NewReader(resp) 552 defer resp.Close() 553 554 // Cache effective uid as we only run Chown if we're root 555 euid := syscall.Geteuid() 556 557 canceled := func() bool { 558 select { 559 case <-ctx.Done(): 560 p.logger.Info("migration of previous alloc canceled") 561 return true 562 default: 563 return false 564 } 565 } 566 567 // if we see this file, there was an error on the remote side 568 errorFilename := allocdir.SnapshotErrorFilename(p.prevAllocID) 569 570 buf := make([]byte, 1024) 571 for !canceled() { 572 // Get the next header 573 hdr, err := tr.Next() 574 575 // Snapshot has ended 576 if err == io.EOF { 577 return nil 578 } 579 580 if err != nil { 581 return fmt.Errorf("error streaming previous alloc %q for new alloc %q: %v", 582 p.prevAllocID, p.allocID, err) 583 } 584 585 if hdr.Name == errorFilename { 586 // Error snapshotting on the remote side, try to read 587 // the message out of the file and return it. 588 errBuf := make([]byte, int(hdr.Size)) 589 if _, err := tr.Read(errBuf); err != nil && err != io.EOF { 590 return fmt.Errorf("error streaming previous alloc %q for new alloc %q; failed reading error message: %v", 591 p.prevAllocID, p.allocID, err) 592 } 593 return fmt.Errorf("error streaming previous alloc %q for new alloc %q: %s", 594 p.prevAllocID, p.allocID, string(errBuf)) 595 } 596 597 // If the header is for a directory we create the directory 598 if hdr.Typeflag == tar.TypeDir { 599 name := filepath.Join(dest, hdr.Name) 600 os.MkdirAll(name, os.FileMode(hdr.Mode)) 601 602 // Can't change owner if not root or on Windows. 603 if euid == 0 { 604 if err := os.Chown(name, hdr.Uid, hdr.Gid); err != nil { 605 return fmt.Errorf("error chowning directory %v", err) 606 } 607 } 608 continue 609 } 610 // If the header is for a symlink we create the symlink 611 if hdr.Typeflag == tar.TypeSymlink { 612 if err = os.Symlink(hdr.Linkname, filepath.Join(dest, hdr.Name)); err != nil { 613 return fmt.Errorf("error creating symlink: %v", err) 614 } 615 continue 616 } 617 // If the header is a file, we write to a file 618 if hdr.Typeflag == tar.TypeReg { 619 f, err := os.Create(filepath.Join(dest, hdr.Name)) 620 if err != nil { 621 return fmt.Errorf("error creating file: %v", err) 622 } 623 624 // Setting the permissions of the file as the origin. 625 if err := f.Chmod(os.FileMode(hdr.Mode)); err != nil { 626 f.Close() 627 return fmt.Errorf("error chmoding file %v", err) 628 } 629 630 // Can't change owner if not root or on Windows. 631 if euid == 0 { 632 if err := f.Chown(hdr.Uid, hdr.Gid); err != nil { 633 f.Close() 634 return fmt.Errorf("error chowning file %v", err) 635 } 636 } 637 638 // We write in chunks so that we can test if the client 639 // is still alive 640 for !canceled() { 641 n, err := tr.Read(buf) 642 if n > 0 && (err == nil || err == io.EOF) { 643 if _, err := f.Write(buf[:n]); err != nil { 644 f.Close() 645 return fmt.Errorf("error writing to file %q: %v", f.Name(), err) 646 } 647 } 648 649 if err != nil { 650 f.Close() 651 if err != io.EOF { 652 return fmt.Errorf("error reading snapshot: %v", err) 653 } 654 break 655 } 656 } 657 658 } 659 } 660 661 if canceled() { 662 return ctx.Err() 663 } 664 665 return nil 666 } 667 668 // NoopPrevAlloc does not block or migrate on a previous allocation and never 669 // returns an error. 670 type NoopPrevAlloc struct{} 671 672 // Wait returns nil immediately. 673 func (NoopPrevAlloc) Wait(context.Context) error { return nil } 674 675 // Migrate returns nil immediately. 676 func (NoopPrevAlloc) Migrate(context.Context, *allocdir.AllocDir) error { return nil } 677 678 func (NoopPrevAlloc) IsWaiting() bool { return false } 679 func (NoopPrevAlloc) IsMigrating() bool { return false }