github.com/marwan-at-work/consul@v1.4.5/command/debug/debug.go (about) 1 package debug 2 3 import ( 4 "archive/tar" 5 "compress/gzip" 6 "encoding/json" 7 "errors" 8 "flag" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "os" 13 "path/filepath" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/hashicorp/consul/api" 19 "github.com/hashicorp/consul/command/flags" 20 multierror "github.com/hashicorp/go-multierror" 21 "github.com/mitchellh/cli" 22 ) 23 24 const ( 25 // debugInterval is the interval in which to capture dynamic information 26 // when running debug 27 debugInterval = 30 * time.Second 28 29 // debugDuration is the total duration that debug runs before being 30 // shut down 31 debugDuration = 2 * time.Minute 32 33 // debugDurationGrace is a period of time added to the specified 34 // duration to allow intervals to capture within that time 35 debugDurationGrace = 2 * time.Second 36 37 // debugMinInterval is the minimum a user can configure the interval 38 // to prevent accidental DOS 39 debugMinInterval = 5 * time.Second 40 41 // debugMinDuration is the minimum a user can configure the duration 42 // to ensure that all information can be collected in time 43 debugMinDuration = 10 * time.Second 44 45 // debugArchiveExtension is the extension for archive files 46 debugArchiveExtension = ".tar.gz" 47 48 // debugProtocolVersion is the version of the package that is 49 // generated. If this format changes interface, this version 50 // can be incremented so clients can selectively support packages 51 debugProtocolVersion = 1 52 ) 53 54 func New(ui cli.Ui, shutdownCh <-chan struct{}) *cmd { 55 ui = &cli.PrefixedUi{ 56 OutputPrefix: "==> ", 57 InfoPrefix: " ", 58 ErrorPrefix: "==> ", 59 Ui: ui, 60 } 61 62 c := &cmd{UI: ui, shutdownCh: shutdownCh} 63 c.init() 64 return c 65 } 66 67 type cmd struct { 68 UI cli.Ui 69 flags *flag.FlagSet 70 http *flags.HTTPFlags 71 help string 72 73 shutdownCh <-chan struct{} 74 75 // flags 76 interval time.Duration 77 duration time.Duration 78 output string 79 archive bool 80 capture []string 81 client *api.Client 82 // validateTiming can be used to skip validation of interval, duration. This 83 // is primarily useful for testing 84 validateTiming bool 85 86 index *debugIndex 87 } 88 89 // debugIndex is used to manage the summary of all data recorded 90 // during the debug, to be written to json at the end of the run 91 // and stored at the root. Each attribute corresponds to a file or files. 92 type debugIndex struct { 93 // Version of the debug package 94 Version int 95 // Version of the target Consul agent 96 AgentVersion string 97 98 Interval string 99 Duration string 100 101 Targets []string 102 } 103 104 func (c *cmd) init() { 105 c.flags = flag.NewFlagSet("", flag.ContinueOnError) 106 107 defaultFilename := fmt.Sprintf("consul-debug-%d", time.Now().Unix()) 108 109 c.flags.Var((*flags.AppendSliceValue)(&c.capture), "capture", 110 fmt.Sprintf("One or more types of information to capture. This can be used "+ 111 "to capture a subset of information, and defaults to capturing "+ 112 "everything available. Possible information for capture: %s. "+ 113 "This can be repeated multiple times.", strings.Join(c.defaultTargets(), ", "))) 114 c.flags.DurationVar(&c.interval, "interval", debugInterval, 115 fmt.Sprintf("The interval in which to capture dynamic information such as "+ 116 "telemetry, and profiling. Defaults to %s.", debugInterval)) 117 c.flags.DurationVar(&c.duration, "duration", debugDuration, 118 fmt.Sprintf("The total time to record information. "+ 119 "Defaults to %s.", debugDuration)) 120 c.flags.BoolVar(&c.archive, "archive", true, "Boolean value for if the files "+ 121 "should be archived and compressed. Setting this to false will skip the "+ 122 "archive step and leave the directory of information on the current path.") 123 c.flags.StringVar(&c.output, "output", defaultFilename, "The path "+ 124 "to the compressed archive that will be created with the "+ 125 "information after collection.") 126 127 c.http = &flags.HTTPFlags{} 128 flags.Merge(c.flags, c.http.ClientFlags()) 129 c.help = flags.Usage(help, c.flags) 130 131 c.validateTiming = true 132 } 133 134 func (c *cmd) Run(args []string) int { 135 if err := c.flags.Parse(args); err != nil { 136 c.UI.Error(fmt.Sprintf("Error parsing flags: %s", err)) 137 return 1 138 } 139 140 if len(c.flags.Args()) > 0 { 141 c.UI.Error("debug: Too many arguments provided, expected 0") 142 return 1 143 } 144 145 // Connect to the agent 146 client, err := c.http.APIClient() 147 if err != nil { 148 c.UI.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err)) 149 return 1 150 } 151 c.client = client 152 153 version, err := c.prepare() 154 if err != nil { 155 c.UI.Error(fmt.Sprintf("Capture validation failed: %v", err)) 156 return 1 157 } 158 159 archiveName := c.output 160 // Show the user the final file path if archiving 161 if c.archive { 162 archiveName = archiveName + debugArchiveExtension 163 } 164 165 c.UI.Output("Starting debugger and capturing static information...") 166 167 // Output metadata about target agent 168 c.UI.Info(fmt.Sprintf(" Agent Version: '%s'", version)) 169 c.UI.Info(fmt.Sprintf(" Interval: '%s'", c.interval)) 170 c.UI.Info(fmt.Sprintf(" Duration: '%s'", c.duration)) 171 c.UI.Info(fmt.Sprintf(" Output: '%s'", archiveName)) 172 c.UI.Info(fmt.Sprintf(" Capture: '%s'", strings.Join(c.capture, ", "))) 173 174 // Record some information for the index at the root of the archive 175 index := &debugIndex{ 176 Version: debugProtocolVersion, 177 AgentVersion: version, 178 Interval: c.interval.String(), 179 Duration: c.duration.String(), 180 Targets: c.capture, 181 } 182 183 // Add the extra grace period to ensure 184 // all intervals will be captured within the time allotted 185 c.duration = c.duration + debugDurationGrace 186 187 // Capture static information from the target agent 188 err = c.captureStatic() 189 if err != nil { 190 c.UI.Warn(fmt.Sprintf("Static capture failed: %v", err)) 191 } 192 193 // Capture dynamic information from the target agent, blocking for duration 194 if c.configuredTarget("metrics") || c.configuredTarget("logs") || c.configuredTarget("pprof") { 195 err = c.captureDynamic() 196 if err != nil { 197 c.UI.Error(fmt.Sprintf("Error encountered during collection: %v", err)) 198 } 199 } 200 201 // Write the index document 202 idxMarshalled, err := json.MarshalIndent(index, "", "\t") 203 if err != nil { 204 c.UI.Error(fmt.Sprintf("Error marshalling index document: %v", err)) 205 return 1 206 } 207 208 err = ioutil.WriteFile(fmt.Sprintf("%s/index.json", c.output), idxMarshalled, 0644) 209 if err != nil { 210 c.UI.Error(fmt.Sprintf("Error creating index document: %v", err)) 211 return 1 212 } 213 214 // Archive the data if configured to 215 if c.archive { 216 err = c.createArchive() 217 218 if err != nil { 219 c.UI.Warn(fmt.Sprintf("Archive creation failed: %v", err)) 220 return 1 221 } 222 } 223 224 c.UI.Info(fmt.Sprintf("Saved debug archive: %s", archiveName)) 225 226 return 0 227 } 228 229 // prepare validates agent settings against targets and prepares the environment for capturing 230 func (c *cmd) prepare() (version string, err error) { 231 // Ensure realistic duration and intervals exists 232 if c.validateTiming { 233 if c.duration < debugMinDuration { 234 return "", fmt.Errorf("duration must be longer than %s", debugMinDuration) 235 } 236 237 if c.interval < debugMinInterval { 238 return "", fmt.Errorf("interval must be longer than %s", debugMinDuration) 239 } 240 241 if c.duration < c.interval { 242 return "", fmt.Errorf("duration (%s) must be longer than interval (%s)", c.duration, c.interval) 243 } 244 } 245 246 // Retrieve and process agent information necessary to validate 247 self, err := c.client.Agent().Self() 248 if err != nil { 249 return "", fmt.Errorf("error querying target agent: %s. verify connectivity and agent address", err) 250 } 251 252 version, ok := self["Config"]["Version"].(string) 253 if !ok { 254 return "", fmt.Errorf("agent response did not contain version key") 255 } 256 257 debugEnabled, ok := self["DebugConfig"]["EnableDebug"].(bool) 258 if !ok { 259 return version, fmt.Errorf("agent response did not contain debug key") 260 } 261 262 // If none are specified we will collect information from 263 // all by default 264 if len(c.capture) == 0 { 265 c.capture = c.defaultTargets() 266 } 267 268 if !debugEnabled && c.configuredTarget("pprof") { 269 cs := c.capture 270 for i := 0; i < len(cs); i++ { 271 if cs[i] == "pprof" { 272 c.capture = append(cs[:i], cs[i+1:]...) 273 i-- 274 } 275 } 276 c.UI.Warn("[WARN] Unable to capture pprof. Set enable_debug to true on target agent to enable profiling.") 277 } 278 279 for _, t := range c.capture { 280 if !c.allowedTarget(t) { 281 return version, fmt.Errorf("target not found: %s", t) 282 } 283 } 284 285 if _, err := os.Stat(c.output); os.IsNotExist(err) { 286 err := os.MkdirAll(c.output, 0755) 287 if err != nil { 288 return version, fmt.Errorf("could not create output directory: %s", err) 289 } 290 } else { 291 return version, fmt.Errorf("output directory already exists: %s", c.output) 292 } 293 294 return version, nil 295 } 296 297 // captureStatic captures static target information and writes it 298 // to the output path 299 func (c *cmd) captureStatic() error { 300 // Collect errors via multierror as we want to gracefully 301 // fail if an API is inacessible 302 var errors error 303 304 // Collect the named outputs here 305 outputs := make(map[string]interface{}, 0) 306 307 // Capture host information 308 if c.configuredTarget("host") { 309 host, err := c.client.Agent().Host() 310 if err != nil { 311 errors = multierror.Append(errors, err) 312 } 313 outputs["host"] = host 314 } 315 316 // Capture agent information 317 if c.configuredTarget("agent") { 318 agent, err := c.client.Agent().Self() 319 if err != nil { 320 errors = multierror.Append(errors, err) 321 } 322 outputs["agent"] = agent 323 } 324 325 // Capture cluster members information, including WAN 326 if c.configuredTarget("cluster") { 327 members, err := c.client.Agent().Members(true) 328 if err != nil { 329 errors = multierror.Append(errors, err) 330 } 331 outputs["cluster"] = members 332 } 333 334 // Write all outputs to disk as JSON 335 for output, v := range outputs { 336 marshaled, err := json.MarshalIndent(v, "", "\t") 337 if err != nil { 338 errors = multierror.Append(errors, err) 339 } 340 341 err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", c.output, output), marshaled, 0644) 342 if err != nil { 343 errors = multierror.Append(errors, err) 344 } 345 } 346 347 return errors 348 } 349 350 // captureDynamic blocks for the duration of the command 351 // specified by the duration flag, capturing the dynamic 352 // targets at the interval specified 353 func (c *cmd) captureDynamic() error { 354 successChan := make(chan int64) 355 errCh := make(chan error) 356 durationChn := time.After(c.duration) 357 intervalCount := 0 358 359 c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local().String(), intervalCount)) 360 361 // We'll wait for all of the targets configured to be 362 // captured before continuing 363 var wg sync.WaitGroup 364 365 capture := func() { 366 timestamp := time.Now().Local().Unix() 367 368 // Make the directory that will store all captured data 369 // for this interval 370 timestampDir := fmt.Sprintf("%s/%d", c.output, timestamp) 371 err := os.MkdirAll(timestampDir, 0755) 372 if err != nil { 373 errCh <- err 374 } 375 376 // Capture metrics 377 if c.configuredTarget("metrics") { 378 wg.Add(1) 379 380 go func() { 381 metrics, err := c.client.Agent().Metrics() 382 if err != nil { 383 errCh <- err 384 } 385 386 marshaled, err := json.MarshalIndent(metrics, "", "\t") 387 if err != nil { 388 errCh <- err 389 } 390 391 err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", timestampDir, "metrics"), marshaled, 0644) 392 if err != nil { 393 errCh <- err 394 } 395 396 // We need to sleep for the configured interval in the case 397 // of metrics being the only target captured. When it is, 398 // the waitgroup would return on Wait() and repeat without 399 // waiting for the interval. 400 time.Sleep(c.interval) 401 402 wg.Done() 403 }() 404 } 405 406 // Capture pprof 407 if c.configuredTarget("pprof") { 408 wg.Add(1) 409 410 go func() { 411 // We need to capture profiles and traces at the same time 412 // and block for both of them 413 var wgProf sync.WaitGroup 414 415 heap, err := c.client.Debug().Heap() 416 if err != nil { 417 errCh <- err 418 } 419 420 err = ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) 421 if err != nil { 422 errCh <- err 423 } 424 425 // Capture a profile/trace with a minimum of 1s 426 s := c.interval.Seconds() 427 if s < 1 { 428 s = 1 429 } 430 431 go func() { 432 wgProf.Add(1) 433 434 prof, err := c.client.Debug().Profile(int(s)) 435 if err != nil { 436 errCh <- err 437 } 438 439 err = ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644) 440 if err != nil { 441 errCh <- err 442 } 443 444 wgProf.Done() 445 }() 446 447 go func() { 448 wgProf.Add(1) 449 450 trace, err := c.client.Debug().Trace(int(s)) 451 if err != nil { 452 errCh <- err 453 } 454 455 err = ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644) 456 if err != nil { 457 errCh <- err 458 } 459 460 wgProf.Done() 461 }() 462 463 gr, err := c.client.Debug().Goroutine() 464 if err != nil { 465 errCh <- err 466 } 467 468 err = ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) 469 if err != nil { 470 errCh <- err 471 } 472 473 wgProf.Wait() 474 475 wg.Done() 476 }() 477 } 478 479 // Capture logs 480 if c.configuredTarget("logs") { 481 wg.Add(1) 482 483 go func() { 484 endLogChn := make(chan struct{}) 485 logCh, err := c.client.Agent().Monitor("DEBUG", endLogChn, nil) 486 if err != nil { 487 errCh <- err 488 } 489 // Close the log stream 490 defer close(endLogChn) 491 492 // Create the log file for writing 493 f, err := os.Create(fmt.Sprintf("%s/%s", timestampDir, "consul.log")) 494 if err != nil { 495 errCh <- err 496 } 497 defer f.Close() 498 499 intervalChn := time.After(c.interval) 500 501 OUTER: 502 503 for { 504 select { 505 case log := <-logCh: 506 // Append the line to the file 507 if _, err = f.WriteString(log + "\n"); err != nil { 508 errCh <- err 509 break OUTER 510 } 511 // Stop collecting the logs after the interval specified 512 case <-intervalChn: 513 break OUTER 514 } 515 } 516 517 wg.Done() 518 }() 519 } 520 521 // Wait for all captures to complete 522 wg.Wait() 523 524 // Send down the timestamp for UI output 525 successChan <- timestamp 526 } 527 528 go capture() 529 530 for { 531 select { 532 case t := <-successChan: 533 intervalCount++ 534 c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Unix(t, 0).Local().String(), intervalCount)) 535 go capture() 536 case e := <-errCh: 537 c.UI.Error(fmt.Sprintf("Capture failure %s", e)) 538 case <-durationChn: 539 return nil 540 case <-c.shutdownCh: 541 return errors.New("stopping collection due to shutdown signal") 542 } 543 } 544 } 545 546 // allowedTarget returns a boolean if the target is able to be captured 547 func (c *cmd) allowedTarget(target string) bool { 548 for _, dt := range c.defaultTargets() { 549 if dt == target { 550 return true 551 } 552 } 553 return false 554 } 555 556 // configuredTarget returns a boolean if the target is configured to be 557 // captured in the command 558 func (c *cmd) configuredTarget(target string) bool { 559 for _, dt := range c.capture { 560 if dt == target { 561 return true 562 } 563 } 564 return false 565 } 566 567 // createArchive walks the files in the temporary directory 568 // and creates a tar file that is gzipped with the contents 569 func (c *cmd) createArchive() error { 570 path := c.output + debugArchiveExtension 571 572 tempName, err := c.createArchiveTemp(path) 573 if err != nil { 574 return err 575 } 576 577 if err := os.Rename(tempName, path); err != nil { 578 return err 579 } 580 // fsync the dir to make the rename stick 581 if err := syncParentDir(path); err != nil { 582 return err 583 } 584 585 // Remove directory that has been archived 586 if err := os.RemoveAll(c.output); err != nil { 587 return fmt.Errorf("failed to remove archived directory: %s", err) 588 } 589 590 return nil 591 } 592 593 func syncParentDir(name string) error { 594 f, err := os.Open(filepath.Dir(name)) 595 if err != nil { 596 return err 597 } 598 defer f.Close() 599 600 return f.Sync() 601 } 602 603 func (c *cmd) createArchiveTemp(path string) (tempName string, err error) { 604 dir := filepath.Dir(path) 605 name := filepath.Base(path) 606 607 f, err := ioutil.TempFile(dir, name+".tmp") 608 if err != nil { 609 return "", fmt.Errorf("failed to create compressed temp archive: %s", err) 610 } 611 612 g := gzip.NewWriter(f) 613 t := tar.NewWriter(g) 614 615 tempName = f.Name() 616 617 cleanup := func(err error) (string, error) { 618 _ = t.Close() 619 _ = g.Close() 620 _ = f.Close() 621 _ = os.Remove(tempName) 622 return "", err 623 } 624 625 err = filepath.Walk(c.output, func(file string, fi os.FileInfo, err error) error { 626 if err != nil { 627 return fmt.Errorf("failed to walk filepath for archive: %s", err) 628 } 629 630 header, err := tar.FileInfoHeader(fi, fi.Name()) 631 if err != nil { 632 return fmt.Errorf("failed to create compressed archive header: %s", err) 633 } 634 635 header.Name = filepath.Join(filepath.Base(c.output), strings.TrimPrefix(file, c.output)) 636 637 if err := t.WriteHeader(header); err != nil { 638 return fmt.Errorf("failed to write compressed archive header: %s", err) 639 } 640 641 // Only copy files 642 if !fi.Mode().IsRegular() { 643 return nil 644 } 645 646 f, err := os.Open(file) 647 if err != nil { 648 return fmt.Errorf("failed to open target files for archive: %s", err) 649 } 650 651 if _, err := io.Copy(t, f); err != nil { 652 return fmt.Errorf("failed to copy files for archive: %s", err) 653 } 654 655 f.Close() 656 657 return nil 658 }) 659 660 if err != nil { 661 return cleanup(fmt.Errorf("failed to walk output path for archive: %s", err)) 662 } 663 664 // Explicitly close things in the correct order (tar then gzip) so we 665 // know if they worked. 666 if err := t.Close(); err != nil { 667 return cleanup(err) 668 } 669 if err := g.Close(); err != nil { 670 return cleanup(err) 671 } 672 673 // Guarantee that the contents of the temp file are flushed to disk. 674 if err := f.Sync(); err != nil { 675 return cleanup(err) 676 } 677 678 // Close the temp file and go back to the wrapper function for the rest. 679 if err := f.Close(); err != nil { 680 return cleanup(err) 681 } 682 683 return tempName, nil 684 } 685 686 // defaultTargets specifies the list of all targets that 687 // will be captured by default 688 func (c *cmd) defaultTargets() []string { 689 return append(c.dynamicTargets(), c.staticTargets()...) 690 } 691 692 // dynamicTargets returns all the supported targets 693 // that are retrieved at the interval specified 694 func (c *cmd) dynamicTargets() []string { 695 return []string{"metrics", "logs", "pprof"} 696 } 697 698 // staticTargets returns all the supported targets 699 // that are retrieved at the start of the command execution 700 func (c *cmd) staticTargets() []string { 701 return []string{"host", "agent", "cluster"} 702 } 703 704 func (c *cmd) Synopsis() string { 705 return synopsis 706 } 707 708 func (c *cmd) Help() string { 709 return c.help 710 } 711 712 const synopsis = "Records a debugging archive for operators" 713 const help = ` 714 Usage: consul debug [options] 715 716 Monitors a Consul agent for the specified period of time, recording 717 information about the agent, cluster, and environment to an archive 718 written to the specified path. 719 720 If ACLs are enabled, an 'operator:read' token must be supplied in order 721 to perform this operation. 722 723 To create a debug archive in the current directory for the default 724 duration and interval, capturing all information available: 725 726 $ consul debug 727 728 The command stores captured data at the configured output path 729 through the duration, and will archive the data at the same 730 path if interrupted. 731 732 Flags can be used to customize the duration and interval of the 733 operation. Duration is the total time to capture data for from the target 734 agent and interval controls how often dynamic data such as metrics 735 are scraped. 736 737 $ consul debug -interval=20s -duration=1m 738 739 The capture flag can be specified multiple times to limit information 740 retrieved. 741 742 $ consul debug -capture metrics -capture agent 743 744 By default, the archive containing the debugging information is 745 saved to the current directory as a .tar.gz file. The 746 output path can be specified, as well as an option to disable 747 archiving, leaving the directory intact. 748 749 $ consul debug -output=/foo/bar/my-debugging -archive=false 750 751 Note: Information collected by this command has the potential 752 to be highly sensitive. Sensitive material such as ACL tokens and 753 other commonly secret material are redacted automatically, but we 754 strongly recommend review of the data within the archive prior to 755 transmitting it. 756 757 For a full list of options and examples, please see the Consul 758 documentation. 759 `