github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/goroutinedumper/goroutinedumper.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package goroutinedumper
    12  
    13  import (
    14  	"compress/gzip"
    15  	"context"
    16  	"fmt"
    17  	"io/ioutil"
    18  	"os"
    19  	"path/filepath"
    20  	"runtime/pprof"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/cockroachdb/cockroach/pkg/settings"
    25  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/errors"
    29  )
    30  
    31  const (
    32  	goroutineDumpPrefix = "goroutine_dump"
    33  	timeFormat          = "2006-01-02T15_04_05.999"
    34  )
    35  
    36  var (
    37  	numGoroutinesThreshold = settings.RegisterIntSetting(
    38  		"server.goroutine_dump.num_goroutines_threshold",
    39  		"a threshold beyond which if number of goroutines increases, "+
    40  			"then goroutine dump can be triggered",
    41  		1000,
    42  	)
    43  	totalDumpSizeLimit = settings.RegisterByteSizeSetting(
    44  		"server.goroutine_dump.total_dump_size_limit",
    45  		"total size of goroutine dumps to be kept. "+
    46  			"Dumps are GC'ed in the order of creation time. The latest dump is "+
    47  			"always kept even if its size exceeds the limit.",
    48  		500<<20, // 500MiB
    49  	)
    50  )
    51  
    52  // heuristic represents whether goroutine dump is triggered. It is true when
    53  // we think a goroutine dump is helpful in debugging OOM issues.
    54  type heuristic struct {
    55  	name   string
    56  	isTrue func(s *GoroutineDumper) bool
    57  }
    58  
    59  var doubleSinceLastDumpHeuristic = heuristic{
    60  	name: "double_since_last_dump",
    61  	isTrue: func(gd *GoroutineDumper) bool {
    62  		return gd.goroutines > gd.goroutinesThreshold &&
    63  			gd.goroutines >= 2*gd.maxGoroutinesDumped
    64  	},
    65  }
    66  
    67  // GoroutineDumper stores relevant functions and stats to take goroutine dumps
    68  // if an abnormal change in number of goroutines is detected.
    69  type GoroutineDumper struct {
    70  	goroutines          int64
    71  	goroutinesThreshold int64
    72  	maxGoroutinesDumped int64
    73  	heuristics          []heuristic
    74  	currentTime         func() time.Time
    75  	takeGoroutineDump   func(dir string, filename string) error
    76  	gc                  func(ctx context.Context, dir string, sizeLimit int64)
    77  	dir                 string
    78  }
    79  
    80  // MaybeDump takes a goroutine dump only when at least one heuristic in
    81  // GoroutineDumper is true.
    82  // At most one dump is taken in a call of this function.
    83  func (gd *GoroutineDumper) MaybeDump(ctx context.Context, st *cluster.Settings, goroutines int64) {
    84  	gd.goroutines = goroutines
    85  	if gd.goroutinesThreshold != numGoroutinesThreshold.Get(&st.SV) {
    86  		gd.goroutinesThreshold = numGoroutinesThreshold.Get(&st.SV)
    87  		gd.maxGoroutinesDumped = 0
    88  	}
    89  	for _, h := range gd.heuristics {
    90  		if h.isTrue(gd) {
    91  			filename := fmt.Sprintf(
    92  				"%s.%s.%s.%09d",
    93  				goroutineDumpPrefix,
    94  				gd.currentTime().Format(timeFormat),
    95  				h.name,
    96  				goroutines,
    97  			)
    98  			if err := gd.takeGoroutineDump(gd.dir, filename); err != nil {
    99  				log.Errorf(ctx, "error dumping goroutines: %s", err)
   100  				continue
   101  			}
   102  			gd.maxGoroutinesDumped = goroutines
   103  			gd.gc(ctx, gd.dir, totalDumpSizeLimit.Get(&st.SV))
   104  			break
   105  		}
   106  	}
   107  }
   108  
   109  // NewGoroutineDumper returns a GoroutineDumper which enables
   110  // doubleSinceLastDumpHeuristic.
   111  // dir is the directory in which dumps are stored.
   112  func NewGoroutineDumper(dir string) (*GoroutineDumper, error) {
   113  	if dir == "" {
   114  		return nil, errors.New("directory to store dumps could not be determined")
   115  	}
   116  	gd := &GoroutineDumper{
   117  		heuristics: []heuristic{
   118  			doubleSinceLastDumpHeuristic,
   119  		},
   120  		goroutinesThreshold: 0,
   121  		maxGoroutinesDumped: 0,
   122  		currentTime:         timeutil.Now,
   123  		takeGoroutineDump:   takeGoroutineDump,
   124  		gc:                  gc,
   125  		dir:                 dir,
   126  	}
   127  	return gd, nil
   128  }
   129  
   130  // gc removes oldest dumps when the total size of all dumps is larger
   131  // than sizeLimit. Requires that the name of the dumps indicates dump time
   132  // such that sorting the filenames corresponds to ordering the dumps
   133  // from oldest to newest.
   134  // Newest dump in the directory is not considered for GC.
   135  func gc(ctx context.Context, dir string, sizeLimit int64) {
   136  	// ReadDir returns a list of directory entries sorted by filename, which means
   137  	// it is sorted by dump time.
   138  	files, err := ioutil.ReadDir(dir)
   139  	if err != nil {
   140  		log.Errorf(ctx, "cannot read directory %s, err: %s", dir, err)
   141  		return
   142  	}
   143  
   144  	var totalSize int64
   145  	isLatestDump := true
   146  	for i := len(files) - 1; i >= 0; i-- {
   147  		f := files[i]
   148  		path := filepath.Join(dir, f.Name())
   149  		if strings.HasPrefix(f.Name(), goroutineDumpPrefix) {
   150  			totalSize += f.Size()
   151  			// Skipping the latest dump in gc
   152  			if isLatestDump {
   153  				isLatestDump = false
   154  				continue
   155  			}
   156  			if totalSize > sizeLimit {
   157  				if err := os.Remove(path); err != nil {
   158  					log.Warningf(ctx, "Cannot remove dump file %s, err: %s", path, err)
   159  				}
   160  			}
   161  		} else {
   162  			log.Infof(ctx, "Removing unknown file %s in goroutine dump dir %s", f.Name(), dir)
   163  			if err := os.Remove(path); err != nil {
   164  				log.Warningf(ctx, "Cannot remove file %s, err: %s", path, err)
   165  			}
   166  		}
   167  	}
   168  }
   169  
   170  func takeGoroutineDump(dir string, filename string) error {
   171  	filename = filename + ".txt.gz"
   172  	path := filepath.Join(dir, filename)
   173  	f, err := os.Create(path)
   174  	if err != nil {
   175  		return errors.Wrapf(err, "error creating file %s for goroutine dump", path)
   176  	}
   177  	defer f.Close()
   178  	w := gzip.NewWriter(f)
   179  	if err = pprof.Lookup("goroutine").WriteTo(w, 2); err != nil {
   180  		return errors.Wrapf(err, "error writing goroutine dump to %s", path)
   181  	}
   182  	// Flush and write the gzip header. It doesn't close the underlying writer.
   183  	if err := w.Close(); err != nil {
   184  		return errors.Wrapf(err, "error closing gzip writer for %s", path)
   185  	}
   186  	// Return f.Close() too so that we don't miss a potential error if everything
   187  	// else succeeded.
   188  	return f.Close()
   189  }