github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/goroutinedumper/goroutinedumper.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package goroutinedumper 12 13 import ( 14 "compress/gzip" 15 "context" 16 "fmt" 17 "io/ioutil" 18 "os" 19 "path/filepath" 20 "runtime/pprof" 21 "strings" 22 "time" 23 24 "github.com/cockroachdb/cockroach/pkg/settings" 25 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/errors" 29 ) 30 31 const ( 32 goroutineDumpPrefix = "goroutine_dump" 33 timeFormat = "2006-01-02T15_04_05.999" 34 ) 35 36 var ( 37 numGoroutinesThreshold = settings.RegisterIntSetting( 38 "server.goroutine_dump.num_goroutines_threshold", 39 "a threshold beyond which if number of goroutines increases, "+ 40 "then goroutine dump can be triggered", 41 1000, 42 ) 43 totalDumpSizeLimit = settings.RegisterByteSizeSetting( 44 "server.goroutine_dump.total_dump_size_limit", 45 "total size of goroutine dumps to be kept. "+ 46 "Dumps are GC'ed in the order of creation time. The latest dump is "+ 47 "always kept even if its size exceeds the limit.", 48 500<<20, // 500MiB 49 ) 50 ) 51 52 // heuristic represents whether goroutine dump is triggered. It is true when 53 // we think a goroutine dump is helpful in debugging OOM issues. 54 type heuristic struct { 55 name string 56 isTrue func(s *GoroutineDumper) bool 57 } 58 59 var doubleSinceLastDumpHeuristic = heuristic{ 60 name: "double_since_last_dump", 61 isTrue: func(gd *GoroutineDumper) bool { 62 return gd.goroutines > gd.goroutinesThreshold && 63 gd.goroutines >= 2*gd.maxGoroutinesDumped 64 }, 65 } 66 67 // GoroutineDumper stores relevant functions and stats to take goroutine dumps 68 // if an abnormal change in number of goroutines is detected. 69 type GoroutineDumper struct { 70 goroutines int64 71 goroutinesThreshold int64 72 maxGoroutinesDumped int64 73 heuristics []heuristic 74 currentTime func() time.Time 75 takeGoroutineDump func(dir string, filename string) error 76 gc func(ctx context.Context, dir string, sizeLimit int64) 77 dir string 78 } 79 80 // MaybeDump takes a goroutine dump only when at least one heuristic in 81 // GoroutineDumper is true. 82 // At most one dump is taken in a call of this function. 83 func (gd *GoroutineDumper) MaybeDump(ctx context.Context, st *cluster.Settings, goroutines int64) { 84 gd.goroutines = goroutines 85 if gd.goroutinesThreshold != numGoroutinesThreshold.Get(&st.SV) { 86 gd.goroutinesThreshold = numGoroutinesThreshold.Get(&st.SV) 87 gd.maxGoroutinesDumped = 0 88 } 89 for _, h := range gd.heuristics { 90 if h.isTrue(gd) { 91 filename := fmt.Sprintf( 92 "%s.%s.%s.%09d", 93 goroutineDumpPrefix, 94 gd.currentTime().Format(timeFormat), 95 h.name, 96 goroutines, 97 ) 98 if err := gd.takeGoroutineDump(gd.dir, filename); err != nil { 99 log.Errorf(ctx, "error dumping goroutines: %s", err) 100 continue 101 } 102 gd.maxGoroutinesDumped = goroutines 103 gd.gc(ctx, gd.dir, totalDumpSizeLimit.Get(&st.SV)) 104 break 105 } 106 } 107 } 108 109 // NewGoroutineDumper returns a GoroutineDumper which enables 110 // doubleSinceLastDumpHeuristic. 111 // dir is the directory in which dumps are stored. 112 func NewGoroutineDumper(dir string) (*GoroutineDumper, error) { 113 if dir == "" { 114 return nil, errors.New("directory to store dumps could not be determined") 115 } 116 gd := &GoroutineDumper{ 117 heuristics: []heuristic{ 118 doubleSinceLastDumpHeuristic, 119 }, 120 goroutinesThreshold: 0, 121 maxGoroutinesDumped: 0, 122 currentTime: timeutil.Now, 123 takeGoroutineDump: takeGoroutineDump, 124 gc: gc, 125 dir: dir, 126 } 127 return gd, nil 128 } 129 130 // gc removes oldest dumps when the total size of all dumps is larger 131 // than sizeLimit. Requires that the name of the dumps indicates dump time 132 // such that sorting the filenames corresponds to ordering the dumps 133 // from oldest to newest. 134 // Newest dump in the directory is not considered for GC. 135 func gc(ctx context.Context, dir string, sizeLimit int64) { 136 // ReadDir returns a list of directory entries sorted by filename, which means 137 // it is sorted by dump time. 138 files, err := ioutil.ReadDir(dir) 139 if err != nil { 140 log.Errorf(ctx, "cannot read directory %s, err: %s", dir, err) 141 return 142 } 143 144 var totalSize int64 145 isLatestDump := true 146 for i := len(files) - 1; i >= 0; i-- { 147 f := files[i] 148 path := filepath.Join(dir, f.Name()) 149 if strings.HasPrefix(f.Name(), goroutineDumpPrefix) { 150 totalSize += f.Size() 151 // Skipping the latest dump in gc 152 if isLatestDump { 153 isLatestDump = false 154 continue 155 } 156 if totalSize > sizeLimit { 157 if err := os.Remove(path); err != nil { 158 log.Warningf(ctx, "Cannot remove dump file %s, err: %s", path, err) 159 } 160 } 161 } else { 162 log.Infof(ctx, "Removing unknown file %s in goroutine dump dir %s", f.Name(), dir) 163 if err := os.Remove(path); err != nil { 164 log.Warningf(ctx, "Cannot remove file %s, err: %s", path, err) 165 } 166 } 167 } 168 } 169 170 func takeGoroutineDump(dir string, filename string) error { 171 filename = filename + ".txt.gz" 172 path := filepath.Join(dir, filename) 173 f, err := os.Create(path) 174 if err != nil { 175 return errors.Wrapf(err, "error creating file %s for goroutine dump", path) 176 } 177 defer f.Close() 178 w := gzip.NewWriter(f) 179 if err = pprof.Lookup("goroutine").WriteTo(w, 2); err != nil { 180 return errors.Wrapf(err, "error writing goroutine dump to %s", path) 181 } 182 // Flush and write the gzip header. It doesn't close the underlying writer. 183 if err := w.Close(); err != nil { 184 return errors.Wrapf(err, "error closing gzip writer for %s", path) 185 } 186 // Return f.Close() too so that we don't miss a potential error if everything 187 // else succeeded. 188 return f.Close() 189 }