go.temporal.io/server@v1.23.0/common/deadlock/deadlock.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 package deadlock 26 27 import ( 28 "context" 29 "runtime/pprof" 30 "strings" 31 "sync/atomic" 32 "time" 33 34 "go.uber.org/fx" 35 "google.golang.org/grpc/health" 36 37 "go.temporal.io/server/common" 38 "go.temporal.io/server/common/dynamicconfig" 39 "go.temporal.io/server/common/log" 40 "go.temporal.io/server/common/log/tag" 41 "go.temporal.io/server/common/metrics" 42 "go.temporal.io/server/internal/goro" 43 ) 44 45 type ( 46 params struct { 47 fx.In 48 49 Logger log.SnTaggedLogger 50 Collection *dynamicconfig.Collection 51 HealthServer *health.Server 52 MetricsHandler metrics.Handler 53 54 Roots []common.Pingable `group:"deadlockDetectorRoots"` 55 } 56 57 config struct { 58 DumpGoroutines dynamicconfig.BoolPropertyFn 59 FailHealthCheck dynamicconfig.BoolPropertyFn 60 AbortProcess dynamicconfig.BoolPropertyFn 61 Interval dynamicconfig.DurationPropertyFn 62 MaxWorkersPerRoot dynamicconfig.IntPropertyFn 63 } 64 65 deadlockDetector struct { 66 logger log.Logger 67 healthServer *health.Server 68 metricsHandler metrics.Handler 69 config config 70 roots []common.Pingable 71 loops goro.Group 72 } 73 74 loopContext struct { 75 dd *deadlockDetector 76 root common.Pingable 77 ch chan common.PingCheck 78 workers int32 79 } 80 ) 81 82 func NewDeadlockDetector(params params) *deadlockDetector { 83 return &deadlockDetector{ 84 logger: params.Logger, 85 healthServer: params.HealthServer, 86 metricsHandler: params.MetricsHandler.WithTags(metrics.OperationTag(metrics.DeadlockDetectorScope)), 87 config: config{ 88 DumpGoroutines: params.Collection.GetBoolProperty(dynamicconfig.DeadlockDumpGoroutines, true), 89 FailHealthCheck: params.Collection.GetBoolProperty(dynamicconfig.DeadlockFailHealthCheck, false), 90 AbortProcess: params.Collection.GetBoolProperty(dynamicconfig.DeadlockAbortProcess, false), 91 Interval: params.Collection.GetDurationProperty(dynamicconfig.DeadlockInterval, 30*time.Second), 92 MaxWorkersPerRoot: params.Collection.GetIntProperty(dynamicconfig.DeadlockMaxWorkersPerRoot, 10), 93 }, 94 roots: params.Roots, 95 } 96 } 97 98 func (dd *deadlockDetector) Start() error { 99 for _, root := range dd.roots { 100 loopCtx := &loopContext{ 101 dd: dd, 102 root: root, 103 ch: make(chan common.PingCheck), 104 } 105 dd.loops.Go(loopCtx.run) 106 } 107 return nil 108 } 109 110 func (dd *deadlockDetector) Stop() error { 111 dd.loops.Cancel() 112 // don't wait for workers to exit, they may be blocked 113 return nil 114 } 115 116 func (dd *deadlockDetector) detected(name string) { 117 dd.logger.Error("potential deadlock detected", tag.Name(name)) 118 119 if dd.config.DumpGoroutines() { 120 dd.dumpGoroutines() 121 } 122 123 if dd.config.FailHealthCheck() { 124 dd.logger.Info("marking grpc services unhealthy") 125 dd.healthServer.Shutdown() 126 } 127 128 if dd.config.AbortProcess() { 129 dd.logger.Fatal("deadlock detected", tag.Name(name)) 130 } 131 } 132 133 func (dd *deadlockDetector) dumpGoroutines() { 134 profile := pprof.Lookup("goroutine") 135 if profile == nil { 136 dd.logger.Error("could not find goroutine profile") 137 return 138 } 139 var b strings.Builder 140 err := profile.WriteTo(&b, 1) // 1 is magic value that means "text format" 141 if err != nil { 142 dd.logger.Error("failed to get goroutine profile", tag.Error(err)) 143 return 144 } 145 // write it as a single log line with embedded newlines. 146 // the value starts with "goroutine profile: total ...\n" so it should be clear 147 dd.logger.Info("dumping goroutine profile for suspected deadlock") 148 dd.logger.Info(b.String()) 149 } 150 151 func (lc *loopContext) run(ctx context.Context) error { 152 for { 153 // ping blocks until it has passed all checks to a worker goroutine (using an 154 // unbuffered channel). 155 lc.ping(ctx, []common.Pingable{lc.root}) 156 157 timer := time.NewTimer(lc.dd.config.Interval()) 158 select { 159 case <-timer.C: 160 case <-ctx.Done(): 161 timer.Stop() 162 return ctx.Err() 163 } 164 } 165 } 166 167 func (lc *loopContext) ping(ctx context.Context, pingables []common.Pingable) { 168 for _, pingable := range pingables { 169 for _, check := range pingable.GetPingChecks() { 170 select { 171 case lc.ch <- check: 172 case <-ctx.Done(): 173 return 174 default: 175 // maybe add another worker if blocked 176 w := atomic.LoadInt32(&lc.workers) 177 if w < int32(lc.dd.config.MaxWorkersPerRoot()) && atomic.CompareAndSwapInt32(&lc.workers, w, w+1) { 178 lc.dd.loops.Go(lc.worker) 179 } 180 // blocking send 181 select { 182 case lc.ch <- check: 183 case <-ctx.Done(): 184 return 185 } 186 } 187 } 188 } 189 } 190 191 func (lc *loopContext) worker(ctx context.Context) error { 192 for { 193 var check common.PingCheck 194 select { 195 case check = <-lc.ch: 196 case <-ctx.Done(): 197 return nil 198 } 199 200 lc.dd.logger.Debug("starting ping check", tag.Name(check.Name)) 201 startTime := time.Now().UTC() 202 203 // Using AfterFunc is cheaper than creating another goroutine to be the waiter, since 204 // we expect to always cancel it. If the go runtime is so messed up that it can't 205 // create a goroutine, that's a bigger problem than we can handle. 206 t := time.AfterFunc(check.Timeout, func() { 207 if ctx.Err() != nil { 208 // deadlock detector was stopped 209 return 210 } 211 lc.dd.detected(check.Name) 212 }) 213 newPingables := check.Ping() 214 t.Stop() 215 if len(check.MetricsName) > 0 { 216 lc.dd.metricsHandler.Timer(check.MetricsName).Record(time.Since(startTime)) 217 } 218 219 lc.dd.logger.Debug("ping check succeeded", tag.Name(check.Name)) 220 221 lc.ping(ctx, newPingables) 222 } 223 }