go.temporal.io/server@v1.23.0/common/deadlock/deadlock.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  package deadlock
    26  
    27  import (
    28  	"context"
    29  	"runtime/pprof"
    30  	"strings"
    31  	"sync/atomic"
    32  	"time"
    33  
    34  	"go.uber.org/fx"
    35  	"google.golang.org/grpc/health"
    36  
    37  	"go.temporal.io/server/common"
    38  	"go.temporal.io/server/common/dynamicconfig"
    39  	"go.temporal.io/server/common/log"
    40  	"go.temporal.io/server/common/log/tag"
    41  	"go.temporal.io/server/common/metrics"
    42  	"go.temporal.io/server/internal/goro"
    43  )
    44  
    45  type (
    46  	params struct {
    47  		fx.In
    48  
    49  		Logger         log.SnTaggedLogger
    50  		Collection     *dynamicconfig.Collection
    51  		HealthServer   *health.Server
    52  		MetricsHandler metrics.Handler
    53  
    54  		Roots []common.Pingable `group:"deadlockDetectorRoots"`
    55  	}
    56  
    57  	config struct {
    58  		DumpGoroutines    dynamicconfig.BoolPropertyFn
    59  		FailHealthCheck   dynamicconfig.BoolPropertyFn
    60  		AbortProcess      dynamicconfig.BoolPropertyFn
    61  		Interval          dynamicconfig.DurationPropertyFn
    62  		MaxWorkersPerRoot dynamicconfig.IntPropertyFn
    63  	}
    64  
    65  	deadlockDetector struct {
    66  		logger         log.Logger
    67  		healthServer   *health.Server
    68  		metricsHandler metrics.Handler
    69  		config         config
    70  		roots          []common.Pingable
    71  		loops          goro.Group
    72  	}
    73  
    74  	loopContext struct {
    75  		dd      *deadlockDetector
    76  		root    common.Pingable
    77  		ch      chan common.PingCheck
    78  		workers int32
    79  	}
    80  )
    81  
    82  func NewDeadlockDetector(params params) *deadlockDetector {
    83  	return &deadlockDetector{
    84  		logger:         params.Logger,
    85  		healthServer:   params.HealthServer,
    86  		metricsHandler: params.MetricsHandler.WithTags(metrics.OperationTag(metrics.DeadlockDetectorScope)),
    87  		config: config{
    88  			DumpGoroutines:    params.Collection.GetBoolProperty(dynamicconfig.DeadlockDumpGoroutines, true),
    89  			FailHealthCheck:   params.Collection.GetBoolProperty(dynamicconfig.DeadlockFailHealthCheck, false),
    90  			AbortProcess:      params.Collection.GetBoolProperty(dynamicconfig.DeadlockAbortProcess, false),
    91  			Interval:          params.Collection.GetDurationProperty(dynamicconfig.DeadlockInterval, 30*time.Second),
    92  			MaxWorkersPerRoot: params.Collection.GetIntProperty(dynamicconfig.DeadlockMaxWorkersPerRoot, 10),
    93  		},
    94  		roots: params.Roots,
    95  	}
    96  }
    97  
    98  func (dd *deadlockDetector) Start() error {
    99  	for _, root := range dd.roots {
   100  		loopCtx := &loopContext{
   101  			dd:   dd,
   102  			root: root,
   103  			ch:   make(chan common.PingCheck),
   104  		}
   105  		dd.loops.Go(loopCtx.run)
   106  	}
   107  	return nil
   108  }
   109  
   110  func (dd *deadlockDetector) Stop() error {
   111  	dd.loops.Cancel()
   112  	// don't wait for workers to exit, they may be blocked
   113  	return nil
   114  }
   115  
   116  func (dd *deadlockDetector) detected(name string) {
   117  	dd.logger.Error("potential deadlock detected", tag.Name(name))
   118  
   119  	if dd.config.DumpGoroutines() {
   120  		dd.dumpGoroutines()
   121  	}
   122  
   123  	if dd.config.FailHealthCheck() {
   124  		dd.logger.Info("marking grpc services unhealthy")
   125  		dd.healthServer.Shutdown()
   126  	}
   127  
   128  	if dd.config.AbortProcess() {
   129  		dd.logger.Fatal("deadlock detected", tag.Name(name))
   130  	}
   131  }
   132  
   133  func (dd *deadlockDetector) dumpGoroutines() {
   134  	profile := pprof.Lookup("goroutine")
   135  	if profile == nil {
   136  		dd.logger.Error("could not find goroutine profile")
   137  		return
   138  	}
   139  	var b strings.Builder
   140  	err := profile.WriteTo(&b, 1) // 1 is magic value that means "text format"
   141  	if err != nil {
   142  		dd.logger.Error("failed to get goroutine profile", tag.Error(err))
   143  		return
   144  	}
   145  	// write it as a single log line with embedded newlines.
   146  	// the value starts with "goroutine profile: total ...\n" so it should be clear
   147  	dd.logger.Info("dumping goroutine profile for suspected deadlock")
   148  	dd.logger.Info(b.String())
   149  }
   150  
   151  func (lc *loopContext) run(ctx context.Context) error {
   152  	for {
   153  		// ping blocks until it has passed all checks to a worker goroutine (using an
   154  		// unbuffered channel).
   155  		lc.ping(ctx, []common.Pingable{lc.root})
   156  
   157  		timer := time.NewTimer(lc.dd.config.Interval())
   158  		select {
   159  		case <-timer.C:
   160  		case <-ctx.Done():
   161  			timer.Stop()
   162  			return ctx.Err()
   163  		}
   164  	}
   165  }
   166  
   167  func (lc *loopContext) ping(ctx context.Context, pingables []common.Pingable) {
   168  	for _, pingable := range pingables {
   169  		for _, check := range pingable.GetPingChecks() {
   170  			select {
   171  			case lc.ch <- check:
   172  			case <-ctx.Done():
   173  				return
   174  			default:
   175  				// maybe add another worker if blocked
   176  				w := atomic.LoadInt32(&lc.workers)
   177  				if w < int32(lc.dd.config.MaxWorkersPerRoot()) && atomic.CompareAndSwapInt32(&lc.workers, w, w+1) {
   178  					lc.dd.loops.Go(lc.worker)
   179  				}
   180  				// blocking send
   181  				select {
   182  				case lc.ch <- check:
   183  				case <-ctx.Done():
   184  					return
   185  				}
   186  			}
   187  		}
   188  	}
   189  }
   190  
   191  func (lc *loopContext) worker(ctx context.Context) error {
   192  	for {
   193  		var check common.PingCheck
   194  		select {
   195  		case check = <-lc.ch:
   196  		case <-ctx.Done():
   197  			return nil
   198  		}
   199  
   200  		lc.dd.logger.Debug("starting ping check", tag.Name(check.Name))
   201  		startTime := time.Now().UTC()
   202  
   203  		// Using AfterFunc is cheaper than creating another goroutine to be the waiter, since
   204  		// we expect to always cancel it. If the go runtime is so messed up that it can't
   205  		// create a goroutine, that's a bigger problem than we can handle.
   206  		t := time.AfterFunc(check.Timeout, func() {
   207  			if ctx.Err() != nil {
   208  				// deadlock detector was stopped
   209  				return
   210  			}
   211  			lc.dd.detected(check.Name)
   212  		})
   213  		newPingables := check.Ping()
   214  		t.Stop()
   215  		if len(check.MetricsName) > 0 {
   216  			lc.dd.metricsHandler.Timer(check.MetricsName).Record(time.Since(startTime))
   217  		}
   218  
   219  		lc.dd.logger.Debug("ping check succeeded", tag.Name(check.Name))
   220  
   221  		lc.ping(ctx, newPingables)
   222  	}
   223  }