github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node_engine_health.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package server
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/storage"
    18  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    19  	"github.com/cockroachdb/cockroach/pkg/util/log"
    20  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    21  )
    22  
    23  // maxSyncDuration is the threshold above which an observed engine sync duration
    24  // triggers either a warning or a fatal error.
    25  var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_ENGINE_MAX_SYNC_DURATION", 10*time.Second)
    26  
    27  // maxSyncDurationFatalOnExceeded defaults to false due to issues such as
    28  // https://github.com/cockroachdb/cockroach/issues/34860#issuecomment-469262019.
    29  // Similar problems have been known to occur during index backfill and, possibly,
    30  // IMPORT/RESTORE.
    31  var maxSyncDurationFatalOnExceeded = envutil.EnvOrDefaultBool("COCKROACH_ENGINE_MAX_SYNC_DURATION_FATAL", false)
    32  
    33  // startAssertEngineHealth starts a goroutine that periodically verifies that
    34  // syncing the engines is possible within maxSyncDuration. If not,
    35  // the process is terminated (with an attempt at a descriptive message).
    36  func (n *Node) startAssertEngineHealth(ctx context.Context, engines []storage.Engine) {
    37  	n.stopper.RunWorker(ctx, func(ctx context.Context) {
    38  		t := timeutil.NewTimer()
    39  		t.Reset(0)
    40  
    41  		for {
    42  			select {
    43  			case <-t.C:
    44  				t.Read = true
    45  				t.Reset(10 * time.Second)
    46  				n.assertEngineHealth(ctx, engines, maxSyncDuration)
    47  			case <-n.stopper.ShouldQuiesce():
    48  				return
    49  			}
    50  		}
    51  	})
    52  }
    53  
    54  func guaranteedExitFatal(ctx context.Context, msg string, args ...interface{}) {
    55  	// NB: log.Shout sets up a timer that guarantees process termination.
    56  	log.Shoutf(ctx, log.Severity_FATAL, msg, args...)
    57  }
    58  
    59  func (n *Node) assertEngineHealth(
    60  	ctx context.Context, engines []storage.Engine, maxDuration time.Duration,
    61  ) {
    62  	for _, eng := range engines {
    63  		func() {
    64  			t := time.AfterFunc(maxDuration, func() {
    65  				n.metrics.DiskStalls.Inc(1)
    66  				stats := "\n" + eng.GetCompactionStats()
    67  				logger := log.Warningf
    68  				if maxSyncDurationFatalOnExceeded {
    69  					logger = guaranteedExitFatal
    70  				}
    71  				// NB: the disk-stall-detected roachtest matches on this message.
    72  				logger(ctx, "disk stall detected: unable to write to %s within %s %s",
    73  					eng, maxSyncDuration, stats,
    74  				)
    75  			})
    76  			defer t.Stop()
    77  			if err := storage.WriteSyncNoop(ctx, eng); err != nil {
    78  				log.Fatalf(ctx, "%v", err)
    79  			}
    80  		}()
    81  	}
    82  }