github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node_engine_health.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "context" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/storage" 18 "github.com/cockroachdb/cockroach/pkg/util/envutil" 19 "github.com/cockroachdb/cockroach/pkg/util/log" 20 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 21 ) 22 23 // maxSyncDuration is the threshold above which an observed engine sync duration 24 // triggers either a warning or a fatal error. 25 var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_ENGINE_MAX_SYNC_DURATION", 10*time.Second) 26 27 // maxSyncDurationFatalOnExceeded defaults to false due to issues such as 28 // https://github.com/cockroachdb/cockroach/issues/34860#issuecomment-469262019. 29 // Similar problems have been known to occur during index backfill and, possibly, 30 // IMPORT/RESTORE. 31 var maxSyncDurationFatalOnExceeded = envutil.EnvOrDefaultBool("COCKROACH_ENGINE_MAX_SYNC_DURATION_FATAL", false) 32 33 // startAssertEngineHealth starts a goroutine that periodically verifies that 34 // syncing the engines is possible within maxSyncDuration. If not, 35 // the process is terminated (with an attempt at a descriptive message). 36 func (n *Node) startAssertEngineHealth(ctx context.Context, engines []storage.Engine) { 37 n.stopper.RunWorker(ctx, func(ctx context.Context) { 38 t := timeutil.NewTimer() 39 t.Reset(0) 40 41 for { 42 select { 43 case <-t.C: 44 t.Read = true 45 t.Reset(10 * time.Second) 46 n.assertEngineHealth(ctx, engines, maxSyncDuration) 47 case <-n.stopper.ShouldQuiesce(): 48 return 49 } 50 } 51 }) 52 } 53 54 func guaranteedExitFatal(ctx context.Context, msg string, args ...interface{}) { 55 // NB: log.Shout sets up a timer that guarantees process termination. 56 log.Shoutf(ctx, log.Severity_FATAL, msg, args...) 57 } 58 59 func (n *Node) assertEngineHealth( 60 ctx context.Context, engines []storage.Engine, maxDuration time.Duration, 61 ) { 62 for _, eng := range engines { 63 func() { 64 t := time.AfterFunc(maxDuration, func() { 65 n.metrics.DiskStalls.Inc(1) 66 stats := "\n" + eng.GetCompactionStats() 67 logger := log.Warningf 68 if maxSyncDurationFatalOnExceeded { 69 logger = guaranteedExitFatal 70 } 71 // NB: the disk-stall-detected roachtest matches on this message. 72 logger(ctx, "disk stall detected: unable to write to %s within %s %s", 73 eng, maxSyncDuration, stats, 74 ) 75 }) 76 defer t.Stop() 77 if err := storage.WriteSyncNoop(ctx, eng); err != nil { 78 log.Fatalf(ctx, "%v", err) 79 } 80 }() 81 } 82 }