github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/consistency_queue.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/config" 18 "github.com/cockroachdb/cockroach/pkg/gossip" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/settings" 21 "github.com/cockroachdb/cockroach/pkg/util/envutil" 22 "github.com/cockroachdb/cockroach/pkg/util/grpcutil" 23 "github.com/cockroachdb/cockroach/pkg/util/hlc" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 ) 26 27 var consistencyCheckInterval = settings.RegisterNonNegativeDurationSetting( 28 "server.consistency_check.interval", 29 "the time between range consistency checks; set to 0 to disable consistency checking."+ 30 " Note that intervals that are too short can negatively impact performance.", 31 24*time.Hour, 32 ) 33 34 var testingAggressiveConsistencyChecks = envutil.EnvOrDefaultBool("COCKROACH_CONSISTENCY_AGGRESSIVE", false) 35 36 type consistencyQueue struct { 37 *baseQueue 38 interval func() time.Duration 39 replicaCountFn func() int 40 } 41 42 // newConsistencyQueue returns a new instance of consistencyQueue. 43 func newConsistencyQueue(store *Store, gossip *gossip.Gossip) *consistencyQueue { 44 q := &consistencyQueue{ 45 interval: func() time.Duration { 46 return consistencyCheckInterval.Get(&store.ClusterSettings().SV) 47 }, 48 replicaCountFn: store.ReplicaCount, 49 } 50 q.baseQueue = newBaseQueue( 51 "consistencyChecker", q, store, gossip, 52 queueConfig{ 53 maxSize: defaultQueueMaxSize, 54 needsLease: true, 55 needsSystemConfig: false, 56 acceptsUnsplitRanges: true, 57 successes: store.metrics.ConsistencyQueueSuccesses, 58 failures: store.metrics.ConsistencyQueueFailures, 59 pending: store.metrics.ConsistencyQueuePending, 60 processingNanos: store.metrics.ConsistencyQueueProcessingNanos, 61 }, 62 ) 63 return q 64 } 65 66 func (q *consistencyQueue) shouldQueue( 67 ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, 68 ) (bool, float64) { 69 interval := q.interval() 70 if interval <= 0 { 71 return false, 0 72 } 73 74 shouldQ, priority := true, float64(0) 75 if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck { 76 lpTS, err := repl.getQueueLastProcessed(ctx, q.name) 77 if err != nil { 78 return false, 0 79 } 80 if shouldQ, priority = shouldQueueAgain(now, lpTS, interval); !shouldQ { 81 return false, 0 82 } 83 } 84 // Check if all replicas are live. Some tests run without a NodeLiveness configured. 85 if repl.store.cfg.NodeLiveness != nil { 86 for _, rep := range repl.Desc().Replicas().All() { 87 if live, err := repl.store.cfg.NodeLiveness.IsLive(rep.NodeID); err != nil { 88 log.VErrEventf(ctx, 3, "node %d liveness failed: %s", rep.NodeID, err) 89 return false, 0 90 } else if !live { 91 return false, 0 92 } 93 } 94 } 95 return true, priority 96 } 97 98 // process() is called on every range for which this node is a lease holder. 99 func (q *consistencyQueue) process( 100 ctx context.Context, repl *Replica, _ *config.SystemConfig, 101 ) error { 102 if q.interval() <= 0 { 103 return nil 104 } 105 106 // Call setQueueLastProcessed because the consistency checker targets a much 107 // longer cycle time than other queues. That it ignores errors is likely a 108 // historical accident that should be revisited. 109 if err := repl.setQueueLastProcessed(ctx, q.name, repl.store.Clock().Now()); err != nil { 110 log.VErrEventf(ctx, 2, "failed to update last processed time: %v", err) 111 } 112 113 req := roachpb.CheckConsistencyRequest{ 114 // Tell CheckConsistency that the caller is the queue. This triggers 115 // code to handle inconsistencies by recomputing with a diff and 116 // instructing the nodes in the minority to terminate with a fatal 117 // error. It also triggers a stats readjustment if there is no 118 // inconsistency but the persisted stats are found to disagree with 119 // those reflected in the data. All of this really ought to be lifted 120 // into the queue in the future. 121 Mode: roachpb.ChecksumMode_CHECK_VIA_QUEUE, 122 } 123 resp, pErr := repl.CheckConsistency(ctx, req) 124 if pErr != nil { 125 var shouldQuiesce bool 126 select { 127 case <-repl.store.Stopper().ShouldQuiesce(): 128 shouldQuiesce = true 129 default: 130 } 131 132 if !shouldQuiesce || !grpcutil.IsClosedConnection(pErr.GoError()) { 133 // Suppress noisy errors about closed GRPC connections when the 134 // server is quiescing. 135 err := pErr.GoError() 136 log.Errorf(ctx, "%v", err) 137 return err 138 } 139 return nil 140 } 141 if fn := repl.store.cfg.TestingKnobs.ConsistencyTestingKnobs.ConsistencyQueueResultHook; fn != nil { 142 fn(resp) 143 } 144 return nil 145 } 146 147 func (q *consistencyQueue) timer(duration time.Duration) time.Duration { 148 // An interval between replicas to space consistency checks out over 149 // the check interval. 150 replicaCount := q.replicaCountFn() 151 if replicaCount == 0 { 152 return 0 153 } 154 replInterval := q.interval() / time.Duration(replicaCount) 155 if replInterval < duration { 156 return 0 157 } 158 return replInterval - duration 159 } 160 161 // purgatoryChan returns nil. 162 func (*consistencyQueue) purgatoryChan() <-chan time.Time { 163 return nil 164 }