github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/consistency_queue.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/config"
    18  	"github.com/cockroachdb/cockroach/pkg/gossip"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/settings"
    21  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    22  	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
    23  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  )
    26  
    27  var consistencyCheckInterval = settings.RegisterNonNegativeDurationSetting(
    28  	"server.consistency_check.interval",
    29  	"the time between range consistency checks; set to 0 to disable consistency checking."+
    30  		" Note that intervals that are too short can negatively impact performance.",
    31  	24*time.Hour,
    32  )
    33  
    34  var testingAggressiveConsistencyChecks = envutil.EnvOrDefaultBool("COCKROACH_CONSISTENCY_AGGRESSIVE", false)
    35  
    36  type consistencyQueue struct {
    37  	*baseQueue
    38  	interval       func() time.Duration
    39  	replicaCountFn func() int
    40  }
    41  
    42  // newConsistencyQueue returns a new instance of consistencyQueue.
    43  func newConsistencyQueue(store *Store, gossip *gossip.Gossip) *consistencyQueue {
    44  	q := &consistencyQueue{
    45  		interval: func() time.Duration {
    46  			return consistencyCheckInterval.Get(&store.ClusterSettings().SV)
    47  		},
    48  		replicaCountFn: store.ReplicaCount,
    49  	}
    50  	q.baseQueue = newBaseQueue(
    51  		"consistencyChecker", q, store, gossip,
    52  		queueConfig{
    53  			maxSize:              defaultQueueMaxSize,
    54  			needsLease:           true,
    55  			needsSystemConfig:    false,
    56  			acceptsUnsplitRanges: true,
    57  			successes:            store.metrics.ConsistencyQueueSuccesses,
    58  			failures:             store.metrics.ConsistencyQueueFailures,
    59  			pending:              store.metrics.ConsistencyQueuePending,
    60  			processingNanos:      store.metrics.ConsistencyQueueProcessingNanos,
    61  		},
    62  	)
    63  	return q
    64  }
    65  
    66  func (q *consistencyQueue) shouldQueue(
    67  	ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig,
    68  ) (bool, float64) {
    69  	interval := q.interval()
    70  	if interval <= 0 {
    71  		return false, 0
    72  	}
    73  
    74  	shouldQ, priority := true, float64(0)
    75  	if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck {
    76  		lpTS, err := repl.getQueueLastProcessed(ctx, q.name)
    77  		if err != nil {
    78  			return false, 0
    79  		}
    80  		if shouldQ, priority = shouldQueueAgain(now, lpTS, interval); !shouldQ {
    81  			return false, 0
    82  		}
    83  	}
    84  	// Check if all replicas are live. Some tests run without a NodeLiveness configured.
    85  	if repl.store.cfg.NodeLiveness != nil {
    86  		for _, rep := range repl.Desc().Replicas().All() {
    87  			if live, err := repl.store.cfg.NodeLiveness.IsLive(rep.NodeID); err != nil {
    88  				log.VErrEventf(ctx, 3, "node %d liveness failed: %s", rep.NodeID, err)
    89  				return false, 0
    90  			} else if !live {
    91  				return false, 0
    92  			}
    93  		}
    94  	}
    95  	return true, priority
    96  }
    97  
    98  // process() is called on every range for which this node is a lease holder.
    99  func (q *consistencyQueue) process(
   100  	ctx context.Context, repl *Replica, _ *config.SystemConfig,
   101  ) error {
   102  	if q.interval() <= 0 {
   103  		return nil
   104  	}
   105  
   106  	// Call setQueueLastProcessed because the consistency checker targets a much
   107  	// longer cycle time than other queues. That it ignores errors is likely a
   108  	// historical accident that should be revisited.
   109  	if err := repl.setQueueLastProcessed(ctx, q.name, repl.store.Clock().Now()); err != nil {
   110  		log.VErrEventf(ctx, 2, "failed to update last processed time: %v", err)
   111  	}
   112  
   113  	req := roachpb.CheckConsistencyRequest{
   114  		// Tell CheckConsistency that the caller is the queue. This triggers
   115  		// code to handle inconsistencies by recomputing with a diff and
   116  		// instructing the nodes in the minority to terminate with a fatal
   117  		// error. It also triggers a stats readjustment if there is no
   118  		// inconsistency but the persisted stats are found to disagree with
   119  		// those reflected in the data. All of this really ought to be lifted
   120  		// into the queue in the future.
   121  		Mode: roachpb.ChecksumMode_CHECK_VIA_QUEUE,
   122  	}
   123  	resp, pErr := repl.CheckConsistency(ctx, req)
   124  	if pErr != nil {
   125  		var shouldQuiesce bool
   126  		select {
   127  		case <-repl.store.Stopper().ShouldQuiesce():
   128  			shouldQuiesce = true
   129  		default:
   130  		}
   131  
   132  		if !shouldQuiesce || !grpcutil.IsClosedConnection(pErr.GoError()) {
   133  			// Suppress noisy errors about closed GRPC connections when the
   134  			// server is quiescing.
   135  			err := pErr.GoError()
   136  			log.Errorf(ctx, "%v", err)
   137  			return err
   138  		}
   139  		return nil
   140  	}
   141  	if fn := repl.store.cfg.TestingKnobs.ConsistencyTestingKnobs.ConsistencyQueueResultHook; fn != nil {
   142  		fn(resp)
   143  	}
   144  	return nil
   145  }
   146  
   147  func (q *consistencyQueue) timer(duration time.Duration) time.Duration {
   148  	// An interval between replicas to space consistency checks out over
   149  	// the check interval.
   150  	replicaCount := q.replicaCountFn()
   151  	if replicaCount == 0 {
   152  		return 0
   153  	}
   154  	replInterval := q.interval() / time.Duration(replicaCount)
   155  	if replInterval < duration {
   156  		return 0
   157  	}
   158  	return replInterval - duration
   159  }
   160  
   161  // purgatoryChan returns nil.
   162  func (*consistencyQueue) purgatoryChan() <-chan time.Time {
   163  	return nil
   164  }