github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/physicalplan/replicaoracle/oracle.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Package replicaoracle provides functionality for physicalplan to choose a
    12  // replica for a range.
    13  package replicaoracle
    14  
    15  import (
    16  	"context"
    17  	"math"
    18  	"math/rand"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/gossip"
    21  	"github.com/cockroachdb/cockroach/pkg/kv"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/rpc"
    25  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/errors"
    29  )
    30  
    31  // Policy determines how an Oracle should select a replica.
    32  type Policy byte
    33  
    34  var (
    35  	// RandomChoice chooses lease replicas randomly.
    36  	RandomChoice = RegisterPolicy(newRandomOracleFactory)
    37  	// BinPackingChoice bin-packs the choices.
    38  	BinPackingChoice = RegisterPolicy(newBinPackingOracleFactory)
    39  	// ClosestChoice chooses the node closest to the current node.
    40  	ClosestChoice = RegisterPolicy(newClosestOracleFactory)
    41  )
    42  
    43  // Config is used to construct an OracleFactory.
    44  type Config struct {
    45  	NodeDesc         roachpb.NodeDescriptor
    46  	Settings         *cluster.Settings
    47  	Gossip           gossip.DeprecatedOracleGossip
    48  	RPCContext       *rpc.Context
    49  	LeaseHolderCache *kvcoord.LeaseHolderCache
    50  }
    51  
    52  // Oracle is used to choose the lease holder for ranges. This
    53  // interface was extracted so we can experiment with different choosing
    54  // policies.
    55  // Note that choices that start out random can act as self-fulfilling prophecies
    56  // - if there's no active lease, the node that will be asked to execute part of
    57  // the query (the chosen node) will acquire a new lease.
    58  type Oracle interface {
    59  	// ChoosePreferredReplica returns a choice for one range. Implementors are
    60  	// free to use the queryState param, which has info about the number of	ranges
    61  	// already handled by each node for the current SQL query. The state is not
    62  	// updated with the result of this method; the caller is in charge of  that.
    63  	//
    64  	// A RangeUnavailableError can be returned if there's no information in gossip
    65  	// about any of the nodes that might be tried.
    66  	ChoosePreferredReplica(
    67  		context.Context, roachpb.RangeDescriptor, QueryState,
    68  	) (roachpb.ReplicaDescriptor, error)
    69  }
    70  
    71  // OracleFactory creates an oracle for a Txn.
    72  type OracleFactory interface {
    73  	Oracle(*kv.Txn) Oracle
    74  }
    75  
    76  // OracleFactoryFunc creates an OracleFactory from a Config.
    77  type OracleFactoryFunc func(Config) OracleFactory
    78  
    79  // NewOracleFactory creates an oracle with the given policy.
    80  func NewOracleFactory(policy Policy, cfg Config) OracleFactory {
    81  	ff, ok := oracleFactoryFuncs[policy]
    82  	if !ok {
    83  		panic(errors.Errorf("unknown Policy %v", policy))
    84  	}
    85  	return ff(cfg)
    86  }
    87  
    88  // RegisterPolicy creates a new policy given a function which constructs an
    89  // OracleFactory. RegisterPolicy is intended to be called only during init and
    90  // is not safe for concurrent use.
    91  func RegisterPolicy(f OracleFactoryFunc) Policy {
    92  	if len(oracleFactoryFuncs) == 255 {
    93  		panic("Can only register 255 Policy instances")
    94  	}
    95  	r := Policy(len(oracleFactoryFuncs))
    96  	oracleFactoryFuncs[r] = f
    97  	return r
    98  }
    99  
   100  var oracleFactoryFuncs = map[Policy]OracleFactoryFunc{}
   101  
   102  // QueryState encapsulates the history of assignments of ranges to nodes
   103  // done by an oracle on behalf of one particular query.
   104  type QueryState struct {
   105  	RangesPerNode  map[roachpb.NodeID]int
   106  	AssignedRanges map[roachpb.RangeID]roachpb.ReplicaDescriptor
   107  }
   108  
   109  // MakeQueryState creates an initialized QueryState.
   110  func MakeQueryState() QueryState {
   111  	return QueryState{
   112  		RangesPerNode:  make(map[roachpb.NodeID]int),
   113  		AssignedRanges: make(map[roachpb.RangeID]roachpb.ReplicaDescriptor),
   114  	}
   115  }
   116  
   117  // randomOracle is a Oracle that chooses the lease holder randomly
   118  // among the replicas in a range descriptor.
   119  type randomOracle struct {
   120  	gossip gossip.DeprecatedOracleGossip
   121  }
   122  
   123  var _ OracleFactory = &randomOracle{}
   124  
   125  func newRandomOracleFactory(cfg Config) OracleFactory {
   126  	return &randomOracle{gossip: cfg.Gossip}
   127  }
   128  
   129  func (o *randomOracle) Oracle(_ *kv.Txn) Oracle {
   130  	return o
   131  }
   132  
   133  func (o *randomOracle) ChoosePreferredReplica(
   134  	ctx context.Context, desc roachpb.RangeDescriptor, _ QueryState,
   135  ) (roachpb.ReplicaDescriptor, error) {
   136  	replicas, err := replicaSliceOrErr(desc, o.gossip)
   137  	if err != nil {
   138  		return roachpb.ReplicaDescriptor{}, err
   139  	}
   140  	return replicas[rand.Intn(len(replicas))].ReplicaDescriptor, nil
   141  }
   142  
   143  type closestOracle struct {
   144  	gossip      gossip.DeprecatedOracleGossip
   145  	latencyFunc kvcoord.LatencyFunc
   146  	// nodeDesc is the descriptor of the current node. It will be used to give
   147  	// preference to the current node and others "close" to it.
   148  	nodeDesc roachpb.NodeDescriptor
   149  }
   150  
   151  func newClosestOracleFactory(cfg Config) OracleFactory {
   152  	return &closestOracle{
   153  		latencyFunc: latencyFunc(cfg.RPCContext),
   154  		gossip:      cfg.Gossip,
   155  		nodeDesc:    cfg.NodeDesc,
   156  	}
   157  }
   158  
   159  func (o *closestOracle) Oracle(_ *kv.Txn) Oracle {
   160  	return o
   161  }
   162  
   163  func (o *closestOracle) ChoosePreferredReplica(
   164  	ctx context.Context, desc roachpb.RangeDescriptor, _ QueryState,
   165  ) (roachpb.ReplicaDescriptor, error) {
   166  	replicas, err := replicaSliceOrErr(desc, o.gossip)
   167  	if err != nil {
   168  		return roachpb.ReplicaDescriptor{}, err
   169  	}
   170  	replicas.OptimizeReplicaOrder(&o.nodeDesc, o.latencyFunc)
   171  	return replicas[0].ReplicaDescriptor, nil
   172  }
   173  
   174  // maxPreferredRangesPerLeaseHolder applies to the binPackingOracle.
   175  // When choosing lease holders, we try to choose the same node for all the
   176  // ranges applicable, until we hit this limit. The rationale is that maybe a
   177  // bunch of those ranges don't have an active lease, so our choice is going to
   178  // be self-fulfilling. If so, we want to collocate the lease holders. But above
   179  // some limit, we prefer to take the parallelism and distribute to multiple
   180  // nodes. The actual number used is based on nothing.
   181  const maxPreferredRangesPerLeaseHolder = 10
   182  
   183  // binPackingOracle coalesces choices together, so it gives preference to
   184  // replicas on nodes that are already assumed to be lease holders for some other
   185  // ranges that are going to be part of a single query.
   186  // Secondarily, it gives preference to replicas that are "close" to the current
   187  // node.
   188  // Finally, it tries not to overload any node.
   189  type binPackingOracle struct {
   190  	leaseHolderCache                 *kvcoord.LeaseHolderCache
   191  	maxPreferredRangesPerLeaseHolder int
   192  	gossip                           gossip.DeprecatedOracleGossip
   193  	latencyFunc                      kvcoord.LatencyFunc
   194  	// nodeDesc is the descriptor of the current node. It will be used to give
   195  	// preference to the current node and others "close" to it.
   196  	nodeDesc roachpb.NodeDescriptor
   197  }
   198  
   199  func newBinPackingOracleFactory(cfg Config) OracleFactory {
   200  	return &binPackingOracle{
   201  		maxPreferredRangesPerLeaseHolder: maxPreferredRangesPerLeaseHolder,
   202  		gossip:                           cfg.Gossip,
   203  		nodeDesc:                         cfg.NodeDesc,
   204  		leaseHolderCache:                 cfg.LeaseHolderCache,
   205  		latencyFunc:                      latencyFunc(cfg.RPCContext),
   206  	}
   207  }
   208  
   209  var _ OracleFactory = &binPackingOracle{}
   210  
   211  func (o *binPackingOracle) Oracle(_ *kv.Txn) Oracle {
   212  	return o
   213  }
   214  
   215  func (o *binPackingOracle) ChoosePreferredReplica(
   216  	ctx context.Context, desc roachpb.RangeDescriptor, queryState QueryState,
   217  ) (roachpb.ReplicaDescriptor, error) {
   218  	// Attempt to find a cached lease holder and use it if found.
   219  	// If an error occurs, ignore it and proceed to choose a replica below.
   220  	if storeID, ok := o.leaseHolderCache.Lookup(ctx, desc.RangeID); ok {
   221  		repl := roachpb.ReplicaDescriptor{StoreID: storeID}
   222  		// Fill in the node descriptor.
   223  		nodeID, err := o.gossip.GetNodeIDForStoreID(storeID)
   224  		if err != nil {
   225  			log.VEventf(ctx, 2, "failed to lookup store %d: %s", storeID, err)
   226  		} else {
   227  			repl.NodeID = nodeID
   228  			return repl, nil
   229  		}
   230  	}
   231  
   232  	replicas, err := replicaSliceOrErr(desc, o.gossip)
   233  	if err != nil {
   234  		return roachpb.ReplicaDescriptor{}, err
   235  	}
   236  	replicas.OptimizeReplicaOrder(&o.nodeDesc, o.latencyFunc)
   237  
   238  	// Look for a replica that has been assigned some ranges, but it's not yet full.
   239  	minLoad := int(math.MaxInt32)
   240  	var leastLoadedIdx int
   241  	for i, repl := range replicas {
   242  		assignedRanges := queryState.RangesPerNode[repl.NodeID]
   243  		if assignedRanges != 0 && assignedRanges < o.maxPreferredRangesPerLeaseHolder {
   244  			return repl.ReplicaDescriptor, nil
   245  		}
   246  		if assignedRanges < minLoad {
   247  			leastLoadedIdx = i
   248  			minLoad = assignedRanges
   249  		}
   250  	}
   251  	// Either no replica was assigned any previous ranges, or all replicas are
   252  	// full. Use the least-loaded one (if all the load is 0, then the closest
   253  	// replica is returned).
   254  	return replicas[leastLoadedIdx].ReplicaDescriptor, nil
   255  }
   256  
   257  // replicaSliceOrErr returns a ReplicaSlice for the given range descriptor.
   258  // ReplicaSlices are restricted to replicas on nodes for which a NodeDescriptor
   259  // is available in gossip. If no nodes are available, a RangeUnavailableError is
   260  // returned.
   261  func replicaSliceOrErr(
   262  	desc roachpb.RangeDescriptor, gsp gossip.DeprecatedOracleGossip,
   263  ) (kvcoord.ReplicaSlice, error) {
   264  	// Learner replicas won't serve reads/writes, so send only to the `Voters`
   265  	// replicas. This is just an optimization to save a network hop, everything
   266  	// would still work if we had `All` here.
   267  	voterReplicas := desc.Replicas().Voters()
   268  	replicas := kvcoord.NewReplicaSlice(gsp, voterReplicas)
   269  	if len(replicas) == 0 {
   270  		// We couldn't get node descriptors for any replicas.
   271  		var nodeIDs []roachpb.NodeID
   272  		for _, r := range voterReplicas {
   273  			nodeIDs = append(nodeIDs, r.NodeID)
   274  		}
   275  		return kvcoord.ReplicaSlice{}, sqlbase.NewRangeUnavailableError(
   276  			desc.RangeID, errors.Errorf("node info not available in gossip"), nodeIDs...)
   277  	}
   278  	return replicas, nil
   279  }
   280  
   281  // latencyFunc returns a kv.LatencyFunc for use with
   282  // Replicas.OptimizeReplicaOrder.
   283  func latencyFunc(rpcCtx *rpc.Context) kvcoord.LatencyFunc {
   284  	if rpcCtx != nil {
   285  		return rpcCtx.RemoteClocks.Latency
   286  	}
   287  	return nil
   288  }