github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/physicalplan/replicaoracle/oracle.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package replicaoracle provides functionality for physicalplan to choose a 12 // replica for a range. 13 package replicaoracle 14 15 import ( 16 "context" 17 "math" 18 "math/rand" 19 20 "github.com/cockroachdb/cockroach/pkg/gossip" 21 "github.com/cockroachdb/cockroach/pkg/kv" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/rpc" 25 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 26 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/errors" 29 ) 30 31 // Policy determines how an Oracle should select a replica. 32 type Policy byte 33 34 var ( 35 // RandomChoice chooses lease replicas randomly. 36 RandomChoice = RegisterPolicy(newRandomOracleFactory) 37 // BinPackingChoice bin-packs the choices. 38 BinPackingChoice = RegisterPolicy(newBinPackingOracleFactory) 39 // ClosestChoice chooses the node closest to the current node. 40 ClosestChoice = RegisterPolicy(newClosestOracleFactory) 41 ) 42 43 // Config is used to construct an OracleFactory. 44 type Config struct { 45 NodeDesc roachpb.NodeDescriptor 46 Settings *cluster.Settings 47 Gossip gossip.DeprecatedOracleGossip 48 RPCContext *rpc.Context 49 LeaseHolderCache *kvcoord.LeaseHolderCache 50 } 51 52 // Oracle is used to choose the lease holder for ranges. This 53 // interface was extracted so we can experiment with different choosing 54 // policies. 55 // Note that choices that start out random can act as self-fulfilling prophecies 56 // - if there's no active lease, the node that will be asked to execute part of 57 // the query (the chosen node) will acquire a new lease. 58 type Oracle interface { 59 // ChoosePreferredReplica returns a choice for one range. Implementors are 60 // free to use the queryState param, which has info about the number of ranges 61 // already handled by each node for the current SQL query. The state is not 62 // updated with the result of this method; the caller is in charge of that. 63 // 64 // A RangeUnavailableError can be returned if there's no information in gossip 65 // about any of the nodes that might be tried. 66 ChoosePreferredReplica( 67 context.Context, roachpb.RangeDescriptor, QueryState, 68 ) (roachpb.ReplicaDescriptor, error) 69 } 70 71 // OracleFactory creates an oracle for a Txn. 72 type OracleFactory interface { 73 Oracle(*kv.Txn) Oracle 74 } 75 76 // OracleFactoryFunc creates an OracleFactory from a Config. 77 type OracleFactoryFunc func(Config) OracleFactory 78 79 // NewOracleFactory creates an oracle with the given policy. 80 func NewOracleFactory(policy Policy, cfg Config) OracleFactory { 81 ff, ok := oracleFactoryFuncs[policy] 82 if !ok { 83 panic(errors.Errorf("unknown Policy %v", policy)) 84 } 85 return ff(cfg) 86 } 87 88 // RegisterPolicy creates a new policy given a function which constructs an 89 // OracleFactory. RegisterPolicy is intended to be called only during init and 90 // is not safe for concurrent use. 91 func RegisterPolicy(f OracleFactoryFunc) Policy { 92 if len(oracleFactoryFuncs) == 255 { 93 panic("Can only register 255 Policy instances") 94 } 95 r := Policy(len(oracleFactoryFuncs)) 96 oracleFactoryFuncs[r] = f 97 return r 98 } 99 100 var oracleFactoryFuncs = map[Policy]OracleFactoryFunc{} 101 102 // QueryState encapsulates the history of assignments of ranges to nodes 103 // done by an oracle on behalf of one particular query. 104 type QueryState struct { 105 RangesPerNode map[roachpb.NodeID]int 106 AssignedRanges map[roachpb.RangeID]roachpb.ReplicaDescriptor 107 } 108 109 // MakeQueryState creates an initialized QueryState. 110 func MakeQueryState() QueryState { 111 return QueryState{ 112 RangesPerNode: make(map[roachpb.NodeID]int), 113 AssignedRanges: make(map[roachpb.RangeID]roachpb.ReplicaDescriptor), 114 } 115 } 116 117 // randomOracle is a Oracle that chooses the lease holder randomly 118 // among the replicas in a range descriptor. 119 type randomOracle struct { 120 gossip gossip.DeprecatedOracleGossip 121 } 122 123 var _ OracleFactory = &randomOracle{} 124 125 func newRandomOracleFactory(cfg Config) OracleFactory { 126 return &randomOracle{gossip: cfg.Gossip} 127 } 128 129 func (o *randomOracle) Oracle(_ *kv.Txn) Oracle { 130 return o 131 } 132 133 func (o *randomOracle) ChoosePreferredReplica( 134 ctx context.Context, desc roachpb.RangeDescriptor, _ QueryState, 135 ) (roachpb.ReplicaDescriptor, error) { 136 replicas, err := replicaSliceOrErr(desc, o.gossip) 137 if err != nil { 138 return roachpb.ReplicaDescriptor{}, err 139 } 140 return replicas[rand.Intn(len(replicas))].ReplicaDescriptor, nil 141 } 142 143 type closestOracle struct { 144 gossip gossip.DeprecatedOracleGossip 145 latencyFunc kvcoord.LatencyFunc 146 // nodeDesc is the descriptor of the current node. It will be used to give 147 // preference to the current node and others "close" to it. 148 nodeDesc roachpb.NodeDescriptor 149 } 150 151 func newClosestOracleFactory(cfg Config) OracleFactory { 152 return &closestOracle{ 153 latencyFunc: latencyFunc(cfg.RPCContext), 154 gossip: cfg.Gossip, 155 nodeDesc: cfg.NodeDesc, 156 } 157 } 158 159 func (o *closestOracle) Oracle(_ *kv.Txn) Oracle { 160 return o 161 } 162 163 func (o *closestOracle) ChoosePreferredReplica( 164 ctx context.Context, desc roachpb.RangeDescriptor, _ QueryState, 165 ) (roachpb.ReplicaDescriptor, error) { 166 replicas, err := replicaSliceOrErr(desc, o.gossip) 167 if err != nil { 168 return roachpb.ReplicaDescriptor{}, err 169 } 170 replicas.OptimizeReplicaOrder(&o.nodeDesc, o.latencyFunc) 171 return replicas[0].ReplicaDescriptor, nil 172 } 173 174 // maxPreferredRangesPerLeaseHolder applies to the binPackingOracle. 175 // When choosing lease holders, we try to choose the same node for all the 176 // ranges applicable, until we hit this limit. The rationale is that maybe a 177 // bunch of those ranges don't have an active lease, so our choice is going to 178 // be self-fulfilling. If so, we want to collocate the lease holders. But above 179 // some limit, we prefer to take the parallelism and distribute to multiple 180 // nodes. The actual number used is based on nothing. 181 const maxPreferredRangesPerLeaseHolder = 10 182 183 // binPackingOracle coalesces choices together, so it gives preference to 184 // replicas on nodes that are already assumed to be lease holders for some other 185 // ranges that are going to be part of a single query. 186 // Secondarily, it gives preference to replicas that are "close" to the current 187 // node. 188 // Finally, it tries not to overload any node. 189 type binPackingOracle struct { 190 leaseHolderCache *kvcoord.LeaseHolderCache 191 maxPreferredRangesPerLeaseHolder int 192 gossip gossip.DeprecatedOracleGossip 193 latencyFunc kvcoord.LatencyFunc 194 // nodeDesc is the descriptor of the current node. It will be used to give 195 // preference to the current node and others "close" to it. 196 nodeDesc roachpb.NodeDescriptor 197 } 198 199 func newBinPackingOracleFactory(cfg Config) OracleFactory { 200 return &binPackingOracle{ 201 maxPreferredRangesPerLeaseHolder: maxPreferredRangesPerLeaseHolder, 202 gossip: cfg.Gossip, 203 nodeDesc: cfg.NodeDesc, 204 leaseHolderCache: cfg.LeaseHolderCache, 205 latencyFunc: latencyFunc(cfg.RPCContext), 206 } 207 } 208 209 var _ OracleFactory = &binPackingOracle{} 210 211 func (o *binPackingOracle) Oracle(_ *kv.Txn) Oracle { 212 return o 213 } 214 215 func (o *binPackingOracle) ChoosePreferredReplica( 216 ctx context.Context, desc roachpb.RangeDescriptor, queryState QueryState, 217 ) (roachpb.ReplicaDescriptor, error) { 218 // Attempt to find a cached lease holder and use it if found. 219 // If an error occurs, ignore it and proceed to choose a replica below. 220 if storeID, ok := o.leaseHolderCache.Lookup(ctx, desc.RangeID); ok { 221 repl := roachpb.ReplicaDescriptor{StoreID: storeID} 222 // Fill in the node descriptor. 223 nodeID, err := o.gossip.GetNodeIDForStoreID(storeID) 224 if err != nil { 225 log.VEventf(ctx, 2, "failed to lookup store %d: %s", storeID, err) 226 } else { 227 repl.NodeID = nodeID 228 return repl, nil 229 } 230 } 231 232 replicas, err := replicaSliceOrErr(desc, o.gossip) 233 if err != nil { 234 return roachpb.ReplicaDescriptor{}, err 235 } 236 replicas.OptimizeReplicaOrder(&o.nodeDesc, o.latencyFunc) 237 238 // Look for a replica that has been assigned some ranges, but it's not yet full. 239 minLoad := int(math.MaxInt32) 240 var leastLoadedIdx int 241 for i, repl := range replicas { 242 assignedRanges := queryState.RangesPerNode[repl.NodeID] 243 if assignedRanges != 0 && assignedRanges < o.maxPreferredRangesPerLeaseHolder { 244 return repl.ReplicaDescriptor, nil 245 } 246 if assignedRanges < minLoad { 247 leastLoadedIdx = i 248 minLoad = assignedRanges 249 } 250 } 251 // Either no replica was assigned any previous ranges, or all replicas are 252 // full. Use the least-loaded one (if all the load is 0, then the closest 253 // replica is returned). 254 return replicas[leastLoadedIdx].ReplicaDescriptor, nil 255 } 256 257 // replicaSliceOrErr returns a ReplicaSlice for the given range descriptor. 258 // ReplicaSlices are restricted to replicas on nodes for which a NodeDescriptor 259 // is available in gossip. If no nodes are available, a RangeUnavailableError is 260 // returned. 261 func replicaSliceOrErr( 262 desc roachpb.RangeDescriptor, gsp gossip.DeprecatedOracleGossip, 263 ) (kvcoord.ReplicaSlice, error) { 264 // Learner replicas won't serve reads/writes, so send only to the `Voters` 265 // replicas. This is just an optimization to save a network hop, everything 266 // would still work if we had `All` here. 267 voterReplicas := desc.Replicas().Voters() 268 replicas := kvcoord.NewReplicaSlice(gsp, voterReplicas) 269 if len(replicas) == 0 { 270 // We couldn't get node descriptors for any replicas. 271 var nodeIDs []roachpb.NodeID 272 for _, r := range voterReplicas { 273 nodeIDs = append(nodeIDs, r.NodeID) 274 } 275 return kvcoord.ReplicaSlice{}, sqlbase.NewRangeUnavailableError( 276 desc.RangeID, errors.Errorf("node info not available in gossip"), nodeIDs...) 277 } 278 return replicas, nil 279 } 280 281 // latencyFunc returns a kv.LatencyFunc for use with 282 // Replicas.OptimizeReplicaOrder. 283 func latencyFunc(rpcCtx *rpc.Context) kvcoord.LatencyFunc { 284 if rpcCtx != nil { 285 return rpcCtx.RemoteClocks.Latency 286 } 287 return nil 288 }