go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/internal/splitter/split.go (about) 1 // Copyright 2018 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package splitter implements SplitIntoRanges function useful when splitting 16 // large datastore queries into a bunch of smaller queries with approximately 17 // evenly-sized result sets. 18 // 19 // It is based on __scatter__ magical property. For more info see: 20 // https://github.com/GoogleCloudPlatform/appengine-mapreduce/wiki/ScatterPropertyImplementation 21 package splitter 22 23 import ( 24 "context" 25 "math" 26 "sort" 27 28 "go.chromium.org/luci/gae/service/datastore" 29 ) 30 31 // Params are passed to SplitIntoRanges. 32 // 33 // See the doc for SplitIntoRanges for more info. 34 type Params struct { 35 // Shards is maximum number of key ranges to return. 36 // 37 // Should be >=1. The function may return fewer key ranges if the query has 38 // very few results. In the most extreme case it can return one shard that 39 // covers the entirety of the key space. 40 Shards int 41 42 // Samples tells how many random entities to sample when deciding where to 43 // split the query. 44 // 45 // Higher number of samples means better accuracy of the split in exchange for 46 // slower execution of SplitIntoRanges. For large number of shards (hundreds), 47 // number of samples can be set to number of shards. For small number of 48 // shards (tens), it makes sense to sample 16x or even 32x more entities. 49 // 50 // If Samples is 0, default of 512 will be used. If Shards >= Samples, Shards 51 // will be used instead. 52 Samples int 53 } 54 55 // Range represents a range of datastore keys (Start, End]. 56 type Range struct { 57 Start *datastore.Key // if nil, then the range represents (0x000..., End] 58 End *datastore.Key // if nil, then the range represents (Start, 0xfff...) 59 } 60 61 // Apply adds >Start and <=End filters to the query and returns the resulting 62 // query. 63 func (r Range) Apply(q *datastore.Query) *datastore.Query { 64 if r.Start != nil { 65 q = q.Gt("__key__", r.Start) 66 } 67 if r.End != nil { 68 q = q.Lte("__key__", r.End) 69 } 70 return q 71 } 72 73 // IsEmpty is true if the range represents an empty set. 74 func (r Range) IsEmpty() bool { 75 if r.Start == nil || r.End == nil { 76 return false 77 } 78 return !r.Start.Less(r.End) 79 } 80 81 // SplitIntoRanges returns a list of key ranges (up to 'Shards') that together 82 // cover the results of the provided query. 83 // 84 // When all query results are fetched and split between returned ranges, sizes 85 // of resulting buckets are approximately even. 86 // 87 // Internally uses magical entity property __scatter__. It is set on ~0.8% of 88 // datastore entities. Querying a bunch of entities ordered by __scatter__ 89 // returns a pseudorandom sample of entities that match the query. To improve 90 // chances of a more even split, we query 'Samples' entities, and then pick the 91 // split points evenly among them. 92 // 93 // If the given query has filters, SplitIntoRanges may need a corresponding 94 // composite index that includes __scatter__ field. 95 // 96 // May return fewer ranges than requested if it detects there are too few 97 // entities. In extreme case may return a single range (000..., fff...) 98 // represented by Range struct with 'Start' and 'End' both set to nil. 99 func SplitIntoRanges(ctx context.Context, q *datastore.Query, p Params) ([]Range, error) { 100 if p.Shards < 1 { 101 panic("number of shards should be >=1") 102 } 103 if p.Samples == 0 { 104 p.Samples = 512 105 } 106 if p.Samples < p.Shards { 107 p.Samples = p.Shards 108 } 109 110 // Don't even bother if requested 1 shard. Return (-inf, +inf). 111 if p.Shards == 1 { 112 return []Range{{}}, nil 113 } 114 115 keys := make([]*datastore.Key, 0, p.Samples) 116 117 byScat := q.ClearOrder(). 118 Order("__scatter__"). 119 Limit(int32(p.Samples)). 120 KeysOnly(true) 121 if err := datastore.GetAll(ctx, byScat, &keys); err != nil { 122 return nil, err 123 } 124 125 // Here keys are ordered by __scatter__ (which is basically random). Reorder 126 // keys by, well, key: smallest first. 127 sort.Slice(keys, func(i, j int) bool { return keys[i].Less(keys[j]) }) 128 129 var splitPoints []*datastore.Key 130 if len(keys) < p.Shards { 131 // If number of results is less than number of shards, just use one entity 132 // per shard (and returns fewer than 'shards' results). In extreme case of 133 // empty query, this will return one (-inf, +inf) shard. 134 splitPoints = keys 135 } else { 136 // Otherwise evenly pick the split points among 'keys'. For N shards, there 137 // will be N-1 split points. For example, for 6 keys, and 3 shards: 138 // 139 // * * | * * | * * 140 // 141 // Since ranges include right boundaries, the chosen split points would be: 142 // 143 // * [*] * [*] * * 144 // 145 // Thus we'll pick a split point residing left to the (float) location of 146 // the split line. 147 // 148 // When calculating 'stride' we use len(keys)-1/shards because we want the 149 // split location to be "between" points. E.g for the case of 6 points and 150 // 2 shards, the split location should be 2.5: 151 // 152 // * * * | * * * 153 // 0 1 2 2.5 3 4 5 154 splitPoints = make([]*datastore.Key, p.Shards-1) 155 stride := float64(len(keys)-1) / float64(p.Shards) 156 for i := 0; i < len(splitPoints); i++ { 157 idx := int(math.Floor(stride*float64(i) + stride)) 158 splitPoints[i] = keys[idx] 159 } 160 } 161 162 // Use the calculated points to divides 'keys' into non-intersecting ranges 163 // that also cover (-inf, ...) and (..., +inf). In the extreme case of 0 split 164 // points, the result would be single (-inf, +inf) range. 165 ranges := make([]Range, 0, len(splitPoints)+1) 166 var prev *datastore.Key 167 for _, k := range splitPoints { 168 ranges = append(ranges, Range{ 169 Start: prev, 170 End: k, 171 }) 172 prev = k 173 } 174 ranges = append(ranges, Range{ 175 Start: prev, 176 }) 177 return ranges, nil 178 }