go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/internal/splitter/split.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package splitter implements SplitIntoRanges function useful when splitting
    16  // large datastore queries into a bunch of smaller queries with approximately
    17  // evenly-sized result sets.
    18  //
    19  // It is based on __scatter__ magical property. For more info see:
    20  // https://github.com/GoogleCloudPlatform/appengine-mapreduce/wiki/ScatterPropertyImplementation
    21  package splitter
    22  
    23  import (
    24  	"context"
    25  	"math"
    26  	"sort"
    27  
    28  	"go.chromium.org/luci/gae/service/datastore"
    29  )
    30  
    31  // Params are passed to SplitIntoRanges.
    32  //
    33  // See the doc for SplitIntoRanges for more info.
    34  type Params struct {
    35  	// Shards is maximum number of key ranges to return.
    36  	//
    37  	// Should be >=1. The function may return fewer key ranges if the query has
    38  	// very few results. In the most extreme case it can return one shard that
    39  	// covers the entirety of the key space.
    40  	Shards int
    41  
    42  	// Samples tells how many random entities to sample when deciding where to
    43  	// split the query.
    44  	//
    45  	// Higher number of samples means better accuracy of the split in exchange for
    46  	// slower execution of SplitIntoRanges. For large number of shards (hundreds),
    47  	// number of samples can be set to number of shards. For small number of
    48  	// shards (tens), it makes sense to sample 16x or even 32x more entities.
    49  	//
    50  	// If Samples is 0, default of 512 will be used. If Shards >= Samples, Shards
    51  	// will be used instead.
    52  	Samples int
    53  }
    54  
    55  // Range represents a range of datastore keys (Start, End].
    56  type Range struct {
    57  	Start *datastore.Key // if nil, then the range represents (0x000..., End]
    58  	End   *datastore.Key // if nil, then the range represents (Start, 0xfff...)
    59  }
    60  
    61  // Apply adds >Start and <=End filters to the query and returns the resulting
    62  // query.
    63  func (r Range) Apply(q *datastore.Query) *datastore.Query {
    64  	if r.Start != nil {
    65  		q = q.Gt("__key__", r.Start)
    66  	}
    67  	if r.End != nil {
    68  		q = q.Lte("__key__", r.End)
    69  	}
    70  	return q
    71  }
    72  
    73  // IsEmpty is true if the range represents an empty set.
    74  func (r Range) IsEmpty() bool {
    75  	if r.Start == nil || r.End == nil {
    76  		return false
    77  	}
    78  	return !r.Start.Less(r.End)
    79  }
    80  
    81  // SplitIntoRanges returns a list of key ranges (up to 'Shards') that together
    82  // cover the results of the provided query.
    83  //
    84  // When all query results are fetched and split between returned ranges, sizes
    85  // of resulting buckets are approximately even.
    86  //
    87  // Internally uses magical entity property __scatter__. It is set on ~0.8% of
    88  // datastore entities. Querying a bunch of entities ordered by __scatter__
    89  // returns a pseudorandom sample of entities that match the query. To improve
    90  // chances of a more even split, we query 'Samples' entities, and then pick the
    91  // split points evenly among them.
    92  //
    93  // If the given query has filters, SplitIntoRanges may need a corresponding
    94  // composite index that includes __scatter__ field.
    95  //
    96  // May return fewer ranges than requested if it detects there are too few
    97  // entities. In extreme case may return a single range (000..., fff...)
    98  // represented by Range struct with 'Start' and 'End' both set to nil.
    99  func SplitIntoRanges(ctx context.Context, q *datastore.Query, p Params) ([]Range, error) {
   100  	if p.Shards < 1 {
   101  		panic("number of shards should be >=1")
   102  	}
   103  	if p.Samples == 0 {
   104  		p.Samples = 512
   105  	}
   106  	if p.Samples < p.Shards {
   107  		p.Samples = p.Shards
   108  	}
   109  
   110  	// Don't even bother if requested 1 shard. Return (-inf, +inf).
   111  	if p.Shards == 1 {
   112  		return []Range{{}}, nil
   113  	}
   114  
   115  	keys := make([]*datastore.Key, 0, p.Samples)
   116  
   117  	byScat := q.ClearOrder().
   118  		Order("__scatter__").
   119  		Limit(int32(p.Samples)).
   120  		KeysOnly(true)
   121  	if err := datastore.GetAll(ctx, byScat, &keys); err != nil {
   122  		return nil, err
   123  	}
   124  
   125  	// Here keys are ordered by __scatter__ (which is basically random). Reorder
   126  	// keys by, well, key: smallest first.
   127  	sort.Slice(keys, func(i, j int) bool { return keys[i].Less(keys[j]) })
   128  
   129  	var splitPoints []*datastore.Key
   130  	if len(keys) < p.Shards {
   131  		// If number of results is less than number of shards, just use one entity
   132  		// per shard (and returns fewer than 'shards' results). In extreme case of
   133  		// empty query, this will return one (-inf, +inf) shard.
   134  		splitPoints = keys
   135  	} else {
   136  		// Otherwise evenly pick the split points among 'keys'. For N shards, there
   137  		// will be N-1 split points. For example, for 6 keys, and 3 shards:
   138  		//
   139  		// * * | * * | * *
   140  		//
   141  		// Since ranges include right boundaries, the chosen split points would be:
   142  		//
   143  		// * [*] * [*] * *
   144  		//
   145  		// Thus we'll pick a split point residing left to the (float) location of
   146  		// the split line.
   147  		//
   148  		// When calculating 'stride' we use len(keys)-1/shards because we want the
   149  		// split location to be "between" points. E.g for the case of 6 points and
   150  		// 2 shards, the split location should be 2.5:
   151  		//
   152  		// *   *   *   |   *   *   *
   153  		// 0   1   2  2.5  3   4   5
   154  		splitPoints = make([]*datastore.Key, p.Shards-1)
   155  		stride := float64(len(keys)-1) / float64(p.Shards)
   156  		for i := 0; i < len(splitPoints); i++ {
   157  			idx := int(math.Floor(stride*float64(i) + stride))
   158  			splitPoints[i] = keys[idx]
   159  		}
   160  	}
   161  
   162  	// Use the calculated points to divides 'keys' into non-intersecting ranges
   163  	// that also cover (-inf, ...) and (..., +inf). In the extreme case of 0 split
   164  	// points, the result would be single (-inf, +inf) range.
   165  	ranges := make([]Range, 0, len(splitPoints)+1)
   166  	var prev *datastore.Key
   167  	for _, k := range splitPoints {
   168  		ranges = append(ranges, Range{
   169  			Start: prev,
   170  			End:   k,
   171  		})
   172  		prev = k
   173  	}
   174  	ranges = append(ranges, Range{
   175  		Start: prev,
   176  	})
   177  	return ranges, nil
   178  }