go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/internal/splitter/split_test.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package splitter
    16  
    17  import (
    18  	"context"
    19  	"math/rand"
    20  	"testing"
    21  
    22  	"go.chromium.org/luci/gae/impl/memory"
    23  	"go.chromium.org/luci/gae/service/datastore"
    24  
    25  	. "github.com/smartystreets/goconvey/convey"
    26  )
    27  
    28  type RootEntity struct {
    29  	ID int `gae:"$id"`
    30  }
    31  
    32  type IntEntity struct {
    33  	ID     int            `gae:"$id"`
    34  	Parent *datastore.Key `gae:"$parent"`
    35  }
    36  
    37  type StringEntity struct {
    38  	ID     string         `gae:"$id"`
    39  	Parent *datastore.Key `gae:"$parent"`
    40  }
    41  
    42  func putIntEntities(ctx context.Context, parent *datastore.Key, ids []int) {
    43  	e := make([]*IntEntity, len(ids))
    44  	for i, id := range ids {
    45  		e[i] = &IntEntity{
    46  			ID:     id,
    47  			Parent: parent,
    48  		}
    49  	}
    50  	if err := datastore.Put(ctx, e); err != nil {
    51  		panic(err)
    52  	}
    53  }
    54  
    55  type intRange struct {
    56  	start int
    57  	end   int
    58  	size  int
    59  }
    60  
    61  func extractRanges(ranges []Range, max int) []intRange {
    62  	out := make([]intRange, len(ranges))
    63  	for i, rng := range ranges {
    64  		r := intRange{-1, -1, 0}
    65  		if rng.Start != nil {
    66  			r.start = int(rng.Start.IntID())
    67  		}
    68  		if rng.End != nil {
    69  			r.end = int(rng.End.IntID())
    70  		}
    71  		switch {
    72  		case r.start != -1 && r.end != -1:
    73  			r.size = r.end - r.start
    74  		case max != 0 && r.start == -1 && r.end == -1:
    75  			r.size = max
    76  		case r.start == -1:
    77  			r.size = r.end
    78  		case max != 0 && r.end == -1:
    79  			r.size = max - r.start
    80  		}
    81  		out[i] = r
    82  	}
    83  	return out
    84  }
    85  
    86  func countRanges(ctx context.Context, q *datastore.Query, ranges []Range) []int {
    87  	out := make([]int, len(ranges))
    88  	for i, r := range ranges {
    89  		count, err := datastore.Count(ctx, r.Apply(q))
    90  		if err != nil {
    91  			panic(err)
    92  		}
    93  		out[i] = int(count)
    94  	}
    95  	return out
    96  }
    97  
    98  func TestSplitIntoRanges(t *testing.T) {
    99  	Convey("Works", t, func() {
   100  		ctx := memory.Use(context.Background())
   101  		rnd := rand.New(rand.NewSource(1))
   102  
   103  		datastore.GetTestable(ctx).AutoIndex(true)
   104  
   105  		getRanges := func(q *datastore.Query, shards, samples, max int) []intRange {
   106  			ranges, err := SplitIntoRanges(ctx, q, Params{
   107  				Shards:  shards,
   108  				Samples: samples,
   109  			})
   110  			So(err, ShouldBeNil)
   111  			return extractRanges(ranges, max)
   112  		}
   113  
   114  		getShardSizes := func(q *datastore.Query, shards, samples int) []int {
   115  			ranges, err := SplitIntoRanges(ctx, q, Params{
   116  				Shards:  shards,
   117  				Samples: samples,
   118  			})
   119  			So(err, ShouldBeNil)
   120  			return countRanges(ctx, q, ranges)
   121  		}
   122  
   123  		Convey("With mostly empty query", func() {
   124  			putIntEntities(ctx, nil, []int{1, 2, 3, 4, 5, 6})
   125  			maxID := 6
   126  			datastore.GetTestable(ctx).CatchupIndexes()
   127  
   128  			q := datastore.NewQuery("IntEntity")
   129  
   130  			// Special case for 1 shard.
   131  			So(getRanges(q, 1, 1.0, maxID), ShouldResemble, []intRange{
   132  				{-1, -1, 6},
   133  			})
   134  
   135  			// Not enough split points for 4 shards. Got only 2.
   136  			So(getRanges(q, 4, 1.0, maxID), ShouldResemble, []intRange{
   137  				{-1, 4, 4},
   138  				{4, -1, 2},
   139  			})
   140  		})
   141  
   142  		Convey("With evenly distributed int keys", func() {
   143  			ints := []int{}
   144  			maxID := 1024
   145  			for i := 0; i < maxID; i++ {
   146  				ints = append(ints, i)
   147  			}
   148  			putIntEntities(ctx, nil, ints)
   149  			datastore.GetTestable(ctx).CatchupIndexes()
   150  
   151  			q := datastore.NewQuery("IntEntity")
   152  
   153  			// 2 shards. With oversampling we get better accuracy (the middle point is
   154  			// closer to 512).
   155  			So(getRanges(q, 2, 2, maxID), ShouldResemble, []intRange{
   156  				{-1, 399, 399},
   157  				{399, -1, 625},
   158  			})
   159  			So(getRanges(q, 2, 64, maxID), ShouldResemble, []intRange{
   160  				{-1, 476, 476},
   161  				{476, -1, 548},
   162  			})
   163  			So(getRanges(q, 2, 256, maxID), ShouldResemble, []intRange{
   164  				{-1, 489, 489},
   165  				{489, -1, 535},
   166  			})
   167  
   168  			// 3 shards. With oversampling we get better accuracy (shard size is close
   169  			// to 340, shards are more even).
   170  			So(getRanges(q, 3, 3, maxID), ShouldResemble, []intRange{
   171  				{-1, 265, 265},
   172  				{265, 399, 134},
   173  				{399, -1, 625},
   174  			})
   175  			So(getRanges(q, 3, 96, maxID), ShouldResemble, []intRange{
   176  				{-1, 285, 285},
   177  				{285, 696, 411},
   178  				{696, -1, 328},
   179  			})
   180  			So(getRanges(q, 3, 384, maxID), ShouldResemble, []intRange{
   181  				{-1, 327, 327},
   182  				{327, 658, 331},
   183  				{658, -1, 366},
   184  			})
   185  		})
   186  
   187  		Convey("With normally distributed int keys", func() {
   188  			ints := []int{}
   189  			seen := map[int]bool{}
   190  			for i := 0; i < 1000; i++ {
   191  				key := int(rnd.NormFloat64()*2000 + 50000)
   192  				if key > 0 && !seen[key] {
   193  					ints = append(ints, key)
   194  					seen[key] = true
   195  				}
   196  			}
   197  			putIntEntities(ctx, nil, ints)
   198  			datastore.GetTestable(ctx).CatchupIndexes()
   199  
   200  			q := datastore.NewQuery("IntEntity")
   201  
   202  			// Had fewer distinct points generated (due to rnd giving duplicates).
   203  			// All points are present in the single shard if not really sharding.
   204  			So(len(seen), ShouldEqual, 937)
   205  			So(getShardSizes(q, 1, 1), ShouldResemble, []int{len(seen)})
   206  
   207  			// Shards have ~ even sizes (in terms of number of points there).
   208  			shards := getShardSizes(q, 10, 320)
   209  			So(shards, ShouldResemble, []int{
   210  				80, 97, 105, 80, 92, 95, 106, 103, 87, 92,
   211  			})
   212  
   213  			// But key ranges are very different, reflecting distribution of points.
   214  			So(getRanges(q, 10, 320, 0), ShouldResemble, []intRange{
   215  				{-1, 47124, 47124},
   216  				{47124, 48158, 1034},
   217  				{48158, 48925, 767},
   218  				{48925, 49363, 438},
   219  				{49363, 49871, 508},
   220  				{49871, 50352, 481},
   221  				{50352, 50973, 621},
   222  				{50973, 51677, 704},
   223  				{51677, 52573, 896},
   224  				{52573, -1, 0},
   225  			})
   226  
   227  			// All points are counted.
   228  			total := 0
   229  			for _, s := range shards {
   230  				total += s
   231  			}
   232  			So(total, ShouldEqual, len(seen))
   233  		})
   234  
   235  		Convey("Handles ancestor filter", func() {
   236  			root1 := datastore.KeyForObj(ctx, &RootEntity{ID: 1})
   237  			root2 := datastore.KeyForObj(ctx, &RootEntity{ID: 2})
   238  
   239  			putIntEntities(ctx, root1, []int{1, 2, 3, 4})
   240  			putIntEntities(ctx, root2, []int{1, 2, 3, 4, 5, 6, 7, 8})
   241  			datastore.GetTestable(ctx).CatchupIndexes()
   242  
   243  			q := datastore.NewQuery("IntEntity")
   244  
   245  			// Non-ancestor query discovers all 12 entities.
   246  			So(getShardSizes(q, 4, 128), ShouldResemble, []int{
   247  				2, 2, 4, 4,
   248  			})
   249  
   250  			// With ancestor query, discovers only entities that match the query.
   251  			So(getShardSizes(q.Ancestor(root1), 4, 128), ShouldResemble, []int{
   252  				1, 1, 2, 0, // 4 total
   253  			})
   254  			So(getShardSizes(q.Ancestor(root2), 4, 128), ShouldResemble, []int{
   255  				4, 1, 2, 1, // 8 total
   256  			})
   257  		})
   258  
   259  		Convey("Handles arbitrary keys", func() {
   260  			entities := make([]any, 1000)
   261  			for i := 0; i < len(entities); i++ {
   262  				blob := make([]byte, 10)
   263  				_, err := rnd.Read(blob)
   264  				So(err, ShouldBeNil)
   265  				entities[i] = &StringEntity{
   266  					ID:     string(blob),
   267  					Parent: datastore.KeyForObj(ctx, &RootEntity{ID: rnd.Intn(10) + 1}),
   268  				}
   269  			}
   270  			So(datastore.Put(ctx, entities), ShouldBeNil)
   271  			datastore.GetTestable(ctx).CatchupIndexes()
   272  
   273  			q := datastore.NewQuery("StringEntity")
   274  
   275  			// Discovers all 1000 entities, split ~ evenly between shards.
   276  			So(getShardSizes(q, 8, 256), ShouldResemble, []int{
   277  				115, 114, 113, 110, 133, 133, 148, 134,
   278  			})
   279  		})
   280  	})
   281  }