go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/internal/splitter/split_test.go (about) 1 // Copyright 2018 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package splitter 16 17 import ( 18 "context" 19 "math/rand" 20 "testing" 21 22 "go.chromium.org/luci/gae/impl/memory" 23 "go.chromium.org/luci/gae/service/datastore" 24 25 . "github.com/smartystreets/goconvey/convey" 26 ) 27 28 type RootEntity struct { 29 ID int `gae:"$id"` 30 } 31 32 type IntEntity struct { 33 ID int `gae:"$id"` 34 Parent *datastore.Key `gae:"$parent"` 35 } 36 37 type StringEntity struct { 38 ID string `gae:"$id"` 39 Parent *datastore.Key `gae:"$parent"` 40 } 41 42 func putIntEntities(ctx context.Context, parent *datastore.Key, ids []int) { 43 e := make([]*IntEntity, len(ids)) 44 for i, id := range ids { 45 e[i] = &IntEntity{ 46 ID: id, 47 Parent: parent, 48 } 49 } 50 if err := datastore.Put(ctx, e); err != nil { 51 panic(err) 52 } 53 } 54 55 type intRange struct { 56 start int 57 end int 58 size int 59 } 60 61 func extractRanges(ranges []Range, max int) []intRange { 62 out := make([]intRange, len(ranges)) 63 for i, rng := range ranges { 64 r := intRange{-1, -1, 0} 65 if rng.Start != nil { 66 r.start = int(rng.Start.IntID()) 67 } 68 if rng.End != nil { 69 r.end = int(rng.End.IntID()) 70 } 71 switch { 72 case r.start != -1 && r.end != -1: 73 r.size = r.end - r.start 74 case max != 0 && r.start == -1 && r.end == -1: 75 r.size = max 76 case r.start == -1: 77 r.size = r.end 78 case max != 0 && r.end == -1: 79 r.size = max - r.start 80 } 81 out[i] = r 82 } 83 return out 84 } 85 86 func countRanges(ctx context.Context, q *datastore.Query, ranges []Range) []int { 87 out := make([]int, len(ranges)) 88 for i, r := range ranges { 89 count, err := datastore.Count(ctx, r.Apply(q)) 90 if err != nil { 91 panic(err) 92 } 93 out[i] = int(count) 94 } 95 return out 96 } 97 98 func TestSplitIntoRanges(t *testing.T) { 99 Convey("Works", t, func() { 100 ctx := memory.Use(context.Background()) 101 rnd := rand.New(rand.NewSource(1)) 102 103 datastore.GetTestable(ctx).AutoIndex(true) 104 105 getRanges := func(q *datastore.Query, shards, samples, max int) []intRange { 106 ranges, err := SplitIntoRanges(ctx, q, Params{ 107 Shards: shards, 108 Samples: samples, 109 }) 110 So(err, ShouldBeNil) 111 return extractRanges(ranges, max) 112 } 113 114 getShardSizes := func(q *datastore.Query, shards, samples int) []int { 115 ranges, err := SplitIntoRanges(ctx, q, Params{ 116 Shards: shards, 117 Samples: samples, 118 }) 119 So(err, ShouldBeNil) 120 return countRanges(ctx, q, ranges) 121 } 122 123 Convey("With mostly empty query", func() { 124 putIntEntities(ctx, nil, []int{1, 2, 3, 4, 5, 6}) 125 maxID := 6 126 datastore.GetTestable(ctx).CatchupIndexes() 127 128 q := datastore.NewQuery("IntEntity") 129 130 // Special case for 1 shard. 131 So(getRanges(q, 1, 1.0, maxID), ShouldResemble, []intRange{ 132 {-1, -1, 6}, 133 }) 134 135 // Not enough split points for 4 shards. Got only 2. 136 So(getRanges(q, 4, 1.0, maxID), ShouldResemble, []intRange{ 137 {-1, 4, 4}, 138 {4, -1, 2}, 139 }) 140 }) 141 142 Convey("With evenly distributed int keys", func() { 143 ints := []int{} 144 maxID := 1024 145 for i := 0; i < maxID; i++ { 146 ints = append(ints, i) 147 } 148 putIntEntities(ctx, nil, ints) 149 datastore.GetTestable(ctx).CatchupIndexes() 150 151 q := datastore.NewQuery("IntEntity") 152 153 // 2 shards. With oversampling we get better accuracy (the middle point is 154 // closer to 512). 155 So(getRanges(q, 2, 2, maxID), ShouldResemble, []intRange{ 156 {-1, 399, 399}, 157 {399, -1, 625}, 158 }) 159 So(getRanges(q, 2, 64, maxID), ShouldResemble, []intRange{ 160 {-1, 476, 476}, 161 {476, -1, 548}, 162 }) 163 So(getRanges(q, 2, 256, maxID), ShouldResemble, []intRange{ 164 {-1, 489, 489}, 165 {489, -1, 535}, 166 }) 167 168 // 3 shards. With oversampling we get better accuracy (shard size is close 169 // to 340, shards are more even). 170 So(getRanges(q, 3, 3, maxID), ShouldResemble, []intRange{ 171 {-1, 265, 265}, 172 {265, 399, 134}, 173 {399, -1, 625}, 174 }) 175 So(getRanges(q, 3, 96, maxID), ShouldResemble, []intRange{ 176 {-1, 285, 285}, 177 {285, 696, 411}, 178 {696, -1, 328}, 179 }) 180 So(getRanges(q, 3, 384, maxID), ShouldResemble, []intRange{ 181 {-1, 327, 327}, 182 {327, 658, 331}, 183 {658, -1, 366}, 184 }) 185 }) 186 187 Convey("With normally distributed int keys", func() { 188 ints := []int{} 189 seen := map[int]bool{} 190 for i := 0; i < 1000; i++ { 191 key := int(rnd.NormFloat64()*2000 + 50000) 192 if key > 0 && !seen[key] { 193 ints = append(ints, key) 194 seen[key] = true 195 } 196 } 197 putIntEntities(ctx, nil, ints) 198 datastore.GetTestable(ctx).CatchupIndexes() 199 200 q := datastore.NewQuery("IntEntity") 201 202 // Had fewer distinct points generated (due to rnd giving duplicates). 203 // All points are present in the single shard if not really sharding. 204 So(len(seen), ShouldEqual, 937) 205 So(getShardSizes(q, 1, 1), ShouldResemble, []int{len(seen)}) 206 207 // Shards have ~ even sizes (in terms of number of points there). 208 shards := getShardSizes(q, 10, 320) 209 So(shards, ShouldResemble, []int{ 210 80, 97, 105, 80, 92, 95, 106, 103, 87, 92, 211 }) 212 213 // But key ranges are very different, reflecting distribution of points. 214 So(getRanges(q, 10, 320, 0), ShouldResemble, []intRange{ 215 {-1, 47124, 47124}, 216 {47124, 48158, 1034}, 217 {48158, 48925, 767}, 218 {48925, 49363, 438}, 219 {49363, 49871, 508}, 220 {49871, 50352, 481}, 221 {50352, 50973, 621}, 222 {50973, 51677, 704}, 223 {51677, 52573, 896}, 224 {52573, -1, 0}, 225 }) 226 227 // All points are counted. 228 total := 0 229 for _, s := range shards { 230 total += s 231 } 232 So(total, ShouldEqual, len(seen)) 233 }) 234 235 Convey("Handles ancestor filter", func() { 236 root1 := datastore.KeyForObj(ctx, &RootEntity{ID: 1}) 237 root2 := datastore.KeyForObj(ctx, &RootEntity{ID: 2}) 238 239 putIntEntities(ctx, root1, []int{1, 2, 3, 4}) 240 putIntEntities(ctx, root2, []int{1, 2, 3, 4, 5, 6, 7, 8}) 241 datastore.GetTestable(ctx).CatchupIndexes() 242 243 q := datastore.NewQuery("IntEntity") 244 245 // Non-ancestor query discovers all 12 entities. 246 So(getShardSizes(q, 4, 128), ShouldResemble, []int{ 247 2, 2, 4, 4, 248 }) 249 250 // With ancestor query, discovers only entities that match the query. 251 So(getShardSizes(q.Ancestor(root1), 4, 128), ShouldResemble, []int{ 252 1, 1, 2, 0, // 4 total 253 }) 254 So(getShardSizes(q.Ancestor(root2), 4, 128), ShouldResemble, []int{ 255 4, 1, 2, 1, // 8 total 256 }) 257 }) 258 259 Convey("Handles arbitrary keys", func() { 260 entities := make([]any, 1000) 261 for i := 0; i < len(entities); i++ { 262 blob := make([]byte, 10) 263 _, err := rnd.Read(blob) 264 So(err, ShouldBeNil) 265 entities[i] = &StringEntity{ 266 ID: string(blob), 267 Parent: datastore.KeyForObj(ctx, &RootEntity{ID: rnd.Intn(10) + 1}), 268 } 269 } 270 So(datastore.Put(ctx, entities), ShouldBeNil) 271 datastore.GetTestable(ctx).CatchupIndexes() 272 273 q := datastore.NewQuery("StringEntity") 274 275 // Discovers all 1000 entities, split ~ evenly between shards. 276 So(getShardSizes(q, 8, 256), ShouldResemble, []int{ 277 115, 114, 113, 110, 133, 133, 148, 134, 278 }) 279 }) 280 }) 281 }