github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/sort/external_test.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sort 16 17 import ( 18 "context" 19 "fmt" 20 "os" 21 "strings" 22 "testing" 23 24 "github.com/stretchr/testify/require" 25 26 "github.com/dolthub/dolt/go/store/prolly/tree" 27 "github.com/dolthub/dolt/go/store/util/tempfiles" 28 "github.com/dolthub/dolt/go/store/val" 29 ) 30 31 func TestFlush(t *testing.T) { 32 tests := []struct { 33 td val.TupleDesc 34 cnt int 35 }{ 36 { 37 td: val.NewTupleDescriptor( 38 val.Type{Enc: val.Uint32Enc, Nullable: false}, 39 ), 40 cnt: 100, 41 }, 42 { 43 td: val.NewTupleDescriptor( 44 val.Type{Enc: val.Int64Enc, Nullable: false}, 45 ), 46 cnt: 100, 47 }, 48 { 49 td: val.NewTupleDescriptor( 50 val.Type{Enc: val.StringEnc, Nullable: false}, 51 ), 52 cnt: 100, 53 }, 54 { 55 td: val.NewTupleDescriptor( 56 val.Type{Enc: val.Int64Enc, Nullable: false}, 57 val.Type{Enc: val.StringEnc, Nullable: false}, 58 ), 59 cnt: 100, 60 }, 61 } 62 63 name := func(td val.TupleDesc, cnt int) string { 64 b := strings.Builder{} 65 sep := "" 66 for _, t := range td.Types { 67 fmt.Fprintf(&b, "%s%s", sep, string(t.Enc)) 68 sep = ", " 69 } 70 sep = "_" 71 fmt.Fprintf(&b, "%s%d", sep, cnt) 72 return b.String() 73 } 74 75 tmpProv := newProv(t) 76 defer tmpProv.Clean() 77 78 ns := tree.NewTestNodeStore() 79 80 keySize := 100 81 82 for _, tt := range tests { 83 t.Run(name(tt.td, tt.cnt), func(t *testing.T) { 84 km := newKeyMem(tt.cnt * keySize) 85 86 keys := testTuples(ns, tt.td, tt.cnt) 87 expSize := 0 88 for _, k := range keys { 89 expSize += len(k) 90 require.True(t, km.insert(k)) 91 } 92 93 keyCmp := func(l, r val.Tuple) bool { 94 return tt.td.Compare(l, r) <= 0 95 } 96 97 t.Run("sorting", func(t *testing.T) { 98 km.sort(keyCmp) 99 ensureSorted(t, km.keys, keyCmp) 100 }) 101 102 t.Run("mem iter", func(t *testing.T) { 103 cnt, size := drainIterCntSize(t, km) 104 require.Equal(t, tt.cnt, cnt) 105 require.Equal(t, expSize, size) 106 }) 107 108 t.Run("file iter", func(t *testing.T) { 109 kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp) 110 require.NoError(t, err) 111 cnt, size := drainIterCntSize(t, kf) 112 require.Equal(t, tt.cnt, cnt) 113 require.Equal(t, expSize, size) 114 }) 115 116 }) 117 } 118 } 119 120 func TestMerge(t *testing.T) { 121 tests := []struct { 122 td val.TupleDesc 123 counts []int 124 }{ 125 { 126 td: val.NewTupleDescriptor( 127 val.Type{Enc: val.Uint32Enc, Nullable: false}, 128 ), 129 counts: []int{100}, 130 }, 131 { 132 td: val.NewTupleDescriptor( 133 val.Type{Enc: val.Uint32Enc, Nullable: false}, 134 ), 135 counts: []int{100, 100}, 136 }, 137 { 138 td: val.NewTupleDescriptor( 139 val.Type{Enc: val.Uint32Enc, Nullable: false}, 140 ), 141 counts: []int{100, 100, 100, 100}, 142 }, 143 144 { 145 td: val.NewTupleDescriptor( 146 val.Type{Enc: val.StringEnc, Nullable: false}, 147 ), 148 counts: []int{1000, 10000, 10, 100000, 100000}, 149 }, 150 } 151 152 name := func(td val.TupleDesc, counts []int) string { 153 b := strings.Builder{} 154 sep := "" 155 for _, t := range td.Types { 156 fmt.Fprintf(&b, "%s%s", sep, string(t.Enc)) 157 sep = ", " 158 } 159 sep = "_" 160 for _, c := range counts { 161 fmt.Fprintf(&b, "%s%d", sep, c) 162 163 } 164 return b.String() 165 } 166 167 tmpProv := newProv(t) 168 defer tmpProv.Clean() 169 170 ns := tree.NewTestNodeStore() 171 172 batchSize := 4096 173 keySize := 100 174 175 for _, tt := range tests { 176 t.Run(name(tt.td, tt.counts), func(t *testing.T) { 177 keyCmp := func(l, r val.Tuple) bool { 178 return tt.td.Compare(l, r) <= 0 179 } 180 181 var keyMems []keyIterable 182 var keyFiles []keyIterable 183 expSize := 0 184 expCnt := 0 185 for _, cnt := range tt.counts { 186 km := newKeyMem(cnt * keySize) 187 keys := testTuples(ns, tt.td, cnt) 188 for _, k := range keys { 189 expSize += len(k) 190 expCnt++ 191 require.True(t, km.insert(k)) 192 } 193 kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp) 194 require.NoError(t, err) 195 keyFiles = append(keyFiles, kf) 196 keyMems = append(keyMems, km) 197 } 198 199 t.Run("mem merge", func(t *testing.T) { 200 target := newKeyFile(mustNewFile(t, tmpProv), batchSize) 201 202 ctx := context.Background() 203 m, _ := newFileMerger(ctx, keyCmp, target, keyMems...) 204 m.run(ctx) 205 206 cnt, size := drainIterCntSize(t, target) 207 require.Equal(t, expCnt, cnt) 208 require.Equal(t, expSize, size) 209 }) 210 211 t.Run("file merge", func(t *testing.T) { 212 target := newKeyFile(mustNewFile(t, tmpProv), batchSize) 213 214 ctx := context.Background() 215 m, _ := newFileMerger(ctx, keyCmp, target, keyFiles...) 216 m.run(ctx) 217 218 cnt, size := drainIterCntSize(t, target) 219 require.Equal(t, expCnt, cnt) 220 require.Equal(t, expSize, size) 221 }) 222 }) 223 } 224 } 225 226 func TestCompact(t *testing.T) { 227 // run compact until there's only 1 file 228 // check at each iteration that we halved the file count, cnt and size is still the same 229 tests := []struct { 230 td val.TupleDesc 231 fileCnt int 232 }{ 233 { 234 td: val.NewTupleDescriptor( 235 val.Type{Enc: val.Uint32Enc, Nullable: false}, 236 ), 237 fileCnt: 16, 238 }, 239 { 240 td: val.NewTupleDescriptor( 241 val.Type{Enc: val.Uint32Enc, Nullable: false}, 242 ), 243 fileCnt: 64, 244 }, 245 { 246 td: val.NewTupleDescriptor( 247 val.Type{Enc: val.Uint32Enc, Nullable: false}, 248 ), 249 fileCnt: 128, 250 }, 251 252 { 253 td: val.NewTupleDescriptor( 254 val.Type{Enc: val.StringEnc, Nullable: false}, 255 ), 256 fileCnt: 128, 257 }, 258 } 259 260 name := func(td val.TupleDesc, fileCnt int) string { 261 b := strings.Builder{} 262 sep := "" 263 for _, t := range td.Types { 264 fmt.Fprintf(&b, "%s%s", sep, string(t.Enc)) 265 sep = ", " 266 } 267 sep = "_" 268 fmt.Fprintf(&b, "%s%d", sep, fileCnt) 269 270 return b.String() 271 } 272 273 tmpProv := newProv(t) 274 defer tmpProv.Clean() 275 276 ns := tree.NewTestNodeStore() 277 278 batchSize := 10 279 keySize := 100 280 281 for _, tt := range tests { 282 t.Run(name(tt.td, tt.fileCnt), func(t *testing.T) { 283 keyCmp := func(l, r val.Tuple) bool { 284 return tt.td.Compare(l, r) <= 0 285 } 286 287 var keyFiles []keyIterable 288 expSize := 0 289 expCnt := 0 290 for i := 0; i < tt.fileCnt; i++ { 291 km := newKeyMem(batchSize * keySize) 292 keys := testTuples(ns, tt.td, batchSize) 293 for _, k := range keys { 294 expSize += len(k) 295 expCnt++ 296 require.True(t, km.insert(k)) 297 } 298 kf, err := km.flush(mustNewFile(t, tmpProv), keyCmp) 299 require.NoError(t, err) 300 keyFiles = append(keyFiles, kf) 301 } 302 303 ctx := context.Background() 304 305 t.Run("file compact", func(t *testing.T) { 306 s := NewTupleSorter(batchSize, tt.fileCnt, keyCmp, tmpProv) 307 defer s.Close() 308 s.files = append(s.files, keyFiles) 309 err := s.compact(ctx, 0) 310 311 require.NoError(t, err) 312 require.Equal(t, 0, len(s.files[0])) 313 require.Equal(t, 1, len(s.files[1])) 314 require.Equal(t, 2, len(s.files)) 315 316 cnt, size := drainIterCntSize(t, s.files[1][0]) 317 require.Equal(t, expCnt, cnt) 318 require.Equal(t, expSize, size) 319 320 }) 321 }) 322 } 323 } 324 325 func TestFileE2E(t *testing.T) { 326 // simulate full lifecycle 327 // vary batch size and file count so multiple compacts/merges 328 // make the batch size and file size small enough that 329 // we have to spill to disk and compact several times 330 tests := []struct { 331 name string 332 rows int 333 batchSize int 334 fileMax int 335 td val.TupleDesc 336 }{ 337 { 338 name: "uint32", 339 td: val.NewTupleDescriptor( 340 val.Type{Enc: val.Uint32Enc, Nullable: false}, 341 ), 342 rows: 10_000, 343 batchSize: 10_000, 344 fileMax: 4, 345 }, 346 { 347 name: "uint32", 348 td: val.NewTupleDescriptor( 349 val.Type{Enc: val.Uint32Enc, Nullable: false}, 350 ), 351 rows: 10_000, 352 batchSize: 1000, 353 fileMax: 4, 354 }, 355 { 356 name: "uint32", 357 td: val.NewTupleDescriptor( 358 val.Type{Enc: val.Uint32Enc, Nullable: false}, 359 ), 360 rows: 20_000, 361 batchSize: 500, 362 fileMax: 16, 363 }, 364 { 365 name: "int64", 366 td: val.NewTupleDescriptor( 367 val.Type{Enc: val.Int64Enc, Nullable: false}, 368 ), 369 rows: 7_777, 370 batchSize: 1000, 371 fileMax: 4, 372 }, 373 { 374 name: "(string)", 375 td: val.NewTupleDescriptor( 376 val.Type{Enc: val.StringEnc, Nullable: false}, 377 ), 378 rows: 10_000, 379 batchSize: 100, 380 fileMax: 32, 381 }, 382 { 383 name: "(string)", 384 td: val.NewTupleDescriptor( 385 val.Type{Enc: val.StringEnc, Nullable: false}, 386 ), 387 rows: 10_000, 388 batchSize: 483, 389 fileMax: 31, 390 }, 391 { 392 name: "(string)", 393 td: val.NewTupleDescriptor( 394 val.Type{Enc: val.StringEnc, Nullable: false}, 395 ), 396 rows: 1, 397 batchSize: 100, 398 fileMax: 30, 399 }, 400 { 401 name: "(string)", 402 td: val.NewTupleDescriptor( 403 val.Type{Enc: val.StringEnc, Nullable: false}, 404 ), 405 rows: 0, 406 batchSize: 100, 407 fileMax: 30, 408 }, 409 } 410 411 tmpProv := newProv(t) 412 defer tmpProv.Clean() 413 414 ns := tree.NewTestNodeStore() 415 416 for _, tt := range tests { 417 t.Run(fmt.Sprintf("%s %d-rows %d-batch %d-files", tt.name, tt.rows, tt.batchSize, tt.fileMax), func(t *testing.T) { 418 keyCmp := func(l, r val.Tuple) bool { 419 return tt.td.Compare(l, r) <= 0 420 } 421 422 ctx := context.Background() 423 keys := testTuples(ns, tt.td, tt.rows) 424 s := NewTupleSorter(tt.batchSize, tt.fileMax, keyCmp, tmpProv) 425 defer s.Close() 426 expSize := 0 427 for _, k := range keys { 428 err := s.Insert(ctx, k) 429 require.NoError(t, err) 430 expSize += len(k) 431 } 432 433 iterable, err := s.Flush(ctx) 434 require.NoError(t, err) 435 var cnt, size int 436 iter, err := iterable.IterAll(ctx) 437 require.NoError(t, err) 438 defer iter.Close() 439 var lastKey val.Tuple 440 for { 441 k, err := iter.Next(ctx) 442 if err != nil { 443 break 444 } 445 if lastKey != nil { 446 require.True(t, keyCmp(lastKey, k)) 447 } 448 cnt++ 449 size += len(k) 450 lastKey = k 451 } 452 453 require.Equal(t, tt.rows, cnt) 454 require.Equal(t, expSize, size) 455 }) 456 } 457 458 } 459 460 func testTuples(ns tree.NodeStore, kd val.TupleDesc, cnt int) []val.Tuple { 461 keyBuilder := val.NewTupleBuilder(kd) 462 463 var keys []val.Tuple 464 for i := 0; i < cnt; i++ { 465 keys = append(keys, tree.RandomTuple(keyBuilder, ns)) 466 } 467 468 return keys 469 } 470 471 func ensureSorted(t *testing.T, keys []val.Tuple, cmp func(val.Tuple, val.Tuple) bool) { 472 for i := 0; i < len(keys)-1; i += 2 { 473 require.True(t, cmp(keys[i], keys[i+1])) 474 } 475 } 476 477 func newProv(t *testing.T) *tempfiles.TempFileProviderAt { 478 tmpDir := t.TempDir() 479 return tempfiles.NewTempFileProviderAt(tmpDir) 480 } 481 482 func mustNewFile(t *testing.T, prov tempfiles.TempFileProvider) *os.File { 483 f, err := prov.NewFile("", "external_sort_test_*") 484 if err != nil { 485 require.NoError(t, err) 486 } 487 return f 488 } 489 490 func drainIterCntSize(t *testing.T, ki keyIterable) (cnt int, size int) { 491 ctx := context.Background() 492 iter, err := ki.IterAll(ctx) 493 require.NoError(t, err) 494 defer iter.Close() 495 for { 496 k, err := iter.Next(ctx) 497 if err != nil { 498 break 499 } 500 cnt++ 501 size += len(k) 502 } 503 return cnt, size 504 }