github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/prop_length_tracker_test.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "fmt" 16 "path" 17 "testing" 18 19 "github.com/sirupsen/logrus" 20 "github.com/stretchr/testify/assert" 21 "github.com/stretchr/testify/require" 22 ) 23 24 func Test_PropertyLengthTracker(t *testing.T) { 25 dirName := t.TempDir() 26 trackerPath := path.Join(dirName, "my_test_shard") 27 l := logrus.New() 28 29 // This test suite doesn't actually test persistence, there is a separate 30 // one. However, we still need to supply a valid path. Since nothing is ever 31 // written, we can use the same one for each sub-test without them 32 // accidentally sharing state. 33 34 t.Run("single prop", func(t *testing.T) { 35 type test struct { 36 values []float32 37 name string 38 floatCompare bool 39 } 40 41 tests := []test{ 42 { 43 values: []float32{2, 2, 3, 100, 100, 500, 7}, 44 name: "mixed_values", 45 floatCompare: true, 46 }, 47 { 48 values: []float32{ 49 1000, 1200, 1000, 1300, 800, 2000, 2050, 50 2070, 900, 51 }, 52 name: "high_values", 53 floatCompare: true, 54 }, 55 { 56 values: []float32{ 57 60000, 50000, 65000, 58 }, 59 name: "very_high_values", 60 floatCompare: true, 61 }, 62 { 63 values: []float32{ 64 1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5, 65 6, 3, 5, 9, 3, 4, 8, 66 }, 67 name: "very_low_values", 68 floatCompare: true, 69 }, 70 { 71 values: []float32{0, 0}, 72 name: "zeros", 73 floatCompare: false, 74 }, 75 } 76 77 for _, test := range tests { 78 t.Run(test.name, func(t *testing.T) { 79 tracker, err := NewJsonPropertyLengthTracker(trackerPath+test.name, l) 80 require.Nil(t, err) 81 82 actualMean := float32(0) 83 for _, v := range test.values { 84 tracker.TrackProperty("my-very-first-prop", v) 85 actualMean += v 86 } 87 actualMean = actualMean / float32(len(test.values)) 88 89 res, err := tracker.PropertyMean("my-very-first-prop") 90 require.Nil(t, err) 91 92 if test.floatCompare { 93 assert.InEpsilon(t, actualMean, res, 0.1) 94 } else { 95 assert.Equal(t, actualMean, res) 96 } 97 require.Nil(t, tracker.Close()) 98 }) 99 } 100 }) 101 102 t.Run("test untrack", func(t *testing.T) { 103 tracker, err := NewJsonPropertyLengthTracker(trackerPath, l) 104 require.Nil(t, err) 105 106 tracker.TrackProperty("test-prop", 1) 107 tracker.TrackProperty("test-prop", 2) 108 tracker.TrackProperty("test-prop", 3) 109 tracker.Flush(false) 110 111 sum, count, mean, err := tracker.PropertyTally("test-prop") 112 require.Nil(t, err) 113 assert.Equal(t, 6, sum) 114 assert.Equal(t, 3, count) 115 assert.InEpsilon(t, 2, mean, 0.1) 116 117 tracker.UnTrackProperty("test-prop", 2) 118 sum, count, mean, err = tracker.PropertyTally("test-prop") 119 require.Nil(t, err) 120 assert.Equal(t, 4, sum) 121 assert.Equal(t, 2, count) 122 assert.InEpsilon(t, 2, mean, 0.1) 123 124 tracker.UnTrackProperty("test-prop", 1) 125 sum, count, mean, err = tracker.PropertyTally("test-prop") 126 require.Nil(t, err) 127 assert.Equal(t, 3, sum) 128 assert.Equal(t, 1, count) 129 assert.InEpsilon(t, 3, mean, 0.1) 130 131 require.Nil(t, tracker.Close()) 132 }) 133 134 t.Run("multiple properties (can all fit on one page)", func(t *testing.T) { 135 type prop struct { 136 values []float32 137 propName string 138 } 139 140 props := []prop{ 141 { 142 values: []float32{2, 2, 3, 100, 100, 500, 7}, 143 propName: "property-numero-uno", 144 }, { 145 values: []float32{ 146 1000, 1200, 1000, 1300, 800, 2000, 2050, 147 2070, 900, 148 }, 149 propName: "the-second-of-the-properties", 150 }, { 151 values: []float32{ 152 60000, 50000, 65000, 153 }, 154 propName: "property_nummer_DREI", 155 }, 156 } 157 158 // This time we use a single tracker 159 tracker, err := NewJsonPropertyLengthTracker(trackerPath, l) 160 require.Nil(t, err) 161 162 for _, prop := range props { 163 for _, v := range prop.values { 164 tracker.TrackProperty(prop.propName, v) 165 } 166 } 167 168 for _, prop := range props { 169 actualMean := float32(0) 170 for _, v := range prop.values { 171 actualMean += v 172 } 173 actualMean = actualMean / float32(len(prop.values)) 174 175 res, err := tracker.PropertyMean(prop.propName) 176 require.Nil(t, err) 177 178 assert.InEpsilon(t, actualMean, res, 0.1) 179 } 180 181 require.Nil(t, tracker.Close()) 182 }) 183 184 t.Run("with more properties that can fit on one page", func(t *testing.T) { 185 // This time we use a single tracker 186 tracker, err := NewJsonPropertyLengthTracker(trackerPath, l) 187 require.Nil(t, err) 188 189 create20PropsAndVerify(t, tracker) 190 191 require.Nil(t, tracker.Close()) 192 }) 193 } 194 195 func create20PropsAndVerify(t *testing.T, tracker *JsonPropertyLengthTracker) { 196 type prop struct { 197 values []float32 198 propName string 199 } 200 201 // the most props we could ever fit on a single page is 16 if there was no 202 // index, which is impossible. This means the practical max is 15, so at 203 // least 5 props should overflow to the second page. 204 propCount := 20 205 props := make([]prop, propCount) 206 207 for i := range props { 208 props[i] = prop{ 209 values: []float32{1, 4, 3, 17}, 210 propName: fmt.Sprintf("prop_%d", i), 211 } 212 } 213 214 for _, prop := range props { 215 for _, v := range prop.values { 216 tracker.TrackProperty(prop.propName, v) 217 } 218 } 219 220 for _, prop := range props { 221 actualMean := float32(0) 222 for _, v := range prop.values { 223 actualMean += v 224 } 225 actualMean = actualMean / float32(len(prop.values)) 226 227 res, err := tracker.PropertyMean(prop.propName) 228 require.Nil(t, err) 229 230 assert.InEpsilon(t, actualMean, res, 0.1) 231 } 232 233 // modify a prop on page 2 and verify 234 tracker.TrackProperty("prop_19", 24) 235 actualMeanForProp20 := float32(1+4+3+17+25) / 5.0 236 res, err := tracker.PropertyMean("prop_19") 237 require.Nil(t, err) 238 239 assert.InEpsilon(t, actualMeanForProp20, res, 0.1) 240 } 241 242 func Test_PropertyLengthTracker_Persistence(t *testing.T) { 243 dirName := t.TempDir() 244 245 path := path.Join(dirName, "my_test_shard") 246 247 var tracker *JsonPropertyLengthTracker 248 l := logrus.New() 249 250 t.Run("initializing an empty tracker, no file present", func(t *testing.T) { 251 tr, err := NewJsonPropertyLengthTracker(path, l) 252 require.Nil(t, err) 253 tracker = tr 254 }) 255 256 t.Run("importing multi-page data and verifying", func(t *testing.T) { 257 create20PropsAndVerify(t, tracker) 258 }) 259 260 t.Run("commit the state to disk", func(t *testing.T) { 261 require.Nil(t, tracker.Flush(false)) 262 }) 263 264 t.Run("shut down the tracker", func(t *testing.T) { 265 require.Nil(t, tracker.Close()) 266 }) 267 268 var secondTracker *JsonPropertyLengthTracker 269 t.Run("initializing a new tracker from the same file", func(t *testing.T) { 270 tr, err := NewJsonPropertyLengthTracker(path, l) 271 require.Nil(t, err) 272 secondTracker = tr 273 }) 274 275 t.Run("verify data is correct after read from disk", func(t *testing.T) { 276 // root page 277 actualMeanForProp0 := float32(1+4+3+17) / 4.0 278 res, err := secondTracker.PropertyMean("prop_0") 279 require.Nil(t, err) 280 assert.InEpsilon(t, actualMeanForProp0, res, 0.1) 281 282 // later page 283 actualMeanForProp20 := float32(1+4+3+17+25) / 5.0 284 res, err = secondTracker.PropertyMean("prop_19") 285 require.Nil(t, err) 286 assert.InEpsilon(t, actualMeanForProp20, res, 0.1) 287 }) 288 } 289 290 // Testing the switch from the old property length tracker to the new one 291 func TestFormatConversion(t *testing.T) { 292 dirName := t.TempDir() 293 294 path := path.Join(dirName, "my_test_shard") 295 296 var tracker *PropertyLengthTracker 297 298 t.Run("initializing an empty tracker, no file present", func(t *testing.T) { 299 tr, err := NewPropertyLengthTracker(path) 300 require.Nil(t, err) 301 tracker = tr 302 }) 303 304 t.Run("importing multi-page data and verifying", func(t *testing.T) { 305 create20PropsAndVerify_old(t, tracker) 306 }) 307 308 t.Run("commit the state to disk", func(t *testing.T) { 309 require.Nil(t, tracker.Flush()) 310 }) 311 312 t.Run("shut down the tracker", func(t *testing.T) { 313 require.Nil(t, tracker.Close()) 314 }) 315 316 var newTracker *JsonPropertyLengthTracker 317 l := logrus.New() 318 319 t.Run("initializing a new tracker from the same file", func(t *testing.T) { 320 tr, err := NewJsonPropertyLengthTracker(path, l) 321 require.Nil(t, err) 322 newTracker = tr 323 }) 324 325 t.Run("verify data is correct after read from disk", func(t *testing.T) { 326 // root page 327 actualMeanForProp0 := float32(1+4+3+17) / 4.0 328 res, err := newTracker.PropertyMean("prop_0") 329 require.Nil(t, err) 330 assert.InEpsilon(t, actualMeanForProp0, res, 0.1) 331 332 // later page 333 actualMeanForProp20 := float32(1+4+3+17+25) / 5.0 334 res, err = newTracker.PropertyMean("prop_19") 335 require.Nil(t, err) 336 assert.InEpsilon(t, actualMeanForProp20, res, 0.1) 337 338 res, err = newTracker.PropertyMean("prop_22") 339 require.Nil(t, err) 340 assert.EqualValues(t, res, 0) 341 sum, count, average, _ := newTracker.PropertyTally("prop_22") 342 assert.EqualValues(t, 0, sum) 343 assert.EqualValues(t, 3, count) 344 assert.EqualValues(t, 0, average) 345 }) 346 } 347 348 func create20PropsAndVerify_old(t *testing.T, tracker *PropertyLengthTracker) { 349 type prop struct { 350 values []float32 351 propName string 352 } 353 354 // the most props we could ever fit on a single page is 16 if there was no 355 // index, which is impossible. This means the practical max is 15, so at 356 // least 5 props should overflow to the second page. 357 propCount := 20 358 props := make([]prop, propCount) 359 360 for i := range props { 361 props[i] = prop{ 362 values: []float32{1, 4, 3, 17}, 363 propName: fmt.Sprintf("prop_%d", i), 364 } 365 } 366 367 for _, prop := range props { 368 for _, v := range prop.values { 369 tracker.TrackProperty(prop.propName, v) 370 } 371 } 372 373 tracker.TrackProperty("prop_22", 0) 374 tracker.TrackProperty("prop_22", 0) 375 tracker.TrackProperty("prop_22", 0) 376 377 for _, prop := range props { 378 actualMean := float32(0) 379 for _, v := range prop.values { 380 actualMean += v 381 } 382 actualMean = actualMean / float32(len(prop.values)) 383 384 res, err := tracker.PropertyMean(prop.propName) 385 require.Nil(t, err) 386 387 assert.InEpsilon(t, actualMean, res, 0.1) 388 } 389 390 // modify a prop on page 2 and verify 391 tracker.TrackProperty("prop_19", 24) 392 actualMeanForProp20 := float32(1+4+3+17+25) / 5.0 393 res, err := tracker.PropertyMean("prop_19") 394 require.Nil(t, err) 395 396 assert.InEpsilon(t, actualMeanForProp20, res, 0.1) 397 398 res, err = tracker.PropertyMean("prop_22") 399 require.Nil(t, err) 400 assert.EqualValues(t, res, 0) 401 402 sum, _, average, _ := tracker.PropertyTally("prop_22") 403 assert.EqualValues(t, 0, sum) 404 // assert.EqualValues(t, 3, count) 405 assert.EqualValues(t, 0, average) 406 } 407 408 // Test the old property length tracker 409 410 func TestOldPropertyLengthTracker(t *testing.T) { 411 dirName := t.TempDir() 412 trackerPath := path.Join(dirName, "my_test_shard") 413 414 // This test suite doesn't actually test persistence, there is a separate 415 // one. However, we still need to supply a valid path. Since nothing is ever 416 // written, we can use the same one for each sub-test without them 417 // accidentally sharing state. 418 419 t.Run("single prop", func(t *testing.T) { 420 type test struct { 421 values []float32 422 name string 423 floatCompare bool 424 } 425 426 tests := []test{ 427 { 428 values: []float32{2, 2, 3, 100, 100, 500, 7}, 429 name: "mixed_values", 430 floatCompare: true, 431 }, { 432 values: []float32{ 433 1000, 1200, 1000, 1300, 800, 2000, 2050, 434 2070, 900, 435 }, 436 name: "high_values", 437 floatCompare: true, 438 }, { 439 values: []float32{ 440 60000, 50000, 65000, 441 }, 442 name: "very_high_values", 443 floatCompare: true, 444 }, { 445 values: []float32{ 446 1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5, 447 6, 3, 5, 9, 3, 4, 8, 448 }, 449 name: "very_low_values", 450 floatCompare: true, 451 }, { 452 values: []float32{0, 0}, 453 name: "zeros", 454 floatCompare: false, 455 }, 456 } 457 458 for _, test := range tests { 459 t.Run(test.name, func(t *testing.T) { 460 tracker, err := NewPropertyLengthTracker(trackerPath + test.name) 461 require.Nil(t, err) 462 463 actualMean := float32(0) 464 for _, v := range test.values { 465 tracker.TrackProperty("my-very-first-prop", v) 466 actualMean += v 467 } 468 actualMean = actualMean / float32(len(test.values)) 469 470 res, err := tracker.PropertyMean("my-very-first-prop") 471 require.Nil(t, err) 472 473 if test.floatCompare { 474 assert.InEpsilon(t, actualMean, res, 0.1) 475 } else { 476 assert.Equal(t, actualMean, res) 477 } 478 require.Nil(t, tracker.Close()) 479 }) 480 } 481 }) 482 483 t.Run("test untrack", func(t *testing.T) { 484 tracker, err := NewPropertyLengthTracker(trackerPath) 485 require.Nil(t, err) 486 487 tracker.TrackProperty("test-prop", 1) 488 tracker.TrackProperty("test-prop", 2) 489 tracker.TrackProperty("test-prop", 3) 490 tracker.Flush() 491 492 sum, count, mean, err := tracker.PropertyTally("test-prop") 493 require.Nil(t, err) 494 assert.Equal(t, 6, sum) 495 assert.Equal(t, 3, count) 496 assert.InEpsilon(t, 2, mean, 0.1) 497 498 tracker.UnTrackProperty("test-prop", 2) 499 sum, count, mean, err = tracker.PropertyTally("test-prop") 500 require.Nil(t, err) 501 assert.Equal(t, 4, sum) 502 assert.Equal(t, 2, count) 503 assert.InEpsilon(t, 2, mean, 0.1) 504 505 tracker.UnTrackProperty("test-prop", 1) 506 sum, count, mean, err = tracker.PropertyTally("test-prop") 507 require.Nil(t, err) 508 assert.Equal(t, 3, sum) 509 assert.Equal(t, 1, count) 510 assert.InEpsilon(t, 3, mean, 0.1) 511 512 require.Nil(t, tracker.Close()) 513 }) 514 515 t.Run("multiple properties (can all fit on one page)", func(t *testing.T) { 516 type prop struct { 517 values []float32 518 propName string 519 } 520 521 props := []prop{ 522 { 523 values: []float32{2, 2, 3, 100, 100, 500, 7}, 524 propName: "property-numero-uno", 525 }, { 526 values: []float32{ 527 1000, 1200, 1000, 1300, 800, 2000, 2050, 528 2070, 900, 529 }, 530 propName: "the-second-of-the-properties", 531 }, { 532 values: []float32{ 533 60000, 50000, 65000, 534 }, 535 propName: "property_nummer_DREI", 536 }, 537 } 538 539 // This time we use a single tracker 540 tracker, err := NewPropertyLengthTracker(trackerPath) 541 require.Nil(t, err) 542 543 for _, prop := range props { 544 for _, v := range prop.values { 545 tracker.TrackProperty(prop.propName, v) 546 } 547 } 548 549 for _, prop := range props { 550 actualMean := float32(0) 551 for _, v := range prop.values { 552 actualMean += v 553 } 554 actualMean = actualMean / float32(len(prop.values)) 555 556 res, err := tracker.PropertyMean(prop.propName) 557 require.Nil(t, err) 558 559 assert.InEpsilon(t, actualMean, res, 0.1) 560 } 561 562 require.Nil(t, tracker.Close()) 563 }) 564 565 t.Run("with more properties that can fit on one page", func(t *testing.T) { 566 // This time we use a single tracker 567 tracker, err := NewPropertyLengthTracker(trackerPath) 568 require.Nil(t, err) 569 570 create20PropsAndVerify_old(t, tracker) 571 572 require.Nil(t, tracker.Close()) 573 }) 574 } 575 576 func TestOldPropertyLengthTracker_Persistence(t *testing.T) { 577 dirName := t.TempDir() 578 579 path := path.Join(dirName, "my_test_shard") 580 581 var tracker *PropertyLengthTracker 582 583 t.Run("initializing an empty tracker, no file present", func(t *testing.T) { 584 tr, err := NewPropertyLengthTracker(path) 585 require.Nil(t, err) 586 tracker = tr 587 }) 588 589 t.Run("importing multi-page data and verifying", func(t *testing.T) { 590 create20PropsAndVerify_old(t, tracker) 591 }) 592 593 t.Run("commit the state to disk", func(t *testing.T) { 594 require.Nil(t, tracker.Flush()) 595 }) 596 597 t.Run("shut down the tracker", func(t *testing.T) { 598 require.Nil(t, tracker.Close()) 599 }) 600 601 var secondTracker *PropertyLengthTracker 602 t.Run("initializing a new tracker from the same file", func(t *testing.T) { 603 tr, err := NewPropertyLengthTracker(path) 604 require.Nil(t, err) 605 secondTracker = tr 606 }) 607 608 t.Run("verify data is correct after read from disk", func(t *testing.T) { 609 // root page 610 actualMeanForProp0 := float32(1+4+3+17) / 4.0 611 res, err := secondTracker.PropertyMean("prop_0") 612 require.Nil(t, err) 613 assert.InEpsilon(t, actualMeanForProp0, res, 0.1) 614 615 // later page 616 actualMeanForProp20 := float32(1+4+3+17+25) / 5.0 617 res, err = secondTracker.PropertyMean("prop_19") 618 require.Nil(t, err) 619 assert.InEpsilon(t, actualMeanForProp20, res, 0.1) 620 }) 621 622 t.Run("shut down the second tracker", func(t *testing.T) { 623 require.Nil(t, secondTracker.Close()) 624 }) 625 } 626 627 func Test_PropertyLengthTracker_Overflow(t *testing.T) { 628 dirName := t.TempDir() 629 path := path.Join(dirName, "my_test_shard") 630 631 tracker, err := NewPropertyLengthTracker(path) 632 require.Nil(t, err) 633 634 for i := 0; i < 16*15; i++ { 635 err := tracker.TrackProperty(fmt.Sprintf("prop_%v", i), float32(i)) 636 require.Nil(t, err) 637 } 638 639 // Check that property that would cause the internal counter to overflow is not added 640 err = tracker.TrackProperty("OVERFLOW", float32(123)) 641 require.NotNil(t, err) 642 643 require.Nil(t, tracker.Close()) 644 }