github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/new_prop_length_tracker.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"encoding/json"
    16  	"math"
    17  	"os"
    18  	"sync"
    19  
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  )
    23  
    24  var MAX_BUCKETS = 64
    25  
    26  type PropLenData struct {
    27  	BucketedData map[string]map[int]int
    28  	SumData      map[string]int
    29  	CountData    map[string]int
    30  }
    31  
    32  type JsonPropertyLengthTracker struct {
    33  	path string
    34  	data *PropLenData
    35  	sync.Mutex
    36  	UnlimitedBuckets bool
    37  	logger           logrus.FieldLogger
    38  }
    39  
    40  // This class replaces the old PropertyLengthTracker.  It fixes a bug and provides a
    41  // simpler, easier to maintain implementation.  The format is future-proofed, new
    42  // data can be added to the file without breaking old versions of Weaviate.
    43  //
    44  // * We need to know the mean length of all properties for BM25 calculations
    45  // * The prop length tracker is an approximate tracker that uses buckets and simply counts the entries in the buckets
    46  // * There is a precise global counter for the sum of all lengths and a precise global counter for the number of entries
    47  // * It only exists for string/text (and their array forms) because these are the only prop types that can be used with BM25
    48  // * It should probably always exist when indexSearchable is set on a text prop going forward
    49  //
    50  // Property lengths are put into one of 64 buckets.  The value of a bucket is given by the formula:
    51  //
    52  // float32(4 * math.Pow(1.25, float64(bucket)-3.5))
    53  //
    54  // Which as implemented gives bucket values of 0,1,2,3,4,5,6,8,10,13,17,21,26,33,41,52,65,81,101,127,158,198,248,310,387,484,606,757,947,1183,1479,1849,2312,2890,3612,4515,5644,7055,8819,11024,13780,17226,21532,26915,33644,42055,52569,65712,82140,102675,128344,160430,200537,250671,313339,391674,489593,611991,764989,956237,1195296,1494120,1867651,2334564
    55  //
    56  // These buckets are then recorded to disk.  The original implementation was a binary format where all the data was tracked using manual pointer arithmetic.  The new version tracks the statistics in a go map, and marshals that into JSON before writing it to disk.  There is no measurable difference in speed between these two implementations while importing data, however it appears to slow the queries by about 15% (while improving recall by ~25%).
    57  //
    58  // The new tracker is exactly compatible with the old format to enable migration, which is why there is a -1 bucket.  Altering the number of buckets or their values will break compatibility.
    59  //
    60  // Set UnlimitedBuckets to true for precise length tracking
    61  //
    62  // Note that some of the code in this file is forced by the need to be backwards-compatible with the old format.  Once we are confident that all users have migrated to the new format, we can remove the old format code and simplify this file.
    63  
    64  // NewJsonPropertyLengthTracker creates a new tracker and loads the data from the given path.  If the file is in the old format, it will be converted to the new format.
    65  func NewJsonPropertyLengthTracker(path string, logger logrus.FieldLogger) (t *JsonPropertyLengthTracker, err error) {
    66  	// Recover and return empty tracker on panic
    67  	defer func() {
    68  		if r := recover(); r != nil {
    69  			t.logger.Printf("Recovered from panic in NewJsonPropertyLengthTracker, original error: %v", r)
    70  			t = &JsonPropertyLengthTracker{
    71  				data:             &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)},
    72  				path:             path,
    73  				UnlimitedBuckets: false,
    74  			}
    75  			err = errors.Errorf("Recovered from panic in NewJsonPropertyLengthTracker, original error: %v", r)
    76  		}
    77  	}()
    78  
    79  	t = &JsonPropertyLengthTracker{
    80  		data:             &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)},
    81  		path:             path,
    82  		UnlimitedBuckets: false,
    83  		logger:           logger,
    84  	}
    85  
    86  	// read the file into memory
    87  	bytes, err := os.ReadFile(path)
    88  	if err != nil {
    89  		if os.IsNotExist(err) { // File doesn't exist, probably a new class(or a recount), return empty tracker
    90  			t.Flush(false)
    91  			return t, nil
    92  		}
    93  		return nil, errors.Wrap(err, "read property length tracker file:"+path)
    94  	}
    95  
    96  	if len(bytes) == 0 {
    97  		return nil, errors.Errorf("failed sanity check, empty prop len tracker file %s has length 0.  Delete file and set environment variable RECOUNT_PROPERTIES_AT_STARTUP to true", path)
    98  	}
    99  
   100  	// We don't have data file versioning, so we try to parse it as json.  If the parse fails, it is probably the old format file, so we call the old format loader and copy everything across.
   101  	if err = json.Unmarshal(bytes, &t.data); err != nil {
   102  		// It's probably the old format file, load the old format and convert it to the new format
   103  		plt, err := NewPropertyLengthTracker(path)
   104  		if err != nil {
   105  			return nil, errors.Wrap(err, "convert old property length tracker")
   106  		}
   107  
   108  		propertyNames := plt.PropertyNames()
   109  		data := &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}
   110  		// Loop over every page and bucket in the old tracker and add it to the new tracker
   111  		for _, name := range propertyNames {
   112  			data.BucketedData[name] = make(map[int]int, MAX_BUCKETS)
   113  			data.CountData[name] = 0
   114  			data.SumData[name] = 0
   115  			for i := 0; i <= MAX_BUCKETS; i++ {
   116  				fromBucket := i
   117  				if i == MAX_BUCKETS {
   118  					fromBucket = -1
   119  				}
   120  				count, err := plt.BucketCount(name, uint16(fromBucket))
   121  				if err != nil {
   122  					return nil, errors.Wrap(err, "convert old property length tracker")
   123  				}
   124  				data.BucketedData[name][fromBucket] = int(count)
   125  				value := float32(0)
   126  				if fromBucket == -1 {
   127  					value = 0
   128  				} else {
   129  					value = plt.valueFromBucket(uint16(fromBucket))
   130  				}
   131  
   132  				data.SumData[name] = data.SumData[name] + int(value)*int(count)
   133  				data.CountData[name] = data.CountData[name] + int(count)
   134  			}
   135  		}
   136  		t.data = data
   137  		t.Flush(true)
   138  		plt.Close()
   139  		plt.Drop()
   140  		t.Flush(false)
   141  	}
   142  	t.path = path
   143  
   144  	// Make really sure we aren't going to crash on a nil pointer
   145  	if t.data == nil {
   146  		return nil, errors.Errorf("failed sanity check, prop len tracker file %s has nil data.  Delete file and set environment variable RECOUNT_PROPERTIES_AT_STARTUP to true", path)
   147  	}
   148  	return t, nil
   149  }
   150  
   151  func (t *JsonPropertyLengthTracker) Clear() {
   152  	t.Lock()
   153  	defer t.Unlock()
   154  
   155  	t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}
   156  }
   157  
   158  // Path to the file on disk
   159  func (t *JsonPropertyLengthTracker) FileName() string {
   160  	return t.path
   161  }
   162  
   163  // Adds a new value to the tracker
   164  func (t *JsonPropertyLengthTracker) TrackProperty(propName string, value float32) error {
   165  	t.Lock()
   166  	defer t.Unlock()
   167  
   168  	// Remove this check once we are confident that all users have migrated to the new format
   169  	if t.data == nil {
   170  		t.logger.Print("WARNING: t.data is nil in TrackProperty, initializing to empty tracker")
   171  		t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}
   172  	}
   173  	t.data.SumData[propName] = t.data.SumData[propName] + int(value)
   174  	t.data.CountData[propName] = t.data.CountData[propName] + 1
   175  
   176  	bucketId := t.bucketFromValue(value)
   177  	if _, ok := t.data.BucketedData[propName]; ok {
   178  		t.data.BucketedData[propName][int(bucketId)] = t.data.BucketedData[propName][int(bucketId)] + 1
   179  	} else {
   180  
   181  		t.data.BucketedData[propName] = make(map[int]int, 64+1)
   182  		t.data.BucketedData[propName][int(bucketId)] = 1
   183  	}
   184  
   185  	return nil
   186  }
   187  
   188  // Removes a value from the tracker
   189  func (t *JsonPropertyLengthTracker) UnTrackProperty(propName string, value float32) error {
   190  	t.Lock()
   191  	defer t.Unlock()
   192  
   193  	// Remove this check once we are confident that all users have migrated to the new format
   194  	if t.data == nil {
   195  		t.logger.Print("WARNING: t.data is nil in TrackProperty, initializing to empty tracker")
   196  		t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}
   197  	}
   198  	t.data.SumData[propName] = t.data.SumData[propName] - int(value)
   199  	t.data.CountData[propName] = t.data.CountData[propName] - 1
   200  
   201  	bucketId := t.bucketFromValue(value)
   202  	if _, ok := t.data.BucketedData[propName]; ok {
   203  		t.data.BucketedData[propName][int(bucketId)] = t.data.BucketedData[propName][int(bucketId)] - 1
   204  	} else {
   205  		return errors.New("property not found")
   206  	}
   207  
   208  	return nil
   209  }
   210  
   211  // Returns the bucket that the given value belongs to
   212  func (t *JsonPropertyLengthTracker) bucketFromValue(value float32) int {
   213  	if t.UnlimitedBuckets {
   214  		return int(value)
   215  	}
   216  	if value <= 5.00 {
   217  		return int(value) - 1
   218  	}
   219  
   220  	bucket := int(math.Log(float64(value)/4.0)/math.Log(1.25) + 4)
   221  	if bucket > MAX_BUCKETS-1 {
   222  		return MAX_BUCKETS
   223  	}
   224  	return int(bucket)
   225  }
   226  
   227  // Returns the average length of the given property
   228  func (t *JsonPropertyLengthTracker) PropertyMean(propName string) (float32, error) {
   229  	t.Lock()
   230  	defer t.Unlock()
   231  
   232  	sum, ok := t.data.SumData[propName]
   233  	if !ok {
   234  		return 0, nil
   235  	}
   236  	count, ok := t.data.CountData[propName]
   237  	if !ok {
   238  		return 0, nil
   239  	}
   240  
   241  	return float32(sum) / float32(count), nil
   242  }
   243  
   244  // returns totalPropertyLength, totalCount, average propertyLength = sum / totalCount, total propertylength, totalCount, error
   245  func (t *JsonPropertyLengthTracker) PropertyTally(propName string) (int, int, float64, error) {
   246  	t.Lock()
   247  	defer t.Unlock()
   248  	sum, ok := t.data.SumData[propName]
   249  	if !ok {
   250  		return 0, 0, 0, nil // Required to match the old prop tracker (for now)
   251  	}
   252  	count, ok := t.data.CountData[propName]
   253  	if !ok {
   254  		return 0, 0, 0, nil // Required to match the old prop tracker (for now)
   255  	}
   256  	return sum, count, float64(sum) / float64(count), nil
   257  }
   258  
   259  // Writes the current state of the tracker to disk.  (flushBackup = true) will only write the backup file
   260  func (t *JsonPropertyLengthTracker) Flush(flushBackup bool) error {
   261  	if !flushBackup { // Write the backup file first
   262  		t.Flush(true)
   263  	}
   264  
   265  	t.Lock()
   266  	defer t.Unlock()
   267  
   268  	bytes, err := json.Marshal(t.data)
   269  	if err != nil {
   270  		return err
   271  	}
   272  
   273  	filename := t.path
   274  	if flushBackup {
   275  		filename = t.path + ".bak"
   276  	}
   277  
   278  	// Do a write+rename to avoid corrupting the file if we crash while writing
   279  	tempfile := filename + ".tmp"
   280  
   281  	err = WriteFile(tempfile, bytes, 0o666)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	err = os.Rename(tempfile, filename)
   287  	if err != nil {
   288  		return err
   289  	}
   290  
   291  	return nil
   292  }
   293  
   294  func WriteFile(name string, data []byte, perm os.FileMode) error {
   295  	f, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm)
   296  	if err != nil {
   297  		return err
   298  	}
   299  	defer f.Close()
   300  
   301  	_, err = f.Write(data)
   302  	if err != nil {
   303  		return err
   304  	}
   305  
   306  	// TODO: f.Sync() is introducing performance penalization at this point
   307  	// it will be addressed as part of another PR
   308  
   309  	return nil
   310  }
   311  
   312  // Closes the tracker and removes the backup file
   313  func (t *JsonPropertyLengthTracker) Close() error {
   314  	if err := t.Flush(false); err != nil {
   315  		return errors.Wrap(err, "flush before closing")
   316  	}
   317  
   318  	t.Lock()
   319  	defer t.Unlock()
   320  
   321  	t.data.BucketedData = nil
   322  
   323  	return nil
   324  }
   325  
   326  // Drop removes the tracker from disk
   327  func (t *JsonPropertyLengthTracker) Drop() error {
   328  	t.Close()
   329  
   330  	t.Lock()
   331  	defer t.Unlock()
   332  
   333  	t.data.BucketedData = nil
   334  
   335  	if err := os.Remove(t.path); err != nil {
   336  		return errors.Wrap(err, "remove prop length tracker state from disk:"+t.path)
   337  	}
   338  	if err := os.Remove(t.path + ".bak"); err != nil {
   339  		return errors.Wrap(err, "remove prop length tracker state from disk:"+t.path+".bak")
   340  	}
   341  
   342  	return nil
   343  }