github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/prop_length_tracker.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"encoding/binary"
    16  	"fmt"
    17  	"io"
    18  	"math"
    19  	"os"
    20  	"sync"
    21  
    22  	"github.com/pkg/errors"
    23  )
    24  
    25  // Page Design
    26  // | Bytes     | Description                                      |
    27  // | --------- | ------------------------------------------------ |
    28  // | start     | page is now 0
    29  // | 0-1       | uint16 pointer to last index byte
    30  // | 2-3       | uint16 pointer for property name length
    31  // | 4-n       | property name
    32  // | ...       | repeat length+pointer pattern
    33  // | 3584-3840 | second property buckets (64 buckets of float32)
    34  // | 3840-4096 | first property buckets
    35  // | repeat    | page is now 1, repeat all of above
    36  //
    37  // Fixed Assumptions:
    38  //   - First two bytes always used to indicate end of index, minimal value is 02,
    39  //     as the first possible value with index length=0 is after the two bytes
    40  //     themselves.
    41  //   - 64 buckets of float32 per property (=256B per prop), excluding the index
    42  //   - One index row is always 4+len(propName), consisting of a uint16 prop name
    43  //     length pointer, the name itself and an offset pointer pointing to the start
    44  //     (first byte) of the buckets
    45  //
    46  // The counter to the last index byte is only an uint16, so it can at maximum address 65535. This will overflow when the
    47  // 16th page is added (eg at page=15). To avoid a crash an error is returned in this case, but we will need to change
    48  // the byteformat to fix this.
    49  type PropertyLengthTracker struct {
    50  	file  *os.File
    51  	path  string
    52  	pages []byte
    53  	sync.Mutex
    54  }
    55  
    56  func NewPropertyLengthTracker(path string) (*PropertyLengthTracker, error) {
    57  	f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o666)
    58  	if err != nil {
    59  		return nil, err
    60  	}
    61  
    62  	stat, err := f.Stat()
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  
    67  	t := &PropertyLengthTracker{
    68  		pages: nil,
    69  		file:  f,
    70  		path:  path,
    71  	}
    72  
    73  	if stat.Size() > 0 {
    74  		// the file has existed before, we need to initialize with its content, we
    75  		// can read the entire contents into memory
    76  		existingPages, err := io.ReadAll(f)
    77  		if err != nil {
    78  			return nil, errors.Wrap(err, "read initial count from file")
    79  		}
    80  
    81  		if len(existingPages)%4096 != 0 {
    82  			return nil, errors.Errorf(
    83  				"failed sanity check, prop len tracker file %s has length %d", path,
    84  				len(existingPages))
    85  		}
    86  
    87  		t.pages = existingPages
    88  	} else {
    89  		// this is the first time this is being created, initialize with an empty
    90  		// page
    91  		t.pages = make([]byte, 4096)
    92  		// set initial end-of-index offset to 2
    93  		binary.LittleEndian.PutUint16(t.pages[0:2], 2)
    94  	}
    95  
    96  	return t, nil
    97  }
    98  
    99  func (t *PropertyLengthTracker) BucketCount(propName string, bucket uint16) (uint16, error) {
   100  	t.Lock()
   101  	defer t.Unlock()
   102  
   103  	page, offset, ok := t.propExists(propName)
   104  	if !ok {
   105  		return 0, fmt.Errorf("property %v does not exist in OldPropertyLengthTracker", propName)
   106  	}
   107  
   108  	offset = offset + page*4096
   109  
   110  	o := offset + (bucket * 4)
   111  	v := binary.LittleEndian.Uint32(t.pages[o : o+4])
   112  	count := math.Float32frombits(v)
   113  
   114  	return uint16(count), nil
   115  }
   116  
   117  func (t *PropertyLengthTracker) PropertyNames() []string {
   118  	var names []string
   119  	pages := len(t.pages) / int(4096)
   120  	for page := 0; page < pages; page++ {
   121  		pageStart := page * int(4096)
   122  
   123  		relativeEOI := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2]) // t.uint16At(pageStart)
   124  		EOI := pageStart + int(relativeEOI)
   125  
   126  		offset := int(pageStart) + 2
   127  		for offset < EOI {
   128  			propNameLength := int(binary.LittleEndian.Uint16(t.pages[offset : offset+2])) // int(t.uint16At(offset))
   129  			offset += 2
   130  
   131  			propName := t.pages[offset : offset+propNameLength]
   132  			offset += propNameLength
   133  
   134  			offset += 2
   135  
   136  			names = append(names, string(propName))
   137  		}
   138  	}
   139  	return names
   140  }
   141  
   142  func (t *PropertyLengthTracker) TrackProperty(propName string, value float32) error {
   143  	t.Lock()
   144  	defer t.Unlock()
   145  
   146  	var page uint16
   147  	var relBucketOffset uint16
   148  	if p, o, ok := t.propExists(propName); ok {
   149  		page = p
   150  		relBucketOffset = o
   151  	} else {
   152  		var err error
   153  		page, relBucketOffset, err = t.addProperty(propName)
   154  		if err != nil {
   155  			return err
   156  		}
   157  	}
   158  
   159  	bucketOffset := page*4096 + relBucketOffset + t.bucketFromValue(value)*4
   160  
   161  	v := binary.LittleEndian.Uint32(t.pages[bucketOffset : bucketOffset+4])
   162  	currentValue := math.Float32frombits(v)
   163  	currentValue += 1
   164  	v = math.Float32bits(currentValue)
   165  	binary.LittleEndian.PutUint32(t.pages[bucketOffset:bucketOffset+4], v)
   166  	return nil
   167  }
   168  
   169  func (t *PropertyLengthTracker) UnTrackProperty(propName string, value float32) error {
   170  	t.Lock()
   171  	defer t.Unlock()
   172  
   173  	var page uint16
   174  	var relBucketOffset uint16
   175  	if p, o, ok := t.propExists(propName); ok {
   176  		page = p
   177  		relBucketOffset = o
   178  	} else {
   179  		return fmt.Errorf("property %v does not exist in OldPropertyLengthTracker", propName)
   180  	}
   181  
   182  	bucketOffset := page*4096 + relBucketOffset + t.bucketFromValue(value)*4
   183  
   184  	v := binary.LittleEndian.Uint32(t.pages[bucketOffset : bucketOffset+4])
   185  	currentValue := math.Float32frombits(v)
   186  	currentValue -= 1
   187  	v = math.Float32bits(currentValue)
   188  	binary.LittleEndian.PutUint32(t.pages[bucketOffset:bucketOffset+4], v)
   189  	return nil
   190  }
   191  
   192  // propExists returns page number, relative offset on page, and a bool whether
   193  // the prop existed at all. The first to values have no meaning if the latter
   194  // is false
   195  func (t *PropertyLengthTracker) propExists(needle string) (uint16, uint16, bool) {
   196  	pages := len(t.pages) / 4096
   197  	for page := 0; page < pages; page++ {
   198  		pageStart := page * 4096
   199  
   200  		relativeEOI := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2])
   201  		EOI := pageStart + int(relativeEOI)
   202  
   203  		offset := int(pageStart) + 2
   204  		for offset < EOI {
   205  			propNameLength := int(binary.LittleEndian.Uint16(
   206  				t.pages[offset : offset+2]))
   207  			offset += 2
   208  
   209  			propName := t.pages[offset : offset+propNameLength]
   210  			offset += propNameLength
   211  			bucketPointer := binary.LittleEndian.Uint16(
   212  				t.pages[offset : offset+2])
   213  			offset += 2
   214  
   215  			if string(propName) == needle {
   216  				return uint16(page), bucketPointer, true
   217  			}
   218  
   219  		}
   220  	}
   221  	return 0, 0, false
   222  }
   223  
   224  func (t *PropertyLengthTracker) addProperty(propName string) (uint16, uint16, error) {
   225  	page := uint16(0)
   226  
   227  	for {
   228  		propNameBytes := []byte(propName)
   229  		t.createPageIfNotExists(page)
   230  		pageStart := page * 4096
   231  		lastBucketOffset := pageStart + 4096
   232  
   233  		relativeOffset := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2])
   234  		offset := pageStart + relativeOffset
   235  		if relativeOffset != 2 {
   236  			// relative offset is other than 2, so there are also props in. This
   237  			// means we can take the value of offset-2 to read the bucket offset
   238  			lastBucketOffset = pageStart + binary.LittleEndian.
   239  				Uint16(t.pages[offset-2:offset])
   240  		}
   241  
   242  		if !t.canPageFit(propNameBytes, offset, lastBucketOffset) {
   243  			page++
   244  			// overflow of uint16 variable that tracks the size of the tracker
   245  			if page > 15 {
   246  				return 0, 0, fmt.Errorf("could not add property %v, to PropertyLengthTracker, because the total"+
   247  					"length of all properties is too long", propName)
   248  			}
   249  			continue
   250  		}
   251  
   252  		propNameLength := uint16(len(propNameBytes))
   253  		binary.LittleEndian.PutUint16(t.pages[offset:offset+2], propNameLength)
   254  		offset += 2
   255  		copy(t.pages[offset:offset+propNameLength], propNameBytes)
   256  		offset += propNameLength
   257  
   258  		newBucketOffset := lastBucketOffset - 256 - pageStart
   259  		binary.LittleEndian.PutUint16(t.pages[offset:offset+2], newBucketOffset)
   260  		offset += 2
   261  
   262  		// update end of index offset for page, since the prop name index has
   263  		// now grown
   264  		binary.LittleEndian.PutUint16(t.pages[pageStart:pageStart+2], offset-pageStart)
   265  		return page, newBucketOffset, nil
   266  	}
   267  }
   268  
   269  func (t *PropertyLengthTracker) canPageFit(propName []byte,
   270  	offset uint16, lastBucketOffset uint16,
   271  ) bool {
   272  	// lastBucketOffset represents the end of the writable area, offset
   273  	// represents the start, which means we can take the delta to see // how
   274  	// much space is left on this page
   275  	spaceLeft := lastBucketOffset - offset
   276  
   277  	// we need to write 256 bytes for the buckets, plus two pointers of uint16
   278  	spaceNeeded := uint16(len(propName)+4) + 256
   279  
   280  	return spaceLeft >= spaceNeeded
   281  }
   282  
   283  func (t *PropertyLengthTracker) bucketFromValue(value float32) uint16 {
   284  	if value <= 5.00 {
   285  		return uint16(value) - 1
   286  	}
   287  
   288  	bucket := int(math.Log(float64(value)/4.0)/math.Log(1.25) + 4)
   289  	if bucket > 63 {
   290  		return 64
   291  	}
   292  	return uint16(bucket)
   293  }
   294  
   295  func (t *PropertyLengthTracker) valueFromBucket(bucket uint16) float32 {
   296  	if bucket <= 5 {
   297  		return float32(bucket + 1)
   298  	}
   299  
   300  	return float32(4 * math.Pow(1.25, float64(bucket)-3.5))
   301  }
   302  
   303  func (t *PropertyLengthTracker) PropertyMean(propName string) (float32, error) {
   304  	t.Lock()
   305  	defer t.Unlock()
   306  
   307  	page, offset, ok := t.propExists(propName)
   308  	if !ok {
   309  		return 0, nil
   310  	}
   311  
   312  	sum := float32(0)
   313  	totalCount := float32(0)
   314  	bucket := uint16(0)
   315  
   316  	offset = offset + page*4096
   317  	for o := offset; o < offset+256; o += 4 {
   318  		v := binary.LittleEndian.Uint32(t.pages[o : o+4])
   319  		count := math.Float32frombits(v)
   320  		sum += float32(t.valueFromBucket(bucket)) * count
   321  		totalCount += count
   322  
   323  		bucket++
   324  	}
   325  
   326  	if totalCount == 0 {
   327  		return 0, nil
   328  	}
   329  
   330  	return sum / totalCount, nil
   331  }
   332  
   333  func (t *PropertyLengthTracker) PropertyTally(propName string) (int, int, float32, error) {
   334  	t.Lock()
   335  	defer t.Unlock()
   336  
   337  	page, offset, ok := t.propExists(propName)
   338  	if !ok {
   339  		return 0, 0, 0, nil
   340  	}
   341  
   342  	sum := float32(0)
   343  	totalCount := float32(0)
   344  	bucket := uint16(0)
   345  
   346  	offset = offset + page*4096
   347  	for o := offset; o < offset+256; o += 4 {
   348  		v := binary.LittleEndian.Uint32(t.pages[o : o+4])
   349  		count := math.Float32frombits(v)
   350  		sum += float32(t.valueFromBucket(bucket)) * count
   351  		totalCount += count
   352  
   353  		bucket++
   354  	}
   355  
   356  	if totalCount == 0 {
   357  		return 0, 0, 0, nil
   358  	}
   359  
   360  	return int(sum), int(totalCount), sum / totalCount, nil
   361  }
   362  
   363  func (t *PropertyLengthTracker) createPageIfNotExists(page uint16) {
   364  	if uint16(len(t.pages))/4096-1 < page {
   365  		// we need to grow the page buffer
   366  		newPages := make([]byte, uint64(page)*4096+4096)
   367  		copy(newPages[:len(t.pages)], t.pages)
   368  
   369  		// the new page must have the correct offset initialized
   370  		binary.LittleEndian.PutUint16(newPages[page*4096:page*4096+2], 2)
   371  		t.pages = newPages
   372  	}
   373  }
   374  
   375  func (t *PropertyLengthTracker) Flush() error {
   376  	t.Lock()
   377  	defer t.Unlock()
   378  
   379  	if err := t.file.Truncate(int64(len(t.pages))); err != nil {
   380  		return errors.Wrap(err, "truncate prop tracker file to correct length")
   381  	}
   382  
   383  	if _, err := t.file.Seek(0, io.SeekStart); err != nil {
   384  		return errors.Wrap(err, "seek to beginning of prop tracker file")
   385  	}
   386  
   387  	if _, err := t.file.Write(t.pages); err != nil {
   388  		return errors.Wrap(err, "flush page content to disk")
   389  	}
   390  
   391  	return nil
   392  }
   393  
   394  func (t *PropertyLengthTracker) Close() error {
   395  	if err := t.Flush(); err != nil {
   396  		return errors.Wrap(err, "flush before closing")
   397  	}
   398  
   399  	t.Lock()
   400  	defer t.Unlock()
   401  
   402  	if err := t.file.Close(); err != nil {
   403  		return errors.Wrap(err, "close prop length tracker file")
   404  	}
   405  
   406  	t.pages = nil
   407  
   408  	return nil
   409  }
   410  
   411  func (t *PropertyLengthTracker) Drop() error {
   412  	t.Lock()
   413  	defer t.Unlock()
   414  
   415  	if err := t.file.Close(); err != nil {
   416  		_ = err
   417  		// explicitly ignore error
   418  	}
   419  
   420  	t.pages = nil
   421  
   422  	if err := os.Remove(t.path); err != nil {
   423  		return errors.Wrap(err, "remove prop length tracker state from disk")
   424  	}
   425  
   426  	return nil
   427  }
   428  
   429  func (t *PropertyLengthTracker) FileName() string {
   430  	return t.file.Name()
   431  }