github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/row_reader_frequency.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  import (
    15  	"bytes"
    16  	"context"
    17  	"encoding/binary"
    18  	"fmt"
    19  
    20  	"github.com/weaviate/sroar"
    21  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv"
    22  	"github.com/weaviate/weaviate/adapters/repos/db/roaringset"
    23  	"github.com/weaviate/weaviate/entities/filters"
    24  )
    25  
    26  // RowReaderFrequency reads one or many row(s) depending on the specified operator
    27  type RowReaderFrequency struct {
    28  	value         []byte
    29  	bucket        *lsmkv.Bucket
    30  	operator      filters.Operator
    31  	keyOnly       bool
    32  	shardVersion  uint16
    33  	bitmapFactory *roaringset.BitmapFactory
    34  }
    35  
    36  func NewRowReaderFrequency(bucket *lsmkv.Bucket, value []byte,
    37  	operator filters.Operator, keyOnly bool, shardVersion uint16,
    38  	bitmapFactory *roaringset.BitmapFactory,
    39  ) *RowReaderFrequency {
    40  	return &RowReaderFrequency{
    41  		bucket:        bucket,
    42  		value:         value,
    43  		operator:      operator,
    44  		keyOnly:       keyOnly,
    45  		shardVersion:  shardVersion,
    46  		bitmapFactory: bitmapFactory,
    47  	}
    48  }
    49  
    50  func (rr *RowReaderFrequency) Read(ctx context.Context, readFn ReadFn) error {
    51  	switch rr.operator {
    52  	case filters.OperatorEqual:
    53  		return rr.equal(ctx, readFn)
    54  	case filters.OperatorNotEqual:
    55  		return rr.notEqual(ctx, readFn)
    56  	case filters.OperatorGreaterThan:
    57  		return rr.greaterThan(ctx, readFn, false)
    58  	case filters.OperatorGreaterThanEqual:
    59  		return rr.greaterThan(ctx, readFn, true)
    60  	case filters.OperatorLessThan:
    61  		return rr.lessThan(ctx, readFn, false)
    62  	case filters.OperatorLessThanEqual:
    63  		return rr.lessThan(ctx, readFn, true)
    64  	case filters.OperatorLike:
    65  		return rr.like(ctx, readFn)
    66  	default:
    67  		return fmt.Errorf("operator %v supported", rr.operator)
    68  	}
    69  }
    70  
    71  // equal is a special case, as we don't need to iterate, but just read a single
    72  // row
    73  func (rr *RowReaderFrequency) equal(ctx context.Context, readFn ReadFn) error {
    74  	v, err := rr.equalHelper(ctx)
    75  	if err != nil {
    76  		return err
    77  	}
    78  
    79  	_, err = readFn(rr.value, rr.transformToBitmap(v))
    80  	return err
    81  }
    82  
    83  func (rr *RowReaderFrequency) notEqual(ctx context.Context, readFn ReadFn) error {
    84  	v, err := rr.equalHelper(ctx)
    85  	if err != nil {
    86  		return err
    87  	}
    88  
    89  	// Invert the Equal results for an efficient NotEqual
    90  	inverted := rr.bitmapFactory.GetBitmap()
    91  	inverted.AndNot(rr.transformToBitmap(v))
    92  	_, err = readFn(rr.value, inverted)
    93  	return err
    94  }
    95  
    96  // greaterThan reads from the specified value to the end. The first row is only
    97  // included if allowEqual==true, otherwise it starts with the next one
    98  func (rr *RowReaderFrequency) greaterThan(ctx context.Context, readFn ReadFn,
    99  	allowEqual bool,
   100  ) error {
   101  	c := rr.newCursor()
   102  	defer c.Close()
   103  
   104  	for k, v := c.Seek(rr.value); k != nil; k, v = c.Next() {
   105  		if err := ctx.Err(); err != nil {
   106  			return err
   107  		}
   108  
   109  		if bytes.Equal(k, rr.value) && !allowEqual {
   110  			continue
   111  		}
   112  
   113  		continueReading, err := readFn(k, rr.transformToBitmap(v))
   114  		if err != nil {
   115  			return err
   116  		}
   117  
   118  		if !continueReading {
   119  			break
   120  		}
   121  	}
   122  
   123  	return nil
   124  }
   125  
   126  // lessThan reads from the very begging to the specified  value. The last
   127  // matching row is only included if allowEqual==true, otherwise it ends one
   128  // prior to that.
   129  func (rr *RowReaderFrequency) lessThan(ctx context.Context, readFn ReadFn,
   130  	allowEqual bool,
   131  ) error {
   132  	c := rr.newCursor()
   133  	defer c.Close()
   134  
   135  	for k, v := c.First(); k != nil && bytes.Compare(k, rr.value) != 1; k, v = c.Next() {
   136  		if err := ctx.Err(); err != nil {
   137  			return err
   138  		}
   139  
   140  		if bytes.Equal(k, rr.value) && !allowEqual {
   141  			continue
   142  		}
   143  
   144  		continueReading, err := readFn(k, rr.transformToBitmap(v))
   145  		if err != nil {
   146  			return err
   147  		}
   148  
   149  		if !continueReading {
   150  			break
   151  		}
   152  	}
   153  
   154  	return nil
   155  }
   156  
   157  func (rr *RowReaderFrequency) like(ctx context.Context, readFn ReadFn) error {
   158  	like, err := parseLikeRegexp(rr.value)
   159  	if err != nil {
   160  		return fmt.Errorf("parse like value: %w", err)
   161  	}
   162  
   163  	// TODO: don't we need to check here if this is a doc id vs a object search?
   164  	// Or is this not a problem because the latter removes duplicates anyway?
   165  	c := rr.newCursor(lsmkv.MapListAcceptDuplicates())
   166  	defer c.Close()
   167  
   168  	var (
   169  		initialK []byte
   170  		initialV []lsmkv.MapPair
   171  	)
   172  
   173  	if like.optimizable {
   174  		initialK, initialV = c.Seek(like.min)
   175  	} else {
   176  		initialK, initialV = c.First()
   177  	}
   178  
   179  	for k, v := initialK, initialV; k != nil; k, v = c.Next() {
   180  		if err := ctx.Err(); err != nil {
   181  			return err
   182  		}
   183  
   184  		if like.optimizable {
   185  			// if the query is optimizable, i.e. it doesn't start with a wildcard, we
   186  			// can abort once we've moved past the point where the fixed characters
   187  			// no longer match
   188  			if len(k) < len(like.min) {
   189  				break
   190  			}
   191  
   192  			if bytes.Compare(like.min, k[:len(like.min)]) == -1 {
   193  				break
   194  			}
   195  		}
   196  
   197  		if !like.regexp.Match(k) {
   198  			continue
   199  		}
   200  
   201  		continueReading, err := readFn(k, rr.transformToBitmap(v))
   202  		if err != nil {
   203  			return err
   204  		}
   205  
   206  		if !continueReading {
   207  			break
   208  		}
   209  	}
   210  
   211  	return nil
   212  }
   213  
   214  // newCursor will either return a regular cursor - or a key-only cursor if
   215  // keyOnly==true
   216  func (rr *RowReaderFrequency) newCursor(
   217  	opts ...lsmkv.MapListOption,
   218  ) *lsmkv.CursorMap {
   219  	if rr.shardVersion < 2 {
   220  		opts = append(opts, lsmkv.MapListLegacySortingRequired())
   221  	}
   222  
   223  	if rr.keyOnly {
   224  		return rr.bucket.MapCursorKeyOnly(opts...)
   225  	}
   226  
   227  	return rr.bucket.MapCursor(opts...)
   228  }
   229  
   230  func (rr *RowReaderFrequency) transformToBitmap(pairs []lsmkv.MapPair) *sroar.Bitmap {
   231  	out := sroar.NewBitmap()
   232  	for _, pair := range pairs {
   233  		// this entry has a frequency, but that's only used for bm25, not for
   234  		// pure filtering, so we can ignore it here
   235  		if rr.shardVersion < 2 {
   236  			out.Set(binary.LittleEndian.Uint64(pair.Key))
   237  		} else {
   238  			out.Set(binary.BigEndian.Uint64(pair.Key))
   239  		}
   240  	}
   241  	return out
   242  }
   243  
   244  // equalHelper exists, because the Equal and NotEqual operators share this functionality
   245  func (rr *RowReaderFrequency) equalHelper(ctx context.Context) (v []lsmkv.MapPair, err error) {
   246  	if err = ctx.Err(); err != nil {
   247  		return
   248  	}
   249  
   250  	if rr.shardVersion < 2 {
   251  		v, err = rr.bucket.MapList(rr.value, lsmkv.MapListAcceptDuplicates(),
   252  			lsmkv.MapListLegacySortingRequired())
   253  		if err != nil {
   254  			return
   255  		}
   256  	} else {
   257  		v, err = rr.bucket.MapList(rr.value, lsmkv.MapListAcceptDuplicates())
   258  		if err != nil {
   259  			return
   260  		}
   261  	}
   262  	return
   263  }