github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/compressionhelpers/compression.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package compressionhelpers
    13  
    14  import (
    15  	"context"
    16  	"encoding/binary"
    17  	"fmt"
    18  
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  	"github.com/weaviate/weaviate/adapters/repos/db/helpers"
    22  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv"
    23  	"github.com/weaviate/weaviate/adapters/repos/db/vector/cache"
    24  	"github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer"
    25  	"github.com/weaviate/weaviate/entities/storobj"
    26  	"github.com/weaviate/weaviate/entities/vectorindex/hnsw"
    27  )
    28  
    29  type CompressorDistancer interface {
    30  	DistanceToNode(id uint64) (float32, bool, error)
    31  	DistanceToFloat(vec []float32) (float32, bool, error)
    32  }
    33  
    34  type ReturnDistancerFn func()
    35  
    36  type VectorCompressor interface {
    37  	Drop() error
    38  	GrowCache(size uint64)
    39  	SetCacheMaxSize(size int64)
    40  	GetCacheMaxSize() int64
    41  	Delete(ctx context.Context, id uint64)
    42  	Preload(id uint64, vector []float32)
    43  	Prefetch(id uint64)
    44  	PrefillCache()
    45  
    46  	DistanceBetweenCompressedVectorsFromIDs(ctx context.Context, x, y uint64) (float32, error)
    47  	DistanceBetweenCompressedAndUncompressedVectorsFromID(ctx context.Context, x uint64, y []float32) (float32, error)
    48  	NewDistancer(vector []float32) (CompressorDistancer, ReturnDistancerFn)
    49  	NewDistancerFromID(id uint64) (CompressorDistancer, error)
    50  	NewBag() CompressionDistanceBag
    51  
    52  	ExposeFields() PQData
    53  }
    54  
    55  type quantizedVectorsCompressor[T byte | uint64] struct {
    56  	cache           cache.Cache[T]
    57  	compressedStore *lsmkv.Store
    58  	quantizer       quantizer[T]
    59  	storeId         func([]byte, uint64)
    60  	loadId          func([]byte) uint64
    61  }
    62  
    63  func (compressor *quantizedVectorsCompressor[T]) Drop() error {
    64  	compressor.cache.Drop()
    65  	return nil
    66  }
    67  
    68  func (compressor *quantizedVectorsCompressor[T]) GrowCache(size uint64) {
    69  	compressor.cache.Grow(size)
    70  }
    71  
    72  func (compressor *quantizedVectorsCompressor[T]) SetCacheMaxSize(size int64) {
    73  	compressor.cache.UpdateMaxSize(size)
    74  }
    75  
    76  func (compressor *quantizedVectorsCompressor[T]) GetCacheMaxSize() int64 {
    77  	return compressor.cache.CopyMaxSize()
    78  }
    79  
    80  func (compressor *quantizedVectorsCompressor[T]) Delete(ctx context.Context, id uint64) {
    81  	compressor.cache.Delete(ctx, id)
    82  	idBytes := make([]byte, 8)
    83  	compressor.storeId(idBytes, id)
    84  	compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Delete(idBytes)
    85  }
    86  
    87  func (compressor *quantizedVectorsCompressor[T]) Preload(id uint64, vector []float32) {
    88  	compressedVector := compressor.quantizer.Encode(vector)
    89  	idBytes := make([]byte, 8)
    90  	compressor.storeId(idBytes, id)
    91  	compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Put(idBytes, compressor.quantizer.CompressedBytes(compressedVector))
    92  	compressor.cache.Grow(id)
    93  	compressor.cache.Preload(id, compressedVector)
    94  }
    95  
    96  func (compressor *quantizedVectorsCompressor[T]) Prefetch(id uint64) {
    97  	compressor.cache.Prefetch(id)
    98  }
    99  
   100  func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedVectors(x, y []T) (float32, error) {
   101  	return compressor.quantizer.DistanceBetweenCompressedVectors(x, y)
   102  }
   103  
   104  func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedAndUncompressedVectors(x []T, y []float32) (float32, error) {
   105  	return compressor.quantizer.DistanceBetweenCompressedAndUncompressedVectors(y, x)
   106  }
   107  
   108  func (compressor *quantizedVectorsCompressor[T]) compressedVectorFromID(ctx context.Context, id uint64) ([]T, error) {
   109  	compressedVector, err := compressor.cache.Get(ctx, id)
   110  	if err != nil {
   111  		return nil, err
   112  	}
   113  	if len(compressedVector) == 0 {
   114  		return nil, fmt.Errorf("got a nil or zero-length vector at docID %d", id)
   115  	}
   116  	return compressedVector, nil
   117  }
   118  
   119  func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedVectorsFromIDs(ctx context.Context, id1, id2 uint64) (float32, error) {
   120  	compressedVector1, err := compressor.compressedVectorFromID(ctx, id1)
   121  	if err != nil {
   122  		return 0, err
   123  	}
   124  
   125  	compressedVector2, err := compressor.compressedVectorFromID(ctx, id2)
   126  	if err != nil {
   127  		return 0, err
   128  	}
   129  
   130  	dist, err := compressor.DistanceBetweenCompressedVectors(compressedVector1, compressedVector2)
   131  	return dist, err
   132  }
   133  
   134  func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedAndUncompressedVectorsFromID(ctx context.Context, id uint64, vector []float32) (float32, error) {
   135  	compressedVector, err := compressor.compressedVectorFromID(ctx, id)
   136  	if err != nil {
   137  		return 0, err
   138  	}
   139  
   140  	dist, err := compressor.DistanceBetweenCompressedAndUncompressedVectors(compressedVector, vector)
   141  	return dist, err
   142  }
   143  
   144  func (compressor *quantizedVectorsCompressor[T]) getCompressedVectorForID(ctx context.Context, id uint64) ([]T, error) {
   145  	idBytes := make([]byte, 8)
   146  	compressor.storeId(idBytes, id)
   147  	compressedVector, err := compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Get(idBytes)
   148  	if err != nil {
   149  		return nil, errors.Wrap(err, "Getting vector for id")
   150  	}
   151  	if len(compressedVector) == 0 {
   152  		return nil, storobj.NewErrNotFoundf(id, "getCompressedVectorForID")
   153  	}
   154  
   155  	return compressor.quantizer.FromCompressedBytes(compressedVector), nil
   156  }
   157  
   158  func (compressor *quantizedVectorsCompressor[T]) NewDistancer(vector []float32) (CompressorDistancer, ReturnDistancerFn) {
   159  	d := &quantizedCompressorDistancer[T]{
   160  		compressor: compressor,
   161  		distancer:  compressor.quantizer.NewQuantizerDistancer(vector),
   162  	}
   163  	return d, func() {
   164  		compressor.returnDistancer(d)
   165  	}
   166  }
   167  
   168  func (compressor *quantizedVectorsCompressor[T]) NewDistancerFromID(id uint64) (CompressorDistancer, error) {
   169  	compressedVector, err := compressor.compressedVectorFromID(context.Background(), id)
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	if compressedVector == nil {
   174  		return nil, storobj.ErrNotFound{
   175  			DocID: id,
   176  		}
   177  	}
   178  	d := &quantizedCompressorDistancer[T]{
   179  		compressor: compressor,
   180  		distancer:  compressor.quantizer.NewCompressedQuantizerDistancer(compressedVector),
   181  	}
   182  	return d, nil
   183  }
   184  
   185  func (compressor *quantizedVectorsCompressor[T]) returnDistancer(distancer CompressorDistancer) {
   186  	dst := distancer.(*quantizedCompressorDistancer[T]).distancer
   187  	if dst == nil {
   188  		return
   189  	}
   190  	compressor.quantizer.ReturnQuantizerDistancer(dst)
   191  }
   192  
   193  func (compressor *quantizedVectorsCompressor[T]) NewBag() CompressionDistanceBag {
   194  	return &quantizedDistanceBag[T]{
   195  		compressor: compressor,
   196  		elements:   make(map[uint64][]T),
   197  	}
   198  }
   199  
   200  func (compressor *quantizedVectorsCompressor[T]) initCompressedStore() error {
   201  	err := compressor.compressedStore.CreateOrLoadBucket(context.Background(), helpers.VectorsCompressedBucketLSM)
   202  	if err != nil {
   203  		return errors.Wrapf(err, "Create or load bucket (compressed vectors store)")
   204  	}
   205  	return nil
   206  }
   207  
   208  func (compressor *quantizedVectorsCompressor[T]) PrefillCache() {
   209  	cursor := compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Cursor()
   210  	for k, v := cursor.First(); k != nil; k, v = cursor.Next() {
   211  		id := compressor.loadId(k)
   212  		compressor.cache.Grow(id)
   213  
   214  		vc := make([]byte, len(v))
   215  		copy(vc, v)
   216  		compressor.cache.Preload(id, compressor.quantizer.FromCompressedBytes(vc))
   217  	}
   218  	cursor.Close()
   219  }
   220  
   221  func (compressor *quantizedVectorsCompressor[T]) ExposeFields() PQData {
   222  	return compressor.quantizer.ExposeFields()
   223  }
   224  
   225  func NewHNSWPQCompressor(
   226  	cfg hnsw.PQConfig,
   227  	distance distancer.Provider,
   228  	dimensions int,
   229  	vectorCacheMaxObjects int,
   230  	logger logrus.FieldLogger,
   231  	data [][]float32,
   232  	store *lsmkv.Store,
   233  ) (VectorCompressor, error) {
   234  	quantizer, err := NewProductQuantizer(cfg, distance, dimensions)
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  	pqVectorsCompressor := &quantizedVectorsCompressor[byte]{
   239  		quantizer:       quantizer,
   240  		compressedStore: store,
   241  		storeId:         binary.LittleEndian.PutUint64,
   242  		loadId:          binary.LittleEndian.Uint64,
   243  	}
   244  	pqVectorsCompressor.initCompressedStore()
   245  	pqVectorsCompressor.cache = cache.NewShardedByteLockCache(pqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0)
   246  	pqVectorsCompressor.cache.Grow(uint64(len(data)))
   247  	err = quantizer.Fit(data)
   248  	if err != nil {
   249  		return nil, err
   250  	}
   251  	return pqVectorsCompressor, nil
   252  }
   253  
   254  func RestoreHNSWPQCompressor(
   255  	cfg hnsw.PQConfig,
   256  	distance distancer.Provider,
   257  	dimensions int,
   258  	vectorCacheMaxObjects int,
   259  	logger logrus.FieldLogger,
   260  	encoders []PQEncoder,
   261  	store *lsmkv.Store,
   262  ) (VectorCompressor, error) {
   263  	quantizer, err := NewProductQuantizerWithEncoders(cfg, distance, dimensions, encoders)
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	pqVectorsCompressor := &quantizedVectorsCompressor[byte]{
   268  		quantizer:       quantizer,
   269  		compressedStore: store,
   270  		storeId:         binary.LittleEndian.PutUint64,
   271  		loadId:          binary.LittleEndian.Uint64,
   272  	}
   273  	pqVectorsCompressor.initCompressedStore()
   274  	pqVectorsCompressor.cache = cache.NewShardedByteLockCache(pqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0)
   275  	return pqVectorsCompressor, nil
   276  }
   277  
   278  func NewBQCompressor(
   279  	distance distancer.Provider,
   280  	vectorCacheMaxObjects int,
   281  	logger logrus.FieldLogger,
   282  	store *lsmkv.Store,
   283  ) (VectorCompressor, error) {
   284  	quantizer := NewBinaryQuantizer(distance)
   285  	bqVectorsCompressor := &quantizedVectorsCompressor[uint64]{
   286  		quantizer:       &quantizer,
   287  		compressedStore: store,
   288  		storeId:         binary.BigEndian.PutUint64,
   289  		loadId:          binary.BigEndian.Uint64,
   290  	}
   291  	bqVectorsCompressor.initCompressedStore()
   292  	bqVectorsCompressor.cache = cache.NewShardedUInt64LockCache(bqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0)
   293  	return bqVectorsCompressor, nil
   294  }
   295  
   296  type quantizedCompressorDistancer[T byte | uint64] struct {
   297  	compressor *quantizedVectorsCompressor[T]
   298  	distancer  quantizerDistancer[T]
   299  }
   300  
   301  func (distancer *quantizedCompressorDistancer[T]) DistanceToNode(id uint64) (float32, bool, error) {
   302  	compressedVector, err := distancer.compressor.cache.Get(context.Background(), id)
   303  	if err != nil {
   304  		return 0, false, err
   305  	}
   306  	if len(compressedVector) == 0 {
   307  		return 0, false, fmt.Errorf(
   308  			"got a nil or zero-length vector at docID %d", id)
   309  	}
   310  	return distancer.distancer.Distance(compressedVector)
   311  }
   312  
   313  func (distancer *quantizedCompressorDistancer[T]) DistanceToFloat(vector []float32) (float32, bool, error) {
   314  	return distancer.distancer.DistanceToFloat(vector)
   315  }