github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/vector/compressionhelpers/compression.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package compressionhelpers 13 14 import ( 15 "context" 16 "encoding/binary" 17 "fmt" 18 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 "github.com/weaviate/weaviate/adapters/repos/db/helpers" 22 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv" 23 "github.com/weaviate/weaviate/adapters/repos/db/vector/cache" 24 "github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer" 25 "github.com/weaviate/weaviate/entities/storobj" 26 "github.com/weaviate/weaviate/entities/vectorindex/hnsw" 27 ) 28 29 type CompressorDistancer interface { 30 DistanceToNode(id uint64) (float32, bool, error) 31 DistanceToFloat(vec []float32) (float32, bool, error) 32 } 33 34 type ReturnDistancerFn func() 35 36 type VectorCompressor interface { 37 Drop() error 38 GrowCache(size uint64) 39 SetCacheMaxSize(size int64) 40 GetCacheMaxSize() int64 41 Delete(ctx context.Context, id uint64) 42 Preload(id uint64, vector []float32) 43 Prefetch(id uint64) 44 PrefillCache() 45 46 DistanceBetweenCompressedVectorsFromIDs(ctx context.Context, x, y uint64) (float32, error) 47 DistanceBetweenCompressedAndUncompressedVectorsFromID(ctx context.Context, x uint64, y []float32) (float32, error) 48 NewDistancer(vector []float32) (CompressorDistancer, ReturnDistancerFn) 49 NewDistancerFromID(id uint64) (CompressorDistancer, error) 50 NewBag() CompressionDistanceBag 51 52 ExposeFields() PQData 53 } 54 55 type quantizedVectorsCompressor[T byte | uint64] struct { 56 cache cache.Cache[T] 57 compressedStore *lsmkv.Store 58 quantizer quantizer[T] 59 storeId func([]byte, uint64) 60 loadId func([]byte) uint64 61 } 62 63 func (compressor *quantizedVectorsCompressor[T]) Drop() error { 64 compressor.cache.Drop() 65 return nil 66 } 67 68 func (compressor *quantizedVectorsCompressor[T]) GrowCache(size uint64) { 69 compressor.cache.Grow(size) 70 } 71 72 func (compressor *quantizedVectorsCompressor[T]) SetCacheMaxSize(size int64) { 73 compressor.cache.UpdateMaxSize(size) 74 } 75 76 func (compressor *quantizedVectorsCompressor[T]) GetCacheMaxSize() int64 { 77 return compressor.cache.CopyMaxSize() 78 } 79 80 func (compressor *quantizedVectorsCompressor[T]) Delete(ctx context.Context, id uint64) { 81 compressor.cache.Delete(ctx, id) 82 idBytes := make([]byte, 8) 83 compressor.storeId(idBytes, id) 84 compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Delete(idBytes) 85 } 86 87 func (compressor *quantizedVectorsCompressor[T]) Preload(id uint64, vector []float32) { 88 compressedVector := compressor.quantizer.Encode(vector) 89 idBytes := make([]byte, 8) 90 compressor.storeId(idBytes, id) 91 compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Put(idBytes, compressor.quantizer.CompressedBytes(compressedVector)) 92 compressor.cache.Grow(id) 93 compressor.cache.Preload(id, compressedVector) 94 } 95 96 func (compressor *quantizedVectorsCompressor[T]) Prefetch(id uint64) { 97 compressor.cache.Prefetch(id) 98 } 99 100 func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedVectors(x, y []T) (float32, error) { 101 return compressor.quantizer.DistanceBetweenCompressedVectors(x, y) 102 } 103 104 func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedAndUncompressedVectors(x []T, y []float32) (float32, error) { 105 return compressor.quantizer.DistanceBetweenCompressedAndUncompressedVectors(y, x) 106 } 107 108 func (compressor *quantizedVectorsCompressor[T]) compressedVectorFromID(ctx context.Context, id uint64) ([]T, error) { 109 compressedVector, err := compressor.cache.Get(ctx, id) 110 if err != nil { 111 return nil, err 112 } 113 if len(compressedVector) == 0 { 114 return nil, fmt.Errorf("got a nil or zero-length vector at docID %d", id) 115 } 116 return compressedVector, nil 117 } 118 119 func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedVectorsFromIDs(ctx context.Context, id1, id2 uint64) (float32, error) { 120 compressedVector1, err := compressor.compressedVectorFromID(ctx, id1) 121 if err != nil { 122 return 0, err 123 } 124 125 compressedVector2, err := compressor.compressedVectorFromID(ctx, id2) 126 if err != nil { 127 return 0, err 128 } 129 130 dist, err := compressor.DistanceBetweenCompressedVectors(compressedVector1, compressedVector2) 131 return dist, err 132 } 133 134 func (compressor *quantizedVectorsCompressor[T]) DistanceBetweenCompressedAndUncompressedVectorsFromID(ctx context.Context, id uint64, vector []float32) (float32, error) { 135 compressedVector, err := compressor.compressedVectorFromID(ctx, id) 136 if err != nil { 137 return 0, err 138 } 139 140 dist, err := compressor.DistanceBetweenCompressedAndUncompressedVectors(compressedVector, vector) 141 return dist, err 142 } 143 144 func (compressor *quantizedVectorsCompressor[T]) getCompressedVectorForID(ctx context.Context, id uint64) ([]T, error) { 145 idBytes := make([]byte, 8) 146 compressor.storeId(idBytes, id) 147 compressedVector, err := compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Get(idBytes) 148 if err != nil { 149 return nil, errors.Wrap(err, "Getting vector for id") 150 } 151 if len(compressedVector) == 0 { 152 return nil, storobj.NewErrNotFoundf(id, "getCompressedVectorForID") 153 } 154 155 return compressor.quantizer.FromCompressedBytes(compressedVector), nil 156 } 157 158 func (compressor *quantizedVectorsCompressor[T]) NewDistancer(vector []float32) (CompressorDistancer, ReturnDistancerFn) { 159 d := &quantizedCompressorDistancer[T]{ 160 compressor: compressor, 161 distancer: compressor.quantizer.NewQuantizerDistancer(vector), 162 } 163 return d, func() { 164 compressor.returnDistancer(d) 165 } 166 } 167 168 func (compressor *quantizedVectorsCompressor[T]) NewDistancerFromID(id uint64) (CompressorDistancer, error) { 169 compressedVector, err := compressor.compressedVectorFromID(context.Background(), id) 170 if err != nil { 171 return nil, err 172 } 173 if compressedVector == nil { 174 return nil, storobj.ErrNotFound{ 175 DocID: id, 176 } 177 } 178 d := &quantizedCompressorDistancer[T]{ 179 compressor: compressor, 180 distancer: compressor.quantizer.NewCompressedQuantizerDistancer(compressedVector), 181 } 182 return d, nil 183 } 184 185 func (compressor *quantizedVectorsCompressor[T]) returnDistancer(distancer CompressorDistancer) { 186 dst := distancer.(*quantizedCompressorDistancer[T]).distancer 187 if dst == nil { 188 return 189 } 190 compressor.quantizer.ReturnQuantizerDistancer(dst) 191 } 192 193 func (compressor *quantizedVectorsCompressor[T]) NewBag() CompressionDistanceBag { 194 return &quantizedDistanceBag[T]{ 195 compressor: compressor, 196 elements: make(map[uint64][]T), 197 } 198 } 199 200 func (compressor *quantizedVectorsCompressor[T]) initCompressedStore() error { 201 err := compressor.compressedStore.CreateOrLoadBucket(context.Background(), helpers.VectorsCompressedBucketLSM) 202 if err != nil { 203 return errors.Wrapf(err, "Create or load bucket (compressed vectors store)") 204 } 205 return nil 206 } 207 208 func (compressor *quantizedVectorsCompressor[T]) PrefillCache() { 209 cursor := compressor.compressedStore.Bucket(helpers.VectorsCompressedBucketLSM).Cursor() 210 for k, v := cursor.First(); k != nil; k, v = cursor.Next() { 211 id := compressor.loadId(k) 212 compressor.cache.Grow(id) 213 214 vc := make([]byte, len(v)) 215 copy(vc, v) 216 compressor.cache.Preload(id, compressor.quantizer.FromCompressedBytes(vc)) 217 } 218 cursor.Close() 219 } 220 221 func (compressor *quantizedVectorsCompressor[T]) ExposeFields() PQData { 222 return compressor.quantizer.ExposeFields() 223 } 224 225 func NewHNSWPQCompressor( 226 cfg hnsw.PQConfig, 227 distance distancer.Provider, 228 dimensions int, 229 vectorCacheMaxObjects int, 230 logger logrus.FieldLogger, 231 data [][]float32, 232 store *lsmkv.Store, 233 ) (VectorCompressor, error) { 234 quantizer, err := NewProductQuantizer(cfg, distance, dimensions) 235 if err != nil { 236 return nil, err 237 } 238 pqVectorsCompressor := &quantizedVectorsCompressor[byte]{ 239 quantizer: quantizer, 240 compressedStore: store, 241 storeId: binary.LittleEndian.PutUint64, 242 loadId: binary.LittleEndian.Uint64, 243 } 244 pqVectorsCompressor.initCompressedStore() 245 pqVectorsCompressor.cache = cache.NewShardedByteLockCache(pqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0) 246 pqVectorsCompressor.cache.Grow(uint64(len(data))) 247 err = quantizer.Fit(data) 248 if err != nil { 249 return nil, err 250 } 251 return pqVectorsCompressor, nil 252 } 253 254 func RestoreHNSWPQCompressor( 255 cfg hnsw.PQConfig, 256 distance distancer.Provider, 257 dimensions int, 258 vectorCacheMaxObjects int, 259 logger logrus.FieldLogger, 260 encoders []PQEncoder, 261 store *lsmkv.Store, 262 ) (VectorCompressor, error) { 263 quantizer, err := NewProductQuantizerWithEncoders(cfg, distance, dimensions, encoders) 264 if err != nil { 265 return nil, err 266 } 267 pqVectorsCompressor := &quantizedVectorsCompressor[byte]{ 268 quantizer: quantizer, 269 compressedStore: store, 270 storeId: binary.LittleEndian.PutUint64, 271 loadId: binary.LittleEndian.Uint64, 272 } 273 pqVectorsCompressor.initCompressedStore() 274 pqVectorsCompressor.cache = cache.NewShardedByteLockCache(pqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0) 275 return pqVectorsCompressor, nil 276 } 277 278 func NewBQCompressor( 279 distance distancer.Provider, 280 vectorCacheMaxObjects int, 281 logger logrus.FieldLogger, 282 store *lsmkv.Store, 283 ) (VectorCompressor, error) { 284 quantizer := NewBinaryQuantizer(distance) 285 bqVectorsCompressor := &quantizedVectorsCompressor[uint64]{ 286 quantizer: &quantizer, 287 compressedStore: store, 288 storeId: binary.BigEndian.PutUint64, 289 loadId: binary.BigEndian.Uint64, 290 } 291 bqVectorsCompressor.initCompressedStore() 292 bqVectorsCompressor.cache = cache.NewShardedUInt64LockCache(bqVectorsCompressor.getCompressedVectorForID, vectorCacheMaxObjects, logger, 0) 293 return bqVectorsCompressor, nil 294 } 295 296 type quantizedCompressorDistancer[T byte | uint64] struct { 297 compressor *quantizedVectorsCompressor[T] 298 distancer quantizerDistancer[T] 299 } 300 301 func (distancer *quantizedCompressorDistancer[T]) DistanceToNode(id uint64) (float32, bool, error) { 302 compressedVector, err := distancer.compressor.cache.Get(context.Background(), id) 303 if err != nil { 304 return 0, false, err 305 } 306 if len(compressedVector) == 0 { 307 return 0, false, fmt.Errorf( 308 "got a nil or zero-length vector at docID %d", id) 309 } 310 return distancer.distancer.Distance(compressedVector) 311 } 312 313 func (distancer *quantizedCompressorDistancer[T]) DistanceToFloat(vector []float32) (float32, bool, error) { 314 return distancer.distancer.DistanceToFloat(vector) 315 }