github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/prop_length_tracker.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "encoding/binary" 16 "fmt" 17 "io" 18 "math" 19 "os" 20 "sync" 21 22 "github.com/pkg/errors" 23 ) 24 25 // Page Design 26 // | Bytes | Description | 27 // | --------- | ------------------------------------------------ | 28 // | start | page is now 0 29 // | 0-1 | uint16 pointer to last index byte 30 // | 2-3 | uint16 pointer for property name length 31 // | 4-n | property name 32 // | ... | repeat length+pointer pattern 33 // | 3584-3840 | second property buckets (64 buckets of float32) 34 // | 3840-4096 | first property buckets 35 // | repeat | page is now 1, repeat all of above 36 // 37 // Fixed Assumptions: 38 // - First two bytes always used to indicate end of index, minimal value is 02, 39 // as the first possible value with index length=0 is after the two bytes 40 // themselves. 41 // - 64 buckets of float32 per property (=256B per prop), excluding the index 42 // - One index row is always 4+len(propName), consisting of a uint16 prop name 43 // length pointer, the name itself and an offset pointer pointing to the start 44 // (first byte) of the buckets 45 // 46 // The counter to the last index byte is only an uint16, so it can at maximum address 65535. This will overflow when the 47 // 16th page is added (eg at page=15). To avoid a crash an error is returned in this case, but we will need to change 48 // the byteformat to fix this. 49 type PropertyLengthTracker struct { 50 file *os.File 51 path string 52 pages []byte 53 sync.Mutex 54 } 55 56 func NewPropertyLengthTracker(path string) (*PropertyLengthTracker, error) { 57 f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o666) 58 if err != nil { 59 return nil, err 60 } 61 62 stat, err := f.Stat() 63 if err != nil { 64 return nil, err 65 } 66 67 t := &PropertyLengthTracker{ 68 pages: nil, 69 file: f, 70 path: path, 71 } 72 73 if stat.Size() > 0 { 74 // the file has existed before, we need to initialize with its content, we 75 // can read the entire contents into memory 76 existingPages, err := io.ReadAll(f) 77 if err != nil { 78 return nil, errors.Wrap(err, "read initial count from file") 79 } 80 81 if len(existingPages)%4096 != 0 { 82 return nil, errors.Errorf( 83 "failed sanity check, prop len tracker file %s has length %d", path, 84 len(existingPages)) 85 } 86 87 t.pages = existingPages 88 } else { 89 // this is the first time this is being created, initialize with an empty 90 // page 91 t.pages = make([]byte, 4096) 92 // set initial end-of-index offset to 2 93 binary.LittleEndian.PutUint16(t.pages[0:2], 2) 94 } 95 96 return t, nil 97 } 98 99 func (t *PropertyLengthTracker) BucketCount(propName string, bucket uint16) (uint16, error) { 100 t.Lock() 101 defer t.Unlock() 102 103 page, offset, ok := t.propExists(propName) 104 if !ok { 105 return 0, fmt.Errorf("property %v does not exist in OldPropertyLengthTracker", propName) 106 } 107 108 offset = offset + page*4096 109 110 o := offset + (bucket * 4) 111 v := binary.LittleEndian.Uint32(t.pages[o : o+4]) 112 count := math.Float32frombits(v) 113 114 return uint16(count), nil 115 } 116 117 func (t *PropertyLengthTracker) PropertyNames() []string { 118 var names []string 119 pages := len(t.pages) / int(4096) 120 for page := 0; page < pages; page++ { 121 pageStart := page * int(4096) 122 123 relativeEOI := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2]) // t.uint16At(pageStart) 124 EOI := pageStart + int(relativeEOI) 125 126 offset := int(pageStart) + 2 127 for offset < EOI { 128 propNameLength := int(binary.LittleEndian.Uint16(t.pages[offset : offset+2])) // int(t.uint16At(offset)) 129 offset += 2 130 131 propName := t.pages[offset : offset+propNameLength] 132 offset += propNameLength 133 134 offset += 2 135 136 names = append(names, string(propName)) 137 } 138 } 139 return names 140 } 141 142 func (t *PropertyLengthTracker) TrackProperty(propName string, value float32) error { 143 t.Lock() 144 defer t.Unlock() 145 146 var page uint16 147 var relBucketOffset uint16 148 if p, o, ok := t.propExists(propName); ok { 149 page = p 150 relBucketOffset = o 151 } else { 152 var err error 153 page, relBucketOffset, err = t.addProperty(propName) 154 if err != nil { 155 return err 156 } 157 } 158 159 bucketOffset := page*4096 + relBucketOffset + t.bucketFromValue(value)*4 160 161 v := binary.LittleEndian.Uint32(t.pages[bucketOffset : bucketOffset+4]) 162 currentValue := math.Float32frombits(v) 163 currentValue += 1 164 v = math.Float32bits(currentValue) 165 binary.LittleEndian.PutUint32(t.pages[bucketOffset:bucketOffset+4], v) 166 return nil 167 } 168 169 func (t *PropertyLengthTracker) UnTrackProperty(propName string, value float32) error { 170 t.Lock() 171 defer t.Unlock() 172 173 var page uint16 174 var relBucketOffset uint16 175 if p, o, ok := t.propExists(propName); ok { 176 page = p 177 relBucketOffset = o 178 } else { 179 return fmt.Errorf("property %v does not exist in OldPropertyLengthTracker", propName) 180 } 181 182 bucketOffset := page*4096 + relBucketOffset + t.bucketFromValue(value)*4 183 184 v := binary.LittleEndian.Uint32(t.pages[bucketOffset : bucketOffset+4]) 185 currentValue := math.Float32frombits(v) 186 currentValue -= 1 187 v = math.Float32bits(currentValue) 188 binary.LittleEndian.PutUint32(t.pages[bucketOffset:bucketOffset+4], v) 189 return nil 190 } 191 192 // propExists returns page number, relative offset on page, and a bool whether 193 // the prop existed at all. The first to values have no meaning if the latter 194 // is false 195 func (t *PropertyLengthTracker) propExists(needle string) (uint16, uint16, bool) { 196 pages := len(t.pages) / 4096 197 for page := 0; page < pages; page++ { 198 pageStart := page * 4096 199 200 relativeEOI := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2]) 201 EOI := pageStart + int(relativeEOI) 202 203 offset := int(pageStart) + 2 204 for offset < EOI { 205 propNameLength := int(binary.LittleEndian.Uint16( 206 t.pages[offset : offset+2])) 207 offset += 2 208 209 propName := t.pages[offset : offset+propNameLength] 210 offset += propNameLength 211 bucketPointer := binary.LittleEndian.Uint16( 212 t.pages[offset : offset+2]) 213 offset += 2 214 215 if string(propName) == needle { 216 return uint16(page), bucketPointer, true 217 } 218 219 } 220 } 221 return 0, 0, false 222 } 223 224 func (t *PropertyLengthTracker) addProperty(propName string) (uint16, uint16, error) { 225 page := uint16(0) 226 227 for { 228 propNameBytes := []byte(propName) 229 t.createPageIfNotExists(page) 230 pageStart := page * 4096 231 lastBucketOffset := pageStart + 4096 232 233 relativeOffset := binary.LittleEndian.Uint16(t.pages[pageStart : pageStart+2]) 234 offset := pageStart + relativeOffset 235 if relativeOffset != 2 { 236 // relative offset is other than 2, so there are also props in. This 237 // means we can take the value of offset-2 to read the bucket offset 238 lastBucketOffset = pageStart + binary.LittleEndian. 239 Uint16(t.pages[offset-2:offset]) 240 } 241 242 if !t.canPageFit(propNameBytes, offset, lastBucketOffset) { 243 page++ 244 // overflow of uint16 variable that tracks the size of the tracker 245 if page > 15 { 246 return 0, 0, fmt.Errorf("could not add property %v, to PropertyLengthTracker, because the total"+ 247 "length of all properties is too long", propName) 248 } 249 continue 250 } 251 252 propNameLength := uint16(len(propNameBytes)) 253 binary.LittleEndian.PutUint16(t.pages[offset:offset+2], propNameLength) 254 offset += 2 255 copy(t.pages[offset:offset+propNameLength], propNameBytes) 256 offset += propNameLength 257 258 newBucketOffset := lastBucketOffset - 256 - pageStart 259 binary.LittleEndian.PutUint16(t.pages[offset:offset+2], newBucketOffset) 260 offset += 2 261 262 // update end of index offset for page, since the prop name index has 263 // now grown 264 binary.LittleEndian.PutUint16(t.pages[pageStart:pageStart+2], offset-pageStart) 265 return page, newBucketOffset, nil 266 } 267 } 268 269 func (t *PropertyLengthTracker) canPageFit(propName []byte, 270 offset uint16, lastBucketOffset uint16, 271 ) bool { 272 // lastBucketOffset represents the end of the writable area, offset 273 // represents the start, which means we can take the delta to see // how 274 // much space is left on this page 275 spaceLeft := lastBucketOffset - offset 276 277 // we need to write 256 bytes for the buckets, plus two pointers of uint16 278 spaceNeeded := uint16(len(propName)+4) + 256 279 280 return spaceLeft >= spaceNeeded 281 } 282 283 func (t *PropertyLengthTracker) bucketFromValue(value float32) uint16 { 284 if value <= 5.00 { 285 return uint16(value) - 1 286 } 287 288 bucket := int(math.Log(float64(value)/4.0)/math.Log(1.25) + 4) 289 if bucket > 63 { 290 return 64 291 } 292 return uint16(bucket) 293 } 294 295 func (t *PropertyLengthTracker) valueFromBucket(bucket uint16) float32 { 296 if bucket <= 5 { 297 return float32(bucket + 1) 298 } 299 300 return float32(4 * math.Pow(1.25, float64(bucket)-3.5)) 301 } 302 303 func (t *PropertyLengthTracker) PropertyMean(propName string) (float32, error) { 304 t.Lock() 305 defer t.Unlock() 306 307 page, offset, ok := t.propExists(propName) 308 if !ok { 309 return 0, nil 310 } 311 312 sum := float32(0) 313 totalCount := float32(0) 314 bucket := uint16(0) 315 316 offset = offset + page*4096 317 for o := offset; o < offset+256; o += 4 { 318 v := binary.LittleEndian.Uint32(t.pages[o : o+4]) 319 count := math.Float32frombits(v) 320 sum += float32(t.valueFromBucket(bucket)) * count 321 totalCount += count 322 323 bucket++ 324 } 325 326 if totalCount == 0 { 327 return 0, nil 328 } 329 330 return sum / totalCount, nil 331 } 332 333 func (t *PropertyLengthTracker) PropertyTally(propName string) (int, int, float32, error) { 334 t.Lock() 335 defer t.Unlock() 336 337 page, offset, ok := t.propExists(propName) 338 if !ok { 339 return 0, 0, 0, nil 340 } 341 342 sum := float32(0) 343 totalCount := float32(0) 344 bucket := uint16(0) 345 346 offset = offset + page*4096 347 for o := offset; o < offset+256; o += 4 { 348 v := binary.LittleEndian.Uint32(t.pages[o : o+4]) 349 count := math.Float32frombits(v) 350 sum += float32(t.valueFromBucket(bucket)) * count 351 totalCount += count 352 353 bucket++ 354 } 355 356 if totalCount == 0 { 357 return 0, 0, 0, nil 358 } 359 360 return int(sum), int(totalCount), sum / totalCount, nil 361 } 362 363 func (t *PropertyLengthTracker) createPageIfNotExists(page uint16) { 364 if uint16(len(t.pages))/4096-1 < page { 365 // we need to grow the page buffer 366 newPages := make([]byte, uint64(page)*4096+4096) 367 copy(newPages[:len(t.pages)], t.pages) 368 369 // the new page must have the correct offset initialized 370 binary.LittleEndian.PutUint16(newPages[page*4096:page*4096+2], 2) 371 t.pages = newPages 372 } 373 } 374 375 func (t *PropertyLengthTracker) Flush() error { 376 t.Lock() 377 defer t.Unlock() 378 379 if err := t.file.Truncate(int64(len(t.pages))); err != nil { 380 return errors.Wrap(err, "truncate prop tracker file to correct length") 381 } 382 383 if _, err := t.file.Seek(0, io.SeekStart); err != nil { 384 return errors.Wrap(err, "seek to beginning of prop tracker file") 385 } 386 387 if _, err := t.file.Write(t.pages); err != nil { 388 return errors.Wrap(err, "flush page content to disk") 389 } 390 391 return nil 392 } 393 394 func (t *PropertyLengthTracker) Close() error { 395 if err := t.Flush(); err != nil { 396 return errors.Wrap(err, "flush before closing") 397 } 398 399 t.Lock() 400 defer t.Unlock() 401 402 if err := t.file.Close(); err != nil { 403 return errors.Wrap(err, "close prop length tracker file") 404 } 405 406 t.pages = nil 407 408 return nil 409 } 410 411 func (t *PropertyLengthTracker) Drop() error { 412 t.Lock() 413 defer t.Unlock() 414 415 if err := t.file.Close(); err != nil { 416 _ = err 417 // explicitly ignore error 418 } 419 420 t.pages = nil 421 422 if err := os.Remove(t.path); err != nil { 423 return errors.Wrap(err, "remove prop length tracker state from disk") 424 } 425 426 return nil 427 } 428 429 func (t *PropertyLengthTracker) FileName() string { 430 return t.file.Name() 431 }