github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/new_prop_length_tracker.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 import ( 15 "encoding/json" 16 "math" 17 "os" 18 "sync" 19 20 "github.com/pkg/errors" 21 "github.com/sirupsen/logrus" 22 ) 23 24 var MAX_BUCKETS = 64 25 26 type PropLenData struct { 27 BucketedData map[string]map[int]int 28 SumData map[string]int 29 CountData map[string]int 30 } 31 32 type JsonPropertyLengthTracker struct { 33 path string 34 data *PropLenData 35 sync.Mutex 36 UnlimitedBuckets bool 37 logger logrus.FieldLogger 38 } 39 40 // This class replaces the old PropertyLengthTracker. It fixes a bug and provides a 41 // simpler, easier to maintain implementation. The format is future-proofed, new 42 // data can be added to the file without breaking old versions of Weaviate. 43 // 44 // * We need to know the mean length of all properties for BM25 calculations 45 // * The prop length tracker is an approximate tracker that uses buckets and simply counts the entries in the buckets 46 // * There is a precise global counter for the sum of all lengths and a precise global counter for the number of entries 47 // * It only exists for string/text (and their array forms) because these are the only prop types that can be used with BM25 48 // * It should probably always exist when indexSearchable is set on a text prop going forward 49 // 50 // Property lengths are put into one of 64 buckets. The value of a bucket is given by the formula: 51 // 52 // float32(4 * math.Pow(1.25, float64(bucket)-3.5)) 53 // 54 // Which as implemented gives bucket values of 0,1,2,3,4,5,6,8,10,13,17,21,26,33,41,52,65,81,101,127,158,198,248,310,387,484,606,757,947,1183,1479,1849,2312,2890,3612,4515,5644,7055,8819,11024,13780,17226,21532,26915,33644,42055,52569,65712,82140,102675,128344,160430,200537,250671,313339,391674,489593,611991,764989,956237,1195296,1494120,1867651,2334564 55 // 56 // These buckets are then recorded to disk. The original implementation was a binary format where all the data was tracked using manual pointer arithmetic. The new version tracks the statistics in a go map, and marshals that into JSON before writing it to disk. There is no measurable difference in speed between these two implementations while importing data, however it appears to slow the queries by about 15% (while improving recall by ~25%). 57 // 58 // The new tracker is exactly compatible with the old format to enable migration, which is why there is a -1 bucket. Altering the number of buckets or their values will break compatibility. 59 // 60 // Set UnlimitedBuckets to true for precise length tracking 61 // 62 // Note that some of the code in this file is forced by the need to be backwards-compatible with the old format. Once we are confident that all users have migrated to the new format, we can remove the old format code and simplify this file. 63 64 // NewJsonPropertyLengthTracker creates a new tracker and loads the data from the given path. If the file is in the old format, it will be converted to the new format. 65 func NewJsonPropertyLengthTracker(path string, logger logrus.FieldLogger) (t *JsonPropertyLengthTracker, err error) { 66 // Recover and return empty tracker on panic 67 defer func() { 68 if r := recover(); r != nil { 69 t.logger.Printf("Recovered from panic in NewJsonPropertyLengthTracker, original error: %v", r) 70 t = &JsonPropertyLengthTracker{ 71 data: &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}, 72 path: path, 73 UnlimitedBuckets: false, 74 } 75 err = errors.Errorf("Recovered from panic in NewJsonPropertyLengthTracker, original error: %v", r) 76 } 77 }() 78 79 t = &JsonPropertyLengthTracker{ 80 data: &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)}, 81 path: path, 82 UnlimitedBuckets: false, 83 logger: logger, 84 } 85 86 // read the file into memory 87 bytes, err := os.ReadFile(path) 88 if err != nil { 89 if os.IsNotExist(err) { // File doesn't exist, probably a new class(or a recount), return empty tracker 90 t.Flush(false) 91 return t, nil 92 } 93 return nil, errors.Wrap(err, "read property length tracker file:"+path) 94 } 95 96 if len(bytes) == 0 { 97 return nil, errors.Errorf("failed sanity check, empty prop len tracker file %s has length 0. Delete file and set environment variable RECOUNT_PROPERTIES_AT_STARTUP to true", path) 98 } 99 100 // We don't have data file versioning, so we try to parse it as json. If the parse fails, it is probably the old format file, so we call the old format loader and copy everything across. 101 if err = json.Unmarshal(bytes, &t.data); err != nil { 102 // It's probably the old format file, load the old format and convert it to the new format 103 plt, err := NewPropertyLengthTracker(path) 104 if err != nil { 105 return nil, errors.Wrap(err, "convert old property length tracker") 106 } 107 108 propertyNames := plt.PropertyNames() 109 data := &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)} 110 // Loop over every page and bucket in the old tracker and add it to the new tracker 111 for _, name := range propertyNames { 112 data.BucketedData[name] = make(map[int]int, MAX_BUCKETS) 113 data.CountData[name] = 0 114 data.SumData[name] = 0 115 for i := 0; i <= MAX_BUCKETS; i++ { 116 fromBucket := i 117 if i == MAX_BUCKETS { 118 fromBucket = -1 119 } 120 count, err := plt.BucketCount(name, uint16(fromBucket)) 121 if err != nil { 122 return nil, errors.Wrap(err, "convert old property length tracker") 123 } 124 data.BucketedData[name][fromBucket] = int(count) 125 value := float32(0) 126 if fromBucket == -1 { 127 value = 0 128 } else { 129 value = plt.valueFromBucket(uint16(fromBucket)) 130 } 131 132 data.SumData[name] = data.SumData[name] + int(value)*int(count) 133 data.CountData[name] = data.CountData[name] + int(count) 134 } 135 } 136 t.data = data 137 t.Flush(true) 138 plt.Close() 139 plt.Drop() 140 t.Flush(false) 141 } 142 t.path = path 143 144 // Make really sure we aren't going to crash on a nil pointer 145 if t.data == nil { 146 return nil, errors.Errorf("failed sanity check, prop len tracker file %s has nil data. Delete file and set environment variable RECOUNT_PROPERTIES_AT_STARTUP to true", path) 147 } 148 return t, nil 149 } 150 151 func (t *JsonPropertyLengthTracker) Clear() { 152 t.Lock() 153 defer t.Unlock() 154 155 t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)} 156 } 157 158 // Path to the file on disk 159 func (t *JsonPropertyLengthTracker) FileName() string { 160 return t.path 161 } 162 163 // Adds a new value to the tracker 164 func (t *JsonPropertyLengthTracker) TrackProperty(propName string, value float32) error { 165 t.Lock() 166 defer t.Unlock() 167 168 // Remove this check once we are confident that all users have migrated to the new format 169 if t.data == nil { 170 t.logger.Print("WARNING: t.data is nil in TrackProperty, initializing to empty tracker") 171 t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)} 172 } 173 t.data.SumData[propName] = t.data.SumData[propName] + int(value) 174 t.data.CountData[propName] = t.data.CountData[propName] + 1 175 176 bucketId := t.bucketFromValue(value) 177 if _, ok := t.data.BucketedData[propName]; ok { 178 t.data.BucketedData[propName][int(bucketId)] = t.data.BucketedData[propName][int(bucketId)] + 1 179 } else { 180 181 t.data.BucketedData[propName] = make(map[int]int, 64+1) 182 t.data.BucketedData[propName][int(bucketId)] = 1 183 } 184 185 return nil 186 } 187 188 // Removes a value from the tracker 189 func (t *JsonPropertyLengthTracker) UnTrackProperty(propName string, value float32) error { 190 t.Lock() 191 defer t.Unlock() 192 193 // Remove this check once we are confident that all users have migrated to the new format 194 if t.data == nil { 195 t.logger.Print("WARNING: t.data is nil in TrackProperty, initializing to empty tracker") 196 t.data = &PropLenData{make(map[string]map[int]int), make(map[string]int), make(map[string]int)} 197 } 198 t.data.SumData[propName] = t.data.SumData[propName] - int(value) 199 t.data.CountData[propName] = t.data.CountData[propName] - 1 200 201 bucketId := t.bucketFromValue(value) 202 if _, ok := t.data.BucketedData[propName]; ok { 203 t.data.BucketedData[propName][int(bucketId)] = t.data.BucketedData[propName][int(bucketId)] - 1 204 } else { 205 return errors.New("property not found") 206 } 207 208 return nil 209 } 210 211 // Returns the bucket that the given value belongs to 212 func (t *JsonPropertyLengthTracker) bucketFromValue(value float32) int { 213 if t.UnlimitedBuckets { 214 return int(value) 215 } 216 if value <= 5.00 { 217 return int(value) - 1 218 } 219 220 bucket := int(math.Log(float64(value)/4.0)/math.Log(1.25) + 4) 221 if bucket > MAX_BUCKETS-1 { 222 return MAX_BUCKETS 223 } 224 return int(bucket) 225 } 226 227 // Returns the average length of the given property 228 func (t *JsonPropertyLengthTracker) PropertyMean(propName string) (float32, error) { 229 t.Lock() 230 defer t.Unlock() 231 232 sum, ok := t.data.SumData[propName] 233 if !ok { 234 return 0, nil 235 } 236 count, ok := t.data.CountData[propName] 237 if !ok { 238 return 0, nil 239 } 240 241 return float32(sum) / float32(count), nil 242 } 243 244 // returns totalPropertyLength, totalCount, average propertyLength = sum / totalCount, total propertylength, totalCount, error 245 func (t *JsonPropertyLengthTracker) PropertyTally(propName string) (int, int, float64, error) { 246 t.Lock() 247 defer t.Unlock() 248 sum, ok := t.data.SumData[propName] 249 if !ok { 250 return 0, 0, 0, nil // Required to match the old prop tracker (for now) 251 } 252 count, ok := t.data.CountData[propName] 253 if !ok { 254 return 0, 0, 0, nil // Required to match the old prop tracker (for now) 255 } 256 return sum, count, float64(sum) / float64(count), nil 257 } 258 259 // Writes the current state of the tracker to disk. (flushBackup = true) will only write the backup file 260 func (t *JsonPropertyLengthTracker) Flush(flushBackup bool) error { 261 if !flushBackup { // Write the backup file first 262 t.Flush(true) 263 } 264 265 t.Lock() 266 defer t.Unlock() 267 268 bytes, err := json.Marshal(t.data) 269 if err != nil { 270 return err 271 } 272 273 filename := t.path 274 if flushBackup { 275 filename = t.path + ".bak" 276 } 277 278 // Do a write+rename to avoid corrupting the file if we crash while writing 279 tempfile := filename + ".tmp" 280 281 err = WriteFile(tempfile, bytes, 0o666) 282 if err != nil { 283 return err 284 } 285 286 err = os.Rename(tempfile, filename) 287 if err != nil { 288 return err 289 } 290 291 return nil 292 } 293 294 func WriteFile(name string, data []byte, perm os.FileMode) error { 295 f, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm) 296 if err != nil { 297 return err 298 } 299 defer f.Close() 300 301 _, err = f.Write(data) 302 if err != nil { 303 return err 304 } 305 306 // TODO: f.Sync() is introducing performance penalization at this point 307 // it will be addressed as part of another PR 308 309 return nil 310 } 311 312 // Closes the tracker and removes the backup file 313 func (t *JsonPropertyLengthTracker) Close() error { 314 if err := t.Flush(false); err != nil { 315 return errors.Wrap(err, "flush before closing") 316 } 317 318 t.Lock() 319 defer t.Unlock() 320 321 t.data.BucketedData = nil 322 323 return nil 324 } 325 326 // Drop removes the tracker from disk 327 func (t *JsonPropertyLengthTracker) Drop() error { 328 t.Close() 329 330 t.Lock() 331 defer t.Unlock() 332 333 t.data.BucketedData = nil 334 335 if err := os.Remove(t.path); err != nil { 336 return errors.Wrap(err, "remove prop length tracker state from disk:"+t.path) 337 } 338 if err := os.Remove(t.path + ".bak"); err != nil { 339 return errors.Wrap(err, "remove prop length tracker state from disk:"+t.path+".bak") 340 } 341 342 return nil 343 }