github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/refcache/cacher.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package refcache 13 14 import ( 15 "context" 16 "fmt" 17 "sync" 18 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 "github.com/weaviate/weaviate/entities/additional" 22 "github.com/weaviate/weaviate/entities/models" 23 "github.com/weaviate/weaviate/entities/multi" 24 "github.com/weaviate/weaviate/entities/schema/crossref" 25 "github.com/weaviate/weaviate/entities/search" 26 ) 27 28 type repo interface { 29 MultiGet(ctx context.Context, query []multi.Identifier, 30 additional additional.Properties, tenant string) ([]search.Result, error) 31 } 32 33 func NewCacher(repo repo, logger logrus.FieldLogger, tenant string) *Cacher { 34 return &Cacher{ 35 logger: logger, 36 repo: repo, 37 store: map[multi.Identifier]search.Result{}, 38 withGroup: false, 39 tenant: tenant, 40 } 41 } 42 43 func NewCacherWithGroup(repo repo, logger logrus.FieldLogger, tenant string) *Cacher { 44 return &Cacher{ 45 logger: logger, 46 repo: repo, 47 store: map[multi.Identifier]search.Result{}, 48 // for groupBy feature 49 withGroup: true, 50 getGroupSelectProperties: getGroupSelectProperties, 51 tenant: tenant, 52 } 53 } 54 55 type cacherJob struct { 56 si multi.Identifier 57 props search.SelectProperties 58 complete bool 59 } 60 61 type Cacher struct { 62 sync.Mutex 63 jobs []cacherJob 64 logger logrus.FieldLogger 65 repo repo 66 store map[multi.Identifier]search.Result 67 additional additional.Properties // meta is immutable for the lifetime of the request cacher, so we can safely store it 68 // for groupBy feature 69 withGroup bool 70 getGroupSelectProperties func(properties search.SelectProperties) search.SelectProperties 71 tenant string 72 } 73 74 func (c *Cacher) Get(si multi.Identifier) (search.Result, bool) { 75 sr, ok := c.store[si] 76 return sr, ok 77 } 78 79 // Build builds the lookup cache recursively and tries to be smart about it. This 80 // means that it aims to use only a single (multiget) transaction per layer. 81 // The recursion exit condition is jobs marked as done. At some point 82 // the cacher will realise that for every nested prop there is already a 83 // complete job, so it it stop the recursion. 84 // 85 // build is called on a "level" i.e. the search result. After working 86 // on the job list for the first time if the resolved items still contain 87 // references and the user set the SelectProperty to indicate they want to 88 // resolve them, build is called again on all the results (plural!) from the 89 // previous run. We thus end up with one request to the backend per level 90 // regardless of the amount of lookups per level. 91 // 92 // This keeps request times to a minimum even on deeply nested requests. 93 func (c *Cacher) Build(ctx context.Context, objects []search.Result, 94 properties search.SelectProperties, additional additional.Properties, 95 ) error { 96 c.additional = additional 97 err := c.findJobsFromResponse(objects, properties) 98 if err != nil { 99 return fmt.Errorf("build request cache: %v", err) 100 } 101 102 c.dedupJobList() 103 err = c.fetchJobs(ctx) 104 if err != nil { 105 return fmt.Errorf("build request cache: %v", err) 106 } 107 108 return nil 109 } 110 111 // A response is a []search.Result which has all primitive props parsed (and 112 // even ref-beacons parsed into their respective types, but not resolved!) 113 // findJobsFromResponse will traverse through it and check if there are 114 // references. In a recursive lookup this can both be done on the rootlevel to 115 // start the first lookup as well as recursively on the results of a lookup to 116 // further look if a next-level call is required. 117 func (c *Cacher) findJobsFromResponse(objects []search.Result, properties search.SelectProperties) error { 118 for _, obj := range objects { 119 var err error 120 121 // we can only set SelectProperties on the rootlevel since this is the only 122 // place where we have a single root class. In nested lookups we need to 123 // first identify the correct path in the SelectProperties graph which 124 // correspends with the path we're currently traversing through. Thus we 125 // always cache the original SelectProps with the job. This call goes 126 // through the job history and looks up the correct SelectProperties 127 // subpath to use in this place. 128 // tl;dr: On root level (root=base) take props from the outside, on a 129 // nested level lookup the SelectProps matching the current base element 130 propertiesReplaced, err := c.ReplaceInitialPropertiesWithSpecific(obj, properties) 131 if err != nil { 132 return err 133 } 134 135 if obj.Schema == nil { 136 return nil 137 } 138 139 schemaMap, ok := obj.Schema.(map[string]interface{}) 140 if !ok { 141 return fmt.Errorf("object schema is present, but not a map: %T", obj) 142 } 143 144 if err := c.parseSchemaMap(schemaMap, propertiesReplaced); err != nil { 145 return err 146 } 147 148 if c.withGroup { 149 if err := c.parseAdditionalGroup(obj, properties); err != nil { 150 return err 151 } 152 } 153 } 154 155 return nil 156 } 157 158 func (c *Cacher) parseAdditionalGroup(obj search.Result, properties search.SelectProperties) error { 159 if obj.AdditionalProperties != nil && obj.AdditionalProperties["group"] != nil { 160 if group, ok := obj.AdditionalProperties["group"].(*additional.Group); ok { 161 for _, hitMap := range group.Hits { 162 if err := c.parseSchemaMap(hitMap, c.getGroupSelectProperties(properties)); err != nil { 163 return err 164 } 165 } 166 } 167 } 168 return nil 169 } 170 171 func (c *Cacher) parseSchemaMap(schemaMap map[string]interface{}, propertiesReplaced search.SelectProperties) error { 172 for key, value := range schemaMap { 173 selectProp := propertiesReplaced.FindProperty(key) 174 skip, unresolved := c.skipProperty(key, value, selectProp) 175 if skip { 176 continue 177 } 178 179 for _, selectPropRef := range selectProp.Refs { 180 innerProperties := selectPropRef.RefProperties 181 182 for _, item := range unresolved { 183 ref, err := c.extractAndParseBeacon(item) 184 if err != nil { 185 return err 186 } 187 c.addJob(multi.Identifier{ 188 ID: ref.TargetID.String(), 189 ClassName: selectPropRef.ClassName, 190 }, innerProperties) 191 } 192 } 193 } 194 return nil 195 } 196 197 func (c *Cacher) skipProperty(key string, value interface{}, selectProp *search.SelectProperty) (bool, models.MultipleRef) { 198 // the cacher runs at a point where primitive props have already been 199 // parsed, so we can simply look for parsed, but not resolved refenereces 200 parsed, ok := value.(models.MultipleRef) 201 if !ok { 202 // must be another kind of prop, not interesting for us 203 return true, nil 204 } 205 206 if selectProp == nil { 207 // while we did hit a ref propr, the user is not interested in resolving 208 // this prop 209 return true, nil 210 } 211 212 return false, parsed 213 } 214 215 func (c *Cacher) extractAndParseBeacon(item *models.SingleRef) (*crossref.Ref, error) { 216 return crossref.Parse(item.Beacon.String()) 217 } 218 219 func (c *Cacher) ReplaceInitialPropertiesWithSpecific(obj search.Result, 220 properties search.SelectProperties, 221 ) (search.SelectProperties, error) { 222 if properties != nil { 223 // don't overwrite the properties if the caller has explicitly set them, 224 // this can only mean they're at the root level 225 return properties, nil 226 } 227 228 // this is a nested level, we cannot rely on global initialSelectProperties 229 // anymore, instead we need to find the selectProperties for exactly this 230 // ID 231 job, ok := c.findJob(multi.Identifier{ 232 ID: obj.ID.String(), 233 ClassName: obj.ClassName, 234 }) 235 if ok { 236 return job.props, nil 237 } 238 239 return properties, nil 240 } 241 242 func (c *Cacher) addJob(si multi.Identifier, props search.SelectProperties) { 243 c.jobs = append(c.jobs, cacherJob{si, props, false}) 244 } 245 246 func (c *Cacher) findJob(si multi.Identifier) (cacherJob, bool) { 247 for _, job := range c.jobs { 248 if job.si == si { 249 return job, true 250 } 251 } 252 253 return cacherJob{}, false 254 } 255 256 // finds incompleteJobs without altering the original job list 257 func (c *Cacher) incompleteJobs() []cacherJob { 258 out := make([]cacherJob, len(c.jobs)) 259 n := 0 260 for _, job := range c.jobs { 261 if !job.complete { 262 out[n] = job 263 n++ 264 } 265 } 266 267 return out[:n] 268 } 269 270 // finds complete jobs without altering the original job list 271 func (c *Cacher) completeJobs() []cacherJob { 272 out := make([]cacherJob, len(c.jobs)) 273 n := 0 274 for _, job := range c.jobs { 275 if job.complete { 276 out[n] = job 277 n++ 278 } 279 } 280 281 return out[:n] 282 } 283 284 // alters the list, removes duplicates. 285 func (c *Cacher) dedupJobList() { 286 incompleteJobs := c.incompleteJobs() 287 before := len(incompleteJobs) 288 if before == 0 { 289 // nothing to do 290 return 291 } 292 293 c.logger. 294 WithFields(logrus.Fields{ 295 "action": "request_cacher_dedup_joblist_start", 296 "jobs": before, 297 }). 298 Debug("starting job list deduplication") 299 deduped := make([]cacherJob, len(incompleteJobs)) 300 found := map[multi.Identifier]struct{}{} 301 302 // don't look up refs that are already completed - this can for example happen with cyclic refs 303 for _, job := range c.completeJobs() { 304 found[job.si] = struct{}{} 305 } 306 307 n := 0 308 for _, job := range incompleteJobs { 309 if _, ok := found[job.si]; ok { 310 continue 311 } 312 313 found[job.si] = struct{}{} 314 deduped[n] = job 315 n++ 316 } 317 318 c.jobs = append(c.completeJobs(), deduped[:n]...) 319 320 c.logger. 321 WithFields(logrus.Fields{ 322 "action": "request_cacher_dedup_joblist_complete", 323 "jobs": n, 324 "removedJobs": before - n, 325 }). 326 Debug("completed job list deduplication") 327 } 328 329 func (c *Cacher) fetchJobs(ctx context.Context) error { 330 jobs := c.incompleteJobs() 331 if len(jobs) == 0 { 332 c.logSkipFetchJobs() 333 return nil 334 } 335 336 query := jobListToMultiGetQuery(jobs) 337 res, err := c.repo.MultiGet(ctx, query, c.additional, c.tenant) 338 if err != nil { 339 return errors.Wrap(err, "fetch job list") 340 } 341 342 return c.parseAndStore(ctx, res) 343 } 344 345 func (c *Cacher) logSkipFetchJobs() { 346 c.logger. 347 WithFields( 348 logrus.Fields{ 349 "action": "request_cacher_fetch_jobs_skip", 350 }). 351 Trace("skip fetch jobs, have no incomplete jobs") 352 } 353 354 // parseAndStore parses the results for nested refs. Since it is already a 355 // []search.Result no other parsing is required, as we can expect this type to 356 // have all primitive props parsed correctly 357 // 358 // If nested refs are found, the recursion is started. 359 // 360 // Once no more nested refs can be found, the recursion triggers its exit 361 // condition and all jobs are stored. 362 func (c *Cacher) parseAndStore(ctx context.Context, res []search.Result) error { 363 // mark all current jobs as done, as we use the amount of incomplete jobs as 364 // the exit condition for the recursion. Next up, we will start a nested 365 // Build() call. If the Build call returns no new jobs, we are done and the 366 // recursion stops. If it does return more jobs, we will enter a nested 367 // iteration which will eventually come to this place again 368 c.markAllJobsAsDone() 369 370 err := c.Build(ctx, removeEmptyResults(res), nil, c.additional) 371 if err != nil { 372 return errors.Wrap(err, "build nested cache") 373 } 374 375 err = c.storeResults(res) 376 if err != nil { 377 return err 378 } 379 380 return nil 381 } 382 383 func removeEmptyResults(in []search.Result) []search.Result { 384 out := make([]search.Result, len(in)) 385 n := 0 386 for _, obj := range in { 387 if obj.ID != "" { 388 out[n] = obj 389 n++ 390 } 391 } 392 393 return out[0:n] 394 } 395 396 func (c *Cacher) storeResults(res search.Results) error { 397 for _, item := range res { 398 c.store[multi.Identifier{ 399 ID: item.ID.String(), 400 ClassName: item.ClassName, 401 }] = item 402 } 403 404 return nil 405 } 406 407 func (c *Cacher) markAllJobsAsDone() { 408 for i := range c.jobs { 409 c.jobs[i].complete = true 410 } 411 } 412 413 func jobListToMultiGetQuery(jobs []cacherJob) []multi.Identifier { 414 query := make([]multi.Identifier, len(jobs)) 415 for i, job := range jobs { 416 query[i] = job.si 417 } 418 419 return query 420 }