github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/refcache/cacher.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package refcache
    13  
    14  import (
    15  	"context"
    16  	"fmt"
    17  	"sync"
    18  
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  	"github.com/weaviate/weaviate/entities/additional"
    22  	"github.com/weaviate/weaviate/entities/models"
    23  	"github.com/weaviate/weaviate/entities/multi"
    24  	"github.com/weaviate/weaviate/entities/schema/crossref"
    25  	"github.com/weaviate/weaviate/entities/search"
    26  )
    27  
    28  type repo interface {
    29  	MultiGet(ctx context.Context, query []multi.Identifier,
    30  		additional additional.Properties, tenant string) ([]search.Result, error)
    31  }
    32  
    33  func NewCacher(repo repo, logger logrus.FieldLogger, tenant string) *Cacher {
    34  	return &Cacher{
    35  		logger:    logger,
    36  		repo:      repo,
    37  		store:     map[multi.Identifier]search.Result{},
    38  		withGroup: false,
    39  		tenant:    tenant,
    40  	}
    41  }
    42  
    43  func NewCacherWithGroup(repo repo, logger logrus.FieldLogger, tenant string) *Cacher {
    44  	return &Cacher{
    45  		logger: logger,
    46  		repo:   repo,
    47  		store:  map[multi.Identifier]search.Result{},
    48  		// for groupBy feature
    49  		withGroup:                true,
    50  		getGroupSelectProperties: getGroupSelectProperties,
    51  		tenant:                   tenant,
    52  	}
    53  }
    54  
    55  type cacherJob struct {
    56  	si       multi.Identifier
    57  	props    search.SelectProperties
    58  	complete bool
    59  }
    60  
    61  type Cacher struct {
    62  	sync.Mutex
    63  	jobs       []cacherJob
    64  	logger     logrus.FieldLogger
    65  	repo       repo
    66  	store      map[multi.Identifier]search.Result
    67  	additional additional.Properties // meta is immutable for the lifetime of the request cacher, so we can safely store it
    68  	// for groupBy feature
    69  	withGroup                bool
    70  	getGroupSelectProperties func(properties search.SelectProperties) search.SelectProperties
    71  	tenant                   string
    72  }
    73  
    74  func (c *Cacher) Get(si multi.Identifier) (search.Result, bool) {
    75  	sr, ok := c.store[si]
    76  	return sr, ok
    77  }
    78  
    79  // Build builds the lookup cache recursively and tries to be smart about it. This
    80  // means that it aims to use only a single (multiget) transaction per layer.
    81  // The recursion exit condition is jobs marked as done. At some point
    82  // the cacher will realise that for every nested prop there is already a
    83  // complete job, so it it stop the recursion.
    84  //
    85  // build is called on a "level" i.e. the search result. After working
    86  // on the job list for the first time if the resolved items still contain
    87  // references and the user set the SelectProperty to indicate they want to
    88  // resolve them, build is called again on all the results (plural!) from the
    89  // previous run. We thus end up with one request to the backend per level
    90  // regardless of the amount of lookups per level.
    91  //
    92  // This keeps request times to a minimum even on deeply nested requests.
    93  func (c *Cacher) Build(ctx context.Context, objects []search.Result,
    94  	properties search.SelectProperties, additional additional.Properties,
    95  ) error {
    96  	c.additional = additional
    97  	err := c.findJobsFromResponse(objects, properties)
    98  	if err != nil {
    99  		return fmt.Errorf("build request cache: %v", err)
   100  	}
   101  
   102  	c.dedupJobList()
   103  	err = c.fetchJobs(ctx)
   104  	if err != nil {
   105  		return fmt.Errorf("build request cache: %v", err)
   106  	}
   107  
   108  	return nil
   109  }
   110  
   111  // A response is a []search.Result which has all primitive props parsed (and
   112  // even ref-beacons parsed into their respective types, but not resolved!)
   113  // findJobsFromResponse will traverse through it and  check if there are
   114  // references. In a recursive lookup this can both be done on the rootlevel to
   115  // start the first lookup as well as recursively on the results of a lookup to
   116  // further look if a next-level call is required.
   117  func (c *Cacher) findJobsFromResponse(objects []search.Result, properties search.SelectProperties) error {
   118  	for _, obj := range objects {
   119  		var err error
   120  
   121  		// we can only set SelectProperties on the rootlevel since this is the only
   122  		// place where we have a single root class. In nested lookups we need to
   123  		// first identify the correct path in the SelectProperties graph which
   124  		// correspends with the path we're currently traversing through. Thus we
   125  		// always cache the original SelectProps with the job. This call goes
   126  		// through the job history and looks up the correct SelectProperties
   127  		// subpath to use in this place.
   128  		// tl;dr: On root level (root=base) take props from the outside, on a
   129  		// nested level lookup the SelectProps matching the current base element
   130  		propertiesReplaced, err := c.ReplaceInitialPropertiesWithSpecific(obj, properties)
   131  		if err != nil {
   132  			return err
   133  		}
   134  
   135  		if obj.Schema == nil {
   136  			return nil
   137  		}
   138  
   139  		schemaMap, ok := obj.Schema.(map[string]interface{})
   140  		if !ok {
   141  			return fmt.Errorf("object schema is present, but not a map: %T", obj)
   142  		}
   143  
   144  		if err := c.parseSchemaMap(schemaMap, propertiesReplaced); err != nil {
   145  			return err
   146  		}
   147  
   148  		if c.withGroup {
   149  			if err := c.parseAdditionalGroup(obj, properties); err != nil {
   150  				return err
   151  			}
   152  		}
   153  	}
   154  
   155  	return nil
   156  }
   157  
   158  func (c *Cacher) parseAdditionalGroup(obj search.Result, properties search.SelectProperties) error {
   159  	if obj.AdditionalProperties != nil && obj.AdditionalProperties["group"] != nil {
   160  		if group, ok := obj.AdditionalProperties["group"].(*additional.Group); ok {
   161  			for _, hitMap := range group.Hits {
   162  				if err := c.parseSchemaMap(hitMap, c.getGroupSelectProperties(properties)); err != nil {
   163  					return err
   164  				}
   165  			}
   166  		}
   167  	}
   168  	return nil
   169  }
   170  
   171  func (c *Cacher) parseSchemaMap(schemaMap map[string]interface{}, propertiesReplaced search.SelectProperties) error {
   172  	for key, value := range schemaMap {
   173  		selectProp := propertiesReplaced.FindProperty(key)
   174  		skip, unresolved := c.skipProperty(key, value, selectProp)
   175  		if skip {
   176  			continue
   177  		}
   178  
   179  		for _, selectPropRef := range selectProp.Refs {
   180  			innerProperties := selectPropRef.RefProperties
   181  
   182  			for _, item := range unresolved {
   183  				ref, err := c.extractAndParseBeacon(item)
   184  				if err != nil {
   185  					return err
   186  				}
   187  				c.addJob(multi.Identifier{
   188  					ID:        ref.TargetID.String(),
   189  					ClassName: selectPropRef.ClassName,
   190  				}, innerProperties)
   191  			}
   192  		}
   193  	}
   194  	return nil
   195  }
   196  
   197  func (c *Cacher) skipProperty(key string, value interface{}, selectProp *search.SelectProperty) (bool, models.MultipleRef) {
   198  	// the cacher runs at a point where primitive props have already been
   199  	// parsed, so we can simply look for parsed, but not resolved refenereces
   200  	parsed, ok := value.(models.MultipleRef)
   201  	if !ok {
   202  		// must be another kind of prop, not interesting for us
   203  		return true, nil
   204  	}
   205  
   206  	if selectProp == nil {
   207  		// while we did hit a ref propr, the user is not interested in resolving
   208  		// this prop
   209  		return true, nil
   210  	}
   211  
   212  	return false, parsed
   213  }
   214  
   215  func (c *Cacher) extractAndParseBeacon(item *models.SingleRef) (*crossref.Ref, error) {
   216  	return crossref.Parse(item.Beacon.String())
   217  }
   218  
   219  func (c *Cacher) ReplaceInitialPropertiesWithSpecific(obj search.Result,
   220  	properties search.SelectProperties,
   221  ) (search.SelectProperties, error) {
   222  	if properties != nil {
   223  		// don't overwrite the properties if the caller has explicitly set them,
   224  		// this can only mean they're at the root level
   225  		return properties, nil
   226  	}
   227  
   228  	// this is a nested level, we cannot rely on global initialSelectProperties
   229  	// anymore, instead we need to find the selectProperties for exactly this
   230  	// ID
   231  	job, ok := c.findJob(multi.Identifier{
   232  		ID:        obj.ID.String(),
   233  		ClassName: obj.ClassName,
   234  	})
   235  	if ok {
   236  		return job.props, nil
   237  	}
   238  
   239  	return properties, nil
   240  }
   241  
   242  func (c *Cacher) addJob(si multi.Identifier, props search.SelectProperties) {
   243  	c.jobs = append(c.jobs, cacherJob{si, props, false})
   244  }
   245  
   246  func (c *Cacher) findJob(si multi.Identifier) (cacherJob, bool) {
   247  	for _, job := range c.jobs {
   248  		if job.si == si {
   249  			return job, true
   250  		}
   251  	}
   252  
   253  	return cacherJob{}, false
   254  }
   255  
   256  // finds incompleteJobs without altering the original job list
   257  func (c *Cacher) incompleteJobs() []cacherJob {
   258  	out := make([]cacherJob, len(c.jobs))
   259  	n := 0
   260  	for _, job := range c.jobs {
   261  		if !job.complete {
   262  			out[n] = job
   263  			n++
   264  		}
   265  	}
   266  
   267  	return out[:n]
   268  }
   269  
   270  // finds complete jobs  without altering the original job list
   271  func (c *Cacher) completeJobs() []cacherJob {
   272  	out := make([]cacherJob, len(c.jobs))
   273  	n := 0
   274  	for _, job := range c.jobs {
   275  		if job.complete {
   276  			out[n] = job
   277  			n++
   278  		}
   279  	}
   280  
   281  	return out[:n]
   282  }
   283  
   284  // alters the list, removes duplicates.
   285  func (c *Cacher) dedupJobList() {
   286  	incompleteJobs := c.incompleteJobs()
   287  	before := len(incompleteJobs)
   288  	if before == 0 {
   289  		// nothing to do
   290  		return
   291  	}
   292  
   293  	c.logger.
   294  		WithFields(logrus.Fields{
   295  			"action": "request_cacher_dedup_joblist_start",
   296  			"jobs":   before,
   297  		}).
   298  		Debug("starting job list deduplication")
   299  	deduped := make([]cacherJob, len(incompleteJobs))
   300  	found := map[multi.Identifier]struct{}{}
   301  
   302  	// don't look up refs that are already completed - this can for example happen with cyclic refs
   303  	for _, job := range c.completeJobs() {
   304  		found[job.si] = struct{}{}
   305  	}
   306  
   307  	n := 0
   308  	for _, job := range incompleteJobs {
   309  		if _, ok := found[job.si]; ok {
   310  			continue
   311  		}
   312  
   313  		found[job.si] = struct{}{}
   314  		deduped[n] = job
   315  		n++
   316  	}
   317  
   318  	c.jobs = append(c.completeJobs(), deduped[:n]...)
   319  
   320  	c.logger.
   321  		WithFields(logrus.Fields{
   322  			"action":      "request_cacher_dedup_joblist_complete",
   323  			"jobs":        n,
   324  			"removedJobs": before - n,
   325  		}).
   326  		Debug("completed job list deduplication")
   327  }
   328  
   329  func (c *Cacher) fetchJobs(ctx context.Context) error {
   330  	jobs := c.incompleteJobs()
   331  	if len(jobs) == 0 {
   332  		c.logSkipFetchJobs()
   333  		return nil
   334  	}
   335  
   336  	query := jobListToMultiGetQuery(jobs)
   337  	res, err := c.repo.MultiGet(ctx, query, c.additional, c.tenant)
   338  	if err != nil {
   339  		return errors.Wrap(err, "fetch job list")
   340  	}
   341  
   342  	return c.parseAndStore(ctx, res)
   343  }
   344  
   345  func (c *Cacher) logSkipFetchJobs() {
   346  	c.logger.
   347  		WithFields(
   348  			logrus.Fields{
   349  				"action": "request_cacher_fetch_jobs_skip",
   350  			}).
   351  		Trace("skip fetch jobs, have no incomplete jobs")
   352  }
   353  
   354  // parseAndStore parses the results for nested refs. Since it is already a
   355  // []search.Result no other parsing is required, as we can expect this type to
   356  // have all primitive props parsed correctly
   357  //
   358  // If nested refs are found, the recursion is started.
   359  //
   360  // Once no more nested refs can be found, the recursion triggers its exit
   361  // condition and all jobs are stored.
   362  func (c *Cacher) parseAndStore(ctx context.Context, res []search.Result) error {
   363  	// mark all current jobs as done, as we use the amount of incomplete jobs as
   364  	// the exit condition for the recursion. Next up, we will start a nested
   365  	// Build() call. If the Build call returns no new jobs, we are done and the
   366  	// recursion stops. If it does return more jobs, we will enter a nested
   367  	// iteration which will eventually come to this place again
   368  	c.markAllJobsAsDone()
   369  
   370  	err := c.Build(ctx, removeEmptyResults(res), nil, c.additional)
   371  	if err != nil {
   372  		return errors.Wrap(err, "build nested cache")
   373  	}
   374  
   375  	err = c.storeResults(res)
   376  	if err != nil {
   377  		return err
   378  	}
   379  
   380  	return nil
   381  }
   382  
   383  func removeEmptyResults(in []search.Result) []search.Result {
   384  	out := make([]search.Result, len(in))
   385  	n := 0
   386  	for _, obj := range in {
   387  		if obj.ID != "" {
   388  			out[n] = obj
   389  			n++
   390  		}
   391  	}
   392  
   393  	return out[0:n]
   394  }
   395  
   396  func (c *Cacher) storeResults(res search.Results) error {
   397  	for _, item := range res {
   398  		c.store[multi.Identifier{
   399  			ID:        item.ID.String(),
   400  			ClassName: item.ClassName,
   401  		}] = item
   402  	}
   403  
   404  	return nil
   405  }
   406  
   407  func (c *Cacher) markAllJobsAsDone() {
   408  	for i := range c.jobs {
   409  		c.jobs[i].complete = true
   410  	}
   411  }
   412  
   413  func jobListToMultiGetQuery(jobs []cacherJob) []multi.Identifier {
   414  	query := make([]multi.Identifier, len(jobs))
   415  	for i, job := range jobs {
   416  		query[i] = job.si
   417  	}
   418  
   419  	return query
   420  }