go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gae/impl/memory/datastore_index_selection.go (about)

     1  // Copyright 2015 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package memory
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  	"strings"
    22  
    23  	"go.chromium.org/luci/common/data/cmpbin"
    24  	"go.chromium.org/luci/common/data/stringset"
    25  
    26  	ds "go.chromium.org/luci/gae/service/datastore"
    27  )
    28  
    29  // ErrMissingIndex is returned when the current indexes are not sufficient
    30  // for the current query.
    31  type ErrMissingIndex struct {
    32  	ns      string
    33  	Missing *ds.IndexDefinition
    34  }
    35  
    36  func (e *ErrMissingIndex) Error() string {
    37  	yaml, err := e.Missing.YAMLString()
    38  	if err != nil {
    39  		panic(err)
    40  	}
    41  	return fmt.Sprintf(
    42  		"Insufficient indexes. Consider adding:\n%s", yaml)
    43  }
    44  
    45  // reducedQuery contains only the pieces of the query necessary to iterate for
    46  // results.
    47  //
    48  //	deduplication is applied externally
    49  //	projection / keysonly / entity retrieval is done externally
    50  type reducedQuery struct {
    51  	kc   ds.KeyContext
    52  	kind string
    53  
    54  	// eqFilters indicate the set of all prefix constraints which need to be
    55  	// fulfilled in the composite query. All of these will translate into prefix
    56  	// bytes for SOME index.
    57  	eqFilters map[string]stringset.Set
    58  
    59  	// suffixFormat is the PRECISE listing of the suffix columns that ALL indexes
    60  	//   in the multi query will have.
    61  	//
    62  	// suffixFormat ALWAYS includes the inequality filter (if any) as the 0th
    63  	//   element
    64  	// suffixFormat ALWAYS includes any additional projections (in ascending
    65  	//   order) after all user defined sort orders
    66  	// suffixFormat ALWAYS has __key__ as the last column
    67  	suffixFormat []ds.IndexColumn
    68  
    69  	// limits of the inequality and/or full sort order. This is ONLY a suffix,
    70  	// and it will be appended to the prefix during iteration.
    71  	start []byte
    72  	end   []byte
    73  
    74  	// metadata describing the total number of columns that this query requires to
    75  	// execute perfectly.
    76  	numCols int
    77  }
    78  
    79  type indexDefinitionSortable struct {
    80  	// eqFilts is the list of ACTUAL prefix columns. Note that it may contain
    81  	// redundant columns! (e.g. (tag, tag) is a perfectly valid prefix, becuase
    82  	// (tag=1, tag=2) is a perfectly valid query).
    83  	eqFilts []ds.IndexColumn
    84  	coll    memCollection
    85  }
    86  
    87  func (i *indexDefinitionSortable) hasAncestor() bool {
    88  	return len(i.eqFilts) > 0 && i.eqFilts[0].Property == "__ancestor__"
    89  }
    90  
    91  func (i *indexDefinitionSortable) numEqHits(c *constraints) int {
    92  	ret := 0
    93  	for _, filt := range i.eqFilts {
    94  		if _, ok := c.constraints[filt.Property]; ok {
    95  			ret++
    96  		}
    97  	}
    98  	return ret
    99  }
   100  
   101  type indexDefinitionSortableSlice []indexDefinitionSortable
   102  
   103  func (idxs indexDefinitionSortableSlice) Len() int      { return len(idxs) }
   104  func (idxs indexDefinitionSortableSlice) Swap(i, j int) { idxs[i], idxs[j] = idxs[j], idxs[i] }
   105  func (idxs indexDefinitionSortableSlice) Less(i, j int) bool {
   106  	a, b := idxs[i], idxs[j]
   107  	if a.coll == nil && b.coll != nil {
   108  		return true
   109  	} else if a.coll != nil && b.coll == nil {
   110  		return false
   111  	}
   112  
   113  	cmp := len(a.eqFilts) - len(b.eqFilts)
   114  	if cmp < 0 {
   115  		return true
   116  	} else if cmp > 0 {
   117  		return false
   118  	}
   119  	for k, col := range a.eqFilts {
   120  		ocol := b.eqFilts[k]
   121  		if !col.Descending && ocol.Descending {
   122  			return true
   123  		} else if col.Descending && !ocol.Descending {
   124  			return false
   125  		}
   126  		if col.Property < ocol.Property {
   127  			return true
   128  		} else if col.Property > ocol.Property {
   129  			return false
   130  		}
   131  	}
   132  	return false
   133  }
   134  
   135  // maybeAddDefinition possibly adds a new indexDefinitionSortable to this slice.
   136  // It's only added if it could be useful in servicing q, otherwise this function
   137  // is a noop.
   138  //
   139  // This returns true iff the proposed index is OK and depletes missingTerms to
   140  // empty.
   141  //
   142  // If the proposed index is PERFECT (e.g. contains enough columns to cover all
   143  // equality filters, and also has the correct suffix), idxs will be replaced
   144  // with JUST that index, and this will return true.
   145  func (idxs *indexDefinitionSortableSlice) maybeAddDefinition(q *reducedQuery, s memStore, missingTerms stringset.Set, id *ds.IndexDefinition) bool {
   146  	// Kindless queries are handled elsewhere.
   147  	if id.Kind != q.kind {
   148  		impossible(
   149  			fmt.Errorf("maybeAddDefinition given index with wrong kind %q v %q", id.Kind, q.kind))
   150  	}
   151  
   152  	// If we're an ancestor query, and the index is compound, but doesn't include
   153  	// an Ancestor field, it doesn't work. Builtin indexes can be used for
   154  	// ancestor queries (and have !Ancestor), assuming that it's only equality
   155  	// filters (plus inequality on __key__), or a single inequality.
   156  	if q.eqFilters["__ancestor__"] != nil && !id.Ancestor && !id.Builtin() {
   157  		impossible(
   158  			fmt.Errorf("maybeAddDefinition given compound index with wrong ancestor info: %s %#v", id, q))
   159  	}
   160  
   161  	// add __ancestor__ if necessary
   162  	sortBy := id.GetFullSortOrder()
   163  
   164  	// If the index has fewer fields than we need for the suffix, it can't
   165  	// possibly help.
   166  	if len(sortBy) < len(q.suffixFormat) {
   167  		return false
   168  	}
   169  
   170  	numEqFilts := len(sortBy) - len(q.suffixFormat)
   171  	// make sure the orders are precisely the same
   172  	for i, sb := range sortBy[numEqFilts:] {
   173  		if q.suffixFormat[i] != sb {
   174  			return false
   175  		}
   176  	}
   177  
   178  	if id.Builtin() && numEqFilts == 0 {
   179  		if len(q.eqFilters) > 1 || (len(q.eqFilters) == 1 && q.eqFilters["__ancestor__"] == nil) {
   180  			return false
   181  		}
   182  		if len(sortBy) > 1 && q.eqFilters["__ancestor__"] != nil {
   183  			return false
   184  		}
   185  	}
   186  
   187  	// Make sure the equalities section doesn't contain any properties we don't
   188  	// want in our query.
   189  	//
   190  	// numByProp && totalEqFilts will be used to see if this is a perfect match
   191  	// later.
   192  	numByProp := make(map[string]int, len(q.eqFilters))
   193  	totalEqFilts := 0
   194  
   195  	eqFilts := sortBy[:numEqFilts]
   196  	for _, p := range eqFilts {
   197  		if _, ok := q.eqFilters[p.Property]; !ok {
   198  			return false
   199  		}
   200  		numByProp[p.Property]++
   201  		totalEqFilts++
   202  	}
   203  
   204  	// ok, we can actually use this
   205  
   206  	// Grab the collection for convenience later. We don't want to invalidate this
   207  	// index's potential just because the collection doesn't exist. If it's
   208  	// a builtin and it doesn't exist, it still needs to be one of the 'possible'
   209  	// indexes... it just means that the user's query will end up with no results.
   210  	coll := s.GetCollection(
   211  		fmt.Sprintf("idx:%s:%s", q.kc.Namespace, ds.Serialize.ToBytes(*id.PrepForIdxTable())))
   212  
   213  	// First, see if it's a perfect match. If it is, then our search is over.
   214  	//
   215  	// A perfect match contains ALL the equality filter columns (or more, since
   216  	// we can use residuals to fill in the extras).
   217  	for _, sb := range eqFilts {
   218  		missingTerms.Del(sb.Property)
   219  	}
   220  
   221  	perfect := false
   222  	if len(sortBy) == q.numCols {
   223  		perfect = true
   224  		for k, num := range numByProp {
   225  			if num < q.eqFilters[k].Len() {
   226  				perfect = false
   227  				break
   228  			}
   229  		}
   230  	}
   231  	toAdd := indexDefinitionSortable{coll: coll, eqFilts: eqFilts}
   232  	if perfect {
   233  		*idxs = indexDefinitionSortableSlice{toAdd}
   234  	} else {
   235  		*idxs = append(*idxs, toAdd)
   236  	}
   237  	return missingTerms.Len() == 0
   238  }
   239  
   240  // getRelevantIndexes retrieves the relevant indexes which could be used to
   241  // service q. It returns nil if it's not possible to service q with the current
   242  // indexes.
   243  func getRelevantIndexes(q *reducedQuery, s memStore) (indexDefinitionSortableSlice, error) {
   244  	missingTerms := stringset.New(len(q.eqFilters))
   245  	for k := range q.eqFilters {
   246  		if k == "__ancestor__" {
   247  			// ancestor is not a prefix which can be satisfied by a single index. It
   248  			// must be satisfied by ALL indexes (and has special logic for this in
   249  			// the addDefinition logic)
   250  			continue
   251  		}
   252  		missingTerms.Add(k)
   253  	}
   254  	idxs := indexDefinitionSortableSlice{}
   255  
   256  	// First we add builtins
   257  	// add
   258  	//   idx:KIND
   259  	if idxs.maybeAddDefinition(q, s, missingTerms, &ds.IndexDefinition{
   260  		Kind: q.kind,
   261  	}) {
   262  		return idxs, nil
   263  	}
   264  
   265  	// add
   266  	//   idx:KIND:prop
   267  	//   idx:KIND:-prop
   268  	props := stringset.New(len(q.eqFilters) + len(q.suffixFormat))
   269  	for prop := range q.eqFilters {
   270  		props.Add(prop)
   271  	}
   272  	for _, col := range q.suffixFormat[:len(q.suffixFormat)-1] {
   273  		props.Add(col.Property)
   274  	}
   275  	for _, prop := range props.ToSlice() {
   276  		if !isSpecialProp(prop) && (strings.HasPrefix(prop, "__") && strings.HasSuffix(prop, "__")) {
   277  			continue
   278  		}
   279  		if idxs.maybeAddDefinition(q, s, missingTerms, &ds.IndexDefinition{
   280  			Kind: q.kind,
   281  			SortBy: []ds.IndexColumn{
   282  				{Property: prop},
   283  			},
   284  		}) {
   285  			return idxs, nil
   286  		}
   287  		if idxs.maybeAddDefinition(q, s, missingTerms, &ds.IndexDefinition{
   288  			Kind: q.kind,
   289  			SortBy: []ds.IndexColumn{
   290  				{Property: prop, Descending: true},
   291  			},
   292  		}) {
   293  			return idxs, nil
   294  		}
   295  	}
   296  
   297  	// Try adding all compound indexes whose suffix matches.
   298  	suffix := &ds.IndexDefinition{
   299  		Kind:     q.kind,
   300  		Ancestor: q.eqFilters["__ancestor__"] != nil,
   301  		SortBy:   q.suffixFormat,
   302  	}
   303  	walkCompIdxs(s, suffix, func(def *ds.IndexDefinition) bool {
   304  		// keep walking until we find a perfect index.
   305  		return !idxs.maybeAddDefinition(q, s, missingTerms, def)
   306  	})
   307  
   308  	// this query is impossible to fulfill with the current indexes. Not all the
   309  	// terms (equality + projection) are satisfied.
   310  	if missingTerms.Len() > 0 || len(idxs) == 0 {
   311  		remains := &ds.IndexDefinition{
   312  			Kind:     q.kind,
   313  			Ancestor: q.eqFilters["__ancestor__"] != nil,
   314  		}
   315  		terms := missingTerms.ToSlice()
   316  		if serializationDeterministic {
   317  			sort.Strings(terms)
   318  		}
   319  		for _, term := range terms {
   320  			remains.SortBy = append(remains.SortBy, ds.IndexColumn{Property: term})
   321  		}
   322  		remains.SortBy = append(remains.SortBy, q.suffixFormat...)
   323  		last := remains.SortBy[len(remains.SortBy)-1]
   324  		if !last.Descending {
   325  			// this removes the __key__ column, since it's implicit.
   326  			remains.SortBy = remains.SortBy[:len(remains.SortBy)-1]
   327  		}
   328  		if remains.Builtin() {
   329  			impossible(
   330  				fmt.Errorf("recommended missing index would be a builtin: %s", remains))
   331  		}
   332  		return nil, &ErrMissingIndex{q.kc.Namespace, remains}
   333  	}
   334  
   335  	return idxs, nil
   336  }
   337  
   338  // generate generates a single iterDefinition for the given index.
   339  func generate(q *reducedQuery, idx *indexDefinitionSortable, c *constraints) *iterDefinition {
   340  	def := &iterDefinition{
   341  		c:     idx.coll,
   342  		start: q.start,
   343  		end:   q.end,
   344  	}
   345  	toJoin := make([][]byte, len(idx.eqFilts))
   346  	for _, sb := range idx.eqFilts {
   347  		val := c.peel(sb.Property)
   348  		if sb.Descending {
   349  			val = cmpbin.InvertBytes(val)
   350  		}
   351  		toJoin = append(toJoin, val)
   352  	}
   353  	def.prefix = bytes.Join(toJoin, nil)
   354  	def.prefixLen = len(def.prefix)
   355  
   356  	if q.eqFilters["__ancestor__"] != nil && !idx.hasAncestor() {
   357  		// The query requires an ancestor, but the index doesn't explicitly have it
   358  		// as part of the prefix (otherwise it would have been the first eqFilt
   359  		// above). This happens when it's a builtin index, or if it's the primary
   360  		// index (for a kindless query), or if it's the Kind index (for a filterless
   361  		// query).
   362  		//
   363  		// builtin indexes are:
   364  		//   Kind/__key__
   365  		//   Kind/Prop/__key__
   366  		//   Kind/Prop/-__key__
   367  		if len(q.suffixFormat) > 2 || q.suffixFormat[len(q.suffixFormat)-1].Property != "__key__" {
   368  			// This should never happen. One of the previous validators would have
   369  			// selected a different index. But just in case.
   370  			impossible(fmt.Errorf("cannot supply an implicit ancestor for %#v", idx))
   371  		}
   372  
   373  		// get the only value out of __ancestor__
   374  		anc, _ := q.eqFilters["__ancestor__"].Peek()
   375  
   376  		// Intentionally do NOT update prefixLen. This allows multiIterator to
   377  		// correctly include the entire key in the shared iterator suffix, instead
   378  		// of just the remainder.
   379  
   380  		// chop the terminal null byte off the q.ancestor key... we can accept
   381  		// anything which is a descendant or an exact match.  Removing the last byte
   382  		// from the key (the terminating null) allows this trick to work. Otherwise
   383  		// it would be a closed range of EXACTLY this key.
   384  		chopped := []byte(anc[:len(anc)-1])
   385  		if q.suffixFormat[0].Descending {
   386  			chopped = cmpbin.InvertBytes(chopped)
   387  		}
   388  		def.prefix = cmpbin.ConcatBytes(def.prefix, chopped)
   389  
   390  		// Update start and end, since we know that if they contain anything, they
   391  		// contain values for the __key__ field. This is necessary because bytes
   392  		// are shifting from the suffix to the prefix, and start/end should only
   393  		// contain suffix (variable) bytes.
   394  		if def.start != nil {
   395  			if !bytes.HasPrefix(def.start, chopped) {
   396  				// again, shouldn't happen, but if it does, we want to know about it.
   397  				impossible(fmt.Errorf(
   398  					"start suffix for implied ancestor doesn't start with ancestor! start:%v ancestor:%v",
   399  					def.start, chopped))
   400  			}
   401  			def.start = def.start[len(chopped):]
   402  		}
   403  		if def.end != nil {
   404  			if !bytes.HasPrefix(def.end, chopped) {
   405  				impossible(fmt.Errorf(
   406  					"end suffix for implied ancestor doesn't start with ancestor! end:%v ancestor:%v",
   407  					def.end, chopped))
   408  			}
   409  			def.end = def.end[len(chopped):]
   410  		}
   411  	}
   412  
   413  	return def
   414  }
   415  
   416  type constraints struct {
   417  	constraints     map[string][][]byte
   418  	original        map[string][][]byte
   419  	residualMapping map[string]int
   420  }
   421  
   422  // peel picks a constraint value for the property. It then removes this value
   423  // from constraints (possibly removing the entire row from constraints if it
   424  // was the last value). If the value wasn't available in constraints, it picks
   425  // the value from residuals.
   426  func (c *constraints) peel(prop string) []byte {
   427  	ret := []byte(nil)
   428  	if vals, ok := c.constraints[prop]; ok {
   429  		ret = vals[0]
   430  		if len(vals) == 1 {
   431  			delete(c.constraints, prop)
   432  		} else {
   433  			c.constraints[prop] = vals[1:]
   434  		}
   435  	} else {
   436  		row := c.original[prop]
   437  		idx := c.residualMapping[prop]
   438  		c.residualMapping[prop]++
   439  		ret = row[idx%len(row)]
   440  	}
   441  	return ret
   442  }
   443  
   444  func (c *constraints) empty() bool {
   445  	return len(c.constraints) == 0
   446  }
   447  
   448  // calculateConstraints produces a mapping of all equality filters to the values
   449  // that they're constrained to. It also calculates residuals, which are an
   450  // arbitrary value for filling index prefixes which have more equality fields
   451  // than are necessary. The value doesn't matter, as long as its an equality
   452  // constraint in the original query.
   453  func calculateConstraints(q *reducedQuery) *constraints {
   454  	ret := &constraints{
   455  		original:        make(map[string][][]byte, len(q.eqFilters)),
   456  		constraints:     make(map[string][][]byte, len(q.eqFilters)),
   457  		residualMapping: make(map[string]int),
   458  	}
   459  	for prop, vals := range q.eqFilters {
   460  		bvals := make([][]byte, 0, vals.Len())
   461  		vals.Iter(func(val string) bool {
   462  			bvals = append(bvals, []byte(val))
   463  			return true
   464  		})
   465  		ret.original[prop] = bvals
   466  		if prop == "__ancestor__" {
   467  			// exclude __ancestor__ from the constraints.
   468  			//
   469  			// This is because it's handled specially during index proposal and
   470  			// generation. Ancestor is used by ALL indexes, and so its residual value
   471  			// in ret.original above will be sufficient.
   472  			continue
   473  		}
   474  		ret.constraints[prop] = bvals
   475  	}
   476  	return ret
   477  }
   478  
   479  // getIndexes returns a set of iterator definitions. Iterating over these
   480  // will result in matching suffixes.
   481  func getIndexes(q *reducedQuery, s memStore) ([]*iterDefinition, error) {
   482  	relevantIdxs := indexDefinitionSortableSlice(nil)
   483  	if q.kind == "" {
   484  		if coll := s.GetCollection("ents:" + q.kc.Namespace); coll != nil {
   485  			relevantIdxs = indexDefinitionSortableSlice{{coll: coll}}
   486  		}
   487  	} else {
   488  		err := error(nil)
   489  		relevantIdxs, err = getRelevantIndexes(q, s)
   490  		if err != nil {
   491  			return nil, err
   492  		}
   493  	}
   494  	if len(relevantIdxs) == 0 {
   495  		return nil, ds.ErrNullQuery
   496  	}
   497  
   498  	// This sorts it so that relevantIdxs goes less filters -> more filters. We
   499  	// traverse this list backwards, however, so we traverse it in more filters ->
   500  	// less filters order.
   501  	sort.Sort(relevantIdxs)
   502  
   503  	constraints := calculateConstraints(q)
   504  
   505  	ret := []*iterDefinition{}
   506  	for !constraints.empty() || len(ret) == 0 {
   507  		bestIdx := (*indexDefinitionSortable)(nil)
   508  		if len(ret) == 0 {
   509  			// if ret is empty, take the biggest relevantIdx. It's guaranteed to have
   510  			// the greatest number of equality filters of any index in the list, and
   511  			// we know that every equality filter will be pulled from constraints and
   512  			// not residual.
   513  			//
   514  			// This also takes care of the case when the query has no equality filters,
   515  			// in which case relevantIdxs will actually only contain one index anyway
   516  			// :)
   517  			bestIdx = &relevantIdxs[len(relevantIdxs)-1]
   518  			if bestIdx.coll == nil {
   519  				return nil, ds.ErrNullQuery
   520  			}
   521  		} else {
   522  			// If ret's not empty, then we need to find the best index we can. The
   523  			// best index will be the one with the most matching equality columns.
   524  			// Since relevantIdxs is sorted primarially by the number of equality
   525  			// columns, we walk down the list until the number of possible columns is
   526  			// worse than our best-so-far.
   527  			//
   528  			// Traversing the list backwards goes from more filters -> less filters,
   529  			// but also allows us to remove items from the list as we iterate over it.
   530  			bestNumEqHits := 0
   531  			for i := len(relevantIdxs) - 1; i >= 0; i-- {
   532  				idx := &relevantIdxs[i]
   533  				if len(idx.eqFilts) < bestNumEqHits {
   534  					// if the number of filters drops below our best hit, it's never going
   535  					// to get better than that. This index might be helpful on a later
   536  					// loop though, so don't remove it.
   537  					break
   538  				}
   539  				numHits := 0
   540  				if idx.coll != nil {
   541  					numHits = idx.numEqHits(constraints)
   542  				}
   543  				if numHits > bestNumEqHits {
   544  					bestNumEqHits = numHits
   545  					bestIdx = idx
   546  				} else if numHits == 0 {
   547  					// This index will never become useful again, so remove it.
   548  					relevantIdxs = append(relevantIdxs[:i], relevantIdxs[i+1:]...)
   549  				}
   550  			}
   551  		}
   552  		if bestIdx == nil {
   553  			// something is really wrong here... if relevantIdxs is !nil, then we
   554  			// should always be able to make progress in this loop.
   555  			impossible(fmt.Errorf("deadlock: cannot fulfil query?"))
   556  		}
   557  		ret = append(ret, generate(q, bestIdx, constraints))
   558  	}
   559  
   560  	return ret, nil
   561  }