go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/gae/service/datastore/queryiterator.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package datastore
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"sort"
    21  
    22  	"golang.org/x/sync/errgroup"
    23  
    24  	"go.chromium.org/luci/common/data/cmpbin"
    25  )
    26  
    27  // queryIterator is an iterator for datastore query results.
    28  //
    29  // DANGER: This implementation assumes instances of CursorCB carry complete
    30  // snapshots of the query iteration state, e.g. "remembering" a cursor callback
    31  // at some iteration cycle `X`, and calling it a later cycle `Y` will produce
    32  // a cursor for the cycle `X`. This property is not guaranteed by RawInterface
    33  // API and it (accidentally) only holds for `impl/memory` implementation using
    34  // only in unit tests.
    35  //
    36  // In all production implementations cursor callbacks just hold a pointer to
    37  // the internal datastore iterator: calling such callback always returns
    38  // the **current position** of the internal iterator (in the above example, it
    39  // will produce the position representing `Y`, not `X`). As a consequence,
    40  // RunMutli skips entities when resuming from a cursor.
    41  //
    42  // Additionally, RawInterface doesn't guarantee it is safe to call the cursor
    43  // callback from another goroutine (which queryIterator does). This results in
    44  // data races when using cursors with production implementations. Again,
    45  // accidentally, `impl/memory` works "fine", so this problem is obscured in
    46  // unit tests.
    47  //
    48  // One potential fix is to call the cursor callback for every fetched entity and
    49  // pass around real Cursor objects (that are complete "snapshots") instead of
    50  // CursorCB functions (that are just pointers to an internal non-thread safe
    51  // iterator with the most recent cursor). The problem is that CursorCB is a
    52  // potentially slow operations that can make RPCs (and it does in `impl/prod`
    53  // implementation). That's the reason it is a callback (to be called lazily only
    54  // when needed). It is not supposed to be used on every loop cycle. Using it
    55  // this way may severely degrade performance of the query.
    56  type queryIterator struct {
    57  	query                 *Query
    58  	order                 []IndexColumn
    59  	currentQueryResult    *rawQueryResult
    60  	itemCh                chan *rawQueryResult
    61  	done                  bool
    62  	currentItemOrderCache string   // lazy loading (loaded when `CurrentItemOrder()` is called).
    63  	cursorCB              CursorCB // for the *current* item
    64  }
    65  
    66  // startQueryIterator starts to run the given query and return the iterator for
    67  // query results.
    68  func startQueryIterator(ctx context.Context, eg *errgroup.Group, fq *FinalizedQuery) *queryIterator {
    69  	qi := &queryIterator{
    70  		query:  fq.Original(),
    71  		order:  fq.Orders(),
    72  		itemCh: make(chan *rawQueryResult),
    73  		// This will be used as CurrentCursor after the first Next() call. To get
    74  		// the first query result, we just need to restart the query from its
    75  		// initial starting cursor.
    76  		cursorCB: func() (Cursor, error) {
    77  			start, _ := fq.Bounds()
    78  			return start, nil
    79  		},
    80  	}
    81  
    82  	eg.Go(func() (err error) {
    83  		defer func() { qi.itemCh <- &rawQueryResult{err: err} }()
    84  		return Raw(ctx).Run(fq, func(k *Key, pm PropertyMap, cursorCB CursorCB) error {
    85  			if k == nil { // we use `key == nil` as an indicator of the last message
    86  				panic("impossible per Run contract")
    87  			}
    88  			// Do not even attempt to write to `qi.itemCh` if the context is already
    89  			// done. Note that if multiple cases of select {...} are ready at the same
    90  			// time, Go chooses one randomly to proceed. We don't want that if the
    91  			// context is already done.
    92  			if ctx.Err() != nil {
    93  				return ctx.Err()
    94  			}
    95  			select {
    96  			case <-ctx.Done():
    97  				return ctx.Err()
    98  			case qi.itemCh <- &rawQueryResult{
    99  				key:      k,
   100  				data:     pm,
   101  				cursorCB: cursorCB,
   102  			}:
   103  				return nil
   104  			}
   105  		})
   106  	})
   107  
   108  	return qi
   109  }
   110  
   111  // Query is the original query this iterator was started with.
   112  func (qi *queryIterator) Query() *Query {
   113  	return qi.query
   114  }
   115  
   116  // CurrentItem returns the current query result.
   117  //
   118  // Returns nil key if the iterator has reached its end.
   119  func (qi *queryIterator) CurrentItem() (*Key, PropertyMap) {
   120  	if qi.currentQueryResult == nil {
   121  		return nil, PropertyMap{}
   122  	}
   123  	return qi.currentQueryResult.key, qi.currentQueryResult.data
   124  }
   125  
   126  // CurrentItemKey returns a serialized current item key.
   127  //
   128  // Returns "" if the iterator has reached its end.
   129  func (qi *queryIterator) CurrentItemKey() string {
   130  	if qi.currentQueryResult == nil || qi.currentQueryResult.key == nil {
   131  		return ""
   132  	}
   133  	return string(Serialize.ToBytes(qi.currentQueryResult.key))
   134  }
   135  
   136  // CurrentCursor returns a cursor pointing to the current item (if any).
   137  //
   138  // The defining property of this cursor is that if a queryIterator is recreated
   139  // with it, its first Next() call will return the current item again (if any).
   140  // This is useful for repopulating the heap when restarting the query from
   141  // a cursor.
   142  //
   143  // Note that if the iterator is exhausted already, i.e. Next() returns
   144  // done == true, CurrentCursor() still returns some non-nil cursor. This cursor
   145  // points to a position right after the last fetched item. When resuming from
   146  // such cursor, we'll either immediately discover the iterator is still
   147  // exhausted, or (if the datastore state changed between calls), we'll discover
   148  // new items that can be fetched now.
   149  //
   150  // Returns nil only if the query produced no results whatsoever and this query
   151  // didn't have a cursor set. In that case we'll need to restart the query from
   152  // scratch when restarting the iteration and this is precisely what `nil` cursor
   153  // does.
   154  func (qi *queryIterator) CurrentCursor() (Cursor, error) {
   155  	return qi.cursorCB()
   156  }
   157  
   158  // CurrentItemOrder returns a serialized representation of properties used for
   159  // ordering the results.
   160  //
   161  // Such strings are directly comparable to one another.
   162  func (qi *queryIterator) CurrentItemOrder() string {
   163  	if qi.currentItemOrderCache != "" {
   164  		return qi.currentItemOrderCache
   165  	}
   166  
   167  	if qi.currentQueryResult == nil {
   168  		return ""
   169  	}
   170  
   171  	invBuf := cmpbin.Invertible(&bytes.Buffer{})
   172  	for _, column := range qi.order {
   173  		invBuf.SetInvert(column.Descending)
   174  		if column.Property == "__key__" {
   175  			panicIf(Serialize.Key(invBuf, qi.currentQueryResult.key))
   176  			continue
   177  		}
   178  		columnData := qi.currentQueryResult.data[column.Property].Slice()
   179  		sort.Sort(columnData)
   180  		if column.Descending {
   181  			panicIf(Serialize.Property(invBuf, columnData[columnData.Len()-1]))
   182  		} else {
   183  			panicIf(Serialize.Property(invBuf, columnData[0]))
   184  		}
   185  	}
   186  	qi.currentItemOrderCache = invBuf.String()
   187  	return qi.currentItemOrderCache
   188  }
   189  
   190  // Next iterates to the next item and makes it current.
   191  //
   192  // Note: call Next() before calling any CurrentItemXXX functions to get the
   193  // right results.
   194  //
   195  // If the iterator has finished running returns done == true and an error
   196  // (if the iterator finished due to an error). The error may be a context error
   197  // if the root context was canceled or has expired.
   198  func (qi *queryIterator) Next() (done bool, err error) {
   199  	if qi.itemCh == nil {
   200  		panic("item channel for queryIterator is not properly initiated")
   201  	}
   202  	if !qi.done {
   203  		// Let's assume currentQueryResult index among the full list of query
   204  		// results is `T`. It means `currentQueryResult.cursorCB` is pointing to
   205  		// `T+1`. Also `<-qi.itemCh` will return the next result, i.e. `T+1` as
   206  		// well. This new `T+1` result will become the CurrentItem(). We need to
   207  		// make CurrentCursor() return `T+1` cursor as well. And this is precisely
   208  		// what `currentQueryResult.cursorCB` does.
   209  		//
   210  		// The nil check is for the very first Next() call. We already populated
   211  		// cursorCB correctly for this situation in `startQueryIterator`.
   212  		//
   213  		// See also DANGER warning in queryIterator doc. This is not how cursors
   214  		// actually behave.
   215  		if qi.currentQueryResult != nil {
   216  			qi.cursorCB = qi.currentQueryResult.cursorCB
   217  		}
   218  		qi.currentQueryResult = <-qi.itemCh
   219  		qi.currentItemOrderCache = ""
   220  		qi.done = qi.currentQueryResult.key == nil
   221  	}
   222  	return qi.done, qi.currentQueryResult.err
   223  }
   224  
   225  // rawQueryResult captures the result from raw datastore query snapshot.
   226  type rawQueryResult struct {
   227  	key      *Key
   228  	data     PropertyMap
   229  	err      error
   230  	cursorCB CursorCB // points to the entry right after `key`
   231  }