go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/eventbox/dsset/dsset.go (about)

     1  // Copyright 2017 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package dsset implements a particular flavor of Datastore-on-Firestore backed
    16  // set.
    17  //
    18  // Due to its internal structure, it requires some maintenance on behalf of the
    19  // caller to periodically cleanup removed items (aka tombstones).
    20  //
    21  // Items added to the set should have unique IDs, at least for the duration of
    22  // some configurable time interval, as defined by TombstonesDelay property.
    23  // It means removed items can't be added back to the set right away (the set
    24  // will think they are already there). This is required to make 'Add' operation
    25  // idempotent.
    26  //
    27  // TombstonesDelay is assumed to be much larger than time scale of all "fast"
    28  // processes in the system, in particular all List+Pop processes. For example,
    29  // if List+Pop is expected to take 1 min, TombstonesDelay should be >> 1 min
    30  // (e.g. 5 min). Setting TombstonesDelay to very large value is harmful though,
    31  // since it may slow down 'List' and 'Pop' (by allowing more garbage that will
    32  // have to be filtered out).
    33  //
    34  // Properties (where N is current size of the set):
    35  //   - Batch 'Add', O(1) performance.
    36  //   - Transactional consistent 'Pop' (1 QPS limit), O(N) performance.
    37  //   - Non-transactional consistent 'List', O(N) performance.
    38  //   - Popped items can't be re-added until their tombstones expire.
    39  //
    40  // These properties make dsset suitable for multiple producers, single consumer
    41  // queues, where order of items is not important, each item has a unique
    42  // identifier, and the queue size is small.
    43  //
    44  // Structurally dsset places 2 kinds of entities under provided Set's parent
    45  // entity:
    46  //   - items of the set.
    47  //   - tombstones, recording deleted items.
    48  //
    49  // This code is a fork of dsset for classic Datastore, which had to work around
    50  // 1 write per second per entity group limit using shards. See
    51  // go.chromium.org/luci/scheduler/appengine/engine/dsset.
    52  package dsset
    53  
    54  import (
    55  	"context"
    56  	"fmt"
    57  	"time"
    58  
    59  	"golang.org/x/sync/errgroup"
    60  
    61  	"go.chromium.org/luci/cv/internal/tracing"
    62  	"go.chromium.org/luci/gae/filter/txndefer"
    63  	"go.chromium.org/luci/gae/service/datastore"
    64  
    65  	"go.chromium.org/luci/common/clock"
    66  	"go.chromium.org/luci/common/data/stringset"
    67  	"go.chromium.org/luci/common/retry/transient"
    68  )
    69  
    70  // Set holds a set of Items and uses tombstones to achieve idempotency of Add.
    71  //
    72  // Producers just call Add(...).
    73  //
    74  // The consumer must run more elaborate algorithm that ensures atomicity of
    75  // 'Pop' and takes care of cleaning up of the garbage. This requires a mix of
    76  // transactional and non-transactional actions:
    77  //
    78  //	listing, err := set.List(ctx)
    79  //	if err != nil {
    80  //	  return err
    81  //	}
    82  //
    83  //	if err := dsset.CleanupGarbage(ctx, listing.Garbage); err != nil {
    84  //	  return err
    85  //	}
    86  //
    87  //	... Fetch any additional info associated with 'listing.Items' ...
    88  //
    89  //	err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
    90  //	  op, err := set.BeginPop(ctx, listing)
    91  //	  if err != nil {
    92  //	    return err
    93  //	  }
    94  //	  for _, itm := range listing.items {
    95  //	    if op.Pop(item.ID) {
    96  //	      // The item was indeed in the set and we've just removed it!
    97  //	    } else {
    98  //	      // Some other transaction has popped it already.
    99  //	    }
   100  //	  }
   101  //	  return dsset.FinishPop(ctx, op)
   102  //	}, nil)
   103  //	return err
   104  type Set struct {
   105  	// Parent points to the datastore owning the set.
   106  	//
   107  	// Set's Datastore entities will be placed with this parent.
   108  	Parent *datastore.Key
   109  	// TombstonesDelay is how long to keep tombstones in the set.
   110  	TombstonesDelay time.Duration
   111  }
   112  
   113  // Item is what's stored in the set.
   114  type Item struct {
   115  	ID    string // unique in time identifier of the item
   116  	Value []byte // arbitrary value (<1 MB, but preferably much smaller)
   117  }
   118  
   119  // Garbage is a list of tombstones to cleanup.
   120  type Garbage []*tombstone
   121  
   122  // Listing is returned by 'List' call.
   123  //
   124  // It contains actual listing of items in the set, as well as a bunch of service
   125  // information used by other operations ('CleanupGarbage' and 'Pop') to keep
   126  // the set in a garbage-free and consistent state.
   127  //
   128  // The only way to construct a correct Listing is to call 'List' method.
   129  //
   130  // See comments for Set struct and List method for more info.
   131  type Listing struct {
   132  	Items   []Item  // all items in the set, in arbitrary order
   133  	Garbage Garbage // tombstones that can be cleaned up now
   134  
   135  	parent     *datastore.Key            // set's parent.
   136  	producedAt time.Time                 // when 'List' call was initiated
   137  	idToKey    map[string]*datastore.Key // ID -> datastore key to cleanup
   138  }
   139  
   140  // tombstone is a reference to a deleted item that still lingers in the set.
   141  //
   142  // Tombstones exist to make sure recently popped items do not reappear in the
   143  // set if producers attempt to re-add them.
   144  type tombstone struct {
   145  	id        string         // deleted item ID
   146  	storage   *datastore.Key // itemEntity to delete in 'CleanupGarbage'
   147  	old       bool           // true if tombstone should be popped in 'Pop'
   148  	cleanedUp bool           // true if 'CleanupGarbage' processed the tombstone
   149  }
   150  
   151  // Add idempotently adds a bunch of items to the set.
   152  //
   153  // If items with given keys are already in the set, or have been deleted
   154  // recently, they won't be re-added. No error is returned in this case. When
   155  // retrying the call like that, the caller is responsible to pass exact same
   156  // Item.Value, otherwise 'List' may return random variant of the added item.
   157  //
   158  // If called outside of a transaction and the call fails, may add only some
   159  // subset of items. Running inside a transaction makes this operation atomic.
   160  //
   161  // Returns only transient errors.
   162  func (s *Set) Add(c context.Context, items []Item) error {
   163  	// If added items have been popped already (they have tombstones), 'List' will
   164  	// omit them as well.
   165  	entities := make([]itemEntity, len(items))
   166  	for i, itm := range items {
   167  		entities[i] = itemEntity{
   168  			ID:     itm.ID,
   169  			Parent: s.Parent,
   170  			Value:  itm.Value,
   171  		}
   172  	}
   173  	return transient.Tag.Apply(datastore.Put(c, entities))
   174  }
   175  
   176  // List returns all items that are currently in the set (in arbitrary order),
   177  // as well as a set of tombstones that points to items that were previously
   178  // popped and can be cleaned up now.
   179  //
   180  // Must be called outside of transactions (panics otherwise).
   181  //
   182  // The set of tombstones to cleanup should be passed to 'CleanupGarbage', and
   183  // later to 'BeginPop' (via Listing), in that order. Not doing so will lead to
   184  // accumulation of a garbage in the set that will slow down 'List' and 'Pop'.
   185  //
   186  // Returns only transient errors.
   187  func (s *Set) List(ctx context.Context, maxEvents int) (l *Listing, err error) {
   188  	switch {
   189  	case datastore.CurrentTransaction(ctx) != nil:
   190  		panic(fmt.Errorf("dsset.Set.List must be called outside of a transaction"))
   191  	case maxEvents <= 0:
   192  		panic(fmt.Errorf("maxEvents must be >0, but %d given", maxEvents))
   193  	}
   194  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/eventbox/dsset/List")
   195  	defer func() { tracing.End(span, err) }()
   196  
   197  	now := clock.Now(ctx).UTC()
   198  
   199  	// Fetch all items and all tombstones.
   200  	tombsEntity := tombstonesEntity{Parent: s.Parent}
   201  
   202  	eg, ctx := errgroup.WithContext(ctx)
   203  	eg.Go(func() error {
   204  		err := datastore.Get(ctx, &tombsEntity)
   205  		if err != nil && err != datastore.ErrNoSuchEntity {
   206  			return err
   207  		}
   208  		return nil
   209  	})
   210  
   211  	var entities []*itemEntity
   212  	eg.Go(func() error {
   213  		q := datastore.NewQuery("dsset.Item").Ancestor(s.Parent).Limit(int32(maxEvents))
   214  		return datastore.GetAll(ctx, q, &entities)
   215  	})
   216  	if err := eg.Wait(); err != nil {
   217  		return nil, transient.Tag.Apply(err)
   218  	}
   219  
   220  	// Mapping "item ID" => "entity to delete to remove it". This is eventually
   221  	// used by 'CleanupGarbage'.
   222  	idToKey := map[string]*datastore.Key{}
   223  	for _, e := range entities {
   224  		idToKey[e.ID] = datastore.KeyForObj(ctx, e)
   225  	}
   226  
   227  	// A set of items we pretend not to see. Initially all tombstoned ones.
   228  	//
   229  	// Since we are iterating over tombstone list anyway, find all sufficiently
   230  	// old tombstones or tombstones that still have storage associated with them.
   231  	// We return them to the caller, so they can be cleaned up:
   232  	//   * 'CleanupGarbage' makes sure 'storage' entities are deleted.
   233  	//   * 'BeginPop' completely erases old tombstones.
   234  	var tombs Garbage
   235  	ignore := stringset.New(len(tombsEntity.Tombstones))
   236  	for _, t := range tombsEntity.Tombstones {
   237  		ignore.Add(t.ID)
   238  		old := now.Sub(t.Tombstoned) > s.TombstonesDelay
   239  		if storage, ok := idToKey[t.ID]; ok || old {
   240  			tombs = append(tombs, &tombstone{
   241  				id:      t.ID,
   242  				storage: storage,
   243  				old:     old, // if true, BeginPop will delete this tombstone
   244  			})
   245  		}
   246  	}
   247  
   248  	// Throw away tombstoned items.
   249  	var items []Item
   250  	for _, e := range entities {
   251  		if !ignore.Has(e.ID) {
   252  			items = append(items, Item{
   253  				ID:    e.ID,
   254  				Value: e.Value,
   255  			})
   256  			ignore.Add(e.ID)
   257  		}
   258  	}
   259  
   260  	return &Listing{
   261  		Items:      items,
   262  		Garbage:    tombs,
   263  		parent:     s.Parent,
   264  		producedAt: now,
   265  		idToKey:    idToKey,
   266  	}, nil
   267  }
   268  
   269  // Delete deletes items from the set non-transactionally.
   270  //
   271  // Use at your own risk. If in doubt, use expected BeginPop() instead.
   272  //
   273  // Calls nextID() to get next ID to delete until nextID() returns "".
   274  func (s *Set) Delete(ctx context.Context, nextID func() string) (err error) {
   275  	if datastore.CurrentTransaction(ctx) != nil {
   276  		panic("dsset.Set.Delete must be called outside of a transaction")
   277  	}
   278  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/eventbox/dsset/Delete")
   279  	defer func() { tracing.End(span, err) }()
   280  
   281  	keys := []*datastore.Key{}
   282  	for {
   283  		id := nextID()
   284  		if id == "" {
   285  			break
   286  		}
   287  		keys = append(keys, datastore.NewKey(ctx, "dsset.Item", id, 0, s.Parent))
   288  	}
   289  	return transient.Tag.Apply(datastore.Delete(ctx, keys))
   290  }
   291  
   292  // PopOp is an in-progress 'Pop' operation.
   293  //
   294  // See BeginPop.
   295  type PopOp struct {
   296  	ctx      context.Context           // datastore context to use for this op
   297  	txn      datastore.Transaction     // a transaction that started BeginPop
   298  	now      time.Time                 // popping time for all popped items
   299  	dirty    bool                      // true if the tombstone map was modified
   300  	finished bool                      // true if finished already
   301  	entity   *tombstonesEntity         // entity with tombstones
   302  	tombs    map[string]time.Time      // entity.Tombstones in a map form
   303  	idToKey  map[string]*datastore.Key // ID -> datastore key to cleanup
   304  	popped   Garbage                   // new tombstones for popped items
   305  }
   306  
   307  // BeginPop initiates 'Pop' operation.
   308  //
   309  // Pop operation is used to transactionally remove items from the set, as well
   310  // as cleanup old tombstones. It must be finished with 'dsset.FinishPop', even
   311  // if no items have been popped: the internal state still can change in this
   312  // case, since 'BeginPop' cleans up old tombstones. Even more, it is necessary
   313  // to do 'Pop' if listing contains non-empty set of tombstones (regardless of
   314  // whether the caller wants to actually pop any items from the set). This is
   315  // part of the required set maintenance.
   316  //
   317  // Requires a transaction. Modifies Tombstone entity.
   318  // Requires a txndefer to be installed in context. This is already done by
   319  // default in luci/server.
   320  //
   321  // Returns only transient errors. Such errors usually mean that the entire pop
   322  // sequence ('List' + 'Pop') should be retried.
   323  func (s *Set) BeginPop(c context.Context, listing *Listing) (*PopOp, error) {
   324  	if listing.parent != s.Parent {
   325  		panic("passed Listing from another set")
   326  	}
   327  	txn := datastore.CurrentTransaction(c)
   328  	if txn == nil {
   329  		panic("dsset.Set.BeginPop must be called inside a transaction")
   330  	}
   331  
   332  	now := clock.Now(c).UTC()
   333  	if age := now.Sub(listing.producedAt); age > s.TombstonesDelay {
   334  		return nil, transient.Tag.Apply(fmt.Errorf("the listing is stale (%s > %s)", age, s.TombstonesDelay))
   335  	}
   336  
   337  	entity := &tombstonesEntity{Parent: s.Parent}
   338  	if err := datastore.Get(c, entity); err != nil && err != datastore.ErrNoSuchEntity {
   339  		return nil, transient.Tag.Apply(err)
   340  	}
   341  
   342  	// The data in tombstonesEntity, in map form.
   343  	tombs := make(map[string]time.Time, len(entity.Tombstones))
   344  	for _, t := range entity.Tombstones {
   345  		tombs[t.ID] = t.Tombstoned
   346  	}
   347  
   348  	// Throw away old tombstones right away.
   349  	dirty := false
   350  	for _, tomb := range listing.Garbage {
   351  		if tomb.old {
   352  			if !tomb.cleanedUp {
   353  				panic("trying to remove Tombstone that wasn't cleaned up")
   354  			}
   355  			if _, hasTomb := tombs[tomb.id]; hasTomb {
   356  				delete(tombs, tomb.id)
   357  				dirty = true
   358  			}
   359  		}
   360  	}
   361  
   362  	return &PopOp{
   363  		ctx:     c,
   364  		txn:     txn,
   365  		now:     now,
   366  		dirty:   dirty,
   367  		entity:  entity,
   368  		tombs:   tombs,
   369  		idToKey: listing.idToKey,
   370  	}, nil
   371  }
   372  
   373  // CanPop returns true if the given item can be popped from the set.
   374  //
   375  // Returns false if this item has been popped before (perhaps in another
   376  // transaction), or it's not in the listing passed to BeginPop.
   377  func (p *PopOp) CanPop(id string) bool {
   378  	if _, hasTomb := p.tombs[id]; hasTomb {
   379  		return false // already popped by someone else
   380  	}
   381  	if _, present := p.idToKey[id]; present {
   382  		return true // listed in the set
   383  	}
   384  	return false
   385  }
   386  
   387  // Pop removed the item from the set and returns true if it was there.
   388  //
   389  // Returns false if this item has been popped before (perhaps in another
   390  // transaction), or it's not in the listing passed to BeginPop.
   391  func (p *PopOp) Pop(id string) bool {
   392  	if p.finished {
   393  		panic("the operation has already been finished")
   394  	}
   395  	if !p.CanPop(id) {
   396  		return false
   397  	}
   398  	p.tombs[id] = p.now
   399  	p.popped = append(p.popped, &tombstone{
   400  		id:      id,
   401  		storage: p.idToKey[id],
   402  	})
   403  	p.dirty = true
   404  	return true
   405  }
   406  
   407  // makeTombstonesEntity is used internally by FinishPop.
   408  func (p *PopOp) makeTombstonesEntity() *tombstonesEntity {
   409  	p.entity.Tombstones = p.entity.Tombstones[:0]
   410  	for id, ts := range p.tombs {
   411  		p.entity.Tombstones = append(p.entity.Tombstones, struct {
   412  			ID         string
   413  			Tombstoned time.Time
   414  		}{id, ts})
   415  	}
   416  	return p.entity
   417  }
   418  
   419  ////////////////////////////////////////////////////////////////////////////////
   420  
   421  // FinishPop completes one or more pop operations (for different sets) by
   422  // submitting changes to datastore.
   423  //
   424  // Must be called within the same transaction that called BeginPop.
   425  //
   426  // Returns only transient errors.
   427  func FinishPop(ctx context.Context, ops ...*PopOp) error {
   428  	txn := datastore.CurrentTransaction(ctx)
   429  
   430  	entities := []*tombstonesEntity{}
   431  	tombsCount := 0
   432  	for _, op := range ops {
   433  		if op.finished {
   434  			panic("the operation has already been finished")
   435  		}
   436  		if op.txn != txn {
   437  			panic("wrong transaction")
   438  		}
   439  		if op.dirty {
   440  			entities = append(entities, op.makeTombstonesEntity())
   441  			tombsCount += len(op.popped)
   442  		}
   443  	}
   444  
   445  	if err := datastore.Put(ctx, entities); err != nil {
   446  		return transient.Tag.Apply(err)
   447  	}
   448  
   449  	var tombs Garbage
   450  	if tombsCount != 0 {
   451  		tombs = make(Garbage, 0, tombsCount)
   452  	}
   453  	for _, op := range ops {
   454  		tombs = append(tombs, op.popped...)
   455  		op.finished = true
   456  	}
   457  	txndefer.Defer(ctx, func(ctx context.Context) {
   458  		CleanupGarbage(ctx, tombs) // best-effort cleanup
   459  	})
   460  	return nil
   461  }
   462  
   463  // CleanupGarbage deletes entities used to store items under given tombstones.
   464  //
   465  // This is datastore's MultiDelete RPC in disguise.
   466  // Must be called outside of transactions. Idempotent.
   467  //
   468  // Can handle tombstones from multiple different sets at once. This is preferred
   469  // over calling 'CleanupGarbage' multiple times (once per set), since it
   470  // collapses multiple datastore RPCs into one.
   471  //
   472  // This MUST be called before tombstones returned by 'List' are removed in
   473  // 'Pop'. Failure to do so will make items reappear in the set.
   474  //
   475  // Returns only transient errors. There's no way to know which items were
   476  // removed and which weren't in case of an error.
   477  func CleanupGarbage(ctx context.Context, cleanup ...Garbage) (err error) {
   478  	if datastore.CurrentTransaction(ctx) != nil {
   479  		panic("dsset.CleanupGarbage must be called outside of a transaction")
   480  	}
   481  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/eventbox/dsset/CleanupGarbage")
   482  	defer func() { tracing.End(span, err) }()
   483  
   484  	keys := []*datastore.Key{}
   485  	for _, tombs := range cleanup {
   486  		for _, tomb := range tombs {
   487  			if tomb.storage != nil {
   488  				keys = append(keys, tomb.storage)
   489  			}
   490  		}
   491  	}
   492  
   493  	if err := datastore.Delete(ctx, keys); err != nil {
   494  		return transient.Tag.Apply(err)
   495  	}
   496  
   497  	for _, tombs := range cleanup {
   498  		for _, tomb := range tombs {
   499  			tomb.cleanedUp = true
   500  			tomb.storage = nil
   501  		}
   502  	}
   503  	return nil
   504  }
   505  
   506  ////////////////////////////////////////////////////////////////////////////////
   507  
   508  type itemEntity struct {
   509  	_kind string `gae:"$kind,dsset.Item"`
   510  
   511  	ID     string         `gae:"$id"`
   512  	Parent *datastore.Key `gae:"$parent"`
   513  	Value  []byte         `gae:",noindex"`
   514  }
   515  
   516  type tombstonesEntity struct {
   517  	_kind string `gae:"$kind,dsset.Tombstones"`
   518  
   519  	ID     string         `gae:"$id,const"` // Always the same ID.
   520  	Parent *datastore.Key `gae:"$parent"`
   521  
   522  	// Tombstones is unordered list of pairs <item ID, when it was popped>.
   523  	Tombstones []struct {
   524  		ID         string
   525  		Tombstoned time.Time
   526  	} `gae:",noindex"`
   527  }