go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/eventbox/box.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package eventbox
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/google/uuid"
    25  	"go.opentelemetry.io/otel/attribute"
    26  	"golang.org/x/sync/errgroup"
    27  
    28  	"go.chromium.org/luci/common/clock"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/common/logging"
    31  	"go.chromium.org/luci/common/retry/transient"
    32  	"go.chromium.org/luci/gae/service/datastore"
    33  
    34  	"go.chromium.org/luci/cv/internal/common"
    35  	"go.chromium.org/luci/cv/internal/common/eventbox/dsset"
    36  	"go.chromium.org/luci/cv/internal/tracing"
    37  )
    38  
    39  // Recipient is the recipient of the events.
    40  type Recipient struct {
    41  	// Key is the Datastore key of the recipient.
    42  	//
    43  	// The corresponding entity doesn't have to exist.
    44  	Key *datastore.Key
    45  	// MonitoringString is the value for the metric field "recipient".
    46  	//
    47  	// There should be very few distinct values.
    48  	MonitoringString string
    49  }
    50  
    51  // Emit emits a new event with provided value and auto-generated unique ID.
    52  func Emit(ctx context.Context, value []byte, to Recipient) error {
    53  	// TombstonesDelay doesn't matter for Add.
    54  	d := dsset.Set{Parent: to.Key}
    55  	// Keep IDs well distributed, but record creation time in it.
    56  	// See also oldestEventAge().
    57  	id := fmt.Sprintf("%s/%d", uuid.New().String(), clock.Now(ctx).UnixNano())
    58  	if err := d.Add(ctx, []dsset.Item{{ID: id, Value: value}}); err != nil {
    59  		return errors.Annotate(err, "failed to send event").Err()
    60  	}
    61  	metricSent.Add(ctx, 1, to.MonitoringString)
    62  	return nil
    63  }
    64  
    65  // TombstonesDelay is exposed to mitigate frequent errors in CV e2e tests when
    66  // tasks are run in parallel with fake clock.
    67  var TombstonesDelay = 5 * time.Minute
    68  
    69  // List returns unprocessed events. For use in tests only.
    70  func List(ctx context.Context, r Recipient) (Events, error) {
    71  	d := dsset.Set{
    72  		Parent:          r.Key,
    73  		TombstonesDelay: TombstonesDelay,
    74  	}
    75  	const effectivelyUnlimited = 1000000
    76  	switch l, err := d.List(ctx, effectivelyUnlimited); {
    77  	case err != nil:
    78  		return nil, err
    79  	case len(l.Items) == effectivelyUnlimited:
    80  		panic(fmt.Errorf("fetched possibly not all events (limit: %d)", effectivelyUnlimited))
    81  	default:
    82  		return toEvents(l.Items), nil
    83  	}
    84  }
    85  
    86  // ProcessBatch reliably processes outstanding events, while transactionally modifying state
    87  // and performing arbitrary side effects.
    88  //
    89  // Returns:
    90  //   - a slice of non-nil post process functions which SHOULD be executed
    91  //     immediately after calling this function. Those are generally extra work
    92  //     that needs to be done as the result of state modification.
    93  //   - error while processing events. Tags the error with common.DSContentionTag
    94  //     if entity's EVersion has changed or there is contention on Datastore
    95  //     entities involved in a transaction.
    96  func ProcessBatch(ctx context.Context, r Recipient, p Processor, maxEvents int) (_ []PostProcessFn, err error) {
    97  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/eventbox/ProcessBatch",
    98  		attribute.String("recipient", r.MonitoringString),
    99  	)
   100  	defer func() { tracing.End(span, err) }()
   101  	postProcessFn, err := processBatch(ctx, r, p, maxEvents)
   102  	if common.IsDatastoreContention(err) {
   103  		err = common.DSContentionTag.Apply(err)
   104  	}
   105  	return postProcessFn, err
   106  }
   107  
   108  func processBatch(ctx context.Context, r Recipient, p Processor, maxEvents int) ([]PostProcessFn, error) {
   109  	var state State
   110  	var expectedEV EVersion
   111  	eg, ectx := errgroup.WithContext(ctx)
   112  	eg.Go(func() (err error) {
   113  		state, expectedEV, err = p.LoadState(ectx)
   114  		return
   115  	})
   116  	d := dsset.Set{
   117  		Parent:          r.Key,
   118  		TombstonesDelay: TombstonesDelay,
   119  	}
   120  	var listing *dsset.Listing
   121  	eg.Go(func() (err error) {
   122  		listing, err = listAndCleanup(ectx, r, &d, maxEvents)
   123  		return
   124  	})
   125  	if err := eg.Wait(); err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	// Compute resulting state before transaction.
   130  	transitions, garbage, err := p.PrepareMutation(ctx, toEvents(listing.Items), state)
   131  	if gErr := deleteSemanticGarbage(ctx, r, &d, garbage); gErr != nil {
   132  		return nil, gErr
   133  	}
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  	transitions = withoutNoops(transitions, state)
   138  	if len(transitions) == 0 {
   139  		return nil, nil // nothing to do.
   140  	}
   141  
   142  	var innerErr error
   143  	var postProcessFns []PostProcessFn
   144  	var eventsRemoved int
   145  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) {
   146  		defer func() { innerErr = err }()
   147  		//  reset, since this func can be retried
   148  		postProcessFns = nil
   149  		eventsRemoved = 0
   150  
   151  		switch latestEV, err := p.FetchEVersion(ctx); {
   152  		case err != nil:
   153  			return err
   154  		case latestEV != expectedEV:
   155  			return errors.Reason(
   156  				"Datastore contention: EVersion read %d, but expected %d", latestEV, expectedEV,
   157  			).Tag(transient.Tag).Tag(common.DSContentionTag).Err()
   158  		}
   159  
   160  		popOp, err := d.BeginPop(ctx, listing)
   161  		if err != nil {
   162  			return errors.Annotate(err, "failed to BeginPop").Err()
   163  		}
   164  
   165  		var newState State
   166  		for _, t := range transitions {
   167  			if err := t.apply(ctx, popOp); err != nil {
   168  				return err
   169  			}
   170  			newState = t.TransitionTo
   171  			if t.PostProcessFn != nil {
   172  				postProcessFns = append(postProcessFns, t.PostProcessFn)
   173  			}
   174  			eventsRemoved += len(t.Events)
   175  		}
   176  
   177  		if newState != state {
   178  			if err := p.SaveState(ctx, newState, expectedEV+1); err != nil {
   179  				return err
   180  			}
   181  		}
   182  		return dsset.FinishPop(ctx, popOp)
   183  	}, nil)
   184  
   185  	switch {
   186  	case innerErr != nil:
   187  		return nil, innerErr
   188  	case err != nil:
   189  		return nil, errors.Annotate(err, "failed to commit mutation").Tag(transient.Tag).Err()
   190  	default:
   191  		metricRemoved.Add(ctx, int64(eventsRemoved), r.MonitoringString)
   192  		return postProcessFns, nil
   193  	}
   194  }
   195  
   196  // Processor defines safe way to process events in a batch.
   197  type Processor interface {
   198  	// LoadState is called to load the state before a transaction.
   199  	LoadState(context.Context) (State, EVersion, error)
   200  	// PrepareMutation is called before a transaction to compute transitions based
   201  	// on a batch of events.
   202  	//
   203  	// The events in a batch are an arbitrary subset of all outstanding events.
   204  	// Because loading of events isn't synchronized with event senders,
   205  	// a recipient of events may see them in different order than the origination
   206  	// order, even if events were produced by a single sender.
   207  	//
   208  	// All actions that must be done atomically with updating state must be
   209  	// encapsulated inside Transition.SideEffectFn callback.
   210  	//
   211  	// Garbage events will be deleted non-transactionally before executing
   212  	// transactional transitions. These events may still be processed by a
   213  	// concurrent invocation of a Processor. The garbage events slice may re-use
   214  	// the given events slice. The garbage will be deleted even if PrepareMutation returns
   215  	// non-nil error.
   216  	//
   217  	// For correctness, two concurrent invocation of a Processor must choose the
   218  	// same events to be deleted as garbage. Consider scenario of 2 events A and B
   219  	// deemed semantically the same and 2 concurrent Processor invocations:
   220  	//   P1: let me delete A and hope to transactionally process B.
   221  	//   P2:  ............ B and ............................... A.
   222  	// Then, it's a real possibility that A and B are both deleted AND no neither
   223  	// P1 nor P2 commits a transaction, thus forever forgetting about A and B.
   224  	PrepareMutation(context.Context, Events, State) (transitions []Transition, garbage Events, err error)
   225  	// FetchEVersion is called at the beginning of a transaction.
   226  	//
   227  	// The returned EVersion is compared against the one associated with a state
   228  	// loaded via GetState. If different, the transaction is aborted and new state
   229  	// isn't saved.
   230  	FetchEVersion(ctx context.Context) (EVersion, error)
   231  	// SaveState is called in a transaction to save the state if it has changed.
   232  	//
   233  	// The passed eversion is incremented value of eversion of what GetState
   234  	// returned before.
   235  	SaveState(context.Context, State, EVersion) error
   236  }
   237  
   238  // Event is an incoming event.
   239  type Event dsset.Item
   240  
   241  // Events are incoming events.
   242  type Events []Event
   243  
   244  // toEvents is an annoying redundant malloc to avoid exposing dsset.Item :(
   245  func toEvents(items []dsset.Item) Events {
   246  	es := make(Events, len(items))
   247  	for i, item := range items {
   248  		es[i] = Event(item)
   249  	}
   250  	return es
   251  }
   252  
   253  func listAndCleanup(ctx context.Context, r Recipient, d *dsset.Set, maxEvents int) (*dsset.Listing, error) {
   254  	tStart := clock.Now(ctx)
   255  	listing, err := d.List(ctx, maxEvents)
   256  	metricListDurationsS.Add(ctx, float64(clock.Since(ctx, tStart).Milliseconds()), r.MonitoringString, monitoringResult(err))
   257  	if err != nil {
   258  		return nil, err
   259  	}
   260  	metricSize.Set(ctx, int64(len(listing.Items)), r.MonitoringString)
   261  	metricOldestAgeS.Set(ctx, oldestEventAge(ctx, listing.Items).Seconds(), r.MonitoringString)
   262  
   263  	if err := dsset.CleanupGarbage(ctx, listing.Garbage); err != nil {
   264  		return nil, err
   265  	}
   266  	metricRemoved.Add(ctx, int64(len(listing.Garbage)), r.MonitoringString)
   267  	return listing, nil
   268  }
   269  
   270  func oldestEventAge(ctx context.Context, items []dsset.Item) time.Duration {
   271  	var oldest time.Time
   272  	for _, item := range items {
   273  		// NOTE: there can be some events with old IDs, which didn't record
   274  		// timestamps.
   275  		if parts := strings.SplitN(item.ID, "/", 2); len(parts) == 2 {
   276  			if unixNano, err := strconv.ParseInt(parts[1], 10, 64); err == nil {
   277  				if t := time.Unix(0, unixNano); oldest.IsZero() || oldest.After(t) {
   278  					oldest = t
   279  				}
   280  			}
   281  		}
   282  	}
   283  	if oldest.IsZero() {
   284  		return 0
   285  	}
   286  	age := clock.Since(ctx, oldest)
   287  	if age < 0 {
   288  		// Clocks aren't perfectly synchronized, so round age up to 0.
   289  		age = 0
   290  	}
   291  	return age
   292  }
   293  
   294  func deleteSemanticGarbage(ctx context.Context, r Recipient, d *dsset.Set, events Events) error {
   295  	l := len(events)
   296  	if l == 0 {
   297  		return nil
   298  	}
   299  	logging.Debugf(ctx, "eventbox deleting %d semantic garbage events before transaction", l)
   300  	i := -1
   301  	err := d.Delete(ctx, func() string {
   302  		i++
   303  		if i < l {
   304  			return events[i].ID
   305  		}
   306  		return ""
   307  	})
   308  	if err != nil {
   309  		return errors.Annotate(err, "failed to delete %d semantic garbage events before transaction", l).Err()
   310  	}
   311  	metricRemoved.Add(ctx, int64(l), r.MonitoringString)
   312  	return nil
   313  }
   314  
   315  // State is an arbitrary object.
   316  //
   317  // Use a pointer to an actual state.
   318  type State any
   319  
   320  // EVersion is recipient entity version.
   321  type EVersion int64
   322  
   323  // PostProcessFn should be executed after event processing completes.
   324  type PostProcessFn func(context.Context) error
   325  
   326  // SideEffectFn performs side effects with a Datastore transaction context.
   327  // See Transition.SideEffectFn doc.
   328  type SideEffectFn func(context.Context) error
   329  
   330  // Chain combines several SideEffectFn.
   331  //
   332  // NOTE: modifies incoming ... slice.
   333  func Chain(fs ...SideEffectFn) SideEffectFn {
   334  	nonNil := fs[:0]
   335  	for _, f := range fs {
   336  		if f != nil {
   337  			nonNil = append(nonNil, f)
   338  		}
   339  	}
   340  	if len(nonNil) == 0 {
   341  		return nil
   342  	}
   343  	return func(ctx context.Context) error {
   344  		for _, f := range nonNil {
   345  			if err := f(ctx); err != nil {
   346  				return err
   347  			}
   348  		}
   349  		return nil
   350  	}
   351  }
   352  
   353  // Transition is a state transition.
   354  type Transition struct {
   355  	// SideEffectFn is called in a transaction to atomically with the state change
   356  	// execute any side effects of a state transition.
   357  	//
   358  	// Typical use is notifying other CV components via TQ tasks.
   359  	// Can be nil, meaning there no side effects to execute.
   360  	//
   361  	// TODO(tandrii): introduce error tag to indicate that failure was clean and
   362  	// should be treated as if Transition wasn't started, s.t. progress of all
   363  	// transitions before can be saved.
   364  	SideEffectFn SideEffectFn
   365  	// Events to consume with this transition.
   366  	Events Events
   367  	// TransitionTo is a state to transition to.
   368  	//
   369  	// It's allowed to transition to the exact same state.
   370  	TransitionTo State
   371  	// PostProcessFn is the function to be called by the eventbox user after
   372  	// event processing completes.
   373  	//
   374  	// Note that it will be called outside of the transaction of all state
   375  	// transitions, so the operation inside this function is not expected
   376  	// to be atomic with this state transition.
   377  	PostProcessFn PostProcessFn
   378  }
   379  
   380  func (t *Transition) apply(ctx context.Context, p *dsset.PopOp) error {
   381  	if t.SideEffectFn != nil {
   382  		if err := t.SideEffectFn(ctx); err != nil {
   383  			return err
   384  		}
   385  	}
   386  	for _, e := range t.Events {
   387  		_ = p.Pop(e.ID) // Silently ignore if event has already been consumed.
   388  	}
   389  	return nil
   390  }
   391  
   392  // isNoop returns true if the Transition can be skipped entirely.
   393  func (t *Transition) isNoop(oldState State) bool {
   394  	return t.SideEffectFn == nil && len(t.Events) == 0 && t.TransitionTo == oldState && t.PostProcessFn == nil
   395  }
   396  
   397  // withoutNoops returns only actionable transitions in the original order.
   398  //
   399  // Modifies incoming slice.
   400  func withoutNoops(all []Transition, s State) []Transition {
   401  	ret := all[:0]
   402  	for _, t := range all {
   403  		if t.isNoop(s) {
   404  			continue
   405  		}
   406  		ret = append(ret, t)
   407  		s = t.TransitionTo
   408  	}
   409  	return ret
   410  }