go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/common/eventbox/box.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package eventbox 16 17 import ( 18 "context" 19 "fmt" 20 "strconv" 21 "strings" 22 "time" 23 24 "github.com/google/uuid" 25 "go.opentelemetry.io/otel/attribute" 26 "golang.org/x/sync/errgroup" 27 28 "go.chromium.org/luci/common/clock" 29 "go.chromium.org/luci/common/errors" 30 "go.chromium.org/luci/common/logging" 31 "go.chromium.org/luci/common/retry/transient" 32 "go.chromium.org/luci/gae/service/datastore" 33 34 "go.chromium.org/luci/cv/internal/common" 35 "go.chromium.org/luci/cv/internal/common/eventbox/dsset" 36 "go.chromium.org/luci/cv/internal/tracing" 37 ) 38 39 // Recipient is the recipient of the events. 40 type Recipient struct { 41 // Key is the Datastore key of the recipient. 42 // 43 // The corresponding entity doesn't have to exist. 44 Key *datastore.Key 45 // MonitoringString is the value for the metric field "recipient". 46 // 47 // There should be very few distinct values. 48 MonitoringString string 49 } 50 51 // Emit emits a new event with provided value and auto-generated unique ID. 52 func Emit(ctx context.Context, value []byte, to Recipient) error { 53 // TombstonesDelay doesn't matter for Add. 54 d := dsset.Set{Parent: to.Key} 55 // Keep IDs well distributed, but record creation time in it. 56 // See also oldestEventAge(). 57 id := fmt.Sprintf("%s/%d", uuid.New().String(), clock.Now(ctx).UnixNano()) 58 if err := d.Add(ctx, []dsset.Item{{ID: id, Value: value}}); err != nil { 59 return errors.Annotate(err, "failed to send event").Err() 60 } 61 metricSent.Add(ctx, 1, to.MonitoringString) 62 return nil 63 } 64 65 // TombstonesDelay is exposed to mitigate frequent errors in CV e2e tests when 66 // tasks are run in parallel with fake clock. 67 var TombstonesDelay = 5 * time.Minute 68 69 // List returns unprocessed events. For use in tests only. 70 func List(ctx context.Context, r Recipient) (Events, error) { 71 d := dsset.Set{ 72 Parent: r.Key, 73 TombstonesDelay: TombstonesDelay, 74 } 75 const effectivelyUnlimited = 1000000 76 switch l, err := d.List(ctx, effectivelyUnlimited); { 77 case err != nil: 78 return nil, err 79 case len(l.Items) == effectivelyUnlimited: 80 panic(fmt.Errorf("fetched possibly not all events (limit: %d)", effectivelyUnlimited)) 81 default: 82 return toEvents(l.Items), nil 83 } 84 } 85 86 // ProcessBatch reliably processes outstanding events, while transactionally modifying state 87 // and performing arbitrary side effects. 88 // 89 // Returns: 90 // - a slice of non-nil post process functions which SHOULD be executed 91 // immediately after calling this function. Those are generally extra work 92 // that needs to be done as the result of state modification. 93 // - error while processing events. Tags the error with common.DSContentionTag 94 // if entity's EVersion has changed or there is contention on Datastore 95 // entities involved in a transaction. 96 func ProcessBatch(ctx context.Context, r Recipient, p Processor, maxEvents int) (_ []PostProcessFn, err error) { 97 ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/eventbox/ProcessBatch", 98 attribute.String("recipient", r.MonitoringString), 99 ) 100 defer func() { tracing.End(span, err) }() 101 postProcessFn, err := processBatch(ctx, r, p, maxEvents) 102 if common.IsDatastoreContention(err) { 103 err = common.DSContentionTag.Apply(err) 104 } 105 return postProcessFn, err 106 } 107 108 func processBatch(ctx context.Context, r Recipient, p Processor, maxEvents int) ([]PostProcessFn, error) { 109 var state State 110 var expectedEV EVersion 111 eg, ectx := errgroup.WithContext(ctx) 112 eg.Go(func() (err error) { 113 state, expectedEV, err = p.LoadState(ectx) 114 return 115 }) 116 d := dsset.Set{ 117 Parent: r.Key, 118 TombstonesDelay: TombstonesDelay, 119 } 120 var listing *dsset.Listing 121 eg.Go(func() (err error) { 122 listing, err = listAndCleanup(ectx, r, &d, maxEvents) 123 return 124 }) 125 if err := eg.Wait(); err != nil { 126 return nil, err 127 } 128 129 // Compute resulting state before transaction. 130 transitions, garbage, err := p.PrepareMutation(ctx, toEvents(listing.Items), state) 131 if gErr := deleteSemanticGarbage(ctx, r, &d, garbage); gErr != nil { 132 return nil, gErr 133 } 134 if err != nil { 135 return nil, err 136 } 137 transitions = withoutNoops(transitions, state) 138 if len(transitions) == 0 { 139 return nil, nil // nothing to do. 140 } 141 142 var innerErr error 143 var postProcessFns []PostProcessFn 144 var eventsRemoved int 145 err = datastore.RunInTransaction(ctx, func(ctx context.Context) (err error) { 146 defer func() { innerErr = err }() 147 // reset, since this func can be retried 148 postProcessFns = nil 149 eventsRemoved = 0 150 151 switch latestEV, err := p.FetchEVersion(ctx); { 152 case err != nil: 153 return err 154 case latestEV != expectedEV: 155 return errors.Reason( 156 "Datastore contention: EVersion read %d, but expected %d", latestEV, expectedEV, 157 ).Tag(transient.Tag).Tag(common.DSContentionTag).Err() 158 } 159 160 popOp, err := d.BeginPop(ctx, listing) 161 if err != nil { 162 return errors.Annotate(err, "failed to BeginPop").Err() 163 } 164 165 var newState State 166 for _, t := range transitions { 167 if err := t.apply(ctx, popOp); err != nil { 168 return err 169 } 170 newState = t.TransitionTo 171 if t.PostProcessFn != nil { 172 postProcessFns = append(postProcessFns, t.PostProcessFn) 173 } 174 eventsRemoved += len(t.Events) 175 } 176 177 if newState != state { 178 if err := p.SaveState(ctx, newState, expectedEV+1); err != nil { 179 return err 180 } 181 } 182 return dsset.FinishPop(ctx, popOp) 183 }, nil) 184 185 switch { 186 case innerErr != nil: 187 return nil, innerErr 188 case err != nil: 189 return nil, errors.Annotate(err, "failed to commit mutation").Tag(transient.Tag).Err() 190 default: 191 metricRemoved.Add(ctx, int64(eventsRemoved), r.MonitoringString) 192 return postProcessFns, nil 193 } 194 } 195 196 // Processor defines safe way to process events in a batch. 197 type Processor interface { 198 // LoadState is called to load the state before a transaction. 199 LoadState(context.Context) (State, EVersion, error) 200 // PrepareMutation is called before a transaction to compute transitions based 201 // on a batch of events. 202 // 203 // The events in a batch are an arbitrary subset of all outstanding events. 204 // Because loading of events isn't synchronized with event senders, 205 // a recipient of events may see them in different order than the origination 206 // order, even if events were produced by a single sender. 207 // 208 // All actions that must be done atomically with updating state must be 209 // encapsulated inside Transition.SideEffectFn callback. 210 // 211 // Garbage events will be deleted non-transactionally before executing 212 // transactional transitions. These events may still be processed by a 213 // concurrent invocation of a Processor. The garbage events slice may re-use 214 // the given events slice. The garbage will be deleted even if PrepareMutation returns 215 // non-nil error. 216 // 217 // For correctness, two concurrent invocation of a Processor must choose the 218 // same events to be deleted as garbage. Consider scenario of 2 events A and B 219 // deemed semantically the same and 2 concurrent Processor invocations: 220 // P1: let me delete A and hope to transactionally process B. 221 // P2: ............ B and ............................... A. 222 // Then, it's a real possibility that A and B are both deleted AND no neither 223 // P1 nor P2 commits a transaction, thus forever forgetting about A and B. 224 PrepareMutation(context.Context, Events, State) (transitions []Transition, garbage Events, err error) 225 // FetchEVersion is called at the beginning of a transaction. 226 // 227 // The returned EVersion is compared against the one associated with a state 228 // loaded via GetState. If different, the transaction is aborted and new state 229 // isn't saved. 230 FetchEVersion(ctx context.Context) (EVersion, error) 231 // SaveState is called in a transaction to save the state if it has changed. 232 // 233 // The passed eversion is incremented value of eversion of what GetState 234 // returned before. 235 SaveState(context.Context, State, EVersion) error 236 } 237 238 // Event is an incoming event. 239 type Event dsset.Item 240 241 // Events are incoming events. 242 type Events []Event 243 244 // toEvents is an annoying redundant malloc to avoid exposing dsset.Item :( 245 func toEvents(items []dsset.Item) Events { 246 es := make(Events, len(items)) 247 for i, item := range items { 248 es[i] = Event(item) 249 } 250 return es 251 } 252 253 func listAndCleanup(ctx context.Context, r Recipient, d *dsset.Set, maxEvents int) (*dsset.Listing, error) { 254 tStart := clock.Now(ctx) 255 listing, err := d.List(ctx, maxEvents) 256 metricListDurationsS.Add(ctx, float64(clock.Since(ctx, tStart).Milliseconds()), r.MonitoringString, monitoringResult(err)) 257 if err != nil { 258 return nil, err 259 } 260 metricSize.Set(ctx, int64(len(listing.Items)), r.MonitoringString) 261 metricOldestAgeS.Set(ctx, oldestEventAge(ctx, listing.Items).Seconds(), r.MonitoringString) 262 263 if err := dsset.CleanupGarbage(ctx, listing.Garbage); err != nil { 264 return nil, err 265 } 266 metricRemoved.Add(ctx, int64(len(listing.Garbage)), r.MonitoringString) 267 return listing, nil 268 } 269 270 func oldestEventAge(ctx context.Context, items []dsset.Item) time.Duration { 271 var oldest time.Time 272 for _, item := range items { 273 // NOTE: there can be some events with old IDs, which didn't record 274 // timestamps. 275 if parts := strings.SplitN(item.ID, "/", 2); len(parts) == 2 { 276 if unixNano, err := strconv.ParseInt(parts[1], 10, 64); err == nil { 277 if t := time.Unix(0, unixNano); oldest.IsZero() || oldest.After(t) { 278 oldest = t 279 } 280 } 281 } 282 } 283 if oldest.IsZero() { 284 return 0 285 } 286 age := clock.Since(ctx, oldest) 287 if age < 0 { 288 // Clocks aren't perfectly synchronized, so round age up to 0. 289 age = 0 290 } 291 return age 292 } 293 294 func deleteSemanticGarbage(ctx context.Context, r Recipient, d *dsset.Set, events Events) error { 295 l := len(events) 296 if l == 0 { 297 return nil 298 } 299 logging.Debugf(ctx, "eventbox deleting %d semantic garbage events before transaction", l) 300 i := -1 301 err := d.Delete(ctx, func() string { 302 i++ 303 if i < l { 304 return events[i].ID 305 } 306 return "" 307 }) 308 if err != nil { 309 return errors.Annotate(err, "failed to delete %d semantic garbage events before transaction", l).Err() 310 } 311 metricRemoved.Add(ctx, int64(l), r.MonitoringString) 312 return nil 313 } 314 315 // State is an arbitrary object. 316 // 317 // Use a pointer to an actual state. 318 type State any 319 320 // EVersion is recipient entity version. 321 type EVersion int64 322 323 // PostProcessFn should be executed after event processing completes. 324 type PostProcessFn func(context.Context) error 325 326 // SideEffectFn performs side effects with a Datastore transaction context. 327 // See Transition.SideEffectFn doc. 328 type SideEffectFn func(context.Context) error 329 330 // Chain combines several SideEffectFn. 331 // 332 // NOTE: modifies incoming ... slice. 333 func Chain(fs ...SideEffectFn) SideEffectFn { 334 nonNil := fs[:0] 335 for _, f := range fs { 336 if f != nil { 337 nonNil = append(nonNil, f) 338 } 339 } 340 if len(nonNil) == 0 { 341 return nil 342 } 343 return func(ctx context.Context) error { 344 for _, f := range nonNil { 345 if err := f(ctx); err != nil { 346 return err 347 } 348 } 349 return nil 350 } 351 } 352 353 // Transition is a state transition. 354 type Transition struct { 355 // SideEffectFn is called in a transaction to atomically with the state change 356 // execute any side effects of a state transition. 357 // 358 // Typical use is notifying other CV components via TQ tasks. 359 // Can be nil, meaning there no side effects to execute. 360 // 361 // TODO(tandrii): introduce error tag to indicate that failure was clean and 362 // should be treated as if Transition wasn't started, s.t. progress of all 363 // transitions before can be saved. 364 SideEffectFn SideEffectFn 365 // Events to consume with this transition. 366 Events Events 367 // TransitionTo is a state to transition to. 368 // 369 // It's allowed to transition to the exact same state. 370 TransitionTo State 371 // PostProcessFn is the function to be called by the eventbox user after 372 // event processing completes. 373 // 374 // Note that it will be called outside of the transaction of all state 375 // transitions, so the operation inside this function is not expected 376 // to be atomic with this state transition. 377 PostProcessFn PostProcessFn 378 } 379 380 func (t *Transition) apply(ctx context.Context, p *dsset.PopOp) error { 381 if t.SideEffectFn != nil { 382 if err := t.SideEffectFn(ctx); err != nil { 383 return err 384 } 385 } 386 for _, e := range t.Events { 387 _ = p.Pop(e.ID) // Silently ignore if event has already been consumed. 388 } 389 return nil 390 } 391 392 // isNoop returns true if the Transition can be skipped entirely. 393 func (t *Transition) isNoop(oldState State) bool { 394 return t.SideEffectFn == nil && len(t.Events) == 0 && t.TransitionTo == oldState && t.PostProcessFn == nil 395 } 396 397 // withoutNoops returns only actionable transitions in the original order. 398 // 399 // Modifies incoming slice. 400 func withoutNoops(all []Transition, s State) []Transition { 401 ret := all[:0] 402 for _, t := range all { 403 if t.isNoop(s) { 404 continue 405 } 406 ret = append(ret, t) 407 s = t.TransitionTo 408 } 409 return ret 410 }