go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/manager/manager.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package manager 16 17 import ( 18 "context" 19 "fmt" 20 "time" 21 22 "google.golang.org/protobuf/proto" 23 24 "go.chromium.org/luci/common/clock" 25 "go.chromium.org/luci/common/data/stringset" 26 "go.chromium.org/luci/common/errors" 27 "go.chromium.org/luci/common/logging" 28 "go.chromium.org/luci/common/retry/transient" 29 "go.chromium.org/luci/gae/filter/txndefer" 30 "go.chromium.org/luci/gae/service/datastore" 31 32 "go.chromium.org/luci/cv/internal/changelist" 33 "go.chromium.org/luci/cv/internal/common" 34 "go.chromium.org/luci/cv/internal/common/eventbox" 35 "go.chromium.org/luci/cv/internal/gerrit" 36 "go.chromium.org/luci/cv/internal/gerrit/poller" 37 "go.chromium.org/luci/cv/internal/prjmanager" 38 "go.chromium.org/luci/cv/internal/prjmanager/clpurger" 39 "go.chromium.org/luci/cv/internal/prjmanager/cltriggerer" 40 "go.chromium.org/luci/cv/internal/prjmanager/prjpb" 41 "go.chromium.org/luci/cv/internal/prjmanager/state" 42 "go.chromium.org/luci/cv/internal/prjmanager/triager" 43 "go.chromium.org/luci/cv/internal/run" 44 "go.chromium.org/luci/cv/internal/run/runcreator" 45 "go.chromium.org/luci/cv/internal/tracing" 46 ) 47 48 const ( 49 // maxEventsPerBatch limits the number of incoming events the PM will process at 50 // once. 51 // 52 // This shouldn't be hit in practice under normal operation. This is chosen such 53 // that PM can read these events and make some progress in 1 minute. 54 maxEventsPerBatch = 10000 55 56 // logProjectStateFrequency forces saving ProjectLog entity iff 57 // Project.EVersion is divisible by logProjectStateFrequency. 58 // 59 // In practice, the busiest projects sustain at most ~1 QPS of updates. 60 // Thus, value of 60 limits ProjectLog to at most 1/minute or 1.5k/day. 61 logProjectStateFrequency = 60 62 ) 63 64 var errTaskArrivedTooLate = errors.New("task arrived too late") 65 66 // ProjectManager implements managing projects. 67 type ProjectManager struct { 68 tasksBinding prjpb.TasksBinding 69 handler state.Handler 70 } 71 72 // New creates a new ProjectManager and registers it for handling tasks created 73 // by the given TQ Notifier. 74 func New(n *prjmanager.Notifier, rn state.RunNotifier, c *changelist.Mutator, g gerrit.Factory, u *changelist.Updater) *ProjectManager { 75 pm := &ProjectManager{ 76 tasksBinding: n.TasksBinding, 77 handler: state.Handler{ 78 CLMutator: c, 79 PMNotifier: n, 80 RunNotifier: rn, 81 CLPurger: clpurger.New(n, g, u, c), 82 CLTriggerer: cltriggerer.New(n, g, u, c), 83 CLPoller: poller.New(n.TasksBinding.TQDispatcher, g, u, n), 84 ComponentTriage: triager.Triage, 85 }, 86 } 87 n.TasksBinding.ManageProject.AttachHandler( 88 func(ctx context.Context, payload proto.Message) error { 89 task := payload.(*prjpb.ManageProjectTask) 90 ctx = logging.SetField(ctx, "project", task.GetLuciProject()) 91 err := pm.manageProject(ctx, task.GetLuciProject(), task.GetEta().AsTime()) 92 return common.TQIfy{ 93 KnownIgnore: []error{errTaskArrivedTooLate}, 94 KnownIgnoreTags: []errors.BoolTag{common.DSContentionTag}, 95 KnownRetryTags: []errors.BoolTag{runcreator.StateChangedTag}, 96 }.Error(ctx, err) 97 }, 98 ) 99 100 n.TasksBinding.KickManageProject.AttachHandler( 101 func(ctx context.Context, payload proto.Message) error { 102 task := payload.(*prjpb.KickManageProjectTask) 103 var eta time.Time 104 if t := task.GetEta(); t != nil { 105 eta = t.AsTime() 106 } 107 err := n.TasksBinding.Dispatch(ctx, task.GetLuciProject(), eta) 108 return common.TQifyError(ctx, err) 109 }, 110 ) 111 return pm 112 } 113 114 func (pm *ProjectManager) manageProject(ctx context.Context, luciProject string, taskETA time.Time) error { 115 retryViaNewTask := false 116 var processErr error 117 if delay := clock.Now(ctx).Sub(taskETA); delay > prjpb.MaxAcceptableDelay { 118 logging.Warningf(ctx, "task %s arrived %s late; scheduling next task instead", taskETA, delay) 119 retryViaNewTask = true 120 processErr = errTaskArrivedTooLate 121 } else { 122 processErr = pm.processBatch(ctx, luciProject) 123 if common.DSContentionTag.In(processErr) { 124 logging.Warningf(ctx, "Datastore contention; scheduling next task instead") 125 retryViaNewTask = true 126 } 127 } 128 129 if retryViaNewTask { 130 // Scheduling new task reduces probability of concurrent tasks in extreme 131 // events. 132 if err := pm.tasksBinding.Dispatch(ctx, luciProject, time.Time{}); err != nil { 133 // This should be rare and retry is the best we can do. 134 return err 135 } 136 } 137 return processErr 138 } 139 140 func (pm *ProjectManager) processBatch(ctx context.Context, luciProject string) error { 141 proc := &pmProcessor{ 142 luciProject: luciProject, 143 handler: &pm.handler, 144 } 145 recipient := prjmanager.EventboxRecipient(ctx, luciProject) 146 postProcessFns, err := eventbox.ProcessBatch(ctx, recipient, proc, maxEventsPerBatch) 147 if err != nil { 148 return err 149 } 150 if l := len(postProcessFns); l > 0 { 151 panic(fmt.Errorf("postProcessFns is not supported in PM; got %d", l)) 152 } 153 return nil 154 } 155 156 // pmProcessor implements eventbox.Processor. 157 type pmProcessor struct { 158 luciProject string 159 handler *state.Handler 160 // loadedPState is set by LoadState and read by SaveState. 161 loadedPState *prjpb.PState 162 } 163 164 // LoadState is called to load the state before a transaction. 165 func (proc *pmProcessor) LoadState(ctx context.Context) (eventbox.State, eventbox.EVersion, error) { 166 s := &state.State{} 167 switch p, err := prjmanager.Load(ctx, proc.luciProject); { 168 case err != nil: 169 return nil, 0, err 170 case p == nil: 171 s.PB = &prjpb.PState{LuciProject: proc.luciProject} 172 return s, 0, nil 173 default: 174 p.State.LuciProject = proc.luciProject 175 proc.loadedPState = p.State 176 s.PB = p.State 177 return s, eventbox.EVersion(p.EVersion), nil 178 } 179 } 180 181 // PrepareMutation is called before a transaction to compute transitions. 182 // 183 // All actions that must be done atomically with updating state must be 184 // encapsulated inside Transition.SideEffectFn callback. 185 func (proc *pmProcessor) PrepareMutation(ctx context.Context, events eventbox.Events, s eventbox.State) (ts []eventbox.Transition, noops eventbox.Events, err error) { 186 ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/Mutate") 187 defer func() { tracing.End(span, err) }() 188 189 tr := &triageResult{} 190 for _, e := range events { 191 tr.triage(ctx, e) 192 } 193 tr.removeCLUpdateNoops() 194 195 ts, err = proc.mutate(ctx, tr, s.(*state.State)) 196 return ts, tr.noops, err 197 } 198 199 // FetchEVersion is called at the beginning of a transaction. 200 // 201 // The returned EVersion is compared against the one associated with a state 202 // loaded via GetState. If different, the transaction is aborted and new state 203 // isn't saved. 204 func (proc *pmProcessor) FetchEVersion(ctx context.Context) (eventbox.EVersion, error) { 205 p := &prjmanager.Project{ID: proc.luciProject} 206 switch err := datastore.Get(ctx, p); { 207 case err == datastore.ErrNoSuchEntity: 208 return 0, nil 209 case err != nil: 210 return 0, errors.Annotate(err, "failed to get %q", proc.luciProject).Tag(transient.Tag).Err() 211 default: 212 return eventbox.EVersion(p.EVersion), nil 213 } 214 } 215 216 // SaveState is called in a transaction to save the state if it has changed. 217 // 218 // The passed EVersion is the incremented value of EVersion of what GetState 219 // returned before. 220 func (proc *pmProcessor) SaveState(ctx context.Context, st eventbox.State, ev eventbox.EVersion) error { 221 s := st.(*state.State) 222 // Erase PB.LuciProject as it's already stored as Project{ID:...}. 223 s.PB.LuciProject = "" 224 225 new := &prjmanager.Project{ 226 ID: proc.luciProject, 227 EVersion: int64(ev), 228 UpdateTime: datastore.RoundTime(clock.Now(ctx).UTC()), 229 State: s.PB, 230 } 231 entities := make([]any, 1, 3) 232 entities[0] = new 233 234 old := proc.loadedPState 235 if s.PB.GetConfigHash() != old.GetConfigHash() || s.PB.GetStatus() != old.GetStatus() { 236 entities = append(entities, &prjmanager.ProjectStateOffload{ 237 Project: datastore.MakeKey(ctx, prjmanager.ProjectKind, proc.luciProject), 238 Status: s.PB.GetStatus(), 239 ConfigHash: s.PB.GetConfigHash(), 240 UpdateTime: clock.Now(ctx).UTC(), 241 }) 242 } 243 244 switch reasons := s.LogReasons; { 245 case new.EVersion%logProjectStateFrequency == 0: 246 reasons = append(s.LogReasons, prjpb.LogReason_FYI_PERIODIC) 247 fallthrough 248 case len(reasons) > 0: 249 deduped := prjpb.SortAndDedupeLogReasons(reasons) 250 txndefer.Defer(ctx, func(ctx context.Context) { 251 logging.Debugf(ctx, "Saved ProjectLog @ %d due to %s", new.EVersion, prjpb.FormatLogReasons(deduped)) 252 }) 253 entities = append(entities, &prjmanager.ProjectLog{ 254 Project: datastore.MakeKey(ctx, prjmanager.ProjectKind, proc.luciProject), 255 EVersion: new.EVersion, 256 Status: s.PB.GetStatus(), 257 ConfigHash: s.PB.GetConfigHash(), 258 State: new.State, 259 UpdateTime: new.UpdateTime, 260 Reasons: deduped, 261 }) 262 } 263 264 if err := datastore.Put(ctx, entities...); err != nil { 265 return errors.Annotate(err, "failed to put Project").Tag(transient.Tag).Err() 266 } 267 return nil 268 } 269 270 // triageResult is the result of the triage of the incoming events. 271 type triageResult struct { 272 // Noops are events that can be safely deleted before a transaction 273 // because another semantically **superseding** event will remain in 274 // eventbox. 275 // 276 // Safety note: semantically the same event isn't sufficient, since 277 // concurrent invocations of a PM must agree on which events can be deleted 278 // and which must be kept. 279 noops eventbox.Events 280 281 // newConfig stores newConfig event with the largest ID if any. 282 newConfig eventbox.Events 283 // poke stores Poke event with the largest ID if any. 284 poke eventbox.Events 285 286 clsUpdated struct { 287 // maps CLID to latest EVersion. 288 clEVersions map[int64]int64 289 // maps CLID to event ID of CLUpdated or CLsUpdated events. 290 clEvents map[int64]string 291 // initially, all events. removeCLUpdateNoops() leaves only referenced ones. 292 events eventbox.Events 293 } 294 runsCreated struct { 295 // events and runs are in random order. 296 events eventbox.Events 297 runs common.RunIDs 298 } 299 runsFinished struct { 300 events eventbox.Events 301 runs map[common.RunID]run.Status 302 } 303 purgesCompleted struct { 304 events eventbox.Events 305 purges []*prjpb.PurgeCompleted 306 } 307 triggeringCLDepsCompleted struct { 308 events eventbox.Events 309 triggers []*prjpb.TriggeringCLDepsCompleted 310 } 311 } 312 313 func (tr *triageResult) triage(ctx context.Context, item eventbox.Event) { 314 e := &prjpb.Event{} 315 if err := proto.Unmarshal(item.Value, e); err != nil { 316 // This is a bug in code or data corruption. 317 // There is no way to recover on its own. 318 logging.Errorf(ctx, "CRITICAL: failed to deserialize event %q: %s", item.ID, err) 319 panic(err) 320 } 321 switch v := e.GetEvent().(type) { 322 case *prjpb.Event_NewConfig: 323 tr.highestIDWins(item, &tr.newConfig) 324 case *prjpb.Event_Poke: 325 tr.highestIDWins(item, &tr.poke) 326 327 case *prjpb.Event_ClsUpdated: 328 tr.clsUpdated.events = append(tr.clsUpdated.events, item) 329 for _, cl := range v.ClsUpdated.GetEvents() { 330 tr.triageCLUpdated(cl, item.ID) 331 } 332 333 case *prjpb.Event_RunCreated: 334 tr.runsCreated.events = append(tr.runsCreated.events, item) 335 tr.runsCreated.runs = append(tr.runsCreated.runs, common.RunID(v.RunCreated.GetRunId())) 336 case *prjpb.Event_RunFinished: 337 tr.runsFinished.events = append(tr.runsFinished.events, item) 338 if tr.runsFinished.runs == nil { 339 tr.runsFinished.runs = make(map[common.RunID]run.Status) 340 } 341 tr.runsFinished.runs[common.RunID(v.RunFinished.GetRunId())] = v.RunFinished.GetStatus() 342 case *prjpb.Event_PurgeCompleted: 343 tr.purgesCompleted.events = append(tr.purgesCompleted.events, item) 344 tr.purgesCompleted.purges = append(tr.purgesCompleted.purges, v.PurgeCompleted) 345 case *prjpb.Event_TriggeringClDepsCompleted: 346 tr.triggeringCLDepsCompleted.events = append(tr.triggeringCLDepsCompleted.events, item) 347 tr.triggeringCLDepsCompleted.triggers = append(tr.triggeringCLDepsCompleted.triggers, v.TriggeringClDepsCompleted) 348 default: 349 panic(fmt.Errorf("unknown event: %T [id=%q]", e.GetEvent(), item.ID)) 350 } 351 } 352 353 func (tr *triageResult) highestIDWins(item eventbox.Event, target *eventbox.Events) { 354 if len(*target) == 0 { 355 *target = eventbox.Events{item} 356 return 357 } 358 if i := (*target)[0]; i.ID < item.ID { 359 tr.noops = append(tr.noops, i) 360 (*target)[0] = item 361 } else { 362 tr.noops = append(tr.noops, item) 363 } 364 } 365 366 func (tr *triageResult) triageCLUpdated(v *changelist.CLUpdatedEvent, id string) { 367 clid := v.GetClid() 368 ev := v.GetEversion() 369 370 cu := &tr.clsUpdated 371 if curEV, exists := cu.clEVersions[v.GetClid()]; !exists || curEV < ev { 372 if cu.clEVersions == nil { 373 cu.clEVersions = make(map[int64]int64, 1) 374 cu.clEvents = make(map[int64]string, 1) 375 } 376 cu.clEVersions[clid] = ev 377 cu.clEvents[clid] = id 378 } 379 } 380 381 func (tr *triageResult) removeCLUpdateNoops() { 382 cu := &tr.clsUpdated 383 eventIDs := stringset.New(len(cu.clEvents)) 384 for _, id := range cu.clEvents { 385 eventIDs.Add(id) 386 } 387 remaining := cu.events[:0] 388 for _, e := range cu.events { 389 if eventIDs.Has(e.ID) { 390 remaining = append(remaining, e) 391 } else { 392 tr.noops = append(tr.noops, e) 393 } 394 } 395 cu.events = remaining 396 cu.clEvents = nil // free memory 397 } 398 399 func (proc *pmProcessor) mutate(ctx context.Context, tr *triageResult, s *state.State) ([]eventbox.Transition, error) { 400 var err error 401 var se state.SideEffect 402 ret := make([]eventbox.Transition, 0, 7) 403 var evIndexesToConsume []int 404 405 if upgraded := s.UpgradeIfNecessary(); upgraded != s { 406 ret = append(ret, eventbox.Transition{TransitionTo: upgraded}) 407 s = upgraded 408 } 409 410 // Visit all non-empty fields of triageResult and emit Transitions. 411 // The order of visits matters. 412 413 // Even though OnRunCreated event is sent before OnRunFinished event, 414 // under rare conditions it's possible that OnRunsFinished will be read first, 415 // and OnRunsCreated will be read only in the next PM invocation 416 // (see https://crbug.com/1218681 for a concrete example). 417 if len(tr.runsCreated.runs) > 0 { 418 if s, se, err = proc.handler.OnRunsCreated(ctx, s, tr.runsCreated.runs); err != nil { 419 return nil, err 420 } 421 ret = append(ret, eventbox.Transition{ 422 Events: tr.runsCreated.events, 423 SideEffectFn: state.SideEffectFn(se), 424 TransitionTo: s, 425 }) 426 } 427 428 if len(tr.runsFinished.runs) > 0 { 429 if s, se, err = proc.handler.OnRunsFinished(ctx, s, tr.runsFinished.runs); err != nil { 430 return nil, err 431 } 432 ret = append(ret, eventbox.Transition{ 433 Events: tr.runsFinished.events, 434 SideEffectFn: state.SideEffectFn(se), 435 TransitionTo: s, 436 }) 437 } 438 439 // UpdateConfig event may result in stopping the PM, which requires notifying 440 // each of the incomplete Runs to stop. Thus, runsCreated must be processed 441 // before to ensure no Run will be missed. 442 if len(tr.newConfig) > 0 { 443 if s, se, err = proc.handler.UpdateConfig(ctx, s); err != nil { 444 return nil, err 445 } 446 ret = append(ret, eventbox.Transition{ 447 Events: tr.newConfig, 448 SideEffectFn: state.SideEffectFn(se), 449 TransitionTo: s, 450 }) 451 } 452 453 if len(tr.poke) > 0 { 454 if s, se, err = proc.handler.Poke(ctx, s); err != nil { 455 return nil, err 456 } 457 ret = append(ret, eventbox.Transition{ 458 Events: tr.poke, 459 SideEffectFn: state.SideEffectFn(se), 460 TransitionTo: s, 461 }) 462 } 463 464 if len(tr.clsUpdated.clEVersions) > 0 { 465 if s, se, err = proc.handler.OnCLsUpdated(ctx, s, tr.clsUpdated.clEVersions); err != nil { 466 return nil, err 467 } 468 ret = append(ret, eventbox.Transition{ 469 Events: tr.clsUpdated.events, 470 SideEffectFn: state.SideEffectFn(se), 471 TransitionTo: s, 472 }) 473 } 474 475 // OnPurgesCompleted may expire purges even without incoming event. 476 if s, se, evIndexesToConsume, err = proc.handler.OnPurgesCompleted(ctx, s, tr.purgesCompleted.purges); err != nil { 477 return nil, err 478 } 479 ret = append(ret, eventbox.Transition{ 480 Events: shallowCopyEvents(tr.purgesCompleted.events, evIndexesToConsume), 481 SideEffectFn: state.SideEffectFn(se), 482 TransitionTo: s, 483 }) 484 485 // OnTriggeringCLDepsCompleted may expire triggers even without incoming event. 486 s, se, evIndexesToConsume, err = proc.handler.OnTriggeringCLDepsCompleted(ctx, s, 487 tr.triggeringCLDepsCompleted.triggers, 488 ) 489 if err != nil { 490 return nil, err 491 } 492 ret = append(ret, eventbox.Transition{ 493 Events: shallowCopyEvents(tr.triggeringCLDepsCompleted.events, evIndexesToConsume), 494 SideEffectFn: state.SideEffectFn(se), 495 TransitionTo: s, 496 }) 497 498 if s, se, err = proc.handler.ExecDeferred(ctx, s); err != nil { 499 return nil, err 500 } 501 return append(ret, eventbox.Transition{ 502 SideEffectFn: state.SideEffectFn(se), 503 TransitionTo: s, 504 }), nil 505 } 506 507 func shallowCopyEvents(events []eventbox.Event, indexesToCopy []int) []eventbox.Event { 508 if len(events) == len(indexesToCopy) { 509 return events 510 } 511 ret := make([]eventbox.Event, len(events)) 512 for i, index := range indexesToCopy { 513 ret[i] = events[index] 514 } 515 return ret 516 }