go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/state/components.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package state 16 17 import ( 18 "context" 19 "fmt" 20 "sync" 21 "sync/atomic" 22 "time" 23 24 "google.golang.org/protobuf/encoding/protojson" 25 "google.golang.org/protobuf/types/known/timestamppb" 26 27 "go.chromium.org/luci/common/clock" 28 "go.chromium.org/luci/common/errors" 29 "go.chromium.org/luci/common/logging" 30 "go.chromium.org/luci/common/retry/transient" 31 "go.chromium.org/luci/common/runtime/paniccatcher" 32 "go.chromium.org/luci/common/sync/parallel" 33 34 "go.chromium.org/luci/cv/internal/common" 35 "go.chromium.org/luci/cv/internal/configs/prjcfg" 36 "go.chromium.org/luci/cv/internal/prjmanager/itriager" 37 "go.chromium.org/luci/cv/internal/prjmanager/prjpb" 38 "go.chromium.org/luci/cv/internal/run/runcreator" 39 ) 40 41 const concurrentComponentProcessing = 16 42 43 var errCaughtPanic = errors.New("caught panic") 44 45 // earliestDecisionTime returns the earliest decision time of all components. 46 // 47 // Returns the same time as time.Time and as proto, and boolean indicating that 48 // earliestDecisionTime is as soon as possible. 49 // 50 // Re-uses DecisionTime of one of the components, assuming that components are 51 // modified copy-on-write. 52 func earliestDecisionTime(cs []*prjpb.Component) (time.Time, *timestamppb.Timestamp, bool) { 53 var ret time.Time 54 var retPB *timestamppb.Timestamp 55 for _, c := range cs { 56 if c.GetTriageRequired() { 57 return time.Time{}, nil, true 58 } 59 if dt := c.GetDecisionTime(); dt != nil { 60 if t := dt.AsTime(); ret.IsZero() || ret.After(t) { 61 ret = t 62 retPB = dt 63 } 64 } 65 } 66 return ret, retPB, false 67 } 68 69 // cAction is a component action to be taken during current PM mutation. 70 // 71 // An action may involve creating one or more new Runs, 72 // or removing CQ votes from CL(s) which can't form a Run for some reason. 73 type cAction struct { 74 itriager.Result 75 componentIndex int 76 77 // runsFailed is modified during actOnComponents. 78 runsFailed int32 79 } 80 81 // triageComponents triages components. 82 // 83 // Doesn't modify the state itself. 84 // 85 // Returns: 86 // - an action per each component that needs acting upon; 87 // - indication whether the state should be stored for debugging purpose; 88 // - error, if any. 89 func (h *Handler) triageComponents(ctx context.Context, s *State) ([]*cAction, bool, error) { 90 var sup itriager.PMState 91 sup, err := s.makeTriageSupporter(ctx) 92 if err != nil { 93 return nil, false, err 94 } 95 96 poolSize := min(concurrentComponentProcessing, len(s.PB.GetComponents())) 97 now := clock.Now(ctx) 98 99 var mutex sync.Mutex 100 var actions []*cAction 101 poolErr := parallel.WorkPool(poolSize, func(work chan<- func() error) { 102 for i, oldC := range s.PB.GetComponents() { 103 i, oldC := i, oldC 104 if !needsTriage(oldC, now) { 105 continue 106 } 107 work <- func() error { 108 switch res, err := h.triageOneComponent(ctx, s, oldC, sup); { 109 case itriager.IsErrOutdatedPMState(err): 110 return nil 111 case err != nil: 112 // Log error here since only total errs count will be propagated up 113 // the stack. 114 level := logging.Error 115 if transient.Tag.In(err) { 116 level = logging.Warning 117 } 118 logging.Logf(ctx, level, "%s while processing component: %s", err, protojson.Format(oldC)) 119 return err 120 default: 121 mutex.Lock() 122 actions = append(actions, &cAction{componentIndex: i, Result: res}) 123 mutex.Unlock() 124 return nil 125 } 126 } 127 } 128 }) 129 switch merrs, ok := poolErr.(errors.MultiError); { 130 case poolErr == nil || (ok && len(merrs) == 0): 131 return actions, false, nil 132 case !ok: 133 panic(fmt.Errorf("unexpected return from parallel.WorkPool: %s", poolErr)) 134 case len(actions) > 0: 135 // Components are independent, so proceed despite errors on some components 136 // since partial progress is better than none. 137 logging.Warningf(ctx, "triageComponents: %d errors, but proceeding to act on %d components", len(merrs), len(actions)) 138 return actions, errors.Contains(merrs, errCaughtPanic), nil 139 default: 140 err := common.MostSevereError(merrs) 141 return nil, false, errors.Annotate(err, "failed to triage %d components, keeping the most severe error", len(merrs)).Err() 142 } 143 } 144 145 func needsTriage(c *prjpb.Component, now time.Time) bool { 146 if c.GetTriageRequired() { 147 return true // external event arrived 148 } 149 next := c.GetDecisionTime() 150 if next == nil { 151 return false // wait for an external event 152 } 153 if next.AsTime().After(now) { 154 return false // too early 155 } 156 return true 157 } 158 159 func (h *Handler) triageOneComponent(ctx context.Context, s *State, oldC *prjpb.Component, sup itriager.PMState) (res itriager.Result, err error) { 160 defer paniccatcher.Catch(func(p *paniccatcher.Panic) { 161 logging.Errorf(ctx, "caught panic %s:\n\n%s", p.Reason, p.Stack) 162 // Log as a separate message under debug level to avoid sending it to Cloud 163 // Error. 164 logging.Debugf(ctx, "caught panic current state:\n%s", protojson.Format(s.PB)) 165 err = errCaughtPanic 166 }) 167 res, err = h.ComponentTriage(ctx, oldC, sup) 168 if err != nil { 169 return 170 } 171 if res.NewValue != nil && res.NewValue == oldC { 172 panic(fmt.Errorf("new value re-uses prior component object, must use copy-on-write instead")) 173 } 174 if len(res.CLsToPurge) > 0 { 175 s.validatePurgeCLTasks(oldC, res.CLsToPurge) 176 } 177 return 178 } 179 180 // actOnComponents executes actions on components produced by triageComponents. 181 // 182 // Expects the state to already be shallow cloned. 183 func (h *Handler) actOnComponents(ctx context.Context, s *State, actions []*cAction) (SideEffect, error) { 184 // First, create Runs in parallel. 185 // As Run creation may take considerable time, use an earlier deadline to have 186 // enough time to save state for other components. 187 ctxRunCreation, cancel := earlierDeadline(ctx, 5*time.Second) 188 defer cancel() 189 poolSize := min(concurrentComponentProcessing, len(actions)) 190 runsErr := parallel.WorkPool(poolSize, func(work chan<- func() error) { 191 for _, action := range actions { 192 for _, rc := range action.RunsToCreate { 193 action, rc := action, rc 194 c := s.PB.GetComponents()[action.componentIndex] 195 work <- func() error { 196 err := h.createOneRun(ctxRunCreation, rc, c) 197 if err != nil { 198 atomic.AddInt32(&action.runsFailed, 1) 199 // Log error here since only total errs count will be propagated up 200 // the stack. 201 level := logging.Error 202 if transient.Tag.In(err) { 203 level = logging.Warning 204 } 205 logging.Logf(ctx, level, "Failed to create a Run in component\n%s\ndue to error: %s", protojson.Format(c), err) 206 } 207 return err 208 } 209 } 210 } 211 }) 212 213 // Keep runsErr for now and try to make progress on actions/components 214 // without errors. 215 216 // Shallow-copy the components slice, as one or more components are highly 217 // likely to be modified in a loop below. 218 s.PB.Components = append(([]*prjpb.Component)(nil), s.PB.GetComponents()...) 219 runsCreated, componentsUpdated := 0, 0 220 var purgeTasks []*prjpb.PurgeCLTask 221 var triggerTasks []*prjpb.TriggeringCLDepsTask 222 for _, action := range actions { 223 if action.runsFailed > 0 { 224 continue 225 } 226 runsCreated += len(action.RunsToCreate) 227 purgeTasks = append(purgeTasks, action.CLsToPurge...) 228 // CLsToTrigger in each action is a series of the deps are chained 229 // together, and the TQ handler will vote them sequentially. 230 // 231 // A separate task should be triggered for each action. Otherwise, 232 // The deps of unrelated CLs will be handled by a single task, and 233 // the CQ vote processes will be blocked by unrelated chains. 234 // 235 // Note that there can be more than one inflight TriggeringCLsTask 236 // for the same component. 237 // 238 // Let's say that 239 // - there are CL1,2,3,4,5 and CL1 is the bottommost CL. 240 // - CQ+2 is triggered on CL3. 241 // - CV processed the vote and enqueued a TriggeringCLsTask for CL1/2. 242 // - Before or while the TriggeringCLsTask is being processed, 243 // * CQ+2 is triggered on CL5 244 // * CV processed the CQ vote on CL5 and 245 // enqueued another TriggeringCLsTask for CL4 246 // 247 // There is a very small chance that CL4 may get voted earlier than 248 // CL1/2/3. However, it's not a big deal, even if it happens. 249 if len(action.CLsToTriggerDeps) > 0 { 250 for _, item := range action.CLsToTriggerDeps { 251 triggerTasks = append(triggerTasks, &prjpb.TriggeringCLDepsTask{ 252 TriggeringClDeps: item, 253 }) 254 } 255 } 256 if action.NewValue != nil { 257 s.PB.Components[action.componentIndex] = action.NewValue 258 componentsUpdated++ 259 } 260 } 261 var purgeSE SideEffect 262 curPurgeCLCount := len(s.PB.PurgingCls) 263 if len(purgeTasks) > 0 { 264 purgeSE = h.addCLsToPurge(ctx, s, purgeTasks) 265 } 266 var triggerSE SideEffect 267 curTriggerCLDepsCount := len(s.PB.TriggeringClDeps) 268 if len(triggerTasks) > 0 { 269 triggerSE = h.addCLsToTriggerDeps(ctx, s, triggerTasks) 270 } 271 sideEffect := NewSideEffects(purgeSE, triggerSE) 272 proceedMsg := fmt.Sprintf( 273 // report # of events processed and the new CLs that will be triggered 274 // or purged. i.e., the existing CLs that are in the process of purge 275 // or trigger are not counted. 276 "proceeding to save %d components, purge %d CLs, and trigger %d CL Deps", 277 componentsUpdated, 278 len(s.PB.PurgingCls)-curPurgeCLCount, 279 len(s.PB.TriggeringClDeps)-curTriggerCLDepsCount, 280 ) 281 // Finally, decide the final result. 282 switch merrs, ok := runsErr.(errors.MultiError); { 283 case runsErr == nil || (ok && len(merrs) == 0): 284 logging.Infof(ctx, "actOnComponents: created %d Runs, %s", runsCreated, proceedMsg) 285 return sideEffect, nil 286 case !ok: 287 panic(fmt.Errorf("unexpected return from parallel.WorkPool")) 288 default: 289 logging.Warningf(ctx, "actOnComponents: created %d Runs, failed to create %d Runs", runsCreated, len(merrs)) 290 if componentsUpdated+len(purgeTasks) == 0 { 291 err := common.MostSevereError(merrs) 292 return nil, errors.Annotate(err, "failed to actOnComponents, most severe error").Err() 293 } 294 // All actions are independent, so proceed despite the errors since partial 295 // progress is better than none. 296 logging.Debugf(ctx, "actOnComponents: %s", proceedMsg) 297 return sideEffect, nil 298 } 299 } 300 301 func (h *Handler) createOneRun(ctx context.Context, rc *runcreator.Creator, c *prjpb.Component) (err error) { 302 defer paniccatcher.Catch(func(p *paniccatcher.Panic) { 303 logging.Errorf(ctx, "caught panic while creating a Run %s\n\n%s", p.Reason, p.Stack) 304 err = errCaughtPanic 305 }) 306 307 switch _, err = rc.Create(ctx, h.CLMutator, h.PMNotifier, h.RunNotifier); { 308 case err == nil: 309 return nil 310 case runcreator.StateChangedTag.In(err): 311 // This is a transient error at component action level: on retry, the Triage() 312 // function will re-evaulate the state. 313 return transient.Tag.Apply(err) 314 default: 315 return err 316 } 317 } 318 319 // validatePurgeCLTasks verifies correctness of tasks from Triage. 320 // 321 // Modifies given tasks in place. 322 // Panics in case of problems. 323 func (s *State) validatePurgeCLTasks(c *prjpb.Component, ts []*prjpb.PurgeCLTask) { 324 // First, verify individual tasks have expected fields set. 325 m := make(common.CLIDsSet, len(ts)) 326 for _, t := range ts { 327 id := t.GetPurgingCl().GetClid() 328 switch { 329 case id == 0: 330 panic(fmt.Errorf("clid must be set")) 331 case m.HasI64(id): 332 panic(fmt.Errorf("duplicated clid %d", id)) 333 case len(t.GetPurgeReasons()) == 0: 334 panic(fmt.Errorf("at least 1 reason must be given")) 335 } 336 for i, r := range t.GetPurgeReasons() { 337 if r.GetClError().GetKind() == nil { 338 panic(fmt.Errorf("PurgeReason #%d is nil", i)) 339 } 340 if r.GetApplyTo() == nil { 341 panic(fmt.Errorf("which trigger(s) the PurgeReason applies to must be specified")) 342 } 343 } 344 345 m.AddI64(id) 346 } 347 // Verify only CLs not yet purged are being purged. 348 // NOTE: This iterates all CLs currently being purged, but there should be 349 // very few such CLs compared to the total number of tracked CLs. 350 for _, p := range s.PB.GetPurgingCls() { 351 if m.HasI64(p.GetClid()) { 352 panic(fmt.Errorf("can't purge %d CL which is already being purged", p.GetClid())) 353 } 354 } 355 // Verify only CLs from the component are being purged. 356 for _, clid := range c.GetClids() { 357 m.DelI64(clid) 358 } 359 if len(m) > 0 { 360 panic(fmt.Errorf("purging %v CLs outside the component", m)) 361 } 362 } 363 364 // addCLsToPurge changes PB.PurgingCLs and prepares for atomic creation of TQ 365 // tasks to do actual purge. 366 // 367 // Expects given tasks to be correct (see validatePurgeCLTasks). 368 func (h *Handler) addCLsToPurge(ctx context.Context, s *State, ts []*prjpb.PurgeCLTask) SideEffect { 369 if len(ts) == 0 { 370 return nil 371 } 372 s.populatePurgeCLTasks(ctx, ts) 373 purgingCLs := make([]*prjpb.PurgingCL, len(ts)) 374 for i, t := range ts { 375 purgingCLs[i] = t.GetPurgingCl() 376 } 377 s.PB.PurgingCls, _ = s.PB.COWPurgingCLs(nil, purgingCLs) 378 return &TriggerPurgeCLTasks{payloads: ts, clPurger: h.CLPurger} 379 } 380 381 // addCLsTriggerDeps updates PB.TriggerCLDeps and prepares TQ tasks for the CLs 382 // to trigger. 383 func (h *Handler) addCLsToTriggerDeps(ctx context.Context, s *State, ts []*prjpb.TriggeringCLDepsTask) SideEffect { 384 if len(ts) == 0 { 385 return nil 386 } 387 deadline := timestamppb.New(clock.Now(ctx).Add(prjpb.MaxTriggeringCLDepsDuration)) 388 opInt := deadline.AsTime().Unix() 389 payloads := make([]*prjpb.TriggeringCLDeps, len(ts)) 390 for i, t := range ts { 391 payload := t.GetTriggeringClDeps() 392 payload.OperationId = fmt.Sprintf("%d-%d", opInt, payload.GetOriginClid()) 393 payload.Deadline = deadline 394 payloads[i] = payload 395 t.LuciProject = s.PB.GetLuciProject() 396 } 397 s.PB.TriggeringClDeps, _ = s.PB.COWTriggeringCLDeps(nil, payloads) 398 return &ScheduleTriggeringCLDepsTasks{payloads: ts, clTriggerer: h.CLTriggerer} 399 } 400 401 // maxPurgingCLDuration limits the time that a TQ task has to execute 402 // PurgeCLTask. 403 const maxPurgingCLDuration = 10 * time.Minute 404 405 // populatePurgeCLTasks populates all remaining fields in PurgeCLsTasks created 406 // by Triage. 407 // 408 // Modifies given tasks in place. 409 func (s *State) populatePurgeCLTasks(ctx context.Context, ts []*prjpb.PurgeCLTask) { 410 deadline := timestamppb.New(clock.Now(ctx).Add(maxPurgingCLDuration)) 411 opInt := deadline.AsTime().Unix() 412 for _, t := range ts { 413 id := t.GetPurgingCl().GetClid() 414 pcl := s.PB.GetPcls()[s.pclIndex[common.CLID(id)]] 415 416 t.LuciProject = s.PB.GetLuciProject() 417 t.PurgingCl.Deadline = deadline 418 t.PurgingCl.OperationId = fmt.Sprintf("%d-%d", opInt, id) 419 t.ConfigGroups = make([]string, len(pcl.GetConfigGroupIndexes())) 420 for i, idx := range pcl.GetConfigGroupIndexes() { 421 id := prjcfg.MakeConfigGroupID(s.PB.GetConfigHash(), s.PB.ConfigGroupNames[idx]) 422 t.ConfigGroups[i] = string(id) 423 } 424 } 425 } 426 427 func (s *State) makeTriageSupporter(ctx context.Context) (*triageSupporter, error) { 428 if s.configGroups == nil { 429 meta, err := prjcfg.GetHashMeta(ctx, s.PB.GetLuciProject(), s.PB.GetConfigHash()) 430 if err != nil { 431 return nil, err 432 } 433 if s.configGroups, err = meta.GetConfigGroups(ctx); err != nil { 434 return nil, err 435 } 436 } 437 s.ensurePCLIndex() 438 purging := make(map[int64]*prjpb.PurgingCL, len(s.PB.GetPurgingCls())) 439 for _, p := range s.PB.GetPurgingCls() { 440 purging[p.GetClid()] = p 441 } 442 triggering := make(map[int64]*prjpb.TriggeringCLDeps, len(s.PB.GetTriggeringClDeps())) 443 for _, t := range s.PB.GetTriggeringClDeps() { 444 triggering[t.GetOriginClid()] = t 445 } 446 447 return &triageSupporter{ 448 pcls: s.PB.GetPcls(), 449 pclIndex: s.pclIndex, 450 purging: purging, 451 triggeringCLDeps: triggering, 452 configGroups: s.configGroups, 453 }, nil 454 } 455 456 // triageSupporter provides limited access to resources of PM state. 457 // 458 // Implements itriager.PMState. 459 type triageSupporter struct { 460 pcls []*prjpb.PCL 461 pclIndex map[common.CLID]int 462 purging map[int64]*prjpb.PurgingCL 463 triggeringCLDeps map[int64]*prjpb.TriggeringCLDeps 464 configGroups []*prjcfg.ConfigGroup 465 } 466 467 var _ itriager.PMState = (*triageSupporter)(nil) 468 469 func (a *triageSupporter) PCL(clid int64) *prjpb.PCL { 470 i, ok := a.pclIndex[common.CLID(clid)] 471 if !ok { 472 return nil 473 } 474 return a.pcls[i] 475 } 476 477 func (a *triageSupporter) PurgingCL(clid int64) *prjpb.PurgingCL { 478 return a.purging[clid] 479 } 480 481 func (a *triageSupporter) ConfigGroup(index int32) *prjcfg.ConfigGroup { 482 return a.configGroups[index] 483 } 484 485 func (a *triageSupporter) TriggeringCLDeps(clid int64) *prjpb.TriggeringCLDeps { 486 return a.triggeringCLDeps[clid] 487 } 488 489 func markForTriage(in []*prjpb.Component) []*prjpb.Component { 490 out := make([]*prjpb.Component, len(in)) 491 for i, c := range in { 492 if !c.GetTriageRequired() { 493 c = c.CloneShallow() 494 c.TriageRequired = true 495 } 496 out[i] = c 497 } 498 return out 499 } 500 501 func markForTriageOnChangedPCLs(in []*prjpb.Component, pcls []*prjpb.PCL, changed common.CLIDsSet) []*prjpb.Component { 502 // For each changed CL `A`, expand changed set to include all CLs `B` such 503 // that B depends on A. 504 reverseDeps := make(map[int64][]int64, len(pcls)) // `A` -> all such `B` CLs 505 for _, p := range pcls { 506 for _, dep := range p.GetDeps() { 507 reverseDeps[dep.GetClid()] = append(reverseDeps[dep.GetClid()], p.GetClid()) 508 } 509 } 510 expanded := make(common.CLIDsSet, len(changed)) 511 var expand func(int64) 512 expand = func(clid int64) { 513 if expanded.HasI64(clid) { 514 return 515 } 516 expanded.AddI64(clid) 517 for _, revDep := range reverseDeps[clid] { 518 expand(revDep) 519 } 520 } 521 for clid := range changed { 522 expand(int64(clid)) 523 } 524 525 out := make([]*prjpb.Component, len(in)) 526 for i, c := range in { 527 if !c.GetTriageRequired() { 528 for _, clid := range c.GetClids() { 529 if expanded.HasI64(clid) { 530 c = c.CloneShallow() 531 c.TriageRequired = true 532 break 533 } 534 } 535 } 536 out[i] = c 537 } 538 return out 539 } 540 541 func earlierDeadline(ctx context.Context, reserve time.Duration) (context.Context, context.CancelFunc) { 542 deadline, ok := ctx.Deadline() 543 if !ok { 544 return ctx, func() {} // no deadline 545 } 546 return clock.WithDeadline(ctx, deadline.Add(-reserve)) 547 }