go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/state/components.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package state
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sync"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"google.golang.org/protobuf/encoding/protojson"
    25  	"google.golang.org/protobuf/types/known/timestamppb"
    26  
    27  	"go.chromium.org/luci/common/clock"
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/common/retry/transient"
    31  	"go.chromium.org/luci/common/runtime/paniccatcher"
    32  	"go.chromium.org/luci/common/sync/parallel"
    33  
    34  	"go.chromium.org/luci/cv/internal/common"
    35  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    36  	"go.chromium.org/luci/cv/internal/prjmanager/itriager"
    37  	"go.chromium.org/luci/cv/internal/prjmanager/prjpb"
    38  	"go.chromium.org/luci/cv/internal/run/runcreator"
    39  )
    40  
    41  const concurrentComponentProcessing = 16
    42  
    43  var errCaughtPanic = errors.New("caught panic")
    44  
    45  // earliestDecisionTime returns the earliest decision time of all components.
    46  //
    47  // Returns the same time as time.Time and as proto, and boolean indicating that
    48  // earliestDecisionTime is as soon as possible.
    49  //
    50  // Re-uses DecisionTime of one of the components, assuming that components are
    51  // modified copy-on-write.
    52  func earliestDecisionTime(cs []*prjpb.Component) (time.Time, *timestamppb.Timestamp, bool) {
    53  	var ret time.Time
    54  	var retPB *timestamppb.Timestamp
    55  	for _, c := range cs {
    56  		if c.GetTriageRequired() {
    57  			return time.Time{}, nil, true
    58  		}
    59  		if dt := c.GetDecisionTime(); dt != nil {
    60  			if t := dt.AsTime(); ret.IsZero() || ret.After(t) {
    61  				ret = t
    62  				retPB = dt
    63  			}
    64  		}
    65  	}
    66  	return ret, retPB, false
    67  }
    68  
    69  // cAction is a component action to be taken during current PM mutation.
    70  //
    71  // An action may involve creating one or more new Runs,
    72  // or removing CQ votes from CL(s) which can't form a Run for some reason.
    73  type cAction struct {
    74  	itriager.Result
    75  	componentIndex int
    76  
    77  	// runsFailed is modified during actOnComponents.
    78  	runsFailed int32
    79  }
    80  
    81  // triageComponents triages components.
    82  //
    83  // Doesn't modify the state itself.
    84  //
    85  // Returns:
    86  //   - an action per each component that needs acting upon;
    87  //   - indication whether the state should be stored for debugging purpose;
    88  //   - error, if any.
    89  func (h *Handler) triageComponents(ctx context.Context, s *State) ([]*cAction, bool, error) {
    90  	var sup itriager.PMState
    91  	sup, err := s.makeTriageSupporter(ctx)
    92  	if err != nil {
    93  		return nil, false, err
    94  	}
    95  
    96  	poolSize := min(concurrentComponentProcessing, len(s.PB.GetComponents()))
    97  	now := clock.Now(ctx)
    98  
    99  	var mutex sync.Mutex
   100  	var actions []*cAction
   101  	poolErr := parallel.WorkPool(poolSize, func(work chan<- func() error) {
   102  		for i, oldC := range s.PB.GetComponents() {
   103  			i, oldC := i, oldC
   104  			if !needsTriage(oldC, now) {
   105  				continue
   106  			}
   107  			work <- func() error {
   108  				switch res, err := h.triageOneComponent(ctx, s, oldC, sup); {
   109  				case itriager.IsErrOutdatedPMState(err):
   110  					return nil
   111  				case err != nil:
   112  					// Log error here since only total errs count will be propagated up
   113  					// the stack.
   114  					level := logging.Error
   115  					if transient.Tag.In(err) {
   116  						level = logging.Warning
   117  					}
   118  					logging.Logf(ctx, level, "%s while processing component: %s", err, protojson.Format(oldC))
   119  					return err
   120  				default:
   121  					mutex.Lock()
   122  					actions = append(actions, &cAction{componentIndex: i, Result: res})
   123  					mutex.Unlock()
   124  					return nil
   125  				}
   126  			}
   127  		}
   128  	})
   129  	switch merrs, ok := poolErr.(errors.MultiError); {
   130  	case poolErr == nil || (ok && len(merrs) == 0):
   131  		return actions, false, nil
   132  	case !ok:
   133  		panic(fmt.Errorf("unexpected return from parallel.WorkPool: %s", poolErr))
   134  	case len(actions) > 0:
   135  		// Components are independent, so proceed despite errors on some components
   136  		// since partial progress is better than none.
   137  		logging.Warningf(ctx, "triageComponents: %d errors, but proceeding to act on %d components", len(merrs), len(actions))
   138  		return actions, errors.Contains(merrs, errCaughtPanic), nil
   139  	default:
   140  		err := common.MostSevereError(merrs)
   141  		return nil, false, errors.Annotate(err, "failed to triage %d components, keeping the most severe error", len(merrs)).Err()
   142  	}
   143  }
   144  
   145  func needsTriage(c *prjpb.Component, now time.Time) bool {
   146  	if c.GetTriageRequired() {
   147  		return true // external event arrived
   148  	}
   149  	next := c.GetDecisionTime()
   150  	if next == nil {
   151  		return false // wait for an external event
   152  	}
   153  	if next.AsTime().After(now) {
   154  		return false // too early
   155  	}
   156  	return true
   157  }
   158  
   159  func (h *Handler) triageOneComponent(ctx context.Context, s *State, oldC *prjpb.Component, sup itriager.PMState) (res itriager.Result, err error) {
   160  	defer paniccatcher.Catch(func(p *paniccatcher.Panic) {
   161  		logging.Errorf(ctx, "caught panic %s:\n\n%s", p.Reason, p.Stack)
   162  		// Log as a separate message under debug level to avoid sending it to Cloud
   163  		// Error.
   164  		logging.Debugf(ctx, "caught panic current state:\n%s", protojson.Format(s.PB))
   165  		err = errCaughtPanic
   166  	})
   167  	res, err = h.ComponentTriage(ctx, oldC, sup)
   168  	if err != nil {
   169  		return
   170  	}
   171  	if res.NewValue != nil && res.NewValue == oldC {
   172  		panic(fmt.Errorf("new value re-uses prior component object, must use copy-on-write instead"))
   173  	}
   174  	if len(res.CLsToPurge) > 0 {
   175  		s.validatePurgeCLTasks(oldC, res.CLsToPurge)
   176  	}
   177  	return
   178  }
   179  
   180  // actOnComponents executes actions on components produced by triageComponents.
   181  //
   182  // Expects the state to already be shallow cloned.
   183  func (h *Handler) actOnComponents(ctx context.Context, s *State, actions []*cAction) (SideEffect, error) {
   184  	// First, create Runs in parallel.
   185  	// As Run creation may take considerable time, use an earlier deadline to have
   186  	// enough time to save state for other components.
   187  	ctxRunCreation, cancel := earlierDeadline(ctx, 5*time.Second)
   188  	defer cancel()
   189  	poolSize := min(concurrentComponentProcessing, len(actions))
   190  	runsErr := parallel.WorkPool(poolSize, func(work chan<- func() error) {
   191  		for _, action := range actions {
   192  			for _, rc := range action.RunsToCreate {
   193  				action, rc := action, rc
   194  				c := s.PB.GetComponents()[action.componentIndex]
   195  				work <- func() error {
   196  					err := h.createOneRun(ctxRunCreation, rc, c)
   197  					if err != nil {
   198  						atomic.AddInt32(&action.runsFailed, 1)
   199  						// Log error here since only total errs count will be propagated up
   200  						// the stack.
   201  						level := logging.Error
   202  						if transient.Tag.In(err) {
   203  							level = logging.Warning
   204  						}
   205  						logging.Logf(ctx, level, "Failed to create a Run in component\n%s\ndue to error: %s", protojson.Format(c), err)
   206  					}
   207  					return err
   208  				}
   209  			}
   210  		}
   211  	})
   212  
   213  	// Keep runsErr for now and try to make progress on actions/components
   214  	// without errors.
   215  
   216  	// Shallow-copy the components slice, as one or more components are highly
   217  	// likely to be modified in a loop below.
   218  	s.PB.Components = append(([]*prjpb.Component)(nil), s.PB.GetComponents()...)
   219  	runsCreated, componentsUpdated := 0, 0
   220  	var purgeTasks []*prjpb.PurgeCLTask
   221  	var triggerTasks []*prjpb.TriggeringCLDepsTask
   222  	for _, action := range actions {
   223  		if action.runsFailed > 0 {
   224  			continue
   225  		}
   226  		runsCreated += len(action.RunsToCreate)
   227  		purgeTasks = append(purgeTasks, action.CLsToPurge...)
   228  		// CLsToTrigger in each action is a series of the deps are chained
   229  		// together, and the TQ handler will vote them sequentially.
   230  		//
   231  		// A separate task should be triggered for each action. Otherwise,
   232  		// The deps of unrelated CLs will be handled by a single task, and
   233  		// the CQ vote processes will be blocked by unrelated chains.
   234  		//
   235  		// Note that there can be more than one inflight TriggeringCLsTask
   236  		// for the same component.
   237  		//
   238  		// Let's say that
   239  		// - there are CL1,2,3,4,5 and CL1 is the bottommost CL.
   240  		// - CQ+2 is triggered on CL3.
   241  		// - CV processed the vote and enqueued a TriggeringCLsTask for CL1/2.
   242  		// - Before or while the TriggeringCLsTask is being processed,
   243  		//   * CQ+2 is triggered on CL5
   244  		//   * CV processed the CQ vote on CL5 and
   245  		//     enqueued another TriggeringCLsTask for CL4
   246  		//
   247  		// There is a very small chance that CL4 may get voted earlier than
   248  		// CL1/2/3. However, it's not a big deal, even if it happens.
   249  		if len(action.CLsToTriggerDeps) > 0 {
   250  			for _, item := range action.CLsToTriggerDeps {
   251  				triggerTasks = append(triggerTasks, &prjpb.TriggeringCLDepsTask{
   252  					TriggeringClDeps: item,
   253  				})
   254  			}
   255  		}
   256  		if action.NewValue != nil {
   257  			s.PB.Components[action.componentIndex] = action.NewValue
   258  			componentsUpdated++
   259  		}
   260  	}
   261  	var purgeSE SideEffect
   262  	curPurgeCLCount := len(s.PB.PurgingCls)
   263  	if len(purgeTasks) > 0 {
   264  		purgeSE = h.addCLsToPurge(ctx, s, purgeTasks)
   265  	}
   266  	var triggerSE SideEffect
   267  	curTriggerCLDepsCount := len(s.PB.TriggeringClDeps)
   268  	if len(triggerTasks) > 0 {
   269  		triggerSE = h.addCLsToTriggerDeps(ctx, s, triggerTasks)
   270  	}
   271  	sideEffect := NewSideEffects(purgeSE, triggerSE)
   272  	proceedMsg := fmt.Sprintf(
   273  		// report # of events processed and the new CLs that will be triggered
   274  		// or purged. i.e., the existing CLs that are in the process of purge
   275  		// or trigger are not counted.
   276  		"proceeding to save %d components, purge %d CLs, and trigger %d CL Deps",
   277  		componentsUpdated,
   278  		len(s.PB.PurgingCls)-curPurgeCLCount,
   279  		len(s.PB.TriggeringClDeps)-curTriggerCLDepsCount,
   280  	)
   281  	// Finally, decide the final result.
   282  	switch merrs, ok := runsErr.(errors.MultiError); {
   283  	case runsErr == nil || (ok && len(merrs) == 0):
   284  		logging.Infof(ctx, "actOnComponents: created %d Runs, %s", runsCreated, proceedMsg)
   285  		return sideEffect, nil
   286  	case !ok:
   287  		panic(fmt.Errorf("unexpected return from parallel.WorkPool"))
   288  	default:
   289  		logging.Warningf(ctx, "actOnComponents: created %d Runs, failed to create %d Runs", runsCreated, len(merrs))
   290  		if componentsUpdated+len(purgeTasks) == 0 {
   291  			err := common.MostSevereError(merrs)
   292  			return nil, errors.Annotate(err, "failed to actOnComponents, most severe error").Err()
   293  		}
   294  		// All actions are independent, so proceed despite the errors since partial
   295  		// progress is better than none.
   296  		logging.Debugf(ctx, "actOnComponents: %s", proceedMsg)
   297  		return sideEffect, nil
   298  	}
   299  }
   300  
   301  func (h *Handler) createOneRun(ctx context.Context, rc *runcreator.Creator, c *prjpb.Component) (err error) {
   302  	defer paniccatcher.Catch(func(p *paniccatcher.Panic) {
   303  		logging.Errorf(ctx, "caught panic while creating a Run %s\n\n%s", p.Reason, p.Stack)
   304  		err = errCaughtPanic
   305  	})
   306  
   307  	switch _, err = rc.Create(ctx, h.CLMutator, h.PMNotifier, h.RunNotifier); {
   308  	case err == nil:
   309  		return nil
   310  	case runcreator.StateChangedTag.In(err):
   311  		// This is a transient error at component action level: on retry, the Triage()
   312  		// function will re-evaulate the state.
   313  		return transient.Tag.Apply(err)
   314  	default:
   315  		return err
   316  	}
   317  }
   318  
   319  // validatePurgeCLTasks verifies correctness of tasks from Triage.
   320  //
   321  // Modifies given tasks in place.
   322  // Panics in case of problems.
   323  func (s *State) validatePurgeCLTasks(c *prjpb.Component, ts []*prjpb.PurgeCLTask) {
   324  	// First, verify individual tasks have expected fields set.
   325  	m := make(common.CLIDsSet, len(ts))
   326  	for _, t := range ts {
   327  		id := t.GetPurgingCl().GetClid()
   328  		switch {
   329  		case id == 0:
   330  			panic(fmt.Errorf("clid must be set"))
   331  		case m.HasI64(id):
   332  			panic(fmt.Errorf("duplicated clid %d", id))
   333  		case len(t.GetPurgeReasons()) == 0:
   334  			panic(fmt.Errorf("at least 1 reason must be given"))
   335  		}
   336  		for i, r := range t.GetPurgeReasons() {
   337  			if r.GetClError().GetKind() == nil {
   338  				panic(fmt.Errorf("PurgeReason #%d is nil", i))
   339  			}
   340  			if r.GetApplyTo() == nil {
   341  				panic(fmt.Errorf("which trigger(s) the PurgeReason applies to must be specified"))
   342  			}
   343  		}
   344  
   345  		m.AddI64(id)
   346  	}
   347  	// Verify only CLs not yet purged are being purged.
   348  	// NOTE: This iterates all CLs currently being purged, but there should be
   349  	// very few such CLs compared to the total number of tracked CLs.
   350  	for _, p := range s.PB.GetPurgingCls() {
   351  		if m.HasI64(p.GetClid()) {
   352  			panic(fmt.Errorf("can't purge %d CL which is already being purged", p.GetClid()))
   353  		}
   354  	}
   355  	// Verify only CLs from the component are being purged.
   356  	for _, clid := range c.GetClids() {
   357  		m.DelI64(clid)
   358  	}
   359  	if len(m) > 0 {
   360  		panic(fmt.Errorf("purging %v CLs outside the component", m))
   361  	}
   362  }
   363  
   364  // addCLsToPurge changes PB.PurgingCLs and prepares for atomic creation of TQ
   365  // tasks to do actual purge.
   366  //
   367  // Expects given tasks to be correct (see validatePurgeCLTasks).
   368  func (h *Handler) addCLsToPurge(ctx context.Context, s *State, ts []*prjpb.PurgeCLTask) SideEffect {
   369  	if len(ts) == 0 {
   370  		return nil
   371  	}
   372  	s.populatePurgeCLTasks(ctx, ts)
   373  	purgingCLs := make([]*prjpb.PurgingCL, len(ts))
   374  	for i, t := range ts {
   375  		purgingCLs[i] = t.GetPurgingCl()
   376  	}
   377  	s.PB.PurgingCls, _ = s.PB.COWPurgingCLs(nil, purgingCLs)
   378  	return &TriggerPurgeCLTasks{payloads: ts, clPurger: h.CLPurger}
   379  }
   380  
   381  // addCLsTriggerDeps updates PB.TriggerCLDeps and prepares TQ tasks for the CLs
   382  // to trigger.
   383  func (h *Handler) addCLsToTriggerDeps(ctx context.Context, s *State, ts []*prjpb.TriggeringCLDepsTask) SideEffect {
   384  	if len(ts) == 0 {
   385  		return nil
   386  	}
   387  	deadline := timestamppb.New(clock.Now(ctx).Add(prjpb.MaxTriggeringCLDepsDuration))
   388  	opInt := deadline.AsTime().Unix()
   389  	payloads := make([]*prjpb.TriggeringCLDeps, len(ts))
   390  	for i, t := range ts {
   391  		payload := t.GetTriggeringClDeps()
   392  		payload.OperationId = fmt.Sprintf("%d-%d", opInt, payload.GetOriginClid())
   393  		payload.Deadline = deadline
   394  		payloads[i] = payload
   395  		t.LuciProject = s.PB.GetLuciProject()
   396  	}
   397  	s.PB.TriggeringClDeps, _ = s.PB.COWTriggeringCLDeps(nil, payloads)
   398  	return &ScheduleTriggeringCLDepsTasks{payloads: ts, clTriggerer: h.CLTriggerer}
   399  }
   400  
   401  // maxPurgingCLDuration limits the time that a TQ task has to execute
   402  // PurgeCLTask.
   403  const maxPurgingCLDuration = 10 * time.Minute
   404  
   405  // populatePurgeCLTasks populates all remaining fields in PurgeCLsTasks created
   406  // by Triage.
   407  //
   408  // Modifies given tasks in place.
   409  func (s *State) populatePurgeCLTasks(ctx context.Context, ts []*prjpb.PurgeCLTask) {
   410  	deadline := timestamppb.New(clock.Now(ctx).Add(maxPurgingCLDuration))
   411  	opInt := deadline.AsTime().Unix()
   412  	for _, t := range ts {
   413  		id := t.GetPurgingCl().GetClid()
   414  		pcl := s.PB.GetPcls()[s.pclIndex[common.CLID(id)]]
   415  
   416  		t.LuciProject = s.PB.GetLuciProject()
   417  		t.PurgingCl.Deadline = deadline
   418  		t.PurgingCl.OperationId = fmt.Sprintf("%d-%d", opInt, id)
   419  		t.ConfigGroups = make([]string, len(pcl.GetConfigGroupIndexes()))
   420  		for i, idx := range pcl.GetConfigGroupIndexes() {
   421  			id := prjcfg.MakeConfigGroupID(s.PB.GetConfigHash(), s.PB.ConfigGroupNames[idx])
   422  			t.ConfigGroups[i] = string(id)
   423  		}
   424  	}
   425  }
   426  
   427  func (s *State) makeTriageSupporter(ctx context.Context) (*triageSupporter, error) {
   428  	if s.configGroups == nil {
   429  		meta, err := prjcfg.GetHashMeta(ctx, s.PB.GetLuciProject(), s.PB.GetConfigHash())
   430  		if err != nil {
   431  			return nil, err
   432  		}
   433  		if s.configGroups, err = meta.GetConfigGroups(ctx); err != nil {
   434  			return nil, err
   435  		}
   436  	}
   437  	s.ensurePCLIndex()
   438  	purging := make(map[int64]*prjpb.PurgingCL, len(s.PB.GetPurgingCls()))
   439  	for _, p := range s.PB.GetPurgingCls() {
   440  		purging[p.GetClid()] = p
   441  	}
   442  	triggering := make(map[int64]*prjpb.TriggeringCLDeps, len(s.PB.GetTriggeringClDeps()))
   443  	for _, t := range s.PB.GetTriggeringClDeps() {
   444  		triggering[t.GetOriginClid()] = t
   445  	}
   446  
   447  	return &triageSupporter{
   448  		pcls:             s.PB.GetPcls(),
   449  		pclIndex:         s.pclIndex,
   450  		purging:          purging,
   451  		triggeringCLDeps: triggering,
   452  		configGroups:     s.configGroups,
   453  	}, nil
   454  }
   455  
   456  // triageSupporter provides limited access to resources of PM state.
   457  //
   458  // Implements itriager.PMState.
   459  type triageSupporter struct {
   460  	pcls             []*prjpb.PCL
   461  	pclIndex         map[common.CLID]int
   462  	purging          map[int64]*prjpb.PurgingCL
   463  	triggeringCLDeps map[int64]*prjpb.TriggeringCLDeps
   464  	configGroups     []*prjcfg.ConfigGroup
   465  }
   466  
   467  var _ itriager.PMState = (*triageSupporter)(nil)
   468  
   469  func (a *triageSupporter) PCL(clid int64) *prjpb.PCL {
   470  	i, ok := a.pclIndex[common.CLID(clid)]
   471  	if !ok {
   472  		return nil
   473  	}
   474  	return a.pcls[i]
   475  }
   476  
   477  func (a *triageSupporter) PurgingCL(clid int64) *prjpb.PurgingCL {
   478  	return a.purging[clid]
   479  }
   480  
   481  func (a *triageSupporter) ConfigGroup(index int32) *prjcfg.ConfigGroup {
   482  	return a.configGroups[index]
   483  }
   484  
   485  func (a *triageSupporter) TriggeringCLDeps(clid int64) *prjpb.TriggeringCLDeps {
   486  	return a.triggeringCLDeps[clid]
   487  }
   488  
   489  func markForTriage(in []*prjpb.Component) []*prjpb.Component {
   490  	out := make([]*prjpb.Component, len(in))
   491  	for i, c := range in {
   492  		if !c.GetTriageRequired() {
   493  			c = c.CloneShallow()
   494  			c.TriageRequired = true
   495  		}
   496  		out[i] = c
   497  	}
   498  	return out
   499  }
   500  
   501  func markForTriageOnChangedPCLs(in []*prjpb.Component, pcls []*prjpb.PCL, changed common.CLIDsSet) []*prjpb.Component {
   502  	// For each changed CL `A`, expand changed set to include all CLs `B` such
   503  	// that B depends on A.
   504  	reverseDeps := make(map[int64][]int64, len(pcls)) // `A` -> all such `B` CLs
   505  	for _, p := range pcls {
   506  		for _, dep := range p.GetDeps() {
   507  			reverseDeps[dep.GetClid()] = append(reverseDeps[dep.GetClid()], p.GetClid())
   508  		}
   509  	}
   510  	expanded := make(common.CLIDsSet, len(changed))
   511  	var expand func(int64)
   512  	expand = func(clid int64) {
   513  		if expanded.HasI64(clid) {
   514  			return
   515  		}
   516  		expanded.AddI64(clid)
   517  		for _, revDep := range reverseDeps[clid] {
   518  			expand(revDep)
   519  		}
   520  	}
   521  	for clid := range changed {
   522  		expand(int64(clid))
   523  	}
   524  
   525  	out := make([]*prjpb.Component, len(in))
   526  	for i, c := range in {
   527  		if !c.GetTriageRequired() {
   528  			for _, clid := range c.GetClids() {
   529  				if expanded.HasI64(clid) {
   530  					c = c.CloneShallow()
   531  					c.TriageRequired = true
   532  					break
   533  				}
   534  			}
   535  		}
   536  		out[i] = c
   537  	}
   538  	return out
   539  }
   540  
   541  func earlierDeadline(ctx context.Context, reserve time.Duration) (context.Context, context.CancelFunc) {
   542  	deadline, ok := ctx.Deadline()
   543  	if !ok {
   544  		return ctx, func() {} // no deadline
   545  	}
   546  	return clock.WithDeadline(ctx, deadline.Add(-reserve))
   547  }