go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/prjmanager/state/handler.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package state
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sort"
    21  	"time"
    22  
    23  	"google.golang.org/protobuf/proto"
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/data/stringset"
    28  	"go.chromium.org/luci/common/logging"
    29  
    30  	"go.chromium.org/luci/cv/internal/changelist"
    31  	"go.chromium.org/luci/cv/internal/common"
    32  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    33  	"go.chromium.org/luci/cv/internal/gerrit/cfgmatcher"
    34  	"go.chromium.org/luci/cv/internal/gerrit/poller"
    35  	"go.chromium.org/luci/cv/internal/prjmanager"
    36  	"go.chromium.org/luci/cv/internal/prjmanager/clpurger"
    37  	"go.chromium.org/luci/cv/internal/prjmanager/cltriggerer"
    38  	"go.chromium.org/luci/cv/internal/prjmanager/itriager"
    39  	"go.chromium.org/luci/cv/internal/prjmanager/prjpb"
    40  	"go.chromium.org/luci/cv/internal/run"
    41  	"go.chromium.org/luci/cv/internal/tracing"
    42  )
    43  
    44  type RunNotifier interface {
    45  	Start(ctx context.Context, id common.RunID) error
    46  	PokeNow(ctx context.Context, id common.RunID) error
    47  	Cancel(ctx context.Context, id common.RunID, reason string) error
    48  	UpdateConfig(ctx context.Context, id common.RunID, hash string, eversion int64) error
    49  }
    50  
    51  // Handler handles state transitions of a project.
    52  type Handler struct {
    53  	CLMutator       *changelist.Mutator
    54  	PMNotifier      *prjmanager.Notifier
    55  	RunNotifier     RunNotifier
    56  	CLPurger        *clpurger.Purger
    57  	CLTriggerer     *cltriggerer.Triggerer
    58  	CLPoller        *poller.Poller
    59  	ComponentTriage itriager.Triage
    60  }
    61  
    62  // UpdateConfig updates PM to the latest config version.
    63  func (h *Handler) UpdateConfig(ctx context.Context, s *State) (*State, SideEffect, error) {
    64  	s.ensureNotYetCloned()
    65  
    66  	meta, err := prjcfg.GetLatestMeta(ctx, s.PB.GetLuciProject())
    67  	if err != nil {
    68  		return nil, nil, err
    69  	}
    70  
    71  	switch meta.Status {
    72  	case prjcfg.StatusEnabled:
    73  		if s.PB.GetStatus() == prjpb.Status_STARTED && meta.Hash() == s.PB.GetConfigHash() {
    74  			return s, nil, nil // already up-to-date.
    75  		}
    76  
    77  		// Tell poller to update ASAP. It doesn't need to wait for a transaction as
    78  		// it's OK for poller to be temporarily more up-to-date than PM.
    79  		if err := h.CLPoller.Poke(ctx, s.PB.GetLuciProject()); err != nil {
    80  			return nil, nil, err
    81  		}
    82  
    83  		if s.PB.Status == prjpb.Status_STARTED {
    84  			s = s.cloneShallow(prjpb.LogReason_CONFIG_CHANGED)
    85  		} else {
    86  			s = s.cloneShallow(prjpb.LogReason_CONFIG_CHANGED, prjpb.LogReason_STATUS_CHANGED)
    87  			s.PB.Status = prjpb.Status_STARTED
    88  		}
    89  		s.PB.ConfigHash = meta.Hash()
    90  		s.PB.ConfigGroupNames = meta.ConfigGroupNames
    91  
    92  		if s.configGroups, err = meta.GetConfigGroups(ctx); err != nil {
    93  			return nil, nil, err
    94  		}
    95  		s.cfgMatcher = cfgmatcher.LoadMatcherFromConfigGroups(ctx, s.configGroups, &meta)
    96  
    97  		if err = s.reevalPCLs(ctx); err != nil {
    98  			return nil, nil, err
    99  		}
   100  		// New config may mean new conditions for Run creation. Re-triaging all
   101  		// components is required.
   102  		s.PB.Components = markForTriage(s.PB.GetComponents())
   103  
   104  		// We may have been in STOPPING phase, in which case incomplete runs may
   105  		// still be finalizing themselves after receiving Cancel event from us.
   106  		// It's harmless to send them UpdateConfig message, too. Eventually, they'll
   107  		// complete finalization, send us OnRunFinished event and then we'll remove
   108  		// them from the state anyway.
   109  		return s, &UpdateIncompleteRunsConfig{
   110  			RunNotifier: h.RunNotifier,
   111  			EVersion:    meta.EVersion,
   112  			Hash:        meta.Hash(),
   113  			RunIDs:      s.PB.IncompleteRuns(),
   114  		}, err
   115  
   116  	case prjcfg.StatusDisabled, prjcfg.StatusNotExists:
   117  		// Intentionally not catching up with new ConfigHash (if any),
   118  		// since it's not actionable and also simpler.
   119  		switch s.PB.GetStatus() {
   120  		case prjpb.Status_STATUS_UNSPECIFIED:
   121  			// Project entity doesn't exist. No need to create it.
   122  			return s, nil, nil
   123  		case prjpb.Status_STOPPED:
   124  			return s, nil, nil
   125  		case prjpb.Status_STARTED:
   126  			s = s.cloneShallow(prjpb.LogReason_STATUS_CHANGED)
   127  			s.PB.Status = prjpb.Status_STOPPING
   128  			fallthrough
   129  		case prjpb.Status_STOPPING:
   130  			if err := h.CLPoller.Poke(ctx, s.PB.GetLuciProject()); err != nil {
   131  				return nil, nil, err
   132  			}
   133  			runs := s.PB.IncompleteRuns()
   134  			if len(runs) == 0 {
   135  				s = s.cloneShallow(prjpb.LogReason_STATUS_CHANGED)
   136  				s.PB.Status = prjpb.Status_STOPPED
   137  				return s, nil, nil
   138  			}
   139  			return s, &CancelIncompleteRuns{
   140  				RunNotifier: h.RunNotifier,
   141  				RunIDs:      s.PB.IncompleteRuns(),
   142  			}, nil
   143  		default:
   144  			panic(fmt.Errorf("unexpected project status: %d", s.PB.GetStatus()))
   145  		}
   146  	default:
   147  		panic(fmt.Errorf("unexpected config status: %d", meta.Status))
   148  	}
   149  }
   150  
   151  // Poke propagates "the poke" downstream to Poller & Runs.
   152  func (h *Handler) Poke(ctx context.Context, s *State) (*State, SideEffect, error) {
   153  	s.ensureNotYetCloned()
   154  
   155  	// First, check if UpdateConfig if necessary.
   156  	switch newState, sideEffect, err := h.UpdateConfig(ctx, s); {
   157  	case err != nil:
   158  		return nil, nil, err
   159  	case newState != s:
   160  		// UpdateConfig noticed a change and its SideEffectFn will propagate it
   161  		// downstream.
   162  		return newState, sideEffect, nil
   163  	}
   164  
   165  	// Propagate downstream directly.
   166  	if err := h.CLPoller.Poke(ctx, s.PB.GetLuciProject()); err != nil {
   167  		return nil, nil, err
   168  	}
   169  	if err := h.pokeRuns(ctx, s); err != nil {
   170  		return nil, nil, err
   171  	}
   172  	// Force re-triage of all components.
   173  	s = s.cloneShallow()
   174  	s.PB.Components = markForTriage(s.PB.GetComponents())
   175  	return s, nil, nil
   176  }
   177  
   178  // OnRunsCreated updates state after new Runs were created.
   179  func (h *Handler) OnRunsCreated(ctx context.Context, s *State, created common.RunIDs) (_ *State, __ SideEffect, err error) {
   180  	s.ensureNotYetCloned()
   181  
   182  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/OnRunsCreated")
   183  	defer func() { tracing.End(span, err) }()
   184  
   185  	// Check if PM is already aware of these Runs.
   186  	remaining := created.Set()
   187  	s.PB.IterIncompleteRuns(func(r *prjpb.PRun, _ *prjpb.Component) (stop bool) {
   188  		delete(remaining, common.RunID(r.GetId()))
   189  		return len(remaining) == 0 // stop if nothing left
   190  	})
   191  	if len(remaining) == 0 {
   192  		return s, nil, nil
   193  	}
   194  
   195  	switch s.PB.GetStatus() {
   196  	case prjpb.Status_STARTED:
   197  		s = s.cloneShallow()
   198  		if err := s.addCreatedRuns(ctx, remaining); err != nil {
   199  			return nil, nil, err
   200  		}
   201  		return s, nil, nil
   202  	case prjpb.Status_STOPPED, prjpb.Status_STOPPING:
   203  		// This should not normally happen, but may under rare conditions.
   204  		switch incomplete, err := incompleteRuns(ctx, remaining); {
   205  		case err != nil:
   206  			return nil, nil, err
   207  		case len(incomplete) == 0:
   208  			// All the Runs have actually already finished. Nothing to do, and this if
   209  			// fine.
   210  			return s, nil, nil
   211  		default:
   212  			logging.Errorf(ctx, "RunCreated events for %s on %s Project Manager", incomplete, s.PB.GetStatus())
   213  			return s, &CancelIncompleteRuns{RunNotifier: h.RunNotifier, RunIDs: incomplete}, nil
   214  		}
   215  	default:
   216  		panic(fmt.Errorf("unexpected project status: %d", s.PB.GetStatus()))
   217  	}
   218  }
   219  
   220  // OnRunsFinished updates state after Runs were finished.
   221  func (h *Handler) OnRunsFinished(ctx context.Context, s *State, finished map[common.RunID]run.Status) (_ *State, __ SideEffect, err error) {
   222  	s.ensureNotYetCloned()
   223  
   224  	_, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/OnRunsFinished")
   225  	defer func() { tracing.End(span, err) }()
   226  
   227  	// This is rarely a noop, so assume state is modified for simplicity.
   228  	s = s.cloneShallow()
   229  	var failedMaybeMCERuns []*prjpb.PRun
   230  	incompleteRunsCount := s.removeFinishedRuns(
   231  		finished, func(r *prjpb.PRun) {
   232  			rid := common.RunID(r.GetId())
   233  			if st, ok := finished[rid]; ok && st == run.Status_FAILED && maybeMCERun(ctx, s, r) {
   234  				failedMaybeMCERuns = append(failedMaybeMCERuns, r)
   235  			}
   236  		},
   237  	)
   238  	if s.PB.GetStatus() == prjpb.Status_STOPPING && incompleteRunsCount == 0 {
   239  		s.LogReasons = append(s.LogReasons, prjpb.LogReason_STATUS_CHANGED)
   240  		s.PB.Status = prjpb.Status_STOPPED
   241  	}
   242  	se := h.addCLsToPurge(ctx, s, makePurgeCLTasksForFailedMCERuns(ctx, s, failedMaybeMCERuns))
   243  	return s, se, nil
   244  }
   245  
   246  // OnCLsUpdated updates state as a result of new changes to CLs.
   247  //
   248  // clEVersions must map CL's ID to CL's EVersion.
   249  // clEVersions is mutated.
   250  func (h *Handler) OnCLsUpdated(ctx context.Context, s *State, clEVersions map[int64]int64) (_ *State, __ SideEffect, err error) {
   251  	s.ensureNotYetCloned()
   252  
   253  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/OnCLsUpdated")
   254  	defer func() { tracing.End(span, err) }()
   255  
   256  	if s.PB.GetStatus() != prjpb.Status_STARTED {
   257  		// Ignore all incoming CL events. If PM is re-enabled, then first full
   258  		// poll will force re-sending of OnCLsUpdated event for all still
   259  		// interesting CLs.
   260  		return s, nil, nil
   261  	}
   262  
   263  	// Most likely there will be changes to state.
   264  	s = s.cloneShallow()
   265  	if err := s.evalUpdatedCLs(ctx, clEVersions); err != nil {
   266  		return nil, nil, err
   267  	}
   268  	return s, nil, nil
   269  }
   270  
   271  // OnPurgesCompleted updates state as a result of completed purge operations.
   272  func (h *Handler) OnPurgesCompleted(ctx context.Context, s *State, events []*prjpb.PurgeCompleted) (_ *State, __ SideEffect, evsToConsume []int, err error) {
   273  	s.ensureNotYetCloned()
   274  
   275  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/OnPurgesCompleted")
   276  	defer func() { tracing.End(span, err) }()
   277  
   278  	opIDs := stringset.New(len(events))
   279  	clids := make([]int64, len(events))
   280  	for i, e := range events {
   281  		clids[i] = e.GetClid()
   282  		opIDs.Add(e.GetOperationId())
   283  	}
   284  	if len(clids) > 0 {
   285  		s = s.cloneShallow()
   286  		if err := s.evalCLs(ctx, clids); err != nil {
   287  			return s, nil, nil, err
   288  		}
   289  		for i, clid := range clids {
   290  			switch pcl := s.PB.GetPCL(clid); {
   291  			case pcl.GetOutdated() == nil:
   292  				// Consume the event only if the snapshot is fresh.
   293  				evsToConsume = append(evsToConsume, i)
   294  			default:
   295  				opIDs.Del(events[i].GetOperationId())
   296  			}
   297  		}
   298  	}
   299  
   300  	// Give 1 minute grace before expiring purging tasks. This doesn't change
   301  	// correctness, but decreases probability of starting another purge before
   302  	// PM observes CLUpdated event with results of prior purge.
   303  	expireCutOff := clock.Now(ctx).Add(-time.Minute)
   304  
   305  	deleted := map[int64]struct{}{}
   306  	out, mutated := s.PB.COWPurgingCLs(func(p *prjpb.PurgingCL) *prjpb.PurgingCL {
   307  		if opIDs.Has(p.GetOperationId()) {
   308  			deleted[p.GetClid()] = struct{}{}
   309  			return nil // delete
   310  		}
   311  		if p.GetDeadline().AsTime().Before(expireCutOff) {
   312  			logging.Debugf(ctx, "PurgingCL %d %q expired", p.GetClid(), p.GetOperationId())
   313  			deleted[p.GetClid()] = struct{}{}
   314  			return nil // delete
   315  		}
   316  		return p // keep as is
   317  	}, nil)
   318  	if !mutated {
   319  		return s, nil, evsToConsume, nil
   320  	}
   321  
   322  	if !s.alreadyCloned {
   323  		s = s.cloneShallow()
   324  	}
   325  	s.PB.PurgingCls = out
   326  
   327  	switch {
   328  	case s.PB.GetRepartitionRequired():
   329  		// all the components will be retriaged during the repartition process.
   330  	default:
   331  		cs, mutatedComponents := s.PB.COWComponents(func(c *prjpb.Component) *prjpb.Component {
   332  			if c.GetTriageRequired() {
   333  				return c
   334  			}
   335  			for _, id := range c.GetClids() {
   336  				if _, yes := deleted[id]; yes {
   337  					c = c.CloneShallow()
   338  					c.TriageRequired = true
   339  					return c
   340  				}
   341  			}
   342  			return c
   343  		}, nil)
   344  		if mutatedComponents {
   345  			s.PB.Components = cs
   346  		}
   347  	}
   348  	return s, nil, evsToConsume, nil
   349  }
   350  
   351  // ExecDeferred performs previously postponed actions, notably creating Runs.
   352  func (h *Handler) ExecDeferred(ctx context.Context, s *State) (_ *State, __ SideEffect, err error) {
   353  	s.ensureNotYetCloned()
   354  
   355  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/ExecDeferred")
   356  	defer func() { tracing.End(span, err) }()
   357  
   358  	if s.PB.GetStatus() != prjpb.Status_STARTED {
   359  		return s, nil, nil
   360  	}
   361  
   362  	mutated := false
   363  	if s.PB.GetRepartitionRequired() || len(s.PB.GetCreatedPruns()) > 0 {
   364  		s = s.cloneShallow()
   365  		mutated = true
   366  		cat := s.categorizeCLs(ctx)
   367  		if err := s.loadActiveIntoPCLs(ctx, cat); err != nil {
   368  			return nil, nil, err
   369  		}
   370  		s.repartition(cat)
   371  	}
   372  
   373  	var sideEffect SideEffect
   374  	switch actions, saveForDebug, err := h.triageComponents(ctx, s); {
   375  	case err != nil:
   376  		if !mutated {
   377  			return nil, nil, err
   378  		}
   379  		// Don't lose progress made so far.
   380  		logging.Warningf(ctx, "Failed to triageComponents %s, but proceeding to save repartitioned state", err)
   381  	case len(actions) > 0 || saveForDebug:
   382  		if !mutated {
   383  			if saveForDebug {
   384  				s = s.cloneShallow(prjpb.LogReason_DEBUG)
   385  			} else {
   386  				s = s.cloneShallow()
   387  			}
   388  			mutated = true
   389  		}
   390  		sideEffect, err = h.actOnComponents(ctx, s, actions)
   391  		if err != nil {
   392  			return nil, nil, err
   393  		}
   394  	}
   395  
   396  	switch t, tPB, asap := earliestDecisionTime(s.PB.GetComponents()); {
   397  	case asap:
   398  		t = clock.Now(ctx)
   399  		tPB = timestamppb.New(t)
   400  		fallthrough
   401  	case tPB != nil && !proto.Equal(tPB, s.PB.GetNextEvalTime()):
   402  		if !mutated {
   403  			s = s.cloneShallow()
   404  		}
   405  		s.PB.NextEvalTime = tPB
   406  		fallthrough
   407  	case tPB != nil:
   408  		// Always create a new task if there is NextEvalTime. If it is in the
   409  		// future, it'll be deduplicated as needed.
   410  		if err := h.PMNotifier.TasksBinding.Dispatch(ctx, s.PB.GetLuciProject(), t); err != nil {
   411  			return nil, nil, err
   412  		}
   413  	}
   414  	return s, sideEffect, nil
   415  }
   416  
   417  // OnTriggeringCLDepsCompleted manages TriggeringCLDeps completion events.
   418  func (h *Handler) OnTriggeringCLDepsCompleted(ctx context.Context, s *State, events []*prjpb.TriggeringCLDepsCompleted) (_ *State, __ SideEffect, evIndexesToConsume []int, err error) {
   419  	s.ensureNotYetCloned()
   420  
   421  	ctx, span := tracing.Start(ctx, "go.chromium.org/luci/cv/internal/prjmanager/impl/state/OnTriggeringCLDepsCompleted")
   422  	defer func() { tracing.End(span, err) }()
   423  
   424  	// give one extra minute before processing an expired op.
   425  	expireCutOff := clock.Now(ctx).Add(-time.Minute)
   426  	opsToRemove := make(map[string]int, len(events))
   427  	var clidsToEval []int64
   428  	var purgeTasks []*prjpb.PurgeCLTask
   429  	for i, evt := range events {
   430  		ctx := logging.SetField(ctx, "origin_cl", evt.GetOrigin())
   431  		switch op := s.PB.GetTriggeringCLDeps(evt.GetOrigin()); {
   432  		case op == nil:
   433  			logging.Warningf(ctx, "OnTriggeringCLDepsCompleted: event arrived but the op(%s) doesn't exist", evt.GetOperationId())
   434  		default:
   435  			if len(evt.GetFailed()) > 0 {
   436  				// If any vote failed, schedule Purge tasks for the origin and all
   437  				// the vote suceeded CLs.
   438  				if tasks := purgeFailedTriggeringCLDeps(s, op.GetTrigger(), evt); len(tasks) > 0 {
   439  					logging.Debugf(ctx, "purging votes for %v due to vote failures on %v",
   440  						evt.GetSucceeded(), evt.GetFailed())
   441  					purgeTasks = append(purgeTasks, tasks...)
   442  				}
   443  			}
   444  			for _, clid := range evt.GetSucceeded() {
   445  				if pcl := s.PB.GetPCL(clid); pcl != nil {
   446  					clidsToEval = append(clidsToEval, clid)
   447  				}
   448  			}
   449  		}
   450  		// The event should still be added into opsToRemove, even if
   451  		// there is no matching op in s.PB. Otherwise, the event will be
   452  		// preserved forever.
   453  		opsToRemove[evt.GetOperationId()] = i
   454  	}
   455  
   456  	s = s.cloneShallow()
   457  	if len(clidsToEval) > 0 {
   458  		if err := s.evalCLs(ctx, clidsToEval); err != nil {
   459  			return s, nil, nil, err
   460  		}
   461  	}
   462  	for opID, evIndex := range opsToRemove {
   463  		consume := true
   464  		// ensure that all the succeeded deps are fresh to remove the Op.
   465  		for _, depCLID := range events[evIndex].GetSucceeded() {
   466  			if pcl := s.PB.GetPCL(depCLID); pcl.GetOutdated() != nil {
   467  				delete(opsToRemove, opID)
   468  				consume = false
   469  				break
   470  			}
   471  		}
   472  		if consume {
   473  			evIndexesToConsume = append(evIndexesToConsume, evIndex)
   474  		}
   475  	}
   476  	deleted := map[int64]struct{}{}
   477  	out, mutated := s.PB.COWTriggeringCLDeps(func(op *prjpb.TriggeringCLDeps) *prjpb.TriggeringCLDeps {
   478  		if op.GetDeadline().AsTime().Before(expireCutOff) {
   479  			ctx := logging.SetField(ctx, "origin_cl", op.GetOriginClid())
   480  			logging.Warningf(ctx, "TriggeringCLDeps(%s): deadline exceeded", op.GetOperationId())
   481  			deleted[op.GetOriginClid()] = struct{}{}
   482  			return nil // delete
   483  		}
   484  		if _, ok := opsToRemove[op.GetOperationId()]; ok {
   485  			deleted[op.GetOriginClid()] = struct{}{}
   486  			return nil // delete
   487  		}
   488  		return op
   489  	}, nil)
   490  	if !mutated {
   491  		// if there is a cl to purge, there must be an op to remove.
   492  		if len(purgeTasks) > 0 {
   493  			panic(fmt.Errorf("OnTriggeringCLDepsCompleted: BUG"))
   494  		}
   495  		return s, nil, evIndexesToConsume, nil
   496  	}
   497  	s.PB.TriggeringClDeps = out
   498  
   499  	switch {
   500  	case s.PB.GetRepartitionRequired():
   501  		// all the components will be retriaged during the repartition process.
   502  	default:
   503  		cs, mutatedComponents := s.PB.COWComponents(func(c *prjpb.Component) *prjpb.Component {
   504  			if c.GetTriageRequired() {
   505  				return c
   506  			}
   507  			for _, id := range c.GetClids() {
   508  				if _, yes := deleted[id]; yes {
   509  					c = c.CloneShallow()
   510  					c.TriageRequired = true
   511  					return c
   512  				}
   513  			}
   514  			return c
   515  		}, nil)
   516  		if mutatedComponents {
   517  			s.PB.Components = cs
   518  		}
   519  	}
   520  	var se SideEffect
   521  	if len(purgeTasks) > 0 {
   522  		se = h.addCLsToPurge(ctx, s, purgeTasks)
   523  	}
   524  	return s, se, evIndexesToConsume, nil
   525  }
   526  
   527  // purgeFailedTriggeringCLDeps schedules PurgingCLTasks for the successfully
   528  // voted deps of a given failed TriggeringCLDeps.
   529  func purgeFailedTriggeringCLDeps(s *State, tr *run.Trigger, evt *prjpb.TriggeringCLDepsCompleted) []*prjpb.PurgeCLTask {
   530  	depErr := &changelist.CLError_TriggerDeps{}
   531  	for _, err := range evt.GetFailed() {
   532  		proto.Merge(depErr, err)
   533  	}
   534  	reasons := []*prjpb.PurgeReason{{
   535  		ClError: &changelist.CLError{
   536  			Kind: &changelist.CLError_TriggerDeps_{
   537  				TriggerDeps: depErr,
   538  			},
   539  		},
   540  		ApplyTo: &prjpb.PurgeReason_Triggers{
   541  			Triggers: &run.Triggers{
   542  				CqVoteTrigger: tr,
   543  			},
   544  		},
   545  	}}
   546  	ret := make([]*prjpb.PurgeCLTask, 0, len(evt.GetSucceeded())+1)
   547  	for _, clid := range evt.GetSucceeded() {
   548  		if s.PB.GetPurgingCL(clid) != nil {
   549  			continue
   550  		}
   551  		ret = append(ret, &prjpb.PurgeCLTask{
   552  			PurgeReasons: reasons,
   553  			PurgingCl: &prjpb.PurgingCL{
   554  				// No email for purging the CQ vote from deps.
   555  				// The purge operations on the originating CL will send out
   556  				// an email. That should be enough.
   557  				Notification: clpurger.NoNotification,
   558  				Clid:         clid,
   559  				ApplyTo: &prjpb.PurgingCL_Triggers{
   560  					Triggers: &run.Triggers{
   561  						CqVoteTrigger: tr,
   562  					},
   563  				},
   564  			},
   565  		})
   566  	}
   567  	// and the origin CL
   568  	ret = append(ret, &prjpb.PurgeCLTask{
   569  		PurgeReasons: reasons,
   570  		PurgingCl: &prjpb.PurgingCL{
   571  			Clid: evt.GetOrigin(),
   572  			// Nil to send the default notifications.
   573  			Notification: nil,
   574  			ApplyTo: &prjpb.PurgingCL_Triggers{
   575  				Triggers: &run.Triggers{
   576  					CqVoteTrigger: tr,
   577  				},
   578  			},
   579  		},
   580  	})
   581  	return ret
   582  }
   583  
   584  func makePurgeCLTasksForFailedMCERuns(ctx context.Context, s *State, failed []*prjpb.PRun) []*prjpb.PurgeCLTask {
   585  	if len(failed) == 0 {
   586  		return nil
   587  	}
   588  	reverseDeps := make(map[int64][]*prjpb.PCL, len(s.PB.GetPcls()))
   589  	for _, p := range s.PB.GetPcls() {
   590  		for _, dep := range p.GetDeps() {
   591  			if dep.GetKind() == changelist.DepKind_HARD {
   592  				reverseDeps[dep.GetClid()] = append(reverseDeps[dep.GetClid()], p)
   593  			}
   594  		}
   595  	}
   596  	incompleteRuns := make(map[int64]struct{})
   597  	s.PB.IterIncompleteRuns(func(r *prjpb.PRun, _ *prjpb.Component) bool {
   598  		if clids := r.GetClids(); len(clids) == 1 {
   599  			incompleteRuns[clids[0]] = struct{}{}
   600  		}
   601  		return false
   602  	})
   603  	tasks := make(map[int64]*prjpb.PurgeCLTask)
   604  	for _, r := range failed {
   605  		for _, child := range reverseDeps[r.GetClids()[0]] {
   606  			// skip if any of the following is true.
   607  			trigger := child.GetTriggers().GetCqVoteTrigger()
   608  			if trigger.GetMode() != r.GetMode() {
   609  				continue
   610  			}
   611  			if _, ok := incompleteRuns[child.GetClid()]; ok {
   612  				continue
   613  			}
   614  			if s.PB.GetPurgingCL(child.GetClid()) != nil {
   615  				continue
   616  			}
   617  			// At this stage, the current CL
   618  			// - depends on the failed MCE run
   619  			// - has no incomplete Run
   620  			// - has the same CQ vote as the CQ vote of the failed MCE Run.
   621  			tasks[child.GetClid()] = &prjpb.PurgeCLTask{
   622  				PurgeReasons: []*prjpb.PurgeReason{{
   623  					ClError: &changelist.CLError{Kind: &changelist.CLError_DepRunFailed{
   624  						DepRunFailed: r.GetClids()[0],
   625  					}},
   626  					ApplyTo: &prjpb.PurgeReason_Triggers{
   627  						Triggers: &run.Triggers{
   628  							CqVoteTrigger: trigger,
   629  						},
   630  					},
   631  				}},
   632  				PurgingCl: &prjpb.PurgingCL{
   633  					Clid: child.GetClid(),
   634  					// In case a parent Run fails in a huge stack, we want to
   635  					// minimize # of emails sent out by the Purge opertaions.
   636  					// One mail for the probably-top CL should be enough.
   637  					Notification: clpurger.NoNotification,
   638  					ApplyTo: &prjpb.PurgingCL_Triggers{
   639  						Triggers: &run.Triggers{
   640  							CqVoteTrigger: trigger,
   641  						},
   642  					},
   643  				},
   644  			}
   645  		}
   646  	}
   647  	if len(tasks) == 0 {
   648  		return nil
   649  	}
   650  	var foundCLToNotify bool
   651  	ret := make([]*prjpb.PurgeCLTask, 0, len(tasks))
   652  	for _, t := range tasks {
   653  		clid := t.GetPurgingCl().GetClid()
   654  		if !foundCLToNotify && shouldPurgeNotify(clid, reverseDeps[clid], tasks) {
   655  			// set nil to let clpurger decide the notification targets, based
   656  			// on the Run mode.
   657  			t.GetPurgingCl().Notification = nil
   658  			foundCLToNotify = true
   659  		}
   660  		ret = append(ret, t)
   661  	}
   662  	sort.Slice(ret, func(i, j int) bool {
   663  		return ret[i].GetPurgingCl().GetClid() < ret[j].GetPurgingCl().GetClid()
   664  	})
   665  	return ret
   666  }
   667  
   668  func shouldPurgeNotify(clid int64, children []*prjpb.PCL, tasks map[int64]*prjpb.PurgeCLTask) bool {
   669  	for _, child := range children {
   670  		// don't send an email if the CL has a child of which trigger is
   671  		// purge-requested.
   672  		if _, ok := tasks[child.GetClid()]; ok {
   673  			return false
   674  		}
   675  	}
   676  	return true
   677  }