go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/submit.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package handler
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  	"time"
    24  
    25  	"google.golang.org/protobuf/proto"
    26  	"google.golang.org/protobuf/types/known/timestamppb"
    27  
    28  	"go.chromium.org/luci/common/clock"
    29  	"go.chromium.org/luci/common/errors"
    30  	"go.chromium.org/luci/common/logging"
    31  	gerritpb "go.chromium.org/luci/common/proto/gerrit"
    32  	"go.chromium.org/luci/common/retry/transient"
    33  	"go.chromium.org/luci/common/sync/parallel"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  	"go.chromium.org/luci/server/tq"
    36  
    37  	cfgpb "go.chromium.org/luci/cv/api/config/v2"
    38  
    39  	"go.chromium.org/luci/cv/internal/common"
    40  	"go.chromium.org/luci/cv/internal/common/eventbox"
    41  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    42  	"go.chromium.org/luci/cv/internal/gerrit"
    43  	"go.chromium.org/luci/cv/internal/run"
    44  	"go.chromium.org/luci/cv/internal/run/eventpb"
    45  	"go.chromium.org/luci/cv/internal/run/impl/state"
    46  	"go.chromium.org/luci/cv/internal/run/impl/submit"
    47  	"go.chromium.org/luci/cv/internal/run/impl/util"
    48  )
    49  
    50  const submissionPostponedMessage = "Posting Submittion Postponed Message"
    51  
    52  // OnReadyForSubmission implements Handler interface.
    53  func (impl *Impl) OnReadyForSubmission(ctx context.Context, rs *state.RunState) (*Result, error) {
    54  	switch status := rs.Status; {
    55  	case run.IsEnded(status):
    56  		// It is safe to discard this event because this event arrives after Run
    57  		// gets cancelled while waiting for submission.
    58  		logging.Debugf(ctx, "received ReadyForSubmission event when Run is %s", status)
    59  		rs = rs.ShallowCopy()
    60  		// Under certain race conditions, this Run may still occupy the submit
    61  		// queue. So, check first without a transaction and then initiate a
    62  		// transaction to release if this Run currently occupies the submit queue.
    63  		if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil {
    64  			return nil, err
    65  		}
    66  		return &Result{State: rs}, nil
    67  	case status == run.Status_SUBMITTING:
    68  		// Discard this event if this Run is currently submitting. If submission
    69  		// is stopped and should be resumed (e.g. transient failure, app crashing),
    70  		// it should be handled in `OnSubmissionCompleted` or `TryResumeSubmission`.
    71  		logging.Debugf(ctx, "received ReadyForSubmission event when Run is submitting")
    72  		return &Result{State: rs}, nil
    73  	case status == run.Status_RUNNING:
    74  		// This may happen when this Run transitioned from RUNNING status to
    75  		// WAITING_FOR_SUBMISSION, prepared for submission but failed to
    76  		// save the state transition. This Run is receiving this event because
    77  		// of the fail-safe task sent while acquiring the Submit Queue. CV should
    78  		// treat this Run as WAITING_FOR_SUBMISSION status.
    79  		rs = rs.ShallowCopy()
    80  		rs.Status = run.Status_WAITING_FOR_SUBMISSION
    81  	case status != run.Status_WAITING_FOR_SUBMISSION:
    82  		panic(fmt.Errorf("impossible status %s", status))
    83  	}
    84  	// Check DepRuns to see if we can start submitting this run.
    85  	if len(rs.DepRuns) > 0 {
    86  		runs, errs := run.LoadRunsFromIDs(rs.DepRuns...).Do(ctx)
    87  		for i, depr := range runs {
    88  			switch err := errs[i]; {
    89  			case err == datastore.ErrNoSuchEntity:
    90  				panic(fmt.Errorf("run %s has a non-existent DepRuns Run %s", rs.ID, depr.ID))
    91  			case err != nil:
    92  				return nil, errors.Annotate(err, "failed to load run %s", depr.ID).Tag(transient.Tag).Err()
    93  			case !run.IsEnded(depr.Status):
    94  				// If a parent run has not finished we cannot submit this run.
    95  				rs = rs.ShallowCopy()
    96  				runCLs, err := run.LoadRunCLs(ctx, depr.ID, depr.CLs)
    97  				if err != nil {
    98  					return nil, err
    99  				}
   100  				for _, runCL := range runCLs {
   101  					rs.LogInfof(ctx, submissionPostponedMessage, "Submission postponed, waiting for upstream CL %s to be submitted", runCL.ExternalID.MustURL())
   102  				}
   103  				return &Result{State: rs}, nil
   104  			case depr.Status != run.Status_SUCCEEDED:
   105  				// If a parent run has ended but was not successful, this run should not be submitted and
   106  				// the parent run will trigger OnParentRunCompleted to handle cancelling this run.
   107  				return &Result{State: rs}, nil
   108  			}
   109  		}
   110  	}
   111  
   112  	if !run.ShouldSubmit(&rs.Run) {
   113  		panic(fmt.Errorf("impossible, %s runs cannot submit CLs", rs.Mode))
   114  	}
   115  	if len(rs.Submission.GetSubmittedCls()) > 0 {
   116  		panic(fmt.Errorf("impossible; Run %q is in Status_WAITING_FOR_SUBMISSION status but has submitted CLs ", rs.ID))
   117  	}
   118  	rs = rs.ShallowCopy()
   119  	cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  	switch waitlisted, err := acquireSubmitQueue(ctx, rs, impl.RM, cg.SubmitOptions); {
   124  	case err != nil:
   125  		return nil, err
   126  	case waitlisted:
   127  		// This Run will be notified by Submit Queue once its turn comes.
   128  		return &Result{State: rs}, nil
   129  	}
   130  	rs.CloneSubmission()
   131  	switch treeOpen, treeErr := rs.CheckTree(ctx, impl.TreeClient); {
   132  	case treeErr != nil && clock.Since(ctx, rs.Submission.TreeErrorSince.AsTime()) > treeStatusFailureTimeLimit:
   133  		// Failed to fetch status for a long time. Fail the Run.
   134  		if err := releaseSubmitQueue(ctx, rs, impl.RM); err != nil {
   135  			return nil, err
   136  		}
   137  		rims := make(map[common.CLID]reviewInputMeta, len(rs.CLs))
   138  		whoms := rs.Mode.GerritNotifyTargets()
   139  		meta := reviewInputMeta{
   140  			notify: whoms,
   141  			// Add the same set of group/people to the attention set.
   142  			addToAttention: whoms,
   143  			reason:         treeStatusCheckFailedReason,
   144  			message:        fmt.Sprintf(persistentTreeStatusAppFailureTemplate, cg.Content.GetVerifiers().GetTreeStatus().GetUrl()),
   145  		}
   146  		for _, cl := range rs.CLs {
   147  			if !rs.HasRootCL() || rs.RootCL == cl {
   148  				rims[cl] = meta
   149  			}
   150  		}
   151  		scheduleTriggersReset(ctx, rs, rims, run.Status_FAILED)
   152  		return &Result{
   153  			State: rs,
   154  		}, nil
   155  	case treeErr != nil:
   156  		logging.Warningf(ctx, "tree-status check failed with: %s, retrying in %s", treeErr, treeCheckInterval)
   157  		fallthrough
   158  	case !treeOpen:
   159  		err := parallel.WorkPool(2, func(work chan<- func() error) {
   160  			work <- func() error {
   161  				// Tree is closed or status unknown, revisit after 1 minute.
   162  				return impl.RM.PokeAfter(ctx, rs.ID, treeCheckInterval)
   163  			}
   164  			work <- func() error {
   165  				// Give up the Submit Queue while waiting for tree to open.
   166  				return releaseSubmitQueue(ctx, rs, impl.RM)
   167  			}
   168  		})
   169  		if err != nil {
   170  			return nil, common.MostSevereError(err)
   171  		}
   172  		return &Result{State: rs}, nil
   173  	default:
   174  		if err := markSubmitting(ctx, rs); err != nil {
   175  			return nil, err
   176  		}
   177  		s := submit.NewSubmitter(ctx, rs.ID, rs.Submission, impl.RM, impl.GFactory)
   178  		rs.SubmissionScheduled = true
   179  		return &Result{
   180  			State:         rs,
   181  			PostProcessFn: s.Submit,
   182  		}, nil
   183  	}
   184  }
   185  
   186  // OnCLsSubmitted implements Handler interface.
   187  func (*Impl) OnCLsSubmitted(ctx context.Context, rs *state.RunState, clids common.CLIDs) (*Result, error) {
   188  	switch status := rs.Status; {
   189  	case run.IsEnded(status):
   190  		logging.Warningf(ctx, "received CLsSubmitted event when Run is %s", status)
   191  		return &Result{State: rs}, nil
   192  	case status != run.Status_SUBMITTING:
   193  		return nil, errors.Reason("expected SUBMITTING status; got %s", status).Err()
   194  	}
   195  	rs = rs.ShallowCopy()
   196  	rs.Submission = proto.Clone(rs.Submission).(*run.Submission)
   197  	submitted := clids.Set()
   198  	for _, cl := range rs.Submission.GetSubmittedCls() {
   199  		submitted.AddI64(cl)
   200  	}
   201  	if rs.Submission.GetSubmittedCls() != nil {
   202  		rs.Submission.SubmittedCls = rs.Submission.SubmittedCls[:0]
   203  	}
   204  	for _, cl := range rs.Submission.GetCls() {
   205  		if submitted.HasI64(cl) {
   206  			rs.Submission.SubmittedCls = append(rs.Submission.SubmittedCls, cl)
   207  			submitted.DelI64(cl)
   208  		}
   209  	}
   210  	rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   211  		Time: timestamppb.New(clock.Now(ctx)),
   212  		Kind: &run.LogEntry_ClSubmitted{
   213  			ClSubmitted: &run.LogEntry_CLSubmitted{
   214  				NewlySubmittedCls: common.CLIDsAsInt64s(clids),
   215  				TotalSubmitted:    int64(len(rs.Submission.SubmittedCls)),
   216  			},
   217  		},
   218  	})
   219  	if len(submitted) > 0 {
   220  		unexpected := make(sort.IntSlice, 0, len(submitted))
   221  		for clid := range submitted {
   222  			unexpected = append(unexpected, int(clid))
   223  		}
   224  		unexpected.Sort()
   225  		return nil, errors.Reason("received CLsSubmitted event for cls not belonging to this Run: %v", unexpected).Err()
   226  	}
   227  	return &Result{State: rs}, nil
   228  }
   229  
   230  // OnSubmissionCompleted implements Handler interface.
   231  func (impl *Impl) OnSubmissionCompleted(ctx context.Context, rs *state.RunState, sc *eventpb.SubmissionCompleted) (*Result, error) {
   232  	switch status := rs.Status; {
   233  	case run.IsEnded(status):
   234  		logging.Warningf(ctx, "received SubmissionCompleted event when Run is %s", status)
   235  		rs = rs.ShallowCopy()
   236  		if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil {
   237  			return nil, err
   238  		}
   239  		return &Result{State: rs}, nil
   240  	case status != run.Status_SUBMITTING:
   241  		return nil, errors.Reason("expected SUBMITTING status; got %s", status).Err()
   242  	}
   243  
   244  	rs = rs.ShallowCopy()
   245  	if sc.GetQueueReleaseTimestamp() != nil {
   246  		rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   247  			Time: sc.GetQueueReleaseTimestamp(),
   248  			Kind: &run.LogEntry_ReleasedSubmitQueue_{
   249  				ReleasedSubmitQueue: &run.LogEntry_ReleasedSubmitQueue{},
   250  			},
   251  		})
   252  	}
   253  
   254  	cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID)
   255  	if err != nil {
   256  		return nil, errors.Annotate(err, "prjcfg.GetConfigGroup").Err()
   257  	}
   258  	switch sc.GetResult() {
   259  	case eventpb.SubmissionResult_SUCCEEDED:
   260  		childRuns, err := run.LoadChildRuns(ctx, rs.ID)
   261  		if err != nil {
   262  			return nil, errors.Annotate(err, "failed to load child runs").Err()
   263  		}
   264  		se := impl.endRun(ctx, rs, run.Status_SUCCEEDED, cg, childRuns)
   265  		// Only mark run submitted to ResultDB when submission run is successful.
   266  		se = eventbox.Chain(
   267  			se,
   268  			func(ctx context.Context) error {
   269  				return impl.RdbNotifier.Schedule(ctx, rs.ID)
   270  			},
   271  		)
   272  		return &Result{
   273  			State:        rs,
   274  			SideEffectFn: se,
   275  		}, nil
   276  	case eventpb.SubmissionResult_FAILED_TRANSIENT:
   277  		rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   278  			Time: timestamppb.New(clock.Now(ctx)),
   279  			Kind: &run.LogEntry_SubmissionFailure_{
   280  				SubmissionFailure: &run.LogEntry_SubmissionFailure{
   281  					Event: sc,
   282  				},
   283  			},
   284  		})
   285  		return impl.tryResumeSubmission(ctx, rs, sc)
   286  	case eventpb.SubmissionResult_FAILED_PERMANENT:
   287  		if clFailures := sc.GetClFailures(); clFailures != nil {
   288  			failedCLs := make([]int64, len(clFailures.GetFailures()))
   289  			for i, f := range clFailures.GetFailures() {
   290  				failedCLs[i] = f.GetClid()
   291  			}
   292  			rs.Submission = proto.Clone(rs.Submission).(*run.Submission)
   293  			rs.Submission.FailedCls = failedCLs
   294  			rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   295  				Time: timestamppb.New(clock.Now(ctx)),
   296  				Kind: &run.LogEntry_SubmissionFailure_{
   297  					SubmissionFailure: &run.LogEntry_SubmissionFailure{
   298  						Event: sc,
   299  					},
   300  				},
   301  			})
   302  		}
   303  		allRunCLs, err := run.LoadRunCLs(ctx, rs.ID, common.MakeCLIDs(rs.Submission.GetCls()...))
   304  		if err != nil {
   305  			return nil, err
   306  		}
   307  		impl.scheduleResetTriggersForNotSubmittedCLs(ctx, rs, allRunCLs, sc)
   308  		return &Result{State: rs}, nil
   309  	default:
   310  		panic(fmt.Errorf("impossible submission result %s", sc.GetResult()))
   311  	}
   312  }
   313  
   314  // TryResumeSubmission implements Handler interface.
   315  func (impl *Impl) TryResumeSubmission(ctx context.Context, rs *state.RunState) (*Result, error) {
   316  	return impl.tryResumeSubmission(ctx, rs, nil)
   317  }
   318  
   319  func (impl *Impl) tryResumeSubmission(ctx context.Context, rs *state.RunState, sc *eventpb.SubmissionCompleted) (*Result, error) {
   320  	switch {
   321  	case rs.Status != run.Status_SUBMITTING || rs.SubmissionScheduled:
   322  		return &Result{State: rs}, nil
   323  	case sc != nil && sc.Result != eventpb.SubmissionResult_FAILED_TRANSIENT:
   324  		panic(fmt.Errorf("submission can only be resumed on nil submission completed event or event reporting transient failure; got %s", sc))
   325  	}
   326  
   327  	deadline := rs.Submission.GetDeadline()
   328  	taskID := rs.Submission.GetTaskId()
   329  	switch {
   330  	case deadline == nil:
   331  		panic(fmt.Errorf("impossible: run %q is submitting but Run.Submission.Deadline is not set", rs.ID))
   332  	case taskID == "":
   333  		panic(fmt.Errorf("impossible: run %q is submitting but Run.Submission.TaskId is not set", rs.ID))
   334  	}
   335  
   336  	switch expired := clock.Now(ctx).After(deadline.AsTime()); {
   337  	case expired:
   338  		cg, err := prjcfg.GetConfigGroup(ctx, rs.ID.LUCIProject(), rs.ConfigGroupID)
   339  		if err != nil {
   340  			return nil, errors.Annotate(err, "prjcfg.GetConfigGroup").Err()
   341  		}
   342  
   343  		rs = rs.ShallowCopy()
   344  		switch submittedCnt := len(rs.Submission.GetSubmittedCls()); {
   345  		case submittedCnt > 0 && submittedCnt == len(rs.Submission.GetCls()):
   346  			// Fully submitted
   347  			if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil {
   348  				return nil, err
   349  			}
   350  			childRuns, err := run.LoadChildRuns(ctx, rs.ID)
   351  			if err != nil {
   352  				return nil, errors.Annotate(err, "failed to load child runs").Err()
   353  			}
   354  			return &Result{
   355  				State:        rs,
   356  				SideEffectFn: impl.endRun(ctx, rs, run.Status_SUCCEEDED, cg, childRuns),
   357  			}, nil
   358  		default: // None submitted or partially submitted
   359  			// Synthesize a submission completed event with permanent failure.
   360  			if clFailures := sc.GetClFailures(); clFailures != nil {
   361  				rs.Submission = proto.Clone(rs.Submission).(*run.Submission)
   362  				rs.Submission.FailedCls = make([]int64, len(clFailures.GetFailures()))
   363  				sc = &eventpb.SubmissionCompleted{
   364  					Result: eventpb.SubmissionResult_FAILED_PERMANENT,
   365  					FailureReason: &eventpb.SubmissionCompleted_ClFailures{
   366  						ClFailures: &eventpb.SubmissionCompleted_CLSubmissionFailures{
   367  							Failures: make([]*eventpb.SubmissionCompleted_CLSubmissionFailure, len(clFailures.GetFailures())),
   368  						},
   369  					},
   370  				}
   371  				for i, f := range clFailures.GetFailures() {
   372  					rs.Submission.FailedCls[i] = f.GetClid()
   373  					sc.GetClFailures().Failures[i] = &eventpb.SubmissionCompleted_CLSubmissionFailure{
   374  						Clid:    f.GetClid(),
   375  						Message: fmt.Sprintf("CL failed to submit because of transient failure: %s. However, submission is running out of time to retry.", f.GetMessage()),
   376  					}
   377  				}
   378  			} else {
   379  				sc = &eventpb.SubmissionCompleted{
   380  					Result: eventpb.SubmissionResult_FAILED_PERMANENT,
   381  					FailureReason: &eventpb.SubmissionCompleted_Timeout{
   382  						Timeout: true,
   383  					},
   384  				}
   385  			}
   386  			allRunCLs, err := run.LoadRunCLs(ctx, rs.ID, common.MakeCLIDs(rs.Submission.GetCls()...))
   387  			if err != nil {
   388  				return nil, err
   389  			}
   390  			impl.scheduleResetTriggersForNotSubmittedCLs(ctx, rs, allRunCLs, sc)
   391  			if err := releaseSubmitQueueIfTaken(ctx, rs, impl.RM); err != nil {
   392  				return nil, err
   393  			}
   394  			return &Result{State: rs}, nil
   395  		}
   396  	case taskID == mustTaskIDFromContext(ctx):
   397  		// Matching taskID indicates current task is the retry of a previous
   398  		// submitting task that has failed transiently. Continue the submission.
   399  		rs = rs.ShallowCopy()
   400  		s := submit.NewSubmitter(ctx, rs.ID, rs.Submission, impl.RM, impl.GFactory)
   401  		rs.SubmissionScheduled = true
   402  		return &Result{
   403  			State:         rs,
   404  			PostProcessFn: s.Submit,
   405  		}, nil
   406  	default:
   407  		// Presumably another task is working on the submission at this time.
   408  		// So, wake up RM as soon as the submission expires. Meanwhile, don't
   409  		// consume the event as the retries of submission task will process
   410  		// this event. It's probably a race condition that this task sees this
   411  		// event first.
   412  		if err := impl.RM.Invoke(ctx, rs.ID, deadline.AsTime()); err != nil {
   413  			return nil, err
   414  		}
   415  		return &Result{
   416  			State:          rs,
   417  			PreserveEvents: true,
   418  		}, nil
   419  	}
   420  }
   421  
   422  func acquireSubmitQueue(ctx context.Context, rs *state.RunState, rm RM, opts *cfgpb.SubmitOptions) (waitlisted bool, err error) {
   423  	now := clock.Now(ctx).UTC()
   424  	var innerErr error
   425  	err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   426  		waitlisted, innerErr = submit.TryAcquire(ctx, rm.NotifyReadyForSubmission, rs.ID, opts)
   427  		switch {
   428  		case innerErr != nil:
   429  			return innerErr
   430  		case !waitlisted:
   431  			// It is possible that RM fails before successfully completing the state
   432  			// transition. In that case, this Run will block Submit Queue infinitely.
   433  			// Sending a ReadyForSubmission event after 10s as a fail-safe to ensure
   434  			// Run keeps making progress.
   435  			return rm.NotifyReadyForSubmission(ctx, rs.ID, now.Add(10*time.Second))
   436  		default:
   437  			return nil
   438  		}
   439  	}, nil)
   440  	switch {
   441  	case innerErr != nil:
   442  		return false, innerErr
   443  	case err != nil:
   444  		return false, errors.Annotate(err, "failed to run the transaction to acquire submit queue").Tag(transient.Tag).Err()
   445  	case waitlisted:
   446  		rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   447  			Time: timestamppb.New(clock.Now(ctx)),
   448  			Kind: &run.LogEntry_Waitlisted_{
   449  				Waitlisted: &run.LogEntry_Waitlisted{},
   450  			},
   451  		})
   452  		logging.Debugf(ctx, "Waitlisted in Submit Queue")
   453  		return true, nil
   454  	default:
   455  		rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   456  			Time: timestamppb.New(clock.Now(ctx)),
   457  			Kind: &run.LogEntry_AcquiredSubmitQueue_{
   458  				AcquiredSubmitQueue: &run.LogEntry_AcquiredSubmitQueue{},
   459  			},
   460  		})
   461  		logging.Debugf(ctx, "Acquired Submit Queue")
   462  		return false, nil
   463  	}
   464  }
   465  
   466  // releaseSubmitQueueIfTaken checks if submit queue is occupied by the given
   467  // Run before trying to release.
   468  func releaseSubmitQueueIfTaken(ctx context.Context, rs *state.RunState, rm RM) error {
   469  	switch current, waitlist, err := submit.LoadCurrentAndWaitlist(ctx, rs.ID); {
   470  	case err != nil:
   471  		return err
   472  	case current == rs.ID:
   473  		return releaseSubmitQueue(ctx, rs, rm)
   474  	default:
   475  		for _, w := range waitlist {
   476  			if w == rs.ID {
   477  				return releaseSubmitQueue(ctx, rs, rm)
   478  			}
   479  		}
   480  	}
   481  	return nil
   482  }
   483  
   484  func releaseSubmitQueue(ctx context.Context, rs *state.RunState, rm RM) error {
   485  	var innerErr error
   486  	err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   487  		innerErr = submit.Release(ctx, rm.NotifyReadyForSubmission, rs.ID)
   488  		return innerErr
   489  	}, nil)
   490  	switch {
   491  	case innerErr != nil:
   492  		return innerErr
   493  	case err != nil:
   494  		return errors.Annotate(err, "failed to release submit queue").Tag(transient.Tag).Err()
   495  	}
   496  	rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   497  		Time: timestamppb.New(clock.Now(ctx)),
   498  		Kind: &run.LogEntry_ReleasedSubmitQueue_{
   499  			ReleasedSubmitQueue: &run.LogEntry_ReleasedSubmitQueue{},
   500  		},
   501  	})
   502  	logging.Debugf(ctx, "Released Submit Queue")
   503  	return nil
   504  }
   505  
   506  const defaultSubmissionDuration = 20 * time.Minute
   507  
   508  func markSubmitting(ctx context.Context, rs *state.RunState) error {
   509  	rs.Status = run.Status_SUBMITTING
   510  	cls, err := run.LoadRunCLs(ctx, rs.ID, rs.CLs)
   511  	if err != nil {
   512  		return err
   513  	}
   514  	if rs.Submission.Cls, err = orderCLsInSubmissionOrder(ctx, cls, rs.ID, rs.Submission); err != nil {
   515  		return err
   516  	}
   517  	// TODO(b/267966142): Find a less hacky way to set the timeout for CLs
   518  	// that are known to slow to land.
   519  	submissionDuration := defaultSubmissionDuration
   520  	if hasCrOSKernelChange(rs.ID.LUCIProject(), cls) {
   521  		submissionDuration = 40 * time.Minute
   522  	}
   523  	rs.Submission.Deadline = timestamppb.New(clock.Now(ctx).UTC().Add(submissionDuration))
   524  	rs.Submission.TaskId = mustTaskIDFromContext(ctx)
   525  	return nil
   526  }
   527  
   528  func (impl *Impl) scheduleResetTriggersForNotSubmittedCLs(ctx context.Context, rs *state.RunState, allRunCLs []*run.RunCL, sc *eventpb.SubmissionCompleted) {
   529  	whoms := rs.Mode.GerritNotifyTargets()
   530  	// Single-CL Run
   531  	if len(allRunCLs) == 1 {
   532  		meta := reviewInputMeta{
   533  			notify:         whoms,
   534  			addToAttention: whoms,
   535  			reason:         submissionFailureAttentionReason,
   536  		}
   537  		switch {
   538  		case sc.GetClFailures() != nil:
   539  			failures := sc.GetClFailures().GetFailures()
   540  			if len(failures) != 1 {
   541  				panic(fmt.Errorf("expected exactly 1 failed CL, got %v", failures))
   542  			}
   543  			meta.message = failures[0].GetMessage()
   544  		case sc.GetTimeout():
   545  			meta.message = timeoutMsg
   546  		default:
   547  			meta.message = defaultMsg
   548  		}
   549  		scheduleTriggersReset(ctx, rs, map[common.CLID]reviewInputMeta{
   550  			allRunCLs[0].ID: meta,
   551  		}, run.Status_FAILED)
   552  		return
   553  	}
   554  
   555  	// Multi-CL Run
   556  	submitted, failed, pending := splitRunCLs(allRunCLs, rs.Submission, sc)
   557  	msgSuffix := makeSubmissionMsgSuffix(submitted, failed, pending)
   558  	// Multi-CL Run with root CL (i.e. instantly triggered multi-CL Run)
   559  	if rs.HasRootCL() {
   560  		meta := reviewInputMeta{
   561  			notify:         whoms,
   562  			addToAttention: whoms,
   563  			reason:         submissionFailureAttentionReason,
   564  		}
   565  		switch clFailures := sc.GetClFailures().GetFailures(); {
   566  		case len(clFailures) == 1 && clFailures[0].GetClid() == int64(rs.RootCL):
   567  			// this is the case where the only failed CL is the root CL.
   568  			meta.message = fmt.Sprintf("%s\n\n%s", sc.GetClFailures().GetFailures()[0].GetMessage(), msgSuffix)
   569  		case len(clFailures) > 0:
   570  			// Summarize a list of the failed CLs and post it onto the root CL.
   571  			var sb strings.Builder
   572  			sb.WriteString(partiallySubmittedMsgForRootCL)
   573  			failureMsgByCLID := map[common.CLID]string{}
   574  			for _, f := range sc.GetClFailures().GetFailures() {
   575  				failureMsgByCLID[common.CLID(f.GetClid())] = f.GetMessage()
   576  			}
   577  			for _, f := range failed {
   578  				fmt.Fprintf(&sb, "\n* %s: %s", f.ExternalID.MustURL(), failureMsgByCLID[f.ID])
   579  			}
   580  			fmt.Fprint(&sb, "\n\n")
   581  			fmt.Fprint(&sb, msgSuffix)
   582  			meta.message = sb.String()
   583  		case sc.GetTimeout():
   584  			meta.message = fmt.Sprintf("%s\n\n%s", timeoutMsg, msgSuffix)
   585  		default:
   586  			meta.message = fmt.Sprintf("%s\n\n%s", defaultMsg, msgSuffix)
   587  		}
   588  		scheduleTriggersReset(ctx, rs, map[common.CLID]reviewInputMeta{
   589  			rs.RootCL: meta,
   590  		}, run.Status_FAILED)
   591  		return
   592  	}
   593  	// Multi-CL Run without root CL
   594  	metas := make(map[common.CLID]reviewInputMeta, len(failed)+len(pending))
   595  	switch {
   596  	case sc.GetClFailures() != nil:
   597  		// Compute metas for CLs that fail to submit
   598  		for _, f := range sc.GetClFailures().GetFailures() {
   599  			metas[common.CLID(f.GetClid())] = reviewInputMeta{
   600  				notify:         whoms,
   601  				message:        fmt.Sprintf("%s\n\n%s", f.GetMessage(), msgSuffix),
   602  				addToAttention: whoms,
   603  				reason:         submissionFailureAttentionReason,
   604  			}
   605  		}
   606  
   607  		// Compute the metas for CLs that CV didn't attempt to submit.
   608  		var sb strings.Builder
   609  		// TODO(yiwzhang): Once CV learns how to submit multiple CLs in parallel,
   610  		// this should be optimized to print out failed CLs that each pending CL
   611  		// depends on instead of printing out all failed CLs.
   612  		// Example: considering a CL group where CL B and CL C are submitted in
   613  		// parallel and neither of them succeeds:
   614  		//   A (submitted)
   615  		//   |
   616  		//   |--> B (failed) --> D (pending)
   617  		//   |
   618  		//   |--> C (failed) --> E (pending)
   619  		// the message CV posts on CL D should only include the fact that CV fails
   620  		// to submit CL B.
   621  		for _, f := range failed {
   622  			fmt.Fprintf(&sb, "\n* %s", f.ExternalID.MustURL())
   623  		}
   624  		fmt.Fprint(&sb, "\n\n")
   625  		fmt.Fprint(&sb, msgSuffix)
   626  		pendingMeta := reviewInputMeta{
   627  			notify:         whoms,
   628  			message:        fmt.Sprintf("%s%s", partiallySubmittedMsgForPendingCLs, sb.String()),
   629  			addToAttention: whoms,
   630  			reason:         submissionFailureAttentionReason,
   631  		}
   632  		for _, pendingCL := range pending {
   633  			metas[pendingCL.ID] = pendingMeta
   634  		}
   635  		scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED)
   636  
   637  		// Post messages to submitted CLs.
   638  		// TODO(yiwzhang): model message posting as a long op.
   639  		if len(submitted) > 0 {
   640  			msg := fmt.Sprintf("%s%s", partiallySubmittedMsgForSubmittedCLs, sb.String())
   641  			err := parallel.FanOutIn(func(workCh chan<- func() error) {
   642  				for _, submittedCL := range submitted {
   643  					submittedCL := submittedCL
   644  					workCh <- func() error {
   645  						// Swallow the error because these messages are not critical and
   646  						// it's okay to not posting them during a Gerrit hiccup.
   647  						if err := postMsgForDependentFailures(ctx, impl.GFactory, submittedCL, msg); err != nil {
   648  							logging.Warningf(ctx, "failed to post messages for dependent submission failure on submitted changes: %s", err)
   649  						}
   650  						return nil
   651  					}
   652  				}
   653  			})
   654  			if err != nil {
   655  				panic(fmt.Errorf("impossible parallel error: %w", err))
   656  			}
   657  		}
   658  	case sc.GetTimeout():
   659  		meta := reviewInputMeta{
   660  			notify:         whoms,
   661  			message:        fmt.Sprintf("%s\n\n%s", timeoutMsg, msgSuffix),
   662  			addToAttention: whoms,
   663  			reason:         submissionFailureAttentionReason,
   664  		}
   665  		for _, cl := range pending {
   666  			metas[cl.ID] = meta
   667  		}
   668  	default:
   669  		meta := reviewInputMeta{
   670  			notify:         whoms,
   671  			message:        fmt.Sprintf("%s\n\n%s", defaultMsg, msgSuffix),
   672  			addToAttention: whoms,
   673  			reason:         submissionFailureAttentionReason,
   674  		}
   675  		for _, cl := range pending {
   676  			metas[cl.ID] = meta
   677  		}
   678  	}
   679  	scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED)
   680  }
   681  
   682  func makeSubmissionMsgSuffix(submitted, failed, pending []*run.RunCL) string {
   683  	submittedURLs := make([]string, len(submitted))
   684  	for i, cl := range submitted {
   685  		submittedURLs[i] = cl.ExternalID.MustURL()
   686  	}
   687  	notSubmittedURLs := make([]string, len(failed)+len(pending))
   688  	for i, cl := range failed {
   689  		notSubmittedURLs[i] = cl.ExternalID.MustURL()
   690  	}
   691  	for i, cl := range pending {
   692  		notSubmittedURLs[len(failed)+i] = cl.ExternalID.MustURL()
   693  	}
   694  	if len(submittedURLs) > 0 { // partial submission
   695  		return fmt.Sprintf(partiallySubmittedMsgSuffixFmt,
   696  			strings.Join(notSubmittedURLs, "\n* "),
   697  			strings.Join(submittedURLs, "\n* "),
   698  		)
   699  	}
   700  	return fmt.Sprintf(noneSubmittedMsgSuffixFmt, strings.Join(notSubmittedURLs, "\n* "))
   701  }
   702  
   703  ////////////////////////////////////////////////////////////////////////////////
   704  // Helper methods
   705  
   706  var fakeTaskIDKey = "used in handler tests only for setting the mock taskID"
   707  
   708  func mustTaskIDFromContext(ctx context.Context) string {
   709  	if taskID, ok := ctx.Value(&fakeTaskIDKey).(string); ok {
   710  		return taskID
   711  	}
   712  	switch executionInfo := tq.TaskExecutionInfo(ctx); {
   713  	case executionInfo == nil:
   714  		panic("must be called within a task handler")
   715  	case executionInfo.TaskID == "":
   716  		panic("taskID in task executionInfo is empty")
   717  	default:
   718  		return executionInfo.TaskID
   719  	}
   720  }
   721  func orderCLsInSubmissionOrder(ctx context.Context, cls []*run.RunCL, runID common.RunID, sub *run.Submission) ([]int64, error) {
   722  	cls, err := submit.ComputeOrder(cls)
   723  	if err != nil {
   724  		return nil, err
   725  	}
   726  	ret := make([]int64, len(cls))
   727  	for i, cl := range cls {
   728  		ret[i] = int64(cl.ID)
   729  	}
   730  	return ret, nil
   731  }
   732  
   733  func hasCrOSKernelChange(luciProject string, cls []*run.RunCL) bool {
   734  	if luciProject != "chromeos" {
   735  		return false
   736  	}
   737  	for _, cl := range cls {
   738  		if cl.Detail.GetGerrit().GetInfo().GetProject() == "chromiumos/third_party/kernel" {
   739  			return true
   740  		}
   741  	}
   742  	return false
   743  }
   744  func splitRunCLs(cls []*run.RunCL, submission *run.Submission, sc *eventpb.SubmissionCompleted) (submitted, failed, pending []*run.RunCL) {
   745  	submittedSet := common.MakeCLIDsSet(submission.GetSubmittedCls()...)
   746  	failedSet := make(common.CLIDsSet, len(sc.GetClFailures().GetFailures()))
   747  	for _, f := range sc.GetClFailures().GetFailures() {
   748  		if submittedSet.HasI64(f.GetClid()) {
   749  			panic(fmt.Errorf("impossible; cl %d is marked both submitted and failed", f.GetClid()))
   750  		}
   751  		failedSet.AddI64(f.GetClid())
   752  	}
   753  
   754  	submitted = make([]*run.RunCL, 0, len(submittedSet))
   755  	failed = make([]*run.RunCL, 0, len(failedSet))
   756  	pending = make([]*run.RunCL, 0, len(cls)-len(submittedSet)-len(failedSet))
   757  	for _, cl := range cls {
   758  		switch {
   759  		case submittedSet.Has(cl.ID):
   760  			submitted = append(submitted, cl)
   761  		case failedSet.Has(cl.ID):
   762  			failed = append(failed, cl)
   763  		default:
   764  			pending = append(pending, cl)
   765  		}
   766  	}
   767  	return submitted, failed, pending
   768  }
   769  
   770  // TODO(crbug/1302119): Replace terms like "Project admin" with dedicated
   771  // contact sourced from Project Config.
   772  const (
   773  	cvBugLink  = "https://issues.chromium.org/issues/new?component=1456237"
   774  	defaultMsg = "Submission of this CL failed due to unexpected internal " +
   775  		"error. Please contact LUCI team.\n\n" + cvBugLink
   776  	noneSubmittedMsgSuffixFmt = "None of the CLs in the Run has been " +
   777  		"submitted. CLs:\n* %s"
   778  	partiallySubmittedMsgForRootCL     = "Failed to submit the following CL(s):"
   779  	partiallySubmittedMsgForPendingCLs = "This CL is not submitted because " +
   780  		"submission has failed for the following CL(s) which this CL depends on."
   781  	partiallySubmittedMsgForSubmittedCLs = "This CL is submitted. However, " +
   782  		"submission has failed for the following CL(s) which depend on this CL."
   783  	partiallySubmittedMsgSuffixFmt = "CLs in the Run have been submitted " +
   784  		"partially.\nNot submitted:\n* %s\nSubmitted:\n* %s\n" +
   785  		"Please, use your judgement to determine if already submitted CLs have " +
   786  		"to be reverted, or if the remaining CLs could be manually submitted. " +
   787  		"If you think the partially submitted CLs may have broken the " +
   788  		"tip-of-tree of your project, consider notifying your infrastructure " +
   789  		"team/gardeners/sheriffs."
   790  	timeoutMsg = "Ran out of time to submit this CL. " +
   791  		// TODO(yiwzhang): Generally, time out means CV is doing something
   792  		// wrong and looping over internally, However, timeout could also
   793  		// happen when submitting large CL stack and Gerrit is slow. In that
   794  		// case, CV can't do anything about it. After launching m1, gather data
   795  		// to see under what circumstance it may happen and revise this message
   796  		// so that CV doesn't get blamed for timeout it isn't responsible for.
   797  		"Please contact LUCI team.\n\n" + cvBugLink
   798  	persistentTreeStatusAppFailureTemplate = "Could not submit this CL " +
   799  		"because the tree status app at %s repeatedly returned failures. "
   800  	treeStatusCheckFailedReason      = "Tree status check failed."
   801  	submissionFailureAttentionReason = "Submission failed."
   802  )
   803  
   804  // postMsgForDependentFailures posts a review message to
   805  // a given CL to notify submission failures of the dependent CLs.
   806  func postMsgForDependentFailures(ctx context.Context, gf gerrit.Factory, rcl *run.RunCL, msg string) error {
   807  	queryOpts := []gerritpb.QueryOption{gerritpb.QueryOption_MESSAGES}
   808  	posted, err := util.IsActionTakenOnGerritCL(ctx, gf, rcl, queryOpts, func(rcl *run.RunCL, ci *gerritpb.ChangeInfo) time.Time {
   809  		// In practice, Gerrit generally orders the messages from earliest to
   810  		// latest. So iterating in the reverse order because it's more likely the
   811  		// desired message is posted recently. Also don't visit any messages before
   812  		// the run trigger as those messages should belong to previous Runs.
   813  		clTriggeredAt := rcl.Trigger.Time.AsTime()
   814  		for i := len(ci.GetMessages()) - 1; i >= 0; i-- {
   815  			m := ci.GetMessages()[i]
   816  			switch t := m.GetDate().AsTime(); {
   817  			case t.Before(clTriggeredAt):
   818  				// i-th message is too old, no need to check even older ones.
   819  				return time.Time{}
   820  			case strings.Contains(m.GetMessage(), msg):
   821  				return t
   822  			}
   823  		}
   824  		return time.Time{}
   825  	})
   826  
   827  	switch {
   828  	case err != nil:
   829  		return err
   830  	case !posted.IsZero():
   831  		return nil
   832  	}
   833  
   834  	ci := rcl.Detail.GetGerrit().GetInfo()
   835  	ownerAndVotersAccounts := gerrit.Whoms{gerrit.Whom_OWNER, gerrit.Whom_CQ_VOTERS}.ToAccountIDsSorted(ci)
   836  	req := &gerritpb.SetReviewRequest{
   837  		Number:     ci.GetNumber(),
   838  		Project:    ci.GetProject(),
   839  		RevisionId: ci.GetCurrentRevision(),
   840  		Message:    msg,
   841  		Tag:        run.FullRun.GerritMessageTag(),
   842  		Notify:     gerritpb.Notify_NOTIFY_NONE,
   843  		NotifyDetails: &gerritpb.NotifyDetails{
   844  			Recipients: []*gerritpb.NotifyDetails_Recipient{
   845  				{
   846  					RecipientType: gerritpb.NotifyDetails_RECIPIENT_TYPE_TO,
   847  					Info: &gerritpb.NotifyDetails_Info{
   848  						Accounts: ownerAndVotersAccounts,
   849  					},
   850  				},
   851  			},
   852  		},
   853  		AddToAttentionSet: make([]*gerritpb.AttentionSetInput, len(ownerAndVotersAccounts)),
   854  	}
   855  	reason := fmt.Sprintf("ps#%d: failed to submit dependent CLs",
   856  		ci.GetRevisions()[ci.GetCurrentRevision()].GetNumber())
   857  	for i, acct := range ownerAndVotersAccounts {
   858  		req.AddToAttentionSet[i] = &gerritpb.AttentionSetInput{
   859  			User:   strconv.Itoa(int(acct)),
   860  			Reason: reason,
   861  		}
   862  	}
   863  	return util.MutateGerritCL(ctx, gf, rcl, req, 1*time.Minute, "post-msg-for-dependent-failure")
   864  }