go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/tryjobs.go (about)

     1  // Copyright 2021 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package handler
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"slices"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  	"time"
    25  
    26  	"google.golang.org/protobuf/proto"
    27  	"google.golang.org/protobuf/types/known/timestamppb"
    28  
    29  	bbutil "go.chromium.org/luci/buildbucket/protoutil"
    30  	"go.chromium.org/luci/common/clock"
    31  	"go.chromium.org/luci/common/errors"
    32  	"go.chromium.org/luci/common/logging"
    33  
    34  	cfgpb "go.chromium.org/luci/cv/api/config/v2"
    35  	"go.chromium.org/luci/cv/internal/common"
    36  	"go.chromium.org/luci/cv/internal/run"
    37  	"go.chromium.org/luci/cv/internal/run/eventpb"
    38  	"go.chromium.org/luci/cv/internal/run/impl/state"
    39  	"go.chromium.org/luci/cv/internal/tryjob"
    40  )
    41  
    42  const (
    43  	// maxTryjobExecutorDuration is the max time that the tryjob executor
    44  	// can process.
    45  	maxTryjobExecutorDuration = 8 * time.Minute
    46  )
    47  
    48  // OnTryjobsUpdated implements Handler interface.
    49  func (impl *Impl) OnTryjobsUpdated(ctx context.Context, rs *state.RunState, tryjobs common.TryjobIDs) (*Result, error) {
    50  	switch status := rs.Status; {
    51  	case run.IsEnded(status):
    52  		fallthrough
    53  	case status == run.Status_WAITING_FOR_SUBMISSION || status == run.Status_SUBMITTING:
    54  		logging.Debugf(ctx, "Ignoring Tryjobs event because Run is in status %s", status)
    55  		return &Result{State: rs}, nil
    56  	case status != run.Status_RUNNING:
    57  		return nil, errors.Reason("expected RUNNING status, got %s", status).Err()
    58  	case hasExecuteTryjobLongOp(rs):
    59  		// Process this event after the current tryjob executor finishes running.
    60  		return &Result{State: rs, PreserveEvents: true}, nil
    61  	default:
    62  		tryjobs.Dedupe()
    63  		slices.Sort(tryjobs)
    64  		rs = rs.ShallowCopy()
    65  		enqueueTryjobsUpdatedTask(ctx, rs, tryjobs)
    66  		return &Result{State: rs}, nil
    67  	}
    68  }
    69  
    70  func (impl *Impl) onCompletedExecuteTryjobs(ctx context.Context, rs *state.RunState, _ *run.OngoingLongOps_Op, opCompleted *eventpb.LongOpCompleted) (*Result, error) {
    71  	opID := opCompleted.GetOperationId()
    72  	rs = rs.ShallowCopy()
    73  	rs.RemoveCompletedLongOp(opID)
    74  	if rs.Status != run.Status_RUNNING {
    75  		logging.Warningf(ctx, "long operation to execute Tryjobs has completed but Run is %s.", rs.Status)
    76  		return &Result{State: rs}, nil
    77  	}
    78  	var runStatus run.Status
    79  	switch opCompleted.GetStatus() {
    80  	case eventpb.LongOpCompleted_EXPIRED:
    81  		// Tryjob executor timeout.
    82  		fallthrough
    83  	case eventpb.LongOpCompleted_FAILED:
    84  		// normally indicates tryjob executor itself encounters error (e.g. failed
    85  		// to read from datastore).
    86  		runStatus = run.Status_FAILED
    87  	case eventpb.LongOpCompleted_SUCCEEDED:
    88  		switch es, _, err := tryjob.LoadExecutionState(ctx, rs.ID); {
    89  		case err != nil:
    90  			return nil, err
    91  		case es == nil:
    92  			panic(fmt.Errorf("impossible; Execute Tryjobs task succeeded but ExecutionState was missing"))
    93  		default:
    94  			if rs.Tryjobs == nil {
    95  				rs.Tryjobs = &run.Tryjobs{}
    96  			} else {
    97  				rs.Tryjobs = proto.Clone(rs.Tryjobs).(*run.Tryjobs)
    98  			}
    99  			rs.Tryjobs.State = es // Copy the execution state to Run entity
   100  			switch executionStatus := es.GetStatus(); {
   101  			case executionStatus == tryjob.ExecutionState_SUCCEEDED && run.ShouldSubmit(&rs.Run):
   102  				rs.Status = run.Status_WAITING_FOR_SUBMISSION
   103  				return impl.OnReadyForSubmission(ctx, rs)
   104  			case executionStatus == tryjob.ExecutionState_SUCCEEDED:
   105  				runStatus = run.Status_SUCCEEDED
   106  			case executionStatus == tryjob.ExecutionState_FAILED:
   107  				runStatus = run.Status_FAILED
   108  			case executionStatus == tryjob.ExecutionState_RUNNING:
   109  				// Tryjobs are still running. No change to run status.
   110  			case executionStatus == tryjob.ExecutionState_STATUS_UNSPECIFIED:
   111  				panic(fmt.Errorf("execution status is not specified"))
   112  			default:
   113  				panic(fmt.Errorf("unknown tryjob execution status %s", executionStatus))
   114  			}
   115  		}
   116  	default:
   117  		panic(fmt.Errorf("unknown LongOpCompleted status: %s", opCompleted.GetStatus()))
   118  	}
   119  
   120  	if run.IsEnded(runStatus) {
   121  		cls, err := run.LoadRunCLs(ctx, rs.ID, rs.CLs)
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  		executionFailures := rs.Tryjobs.GetState().GetFailures()
   126  		var failedTryjobs []*tryjob.Tryjob
   127  		if len(executionFailures.GetUnsuccessfulResults()) > 0 {
   128  			ids := make(common.TryjobIDs, len(executionFailures.GetUnsuccessfulResults()))
   129  			for i, r := range executionFailures.GetUnsuccessfulResults() {
   130  				ids[i] = common.TryjobID(r.GetTryjobId())
   131  			}
   132  			var err error
   133  			failedTryjobs, err = tryjob.LoadTryjobsByIDs(ctx, ids)
   134  			if err != nil {
   135  				return nil, err
   136  			}
   137  		}
   138  
   139  		metas := make(map[common.CLID]reviewInputMeta, len(rs.CLs))
   140  		for _, cl := range cls {
   141  			if rs.HasRootCL() && cl.ID != rs.RootCL {
   142  				continue
   143  			}
   144  			var meta reviewInputMeta
   145  			switch {
   146  			case runStatus == run.Status_SUCCEEDED && rs.Mode == run.NewPatchsetRun:
   147  				meta = reviewInputMeta{} // silent
   148  			case runStatus == run.Status_SUCCEEDED:
   149  				meta = reviewInputMeta{
   150  					message: "This CL has passed the run",
   151  					notify:  rs.Mode.GerritNotifyTargets(),
   152  				}
   153  			case runStatus == run.Status_FAILED && executionFailures != nil:
   154  				meta = reviewInputMeta{
   155  					notify:         rs.Mode.GerritNotifyTargets(),
   156  					addToAttention: rs.Mode.GerritNotifyTargets(),
   157  					reason:         "Tryjobs failed",
   158  				}
   159  				switch {
   160  				case len(executionFailures.GetUnsuccessfulResults()) > 0:
   161  					meta.message = "This CL has failed the run. Reason:\n\n" + composeTryjobsResultFailureReason(cl, failedTryjobs)
   162  				case len(executionFailures.GetLaunchFailures()) > 0:
   163  					meta.message = composeLaunchFailureReason(executionFailures.GetLaunchFailures())
   164  				default:
   165  					return nil, errors.New("Bug: execution state reports failure, but no detailed failure specified")
   166  				}
   167  			default:
   168  				meta = reviewInputMeta{
   169  					message:        "Unexpected error when processing Tryjobs. Please retry. If retry continues to fail, please contact LUCI team.\n\n" + cvBugLink,
   170  					notify:         rs.Mode.GerritNotifyTargets(),
   171  					addToAttention: rs.Mode.GerritNotifyTargets(),
   172  					reason:         "Run failed",
   173  				}
   174  			}
   175  			metas[cl.ID] = meta
   176  		}
   177  		scheduleTriggersReset(ctx, rs, metas, runStatus)
   178  	}
   179  
   180  	return &Result{
   181  		State: rs,
   182  	}, nil
   183  }
   184  
   185  func hasExecuteTryjobLongOp(rs *state.RunState) bool {
   186  	for _, op := range rs.OngoingLongOps.GetOps() {
   187  		if op.GetExecuteTryjobs() != nil {
   188  			return true
   189  		}
   190  	}
   191  	return false
   192  }
   193  
   194  func enqueueTryjobsUpdatedTask(ctx context.Context, rs *state.RunState, tryjobs common.TryjobIDs) {
   195  	rs.EnqueueLongOp(&run.OngoingLongOps_Op{
   196  		Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxTryjobExecutorDuration)),
   197  		Work: &run.OngoingLongOps_Op_ExecuteTryjobs{
   198  			ExecuteTryjobs: &tryjob.ExecuteTryjobsPayload{
   199  				TryjobsUpdated: tryjobs.ToInt64(),
   200  			},
   201  		},
   202  	})
   203  }
   204  
   205  func enqueueRequirementChangedTask(ctx context.Context, rs *state.RunState) {
   206  	rs.EnqueueLongOp(&run.OngoingLongOps_Op{
   207  		Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxTryjobExecutorDuration)),
   208  		Work: &run.OngoingLongOps_Op_ExecuteTryjobs{
   209  			ExecuteTryjobs: &tryjob.ExecuteTryjobsPayload{
   210  				RequirementChanged: true,
   211  			},
   212  		},
   213  	})
   214  }
   215  
   216  // composeTryjobsResultFailureReason make a human-readable string explaining
   217  // the failed Tryjob result for the given CL.
   218  func composeTryjobsResultFailureReason(cl *run.RunCL, tryjobs []*tryjob.Tryjob) string {
   219  	switch len(tryjobs) {
   220  	case 0:
   221  		panic(fmt.Errorf("composeReason called without tryjobs"))
   222  	case 1: // Optimize for most common case: one failed tryjob.
   223  		tj := tryjobs[0]
   224  		restricted := tj.Definition.GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED
   225  		var sb strings.Builder
   226  		if restricted {
   227  			writeMDLink(&sb, "Tryjob", tj.ExternalID.MustURL())
   228  			sb.WriteString(" has failed")
   229  		} else {
   230  			sb.WriteString("Tryjob ")
   231  			writeMDLink(&sb, getBuilderName(tj.Definition, tj.Result), tj.ExternalID.MustURL())
   232  			sb.WriteString(" has failed")
   233  			if sm := tj.Result.GetBuildbucket().GetSummaryMarkdown(); sm != "" {
   234  				sb.WriteString(" with summary")
   235  				if cl.Detail.GetGerrit() != nil {
   236  					fmt.Fprintf(&sb, " ([view all results](%s?checksPatchset=%d&tab=checks))", cl.ExternalID.MustURL(), cl.Detail.GetPatchset())
   237  				}
   238  				sb.WriteString(":\n\n---\n")
   239  				sb.WriteString(sm)
   240  			}
   241  		}
   242  		return sb.String()
   243  	default:
   244  		var sb strings.Builder
   245  		sb.WriteString("Failed Tryjobs:")
   246  		// restrict the result visibility if any tryjob has restricted result
   247  		// visibility
   248  		var restricted bool
   249  		for _, tj := range tryjobs {
   250  			if tj.Definition.GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED {
   251  				restricted = true
   252  				break
   253  			}
   254  		}
   255  		for _, tj := range tryjobs {
   256  			sb.WriteString("\n* ")
   257  			if restricted {
   258  				sb.WriteString(tj.ExternalID.MustURL())
   259  			} else {
   260  				writeMDLink(&sb, getBuilderName(tj.Definition, tj.Result), tj.ExternalID.MustURL())
   261  				if sm := tj.Result.GetBuildbucket().GetSummaryMarkdown(); sm != "" {
   262  					sb.WriteString(". Summary")
   263  					if cl.Detail.GetGerrit() != nil {
   264  						fmt.Fprintf(&sb, " ([view all results](%s?checksPatchset=%d&tab=checks))", cl.ExternalID.MustURL(), cl.Detail.GetPatchset())
   265  					}
   266  					sb.WriteString(":\n\n---\n")
   267  					sb.WriteString(sm)
   268  					sb.WriteString("\n\n---")
   269  				}
   270  			}
   271  		}
   272  		return sb.String()
   273  	}
   274  }
   275  
   276  // composeLaunchFailureReason makes a string explaining tryjob launch failures.
   277  func composeLaunchFailureReason(launchFailures []*tryjob.ExecutionState_Failures_LaunchFailure) string {
   278  	if len(launchFailures) == 0 {
   279  		panic(fmt.Errorf("expected non-empty launch failures"))
   280  	}
   281  	if len(launchFailures) == 1 { // optimize for most common case
   282  		for _, failure := range launchFailures {
   283  			switch {
   284  			case failure.GetDefinition().GetBuildbucket() == nil:
   285  				panic(fmt.Errorf("non Buildbucket backend is not supported. got %T", failure.GetDefinition().GetBackend()))
   286  			case failure.GetDefinition().GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED:
   287  				// TODO(crbug/1302119): Replace terms like "Project admin" with
   288  				// dedicated contact sourced from Project Config.
   289  				return "Failed to launch one tryjob. The tryjob name can't be shown due to configuration. Please contact your Project admin for help."
   290  			default:
   291  				builderName := bbutil.FormatBuilderID(failure.GetDefinition().GetBuildbucket().GetBuilder())
   292  				return fmt.Sprintf("Failed to launch tryjob `%s`. Reason: %s", builderName, failure.GetReason())
   293  			}
   294  		}
   295  	}
   296  
   297  	var sb strings.Builder
   298  	sb.WriteString("Failed to launch the following tryjobs:")
   299  	var restrictedCnt int
   300  	lines := make([]string, 0, len(launchFailures))
   301  	for _, failure := range launchFailures {
   302  		if failure.GetDefinition().GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED {
   303  			restrictedCnt++
   304  			continue
   305  		}
   306  		lines = append(lines, fmt.Sprintf("* `%s`; Failure reason: %s", bbutil.FormatBuilderID(failure.GetDefinition().GetBuildbucket().GetBuilder()), failure.GetReason()))
   307  	}
   308  	sort.Strings(lines) // for determinism
   309  	for _, l := range lines {
   310  		sb.WriteRune('\n')
   311  		sb.WriteString(l)
   312  	}
   313  
   314  	switch {
   315  	case restrictedCnt == len(launchFailures):
   316  		// TODO(crbug/1302119): Replace terms like "Project admin" with
   317  		// dedicated contact sourced from Project Config.
   318  		return fmt.Sprintf("Failed to launch %d tryjobs. The tryjob names can't be shown due to configuration. Please contact your Project admin for help.", restrictedCnt)
   319  	case restrictedCnt > 0:
   320  		sb.WriteString("\n\nIn addition to the tryjobs above, failed to launch ")
   321  		sb.WriteString(strconv.Itoa(restrictedCnt))
   322  		sb.WriteString(" tryjob")
   323  		if restrictedCnt > 1 {
   324  			sb.WriteString("s")
   325  		}
   326  		sb.WriteString(". But the tryjob names can't be shown due to configuration. Please contact your Project admin for help.")
   327  	}
   328  	return sb.String()
   329  }
   330  
   331  // getBuilderName gets the Buildbucket builder name from Tryjob result or
   332  // Tryjob definition.
   333  //
   334  // Tries to get builder name from the result first as it reflects actual
   335  // builder launched which may or may not be the main builder in the tryjob
   336  // definition.
   337  func getBuilderName(def *tryjob.Definition, result *tryjob.Result) string {
   338  	if result != nil && result.GetBackend() != nil {
   339  		switch result.GetBackend().(type) {
   340  		case *tryjob.Result_Buildbucket_:
   341  			if builder := result.GetBuildbucket().GetBuilder(); builder != nil {
   342  				return bbutil.FormatBuilderID(builder)
   343  			}
   344  		default:
   345  			panic(fmt.Errorf("non Buildbucket tryjob backend is not supported. got %T", result.GetBackend()))
   346  		}
   347  	}
   348  	if def != nil && def.GetBackend() != nil {
   349  		switch def.GetBackend().(type) {
   350  		case *tryjob.Definition_Buildbucket_:
   351  			if builder := def.GetBuildbucket().GetBuilder(); builder != nil {
   352  				return bbutil.FormatBuilderID(builder)
   353  			}
   354  		default:
   355  			panic(fmt.Errorf("non Buildbucket tryjob backend is not supported. got %T", def.GetBackend()))
   356  		}
   357  	}
   358  	panic(fmt.Errorf("impossible; can't get builder name from definition and result. Definition: %s; Result: %s", def, result))
   359  }
   360  
   361  func writeMDLink(sb *strings.Builder, text, url string) {
   362  	sb.WriteString("[")
   363  	sb.WriteString(text)
   364  	sb.WriteString("](")
   365  	sb.WriteString(url)
   366  	sb.WriteString(")")
   367  }