go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/start.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package handler
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"regexp"
    21  	"strings"
    22  	"time"
    23  
    24  	"google.golang.org/protobuf/types/known/timestamppb"
    25  
    26  	"go.chromium.org/luci/common/clock"
    27  	"go.chromium.org/luci/common/errors"
    28  	"go.chromium.org/luci/common/logging"
    29  	"go.chromium.org/luci/common/retry/transient"
    30  	"go.chromium.org/luci/gae/filter/txndefer"
    31  	"go.chromium.org/luci/gae/service/datastore"
    32  	"go.chromium.org/luci/server/quota"
    33  	"go.chromium.org/luci/server/quota/quotapb"
    34  
    35  	"go.chromium.org/luci/cv/internal/common"
    36  	"go.chromium.org/luci/cv/internal/common/eventbox"
    37  	"go.chromium.org/luci/cv/internal/configs/prjcfg"
    38  	"go.chromium.org/luci/cv/internal/metrics"
    39  	"go.chromium.org/luci/cv/internal/run"
    40  	"go.chromium.org/luci/cv/internal/run/eventpb"
    41  	"go.chromium.org/luci/cv/internal/run/impl/state"
    42  	"go.chromium.org/luci/cv/internal/tryjob/requirement"
    43  )
    44  
    45  const (
    46  	logEntryLabelPostStartMessage       = "Posting Starting Message"
    47  	logEntryLabelRunQuotaBalanceMessage = "Run Quota Balance"
    48  )
    49  
    50  // Start implements Handler interface.
    51  func (impl *Impl) Start(ctx context.Context, rs *state.RunState) (*Result, error) {
    52  	switch status := rs.Status; {
    53  	case status == run.Status_STATUS_UNSPECIFIED:
    54  		err := errors.Reason("CRITICAL: can't start a Run %q with unspecified status", rs.ID).Err()
    55  		common.LogError(ctx, err)
    56  		panic(err)
    57  	case status != run.Status_PENDING:
    58  		logging.Debugf(ctx, "Skip starting Run because this Run is %s", status)
    59  		return &Result{State: rs}, nil
    60  	}
    61  	if len(rs.DepRuns) > 0 {
    62  		runs, errs := run.LoadRunsFromIDs(rs.DepRuns...).Do(ctx)
    63  		for i, r := range runs {
    64  			switch err := errs[i]; {
    65  			case err == datastore.ErrNoSuchEntity:
    66  				panic(err)
    67  			case err != nil:
    68  				return nil, errors.Annotate(err, "failed to load run %s", r.ID).Tag(transient.Tag).Err()
    69  			default:
    70  				switch r.Status {
    71  				case run.Status_STATUS_UNSPECIFIED:
    72  					err := errors.Reason("CRITICAL: can't start a Run %q, parent Run %s has unspecified status", rs.ID, r.ID).Err()
    73  					common.LogError(ctx, err)
    74  					panic(err)
    75  				case run.Status_FAILED, run.Status_CANCELLED, run.Status_PENDING:
    76  					// If a parent run has FAILED or been CANCELLED, this run should not start.
    77  					//The parent run  will emit OnParentCompleted to handle cancelling this run.
    78  					// If the parent run is still PENDING, this run should not start. Once the
    79  					// parent run has started, it will emit a Start event for this run.
    80  					return &Result{State: rs}, nil
    81  				}
    82  			}
    83  		}
    84  	}
    85  
    86  	if reasons, isValid := validateRunOptions(rs); !isValid {
    87  		rs = rs.ShallowCopy()
    88  		var msgBuilder strings.Builder
    89  		msgBuilder.WriteString("Failed to start the Run. Reason:")
    90  		switch len(reasons) {
    91  		case 0:
    92  			panic(fmt.Errorf("must provide reason"))
    93  		case 1:
    94  			msgBuilder.WriteRune(' ')
    95  			msgBuilder.WriteString(reasons[0])
    96  		default:
    97  			msgBuilder.WriteString("\n")
    98  			for _, reason := range reasons {
    99  				msgBuilder.WriteString("\n* ")
   100  				msgBuilder.WriteString(reason)
   101  			}
   102  		}
   103  		whoms := rs.Mode.GerritNotifyTargets()
   104  		meta := reviewInputMeta{
   105  			message:        msgBuilder.String(),
   106  			notify:         whoms,
   107  			addToAttention: whoms,
   108  			reason:         "Failed to start the run",
   109  		}
   110  		metas := make(map[common.CLID]reviewInputMeta, len(rs.CLs))
   111  		for _, cl := range rs.CLs {
   112  			if !rs.HasRootCL() || rs.RootCL == cl {
   113  				metas[cl] = meta
   114  			}
   115  		}
   116  		scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED)
   117  		return &Result{State: rs}, nil
   118  	}
   119  
   120  	rs = rs.ShallowCopy()
   121  	cg, runCLs, cls, err := loadCLsAndConfig(ctx, rs, rs.CLs)
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  	rs = rs.ShallowCopy()
   126  	switch ok, err := checkRunCreate(ctx, rs, cg, runCLs, cls); {
   127  	case err != nil:
   128  		return nil, err
   129  	case !ok:
   130  		return &Result{State: rs}, nil
   131  	}
   132  
   133  	// Run quota should be debited from the creator before the run is started.
   134  	// When run quota isn't available, the run is left in the pending state.
   135  	pendingMsg := fmt.Sprintf("User %s has exhausted their run quota. This run will start once the quota balance has recovered.", rs.Run.BilledTo.Email())
   136  	switch quotaOp, userLimit, err := impl.QM.DebitRunQuota(ctx, &rs.Run); {
   137  	case err == nil && quotaOp != nil:
   138  		rs.LogInfof(ctx, logEntryLabelRunQuotaBalanceMessage, "Run quota debited from %s; balance: %d", rs.Run.BilledTo.Email(), quotaOp.GetNewBalance())
   139  	case errors.Unwrap(err) == quota.ErrQuotaApply && quotaOp.GetStatus() == quotapb.OpResult_ERR_UNDERFLOW:
   140  		// run quota isn't currently available for the user; leave the run in pending.
   141  		logging.Debugf(ctx, "Run quota underflow for %s; leaving the run %s pending", rs.Run.BilledTo.Email(), rs.Run.ID)
   142  
   143  		// Post pending message to all gerrit CLs for this run if not posted already.
   144  		if !rs.QuotaExhaustionMsgLongOpRequested {
   145  			rs.QuotaExhaustionMsgLongOpRequested = true // Only enqueue once.
   146  
   147  			if userLimit.GetRun().GetReachLimitMsg() != "" {
   148  				pendingMsg = fmt.Sprintf("%s\n\n%s", pendingMsg, userLimit.GetRun().GetReachLimitMsg())
   149  			}
   150  
   151  			rs.EnqueueLongOp(&run.OngoingLongOps_Op{
   152  				Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxPostMessageDuration)),
   153  				Work: &run.OngoingLongOps_Op_PostGerritMessage_{
   154  					PostGerritMessage: &run.OngoingLongOps_Op_PostGerritMessage{
   155  						Message: pendingMsg,
   156  					},
   157  				},
   158  			})
   159  
   160  			switch host, _, err := runCLs[0].ExternalID.ParseGobID(); {
   161  			case err != nil:
   162  				logging.Errorf(ctx, "ParseGobID failed; skipping RunQuotaRejection metric %v", err)
   163  			default:
   164  				hostID := strings.TrimSuffix(host, "-review.googlesource.com")
   165  				gerritAccountID := fmt.Sprintf("%s/%d", hostID, runCLs[0].Trigger.GerritAccountId)
   166  				metrics.Public.RunQuotaRejection.Add(
   167  					ctx,
   168  					1,
   169  					rs.Run.ID.LUCIProject(),
   170  					rs.Run.ConfigGroupID.Name(),
   171  					gerritAccountID,
   172  				)
   173  			}
   174  		}
   175  
   176  		return &Result{State: rs, PreserveEvents: true}, nil
   177  	case errors.Unwrap(err) == quota.ErrQuotaApply:
   178  		return nil, errors.Annotate(err, "QM.DebitRunQuota: unexpected quotaOp Status %s", quotaOp.GetStatus()).Tag(transient.Tag).Err()
   179  	case err != nil:
   180  		return nil, errors.Annotate(err, "QM.DebitRunQuota").Tag(transient.Tag).Err()
   181  	}
   182  
   183  	switch result, err := requirement.Compute(ctx, requirement.Input{
   184  		ConfigGroup: cg.Content,
   185  		RunOwner:    rs.Owner,
   186  		CLs:         runCLs,
   187  		RunOptions:  rs.Options,
   188  		RunMode:     rs.Mode,
   189  	}); {
   190  	case err != nil:
   191  		return nil, err
   192  	case result.OK():
   193  		rs.Tryjobs = &run.Tryjobs{
   194  			Requirement:           result.Requirement,
   195  			RequirementVersion:    1,
   196  			RequirementComputedAt: timestamppb.New(clock.Now(ctx).UTC()),
   197  		}
   198  		enqueueRequirementChangedTask(ctx, rs)
   199  	default:
   200  		whoms := rs.Mode.GerritNotifyTargets()
   201  		meta := reviewInputMeta{
   202  			message:        fmt.Sprintf("Failed to compute tryjob requirement. Reason:\n\n%s", result.ComputationFailure.Reason()),
   203  			notify:         whoms,
   204  			addToAttention: whoms,
   205  			reason:         "Computing tryjob requirement failed",
   206  		}
   207  		metas := make(map[common.CLID]reviewInputMeta, len(cls))
   208  		for _, cl := range rs.CLs {
   209  			if !rs.HasRootCL() || rs.RootCL == cl {
   210  				metas[cl] = meta
   211  			}
   212  		}
   213  		scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED)
   214  		rs.LogInfof(ctx, "Tryjob Requirement Computation", "Failed to compute tryjob requirement. Reason: %s", result.ComputationFailure.Reason())
   215  		return &Result{State: rs}, nil
   216  	}
   217  
   218  	// New patchset runs should be quiet.
   219  	if rs.Mode != run.NewPatchsetRun {
   220  		rs.EnqueueLongOp(&run.OngoingLongOps_Op{
   221  			Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxPostMessageDuration)),
   222  			Work: &run.OngoingLongOps_Op_PostStartMessage{
   223  				PostStartMessage: true,
   224  			},
   225  		})
   226  	}
   227  	// Note that it is inevitable that duplicate pickup latency metric maybe
   228  	// emitted for the same Run if the state transition fails later that
   229  	// causes a retry.
   230  	rs.Status = run.Status_RUNNING
   231  	rs.StartTime = datastore.RoundTime(clock.Now(ctx).UTC())
   232  	rs.LogEntries = append(rs.LogEntries, &run.LogEntry{
   233  		Time: timestamppb.New(rs.StartTime),
   234  		Kind: &run.LogEntry_Started_{
   235  			Started: &run.LogEntry_Started{},
   236  		},
   237  	})
   238  
   239  	childRuns, err := run.LoadChildRuns(ctx, rs.ID)
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  	se := eventbox.Chain(
   244  		func(ctx context.Context) error {
   245  			txndefer.Defer(ctx, func(ctx context.Context) {
   246  				reportStartMetrics(ctx, rs, cg)
   247  			})
   248  			return nil
   249  		},
   250  		func(ctx context.Context) error {
   251  			for _, child := range childRuns {
   252  				if child.Status == run.Status_PENDING {
   253  					if err := impl.RM.Start(ctx, child.ID); err != nil {
   254  						return err
   255  					}
   256  				}
   257  			}
   258  			return nil
   259  		},
   260  	)
   261  	return &Result{
   262  		State:        rs,
   263  		SideEffectFn: se,
   264  	}, nil
   265  }
   266  
   267  func (impl *Impl) onCompletedPostStartMessage(ctx context.Context, rs *state.RunState, op *run.OngoingLongOps_Op, result *eventpb.LongOpCompleted) (*Result, error) {
   268  	opID := result.GetOperationId()
   269  	rs = rs.ShallowCopy()
   270  	rs.RemoveCompletedLongOp(opID)
   271  	if rs.Status != run.Status_RUNNING {
   272  		logging.Warningf(ctx, "Long operation to post start message completed but Run is %s: %s", rs.Status, result)
   273  		return &Result{State: rs}, nil
   274  	}
   275  
   276  	switch result.GetStatus() {
   277  	case eventpb.LongOpCompleted_FAILED:
   278  		// NOTE: if there are several CLs across different projects, detailed
   279  		// failure may have sensitive info which shouldn't be shared elsewhere.
   280  		// Future improvement: if posting start message is a common problem,
   281  		// record `result` message and render links to CLs on which CV failed to
   282  		// post starting message.
   283  		rs.LogInfo(ctx, logEntryLabelPostStartMessage, "Failed to post the starting message")
   284  	case eventpb.LongOpCompleted_EXPIRED:
   285  		rs.LogInfo(ctx, logEntryLabelPostStartMessage, fmt.Sprintf("Failed to post the starting message within the %s deadline", maxPostMessageDuration))
   286  	case eventpb.LongOpCompleted_SUCCEEDED:
   287  		// TODO(tandrii): simplify once all such events have timestamp.
   288  		if t := result.GetPostStartMessage().GetTime(); t != nil {
   289  			rs.LogInfoAt(t.AsTime(), logEntryLabelPostStartMessage, "posted start message on each CL")
   290  		} else {
   291  			rs.LogInfo(ctx, logEntryLabelPostStartMessage, "posted start message on each CL")
   292  		}
   293  	case eventpb.LongOpCompleted_CANCELLED:
   294  		rs.LogInfo(ctx, logEntryLabelPostStartMessage, "cancelled posting start message on CL(s)")
   295  	default:
   296  		panic(fmt.Errorf("unexpected LongOpCompleted status: %s", result.GetStatus()))
   297  	}
   298  
   299  	return &Result{State: rs}, nil
   300  }
   301  
   302  const customTryjobTagRe = `^[a-z0-9_\-]+:.+$`
   303  
   304  var customTryjobTagReCompiled = regexp.MustCompile(customTryjobTagRe)
   305  
   306  func validateRunOptions(rs *state.RunState) (reasons []string, isValid bool) {
   307  	for _, tag := range rs.Options.GetCustomTryjobTags() {
   308  		if !customTryjobTagReCompiled.Match([]byte(strings.TrimSpace(tag))) {
   309  			reasons = append(reasons, fmt.Sprintf("malformed tag: %q; expecting format %q", tag, customTryjobTagRe))
   310  		}
   311  	}
   312  	if len(reasons) == 0 {
   313  		return nil, true
   314  	}
   315  	return reasons, false
   316  }
   317  
   318  func reportStartMetrics(ctx context.Context, rs *state.RunState, cg *prjcfg.ConfigGroup) {
   319  	project := rs.ID.LUCIProject()
   320  	metrics.Public.RunStarted.Add(ctx, 1, project, rs.ConfigGroupID.Name(), string(rs.Mode))
   321  	delay := rs.StartTime.Sub(rs.CreateTime)
   322  	metricPickupLatencyS.Add(ctx, delay.Seconds(), project)
   323  
   324  	if d := cg.Content.GetCombineCls().GetStabilizationDelay(); d != nil {
   325  		delay -= d.AsDuration()
   326  	}
   327  	metricPickupLatencyAdjustedS.Add(ctx, delay.Seconds(), project)
   328  
   329  	if delay >= 1*time.Minute {
   330  		logging.Warningf(ctx, "Too large adjusted pickup delay: %s", delay)
   331  	}
   332  }