go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/start.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package handler 16 17 import ( 18 "context" 19 "fmt" 20 "regexp" 21 "strings" 22 "time" 23 24 "google.golang.org/protobuf/types/known/timestamppb" 25 26 "go.chromium.org/luci/common/clock" 27 "go.chromium.org/luci/common/errors" 28 "go.chromium.org/luci/common/logging" 29 "go.chromium.org/luci/common/retry/transient" 30 "go.chromium.org/luci/gae/filter/txndefer" 31 "go.chromium.org/luci/gae/service/datastore" 32 "go.chromium.org/luci/server/quota" 33 "go.chromium.org/luci/server/quota/quotapb" 34 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/common/eventbox" 37 "go.chromium.org/luci/cv/internal/configs/prjcfg" 38 "go.chromium.org/luci/cv/internal/metrics" 39 "go.chromium.org/luci/cv/internal/run" 40 "go.chromium.org/luci/cv/internal/run/eventpb" 41 "go.chromium.org/luci/cv/internal/run/impl/state" 42 "go.chromium.org/luci/cv/internal/tryjob/requirement" 43 ) 44 45 const ( 46 logEntryLabelPostStartMessage = "Posting Starting Message" 47 logEntryLabelRunQuotaBalanceMessage = "Run Quota Balance" 48 ) 49 50 // Start implements Handler interface. 51 func (impl *Impl) Start(ctx context.Context, rs *state.RunState) (*Result, error) { 52 switch status := rs.Status; { 53 case status == run.Status_STATUS_UNSPECIFIED: 54 err := errors.Reason("CRITICAL: can't start a Run %q with unspecified status", rs.ID).Err() 55 common.LogError(ctx, err) 56 panic(err) 57 case status != run.Status_PENDING: 58 logging.Debugf(ctx, "Skip starting Run because this Run is %s", status) 59 return &Result{State: rs}, nil 60 } 61 if len(rs.DepRuns) > 0 { 62 runs, errs := run.LoadRunsFromIDs(rs.DepRuns...).Do(ctx) 63 for i, r := range runs { 64 switch err := errs[i]; { 65 case err == datastore.ErrNoSuchEntity: 66 panic(err) 67 case err != nil: 68 return nil, errors.Annotate(err, "failed to load run %s", r.ID).Tag(transient.Tag).Err() 69 default: 70 switch r.Status { 71 case run.Status_STATUS_UNSPECIFIED: 72 err := errors.Reason("CRITICAL: can't start a Run %q, parent Run %s has unspecified status", rs.ID, r.ID).Err() 73 common.LogError(ctx, err) 74 panic(err) 75 case run.Status_FAILED, run.Status_CANCELLED, run.Status_PENDING: 76 // If a parent run has FAILED or been CANCELLED, this run should not start. 77 //The parent run will emit OnParentCompleted to handle cancelling this run. 78 // If the parent run is still PENDING, this run should not start. Once the 79 // parent run has started, it will emit a Start event for this run. 80 return &Result{State: rs}, nil 81 } 82 } 83 } 84 } 85 86 if reasons, isValid := validateRunOptions(rs); !isValid { 87 rs = rs.ShallowCopy() 88 var msgBuilder strings.Builder 89 msgBuilder.WriteString("Failed to start the Run. Reason:") 90 switch len(reasons) { 91 case 0: 92 panic(fmt.Errorf("must provide reason")) 93 case 1: 94 msgBuilder.WriteRune(' ') 95 msgBuilder.WriteString(reasons[0]) 96 default: 97 msgBuilder.WriteString("\n") 98 for _, reason := range reasons { 99 msgBuilder.WriteString("\n* ") 100 msgBuilder.WriteString(reason) 101 } 102 } 103 whoms := rs.Mode.GerritNotifyTargets() 104 meta := reviewInputMeta{ 105 message: msgBuilder.String(), 106 notify: whoms, 107 addToAttention: whoms, 108 reason: "Failed to start the run", 109 } 110 metas := make(map[common.CLID]reviewInputMeta, len(rs.CLs)) 111 for _, cl := range rs.CLs { 112 if !rs.HasRootCL() || rs.RootCL == cl { 113 metas[cl] = meta 114 } 115 } 116 scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED) 117 return &Result{State: rs}, nil 118 } 119 120 rs = rs.ShallowCopy() 121 cg, runCLs, cls, err := loadCLsAndConfig(ctx, rs, rs.CLs) 122 if err != nil { 123 return nil, err 124 } 125 rs = rs.ShallowCopy() 126 switch ok, err := checkRunCreate(ctx, rs, cg, runCLs, cls); { 127 case err != nil: 128 return nil, err 129 case !ok: 130 return &Result{State: rs}, nil 131 } 132 133 // Run quota should be debited from the creator before the run is started. 134 // When run quota isn't available, the run is left in the pending state. 135 pendingMsg := fmt.Sprintf("User %s has exhausted their run quota. This run will start once the quota balance has recovered.", rs.Run.BilledTo.Email()) 136 switch quotaOp, userLimit, err := impl.QM.DebitRunQuota(ctx, &rs.Run); { 137 case err == nil && quotaOp != nil: 138 rs.LogInfof(ctx, logEntryLabelRunQuotaBalanceMessage, "Run quota debited from %s; balance: %d", rs.Run.BilledTo.Email(), quotaOp.GetNewBalance()) 139 case errors.Unwrap(err) == quota.ErrQuotaApply && quotaOp.GetStatus() == quotapb.OpResult_ERR_UNDERFLOW: 140 // run quota isn't currently available for the user; leave the run in pending. 141 logging.Debugf(ctx, "Run quota underflow for %s; leaving the run %s pending", rs.Run.BilledTo.Email(), rs.Run.ID) 142 143 // Post pending message to all gerrit CLs for this run if not posted already. 144 if !rs.QuotaExhaustionMsgLongOpRequested { 145 rs.QuotaExhaustionMsgLongOpRequested = true // Only enqueue once. 146 147 if userLimit.GetRun().GetReachLimitMsg() != "" { 148 pendingMsg = fmt.Sprintf("%s\n\n%s", pendingMsg, userLimit.GetRun().GetReachLimitMsg()) 149 } 150 151 rs.EnqueueLongOp(&run.OngoingLongOps_Op{ 152 Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxPostMessageDuration)), 153 Work: &run.OngoingLongOps_Op_PostGerritMessage_{ 154 PostGerritMessage: &run.OngoingLongOps_Op_PostGerritMessage{ 155 Message: pendingMsg, 156 }, 157 }, 158 }) 159 160 switch host, _, err := runCLs[0].ExternalID.ParseGobID(); { 161 case err != nil: 162 logging.Errorf(ctx, "ParseGobID failed; skipping RunQuotaRejection metric %v", err) 163 default: 164 hostID := strings.TrimSuffix(host, "-review.googlesource.com") 165 gerritAccountID := fmt.Sprintf("%s/%d", hostID, runCLs[0].Trigger.GerritAccountId) 166 metrics.Public.RunQuotaRejection.Add( 167 ctx, 168 1, 169 rs.Run.ID.LUCIProject(), 170 rs.Run.ConfigGroupID.Name(), 171 gerritAccountID, 172 ) 173 } 174 } 175 176 return &Result{State: rs, PreserveEvents: true}, nil 177 case errors.Unwrap(err) == quota.ErrQuotaApply: 178 return nil, errors.Annotate(err, "QM.DebitRunQuota: unexpected quotaOp Status %s", quotaOp.GetStatus()).Tag(transient.Tag).Err() 179 case err != nil: 180 return nil, errors.Annotate(err, "QM.DebitRunQuota").Tag(transient.Tag).Err() 181 } 182 183 switch result, err := requirement.Compute(ctx, requirement.Input{ 184 ConfigGroup: cg.Content, 185 RunOwner: rs.Owner, 186 CLs: runCLs, 187 RunOptions: rs.Options, 188 RunMode: rs.Mode, 189 }); { 190 case err != nil: 191 return nil, err 192 case result.OK(): 193 rs.Tryjobs = &run.Tryjobs{ 194 Requirement: result.Requirement, 195 RequirementVersion: 1, 196 RequirementComputedAt: timestamppb.New(clock.Now(ctx).UTC()), 197 } 198 enqueueRequirementChangedTask(ctx, rs) 199 default: 200 whoms := rs.Mode.GerritNotifyTargets() 201 meta := reviewInputMeta{ 202 message: fmt.Sprintf("Failed to compute tryjob requirement. Reason:\n\n%s", result.ComputationFailure.Reason()), 203 notify: whoms, 204 addToAttention: whoms, 205 reason: "Computing tryjob requirement failed", 206 } 207 metas := make(map[common.CLID]reviewInputMeta, len(cls)) 208 for _, cl := range rs.CLs { 209 if !rs.HasRootCL() || rs.RootCL == cl { 210 metas[cl] = meta 211 } 212 } 213 scheduleTriggersReset(ctx, rs, metas, run.Status_FAILED) 214 rs.LogInfof(ctx, "Tryjob Requirement Computation", "Failed to compute tryjob requirement. Reason: %s", result.ComputationFailure.Reason()) 215 return &Result{State: rs}, nil 216 } 217 218 // New patchset runs should be quiet. 219 if rs.Mode != run.NewPatchsetRun { 220 rs.EnqueueLongOp(&run.OngoingLongOps_Op{ 221 Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxPostMessageDuration)), 222 Work: &run.OngoingLongOps_Op_PostStartMessage{ 223 PostStartMessage: true, 224 }, 225 }) 226 } 227 // Note that it is inevitable that duplicate pickup latency metric maybe 228 // emitted for the same Run if the state transition fails later that 229 // causes a retry. 230 rs.Status = run.Status_RUNNING 231 rs.StartTime = datastore.RoundTime(clock.Now(ctx).UTC()) 232 rs.LogEntries = append(rs.LogEntries, &run.LogEntry{ 233 Time: timestamppb.New(rs.StartTime), 234 Kind: &run.LogEntry_Started_{ 235 Started: &run.LogEntry_Started{}, 236 }, 237 }) 238 239 childRuns, err := run.LoadChildRuns(ctx, rs.ID) 240 if err != nil { 241 return nil, err 242 } 243 se := eventbox.Chain( 244 func(ctx context.Context) error { 245 txndefer.Defer(ctx, func(ctx context.Context) { 246 reportStartMetrics(ctx, rs, cg) 247 }) 248 return nil 249 }, 250 func(ctx context.Context) error { 251 for _, child := range childRuns { 252 if child.Status == run.Status_PENDING { 253 if err := impl.RM.Start(ctx, child.ID); err != nil { 254 return err 255 } 256 } 257 } 258 return nil 259 }, 260 ) 261 return &Result{ 262 State: rs, 263 SideEffectFn: se, 264 }, nil 265 } 266 267 func (impl *Impl) onCompletedPostStartMessage(ctx context.Context, rs *state.RunState, op *run.OngoingLongOps_Op, result *eventpb.LongOpCompleted) (*Result, error) { 268 opID := result.GetOperationId() 269 rs = rs.ShallowCopy() 270 rs.RemoveCompletedLongOp(opID) 271 if rs.Status != run.Status_RUNNING { 272 logging.Warningf(ctx, "Long operation to post start message completed but Run is %s: %s", rs.Status, result) 273 return &Result{State: rs}, nil 274 } 275 276 switch result.GetStatus() { 277 case eventpb.LongOpCompleted_FAILED: 278 // NOTE: if there are several CLs across different projects, detailed 279 // failure may have sensitive info which shouldn't be shared elsewhere. 280 // Future improvement: if posting start message is a common problem, 281 // record `result` message and render links to CLs on which CV failed to 282 // post starting message. 283 rs.LogInfo(ctx, logEntryLabelPostStartMessage, "Failed to post the starting message") 284 case eventpb.LongOpCompleted_EXPIRED: 285 rs.LogInfo(ctx, logEntryLabelPostStartMessage, fmt.Sprintf("Failed to post the starting message within the %s deadline", maxPostMessageDuration)) 286 case eventpb.LongOpCompleted_SUCCEEDED: 287 // TODO(tandrii): simplify once all such events have timestamp. 288 if t := result.GetPostStartMessage().GetTime(); t != nil { 289 rs.LogInfoAt(t.AsTime(), logEntryLabelPostStartMessage, "posted start message on each CL") 290 } else { 291 rs.LogInfo(ctx, logEntryLabelPostStartMessage, "posted start message on each CL") 292 } 293 case eventpb.LongOpCompleted_CANCELLED: 294 rs.LogInfo(ctx, logEntryLabelPostStartMessage, "cancelled posting start message on CL(s)") 295 default: 296 panic(fmt.Errorf("unexpected LongOpCompleted status: %s", result.GetStatus())) 297 } 298 299 return &Result{State: rs}, nil 300 } 301 302 const customTryjobTagRe = `^[a-z0-9_\-]+:.+$` 303 304 var customTryjobTagReCompiled = regexp.MustCompile(customTryjobTagRe) 305 306 func validateRunOptions(rs *state.RunState) (reasons []string, isValid bool) { 307 for _, tag := range rs.Options.GetCustomTryjobTags() { 308 if !customTryjobTagReCompiled.Match([]byte(strings.TrimSpace(tag))) { 309 reasons = append(reasons, fmt.Sprintf("malformed tag: %q; expecting format %q", tag, customTryjobTagRe)) 310 } 311 } 312 if len(reasons) == 0 { 313 return nil, true 314 } 315 return reasons, false 316 } 317 318 func reportStartMetrics(ctx context.Context, rs *state.RunState, cg *prjcfg.ConfigGroup) { 319 project := rs.ID.LUCIProject() 320 metrics.Public.RunStarted.Add(ctx, 1, project, rs.ConfigGroupID.Name(), string(rs.Mode)) 321 delay := rs.StartTime.Sub(rs.CreateTime) 322 metricPickupLatencyS.Add(ctx, delay.Seconds(), project) 323 324 if d := cg.Content.GetCombineCls().GetStabilizationDelay(); d != nil { 325 delay -= d.AsDuration() 326 } 327 metricPickupLatencyAdjustedS.Add(ctx, delay.Seconds(), project) 328 329 if delay >= 1*time.Minute { 330 logging.Warningf(ctx, "Too large adjusted pickup delay: %s", delay) 331 } 332 }