go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/cv/internal/run/impl/handler/tryjobs.go (about) 1 // Copyright 2021 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package handler 16 17 import ( 18 "context" 19 "fmt" 20 "slices" 21 "sort" 22 "strconv" 23 "strings" 24 "time" 25 26 "google.golang.org/protobuf/proto" 27 "google.golang.org/protobuf/types/known/timestamppb" 28 29 bbutil "go.chromium.org/luci/buildbucket/protoutil" 30 "go.chromium.org/luci/common/clock" 31 "go.chromium.org/luci/common/errors" 32 "go.chromium.org/luci/common/logging" 33 34 cfgpb "go.chromium.org/luci/cv/api/config/v2" 35 "go.chromium.org/luci/cv/internal/common" 36 "go.chromium.org/luci/cv/internal/run" 37 "go.chromium.org/luci/cv/internal/run/eventpb" 38 "go.chromium.org/luci/cv/internal/run/impl/state" 39 "go.chromium.org/luci/cv/internal/tryjob" 40 ) 41 42 const ( 43 // maxTryjobExecutorDuration is the max time that the tryjob executor 44 // can process. 45 maxTryjobExecutorDuration = 8 * time.Minute 46 ) 47 48 // OnTryjobsUpdated implements Handler interface. 49 func (impl *Impl) OnTryjobsUpdated(ctx context.Context, rs *state.RunState, tryjobs common.TryjobIDs) (*Result, error) { 50 switch status := rs.Status; { 51 case run.IsEnded(status): 52 fallthrough 53 case status == run.Status_WAITING_FOR_SUBMISSION || status == run.Status_SUBMITTING: 54 logging.Debugf(ctx, "Ignoring Tryjobs event because Run is in status %s", status) 55 return &Result{State: rs}, nil 56 case status != run.Status_RUNNING: 57 return nil, errors.Reason("expected RUNNING status, got %s", status).Err() 58 case hasExecuteTryjobLongOp(rs): 59 // Process this event after the current tryjob executor finishes running. 60 return &Result{State: rs, PreserveEvents: true}, nil 61 default: 62 tryjobs.Dedupe() 63 slices.Sort(tryjobs) 64 rs = rs.ShallowCopy() 65 enqueueTryjobsUpdatedTask(ctx, rs, tryjobs) 66 return &Result{State: rs}, nil 67 } 68 } 69 70 func (impl *Impl) onCompletedExecuteTryjobs(ctx context.Context, rs *state.RunState, _ *run.OngoingLongOps_Op, opCompleted *eventpb.LongOpCompleted) (*Result, error) { 71 opID := opCompleted.GetOperationId() 72 rs = rs.ShallowCopy() 73 rs.RemoveCompletedLongOp(opID) 74 if rs.Status != run.Status_RUNNING { 75 logging.Warningf(ctx, "long operation to execute Tryjobs has completed but Run is %s.", rs.Status) 76 return &Result{State: rs}, nil 77 } 78 var runStatus run.Status 79 switch opCompleted.GetStatus() { 80 case eventpb.LongOpCompleted_EXPIRED: 81 // Tryjob executor timeout. 82 fallthrough 83 case eventpb.LongOpCompleted_FAILED: 84 // normally indicates tryjob executor itself encounters error (e.g. failed 85 // to read from datastore). 86 runStatus = run.Status_FAILED 87 case eventpb.LongOpCompleted_SUCCEEDED: 88 switch es, _, err := tryjob.LoadExecutionState(ctx, rs.ID); { 89 case err != nil: 90 return nil, err 91 case es == nil: 92 panic(fmt.Errorf("impossible; Execute Tryjobs task succeeded but ExecutionState was missing")) 93 default: 94 if rs.Tryjobs == nil { 95 rs.Tryjobs = &run.Tryjobs{} 96 } else { 97 rs.Tryjobs = proto.Clone(rs.Tryjobs).(*run.Tryjobs) 98 } 99 rs.Tryjobs.State = es // Copy the execution state to Run entity 100 switch executionStatus := es.GetStatus(); { 101 case executionStatus == tryjob.ExecutionState_SUCCEEDED && run.ShouldSubmit(&rs.Run): 102 rs.Status = run.Status_WAITING_FOR_SUBMISSION 103 return impl.OnReadyForSubmission(ctx, rs) 104 case executionStatus == tryjob.ExecutionState_SUCCEEDED: 105 runStatus = run.Status_SUCCEEDED 106 case executionStatus == tryjob.ExecutionState_FAILED: 107 runStatus = run.Status_FAILED 108 case executionStatus == tryjob.ExecutionState_RUNNING: 109 // Tryjobs are still running. No change to run status. 110 case executionStatus == tryjob.ExecutionState_STATUS_UNSPECIFIED: 111 panic(fmt.Errorf("execution status is not specified")) 112 default: 113 panic(fmt.Errorf("unknown tryjob execution status %s", executionStatus)) 114 } 115 } 116 default: 117 panic(fmt.Errorf("unknown LongOpCompleted status: %s", opCompleted.GetStatus())) 118 } 119 120 if run.IsEnded(runStatus) { 121 cls, err := run.LoadRunCLs(ctx, rs.ID, rs.CLs) 122 if err != nil { 123 return nil, err 124 } 125 executionFailures := rs.Tryjobs.GetState().GetFailures() 126 var failedTryjobs []*tryjob.Tryjob 127 if len(executionFailures.GetUnsuccessfulResults()) > 0 { 128 ids := make(common.TryjobIDs, len(executionFailures.GetUnsuccessfulResults())) 129 for i, r := range executionFailures.GetUnsuccessfulResults() { 130 ids[i] = common.TryjobID(r.GetTryjobId()) 131 } 132 var err error 133 failedTryjobs, err = tryjob.LoadTryjobsByIDs(ctx, ids) 134 if err != nil { 135 return nil, err 136 } 137 } 138 139 metas := make(map[common.CLID]reviewInputMeta, len(rs.CLs)) 140 for _, cl := range cls { 141 if rs.HasRootCL() && cl.ID != rs.RootCL { 142 continue 143 } 144 var meta reviewInputMeta 145 switch { 146 case runStatus == run.Status_SUCCEEDED && rs.Mode == run.NewPatchsetRun: 147 meta = reviewInputMeta{} // silent 148 case runStatus == run.Status_SUCCEEDED: 149 meta = reviewInputMeta{ 150 message: "This CL has passed the run", 151 notify: rs.Mode.GerritNotifyTargets(), 152 } 153 case runStatus == run.Status_FAILED && executionFailures != nil: 154 meta = reviewInputMeta{ 155 notify: rs.Mode.GerritNotifyTargets(), 156 addToAttention: rs.Mode.GerritNotifyTargets(), 157 reason: "Tryjobs failed", 158 } 159 switch { 160 case len(executionFailures.GetUnsuccessfulResults()) > 0: 161 meta.message = "This CL has failed the run. Reason:\n\n" + composeTryjobsResultFailureReason(cl, failedTryjobs) 162 case len(executionFailures.GetLaunchFailures()) > 0: 163 meta.message = composeLaunchFailureReason(executionFailures.GetLaunchFailures()) 164 default: 165 return nil, errors.New("Bug: execution state reports failure, but no detailed failure specified") 166 } 167 default: 168 meta = reviewInputMeta{ 169 message: "Unexpected error when processing Tryjobs. Please retry. If retry continues to fail, please contact LUCI team.\n\n" + cvBugLink, 170 notify: rs.Mode.GerritNotifyTargets(), 171 addToAttention: rs.Mode.GerritNotifyTargets(), 172 reason: "Run failed", 173 } 174 } 175 metas[cl.ID] = meta 176 } 177 scheduleTriggersReset(ctx, rs, metas, runStatus) 178 } 179 180 return &Result{ 181 State: rs, 182 }, nil 183 } 184 185 func hasExecuteTryjobLongOp(rs *state.RunState) bool { 186 for _, op := range rs.OngoingLongOps.GetOps() { 187 if op.GetExecuteTryjobs() != nil { 188 return true 189 } 190 } 191 return false 192 } 193 194 func enqueueTryjobsUpdatedTask(ctx context.Context, rs *state.RunState, tryjobs common.TryjobIDs) { 195 rs.EnqueueLongOp(&run.OngoingLongOps_Op{ 196 Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxTryjobExecutorDuration)), 197 Work: &run.OngoingLongOps_Op_ExecuteTryjobs{ 198 ExecuteTryjobs: &tryjob.ExecuteTryjobsPayload{ 199 TryjobsUpdated: tryjobs.ToInt64(), 200 }, 201 }, 202 }) 203 } 204 205 func enqueueRequirementChangedTask(ctx context.Context, rs *state.RunState) { 206 rs.EnqueueLongOp(&run.OngoingLongOps_Op{ 207 Deadline: timestamppb.New(clock.Now(ctx).UTC().Add(maxTryjobExecutorDuration)), 208 Work: &run.OngoingLongOps_Op_ExecuteTryjobs{ 209 ExecuteTryjobs: &tryjob.ExecuteTryjobsPayload{ 210 RequirementChanged: true, 211 }, 212 }, 213 }) 214 } 215 216 // composeTryjobsResultFailureReason make a human-readable string explaining 217 // the failed Tryjob result for the given CL. 218 func composeTryjobsResultFailureReason(cl *run.RunCL, tryjobs []*tryjob.Tryjob) string { 219 switch len(tryjobs) { 220 case 0: 221 panic(fmt.Errorf("composeReason called without tryjobs")) 222 case 1: // Optimize for most common case: one failed tryjob. 223 tj := tryjobs[0] 224 restricted := tj.Definition.GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED 225 var sb strings.Builder 226 if restricted { 227 writeMDLink(&sb, "Tryjob", tj.ExternalID.MustURL()) 228 sb.WriteString(" has failed") 229 } else { 230 sb.WriteString("Tryjob ") 231 writeMDLink(&sb, getBuilderName(tj.Definition, tj.Result), tj.ExternalID.MustURL()) 232 sb.WriteString(" has failed") 233 if sm := tj.Result.GetBuildbucket().GetSummaryMarkdown(); sm != "" { 234 sb.WriteString(" with summary") 235 if cl.Detail.GetGerrit() != nil { 236 fmt.Fprintf(&sb, " ([view all results](%s?checksPatchset=%d&tab=checks))", cl.ExternalID.MustURL(), cl.Detail.GetPatchset()) 237 } 238 sb.WriteString(":\n\n---\n") 239 sb.WriteString(sm) 240 } 241 } 242 return sb.String() 243 default: 244 var sb strings.Builder 245 sb.WriteString("Failed Tryjobs:") 246 // restrict the result visibility if any tryjob has restricted result 247 // visibility 248 var restricted bool 249 for _, tj := range tryjobs { 250 if tj.Definition.GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED { 251 restricted = true 252 break 253 } 254 } 255 for _, tj := range tryjobs { 256 sb.WriteString("\n* ") 257 if restricted { 258 sb.WriteString(tj.ExternalID.MustURL()) 259 } else { 260 writeMDLink(&sb, getBuilderName(tj.Definition, tj.Result), tj.ExternalID.MustURL()) 261 if sm := tj.Result.GetBuildbucket().GetSummaryMarkdown(); sm != "" { 262 sb.WriteString(". Summary") 263 if cl.Detail.GetGerrit() != nil { 264 fmt.Fprintf(&sb, " ([view all results](%s?checksPatchset=%d&tab=checks))", cl.ExternalID.MustURL(), cl.Detail.GetPatchset()) 265 } 266 sb.WriteString(":\n\n---\n") 267 sb.WriteString(sm) 268 sb.WriteString("\n\n---") 269 } 270 } 271 } 272 return sb.String() 273 } 274 } 275 276 // composeLaunchFailureReason makes a string explaining tryjob launch failures. 277 func composeLaunchFailureReason(launchFailures []*tryjob.ExecutionState_Failures_LaunchFailure) string { 278 if len(launchFailures) == 0 { 279 panic(fmt.Errorf("expected non-empty launch failures")) 280 } 281 if len(launchFailures) == 1 { // optimize for most common case 282 for _, failure := range launchFailures { 283 switch { 284 case failure.GetDefinition().GetBuildbucket() == nil: 285 panic(fmt.Errorf("non Buildbucket backend is not supported. got %T", failure.GetDefinition().GetBackend())) 286 case failure.GetDefinition().GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED: 287 // TODO(crbug/1302119): Replace terms like "Project admin" with 288 // dedicated contact sourced from Project Config. 289 return "Failed to launch one tryjob. The tryjob name can't be shown due to configuration. Please contact your Project admin for help." 290 default: 291 builderName := bbutil.FormatBuilderID(failure.GetDefinition().GetBuildbucket().GetBuilder()) 292 return fmt.Sprintf("Failed to launch tryjob `%s`. Reason: %s", builderName, failure.GetReason()) 293 } 294 } 295 } 296 297 var sb strings.Builder 298 sb.WriteString("Failed to launch the following tryjobs:") 299 var restrictedCnt int 300 lines := make([]string, 0, len(launchFailures)) 301 for _, failure := range launchFailures { 302 if failure.GetDefinition().GetResultVisibility() == cfgpb.CommentLevel_COMMENT_LEVEL_RESTRICTED { 303 restrictedCnt++ 304 continue 305 } 306 lines = append(lines, fmt.Sprintf("* `%s`; Failure reason: %s", bbutil.FormatBuilderID(failure.GetDefinition().GetBuildbucket().GetBuilder()), failure.GetReason())) 307 } 308 sort.Strings(lines) // for determinism 309 for _, l := range lines { 310 sb.WriteRune('\n') 311 sb.WriteString(l) 312 } 313 314 switch { 315 case restrictedCnt == len(launchFailures): 316 // TODO(crbug/1302119): Replace terms like "Project admin" with 317 // dedicated contact sourced from Project Config. 318 return fmt.Sprintf("Failed to launch %d tryjobs. The tryjob names can't be shown due to configuration. Please contact your Project admin for help.", restrictedCnt) 319 case restrictedCnt > 0: 320 sb.WriteString("\n\nIn addition to the tryjobs above, failed to launch ") 321 sb.WriteString(strconv.Itoa(restrictedCnt)) 322 sb.WriteString(" tryjob") 323 if restrictedCnt > 1 { 324 sb.WriteString("s") 325 } 326 sb.WriteString(". But the tryjob names can't be shown due to configuration. Please contact your Project admin for help.") 327 } 328 return sb.String() 329 } 330 331 // getBuilderName gets the Buildbucket builder name from Tryjob result or 332 // Tryjob definition. 333 // 334 // Tries to get builder name from the result first as it reflects actual 335 // builder launched which may or may not be the main builder in the tryjob 336 // definition. 337 func getBuilderName(def *tryjob.Definition, result *tryjob.Result) string { 338 if result != nil && result.GetBackend() != nil { 339 switch result.GetBackend().(type) { 340 case *tryjob.Result_Buildbucket_: 341 if builder := result.GetBuildbucket().GetBuilder(); builder != nil { 342 return bbutil.FormatBuilderID(builder) 343 } 344 default: 345 panic(fmt.Errorf("non Buildbucket tryjob backend is not supported. got %T", result.GetBackend())) 346 } 347 } 348 if def != nil && def.GetBackend() != nil { 349 switch def.GetBackend().(type) { 350 case *tryjob.Definition_Buildbucket_: 351 if builder := def.GetBuildbucket().GetBuilder(); builder != nil { 352 return bbutil.FormatBuilderID(builder) 353 } 354 default: 355 panic(fmt.Errorf("non Buildbucket tryjob backend is not supported. got %T", def.GetBackend())) 356 } 357 } 358 panic(fmt.Errorf("impossible; can't get builder name from definition and result. Definition: %s; Result: %s", def, result)) 359 } 360 361 func writeMDLink(sb *strings.Builder, text, url string) { 362 sb.WriteString("[") 363 sb.WriteString(text) 364 sb.WriteString("](") 365 sb.WriteString(url) 366 sb.WriteString(")") 367 }