go.fuchsia.dev/infra@v0.0.0-20240507153436-9b593402251b/cmd/autogardener/culprit.go (about) 1 // Copyright 2022 The Fuchsia Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 package main 6 7 import ( 8 "context" 9 "encoding/json" 10 "fmt" 11 "io" 12 "net/url" 13 "os" 14 "os/signal" 15 "runtime/pprof" 16 "strconv" 17 "strings" 18 "sync" 19 "syscall" 20 21 "cloud.google.com/go/bigquery" 22 "github.com/maruel/subcommands" 23 "go.chromium.org/luci/auth" 24 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/logging/gologger" 27 gerritpb "go.chromium.org/luci/common/proto/gerrit" 28 "go.chromium.org/luci/common/proto/git" 29 "go.chromium.org/luci/grpc/prpc" 30 "go.chromium.org/luci/luciexe/exe" 31 resultpb "go.chromium.org/luci/resultdb/proto/v1" 32 "golang.org/x/exp/maps" 33 "golang.org/x/sync/errgroup" 34 "google.golang.org/api/option" 35 "google.golang.org/protobuf/types/known/fieldmaskpb" 36 "google.golang.org/protobuf/types/known/structpb" 37 38 "go.fuchsia.dev/infra/functools" 39 "go.fuchsia.dev/infra/gerrit" 40 "go.fuchsia.dev/infra/gitiles" 41 ) 42 43 // maxClusterCount is the maximum number of clusters that will be analyzed. If 44 // there are a large number of clusters to analyze then they probably have 45 // similar underlying causes so it's not useful to analyze each one 46 // individually, which would take a really long time. 47 const maxClusterCount = 10 48 49 // Only builds that used a manifest project with this name will be considered. 50 const expectedCheckoutProject = "integration" 51 52 func cmdCulprit(authOpts auth.Options) *subcommands.Command { 53 return &subcommands.Command{ 54 UsageLine: "culprit [flags] BUILD_ID [BUILD_ID]...", 55 ShortDesc: "Find culprit changes for CI breakages", 56 LongDesc: "Find culprit changes for CI breakages", 57 CommandRun: func() subcommands.CommandRun { 58 c := &culpritCmd{} 59 c.Init(authOpts) 60 return c 61 }, 62 } 63 } 64 65 type culpritCmd struct { 66 commonFlags 67 jsonOutputFile string 68 69 buildIDs []int64 70 bqClient *bigquery.Client 71 bbClient buildbucketpb.BuildsClient 72 rdbClient resultpb.ResultDBClient 73 gitilesClient *gitiles.Client 74 gerritClient *gerritMultiClient 75 } 76 77 func (c *culpritCmd) Init(defaultAuthOpts auth.Options) { 78 c.commonFlags.Init(defaultAuthOpts) 79 c.Flags.StringVar( 80 &c.jsonOutputFile, 81 "json-output", 82 "", 83 "Path to which to write output JSON. Use '-' for stdout Use '-' for stdout.") 84 } 85 86 func (c *culpritCmd) parseArgs(args []string) error { 87 if err := c.commonFlags.Parse(); err != nil { 88 return err 89 } 90 for _, rawBuildID := range args { 91 buildID, err := strconv.ParseInt(strings.TrimPrefix(rawBuildID, "b"), 10, 64) 92 if err != nil { 93 return fmt.Errorf("positional argument %q is not an integer or a valid build URL", rawBuildID) 94 } 95 c.buildIDs = append(c.buildIDs, buildID) 96 } 97 return nil 98 } 99 100 func (c *culpritCmd) Run(a subcommands.Application, args []string, _ subcommands.Env) int { 101 if err := c.parseArgs(args); err != nil { 102 fmt.Fprintf(a.GetErr(), "%s: %s\n", a.GetName(), err) 103 return 1 104 } 105 106 if err := c.main(); err != nil { 107 fmt.Fprintf(a.GetErr(), "%s: %s\n", a.GetName(), err) 108 return 1 109 } 110 return 0 111 } 112 113 func (c *culpritCmd) initClients(ctx context.Context) error { 114 authenticator := auth.NewAuthenticator(ctx, auth.OptionalLogin, c.parsedAuthOpts) 115 tokenSource, err := authenticator.TokenSource() 116 if err != nil { 117 if err == auth.ErrLoginRequired { 118 fmt.Fprintf(os.Stderr, "You need to login first by running:\n") 119 fmt.Fprintf(os.Stderr, " luci-auth login -scopes %q\n", strings.Join(c.parsedAuthOpts.Scopes, " ")) 120 } 121 return err 122 } 123 124 c.bqClient, err = bigquery.NewClient(ctx, "fuchsia-infra", option.WithTokenSource(tokenSource)) 125 if err != nil { 126 return err 127 } 128 129 authClient, err := authenticator.Client() 130 if err != nil { 131 return fmt.Errorf("failed to initialize auth client: %w", err) 132 } 133 134 c.gitilesClient, err = gitiles.NewClient( 135 // TODO(olivernewman): Generalize the autogardener to other 136 // repositories. 137 "turquoise-internal.googlesource.com", "integration", authClient) 138 if err != nil { 139 return err 140 } 141 142 c.bbClient = buildbucketpb.NewBuildsPRPCClient(&prpc.Client{ 143 C: authClient, 144 Host: "cr-buildbucket.appspot.com", 145 }) 146 147 c.rdbClient = resultpb.NewResultDBPRPCClient(&prpc.Client{ 148 C: authClient, 149 // TODO(olivernewman): Get the ResultDB host name from the build proto. 150 Host: "results.api.cr.dev", 151 }) 152 153 c.gerritClient = &gerritMultiClient{ 154 authClient: authClient, 155 clients: make(map[gerritClientKey]*gerrit.Client), 156 } 157 158 return nil 159 } 160 161 func (c *culpritCmd) main() error { 162 ctx := gologger.StdConfig.Use(context.Background()) 163 logging.SetLevel(ctx, logging.Debug) 164 165 // Print goroutine stack trace to stderr after a Ctrl-C. This is helpful for 166 // debugging deadlocks and slow operations. 167 go func() { 168 ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM, syscall.SIGINT) 169 defer cancel() 170 <-ctx.Done() 171 pprof.Lookup("goroutine").WriteTo(os.Stderr, 1) 172 }() 173 174 if err := c.initClients(ctx); err != nil { 175 return err 176 } 177 178 buildResults, err := c.fetchBuildResults(ctx) 179 if err != nil { 180 return err 181 } 182 183 clusters := clusterFailureModes(buildResults) 184 if len(clusters) > maxClusterCount { 185 // Choose a random maxClusterCount clusters. 186 newClusters := make(map[failureSignature][]buildResult) 187 for k, v := range clusters { 188 newClusters[k] = v 189 if len(newClusters) == maxClusterCount { 190 break 191 } 192 } 193 clusters = newClusters 194 } 195 196 noun := "cluster" 197 if len(clusters) != 1 { 198 noun += "s" 199 } 200 fmt.Printf("Found %d %s\n", len(clusters), noun) 201 202 eg, _ := errgroup.WithContext(ctx) 203 results := make(map[failureSignature][]suspectCommit) 204 for signature, buildResults := range clusters { 205 // Make copies of the variables that can be safely used in the closure. 206 signature, buildResults := signature, buildResults 207 eg.Go(func() error { 208 suspects, err := c.diagnoseCluster(ctx, signature, buildResults) 209 results[signature] = suspects 210 return err 211 }) 212 } 213 214 if err := eg.Wait(); err != nil { 215 return err 216 } 217 218 var outputLines []string 219 220 var jsonOutput culpritOutput 221 222 // Sort results by confidence in the first suspect, so that failure modes 223 // with lower culprit confidence are less prominent, since they're more 224 // likely to not be actionable. 225 sortedSignatures := maps.Keys(results) 226 functools.SortBy(sortedSignatures, func(sig failureSignature) int { 227 if len(results[sig]) == 0 { 228 return 0 229 } 230 return -results[sig][0].score() 231 }) 232 233 for _, signature := range sortedSignatures { 234 suspects := results[signature] 235 // Don't bother emitting markdown output for failure modes with no 236 // suspects. It just adds noise to the output. 237 if len(suspects) == 0 { 238 continue 239 } 240 241 outputLines = append(outputLines, "### "+signature.FailedTest, "") 242 243 var outputSuspects []outputSuspect 244 245 // Only show the top N suspects. 246 for i, suspect := range suspects { 247 outputLines = append(outputLines, fmt.Sprintf("%2d. (%3d%%) %s", i+1, suspect.score(), suspect.gerritURL())) 248 outputLines = append(outputLines, " "+suspect.commitSummary()) 249 250 outputSuspects = append(outputSuspects, outputSuspect{ 251 suspectCommit: suspect, 252 GerritURL: suspect.gerritURL(), 253 ChangedFiles: suspect.changedFiles(), 254 CommitSummary: suspect.commitSummary(), 255 Score: suspect.score(), 256 Features: suspect.features(), 257 }) 258 } 259 260 outputLines = append(outputLines, "") 261 262 jsonOutput.Clusters = append(jsonOutput.Clusters, outputCluster{ 263 Signature: signature, 264 Suspects: outputSuspects, 265 }) 266 } 267 268 markdownOutput := strings.Join(outputLines, "\n") 269 if c.jsonOutputFile == "" { 270 fmt.Println(markdownOutput) 271 return nil 272 } 273 jsonOutput.MarkdownOutput = markdownOutput 274 275 rawJSON, err := json.MarshalIndent(jsonOutput, "", " ") 276 rawJSON = append(rawJSON, '\n') 277 if err != nil { 278 return err 279 } 280 281 var outputWriter io.Writer 282 if c.jsonOutputFile == "-" { 283 outputWriter = os.Stdout 284 } else { 285 f, err := os.Create(c.jsonOutputFile) 286 if err != nil { 287 return err 288 } 289 defer f.Close() 290 outputWriter = f 291 } 292 293 if _, err := outputWriter.Write(rawJSON); err != nil { 294 return err 295 } 296 297 return nil 298 } 299 300 // fetchBuildResults downloads the build proto and failed test names for each 301 // build. 302 func (c *culpritCmd) fetchBuildResults(ctx context.Context) ([]buildResult, error) { 303 var res []buildResult 304 for _, buildID := range c.buildIDs { 305 build, err := c.bbClient.GetBuild(ctx, &buildbucketpb.GetBuildRequest{ 306 Id: buildID, 307 Mask: &buildbucketpb.BuildMask{ 308 Fields: &fieldmaskpb.FieldMask{ 309 Paths: []string{ 310 "id", 311 "builder", 312 "status", 313 "start_time", 314 "summary_markdown", 315 "input", 316 "output.properties", 317 "infra.resultdb", 318 }, 319 }, 320 }, 321 }) 322 if err != nil { 323 return nil, err 324 } 325 326 resp, err := c.rdbClient.QueryTestResults(ctx, &resultpb.QueryTestResultsRequest{ 327 Invocations: []string{build.Infra.Resultdb.Invocation}, 328 Predicate: &resultpb.TestResultPredicate{ 329 // Only include non-exonerated failed tests. 330 Expectancy: resultpb.TestResultPredicate_VARIANTS_WITH_UNEXPECTED_RESULTS, 331 }, 332 ReadMask: &fieldmaskpb.FieldMask{ 333 Paths: []string{"test_id", "status", "tags", "failure_reason"}, 334 }, 335 }) 336 if err != nil { 337 return nil, err 338 } 339 340 failedTests := functools.Filter(resp.TestResults, func(t *resultpb.TestResult) bool { 341 for _, tag := range t.Tags { 342 // Only consider top-level test suites. Test cases don't yet 343 // have all the metadata we care about. 344 if tag.Key == "test_case_count" { 345 return true 346 } 347 } 348 return false 349 }) 350 351 proj, err := integrationProjectName(build) 352 if err != nil { 353 return nil, err 354 } 355 // Skip builds that ran against unknown repos - see 356 // https://fxbug.dev/323227115 for context. 357 if proj != expectedCheckoutProject { 358 logging.Warningf( 359 ctx, 360 "skipping build %d because its checkout manifest project was %q, not %q", 361 build.Id, proj, expectedCheckoutProject) 362 continue 363 } 364 365 res = append(res, buildResult{ 366 Build: build, 367 FailedTests: failedTests, 368 }) 369 } 370 return res, nil 371 } 372 373 func (c *culpritCmd) diagnoseCluster(ctx context.Context, sig failureSignature, buildResults []buildResult) ([]suspectCommit, error) { 374 // Sort in descending order by revision count. 375 functools.SortBy(buildResults, func(b buildResult) int { 376 irc, _ := b.integrationRevisionCount() 377 return -1 * irc 378 }) 379 mostRecentBuild := buildResults[0].Build 380 381 windowEnd := mostRecentBuild.StartTime.AsTime() 382 383 commits, err := c.gitilesClient.Log( 384 ctx, mostRecentBuild.Input.GitilesCommit.Id, 300) 385 if err != nil { 386 return nil, err 387 } 388 389 baseCommitPosition, ok := buildResults[0].integrationRevisionCount() 390 if !ok { 391 baseCommitPosition = 0 392 } 393 394 var suspects []suspectCommit 395 for i, commit := range commits { 396 gerritChange, err := commitToGerritChange(commit) 397 if err != nil { 398 return nil, err 399 } 400 suspect := suspectCommit{ 401 signature: sig, 402 CommitInfo: commit, 403 GerritChange: gerritChange, 404 BlamelistDistances: make(map[string]int), 405 TagMatchesTest: hasMatchingTag(commit.Message, sig.FailedTest), 406 } 407 if baseCommitPosition > 0 { 408 suspect.CommitPosition = baseCommitPosition - i 409 } 410 411 suspects = append(suspects, suspect) 412 } 413 414 // Used to control write access to `suspects`. 415 var mu sync.Mutex 416 417 eg, _ := errgroup.WithContext(ctx) 418 eg.Go(func() error { 419 affectingChanges, err := getChangesAffectingTest(ctx, c.bqClient, sig, windowEnd) 420 if err != nil { 421 return err 422 } 423 mu.Lock() 424 defer mu.Unlock() 425 for _, ac := range affectingChanges { 426 for i, suspect := range suspects { 427 if gerritChangesEqual(ac.Change, suspect.GerritChange) { 428 suspects[i].AffectedTest = true 429 } 430 } 431 } 432 return err 433 }) 434 435 eg.Go(func() error { 436 results, err := getNearbyTestResults(ctx, c.bqClient, sig, windowEnd) 437 if err != nil { 438 return err 439 } 440 441 mu.Lock() 442 defer mu.Unlock() 443 444 return calculateBlamelistDistances(results, suspects) 445 }) 446 447 if err := eg.Wait(); err != nil { 448 return nil, err 449 } 450 451 // Filter out suspects with a score of zero. 452 suspects = functools.Filter(suspects, func(c suspectCommit) bool { 453 return c.score() > 0 454 }) 455 456 functools.SortBy(suspects, func(c suspectCommit) int { 457 return -1 * c.score() 458 }) 459 if len(suspects) > 10 { 460 suspects = suspects[:10] 461 } 462 463 // Some data sources are expensive to query, so we only use them as 464 // additional data after the initial filtering has been done using cheaper data sources. 465 eg, _ = errgroup.WithContext(ctx) 466 eg.Go(func() error { 467 // The Gerrit API doesn't include changed files for the most recent 468 // revision by default, so we must explicitly request them. 469 opts := []gerritpb.QueryOption{ 470 gerritpb.QueryOption_ALL_FILES, 471 gerritpb.QueryOption_CURRENT_REVISION, 472 } 473 gerritEG, _ := errgroup.WithContext(ctx) 474 for i, suspect := range suspects { 475 // Make copies of the variables that can be safely used in the closure. 476 i, suspect := i, suspect 477 gerritEG.Go(func() error { 478 changeInfo, err := c.gerritClient.getChange(ctx, suspect.GerritChange, opts...) 479 if err != nil { 480 return err 481 } 482 mu.Lock() 483 suspects[i].ChangeInfo = changeInfo 484 mu.Unlock() 485 return nil 486 }) 487 } 488 return gerritEG.Wait() 489 }) 490 491 if err := eg.Wait(); err != nil { 492 return nil, err 493 } 494 495 // If all the remaining suspects have the same score, that indicates low 496 // confidence in each individual suspect since we can't distinguish 497 // between suspects. It's also just not very helpful to print a ranking 498 // of suspects where each one has the same score. 499 uniqueScores := make(map[int]struct{}) 500 for _, suspect := range suspects { 501 uniqueScores[suspect.score()] = struct{}{} 502 } 503 if len(uniqueScores) <= 1 { 504 suspects = nil 505 } 506 507 functools.SortBy(suspects, func(c suspectCommit) int { 508 return -1 * c.score() 509 }) 510 511 return suspects, nil 512 } 513 514 // clusterFailureModes takes a set of failed builds and attempts to cluster them 515 // into groups by common failure reason. A single build might be included in 516 // multiple clusters if it had multiple failure reasons. 517 // 518 // If *any* cluster has more than one build, then we'll only return clusters 519 // that appear with more than one build. Any failure mode that appears only once 520 // is much less likely to be significant than a failure mode that appears 521 // multiple times. 522 func clusterFailureModes(buildResults []buildResult) map[failureSignature][]buildResult { 523 clusters := make(map[failureSignature][]buildResult) 524 for _, br := range buildResults { 525 if len(br.FailedTests) == 0 { 526 continue 527 } 528 529 for _, test := range br.FailedTests { 530 sig := failureSignature{ 531 FailedTest: test.TestId, 532 } 533 for _, tag := range test.Tags { 534 if tag.Key == "gn_label" { 535 sig.TestGNLabel = tag.Value 536 } 537 } 538 if test.FailureReason != nil { 539 sig.FailureReason = test.FailureReason.PrimaryErrorMessage 540 } 541 clusters[sig] = append(clusters[sig], br) 542 } 543 } 544 545 minRequiredSize := 1 546 for _, cluster := range clusters { 547 if len(cluster) > 1 { 548 minRequiredSize = 2 549 break 550 } 551 } 552 553 for sig, buildResults := range clusters { 554 if len(buildResults) < minRequiredSize { 555 delete(clusters, sig) 556 } 557 } 558 559 return clusters 560 } 561 562 // integrationProjectName returns the name of the integration project that was 563 // used for the build's checkout, e.g. "integration". 564 func integrationProjectName(b *buildbucketpb.Build) (string, error) { 565 var checkoutInfo *structpb.Struct 566 if err := exe.ParseProperties(b.Output.Properties, map[string]any{ 567 "checkout_info": &checkoutInfo, 568 }); err != nil { 569 return "", err 570 } else if checkoutInfo == nil { 571 return "", fmt.Errorf("build %d has no `checkout_info` property", b.Id) 572 } 573 var projectName string 574 if err := exe.ParseProperties(checkoutInfo, map[string]any{ 575 "manifest_project": &projectName, 576 }); err != nil { 577 return "", err 578 } else if projectName == "" { 579 return "", fmt.Errorf("build %d has no `checkout_info.manifest_project` property", b.Id) 580 } 581 return projectName, nil 582 } 583 584 func gerritChangesEqual(c1, c2 *buildbucketpb.GerritChange) bool { 585 if c1 == nil || c2 == nil { 586 return false 587 } 588 return c1.Host == c2.Host && 589 c1.Project == c2.Project && 590 c1.Change == c2.Change 591 } 592 593 func commitToGerritChange(commit *git.Commit) (*buildbucketpb.GerritChange, error) { 594 lines := strings.Split(commit.Message, "\n") 595 changeURL := parseFooter(lines, "Original-Reviewed-on") 596 if changeURL == "" { 597 changeURL = parseFooter(lines, "Reviewed-on") 598 } 599 if changeURL == "" { 600 return nil, fmt.Errorf("no reviewed-on footer for commit %s", commit.Id) 601 } 602 return parseGerritChangeURL(changeURL) 603 } 604 605 func parseFooter(msgLines []string, footer string) string { 606 prefix := fmt.Sprintf("%s: ", footer) 607 for _, line := range msgLines { 608 if strings.HasPrefix(line, prefix) { 609 return strings.TrimPrefix(line, prefix) 610 } 611 } 612 return "" 613 } 614 615 func parseGerritChangeURL(changeURL string) (*buildbucketpb.GerritChange, error) { 616 u, err := url.Parse(changeURL) 617 if err != nil { 618 return nil, err 619 } 620 path := strings.TrimPrefix(u.Path, "/c/") 621 project, changeNumStr, ok := strings.Cut(path, "/+/") 622 if !ok { 623 return nil, fmt.Errorf("malformed gerrit URL: %q", changeURL) 624 } 625 626 var patchset int 627 changeNumStr, patchsetStr, ok := strings.Cut(changeNumStr, "/") 628 if ok { 629 patchset, err = strconv.Atoi(patchsetStr) 630 if err != nil { 631 return nil, err 632 } 633 } 634 changeNum, err := strconv.Atoi(changeNumStr) 635 if err != nil { 636 return nil, err 637 } 638 639 return &buildbucketpb.GerritChange{ 640 Host: u.Host, 641 Project: project, 642 Change: int64(changeNum), 643 Patchset: int64(patchset), 644 }, nil 645 } 646 647 type culpritOutput struct { 648 // Human-readable markdown output. 649 MarkdownOutput string `json:"markdown_output"` 650 651 Clusters []outputCluster 652 } 653 654 type outputCluster struct { 655 Signature failureSignature `json:"signature"` 656 Suspects []outputSuspect `json:"suspects"` 657 } 658 659 type outputSuspect struct { 660 suspectCommit 661 662 GerritURL string `json:"gerrit_url"` 663 ChangedFiles []string `json:"changed_files,omitempty"` 664 CommitSummary string `json:"commit_summary"` 665 Score int `json:"score"` 666 Features []culpritFeature `json:"features"` 667 }