github.com/GoogleCloudPlatform/testgrid@v0.0.174/pkg/summarizer/summary.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package summarizer provides a method to read state protos defined in a config an output summary protos. 18 package summarizer 19 20 import ( 21 "compress/zlib" 22 "context" 23 "errors" 24 "fmt" 25 "io" 26 "io/ioutil" 27 "net/url" 28 "path" 29 "regexp" 30 "sort" 31 "strconv" 32 "strings" 33 "sync" 34 "time" 35 36 "bitbucket.org/creachadair/stringset" 37 "cloud.google.com/go/storage" 38 "github.com/GoogleCloudPlatform/testgrid/config" 39 "github.com/GoogleCloudPlatform/testgrid/config/snapshot" 40 "github.com/GoogleCloudPlatform/testgrid/internal/result" 41 configpb "github.com/GoogleCloudPlatform/testgrid/pb/config" 42 statepb "github.com/GoogleCloudPlatform/testgrid/pb/state" 43 summarypb "github.com/GoogleCloudPlatform/testgrid/pb/summary" 44 statuspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status" 45 "github.com/GoogleCloudPlatform/testgrid/pkg/tabulator" 46 "github.com/GoogleCloudPlatform/testgrid/util" 47 "github.com/GoogleCloudPlatform/testgrid/util/gcs" 48 "github.com/GoogleCloudPlatform/testgrid/util/metrics" 49 "github.com/golang/protobuf/proto" 50 "github.com/sirupsen/logrus" 51 ) 52 53 // Metrics holds metrics relevant to the Updater. 54 type Metrics struct { 55 Summarize metrics.Cyclic 56 } 57 58 // CreateMetrics creates all the metrics that the Summarizer will use 59 // This should be called once 60 func CreateMetrics(factory metrics.Factory) *Metrics { 61 return &Metrics{ 62 Summarize: factory.NewCyclic("summarizer"), 63 } 64 } 65 66 // FeatureFlags aggregates the knobs to enable/disable certain features. 67 type FeatureFlags struct { 68 // controls the acceptable flakiness calculation logic for dashboard tab 69 AllowFuzzyFlakiness bool 70 71 // allows ignoring columns with specific test statuses during summarization 72 AllowIgnoredColumns bool 73 74 // allows enforcing minimum number of runs for a dashboard tab 75 AllowMinNumberOfRuns bool 76 } 77 78 // gridReader returns the grid content and metadata (last updated time, generation id) 79 type gridReader func(ctx context.Context) (io.ReadCloser, time.Time, int64, error) 80 81 // groupFinder returns the named group as well as reader for the grid state 82 type groupFinder func(dashboardName string, tab *configpb.DashboardTab) (*gcs.Path, *configpb.TestGroup, gridReader, error) 83 84 func lockDashboard(ctx context.Context, client gcs.ConditionalClient, path gcs.Path, generation int64) (*storage.ObjectAttrs, error) { 85 var buf []byte 86 if generation == 0 { 87 var sum summarypb.DashboardSummary 88 var err error 89 buf, err = proto.Marshal(&sum) 90 if err != nil { 91 return nil, fmt.Errorf("marshal: %w", err) 92 } 93 } 94 95 return gcs.Touch(ctx, client, path, generation, buf) 96 } 97 98 // Fixer should adjust the dashboard queue until the context expires. 99 type Fixer func(context.Context, *config.DashboardQueue) error 100 101 // UpdateOptions aggregates the Update function parameter into a single structure. 102 type UpdateOptions struct { 103 ConfigPath gcs.Path 104 Concurrency int 105 TabPathPrefix string 106 SummaryPathPrefix string 107 AllowedDashboards []string 108 Confirm bool 109 Features FeatureFlags 110 Freq time.Duration 111 } 112 113 // Update summary protos by reading the state protos defined in the config. 114 // 115 // Will use concurrency go routines to update dashboards in parallel. 116 // Setting dashboard will limit update to this dashboard. 117 // Will write summary proto when confirm is set. 118 func Update(ctx context.Context, client gcs.ConditionalClient, mets *Metrics, opts *UpdateOptions, fixers ...Fixer) error { 119 ctx, cancel := context.WithCancel(ctx) 120 defer cancel() 121 if opts.Concurrency < 1 { 122 return fmt.Errorf("concurrency must be positive, got: %d", opts.Concurrency) 123 } 124 log := logrus.WithField("config", opts.ConfigPath) 125 126 var q config.DashboardQueue 127 var cfg *snapshot.Config 128 129 allowed := stringset.New(opts.AllowedDashboards...) 130 fixSnapshot := func(newConfig *snapshot.Config) error { 131 baseLog := log 132 log := log.WithField("fixSnapshot()", true) 133 newConfig.Dashboards = filterDashboards(newConfig.Dashboards, allowed) 134 cfg = newConfig 135 136 dashCap := len(cfg.Dashboards) 137 paths := make([]gcs.Path, 0, dashCap) 138 dashboards := make([]*configpb.Dashboard, 0, dashCap) 139 for _, d := range cfg.Dashboards { 140 path, err := SummaryPath(opts.ConfigPath, opts.SummaryPathPrefix, d.Name) 141 if err != nil { 142 log.WithError(err).WithField("dashboard", d.Name).Error("Bad dashboard path") 143 } 144 paths = append(paths, *path) 145 dashboards = append(dashboards, d) 146 } 147 148 stats := gcs.Stat(ctx, client, 10, paths...) 149 whens := make(map[string]time.Time, len(stats)) 150 var wg sync.WaitGroup 151 for i, stat := range stats { 152 name := dashboards[i].Name 153 path := paths[i] 154 log := log.WithField("path", path) 155 switch { 156 case stat.Attrs != nil: 157 whens[name] = stat.Attrs.Updated.Add(opts.Freq) 158 default: 159 if errors.Is(stat.Err, storage.ErrObjectNotExist) { 160 wg.Add(1) 161 go func() { 162 defer wg.Done() 163 _, err := lockDashboard(ctx, client, path, 0) 164 switch { 165 case gcs.IsPreconditionFailed(err): 166 log.WithError(err).Debug("Lost race to create initial summary") 167 case err != nil: 168 log.WithError(err).Error("Failed to lock initial summary") 169 default: 170 log.Info("Created initial summary") 171 } 172 }() 173 } else { 174 log.WithError(stat.Err).Info("Failed to stat") 175 } 176 whens[name] = time.Now() 177 } 178 } 179 180 wg.Wait() 181 182 q.Init(baseLog, dashboards, time.Now().Add(opts.Freq)) 183 if err := q.FixAll(whens, false); err != nil { 184 log.WithError(err).Error("Failed to fix all dashboards based on last update time") 185 } 186 return nil 187 } 188 189 log.Debug("Observing config...") 190 cfgChanged, err := snapshot.Observe(ctx, log, client, opts.ConfigPath, time.NewTicker(time.Minute).C) 191 if err != nil { 192 return fmt.Errorf("observe config: %w", err) 193 } 194 fixSnapshot(<-cfgChanged) // Bootstrap queue before use 195 196 var active stringset.Set 197 var waiting stringset.Set 198 var lock sync.Mutex 199 200 go func() { 201 fixCtx, fixCancel := context.WithCancel(ctx) 202 var fixWg sync.WaitGroup 203 fixAll := func() { 204 n := len(fixers) 205 log.WithField("fixers", n).Trace("Starting fixers on current dashboards...") 206 fixWg.Add(n) 207 for i, fix := range fixers { 208 go func(i int, fix Fixer) { 209 defer fixWg.Done() 210 if err := fix(fixCtx, &q); err != nil && !errors.Is(err, context.Canceled) { 211 log.WithError(err).WithField("fixer", i).Warning("Fixer failed") 212 } 213 }(i, fix) 214 } 215 log.Debug("Started fixers on current dashboards") 216 } 217 218 ticker := time.NewTicker(time.Minute) // TODO(fejta): subscribe to notifications 219 fixAll() 220 for { 221 lock.Lock() 222 activeDashboards := active.Elements() 223 lock.Unlock() 224 225 depth, next, when := q.Status() 226 log := log.WithFields(logrus.Fields{ 227 "depth": depth, 228 "active": activeDashboards, 229 }) 230 if next != nil { 231 log = log.WithField("next", *next) 232 } 233 delay := time.Since(when) 234 if delay < 0 { 235 delay = 0 236 log = log.WithField("sleep", -delay) 237 } 238 log = log.WithField("delay", delay.Round(time.Second)) 239 log.Info("Updating dashboards") 240 select { 241 case <-ctx.Done(): 242 ticker.Stop() 243 fixCancel() 244 fixWg.Wait() 245 return 246 case newConfig := <-cfgChanged: 247 log.Info("Configuration changed") 248 fixCancel() 249 fixWg.Wait() 250 fixCtx, fixCancel = context.WithCancel(ctx) 251 fixSnapshot(newConfig) 252 fixAll() 253 case <-ticker.C: 254 } 255 256 } 257 }() 258 259 dashboardNames := make(chan string) 260 261 // TODO(fejta): cache downloaded group? 262 findGroup := func(dash string, tab *configpb.DashboardTab) (*gcs.Path, *configpb.TestGroup, gridReader, error) { 263 name := tab.TestGroupName 264 group := cfg.Groups[name] 265 if group == nil { 266 return nil, nil, nil, nil 267 } 268 groupPath, err := tabulator.TabStatePath(opts.ConfigPath, opts.TabPathPrefix, dash, tab.Name) 269 if err != nil { 270 return nil, group, nil, err 271 } 272 reader := func(ctx context.Context) (io.ReadCloser, time.Time, int64, error) { 273 return pathReader(ctx, client, *groupPath) 274 } 275 return groupPath, group, reader, nil 276 } 277 278 tabUpdater := tabUpdatePool(ctx, log, opts.Concurrency, opts.Features) 279 280 updateName := func(log *logrus.Entry, dashName string) (logrus.FieldLogger, bool, error) { 281 ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) 282 defer cancel() 283 dash := cfg.Dashboards[dashName] 284 if dash == nil { 285 return log, false, errors.New("dashboard not found") 286 } 287 log.Debug("Summarizing dashboard") 288 summaryPath, err := SummaryPath(opts.ConfigPath, opts.SummaryPathPrefix, dashName) 289 if err != nil { 290 return log, false, fmt.Errorf("summary path: %v", err) 291 } 292 sum, _, _, err := ReadSummary(ctx, client, *summaryPath) 293 if err != nil { 294 return log, false, fmt.Errorf("read %q: %v", *summaryPath, err) 295 } 296 297 if sum == nil { 298 sum = &summarypb.DashboardSummary{} 299 } 300 301 // TODO(fejta): refactor to note whether there is more work 302 more := updateDashboard(ctx, client, dash, sum, findGroup, tabUpdater) 303 304 var healthyTests int 305 var failures int 306 for _, tab := range sum.TabSummaries { 307 failures += len(tab.FailingTestSummaries) 308 if h := tab.Healthiness; h != nil { 309 healthyTests += len(h.Tests) 310 } 311 } 312 313 log = log.WithFields(logrus.Fields{ 314 "path": summaryPath, 315 "tabs": len(sum.TabSummaries), 316 "failures": failures, 317 "healthy-tests": healthyTests, 318 }) 319 if !opts.Confirm { 320 return log, more, nil 321 } 322 size, err := writeSummary(ctx, client, *summaryPath, sum) 323 log = log.WithField("bytes", size) 324 if err != nil { 325 return log, more, fmt.Errorf("write: %w", err) 326 } 327 return log, more, nil 328 } 329 330 var wg sync.WaitGroup 331 wg.Add(opts.Concurrency) 332 for i := 0; i < opts.Concurrency; i++ { 333 go func() { 334 defer wg.Done() 335 for dashName := range dashboardNames { 336 lock.Lock() 337 start := active.Add(dashName) 338 if !start { 339 waiting.Add(dashName) 340 } 341 lock.Unlock() 342 if !start { 343 continue 344 } 345 346 log := log.WithField("dashboard", dashName) 347 finish := mets.Summarize.Start() 348 if log, more, err := updateName(log, dashName); err != nil { 349 finish.Fail() 350 q.Fix(dashName, time.Now().Add(opts.Freq/2), false) 351 log.WithError(err).Error("Failed to summarize dashboard") 352 } else { 353 finish.Success() 354 if more { 355 q.Fix(dashName, time.Now(), false) 356 log = log.WithField("more", more) 357 } 358 log.Info("Summarized dashboard") 359 } 360 361 lock.Lock() 362 active.Discard(dashName) 363 restart := waiting.Discard(dashName) 364 lock.Unlock() 365 if restart { 366 q.Fix(dashName, time.Now(), false) 367 } 368 369 } 370 }() 371 } 372 defer wg.Wait() 373 defer close(dashboardNames) 374 375 return q.Send(ctx, dashboardNames, opts.Freq) 376 } 377 378 func filterDashboards(dashboards map[string]*configpb.Dashboard, allowed stringset.Set) map[string]*configpb.Dashboard { 379 if allowed.Len() == 0 { 380 return dashboards 381 } 382 383 for key, d := range dashboards { 384 if allowed.Contains(d.Name) { 385 continue 386 } 387 delete(dashboards, key) 388 } 389 return dashboards 390 } 391 392 var ( 393 normalizer = regexp.MustCompile(`[^a-z0-9]+`) 394 ) 395 396 // SummaryPath generates a summary GCS path for a given dashboard 397 func SummaryPath(g gcs.Path, prefix, dashboard string) (*gcs.Path, error) { 398 // ''.join(c for c in n.lower() if c is alphanumeric 399 name := "summary-" + normalizer.ReplaceAllString(strings.ToLower(dashboard), "") 400 fullName := path.Join(prefix, name) 401 u, err := url.Parse(fullName) 402 if err != nil { 403 return nil, fmt.Errorf("parse url: %w", err) 404 } 405 np, err := g.ResolveReference(u) 406 if err != nil { 407 return nil, fmt.Errorf("resolve reference: %w", err) 408 } 409 if np.Bucket() != g.Bucket() { 410 return nil, fmt.Errorf("dashboard %s should not change bucket", fullName) 411 } 412 return np, nil 413 } 414 415 // ReadSummary provides the dashboard summary as defined in summary.proto. 416 // IMPORTANT: Returns nil if the object doesn't exist. 417 // Returns an error iff wasn't read or serialized properly. 418 func ReadSummary(ctx context.Context, client gcs.Client, path gcs.Path) (*summarypb.DashboardSummary, time.Time, int64, error) { 419 r, modified, gen, err := pathReader(ctx, client, path) 420 if errors.Is(err, storage.ErrObjectNotExist) { 421 return nil, time.Time{}, 0, nil 422 } else if err != nil { 423 return nil, time.Time{}, 0, fmt.Errorf("open: %w", err) 424 } 425 buf, err := ioutil.ReadAll(r) 426 if err != nil { 427 return nil, time.Time{}, 0, fmt.Errorf("read: %w", err) 428 } 429 var sum summarypb.DashboardSummary 430 431 if err := proto.Unmarshal(buf, &sum); err != nil { 432 return nil, time.Time{}, 0, fmt.Errorf("unmarhsal: %v", err) 433 } 434 435 return &sum, modified, gen, nil 436 } 437 438 func writeSummary(ctx context.Context, client gcs.Client, path gcs.Path, sum *summarypb.DashboardSummary) (int, error) { 439 buf, err := proto.Marshal(sum) 440 if err != nil { 441 return 0, fmt.Errorf("marshal: %v", err) 442 } 443 _, err = client.Upload(ctx, path, buf, gcs.DefaultACL, gcs.NoCache) 444 return len(buf), err 445 } 446 447 func statPaths(ctx context.Context, log logrus.FieldLogger, client gcs.Stater, paths ...gcs.Path) []*storage.ObjectAttrs { 448 return gcs.StatExisting(ctx, log, client, paths...) 449 } 450 451 // pathReader returns a reader for the specified path and last modified, generation metadata. 452 func pathReader(ctx context.Context, client gcs.Client, path gcs.Path) (io.ReadCloser, time.Time, int64, error) { 453 r, attrs, err := client.Open(ctx, path) 454 if err != nil { 455 return nil, time.Time{}, 0, fmt.Errorf("client.Open(): %w", err) 456 } 457 if attrs == nil { 458 return r, time.Time{}, 0, nil 459 } 460 return r, attrs.LastModified, attrs.Generation, nil 461 } 462 463 func tabStatus(dashName, tabName, msg string) *summarypb.DashboardTabSummary { 464 return &summarypb.DashboardTabSummary{ 465 DashboardName: dashName, 466 DashboardTabName: tabName, 467 OverallStatus: summarypb.DashboardTabSummary_UNKNOWN, 468 Alert: msg, 469 Status: msg, 470 } 471 } 472 473 // updateDashboard will summarize all the tabs. 474 // 475 // Errors summarizing tabs are displayed on the summary for the dashboard. 476 // 477 // Returns true when there is more work to to. 478 func updateDashboard(ctx context.Context, client gcs.Stater, dash *configpb.Dashboard, sum *summarypb.DashboardSummary, findGroup groupFinder, tabUpdater *tabUpdater) bool { 479 log := logrus.WithField("dashboard", dash.Name) 480 481 var graceCtx context.Context 482 if when, ok := ctx.Deadline(); ok { 483 dur := time.Until(when) / 2 484 var cancel func() 485 graceCtx, cancel = context.WithTimeout(ctx, dur) 486 defer cancel() 487 } else { 488 graceCtx = ctx 489 } 490 491 // First collect the previously summarized tabs. 492 tabSummaries := make(map[string]*summarypb.DashboardTabSummary, len(sum.TabSummaries)) 493 for _, tabSum := range sum.TabSummaries { 494 tabSummaries[tabSum.DashboardTabName] = tabSum 495 } 496 497 // Now create info about which tabs we need to summarize and where the grid state lives. 498 type groupInfo struct { 499 group *configpb.TestGroup 500 reader gridReader 501 tabs []*configpb.DashboardTab 502 } 503 groupInfos := make(map[gcs.Path]*groupInfo, len(dash.DashboardTab)) 504 505 var paths []gcs.Path 506 for _, tab := range dash.DashboardTab { 507 groupPath, group, groupReader, err := findGroup(dash.Name, tab) 508 if err != nil { 509 tabSummaries[tab.Name] = tabStatus(dash.Name, tab.Name, fmt.Sprintf("Error reading group info: %v", err)) 510 continue 511 } 512 if group == nil { 513 tabSummaries[tab.Name] = tabStatus(dash.Name, tab.Name, fmt.Sprintf("Test group does not exist: %q", tab.TestGroupName)) 514 continue 515 } 516 info := groupInfos[*groupPath] 517 if info == nil { 518 info = &groupInfo{ 519 group: group, 520 reader: groupReader, // TODO(fejta): optimize (only read once) 521 } 522 paths = append(paths, *groupPath) 523 groupInfos[*groupPath] = info 524 } 525 info.tabs = append(info.tabs, tab) 526 } 527 528 // Check the attributes of the grid states. 529 attrs := gcs.StatExisting(ctx, log, client, paths...) 530 531 delays := make(map[gcs.Path]float64, len(paths)) 532 533 // determine how much behind each summary is 534 for i, path := range paths { 535 a := attrs[i] 536 for _, tab := range groupInfos[path].tabs { 537 // TODO(fejta): optimize (only read once) 538 name := tab.Name 539 sum := tabSummaries[name] 540 if a == nil { 541 tabSummaries[name] = tabStatus(dash.Name, name, noRuns) 542 delays[path] = -1 543 } else if sum == nil { 544 tabSummaries[name] = tabStatus(dash.Name, name, "Newly created tab") 545 delays[path] = float64(24 * time.Hour / time.Second) 546 log.WithField("tab", name).Debug("Found new tab") 547 } else { 548 delays[path] = float64(attrs[i].Updated.Unix()) - tabSummaries[name].LastUpdateTimestamp 549 } 550 } 551 } 552 553 // sort by delay 554 sort.SliceStable(paths, func(i, j int) bool { 555 return delays[paths[i]] > delays[paths[j]] 556 }) 557 558 // Now let's update the tab summaries in parallel, starting with most delayed 559 560 type future struct { 561 log *logrus.Entry 562 name string 563 result func() (*summarypb.DashboardTabSummary, error) 564 } 565 566 // channel to receive updated tabs 567 ch := make(chan future) 568 569 // request an update for each tab, starting with the least recently modified one. 570 go func() { 571 defer close(ch) 572 tabUpdater.lock.Lock() 573 defer tabUpdater.lock.Unlock() 574 for _, path := range paths { 575 info := groupInfos[path] 576 log := log.WithField("group", path) 577 for _, tab := range info.tabs { 578 log := log.WithField("tab", tab.Name) 579 delay := delays[path] 580 if delay == 0 { 581 log.Debug("Already up to date") 582 continue 583 } else if delay == -1 { 584 log.Debug("No grid state to process") 585 } 586 log = log.WithField("delay", delay) 587 if err := graceCtx.Err(); err != nil { 588 log.WithError(err).Info("Interrupted") 589 return 590 } 591 log.Debug("Requesting tab summary update") 592 f := tabUpdater.update(ctx, tab, info.group, info.reader) 593 select { 594 case <-ctx.Done(): 595 return 596 case ch <- future{log, tab.Name, f}: 597 } 598 } 599 } 600 }() 601 602 // Update the summary for any tabs that give a response 603 for fut := range ch { 604 tabName := fut.name 605 log := fut.log 606 log.Trace("Waiting for updated tab summary response") 607 s, err := fut.result() 608 if err != nil { 609 s = tabStatus(dash.Name, tabName, fmt.Sprintf("Error attempting to summarize tab: %v", err)) 610 log = log.WithError(err) 611 } else { 612 s.DashboardName = dash.Name 613 } 614 tabSummaries[tabName] = s 615 log.Trace("Updated tab summary") 616 } 617 618 // assemble them back into the dashboard summary. 619 sum.TabSummaries = make([]*summarypb.DashboardTabSummary, len(dash.DashboardTab)) 620 for idx, tab := range dash.DashboardTab { 621 sum.TabSummaries[idx] = tabSummaries[tab.Name] 622 } 623 624 return graceCtx.Err() != nil 625 } 626 627 type tabUpdater struct { 628 lock sync.Mutex 629 update func(context.Context, *configpb.DashboardTab, *configpb.TestGroup, gridReader) func() (*summarypb.DashboardTabSummary, error) 630 } 631 632 func tabUpdatePool(poolCtx context.Context, log *logrus.Entry, concurrency int, features FeatureFlags) *tabUpdater { 633 type request struct { 634 ctx context.Context 635 tab *configpb.DashboardTab 636 group *configpb.TestGroup 637 read gridReader 638 sum *summarypb.DashboardTabSummary 639 err error 640 wg sync.WaitGroup 641 } 642 643 ch := make(chan *request, concurrency) 644 645 var wg sync.WaitGroup 646 wg.Add(concurrency) 647 log = log.WithField("concurrency", concurrency) 648 log.Info("Starting up worker pool") 649 650 for i := 0; i < concurrency; i++ { 651 go func() { 652 defer wg.Done() 653 for req := range ch { 654 req.sum, req.err = updateTab(req.ctx, req.tab, req.group, req.read, features) 655 req.wg.Done() 656 } 657 }() 658 } 659 660 go func() { 661 <-poolCtx.Done() 662 log.Info("Shutting down worker pool") 663 close(ch) 664 wg.Wait() 665 log.Info("Worker pool stopped") 666 }() 667 668 updateTabViaPool := func(ctx context.Context, tab *configpb.DashboardTab, group *configpb.TestGroup, groupReader gridReader) func() (*summarypb.DashboardTabSummary, error) { 669 req := &request{ 670 ctx: ctx, 671 tab: tab, 672 group: group, 673 read: groupReader, 674 } 675 req.wg.Add(1) 676 select { 677 case <-ctx.Done(): 678 return func() (*summarypb.DashboardTabSummary, error) { return nil, ctx.Err() } 679 case ch <- req: 680 return func() (*summarypb.DashboardTabSummary, error) { 681 req.wg.Wait() 682 return req.sum, req.err 683 } 684 } 685 } 686 687 return &tabUpdater{ 688 update: updateTabViaPool, 689 } 690 } 691 692 // staleHours returns the configured number of stale hours for the tab. 693 func staleHours(tab *configpb.DashboardTab) time.Duration { 694 if tab.AlertOptions == nil { 695 return 0 696 } 697 return time.Duration(tab.AlertOptions.AlertStaleResultsHours) * time.Hour 698 } 699 700 // updateTab reads the latest grid state for the tab and summarizes it. 701 func updateTab(ctx context.Context, tab *configpb.DashboardTab, group *configpb.TestGroup, groupReader gridReader, features FeatureFlags) (*summarypb.DashboardTabSummary, error) { 702 groupName := tab.TestGroupName 703 grid, mod, _, err := readGrid(ctx, groupReader) // TODO(fejta): track gen 704 if err != nil { 705 return nil, fmt.Errorf("load %s: %v", groupName, err) 706 } 707 708 var healthiness *summarypb.HealthinessInfo 709 if shouldRunHealthiness(tab) { 710 // TODO (itsazhuhere@): Change to rely on YAML defaults rather than consts 711 interval := int(tab.HealthAnalysisOptions.DaysOfAnalysis) 712 if interval <= 0 { 713 interval = DefaultInterval 714 } 715 healthiness = getHealthinessForInterval(grid, tab.Name, time.Now(), interval) 716 } 717 718 recent := recentColumns(tab, group) 719 grid.Rows = recentRows(grid.Rows, recent) 720 721 grid.Rows = filterMethods(grid.Rows) 722 723 latest, latestSeconds := latestRun(grid.Columns) 724 alert := staleAlert(mod, latest, staleHours(tab), len(grid.Rows)) 725 failures := failingTestSummaries(grid.Rows, tab.GetOpenTestTemplate(), group.GetGcsPrefix(), group.GetColumnHeader()) 726 colsCells, brokenState := gridMetrics(len(grid.Columns), grid.Rows, recent, tab.BrokenColumnThreshold, features, tab.GetStatusCustomizationOptions()) 727 metrics := tabMetrics(colsCells) 728 tabStatus := overallStatus(grid, recent, alert, brokenState, failures, features, colsCells, tab.GetStatusCustomizationOptions()) 729 return &summarypb.DashboardTabSummary{ 730 DashboardTabName: tab.Name, 731 LastUpdateTimestamp: float64(mod.Unix()), 732 LastRunTimestamp: float64(latestSeconds), 733 Alert: alert, 734 FailingTestSummaries: failures, 735 OverallStatus: tabStatus, 736 Status: statusMessage(colsCells, tabStatus, tab.GetStatusCustomizationOptions()), 737 LatestGreen: latestGreen(grid, group.UseKubernetesClient), 738 BugUrl: tab.GetOpenBugTemplate().GetUrl(), 739 Healthiness: healthiness, 740 LinkedIssues: allLinkedIssues(grid.Rows), 741 SummaryMetrics: metrics, 742 }, nil 743 } 744 745 // readGrid downloads and deserializes the current test group state. 746 func readGrid(ctx context.Context, reader gridReader) (*statepb.Grid, time.Time, int64, error) { 747 var t time.Time 748 r, mod, gen, err := reader(ctx) 749 if err != nil { 750 return nil, t, 0, fmt.Errorf("open: %w", err) 751 } 752 defer r.Close() 753 zlibReader, err := zlib.NewReader(r) 754 if err != nil { 755 return nil, t, 0, fmt.Errorf("decompress: %v", err) 756 } 757 buf, err := ioutil.ReadAll(zlibReader) 758 if err != nil { 759 return nil, t, 0, fmt.Errorf("read: %v", err) 760 } 761 var g statepb.Grid 762 if err = proto.Unmarshal(buf, &g); err != nil { 763 return nil, t, 0, fmt.Errorf("parse: %v", err) 764 } 765 return &g, mod, gen, nil 766 } 767 768 // recentColumns returns the configured number of recent columns to summarize, or 5. 769 func recentColumns(tab *configpb.DashboardTab, group *configpb.TestGroup) int { 770 return firstFilled(tab.NumColumnsRecent, group.NumColumnsRecent, 5) 771 } 772 773 // firstFilled returns the first non-empty value, or zero. 774 func firstFilled(values ...int32) int { 775 for _, v := range values { 776 if v != 0 { 777 return int(v) 778 } 779 } 780 return 0 781 } 782 783 // recentRows returns the subset of rows with at least one recent result 784 func recentRows(in []*statepb.Row, recent int) []*statepb.Row { 785 var rows []*statepb.Row 786 for _, r := range in { 787 if r.Results == nil { 788 continue 789 } 790 if statuspb.TestStatus(r.Results[0]) == statuspb.TestStatus_NO_RESULT && int(r.Results[1]) >= recent { 791 continue 792 } 793 rows = append(rows, r) 794 } 795 return rows 796 } 797 798 // filterMethods returns the subset of rows that do not have test method names 799 func filterMethods(rows []*statepb.Row) []*statepb.Row { 800 var filtered []*statepb.Row 801 for _, r := range rows { 802 if !isValidTestName(r.Id) || !isValidTestName(r.Name) { 803 continue 804 } 805 filtered = append(filtered, r) 806 } 807 return filtered 808 } 809 810 // latestRun returns the Time (and seconds-since-epoch) of the most recent run. 811 func latestRun(columns []*statepb.Column) (time.Time, int64) { 812 if len(columns) > 0 { 813 if start := int64(columns[0].Started); start > 0 { 814 second := start / 1000 815 mills := start % 1000 816 return time.Unix(second, mills*1e6), second 817 } 818 } 819 return time.Time{}, 0 820 } 821 822 const noRuns = "no completed results" 823 824 // staleAlert returns an explanatory message if the latest results are stale. 825 func staleAlert(mod, ran time.Time, stale time.Duration, rows int) string { 826 if mod.IsZero() { 827 return "no stored results" 828 } 829 if stale == 0 { 830 return "" 831 } 832 if ran.IsZero() || rows == 0 { // Has no columns and/or no rows. 833 return noRuns 834 } 835 now := time.Now() 836 if dur := now.Sub(mod); dur > stale { 837 return fmt.Sprintf("data has not changed since %s (%s old)", mod, dur.Truncate(15*time.Minute)) 838 } 839 if dur := now.Sub(ran); dur > stale { 840 return fmt.Sprintf("latest column from %s (%s old)", ran, dur.Truncate(15*time.Minute)) 841 } 842 return "" 843 } 844 845 // failingTestSummaries returns details for every row with an active alert. 846 func failingTestSummaries(rows []*statepb.Row, template *configpb.LinkTemplate, gcsPrefix string, columnHeader []*configpb.TestGroup_ColumnHeader) []*summarypb.FailingTestSummary { 847 var failures []*summarypb.FailingTestSummary 848 for _, row := range rows { 849 if row.AlertInfo == nil { 850 continue 851 } 852 alert := row.AlertInfo 853 sum := summarypb.FailingTestSummary{ 854 DisplayName: row.Name, 855 TestName: row.Id, 856 FailBuildId: alert.FailBuildId, 857 LatestFailBuildId: alert.LatestFailBuildId, 858 FailCount: alert.FailCount, 859 FailureMessage: alert.FailureMessage, 860 PassBuildId: alert.PassBuildId, 861 // TODO(fejta): better build info 862 BuildLink: alert.BuildLink, 863 BuildLinkText: alert.BuildLinkText, 864 BuildUrlText: alert.BuildUrlText, 865 LinkedBugs: row.Issues, 866 FailTestLink: buildFailLink(alert.FailTestId, row.Id), 867 LatestFailTestLink: buildFailLink(alert.LatestFailTestId, row.Id), 868 Properties: alert.Properties, 869 CustomColumnHeaders: alert.CustomColumnHeaders, 870 HotlistIds: alert.HotlistIds, 871 EmailAddresses: alert.EmailAddresses, 872 } 873 if alert.PassTime != nil { 874 sum.PassTimestamp = float64(alert.PassTime.Seconds) 875 } 876 if alert.FailTime != nil { 877 sum.FailTimestamp = float64(alert.FailTime.Seconds) 878 } 879 880 propertyToColumnHeader := make(map[string]string) 881 for i := 0; i < len(columnHeader); i++ { 882 if columnHeader[i].Label != "" { 883 propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].Label 884 } else if columnHeader[i].Property != "" { 885 propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].Property 886 } else { 887 propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].ConfigurationValue 888 } 889 } 890 891 // Verify what the links for alerts would be with the new method. 892 failLink := testResultLink(template, alert.GetProperties(), alert.GetFailTestId(), row.GetId(), alert.GetFailBuildId(), gcsPrefix, propertyToColumnHeader, alert.CustomColumnHeaders) 893 latestFailLink := testResultLink(template, alert.GetProperties(), alert.GetLatestFailTestId(), row.GetId(), alert.GetLatestFailBuildId(), gcsPrefix, propertyToColumnHeader, alert.CustomColumnHeaders) 894 log := logrus.WithField("failLink", failLink).WithField("latestFailLink", latestFailLink) 895 if failLink == "" || latestFailLink == "" { 896 log.Warning("Failed to create failure link.") 897 } else if !strings.HasPrefix(failLink, "http") || !strings.HasPrefix(latestFailLink, "http") { 898 log.Warning("Failure link does not include scheme.") 899 } else { 900 log.Info("Created failure links.") 901 } 902 903 failures = append(failures, &sum) 904 } 905 return failures 906 } 907 908 // buildFailLink creates a search link 909 // TODO(#134): Build proper url for both internal and external jobs 910 func buildFailLink(testID, target string) string { 911 return fmt.Sprintf("%s %s", url.PathEscape(testID), url.PathEscape(target)) 912 } 913 914 func testResultLink(template *configpb.LinkTemplate, properties map[string]string, testID, target, buildID, gcsPrefix string, propertyToColumnHeader map[string]string, customColumnHeaders map[string]string) string { 915 // Return the result of open_test_template for the tab. 916 // This assumes that open_test_template uses a limited set of tokens (since it's not in the context of a browser). 917 // Assume that the following are valid: <gcs_prefix>, <test-name>, <workflow-id>, <workflow-name>, <test-id>, <build ID> 918 // TODO: Ensure workflow-id, workflow-name are added in alerts. 919 tokens := util.Tokens(template) 920 parameters := map[string]string{} 921 for _, token := range tokens { 922 switch token { 923 case util.GcsPrefix: 924 parameters[util.GcsPrefix] = gcsPrefix 925 case util.TestName: 926 parameters[util.TestName] = target 927 case util.WorkflowID: 928 if workflowID, ok := properties["workflow-id"]; ok { 929 parameters[util.WorkflowID] = workflowID 930 } 931 case util.WorkflowName: 932 if WorkflowName, ok := properties["workflow-name"]; ok { 933 parameters[util.WorkflowName] = WorkflowName 934 } 935 case util.TestID: 936 parameters[util.TestID] = testID 937 case util.BuildID: 938 parameters[util.BuildID] = buildID 939 case util.CustomColumnRe.FindString(token): 940 if v, ok := customColumnHeaders[propertyToColumnHeader[token]]; ok { 941 parameters[token] = v 942 } 943 default: 944 // Didn't match any simple tokens, check if it's a property. 945 trimmedToken := strings.NewReplacer("<", "", ">", "").Replace(token) 946 if v, ok := properties[trimmedToken]; ok { 947 parameters[token] = v 948 } 949 } 950 } 951 link, err := util.ExpandTemplate(template, parameters) 952 if err != nil { 953 logrus.WithError(err).WithField("template", template).WithField("parameters", parameters).Error("Error expanding link template.") 954 return "" 955 } 956 return link 957 } 958 959 // overallStatus determines whether the tab is stale, failing, flaky or healthy. 960 // 961 // Tabs are: 962 // BROKEN - called with brokenState (typically when most rows are red) 963 // STALE - called with a stale mstring (typically when most recent column is old) 964 // FAIL - there is at least one alert 965 // ACCEPTABLE - the ratio of (valid) failing to total columns is less than configured threshold 966 // FLAKY - at least one recent column has failing cells 967 // PENDING - number of valid columns is less than minimum # of runs required 968 // PASS - all recent columns are entirely green 969 func overallStatus(grid *statepb.Grid, recent int, stale string, brokenState bool, alerts []*summarypb.FailingTestSummary, features FeatureFlags, colCells gridStats, opts *configpb.DashboardTabStatusCustomizationOptions) summarypb.DashboardTabSummary_TabStatus { 970 if brokenState { 971 return summarypb.DashboardTabSummary_BROKEN 972 } 973 if stale != "" { 974 return summarypb.DashboardTabSummary_STALE 975 } 976 if len(alerts) > 0 { 977 return summarypb.DashboardTabSummary_FAIL 978 } 979 // safeguard PENDING status behind a flag 980 if features.AllowMinNumberOfRuns && opts.GetMinAcceptableRuns() > int32(colCells.completedCols-colCells.ignoredCols) { 981 return summarypb.DashboardTabSummary_PENDING 982 } 983 984 results := result.Map(grid.Rows) 985 moreCols := true 986 var passing bool 987 var flaky bool 988 // We want to look at recent columns, skipping over any that are still running. 989 for moreCols && recent > 0 { 990 moreCols = false 991 var foundCol bool 992 var running bool 993 var ignored bool 994 // One result off each column since we don't know which 995 // cells are running ahead of time. 996 for _, resultF := range results { 997 r, ok := resultF() 998 if !ok { 999 continue 1000 } 1001 moreCols = true 1002 if r == statuspb.TestStatus_RUNNING { 1003 running = true 1004 // not break because we need to pull this column's 1005 // result off every row's channel. 1006 continue 1007 } 1008 if features.AllowIgnoredColumns && result.Ignored(r, opts) { 1009 ignored = true 1010 continue 1011 } 1012 r = coalesceResult(r, result.IgnoreRunning) 1013 if r == statuspb.TestStatus_NO_RESULT { 1014 continue 1015 } 1016 // any failure in a recent column results in flaky 1017 if r != statuspb.TestStatus_PASS { 1018 flaky = true 1019 continue 1020 } 1021 foundCol = true 1022 } 1023 1024 // Running columns are unfinished and therefore should 1025 // not count as "recent" until they finish. 1026 if running { 1027 continue 1028 } 1029 1030 // Ignored columns are ignored from tab status but they do count as recent 1031 // Failures in this col are ignored too 1032 if ignored { 1033 recent-- 1034 flaky = false 1035 continue 1036 } 1037 1038 if flaky { 1039 if isAcceptable(colCells, opts, features) { 1040 return summarypb.DashboardTabSummary_ACCEPTABLE 1041 } 1042 return summarypb.DashboardTabSummary_FLAKY 1043 } 1044 1045 if foundCol { 1046 passing = true 1047 recent-- 1048 } 1049 } 1050 1051 if passing { 1052 return summarypb.DashboardTabSummary_PASS 1053 } 1054 return summarypb.DashboardTabSummary_UNKNOWN 1055 } 1056 1057 // isAcceptable determines if the flakiness is within acceptable range. 1058 // Return true iff the feature is enabled, `max_acceptable_flakiness` is set and flakiness is < than configured. 1059 func isAcceptable(colCells gridStats, opts *configpb.DashboardTabStatusCustomizationOptions, features FeatureFlags) bool { 1060 if features.AllowFuzzyFlakiness && opts.GetMaxAcceptableFlakiness() > 0 && 1061 100*float64(colCells.passingCols)/float64(colCells.completedCols-colCells.ignoredCols) >= float64(100-opts.GetMaxAcceptableFlakiness()) { 1062 return true 1063 } 1064 1065 return false 1066 } 1067 1068 func allLinkedIssues(rows []*statepb.Row) []string { 1069 issueSet := make(map[string]bool) 1070 for _, row := range rows { 1071 for _, issueID := range row.Issues { 1072 issueSet[issueID] = true 1073 } 1074 } 1075 linkedIssues := []string{} 1076 for issueID := range issueSet { 1077 linkedIssues = append(linkedIssues, issueID) 1078 } 1079 return linkedIssues 1080 } 1081 1082 // gridStats aggregates columnar and cellular metrics as a struct 1083 type gridStats struct { 1084 passingCols int 1085 completedCols int 1086 ignoredCols int 1087 passingCells int 1088 filledCells int 1089 } 1090 1091 // Culminate set of metrics related to a section of the Grid 1092 func gridMetrics(cols int, rows []*statepb.Row, recent int, brokenThreshold float32, features FeatureFlags, opts *configpb.DashboardTabStatusCustomizationOptions) (gridStats, bool) { 1093 results := result.Map(rows) 1094 var passingCells int 1095 var filledCells int 1096 var passingCols int 1097 var completedCols int 1098 var ignoredCols int 1099 var brokenState bool 1100 1101 for idx := 0; idx < cols; idx++ { 1102 if idx >= recent { 1103 break 1104 } 1105 var passes int 1106 var failures int 1107 var ignores int 1108 var other int 1109 for _, iter := range results { 1110 // TODO(fejta): fail old running cols 1111 r, _ := iter() 1112 // check for ignores first 1113 if features.AllowIgnoredColumns && result.Ignored(r, opts) { 1114 ignores++ 1115 } 1116 // proceed with the rest of calculations 1117 status := coalesceResult(r, result.IgnoreRunning) 1118 if result.Passing(status) { 1119 passes++ 1120 passingCells++ 1121 filledCells++ 1122 } else if result.Failing(status) { 1123 failures++ 1124 filledCells++ 1125 } else if status != statuspb.TestStatus_NO_RESULT { 1126 other++ 1127 filledCells++ 1128 } 1129 } 1130 1131 if passes+failures+other > 0 { 1132 completedCols++ 1133 } 1134 // only one of those can be true 1135 if ignores > 0 { 1136 ignoredCols++ 1137 } else if failures == 0 && passes > 0 { 1138 passingCols++ 1139 } 1140 1141 if passes+failures > 0 && brokenThreshold > 0 { 1142 if float32(failures)/float32(passes+failures+other) > brokenThreshold { 1143 brokenState = true 1144 } 1145 } 1146 } 1147 1148 metrics := gridStats{ 1149 passingCols: passingCols, 1150 completedCols: completedCols, 1151 ignoredCols: ignoredCols, 1152 passingCells: passingCells, 1153 filledCells: filledCells, 1154 } 1155 1156 return metrics, brokenState 1157 } 1158 1159 // Add a subset of colCellMetrics to summary proto 1160 func tabMetrics(colCells gridStats) *summarypb.DashboardTabSummaryMetrics { 1161 return &summarypb.DashboardTabSummaryMetrics{ 1162 PassingColumns: int32(colCells.passingCols), 1163 CompletedColumns: int32(colCells.completedCols), 1164 IgnoredColumns: int32(colCells.ignoredCols), 1165 } 1166 } 1167 1168 func fmtStatus(colCells gridStats, tabStatus summarypb.DashboardTabSummary_TabStatus, opts *configpb.DashboardTabStatusCustomizationOptions) string { 1169 colCent := 100 * float64(colCells.passingCols) / float64(colCells.completedCols) 1170 cellCent := 100 * float64(colCells.passingCells) / float64(colCells.filledCells) 1171 flakyCent := 100 * float64(colCells.completedCols-colCells.ignoredCols-colCells.passingCols) / float64(colCells.completedCols-colCells.ignoredCols) 1172 // put tab stats on a single line and additional status info on the next 1173 statusMsg := fmt.Sprintf("Tab stats: %d of %d (%.1f%%) recent columns passed (%d of %d or %.1f%% cells)", colCells.passingCols, colCells.completedCols, colCent, colCells.passingCells, colCells.filledCells, cellCent) 1174 if colCells.ignoredCols > 0 { 1175 statusMsg += fmt.Sprintf(". %d columns ignored", colCells.ignoredCols) 1176 } 1177 // add status info message for certain cases 1178 if tabStatus == summarypb.DashboardTabSummary_PENDING { 1179 statusMsg += "\nStatus info: Not enough runs" 1180 } else if tabStatus == summarypb.DashboardTabSummary_ACCEPTABLE { 1181 statusMsg += fmt.Sprintf("\nStatus info: Recent flakiness (%.1f%%) over valid columns is within configured acceptable level of %.1f%%.", flakyCent, opts.GetMaxAcceptableFlakiness()) 1182 } 1183 return statusMsg 1184 } 1185 1186 // Tab stats: 3 out of 5 (60.0%) recent columns passed (35 of 50 or 70.0% cells). 1 columns ignored. 1187 // (OPTIONAL) Status info: Recent flakiness (40.0%) flakiness is within configured acceptable level of X 1188 // OR Status info: Not enough runs 1189 func statusMessage(colCells gridStats, tabStatus summarypb.DashboardTabSummary_TabStatus, opts *configpb.DashboardTabStatusCustomizationOptions) string { 1190 if colCells.filledCells == 0 { 1191 return noRuns 1192 } 1193 return fmtStatus(colCells, tabStatus, opts) 1194 } 1195 1196 const noGreens = "no recent greens" 1197 1198 // latestGreen finds the ID for the most recent column with all passing rows. 1199 // 1200 // Returns the build, first extra column header and/or a no recent greens message. 1201 func latestGreen(grid *statepb.Grid, useFirstExtra bool) string { 1202 results := result.Map(grid.Rows) 1203 for _, col := range grid.Columns { 1204 var failures bool 1205 var passes bool 1206 for _, resultF := range results { 1207 r, _ := resultF() 1208 result := coalesceResult(r, result.ShowRunning) 1209 if result == statuspb.TestStatus_PASS { 1210 passes = true 1211 } 1212 if result == statuspb.TestStatus_FLAKY || result == statuspb.TestStatus_FAIL || result == statuspb.TestStatus_UNKNOWN { 1213 failures = true 1214 } 1215 } 1216 if failures || !passes { 1217 continue 1218 } 1219 if useFirstExtra && len(col.Extra) > 0 { 1220 return col.Extra[0] 1221 } 1222 return col.Build 1223 } 1224 return noGreens 1225 } 1226 1227 func getHealthinessForInterval(grid *statepb.Grid, tabName string, currentTime time.Time, interval int) *summarypb.HealthinessInfo { 1228 now := goBackDays(0, currentTime) 1229 oneInterval := goBackDays(interval, currentTime) 1230 twoIntervals := goBackDays(2*interval, currentTime) 1231 1232 healthiness := CalculateHealthiness(grid, oneInterval, now, tabName) 1233 pastHealthiness := CalculateHealthiness(grid, twoIntervals, oneInterval, tabName) 1234 CalculateTrend(healthiness, pastHealthiness) 1235 1236 healthiness.PreviousFlakiness = []float32{pastHealthiness.AverageFlakiness} 1237 return healthiness 1238 } 1239 1240 func goBackDays(days int, currentTime time.Time) int { 1241 // goBackDays gets the time intervals for our flakiness report. 1242 // The old version of this function would round to the 12am of the given day. 1243 // Since the new flakiness report will be run with Summarizer and therefore more often 1244 // than the once-a-week of the old flakiness report, we will not round to 12am anymore. 1245 date := currentTime.AddDate(0, 0, -1*days) 1246 intDate := int(date.Unix()) 1247 return intDate 1248 } 1249 1250 func shouldRunHealthiness(tab *configpb.DashboardTab) bool { 1251 if tab.HealthAnalysisOptions == nil { 1252 return false 1253 } 1254 return tab.HealthAnalysisOptions.Enable 1255 } 1256 1257 // coalesceResult reduces the result to PASS, NO_RESULT, FAIL or FLAKY. 1258 func coalesceResult(rowResult statuspb.TestStatus, ignoreRunning bool) statuspb.TestStatus { 1259 return result.Coalesce(rowResult, ignoreRunning) 1260 }