github.com/GoogleCloudPlatform/testgrid@v0.0.174/pkg/summarizer/summary.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package summarizer provides a method to read state protos defined in a config an output summary protos.
    18  package summarizer
    19  
    20  import (
    21  	"compress/zlib"
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"net/url"
    28  	"path"
    29  	"regexp"
    30  	"sort"
    31  	"strconv"
    32  	"strings"
    33  	"sync"
    34  	"time"
    35  
    36  	"bitbucket.org/creachadair/stringset"
    37  	"cloud.google.com/go/storage"
    38  	"github.com/GoogleCloudPlatform/testgrid/config"
    39  	"github.com/GoogleCloudPlatform/testgrid/config/snapshot"
    40  	"github.com/GoogleCloudPlatform/testgrid/internal/result"
    41  	configpb "github.com/GoogleCloudPlatform/testgrid/pb/config"
    42  	statepb "github.com/GoogleCloudPlatform/testgrid/pb/state"
    43  	summarypb "github.com/GoogleCloudPlatform/testgrid/pb/summary"
    44  	statuspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status"
    45  	"github.com/GoogleCloudPlatform/testgrid/pkg/tabulator"
    46  	"github.com/GoogleCloudPlatform/testgrid/util"
    47  	"github.com/GoogleCloudPlatform/testgrid/util/gcs"
    48  	"github.com/GoogleCloudPlatform/testgrid/util/metrics"
    49  	"github.com/golang/protobuf/proto"
    50  	"github.com/sirupsen/logrus"
    51  )
    52  
    53  // Metrics holds metrics relevant to the Updater.
    54  type Metrics struct {
    55  	Summarize metrics.Cyclic
    56  }
    57  
    58  // CreateMetrics creates all the metrics that the Summarizer will use
    59  // This should be called once
    60  func CreateMetrics(factory metrics.Factory) *Metrics {
    61  	return &Metrics{
    62  		Summarize: factory.NewCyclic("summarizer"),
    63  	}
    64  }
    65  
    66  // FeatureFlags aggregates the knobs to enable/disable certain features.
    67  type FeatureFlags struct {
    68  	// controls the acceptable flakiness calculation logic for dashboard tab
    69  	AllowFuzzyFlakiness bool
    70  
    71  	// allows ignoring columns with specific test statuses during summarization
    72  	AllowIgnoredColumns bool
    73  
    74  	// allows enforcing minimum number of runs for a dashboard tab
    75  	AllowMinNumberOfRuns bool
    76  }
    77  
    78  // gridReader returns the grid content and metadata (last updated time, generation id)
    79  type gridReader func(ctx context.Context) (io.ReadCloser, time.Time, int64, error)
    80  
    81  // groupFinder returns the named group as well as reader for the grid state
    82  type groupFinder func(dashboardName string, tab *configpb.DashboardTab) (*gcs.Path, *configpb.TestGroup, gridReader, error)
    83  
    84  func lockDashboard(ctx context.Context, client gcs.ConditionalClient, path gcs.Path, generation int64) (*storage.ObjectAttrs, error) {
    85  	var buf []byte
    86  	if generation == 0 {
    87  		var sum summarypb.DashboardSummary
    88  		var err error
    89  		buf, err = proto.Marshal(&sum)
    90  		if err != nil {
    91  			return nil, fmt.Errorf("marshal: %w", err)
    92  		}
    93  	}
    94  
    95  	return gcs.Touch(ctx, client, path, generation, buf)
    96  }
    97  
    98  // Fixer should adjust the dashboard queue until the context expires.
    99  type Fixer func(context.Context, *config.DashboardQueue) error
   100  
   101  // UpdateOptions aggregates the Update function parameter into a single structure.
   102  type UpdateOptions struct {
   103  	ConfigPath        gcs.Path
   104  	Concurrency       int
   105  	TabPathPrefix     string
   106  	SummaryPathPrefix string
   107  	AllowedDashboards []string
   108  	Confirm           bool
   109  	Features          FeatureFlags
   110  	Freq              time.Duration
   111  }
   112  
   113  // Update summary protos by reading the state protos defined in the config.
   114  //
   115  // Will use concurrency go routines to update dashboards in parallel.
   116  // Setting dashboard will limit update to this dashboard.
   117  // Will write summary proto when confirm is set.
   118  func Update(ctx context.Context, client gcs.ConditionalClient, mets *Metrics, opts *UpdateOptions, fixers ...Fixer) error {
   119  	ctx, cancel := context.WithCancel(ctx)
   120  	defer cancel()
   121  	if opts.Concurrency < 1 {
   122  		return fmt.Errorf("concurrency must be positive, got: %d", opts.Concurrency)
   123  	}
   124  	log := logrus.WithField("config", opts.ConfigPath)
   125  
   126  	var q config.DashboardQueue
   127  	var cfg *snapshot.Config
   128  
   129  	allowed := stringset.New(opts.AllowedDashboards...)
   130  	fixSnapshot := func(newConfig *snapshot.Config) error {
   131  		baseLog := log
   132  		log := log.WithField("fixSnapshot()", true)
   133  		newConfig.Dashboards = filterDashboards(newConfig.Dashboards, allowed)
   134  		cfg = newConfig
   135  
   136  		dashCap := len(cfg.Dashboards)
   137  		paths := make([]gcs.Path, 0, dashCap)
   138  		dashboards := make([]*configpb.Dashboard, 0, dashCap)
   139  		for _, d := range cfg.Dashboards {
   140  			path, err := SummaryPath(opts.ConfigPath, opts.SummaryPathPrefix, d.Name)
   141  			if err != nil {
   142  				log.WithError(err).WithField("dashboard", d.Name).Error("Bad dashboard path")
   143  			}
   144  			paths = append(paths, *path)
   145  			dashboards = append(dashboards, d)
   146  		}
   147  
   148  		stats := gcs.Stat(ctx, client, 10, paths...)
   149  		whens := make(map[string]time.Time, len(stats))
   150  		var wg sync.WaitGroup
   151  		for i, stat := range stats {
   152  			name := dashboards[i].Name
   153  			path := paths[i]
   154  			log := log.WithField("path", path)
   155  			switch {
   156  			case stat.Attrs != nil:
   157  				whens[name] = stat.Attrs.Updated.Add(opts.Freq)
   158  			default:
   159  				if errors.Is(stat.Err, storage.ErrObjectNotExist) {
   160  					wg.Add(1)
   161  					go func() {
   162  						defer wg.Done()
   163  						_, err := lockDashboard(ctx, client, path, 0)
   164  						switch {
   165  						case gcs.IsPreconditionFailed(err):
   166  							log.WithError(err).Debug("Lost race to create initial summary")
   167  						case err != nil:
   168  							log.WithError(err).Error("Failed to lock initial summary")
   169  						default:
   170  							log.Info("Created initial summary")
   171  						}
   172  					}()
   173  				} else {
   174  					log.WithError(stat.Err).Info("Failed to stat")
   175  				}
   176  				whens[name] = time.Now()
   177  			}
   178  		}
   179  
   180  		wg.Wait()
   181  
   182  		q.Init(baseLog, dashboards, time.Now().Add(opts.Freq))
   183  		if err := q.FixAll(whens, false); err != nil {
   184  			log.WithError(err).Error("Failed to fix all dashboards based on last update time")
   185  		}
   186  		return nil
   187  	}
   188  
   189  	log.Debug("Observing config...")
   190  	cfgChanged, err := snapshot.Observe(ctx, log, client, opts.ConfigPath, time.NewTicker(time.Minute).C)
   191  	if err != nil {
   192  		return fmt.Errorf("observe config: %w", err)
   193  	}
   194  	fixSnapshot(<-cfgChanged) // Bootstrap queue before use
   195  
   196  	var active stringset.Set
   197  	var waiting stringset.Set
   198  	var lock sync.Mutex
   199  
   200  	go func() {
   201  		fixCtx, fixCancel := context.WithCancel(ctx)
   202  		var fixWg sync.WaitGroup
   203  		fixAll := func() {
   204  			n := len(fixers)
   205  			log.WithField("fixers", n).Trace("Starting fixers on current dashboards...")
   206  			fixWg.Add(n)
   207  			for i, fix := range fixers {
   208  				go func(i int, fix Fixer) {
   209  					defer fixWg.Done()
   210  					if err := fix(fixCtx, &q); err != nil && !errors.Is(err, context.Canceled) {
   211  						log.WithError(err).WithField("fixer", i).Warning("Fixer failed")
   212  					}
   213  				}(i, fix)
   214  			}
   215  			log.Debug("Started fixers on current dashboards")
   216  		}
   217  
   218  		ticker := time.NewTicker(time.Minute) // TODO(fejta): subscribe to notifications
   219  		fixAll()
   220  		for {
   221  			lock.Lock()
   222  			activeDashboards := active.Elements()
   223  			lock.Unlock()
   224  
   225  			depth, next, when := q.Status()
   226  			log := log.WithFields(logrus.Fields{
   227  				"depth":  depth,
   228  				"active": activeDashboards,
   229  			})
   230  			if next != nil {
   231  				log = log.WithField("next", *next)
   232  			}
   233  			delay := time.Since(when)
   234  			if delay < 0 {
   235  				delay = 0
   236  				log = log.WithField("sleep", -delay)
   237  			}
   238  			log = log.WithField("delay", delay.Round(time.Second))
   239  			log.Info("Updating dashboards")
   240  			select {
   241  			case <-ctx.Done():
   242  				ticker.Stop()
   243  				fixCancel()
   244  				fixWg.Wait()
   245  				return
   246  			case newConfig := <-cfgChanged:
   247  				log.Info("Configuration changed")
   248  				fixCancel()
   249  				fixWg.Wait()
   250  				fixCtx, fixCancel = context.WithCancel(ctx)
   251  				fixSnapshot(newConfig)
   252  				fixAll()
   253  			case <-ticker.C:
   254  			}
   255  
   256  		}
   257  	}()
   258  
   259  	dashboardNames := make(chan string)
   260  
   261  	// TODO(fejta): cache downloaded group?
   262  	findGroup := func(dash string, tab *configpb.DashboardTab) (*gcs.Path, *configpb.TestGroup, gridReader, error) {
   263  		name := tab.TestGroupName
   264  		group := cfg.Groups[name]
   265  		if group == nil {
   266  			return nil, nil, nil, nil
   267  		}
   268  		groupPath, err := tabulator.TabStatePath(opts.ConfigPath, opts.TabPathPrefix, dash, tab.Name)
   269  		if err != nil {
   270  			return nil, group, nil, err
   271  		}
   272  		reader := func(ctx context.Context) (io.ReadCloser, time.Time, int64, error) {
   273  			return pathReader(ctx, client, *groupPath)
   274  		}
   275  		return groupPath, group, reader, nil
   276  	}
   277  
   278  	tabUpdater := tabUpdatePool(ctx, log, opts.Concurrency, opts.Features)
   279  
   280  	updateName := func(log *logrus.Entry, dashName string) (logrus.FieldLogger, bool, error) {
   281  		ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
   282  		defer cancel()
   283  		dash := cfg.Dashboards[dashName]
   284  		if dash == nil {
   285  			return log, false, errors.New("dashboard not found")
   286  		}
   287  		log.Debug("Summarizing dashboard")
   288  		summaryPath, err := SummaryPath(opts.ConfigPath, opts.SummaryPathPrefix, dashName)
   289  		if err != nil {
   290  			return log, false, fmt.Errorf("summary path: %v", err)
   291  		}
   292  		sum, _, _, err := ReadSummary(ctx, client, *summaryPath)
   293  		if err != nil {
   294  			return log, false, fmt.Errorf("read %q: %v", *summaryPath, err)
   295  		}
   296  
   297  		if sum == nil {
   298  			sum = &summarypb.DashboardSummary{}
   299  		}
   300  
   301  		// TODO(fejta): refactor to note whether there is more work
   302  		more := updateDashboard(ctx, client, dash, sum, findGroup, tabUpdater)
   303  
   304  		var healthyTests int
   305  		var failures int
   306  		for _, tab := range sum.TabSummaries {
   307  			failures += len(tab.FailingTestSummaries)
   308  			if h := tab.Healthiness; h != nil {
   309  				healthyTests += len(h.Tests)
   310  			}
   311  		}
   312  
   313  		log = log.WithFields(logrus.Fields{
   314  			"path":          summaryPath,
   315  			"tabs":          len(sum.TabSummaries),
   316  			"failures":      failures,
   317  			"healthy-tests": healthyTests,
   318  		})
   319  		if !opts.Confirm {
   320  			return log, more, nil
   321  		}
   322  		size, err := writeSummary(ctx, client, *summaryPath, sum)
   323  		log = log.WithField("bytes", size)
   324  		if err != nil {
   325  			return log, more, fmt.Errorf("write: %w", err)
   326  		}
   327  		return log, more, nil
   328  	}
   329  
   330  	var wg sync.WaitGroup
   331  	wg.Add(opts.Concurrency)
   332  	for i := 0; i < opts.Concurrency; i++ {
   333  		go func() {
   334  			defer wg.Done()
   335  			for dashName := range dashboardNames {
   336  				lock.Lock()
   337  				start := active.Add(dashName)
   338  				if !start {
   339  					waiting.Add(dashName)
   340  				}
   341  				lock.Unlock()
   342  				if !start {
   343  					continue
   344  				}
   345  
   346  				log := log.WithField("dashboard", dashName)
   347  				finish := mets.Summarize.Start()
   348  				if log, more, err := updateName(log, dashName); err != nil {
   349  					finish.Fail()
   350  					q.Fix(dashName, time.Now().Add(opts.Freq/2), false)
   351  					log.WithError(err).Error("Failed to summarize dashboard")
   352  				} else {
   353  					finish.Success()
   354  					if more {
   355  						q.Fix(dashName, time.Now(), false)
   356  						log = log.WithField("more", more)
   357  					}
   358  					log.Info("Summarized dashboard")
   359  				}
   360  
   361  				lock.Lock()
   362  				active.Discard(dashName)
   363  				restart := waiting.Discard(dashName)
   364  				lock.Unlock()
   365  				if restart {
   366  					q.Fix(dashName, time.Now(), false)
   367  				}
   368  
   369  			}
   370  		}()
   371  	}
   372  	defer wg.Wait()
   373  	defer close(dashboardNames)
   374  
   375  	return q.Send(ctx, dashboardNames, opts.Freq)
   376  }
   377  
   378  func filterDashboards(dashboards map[string]*configpb.Dashboard, allowed stringset.Set) map[string]*configpb.Dashboard {
   379  	if allowed.Len() == 0 {
   380  		return dashboards
   381  	}
   382  
   383  	for key, d := range dashboards {
   384  		if allowed.Contains(d.Name) {
   385  			continue
   386  		}
   387  		delete(dashboards, key)
   388  	}
   389  	return dashboards
   390  }
   391  
   392  var (
   393  	normalizer = regexp.MustCompile(`[^a-z0-9]+`)
   394  )
   395  
   396  // SummaryPath generates a summary GCS path for a given dashboard
   397  func SummaryPath(g gcs.Path, prefix, dashboard string) (*gcs.Path, error) {
   398  	// ''.join(c for c in n.lower() if c is alphanumeric
   399  	name := "summary-" + normalizer.ReplaceAllString(strings.ToLower(dashboard), "")
   400  	fullName := path.Join(prefix, name)
   401  	u, err := url.Parse(fullName)
   402  	if err != nil {
   403  		return nil, fmt.Errorf("parse url: %w", err)
   404  	}
   405  	np, err := g.ResolveReference(u)
   406  	if err != nil {
   407  		return nil, fmt.Errorf("resolve reference: %w", err)
   408  	}
   409  	if np.Bucket() != g.Bucket() {
   410  		return nil, fmt.Errorf("dashboard %s should not change bucket", fullName)
   411  	}
   412  	return np, nil
   413  }
   414  
   415  // ReadSummary provides the dashboard summary as defined in summary.proto.
   416  // IMPORTANT: Returns nil if the object doesn't exist.
   417  // Returns an error iff wasn't read or serialized properly.
   418  func ReadSummary(ctx context.Context, client gcs.Client, path gcs.Path) (*summarypb.DashboardSummary, time.Time, int64, error) {
   419  	r, modified, gen, err := pathReader(ctx, client, path)
   420  	if errors.Is(err, storage.ErrObjectNotExist) {
   421  		return nil, time.Time{}, 0, nil
   422  	} else if err != nil {
   423  		return nil, time.Time{}, 0, fmt.Errorf("open: %w", err)
   424  	}
   425  	buf, err := ioutil.ReadAll(r)
   426  	if err != nil {
   427  		return nil, time.Time{}, 0, fmt.Errorf("read: %w", err)
   428  	}
   429  	var sum summarypb.DashboardSummary
   430  
   431  	if err := proto.Unmarshal(buf, &sum); err != nil {
   432  		return nil, time.Time{}, 0, fmt.Errorf("unmarhsal: %v", err)
   433  	}
   434  
   435  	return &sum, modified, gen, nil
   436  }
   437  
   438  func writeSummary(ctx context.Context, client gcs.Client, path gcs.Path, sum *summarypb.DashboardSummary) (int, error) {
   439  	buf, err := proto.Marshal(sum)
   440  	if err != nil {
   441  		return 0, fmt.Errorf("marshal: %v", err)
   442  	}
   443  	_, err = client.Upload(ctx, path, buf, gcs.DefaultACL, gcs.NoCache)
   444  	return len(buf), err
   445  }
   446  
   447  func statPaths(ctx context.Context, log logrus.FieldLogger, client gcs.Stater, paths ...gcs.Path) []*storage.ObjectAttrs {
   448  	return gcs.StatExisting(ctx, log, client, paths...)
   449  }
   450  
   451  // pathReader returns a reader for the specified path and last modified, generation metadata.
   452  func pathReader(ctx context.Context, client gcs.Client, path gcs.Path) (io.ReadCloser, time.Time, int64, error) {
   453  	r, attrs, err := client.Open(ctx, path)
   454  	if err != nil {
   455  		return nil, time.Time{}, 0, fmt.Errorf("client.Open(): %w", err)
   456  	}
   457  	if attrs == nil {
   458  		return r, time.Time{}, 0, nil
   459  	}
   460  	return r, attrs.LastModified, attrs.Generation, nil
   461  }
   462  
   463  func tabStatus(dashName, tabName, msg string) *summarypb.DashboardTabSummary {
   464  	return &summarypb.DashboardTabSummary{
   465  		DashboardName:    dashName,
   466  		DashboardTabName: tabName,
   467  		OverallStatus:    summarypb.DashboardTabSummary_UNKNOWN,
   468  		Alert:            msg,
   469  		Status:           msg,
   470  	}
   471  }
   472  
   473  // updateDashboard will summarize all the tabs.
   474  //
   475  // Errors summarizing tabs are displayed on the summary for the dashboard.
   476  //
   477  // Returns true when there is more work to to.
   478  func updateDashboard(ctx context.Context, client gcs.Stater, dash *configpb.Dashboard, sum *summarypb.DashboardSummary, findGroup groupFinder, tabUpdater *tabUpdater) bool {
   479  	log := logrus.WithField("dashboard", dash.Name)
   480  
   481  	var graceCtx context.Context
   482  	if when, ok := ctx.Deadline(); ok {
   483  		dur := time.Until(when) / 2
   484  		var cancel func()
   485  		graceCtx, cancel = context.WithTimeout(ctx, dur)
   486  		defer cancel()
   487  	} else {
   488  		graceCtx = ctx
   489  	}
   490  
   491  	// First collect the previously summarized tabs.
   492  	tabSummaries := make(map[string]*summarypb.DashboardTabSummary, len(sum.TabSummaries))
   493  	for _, tabSum := range sum.TabSummaries {
   494  		tabSummaries[tabSum.DashboardTabName] = tabSum
   495  	}
   496  
   497  	// Now create info about which tabs we need to summarize and where the grid state lives.
   498  	type groupInfo struct {
   499  		group  *configpb.TestGroup
   500  		reader gridReader
   501  		tabs   []*configpb.DashboardTab
   502  	}
   503  	groupInfos := make(map[gcs.Path]*groupInfo, len(dash.DashboardTab))
   504  
   505  	var paths []gcs.Path
   506  	for _, tab := range dash.DashboardTab {
   507  		groupPath, group, groupReader, err := findGroup(dash.Name, tab)
   508  		if err != nil {
   509  			tabSummaries[tab.Name] = tabStatus(dash.Name, tab.Name, fmt.Sprintf("Error reading group info: %v", err))
   510  			continue
   511  		}
   512  		if group == nil {
   513  			tabSummaries[tab.Name] = tabStatus(dash.Name, tab.Name, fmt.Sprintf("Test group does not exist: %q", tab.TestGroupName))
   514  			continue
   515  		}
   516  		info := groupInfos[*groupPath]
   517  		if info == nil {
   518  			info = &groupInfo{
   519  				group:  group,
   520  				reader: groupReader, // TODO(fejta): optimize (only read once)
   521  			}
   522  			paths = append(paths, *groupPath)
   523  			groupInfos[*groupPath] = info
   524  		}
   525  		info.tabs = append(info.tabs, tab)
   526  	}
   527  
   528  	// Check the attributes of the grid states.
   529  	attrs := gcs.StatExisting(ctx, log, client, paths...)
   530  
   531  	delays := make(map[gcs.Path]float64, len(paths))
   532  
   533  	// determine how much behind each summary is
   534  	for i, path := range paths {
   535  		a := attrs[i]
   536  		for _, tab := range groupInfos[path].tabs {
   537  			// TODO(fejta): optimize (only read once)
   538  			name := tab.Name
   539  			sum := tabSummaries[name]
   540  			if a == nil {
   541  				tabSummaries[name] = tabStatus(dash.Name, name, noRuns)
   542  				delays[path] = -1
   543  			} else if sum == nil {
   544  				tabSummaries[name] = tabStatus(dash.Name, name, "Newly created tab")
   545  				delays[path] = float64(24 * time.Hour / time.Second)
   546  				log.WithField("tab", name).Debug("Found new tab")
   547  			} else {
   548  				delays[path] = float64(attrs[i].Updated.Unix()) - tabSummaries[name].LastUpdateTimestamp
   549  			}
   550  		}
   551  	}
   552  
   553  	// sort by delay
   554  	sort.SliceStable(paths, func(i, j int) bool {
   555  		return delays[paths[i]] > delays[paths[j]]
   556  	})
   557  
   558  	// Now let's update the tab summaries in parallel, starting with most delayed
   559  
   560  	type future struct {
   561  		log    *logrus.Entry
   562  		name   string
   563  		result func() (*summarypb.DashboardTabSummary, error)
   564  	}
   565  
   566  	// channel to receive updated tabs
   567  	ch := make(chan future)
   568  
   569  	// request an update for each tab, starting with the least recently modified one.
   570  	go func() {
   571  		defer close(ch)
   572  		tabUpdater.lock.Lock()
   573  		defer tabUpdater.lock.Unlock()
   574  		for _, path := range paths {
   575  			info := groupInfos[path]
   576  			log := log.WithField("group", path)
   577  			for _, tab := range info.tabs {
   578  				log := log.WithField("tab", tab.Name)
   579  				delay := delays[path]
   580  				if delay == 0 {
   581  					log.Debug("Already up to date")
   582  					continue
   583  				} else if delay == -1 {
   584  					log.Debug("No grid state to process")
   585  				}
   586  				log = log.WithField("delay", delay)
   587  				if err := graceCtx.Err(); err != nil {
   588  					log.WithError(err).Info("Interrupted")
   589  					return
   590  				}
   591  				log.Debug("Requesting tab summary update")
   592  				f := tabUpdater.update(ctx, tab, info.group, info.reader)
   593  				select {
   594  				case <-ctx.Done():
   595  					return
   596  				case ch <- future{log, tab.Name, f}:
   597  				}
   598  			}
   599  		}
   600  	}()
   601  
   602  	// Update the summary for any tabs that give a response
   603  	for fut := range ch {
   604  		tabName := fut.name
   605  		log := fut.log
   606  		log.Trace("Waiting for updated tab summary response")
   607  		s, err := fut.result()
   608  		if err != nil {
   609  			s = tabStatus(dash.Name, tabName, fmt.Sprintf("Error attempting to summarize tab: %v", err))
   610  			log = log.WithError(err)
   611  		} else {
   612  			s.DashboardName = dash.Name
   613  		}
   614  		tabSummaries[tabName] = s
   615  		log.Trace("Updated tab summary")
   616  	}
   617  
   618  	// assemble them back into the dashboard summary.
   619  	sum.TabSummaries = make([]*summarypb.DashboardTabSummary, len(dash.DashboardTab))
   620  	for idx, tab := range dash.DashboardTab {
   621  		sum.TabSummaries[idx] = tabSummaries[tab.Name]
   622  	}
   623  
   624  	return graceCtx.Err() != nil
   625  }
   626  
   627  type tabUpdater struct {
   628  	lock   sync.Mutex
   629  	update func(context.Context, *configpb.DashboardTab, *configpb.TestGroup, gridReader) func() (*summarypb.DashboardTabSummary, error)
   630  }
   631  
   632  func tabUpdatePool(poolCtx context.Context, log *logrus.Entry, concurrency int, features FeatureFlags) *tabUpdater {
   633  	type request struct {
   634  		ctx   context.Context
   635  		tab   *configpb.DashboardTab
   636  		group *configpb.TestGroup
   637  		read  gridReader
   638  		sum   *summarypb.DashboardTabSummary
   639  		err   error
   640  		wg    sync.WaitGroup
   641  	}
   642  
   643  	ch := make(chan *request, concurrency)
   644  
   645  	var wg sync.WaitGroup
   646  	wg.Add(concurrency)
   647  	log = log.WithField("concurrency", concurrency)
   648  	log.Info("Starting up worker pool")
   649  
   650  	for i := 0; i < concurrency; i++ {
   651  		go func() {
   652  			defer wg.Done()
   653  			for req := range ch {
   654  				req.sum, req.err = updateTab(req.ctx, req.tab, req.group, req.read, features)
   655  				req.wg.Done()
   656  			}
   657  		}()
   658  	}
   659  
   660  	go func() {
   661  		<-poolCtx.Done()
   662  		log.Info("Shutting down worker pool")
   663  		close(ch)
   664  		wg.Wait()
   665  		log.Info("Worker pool stopped")
   666  	}()
   667  
   668  	updateTabViaPool := func(ctx context.Context, tab *configpb.DashboardTab, group *configpb.TestGroup, groupReader gridReader) func() (*summarypb.DashboardTabSummary, error) {
   669  		req := &request{
   670  			ctx:   ctx,
   671  			tab:   tab,
   672  			group: group,
   673  			read:  groupReader,
   674  		}
   675  		req.wg.Add(1)
   676  		select {
   677  		case <-ctx.Done():
   678  			return func() (*summarypb.DashboardTabSummary, error) { return nil, ctx.Err() }
   679  		case ch <- req:
   680  			return func() (*summarypb.DashboardTabSummary, error) {
   681  				req.wg.Wait()
   682  				return req.sum, req.err
   683  			}
   684  		}
   685  	}
   686  
   687  	return &tabUpdater{
   688  		update: updateTabViaPool,
   689  	}
   690  }
   691  
   692  // staleHours returns the configured number of stale hours for the tab.
   693  func staleHours(tab *configpb.DashboardTab) time.Duration {
   694  	if tab.AlertOptions == nil {
   695  		return 0
   696  	}
   697  	return time.Duration(tab.AlertOptions.AlertStaleResultsHours) * time.Hour
   698  }
   699  
   700  // updateTab reads the latest grid state for the tab and summarizes it.
   701  func updateTab(ctx context.Context, tab *configpb.DashboardTab, group *configpb.TestGroup, groupReader gridReader, features FeatureFlags) (*summarypb.DashboardTabSummary, error) {
   702  	groupName := tab.TestGroupName
   703  	grid, mod, _, err := readGrid(ctx, groupReader) // TODO(fejta): track gen
   704  	if err != nil {
   705  		return nil, fmt.Errorf("load %s: %v", groupName, err)
   706  	}
   707  
   708  	var healthiness *summarypb.HealthinessInfo
   709  	if shouldRunHealthiness(tab) {
   710  		// TODO (itsazhuhere@): Change to rely on YAML defaults rather than consts
   711  		interval := int(tab.HealthAnalysisOptions.DaysOfAnalysis)
   712  		if interval <= 0 {
   713  			interval = DefaultInterval
   714  		}
   715  		healthiness = getHealthinessForInterval(grid, tab.Name, time.Now(), interval)
   716  	}
   717  
   718  	recent := recentColumns(tab, group)
   719  	grid.Rows = recentRows(grid.Rows, recent)
   720  
   721  	grid.Rows = filterMethods(grid.Rows)
   722  
   723  	latest, latestSeconds := latestRun(grid.Columns)
   724  	alert := staleAlert(mod, latest, staleHours(tab), len(grid.Rows))
   725  	failures := failingTestSummaries(grid.Rows, tab.GetOpenTestTemplate(), group.GetGcsPrefix(), group.GetColumnHeader())
   726  	colsCells, brokenState := gridMetrics(len(grid.Columns), grid.Rows, recent, tab.BrokenColumnThreshold, features, tab.GetStatusCustomizationOptions())
   727  	metrics := tabMetrics(colsCells)
   728  	tabStatus := overallStatus(grid, recent, alert, brokenState, failures, features, colsCells, tab.GetStatusCustomizationOptions())
   729  	return &summarypb.DashboardTabSummary{
   730  		DashboardTabName:     tab.Name,
   731  		LastUpdateTimestamp:  float64(mod.Unix()),
   732  		LastRunTimestamp:     float64(latestSeconds),
   733  		Alert:                alert,
   734  		FailingTestSummaries: failures,
   735  		OverallStatus:        tabStatus,
   736  		Status:               statusMessage(colsCells, tabStatus, tab.GetStatusCustomizationOptions()),
   737  		LatestGreen:          latestGreen(grid, group.UseKubernetesClient),
   738  		BugUrl:               tab.GetOpenBugTemplate().GetUrl(),
   739  		Healthiness:          healthiness,
   740  		LinkedIssues:         allLinkedIssues(grid.Rows),
   741  		SummaryMetrics:       metrics,
   742  	}, nil
   743  }
   744  
   745  // readGrid downloads and deserializes the current test group state.
   746  func readGrid(ctx context.Context, reader gridReader) (*statepb.Grid, time.Time, int64, error) {
   747  	var t time.Time
   748  	r, mod, gen, err := reader(ctx)
   749  	if err != nil {
   750  		return nil, t, 0, fmt.Errorf("open: %w", err)
   751  	}
   752  	defer r.Close()
   753  	zlibReader, err := zlib.NewReader(r)
   754  	if err != nil {
   755  		return nil, t, 0, fmt.Errorf("decompress: %v", err)
   756  	}
   757  	buf, err := ioutil.ReadAll(zlibReader)
   758  	if err != nil {
   759  		return nil, t, 0, fmt.Errorf("read: %v", err)
   760  	}
   761  	var g statepb.Grid
   762  	if err = proto.Unmarshal(buf, &g); err != nil {
   763  		return nil, t, 0, fmt.Errorf("parse: %v", err)
   764  	}
   765  	return &g, mod, gen, nil
   766  }
   767  
   768  // recentColumns returns the configured number of recent columns to summarize, or 5.
   769  func recentColumns(tab *configpb.DashboardTab, group *configpb.TestGroup) int {
   770  	return firstFilled(tab.NumColumnsRecent, group.NumColumnsRecent, 5)
   771  }
   772  
   773  // firstFilled returns the first non-empty value, or zero.
   774  func firstFilled(values ...int32) int {
   775  	for _, v := range values {
   776  		if v != 0 {
   777  			return int(v)
   778  		}
   779  	}
   780  	return 0
   781  }
   782  
   783  // recentRows returns the subset of rows with at least one recent result
   784  func recentRows(in []*statepb.Row, recent int) []*statepb.Row {
   785  	var rows []*statepb.Row
   786  	for _, r := range in {
   787  		if r.Results == nil {
   788  			continue
   789  		}
   790  		if statuspb.TestStatus(r.Results[0]) == statuspb.TestStatus_NO_RESULT && int(r.Results[1]) >= recent {
   791  			continue
   792  		}
   793  		rows = append(rows, r)
   794  	}
   795  	return rows
   796  }
   797  
   798  // filterMethods returns the subset of rows that do not have test method names
   799  func filterMethods(rows []*statepb.Row) []*statepb.Row {
   800  	var filtered []*statepb.Row
   801  	for _, r := range rows {
   802  		if !isValidTestName(r.Id) || !isValidTestName(r.Name) {
   803  			continue
   804  		}
   805  		filtered = append(filtered, r)
   806  	}
   807  	return filtered
   808  }
   809  
   810  // latestRun returns the Time (and seconds-since-epoch) of the most recent run.
   811  func latestRun(columns []*statepb.Column) (time.Time, int64) {
   812  	if len(columns) > 0 {
   813  		if start := int64(columns[0].Started); start > 0 {
   814  			second := start / 1000
   815  			mills := start % 1000
   816  			return time.Unix(second, mills*1e6), second
   817  		}
   818  	}
   819  	return time.Time{}, 0
   820  }
   821  
   822  const noRuns = "no completed results"
   823  
   824  // staleAlert returns an explanatory message if the latest results are stale.
   825  func staleAlert(mod, ran time.Time, stale time.Duration, rows int) string {
   826  	if mod.IsZero() {
   827  		return "no stored results"
   828  	}
   829  	if stale == 0 {
   830  		return ""
   831  	}
   832  	if ran.IsZero() || rows == 0 { // Has no columns and/or no rows.
   833  		return noRuns
   834  	}
   835  	now := time.Now()
   836  	if dur := now.Sub(mod); dur > stale {
   837  		return fmt.Sprintf("data has not changed since %s (%s old)", mod, dur.Truncate(15*time.Minute))
   838  	}
   839  	if dur := now.Sub(ran); dur > stale {
   840  		return fmt.Sprintf("latest column from %s (%s old)", ran, dur.Truncate(15*time.Minute))
   841  	}
   842  	return ""
   843  }
   844  
   845  // failingTestSummaries returns details for every row with an active alert.
   846  func failingTestSummaries(rows []*statepb.Row, template *configpb.LinkTemplate, gcsPrefix string, columnHeader []*configpb.TestGroup_ColumnHeader) []*summarypb.FailingTestSummary {
   847  	var failures []*summarypb.FailingTestSummary
   848  	for _, row := range rows {
   849  		if row.AlertInfo == nil {
   850  			continue
   851  		}
   852  		alert := row.AlertInfo
   853  		sum := summarypb.FailingTestSummary{
   854  			DisplayName:       row.Name,
   855  			TestName:          row.Id,
   856  			FailBuildId:       alert.FailBuildId,
   857  			LatestFailBuildId: alert.LatestFailBuildId,
   858  			FailCount:         alert.FailCount,
   859  			FailureMessage:    alert.FailureMessage,
   860  			PassBuildId:       alert.PassBuildId,
   861  			// TODO(fejta): better build info
   862  			BuildLink:           alert.BuildLink,
   863  			BuildLinkText:       alert.BuildLinkText,
   864  			BuildUrlText:        alert.BuildUrlText,
   865  			LinkedBugs:          row.Issues,
   866  			FailTestLink:        buildFailLink(alert.FailTestId, row.Id),
   867  			LatestFailTestLink:  buildFailLink(alert.LatestFailTestId, row.Id),
   868  			Properties:          alert.Properties,
   869  			CustomColumnHeaders: alert.CustomColumnHeaders,
   870  			HotlistIds:          alert.HotlistIds,
   871  			EmailAddresses:      alert.EmailAddresses,
   872  		}
   873  		if alert.PassTime != nil {
   874  			sum.PassTimestamp = float64(alert.PassTime.Seconds)
   875  		}
   876  		if alert.FailTime != nil {
   877  			sum.FailTimestamp = float64(alert.FailTime.Seconds)
   878  		}
   879  
   880  		propertyToColumnHeader := make(map[string]string)
   881  		for i := 0; i < len(columnHeader); i++ {
   882  			if columnHeader[i].Label != "" {
   883  				propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].Label
   884  			} else if columnHeader[i].Property != "" {
   885  				propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].Property
   886  			} else {
   887  				propertyToColumnHeader["<custom-"+strconv.Itoa(i)+">"] = columnHeader[i].ConfigurationValue
   888  			}
   889  		}
   890  
   891  		// Verify what the links for alerts would be with the new method.
   892  		failLink := testResultLink(template, alert.GetProperties(), alert.GetFailTestId(), row.GetId(), alert.GetFailBuildId(), gcsPrefix, propertyToColumnHeader, alert.CustomColumnHeaders)
   893  		latestFailLink := testResultLink(template, alert.GetProperties(), alert.GetLatestFailTestId(), row.GetId(), alert.GetLatestFailBuildId(), gcsPrefix, propertyToColumnHeader, alert.CustomColumnHeaders)
   894  		log := logrus.WithField("failLink", failLink).WithField("latestFailLink", latestFailLink)
   895  		if failLink == "" || latestFailLink == "" {
   896  			log.Warning("Failed to create failure link.")
   897  		} else if !strings.HasPrefix(failLink, "http") || !strings.HasPrefix(latestFailLink, "http") {
   898  			log.Warning("Failure link does not include scheme.")
   899  		} else {
   900  			log.Info("Created failure links.")
   901  		}
   902  
   903  		failures = append(failures, &sum)
   904  	}
   905  	return failures
   906  }
   907  
   908  // buildFailLink creates a search link
   909  // TODO(#134): Build proper url for both internal and external jobs
   910  func buildFailLink(testID, target string) string {
   911  	return fmt.Sprintf("%s %s", url.PathEscape(testID), url.PathEscape(target))
   912  }
   913  
   914  func testResultLink(template *configpb.LinkTemplate, properties map[string]string, testID, target, buildID, gcsPrefix string, propertyToColumnHeader map[string]string, customColumnHeaders map[string]string) string {
   915  	// Return the result of open_test_template for the tab.
   916  	// This assumes that open_test_template uses a limited set of tokens (since it's not in the context of a browser).
   917  	// Assume that the following are valid: <gcs_prefix>, <test-name>, <workflow-id>, <workflow-name>, <test-id>, <build ID>
   918  	// TODO: Ensure workflow-id, workflow-name are added in alerts.
   919  	tokens := util.Tokens(template)
   920  	parameters := map[string]string{}
   921  	for _, token := range tokens {
   922  		switch token {
   923  		case util.GcsPrefix:
   924  			parameters[util.GcsPrefix] = gcsPrefix
   925  		case util.TestName:
   926  			parameters[util.TestName] = target
   927  		case util.WorkflowID:
   928  			if workflowID, ok := properties["workflow-id"]; ok {
   929  				parameters[util.WorkflowID] = workflowID
   930  			}
   931  		case util.WorkflowName:
   932  			if WorkflowName, ok := properties["workflow-name"]; ok {
   933  				parameters[util.WorkflowName] = WorkflowName
   934  			}
   935  		case util.TestID:
   936  			parameters[util.TestID] = testID
   937  		case util.BuildID:
   938  			parameters[util.BuildID] = buildID
   939  		case util.CustomColumnRe.FindString(token):
   940  			if v, ok := customColumnHeaders[propertyToColumnHeader[token]]; ok {
   941  				parameters[token] = v
   942  			}
   943  		default:
   944  			// Didn't match any simple tokens, check if it's a property.
   945  			trimmedToken := strings.NewReplacer("<", "", ">", "").Replace(token)
   946  			if v, ok := properties[trimmedToken]; ok {
   947  				parameters[token] = v
   948  			}
   949  		}
   950  	}
   951  	link, err := util.ExpandTemplate(template, parameters)
   952  	if err != nil {
   953  		logrus.WithError(err).WithField("template", template).WithField("parameters", parameters).Error("Error expanding link template.")
   954  		return ""
   955  	}
   956  	return link
   957  }
   958  
   959  // overallStatus determines whether the tab is stale, failing, flaky or healthy.
   960  //
   961  // Tabs are:
   962  // BROKEN - called with brokenState (typically when most rows are red)
   963  // STALE - called with a stale mstring (typically when most recent column is old)
   964  // FAIL - there is at least one alert
   965  // ACCEPTABLE - the ratio of (valid) failing to total columns is less than configured threshold
   966  // FLAKY - at least one recent column has failing cells
   967  // PENDING - number of valid columns is less than minimum # of runs required
   968  // PASS - all recent columns are entirely green
   969  func overallStatus(grid *statepb.Grid, recent int, stale string, brokenState bool, alerts []*summarypb.FailingTestSummary, features FeatureFlags, colCells gridStats, opts *configpb.DashboardTabStatusCustomizationOptions) summarypb.DashboardTabSummary_TabStatus {
   970  	if brokenState {
   971  		return summarypb.DashboardTabSummary_BROKEN
   972  	}
   973  	if stale != "" {
   974  		return summarypb.DashboardTabSummary_STALE
   975  	}
   976  	if len(alerts) > 0 {
   977  		return summarypb.DashboardTabSummary_FAIL
   978  	}
   979  	// safeguard PENDING status behind a flag
   980  	if features.AllowMinNumberOfRuns && opts.GetMinAcceptableRuns() > int32(colCells.completedCols-colCells.ignoredCols) {
   981  		return summarypb.DashboardTabSummary_PENDING
   982  	}
   983  
   984  	results := result.Map(grid.Rows)
   985  	moreCols := true
   986  	var passing bool
   987  	var flaky bool
   988  	// We want to look at recent columns, skipping over any that are still running.
   989  	for moreCols && recent > 0 {
   990  		moreCols = false
   991  		var foundCol bool
   992  		var running bool
   993  		var ignored bool
   994  		// One result off each column since we don't know which
   995  		// cells are running ahead of time.
   996  		for _, resultF := range results {
   997  			r, ok := resultF()
   998  			if !ok {
   999  				continue
  1000  			}
  1001  			moreCols = true
  1002  			if r == statuspb.TestStatus_RUNNING {
  1003  				running = true
  1004  				// not break because we need to pull this column's
  1005  				// result off every row's channel.
  1006  				continue
  1007  			}
  1008  			if features.AllowIgnoredColumns && result.Ignored(r, opts) {
  1009  				ignored = true
  1010  				continue
  1011  			}
  1012  			r = coalesceResult(r, result.IgnoreRunning)
  1013  			if r == statuspb.TestStatus_NO_RESULT {
  1014  				continue
  1015  			}
  1016  			// any failure in a recent column results in flaky
  1017  			if r != statuspb.TestStatus_PASS {
  1018  				flaky = true
  1019  				continue
  1020  			}
  1021  			foundCol = true
  1022  		}
  1023  
  1024  		// Running columns are unfinished and therefore should
  1025  		// not count as "recent" until they finish.
  1026  		if running {
  1027  			continue
  1028  		}
  1029  
  1030  		// Ignored columns are ignored from tab status but they do count as recent
  1031  		// Failures in this col are ignored too
  1032  		if ignored {
  1033  			recent--
  1034  			flaky = false
  1035  			continue
  1036  		}
  1037  
  1038  		if flaky {
  1039  			if isAcceptable(colCells, opts, features) {
  1040  				return summarypb.DashboardTabSummary_ACCEPTABLE
  1041  			}
  1042  			return summarypb.DashboardTabSummary_FLAKY
  1043  		}
  1044  
  1045  		if foundCol {
  1046  			passing = true
  1047  			recent--
  1048  		}
  1049  	}
  1050  
  1051  	if passing {
  1052  		return summarypb.DashboardTabSummary_PASS
  1053  	}
  1054  	return summarypb.DashboardTabSummary_UNKNOWN
  1055  }
  1056  
  1057  // isAcceptable determines if the flakiness is within acceptable range.
  1058  // Return true iff the feature is enabled, `max_acceptable_flakiness` is set and flakiness is < than configured.
  1059  func isAcceptable(colCells gridStats, opts *configpb.DashboardTabStatusCustomizationOptions, features FeatureFlags) bool {
  1060  	if features.AllowFuzzyFlakiness && opts.GetMaxAcceptableFlakiness() > 0 &&
  1061  		100*float64(colCells.passingCols)/float64(colCells.completedCols-colCells.ignoredCols) >= float64(100-opts.GetMaxAcceptableFlakiness()) {
  1062  		return true
  1063  	}
  1064  
  1065  	return false
  1066  }
  1067  
  1068  func allLinkedIssues(rows []*statepb.Row) []string {
  1069  	issueSet := make(map[string]bool)
  1070  	for _, row := range rows {
  1071  		for _, issueID := range row.Issues {
  1072  			issueSet[issueID] = true
  1073  		}
  1074  	}
  1075  	linkedIssues := []string{}
  1076  	for issueID := range issueSet {
  1077  		linkedIssues = append(linkedIssues, issueID)
  1078  	}
  1079  	return linkedIssues
  1080  }
  1081  
  1082  // gridStats aggregates columnar and cellular metrics as a struct
  1083  type gridStats struct {
  1084  	passingCols   int
  1085  	completedCols int
  1086  	ignoredCols   int
  1087  	passingCells  int
  1088  	filledCells   int
  1089  }
  1090  
  1091  // Culminate set of metrics related to a section of the Grid
  1092  func gridMetrics(cols int, rows []*statepb.Row, recent int, brokenThreshold float32, features FeatureFlags, opts *configpb.DashboardTabStatusCustomizationOptions) (gridStats, bool) {
  1093  	results := result.Map(rows)
  1094  	var passingCells int
  1095  	var filledCells int
  1096  	var passingCols int
  1097  	var completedCols int
  1098  	var ignoredCols int
  1099  	var brokenState bool
  1100  
  1101  	for idx := 0; idx < cols; idx++ {
  1102  		if idx >= recent {
  1103  			break
  1104  		}
  1105  		var passes int
  1106  		var failures int
  1107  		var ignores int
  1108  		var other int
  1109  		for _, iter := range results {
  1110  			// TODO(fejta): fail old running cols
  1111  			r, _ := iter()
  1112  			// check for ignores first
  1113  			if features.AllowIgnoredColumns && result.Ignored(r, opts) {
  1114  				ignores++
  1115  			}
  1116  			// proceed with the rest of calculations
  1117  			status := coalesceResult(r, result.IgnoreRunning)
  1118  			if result.Passing(status) {
  1119  				passes++
  1120  				passingCells++
  1121  				filledCells++
  1122  			} else if result.Failing(status) {
  1123  				failures++
  1124  				filledCells++
  1125  			} else if status != statuspb.TestStatus_NO_RESULT {
  1126  				other++
  1127  				filledCells++
  1128  			}
  1129  		}
  1130  
  1131  		if passes+failures+other > 0 {
  1132  			completedCols++
  1133  		}
  1134  		// only one of those can be true
  1135  		if ignores > 0 {
  1136  			ignoredCols++
  1137  		} else if failures == 0 && passes > 0 {
  1138  			passingCols++
  1139  		}
  1140  
  1141  		if passes+failures > 0 && brokenThreshold > 0 {
  1142  			if float32(failures)/float32(passes+failures+other) > brokenThreshold {
  1143  				brokenState = true
  1144  			}
  1145  		}
  1146  	}
  1147  
  1148  	metrics := gridStats{
  1149  		passingCols:   passingCols,
  1150  		completedCols: completedCols,
  1151  		ignoredCols:   ignoredCols,
  1152  		passingCells:  passingCells,
  1153  		filledCells:   filledCells,
  1154  	}
  1155  
  1156  	return metrics, brokenState
  1157  }
  1158  
  1159  // Add a subset of colCellMetrics to summary proto
  1160  func tabMetrics(colCells gridStats) *summarypb.DashboardTabSummaryMetrics {
  1161  	return &summarypb.DashboardTabSummaryMetrics{
  1162  		PassingColumns:   int32(colCells.passingCols),
  1163  		CompletedColumns: int32(colCells.completedCols),
  1164  		IgnoredColumns:   int32(colCells.ignoredCols),
  1165  	}
  1166  }
  1167  
  1168  func fmtStatus(colCells gridStats, tabStatus summarypb.DashboardTabSummary_TabStatus, opts *configpb.DashboardTabStatusCustomizationOptions) string {
  1169  	colCent := 100 * float64(colCells.passingCols) / float64(colCells.completedCols)
  1170  	cellCent := 100 * float64(colCells.passingCells) / float64(colCells.filledCells)
  1171  	flakyCent := 100 * float64(colCells.completedCols-colCells.ignoredCols-colCells.passingCols) / float64(colCells.completedCols-colCells.ignoredCols)
  1172  	// put tab stats on a single line and additional status info on the next
  1173  	statusMsg := fmt.Sprintf("Tab stats: %d of %d (%.1f%%) recent columns passed (%d of %d or %.1f%% cells)", colCells.passingCols, colCells.completedCols, colCent, colCells.passingCells, colCells.filledCells, cellCent)
  1174  	if colCells.ignoredCols > 0 {
  1175  		statusMsg += fmt.Sprintf(". %d columns ignored", colCells.ignoredCols)
  1176  	}
  1177  	// add status info message for certain cases
  1178  	if tabStatus == summarypb.DashboardTabSummary_PENDING {
  1179  		statusMsg += "\nStatus info: Not enough runs"
  1180  	} else if tabStatus == summarypb.DashboardTabSummary_ACCEPTABLE {
  1181  		statusMsg += fmt.Sprintf("\nStatus info: Recent flakiness (%.1f%%) over valid columns is within configured acceptable level of %.1f%%.", flakyCent, opts.GetMaxAcceptableFlakiness())
  1182  	}
  1183  	return statusMsg
  1184  }
  1185  
  1186  // Tab stats: 3 out of 5 (60.0%) recent columns passed (35 of 50 or 70.0% cells). 1 columns ignored.
  1187  // (OPTIONAL) Status info: Recent flakiness (40.0%) flakiness is within configured acceptable level of X
  1188  // OR Status info: Not enough runs
  1189  func statusMessage(colCells gridStats, tabStatus summarypb.DashboardTabSummary_TabStatus, opts *configpb.DashboardTabStatusCustomizationOptions) string {
  1190  	if colCells.filledCells == 0 {
  1191  		return noRuns
  1192  	}
  1193  	return fmtStatus(colCells, tabStatus, opts)
  1194  }
  1195  
  1196  const noGreens = "no recent greens"
  1197  
  1198  // latestGreen finds the ID for the most recent column with all passing rows.
  1199  //
  1200  // Returns the build, first extra column header and/or a no recent greens message.
  1201  func latestGreen(grid *statepb.Grid, useFirstExtra bool) string {
  1202  	results := result.Map(grid.Rows)
  1203  	for _, col := range grid.Columns {
  1204  		var failures bool
  1205  		var passes bool
  1206  		for _, resultF := range results {
  1207  			r, _ := resultF()
  1208  			result := coalesceResult(r, result.ShowRunning)
  1209  			if result == statuspb.TestStatus_PASS {
  1210  				passes = true
  1211  			}
  1212  			if result == statuspb.TestStatus_FLAKY || result == statuspb.TestStatus_FAIL || result == statuspb.TestStatus_UNKNOWN {
  1213  				failures = true
  1214  			}
  1215  		}
  1216  		if failures || !passes {
  1217  			continue
  1218  		}
  1219  		if useFirstExtra && len(col.Extra) > 0 {
  1220  			return col.Extra[0]
  1221  		}
  1222  		return col.Build
  1223  	}
  1224  	return noGreens
  1225  }
  1226  
  1227  func getHealthinessForInterval(grid *statepb.Grid, tabName string, currentTime time.Time, interval int) *summarypb.HealthinessInfo {
  1228  	now := goBackDays(0, currentTime)
  1229  	oneInterval := goBackDays(interval, currentTime)
  1230  	twoIntervals := goBackDays(2*interval, currentTime)
  1231  
  1232  	healthiness := CalculateHealthiness(grid, oneInterval, now, tabName)
  1233  	pastHealthiness := CalculateHealthiness(grid, twoIntervals, oneInterval, tabName)
  1234  	CalculateTrend(healthiness, pastHealthiness)
  1235  
  1236  	healthiness.PreviousFlakiness = []float32{pastHealthiness.AverageFlakiness}
  1237  	return healthiness
  1238  }
  1239  
  1240  func goBackDays(days int, currentTime time.Time) int {
  1241  	// goBackDays gets the time intervals for our flakiness report.
  1242  	// The old version of this function would round to the 12am of the given day.
  1243  	// Since the new flakiness report will be run with Summarizer and therefore more often
  1244  	// than the once-a-week of the old flakiness report, we will not round to 12am anymore.
  1245  	date := currentTime.AddDate(0, 0, -1*days)
  1246  	intDate := int(date.Unix())
  1247  	return intDate
  1248  }
  1249  
  1250  func shouldRunHealthiness(tab *configpb.DashboardTab) bool {
  1251  	if tab.HealthAnalysisOptions == nil {
  1252  		return false
  1253  	}
  1254  	return tab.HealthAnalysisOptions.Enable
  1255  }
  1256  
  1257  // coalesceResult reduces the result to PASS, NO_RESULT, FAIL or FLAKY.
  1258  func coalesceResult(rowResult statuspb.TestStatus, ignoreRunning bool) statuspb.TestStatus {
  1259  	return result.Coalesce(rowResult, ignoreRunning)
  1260  }