github.com/GoogleCloudPlatform/testgrid@v0.0.174/pkg/updater/updater.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package updater reads the latest test results and saves updated state.
    18  package updater
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"math"
    25  	"math/rand"
    26  	"net/url"
    27  	"path"
    28  	"sort"
    29  	"strings"
    30  	"sync"
    31  	"time"
    32  	"unicode/utf8"
    33  
    34  	"cloud.google.com/go/storage"
    35  	"github.com/GoogleCloudPlatform/testgrid/config"
    36  	"github.com/GoogleCloudPlatform/testgrid/config/snapshot"
    37  	"github.com/GoogleCloudPlatform/testgrid/internal/result"
    38  	configpb "github.com/GoogleCloudPlatform/testgrid/pb/config"
    39  	statepb "github.com/GoogleCloudPlatform/testgrid/pb/state"
    40  	statuspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status"
    41  	"github.com/GoogleCloudPlatform/testgrid/util/gcs"
    42  	"github.com/GoogleCloudPlatform/testgrid/util/metrics"
    43  	"github.com/fvbommel/sortorder"
    44  	"github.com/golang/protobuf/ptypes/timestamp"
    45  	"github.com/sirupsen/logrus"
    46  )
    47  
    48  const componentName = "updater"
    49  
    50  // Metrics holds metrics relevant to the Updater.
    51  type Metrics struct {
    52  	UpdateState  metrics.Cyclic
    53  	DelaySeconds metrics.Duration
    54  }
    55  
    56  // CreateMetrics creates metrics for this controller
    57  func CreateMetrics(factory metrics.Factory) *Metrics {
    58  	return &Metrics{
    59  		UpdateState:  factory.NewCyclic(componentName),
    60  		DelaySeconds: factory.NewDuration("delay", "Seconds updater is behind schedule", "component"),
    61  	}
    62  }
    63  
    64  func (mets *Metrics) delay(dur time.Duration) {
    65  	if mets == nil {
    66  		return
    67  	}
    68  	mets.DelaySeconds.Set(dur, componentName)
    69  }
    70  
    71  func (mets *Metrics) start() *metrics.CycleReporter {
    72  	if mets == nil {
    73  		return nil
    74  	}
    75  	return mets.UpdateState.Start()
    76  }
    77  
    78  // GroupUpdater will compile the grid state proto for the specified group and upload it.
    79  //
    80  // This typically involves downloading the existing state, dropping old columns,
    81  // compiling any new columns and inserting them into the front and then uploading
    82  // the proto to GCS.
    83  //
    84  // Disable pooled downloads with a nil poolCtx, otherwise at most concurrency builds
    85  // will be downloaded at the same time.
    86  //
    87  // Return true if there are more results to process.
    88  type GroupUpdater func(parent context.Context, log logrus.FieldLogger, client gcs.Client, tg *configpb.TestGroup, gridPath gcs.Path) (bool, error)
    89  
    90  // GCS returns a GCS-based GroupUpdater, which knows how to process result data stored in GCS.
    91  func GCS(poolCtx context.Context, colClient gcs.Client, groupTimeout, buildTimeout time.Duration, concurrency int, write bool, enableIgnoreSkip bool) GroupUpdater {
    92  	var readResult *resultReader
    93  	if poolCtx == nil {
    94  		// TODO(fejta): remove check soon
    95  		panic("Context must be non-nil")
    96  	}
    97  	readResult = resultReaderPool(poolCtx, logrus.WithField("pool", "readResult"), concurrency)
    98  
    99  	return func(parent context.Context, log logrus.FieldLogger, client gcs.Client, tg *configpb.TestGroup, gridPath gcs.Path) (bool, error) {
   100  		if !tg.UseKubernetesClient && (tg.ResultSource == nil || tg.ResultSource.GetGcsConfig() == nil) {
   101  			log.Debug("Skipping non-kubernetes client group")
   102  			return false, nil
   103  		}
   104  		ctx, cancel := context.WithTimeout(parent, groupTimeout)
   105  		defer cancel()
   106  		gcsColReader := gcsColumnReader(colClient, buildTimeout, readResult, enableIgnoreSkip)
   107  		reprocess := 20 * time.Minute // allow 20m for prow to finish uploading artifacts
   108  		return InflateDropAppend(ctx, log, client, tg, gridPath, write, gcsColReader, reprocess)
   109  	}
   110  }
   111  
   112  func gridPaths(configPath gcs.Path, gridPrefix string, groups []*configpb.TestGroup) ([]gcs.Path, error) {
   113  	paths := make([]gcs.Path, 0, len(groups))
   114  	for _, tg := range groups {
   115  		tgp, err := TestGroupPath(configPath, gridPrefix, tg.Name)
   116  		if err != nil {
   117  			return nil, fmt.Errorf("%s bad group path: %w", tg.Name, err)
   118  		}
   119  		paths = append(paths, *tgp)
   120  	}
   121  	return paths, nil
   122  }
   123  
   124  // lockGroup makes a conditional GCS write operation to ensure it has authority to update this object.
   125  //
   126  // This allows multiple decentralized updaters to collaborate on updating groups:
   127  // Regardless of how many updaters are trying to concurrently update an object foo at generation X, GCS
   128  // will only allow one of them to "win". The others receive a PreconditionFailed error and can
   129  // move onto the next group.
   130  func lockGroup(ctx context.Context, client gcs.ConditionalClient, path gcs.Path, generation int64) (*storage.ObjectAttrs, error) {
   131  	var buf []byte
   132  	if generation == 0 {
   133  		var grid statepb.Grid
   134  		var err error
   135  		if buf, err = gcs.MarshalGrid(&grid); err != nil {
   136  			return nil, fmt.Errorf("marshal: %w", err)
   137  		}
   138  	}
   139  
   140  	return gcs.Touch(ctx, client, path, generation, buf)
   141  }
   142  
   143  func testGroups(cfg *snapshot.Config, groupNames ...string) ([]*configpb.TestGroup, error) {
   144  	var groups []*configpb.TestGroup
   145  
   146  	if len(groupNames) == 0 {
   147  		groups = make([]*configpb.TestGroup, 0, len(groupNames))
   148  		for _, testConfig := range cfg.Groups {
   149  			groups = append(groups, testConfig)
   150  		}
   151  		return groups, nil
   152  	}
   153  
   154  	groups = make([]*configpb.TestGroup, 0, len(groupNames))
   155  	for _, groupName := range groupNames {
   156  		tg := cfg.Groups[groupName]
   157  		if tg == nil {
   158  			return nil, fmt.Errorf("group %q not found", groupName)
   159  		}
   160  		groups = append(groups, tg)
   161  	}
   162  	return groups, nil
   163  }
   164  
   165  type lastUpdated struct {
   166  	client     gcs.ConditionalClient
   167  	gridPrefix string
   168  	configPath gcs.Path
   169  	freq       time.Duration
   170  }
   171  
   172  func (fixer lastUpdated) fixOnce(ctx context.Context, log logrus.FieldLogger, q *config.TestGroupQueue, groups []*configpb.TestGroup) error {
   173  	ctx, cancel := context.WithCancel(ctx)
   174  	defer cancel()
   175  	paths, err := gridPaths(fixer.configPath, fixer.gridPrefix, groups)
   176  	if err != nil {
   177  		return err
   178  	}
   179  	attrs := gcs.StatExisting(ctx, log, fixer.client, paths...)
   180  	updates := make(map[string]time.Time, len(attrs))
   181  	var wg sync.WaitGroup
   182  	for i, attr := range attrs {
   183  		if attr == nil {
   184  			continue
   185  		}
   186  		name := groups[i].Name
   187  		if attr.Generation > 0 {
   188  			updates[name] = attr.Updated.Add(fixer.freq)
   189  		} else if attr.Generation == 0 {
   190  			wg.Add(1)
   191  			go func(i int) {
   192  				defer wg.Done()
   193  				if _, err := lockGroup(ctx, fixer.client, paths[i], 0); err != nil && !gcs.IsPreconditionFailed(err) {
   194  					log.WithError(err).Error("Failed to create empty group state")
   195  				}
   196  			}(i)
   197  			updates[name] = time.Now()
   198  		}
   199  	}
   200  	wg.Wait()
   201  	q.Init(log, groups, time.Now().Add(fixer.freq))
   202  	q.FixAll(updates, false)
   203  	return nil
   204  }
   205  
   206  func (fixer lastUpdated) Fix(ctx context.Context, log logrus.FieldLogger, q *config.TestGroupQueue, groups []*configpb.TestGroup) error {
   207  	if fixer.freq == 0 {
   208  		return nil
   209  	}
   210  	ticker := time.NewTicker(fixer.freq)
   211  	fix := func() {
   212  		if err := fixer.fixOnce(ctx, log, q, groups); err != nil {
   213  			log.WithError(err).Warning("Failed to fix groups based on last update time")
   214  		}
   215  	}
   216  	fix()
   217  
   218  	for {
   219  		select {
   220  		case <-ctx.Done():
   221  			ticker.Stop()
   222  			return ctx.Err()
   223  		case <-ticker.C:
   224  			fix()
   225  		}
   226  	}
   227  }
   228  
   229  // Fixer will fix the TestGroupQueue's next time for TestGroups.
   230  //
   231  // Fixer should:
   232  // * work continually and not return until the context expires.
   233  // * expect to be called multiple times with different contexts and test groups.
   234  //
   235  // For example, it might use the last updated time of the test group to
   236  // specify the next update time. Or it might watch the data backing these groups and
   237  // request an immediate update whenever the data changes.
   238  type Fixer func(context.Context, logrus.FieldLogger, *config.TestGroupQueue, []*configpb.TestGroup) error
   239  
   240  // UpdateOptions aggregates the Update function parameter into a single structure.
   241  type UpdateOptions struct {
   242  	ConfigPath       gcs.Path
   243  	GridPrefix       string
   244  	GroupConcurrency int
   245  	GroupNames       []string
   246  	Write            bool
   247  	Freq             time.Duration
   248  }
   249  
   250  // Update test groups with the specified freq.
   251  //
   252  // Retries errors at double and unfinished groups as soon as possible.
   253  //
   254  // Filters down to a single group when set.
   255  // Returns after all groups updated once if freq is zero.
   256  func Update(parent context.Context, client gcs.ConditionalClient, mets *Metrics, updateGroup GroupUpdater, opts *UpdateOptions, fixers ...Fixer) error {
   257  	ctx, cancel := context.WithCancel(parent)
   258  	defer cancel()
   259  	log := logrus.WithField("config", opts.ConfigPath)
   260  
   261  	var q config.TestGroupQueue
   262  
   263  	log.Debug("Observing config...")
   264  	newConfig, err := snapshot.Observe(ctx, log, client, opts.ConfigPath, time.NewTicker(time.Minute).C)
   265  	if err != nil {
   266  		return fmt.Errorf("observe config: %w", err)
   267  
   268  	}
   269  	cfg := <-newConfig
   270  	groups, err := testGroups(cfg, opts.GroupNames...)
   271  	if err != nil {
   272  		return fmt.Errorf("filter test groups: %w", err)
   273  	}
   274  
   275  	q.Init(log, groups, time.Now().Add(opts.Freq))
   276  
   277  	log.Debug("Fetching initial start times...")
   278  	fixLastUpdated := lastUpdated{
   279  		client:     client,
   280  		gridPrefix: opts.GridPrefix,
   281  		configPath: opts.ConfigPath,
   282  		freq:       opts.Freq,
   283  	}
   284  	if err := fixLastUpdated.fixOnce(ctx, log, &q, groups); err != nil {
   285  		return fmt.Errorf("get generations: %v", err)
   286  	}
   287  	log.Info("Fetched initial start times")
   288  
   289  	fixers = append(fixers, fixLastUpdated.Fix)
   290  
   291  	go func() {
   292  		ticker := time.NewTicker(time.Minute)
   293  		log := log
   294  		for {
   295  			depth, next, when := q.Status()
   296  			log := log.WithField("depth", depth)
   297  			if next != nil {
   298  				log = log.WithField("next", next.Name)
   299  			}
   300  			delay := time.Since(when)
   301  			if delay < 0 {
   302  				delay = 0
   303  				log = log.WithField("sleep", -delay)
   304  			}
   305  			if max := opts.Freq * 2; max > 0 && delay > max {
   306  				delay = max
   307  			}
   308  			log = log.WithField("delay", delay.Round(time.Second))
   309  			mets.delay(delay)
   310  			select {
   311  			case <-ctx.Done():
   312  				return
   313  			case <-ticker.C:
   314  				log.Info("Queue Status")
   315  			}
   316  		}
   317  	}()
   318  
   319  	go func() {
   320  		fixCtx, fixCancel := context.WithCancel(ctx)
   321  		var fixWg sync.WaitGroup
   322  		fixAll := func() {
   323  			n := len(fixers)
   324  			log.WithField("fixers", n).Trace("Starting fixers on current test groups...")
   325  			fixWg.Add(n)
   326  			for i, fix := range fixers {
   327  				go func(i int, fix Fixer) {
   328  					defer fixWg.Done()
   329  					if err := fix(fixCtx, log, &q, groups); err != nil && !errors.Is(err, context.Canceled) {
   330  						log.WithError(err).WithField("fixer", i).Warning("Fixer failed")
   331  					}
   332  				}(i, fix)
   333  			}
   334  			log.Debug("Started fixers on current test groups")
   335  		}
   336  		fixAll()
   337  		for {
   338  			select {
   339  			case <-ctx.Done():
   340  				fixCancel()
   341  				return
   342  			case cfg, ok := <-newConfig:
   343  				if !ok {
   344  					fixCancel()
   345  					return
   346  				}
   347  				log.Info("Updating config")
   348  				groups, err = testGroups(cfg, opts.GroupNames...)
   349  				if err != nil {
   350  					log.Errorf("Error during config update: %v", err)
   351  				}
   352  				log.Debug("Cancelling fixers on old test groups...")
   353  				fixCancel()
   354  				fixWg.Wait()
   355  				q.Init(log, groups, time.Now().Add(opts.Freq))
   356  				log.Debug("Canceled fixers on old test groups")
   357  				fixCtx, fixCancel = context.WithCancel(ctx)
   358  				fixAll()
   359  			}
   360  		}
   361  	}()
   362  
   363  	active := map[string]bool{}
   364  	var lock sync.RWMutex
   365  	var wg sync.WaitGroup
   366  	wg.Add(opts.GroupConcurrency)
   367  	defer wg.Wait()
   368  	channel := make(chan *configpb.TestGroup)
   369  	defer close(channel)
   370  
   371  	updateTestGroup := func(tg *configpb.TestGroup) {
   372  		name := tg.Name
   373  		log := log.WithField("group", name)
   374  		lock.RLock()
   375  		on := active[name]
   376  		lock.RUnlock()
   377  		if on {
   378  			log.Debug("Already updating...")
   379  			return
   380  		}
   381  		fin := mets.start()
   382  		tgp, err := TestGroupPath(opts.ConfigPath, opts.GridPrefix, name)
   383  		if err != nil {
   384  			fin.Fail()
   385  			log.WithError(err).Error("Bad path")
   386  			return
   387  		}
   388  		lock.Lock()
   389  		if active[name] {
   390  			lock.Unlock()
   391  			log.Debug("Another routine started updating...")
   392  			return
   393  		}
   394  		active[name] = true
   395  		lock.Unlock()
   396  		defer func() {
   397  			lock.Lock()
   398  			active[name] = false
   399  			lock.Unlock()
   400  		}()
   401  		start := time.Now()
   402  		unprocessed, err := updateGroup(ctx, log, client, tg, *tgp)
   403  		log.WithField("duration", time.Since(start)).Info("Finished processing group.")
   404  		if err != nil {
   405  			log := log.WithError(err)
   406  			if gcs.IsPreconditionFailed(err) {
   407  				fin.Skip()
   408  				log.Info("Group was modified while updating")
   409  			} else {
   410  				fin.Fail()
   411  				log.Error("Failed to update group")
   412  			}
   413  			var delay time.Duration
   414  			if opts.Freq > 0 {
   415  				delay = opts.Freq/4 + time.Duration(rand.Int63n(int64(opts.Freq/4))) // Int63n() panics if freq <= 0
   416  				log = log.WithField("delay", delay.Seconds())
   417  				q.Fix(tg.Name, time.Now().Add(delay), true)
   418  			}
   419  			return
   420  		}
   421  		fin.Success()
   422  		if unprocessed { // process another chunk ASAP
   423  			q.Fix(name, time.Now(), false)
   424  		}
   425  	}
   426  
   427  	for i := 0; i < opts.GroupConcurrency; i++ {
   428  		go func() {
   429  			defer wg.Done()
   430  			for tg := range channel {
   431  				updateTestGroup(tg)
   432  			}
   433  		}()
   434  	}
   435  
   436  	log.Info("Starting to process test groups...")
   437  	return q.Send(ctx, channel, opts.Freq)
   438  }
   439  
   440  // TestGroupPath returns the path to a test_group proto given this proto
   441  func TestGroupPath(g gcs.Path, gridPrefix, groupName string) (*gcs.Path, error) {
   442  	name := path.Join(gridPrefix, groupName)
   443  	u, err := url.Parse(name)
   444  	if err != nil {
   445  		return nil, fmt.Errorf("invalid url %s: %w", name, err)
   446  	}
   447  	np, err := g.ResolveReference(u)
   448  	if err != nil {
   449  		return nil, fmt.Errorf("resolve reference: %w", err)
   450  	}
   451  	if np.Bucket() != g.Bucket() {
   452  		return nil, fmt.Errorf("testGroup %s should not change bucket", name)
   453  	}
   454  	return np, nil
   455  }
   456  
   457  func gcsPrefix(tg *configpb.TestGroup) string {
   458  	if tg.ResultSource == nil {
   459  		return tg.GcsPrefix
   460  	}
   461  	if gcsCfg := tg.ResultSource.GetGcsConfig(); gcsCfg != nil {
   462  		return gcsCfg.GcsPrefix
   463  	}
   464  	return tg.GcsPrefix
   465  }
   466  
   467  func groupPaths(tg *configpb.TestGroup) ([]gcs.Path, error) {
   468  	var out []gcs.Path
   469  	prefixes := strings.Split(gcsPrefix(tg), ",")
   470  	for idx, prefix := range prefixes {
   471  		prefix := strings.TrimSpace(prefix)
   472  		if prefix == "" {
   473  			continue
   474  		}
   475  		u, err := url.Parse("gs://" + prefix)
   476  		if err != nil {
   477  			return nil, fmt.Errorf("parse: %w", err)
   478  		}
   479  		if u.Path != "" && u.Path[len(u.Path)-1] != '/' {
   480  			u.Path += "/"
   481  		}
   482  
   483  		var p gcs.Path
   484  		if err := p.SetURL(u); err != nil {
   485  			if idx > 0 {
   486  				return nil, fmt.Errorf("%d: %s: %w", idx, prefix, err)
   487  			}
   488  			return nil, err
   489  		}
   490  		out = append(out, p)
   491  	}
   492  	return out, nil
   493  }
   494  
   495  // truncateRunning filters out all columns until the oldest still running column.
   496  //
   497  // If there are 20 columns where all are complete except the 3rd and 7th, this will
   498  // return the 8th and later columns.
   499  //
   500  // Running columns more than 3 days old are not considered.
   501  func truncateRunning(cols []InflatedColumn, floorTime time.Time) []InflatedColumn {
   502  	if len(cols) == 0 {
   503  		return cols
   504  	}
   505  
   506  	floor := float64(floorTime.UTC().Unix() * 1000)
   507  
   508  	for i := len(cols) - 1; i >= 0; i-- {
   509  		if cols[i].Column.Started < floor {
   510  			continue
   511  		}
   512  		for _, cell := range cols[i].Cells {
   513  			if cell.Result == statuspb.TestStatus_RUNNING {
   514  				return cols[i+1:]
   515  			}
   516  		}
   517  	}
   518  	// No cells are found to be running; do not truncate
   519  	return cols
   520  }
   521  
   522  func listBuilds(ctx context.Context, client gcs.Lister, since string, paths ...gcs.Path) ([]gcs.Build, error) {
   523  	var out []gcs.Build
   524  
   525  	for idx, tgPath := range paths {
   526  		var offset *gcs.Path
   527  		var err error
   528  		if since != "" {
   529  			if !strings.HasSuffix(since, "/") {
   530  				since = since + "/"
   531  			}
   532  			if offset, err = tgPath.ResolveReference(&url.URL{Path: since}); err != nil {
   533  				return nil, fmt.Errorf("resolve since: %w", err)
   534  			}
   535  		}
   536  		builds, err := gcs.ListBuilds(ctx, client, tgPath, offset)
   537  		if err != nil {
   538  			return nil, fmt.Errorf("%d: %s: %w", idx, tgPath, err)
   539  		}
   540  		out = append(out, builds...)
   541  	}
   542  
   543  	if len(paths) > 1 {
   544  		gcs.Sort(out)
   545  	}
   546  
   547  	return out, nil
   548  }
   549  
   550  // ColumnReader finds, processes and new columns to send to the receivers.
   551  //
   552  // * Columns with the same Name and Build will get merged together.
   553  // * Readers must be reentrant.
   554  //   - Processing must expect every sent column to be the final column this cycle.
   555  //     AKA calling this method once and reading two columns should be equivalent to
   556  //     calling the method once, reading one column and then calling it a second time
   557  //     and reading a second column.
   558  type ColumnReader func(ctx context.Context, log logrus.FieldLogger, tg *configpb.TestGroup, oldCols []InflatedColumn, stop time.Time, receivers chan<- InflatedColumn) error
   559  
   560  // SortStarted sorts InflatedColumns by column start time.
   561  func SortStarted(cols []InflatedColumn) {
   562  	sort.SliceStable(cols, func(i, j int) bool {
   563  		return cols[i].Column.Started > cols[j].Column.Started
   564  	})
   565  }
   566  
   567  const byteCeiling = 2e6 // 2 megabytes
   568  
   569  // InflateDropAppend updates groups by downloading the existing grid, dropping old rows and appending new ones.
   570  func InflateDropAppend(ctx context.Context, alog logrus.FieldLogger, client gcs.Client, tg *configpb.TestGroup, gridPath gcs.Path, write bool, readCols ColumnReader, reprocess time.Duration) (bool, error) {
   571  	log := alog.(logrus.Ext1FieldLogger) // Add trace method
   572  	ctx, cancel := context.WithCancel(ctx)
   573  	defer cancel()
   574  
   575  	// Grace period to read additional column.
   576  	var grace context.Context
   577  	if deadline, present := ctx.Deadline(); present {
   578  		var cancel context.CancelFunc
   579  		dur := time.Until(deadline) / 2
   580  		grace, cancel = context.WithTimeout(context.Background(), dur)
   581  		defer cancel()
   582  	} else {
   583  		grace = context.Background()
   584  	}
   585  
   586  	var shrinkGrace context.Context
   587  	if deadline, present := ctx.Deadline(); present {
   588  		var cancel context.CancelFunc
   589  		dur := 3 * time.Until(deadline) / 4
   590  		shrinkGrace, cancel = context.WithTimeout(context.Background(), dur)
   591  		defer cancel()
   592  	} else {
   593  		shrinkGrace = context.Background()
   594  	}
   595  
   596  	var dur time.Duration
   597  	if tg.DaysOfResults > 0 {
   598  		dur = days(float64(tg.DaysOfResults))
   599  	} else {
   600  		dur = days(7)
   601  	}
   602  
   603  	stop := time.Now().Add(-dur)
   604  	log = log.WithField("stop", stop)
   605  
   606  	var oldCols []InflatedColumn
   607  	var issues map[string][]string
   608  
   609  	log.Trace("Downloading existing grid...")
   610  	old, attrs, err := gcs.DownloadGrid(ctx, client, gridPath)
   611  	if err != nil {
   612  		log.WithField("path", gridPath).WithError(err).Error("Failed to download existing grid")
   613  	}
   614  	inflateStart := time.Now()
   615  	if old != nil {
   616  		var cols []InflatedColumn
   617  		var err error
   618  		log.Trace("Inflating grid...")
   619  		if cols, issues, err = InflateGrid(ctx, old, stop, time.Now().Add(-reprocess)); err != nil {
   620  			return false, fmt.Errorf("inflate: %w", err)
   621  		}
   622  		var floor time.Time
   623  		when := time.Now().Add(-7 * 24 * time.Hour)
   624  		if col := reprocessColumn(log, old, tg, when); col != nil {
   625  			cols = append(cols, *col)
   626  			floor = when
   627  		}
   628  		SortStarted(cols) // Our processing requires descending start time.
   629  		oldCols = truncateRunning(cols, floor)
   630  	}
   631  	inflateDur := time.Since(inflateStart)
   632  	readColsStart := time.Now()
   633  	var cols []InflatedColumn
   634  	var unreadColumns bool
   635  	if attrs != nil && attrs.Size >= int64(byteCeiling) {
   636  		log.WithField("size", attrs.Size).Info("Grid too large, compressing...")
   637  		unreadColumns = true
   638  		cols = oldCols
   639  	} else {
   640  		if condClient, ok := client.(gcs.ConditionalClient); ok {
   641  			var cond storage.Conditions
   642  			if attrs == nil {
   643  				cond.DoesNotExist = true
   644  			} else {
   645  				cond.GenerationMatch = attrs.Generation
   646  			}
   647  			client = condClient.If(&cond, &cond)
   648  		}
   649  
   650  		newCols := make(chan InflatedColumn)
   651  		ec := make(chan error)
   652  
   653  		log.Trace("Reading first column...")
   654  		go func() {
   655  			err := readCols(ctx, log, tg, oldCols, stop, newCols)
   656  			select {
   657  			case <-ctx.Done():
   658  			case ec <- err:
   659  			}
   660  		}()
   661  
   662  		// Must read at least one column every cycle to ensure we make forward progress.
   663  		more := true
   664  		select {
   665  		case <-ctx.Done():
   666  			return false, fmt.Errorf("first column: %w", ctx.Err())
   667  		case col := <-newCols:
   668  			if len(col.Cells) == 0 {
   669  				// Group all empty columns together by setting build/name empty.
   670  				col.Column.Build = ""
   671  				col.Column.Name = ""
   672  			}
   673  			cols = append(cols, col)
   674  		case err := <-ec:
   675  			if err != nil {
   676  				return false, fmt.Errorf("read first column: %w", err)
   677  			}
   678  			more = false
   679  		}
   680  
   681  		// Read as many additional columns as we can within the allocated time.
   682  		log.Trace("Reading additional columns...")
   683  		for more {
   684  			select {
   685  			case <-grace.Done():
   686  				unreadColumns = true
   687  				more = false
   688  			case <-ctx.Done():
   689  				return false, ctx.Err()
   690  			case col := <-newCols:
   691  				if len(col.Cells) == 0 {
   692  					// Group all empty columns together by setting build/name empty.
   693  					col.Column.Build = ""
   694  					col.Column.Name = ""
   695  				}
   696  				cols = append(cols, col)
   697  			case err := <-ec:
   698  				if err != nil {
   699  					return false, fmt.Errorf("read columns: %w", err)
   700  				}
   701  				more = false
   702  			}
   703  		}
   704  
   705  		log = log.WithField("appended", len(cols))
   706  
   707  		overrideBuild(tg, cols) // so we group correctly
   708  		cols = append(cols, oldCols...)
   709  		cols = groupColumns(tg, cols)
   710  	}
   711  	readColsDur := time.Since(readColsStart)
   712  
   713  	SortStarted(cols)
   714  
   715  	shrinkStart := time.Now()
   716  	cols = truncateGrid(cols, byteCeiling) // Assume each cell is at least 1 byte
   717  	var grid *statepb.Grid
   718  	var buf []byte
   719  	grid, buf, err = shrinkGridInline(shrinkGrace, log, tg, cols, issues, byteCeiling)
   720  	if err != nil {
   721  		return false, fmt.Errorf("shrink grid inline: %v", err)
   722  	}
   723  	shrinkDur := time.Since(shrinkStart)
   724  
   725  	grid.Config = tg
   726  
   727  	log = log.WithField("url", gridPath).WithField("bytes", len(buf))
   728  	if !write {
   729  		log = log.WithField("dryrun", true)
   730  	} else {
   731  		log.Debug("Writing grid...")
   732  		// TODO(fejta): configurable cache value
   733  		if _, err := client.Upload(ctx, gridPath, buf, gcs.DefaultACL, gcs.NoCache); err != nil {
   734  			return false, fmt.Errorf("upload %d bytes: %w", len(buf), err)
   735  		}
   736  	}
   737  	if unreadColumns {
   738  		log = log.WithField("more", true)
   739  	}
   740  	log.WithFields(logrus.Fields{
   741  		"cols":     len(grid.Columns),
   742  		"rows":     len(grid.Rows),
   743  		"inflate":  inflateDur,
   744  		"readCols": readColsDur,
   745  		"shrink":   shrinkDur,
   746  	}).Info("Wrote grid")
   747  	return unreadColumns, nil
   748  }
   749  
   750  // truncateGrid cuts grid down to 'cellCeiling' or fewer cells
   751  // Used as a cheap way to truncate before the finer-tuned shrinkGridInline.
   752  func truncateGrid(cols []InflatedColumn, cellCeiling int) []InflatedColumn {
   753  	var cells int
   754  	for i := 0; i < len(cols); i++ {
   755  		nc := len(cols[i].Cells)
   756  		cells += nc
   757  		if i < 2 || cells <= cellCeiling {
   758  			continue
   759  		}
   760  		return cols[:i]
   761  	}
   762  	return cols
   763  }
   764  
   765  // reprocessColumn returns a column with a running result if the previous config differs from the current one
   766  func reprocessColumn(log logrus.FieldLogger, old *statepb.Grid, currentCfg *configpb.TestGroup, when time.Time) *InflatedColumn {
   767  	if old.Config == nil || old.Config.String() == currentCfg.String() {
   768  		return nil
   769  	}
   770  
   771  	log.WithField("since", when.Round(time.Minute)).Info("Reprocessing results after changed config")
   772  
   773  	return &InflatedColumn{
   774  		Column: &statepb.Column{
   775  			Started: float64(when.UTC().Unix() * 1000),
   776  		},
   777  		Cells: map[string]Cell{
   778  			"reprocess": {
   779  				Result: statuspb.TestStatus_RUNNING,
   780  			},
   781  		},
   782  	}
   783  }
   784  
   785  func shrinkGridInline(ctx context.Context, log logrus.FieldLogger, tg *configpb.TestGroup, cols []InflatedColumn, issues map[string][]string, byteCeiling int) (*statepb.Grid, []byte, error) {
   786  	// Hopefully the grid is small enough...
   787  	grid := constructGridFromGroupConfig(log, tg, cols, issues)
   788  	buf, err := gcs.MarshalGrid(grid)
   789  	if err != nil {
   790  		return nil, nil, fmt.Errorf("marshal grid: %w", err)
   791  	}
   792  	orig := len(buf)
   793  	if byteCeiling == 0 || orig < byteCeiling {
   794  		return grid, buf, nil
   795  	}
   796  
   797  	// Nope, let's drop old data...
   798  	newCeiling := byteCeiling / 2
   799  
   800  	log = log.WithField("originally", orig)
   801  	for i := len(cols) / 2; i > 0; i = i / 2 {
   802  		select {
   803  		case <-ctx.Done():
   804  			log.WithField("size", len(buf)).Info("Timeout shrinking row data")
   805  			return grid, buf, nil
   806  		default:
   807  		}
   808  
   809  		log.WithField("size", len(buf)).Debug("Shrinking row data")
   810  
   811  		// shrink cols to half and cap
   812  		truncateLastColumn(cols[0:i], orig, byteCeiling, "byte")
   813  
   814  		grid = constructGridFromGroupConfig(log, tg, cols[0:i], issues)
   815  		buf, err = gcs.MarshalGrid(grid)
   816  		if err != nil {
   817  			return nil, nil, fmt.Errorf("marshal grid: %w", err)
   818  		}
   819  
   820  		if len(buf) < newCeiling {
   821  			log.WithField("size", len(buf)).Info("Shrunk row data")
   822  			return grid, buf, nil
   823  		}
   824  
   825  	}
   826  
   827  	// One column isn't small enough. Return a single-cell grid.
   828  	grid = constructGridFromGroupConfig(log, tg, deletedColumn(cols[0]), nil)
   829  	buf, err = gcs.MarshalGrid(grid)
   830  	log.WithField("size", len(buf)).Info("Shrunk to minimum; storing metadata only")
   831  	return grid, buf, err
   832  }
   833  
   834  // Legacy row name to report data truncation
   835  const truncatedRowName = "Truncated"
   836  
   837  func truncateLastColumn(grid []InflatedColumn, orig, max int, entity string) {
   838  	if len(grid) == 0 {
   839  		return
   840  	}
   841  	last := len(grid) - 1
   842  	for name, cell := range grid[last].Cells {
   843  		if name == truncatedRowName {
   844  			delete(grid[last].Cells, truncatedRowName)
   845  			continue
   846  		}
   847  		if cell.Result == statuspb.TestStatus_NO_RESULT {
   848  			continue
   849  		}
   850  		cell.Result = statuspb.TestStatus_UNKNOWN
   851  		cell.Message = fmt.Sprintf("%d %s grid exceeds maximum size of %d %ss", orig, entity, max, entity)
   852  		cell.Icon = "..." // Overwritten by the UI
   853  		grid[last].Cells[name] = cell
   854  	}
   855  }
   856  
   857  // A column with the same header data, but all the rows deleted.
   858  func deletedColumn(latestColumn InflatedColumn) []InflatedColumn {
   859  	return []InflatedColumn{
   860  		{
   861  			Column: latestColumn.Column,
   862  			Cells: map[string]Cell{
   863  				truncatedRowName: {
   864  					Result:  statuspb.TestStatus_UNKNOWN,
   865  					ID:      truncatedRowName,
   866  					Message: fmt.Sprintf("The grid is too large to update. Split this testgroup into multiple testgroups."),
   867  				},
   868  			},
   869  		},
   870  	}
   871  }
   872  
   873  // FormatStrftime replaces python codes with what go expects.
   874  //
   875  // aka %Y-%m-%d becomes 2006-01-02
   876  func FormatStrftime(in string) string {
   877  	replacements := map[string]string{
   878  		"%p": "PM",
   879  		"%Y": "2006",
   880  		"%y": "06",
   881  		"%m": "01",
   882  		"%d": "02",
   883  		"%H": "15",
   884  		"%M": "04",
   885  		"%S": "05",
   886  	}
   887  
   888  	out := in
   889  
   890  	for bad, good := range replacements {
   891  		out = strings.ReplaceAll(out, bad, good)
   892  	}
   893  	return out
   894  }
   895  
   896  func overrideBuild(tg *configpb.TestGroup, cols []InflatedColumn) {
   897  	fmt := tg.BuildOverrideStrftime
   898  	if fmt == "" {
   899  		return
   900  	}
   901  	fmt = FormatStrftime(fmt)
   902  	for _, col := range cols {
   903  		started := int64(col.Column.Started)
   904  		when := time.Unix(started/1000, (started%1000)*int64(time.Millisecond/time.Nanosecond))
   905  		col.Column.Build = when.Format(fmt)
   906  	}
   907  }
   908  
   909  const columnIDSeparator = "\ue000"
   910  
   911  // GroupColumns merges columns with the same Name and Build.
   912  //
   913  // Cells are joined together, splitting those with the same name.
   914  // Started is the smallest value.
   915  // Extra is the most recent filled value.
   916  func groupColumns(tg *configpb.TestGroup, cols []InflatedColumn) []InflatedColumn {
   917  	groups := map[string][]InflatedColumn{}
   918  	var ids []string
   919  	for _, c := range cols {
   920  		id := c.Column.Name + columnIDSeparator + c.Column.Build
   921  		groups[id] = append(groups[id], c)
   922  		ids = append(ids, id)
   923  	}
   924  
   925  	if len(groups) == 0 {
   926  		return nil
   927  	}
   928  
   929  	out := make([]InflatedColumn, 0, len(groups))
   930  
   931  	seen := make(map[string]bool, len(groups))
   932  
   933  	for _, id := range ids {
   934  		if seen[id] {
   935  			continue // already merged this group.
   936  		}
   937  		seen[id] = true
   938  		var col InflatedColumn
   939  
   940  		groupedCells := groups[id]
   941  		if len(groupedCells) == 1 {
   942  			out = append(out, groupedCells[0])
   943  			continue
   944  		}
   945  
   946  		cells := map[string][]Cell{}
   947  
   948  		var count int
   949  		for i, c := range groupedCells {
   950  			if i == 0 {
   951  				col.Column = c.Column
   952  			} else {
   953  				if c.Column.Started < col.Column.Started {
   954  					col.Column.Started = c.Column.Started
   955  				}
   956  				if !sortorder.NaturalLess(c.Column.Hint, col.Column.Hint) {
   957  					col.Column.Hint = c.Column.Hint
   958  				}
   959  				for i, val := range c.Column.Extra {
   960  					if i == len(col.Column.Extra) {
   961  						col.Column.Extra = append(col.Column.Extra, val)
   962  						continue
   963  					}
   964  					if val == "" || val == col.Column.Extra[i] {
   965  						continue
   966  					}
   967  					if col.Column.Extra[i] == "" {
   968  						col.Column.Extra[i] = val
   969  					} else if i < len(tg.GetColumnHeader()) && tg.GetColumnHeader()[i].ListAllValues {
   970  						col.Column.Extra[i] = joinHeaders(col.Column.Extra[i], val)
   971  					} else {
   972  						col.Column.Extra[i] = "*" // values differ
   973  					}
   974  				}
   975  			}
   976  			for key, cell := range c.Cells {
   977  				cells[key] = append(cells[key], cell)
   978  				count++
   979  			}
   980  		}
   981  		if tg.IgnoreOldResults {
   982  			col.Cells = make(map[string]Cell, len(cells))
   983  		} else {
   984  			col.Cells = make(map[string]Cell, count)
   985  		}
   986  		for name, duplicateCells := range cells {
   987  			if tg.IgnoreOldResults {
   988  				col.Cells[name] = duplicateCells[0]
   989  				continue
   990  			}
   991  			for name, cell := range SplitCells(name, duplicateCells...) {
   992  				col.Cells[name] = cell
   993  			}
   994  		}
   995  		out = append(out, col)
   996  	}
   997  	return out
   998  }
   999  
  1000  func joinHeaders(headers ...string) string {
  1001  	headerSet := make(map[string]bool)
  1002  	for _, header := range headers {
  1003  		vals := strings.Split(header, "||")
  1004  		for _, val := range vals {
  1005  			if val == "" {
  1006  				continue
  1007  			}
  1008  			headerSet[val] = true
  1009  		}
  1010  	}
  1011  	keys := []string{}
  1012  	for k := range headerSet {
  1013  		keys = append(keys, k)
  1014  	}
  1015  	sort.Strings(keys)
  1016  	return strings.Join(keys, "||")
  1017  }
  1018  
  1019  // days converts days float into a time.Duration, assuming a 24 hour day.
  1020  //
  1021  // A day is not always 24 hours due to things like leap-seconds.
  1022  // We do not need this level of precision though, so ignore the complexity.
  1023  func days(d float64) time.Duration {
  1024  	return time.Duration(24*d) * time.Hour // Close enough
  1025  }
  1026  
  1027  // ConstructGrid will append all the inflatedColumns into the returned Grid.
  1028  //
  1029  // The returned Grid has correctly compressed row values.
  1030  func ConstructGrid(log logrus.FieldLogger, cols []InflatedColumn, issues map[string][]string, failuresToAlert, passesToDisableAlert int, useCommitAsBuildID bool, userProperty string, brokenThreshold float32, columnHeader []*configpb.TestGroup_ColumnHeader) *statepb.Grid {
  1031  	// Add the columns into a grid message
  1032  	var grid statepb.Grid
  1033  	rows := map[string]*statepb.Row{} // For fast target => row lookup
  1034  	if failuresToAlert > 0 && passesToDisableAlert == 0 {
  1035  		passesToDisableAlert = 1
  1036  	}
  1037  
  1038  	for _, col := range cols {
  1039  		if brokenThreshold > 0.0 && col.Column != nil {
  1040  			col.Column.Stats = columnStats(col.Cells, brokenThreshold)
  1041  		}
  1042  		AppendColumn(&grid, rows, col)
  1043  	}
  1044  
  1045  	dropEmptyRows(log, &grid, rows)
  1046  
  1047  	for name, row := range rows {
  1048  		row.Issues = append(row.Issues, issues[name]...)
  1049  		issueSet := make(map[string]bool, len(row.Issues))
  1050  		for _, i := range row.Issues {
  1051  			issueSet[i] = true
  1052  		}
  1053  		row.Issues = make([]string, 0, len(issueSet))
  1054  		for i := range issueSet {
  1055  			row.Issues = append(row.Issues, i)
  1056  		}
  1057  		sort.SliceStable(row.Issues, func(i, j int) bool {
  1058  			// Largest issues at the front of the list
  1059  			return !sortorder.NaturalLess(row.Issues[i], row.Issues[j])
  1060  		})
  1061  	}
  1062  
  1063  	alertRows(grid.Columns, grid.Rows, failuresToAlert, passesToDisableAlert, useCommitAsBuildID, userProperty, columnHeader)
  1064  	sort.SliceStable(grid.Rows, func(i, j int) bool {
  1065  		return sortorder.NaturalLess(grid.Rows[i].Name, grid.Rows[j].Name)
  1066  	})
  1067  
  1068  	for _, row := range grid.Rows {
  1069  		del := true
  1070  		for _, up := range row.UserProperty {
  1071  			if up != "" {
  1072  				del = false
  1073  				break
  1074  			}
  1075  		}
  1076  		if del {
  1077  			row.UserProperty = nil
  1078  		}
  1079  		sort.SliceStable(row.Metric, func(i, j int) bool {
  1080  			return sortorder.NaturalLess(row.Metric[i], row.Metric[j])
  1081  		})
  1082  		sort.SliceStable(row.Metrics, func(i, j int) bool {
  1083  			return sortorder.NaturalLess(row.Metrics[i].Name, row.Metrics[j].Name)
  1084  		})
  1085  	}
  1086  	return &grid
  1087  }
  1088  
  1089  // constructGridFromGroupConfig will append all the inflatedColumns into the returned Grid.
  1090  //
  1091  // The returned Grid has correctly compressed row values.
  1092  func constructGridFromGroupConfig(log logrus.FieldLogger, group *configpb.TestGroup, cols []InflatedColumn, issues map[string][]string) *statepb.Grid {
  1093  	usesK8sClient := group.UseKubernetesClient || (group.GetResultSource().GetGcsConfig() != nil)
  1094  	return ConstructGrid(log, cols, issues, int(group.GetNumFailuresToAlert()), int(group.GetNumPassesToDisableAlert()), usesK8sClient, group.GetUserProperty(), 0.0, group.GetColumnHeader())
  1095  }
  1096  
  1097  func dropEmptyRows(log logrus.FieldLogger, grid *statepb.Grid, rows map[string]*statepb.Row) {
  1098  	filled := make([]*statepb.Row, 0, len(rows))
  1099  	var dropped int
  1100  	for _, r := range grid.Rows {
  1101  		var found bool
  1102  		f := result.Iter(r.Results)
  1103  		for {
  1104  			res, more := f()
  1105  			if !more {
  1106  				break
  1107  			}
  1108  			if res == statuspb.TestStatus_NO_RESULT {
  1109  				continue
  1110  			}
  1111  			found = true
  1112  			break
  1113  		}
  1114  		if !found {
  1115  			dropped++
  1116  			delete(rows, r.Name)
  1117  			continue
  1118  		}
  1119  		filled = append(filled, r)
  1120  	}
  1121  
  1122  	if dropped == 0 {
  1123  		return
  1124  	}
  1125  
  1126  	grid.Rows = filled
  1127  	log.WithField("dropped", dropped).Info("Dropped old rows")
  1128  }
  1129  
  1130  // appendMetric adds the value at index to metric.
  1131  //
  1132  // Handles the details of sparse-encoding the results.
  1133  // Indices must be monotonically increasing for the same metric.
  1134  func appendMetric(metric *statepb.Metric, idx int32, value float64) {
  1135  	if l := int32(len(metric.Indices)); l == 0 || metric.Indices[l-2]+metric.Indices[l-1] != idx {
  1136  		// If we append V to idx 9 and metric.Indices = [3, 4] then the last filled index is 3+4-1=7
  1137  		// So that means we have holes in idx 7 and 8, so start a new group.
  1138  		metric.Indices = append(metric.Indices, idx, 1)
  1139  	} else {
  1140  		metric.Indices[l-1]++ // Expand the length of the current filled list
  1141  	}
  1142  	metric.Values = append(metric.Values, value)
  1143  }
  1144  
  1145  var emptyCell = Cell{Result: statuspb.TestStatus_NO_RESULT}
  1146  
  1147  func hasCellID(name string) bool {
  1148  	return !strings.Contains(name, "@TESTGRID@")
  1149  }
  1150  
  1151  // truncate truncates a message to max runes (and ellipses). Max = 0 returns the original message.
  1152  func truncate(msg string, max int) string {
  1153  	if max == 0 || len(msg) <= max {
  1154  		return msg
  1155  	}
  1156  	convert := func(s string) string {
  1157  		if utf8.ValidString(s) {
  1158  			return s
  1159  		}
  1160  		return strings.ToValidUTF8(s, "")
  1161  		// return s
  1162  	}
  1163  	start := convert(msg[:max/2])
  1164  	end := convert(msg[len(msg)-max/2:])
  1165  	return start + "..." + end
  1166  }
  1167  
  1168  // appendCell adds the rowResult column to the row.
  1169  //
  1170  // Handles the details like missing fields and run-length-encoding the result.
  1171  func appendCell(row *statepb.Row, cell Cell, start, count int) {
  1172  	latest := int32(cell.Result)
  1173  	n := len(row.Results)
  1174  	switch {
  1175  	case n == 0, row.Results[n-2] != latest:
  1176  		row.Results = append(row.Results, latest, int32(count))
  1177  	default:
  1178  		row.Results[n-1] += int32(count)
  1179  	}
  1180  
  1181  	addCellID := hasCellID(row.Name)
  1182  
  1183  	for i := 0; i < count; i++ {
  1184  		columnIdx := int32(start + i)
  1185  		for metricName, measurement := range cell.Metrics {
  1186  			var metric *statepb.Metric
  1187  			var ok bool
  1188  			for _, name := range row.Metric {
  1189  				if name == metricName {
  1190  					ok = true
  1191  					break
  1192  				}
  1193  			}
  1194  			if !ok {
  1195  				row.Metric = append(row.Metric, metricName)
  1196  			}
  1197  			for _, metric = range row.Metrics {
  1198  				if metric.Name == metricName {
  1199  					break
  1200  				}
  1201  				metric = nil
  1202  			}
  1203  			if metric == nil {
  1204  				metric = &statepb.Metric{Name: metricName}
  1205  				row.Metrics = append(row.Metrics, metric)
  1206  			}
  1207  			// len()-1 because we already appended the cell id
  1208  			appendMetric(metric, columnIdx, measurement)
  1209  		}
  1210  		if cell.Result == statuspb.TestStatus_NO_RESULT {
  1211  			continue
  1212  		}
  1213  		if addCellID {
  1214  			// These values can be derived from the parent row and don't need to be repeated here.
  1215  			row.CellIds = append(row.CellIds, cell.CellID)
  1216  			row.Properties = append(row.Properties, &statepb.Property{
  1217  				Property: cell.Properties,
  1218  			})
  1219  		}
  1220  		// Javascript client expects no result cells to skip icons/messages
  1221  		row.Messages = append(row.Messages, truncate(cell.Message, 140))
  1222  		row.Icons = append(row.Icons, cell.Icon)
  1223  		row.UserProperty = append(row.UserProperty, cell.UserProperty)
  1224  	}
  1225  
  1226  	row.Issues = append(row.Issues, cell.Issues...)
  1227  }
  1228  
  1229  // AppendColumn adds the build column to the grid.
  1230  //
  1231  // This handles details like:
  1232  // * rows appearing/disappearing in the middle of the run.
  1233  // * adding auto metadata like duration, commit as well as any user-added metadata
  1234  // * extracting build metadata into the appropriate column header
  1235  // * Ensuring row names are unique and formatted with metadata
  1236  func AppendColumn(grid *statepb.Grid, rows map[string]*statepb.Row, inflated InflatedColumn) {
  1237  	grid.Columns = append(grid.Columns, inflated.Column)
  1238  	colIdx := len(grid.Columns) - 1
  1239  
  1240  	missing := map[string]*statepb.Row{}
  1241  	for name, row := range rows {
  1242  		missing[name] = row
  1243  	}
  1244  
  1245  	for name, cell := range inflated.Cells {
  1246  		delete(missing, name)
  1247  
  1248  		row, ok := rows[name]
  1249  		if !ok {
  1250  			id := cell.ID
  1251  			if id == "" {
  1252  				id = name
  1253  			}
  1254  			row = &statepb.Row{
  1255  				Name:    name,
  1256  				Id:      id,
  1257  				CellIds: []string{}, // TODO(fejta): try and leave this nil
  1258  			}
  1259  			rows[name] = row
  1260  			grid.Rows = append(grid.Rows, row)
  1261  			if colIdx > 0 {
  1262  				appendCell(row, emptyCell, 0, colIdx)
  1263  			}
  1264  		}
  1265  		appendCell(row, cell, colIdx, 1)
  1266  	}
  1267  
  1268  	for _, row := range missing {
  1269  		appendCell(row, emptyCell, colIdx, 1)
  1270  	}
  1271  }
  1272  
  1273  // alertRows configures the alert for every row that has one.
  1274  func alertRows(cols []*statepb.Column, rows []*statepb.Row, openFailures, closePasses int, useCommitAsBuildID bool, userProperty string, columnHeader []*configpb.TestGroup_ColumnHeader) {
  1275  	for _, r := range rows {
  1276  		r.AlertInfo = alertRow(cols, r, openFailures, closePasses, useCommitAsBuildID, userProperty, columnHeader)
  1277  	}
  1278  }
  1279  
  1280  // alertRow returns an AlertInfo proto if there have been failuresToOpen consecutive failures more recently than passesToClose.
  1281  func alertRow(cols []*statepb.Column, row *statepb.Row, failuresToOpen, passesToClose int, useCommitAsBuildID bool, userPropertyName string, columnHeader []*configpb.TestGroup_ColumnHeader) *statepb.AlertInfo {
  1282  	if failuresToOpen == 0 {
  1283  		return nil
  1284  	}
  1285  	var concurrentFailures int
  1286  	var totalFailures int32
  1287  	var passes int
  1288  	var compressedIdx int
  1289  	f := result.Iter(row.Results)
  1290  	var firstFail *statepb.Column
  1291  	var latestFail *statepb.Column
  1292  	var latestPass *statepb.Column
  1293  	var failIdx int
  1294  	var latestFailIdx int
  1295  	customColumnHeaders := make(map[string]string)
  1296  	// find the first number of consecutive passesToClose (no alert)
  1297  	// or else failuresToOpen (alert).
  1298  	for _, col := range cols {
  1299  		// TODO(fejta): ignore old running
  1300  		rawRes, _ := f()
  1301  		res := result.Coalesce(rawRes, result.IgnoreRunning)
  1302  		if res == statuspb.TestStatus_NO_RESULT {
  1303  			if rawRes == statuspb.TestStatus_RUNNING {
  1304  				compressedIdx++
  1305  			}
  1306  			continue
  1307  		}
  1308  		if res == statuspb.TestStatus_PASS {
  1309  			passes++
  1310  			if concurrentFailures >= failuresToOpen {
  1311  				if latestPass == nil {
  1312  					latestPass = col // most recent pass before outage
  1313  				}
  1314  				if passes >= passesToClose {
  1315  					break // enough failures and enough passes, definitely past the start of the failure
  1316  				}
  1317  			} else if passes >= passesToClose {
  1318  				return nil // enough passes but not enough failures, there is no outage
  1319  			} else {
  1320  				concurrentFailures = 0
  1321  			}
  1322  		}
  1323  		if res == statuspb.TestStatus_FAIL {
  1324  			passes = 0
  1325  			latestPass = nil
  1326  			concurrentFailures++
  1327  			totalFailures++
  1328  			if totalFailures == 1 { // note most recent failure for this outage
  1329  				latestFailIdx = compressedIdx
  1330  				latestFail = col
  1331  			}
  1332  			failIdx = compressedIdx
  1333  			firstFail = col
  1334  		}
  1335  		if res == statuspb.TestStatus_FLAKY {
  1336  			passes = 0
  1337  			if concurrentFailures >= failuresToOpen {
  1338  				break // cannot definitively say which commit is at fault
  1339  			}
  1340  			concurrentFailures = 0
  1341  		}
  1342  		compressedIdx++
  1343  
  1344  		for i := 0; i < len(columnHeader); i++ {
  1345  			if i >= len(col.Extra) {
  1346  				logrus.WithFields(logrus.Fields{
  1347  					"started":                 time.Unix(0, int64(col.GetStarted()*float64(time.Millisecond))),
  1348  					"additionalColumnHeaders": col.GetExtra(),
  1349  				}).Trace("Insufficient column header values to record.")
  1350  				break
  1351  			}
  1352  			if columnHeader[i].Label != "" {
  1353  				customColumnHeaders[columnHeader[i].Label] = col.Extra[i]
  1354  			} else if columnHeader[i].Property != "" {
  1355  				customColumnHeaders[columnHeader[i].Property] = col.Extra[i]
  1356  			} else {
  1357  				customColumnHeaders[columnHeader[i].ConfigurationValue] = col.Extra[i]
  1358  			}
  1359  		}
  1360  	}
  1361  	if concurrentFailures < failuresToOpen {
  1362  		return nil
  1363  	}
  1364  	var id string
  1365  	var latestID string
  1366  	if len(row.CellIds) > 0 { // not all rows have cell ids
  1367  		id = row.CellIds[failIdx]
  1368  		latestID = row.CellIds[latestFailIdx]
  1369  	}
  1370  	msg := row.Messages[latestFailIdx]
  1371  	var userProperties map[string]string
  1372  	if row.UserProperty != nil && latestFailIdx < len(row.UserProperty) && row.UserProperty[latestFailIdx] != "" {
  1373  		userProperties = map[string]string{
  1374  			userPropertyName: row.UserProperty[latestFailIdx],
  1375  		}
  1376  	}
  1377  
  1378  	return alertInfo(totalFailures, msg, id, latestID, userProperties, firstFail, latestFail, latestPass, useCommitAsBuildID, customColumnHeaders)
  1379  }
  1380  
  1381  // alertInfo returns an alert proto with the configured fields
  1382  func alertInfo(failures int32, msg, cellID, latestCellID string, userProperties map[string]string, fail, latestFail, pass *statepb.Column, useCommitAsBuildID bool, customColumnHeaders map[string]string) *statepb.AlertInfo {
  1383  	return &statepb.AlertInfo{
  1384  		FailCount:           failures,
  1385  		FailBuildId:         buildID(fail, useCommitAsBuildID),
  1386  		LatestFailBuildId:   buildID(latestFail, useCommitAsBuildID),
  1387  		FailTime:            stamp(fail),
  1388  		FailTestId:          cellID,
  1389  		LatestFailTestId:    latestCellID,
  1390  		FailureMessage:      msg,
  1391  		PassTime:            stamp(pass),
  1392  		PassBuildId:         buildID(pass, useCommitAsBuildID),
  1393  		EmailAddresses:      emailAddresses(fail),
  1394  		HotlistIds:          hotlistIDs(fail),
  1395  		Properties:          userProperties,
  1396  		CustomColumnHeaders: customColumnHeaders,
  1397  	}
  1398  }
  1399  
  1400  func columnStats(cells map[string]Cell, brokenThreshold float32) *statepb.Stats {
  1401  	var passes, fails, total int32
  1402  	var pending bool
  1403  	if brokenThreshold <= 0.0 {
  1404  		return nil
  1405  	}
  1406  	if cells == nil {
  1407  		return nil
  1408  	}
  1409  	for _, cell := range cells {
  1410  		if cell.Result == statuspb.TestStatus_RUNNING {
  1411  			pending = true
  1412  		}
  1413  		status := result.Coalesce(cell.Result, false)
  1414  		switch status {
  1415  		case statuspb.TestStatus_PASS:
  1416  			passes++
  1417  			total++
  1418  		case statuspb.TestStatus_FAIL:
  1419  			fails++
  1420  			total++
  1421  		case statuspb.TestStatus_FLAKY, statuspb.TestStatus_UNKNOWN:
  1422  			total++
  1423  		default:
  1424  			// blank cell or unrecognized status, do nothing
  1425  		}
  1426  	}
  1427  	var failRatio float32
  1428  	if total != 0.0 {
  1429  		failRatio = float32(fails) / float32(total)
  1430  	}
  1431  	return &statepb.Stats{
  1432  		FailCount:  fails,
  1433  		PassCount:  passes,
  1434  		TotalCount: total,
  1435  		Pending:    pending,
  1436  		Broken:     failRatio > brokenThreshold,
  1437  	}
  1438  }
  1439  
  1440  func hotlistIDs(col *statepb.Column) []string {
  1441  	var ids []string
  1442  	for _, hotlistID := range strings.Split(col.HotlistIds, ",") {
  1443  		if id := strings.TrimSpace(hotlistID); id != "" {
  1444  			ids = append(ids, strings.TrimSpace(hotlistID))
  1445  		}
  1446  	}
  1447  	return ids
  1448  }
  1449  
  1450  func emailAddresses(col *statepb.Column) []string {
  1451  	if col == nil {
  1452  		return []string{}
  1453  	}
  1454  	return col.GetEmailAddresses()
  1455  }
  1456  
  1457  // buildID extracts the ID from the first extra row (where commit data is) or else the Build field.
  1458  func buildID(col *statepb.Column, getCommitHeader bool) string {
  1459  	if col == nil {
  1460  		return ""
  1461  	}
  1462  	if getCommitHeader && len(col.Extra) > 0 {
  1463  		return col.Extra[0]
  1464  	}
  1465  	return col.Build
  1466  }
  1467  
  1468  const billion = 1e9
  1469  
  1470  // stamp converts seconds into a timestamp proto
  1471  // TODO(#683): col.Started should be a timestamp instead of a float
  1472  func stamp(col *statepb.Column) *timestamp.Timestamp {
  1473  	if col == nil {
  1474  		return nil
  1475  	}
  1476  	seconds := col.Started / 1000
  1477  	floor := math.Floor(seconds)
  1478  	remain := seconds - floor
  1479  	return &timestamp.Timestamp{
  1480  		Seconds: int64(floor),
  1481  		Nanos:   int32(remain * billion),
  1482  	}
  1483  }