github.com/GoogleCloudPlatform/testgrid@v0.0.174/pkg/tabulator/tabstate.go (about)

     1  /*
     2  Copyright 2022 The TestGrid Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package tabulator processes test group state into tab state.
    18  package tabulator
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"net/url"
    25  	"path"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/sirupsen/logrus"
    30  	"google.golang.org/protobuf/proto"
    31  
    32  	"github.com/GoogleCloudPlatform/testgrid/config"
    33  	"github.com/GoogleCloudPlatform/testgrid/config/snapshot"
    34  	configpb "github.com/GoogleCloudPlatform/testgrid/pb/config"
    35  	statepb "github.com/GoogleCloudPlatform/testgrid/pb/state"
    36  	tspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status"
    37  	"github.com/GoogleCloudPlatform/testgrid/pkg/updater"
    38  	"github.com/GoogleCloudPlatform/testgrid/util/gcs"
    39  	"github.com/GoogleCloudPlatform/testgrid/util/metrics"
    40  )
    41  
    42  const componentName = "tabulator"
    43  const writeTimeout = 10 * time.Minute
    44  
    45  // Metrics holds metrics relevant to this controller.
    46  type Metrics struct {
    47  	UpdateState  metrics.Cyclic
    48  	DelaySeconds metrics.Duration
    49  }
    50  
    51  // CreateMetrics creates metrics for this controller
    52  func CreateMetrics(factory metrics.Factory) *Metrics {
    53  	return &Metrics{
    54  		UpdateState:  factory.NewCyclic(componentName),
    55  		DelaySeconds: factory.NewDuration("delay", "Seconds tabulator is behind schedule", "component"),
    56  	}
    57  }
    58  
    59  type writeTask struct {
    60  	dashboard *configpb.Dashboard
    61  	tab       *configpb.DashboardTab
    62  	group     *configpb.TestGroup
    63  	data      *statepb.Grid //TODO(chases2): change to inflatedColumns (and additional data) now that "filter-columns" is used everywhere
    64  }
    65  
    66  func mapTasks(cfg *snapshot.Config) map[string][]writeTask {
    67  	groupToTabs := make(map[string][]writeTask, len(cfg.Groups))
    68  
    69  	for _, dashboard := range cfg.Dashboards {
    70  		for _, tab := range dashboard.DashboardTab {
    71  			g := tab.TestGroupName
    72  			groupToTabs[g] = append(groupToTabs[g], writeTask{
    73  				dashboard: dashboard,
    74  				tab:       tab,
    75  				group:     cfg.Groups[g],
    76  			})
    77  		}
    78  	}
    79  
    80  	return groupToTabs
    81  }
    82  
    83  // Fixer should adjust the queue until the context expires.
    84  type Fixer func(context.Context, *config.TestGroupQueue) error
    85  
    86  // UpdateOptions aggregates the Update function parameter into a single structure.
    87  type UpdateOptions struct {
    88  	ConfigPath          gcs.Path
    89  	ReadConcurrency     int
    90  	WriteConcurrency    int
    91  	GridPathPrefix      string
    92  	TabsPathPrefix      string
    93  	AllowedGroups       []string
    94  	Confirm             bool
    95  	CalculateStats      bool
    96  	UseTabAlertSettings bool
    97  	ExtendState         bool
    98  	Freq                time.Duration
    99  }
   100  
   101  // Update tab state with the given frequency continuously. If freq == 0, runs only once.
   102  //
   103  // Copies the grid into the tab state, removing unneeded data.
   104  // Observes each test group in allowedGroups, or all of them in the config if not specified
   105  func Update(ctx context.Context, client gcs.ConditionalClient, mets *Metrics, opts *UpdateOptions, fixers ...Fixer) error {
   106  	ctx, cancel := context.WithCancel(ctx)
   107  	defer cancel()
   108  
   109  	if opts.ReadConcurrency < 1 || opts.WriteConcurrency < 1 {
   110  		return fmt.Errorf("concurrency must be positive, got read %d and write %d", opts.ReadConcurrency, opts.WriteConcurrency)
   111  	}
   112  	log := logrus.WithField("config", opts.ConfigPath)
   113  
   114  	var q config.TestGroupQueue
   115  
   116  	log.Debug("Observing config...")
   117  	cfgChanged, err := snapshot.Observe(ctx, log, client, opts.ConfigPath, time.NewTicker(time.Minute).C)
   118  	if err != nil {
   119  		return fmt.Errorf("error while observing config %q: %w", opts.ConfigPath.String(), err)
   120  	}
   121  
   122  	var cfg *snapshot.Config
   123  	var tasksPerGroup map[string][]writeTask
   124  	fixSnapshot := func(newConfig *snapshot.Config) {
   125  		cfg = newConfig
   126  		tasksPerGroup = mapTasks(cfg)
   127  
   128  		if len(opts.AllowedGroups) != 0 {
   129  			groups := make([]*configpb.TestGroup, 0, len(opts.AllowedGroups))
   130  			for _, group := range opts.AllowedGroups {
   131  				c, ok := cfg.Groups[group]
   132  				if !ok {
   133  					log.Errorf("Could not find requested group %q in config", c)
   134  					continue
   135  				}
   136  				groups = append(groups, c)
   137  			}
   138  
   139  			q.Init(log, groups, time.Now())
   140  			return
   141  
   142  		}
   143  
   144  		groups := make([]*configpb.TestGroup, 0, len(cfg.Groups))
   145  		for _, group := range cfg.Groups {
   146  			groups = append(groups, group)
   147  		}
   148  
   149  		q.Init(log, groups, time.Now())
   150  	}
   151  
   152  	fixSnapshot(<-cfgChanged)
   153  
   154  	go func(ctx context.Context) {
   155  		fixCtx, fixCancel := context.WithCancel(ctx)
   156  		var fixWg sync.WaitGroup
   157  		fixAll := func() {
   158  			n := len(fixers)
   159  			log.WithField("fixers", n).Debug("Starting fixers on current groups...")
   160  			fixWg.Add(n)
   161  			for i, fix := range fixers {
   162  				go func(i int, fix Fixer) {
   163  					defer fixWg.Done()
   164  					if err := fix(fixCtx, &q); err != nil && !errors.Is(err, context.Canceled) {
   165  						log.WithError(err).WithField("fixer", i).Warning("Fixer failed")
   166  					}
   167  				}(i, fix)
   168  			}
   169  			log.WithField("fixers", n).Info("Started fixers on current groups.")
   170  		}
   171  
   172  		ticker := time.NewTicker(time.Minute)
   173  		fixAll()
   174  		defer ticker.Stop()
   175  		for {
   176  			depth, next, when := q.Status()
   177  			log := log.WithField("depth", depth)
   178  			if next != nil {
   179  				log = log.WithField("next", &next)
   180  			}
   181  			delay := time.Since(when)
   182  			if delay < 0 {
   183  				delay = 0
   184  				log = log.WithField("sleep", -delay)
   185  			}
   186  			mets.DelaySeconds.Set(delay, componentName)
   187  			log.Debug("Calculated metrics")
   188  
   189  			select {
   190  			case <-ctx.Done():
   191  				ticker.Stop()
   192  				fixCancel()
   193  				fixWg.Wait()
   194  				return
   195  			case newConfig, ok := <-cfgChanged:
   196  				if !ok {
   197  					log.Info("Configuration channel closed")
   198  					cfgChanged = nil
   199  					continue
   200  				}
   201  				log.Info("Configuration changed")
   202  				fixCancel()
   203  				fixWg.Wait()
   204  				fixCtx, fixCancel = context.WithCancel(ctx)
   205  				fixSnapshot(newConfig)
   206  				fixAll()
   207  			case <-ticker.C:
   208  			}
   209  		}
   210  	}(ctx)
   211  
   212  	// Set up worker pools
   213  	groups := make(chan *configpb.TestGroup)
   214  	tasks := make(chan writeTask)
   215  	var tabLock sync.Mutex
   216  
   217  	read := func(ctx context.Context, log *logrus.Entry, group *configpb.TestGroup) error {
   218  		if group == nil {
   219  			return errors.New("nil group to read")
   220  		}
   221  
   222  		fromPath, err := updater.TestGroupPath(opts.ConfigPath, opts.GridPathPrefix, group.Name)
   223  		if err != nil {
   224  			return fmt.Errorf("can't make tg path %q: %w", group.Name, err)
   225  		}
   226  
   227  		log.WithField("from", fromPath.String()).Info("Reading state")
   228  
   229  		grid, _, err := gcs.DownloadGrid(ctx, client, *fromPath)
   230  		if err != nil {
   231  			return fmt.Errorf("downloadGrid(%s): %w", fromPath, err)
   232  		}
   233  
   234  		tabLock.Lock()
   235  		defer tabLock.Unlock()
   236  		// lock out all other readers so that all these tabs get handled as soon as possible
   237  		for _, task := range tasksPerGroup[group.Name] {
   238  			log := log.WithFields(logrus.Fields{
   239  				"group":     task.group.GetName(),
   240  				"dashboard": task.dashboard.GetName(),
   241  				"tab":       task.tab.GetName(),
   242  			})
   243  			select {
   244  			case <-ctx.Done():
   245  				log.Debug("Skipping irrelevant task")
   246  				continue
   247  			default:
   248  				out := task
   249  				out.data = proto.Clone(grid).(*statepb.Grid)
   250  				log.Debug("Requesting write task")
   251  				tasks <- out
   252  			}
   253  		}
   254  		return nil
   255  	}
   256  
   257  	// Run threads continuously
   258  	var readWg, writeWg sync.WaitGroup
   259  	readWg.Add(opts.ReadConcurrency)
   260  	for i := 0; i < opts.ReadConcurrency; i++ {
   261  		go func() {
   262  			defer readWg.Done()
   263  			for group := range groups {
   264  				readCtx, cancel := context.WithCancel(ctx)
   265  				log = log.WithField("group", group.Name)
   266  				err := read(readCtx, log, group)
   267  				cancel()
   268  				if err != nil {
   269  					next := time.Now().Add(opts.Freq / 10)
   270  					q.Fix(group.Name, next, false)
   271  					log.WithError(err).WithField("retry-at", next).Error("failed to read, retry later")
   272  				}
   273  			}
   274  		}()
   275  	}
   276  	writeWg.Add(opts.WriteConcurrency)
   277  	for i := 0; i < opts.WriteConcurrency; i++ {
   278  		go func() {
   279  			defer writeWg.Done()
   280  			for task := range tasks {
   281  				writeCtx, cancel := context.WithTimeout(ctx, writeTimeout)
   282  				finish := mets.UpdateState.Start()
   283  				log = log.WithField("dashboard", task.dashboard.Name).WithField("tab", task.tab.Name)
   284  				err := createTabState(writeCtx, log, client, task, opts.ConfigPath, opts.TabsPathPrefix, opts.Confirm, opts.CalculateStats, opts.UseTabAlertSettings, opts.ExtendState)
   285  				cancel()
   286  				if err != nil {
   287  					finish.Fail()
   288  					log.Errorf("write: %v", err)
   289  					continue
   290  				}
   291  				finish.Success()
   292  			}
   293  		}()
   294  	}
   295  
   296  	defer writeWg.Wait()
   297  	defer close(tasks)
   298  	defer readWg.Wait()
   299  	defer close(groups)
   300  
   301  	return q.Send(ctx, groups, opts.Freq)
   302  }
   303  
   304  // createTabState creates the tab state from the group state
   305  func createTabState(ctx context.Context, log logrus.FieldLogger, client gcs.Client, task writeTask, configPath gcs.Path, tabsPathPrefix string, confirm, calculateStats, useTabAlerts, extendState bool) error {
   306  	location, err := TabStatePath(configPath, tabsPathPrefix, task.dashboard.Name, task.tab.Name)
   307  	if err != nil {
   308  		return fmt.Errorf("can't make dashtab path %s/%s: %w", task.dashboard.Name, task.tab.Name, err)
   309  	}
   310  
   311  	log.WithFields(logrus.Fields{
   312  		"to": location.String(),
   313  	}).Info("Calculating state")
   314  
   315  	var existingGrid *statepb.Grid
   316  	if extendState {
   317  		// TODO(chases2): Download grid only if task.Data was truncated (last column is UNKNOWN)
   318  		existingGrid, _, err = gcs.DownloadGrid(ctx, client, *location)
   319  		if err != nil {
   320  			return fmt.Errorf("downloadGrid: %w", err)
   321  		}
   322  	}
   323  
   324  	grid, err := tabulate(ctx, log, task.data, task.tab, task.group, calculateStats, useTabAlerts, existingGrid)
   325  	if err != nil {
   326  		return fmt.Errorf("tabulate: %w", err)
   327  	}
   328  
   329  	if !confirm {
   330  		log.Debug("Successfully created tab state; discarding")
   331  		return nil
   332  	}
   333  
   334  	buf, err := gcs.MarshalGrid(grid)
   335  	if err != nil {
   336  		return fmt.Errorf("marshalGrid: %w", err)
   337  	}
   338  
   339  	_, err = client.Upload(ctx, *location, buf, gcs.DefaultACL, gcs.NoCache)
   340  	if err != nil {
   341  		return fmt.Errorf("client.Upload(%s): %w", location, err)
   342  	}
   343  	return nil
   344  }
   345  
   346  // TabStatePath returns the path for a given tab.
   347  func TabStatePath(configPath gcs.Path, tabPrefix, dashboardName, tabName string) (*gcs.Path, error) {
   348  	name := path.Join(tabPrefix, dashboardName, tabName)
   349  	u, err := url.Parse(name)
   350  	if err != nil {
   351  		return nil, fmt.Errorf("invalid url %s: %w", name, err)
   352  	}
   353  	np, err := configPath.ResolveReference(u)
   354  	if err != nil {
   355  		return nil, fmt.Errorf("resolve reference: %w", err)
   356  	}
   357  	if np.Bucket() != configPath.Bucket() {
   358  		return nil, fmt.Errorf("tabState %s should not change bucket", name)
   359  	}
   360  	return np, nil
   361  }
   362  
   363  // tabulate transforms "grid" to only the part that needs to be displayed by the UI.
   364  // If an existingGrid is passed in, new results from "grid" will be grafted onto it.
   365  func tabulate(ctx context.Context, log logrus.FieldLogger, grid *statepb.Grid, tabCfg *configpb.DashboardTab, groupCfg *configpb.TestGroup, calculateStats, useTabAlertSettings bool, existingGrid *statepb.Grid) (*statepb.Grid, error) {
   366  	if grid == nil {
   367  		return nil, errors.New("no grid")
   368  	}
   369  	if tabCfg == nil || groupCfg == nil {
   370  		return nil, errors.New("no config")
   371  	}
   372  	filterRows, err := filterGrid(tabCfg.BaseOptions, grid.Rows)
   373  	if err != nil {
   374  		return nil, fmt.Errorf("filterGrid: %w", err)
   375  	}
   376  	grid.Rows = filterRows
   377  
   378  	inflatedGrid, issues, err := updater.InflateGrid(ctx, grid, time.Time{}, time.Now())
   379  	if err != nil {
   380  		return nil, fmt.Errorf("inflateGrid: %w", err)
   381  	}
   382  
   383  	inflatedGrid = dropEmptyColumns(inflatedGrid)
   384  
   385  	usesK8sClient := groupCfg.UseKubernetesClient || (groupCfg.GetResultSource().GetGcsConfig() != nil)
   386  	var brokenThreshold float32
   387  	if calculateStats {
   388  		brokenThreshold = tabCfg.BrokenColumnThreshold
   389  	}
   390  	var alert, unalert int
   391  	if useTabAlertSettings {
   392  		alert = int(tabCfg.GetAlertOptions().GetNumFailuresToAlert())
   393  		unalert = int(tabCfg.GetAlertOptions().GetNumPassesToDisableAlert())
   394  	} else {
   395  		alert = int(groupCfg.NumFailuresToAlert)
   396  		unalert = int(groupCfg.NumPassesToDisableAlert)
   397  	}
   398  	if existingGrid != nil {
   399  		existingInflatedGrid, _, err := updater.InflateGrid(ctx, existingGrid, time.Time{}, time.Now())
   400  		if err != nil {
   401  			return nil, fmt.Errorf("inflate existing grid: %w", err)
   402  		}
   403  		inflatedGrid = mergeGrids(existingInflatedGrid, inflatedGrid)
   404  	}
   405  	grid = updater.ConstructGrid(log, inflatedGrid, issues, alert, unalert, usesK8sClient, groupCfg.GetUserProperty(), brokenThreshold, groupCfg.GetColumnHeader())
   406  	return grid, nil
   407  }
   408  
   409  // mergeGrids merges two sorted, inflated grids together.
   410  // Precondition: "addition" is an output of an Updater with an "unknown" column last.
   411  // This final column will be dropped and replaced with existing results.
   412  func mergeGrids(existing, addition []updater.InflatedColumn) []updater.InflatedColumn {
   413  	if len(addition) == 0 {
   414  		return existing
   415  	}
   416  	seam := addition[len(addition)-1].Column.Started
   417  	min := 0
   418  	max := len(existing)
   419  	for min != max {
   420  		check := (min + max) / 2
   421  		if existing[check].Column.Started <= seam {
   422  			max = check
   423  		} else {
   424  			min = check + 1
   425  		}
   426  	}
   427  	if max == len(existing) {
   428  		return addition
   429  	}
   430  	return append(addition[:len(addition)-1], existing[max:]...)
   431  }
   432  
   433  // dropEmptyColumns drops every column in-place that has no results
   434  func dropEmptyColumns(grid []updater.InflatedColumn) []updater.InflatedColumn {
   435  	result := make([]updater.InflatedColumn, 0, len(grid))
   436  	for i, col := range grid {
   437  		for _, cell := range col.Cells {
   438  			if cell.Result != tspb.TestStatus_NO_RESULT {
   439  				result = append(result, grid[i])
   440  				break
   441  			}
   442  		}
   443  	}
   444  	if len(result) == 0 && len(grid) != 0 {
   445  		// If everything would be dropped, keep the first column so there's something left
   446  		result = grid[0:1]
   447  	}
   448  	return result
   449  }