github.com/GoogleCloudPlatform/testgrid@v0.0.174/pkg/tabulator/tabstate.go (about) 1 /* 2 Copyright 2022 The TestGrid Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package tabulator processes test group state into tab state. 18 package tabulator 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "net/url" 25 "path" 26 "sync" 27 "time" 28 29 "github.com/sirupsen/logrus" 30 "google.golang.org/protobuf/proto" 31 32 "github.com/GoogleCloudPlatform/testgrid/config" 33 "github.com/GoogleCloudPlatform/testgrid/config/snapshot" 34 configpb "github.com/GoogleCloudPlatform/testgrid/pb/config" 35 statepb "github.com/GoogleCloudPlatform/testgrid/pb/state" 36 tspb "github.com/GoogleCloudPlatform/testgrid/pb/test_status" 37 "github.com/GoogleCloudPlatform/testgrid/pkg/updater" 38 "github.com/GoogleCloudPlatform/testgrid/util/gcs" 39 "github.com/GoogleCloudPlatform/testgrid/util/metrics" 40 ) 41 42 const componentName = "tabulator" 43 const writeTimeout = 10 * time.Minute 44 45 // Metrics holds metrics relevant to this controller. 46 type Metrics struct { 47 UpdateState metrics.Cyclic 48 DelaySeconds metrics.Duration 49 } 50 51 // CreateMetrics creates metrics for this controller 52 func CreateMetrics(factory metrics.Factory) *Metrics { 53 return &Metrics{ 54 UpdateState: factory.NewCyclic(componentName), 55 DelaySeconds: factory.NewDuration("delay", "Seconds tabulator is behind schedule", "component"), 56 } 57 } 58 59 type writeTask struct { 60 dashboard *configpb.Dashboard 61 tab *configpb.DashboardTab 62 group *configpb.TestGroup 63 data *statepb.Grid //TODO(chases2): change to inflatedColumns (and additional data) now that "filter-columns" is used everywhere 64 } 65 66 func mapTasks(cfg *snapshot.Config) map[string][]writeTask { 67 groupToTabs := make(map[string][]writeTask, len(cfg.Groups)) 68 69 for _, dashboard := range cfg.Dashboards { 70 for _, tab := range dashboard.DashboardTab { 71 g := tab.TestGroupName 72 groupToTabs[g] = append(groupToTabs[g], writeTask{ 73 dashboard: dashboard, 74 tab: tab, 75 group: cfg.Groups[g], 76 }) 77 } 78 } 79 80 return groupToTabs 81 } 82 83 // Fixer should adjust the queue until the context expires. 84 type Fixer func(context.Context, *config.TestGroupQueue) error 85 86 // UpdateOptions aggregates the Update function parameter into a single structure. 87 type UpdateOptions struct { 88 ConfigPath gcs.Path 89 ReadConcurrency int 90 WriteConcurrency int 91 GridPathPrefix string 92 TabsPathPrefix string 93 AllowedGroups []string 94 Confirm bool 95 CalculateStats bool 96 UseTabAlertSettings bool 97 ExtendState bool 98 Freq time.Duration 99 } 100 101 // Update tab state with the given frequency continuously. If freq == 0, runs only once. 102 // 103 // Copies the grid into the tab state, removing unneeded data. 104 // Observes each test group in allowedGroups, or all of them in the config if not specified 105 func Update(ctx context.Context, client gcs.ConditionalClient, mets *Metrics, opts *UpdateOptions, fixers ...Fixer) error { 106 ctx, cancel := context.WithCancel(ctx) 107 defer cancel() 108 109 if opts.ReadConcurrency < 1 || opts.WriteConcurrency < 1 { 110 return fmt.Errorf("concurrency must be positive, got read %d and write %d", opts.ReadConcurrency, opts.WriteConcurrency) 111 } 112 log := logrus.WithField("config", opts.ConfigPath) 113 114 var q config.TestGroupQueue 115 116 log.Debug("Observing config...") 117 cfgChanged, err := snapshot.Observe(ctx, log, client, opts.ConfigPath, time.NewTicker(time.Minute).C) 118 if err != nil { 119 return fmt.Errorf("error while observing config %q: %w", opts.ConfigPath.String(), err) 120 } 121 122 var cfg *snapshot.Config 123 var tasksPerGroup map[string][]writeTask 124 fixSnapshot := func(newConfig *snapshot.Config) { 125 cfg = newConfig 126 tasksPerGroup = mapTasks(cfg) 127 128 if len(opts.AllowedGroups) != 0 { 129 groups := make([]*configpb.TestGroup, 0, len(opts.AllowedGroups)) 130 for _, group := range opts.AllowedGroups { 131 c, ok := cfg.Groups[group] 132 if !ok { 133 log.Errorf("Could not find requested group %q in config", c) 134 continue 135 } 136 groups = append(groups, c) 137 } 138 139 q.Init(log, groups, time.Now()) 140 return 141 142 } 143 144 groups := make([]*configpb.TestGroup, 0, len(cfg.Groups)) 145 for _, group := range cfg.Groups { 146 groups = append(groups, group) 147 } 148 149 q.Init(log, groups, time.Now()) 150 } 151 152 fixSnapshot(<-cfgChanged) 153 154 go func(ctx context.Context) { 155 fixCtx, fixCancel := context.WithCancel(ctx) 156 var fixWg sync.WaitGroup 157 fixAll := func() { 158 n := len(fixers) 159 log.WithField("fixers", n).Debug("Starting fixers on current groups...") 160 fixWg.Add(n) 161 for i, fix := range fixers { 162 go func(i int, fix Fixer) { 163 defer fixWg.Done() 164 if err := fix(fixCtx, &q); err != nil && !errors.Is(err, context.Canceled) { 165 log.WithError(err).WithField("fixer", i).Warning("Fixer failed") 166 } 167 }(i, fix) 168 } 169 log.WithField("fixers", n).Info("Started fixers on current groups.") 170 } 171 172 ticker := time.NewTicker(time.Minute) 173 fixAll() 174 defer ticker.Stop() 175 for { 176 depth, next, when := q.Status() 177 log := log.WithField("depth", depth) 178 if next != nil { 179 log = log.WithField("next", &next) 180 } 181 delay := time.Since(when) 182 if delay < 0 { 183 delay = 0 184 log = log.WithField("sleep", -delay) 185 } 186 mets.DelaySeconds.Set(delay, componentName) 187 log.Debug("Calculated metrics") 188 189 select { 190 case <-ctx.Done(): 191 ticker.Stop() 192 fixCancel() 193 fixWg.Wait() 194 return 195 case newConfig, ok := <-cfgChanged: 196 if !ok { 197 log.Info("Configuration channel closed") 198 cfgChanged = nil 199 continue 200 } 201 log.Info("Configuration changed") 202 fixCancel() 203 fixWg.Wait() 204 fixCtx, fixCancel = context.WithCancel(ctx) 205 fixSnapshot(newConfig) 206 fixAll() 207 case <-ticker.C: 208 } 209 } 210 }(ctx) 211 212 // Set up worker pools 213 groups := make(chan *configpb.TestGroup) 214 tasks := make(chan writeTask) 215 var tabLock sync.Mutex 216 217 read := func(ctx context.Context, log *logrus.Entry, group *configpb.TestGroup) error { 218 if group == nil { 219 return errors.New("nil group to read") 220 } 221 222 fromPath, err := updater.TestGroupPath(opts.ConfigPath, opts.GridPathPrefix, group.Name) 223 if err != nil { 224 return fmt.Errorf("can't make tg path %q: %w", group.Name, err) 225 } 226 227 log.WithField("from", fromPath.String()).Info("Reading state") 228 229 grid, _, err := gcs.DownloadGrid(ctx, client, *fromPath) 230 if err != nil { 231 return fmt.Errorf("downloadGrid(%s): %w", fromPath, err) 232 } 233 234 tabLock.Lock() 235 defer tabLock.Unlock() 236 // lock out all other readers so that all these tabs get handled as soon as possible 237 for _, task := range tasksPerGroup[group.Name] { 238 log := log.WithFields(logrus.Fields{ 239 "group": task.group.GetName(), 240 "dashboard": task.dashboard.GetName(), 241 "tab": task.tab.GetName(), 242 }) 243 select { 244 case <-ctx.Done(): 245 log.Debug("Skipping irrelevant task") 246 continue 247 default: 248 out := task 249 out.data = proto.Clone(grid).(*statepb.Grid) 250 log.Debug("Requesting write task") 251 tasks <- out 252 } 253 } 254 return nil 255 } 256 257 // Run threads continuously 258 var readWg, writeWg sync.WaitGroup 259 readWg.Add(opts.ReadConcurrency) 260 for i := 0; i < opts.ReadConcurrency; i++ { 261 go func() { 262 defer readWg.Done() 263 for group := range groups { 264 readCtx, cancel := context.WithCancel(ctx) 265 log = log.WithField("group", group.Name) 266 err := read(readCtx, log, group) 267 cancel() 268 if err != nil { 269 next := time.Now().Add(opts.Freq / 10) 270 q.Fix(group.Name, next, false) 271 log.WithError(err).WithField("retry-at", next).Error("failed to read, retry later") 272 } 273 } 274 }() 275 } 276 writeWg.Add(opts.WriteConcurrency) 277 for i := 0; i < opts.WriteConcurrency; i++ { 278 go func() { 279 defer writeWg.Done() 280 for task := range tasks { 281 writeCtx, cancel := context.WithTimeout(ctx, writeTimeout) 282 finish := mets.UpdateState.Start() 283 log = log.WithField("dashboard", task.dashboard.Name).WithField("tab", task.tab.Name) 284 err := createTabState(writeCtx, log, client, task, opts.ConfigPath, opts.TabsPathPrefix, opts.Confirm, opts.CalculateStats, opts.UseTabAlertSettings, opts.ExtendState) 285 cancel() 286 if err != nil { 287 finish.Fail() 288 log.Errorf("write: %v", err) 289 continue 290 } 291 finish.Success() 292 } 293 }() 294 } 295 296 defer writeWg.Wait() 297 defer close(tasks) 298 defer readWg.Wait() 299 defer close(groups) 300 301 return q.Send(ctx, groups, opts.Freq) 302 } 303 304 // createTabState creates the tab state from the group state 305 func createTabState(ctx context.Context, log logrus.FieldLogger, client gcs.Client, task writeTask, configPath gcs.Path, tabsPathPrefix string, confirm, calculateStats, useTabAlerts, extendState bool) error { 306 location, err := TabStatePath(configPath, tabsPathPrefix, task.dashboard.Name, task.tab.Name) 307 if err != nil { 308 return fmt.Errorf("can't make dashtab path %s/%s: %w", task.dashboard.Name, task.tab.Name, err) 309 } 310 311 log.WithFields(logrus.Fields{ 312 "to": location.String(), 313 }).Info("Calculating state") 314 315 var existingGrid *statepb.Grid 316 if extendState { 317 // TODO(chases2): Download grid only if task.Data was truncated (last column is UNKNOWN) 318 existingGrid, _, err = gcs.DownloadGrid(ctx, client, *location) 319 if err != nil { 320 return fmt.Errorf("downloadGrid: %w", err) 321 } 322 } 323 324 grid, err := tabulate(ctx, log, task.data, task.tab, task.group, calculateStats, useTabAlerts, existingGrid) 325 if err != nil { 326 return fmt.Errorf("tabulate: %w", err) 327 } 328 329 if !confirm { 330 log.Debug("Successfully created tab state; discarding") 331 return nil 332 } 333 334 buf, err := gcs.MarshalGrid(grid) 335 if err != nil { 336 return fmt.Errorf("marshalGrid: %w", err) 337 } 338 339 _, err = client.Upload(ctx, *location, buf, gcs.DefaultACL, gcs.NoCache) 340 if err != nil { 341 return fmt.Errorf("client.Upload(%s): %w", location, err) 342 } 343 return nil 344 } 345 346 // TabStatePath returns the path for a given tab. 347 func TabStatePath(configPath gcs.Path, tabPrefix, dashboardName, tabName string) (*gcs.Path, error) { 348 name := path.Join(tabPrefix, dashboardName, tabName) 349 u, err := url.Parse(name) 350 if err != nil { 351 return nil, fmt.Errorf("invalid url %s: %w", name, err) 352 } 353 np, err := configPath.ResolveReference(u) 354 if err != nil { 355 return nil, fmt.Errorf("resolve reference: %w", err) 356 } 357 if np.Bucket() != configPath.Bucket() { 358 return nil, fmt.Errorf("tabState %s should not change bucket", name) 359 } 360 return np, nil 361 } 362 363 // tabulate transforms "grid" to only the part that needs to be displayed by the UI. 364 // If an existingGrid is passed in, new results from "grid" will be grafted onto it. 365 func tabulate(ctx context.Context, log logrus.FieldLogger, grid *statepb.Grid, tabCfg *configpb.DashboardTab, groupCfg *configpb.TestGroup, calculateStats, useTabAlertSettings bool, existingGrid *statepb.Grid) (*statepb.Grid, error) { 366 if grid == nil { 367 return nil, errors.New("no grid") 368 } 369 if tabCfg == nil || groupCfg == nil { 370 return nil, errors.New("no config") 371 } 372 filterRows, err := filterGrid(tabCfg.BaseOptions, grid.Rows) 373 if err != nil { 374 return nil, fmt.Errorf("filterGrid: %w", err) 375 } 376 grid.Rows = filterRows 377 378 inflatedGrid, issues, err := updater.InflateGrid(ctx, grid, time.Time{}, time.Now()) 379 if err != nil { 380 return nil, fmt.Errorf("inflateGrid: %w", err) 381 } 382 383 inflatedGrid = dropEmptyColumns(inflatedGrid) 384 385 usesK8sClient := groupCfg.UseKubernetesClient || (groupCfg.GetResultSource().GetGcsConfig() != nil) 386 var brokenThreshold float32 387 if calculateStats { 388 brokenThreshold = tabCfg.BrokenColumnThreshold 389 } 390 var alert, unalert int 391 if useTabAlertSettings { 392 alert = int(tabCfg.GetAlertOptions().GetNumFailuresToAlert()) 393 unalert = int(tabCfg.GetAlertOptions().GetNumPassesToDisableAlert()) 394 } else { 395 alert = int(groupCfg.NumFailuresToAlert) 396 unalert = int(groupCfg.NumPassesToDisableAlert) 397 } 398 if existingGrid != nil { 399 existingInflatedGrid, _, err := updater.InflateGrid(ctx, existingGrid, time.Time{}, time.Now()) 400 if err != nil { 401 return nil, fmt.Errorf("inflate existing grid: %w", err) 402 } 403 inflatedGrid = mergeGrids(existingInflatedGrid, inflatedGrid) 404 } 405 grid = updater.ConstructGrid(log, inflatedGrid, issues, alert, unalert, usesK8sClient, groupCfg.GetUserProperty(), brokenThreshold, groupCfg.GetColumnHeader()) 406 return grid, nil 407 } 408 409 // mergeGrids merges two sorted, inflated grids together. 410 // Precondition: "addition" is an output of an Updater with an "unknown" column last. 411 // This final column will be dropped and replaced with existing results. 412 func mergeGrids(existing, addition []updater.InflatedColumn) []updater.InflatedColumn { 413 if len(addition) == 0 { 414 return existing 415 } 416 seam := addition[len(addition)-1].Column.Started 417 min := 0 418 max := len(existing) 419 for min != max { 420 check := (min + max) / 2 421 if existing[check].Column.Started <= seam { 422 max = check 423 } else { 424 min = check + 1 425 } 426 } 427 if max == len(existing) { 428 return addition 429 } 430 return append(addition[:len(addition)-1], existing[max:]...) 431 } 432 433 // dropEmptyColumns drops every column in-place that has no results 434 func dropEmptyColumns(grid []updater.InflatedColumn) []updater.InflatedColumn { 435 result := make([]updater.InflatedColumn, 0, len(grid)) 436 for i, col := range grid { 437 for _, cell := range col.Cells { 438 if cell.Result != tspb.TestStatus_NO_RESULT { 439 result = append(result, grid[i]) 440 break 441 } 442 } 443 } 444 if len(result) == 0 && len(grid) != 0 { 445 // If everything would be dropped, keep the first column so there's something left 446 result = grid[0:1] 447 } 448 return result 449 }