go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/reclustering/orchestrator/orchestrator_test.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package orchestrator
    16  
    17  import (
    18  	"context"
    19  	"sort"
    20  	"strings"
    21  	"testing"
    22  	"time"
    23  
    24  	"cloud.google.com/go/spanner"
    25  	"google.golang.org/protobuf/types/known/timestamppb"
    26  
    27  	"go.chromium.org/luci/common/clock/testclock"
    28  	"go.chromium.org/luci/gae/impl/memory"
    29  	"go.chromium.org/luci/server/span"
    30  	"go.chromium.org/luci/server/tq"
    31  	"go.chromium.org/luci/server/tq/tqtesting"
    32  
    33  	"go.chromium.org/luci/analysis/internal/clustering/algorithms"
    34  	"go.chromium.org/luci/analysis/internal/clustering/rules"
    35  	"go.chromium.org/luci/analysis/internal/clustering/runs"
    36  	"go.chromium.org/luci/analysis/internal/clustering/shards"
    37  	"go.chromium.org/luci/analysis/internal/clustering/state"
    38  	"go.chromium.org/luci/analysis/internal/config"
    39  	"go.chromium.org/luci/analysis/internal/tasks/taskspb"
    40  	"go.chromium.org/luci/analysis/internal/testutil"
    41  	configpb "go.chromium.org/luci/analysis/proto/config"
    42  
    43  	_ "go.chromium.org/luci/server/tq/txn/spanner"
    44  
    45  	. "github.com/smartystreets/goconvey/convey"
    46  	. "go.chromium.org/luci/common/testing/assertions"
    47  )
    48  
    49  func TestOrchestrator(t *testing.T) {
    50  	Convey(`With Spanner Test Database`, t, func() {
    51  		ctx := testutil.IntegrationTestContext(t)
    52  
    53  		// Simulate the Orchestrator job running one second past the hour.
    54  		startTime := testclock.TestRecentTimeUTC.Truncate(time.Hour).Add(time.Second)
    55  		ctx, tc := testclock.UseTime(ctx, startTime)
    56  
    57  		ctx = memory.Use(ctx) // For config cache.
    58  		ctx, skdr := tq.TestingContext(ctx, nil)
    59  
    60  		cfg := &configpb.Config{
    61  			ReclusteringWorkers: 5,
    62  		}
    63  		config.SetTestConfig(ctx, cfg)
    64  
    65  		testProjects := []string{"project-a", "project-b", "project-c"}
    66  
    67  		testOrchestratorDoesNothing := func() {
    68  			beforeTasks := tasks(skdr)
    69  			beforeRuns := readRuns(ctx, testProjects)
    70  
    71  			err := CronHandler(ctx)
    72  			So(err, ShouldBeNil)
    73  
    74  			afterTasks := tasks(skdr)
    75  			afterRuns := readRuns(ctx, testProjects)
    76  			So(afterTasks, ShouldResembleProto, beforeTasks)
    77  			So(afterRuns, ShouldResemble, beforeRuns)
    78  		}
    79  
    80  		Convey("Without Projects", func() {
    81  			testutil.MustApply(ctx,
    82  				spanner.Delete("ClusteringState", spanner.AllKeys()))
    83  
    84  			testOrchestratorDoesNothing()
    85  		})
    86  		Convey("With Projects", func() {
    87  			// Orchestrator only looks at the projects in ClusteringState table.
    88  			var projectEntries []*state.Entry
    89  			for _, p := range testProjects {
    90  				projectEntries = append(projectEntries, state.NewEntry(0).WithProject(p).Build())
    91  			}
    92  			_, err := state.CreateEntriesForTesting(ctx, projectEntries)
    93  			So(err, ShouldBeNil)
    94  
    95  			// Some projects have config.
    96  			configVersionA := time.Date(2029, time.April, 1, 0, 0, 0, 1, time.UTC)
    97  			configVersionB := time.Date(2029, time.May, 1, 0, 0, 0, 1, time.UTC)
    98  			projectCfg := make(map[string]*configpb.ProjectConfig)
    99  			projectCfg["project-a"] = &configpb.ProjectConfig{
   100  				LastUpdated: timestamppb.New(configVersionA),
   101  			}
   102  			projectCfg["project-b"] = &configpb.ProjectConfig{
   103  				LastUpdated: timestamppb.New(configVersionB),
   104  			}
   105  			config.SetTestProjectConfig(ctx, projectCfg)
   106  
   107  			// Create chunks in project-b. After this, the row estimates
   108  			// for the projects should be:
   109  			// project-a: ~100
   110  			// project-b: ~450
   111  			// project-c: ~100
   112  			var entries []*state.Entry
   113  			for i := 1; i < 450; i++ {
   114  				entries = append(entries, state.NewEntry(i).WithProject("project-b").Build())
   115  			}
   116  			_, err = state.CreateEntriesForTesting(ctx, entries)
   117  			So(err, ShouldBeNil)
   118  
   119  			rulesVersionB := time.Date(2020, time.January, 10, 9, 8, 7, 0, time.UTC)
   120  			rule := rules.NewRule(1).WithProject("project-b").WithPredicateLastUpdateTime(rulesVersionB).Build()
   121  			err = rules.SetForTesting(ctx, []*rules.Entry{rule})
   122  			So(err, ShouldBeNil)
   123  
   124  			expectedRunStartTime := tc.Now().Truncate(time.Minute)
   125  			expectedRunEndTime := expectedRunStartTime.Add(time.Minute)
   126  			expectedTasks := []*taskspb.ReclusterChunks{
   127  				{
   128  					Project:      "project-a",
   129  					AttemptTime:  timestamppb.New(expectedRunEndTime),
   130  					StartChunkId: "",
   131  					EndChunkId:   state.EndOfTable,
   132  					State: &taskspb.ReclusterChunkState{
   133  						CurrentChunkId: "",
   134  						NextReportDue:  timestamppb.New(expectedRunStartTime),
   135  					},
   136  					ShardNumber: 1,
   137  				},
   138  				{
   139  					Project:      "project-b",
   140  					AttemptTime:  timestamppb.New(expectedRunEndTime),
   141  					StartChunkId: "",
   142  					EndChunkId:   strings.Repeat("55", 15) + "54",
   143  					State: &taskspb.ReclusterChunkState{
   144  						CurrentChunkId: "",
   145  						NextReportDue:  timestamppb.New(expectedRunStartTime),
   146  					},
   147  					ShardNumber: 2,
   148  				},
   149  				{
   150  					Project:      "project-b",
   151  					AttemptTime:  timestamppb.New(expectedRunEndTime),
   152  					StartChunkId: strings.Repeat("55", 15) + "54",
   153  					EndChunkId:   strings.Repeat("aa", 15) + "a9",
   154  					State: &taskspb.ReclusterChunkState{
   155  						CurrentChunkId: strings.Repeat("55", 15) + "54",
   156  						NextReportDue:  timestamppb.New(expectedRunStartTime.Add(5 * time.Second / 3)),
   157  					},
   158  					ShardNumber: 3,
   159  				},
   160  				{
   161  					Project:      "project-b",
   162  					AttemptTime:  timestamppb.New(expectedRunEndTime),
   163  					StartChunkId: strings.Repeat("aa", 15) + "a9",
   164  					EndChunkId:   state.EndOfTable,
   165  					State: &taskspb.ReclusterChunkState{
   166  						CurrentChunkId: strings.Repeat("aa", 15) + "a9",
   167  						NextReportDue:  timestamppb.New(expectedRunStartTime.Add((5 * time.Second * 2) / 3)),
   168  					},
   169  					ShardNumber: 4,
   170  				},
   171  				{
   172  					Project:      "project-c",
   173  					AttemptTime:  timestamppb.New(expectedRunEndTime),
   174  					StartChunkId: "",
   175  					EndChunkId:   state.EndOfTable,
   176  					State: &taskspb.ReclusterChunkState{
   177  						CurrentChunkId: "",
   178  						NextReportDue:  timestamppb.New(expectedRunStartTime),
   179  					},
   180  					ShardNumber: 5,
   181  				},
   182  			}
   183  
   184  			expectedShards := []shards.ReclusteringShard{
   185  				{
   186  					ShardNumber:      1,
   187  					AttemptTimestamp: expectedRunEndTime,
   188  					Project:          "project-a",
   189  					Progress:         spanner.NullInt64{},
   190  				},
   191  				{
   192  					ShardNumber:      2,
   193  					AttemptTimestamp: expectedRunEndTime,
   194  					Project:          "project-b",
   195  					Progress:         spanner.NullInt64{},
   196  				},
   197  				{
   198  					ShardNumber:      3,
   199  					AttemptTimestamp: expectedRunEndTime,
   200  					Project:          "project-b",
   201  					Progress:         spanner.NullInt64{},
   202  				},
   203  				{
   204  					ShardNumber:      4,
   205  					AttemptTimestamp: expectedRunEndTime,
   206  					Project:          "project-b",
   207  					Progress:         spanner.NullInt64{},
   208  				},
   209  				{
   210  					ShardNumber:      5,
   211  					AttemptTimestamp: expectedRunEndTime,
   212  					Project:          "project-c",
   213  					Progress:         spanner.NullInt64{},
   214  				},
   215  			}
   216  
   217  			expectedRunA := &runs.ReclusteringRun{
   218  				Project:           "project-a",
   219  				AttemptTimestamp:  expectedRunEndTime,
   220  				AlgorithmsVersion: algorithms.AlgorithmsVersion,
   221  				ConfigVersion:     configVersionA,
   222  				RulesVersion:      rules.StartingEpoch,
   223  				ShardCount:        1,
   224  				ShardsReported:    0,
   225  				Progress:          0,
   226  			}
   227  			expectedRunB := &runs.ReclusteringRun{
   228  				Project:           "project-b",
   229  				AttemptTimestamp:  expectedRunEndTime,
   230  				AlgorithmsVersion: algorithms.AlgorithmsVersion,
   231  				ConfigVersion:     configVersionB,
   232  				RulesVersion:      rulesVersionB,
   233  				ShardCount:        3,
   234  				ShardsReported:    0,
   235  				Progress:          0,
   236  			}
   237  			expectedRunC := &runs.ReclusteringRun{
   238  				Project:           "project-c",
   239  				AttemptTimestamp:  expectedRunEndTime,
   240  				AlgorithmsVersion: algorithms.AlgorithmsVersion,
   241  				ConfigVersion:     config.StartingEpoch,
   242  				RulesVersion:      rules.StartingEpoch,
   243  				ShardCount:        1,
   244  				ShardsReported:    0,
   245  				Progress:          0,
   246  			}
   247  			expectedRuns := make(map[string]*runs.ReclusteringRun)
   248  			expectedRuns["project-a"] = expectedRunA
   249  			expectedRuns["project-b"] = expectedRunB
   250  			expectedRuns["project-c"] = expectedRunC
   251  
   252  			// updateExpectedTasks sets the Algorithms Version,
   253  			// Rules Version and Config Version of expected tasks
   254  			// to match those of the expected runs.
   255  			updateExpectedTasks := func() {
   256  				for _, t := range expectedTasks {
   257  					run := expectedRuns[t.Project]
   258  					t.AlgorithmsVersion = run.AlgorithmsVersion
   259  					t.RulesVersion = timestamppb.New(run.RulesVersion)
   260  					t.ConfigVersion = timestamppb.New(run.ConfigVersion)
   261  				}
   262  			}
   263  			updateExpectedTasks()
   264  
   265  			Convey("Disabled orchestrator does nothing", func() {
   266  				Convey("Workers is zero", func() {
   267  					cfg.ReclusteringWorkers = 0
   268  					config.SetTestConfig(ctx, cfg)
   269  
   270  					testOrchestratorDoesNothing()
   271  				})
   272  			})
   273  			Convey("Schedules successfully without existing runs", func() {
   274  				err := CronHandler(ctx)
   275  				So(err, ShouldBeNil)
   276  
   277  				actualTasks := tasks(skdr)
   278  				So(actualTasks, ShouldResembleProto, expectedTasks)
   279  
   280  				actualRuns := readRuns(ctx, testProjects)
   281  				So(actualRuns, ShouldResemble, expectedRuns)
   282  
   283  				actualShards, err := shards.ReadAll(span.Single(ctx))
   284  				So(err, ShouldBeNil)
   285  				So(actualShards, ShouldResemble, expectedShards)
   286  			})
   287  			Convey("Schedules successfully with a previous run", func() {
   288  				previousRunB := &runs.ReclusteringRun{
   289  					Project:           "project-b",
   290  					AttemptTimestamp:  expectedRunEndTime.Add(-1 * time.Minute),
   291  					AlgorithmsVersion: 1,
   292  					ConfigVersion:     configVersionB.Add(-1 * time.Hour),
   293  					RulesVersion:      rulesVersionB.Add(-1 * time.Hour),
   294  					ShardCount:        10,
   295  				}
   296  				var previousShards []shards.ReclusteringShard
   297  				for i := 0; i < 10; i++ {
   298  					previousShards = append(previousShards, shards.ReclusteringShard{
   299  						ShardNumber:      int64(50 + i),
   300  						AttemptTimestamp: expectedRunEndTime.Add(-1 * time.Minute),
   301  						Project:          "project-b",
   302  						Progress:         spanner.NullInt64{Valid: true, Int64: 1000},
   303  					})
   304  				}
   305  
   306  				expectedProgress := 10 * 1000
   307  				expectedShardsReported := 10
   308  				test := func() {
   309  					err = CronHandler(ctx)
   310  					So(err, ShouldBeNil)
   311  
   312  					// Verify that the previous run had its progress set correctly.
   313  					updatedPreviousRun, err := runs.Read(span.Single(ctx), previousRunB.Project, previousRunB.AttemptTimestamp)
   314  					So(err, ShouldBeNil)
   315  					So(updatedPreviousRun.Progress, ShouldEqual, expectedProgress)
   316  					So(updatedPreviousRun.ShardsReported, ShouldEqual, expectedShardsReported)
   317  
   318  					// Verify that correct shards were created and that shards
   319  					// from previous runs were deleted.
   320  					actualShards, err := shards.ReadAll(span.Single(ctx))
   321  					So(err, ShouldBeNil)
   322  					So(actualShards, ShouldResemble, expectedShards)
   323  
   324  					actualTasks := tasks(skdr)
   325  					So(actualTasks, ShouldResembleProto, expectedTasks)
   326  
   327  					actualRuns := readRuns(ctx, testProjects)
   328  					So(actualRuns, ShouldResemble, expectedRuns)
   329  				}
   330  
   331  				Convey("existing complete run", func() {
   332  					err := runs.SetRunsForTesting(ctx, []*runs.ReclusteringRun{previousRunB})
   333  					So(err, ShouldBeNil)
   334  
   335  					err = shards.SetShardsForTesting(ctx, previousShards)
   336  					So(err, ShouldBeNil)
   337  
   338  					// A run scheduled after an existing complete run should
   339  					// use the latest algorithms, config and rules available. So
   340  					// our expectations are unchanged.
   341  					test()
   342  				})
   343  				Convey("existing incomplete run", func() {
   344  					for i := range previousShards {
   345  						previousShards[i].Progress = spanner.NullInt64{Valid: true, Int64: 500}
   346  					}
   347  					expectedProgress = 10 * 500
   348  					expectedShardsReported = 10
   349  
   350  					err := runs.SetRunsForTesting(ctx, []*runs.ReclusteringRun{previousRunB})
   351  					So(err, ShouldBeNil)
   352  
   353  					err = shards.SetShardsForTesting(ctx, previousShards)
   354  					So(err, ShouldBeNil)
   355  
   356  					sds, err := shards.ReadAll(span.Single(ctx))
   357  					So(err, ShouldBeNil)
   358  					So(sds, ShouldResemble, previousShards)
   359  
   360  					// Expect the same algorithms and rules version to be used as
   361  					// the previous run, to ensure forward progress (if new rules
   362  					// are being constantly created, we don't want to be
   363  					// reclustering only the beginning of the workers' keyspaces).
   364  					expectedRunB.AlgorithmsVersion = previousRunB.AlgorithmsVersion
   365  					expectedRunB.ConfigVersion = previousRunB.ConfigVersion
   366  					expectedRunB.RulesVersion = previousRunB.RulesVersion
   367  					updateExpectedTasks()
   368  					test()
   369  				})
   370  				Convey("existing unreported run", func() {
   371  					for i := range previousShards {
   372  						// Assume the shards did not report progress at all.
   373  						previousShards[i].Progress = spanner.NullInt64{}
   374  					}
   375  					expectedProgress = 0
   376  					expectedShardsReported = 0
   377  
   378  					err := runs.SetRunsForTesting(ctx, []*runs.ReclusteringRun{previousRunB})
   379  					So(err, ShouldBeNil)
   380  
   381  					err = shards.SetShardsForTesting(ctx, previousShards)
   382  					So(err, ShouldBeNil)
   383  
   384  					// Expect the same algorithms and rules version to be used as
   385  					// the previous run, to ensure forward progress (if new rules
   386  					// are being constantly created, we don't want to be
   387  					// reclustering only the beginning of the workers' keyspaces).
   388  					expectedRunB.AlgorithmsVersion = previousRunB.AlgorithmsVersion
   389  					expectedRunB.ConfigVersion = previousRunB.ConfigVersion
   390  					expectedRunB.RulesVersion = previousRunB.RulesVersion
   391  					updateExpectedTasks()
   392  					test()
   393  				})
   394  				Convey("existing complete run with later algorithms version", func() {
   395  					previousRunB.AlgorithmsVersion = algorithms.AlgorithmsVersion + 5
   396  
   397  					err := runs.SetRunsForTesting(ctx, []*runs.ReclusteringRun{previousRunB})
   398  					So(err, ShouldBeNil)
   399  
   400  					err = shards.SetShardsForTesting(ctx, previousShards)
   401  					So(err, ShouldBeNil)
   402  
   403  					// If new algorithms are being rolled out, some GAE instances
   404  					// may be running old code. This includes the instance that
   405  					// runs the orchestrator.
   406  					// To simplify reasoning about re-clustering runs, and ensure
   407  					// correctness of re-clustering progress logic, we require
   408  					// the algorithms version of subsequent runs to always be
   409  					// non-decreasing.
   410  					expectedRunB.AlgorithmsVersion = previousRunB.AlgorithmsVersion
   411  					updateExpectedTasks()
   412  					test()
   413  				})
   414  				Convey("existing complete run with later config version", func() {
   415  					previousRunB.ConfigVersion = configVersionB.Add(time.Hour)
   416  
   417  					err := runs.SetRunsForTesting(ctx, []*runs.ReclusteringRun{previousRunB})
   418  					So(err, ShouldBeNil)
   419  
   420  					err = shards.SetShardsForTesting(ctx, previousShards)
   421  					So(err, ShouldBeNil)
   422  
   423  					// If new config is being rolled out, some GAE instances
   424  					// may still have old config cached. This includes the instance
   425  					// that runs the orchestrator.
   426  					// To simplify reasoning about re-clustering runs, and ensure
   427  					// correctness of re-clustering progress logic, we require
   428  					// the config version of subsequent runs to always be
   429  					// non-decreasing.
   430  					expectedRunB.ConfigVersion = previousRunB.ConfigVersion
   431  					updateExpectedTasks()
   432  					test()
   433  				})
   434  			})
   435  		})
   436  	})
   437  }
   438  
   439  func tasks(s *tqtesting.Scheduler) []*taskspb.ReclusterChunks {
   440  	var tasks []*taskspb.ReclusterChunks
   441  	for _, pl := range s.Tasks().Payloads() {
   442  		task := pl.(*taskspb.ReclusterChunks)
   443  		tasks = append(tasks, task)
   444  	}
   445  	sort.Slice(tasks, func(i, j int) bool {
   446  		return tasks[i].ShardNumber < tasks[j].ShardNumber
   447  	})
   448  	return tasks
   449  }
   450  
   451  func readRuns(ctx context.Context, projects []string) map[string]*runs.ReclusteringRun {
   452  	txn, cancel := span.ReadOnlyTransaction(ctx)
   453  	defer cancel()
   454  
   455  	result := make(map[string]*runs.ReclusteringRun)
   456  	for _, project := range projects {
   457  		run, err := runs.ReadLastUpTo(txn, project, runs.MaxAttemptTimestamp)
   458  		So(err, ShouldBeNil)
   459  		result[project] = run
   460  	}
   461  	return result
   462  }