go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/txn/datastore/integration_test.go (about) 1 // Copyright 2020 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package datastore 16 17 import ( 18 "context" 19 "math/rand" 20 "sync" 21 "testing" 22 "time" 23 24 "google.golang.org/grpc/codes" 25 "google.golang.org/grpc/status" 26 "google.golang.org/protobuf/proto" 27 "google.golang.org/protobuf/types/known/durationpb" 28 29 "go.chromium.org/luci/common/clock" 30 "go.chromium.org/luci/common/clock/testclock" 31 "go.chromium.org/luci/common/logging" 32 "go.chromium.org/luci/common/logging/gologger" 33 "go.chromium.org/luci/common/sync/parallel" 34 "go.chromium.org/luci/gae/filter/featureBreaker" 35 "go.chromium.org/luci/gae/filter/featureBreaker/flaky" 36 "go.chromium.org/luci/gae/filter/txndefer" 37 "go.chromium.org/luci/gae/impl/memory" 38 "go.chromium.org/luci/gae/service/datastore" 39 40 "go.chromium.org/luci/server/tq" 41 "go.chromium.org/luci/server/tq/internal/reminder" 42 "go.chromium.org/luci/server/tq/tqtesting" 43 44 . "github.com/smartystreets/goconvey/convey" 45 ) 46 47 func TestDistributedSweeping(t *testing.T) { 48 t.Parallel() 49 50 Convey("Distributed sweeping", t, func() { 51 RunTest(t, func(disp *tq.Dispatcher) tq.Sweeper { 52 // Use smaller sweep tasks to hit more edge cases. 53 return tq.NewDistributedSweeper(disp, tq.DistributedSweeperOptions{ 54 SweepShards: 4, 55 TasksPerScan: 10, 56 SecondaryScanShards: 4, 57 }) 58 }) 59 }) 60 } 61 62 func TestInProcSweeping(t *testing.T) { 63 t.Parallel() 64 65 Convey("Inproc sweeping", t, func() { 66 RunTest(t, func(disp *tq.Dispatcher) tq.Sweeper { 67 // Use smaller sweep tasks to hit more edge cases. 68 return tq.NewInProcSweeper(tq.InProcSweeperOptions{ 69 SweepShards: 4, 70 TasksPerScan: 10, 71 SecondaryScanShards: 4, 72 SubmitBatchSize: 4, 73 SubmitConcurrentBatches: 2, 74 }) 75 }) 76 }) 77 } 78 79 // RunTest ensures that transactionally submitted tasks eventually execute, 80 // and only once, even if the database and Cloud Tasks RPCs fail with high 81 // chance. 82 func RunTest(t *testing.T, sweeper func(*tq.Dispatcher) tq.Sweeper) { 83 var epoch = testclock.TestRecentTimeUTC 84 const sweepSleep = "sweep sleep" 85 86 ctx, tc := testclock.UseTime(context.Background(), epoch) 87 tc.SetTimerCallback(func(d time.Duration, t clock.Timer) { 88 if testclock.HasTags(t, tqtesting.ClockTag) { 89 panic("there should be no task retries, they all should fail fatally") 90 } 91 if testclock.HasTags(t, sweepSleep) { 92 tc.Add(d) 93 } 94 }) 95 96 ctx = txndefer.FilterRDS(memory.Use(ctx)) 97 ctx, fb := featureBreaker.FilterRDS(ctx, nil) 98 datastore.GetTestable(ctx).Consistent(true) 99 100 // Note: must install after memory.Use, since it overrides the logger. 101 ctx = gologger.StdConfig.Use(ctx) 102 ctx = logging.SetLevel(ctx, logging.Debug) 103 104 // Use a single RND for all flaky.Errors(...) instances. Otherwise they 105 // repeat the same random pattern each time withBrokenDS is called. 106 rnd := rand.NewSource(0) 107 108 withBrokenDS := func(cb func()) { 109 // Makes datastore very faulty. 110 fb.BreakFeaturesWithCallback( 111 flaky.Errors(flaky.Params{ 112 Rand: rnd, 113 DeadlineProbability: 0.3, 114 ConcurrentTransactionProbability: 0.3, 115 }), 116 featureBreaker.DatastoreFeatures..., 117 ) 118 119 cb() 120 121 // "Fixes" datastore, letting us examine it. 122 fb.BreakFeaturesWithCallback( 123 func(context.Context, string) error { return nil }, 124 featureBreaker.DatastoreFeatures..., 125 ) 126 } 127 128 disp := &tq.Dispatcher{} 129 ctx, sched := tq.TestingContext(ctx, disp) 130 disp.Sweeper = sweeper(disp) 131 132 // "Buganize" the submitter. 133 ctx = tq.UseSubmitter(ctx, &flakySubmitter{ 134 Submitter: sched, 135 InternalErrProbability: 0.3, 136 Rand: rand.New(rand.NewSource(123)), 137 }) 138 139 // This will collect which tasks were executed and how many times. 140 mu := sync.Mutex{} 141 execed := map[int]int{} 142 143 disp.RegisterTaskClass(tq.TaskClass{ 144 ID: "work", 145 Prototype: &durationpb.Duration{}, // use it just as int container 146 Kind: tq.Transactional, 147 Queue: "default", 148 Handler: func(ctx context.Context, msg proto.Message) error { 149 d := msg.(*durationpb.Duration) 150 mu.Lock() 151 execed[int(d.Seconds)]++ 152 mu.Unlock() 153 return nil 154 }, 155 }) 156 157 type testEntity struct { 158 _kind string `gae:"$kind,testEntity"` 159 ID int `gae:"$id"` 160 } 161 162 // Run a bunch of transactions that each add an entity and submit a task. 163 // Some of them will fail, this is fine. But eventually each submitted 164 // task must execute, and only once. 165 withBrokenDS(func() { 166 parallel.WorkPool(16, func(work chan<- func() error) { 167 for i := 1; i <= 500; i++ { 168 i := i 169 work <- func() error { 170 return datastore.RunInTransaction(ctx, func(ctx context.Context) error { 171 task := &tq.Task{Payload: &durationpb.Duration{Seconds: int64(i)}} 172 if err := disp.AddTask(ctx, task); err != nil { 173 return err 174 } 175 return datastore.Put(ctx, &testEntity{ID: i}) 176 }, nil) 177 } 178 } 179 }) 180 }) 181 182 // See how many transactions really landed. 183 var landed []*testEntity 184 So(datastore.GetAll(ctx, datastore.NewQuery("testEntity"), &landed), ShouldBeNil) 185 So(len(landed), ShouldBeGreaterThan, 5) // it is usually much larger (but still random) 186 187 // Run rounds of sweeps until there are no reminders left or we appear to 188 // be stuck. Use panics instead of So(...) to avoid spamming goconvey with 189 // lots and lots of assertion dots. 190 for { 191 reminders, err := datastore.Count(ctx, datastore.NewQuery(reminderKind)) 192 if err != nil { 193 panic(err) 194 } 195 logging.Infof(ctx, "%s: %d reminders left, %d tasks executed", clock.Now(ctx).Sub(epoch), reminders, len(execed)) 196 if reminders == 0 && len(sched.Tasks()) == 0 { 197 break // no pending tasks and no reminders in the datastore, we are done 198 } 199 200 // Blow up if it takes too much time to converge. Note that this is fake 201 // time. Also the limit is much-much higher than expected mean time to 202 // make sure this test doesn't flake. 203 if clock.Now(ctx).Sub(epoch) >= 360*time.Minute { // 360 Sweeps. 204 panic("Looks like the test is stuck") 205 } 206 207 // Submit a bunch of sweep tasks and wait until they (and all their 208 // follow ups) are done. 209 withBrokenDS(func() { 210 disp.Sweep(ctx) 211 sched.Run(ctx, tqtesting.StopWhenDrained()) 212 }) 213 214 // Launch the next sweep a bit later. This is the only ticking clock in 215 // the simulation. It is needed because we use "real" time when checking 216 // freshness of reminders. 217 clock.Sleep(clock.Tag(ctx, sweepSleep), time.Minute) 218 } 219 220 // All transactionally submitted tasks should have been executed. 221 So(len(execed), ShouldEqual, len(landed)) 222 // And at most once. 223 for k, v := range execed { 224 if v != 1 { 225 t.Errorf("task %d executed %d times", k, v) 226 } 227 } 228 } 229 230 type flakySubmitter struct { 231 Submitter tq.Submitter 232 Rand *rand.Rand 233 InternalErrProbability float64 234 235 m sync.Mutex 236 } 237 238 func (f *flakySubmitter) Submit(ctx context.Context, req *reminder.Payload) error { 239 f.m.Lock() 240 fail := f.Rand.Float64() < f.InternalErrProbability 241 f.m.Unlock() 242 if fail { 243 return status.Errorf(codes.Internal, "Simulated internal error") 244 } 245 return f.Submitter.Submit(ctx, req) 246 }