go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/txn/datastore/integration_test.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/tq/txn/datastore/integration_test.go (about)

     1  // Copyright 2020 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package datastore
    16  
    17  import (
    18  	"context"
    19  	"math/rand"
    20  	"sync"
    21  	"testing"
    22  	"time"
    23  
    24  	"google.golang.org/grpc/codes"
    25  	"google.golang.org/grpc/status"
    26  	"google.golang.org/protobuf/proto"
    27  	"google.golang.org/protobuf/types/known/durationpb"
    28  
    29  	"go.chromium.org/luci/common/clock"
    30  	"go.chromium.org/luci/common/clock/testclock"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/common/logging/gologger"
    33  	"go.chromium.org/luci/common/sync/parallel"
    34  	"go.chromium.org/luci/gae/filter/featureBreaker"
    35  	"go.chromium.org/luci/gae/filter/featureBreaker/flaky"
    36  	"go.chromium.org/luci/gae/filter/txndefer"
    37  	"go.chromium.org/luci/gae/impl/memory"
    38  	"go.chromium.org/luci/gae/service/datastore"
    39  
    40  	"go.chromium.org/luci/server/tq"
    41  	"go.chromium.org/luci/server/tq/internal/reminder"
    42  	"go.chromium.org/luci/server/tq/tqtesting"
    43  
    44  	. "github.com/smartystreets/goconvey/convey"
    45  )
    46  
    47  func TestDistributedSweeping(t *testing.T) {
    48  	t.Parallel()
    49  
    50  	Convey("Distributed sweeping", t, func() {
    51  		RunTest(t, func(disp *tq.Dispatcher) tq.Sweeper {
    52  			// Use smaller sweep tasks to hit more edge cases.
    53  			return tq.NewDistributedSweeper(disp, tq.DistributedSweeperOptions{
    54  				SweepShards:         4,
    55  				TasksPerScan:        10,
    56  				SecondaryScanShards: 4,
    57  			})
    58  		})
    59  	})
    60  }
    61  
    62  func TestInProcSweeping(t *testing.T) {
    63  	t.Parallel()
    64  
    65  	Convey("Inproc sweeping", t, func() {
    66  		RunTest(t, func(disp *tq.Dispatcher) tq.Sweeper {
    67  			// Use smaller sweep tasks to hit more edge cases.
    68  			return tq.NewInProcSweeper(tq.InProcSweeperOptions{
    69  				SweepShards:             4,
    70  				TasksPerScan:            10,
    71  				SecondaryScanShards:     4,
    72  				SubmitBatchSize:         4,
    73  				SubmitConcurrentBatches: 2,
    74  			})
    75  		})
    76  	})
    77  }
    78  
    79  // RunTest ensures that transactionally submitted tasks eventually execute,
    80  // and only once, even if the database and Cloud Tasks RPCs fail with high
    81  // chance.
    82  func RunTest(t *testing.T, sweeper func(*tq.Dispatcher) tq.Sweeper) {
    83  	var epoch = testclock.TestRecentTimeUTC
    84  	const sweepSleep = "sweep sleep"
    85  
    86  	ctx, tc := testclock.UseTime(context.Background(), epoch)
    87  	tc.SetTimerCallback(func(d time.Duration, t clock.Timer) {
    88  		if testclock.HasTags(t, tqtesting.ClockTag) {
    89  			panic("there should be no task retries, they all should fail fatally")
    90  		}
    91  		if testclock.HasTags(t, sweepSleep) {
    92  			tc.Add(d)
    93  		}
    94  	})
    95  
    96  	ctx = txndefer.FilterRDS(memory.Use(ctx))
    97  	ctx, fb := featureBreaker.FilterRDS(ctx, nil)
    98  	datastore.GetTestable(ctx).Consistent(true)
    99  
   100  	// Note: must install after memory.Use, since it overrides the logger.
   101  	ctx = gologger.StdConfig.Use(ctx)
   102  	ctx = logging.SetLevel(ctx, logging.Debug)
   103  
   104  	// Use a single RND for all flaky.Errors(...) instances. Otherwise they
   105  	// repeat the same random pattern each time withBrokenDS is called.
   106  	rnd := rand.NewSource(0)
   107  
   108  	withBrokenDS := func(cb func()) {
   109  		// Makes datastore very faulty.
   110  		fb.BreakFeaturesWithCallback(
   111  			flaky.Errors(flaky.Params{
   112  				Rand:                             rnd,
   113  				DeadlineProbability:              0.3,
   114  				ConcurrentTransactionProbability: 0.3,
   115  			}),
   116  			featureBreaker.DatastoreFeatures...,
   117  		)
   118  
   119  		cb()
   120  
   121  		// "Fixes" datastore, letting us examine it.
   122  		fb.BreakFeaturesWithCallback(
   123  			func(context.Context, string) error { return nil },
   124  			featureBreaker.DatastoreFeatures...,
   125  		)
   126  	}
   127  
   128  	disp := &tq.Dispatcher{}
   129  	ctx, sched := tq.TestingContext(ctx, disp)
   130  	disp.Sweeper = sweeper(disp)
   131  
   132  	// "Buganize" the submitter.
   133  	ctx = tq.UseSubmitter(ctx, &flakySubmitter{
   134  		Submitter:              sched,
   135  		InternalErrProbability: 0.3,
   136  		Rand:                   rand.New(rand.NewSource(123)),
   137  	})
   138  
   139  	// This will collect which tasks were executed and how many times.
   140  	mu := sync.Mutex{}
   141  	execed := map[int]int{}
   142  
   143  	disp.RegisterTaskClass(tq.TaskClass{
   144  		ID:        "work",
   145  		Prototype: &durationpb.Duration{}, // use it just as int container
   146  		Kind:      tq.Transactional,
   147  		Queue:     "default",
   148  		Handler: func(ctx context.Context, msg proto.Message) error {
   149  			d := msg.(*durationpb.Duration)
   150  			mu.Lock()
   151  			execed[int(d.Seconds)]++
   152  			mu.Unlock()
   153  			return nil
   154  		},
   155  	})
   156  
   157  	type testEntity struct {
   158  		_kind string `gae:"$kind,testEntity"`
   159  		ID    int    `gae:"$id"`
   160  	}
   161  
   162  	// Run a bunch of transactions that each add an entity and submit a task.
   163  	// Some of them will fail, this is fine. But eventually each submitted
   164  	// task must execute, and only once.
   165  	withBrokenDS(func() {
   166  		parallel.WorkPool(16, func(work chan<- func() error) {
   167  			for i := 1; i <= 500; i++ {
   168  				i := i
   169  				work <- func() error {
   170  					return datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   171  						task := &tq.Task{Payload: &durationpb.Duration{Seconds: int64(i)}}
   172  						if err := disp.AddTask(ctx, task); err != nil {
   173  							return err
   174  						}
   175  						return datastore.Put(ctx, &testEntity{ID: i})
   176  					}, nil)
   177  				}
   178  			}
   179  		})
   180  	})
   181  
   182  	// See how many transactions really landed.
   183  	var landed []*testEntity
   184  	So(datastore.GetAll(ctx, datastore.NewQuery("testEntity"), &landed), ShouldBeNil)
   185  	So(len(landed), ShouldBeGreaterThan, 5) // it is usually much larger (but still random)
   186  
   187  	// Run rounds of sweeps until there are no reminders left or we appear to
   188  	// be stuck. Use panics instead of So(...) to avoid spamming goconvey with
   189  	// lots and lots of assertion dots.
   190  	for {
   191  		reminders, err := datastore.Count(ctx, datastore.NewQuery(reminderKind))
   192  		if err != nil {
   193  			panic(err)
   194  		}
   195  		logging.Infof(ctx, "%s: %d reminders left, %d tasks executed", clock.Now(ctx).Sub(epoch), reminders, len(execed))
   196  		if reminders == 0 && len(sched.Tasks()) == 0 {
   197  			break // no pending tasks and no reminders in the datastore, we are done
   198  		}
   199  
   200  		// Blow up if it takes too much time to converge. Note that this is fake
   201  		// time. Also the limit is much-much higher than expected mean time to
   202  		// make sure this test doesn't flake.
   203  		if clock.Now(ctx).Sub(epoch) >= 360*time.Minute { // 360 Sweeps.
   204  			panic("Looks like the test is stuck")
   205  		}
   206  
   207  		// Submit a bunch of sweep tasks and wait until they (and all their
   208  		// follow ups) are done.
   209  		withBrokenDS(func() {
   210  			disp.Sweep(ctx)
   211  			sched.Run(ctx, tqtesting.StopWhenDrained())
   212  		})
   213  
   214  		// Launch the next sweep a bit later. This is the only ticking clock in
   215  		// the simulation. It is needed because we use "real" time when checking
   216  		// freshness of reminders.
   217  		clock.Sleep(clock.Tag(ctx, sweepSleep), time.Minute)
   218  	}
   219  
   220  	// All transactionally submitted tasks should have been executed.
   221  	So(len(execed), ShouldEqual, len(landed))
   222  	// And at most once.
   223  	for k, v := range execed {
   224  		if v != 1 {
   225  			t.Errorf("task %d executed %d times", k, v)
   226  		}
   227  	}
   228  }
   229  
   230  type flakySubmitter struct {
   231  	Submitter              tq.Submitter
   232  	Rand                   *rand.Rand
   233  	InternalErrProbability float64
   234  
   235  	m sync.Mutex
   236  }
   237  
   238  func (f *flakySubmitter) Submit(ctx context.Context, req *reminder.Payload) error {
   239  	f.m.Lock()
   240  	fail := f.Rand.Float64() < f.InternalErrProbability
   241  	f.m.Unlock()
   242  	if fail {
   243  		return status.Errorf(codes.Internal, "Simulated internal error")
   244  	}
   245  	return f.Submitter.Submit(ctx, req)
   246  }