vitess.io/vitess@v0.16.2/go/vt/throttler/max_replication_lag_module_test.go

vitess.io/vitess@v0.16.2/go/vt/throttler/max_replication_lag_module_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package throttler
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"testing"
    23  	"time"
    24  
    25  	"vitess.io/vitess/go/vt/log"
    26  
    27  	"vitess.io/vitess/go/vt/discovery"
    28  
    29  	querypb "vitess.io/vitess/go/vt/proto/query"
    30  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    31  )
    32  
    33  // The tests below tests the heuristic of the MaxReplicationLagModule.
    34  // Depending on a replica's replication lag and the historic throttler rate,
    35  // the module does increase or decrease the throttler rate.
    36  
    37  // Most of the tests assume that there are two REPLICA tablets r1 and r2 which
    38  // both broadcast their replication lag every 20s.
    39  // r1 starts to broadcast at 0s, r2 at 10s.
    40  // Additionally, there are two RDONLY tablets rdonly1 and rdonly2 which are only
    41  // used by the "IgnoreNSlowestReplicas" tests.
    42  const (
    43  	// Automatic enumeration starts at 0. But there's no r0. Ignore it.
    44  	_ = iota
    45  	r1
    46  	r2
    47  	rdonly1
    48  	rdonly2
    49  )
    50  
    51  type testFixture struct {
    52  	m            *MaxReplicationLagModule
    53  	ratesHistory *fakeRatesHistory
    54  }
    55  
    56  func newTestFixtureWithMaxReplicationLag(maxReplicationLag int64) (*testFixture, error) {
    57  	config := NewMaxReplicationLagModuleConfig(maxReplicationLag)
    58  	return newTestFixture(config)
    59  }
    60  
    61  func newTestFixture(config MaxReplicationLagModuleConfig) (*testFixture, error) {
    62  	ratesHistory := newFakeRatesHistory()
    63  	fc := &fakeClock{}
    64  	// Do not start at 0*time.Second because than the code cannot distinguish
    65  	// between a legimate value and a zero time.Time value.
    66  	fc.setNow(1 * time.Second)
    67  	m, err := NewMaxReplicationLagModule(config, ratesHistory.aggregatedIntervalHistory, fc.now)
    68  	if err != nil {
    69  		return nil, err
    70  	}
    71  	// Updates for the throttler go into a big channel and will be ignored.
    72  	m.rateUpdateChan = make(chan<- struct{}, 1000)
    73  
    74  	return &testFixture{
    75  		m:            m,
    76  		ratesHistory: ratesHistory,
    77  	}, nil
    78  }
    79  
    80  // process does the same thing as MaxReplicationLagModule.ProcessRecords() does
    81  // for a new "lagRecord".
    82  func (tf *testFixture) process(lagRecord replicationLagRecord) {
    83  	tf.m.processRecord(lagRecord)
    84  }
    85  
    86  func (tf *testFixture) checkState(state state, rate int64, lastRateChange time.Time) error {
    87  	if got, want := tf.m.currentState, state; got != want {
    88  		return fmt.Errorf("module in wrong state. got = %v, want = %v", got, want)
    89  	}
    90  	if got, want := tf.m.MaxRate(), rate; got != want {
    91  		return fmt.Errorf("module has wrong MaxRate(). got = %v, want = %v", got, want)
    92  	}
    93  	if got, want := tf.m.lastRateChange, lastRateChange; got != want {
    94  		return fmt.Errorf("module has wrong lastRateChange time. got = %v, want = %v", got, want)
    95  	}
    96  	return nil
    97  }
    98  
    99  func TestMaxReplicationLagModule_RateNotZeroWhenDisabled(t *testing.T) {
   100  	tf, err := newTestFixtureWithMaxReplicationLag(ReplicationLagModuleDisabled)
   101  	if err != nil {
   102  		t.Fatal(err)
   103  	}
   104  
   105  	// Initial rate must not be zero. It's ReplicationLagModuleDisabled instead.
   106  	if err := tf.checkState(stateIncreaseRate, ReplicationLagModuleDisabled, sinceZero(1*time.Second)); err != nil {
   107  		t.Fatal(err)
   108  	}
   109  }
   110  
   111  func TestMaxReplicationLagModule_InitialStateAndWait(t *testing.T) {
   112  	config := NewMaxReplicationLagModuleConfig(5)
   113  	// Overwrite the default config to make sure we test a non-default value.
   114  	config.InitialRate = 123
   115  	config.MaxDurationBetweenIncreasesSec = 23
   116  	tf, err := newTestFixture(config)
   117  	if err != nil {
   118  		t.Fatal(err)
   119  	}
   120  
   121  	// Initial rate must be config.InitialRate.
   122  	if err := tf.checkState(stateIncreaseRate, config.InitialRate, sinceZero(1*time.Second)); err != nil {
   123  		t.Fatal(err)
   124  	}
   125  	// After startup, the next increment won't happen until
   126  	// config.MaxDurationBetweenIncreasesSec elapsed.
   127  	if got, want := tf.m.nextAllowedChangeAfterInit, sinceZero(config.MaxDurationBetweenIncreases()+1*time.Second); got != want {
   128  		t.Fatalf("got = %v, want = %v", got, want)
   129  	}
   130  }
   131  
   132  // TestMaxReplicationLagModule_Increase tests only the continuous increase of the
   133  // rate and assumes that we are well below the replica capacity.
   134  func TestMaxReplicationLagModule_Increase(t *testing.T) {
   135  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   136  	if err != nil {
   137  		t.Fatal(err)
   138  	}
   139  
   140  	// We start at config.InitialRate.
   141  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   142  		t.Fatal(err)
   143  	}
   144  	// After the initial wait period of 62s
   145  	// (config.MaxDurationBetweenIncreasesSec), regular increments start.
   146  
   147  	// r2 @  70s, 0s lag
   148  	// NOTE: We don't add a record for 70s itself because the throttler only
   149  	// publishes it at the "end" of that second i.e. when the time at 71s started.
   150  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   151  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   152  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   153  	// r2 becomes the "replica under test".
   154  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   155  		t.Fatal(err)
   156  	}
   157  	// We have to wait at least config.MinDurationBetweenIncreasesSec (40s) before
   158  	// the next increase.
   159  	if got, want := tf.m.replicaUnderTest.nextAllowedChange, sinceZero(70*time.Second).Add(tf.m.config.MinDurationBetweenIncreases()); got != want {
   160  		t.Fatalf("got = %v, want = %v", got, want)
   161  	}
   162  	// r2 @  75s, 0s lag
   163  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   164  	tf.ratesHistory.add(sinceZero(74*time.Second), 200)
   165  	tf.process(lagRecord(sinceZero(75*time.Second), r2, 0))
   166  	// Lag record was ignored because it's within the wait period.
   167  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   168  		t.Fatal(err)
   169  	}
   170  
   171  	// r1 @  80s, 0s lag
   172  	tf.ratesHistory.add(sinceZero(79*time.Second), 200)
   173  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 0))
   174  	// The r1 lag update was ignored because an increment "under test" is always
   175  	// locked in with the replica which triggered the increase (r2 this time).
   176  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   177  		t.Fatal(err)
   178  	}
   179  
   180  	// No increase is possible for the next 20 seconds.
   181  
   182  	// r2 @  90s, 0s lag
   183  	tf.ratesHistory.add(sinceZero(80*time.Second), 200)
   184  	tf.ratesHistory.add(sinceZero(89*time.Second), 200)
   185  	tf.process(lagRecord(sinceZero(90*time.Second), r2, 0))
   186  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   187  		t.Fatal(err)
   188  	}
   189  
   190  	// r1 @ 100s, 0s lag
   191  	tf.ratesHistory.add(sinceZero(99*time.Second), 200)
   192  	tf.process(lagRecord(sinceZero(100*time.Second), r1, 0))
   193  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   194  		t.Fatal(err)
   195  	}
   196  
   197  	// Next rate increase is possible after testing the rate for 40s.
   198  
   199  	// r2 @ 110s, 0s lag
   200  	tf.ratesHistory.add(sinceZero(109*time.Second), 200)
   201  	tf.process(lagRecord(sinceZero(110*time.Second), r2, 0))
   202  	if err := tf.checkState(stateIncreaseRate, 400, sinceZero(110*time.Second)); err != nil {
   203  		t.Fatal(err)
   204  	}
   205  }
   206  
   207  // TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp is
   208  // almost identical to TestMaxReplicationLagModule_Increase but this time we
   209  // test that the system makes progress if the currently tracked replica has
   210  // LastError set or is no longer tracked.
   211  func TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp(t *testing.T) {
   212  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   213  	if err != nil {
   214  		t.Fatal(err)
   215  	}
   216  
   217  	// r2 @  70s, 0s lag
   218  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   219  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   220  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   221  	// r2 becomes the "replica under test".
   222  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   223  		t.Fatal(err)
   224  	}
   225  
   226  	// r2 @  75s, 0s lag, LastError set
   227  	rError := lagRecord(sinceZero(75*time.Second), r2, 0)
   228  	rError.LastError = errors.New("HealthCheck reporting broken")
   229  	tf.m.replicaLagCache.add(rError)
   230  
   231  	// r1 @ 110s, 0s lag
   232  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   233  	tf.ratesHistory.add(sinceZero(109*time.Second), 200)
   234  	tf.process(lagRecord(sinceZero(110*time.Second), r1, 0))
   235  	// We ignore r2 as "replica under test" because it has LastError set.
   236  	// Instead, we act on r1.
   237  	// r1 becomes the "replica under test".
   238  	if err := tf.checkState(stateIncreaseRate, 400, sinceZero(110*time.Second)); err != nil {
   239  		t.Fatal(err)
   240  	}
   241  
   242  	// We'll simulate a shutdown of r1 i.e. we're no longer tracking it.
   243  	// r1 @ 115s, 0s lag, !Up
   244  	tf.ratesHistory.add(sinceZero(110*time.Second), 200)
   245  	tf.ratesHistory.add(sinceZero(114*time.Second), 400)
   246  	rNotUp := lagRecord(sinceZero(115*time.Second), r1, 0)
   247  	rNotUp.Serving = false
   248  	tf.m.replicaLagCache.add(rNotUp)
   249  
   250  	// r2 @ 150s, 0s lag (lastError no longer set)
   251  	tf.ratesHistory.add(sinceZero(149*time.Second), 400)
   252  	tf.process(lagRecord(sinceZero(150*time.Second), r2, 0))
   253  	// We ignore r1 as "replica under test" because it has !Up set.
   254  	// Instead, we act on r2.
   255  	// r2 becomes the "replica under test".
   256  	if err := tf.checkState(stateIncreaseRate, 800, sinceZero(150*time.Second)); err != nil {
   257  		t.Fatal(err)
   258  	}
   259  }
   260  
   261  // TestMaxReplicationLagModule_ReplicaUnderTest_Timeout tests the safe guard
   262  // that a "replica under test" which didn't report its lag for a while will be
   263  // cleared such that any other replica can become the new "replica under test".
   264  func TestMaxReplicationLagModule_ReplicaUnderTest_Timeout(t *testing.T) {
   265  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   266  	if err != nil {
   267  		t.Fatal(err)
   268  	}
   269  
   270  	// r2 @  70s, 0s lag
   271  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   272  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   273  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   274  	// r2 becomes the "replica under test".
   275  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   276  		t.Fatal(err)
   277  	}
   278  
   279  	// r1 @  80s, 0s lag (ignored because r2 is the "replica under test")
   280  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   281  	tf.ratesHistory.add(sinceZero(79*time.Second), 200)
   282  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 0))
   283  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   284  		t.Fatal(err)
   285  	}
   286  
   287  	// r2 as "replica under test" did not report its lag for too long.
   288  	// We'll ignore it from now and let other replicas trigger rate changes.
   289  	// r1 @  173s, 0s lag
   290  	// time for r1 must be > 172s (70s + 40s + 62s) which is
   291  	// (last rate change + test duration + max duration between increases).
   292  	tf.ratesHistory.add(sinceZero(172*time.Second), 200)
   293  	tf.process(lagRecord(sinceZero(173*time.Second), r1, 0))
   294  	if err := tf.checkState(stateIncreaseRate, 400, sinceZero(173*time.Second)); err != nil {
   295  		t.Fatal(err)
   296  	}
   297  }
   298  
   299  // TestMaxReplicationLagModule_ReplicaUnderTest_IncreaseToDecrease verifies that
   300  // the current "replica under test" is ignored when our state changes from
   301  // "stateIncreaseRate" to "stateDecreaseAndGuessRate".
   302  func TestMaxReplicationLagModule_ReplicaUnderTest_IncreaseToDecrease(t *testing.T) {
   303  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   304  	if err != nil {
   305  		t.Fatal(err)
   306  	}
   307  
   308  	// r2 @  70s, 0s lag (triggers the increase state)
   309  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   310  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   311  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   312  	// r2 becomes the "replica under test".
   313  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   314  		t.Fatal(err)
   315  	}
   316  
   317  	// r1 @  80s, 0s lag
   318  	// This lag record is required in the next step to correctly calculate how
   319  	// much r1 lags behind due to the rate increase.
   320  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   321  	tf.ratesHistory.add(sinceZero(79*time.Second), 200)
   322  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 0))
   323  	// r1 remains the "replica under test".
   324  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   325  		t.Fatal(err)
   326  	}
   327  
   328  	// r2 @  90s, 0s lag (ignored because the test duration is not up yet)
   329  	tf.ratesHistory.add(sinceZero(89*time.Second), 200)
   330  	tf.process(lagRecord(sinceZero(90*time.Second), r2, 0))
   331  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   332  		t.Fatal(err)
   333  	}
   334  
   335  	// r1 @ 100s, 3s lag (above target, provokes a decrease)
   336  	tf.ratesHistory.add(sinceZero(99*time.Second), 200)
   337  	tf.process(lagRecord(sinceZero(100*time.Second), r1, 3))
   338  	// r1 becomes the "replica under test".
   339  	// r1's high lag triggered the decrease state and therefore we did not wait
   340  	// for the pending increase of "replica under test" r2.
   341  	if err := tf.checkState(stateDecreaseAndGuessRate, 140, sinceZero(100*time.Second)); err != nil {
   342  		t.Fatal(err)
   343  	}
   344  
   345  	// r2 lag records are ignored while r1 is the "replica under test".
   346  	// r2 @ 110s, 0s lag
   347  	tf.ratesHistory.add(sinceZero(100*time.Second), 200)
   348  	tf.ratesHistory.add(sinceZero(109*time.Second), 140)
   349  	tf.process(lagRecord(sinceZero(110*time.Second), r2, 0))
   350  	if err := tf.checkState(stateDecreaseAndGuessRate, 140, sinceZero(100*time.Second)); err != nil {
   351  		t.Fatal(err)
   352  	}
   353  
   354  	// r1 leaves the "replica under test" as soon as the test duration is up
   355  	// or its lag improved to a better state.
   356  	// Here we simulate that its lag improved and the resulting state goes from
   357  	// "decrease" to "increase".
   358  	// r1 @ 119s, 0s lag (triggers the increase state)
   359  	tf.ratesHistory.add(sinceZero(118*time.Second), 140)
   360  	tf.process(lagRecord(sinceZero(119*time.Second), r1, 0))
   361  	// Rate increases to 170, the middle of: [good, bad] = [140, 200].
   362  	if err := tf.checkState(stateIncreaseRate, 170, sinceZero(119*time.Second)); err != nil {
   363  		t.Fatal(err)
   364  	}
   365  }
   366  
   367  // TestMaxReplicationLagModule_ReplicaUnderTest_DecreaseToEmergency verifies
   368  // that the current "replica under test" is ignored when our state changes from
   369  // "stateDecreaseAndGuessRate" to "stateEmergency".
   370  func TestMaxReplicationLagModule_ReplicaUnderTest_DecreaseToEmergency(t *testing.T) {
   371  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   372  	if err != nil {
   373  		t.Fatal(err)
   374  	}
   375  
   376  	// INCREASE
   377  
   378  	// r1 @  20s, 0s lag
   379  	// This lag record is required in the next step to correctly calculate how
   380  	// much r1 lags behind due to the rate increase.
   381  	tf.ratesHistory.add(sinceZero(19*time.Second), 100)
   382  	tf.process(lagRecord(sinceZero(20*time.Second), r1, 0))
   383  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   384  		t.Fatal(err)
   385  	}
   386  
   387  	// DECREASE
   388  
   389  	// r1 @  40s, 3s lag (above target, provokes a decrease)
   390  	tf.ratesHistory.add(sinceZero(39*time.Second), 100)
   391  	tf.process(lagRecord(sinceZero(40*time.Second), r1, 3))
   392  	// r1 becomes the "replica under test".
   393  	if err := tf.checkState(stateDecreaseAndGuessRate, 70, sinceZero(40*time.Second)); err != nil {
   394  		t.Fatal(err)
   395  	}
   396  
   397  	// EMERGENCY
   398  
   399  	// r2 lag is even worse and triggers the "emergency" state.
   400  	// r2 @  50s, 10s lag
   401  	tf.ratesHistory.add(sinceZero(40*time.Second), 100)
   402  	tf.ratesHistory.add(sinceZero(49*time.Second), 70)
   403  	tf.process(lagRecord(sinceZero(50*time.Second), r2, 10))
   404  	// r1 overrides r2 as new "replica under test".
   405  	if err := tf.checkState(stateEmergency, 35, sinceZero(50*time.Second)); err != nil {
   406  		t.Fatal(err)
   407  	}
   408  
   409  	// r1 lag becomes worse than the r1 lag now. We don't care and keep r1 as
   410  	// "replica under test" for now.
   411  	// r1 @  60s, 15s lag
   412  	tf.ratesHistory.add(sinceZero(50*time.Second), 70)
   413  	tf.ratesHistory.add(sinceZero(59*time.Second), 35)
   414  	tf.process(lagRecord(sinceZero(60*time.Second), r1, 15))
   415  	if err := tf.checkState(stateEmergency, 35, sinceZero(50*time.Second)); err != nil {
   416  		t.Fatal(err)
   417  	}
   418  
   419  	// INCREASE
   420  
   421  	// r2 fully recovers and triggers an increase.
   422  	// r2 @  70s, 0s lag
   423  	tf.ratesHistory.add(sinceZero(69*time.Second), 35)
   424  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   425  	// r2 becomes the new "replica under test".
   426  	if err := tf.checkState(stateIncreaseRate, 70, sinceZero(70*time.Second)); err != nil {
   427  		t.Fatal(err)
   428  	}
   429  
   430  	// EMERGENCY
   431  
   432  	// r1 hasn't recovered yet and forces us to go back to the "emergency" state.
   433  	// r1 @  80s, 15s lag
   434  	tf.ratesHistory.add(sinceZero(70*time.Second), 35)
   435  	tf.ratesHistory.add(sinceZero(79*time.Second), 70)
   436  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 15))
   437  	// r1 becomes the new "replica under test".
   438  	if err := tf.checkState(stateEmergency, 35, sinceZero(80*time.Second)); err != nil {
   439  		t.Fatal(err)
   440  	}
   441  }
   442  
   443  // TestMaxReplicationLagModule_Increase_BadRateUpperBound verifies that a
   444  // known bad rate is always the upper bound for any rate increase.
   445  func TestMaxReplicationLagModule_Increase_BadRateUpperBound(t *testing.T) {
   446  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   447  	if err != nil {
   448  		t.Fatal(err)
   449  	}
   450  
   451  	// We start at config.InitialRate.
   452  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   453  		t.Fatal(err)
   454  	}
   455  
   456  	// Assume that a bad value of 150 was set @ 30s and log error
   457  	if err := tf.m.memory.markBad(150, sinceZero(30*time.Second)); err != nil {
   458  		log.Errorf("tf.m.memory.markBad(150, sinceZero(30*time.Second)) falied : %v", err)
   459  	}
   460  
   461  	// r2 @  70s, 0s lag
   462  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   463  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   464  	// Rate should get increased to 200 based on actual rate of 100 within
   465  	// [0s, 69s].
   466  	// However, this would go over the bad rate. Therefore, the new rate will be
   467  	// the middle of [100, 150] ([actual rate, bad rate]).
   468  	if err := tf.checkState(stateIncreaseRate, 125, sinceZero(70*time.Second)); err != nil {
   469  		t.Fatal(err)
   470  	}
   471  }
   472  
   473  // TestMaxReplicationLagModule_Increase_MinimumProgress verifies that the
   474  // calculated new rate is never identical to the current rate and at least by
   475  // "memoryGranularity" higher.
   476  func TestMaxReplicationLagModule_Increase_MinimumProgress(t *testing.T) {
   477  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   478  	if err != nil {
   479  		t.Fatal(err)
   480  	}
   481  
   482  	// We start at config.InitialRate.
   483  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   484  		t.Fatal(err)
   485  	}
   486  
   487  	// Assume that a bad value of 105 was set @ 30s.
   488  	tf.m.memory.markBad(105, sinceZero(30*time.Second))
   489  
   490  	// r2 @  70s, 0s lag
   491  	// Assume that the actual rate was below the limit (95 instead of 100).
   492  	tf.ratesHistory.add(sinceZero(69*time.Second), 95)
   493  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   494  	// Rate should get increased to 190 based on an actual rate of 95 within
   495  	// [0s, 69s].
   496  	// However, this would go over the bad rate. Therefore, the new rate should
   497  	// be the middle of [95, 105] ([actual rate, bad rate]).
   498  	// But then the new rate is identical to the old set rate of 100.
   499  	// In such a case, we always advance the new rate by "memoryGranularity"
   500  	// (which is currently 5).
   501  	if err := tf.checkState(stateIncreaseRate, 105, sinceZero(70*time.Second)); err != nil {
   502  		t.Fatal(err)
   503  	}
   504  }
   505  
   506  // TestMaxReplicationLagModule_Decrease verifies that we correctly calculate the
   507  // replication rate in the decreaseAndGuessRate state.
   508  func TestMaxReplicationLagModule_Decrease(t *testing.T) {
   509  	tf, err := newTestFixtureWithMaxReplicationLag(5)
   510  	if err != nil {
   511  		t.Fatal(err)
   512  	}
   513  
   514  	// r2 @  70s, 0s lag
   515  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   516  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   517  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   518  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   519  		t.Fatal(err)
   520  	}
   521  
   522  	// r2 @  90s, 3s lag (above target, provokes a decrease)
   523  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   524  	tf.ratesHistory.add(sinceZero(89*time.Second), 200)
   525  	tf.process(lagRecord(sinceZero(90*time.Second), r2, 3))
   526  	// The guessed replication rate is 140 because of the 3s lag increase
   527  	// within the elapsed 20s.
   528  	// The replica processed only 17s worth of work (20s duration - 3s lag increase)
   529  	// 17s / 20s * 200 QPS actual rate => 170 QPS replica rate
   530  	//
   531  	// This results in a backlog of 3s * 200 QPS = 600 queries.
   532  	// Since this backlog is spread across SpreadBacklogAcrossSec (20s),
   533  	// the guessed rate gets further reduced by 30 QPS (600 queries / 20s).
   534  	// Hence, the rate is set to 140 QPS (170 - 30).
   535  	if err := tf.checkState(stateDecreaseAndGuessRate, 140, sinceZero(90*time.Second)); err != nil {
   536  		t.Fatal(err)
   537  	}
   538  }
   539  
   540  // TestMaxReplicationLagModule_Decrease_NoReplicaHistory skips decreasing the
   541  // rate when the lag of r2 goes above the target value because we don't have
   542  // replication lag value since the last rate change for r2. Therefore, we cannot
   543  // reliably guess its rate and wait for the next available record.
   544  func TestMaxReplicationLagModule_Decrease_NoReplicaHistory(t *testing.T) {
   545  	tf, err := newTestFixtureWithMaxReplicationLag(10)
   546  	if err != nil {
   547  		t.Fatal(err)
   548  	}
   549  
   550  	// r2 @  70s, 0s lag
   551  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   552  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   553  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   554  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   555  		t.Fatal(err)
   556  	}
   557  
   558  	// r1 @  80s, 3s lag (above target, but no decrease triggered)
   559  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   560  	tf.ratesHistory.add(sinceZero(79*time.Second), 200)
   561  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 3))
   562  	// Rate was decreased by 25% (half the emergency decrease) as safety measure.
   563  	if err := tf.checkState(stateDecreaseAndGuessRate, 150, sinceZero(80*time.Second)); err != nil {
   564  		t.Fatal(err)
   565  	}
   566  
   567  	// r2 @  90s, 0s lag
   568  	tf.ratesHistory.add(sinceZero(80*time.Second), 200)
   569  	tf.ratesHistory.add(sinceZero(89*time.Second), 150)
   570  	tf.process(lagRecord(sinceZero(90*time.Second), r2, 0))
   571  	// r2 is ignored because r1 is the "replica under test".
   572  	if err := tf.checkState(stateDecreaseAndGuessRate, 150, sinceZero(80*time.Second)); err != nil {
   573  		t.Fatal(err)
   574  	}
   575  
   576  	// r1 recovers after the rate decrease and triggers a new increase.
   577  	// r1 @ 100s, 0s lag
   578  	tf.ratesHistory.add(sinceZero(99*time.Second), 150)
   579  	tf.process(lagRecord(sinceZero(100*time.Second), r1, 0))
   580  	if err := tf.checkState(stateIncreaseRate, 300, sinceZero(100*time.Second)); err != nil {
   581  		t.Fatal(err)
   582  	}
   583  }
   584  
   585  func TestMaxReplicationLagModule_IgnoreNSlowestReplicas_REPLICA(t *testing.T) {
   586  	testIgnoreNSlowestReplicas(t, r1, r2)
   587  }
   588  
   589  func TestMaxReplicationLagModule_IgnoreNSlowestReplicas_RDONLY(t *testing.T) {
   590  	testIgnoreNSlowestReplicas(t, rdonly1, rdonly2)
   591  }
   592  
   593  func testIgnoreNSlowestReplicas(t *testing.T, r1UID, r2UID uint32) {
   594  	config := NewMaxReplicationLagModuleConfig(5)
   595  	typ := "REPLICA"
   596  	if r1UID == r1 {
   597  		config.IgnoreNSlowestReplicas = 1
   598  	} else {
   599  		config.IgnoreNSlowestRdonlys = 1
   600  		typ = "RDONLY"
   601  	}
   602  	tf, err := newTestFixture(config)
   603  	if err != nil {
   604  		t.Fatal(err)
   605  	}
   606  
   607  	// r1 @  80s, 0s lag
   608  	tf.ratesHistory.add(sinceZero(79*time.Second), 100)
   609  	tf.process(lagRecord(sinceZero(80*time.Second), r1UID, 0))
   610  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   611  		t.Fatal(err)
   612  	}
   613  
   614  	// r2 @  90s, 10s lag
   615  	tf.ratesHistory.add(sinceZero(80*time.Second), 100)
   616  	tf.ratesHistory.add(sinceZero(90*time.Second), 200)
   617  	tf.process(lagRecord(sinceZero(90*time.Second), r2UID, 10))
   618  	// Although r2's lag is high, it's ignored because it's the 1 slowest replica.
   619  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   620  		t.Fatal(err)
   621  	}
   622  	results := tf.m.results.latestValues()
   623  	if got, want := len(results), 2; got != want {
   624  		t.Fatalf("skipped replica should have been recorded on the results page. got = %v, want = %v", got, want)
   625  	}
   626  	if got, want := results[0].Reason, fmt.Sprintf("skipping this replica because it's among the 1 slowest %v tablets", typ); got != want {
   627  		t.Fatalf("skipped replica should have been recorded as skipped on the results page. got = %v, want = %v", got, want)
   628  	}
   629  
   630  	// r1 @ 100s, 20s lag
   631  	tf.ratesHistory.add(sinceZero(99*time.Second), 200)
   632  	tf.process(lagRecord(sinceZero(100*time.Second), r1UID, 20))
   633  	// r1 would become the new 1 slowest replica. However, we do not ignore it
   634  	// because then we would ignore all known replicas in a row.
   635  	// => react to the high lag and reduce the rate by 50% from 200 to 100.
   636  	if err := tf.checkState(stateEmergency, 100, sinceZero(100*time.Second)); err != nil {
   637  		t.Fatal(err)
   638  	}
   639  }
   640  
   641  func TestMaxReplicationLagModule_IgnoreNSlowestReplicas_NotEnoughReplicas(t *testing.T) {
   642  	config := NewMaxReplicationLagModuleConfig(5)
   643  	config.IgnoreNSlowestReplicas = 1
   644  	tf, err := newTestFixture(config)
   645  	if err != nil {
   646  		t.Fatal(err)
   647  	}
   648  
   649  	// r2 @  70s, 10s lag
   650  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   651  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 10))
   652  	// r2 is the 1 slowest replica. However, it's not ignored because then we
   653  	// would ignore all replicas. Therefore, we react to its lag increase.
   654  	if err := tf.checkState(stateEmergency, 50, sinceZero(70*time.Second)); err != nil {
   655  		t.Fatal(err)
   656  	}
   657  }
   658  
   659  // TestMaxReplicationLagModule_IgnoreNSlowestReplicas_IsIgnoredDuringIncrease
   660  // is almost identical to
   661  // TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp.
   662  // r2 triggers an increase and we wait for its next update after the increase
   663  // waiting period. However, it becomes the slowest replica in the meantime and
   664  // will be completely ignored. In consequence, we no longer wait for r2 and r1
   665  // can trigger another increase instead.
   666  func TestMaxReplicationLagModule_IgnoreNSlowestReplicas_IsIgnoredDuringIncrease(t *testing.T) {
   667  	config := NewMaxReplicationLagModuleConfig(5)
   668  	config.IgnoreNSlowestReplicas = 1
   669  	tf, err := newTestFixture(config)
   670  	if err != nil {
   671  		t.Fatal(err)
   672  	}
   673  
   674  	// r2 @  70s, 0s lag
   675  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   676  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   677  	// Rate was increased to 200 based on actual rate of 100 within [0s, 69s].
   678  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   679  		t.Fatal(err)
   680  	}
   681  
   682  	// r1 @  80s, 0s lag
   683  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   684  	tf.ratesHistory.add(sinceZero(79*time.Second), 200)
   685  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 0))
   686  	// Lag record was ignored because it's within the wait period.
   687  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   688  		t.Fatal(err)
   689  	}
   690  
   691  	// r2 becomes slow and will be ignored now.
   692  	// r2 @  90s, 10s lag
   693  	tf.ratesHistory.add(sinceZero(89*time.Second), 200)
   694  	tf.m.replicaLagCache.add(lagRecord(sinceZero(90*time.Second), r2, 10))
   695  	// We ignore the 1 slowest replica and do not decrease despite r2's high lag.
   696  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   697  		t.Fatal(err)
   698  	}
   699  
   700  	// r1 @ 110s, 0s lag
   701  	tf.ratesHistory.add(sinceZero(109*time.Second), 200)
   702  	tf.process(lagRecord(sinceZero(110*time.Second), r1, 0))
   703  	// Meanwhile, r1 is doing fine and will trigger the next increase because
   704  	// we're no longer waiting for the ignored r2.
   705  	if err := tf.checkState(stateIncreaseRate, 400, sinceZero(110*time.Second)); err != nil {
   706  		t.Fatal(err)
   707  	}
   708  }
   709  
   710  // TestMaxReplicationLagModule_IgnoreNSlowestReplicas_IncludeRdonly is the same
   711  // as TestMaxReplicationLagModule_IgnoreNSlowestReplicas but includes
   712  // RDONLY tablets as well.
   713  func TestMaxReplicationLagModule_IgnoreNSlowestReplicas_IncludeRdonly(t *testing.T) {
   714  	config := NewMaxReplicationLagModuleConfig(5)
   715  	// We ignore up to 1 REPLICA and 1 RDONLY tablet.
   716  	config.IgnoreNSlowestReplicas = 1
   717  	config.IgnoreNSlowestRdonlys = 1
   718  	tf, err := newTestFixture(config)
   719  	if err != nil {
   720  		t.Fatal(err)
   721  	}
   722  
   723  	// r1     @  80s, 0s lag
   724  	tf.ratesHistory.add(sinceZero(79*time.Second), 100)
   725  	tf.process(lagRecord(sinceZero(80*time.Second), r1, 0))
   726  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   727  		t.Fatal(err)
   728  	}
   729  
   730  	// rdonly1 @  85s, 0s lag
   731  	tf.ratesHistory.add(sinceZero(80*time.Second), 100)
   732  	tf.ratesHistory.add(sinceZero(84*time.Second), 200)
   733  	tf.process(lagRecord(sinceZero(85*time.Second), rdonly1, 0))
   734  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   735  		t.Fatal(err)
   736  	}
   737  
   738  	// r2     @  90s, 10s lag
   739  	tf.ratesHistory.add(sinceZero(89*time.Second), 200)
   740  	tf.process(lagRecord(sinceZero(90*time.Second), r2, 10))
   741  	// Although r2's lag is high, it's ignored because it's the 1 slowest REPLICA tablet.
   742  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   743  		t.Fatal(err)
   744  	}
   745  	results := tf.m.results.latestValues()
   746  	if got, want := len(results), 3; got != want {
   747  		t.Fatalf("skipped replica should have been recorded on the results page. got = %v, want = %v", got, want)
   748  	}
   749  	if got, want := results[0].Reason, "skipping this replica because it's among the 1 slowest REPLICA tablets"; got != want {
   750  		t.Fatalf("skipped replica should have been recorded as skipped on the results page. got = %v, want = %v", got, want)
   751  	}
   752  
   753  	// rdonly2 @  95s, 10s lag
   754  	tf.ratesHistory.add(sinceZero(94*time.Second), 200)
   755  	tf.process(lagRecord(sinceZero(95*time.Second), rdonly2, 10))
   756  	// Although rdonly2's lag is high, it's ignored because it's the 1 slowest RDONLY tablet.
   757  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(80*time.Second)); err != nil {
   758  		t.Fatal(err)
   759  	}
   760  	results = tf.m.results.latestValues()
   761  	if got, want := len(results), 4; got != want {
   762  		t.Fatalf("skipped replica should have been recorded on the results page. got = %v, want = %v", got, want)
   763  	}
   764  	if got, want := results[0].Reason, "skipping this replica because it's among the 1 slowest RDONLY tablets"; got != want {
   765  		t.Fatalf("skipped replica should have been recorded as skipped on the results page. got = %v, want = %v", got, want)
   766  	}
   767  
   768  	// r1     @ 100s, 11s lag
   769  	tf.ratesHistory.add(sinceZero(99*time.Second), 200)
   770  	tf.process(lagRecord(sinceZero(100*time.Second), r1, 10))
   771  	// r1 would become the new 1 slowest REPLICA tablet. However, we do not ignore
   772  	// it because then we would ignore all known replicas in a row.
   773  	// => react to the high lag and reduce the rate by 50% from 200 to 100.
   774  	if err := tf.checkState(stateEmergency, 100, sinceZero(100*time.Second)); err != nil {
   775  		t.Fatal(err)
   776  	}
   777  
   778  	// r2 and rdonly are omitted here for brevity.
   779  
   780  	// r1 recovers, but then rdonly1 becomes slow as well.
   781  
   782  	// r1     @ 120s, 0s lag
   783  	tf.ratesHistory.add(sinceZero(119*time.Second), 100)
   784  	tf.process(lagRecord(sinceZero(120*time.Second), r1, 0))
   785  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(120*time.Second)); err != nil {
   786  		t.Fatal(err)
   787  	}
   788  
   789  	// rdonly1 @ 125s, 11s lag
   790  	tf.ratesHistory.add(sinceZero(120*time.Second), 100)
   791  	tf.ratesHistory.add(sinceZero(124*time.Second), 200)
   792  	tf.process(lagRecord(sinceZero(125*time.Second), rdonly1, 11))
   793  	// rdonly1 would become the new 1 slowest RDONLY tablet. However, we do not
   794  	// ignore it because then we would ignore all known replicas in a row.
   795  	// => react to the high lag and reduce the rate by 50% from 200 to 100.
   796  	if err := tf.checkState(stateEmergency, 100, sinceZero(125*time.Second)); err != nil {
   797  		t.Fatal(err)
   798  	}
   799  }
   800  
   801  // TestMaxReplicationLagModule_EmergencyDoesNotChangeBadValues verifies that a
   802  // replica, which triggers the emergency state and drags the rate all the way
   803  // down to the minimum, does not influence the stored bad rate in the "memory".
   804  // This situation occurs when a tablet gets restarted and restores from a
   805  // backup. During that time, the reported replication lag is very high and won't
   806  // go down until the restore finished (which may take hours). Once the restore
   807  // is done, the throttler should immediately continue from the last good rate
   808  // before the emergency and not have to test rates starting from the minimum.
   809  // In particular, the stored bad rate must not be close the minimum rate or it
   810  // may take hours until e.g. a bad rate of 2 ages out to the actual bad rate of
   811  // e.g. 201.
   812  func TestMaxReplicationLagModule_EmergencyDoesNotChangeBadValues(t *testing.T) {
   813  	config := NewMaxReplicationLagModuleConfig(5)
   814  	// Use a very aggressive aging rate to verify that bad rates do not age while
   815  	// we're in the "emergency" state.
   816  	config.AgeBadRateAfterSec = 21
   817  	tf, err := newTestFixture(config)
   818  	if err != nil {
   819  		t.Fatal(err)
   820  	}
   821  
   822  	// INCREASE (necessary to set a "good" rate in the memory)
   823  
   824  	// r2 @  70s, 0s lag
   825  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
   826  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
   827  	if err := tf.checkState(stateIncreaseRate, 200, sinceZero(70*time.Second)); err != nil {
   828  		t.Fatal(err)
   829  	}
   830  
   831  	// r2 @ 110s, 0s lag
   832  	tf.ratesHistory.add(sinceZero(70*time.Second), 100)
   833  	tf.ratesHistory.add(sinceZero(109*time.Second), 200)
   834  	tf.process(lagRecord(sinceZero(110*time.Second), r2, 0))
   835  	if err := tf.checkState(stateIncreaseRate, 400, sinceZero(110*time.Second)); err != nil {
   836  		t.Fatal(err)
   837  	}
   838  	if got, want := tf.m.memory.highestGood(), int64(200); got != want {
   839  		t.Fatalf("wrong good rate: got = %v, want = %v", got, want)
   840  	}
   841  
   842  	// DECREASE (necessary to set a "bad" rate in the memory)
   843  
   844  	// r2 @ 130s, 3s lag (above target, provokes a decrease)
   845  	tf.ratesHistory.add(sinceZero(110*time.Second), 200)
   846  	tf.ratesHistory.add(sinceZero(129*time.Second), 400)
   847  	tf.process(lagRecord(sinceZero(130*time.Second), r2, 3))
   848  	if err := tf.checkState(stateDecreaseAndGuessRate, 280, sinceZero(130*time.Second)); err != nil {
   849  		t.Fatal(err)
   850  	}
   851  	if got, want := tf.m.memory.lowestBad(), int64(400); got != want {
   852  		t.Fatalf("wrong bad rate: got = %v, want = %v", got, want)
   853  	}
   854  
   855  	// memory: [good, bad] now is [200, 400].
   856  
   857  	// EMERGENCY
   858  
   859  	// Assume that "r1" was just restored and has a very high replication lag.
   860  	// r1 @ 140s, 3600s lag
   861  	tf.ratesHistory.add(sinceZero(130*time.Second), 400)
   862  	tf.ratesHistory.add(sinceZero(139*time.Second), 280)
   863  	tf.process(lagRecord(sinceZero(140*time.Second), r1, 3600))
   864  	if err := tf.checkState(stateEmergency, 140, sinceZero(140*time.Second)); err != nil {
   865  		t.Fatal(err)
   866  	}
   867  	if got, want := tf.m.memory.lowestBad(), int64(280); got != want {
   868  		t.Fatalf("bad rate should change when we transition to the emergency state: got = %v, want = %v", got, want)
   869  	}
   870  
   871  	// memory: [good, bad] now is [200, 280].
   872  
   873  	// r2 @ 150s, 0s lag (ignored because r1 is the "replica under test")
   874  	tf.ratesHistory.add(sinceZero(140*time.Second), 280)
   875  	tf.ratesHistory.add(sinceZero(149*time.Second), 140)
   876  	tf.process(lagRecord(sinceZero(150*time.Second), r2, 0))
   877  	if err := tf.checkState(stateEmergency, 140, sinceZero(140*time.Second)); err != nil {
   878  		t.Fatal(err)
   879  	}
   880  	tf.ratesHistory.add(sinceZero(160*time.Second), 140)
   881  
   882  	// r1 keeps to drive the throttler rate down, but not the bad rate.
   883  
   884  	times := []int{160, 180, 200, 220, 240, 260, 280, 300, 320}
   885  	rates := []int{70, 35, 17, 8, 4, 2, 1, 1, 1}
   886  
   887  	for i, tm := range times {
   888  		// r1 @ <tt>s, 3600s lag
   889  		r1Time := sinceZero(time.Duration(tm) * time.Second)
   890  		if i > 0 {
   891  			tf.ratesHistory.add(r1Time, int64(rates[i-1]))
   892  		}
   893  		tf.process(lagRecord(r1Time, r1, 3600))
   894  		if err := tf.checkState(stateEmergency, int64(rates[i]), r1Time); err != nil {
   895  			t.Fatalf("time=%d: %v", tm, err)
   896  		}
   897  		if got, want := tf.m.memory.lowestBad(), int64(280); got != want {
   898  			t.Fatalf("time=%d: bad rate must not change when the old state is the emergency state: got = %v, want = %v", tm, got, want)
   899  		}
   900  
   901  		// r2 @ <tt+10>s, 0s lag (ignored because r1 is the "replica under test")
   902  		r2Time := sinceZero(time.Duration(tm+10) * time.Second)
   903  		tf.ratesHistory.add(r2Time, int64(rates[i]))
   904  		tf.process(lagRecord(r2Time, r2, 0))
   905  		if err := tf.checkState(stateEmergency, int64(rates[i]), r1Time); err != nil {
   906  			t.Fatalf("time=%d: %v", tm, err)
   907  		}
   908  	}
   909  
   910  	// INCREASE
   911  
   912  	// r1 is fully restored now and its lag is zero.
   913  	// We'll leave the emergency state and increase the rate based of the last
   914  	// stored "good" rate.
   915  
   916  	// r1 @ 340s, 0s lag
   917  	tf.ratesHistory.add(sinceZero(339*time.Second), 1)
   918  	tf.process(lagRecord(sinceZero(340*time.Second), r1, 0))
   919  	// New rate is 240, the middle of [good, bad] = [200, 240].
   920  	if err := tf.checkState(stateIncreaseRate, 240, sinceZero(340*time.Second)); err != nil {
   921  		t.Fatal(err)
   922  	}
   923  }
   924  
   925  func TestMaxReplicationLagModule_NoIncreaseIfMaxRateWasNotApproached(t *testing.T) {
   926  	config := NewMaxReplicationLagModuleConfig(5)
   927  	tf, err := newTestFixture(config)
   928  	if err != nil {
   929  		t.Fatal(err)
   930  	}
   931  
   932  	// r1 @  20s, 0s lag
   933  	// This lag record is required in the next step to correctly calculate how
   934  	// much r1 lags behind due to the rate increase.
   935  	tf.process(lagRecord(sinceZero(20*time.Second), r1, 0))
   936  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   937  		t.Fatal(err)
   938  	}
   939  
   940  	// Master gets 10 QPS in second 69.
   941  	// r1 @  70s, 0s lag.
   942  	// Rate does not double to 200 as it does in the other tests because
   943  	// the actual rate is much smaller than the current rate of 100.
   944  	tf.ratesHistory.add(sinceZero(69*time.Second), 10)
   945  	tf.process(lagRecord(sinceZero(70*time.Second), r1, 0))
   946  	// r1 becomes the "replica under test".
   947  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   948  		t.Fatal(err)
   949  	}
   950  }
   951  
   952  // lagRecord creates a fake record using a fake LegacyTabletStats object.
   953  func lagRecord(t time.Time, uid, lag uint32) replicationLagRecord {
   954  	return replicationLagRecord{t, tabletStats(uid, lag)}
   955  }
   956  
   957  // tabletStats creates fake tablet health data.
   958  func tabletStats(uid, lag uint32) discovery.TabletHealth {
   959  	typ := topodatapb.TabletType_REPLICA
   960  	if uid == rdonly1 || uid == rdonly2 {
   961  		typ = topodatapb.TabletType_RDONLY
   962  	}
   963  	tablet := &topodatapb.Tablet{
   964  		Alias:    &topodatapb.TabletAlias{Cell: "cell1", Uid: uid},
   965  		Keyspace: "ks1",
   966  		Shard:    "-80",
   967  		Type:     typ,
   968  		PortMap:  map[string]int32{"vt": int32(uid)},
   969  	}
   970  	return discovery.TabletHealth{
   971  		Tablet: tablet,
   972  		Target: &querypb.Target{
   973  			Keyspace:   "ks1",
   974  			Shard:      "-80",
   975  			TabletType: typ,
   976  		},
   977  		Serving: true,
   978  		Stats: &querypb.RealtimeStats{
   979  			ReplicationLagSeconds: lag,
   980  		},
   981  		PrimaryTermStartTime: 22,
   982  		LastError:            nil,
   983  	}
   984  }
   985  
   986  func TestApplyLatestConfig(t *testing.T) {
   987  	config := NewMaxReplicationLagModuleConfig(5)
   988  	tf, err := newTestFixture(config)
   989  	if err != nil {
   990  		t.Fatal(err)
   991  	}
   992  
   993  	// We start at config.InitialRate.
   994  	if err := tf.checkState(stateIncreaseRate, 100, sinceZero(1*time.Second)); err != nil {
   995  		t.Fatal(err)
   996  	}
   997  	// Change the default MaxIncrease from 100% to 200% and test that it's
   998  	// correctly propagated.
   999  	config.MaxIncrease = 2
  1000  	tf.m.updateConfiguration(config.Configuration, true /* copyZeroValues */)
  1001  
  1002  	// r2 @  70s, 0s lag
  1003  	tf.ratesHistory.add(sinceZero(69*time.Second), 100)
  1004  	tf.process(lagRecord(sinceZero(70*time.Second), r2, 0))
  1005  	// Rate was increased to 300 based on an actual rate of 100 within [0s, 69s].
  1006  	// That's a 200% increase.
  1007  	if err := tf.checkState(stateIncreaseRate, 300, sinceZero(70*time.Second)); err != nil {
  1008  		t.Fatal(err)
  1009  	}
  1010  
  1011  	// Now reset the config to its default values.
  1012  	tf.m.resetConfiguration()
  1013  	// Let's assume that the current rate of 300 was actually fine.
  1014  	// After the reset, we'll increase it only by 100% to 600.
  1015  
  1016  	// r2 @ 110s, 0s lag
  1017  	tf.ratesHistory.add(sinceZero(80*time.Second), 300)
  1018  	tf.ratesHistory.add(sinceZero(109*time.Second), 300)
  1019  	tf.process(lagRecord(sinceZero(110*time.Second), r2, 0))
  1020  	if err := tf.checkState(stateIncreaseRate, 600, sinceZero(110*time.Second)); err != nil {
  1021  		t.Fatal(err)
  1022  	}
  1023  }