github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/container/container_test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package container_test // intentionally test from external package 12 13 import ( 14 "context" 15 "reflect" 16 "sync" 17 "testing" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/container" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 23 providertestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider/testutils" 24 transporttestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/transport/testutils" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 27 "github.com/cockroachdb/cockroach/pkg/testutils" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 30 "github.com/cockroachdb/cockroach/pkg/util/stop" 31 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 32 "github.com/cockroachdb/errors" 33 "github.com/kr/pretty" 34 "github.com/stretchr/testify/require" 35 ) 36 37 type LateBoundDialer struct { 38 Wrapped *transporttestutils.ChanDialer 39 } 40 41 func (d *LateBoundDialer) Dial(ctx context.Context, nodeID roachpb.NodeID) (ctpb.Client, error) { 42 return d.Wrapped.Dial(ctx, nodeID) 43 } 44 45 func (d *LateBoundDialer) Ready(nodeID roachpb.NodeID) bool { 46 return d.Wrapped.Ready(nodeID) 47 } 48 49 type TestContainer struct { 50 *container.Container 51 NodeID roachpb.NodeID 52 Refreshed struct { 53 syncutil.Mutex 54 RangeIDs []roachpb.RangeID 55 } 56 Dialer *LateBoundDialer 57 TestClock *providertestutils.TestClock 58 } 59 60 func prepareContainer() *TestContainer { 61 stopper := stop.NewStopper() 62 63 tc := &TestContainer{} 64 65 tc.TestClock = providertestutils.NewTestClock(stopper) 66 67 var wg sync.WaitGroup 68 wg.Add(1) 69 refresh := func(requested ...roachpb.RangeID) { 70 tc.Refreshed.Lock() 71 tc.Refreshed.RangeIDs = append(tc.Refreshed.RangeIDs, requested...) 72 tc.Refreshed.Unlock() 73 } 74 75 st := cluster.MakeTestingClusterSettings() 76 77 // Set the target duration to a second and the close fraction so small 78 // that the Provider will essentially close in a hot loop. In this test 79 // we'll block in the clock to pace the Provider's closer loop. 80 closedts.TargetDuration.Override(&st.SV, time.Second) 81 closedts.CloseFraction.Override(&st.SV, 1e-9) 82 83 // We perform a little dance with the Dialer. It needs to be hooked up to the 84 // Server, but that's only created in NewContainer. The Dialer isn't used until 85 // that point, so we just create it a little later. 86 tc.Dialer = &LateBoundDialer{} 87 88 cfg := container.Config{ 89 Settings: st, 90 Stopper: stopper, 91 Clock: tc.TestClock.LiveNow, 92 Refresh: refresh, 93 Dialer: tc.Dialer, 94 } 95 96 tc.Container = container.NewContainer(cfg) 97 return tc 98 } 99 100 func setupTwoNodeTest() (_ *TestContainer, _ *TestContainer, shutdown func()) { 101 c1 := prepareContainer() 102 c2 := prepareContainer() 103 104 c1.NodeID = roachpb.NodeID(1) 105 c2.NodeID = roachpb.NodeID(2) 106 107 c1.Start(c1.NodeID) 108 c2.Start(c2.NodeID) 109 110 // Link the containers. 111 c1.Dialer.Wrapped = transporttestutils.NewChanDialer(c1.Stopper, c2.Server) 112 c2.Dialer.Wrapped = transporttestutils.NewChanDialer(c2.Stopper, c1.Server) 113 114 return c1, c2, func() { 115 // Oh, the joy of multiple stoppers. 116 var wg sync.WaitGroup 117 wg.Add(2) 118 go func() { 119 defer wg.Done() 120 c1.Stopper.Stop(context.Background()) 121 }() 122 go func() { 123 defer wg.Done() 124 c2.Stopper.Stop(context.Background()) 125 }() 126 } 127 } 128 129 func TestTwoNodes(t *testing.T) { 130 defer leaktest.AfterTest(t)() 131 132 ctx := context.Background() 133 134 c1, c2, shutdown := setupTwoNodeTest() 135 defer shutdown() 136 defer func() { 137 t.Logf("n1 -> n2: %s", pretty.Sprint(c1.Dialer.Wrapped.Transcript(c2.NodeID))) 138 t.Logf("n2 -> n1: %s", pretty.Sprint(c2.Dialer.Wrapped.Transcript(c1.NodeID))) 139 }() 140 const ( 141 ep0 ctpb.Epoch = iota 142 ep1 143 ep2 144 ) 145 // Initially, can't serve random things for either n1 or n2. 146 require.True(t, c1.Container.Provider.MaxClosed( 147 c1.NodeID, roachpb.RangeID(5), ep0, ctpb.LAI(0)).IsEmpty(), 148 ) 149 require.True(t, c1.Container.Provider.MaxClosed( 150 c2.NodeID, roachpb.RangeID(5), ep0, ctpb.LAI(0)).IsEmpty(), 151 ) 152 153 // Track and release a command. 154 ts, release := c1.Tracker.Track(ctx) 155 release(ctx, ep1, roachpb.RangeID(17), ctpb.LAI(12)) 156 157 // The command is forced above ts=0.2. This is just an artifact of how the 158 // Tracker is implemented - it closes out 0.1 first, so it begins by forcing 159 // commands just above that. 160 require.Equal(t, hlc.Timestamp{Logical: 2}, ts) 161 162 // The clock gives a timestamp to the Provider, which should close out the 163 // current timestamp and set up 2E9-1E9=1E9 as the next one it wants to close. 164 // We do this twice (for the same timestamp) to make sure that the Provider 165 // not only read the tick, but also processed it. Otherwise, it becomes hard 166 // to write the remainder of the test because the commands we track below may 167 // fall into either case, and may be forced above the old or new timestamp. 168 for i := 0; i < 2; i++ { 169 c1.TestClock.Tick(hlc.Timestamp{WallTime: 2e9}, ep1, nil) 170 } 171 172 // The Tracker still won't let us serve anything, even though it has closed out 173 // 0.1 - this is because it has no information about any ranges at that timestamp. 174 // (Note that the Tracker may not have processed the closing yet, so if there were 175 // a bug here, this test would fail flakily - that's ok). 176 require.True(t, c1.Container.Provider.MaxClosed( 177 c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12)).IsEmpty(), 178 ) 179 180 // Two more commands come in. 181 ts, release = c1.Tracker.Track(ctx) 182 release(ctx, ep1, roachpb.RangeID(17), ctpb.LAI(16)) 183 require.Equal(t, hlc.Timestamp{WallTime: 1e9, Logical: 1}, ts) 184 185 ts, release = c1.Tracker.Track(ctx) 186 release(ctx, ep1, roachpb.RangeID(8), ctpb.LAI(88)) 187 require.Equal(t, hlc.Timestamp{WallTime: 1e9, Logical: 1}, ts) 188 189 // Now another tick. Shortly after it, we should be able to serve below 1E9, and 2E9 should 190 // be the next planned closed timestamp (though we can only verify the former). 191 c1.TestClock.Tick(hlc.Timestamp{WallTime: 3e9}, ep1, nil) 192 193 testutils.SucceedsSoon(t, func() error { 194 if c1.Container.Provider.MaxClosed( 195 c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12), 196 ).Less(hlc.Timestamp{WallTime: 1e9}) { 197 return errors.New("still can't serve") 198 } 199 return nil 200 }) 201 202 // Shouldn't be able to serve the same thing if we haven't caught up yet. 203 require.False(t, !c1.Container.Provider.MaxClosed( 204 c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(11), 205 ).Less(hlc.Timestamp{WallTime: 1e9})) 206 207 // Shouldn't be able to serve at a higher timestamp. 208 require.False(t, !c1.Container.Provider.MaxClosed( 209 c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12), 210 ).Less(hlc.Timestamp{WallTime: 1e9, Logical: 1})) 211 212 // Now things get a little more interesting. Tell node2 to get a stream of 213 // information from node1. We do this via Request, which as a side effect lets 214 // us ascertain that this request makes it to n1. 215 c2.Clients.Request(roachpb.NodeID(1), roachpb.RangeID(18)) 216 testutils.SucceedsSoon(t, func() error { 217 exp := []roachpb.RangeID{18} 218 c1.Refreshed.Lock() 219 defer c1.Refreshed.Unlock() 220 if !reflect.DeepEqual(exp, c1.Refreshed.RangeIDs) { 221 return errors.Errorf("still waiting for %v: currently %v", exp, c1.Refreshed.RangeIDs) 222 } 223 return nil 224 }) 225 226 // And n2 should soon also be able to serve follower reads for a range lead by 227 // n1 when it has caught up. 228 testutils.SucceedsSoon(t, func() error { 229 if c2.Container.Provider.MaxClosed( 230 c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12), 231 ).Less(hlc.Timestamp{WallTime: 1e9}) { 232 return errors.New("n2 still can't serve") 233 } 234 return nil 235 }) 236 237 // Remember the other proposals we tracked above on n1: (r17, 16) and (r8, 88). Feeding another 238 // timestamp to n1, we should see them closed out at t=2E9, and both n1 and n2 should automatically 239 // be able to serve them soon thereafter. 240 c1.TestClock.Tick(hlc.Timestamp{WallTime: 4e9}, ep1, nil) 241 242 checkEpoch1Reads := func(ts hlc.Timestamp) { 243 t.Helper() 244 for i, c := range []*TestContainer{c1, c2} { 245 for _, tuple := range []struct { 246 roachpb.RangeID 247 ctpb.LAI 248 }{ 249 {17, 16}, 250 {8, 88}, 251 } { 252 testutils.SucceedsSoon(t, func() error { 253 t.Helper() 254 if c.Container.Provider.MaxClosed( 255 c1.NodeID, tuple.RangeID, ep1, tuple.LAI, 256 ).Less(ts) { 257 return errors.Errorf("n%d still can't serve (r%d,%d) @ %s", i+1, tuple.RangeID, tuple.LAI, ts) 258 } 259 return nil 260 }) 261 // Still can't serve when not caught up. 262 require.False(t, !c.Container.Provider.MaxClosed( 263 c1.NodeID, tuple.RangeID, ep1, tuple.LAI-1, 264 ).Less(ts)) 265 // Can serve when more than caught up. 266 require.True(t, !c.Container.Provider.MaxClosed( 267 c1.NodeID, tuple.RangeID, ep1, tuple.LAI+1, 268 ).Less(ts)) 269 // Can't serve when in different epoch, no matter larger or smaller. 270 require.False(t, !c.Container.Provider.MaxClosed( 271 c1.NodeID, tuple.RangeID, ep0, tuple.LAI, 272 ).Less(ts)) 273 require.False(t, !c.Container.Provider.MaxClosed( 274 c1.NodeID, tuple.RangeID, ep2, tuple.LAI, 275 ).Less(ts)) 276 } 277 } 278 } 279 checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9}) 280 281 // Tick again in epoch 1 and ensure that reads at t=3E9 can be safely served. 282 // 3E9 gets closed out under the first epoch in this tick with 4E9 as the 283 // timestamp to be closed next due to the 1s target interval. 284 c1.TestClock.Tick(hlc.Timestamp{WallTime: 5e9}, ep1, nil) 285 checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9}) 286 287 // Uh-oh! n1 must've missed a heartbeat. The epoch goes up by one. This means 288 // that soon (after the next tick) timestamps should be closed out under the 289 // the epoch. The timestamp at which this happens is doctored to make sure the 290 // Storage holds on to the past information, because we want to end-to-end test 291 // that this all works out. Consequently we try Tick at the rotation interval 292 // plus the target duration next (so that the next closed timestamp is the 293 // rotation interval). 294 c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 5e9}, ep2, nil) 295 296 // Previously valid reads should remain valid. 297 checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9}) 298 checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9}) 299 300 // After the above tick makes it to the tracker, commands get forced above 301 // the next closed timestamp (from the tick above) minus target interval. 302 // The SucceedsSoon is to ensure that the above tick in ep2 has made it to the tracker. 303 testutils.SucceedsSoon(t, func() error { 304 ts, release = c1.Tracker.Track(ctx) 305 release(ctx, ep2, roachpb.RangeID(123), ctpb.LAI(456)) 306 if !(&hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 4e9, Logical: 1}).Equal(ts) { 307 return errors.Errorf("command still not forced above %v", ts) 308 } 309 return nil 310 }) 311 312 // Previously valid reads should remain valid. 313 checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9}) 314 checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9}) 315 316 // With the next tick, epoch two fully goes into effect (as the first epoch two 317 // timestamp gets closed out). We do this twice to make sure it's processed before 318 // the test proceeds. 319 c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 6e9}, ep2, nil) 320 321 // Previously valid reads should remain valid. Note that this is because the 322 // storage keeps historical data, and we've fine tuned the epoch flip so that 323 // it happens after the epoch 1 information rotates into another bucket and 324 // thus is preserved. If the epoch changed at a smaller timestamp, that 325 // would've wiped out the first epoch's information. 326 // 327 // TODO(tschottdorf): we could make the storage smarter so that it forces a 328 // rotation when the epoch changes, at the expense of pushing out historical 329 // information earlier. Frequent epoch changes could lead to very little 330 // historical information in the storage. Probably better not to risk that. 331 checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9}) 332 checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9}) 333 334 // Another second, another tick. Now the proposal tracked during epoch 2 should 335 // be readable from followers (as `scale+5E9` gets closed out). 336 c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 7e9}, ep2, nil) 337 for i, c := range []*TestContainer{c1, c2} { 338 rangeID := roachpb.RangeID(123) 339 lai := ctpb.LAI(456) 340 epoch := ep2 341 ts := hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 5e9} 342 343 testutils.SucceedsSoon(t, func() error { 344 if c.Container.Provider.MaxClosed( 345 c1.NodeID, rangeID, epoch, lai, 346 ).Less(ts) { 347 return errors.Errorf("n%d still can't serve (r%d,%d) @ %s", i+1, rangeID, lai, ts) 348 } 349 return nil 350 }) 351 352 // Still can't serve when not caught up. 353 require.False(t, !c.Container.Provider.MaxClosed( 354 c1.NodeID, rangeID, epoch, lai-1, 355 ).Less(ts)) 356 357 // Can serve when more than caught up. 358 require.True(t, !c.Container.Provider.MaxClosed( 359 c1.NodeID, rangeID, epoch, lai+1, 360 ).Less(ts)) 361 362 // Can't serve when in different epoch, no matter larger or smaller. 363 require.False(t, !c.Container.Provider.MaxClosed( 364 c1.NodeID, rangeID, epoch-1, lai, 365 ).Less(ts)) 366 require.False(t, !c.Container.Provider.MaxClosed( 367 c1.NodeID, rangeID, epoch+1, lai, 368 ).Less(ts)) 369 } 370 }