github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/provider/provider_test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package provider_test 12 13 import ( 14 "context" 15 "reflect" 16 "sync" 17 "sync/atomic" 18 "testing" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider" 24 providertestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider/testutils" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 27 "github.com/cockroachdb/cockroach/pkg/testutils" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 30 "github.com/cockroachdb/cockroach/pkg/util/log" 31 "github.com/cockroachdb/cockroach/pkg/util/stop" 32 "github.com/cockroachdb/errors" 33 "github.com/cockroachdb/logtags" 34 "github.com/stretchr/testify/require" 35 "golang.org/x/sync/errgroup" 36 ) 37 38 func TestProviderSubscribeNotify(t *testing.T) { 39 defer leaktest.AfterTest(t)() 40 41 ctx := context.Background() 42 stopper := stop.NewStopper() 43 defer stopper.Stop(ctx) 44 45 st := cluster.MakeTestingClusterSettings() 46 // We'll only unleash the closer loop when the test is basically done, and 47 // once we do that we want it to run aggressively. 48 // Testing that the closer loop works as advertised is left to another test. 49 closedts.TargetDuration.Override(&st.SV, time.Millisecond) 50 closedts.CloseFraction.Override(&st.SV, 1.0) 51 52 storage := &providertestutils.TestStorage{} 53 unblockClockCh := make(chan struct{}) 54 cfg := &provider.Config{ 55 NodeID: 2, // note that we're not using 1, just for kicks 56 Settings: st, 57 Stopper: stopper, 58 Storage: storage, 59 Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) { 60 select { 61 case <-stopper.ShouldQuiesce(): 62 case <-unblockClockCh: 63 } 64 return hlc.Timestamp{}, ctpb.Epoch(1), errors.New("injected clock error") 65 }, 66 Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) { 67 panic("should never be called") 68 }, 69 } 70 71 p := provider.NewProvider(cfg) 72 p.Start() 73 74 // We won't touch n1 in this test, so this entry should never pop up. 75 unseenEntry := ctpb.Entry{ 76 ClosedTimestamp: hlc.Timestamp{WallTime: 456}, 77 Epoch: 17, 78 } 79 cfg.Storage.Add(1, unseenEntry) 80 81 entryAt := func(i int) ctpb.Entry { 82 return ctpb.Entry{ 83 ClosedTimestamp: hlc.Timestamp{WallTime: int64(i) * 1e9}, 84 Epoch: ctpb.Epoch(i), 85 MLAI: map[roachpb.RangeID]ctpb.LAI{ 86 roachpb.RangeID(i): ctpb.LAI(10 * i), 87 }, 88 } 89 } 90 91 const numEntries = 10 // must be even 92 var entries []ctpb.Entry 93 for i := 0; i < numEntries; i++ { 94 entries = append(entries, entryAt(i)) 95 } 96 97 var readerSeq int32 // atomically 98 reader := func() error { 99 i := atomic.AddInt32(&readerSeq, 1) 100 ctx, cancel := context.WithCancel(context.Background()) 101 defer cancel() 102 ctx = logtags.AddTag(ctx, "reader", int(i)) 103 104 log.Infof(ctx, "starting") 105 defer log.Infof(ctx, "done") 106 107 ch := make(chan ctpb.Entry) 108 stopper.RunWorker(ctx, func(ctx context.Context) { 109 p.Subscribe(ctx, ch) 110 }) 111 112 var sl []ctpb.Entry // for debug purposes only 113 // Read entries off the subscription. We check two invariants: 114 // 1. we see each Entry (identified via its Epoch) at least twice 115 // (morally exactly twice, but the Provider gives a weaker guarantee) 116 // 2. An Entry can only be observed after the previous Entry has been seen 117 // at least once. That is, to see Epoch X, we need to have seen Epoch X-1. 118 // 119 // These could be sharpened somewhat, but only at a distinct loss of clarity 120 // in this part of the test. 121 // 122 // Examples, writing only the Epoch (which uniquely identifies the Entry in this test): 123 // OK: 124 // - 1 2 3 1 2 3 125 // First writer sends everything before second writer. 126 // - 1 1 2 3 1 2 3 127 // First writer sends everything before second, but first 1 gets duplicated by Provider. 128 // - 1 2 3 1 2 3 1 2 3 129 // Same as last, but whole thing gets duplicated. 130 // - 1 2 3 2 3 1 2 3 131 // Only 2 3 gets duplicated. 132 // Not OK: 133 // - 1 1 2 3 3 134 // Two seen only once. 135 // - 1 3 2 1 2 3 136 // Three observed before two. 137 m := map[ctpb.Epoch]int{-1: 2} // pretend we've seen Epoch -1 twice, streamlines code below 138 expM := map[ctpb.Epoch]int{-1: 2} 139 for _, entry := range entries { 140 expM[entry.Epoch] = 2 141 } 142 for { 143 select { 144 case <-time.After(10 * time.Second): 145 return errors.Errorf("nothing emitted after %v", sl) 146 case entry, ok := <-ch: // implies runtime.Gosched 147 if !ok { 148 if ctx.Err() != nil { 149 // Expected, we must've canceled the context below earlier, which means the 150 // checks were successful. 151 return nil 152 } 153 return errors.New("sender closed channel before reader canceled their context") 154 } 155 sl = append(sl, entry) 156 log.Infof(ctx, "got %d entries now,latest: %+v", len(sl), entry) 157 diagErr := errors.Errorf("saw: %v", sl) 158 prevEpo := entry.Epoch - 1 159 if m[prevEpo] < 1 { 160 return errors.Wrapf( 161 diagErr, 162 "entry for epoch %d received before a matching entry for immediately preceding epoch %d", 163 entry.Epoch, prevEpo, 164 ) 165 } 166 m[entry.Epoch]++ 167 if m[entry.Epoch] > 2 { 168 m[entry.Epoch] = 2 169 } 170 171 if reflect.DeepEqual(expM, m) && ctx.Err() == nil { 172 log.Info(ctx, "canceling subscription") 173 cancel() 174 // As a little gotcha, we need to work around the implementation a tiny bit. 175 // The provider uses a sync.Cond to notify clients and it is likely waiting 176 // for new activity for our subscription. Thus, it's not going to notice 177 // that this client is going away; it would notice if the Provider's closer 178 // did its job (we've blocked it so far) because that periodically wakes 179 // up all clients, rain or shine. So we unblock it now; the Clock is set up 180 // to return errors, so as a nice little benefit we verify that even in that 181 // case the subscription does get woken up. 182 close(unblockClockCh) 183 } 184 } 185 } 186 } 187 188 // Add some entries via Notify, and race them with various subscriptions. Note 189 // that in reality, we have only a single notification going on for the local node 190 // (run by a Provider goroutine). But the data that comes in from other nodes uses 191 // the same mechanism, and it's nice to get coverage for it. In particular, during 192 // reconnections, you could imagine two notification streams for the same NodeID to 193 // be active in parallel. 194 var g errgroup.Group 195 for i := range []struct{}{{}, {}} { // twice 196 i := i // goroutine copy 197 g.Go(func() error { 198 ctx := logtags.AddTag(context.Background(), "writer", i) 199 log.Info(ctx, "starting") 200 defer log.Info(ctx, "done") 201 nCh := p.Notify(roachpb.NodeID(2)) 202 defer close(nCh) 203 for _, entry := range entries { 204 nCh <- entry // implies runtime.Gosched 205 log.Infof(ctx, "wrote %s", entry) 206 } 207 return nil 208 }) 209 } 210 211 for i := 0; i < 1; i++ { // HACK 212 g.Go(reader) 213 } 214 if err := g.Wait(); err != nil { 215 t.Fatal(err) 216 } 217 218 testutils.SucceedsSoon(t, func() error { 219 snap := storage.Snapshot() 220 require.Equal(t, 2, len(snap)) // definitely should have records about two nodes 221 require.Equal(t, 1, len(snap[1])) // one persisted entry for n1 222 // Morally this is true immediately, but consider that the goroutine consuming 223 // from the writer threads above may have read the entry but not put it into 224 // the Storage yet. The reader threads would usually remove this race, but 225 // they can be satisfied early by a duplicate that is emitted during the 226 // switchover from storage to subscription. 227 if exp, act := 2*numEntries, len(snap[2]); exp < act { 228 t.Fatalf("got %d entries in storage, expected no more than %d", act, exp) 229 } else if exp > act { 230 return errors.Errorf("storage has %d entries, need %d", exp, act) 231 } 232 return nil 233 }) 234 } 235 236 // TestProviderSubscribeConcurrent prevents regression of a bug that improperly 237 // handled concurrent subscriptions. 238 func TestProviderSubscribeConcurrent(t *testing.T) { 239 defer leaktest.AfterTest(t)() 240 241 st := cluster.MakeTestingClusterSettings() 242 closedts.TargetDuration.Override(&st.SV, time.Millisecond) 243 closedts.CloseFraction.Override(&st.SV, 1.0) 244 245 stopper := stop.NewStopper() 246 storage := &providertestutils.TestStorage{} 247 248 var ts int64 // atomic 249 cfg := &provider.Config{ 250 NodeID: 1, 251 Settings: st, 252 Stopper: stopper, 253 Storage: storage, 254 Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) { 255 return hlc.Timestamp{}, 1, nil 256 }, 257 Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) { 258 return hlc.Timestamp{ 259 WallTime: atomic.AddInt64(&ts, 1), 260 }, map[roachpb.RangeID]ctpb.LAI{ 261 1: ctpb.LAI(atomic.LoadInt64(&ts)), 262 }, true 263 }, 264 } 265 266 p := provider.NewProvider(cfg) 267 p.Start() 268 269 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond) 270 defer cancel() 271 cancel = func() {} 272 const n = 10 273 var wg sync.WaitGroup 274 wg.Add(n) 275 for i := 0; i < n; i++ { 276 go func() { 277 defer wg.Done() 278 ch := make(chan ctpb.Entry, 3) 279 p.Subscribe(ctx, ch) 280 // Read from channel until stopper stops Provider (and in turn Provider 281 // closes channel). 282 for range ch { 283 } 284 }() 285 } 286 stopper.Stop(context.Background()) 287 wg.Wait() 288 } 289 290 // TestProviderTargetDurationSetting ensures that setting the target duration to 291 // zero disables closing the timestamp and that setting it back to a positive 292 // value re-enables it. 293 func TestProviderTargetDurationSetting(t *testing.T) { 294 defer leaktest.AfterTest(t)() 295 296 st := cluster.MakeTestingClusterSettings() 297 closedts.TargetDuration.Override(&st.SV, time.Millisecond) 298 closedts.CloseFraction.Override(&st.SV, 1.0) 299 300 stopper := stop.NewStopper() 301 storage := &providertestutils.TestStorage{} 302 defer stopper.Stop(context.Background()) 303 304 var ts int64 // atomic 305 var called int 306 calledCh := make(chan struct{}) 307 cfg := &provider.Config{ 308 NodeID: 1, 309 Settings: st, 310 Stopper: stopper, 311 Storage: storage, 312 Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) { 313 return hlc.Timestamp{}, 1, nil 314 }, 315 Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) { 316 if called++; called == 1 { 317 closedts.TargetDuration.Override(&st.SV, 0) 318 } 319 select { 320 case calledCh <- struct{}{}: 321 case <-stopper.ShouldQuiesce(): 322 } 323 return hlc.Timestamp{ 324 WallTime: atomic.AddInt64(&ts, 1), 325 }, map[roachpb.RangeID]ctpb.LAI{ 326 1: ctpb.LAI(atomic.LoadInt64(&ts)), 327 }, true 328 }, 329 } 330 331 p := provider.NewProvider(cfg) 332 p.Start() 333 334 // Get called once. While it's being called, we set the target duration to 0, 335 // disabling the updates. We wait someTime and ensure we don't get called 336 // again. Then we re-enable the setting and ensure we do get called. 337 <-calledCh 338 const someTime = 10 * time.Millisecond 339 select { 340 case <-calledCh: 341 t.Fatal("expected no updates to be sent") 342 case <-time.After(someTime): 343 } 344 closedts.TargetDuration.Override(&st.SV, time.Millisecond) 345 <-calledCh 346 }