get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/norace_test.go (about) 1 // Copyright 2018-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 //go:build !race && !skip_no_race_tests 15 // +build !race,!skip_no_race_tests 16 17 package server 18 19 import ( 20 "bufio" 21 "bytes" 22 "compress/gzip" 23 "context" 24 "encoding/binary" 25 "encoding/json" 26 "errors" 27 "fmt" 28 "io" 29 "math" 30 "math/rand" 31 "net" 32 "net/http" 33 "net/url" 34 "path/filepath" 35 "reflect" 36 "runtime" 37 "runtime/debug" 38 "strconv" 39 "strings" 40 "sync" 41 "sync/atomic" 42 "testing" 43 "time" 44 45 "crypto/hmac" 46 crand "crypto/rand" 47 "crypto/sha256" 48 49 "get.pme.sh/pnats/server/avl" 50 "github.com/klauspost/compress/s2" 51 "github.com/nats-io/jwt/v2" 52 "github.com/nats-io/nats.go" 53 "github.com/nats-io/nkeys" 54 "github.com/nats-io/nuid" 55 ) 56 57 // IMPORTANT: Tests in this file are not executed when running with the -race flag. 58 // The test name should be prefixed with TestNoRace so we can run only 59 // those tests: go test -run=TestNoRace ... 60 61 func TestNoRaceAvoidSlowConsumerBigMessages(t *testing.T) { 62 opts := DefaultOptions() // Use defaults to make sure they avoid pending slow consumer. 63 opts.NoSystemAccount = true 64 s := RunServer(opts) 65 defer s.Shutdown() 66 67 nc1, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 68 if err != nil { 69 t.Fatalf("Error on connect: %v", err) 70 } 71 defer nc1.Close() 72 73 nc2, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 74 if err != nil { 75 t.Fatalf("Error on connect: %v", err) 76 } 77 defer nc2.Close() 78 79 data := make([]byte, 1024*1024) // 1MB payload 80 crand.Read(data) 81 82 expected := int32(500) 83 received := int32(0) 84 85 done := make(chan bool) 86 87 // Create Subscription. 88 nc1.Subscribe("slow.consumer", func(m *nats.Msg) { 89 // Just eat it so that we are not measuring 90 // code time, just delivery. 91 atomic.AddInt32(&received, 1) 92 if received >= expected { 93 done <- true 94 } 95 }) 96 97 // Create Error handler 98 nc1.SetErrorHandler(func(c *nats.Conn, s *nats.Subscription, err error) { 99 t.Fatalf("Received an error on the subscription's connection: %v\n", err) 100 }) 101 102 nc1.Flush() 103 104 for i := 0; i < int(expected); i++ { 105 nc2.Publish("slow.consumer", data) 106 } 107 nc2.Flush() 108 109 select { 110 case <-done: 111 return 112 case <-time.After(10 * time.Second): 113 r := atomic.LoadInt32(&received) 114 if s.NumSlowConsumers() > 0 { 115 t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", r, expected) 116 } 117 t.Fatalf("Failed to receive all large messages: %d of %d\n", r, expected) 118 } 119 } 120 121 func TestNoRaceRoutedQueueAutoUnsubscribe(t *testing.T) { 122 optsA, err := ProcessConfigFile("./configs/seed.conf") 123 require_NoError(t, err) 124 optsA.NoSigs, optsA.NoLog = true, true 125 optsA.NoSystemAccount = true 126 srvA := RunServer(optsA) 127 defer srvA.Shutdown() 128 129 srvARouteURL := fmt.Sprintf("nats://%s:%d", optsA.Cluster.Host, srvA.ClusterAddr().Port) 130 optsB := nextServerOpts(optsA) 131 optsB.Routes = RoutesFromStr(srvARouteURL) 132 133 srvB := RunServer(optsB) 134 defer srvB.Shutdown() 135 136 // Wait for these 2 to connect to each other 137 checkClusterFormed(t, srvA, srvB) 138 139 // Have a client connection to each server 140 ncA, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsA.Host, optsA.Port)) 141 if err != nil { 142 t.Fatalf("Error on connect: %v", err) 143 } 144 defer ncA.Close() 145 146 ncB, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsB.Host, optsB.Port)) 147 if err != nil { 148 t.Fatalf("Error on connect: %v", err) 149 } 150 defer ncB.Close() 151 152 rbar := int32(0) 153 barCb := func(m *nats.Msg) { 154 atomic.AddInt32(&rbar, 1) 155 } 156 rbaz := int32(0) 157 bazCb := func(m *nats.Msg) { 158 atomic.AddInt32(&rbaz, 1) 159 } 160 161 // Create 125 queue subs with auto-unsubscribe to each server for 162 // group bar and group baz. So 250 total per queue group. 163 cons := []*nats.Conn{ncA, ncB} 164 for _, c := range cons { 165 for i := 0; i < 100; i++ { 166 qsub, err := c.QueueSubscribe("foo", "bar", barCb) 167 if err != nil { 168 t.Fatalf("Error on subscribe: %v", err) 169 } 170 if err := qsub.AutoUnsubscribe(1); err != nil { 171 t.Fatalf("Error on auto-unsubscribe: %v", err) 172 } 173 qsub, err = c.QueueSubscribe("foo", "baz", bazCb) 174 if err != nil { 175 t.Fatalf("Error on subscribe: %v", err) 176 } 177 if err := qsub.AutoUnsubscribe(1); err != nil { 178 t.Fatalf("Error on auto-unsubscribe: %v", err) 179 } 180 } 181 c.Subscribe("TEST.COMPLETE", func(m *nats.Msg) {}) 182 } 183 184 // We coelasce now so for each server we will have all local (200) plus 185 // two from the remote side for each queue group. We also create one more 186 // and will wait til each server has 204 subscriptions, that will make sure 187 // that we have everything setup. 188 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 189 subsA := srvA.NumSubscriptions() 190 subsB := srvB.NumSubscriptions() 191 if subsA != 204 || subsB != 204 { 192 return fmt.Errorf("Not all subs processed yet: %d and %d", subsA, subsB) 193 } 194 return nil 195 }) 196 197 expected := int32(200) 198 // Now send messages from each server 199 for i := int32(0); i < expected; i++ { 200 c := cons[i%2] 201 c.Publish("foo", []byte("Don't Drop Me!")) 202 } 203 for _, c := range cons { 204 c.Flush() 205 } 206 207 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 208 nbar := atomic.LoadInt32(&rbar) 209 nbaz := atomic.LoadInt32(&rbaz) 210 if nbar == expected && nbaz == expected { 211 return nil 212 } 213 return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'", 214 expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz)) 215 }) 216 } 217 218 func TestNoRaceClosedSlowConsumerWriteDeadline(t *testing.T) { 219 opts := DefaultOptions() 220 opts.NoSystemAccount = true 221 opts.WriteDeadline = 10 * time.Millisecond // Make very small to trip. 222 opts.MaxPending = 500 * 1024 * 1024 // Set high so it will not trip here. 223 s := RunServer(opts) 224 defer s.Shutdown() 225 226 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 227 if err != nil { 228 t.Fatalf("Error on connect: %v", err) 229 } 230 defer c.Close() 231 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 232 t.Fatalf("Error sending protocols to server: %v", err) 233 } 234 // Reduce socket buffer to increase reliability of data backing up in the server destined 235 // for our subscribed client. 236 c.(*net.TCPConn).SetReadBuffer(128) 237 238 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 239 sender, err := nats.Connect(url) 240 if err != nil { 241 t.Fatalf("Error on connect: %v", err) 242 } 243 defer sender.Close() 244 245 payload := make([]byte, 1024*1024) 246 for i := 0; i < 100; i++ { 247 if err := sender.Publish("foo", payload); err != nil { 248 t.Fatalf("Error on publish: %v", err) 249 } 250 } 251 252 // Flush sender connection to ensure that all data has been sent. 253 if err := sender.Flush(); err != nil { 254 t.Fatalf("Error on flush: %v", err) 255 } 256 257 // At this point server should have closed connection c. 258 checkClosedConns(t, s, 1, 2*time.Second) 259 conns := s.closedClients() 260 if lc := len(conns); lc != 1 { 261 t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc) 262 } 263 checkReason(t, conns[0].Reason, SlowConsumerWriteDeadline) 264 } 265 266 func TestNoRaceClosedSlowConsumerPendingBytes(t *testing.T) { 267 opts := DefaultOptions() 268 opts.NoSystemAccount = true 269 opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer. 270 opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip. 271 s := RunServer(opts) 272 defer s.Shutdown() 273 274 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 275 if err != nil { 276 t.Fatalf("Error on connect: %v", err) 277 } 278 defer c.Close() 279 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 280 t.Fatalf("Error sending protocols to server: %v", err) 281 } 282 // Reduce socket buffer to increase reliability of data backing up in the server destined 283 // for our subscribed client. 284 c.(*net.TCPConn).SetReadBuffer(128) 285 286 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 287 sender, err := nats.Connect(url) 288 if err != nil { 289 t.Fatalf("Error on connect: %v", err) 290 } 291 defer sender.Close() 292 293 payload := make([]byte, 1024*1024) 294 for i := 0; i < 100; i++ { 295 if err := sender.Publish("foo", payload); err != nil { 296 t.Fatalf("Error on publish: %v", err) 297 } 298 } 299 300 // Flush sender connection to ensure that all data has been sent. 301 if err := sender.Flush(); err != nil { 302 t.Fatalf("Error on flush: %v", err) 303 } 304 305 // At this point server should have closed connection c. 306 checkClosedConns(t, s, 1, 2*time.Second) 307 conns := s.closedClients() 308 if lc := len(conns); lc != 1 { 309 t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc) 310 } 311 checkReason(t, conns[0].Reason, SlowConsumerPendingBytes) 312 } 313 314 func TestNoRaceSlowConsumerPendingBytes(t *testing.T) { 315 opts := DefaultOptions() 316 opts.NoSystemAccount = true 317 opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer. 318 opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip. 319 s := RunServer(opts) 320 defer s.Shutdown() 321 322 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 323 if err != nil { 324 t.Fatalf("Error on connect: %v", err) 325 } 326 defer c.Close() 327 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 328 t.Fatalf("Error sending protocols to server: %v", err) 329 } 330 // Reduce socket buffer to increase reliability of data backing up in the server destined 331 // for our subscribed client. 332 c.(*net.TCPConn).SetReadBuffer(128) 333 334 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 335 sender, err := nats.Connect(url) 336 if err != nil { 337 t.Fatalf("Error on connect: %v", err) 338 } 339 defer sender.Close() 340 341 payload := make([]byte, 1024*1024) 342 for i := 0; i < 100; i++ { 343 if err := sender.Publish("foo", payload); err != nil { 344 t.Fatalf("Error on publish: %v", err) 345 } 346 } 347 348 // Flush sender connection to ensure that all data has been sent. 349 if err := sender.Flush(); err != nil { 350 t.Fatalf("Error on flush: %v", err) 351 } 352 353 // At this point server should have closed connection c. 354 355 // On certain platforms, it may take more than one call before 356 // getting the error. 357 for i := 0; i < 100; i++ { 358 if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil { 359 // ok 360 return 361 } 362 } 363 t.Fatal("Connection should have been closed") 364 } 365 366 func TestNoRaceGatewayNoMissingReplies(t *testing.T) { 367 // This test will have following setup: 368 // 369 // responder1 requestor 370 // | | 371 // v v 372 // [A1]<-------gw------------[B1] 373 // | \ | 374 // | \______gw__________ | route 375 // | _\| | 376 // [ ]--------gw----------->[ ] 377 // [A2]<-------gw------------[B2] 378 // [ ] [ ] 379 // ^ 380 // | 381 // responder2 382 // 383 // There is a possible race that when the requestor creates 384 // a subscription on the reply subject, the subject interest 385 // being sent from the inbound gateway, and B1 having none, 386 // the SUB first goes to B2 before being sent to A1 from 387 // B2's inbound GW. But the request can go from B1 to A1 388 // right away and the responder1 connecting to A1 may send 389 // back the reply before the interest on the reply makes it 390 // to A1 (from B2). 391 // This test will also verify that if the responder is instead 392 // connected to A2, the reply is properly received by requestor 393 // on B1. 394 395 // For this test we want to be in interestOnly mode, so 396 // make it happen quickly 397 gatewayMaxRUnsubBeforeSwitch = 1 398 defer func() { gatewayMaxRUnsubBeforeSwitch = defaultGatewayMaxRUnsubBeforeSwitch }() 399 400 // Start with setting up A2 and B2. 401 ob2 := testDefaultOptionsForGateway("B") 402 sb2 := runGatewayServer(ob2) 403 defer sb2.Shutdown() 404 405 oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2) 406 sa2 := runGatewayServer(oa2) 407 defer sa2.Shutdown() 408 409 waitForOutboundGateways(t, sa2, 1, time.Second) 410 waitForInboundGateways(t, sa2, 1, time.Second) 411 waitForOutboundGateways(t, sb2, 1, time.Second) 412 waitForInboundGateways(t, sb2, 1, time.Second) 413 414 // Now start A1 which will connect to B2 415 oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2) 416 oa1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa2.Cluster.Host, oa2.Cluster.Port)) 417 sa1 := runGatewayServer(oa1) 418 defer sa1.Shutdown() 419 420 waitForOutboundGateways(t, sa1, 1, time.Second) 421 waitForInboundGateways(t, sb2, 2, time.Second) 422 423 checkClusterFormed(t, sa1, sa2) 424 425 // Finally, start B1 that will connect to A1. 426 ob1 := testGatewayOptionsFromToWithServers(t, "B", "A", sa1) 427 ob1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", ob2.Cluster.Host, ob2.Cluster.Port)) 428 sb1 := runGatewayServer(ob1) 429 defer sb1.Shutdown() 430 431 // Check that we have the outbound gateway from B1 to A1 432 checkFor(t, 3*time.Second, 15*time.Millisecond, func() error { 433 c := sb1.getOutboundGatewayConnection("A") 434 if c == nil { 435 return fmt.Errorf("Outbound connection to A not created yet") 436 } 437 c.mu.Lock() 438 name := c.opts.Name 439 nc := c.nc 440 c.mu.Unlock() 441 if name != sa1.ID() { 442 // Force a disconnect 443 nc.Close() 444 return fmt.Errorf("Was unable to have B1 connect to A1") 445 } 446 return nil 447 }) 448 449 waitForInboundGateways(t, sa1, 1, time.Second) 450 checkClusterFormed(t, sb1, sb2) 451 452 a1URL := fmt.Sprintf("nats://%s:%d", oa1.Host, oa1.Port) 453 a2URL := fmt.Sprintf("nats://%s:%d", oa2.Host, oa2.Port) 454 b1URL := fmt.Sprintf("nats://%s:%d", ob1.Host, ob1.Port) 455 b2URL := fmt.Sprintf("nats://%s:%d", ob2.Host, ob2.Port) 456 457 ncb1 := natsConnect(t, b1URL) 458 defer ncb1.Close() 459 460 ncb2 := natsConnect(t, b2URL) 461 defer ncb2.Close() 462 463 natsSubSync(t, ncb1, "just.a.sub") 464 natsSubSync(t, ncb2, "just.a.sub") 465 checkExpectedSubs(t, 2, sb1, sb2) 466 467 // For this test, we want A to be checking B's interest in order 468 // to send messages (which would cause replies to be dropped if 469 // there is no interest registered on A). So from A servers, 470 // send to various subjects and cause B's to switch to interestOnly 471 // mode. 472 nca1 := natsConnect(t, a1URL) 473 defer nca1.Close() 474 for i := 0; i < 10; i++ { 475 natsPub(t, nca1, fmt.Sprintf("reject.%d", i), []byte("hello")) 476 } 477 nca2 := natsConnect(t, a2URL) 478 defer nca2.Close() 479 for i := 0; i < 10; i++ { 480 natsPub(t, nca2, fmt.Sprintf("reject.%d", i), []byte("hello")) 481 } 482 483 checkSwitchedMode := func(t *testing.T, s *Server) { 484 t.Helper() 485 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 486 var switchedMode bool 487 c := s.getOutboundGatewayConnection("B") 488 ei, _ := c.gw.outsim.Load(globalAccountName) 489 if ei != nil { 490 e := ei.(*outsie) 491 e.RLock() 492 switchedMode = e.ni == nil && e.mode == InterestOnly 493 e.RUnlock() 494 } 495 if !switchedMode { 496 return fmt.Errorf("Still not switched mode") 497 } 498 return nil 499 }) 500 } 501 checkSwitchedMode(t, sa1) 502 checkSwitchedMode(t, sa2) 503 504 // Setup a subscriber on _INBOX.> on each of A's servers. 505 total := 1000 506 expected := int32(total) 507 rcvOnA := int32(0) 508 qrcvOnA := int32(0) 509 natsSub(t, nca1, "myreply.>", func(_ *nats.Msg) { 510 atomic.AddInt32(&rcvOnA, 1) 511 }) 512 natsQueueSub(t, nca2, "myreply.>", "bar", func(_ *nats.Msg) { 513 atomic.AddInt32(&qrcvOnA, 1) 514 }) 515 checkExpectedSubs(t, 2, sa1, sa2) 516 517 // Ok.. so now we will run the actual test where we 518 // create a responder on A1 and make sure that every 519 // single request from B1 gets the reply. Will repeat 520 // test with responder connected to A2. 521 sendReqs := func(t *testing.T, subConn *nats.Conn) { 522 t.Helper() 523 responder := natsSub(t, subConn, "foo", func(m *nats.Msg) { 524 m.Respond([]byte("reply")) 525 }) 526 natsFlush(t, subConn) 527 checkExpectedSubs(t, 3, sa1, sa2) 528 529 // We are not going to use Request() because this sets 530 // a wildcard subscription on an INBOX and less likely 531 // to produce the race. Instead we will explicitly set 532 // the subscription on the reply subject and create one 533 // per request. 534 for i := 0; i < total/2; i++ { 535 reply := fmt.Sprintf("myreply.%d", i) 536 replySub := natsQueueSubSync(t, ncb1, reply, "bar") 537 natsFlush(t, ncb1) 538 539 // Let's make sure we have interest on B2. 540 if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 { 541 checkFor(t, time.Second, time.Millisecond, func() error { 542 if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 { 543 return fmt.Errorf("B still not registered interest on %s", reply) 544 } 545 return nil 546 }) 547 } 548 natsPubReq(t, ncb1, "foo", reply, []byte("request")) 549 if _, err := replySub.NextMsg(time.Second); err != nil { 550 t.Fatalf("Did not receive reply: %v", err) 551 } 552 natsUnsub(t, replySub) 553 } 554 555 responder.Unsubscribe() 556 natsFlush(t, subConn) 557 checkExpectedSubs(t, 2, sa1, sa2) 558 } 559 sendReqs(t, nca1) 560 sendReqs(t, nca2) 561 562 checkFor(t, time.Second, 15*time.Millisecond, func() error { 563 if n := atomic.LoadInt32(&rcvOnA); n != expected { 564 return fmt.Errorf("Subs on A expected to get %v replies, got %v", expected, n) 565 } 566 return nil 567 }) 568 569 // We should not have received a single message on the queue sub 570 // on cluster A because messages will have been delivered to 571 // the member on cluster B. 572 if n := atomic.LoadInt32(&qrcvOnA); n != 0 { 573 t.Fatalf("Queue sub on A should not have received message, got %v", n) 574 } 575 } 576 577 func TestNoRaceRouteMemUsage(t *testing.T) { 578 oa := DefaultOptions() 579 sa := RunServer(oa) 580 defer sa.Shutdown() 581 582 ob := DefaultOptions() 583 ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port)) 584 sb := RunServer(ob) 585 defer sb.Shutdown() 586 587 checkClusterFormed(t, sa, sb) 588 589 responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port)) 590 defer responder.Close() 591 for i := 0; i < 10; i++ { 592 natsSub(t, responder, "foo", func(m *nats.Msg) { 593 m.Respond(m.Data) 594 }) 595 } 596 natsFlush(t, responder) 597 598 payload := make([]byte, 50*1024) 599 600 bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 601 602 // Capture mem usage 603 mem := runtime.MemStats{} 604 runtime.ReadMemStats(&mem) 605 inUseBefore := mem.HeapInuse 606 607 for i := 0; i < 100; i++ { 608 requestor := natsConnect(t, bURL) 609 // Don't use a defer here otherwise that will make the memory check fail! 610 // We are closing the connection just after these few instructions that 611 // are not calling t.Fatal() anyway. 612 inbox := nats.NewInbox() 613 sub := natsSubSync(t, requestor, inbox) 614 natsPubReq(t, requestor, "foo", inbox, payload) 615 for j := 0; j < 10; j++ { 616 natsNexMsg(t, sub, time.Second) 617 } 618 requestor.Close() 619 } 620 621 runtime.GC() 622 debug.FreeOSMemory() 623 runtime.ReadMemStats(&mem) 624 inUseNow := mem.HeapInuse 625 if inUseNow > 3*inUseBefore { 626 t.Fatalf("Heap in-use before was %v, now %v: too high", inUseBefore, inUseNow) 627 } 628 } 629 630 func TestNoRaceRouteCache(t *testing.T) { 631 maxPerAccountCacheSize = 20 632 prunePerAccountCacheSize = 5 633 closedSubsCheckInterval = 250 * time.Millisecond 634 635 defer func() { 636 maxPerAccountCacheSize = defaultMaxPerAccountCacheSize 637 prunePerAccountCacheSize = defaultPrunePerAccountCacheSize 638 closedSubsCheckInterval = defaultClosedSubsCheckInterval 639 }() 640 641 for _, test := range []struct { 642 name string 643 useQueue bool 644 }{ 645 {"plain_sub", false}, 646 {"queue_sub", true}, 647 } { 648 t.Run(test.name, func(t *testing.T) { 649 650 oa := DefaultOptions() 651 oa.NoSystemAccount = true 652 oa.Cluster.PoolSize = -1 653 sa := RunServer(oa) 654 defer sa.Shutdown() 655 656 ob := DefaultOptions() 657 ob.NoSystemAccount = true 658 ob.Cluster.PoolSize = -1 659 ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port)) 660 sb := RunServer(ob) 661 defer sb.Shutdown() 662 663 checkClusterFormed(t, sa, sb) 664 665 responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port)) 666 defer responder.Close() 667 natsSub(t, responder, "foo", func(m *nats.Msg) { 668 m.Respond(m.Data) 669 }) 670 natsFlush(t, responder) 671 672 checkExpectedSubs(t, 1, sa) 673 checkExpectedSubs(t, 1, sb) 674 675 bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 676 requestor := natsConnect(t, bURL) 677 defer requestor.Close() 678 679 ch := make(chan struct{}, 1) 680 cb := func(_ *nats.Msg) { 681 select { 682 case ch <- struct{}{}: 683 default: 684 } 685 } 686 687 sendReqs := func(t *testing.T, nc *nats.Conn, count int, unsub bool) { 688 t.Helper() 689 for i := 0; i < count; i++ { 690 inbox := nats.NewInbox() 691 var sub *nats.Subscription 692 if test.useQueue { 693 sub = natsQueueSub(t, nc, inbox, "queue", cb) 694 } else { 695 sub = natsSub(t, nc, inbox, cb) 696 } 697 natsPubReq(t, nc, "foo", inbox, []byte("hello")) 698 select { 699 case <-ch: 700 case <-time.After(time.Second): 701 t.Fatalf("Failed to get reply") 702 } 703 if unsub { 704 natsUnsub(t, sub) 705 } 706 } 707 } 708 sendReqs(t, requestor, maxPerAccountCacheSize+1, true) 709 710 var route *client 711 sb.mu.Lock() 712 route = getFirstRoute(sb) 713 sb.mu.Unlock() 714 715 checkExpected := func(t *testing.T, expected int) { 716 t.Helper() 717 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 718 route.mu.Lock() 719 n := len(route.in.pacache) 720 route.mu.Unlock() 721 if n != expected { 722 return fmt.Errorf("Expected %v subs in the cache, got %v", expected, n) 723 } 724 return nil 725 }) 726 } 727 checkExpected(t, (maxPerAccountCacheSize+1)-(prunePerAccountCacheSize+1)) 728 729 // Wait for more than the orphan check 730 time.Sleep(2 * closedSubsCheckInterval) 731 732 // Add a new subs up to point where new prune would occur 733 sendReqs(t, requestor, prunePerAccountCacheSize+1, false) 734 735 // Now closed subs should have been removed, so expected 736 // subs in the cache should be the new ones. 737 checkExpected(t, prunePerAccountCacheSize+1) 738 739 // Now try wil implicit unsubscribe (due to connection close) 740 sendReqs(t, requestor, maxPerAccountCacheSize+1, false) 741 requestor.Close() 742 743 checkExpected(t, maxPerAccountCacheSize-prunePerAccountCacheSize) 744 745 // Wait for more than the orphan check 746 time.Sleep(2 * closedSubsCheckInterval) 747 748 // Now create new connection and send prunePerAccountCacheSize+1 749 // and that should cause all subs from previous connection to be 750 // removed from cache 751 requestor = natsConnect(t, bURL) 752 defer requestor.Close() 753 754 sendReqs(t, requestor, prunePerAccountCacheSize+1, false) 755 checkExpected(t, prunePerAccountCacheSize+1) 756 }) 757 } 758 } 759 760 func TestNoRaceFetchAccountDoesNotRegisterAccountTwice(t *testing.T) { 761 sa, oa, sb, ob, _ := runTrustedGateways(t) 762 defer sa.Shutdown() 763 defer sb.Shutdown() 764 765 // Let's create a user account. 766 okp, _ := nkeys.FromSeed(oSeed) 767 akp, _ := nkeys.CreateAccount() 768 pub, _ := akp.PublicKey() 769 nac := jwt.NewAccountClaims(pub) 770 jwt, _ := nac.Encode(okp) 771 userAcc := pub 772 773 // Replace B's account resolver with one that introduces 774 // delay during the Fetch() 775 sac := &slowAccResolver{AccountResolver: sb.AccountResolver()} 776 sb.SetAccountResolver(sac) 777 778 // Add the account in sa and sb 779 addAccountToMemResolver(sa, userAcc, jwt) 780 addAccountToMemResolver(sb, userAcc, jwt) 781 782 // Tell the slow account resolver which account to slow down 783 sac.Lock() 784 sac.acc = userAcc 785 sac.Unlock() 786 787 urlA := fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port) 788 urlB := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 789 790 nca, err := nats.Connect(urlA, createUserCreds(t, sa, akp)) 791 if err != nil { 792 t.Fatalf("Error connecting to A: %v", err) 793 } 794 defer nca.Close() 795 796 // Since there is an optimistic send, this message will go to B 797 // and on processing this message, B will lookup/fetch this 798 // account, which can produce race with the fetch of this 799 // account from A's system account that sent a notification 800 // about this account, or with the client connect just after 801 // that. 802 nca.Publish("foo", []byte("hello")) 803 804 // Now connect and create a subscription on B 805 ncb, err := nats.Connect(urlB, createUserCreds(t, sb, akp)) 806 if err != nil { 807 t.Fatalf("Error connecting to A: %v", err) 808 } 809 defer ncb.Close() 810 sub, err := ncb.SubscribeSync("foo") 811 if err != nil { 812 t.Fatalf("Error on subscribe: %v", err) 813 } 814 ncb.Flush() 815 816 // Now send messages from A and B should ultimately start to receive 817 // them (once the subscription has been correctly registered) 818 ok := false 819 for i := 0; i < 10; i++ { 820 nca.Publish("foo", []byte("hello")) 821 if _, err := sub.NextMsg(100 * time.Millisecond); err != nil { 822 continue 823 } 824 ok = true 825 break 826 } 827 if !ok { 828 t.Fatalf("B should be able to receive messages") 829 } 830 831 checkTmpAccounts := func(t *testing.T, s *Server) { 832 t.Helper() 833 empty := true 834 s.tmpAccounts.Range(func(_, _ interface{}) bool { 835 empty = false 836 return false 837 }) 838 if !empty { 839 t.Fatalf("tmpAccounts is not empty") 840 } 841 } 842 checkTmpAccounts(t, sa) 843 checkTmpAccounts(t, sb) 844 } 845 846 func TestNoRaceWriteDeadline(t *testing.T) { 847 opts := DefaultOptions() 848 opts.NoSystemAccount = true 849 opts.WriteDeadline = 30 * time.Millisecond 850 s := RunServer(opts) 851 defer s.Shutdown() 852 853 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 854 if err != nil { 855 t.Fatalf("Error on connect: %v", err) 856 } 857 defer c.Close() 858 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 859 t.Fatalf("Error sending protocols to server: %v", err) 860 } 861 // Reduce socket buffer to increase reliability of getting 862 // write deadline errors. 863 c.(*net.TCPConn).SetReadBuffer(4) 864 865 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 866 sender, err := nats.Connect(url) 867 if err != nil { 868 t.Fatalf("Error on connect: %v", err) 869 } 870 defer sender.Close() 871 872 payload := make([]byte, 1000000) 873 total := 1000 874 for i := 0; i < total; i++ { 875 if err := sender.Publish("foo", payload); err != nil { 876 t.Fatalf("Error on publish: %v", err) 877 } 878 } 879 // Flush sender connection to ensure that all data has been sent. 880 if err := sender.Flush(); err != nil { 881 t.Fatalf("Error on flush: %v", err) 882 } 883 884 // At this point server should have closed connection c. 885 886 // On certain platforms, it may take more than one call before 887 // getting the error. 888 for i := 0; i < 100; i++ { 889 if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil { 890 // ok 891 return 892 } 893 } 894 t.Fatal("Connection should have been closed") 895 } 896 897 func TestNoRaceLeafNodeClusterNameConflictDeadlock(t *testing.T) { 898 o := DefaultOptions() 899 o.LeafNode.Port = -1 900 s := RunServer(o) 901 defer s.Shutdown() 902 903 u, err := url.Parse(fmt.Sprintf("nats://127.0.0.1:%d", o.LeafNode.Port)) 904 if err != nil { 905 t.Fatalf("Error parsing url: %v", err) 906 } 907 908 o1 := DefaultOptions() 909 o1.ServerName = "A1" 910 o1.Cluster.Name = "clusterA" 911 o1.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 912 s1 := RunServer(o1) 913 defer s1.Shutdown() 914 915 checkLeafNodeConnected(t, s1) 916 917 o2 := DefaultOptions() 918 o2.ServerName = "A2" 919 o2.Cluster.Name = "clusterA" 920 o2.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port)) 921 o2.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 922 s2 := RunServer(o2) 923 defer s2.Shutdown() 924 925 checkLeafNodeConnected(t, s2) 926 checkClusterFormed(t, s1, s2) 927 928 o3 := DefaultOptions() 929 o3.ServerName = "A3" 930 o3.Cluster.Name = "" // intentionally not set 931 o3.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port)) 932 o3.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 933 s3 := RunServer(o3) 934 defer s3.Shutdown() 935 936 checkLeafNodeConnected(t, s3) 937 checkClusterFormed(t, s1, s2, s3) 938 } 939 940 // This test is same than TestAccountAddServiceImportRace but running 941 // without the -race flag, it would capture more easily the possible 942 // duplicate sid, resulting in less than expected number of subscriptions 943 // in the account's internal subscriptions map. 944 func TestNoRaceAccountAddServiceImportRace(t *testing.T) { 945 TestAccountAddServiceImportRace(t) 946 } 947 948 // Similar to the routed version. Make sure we receive all of the 949 // messages with auto-unsubscribe enabled. 950 func TestNoRaceQueueAutoUnsubscribe(t *testing.T) { 951 opts := DefaultOptions() 952 s := RunServer(opts) 953 defer s.Shutdown() 954 955 nc, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 956 if err != nil { 957 t.Fatalf("Error on connect: %v", err) 958 } 959 defer nc.Close() 960 961 rbar := int32(0) 962 barCb := func(m *nats.Msg) { 963 atomic.AddInt32(&rbar, 1) 964 } 965 rbaz := int32(0) 966 bazCb := func(m *nats.Msg) { 967 atomic.AddInt32(&rbaz, 1) 968 } 969 970 // Create 1000 subscriptions with auto-unsubscribe of 1. 971 // Do two groups, one bar and one baz. 972 total := 1000 973 for i := 0; i < total; i++ { 974 qsub, err := nc.QueueSubscribe("foo", "bar", barCb) 975 if err != nil { 976 t.Fatalf("Error on subscribe: %v", err) 977 } 978 if err := qsub.AutoUnsubscribe(1); err != nil { 979 t.Fatalf("Error on auto-unsubscribe: %v", err) 980 } 981 qsub, err = nc.QueueSubscribe("foo", "baz", bazCb) 982 if err != nil { 983 t.Fatalf("Error on subscribe: %v", err) 984 } 985 if err := qsub.AutoUnsubscribe(1); err != nil { 986 t.Fatalf("Error on auto-unsubscribe: %v", err) 987 } 988 } 989 nc.Flush() 990 991 expected := int32(total) 992 for i := int32(0); i < expected; i++ { 993 nc.Publish("foo", []byte("Don't Drop Me!")) 994 } 995 nc.Flush() 996 997 checkFor(t, 5*time.Second, 10*time.Millisecond, func() error { 998 nbar := atomic.LoadInt32(&rbar) 999 nbaz := atomic.LoadInt32(&rbaz) 1000 if nbar == expected && nbaz == expected { 1001 return nil 1002 } 1003 return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'", 1004 expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz)) 1005 }) 1006 } 1007 1008 func TestNoRaceAcceptLoopsDoNotLeaveOpenedConn(t *testing.T) { 1009 for _, test := range []struct { 1010 name string 1011 url func(o *Options) (string, int) 1012 }{ 1013 {"client", func(o *Options) (string, int) { return o.Host, o.Port }}, 1014 {"route", func(o *Options) (string, int) { return o.Cluster.Host, o.Cluster.Port }}, 1015 {"gateway", func(o *Options) (string, int) { return o.Gateway.Host, o.Gateway.Port }}, 1016 {"leafnode", func(o *Options) (string, int) { return o.LeafNode.Host, o.LeafNode.Port }}, 1017 {"websocket", func(o *Options) (string, int) { return o.Websocket.Host, o.Websocket.Port }}, 1018 } { 1019 t.Run(test.name, func(t *testing.T) { 1020 o := DefaultOptions() 1021 o.DisableShortFirstPing = true 1022 o.Accounts = []*Account{NewAccount("$SYS")} 1023 o.SystemAccount = "$SYS" 1024 o.Cluster.Name = "abc" 1025 o.Cluster.Host = "127.0.0.1" 1026 o.Cluster.Port = -1 1027 o.Gateway.Name = "abc" 1028 o.Gateway.Host = "127.0.0.1" 1029 o.Gateway.Port = -1 1030 o.LeafNode.Host = "127.0.0.1" 1031 o.LeafNode.Port = -1 1032 o.Websocket.Host = "127.0.0.1" 1033 o.Websocket.Port = -1 1034 o.Websocket.HandshakeTimeout = 1 1035 o.Websocket.NoTLS = true 1036 s := RunServer(o) 1037 defer s.Shutdown() 1038 1039 host, port := test.url(o) 1040 url := fmt.Sprintf("%s:%d", host, port) 1041 var conns []net.Conn 1042 1043 wg := sync.WaitGroup{} 1044 wg.Add(1) 1045 done := make(chan struct{}, 1) 1046 go func() { 1047 defer wg.Done() 1048 // Have an upper limit 1049 for i := 0; i < 200; i++ { 1050 c, err := net.Dial("tcp", url) 1051 if err != nil { 1052 return 1053 } 1054 conns = append(conns, c) 1055 select { 1056 case <-done: 1057 return 1058 default: 1059 } 1060 } 1061 }() 1062 time.Sleep(15 * time.Millisecond) 1063 s.Shutdown() 1064 close(done) 1065 wg.Wait() 1066 for _, c := range conns { 1067 c.SetReadDeadline(time.Now().Add(2 * time.Second)) 1068 br := bufio.NewReader(c) 1069 // Read INFO for connections that were accepted 1070 _, _, err := br.ReadLine() 1071 if err == nil { 1072 // After that, the connection should be closed, 1073 // so we should get an error here. 1074 _, _, err = br.ReadLine() 1075 } 1076 // We expect an io.EOF or any other error indicating the use of a closed 1077 // connection, but we should not get the timeout error. 1078 if ne, ok := err.(net.Error); ok && ne.Timeout() { 1079 err = nil 1080 } 1081 if err == nil { 1082 var buf [10]byte 1083 c.SetDeadline(time.Now().Add(2 * time.Second)) 1084 c.Write([]byte("C")) 1085 _, err = c.Read(buf[:]) 1086 if ne, ok := err.(net.Error); ok && ne.Timeout() { 1087 err = nil 1088 } 1089 } 1090 if err == nil { 1091 t.Fatalf("Connection should have been closed") 1092 } 1093 c.Close() 1094 } 1095 }) 1096 } 1097 } 1098 1099 func TestNoRaceJetStreamDeleteStreamManyConsumers(t *testing.T) { 1100 s := RunBasicJetStreamServer(t) 1101 defer s.Shutdown() 1102 1103 mname := "MYS" 1104 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Storage: FileStorage}) 1105 if err != nil { 1106 t.Fatalf("Unexpected error adding stream: %v", err) 1107 } 1108 1109 // This number needs to be higher than the internal sendq size to trigger what this test is testing. 1110 for i := 0; i < 2000; i++ { 1111 _, err := mset.addConsumer(&ConsumerConfig{ 1112 Durable: fmt.Sprintf("D-%d", i), 1113 DeliverSubject: fmt.Sprintf("deliver.%d", i), 1114 }) 1115 if err != nil { 1116 t.Fatalf("Error creating consumer: %v", err) 1117 } 1118 } 1119 // With bug this would not return and would hang. 1120 mset.delete() 1121 } 1122 1123 // We used to swap accounts on an inbound message when processing service imports. 1124 // Until JetStream this was kinda ok, but with JetStream we can have pull consumers 1125 // trying to access the clients account in another Go routine now which causes issues. 1126 // This is not limited to the case above, its just the one that exposed it. 1127 // This test is to show that issue and that the fix works, meaning we no longer swap c.acc. 1128 func TestNoRaceJetStreamServiceImportAccountSwapIssue(t *testing.T) { 1129 s := RunBasicJetStreamServer(t) 1130 defer s.Shutdown() 1131 1132 // Client based API 1133 nc, js := jsClientConnect(t, s) 1134 defer nc.Close() 1135 1136 _, err := js.AddStream(&nats.StreamConfig{ 1137 Name: "TEST", 1138 Subjects: []string{"foo", "bar"}, 1139 }) 1140 if err != nil { 1141 t.Fatalf("Unexpected error: %v", err) 1142 } 1143 1144 sub, err := js.PullSubscribe("foo", "dlc") 1145 if err != nil { 1146 t.Fatalf("Unexpected error: %v", err) 1147 } 1148 1149 beforeSubs := s.NumSubscriptions() 1150 1151 // How long we want both sides to run. 1152 timeout := time.Now().Add(3 * time.Second) 1153 errs := make(chan error, 1) 1154 1155 // Publishing side, which will signal the consumer that is waiting and which will access c.acc. If publish 1156 // operation runs concurrently we will catch c.acc being $SYS some of the time. 1157 go func() { 1158 time.Sleep(100 * time.Millisecond) 1159 for time.Now().Before(timeout) { 1160 // This will signal the delivery of the pull messages. 1161 js.Publish("foo", []byte("Hello")) 1162 // This will swap the account because of JetStream service import. 1163 // We can get an error here with the bug or not. 1164 if _, err := js.StreamInfo("TEST"); err != nil { 1165 errs <- err 1166 return 1167 } 1168 } 1169 errs <- nil 1170 }() 1171 1172 // Pull messages flow. 1173 var received int 1174 for time.Now().Before(timeout.Add(2 * time.Second)) { 1175 if msgs, err := sub.Fetch(1, nats.MaxWait(200*time.Millisecond)); err == nil { 1176 for _, m := range msgs { 1177 received++ 1178 m.AckSync() 1179 } 1180 } else { 1181 break 1182 } 1183 } 1184 // Wait on publisher Go routine and check for errors. 1185 if err := <-errs; err != nil { 1186 t.Fatalf("Unexpected error: %v", err) 1187 } 1188 // Double check all received. 1189 si, err := js.StreamInfo("TEST") 1190 if err != nil { 1191 t.Fatalf("Unexpected error: %v", err) 1192 } 1193 if int(si.State.Msgs) != received { 1194 t.Fatalf("Expected to receive %d msgs, only got %d", si.State.Msgs, received) 1195 } 1196 // Now check for leaked subs from the fetch call above. That is what we first saw from the bug. 1197 if afterSubs := s.NumSubscriptions(); afterSubs != beforeSubs { 1198 t.Fatalf("Leaked subscriptions: %d before, %d after", beforeSubs, afterSubs) 1199 } 1200 } 1201 1202 func TestNoRaceJetStreamAPIStreamListPaging(t *testing.T) { 1203 s := RunBasicJetStreamServer(t) 1204 defer s.Shutdown() 1205 1206 // Create 2X limit 1207 streamsNum := 2 * JSApiNamesLimit 1208 for i := 1; i <= streamsNum; i++ { 1209 name := fmt.Sprintf("STREAM-%06d", i) 1210 cfg := StreamConfig{Name: name, Storage: MemoryStorage} 1211 _, err := s.GlobalAccount().addStream(&cfg) 1212 if err != nil { 1213 t.Fatalf("Unexpected error adding stream: %v", err) 1214 } 1215 } 1216 1217 // Client for API requests. 1218 nc := clientConnectToServer(t, s) 1219 defer nc.Close() 1220 1221 reqList := func(offset int) []byte { 1222 t.Helper() 1223 var req []byte 1224 if offset > 0 { 1225 req, _ = json.Marshal(&ApiPagedRequest{Offset: offset}) 1226 } 1227 resp, err := nc.Request(JSApiStreams, req, time.Second) 1228 if err != nil { 1229 t.Fatalf("Unexpected error getting stream list: %v", err) 1230 } 1231 return resp.Data 1232 } 1233 1234 checkResp := func(resp []byte, expectedLen, expectedOffset int) { 1235 t.Helper() 1236 var listResponse JSApiStreamNamesResponse 1237 if err := json.Unmarshal(resp, &listResponse); err != nil { 1238 t.Fatalf("Unexpected error: %v", err) 1239 } 1240 if len(listResponse.Streams) != expectedLen { 1241 t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Streams)) 1242 } 1243 if listResponse.Total != streamsNum { 1244 t.Fatalf("Expected total to be %d but got %d", streamsNum, listResponse.Total) 1245 } 1246 if listResponse.Offset != expectedOffset { 1247 t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset) 1248 } 1249 if expectedLen < 1 { 1250 return 1251 } 1252 // Make sure we get the right stream. 1253 sname := fmt.Sprintf("STREAM-%06d", expectedOffset+1) 1254 if listResponse.Streams[0] != sname { 1255 t.Fatalf("Expected stream %q to be first, got %q", sname, listResponse.Streams[0]) 1256 } 1257 } 1258 1259 checkResp(reqList(0), JSApiNamesLimit, 0) 1260 checkResp(reqList(JSApiNamesLimit), JSApiNamesLimit, JSApiNamesLimit) 1261 checkResp(reqList(streamsNum), 0, streamsNum) 1262 checkResp(reqList(streamsNum-22), 22, streamsNum-22) 1263 checkResp(reqList(streamsNum+22), 0, streamsNum) 1264 } 1265 1266 func TestNoRaceJetStreamAPIConsumerListPaging(t *testing.T) { 1267 s := RunBasicJetStreamServer(t) 1268 defer s.Shutdown() 1269 1270 sname := "MYSTREAM" 1271 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: sname}) 1272 if err != nil { 1273 t.Fatalf("Unexpected error adding stream: %v", err) 1274 } 1275 1276 // Client for API requests. 1277 nc := clientConnectToServer(t, s) 1278 defer nc.Close() 1279 1280 consumersNum := JSApiNamesLimit 1281 for i := 1; i <= consumersNum; i++ { 1282 dsubj := fmt.Sprintf("d.%d", i) 1283 sub, _ := nc.SubscribeSync(dsubj) 1284 defer sub.Unsubscribe() 1285 nc.Flush() 1286 1287 _, err := mset.addConsumer(&ConsumerConfig{DeliverSubject: dsubj}) 1288 if err != nil { 1289 t.Fatalf("Unexpected error: %v", err) 1290 } 1291 } 1292 1293 reqListSubject := fmt.Sprintf(JSApiConsumersT, sname) 1294 reqList := func(offset int) []byte { 1295 t.Helper() 1296 var req []byte 1297 if offset > 0 { 1298 req, _ = json.Marshal(&JSApiConsumersRequest{ApiPagedRequest: ApiPagedRequest{Offset: offset}}) 1299 } 1300 resp, err := nc.Request(reqListSubject, req, time.Second) 1301 if err != nil { 1302 t.Fatalf("Unexpected error getting stream list: %v", err) 1303 } 1304 return resp.Data 1305 } 1306 1307 checkResp := func(resp []byte, expectedLen, expectedOffset int) { 1308 t.Helper() 1309 var listResponse JSApiConsumerNamesResponse 1310 if err := json.Unmarshal(resp, &listResponse); err != nil { 1311 t.Fatalf("Unexpected error: %v", err) 1312 } 1313 if len(listResponse.Consumers) != expectedLen { 1314 t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Consumers)) 1315 } 1316 if listResponse.Total != consumersNum { 1317 t.Fatalf("Expected total to be %d but got %d", consumersNum, listResponse.Total) 1318 } 1319 if listResponse.Offset != expectedOffset { 1320 t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset) 1321 } 1322 } 1323 1324 checkResp(reqList(0), JSApiNamesLimit, 0) 1325 checkResp(reqList(consumersNum-22), 22, consumersNum-22) 1326 checkResp(reqList(consumersNum+22), 0, consumersNum) 1327 } 1328 1329 func TestNoRaceJetStreamWorkQueueLoadBalance(t *testing.T) { 1330 s := RunBasicJetStreamServer(t) 1331 defer s.Shutdown() 1332 1333 mname := "MY_MSG_SET" 1334 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Subjects: []string{"foo", "bar"}}) 1335 if err != nil { 1336 t.Fatalf("Unexpected error adding message set: %v", err) 1337 } 1338 defer mset.delete() 1339 1340 // Create basic work queue mode consumer. 1341 oname := "WQ" 1342 o, err := mset.addConsumer(&ConsumerConfig{Durable: oname, AckPolicy: AckExplicit}) 1343 if err != nil { 1344 t.Fatalf("Expected no error with durable, got %v", err) 1345 } 1346 defer o.delete() 1347 1348 // To send messages. 1349 nc := clientConnectToServer(t, s) 1350 defer nc.Close() 1351 1352 // For normal work queue semantics, you send requests to the subject with stream and consumer name. 1353 reqMsgSubj := o.requestNextMsgSubject() 1354 1355 numWorkers := 25 1356 counts := make([]int32, numWorkers) 1357 var received int32 1358 1359 rwg := &sync.WaitGroup{} 1360 rwg.Add(numWorkers) 1361 1362 wg := &sync.WaitGroup{} 1363 wg.Add(numWorkers) 1364 ch := make(chan bool) 1365 1366 toSend := 1000 1367 1368 for i := 0; i < numWorkers; i++ { 1369 nc := clientConnectToServer(t, s) 1370 defer nc.Close() 1371 1372 go func(index int32) { 1373 rwg.Done() 1374 defer wg.Done() 1375 <-ch 1376 1377 for counter := &counts[index]; ; { 1378 m, err := nc.Request(reqMsgSubj, nil, 100*time.Millisecond) 1379 if err != nil { 1380 return 1381 } 1382 m.Respond(nil) 1383 atomic.AddInt32(counter, 1) 1384 if total := atomic.AddInt32(&received, 1); total >= int32(toSend) { 1385 return 1386 } 1387 } 1388 }(int32(i)) 1389 } 1390 1391 // Wait for requestors to be ready 1392 rwg.Wait() 1393 close(ch) 1394 1395 sendSubj := "bar" 1396 for i := 0; i < toSend; i++ { 1397 sendStreamMsg(t, nc, sendSubj, "Hello World!") 1398 } 1399 1400 // Wait for test to complete. 1401 wg.Wait() 1402 1403 target := toSend / numWorkers 1404 delta := target/2 + 5 1405 low, high := int32(target-delta), int32(target+delta) 1406 1407 for i := 0; i < numWorkers; i++ { 1408 if msgs := atomic.LoadInt32(&counts[i]); msgs < low || msgs > high { 1409 t.Fatalf("Messages received for worker [%d] too far off from target of %d, got %d", i, target, msgs) 1410 } 1411 } 1412 } 1413 1414 func TestNoRaceJetStreamClusterLargeStreamInlineCatchup(t *testing.T) { 1415 c := createJetStreamClusterExplicit(t, "LSS", 3) 1416 defer c.shutdown() 1417 1418 // Client based API 1419 s := c.randomServer() 1420 nc, js := jsClientConnect(t, s) 1421 defer nc.Close() 1422 1423 _, err := js.AddStream(&nats.StreamConfig{ 1424 Name: "TEST", 1425 Subjects: []string{"foo"}, 1426 Replicas: 3, 1427 }) 1428 if err != nil { 1429 t.Fatalf("Unexpected error: %v", err) 1430 } 1431 1432 sr := c.randomNonStreamLeader("$G", "TEST") 1433 sr.Shutdown() 1434 1435 // In case sr was meta leader. 1436 c.waitOnLeader() 1437 1438 msg, toSend := []byte("Hello JS Clustering"), 5000 1439 1440 // Now fill up stream. 1441 for i := 0; i < toSend; i++ { 1442 if _, err = js.Publish("foo", msg); err != nil { 1443 t.Fatalf("Unexpected publish error: %v", err) 1444 } 1445 } 1446 si, err := js.StreamInfo("TEST") 1447 if err != nil { 1448 t.Fatalf("Unexpected error: %v", err) 1449 } 1450 // Check active state as well, shows that the owner answered. 1451 if si.State.Msgs != uint64(toSend) { 1452 t.Fatalf("Expected %d msgs, got bad state: %+v", toSend, si.State) 1453 } 1454 1455 // Kill our current leader to make just 2. 1456 c.streamLeader("$G", "TEST").Shutdown() 1457 1458 // Now restart the shutdown peer and wait for it to be current. 1459 sr = c.restartServer(sr) 1460 c.waitOnStreamCurrent(sr, "$G", "TEST") 1461 1462 // Ask other servers to stepdown as leader so that sr becomes the leader. 1463 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 1464 c.waitOnStreamLeader("$G", "TEST") 1465 if sl := c.streamLeader("$G", "TEST"); sl != sr { 1466 sl.JetStreamStepdownStream("$G", "TEST") 1467 return fmt.Errorf("Server %s is not leader yet", sr) 1468 } 1469 return nil 1470 }) 1471 1472 si, err = js.StreamInfo("TEST") 1473 if err != nil { 1474 t.Fatalf("Unexpected error: %v", err) 1475 } 1476 // Check that we have all of our messsages stored. 1477 // Wait for a bit for upper layers to process. 1478 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 1479 if si.State.Msgs != uint64(toSend) { 1480 return fmt.Errorf("Expected %d msgs, got %d", toSend, si.State.Msgs) 1481 } 1482 return nil 1483 }) 1484 } 1485 1486 func TestNoRaceJetStreamClusterStreamCreateAndLostQuorum(t *testing.T) { 1487 c := createJetStreamClusterExplicit(t, "R5S", 3) 1488 defer c.shutdown() 1489 1490 // Client based API 1491 s := c.randomServer() 1492 nc, js := jsClientConnect(t, s) 1493 defer nc.Close() 1494 1495 sub, err := nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*") 1496 if err != nil { 1497 t.Fatalf("Unexpected error: %v", err) 1498 } 1499 1500 if _, err := js.AddStream(&nats.StreamConfig{Name: "NO-LQ-START", Replicas: 3}); err != nil { 1501 t.Fatalf("Unexpected error: %v", err) 1502 } 1503 c.waitOnStreamLeader("$G", "NO-LQ-START") 1504 checkSubsPending(t, sub, 0) 1505 1506 c.stopAll() 1507 // Start up the one we were connected to first and wait for it to be connected. 1508 s = c.restartServer(s) 1509 nc, err = nats.Connect(s.ClientURL()) 1510 if err != nil { 1511 t.Fatalf("Failed to create client: %v", err) 1512 } 1513 defer nc.Close() 1514 1515 sub, err = nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*") 1516 if err != nil { 1517 t.Fatalf("Unexpected error: %v", err) 1518 } 1519 nc.Flush() 1520 1521 c.restartAll() 1522 c.waitOnStreamLeader("$G", "NO-LQ-START") 1523 1524 checkSubsPending(t, sub, 0) 1525 } 1526 1527 func TestNoRaceJetStreamSuperClusterMirrors(t *testing.T) { 1528 sc := createJetStreamSuperCluster(t, 3, 3) 1529 defer sc.shutdown() 1530 1531 // Client based API 1532 s := sc.clusterForName("C2").randomServer() 1533 nc, js := jsClientConnect(t, s) 1534 defer nc.Close() 1535 1536 // Create source stream. 1537 _, err := js.AddStream(&nats.StreamConfig{Name: "S1", Subjects: []string{"foo", "bar"}, Replicas: 3, Placement: &nats.Placement{Cluster: "C2"}}) 1538 if err != nil { 1539 t.Fatalf("Unexpected error: %v", err) 1540 } 1541 1542 // Needed while Go client does not have mirror support. 1543 createStream := func(cfg *nats.StreamConfig) { 1544 t.Helper() 1545 if _, err := js.AddStream(cfg); err != nil { 1546 t.Fatalf("Unexpected error: %+v", err) 1547 } 1548 } 1549 1550 // Send 100 messages. 1551 for i := 0; i < 100; i++ { 1552 if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil { 1553 t.Fatalf("Unexpected publish error: %v", err) 1554 } 1555 } 1556 1557 createStream(&nats.StreamConfig{ 1558 Name: "M1", 1559 Mirror: &nats.StreamSource{Name: "S1"}, 1560 Placement: &nats.Placement{Cluster: "C1"}, 1561 }) 1562 1563 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 1564 si, err := js.StreamInfo("M1") 1565 if err != nil { 1566 t.Fatalf("Unexpected error: %v", err) 1567 } 1568 if si.State.Msgs != 100 { 1569 return fmt.Errorf("Expected 100 msgs, got state: %+v", si.State) 1570 } 1571 return nil 1572 }) 1573 1574 // Purge the source stream. 1575 if err := js.PurgeStream("S1"); err != nil { 1576 t.Fatalf("Unexpected purge error: %v", err) 1577 } 1578 // Send 50 more msgs now. 1579 for i := 0; i < 50; i++ { 1580 if _, err := js.Publish("bar", []byte("OK")); err != nil { 1581 t.Fatalf("Unexpected publish error: %v", err) 1582 } 1583 } 1584 1585 createStream(&nats.StreamConfig{ 1586 Name: "M2", 1587 Mirror: &nats.StreamSource{Name: "S1"}, 1588 Replicas: 3, 1589 Placement: &nats.Placement{Cluster: "C3"}, 1590 }) 1591 1592 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1593 si, err := js.StreamInfo("M2") 1594 if err != nil { 1595 t.Fatalf("Unexpected error: %v", err) 1596 } 1597 if si.State.Msgs != 50 { 1598 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1599 } 1600 if si.State.FirstSeq != 101 { 1601 return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State) 1602 } 1603 return nil 1604 }) 1605 1606 sl := sc.clusterForName("C3").streamLeader("$G", "M2") 1607 doneCh := make(chan bool) 1608 1609 // Now test that if the mirror get's interrupted that it picks up where it left off etc. 1610 go func() { 1611 // Send 100 more messages. 1612 for i := 0; i < 100; i++ { 1613 if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil { 1614 t.Errorf("Unexpected publish on %d error: %v", i, err) 1615 } 1616 time.Sleep(2 * time.Millisecond) 1617 } 1618 doneCh <- true 1619 }() 1620 1621 time.Sleep(20 * time.Millisecond) 1622 sl.Shutdown() 1623 1624 <-doneCh 1625 sc.clusterForName("C3").waitOnStreamLeader("$G", "M2") 1626 1627 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1628 si, err := js.StreamInfo("M2") 1629 if err != nil { 1630 t.Fatalf("Unexpected error: %v", err) 1631 } 1632 if si.State.Msgs != 150 { 1633 return fmt.Errorf("Expected 150 msgs, got state: %+v", si.State) 1634 } 1635 if si.State.FirstSeq != 101 { 1636 return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State) 1637 } 1638 return nil 1639 }) 1640 } 1641 1642 func TestNoRaceJetStreamSuperClusterMixedModeMirrors(t *testing.T) { 1643 // Unlike the similar sources test, this test is not reliably catching the bug 1644 // that would cause mirrors to not have the expected messages count. 1645 // Still, adding this test in case we have a regression and we are lucky in 1646 // getting the failure while running this. 1647 1648 tmpl := ` 1649 listen: 127.0.0.1:-1 1650 server_name: %s 1651 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 1652 leaf: { listen: 127.0.0.1:-1 } 1653 1654 cluster { 1655 name: %s 1656 listen: 127.0.0.1:%d 1657 routes = [%s] 1658 } 1659 1660 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 1661 ` 1662 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 4, 1663 func(serverName, clusterName, storeDir, conf string) string { 1664 sname := serverName[strings.Index(serverName, "-")+1:] 1665 switch sname { 1666 case "S5", "S6", "S7": 1667 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 1668 default: 1669 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 1670 } 1671 return conf 1672 }, nil) 1673 defer sc.shutdown() 1674 1675 // Connect our client to a non JS server 1676 c := sc.randomCluster() 1677 var s *Server 1678 for s == nil { 1679 if as := c.randomServer(); !as.JetStreamEnabled() { 1680 s = as 1681 break 1682 } 1683 } 1684 nc, js := jsClientConnect(t, s) 1685 defer nc.Close() 1686 1687 toSend := 1000 1688 // Create 10 origin streams 1689 for i := 0; i < 10; i++ { 1690 name := fmt.Sprintf("S%d", i+1) 1691 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 1692 t.Fatalf("Unexpected error: %v", err) 1693 } 1694 c.waitOnStreamLeader(globalAccountName, name) 1695 // Load them up with a bunch of messages. 1696 for n := 0; n < toSend; n++ { 1697 m := nats.NewMsg(name) 1698 m.Header.Set("stream", name) 1699 m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10)) 1700 if err := nc.PublishMsg(m); err != nil { 1701 t.Fatalf("Unexpected publish error: %v", err) 1702 } 1703 } 1704 } 1705 1706 for i := 0; i < 3; i++ { 1707 // Now create our mirrors 1708 wg := sync.WaitGroup{} 1709 mirrorsCount := 10 1710 wg.Add(mirrorsCount) 1711 errCh := make(chan error, 1) 1712 for m := 0; m < mirrorsCount; m++ { 1713 sname := fmt.Sprintf("S%d", rand.Intn(10)+1) 1714 go func(sname string, mirrorIdx int) { 1715 defer wg.Done() 1716 if _, err := js.AddStream(&nats.StreamConfig{ 1717 Name: fmt.Sprintf("M%d", mirrorIdx), 1718 Mirror: &nats.StreamSource{Name: sname}, 1719 Replicas: 3, 1720 }); err != nil { 1721 select { 1722 case errCh <- err: 1723 default: 1724 } 1725 } 1726 }(sname, m+1) 1727 } 1728 wg.Wait() 1729 select { 1730 case err := <-errCh: 1731 t.Fatalf("Error creating mirrors: %v", err) 1732 default: 1733 } 1734 // Now check the mirrors have all expected messages 1735 for m := 0; m < mirrorsCount; m++ { 1736 name := fmt.Sprintf("M%d", m+1) 1737 checkFor(t, 15*time.Second, 500*time.Millisecond, func() error { 1738 si, err := js.StreamInfo(name) 1739 if err != nil { 1740 t.Fatalf("Could not retrieve stream info") 1741 } 1742 if si.State.Msgs != uint64(toSend) { 1743 return fmt.Errorf("Expected %d msgs, got state: %+v", toSend, si.State) 1744 } 1745 return nil 1746 }) 1747 err := js.DeleteStream(name) 1748 require_NoError(t, err) 1749 } 1750 } 1751 } 1752 1753 func TestNoRaceJetStreamSuperClusterSources(t *testing.T) { 1754 sc := createJetStreamSuperCluster(t, 3, 3) 1755 defer sc.shutdown() 1756 1757 // Client based API 1758 s := sc.clusterForName("C1").randomServer() 1759 nc, js := jsClientConnect(t, s) 1760 defer nc.Close() 1761 1762 // Create our source streams. 1763 for _, sname := range []string{"foo", "bar", "baz"} { 1764 if _, err := js.AddStream(&nats.StreamConfig{Name: sname, Replicas: 1}); err != nil { 1765 t.Fatalf("Unexpected error: %v", err) 1766 } 1767 } 1768 1769 sendBatch := func(subject string, n int) { 1770 for i := 0; i < n; i++ { 1771 msg := fmt.Sprintf("MSG-%d", i+1) 1772 if _, err := js.Publish(subject, []byte(msg)); err != nil { 1773 t.Fatalf("Unexpected publish error: %v", err) 1774 } 1775 } 1776 } 1777 // Populate each one. 1778 sendBatch("foo", 10) 1779 sendBatch("bar", 15) 1780 sendBatch("baz", 25) 1781 1782 // Needed while Go client does not have mirror support for creating mirror or source streams. 1783 createStream := func(cfg *nats.StreamConfig) { 1784 t.Helper() 1785 if _, err := js.AddStream(cfg); err != nil { 1786 t.Fatalf("Unexpected error: %+v", err) 1787 } 1788 } 1789 1790 cfg := &nats.StreamConfig{ 1791 Name: "MS", 1792 Sources: []*nats.StreamSource{ 1793 {Name: "foo"}, 1794 {Name: "bar"}, 1795 {Name: "baz"}, 1796 }, 1797 } 1798 1799 createStream(cfg) 1800 time.Sleep(time.Second) 1801 1802 // Faster timeout since we loop below checking for condition. 1803 js2, err := nc.JetStream(nats.MaxWait(50 * time.Millisecond)) 1804 if err != nil { 1805 t.Fatalf("Unexpected error: %v", err) 1806 } 1807 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1808 si, err := js2.StreamInfo("MS") 1809 if err != nil { 1810 return err 1811 } 1812 if si.State.Msgs != 50 { 1813 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1814 } 1815 return nil 1816 }) 1817 1818 // Purge the source streams. 1819 for _, sname := range []string{"foo", "bar", "baz"} { 1820 if err := js.PurgeStream(sname); err != nil { 1821 t.Fatalf("Unexpected purge error: %v", err) 1822 } 1823 } 1824 1825 if err := js.DeleteStream("MS"); err != nil { 1826 t.Fatalf("Unexpected delete error: %v", err) 1827 } 1828 1829 // Send more msgs now. 1830 sendBatch("foo", 10) 1831 sendBatch("bar", 15) 1832 sendBatch("baz", 25) 1833 1834 cfg = &nats.StreamConfig{ 1835 Name: "MS2", 1836 Sources: []*nats.StreamSource{ 1837 {Name: "foo"}, 1838 {Name: "bar"}, 1839 {Name: "baz"}, 1840 }, 1841 Replicas: 3, 1842 Placement: &nats.Placement{Cluster: "C3"}, 1843 } 1844 1845 createStream(cfg) 1846 1847 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 1848 si, err := js2.StreamInfo("MS2") 1849 if err != nil { 1850 t.Fatalf("Unexpected error: %v", err) 1851 } 1852 if si.State.Msgs != 50 { 1853 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1854 } 1855 if si.State.FirstSeq != 1 { 1856 return fmt.Errorf("Expected start seq of 1, got state: %+v", si.State) 1857 } 1858 return nil 1859 }) 1860 1861 sl := sc.clusterForName("C3").streamLeader("$G", "MS2") 1862 doneCh := make(chan bool) 1863 1864 if sl == sc.leader() { 1865 nc.Request(JSApiLeaderStepDown, nil, time.Second) 1866 sc.waitOnLeader() 1867 } 1868 1869 // Now test that if the mirror get's interrupted that it picks up where it left off etc. 1870 go func() { 1871 // Send 50 more messages each. 1872 for i := 0; i < 50; i++ { 1873 msg := fmt.Sprintf("R-MSG-%d", i+1) 1874 for _, sname := range []string{"foo", "bar", "baz"} { 1875 m := nats.NewMsg(sname) 1876 m.Data = []byte(msg) 1877 if _, err := js.PublishMsg(m); err != nil { 1878 t.Errorf("Unexpected publish error: %v", err) 1879 } 1880 } 1881 time.Sleep(2 * time.Millisecond) 1882 } 1883 doneCh <- true 1884 }() 1885 1886 time.Sleep(20 * time.Millisecond) 1887 sl.Shutdown() 1888 1889 sc.clusterForName("C3").waitOnStreamLeader("$G", "MS2") 1890 <-doneCh 1891 1892 checkFor(t, 20*time.Second, time.Second, func() error { 1893 si, err := js2.StreamInfo("MS2") 1894 if err != nil { 1895 return err 1896 } 1897 if si.State.Msgs != 200 { 1898 return fmt.Errorf("Expected 200 msgs, got state: %+v", si.State) 1899 } 1900 return nil 1901 }) 1902 } 1903 1904 func TestNoRaceJetStreamClusterSourcesMuxd(t *testing.T) { 1905 c := createJetStreamClusterExplicit(t, "SMUX", 3) 1906 defer c.shutdown() 1907 1908 // Client for API requests. 1909 nc, js := jsClientConnect(t, c.randomServer()) 1910 defer nc.Close() 1911 1912 // Send in 10000 messages. 1913 msg, toSend := make([]byte, 1024), 10000 1914 crand.Read(msg) 1915 1916 var sources []*nats.StreamSource 1917 // Create 10 origin streams. 1918 for i := 1; i <= 10; i++ { 1919 name := fmt.Sprintf("O-%d", i) 1920 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 1921 t.Fatalf("Unexpected error: %v", err) 1922 } 1923 // Make sure we have a leader before publishing, especially since we use 1924 // non JS publisher, we would not know if the messages made it to those 1925 // streams or not. 1926 c.waitOnStreamLeader(globalAccountName, name) 1927 // Load them up with a bunch of messages. 1928 for n := 0; n < toSend; n++ { 1929 if err := nc.Publish(name, msg); err != nil { 1930 t.Fatalf("Unexpected publish error: %v", err) 1931 } 1932 } 1933 sources = append(sources, &nats.StreamSource{Name: name}) 1934 } 1935 1936 // Now create our downstream stream that sources from all of them. 1937 if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 2, Sources: sources}); err != nil { 1938 t.Fatalf("Unexpected error: %v", err) 1939 } 1940 1941 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 1942 si, err := js.StreamInfo("S") 1943 if err != nil { 1944 t.Fatalf("Could not retrieve stream info") 1945 } 1946 if si.State.Msgs != uint64(10*toSend) { 1947 return fmt.Errorf("Expected %d msgs, got state: %+v", toSend*10, si.State) 1948 } 1949 return nil 1950 }) 1951 1952 } 1953 1954 func TestNoRaceJetStreamSuperClusterMixedModeSources(t *testing.T) { 1955 tmpl := ` 1956 listen: 127.0.0.1:-1 1957 server_name: %s 1958 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 1959 leaf: { listen: 127.0.0.1:-1 } 1960 1961 cluster { 1962 name: %s 1963 listen: 127.0.0.1:%d 1964 routes = [%s] 1965 } 1966 1967 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 1968 ` 1969 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 2, 1970 func(serverName, clusterName, storeDir, conf string) string { 1971 sname := serverName[strings.Index(serverName, "-")+1:] 1972 switch sname { 1973 case "S5", "S6", "S7": 1974 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 1975 default: 1976 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 1977 } 1978 return conf 1979 }, nil) 1980 defer sc.shutdown() 1981 1982 // Connect our client to a non JS server 1983 c := sc.randomCluster() 1984 var s *Server 1985 for s == nil { 1986 if as := c.randomServer(); !as.JetStreamEnabled() { 1987 s = as 1988 break 1989 } 1990 } 1991 nc, js := jsClientConnect(t, s) 1992 defer nc.Close() 1993 1994 toSend := 1000 1995 var sources []*nats.StreamSource 1996 // Create 100 origin streams. 1997 for i := 1; i <= 100; i++ { 1998 name := fmt.Sprintf("O-%d", i) 1999 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 2000 t.Fatalf("Unexpected error: %v", err) 2001 } 2002 c.waitOnStreamLeader(globalAccountName, name) 2003 // Load them up with a bunch of messages. 2004 for n := 0; n < toSend; n++ { 2005 m := nats.NewMsg(name) 2006 m.Header.Set("stream", name) 2007 m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10)) 2008 if err := nc.PublishMsg(m); err != nil { 2009 t.Fatalf("Unexpected publish error: %v", err) 2010 } 2011 } 2012 sources = append(sources, &nats.StreamSource{Name: name}) 2013 } 2014 2015 for i := 0; i < 3; i++ { 2016 // Now create our downstream stream that sources from all of them. 2017 if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 3, Sources: sources}); err != nil { 2018 t.Fatalf("Unexpected error: %v", err) 2019 } 2020 2021 checkFor(t, 15*time.Second, 1000*time.Millisecond, func() error { 2022 si, err := js.StreamInfo("S") 2023 if err != nil { 2024 t.Fatalf("Could not retrieve stream info") 2025 } 2026 if si.State.Msgs != uint64(100*toSend) { 2027 return fmt.Errorf("Expected %d msgs, got state: %+v", toSend*100, si.State) 2028 } 2029 return nil 2030 }) 2031 2032 err := js.DeleteStream("S") 2033 require_NoError(t, err) 2034 } 2035 } 2036 2037 func TestNoRaceJetStreamClusterExtendedStreamPurgeStall(t *testing.T) { 2038 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 2039 skip(t) 2040 2041 cerr := func(t *testing.T, err error) { 2042 t.Helper() 2043 if err != nil { 2044 t.Fatalf("unexepected err: %s", err) 2045 } 2046 } 2047 2048 s := RunBasicJetStreamServer(t) 2049 defer s.Shutdown() 2050 2051 nc, js := jsClientConnect(t, s) 2052 defer nc.Close() 2053 2054 si, err := js.AddStream(&nats.StreamConfig{ 2055 Name: "KV", 2056 Subjects: []string{"kv.>"}, 2057 Storage: nats.FileStorage, 2058 }) 2059 cerr(t, err) 2060 2061 // 100kb messages spread over 1000 different subjects 2062 body := make([]byte, 100*1024) 2063 for i := 0; i < 50000; i++ { 2064 if _, err := js.PublishAsync(fmt.Sprintf("kv.%d", i%1000), body); err != nil { 2065 cerr(t, err) 2066 } 2067 } 2068 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 2069 if si, err = js.StreamInfo("KV"); err != nil { 2070 return err 2071 } 2072 if si.State.Msgs == 50000 { 2073 return nil 2074 } 2075 return fmt.Errorf("waiting for more") 2076 }) 2077 2078 jp, _ := json.Marshal(&JSApiStreamPurgeRequest{Subject: "kv.20"}) 2079 start := time.Now() 2080 res, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), jp, time.Minute) 2081 elapsed := time.Since(start) 2082 cerr(t, err) 2083 pres := JSApiStreamPurgeResponse{} 2084 err = json.Unmarshal(res.Data, &pres) 2085 cerr(t, err) 2086 if !pres.Success { 2087 t.Fatalf("purge failed: %#v", pres) 2088 } 2089 if elapsed > time.Second { 2090 t.Fatalf("Purge took too long %s", elapsed) 2091 } 2092 v, _ := s.Varz(nil) 2093 if v.Mem > 100*1024*1024 { // 100MB limit but in practice < 100MB -> Was ~7GB when failing. 2094 t.Fatalf("Used too much memory: %v", friendlyBytes(v.Mem)) 2095 } 2096 } 2097 2098 func TestNoRaceJetStreamClusterMirrorExpirationAndMissingSequences(t *testing.T) { 2099 c := createJetStreamClusterExplicit(t, "MMS", 9) 2100 defer c.shutdown() 2101 2102 // Client for API requests. 2103 nc, js := jsClientConnect(t, c.randomServer()) 2104 defer nc.Close() 2105 2106 sendBatch := func(n int) { 2107 t.Helper() 2108 // Send a batch to a given subject. 2109 for i := 0; i < n; i++ { 2110 if _, err := js.Publish("TEST", []byte("OK")); err != nil { 2111 t.Fatalf("Unexpected publish error: %v", err) 2112 } 2113 } 2114 } 2115 2116 checkStream := func(stream string, num uint64) { 2117 t.Helper() 2118 checkFor(t, 20*time.Second, 20*time.Millisecond, func() error { 2119 si, err := js.StreamInfo(stream) 2120 if err != nil { 2121 return err 2122 } 2123 if si.State.Msgs != num { 2124 return fmt.Errorf("Expected %d msgs, got %d", num, si.State.Msgs) 2125 } 2126 return nil 2127 }) 2128 } 2129 2130 checkMirror := func(num uint64) { t.Helper(); checkStream("M", num) } 2131 checkTest := func(num uint64) { t.Helper(); checkStream("TEST", num) } 2132 2133 // Origin 2134 _, err := js.AddStream(&nats.StreamConfig{ 2135 Name: "TEST", 2136 MaxAge: 500 * time.Millisecond, 2137 }) 2138 if err != nil { 2139 t.Fatalf("Unexpected error: %v", err) 2140 } 2141 2142 ts := c.streamLeader("$G", "TEST") 2143 ml := c.leader() 2144 2145 // Create mirror now. 2146 for ms := ts; ms == ts || ms == ml; { 2147 _, err = js.AddStream(&nats.StreamConfig{ 2148 Name: "M", 2149 Mirror: &nats.StreamSource{Name: "TEST"}, 2150 Replicas: 2, 2151 }) 2152 if err != nil { 2153 t.Fatalf("Unexpected error: %v", err) 2154 } 2155 ms = c.streamLeader("$G", "M") 2156 if ts == ms || ms == ml { 2157 // Delete and retry. 2158 js.DeleteStream("M") 2159 } 2160 } 2161 2162 sendBatch(10) 2163 checkMirror(10) 2164 2165 // Now shutdown the server with the mirror. 2166 ms := c.streamLeader("$G", "M") 2167 ms.Shutdown() 2168 c.waitOnLeader() 2169 2170 // Send more messages but let them expire. 2171 sendBatch(10) 2172 checkTest(0) 2173 2174 c.restartServer(ms) 2175 c.checkClusterFormed() 2176 c.waitOnStreamLeader("$G", "M") 2177 2178 sendBatch(10) 2179 checkMirror(20) 2180 } 2181 2182 func TestNoRaceJetStreamClusterLargeActiveOnReplica(t *testing.T) { 2183 // Uncomment to run. 2184 skip(t) 2185 2186 c := createJetStreamClusterExplicit(t, "LAG", 3) 2187 defer c.shutdown() 2188 2189 // Client for API requests. 2190 nc, js := jsClientConnect(t, c.randomServer()) 2191 defer nc.Close() 2192 2193 timeout := time.Now().Add(60 * time.Second) 2194 for time.Now().Before(timeout) { 2195 si, err := js.AddStream(&nats.StreamConfig{ 2196 Name: "TEST", 2197 Subjects: []string{"foo", "bar"}, 2198 Replicas: 3, 2199 }) 2200 if err != nil { 2201 t.Fatalf("Unexpected error: %v", err) 2202 } 2203 for _, r := range si.Cluster.Replicas { 2204 if r.Active > 5*time.Second { 2205 t.Fatalf("Bad Active value: %+v", r) 2206 } 2207 } 2208 if err := js.DeleteStream("TEST"); err != nil { 2209 t.Fatalf("Unexpected delete error: %v", err) 2210 } 2211 } 2212 } 2213 2214 func TestNoRaceJetStreamSuperClusterRIPStress(t *testing.T) { 2215 // Uncomment to run. Needs to be on a big machine. 2216 skip(t) 2217 2218 sc := createJetStreamSuperCluster(t, 3, 3) 2219 defer sc.shutdown() 2220 2221 // Client based API 2222 s := sc.clusterForName("C2").randomServer() 2223 nc, js := jsClientConnect(t, s) 2224 defer nc.Close() 2225 2226 scm := make(map[string][]string) 2227 2228 // Create 50 streams per cluster. 2229 for _, cn := range []string{"C1", "C2", "C3"} { 2230 var streams []string 2231 for i := 0; i < 50; i++ { 2232 sn := fmt.Sprintf("%s-S%d", cn, i+1) 2233 streams = append(streams, sn) 2234 _, err := js.AddStream(&nats.StreamConfig{ 2235 Name: sn, 2236 Replicas: 3, 2237 Placement: &nats.Placement{Cluster: cn}, 2238 MaxAge: 2 * time.Minute, 2239 MaxMsgs: 50_000, 2240 }) 2241 if err != nil { 2242 t.Fatalf("Unexpected error: %v", err) 2243 } 2244 } 2245 scm[cn] = streams 2246 } 2247 2248 sourceForCluster := func(cn string) []*nats.StreamSource { 2249 var sns []string 2250 switch cn { 2251 case "C1": 2252 sns = scm["C2"] 2253 case "C2": 2254 sns = scm["C3"] 2255 case "C3": 2256 sns = scm["C1"] 2257 default: 2258 t.Fatalf("Unknown cluster %q", cn) 2259 } 2260 var ss []*nats.StreamSource 2261 for _, sn := range sns { 2262 ss = append(ss, &nats.StreamSource{Name: sn}) 2263 } 2264 return ss 2265 } 2266 2267 // Mux all 50 streams from one cluster to a single stream across a GW connection to another cluster. 2268 _, err := js.AddStream(&nats.StreamConfig{ 2269 Name: "C1-S-MUX", 2270 Replicas: 2, 2271 Placement: &nats.Placement{Cluster: "C1"}, 2272 Sources: sourceForCluster("C2"), 2273 MaxAge: time.Minute, 2274 MaxMsgs: 20_000, 2275 }) 2276 if err != nil { 2277 t.Fatalf("Unexpected error: %v", err) 2278 } 2279 2280 _, err = js.AddStream(&nats.StreamConfig{ 2281 Name: "C2-S-MUX", 2282 Replicas: 2, 2283 Placement: &nats.Placement{Cluster: "C2"}, 2284 Sources: sourceForCluster("C3"), 2285 MaxAge: time.Minute, 2286 MaxMsgs: 20_000, 2287 }) 2288 if err != nil { 2289 t.Fatalf("Unexpected error: %v", err) 2290 } 2291 2292 _, err = js.AddStream(&nats.StreamConfig{ 2293 Name: "C3-S-MUX", 2294 Replicas: 2, 2295 Placement: &nats.Placement{Cluster: "C3"}, 2296 Sources: sourceForCluster("C1"), 2297 MaxAge: time.Minute, 2298 MaxMsgs: 20_000, 2299 }) 2300 if err != nil { 2301 t.Fatalf("Unexpected error: %v", err) 2302 } 2303 2304 // Now create mirrors for our mux'd streams. 2305 _, err = js.AddStream(&nats.StreamConfig{ 2306 Name: "C1-MIRROR", 2307 Replicas: 3, 2308 Placement: &nats.Placement{Cluster: "C1"}, 2309 Mirror: &nats.StreamSource{Name: "C3-S-MUX"}, 2310 MaxAge: 5 * time.Minute, 2311 MaxMsgs: 10_000, 2312 }) 2313 if err != nil { 2314 t.Fatalf("Unexpected error: %v", err) 2315 } 2316 2317 _, err = js.AddStream(&nats.StreamConfig{ 2318 Name: "C2-MIRROR", 2319 Replicas: 3, 2320 Placement: &nats.Placement{Cluster: "C2"}, 2321 Mirror: &nats.StreamSource{Name: "C2-S-MUX"}, 2322 MaxAge: 5 * time.Minute, 2323 MaxMsgs: 10_000, 2324 }) 2325 if err != nil { 2326 t.Fatalf("Unexpected error: %v", err) 2327 } 2328 2329 _, err = js.AddStream(&nats.StreamConfig{ 2330 Name: "C3-MIRROR", 2331 Replicas: 3, 2332 Placement: &nats.Placement{Cluster: "C3"}, 2333 Mirror: &nats.StreamSource{Name: "C1-S-MUX"}, 2334 MaxAge: 5 * time.Minute, 2335 MaxMsgs: 10_000, 2336 }) 2337 if err != nil { 2338 t.Fatalf("Unexpected error: %v", err) 2339 } 2340 2341 var jsc []nats.JetStream 2342 2343 // Create 64 clients. 2344 for i := 0; i < 64; i++ { 2345 s := sc.randomCluster().randomServer() 2346 nc, _ := jsClientConnect(t, s) 2347 defer nc.Close() 2348 js, err := nc.JetStream(nats.PublishAsyncMaxPending(8 * 1024)) 2349 if err != nil { 2350 t.Fatalf("Unexpected error: %v", err) 2351 } 2352 jsc = append(jsc, js) 2353 } 2354 2355 msg := make([]byte, 1024) 2356 crand.Read(msg) 2357 2358 // 10 minutes 2359 expires := time.Now().Add(480 * time.Second) 2360 for time.Now().Before(expires) { 2361 for _, sns := range scm { 2362 rand.Shuffle(len(sns), func(i, j int) { sns[i], sns[j] = sns[j], sns[i] }) 2363 for _, sn := range sns { 2364 js := jsc[rand.Intn(len(jsc))] 2365 if _, err = js.PublishAsync(sn, msg); err != nil { 2366 t.Fatalf("Unexpected publish error: %v", err) 2367 } 2368 } 2369 } 2370 time.Sleep(10 * time.Millisecond) 2371 } 2372 } 2373 2374 func TestNoRaceJetStreamSlowFilteredInititalPendingAndFirstMsg(t *testing.T) { 2375 s := RunBasicJetStreamServer(t) 2376 defer s.Shutdown() 2377 2378 // Create directly here to force multiple blocks, etc. 2379 a, err := s.LookupAccount("$G") 2380 if err != nil { 2381 t.Fatalf("Unexpected error: %v", err) 2382 } 2383 mset, err := a.addStreamWithStore( 2384 &StreamConfig{ 2385 Name: "S", 2386 Subjects: []string{"foo", "bar", "baz", "foo.bar.baz", "foo.*"}, 2387 }, 2388 &FileStoreConfig{ 2389 BlockSize: 4 * 1024 * 1024, 2390 AsyncFlush: true, 2391 }, 2392 ) 2393 if err != nil { 2394 t.Fatalf("Unexpected error: %v", err) 2395 } 2396 2397 nc, js := jsClientConnect(t, s) 2398 defer nc.Close() 2399 2400 toSend := 100_000 // 500k total though. 2401 2402 // Messages will be 'foo' 'bar' 'baz' repeated 100k times. 2403 // Then 'foo.bar.baz' all contigous for 100k. 2404 // Then foo.N for 1-100000 2405 for i := 0; i < toSend; i++ { 2406 js.PublishAsync("foo", []byte("HELLO")) 2407 js.PublishAsync("bar", []byte("WORLD")) 2408 js.PublishAsync("baz", []byte("AGAIN")) 2409 } 2410 // Make contiguous block of same subject. 2411 for i := 0; i < toSend; i++ { 2412 js.PublishAsync("foo.bar.baz", []byte("ALL-TOGETHER")) 2413 } 2414 // Now add some more at the end. 2415 for i := 0; i < toSend; i++ { 2416 js.PublishAsync(fmt.Sprintf("foo.%d", i+1), []byte("LATER")) 2417 } 2418 2419 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 2420 si, err := js.StreamInfo("S") 2421 if err != nil { 2422 return err 2423 } 2424 if si.State.Msgs != uint64(5*toSend) { 2425 return fmt.Errorf("Expected %d msgs, got %d", 5*toSend, si.State.Msgs) 2426 } 2427 return nil 2428 }) 2429 2430 // Threshold for taking too long. 2431 const thresh = 150 * time.Millisecond 2432 2433 var dindex int 2434 testConsumerCreate := func(subj string, startSeq, expectedNumPending uint64) { 2435 t.Helper() 2436 dindex++ 2437 dname := fmt.Sprintf("dur-%d", dindex) 2438 cfg := ConsumerConfig{FilterSubject: subj, Durable: dname, AckPolicy: AckExplicit} 2439 if startSeq > 1 { 2440 cfg.OptStartSeq, cfg.DeliverPolicy = startSeq, DeliverByStartSequence 2441 } 2442 start := time.Now() 2443 o, err := mset.addConsumer(&cfg) 2444 if err != nil { 2445 t.Fatalf("Unexpected error: %v", err) 2446 } 2447 if delta := time.Since(start); delta > thresh { 2448 t.Fatalf("Creating consumer for %q and start: %d took too long: %v", subj, startSeq, delta) 2449 } 2450 if ci := o.info(); ci.NumPending != expectedNumPending { 2451 t.Fatalf("Expected NumPending of %d, got %d", expectedNumPending, ci.NumPending) 2452 } 2453 } 2454 2455 testConsumerCreate("foo.100000", 1, 1) 2456 testConsumerCreate("foo.100000", 222_000, 1) 2457 testConsumerCreate("foo", 1, 100_000) 2458 testConsumerCreate("foo", 4, 100_000-1) 2459 testConsumerCreate("foo.bar.baz", 1, 100_000) 2460 testConsumerCreate("foo.bar.baz", 350_001, 50_000) 2461 testConsumerCreate("*", 1, 300_000) 2462 testConsumerCreate("*", 4, 300_000-3) 2463 testConsumerCreate(">", 1, 500_000) 2464 testConsumerCreate(">", 50_000, 500_000-50_000+1) 2465 testConsumerCreate("foo.10", 1, 1) 2466 2467 // Also test that we do not take long if the start sequence is later in the stream. 2468 sub, err := js.PullSubscribe("foo.100000", "dlc") 2469 if err != nil { 2470 t.Fatalf("Unexpected error: %v", err) 2471 } 2472 start := time.Now() 2473 fetchMsgs(t, sub, 1, time.Second) 2474 if delta := time.Since(start); delta > thresh { 2475 t.Fatalf("Took too long for pull subscriber to fetch the message: %v", delta) 2476 } 2477 2478 // Now do some deletes and make sure these are handled correctly. 2479 // Delete 3 foo messages. 2480 mset.removeMsg(1) 2481 mset.removeMsg(4) 2482 mset.removeMsg(7) 2483 testConsumerCreate("foo", 1, 100_000-3) 2484 2485 // Make sure wider scoped subjects do the right thing from a pending perspective. 2486 o, err := mset.addConsumer(&ConsumerConfig{FilterSubject: ">", Durable: "cat", AckPolicy: AckExplicit}) 2487 if err != nil { 2488 t.Fatalf("Unexpected error: %v", err) 2489 } 2490 ci, expected := o.info(), uint64(500_000-3) 2491 if ci.NumPending != expected { 2492 t.Fatalf("Expected NumPending of %d, got %d", expected, ci.NumPending) 2493 } 2494 // Send another and make sure its captured by our wide scope consumer. 2495 js.Publish("foo", []byte("HELLO AGAIN")) 2496 if ci = o.info(); ci.NumPending != expected+1 { 2497 t.Fatalf("Expected the consumer to recognize the wide scoped consumer, wanted pending of %d, got %d", expected+1, ci.NumPending) 2498 } 2499 2500 // Stop current server and test restart.. 2501 sd := s.JetStreamConfig().StoreDir 2502 s.Shutdown() 2503 // Restart. 2504 s = RunJetStreamServerOnPort(-1, sd) 2505 defer s.Shutdown() 2506 2507 a, err = s.LookupAccount("$G") 2508 if err != nil { 2509 t.Fatalf("Unexpected error: %v", err) 2510 } 2511 mset, err = a.lookupStream("S") 2512 if err != nil { 2513 t.Fatalf("Unexpected error: %v", err) 2514 } 2515 2516 // Make sure we recovered our per subject state on restart. 2517 testConsumerCreate("foo.100000", 1, 1) 2518 testConsumerCreate("foo", 1, 100_000-2) 2519 } 2520 2521 func TestNoRaceJetStreamFileStoreBufferReuse(t *testing.T) { 2522 // Uncomment to run. Needs to be on a big machine. 2523 skip(t) 2524 2525 s := RunBasicJetStreamServer(t) 2526 defer s.Shutdown() 2527 2528 cfg := &StreamConfig{Name: "TEST", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage} 2529 if _, err := s.GlobalAccount().addStreamWithStore(cfg, nil); err != nil { 2530 t.Fatalf("Unexpected error adding stream: %v", err) 2531 } 2532 2533 // Client for API requests. 2534 nc, js := jsClientConnect(t, s) 2535 defer nc.Close() 2536 2537 toSend := 200_000 2538 2539 m := nats.NewMsg("foo") 2540 m.Data = make([]byte, 8*1024) 2541 crand.Read(m.Data) 2542 2543 start := time.Now() 2544 for i := 0; i < toSend; i++ { 2545 m.Reply = _EMPTY_ 2546 switch i % 3 { 2547 case 0: 2548 m.Subject = "foo" 2549 case 1: 2550 m.Subject = "bar" 2551 case 2: 2552 m.Subject = "baz" 2553 } 2554 m.Header.Set("X-ID2", fmt.Sprintf("XXXXX-%d", i)) 2555 if _, err := js.PublishMsgAsync(m); err != nil { 2556 t.Fatalf("Err on publish: %v", err) 2557 } 2558 } 2559 <-js.PublishAsyncComplete() 2560 fmt.Printf("TOOK %v to publish\n", time.Since(start)) 2561 2562 v, err := s.Varz(nil) 2563 if err != nil { 2564 t.Fatalf("Unexpected error: %v", err) 2565 } 2566 fmt.Printf("MEM AFTER PUBLISH is %v\n", friendlyBytes(v.Mem)) 2567 2568 si, _ := js.StreamInfo("TEST") 2569 fmt.Printf("si is %+v\n", si.State) 2570 2571 received := 0 2572 done := make(chan bool) 2573 2574 cb := func(m *nats.Msg) { 2575 received++ 2576 if received >= toSend { 2577 done <- true 2578 } 2579 } 2580 2581 start = time.Now() 2582 sub, err := js.Subscribe("*", cb, nats.EnableFlowControl(), nats.IdleHeartbeat(time.Second), nats.AckNone()) 2583 if err != nil { 2584 t.Fatalf("Unexpected error: %v", err) 2585 } 2586 defer sub.Unsubscribe() 2587 <-done 2588 fmt.Printf("TOOK %v to consume\n", time.Since(start)) 2589 2590 v, err = s.Varz(nil) 2591 if err != nil { 2592 t.Fatalf("Unexpected error: %v", err) 2593 } 2594 fmt.Printf("MEM AFTER SUBSCRIBE is %v\n", friendlyBytes(v.Mem)) 2595 } 2596 2597 // Report of slow restart for a server that has many messages that have expired while it was not running. 2598 func TestNoRaceJetStreamSlowRestartWithManyExpiredMsgs(t *testing.T) { 2599 opts := DefaultTestOptions 2600 opts.Port = -1 2601 opts.JetStream = true 2602 s := RunServer(&opts) 2603 if config := s.JetStreamConfig(); config != nil { 2604 defer removeDir(t, config.StoreDir) 2605 } 2606 defer s.Shutdown() 2607 2608 // Client for API requests. 2609 nc, js := jsClientConnect(t, s) 2610 defer nc.Close() 2611 2612 ttl := 2 * time.Second 2613 _, err := js.AddStream(&nats.StreamConfig{ 2614 Name: "ORDERS", 2615 Subjects: []string{"orders.*"}, 2616 MaxAge: ttl, 2617 }) 2618 if err != nil { 2619 t.Fatalf("Unexpected error: %v", err) 2620 } 2621 2622 // Attach a consumer who is filtering on a wildcard subject as well. 2623 // This does not affect it like I thought originally but will keep it here. 2624 _, err = js.AddConsumer("ORDERS", &nats.ConsumerConfig{ 2625 Durable: "c22", 2626 FilterSubject: "orders.*", 2627 AckPolicy: nats.AckExplicitPolicy, 2628 }) 2629 if err != nil { 2630 t.Fatalf("Unexpected error: %v", err) 2631 } 2632 2633 // Now fill up with messages. 2634 toSend := 100_000 2635 for i := 1; i <= toSend; i++ { 2636 js.PublishAsync(fmt.Sprintf("orders.%d", i), []byte("OK")) 2637 } 2638 <-js.PublishAsyncComplete() 2639 2640 sdir := strings.TrimSuffix(s.JetStreamConfig().StoreDir, JetStreamStoreDir) 2641 s.Shutdown() 2642 2643 // Let them expire while not running. 2644 time.Sleep(ttl + 500*time.Millisecond) 2645 2646 start := time.Now() 2647 opts.Port = -1 2648 opts.StoreDir = sdir 2649 s = RunServer(&opts) 2650 elapsed := time.Since(start) 2651 defer s.Shutdown() 2652 2653 if elapsed > 2*time.Second { 2654 t.Fatalf("Took %v for restart which is too long", elapsed) 2655 } 2656 2657 // Check everything is correct. 2658 nc, js = jsClientConnect(t, s) 2659 defer nc.Close() 2660 2661 si, err := js.StreamInfo("ORDERS") 2662 if err != nil { 2663 t.Fatalf("Unexpected error: %v", err) 2664 } 2665 if si.State.Msgs != 0 { 2666 t.Fatalf("Expected no msgs after restart, got %d", si.State.Msgs) 2667 } 2668 } 2669 2670 func TestNoRaceJetStreamStalledMirrorsAfterExpire(t *testing.T) { 2671 c := createJetStreamClusterExplicit(t, "JSC", 3) 2672 defer c.shutdown() 2673 2674 nc, js := jsClientConnect(t, c.randomServer()) 2675 defer nc.Close() 2676 2677 cfg := &nats.StreamConfig{ 2678 Name: "TEST", 2679 Subjects: []string{"foo.*"}, 2680 Replicas: 1, 2681 MaxAge: 100 * time.Millisecond, 2682 } 2683 2684 if _, err := js.AddStream(cfg); err != nil { 2685 t.Fatalf("Error creating stream: %v", err) 2686 } 2687 2688 if _, err := js.AddStream(&nats.StreamConfig{ 2689 Name: "M", 2690 Replicas: 2, 2691 Mirror: &nats.StreamSource{Name: "TEST"}, 2692 }); err != nil { 2693 t.Fatalf("Unexpected error: %v", err) 2694 } 2695 2696 sendBatch := func(batch int) { 2697 t.Helper() 2698 for i := 0; i < batch; i++ { 2699 js.PublishAsync("foo.bar", []byte("Hello")) 2700 } 2701 select { 2702 case <-js.PublishAsyncComplete(): 2703 case <-time.After(5 * time.Second): 2704 t.Fatalf("Did not receive completion signal") 2705 } 2706 } 2707 2708 numMsgs := 10_000 2709 sendBatch(numMsgs) 2710 2711 // Turn off expiration so we can test we did not stall. 2712 cfg.MaxAge = 0 2713 if _, err := js.UpdateStream(cfg); err != nil { 2714 t.Fatalf("Unexpected error: %v", err) 2715 } 2716 2717 sendBatch(numMsgs) 2718 2719 // Wait for mirror to be caught up. 2720 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 2721 si, err := js.StreamInfo("M") 2722 if err != nil { 2723 t.Fatalf("Unexpected error: %v", err) 2724 } 2725 if si.State.LastSeq != uint64(2*numMsgs) { 2726 return fmt.Errorf("Expected %d as last sequence, got state: %+v", 2*numMsgs, si.State) 2727 } 2728 return nil 2729 }) 2730 } 2731 2732 // We will use JetStream helpers to create supercluster but this test is about exposing the ability to access 2733 // account scoped connz with subject interest filtering. 2734 func TestNoRaceJetStreamSuperClusterAccountConnz(t *testing.T) { 2735 // This has 4 different account, 3 general and system. 2736 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 3, 3) 2737 defer sc.shutdown() 2738 2739 // Create 20 connections on account one and two 2740 // Create JetStream assets for each as well to make sure by default we do not report them. 2741 num := 20 2742 for i := 0; i < num; i++ { 2743 nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo("one", "p"), nats.Name("one")) 2744 defer nc.Close() 2745 2746 if i%2 == 0 { 2747 nc.SubscribeSync("foo") 2748 } else { 2749 nc.SubscribeSync("bar") 2750 } 2751 2752 nc, js := jsClientConnect(t, sc.randomServer(), nats.UserInfo("two", "p"), nats.Name("two")) 2753 defer nc.Close() 2754 nc.SubscribeSync("baz") 2755 nc.SubscribeSync("foo.bar.*") 2756 nc.SubscribeSync(fmt.Sprintf("id.%d", i+1)) 2757 2758 js.AddStream(&nats.StreamConfig{Name: fmt.Sprintf("TEST:%d", i+1)}) 2759 } 2760 2761 type czapi struct { 2762 Server *ServerInfo 2763 Data *Connz 2764 Error *ApiError 2765 } 2766 2767 parseConnz := func(buf []byte) *Connz { 2768 t.Helper() 2769 var cz czapi 2770 if err := json.Unmarshal(buf, &cz); err != nil { 2771 t.Fatalf("Unexpected error: %v", err) 2772 } 2773 if cz.Error != nil { 2774 t.Fatalf("Unexpected error: %+v", cz.Error) 2775 } 2776 return cz.Data 2777 } 2778 2779 doRequest := func(reqSubj, acc, filter string, expected int) { 2780 t.Helper() 2781 nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo(acc, "p"), nats.Name(acc)) 2782 defer nc.Close() 2783 2784 mch := make(chan *nats.Msg, 9) 2785 sub, _ := nc.ChanSubscribe(nats.NewInbox(), mch) 2786 2787 var req []byte 2788 if filter != _EMPTY_ { 2789 req, _ = json.Marshal(&ConnzOptions{FilterSubject: filter}) 2790 } 2791 2792 if err := nc.PublishRequest(reqSubj, sub.Subject, req); err != nil { 2793 t.Fatalf("Unexpected error: %v", err) 2794 } 2795 2796 // So we can igniore ourtselves. 2797 cid, _ := nc.GetClientID() 2798 sid := nc.ConnectedServerId() 2799 2800 wt := time.NewTimer(200 * time.Millisecond) 2801 var conns []*ConnInfo 2802 LOOP: 2803 for { 2804 select { 2805 case m := <-mch: 2806 if len(m.Data) == 0 { 2807 t.Fatalf("No responders") 2808 } 2809 cr := parseConnz(m.Data) 2810 // For account scoped, NumConns and Total should be the same (sans limits and offsets). 2811 // It Total should not include other accounts since that would leak information about the system. 2812 if filter == _EMPTY_ && cr.NumConns != cr.Total { 2813 t.Fatalf("NumConns and Total should be same with account scoped connz, got %+v", cr) 2814 } 2815 for _, c := range cr.Conns { 2816 if c.Name != acc { 2817 t.Fatalf("Got wrong account: %q vs %q for %+v", acc, c.Account, c) 2818 } 2819 if !(c.Cid == cid && cr.ID == sid) { 2820 conns = append(conns, c) 2821 } 2822 } 2823 wt.Reset(200 * time.Millisecond) 2824 case <-wt.C: 2825 break LOOP 2826 } 2827 } 2828 if len(conns) != expected { 2829 t.Fatalf("Expected to see %d conns but got %d", expected, len(conns)) 2830 } 2831 } 2832 2833 doSysRequest := func(acc string, expected int) { 2834 t.Helper() 2835 doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, _EMPTY_, expected) 2836 } 2837 doAccRequest := func(acc string, expected int) { 2838 t.Helper() 2839 doRequest("$SYS.REQ.ACCOUNT.PING.CONNZ", acc, _EMPTY_, expected) 2840 } 2841 doFiltered := func(acc, filter string, expected int) { 2842 t.Helper() 2843 doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, filter, expected) 2844 } 2845 2846 doSysRequest("one", 20) 2847 doAccRequest("one", 20) 2848 2849 doSysRequest("two", 20) 2850 doAccRequest("two", 20) 2851 2852 // Now check filtering. 2853 doFiltered("one", _EMPTY_, 20) 2854 doFiltered("one", ">", 20) 2855 doFiltered("one", "bar", 10) 2856 doFiltered("two", "bar", 0) 2857 doFiltered("two", "id.1", 1) 2858 doFiltered("two", "id.*", 20) 2859 doFiltered("two", "foo.bar.*", 20) 2860 doFiltered("two", "foo.>", 20) 2861 } 2862 2863 func TestNoRaceCompressedConnz(t *testing.T) { 2864 s := RunBasicJetStreamServer(t) 2865 defer s.Shutdown() 2866 2867 nc, _ := jsClientConnect(t, s) 2868 defer nc.Close() 2869 2870 doRequest := func(compress string) { 2871 t.Helper() 2872 m := nats.NewMsg("$SYS.REQ.ACCOUNT.PING.CONNZ") 2873 m.Header.Add("Accept-Encoding", compress) 2874 resp, err := nc.RequestMsg(m, time.Second) 2875 if err != nil { 2876 t.Fatalf("Unexpected error: %v", err) 2877 } 2878 buf := resp.Data 2879 2880 // Make sure we have an encoding header. 2881 ce := resp.Header.Get("Content-Encoding") 2882 switch strings.ToLower(ce) { 2883 case "gzip": 2884 zr, err := gzip.NewReader(bytes.NewReader(buf)) 2885 if err != nil { 2886 t.Fatalf("Unexpected error: %v", err) 2887 } 2888 defer zr.Close() 2889 buf, err = io.ReadAll(zr) 2890 if err != nil && err != io.ErrUnexpectedEOF { 2891 t.Fatalf("Unexpected error: %v", err) 2892 } 2893 case "snappy", "s2": 2894 sr := s2.NewReader(bytes.NewReader(buf)) 2895 buf, err = io.ReadAll(sr) 2896 if err != nil && err != io.ErrUnexpectedEOF { 2897 t.Fatalf("Unexpected error: %v", err) 2898 } 2899 default: 2900 t.Fatalf("Unknown content-encoding of %q", ce) 2901 } 2902 2903 var cz ServerAPIConnzResponse 2904 if err := json.Unmarshal(buf, &cz); err != nil { 2905 t.Fatalf("Unexpected error: %v", err) 2906 } 2907 if cz.Error != nil { 2908 t.Fatalf("Unexpected error: %+v", cz.Error) 2909 } 2910 } 2911 2912 doRequest("gzip") 2913 doRequest("snappy") 2914 doRequest("s2") 2915 } 2916 2917 func TestNoRaceJetStreamClusterExtendedStreamPurge(t *testing.T) { 2918 for _, st := range []StorageType{FileStorage, MemoryStorage} { 2919 t.Run(st.String(), func(t *testing.T) { 2920 c := createJetStreamClusterExplicit(t, "JSC", 3) 2921 defer c.shutdown() 2922 2923 nc, js := jsClientConnect(t, c.randomServer()) 2924 defer nc.Close() 2925 2926 cfg := StreamConfig{ 2927 Name: "KV", 2928 Subjects: []string{"kv.>"}, 2929 Storage: st, 2930 Replicas: 2, 2931 MaxMsgsPer: 100, 2932 } 2933 req, err := json.Marshal(cfg) 2934 if err != nil { 2935 t.Fatalf("Unexpected error: %v", err) 2936 } 2937 // Do manually for now. 2938 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 2939 c.waitOnStreamLeader("$G", "KV") 2940 2941 si, err := js.StreamInfo("KV") 2942 if err != nil { 2943 t.Fatalf("Unexpected error: %v", err) 2944 } 2945 if si == nil || si.Config.Name != "KV" { 2946 t.Fatalf("StreamInfo is not correct %+v", si) 2947 } 2948 2949 for i := 0; i < 1000; i++ { 2950 js.PublishAsync("kv.foo", []byte("OK")) // 1 * i 2951 js.PublishAsync("kv.bar", []byte("OK")) // 2 * i 2952 js.PublishAsync("kv.baz", []byte("OK")) // 3 * i 2953 } 2954 // First is 2700, last is 3000 2955 for i := 0; i < 700; i++ { 2956 js.PublishAsync(fmt.Sprintf("kv.%d", i+1), []byte("OK")) 2957 } 2958 // Now first is 2700, last is 3700 2959 select { 2960 case <-js.PublishAsyncComplete(): 2961 case <-time.After(10 * time.Second): 2962 t.Fatalf("Did not receive completion signal") 2963 } 2964 2965 si, err = js.StreamInfo("KV") 2966 if err != nil { 2967 t.Fatalf("Unexpected error: %v", err) 2968 } 2969 if si.State.Msgs != 1000 { 2970 t.Fatalf("Expected %d msgs, got %d", 1000, si.State.Msgs) 2971 } 2972 2973 shouldFail := func(preq *JSApiStreamPurgeRequest) { 2974 req, _ := json.Marshal(preq) 2975 resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second) 2976 if err != nil { 2977 t.Fatalf("Unexpected error: %v", err) 2978 } 2979 var pResp JSApiStreamPurgeResponse 2980 if err = json.Unmarshal(resp.Data, &pResp); err != nil { 2981 t.Fatalf("Unexpected error: %v", err) 2982 } 2983 if pResp.Success || pResp.Error == nil { 2984 t.Fatalf("Expected an error response but got none") 2985 } 2986 } 2987 2988 // Sequence and Keep should be mutually exclusive. 2989 shouldFail(&JSApiStreamPurgeRequest{Sequence: 10, Keep: 10}) 2990 2991 purge := func(preq *JSApiStreamPurgeRequest, newTotal uint64) { 2992 t.Helper() 2993 req, _ := json.Marshal(preq) 2994 resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second) 2995 if err != nil { 2996 t.Fatalf("Unexpected error: %v", err) 2997 } 2998 var pResp JSApiStreamPurgeResponse 2999 if err = json.Unmarshal(resp.Data, &pResp); err != nil { 3000 t.Fatalf("Unexpected error: %v", err) 3001 } 3002 if !pResp.Success || pResp.Error != nil { 3003 t.Fatalf("Got a bad response %+v", pResp) 3004 } 3005 si, err = js.StreamInfo("KV") 3006 if err != nil { 3007 t.Fatalf("Unexpected error: %v", err) 3008 } 3009 if si.State.Msgs != newTotal { 3010 t.Fatalf("Expected total after purge to be %d but got %d", newTotal, si.State.Msgs) 3011 } 3012 } 3013 expectLeft := func(subject string, expected uint64) { 3014 t.Helper() 3015 ci, err := js.AddConsumer("KV", &nats.ConsumerConfig{Durable: "dlc", FilterSubject: subject, AckPolicy: nats.AckExplicitPolicy}) 3016 if err != nil { 3017 t.Fatalf("Unexpected error: %v", err) 3018 } 3019 defer js.DeleteConsumer("KV", "dlc") 3020 if ci.NumPending != expected { 3021 t.Fatalf("Expected %d remaining but got %d", expected, ci.NumPending) 3022 } 3023 } 3024 3025 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo"}, 900) 3026 expectLeft("kv.foo", 0) 3027 3028 purge(&JSApiStreamPurgeRequest{Subject: "kv.bar", Keep: 1}, 801) 3029 expectLeft("kv.bar", 1) 3030 3031 purge(&JSApiStreamPurgeRequest{Subject: "kv.baz", Sequence: 2851}, 751) 3032 expectLeft("kv.baz", 50) 3033 3034 purge(&JSApiStreamPurgeRequest{Subject: "kv.*"}, 0) 3035 3036 // RESET 3037 js.DeleteStream("KV") 3038 // Do manually for now. 3039 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3040 c.waitOnStreamLeader("$G", "KV") 3041 3042 if _, err := js.StreamInfo("KV"); err != nil { 3043 t.Fatalf("Unexpected error: %v", err) 3044 } 3045 // Put in 100. 3046 for i := 0; i < 100; i++ { 3047 js.PublishAsync("kv.foo", []byte("OK")) 3048 } 3049 select { 3050 case <-js.PublishAsyncComplete(): 3051 case <-time.After(time.Second): 3052 t.Fatalf("Did not receive completion signal") 3053 } 3054 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10) 3055 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10) 3056 expectLeft("kv.foo", 10) 3057 3058 // RESET AGAIN 3059 js.DeleteStream("KV") 3060 // Do manually for now. 3061 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3062 c.waitOnStreamLeader("$G", "KV") 3063 3064 if _, err := js.StreamInfo("KV"); err != nil { 3065 t.Fatalf("Unexpected error: %v", err) 3066 } 3067 // Put in 100. 3068 for i := 0; i < 100; i++ { 3069 js.Publish("kv.foo", []byte("OK")) 3070 } 3071 purge(&JSApiStreamPurgeRequest{Keep: 10}, 10) 3072 expectLeft(">", 10) 3073 3074 // RESET AGAIN 3075 js.DeleteStream("KV") 3076 // Do manually for now. 3077 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3078 if _, err := js.StreamInfo("KV"); err != nil { 3079 t.Fatalf("Unexpected error: %v", err) 3080 } 3081 // Put in 100. 3082 for i := 0; i < 100; i++ { 3083 js.Publish("kv.foo", []byte("OK")) 3084 } 3085 purge(&JSApiStreamPurgeRequest{Sequence: 90}, 11) // Up to 90 so we keep that, hence the 11. 3086 expectLeft(">", 11) 3087 }) 3088 } 3089 } 3090 3091 func TestNoRaceJetStreamFileStoreCompaction(t *testing.T) { 3092 s := RunBasicJetStreamServer(t) 3093 defer s.Shutdown() 3094 3095 nc, js := jsClientConnect(t, s) 3096 defer nc.Close() 3097 3098 cfg := &nats.StreamConfig{ 3099 Name: "KV", 3100 Subjects: []string{"KV.>"}, 3101 MaxMsgsPerSubject: 1, 3102 } 3103 if _, err := js.AddStream(cfg); err != nil { 3104 t.Fatalf("Unexpected error: %v", err) 3105 } 3106 3107 toSend := 10_000 3108 data := make([]byte, 4*1024) 3109 crand.Read(data) 3110 3111 // First one. 3112 js.PublishAsync("KV.FM", data) 3113 3114 for i := 0; i < toSend; i++ { 3115 js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) 3116 } 3117 // Do again and overwrite the previous batch. 3118 for i := 0; i < toSend; i++ { 3119 js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) 3120 } 3121 select { 3122 case <-js.PublishAsyncComplete(): 3123 case <-time.After(10 * time.Second): 3124 t.Fatalf("Did not receive completion signal") 3125 } 3126 3127 // Now check by hand the utilization level. 3128 mset, err := s.GlobalAccount().lookupStream("KV") 3129 if err != nil { 3130 t.Fatalf("Unexpected error: %v", err) 3131 } 3132 total, used, _ := mset.Store().Utilization() 3133 if pu := 100.0 * float32(used) / float32(total); pu < 80.0 { 3134 t.Fatalf("Utilization is less than 80%%, got %.2f", pu) 3135 } 3136 } 3137 3138 func TestNoRaceJetStreamEncryptionEnabledOnRestartWithExpire(t *testing.T) { 3139 conf := createConfFile(t, []byte(fmt.Sprintf(` 3140 listen: 127.0.0.1:-1 3141 jetstream { 3142 store_dir = %q 3143 } 3144 `, t.TempDir()))) 3145 3146 s, _ := RunServerWithConfig(conf) 3147 defer s.Shutdown() 3148 3149 config := s.JetStreamConfig() 3150 if config == nil { 3151 t.Fatalf("Expected config but got none") 3152 } 3153 defer removeDir(t, config.StoreDir) 3154 3155 nc, js := jsClientConnect(t, s) 3156 defer nc.Close() 3157 3158 toSend := 10_000 3159 3160 cfg := &nats.StreamConfig{ 3161 Name: "TEST", 3162 Subjects: []string{"foo", "bar"}, 3163 MaxMsgs: int64(toSend), 3164 } 3165 if _, err := js.AddStream(cfg); err != nil { 3166 t.Fatalf("Unexpected error: %v", err) 3167 } 3168 3169 data := make([]byte, 4*1024) // 4K payload 3170 crand.Read(data) 3171 3172 for i := 0; i < toSend; i++ { 3173 js.PublishAsync("foo", data) 3174 js.PublishAsync("bar", data) 3175 } 3176 select { 3177 case <-js.PublishAsyncComplete(): 3178 case <-time.After(5 * time.Second): 3179 t.Fatalf("Did not receive completion signal") 3180 } 3181 3182 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dlc", AckPolicy: nats.AckExplicitPolicy}) 3183 if err != nil { 3184 t.Fatalf("Unexpected error: %v", err) 3185 } 3186 3187 // Restart 3188 nc.Close() 3189 s.Shutdown() 3190 3191 ncs := fmt.Sprintf("\nlisten: 127.0.0.1:-1\njetstream: {key: %q, store_dir: %q}\n", "s3cr3t!", config.StoreDir) 3192 conf = createConfFile(t, []byte(ncs)) 3193 3194 // Try to drain entropy to see if effects startup time. 3195 drain := make([]byte, 32*1024*1024) // Pull 32Mb of crypto rand. 3196 crand.Read(drain) 3197 3198 start := time.Now() 3199 s, _ = RunServerWithConfig(conf) 3200 defer s.Shutdown() 3201 dd := time.Since(start) 3202 if dd > 5*time.Second { 3203 t.Fatalf("Restart took longer than expected: %v", dd) 3204 } 3205 } 3206 3207 // This test was from Ivan K. and showed a bug in the filestore implementation. 3208 // This is skipped by default since it takes >40s to run. 3209 func TestNoRaceJetStreamOrderedConsumerMissingMsg(t *testing.T) { 3210 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 3211 skip(t) 3212 3213 s := RunBasicJetStreamServer(t) 3214 defer s.Shutdown() 3215 3216 nc, js := jsClientConnect(t, s) 3217 defer nc.Close() 3218 3219 if _, err := js.AddStream(&nats.StreamConfig{ 3220 Name: "benchstream", 3221 Subjects: []string{"testsubject"}, 3222 Replicas: 1, 3223 }); err != nil { 3224 t.Fatalf("add stream failed: %s", err) 3225 } 3226 3227 total := 1_000_000 3228 3229 numSubs := 10 3230 ch := make(chan struct{}, numSubs) 3231 wg := sync.WaitGroup{} 3232 wg.Add(numSubs) 3233 errCh := make(chan error, 1) 3234 for i := 0; i < numSubs; i++ { 3235 nc, js := jsClientConnect(t, s) 3236 defer nc.Close() 3237 go func(nc *nats.Conn, js nats.JetStreamContext) { 3238 defer wg.Done() 3239 received := 0 3240 _, err := js.Subscribe("testsubject", func(m *nats.Msg) { 3241 meta, _ := m.Metadata() 3242 if meta.Sequence.Consumer != meta.Sequence.Stream { 3243 nc.Close() 3244 errCh <- fmt.Errorf("Bad meta: %+v", meta) 3245 } 3246 received++ 3247 if received == total { 3248 ch <- struct{}{} 3249 } 3250 }, nats.OrderedConsumer()) 3251 if err != nil { 3252 select { 3253 case errCh <- fmt.Errorf("Error creating sub: %v", err): 3254 default: 3255 } 3256 3257 } 3258 }(nc, js) 3259 } 3260 wg.Wait() 3261 select { 3262 case e := <-errCh: 3263 t.Fatal(e) 3264 default: 3265 } 3266 3267 payload := make([]byte, 500) 3268 for i := 1; i <= total; i++ { 3269 js.PublishAsync("testsubject", payload) 3270 } 3271 select { 3272 case <-js.PublishAsyncComplete(): 3273 case <-time.After(10 * time.Second): 3274 t.Fatalf("Did not send all messages") 3275 } 3276 3277 // Now wait for consumers to be done: 3278 for i := 0; i < numSubs; i++ { 3279 select { 3280 case <-ch: 3281 case <-time.After(10 * time.Second): 3282 t.Fatal("Did not receive all messages for all consumers in time") 3283 } 3284 } 3285 3286 } 3287 3288 // Issue #2488 - Bad accounting, can not reproduce the stalled consumers after last several PRs. 3289 // Issue did show bug in ack logic for no-ack and interest based retention. 3290 func TestNoRaceJetStreamClusterInterestPolicyAckNone(t *testing.T) { 3291 for _, test := range []struct { 3292 name string 3293 durable string 3294 }{ 3295 {"durable", "dlc"}, 3296 {"ephemeral", _EMPTY_}, 3297 } { 3298 t.Run(test.name, func(t *testing.T) { 3299 c := createJetStreamClusterExplicit(t, "R3S", 3) 3300 defer c.shutdown() 3301 3302 // Client based API 3303 nc, js := jsClientConnect(t, c.randomServer()) 3304 defer nc.Close() 3305 3306 _, err := js.AddStream(&nats.StreamConfig{ 3307 Name: "cluster", 3308 Subjects: []string{"cluster.*"}, 3309 Retention: nats.InterestPolicy, 3310 Discard: nats.DiscardOld, 3311 Replicas: 3, 3312 }) 3313 if err != nil { 3314 t.Fatalf("Unexpected error: %v", err) 3315 } 3316 3317 var received uint32 3318 mh := func(m *nats.Msg) { 3319 atomic.AddUint32(&received, 1) 3320 } 3321 3322 opts := []nats.SubOpt{nats.DeliverNew(), nats.AckNone()} 3323 if test.durable != _EMPTY_ { 3324 opts = append(opts, nats.Durable(test.durable)) 3325 } 3326 _, err = js.Subscribe("cluster.created", mh, opts...) 3327 if err != nil { 3328 t.Fatalf("Unexpected error: %v", err) 3329 } 3330 3331 msg := []byte("ACK ME") 3332 const total = uint32(1_000) 3333 for i := 0; i < int(total); i++ { 3334 if _, err := js.Publish("cluster.created", msg); err != nil { 3335 t.Fatalf("Unexpected error: %v", err) 3336 } 3337 //time.Sleep(100 * time.Microsecond) 3338 } 3339 3340 // Wait for all messages to be received. 3341 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 3342 r := atomic.LoadUint32(&received) 3343 if r == total { 3344 return nil 3345 } 3346 return fmt.Errorf("Received only %d out of %d", r, total) 3347 }) 3348 3349 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3350 si, err := js.StreamInfo("cluster") 3351 if err != nil { 3352 t.Fatalf("Error getting stream info: %v", err) 3353 } 3354 if si.State.Msgs != 0 { 3355 return fmt.Errorf("Expected no messages, got %d", si.State.Msgs) 3356 } 3357 return nil 3358 }) 3359 }) 3360 } 3361 } 3362 3363 // There was a bug in the filestore compact code that would cause a store 3364 // with JSExpectedLastSubjSeq to fail with "wrong last sequence: 0" 3365 func TestNoRaceJetStreamLastSubjSeqAndFilestoreCompact(t *testing.T) { 3366 s := RunBasicJetStreamServer(t) 3367 defer s.Shutdown() 3368 3369 // Client based API 3370 nc, js := jsClientConnect(t, s) 3371 defer nc.Close() 3372 3373 _, err := js.AddStream(&nats.StreamConfig{ 3374 Name: "MQTT_sess", 3375 Subjects: []string{"MQTT.sess.>"}, 3376 Storage: nats.FileStorage, 3377 Retention: nats.LimitsPolicy, 3378 Replicas: 1, 3379 MaxMsgsPerSubject: 1, 3380 }) 3381 if err != nil { 3382 t.Fatalf("Unexpected error: %v", err) 3383 } 3384 3385 firstPayload := make([]byte, 40) 3386 secondPayload := make([]byte, 380) 3387 for iter := 0; iter < 2; iter++ { 3388 for i := 0; i < 4000; i++ { 3389 subj := "MQTT.sess." + getHash(fmt.Sprintf("client_%d", i)) 3390 pa, err := js.Publish(subj, firstPayload) 3391 if err != nil { 3392 t.Fatalf("Error on publish: %v", err) 3393 } 3394 m := nats.NewMsg(subj) 3395 m.Data = secondPayload 3396 eseq := strconv.FormatInt(int64(pa.Sequence), 10) 3397 m.Header.Set(JSExpectedLastSubjSeq, eseq) 3398 if _, err := js.PublishMsg(m); err != nil { 3399 t.Fatalf("Error on publish (iter=%v seq=%v): %v", iter+1, pa.Sequence, err) 3400 } 3401 } 3402 } 3403 } 3404 3405 // Issue #2548 3406 func TestNoRaceJetStreamClusterMemoryStreamConsumerRaftGrowth(t *testing.T) { 3407 c := createJetStreamClusterExplicit(t, "R3S", 3) 3408 defer c.shutdown() 3409 3410 nc, js := jsClientConnect(t, c.randomServer()) 3411 defer nc.Close() 3412 3413 _, err := js.AddStream(&nats.StreamConfig{ 3414 Name: "memory-leak", 3415 Subjects: []string{"memory-leak"}, 3416 Retention: nats.LimitsPolicy, 3417 MaxMsgs: 1000, 3418 Discard: nats.DiscardOld, 3419 MaxAge: time.Minute, 3420 Storage: nats.MemoryStorage, 3421 Replicas: 3, 3422 }) 3423 if err != nil { 3424 t.Fatalf("Unexpected error: %v", err) 3425 } 3426 3427 _, err = js.QueueSubscribe("memory-leak", "q1", func(msg *nats.Msg) { 3428 time.Sleep(1 * time.Second) 3429 msg.AckSync() 3430 }) 3431 if err != nil { 3432 t.Fatalf("Unexpected error: %v", err) 3433 } 3434 3435 // Send 10k (Must be > 8192 which is compactNumMin from monitorConsumer. 3436 msg := []byte("NATS is a connective technology that powers modern distributed systems.") 3437 for i := 0; i < 10_000; i++ { 3438 if _, err := js.Publish("memory-leak", msg); err != nil { 3439 t.Fatalf("Unexpected error: %v", err) 3440 } 3441 } 3442 3443 // We will verify here that the underlying raft layer for the leader is not > 8192 3444 cl := c.consumerLeader("$G", "memory-leak", "q1") 3445 mset, err := cl.GlobalAccount().lookupStream("memory-leak") 3446 if err != nil { 3447 t.Fatalf("Unexpected error: %v", err) 3448 } 3449 o := mset.lookupConsumer("q1") 3450 if o == nil { 3451 t.Fatalf("Error looking up consumer %q", "q1") 3452 } 3453 node := o.raftNode().(*raft) 3454 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 3455 if ms := node.wal.(*memStore); ms.State().Msgs > 8192 { 3456 return fmt.Errorf("Did not compact the raft memory WAL") 3457 } 3458 return nil 3459 }) 3460 } 3461 3462 func TestNoRaceJetStreamClusterCorruptWAL(t *testing.T) { 3463 c := createJetStreamClusterExplicit(t, "R3S", 3) 3464 defer c.shutdown() 3465 3466 nc, js := jsClientConnect(t, c.randomServer()) 3467 defer nc.Close() 3468 3469 if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Subjects: []string{"foo"}, Replicas: 3}); err != nil { 3470 t.Fatalf("Unexpected error: %v", err) 3471 } 3472 3473 sub, err := js.PullSubscribe("foo", "dlc") 3474 if err != nil { 3475 t.Fatalf("Unexpected error: %v", err) 3476 } 3477 3478 numMsgs := 1000 3479 for i := 0; i < numMsgs; i++ { 3480 js.PublishAsync("foo", []byte("WAL")) 3481 } 3482 select { 3483 case <-js.PublishAsyncComplete(): 3484 case <-time.After(5 * time.Second): 3485 t.Fatalf("Did not receive completion signal") 3486 } 3487 3488 for i, m := range fetchMsgs(t, sub, 200, 5*time.Second) { 3489 // Ack first 50 and every other even on after that.. 3490 if i < 50 || i%2 == 1 { 3491 m.AckSync() 3492 } 3493 } 3494 // Make sure acks processed. 3495 time.Sleep(200 * time.Millisecond) 3496 nc.Close() 3497 3498 // Check consumer consistency. 3499 checkConsumerWith := func(delivered, ackFloor uint64, ackPending int) { 3500 t.Helper() 3501 nc, js := jsClientConnect(t, c.randomServer()) 3502 defer nc.Close() 3503 3504 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3505 ci, err := js.ConsumerInfo("TEST", "dlc") 3506 if err != nil { 3507 return fmt.Errorf("Unexpected error: %v", err) 3508 } 3509 if ci.Delivered.Consumer != ci.Delivered.Stream || ci.Delivered.Consumer != delivered { 3510 return fmt.Errorf("Expected %d for delivered, got %+v", delivered, ci.Delivered) 3511 } 3512 if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != ackFloor { 3513 return fmt.Errorf("Expected %d for ack floor, got %+v", ackFloor, ci.AckFloor) 3514 } 3515 nm := uint64(numMsgs) 3516 if ci.NumPending != nm-delivered { 3517 return fmt.Errorf("Expected num pending to be %d, got %d", nm-delivered, ci.NumPending) 3518 } 3519 if ci.NumAckPending != ackPending { 3520 return fmt.Errorf("Expected num ack pending to be %d, got %d", ackPending, ci.NumAckPending) 3521 } 3522 return nil 3523 }) 3524 } 3525 3526 checkConsumer := func() { 3527 t.Helper() 3528 checkConsumerWith(200, 50, 75) 3529 } 3530 3531 checkConsumer() 3532 3533 // Grab the consumer leader. 3534 cl := c.consumerLeader("$G", "TEST", "dlc") 3535 mset, err := cl.GlobalAccount().lookupStream("TEST") 3536 if err != nil { 3537 t.Fatalf("Unexpected error: %v", err) 3538 } 3539 o := mset.lookupConsumer("dlc") 3540 if o == nil { 3541 t.Fatalf("Error looking up consumer %q", "dlc") 3542 } 3543 // Grab underlying raft node and the WAL (filestore) and we will attempt to "corrupt" it. 3544 node := o.raftNode().(*raft) 3545 // We are doing a stop here to prevent the internal consumer snapshot from happening on exit 3546 node.Stop() 3547 fs := node.wal.(*fileStore) 3548 fcfg, cfg := fs.fcfg, fs.cfg.StreamConfig 3549 // Stop all the servers. 3550 c.stopAll() 3551 3552 // Manipulate directly with cluster down. 3553 fs, err = newFileStore(fcfg, cfg) 3554 if err != nil { 3555 t.Fatalf("Unexpected error: %v", err) 3556 } 3557 state := fs.State() 3558 sm, err := fs.LoadMsg(state.LastSeq, nil) 3559 if err != nil { 3560 t.Fatalf("Unexpected error: %v", err) 3561 } 3562 ae, err := node.decodeAppendEntry(sm.msg, nil, _EMPTY_) 3563 if err != nil { 3564 t.Fatalf("Unexpected error: %v", err) 3565 } 3566 3567 dentry := func(dseq, sseq, dc uint64, ts int64) []byte { 3568 b := make([]byte, 4*binary.MaxVarintLen64+1) 3569 b[0] = byte(updateDeliveredOp) 3570 n := 1 3571 n += binary.PutUvarint(b[n:], dseq) 3572 n += binary.PutUvarint(b[n:], sseq) 3573 n += binary.PutUvarint(b[n:], dc) 3574 n += binary.PutVarint(b[n:], ts) 3575 return b[:n] 3576 } 3577 3578 // Let's put a non-contigous AppendEntry into the system. 3579 ae.pindex += 10 3580 // Add in delivered record. 3581 ae.entries = []*Entry{{EntryNormal, dentry(1000, 1000, 1, time.Now().UnixNano())}} 3582 encoded, err := ae.encode(nil) 3583 if err != nil { 3584 t.Fatalf("Unexpected error: %v", err) 3585 } 3586 if _, _, err := fs.StoreMsg(_EMPTY_, nil, encoded); err != nil { 3587 t.Fatalf("Unexpected error: %v", err) 3588 } 3589 fs.Stop() 3590 3591 c.restartAllSamePorts() 3592 c.waitOnStreamLeader("$G", "TEST") 3593 c.waitOnConsumerLeader("$G", "TEST", "dlc") 3594 3595 checkConsumer() 3596 3597 // Now we will truncate out the WAL out from underneath the leader. 3598 // Grab the consumer leader. 3599 3600 nc, js = jsClientConnect(t, c.randomServer()) 3601 defer nc.Close() 3602 3603 cl = c.consumerLeader("$G", "TEST", "dlc") 3604 mset, err = cl.GlobalAccount().lookupStream("TEST") 3605 require_NoError(t, err) 3606 o = mset.lookupConsumer("dlc") 3607 require_NoError(t, err) 3608 3609 // Grab underlying raft node and the WAL (filestore) and truncate it. 3610 // This will simulate the WAL losing state due to truncate and we want to make sure it recovers. 3611 3612 fs = o.raftNode().(*raft).wal.(*fileStore) 3613 state = fs.State() 3614 err = fs.Truncate(state.FirstSeq) 3615 require_True(t, err == nil || err == ErrInvalidSequence) 3616 state = fs.State() 3617 3618 sub, err = js.PullSubscribe("foo", "dlc") 3619 require_NoError(t, err) 3620 3621 // This will cause us to stepdown and truncate our WAL. 3622 sub.Fetch(100) 3623 c.waitOnConsumerLeader("$G", "TEST", "dlc") 3624 // We can't trust the results sans that we have a leader back in place and the ackFloor. 3625 ci, err := js.ConsumerInfo("TEST", "dlc") 3626 require_NoError(t, err) 3627 if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != 50 { 3628 t.Fatalf("Expected %d for ack floor, got %+v", 50, ci.AckFloor) 3629 } 3630 } 3631 3632 func TestNoRaceJetStreamClusterInterestRetentionDeadlock(t *testing.T) { 3633 c := createJetStreamClusterExplicit(t, "R3S", 3) 3634 defer c.shutdown() 3635 3636 // Client based API 3637 s := c.randomServer() 3638 nc, js := jsClientConnect(t, s) 3639 defer nc.Close() 3640 3641 // This can trigger deadlock with current architecture. 3642 // If stream is !limitsRetention and consumer is DIRECT and ack none we will try to place the msg seq 3643 // onto a chan for the stream to consider removing. All conditions above must hold to trigger. 3644 3645 // We will attempt to trigger here with a stream mirror setup which uses and R=1 DIRECT consumer to replicate msgs. 3646 _, err := js.AddStream(&nats.StreamConfig{Name: "S", Retention: nats.InterestPolicy, Storage: nats.MemoryStorage}) 3647 if err != nil { 3648 t.Fatalf("Unexpected error: %v", err) 3649 } 3650 3651 // Create a mirror which will create the consumer profile to trigger. 3652 _, err = js.AddStream(&nats.StreamConfig{Name: "M", Mirror: &nats.StreamSource{Name: "S"}}) 3653 if err != nil { 3654 t.Fatalf("Unexpected error: %v", err) 3655 } 3656 3657 // Queue up alot of messages. 3658 numRequests := 20_000 3659 for i := 0; i < numRequests; i++ { 3660 js.PublishAsync("S", []byte("Q")) 3661 } 3662 select { 3663 case <-js.PublishAsyncComplete(): 3664 case <-time.After(5 * time.Second): 3665 t.Fatalf("Did not receive completion signal") 3666 } 3667 3668 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3669 si, err := js.StreamInfo("S") 3670 if err != nil { 3671 t.Fatalf("Unexpected error: %v", err) 3672 } 3673 if si.State.Msgs != 0 { 3674 return fmt.Errorf("Expected 0 msgs, got state: %+v", si.State) 3675 } 3676 return nil 3677 }) 3678 } 3679 3680 func TestNoRaceJetStreamClusterMaxConsumersAndDirect(t *testing.T) { 3681 c := createJetStreamClusterExplicit(t, "R3S", 3) 3682 defer c.shutdown() 3683 3684 // Client based API 3685 s := c.randomServer() 3686 nc, js := jsClientConnect(t, s) 3687 defer nc.Close() 3688 3689 // We want to max sure max consumer limits do not affect mirrors or sources etc. 3690 _, err := js.AddStream(&nats.StreamConfig{Name: "S", Storage: nats.MemoryStorage, MaxConsumers: 1}) 3691 if err != nil { 3692 t.Fatalf("Unexpected error: %v", err) 3693 } 3694 3695 var mirrors []string 3696 for i := 0; i < 10; i++ { 3697 // Create a mirror. 3698 mname := fmt.Sprintf("M-%d", i+1) 3699 mirrors = append(mirrors, mname) 3700 _, err = js.AddStream(&nats.StreamConfig{Name: mname, Mirror: &nats.StreamSource{Name: "S"}}) 3701 if err != nil { 3702 t.Fatalf("Unexpected error: %v", err) 3703 } 3704 } 3705 3706 // Queue up messages. 3707 numRequests := 20 3708 for i := 0; i < numRequests; i++ { 3709 js.Publish("S", []byte("Q")) 3710 } 3711 3712 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3713 for _, mname := range mirrors { 3714 si, err := js.StreamInfo(mname) 3715 if err != nil { 3716 t.Fatalf("Unexpected error: %v", err) 3717 } 3718 if si.State.Msgs != uint64(numRequests) { 3719 return fmt.Errorf("Expected %d msgs for %q, got state: %+v", numRequests, mname, si.State) 3720 } 3721 } 3722 return nil 3723 }) 3724 } 3725 3726 // Make sure when we try to hard reset a stream state in a cluster that we also re-create the consumers. 3727 func TestNoRaceJetStreamClusterStreamReset(t *testing.T) { 3728 // Speed up raft 3729 omin, omax, ohb := minElectionTimeout, maxElectionTimeout, hbInterval 3730 minElectionTimeout = 250 * time.Millisecond 3731 maxElectionTimeout = time.Second 3732 hbInterval = 50 * time.Millisecond 3733 defer func() { 3734 minElectionTimeout = omin 3735 maxElectionTimeout = omax 3736 hbInterval = ohb 3737 }() 3738 3739 c := createJetStreamClusterExplicit(t, "R3S", 3) 3740 defer c.shutdown() 3741 3742 // Client based API 3743 s := c.randomServer() 3744 nc, js := jsClientConnect(t, s) 3745 defer nc.Close() 3746 3747 _, err := js.AddStream(&nats.StreamConfig{ 3748 Name: "TEST", 3749 Subjects: []string{"foo.*"}, 3750 Replicas: 2, 3751 Retention: nats.WorkQueuePolicy, 3752 }) 3753 if err != nil { 3754 t.Fatalf("Unexpected error: %v", err) 3755 } 3756 3757 numRequests := 20 3758 for i := 0; i < numRequests; i++ { 3759 js.Publish("foo.created", []byte("REQ")) 3760 } 3761 3762 // Durable. 3763 sub, err := js.SubscribeSync("foo.created", nats.Durable("d1")) 3764 if err != nil { 3765 t.Fatalf("Unexpected error: %v", err) 3766 } 3767 defer sub.Unsubscribe() 3768 3769 si, err := js.StreamInfo("TEST") 3770 require_NoError(t, err) 3771 require_True(t, si.State.Msgs == uint64(numRequests)) 3772 3773 // Let settle a bit for Go routine checks. 3774 time.Sleep(500 * time.Millisecond) 3775 3776 // Grab number go routines. 3777 base := runtime.NumGoroutine() 3778 3779 // Make the consumer busy here by async sending a bunch of messages. 3780 for i := 0; i < numRequests*10; i++ { 3781 js.PublishAsync("foo.created", []byte("REQ")) 3782 } 3783 3784 // Grab a server that is the consumer leader for the durable. 3785 cl := c.consumerLeader("$G", "TEST", "d1") 3786 mset, err := cl.GlobalAccount().lookupStream("TEST") 3787 if err != nil { 3788 t.Fatalf("Unexpected error: %v", err) 3789 } 3790 // Do a hard reset here by hand. 3791 mset.resetClusteredState(nil) 3792 3793 // Wait til we have the consumer leader re-elected. 3794 c.waitOnConsumerLeader("$G", "TEST", "d1") 3795 3796 // Make sure we can get the consumer info eventually. 3797 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 3798 _, err := js.ConsumerInfo("TEST", "d1", nats.MaxWait(250*time.Millisecond)) 3799 return err 3800 }) 3801 3802 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 3803 if after := runtime.NumGoroutine(); base > after { 3804 return fmt.Errorf("Expected %d go routines, got %d", base, after) 3805 } 3806 return nil 3807 }) 3808 3809 // Simulate a low level write error on our consumer and make sure we can recover etc. 3810 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 3811 if cl = c.consumerLeader("$G", "TEST", "d1"); cl != nil { 3812 return nil 3813 } 3814 return errors.New("waiting on consumer leader") 3815 }) 3816 3817 mset, err = cl.GlobalAccount().lookupStream("TEST") 3818 if err != nil { 3819 t.Fatalf("Unexpected error: %v", err) 3820 } 3821 o := mset.lookupConsumer("d1") 3822 if o == nil { 3823 t.Fatalf("Did not retrieve consumer") 3824 } 3825 node := o.raftNode().(*raft) 3826 if node == nil { 3827 t.Fatalf("could not retrieve the raft node for consumer") 3828 } 3829 3830 nc.Close() 3831 node.setWriteErr(io.ErrShortWrite) 3832 3833 c.stopAll() 3834 c.restartAll() 3835 3836 c.waitOnStreamLeader("$G", "TEST") 3837 c.waitOnConsumerLeader("$G", "TEST", "d1") 3838 } 3839 3840 // Reports of high cpu on compaction for a KV store. 3841 func TestNoRaceJetStreamKeyValueCompaction(t *testing.T) { 3842 c := createJetStreamClusterExplicit(t, "R3S", 3) 3843 defer c.shutdown() 3844 3845 // Client based API 3846 nc, js := jsClientConnect(t, c.randomServer()) 3847 defer nc.Close() 3848 3849 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 3850 Bucket: "COMPACT", 3851 Replicas: 3, 3852 }) 3853 if err != nil { 3854 t.Fatalf("Unexpected error: %v", err) 3855 } 3856 3857 value := strings.Repeat("A", 128*1024) 3858 for i := 0; i < 5_000; i++ { 3859 key := fmt.Sprintf("K-%d", rand.Intn(256)+1) 3860 if _, err := kv.PutString(key, value); err != nil { 3861 t.Fatalf("Unexpected error: %v", err) 3862 } 3863 } 3864 } 3865 3866 // Trying to recreate an issue rip saw with KV and server restarts complaining about 3867 // mismatch for a few minutes and growing memory. 3868 func TestNoRaceJetStreamClusterStreamSeqMismatchIssue(t *testing.T) { 3869 c := createJetStreamClusterExplicit(t, "R3S", 3) 3870 defer c.shutdown() 3871 3872 // Client based API 3873 nc, js := jsClientConnect(t, c.randomServer()) 3874 defer nc.Close() 3875 3876 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 3877 Bucket: "MM", 3878 Replicas: 3, 3879 TTL: 500 * time.Millisecond, 3880 }) 3881 require_NoError(t, err) 3882 3883 for i := 1; i <= 10; i++ { 3884 if _, err := kv.PutString("k", "1"); err != nil { 3885 t.Fatalf("Unexpected error: %v", err) 3886 } 3887 } 3888 // Close in case we are connected here. Will recreate. 3889 nc.Close() 3890 3891 // Shutdown a non-leader. 3892 s := c.randomNonStreamLeader("$G", "KV_MM") 3893 s.Shutdown() 3894 3895 nc, js = jsClientConnect(t, c.randomServer()) 3896 defer nc.Close() 3897 3898 kv, err = js.KeyValue("MM") 3899 require_NoError(t, err) 3900 3901 // Now change the state of the stream such that we have to do a compact upon restart 3902 // of the downed server. 3903 for i := 1; i <= 10; i++ { 3904 if _, err := kv.PutString("k", "2"); err != nil { 3905 t.Fatalf("Unexpected error: %v", err) 3906 } 3907 } 3908 3909 // Raft could save us here so need to run a compact on the leader. 3910 snapshotLeader := func() { 3911 sl := c.streamLeader("$G", "KV_MM") 3912 if sl == nil { 3913 t.Fatalf("Did not get the leader") 3914 } 3915 mset, err := sl.GlobalAccount().lookupStream("KV_MM") 3916 require_NoError(t, err) 3917 node := mset.raftNode() 3918 if node == nil { 3919 t.Fatalf("Could not get stream group") 3920 } 3921 if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil { 3922 t.Fatalf("Error installing snapshot: %v", err) 3923 } 3924 } 3925 3926 // Now wait for expiration 3927 time.Sleep(time.Second) 3928 3929 snapshotLeader() 3930 3931 s = c.restartServer(s) 3932 c.waitOnServerCurrent(s) 3933 3934 // We want to make sure we do not reset the raft state on a catchup due to no request yield. 3935 // Bug was if we did not actually request any help from snapshot we did not set mset.lseq properly. 3936 // So when we send next batch that would cause raft reset due to cluster reset for our stream. 3937 mset, err := s.GlobalAccount().lookupStream("KV_MM") 3938 require_NoError(t, err) 3939 3940 for i := 1; i <= 10; i++ { 3941 if _, err := kv.PutString("k1", "X"); err != nil { 3942 t.Fatalf("Unexpected error: %v", err) 3943 } 3944 } 3945 3946 c.waitOnStreamCurrent(s, "$G", "KV_MM") 3947 3948 // Make sure we did not reset our stream. 3949 msetNew, err := s.GlobalAccount().lookupStream("KV_MM") 3950 require_NoError(t, err) 3951 if msetNew != mset { 3952 t.Fatalf("Stream was reset") 3953 } 3954 } 3955 3956 func TestNoRaceJetStreamClusterStreamDropCLFS(t *testing.T) { 3957 c := createJetStreamClusterExplicit(t, "R3S", 3) 3958 defer c.shutdown() 3959 3960 // Client based API 3961 nc, js := jsClientConnect(t, c.randomServer()) 3962 defer nc.Close() 3963 3964 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 3965 Bucket: "CLFS", 3966 Replicas: 3, 3967 }) 3968 require_NoError(t, err) 3969 3970 // Will work 3971 _, err = kv.Create("k.1", []byte("X")) 3972 require_NoError(t, err) 3973 // Drive up CLFS state on leader. 3974 for i := 0; i < 10; i++ { 3975 _, err = kv.Create("k.1", []byte("X")) 3976 require_Error(t, err) 3977 } 3978 // Bookend with new key success. 3979 _, err = kv.Create("k.2", []byte("Z")) 3980 require_NoError(t, err) 3981 3982 // Close in case we are connected here. Will recreate. 3983 nc.Close() 3984 3985 // Shutdown, which will also clear clfs. 3986 s := c.randomNonStreamLeader("$G", "KV_CLFS") 3987 s.Shutdown() 3988 3989 nc, js = jsClientConnect(t, c.randomServer()) 3990 defer nc.Close() 3991 3992 kv, err = js.KeyValue("CLFS") 3993 require_NoError(t, err) 3994 3995 // Drive up CLFS state on leader. 3996 for i := 0; i < 10; i++ { 3997 _, err = kv.Create("k.1", []byte("X")) 3998 require_Error(t, err) 3999 } 4000 4001 sl := c.streamLeader("$G", "KV_CLFS") 4002 if sl == nil { 4003 t.Fatalf("Did not get the leader") 4004 } 4005 mset, err := sl.GlobalAccount().lookupStream("KV_CLFS") 4006 require_NoError(t, err) 4007 node := mset.raftNode() 4008 if node == nil { 4009 t.Fatalf("Could not get stream group") 4010 } 4011 if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil { 4012 t.Fatalf("Error installing snapshot: %v", err) 4013 } 4014 4015 _, err = kv.Create("k.3", []byte("ZZZ")) 4016 require_NoError(t, err) 4017 4018 s = c.restartServer(s) 4019 c.waitOnServerCurrent(s) 4020 4021 mset, err = s.GlobalAccount().lookupStream("KV_CLFS") 4022 require_NoError(t, err) 4023 4024 _, err = kv.Create("k.4", []byte("YYY")) 4025 require_NoError(t, err) 4026 4027 c.waitOnStreamCurrent(s, "$G", "KV_CLFS") 4028 4029 // Make sure we did not reset our stream. 4030 msetNew, err := s.GlobalAccount().lookupStream("KV_CLFS") 4031 require_NoError(t, err) 4032 if msetNew != mset { 4033 t.Fatalf("Stream was reset") 4034 } 4035 } 4036 4037 func TestNoRaceJetStreamMemstoreWithLargeInteriorDeletes(t *testing.T) { 4038 s := RunBasicJetStreamServer(t) 4039 defer s.Shutdown() 4040 4041 // Client for API requests. 4042 nc, js := jsClientConnect(t, s) 4043 defer nc.Close() 4044 4045 _, err := js.AddStream(&nats.StreamConfig{ 4046 Name: "TEST", 4047 Subjects: []string{"foo", "bar"}, 4048 MaxMsgsPerSubject: 1, 4049 Storage: nats.MemoryStorage, 4050 }) 4051 require_NoError(t, err) 4052 4053 acc, err := s.lookupAccount("$G") 4054 require_NoError(t, err) 4055 mset, err := acc.lookupStream("TEST") 4056 require_NoError(t, err) 4057 4058 msg := []byte("Hello World!") 4059 if _, err := js.PublishAsync("foo", msg); err != nil { 4060 t.Fatalf("Unexpected publish error: %v", err) 4061 } 4062 for i := 1; i <= 1_000_000; i++ { 4063 if _, err := js.PublishAsync("bar", msg); err != nil { 4064 t.Fatalf("Unexpected publish error: %v", err) 4065 } 4066 } 4067 select { 4068 case <-js.PublishAsyncComplete(): 4069 case <-time.After(5 * time.Second): 4070 t.Fatalf("Did not receive completion signal") 4071 } 4072 4073 now := time.Now() 4074 ss := mset.stateWithDetail(true) 4075 // Before the fix the snapshot for this test would be > 200ms on my setup. 4076 if elapsed := time.Since(now); elapsed > 100*time.Millisecond { 4077 t.Fatalf("Took too long to snapshot: %v", elapsed) 4078 } else if elapsed > 50*time.Millisecond { 4079 t.Logf("WRN: Took longer than usual to snapshot: %v", elapsed) 4080 } 4081 4082 if ss.Msgs != 2 || ss.FirstSeq != 1 || ss.LastSeq != 1_000_001 || ss.NumDeleted != 999999 { 4083 // To not print out on error. 4084 ss.Deleted = nil 4085 t.Fatalf("Bad State: %+v", ss) 4086 } 4087 } 4088 4089 // This is related to an issue reported where we were exhausting threads by trying to 4090 // cleanup too many consumers at the same time. 4091 // https://github.com/nats-io/nats-server/issues/2742 4092 func TestNoRaceJetStreamConsumerFileStoreConcurrentDiskIO(t *testing.T) { 4093 storeDir := t.TempDir() 4094 4095 // Artificially adjust our environment for this test. 4096 gmp := runtime.GOMAXPROCS(32) 4097 defer runtime.GOMAXPROCS(gmp) 4098 4099 maxT := debug.SetMaxThreads(1050) // 1024 now 4100 defer debug.SetMaxThreads(maxT) 4101 4102 fs, err := newFileStore(FileStoreConfig{StoreDir: storeDir}, StreamConfig{Name: "MT", Storage: FileStorage}) 4103 require_NoError(t, err) 4104 defer fs.Stop() 4105 4106 startCh := make(chan bool) 4107 var wg sync.WaitGroup 4108 var swg sync.WaitGroup 4109 4110 ts := time.Now().UnixNano() 4111 4112 // Create 1000 consumerStores 4113 n := 1000 4114 swg.Add(n) 4115 4116 for i := 1; i <= n; i++ { 4117 name := fmt.Sprintf("o%d", i) 4118 o, err := fs.ConsumerStore(name, &ConsumerConfig{AckPolicy: AckExplicit}) 4119 require_NoError(t, err) 4120 wg.Add(1) 4121 swg.Done() 4122 4123 go func() { 4124 defer wg.Done() 4125 // Will make everyone run concurrently. 4126 <-startCh 4127 o.UpdateDelivered(22, 22, 1, ts) 4128 buf, _ := o.(*consumerFileStore).encodeState() 4129 o.(*consumerFileStore).writeState(buf) 4130 o.Delete() 4131 }() 4132 } 4133 4134 swg.Wait() 4135 close(startCh) 4136 wg.Wait() 4137 } 4138 4139 func TestNoRaceJetStreamClusterHealthz(t *testing.T) { 4140 c := createJetStreamCluster(t, jsClusterAccountsTempl, "HZ", _EMPTY_, 3, 23033, true) 4141 defer c.shutdown() 4142 4143 nc1, js1 := jsClientConnect(t, c.randomServer(), nats.UserInfo("one", "p")) 4144 defer nc1.Close() 4145 4146 nc2, js2 := jsClientConnect(t, c.randomServer(), nats.UserInfo("two", "p")) 4147 defer nc2.Close() 4148 4149 var err error 4150 for _, sname := range []string{"foo", "bar", "baz"} { 4151 _, err = js1.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 4152 require_NoError(t, err) 4153 _, err = js2.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 4154 require_NoError(t, err) 4155 } 4156 // R1 4157 _, err = js1.AddStream(&nats.StreamConfig{Name: "r1", Replicas: 1}) 4158 require_NoError(t, err) 4159 4160 // Now shutdown then send a bunch of data. 4161 s := c.servers[0] 4162 s.Shutdown() 4163 4164 for i := 0; i < 5_000; i++ { 4165 _, err = js1.PublishAsync("foo", []byte("OK")) 4166 require_NoError(t, err) 4167 _, err = js2.PublishAsync("bar", []byte("OK")) 4168 require_NoError(t, err) 4169 } 4170 select { 4171 case <-js1.PublishAsyncComplete(): 4172 case <-time.After(5 * time.Second): 4173 t.Fatalf("Did not receive completion signal") 4174 } 4175 select { 4176 case <-js2.PublishAsyncComplete(): 4177 case <-time.After(5 * time.Second): 4178 t.Fatalf("Did not receive completion signal") 4179 } 4180 4181 s = c.restartServer(s) 4182 opts := s.getOpts() 4183 opts.HTTPHost = "127.0.0.1" 4184 opts.HTTPPort = 11222 4185 err = s.StartMonitoring() 4186 require_NoError(t, err) 4187 url := fmt.Sprintf("http://127.0.0.1:%d/healthz", opts.HTTPPort) 4188 4189 getHealth := func() (int, *HealthStatus) { 4190 resp, err := http.Get(url) 4191 require_NoError(t, err) 4192 defer resp.Body.Close() 4193 body, err := io.ReadAll(resp.Body) 4194 require_NoError(t, err) 4195 var hs HealthStatus 4196 err = json.Unmarshal(body, &hs) 4197 require_NoError(t, err) 4198 return resp.StatusCode, &hs 4199 } 4200 4201 errors := 0 4202 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 4203 code, hs := getHealth() 4204 if code >= 200 && code < 300 { 4205 return nil 4206 } 4207 errors++ 4208 return fmt.Errorf("Got %d status with %+v", code, hs) 4209 }) 4210 if errors == 0 { 4211 t.Fatalf("Expected to have some errors until we became current, got none") 4212 } 4213 } 4214 4215 // Test that we can receive larger messages with stream subject details. 4216 // Also test that we will fail at some point and the user can fall back to 4217 // an orderedconsumer like we do with watch for KV Keys() call. 4218 func TestNoRaceJetStreamStreamInfoSubjectDetailsLimits(t *testing.T) { 4219 conf := createConfFile(t, []byte(fmt.Sprintf(` 4220 listen: 127.0.0.1:-1 4221 jetstream { 4222 store_dir = %q 4223 } 4224 accounts: { 4225 default: { 4226 jetstream: true 4227 users: [ {user: me, password: pwd} ] 4228 limits { max_payload: 512 } 4229 } 4230 } 4231 `, t.TempDir()))) 4232 4233 s, _ := RunServerWithConfig(conf) 4234 if config := s.JetStreamConfig(); config != nil { 4235 defer removeDir(t, config.StoreDir) 4236 } 4237 defer s.Shutdown() 4238 4239 nc, js := jsClientConnect(t, s, nats.UserInfo("me", "pwd")) 4240 defer nc.Close() 4241 4242 // Make sure to flush so we process the 2nd INFO. 4243 nc.Flush() 4244 4245 // Make sure we cannot send larger than 512 bytes. 4246 // But we can receive larger. 4247 sub, err := nc.SubscribeSync("foo") 4248 require_NoError(t, err) 4249 err = nc.Publish("foo", []byte(strings.Repeat("A", 600))) 4250 require_Error(t, err, nats.ErrMaxPayload) 4251 sub.Unsubscribe() 4252 4253 _, err = js.AddStream(&nats.StreamConfig{ 4254 Name: "TEST", 4255 Subjects: []string{"*", "X.*"}, 4256 }) 4257 require_NoError(t, err) 4258 4259 n := JSMaxSubjectDetails 4260 for i := 0; i < n; i++ { 4261 _, err := js.PublishAsync(fmt.Sprintf("X.%d", i), []byte("OK")) 4262 require_NoError(t, err) 4263 } 4264 select { 4265 case <-js.PublishAsyncComplete(): 4266 case <-time.After(5 * time.Second): 4267 t.Fatalf("Did not receive completion signal") 4268 } 4269 4270 // Need to grab StreamInfo by hand for now. 4271 req, err := json.Marshal(&JSApiStreamInfoRequest{SubjectsFilter: "X.*"}) 4272 require_NoError(t, err) 4273 resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second) 4274 require_NoError(t, err) 4275 var si StreamInfo 4276 err = json.Unmarshal(resp.Data, &si) 4277 require_NoError(t, err) 4278 if len(si.State.Subjects) != n { 4279 t.Fatalf("Expected to get %d subject details, got %d", n, len(si.State.Subjects)) 4280 } 4281 4282 // Now add one more message to check pagination 4283 _, err = js.Publish("foo", []byte("TOO MUCH")) 4284 require_NoError(t, err) 4285 4286 req, err = json.Marshal(&JSApiStreamInfoRequest{ApiPagedRequest: ApiPagedRequest{Offset: n}, SubjectsFilter: nats.AllKeys}) 4287 require_NoError(t, err) 4288 resp, err = nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second) 4289 require_NoError(t, err) 4290 var sir JSApiStreamInfoResponse 4291 err = json.Unmarshal(resp.Data, &sir) 4292 require_NoError(t, err) 4293 if len(sir.State.Subjects) != 1 { 4294 t.Fatalf("Expected to get 1 extra subject detail, got %d", len(sir.State.Subjects)) 4295 } 4296 } 4297 4298 func TestNoRaceJetStreamSparseConsumers(t *testing.T) { 4299 s := RunBasicJetStreamServer(t) 4300 defer s.Shutdown() 4301 4302 nc, js := jsClientConnect(t, s) 4303 defer nc.Close() 4304 4305 msg := []byte("ok") 4306 4307 cases := []struct { 4308 name string 4309 mconfig *nats.StreamConfig 4310 }{ 4311 {"MemoryStore", &nats.StreamConfig{Name: "TEST", Storage: nats.MemoryStorage, MaxMsgsPerSubject: 25_000_000, 4312 Subjects: []string{"*"}}}, 4313 {"FileStore", &nats.StreamConfig{Name: "TEST", Storage: nats.FileStorage, MaxMsgsPerSubject: 25_000_000, 4314 Subjects: []string{"*"}}}, 4315 } 4316 for _, c := range cases { 4317 t.Run(c.name, func(t *testing.T) { 4318 js.DeleteStream("TEST") 4319 _, err := js.AddStream(c.mconfig) 4320 require_NoError(t, err) 4321 4322 // We will purposely place foo msgs near the beginning, then in middle, then at the end. 4323 for n := 0; n < 2; n++ { 4324 _, err = js.PublishAsync("foo", msg, nats.StallWait(800*time.Millisecond)) 4325 require_NoError(t, err) 4326 4327 for i := 0; i < 1_000_000; i++ { 4328 _, err = js.PublishAsync("bar", msg, nats.StallWait(800*time.Millisecond)) 4329 require_NoError(t, err) 4330 } 4331 _, err = js.PublishAsync("foo", msg, nats.StallWait(800*time.Millisecond)) 4332 require_NoError(t, err) 4333 } 4334 select { 4335 case <-js.PublishAsyncComplete(): 4336 case <-time.After(5 * time.Second): 4337 t.Fatalf("Did not receive completion signal") 4338 } 4339 4340 // Now create a consumer on foo. 4341 ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{DeliverSubject: "x.x", FilterSubject: "foo", AckPolicy: nats.AckNonePolicy}) 4342 require_NoError(t, err) 4343 4344 done, received := make(chan bool), uint64(0) 4345 4346 cb := func(m *nats.Msg) { 4347 received++ 4348 if received >= ci.NumPending { 4349 done <- true 4350 } 4351 } 4352 4353 sub, err := nc.Subscribe("x.x", cb) 4354 require_NoError(t, err) 4355 defer sub.Unsubscribe() 4356 start := time.Now() 4357 var elapsed time.Duration 4358 4359 select { 4360 case <-done: 4361 elapsed = time.Since(start) 4362 case <-time.After(10 * time.Second): 4363 t.Fatal("Did not receive all messages for all consumers in time") 4364 } 4365 4366 if elapsed > 500*time.Millisecond { 4367 t.Fatalf("Getting all messages took longer than expected: %v", elapsed) 4368 } 4369 }) 4370 } 4371 } 4372 4373 func TestNoRaceJetStreamConsumerFilterPerfDegradation(t *testing.T) { 4374 s := RunBasicJetStreamServer(t) 4375 defer s.Shutdown() 4376 4377 nc, _ := jsClientConnect(t, s) 4378 defer nc.Close() 4379 4380 js, err := nc.JetStream(nats.PublishAsyncMaxPending(256)) 4381 require_NoError(t, err) 4382 4383 _, err = js.AddStream(&nats.StreamConfig{ 4384 Name: "test", 4385 Subjects: []string{"test.*.subj"}, 4386 Replicas: 1, 4387 }) 4388 require_NoError(t, err) 4389 4390 toSend := 50_000 4391 count := 0 4392 ch := make(chan struct{}, 6) 4393 _, err = js.Subscribe("test.*.subj", func(m *nats.Msg) { 4394 m.Ack() 4395 if count++; count == toSend { 4396 ch <- struct{}{} 4397 } 4398 }, nats.DeliverNew(), nats.ManualAck()) 4399 require_NoError(t, err) 4400 4401 msg := make([]byte, 1024) 4402 sent := int32(0) 4403 send := func() { 4404 defer func() { ch <- struct{}{} }() 4405 for i := 0; i < toSend/5; i++ { 4406 msgID := atomic.AddInt32(&sent, 1) 4407 _, err := js.Publish(fmt.Sprintf("test.%d.subj", msgID), msg) 4408 if err != nil { 4409 t.Error(err) 4410 return 4411 } 4412 } 4413 } 4414 for i := 0; i < 5; i++ { 4415 go send() 4416 } 4417 timeout := time.NewTimer(10 * time.Second) 4418 for i := 0; i < 6; i++ { 4419 select { 4420 case <-ch: 4421 case <-timeout.C: 4422 t.Fatal("Took too long") 4423 } 4424 } 4425 } 4426 4427 func TestNoRaceJetStreamFileStoreKeyFileCleanup(t *testing.T) { 4428 storeDir := t.TempDir() 4429 4430 prf := func(context []byte) ([]byte, error) { 4431 h := hmac.New(sha256.New, []byte("dlc22")) 4432 if _, err := h.Write(context); err != nil { 4433 return nil, err 4434 } 4435 return h.Sum(nil), nil 4436 } 4437 4438 fs, err := newFileStoreWithCreated( 4439 FileStoreConfig{StoreDir: storeDir, BlockSize: 1024 * 1024}, 4440 StreamConfig{Name: "TEST", Storage: FileStorage}, 4441 time.Now(), 4442 prf, nil) 4443 require_NoError(t, err) 4444 defer fs.Stop() 4445 4446 n, msg := 10_000, []byte(strings.Repeat("Z", 1024)) 4447 for i := 0; i < n; i++ { 4448 _, _, err := fs.StoreMsg(fmt.Sprintf("X.%d", i), nil, msg) 4449 require_NoError(t, err) 4450 } 4451 4452 var seqs []uint64 4453 for i := 1; i <= n; i++ { 4454 seqs = append(seqs, uint64(i)) 4455 } 4456 // Randomly delete msgs, make sure we cleanup as we empty the message blocks. 4457 rand.Shuffle(len(seqs), func(i, j int) { seqs[i], seqs[j] = seqs[j], seqs[i] }) 4458 4459 for _, seq := range seqs { 4460 _, err := fs.RemoveMsg(seq) 4461 require_NoError(t, err) 4462 } 4463 4464 // We will have cleanup the main .blk and .idx sans the lmb, but we should not have any *.fss files. 4465 kms, err := filepath.Glob(filepath.Join(storeDir, msgDir, keyScanAll)) 4466 require_NoError(t, err) 4467 4468 if len(kms) > 1 { 4469 t.Fatalf("Expected to find only 1 key file, found %d", len(kms)) 4470 } 4471 } 4472 4473 func TestNoRaceJetStreamMsgIdPerfDuringCatchup(t *testing.T) { 4474 // Uncomment to run. Needs to be on a bigger machine. Do not want as part of Travis tests atm. 4475 skip(t) 4476 4477 c := createJetStreamClusterExplicit(t, "JSC", 3) 4478 defer c.shutdown() 4479 4480 nc, js := jsClientConnect(t, c.serverByName("S-1")) 4481 defer nc.Close() 4482 4483 _, err := js.AddStream(&nats.StreamConfig{ 4484 Name: "TEST", 4485 Replicas: 3, 4486 }) 4487 require_NoError(t, err) 4488 4489 // This will be the one we restart. 4490 sl := c.streamLeader("$G", "TEST") 4491 // Now move leader. 4492 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 4493 require_NoError(t, err) 4494 c.waitOnStreamLeader("$G", "TEST") 4495 4496 // Connect to new leader. 4497 nc, _ = jsClientConnect(t, c.streamLeader("$G", "TEST")) 4498 defer nc.Close() 4499 4500 js, err = nc.JetStream(nats.PublishAsyncMaxPending(1024)) 4501 require_NoError(t, err) 4502 4503 n, ss, sr := 1_000_000, 250_000, 800_000 4504 m := nats.NewMsg("TEST") 4505 m.Data = []byte(strings.Repeat("Z", 2048)) 4506 4507 // Target rate 10k msgs/sec 4508 start := time.Now() 4509 4510 for i := 0; i < n; i++ { 4511 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4512 _, err := js.PublishMsgAsync(m) 4513 require_NoError(t, err) 4514 //time.Sleep(42 * time.Microsecond) 4515 if i == ss { 4516 fmt.Printf("SD") 4517 sl.Shutdown() 4518 } else if i == sr { 4519 nc.Flush() 4520 select { 4521 case <-js.PublishAsyncComplete(): 4522 case <-time.After(10 * time.Second): 4523 } 4524 fmt.Printf("RS") 4525 sl = c.restartServer(sl) 4526 } 4527 if i%10_000 == 0 { 4528 fmt.Print("#") 4529 } 4530 } 4531 fmt.Println() 4532 4533 // Wait to receive all messages. 4534 select { 4535 case <-js.PublishAsyncComplete(): 4536 case <-time.After(20 * time.Second): 4537 t.Fatalf("Did not receive completion signal") 4538 } 4539 4540 tt := time.Since(start) 4541 si, err := js.StreamInfo("TEST") 4542 require_NoError(t, err) 4543 4544 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4545 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4546 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4547 4548 c.waitOnStreamCurrent(sl, "$G", "TEST") 4549 for _, s := range c.servers { 4550 mset, _ := s.GlobalAccount().lookupStream("TEST") 4551 if state := mset.store.State(); state.Msgs != uint64(n) { 4552 t.Fatalf("Expected server %v to have correct number of msgs %d but got %d", s, n, state.Msgs) 4553 } 4554 } 4555 } 4556 4557 func TestNoRaceJetStreamRebuildDeDupeAndMemoryPerf(t *testing.T) { 4558 skip(t) 4559 4560 s := RunBasicJetStreamServer(t) 4561 defer s.Shutdown() 4562 4563 nc, js := jsClientConnect(t, s) 4564 defer nc.Close() 4565 4566 _, err := js.AddStream(&nats.StreamConfig{Name: "DD"}) 4567 require_NoError(t, err) 4568 4569 m := nats.NewMsg("DD") 4570 m.Data = []byte(strings.Repeat("Z", 2048)) 4571 4572 start := time.Now() 4573 4574 n := 1_000_000 4575 for i := 0; i < n; i++ { 4576 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4577 _, err := js.PublishMsgAsync(m) 4578 require_NoError(t, err) 4579 } 4580 4581 select { 4582 case <-js.PublishAsyncComplete(): 4583 case <-time.After(20 * time.Second): 4584 t.Fatalf("Did not receive completion signal") 4585 } 4586 4587 tt := time.Since(start) 4588 si, err := js.StreamInfo("DD") 4589 require_NoError(t, err) 4590 4591 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4592 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4593 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4594 4595 v, _ := s.Varz(nil) 4596 fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem)) 4597 4598 mset, err := s.GlobalAccount().lookupStream("DD") 4599 require_NoError(t, err) 4600 4601 mset.mu.Lock() 4602 mset.ddloaded = false 4603 start = time.Now() 4604 mset.rebuildDedupe() 4605 fmt.Printf("TOOK %v to rebuild dd\n", time.Since(start)) 4606 mset.mu.Unlock() 4607 4608 v, _ = s.Varz(nil) 4609 fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem)) 4610 4611 // Now do an ephemeral consumer and whip through every message. Doing same calculations. 4612 start = time.Now() 4613 received, done := 0, make(chan bool) 4614 sub, err := js.Subscribe("DD", func(m *nats.Msg) { 4615 received++ 4616 if received >= n { 4617 done <- true 4618 } 4619 }, nats.OrderedConsumer()) 4620 require_NoError(t, err) 4621 4622 select { 4623 case <-done: 4624 case <-time.After(10 * time.Second): 4625 if s.NumSlowConsumers() > 0 { 4626 t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", received, n) 4627 } 4628 t.Fatalf("Failed to receive all large messages: %d of %d\n", received, n) 4629 } 4630 4631 fmt.Printf("TOOK %v to receive all %d msgs\n", time.Since(start), n) 4632 sub.Unsubscribe() 4633 4634 v, _ = s.Varz(nil) 4635 fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem)) 4636 } 4637 4638 func TestNoRaceJetStreamMemoryUsageOnLimitedStreamWithMirror(t *testing.T) { 4639 skip(t) 4640 4641 s := RunBasicJetStreamServer(t) 4642 defer s.Shutdown() 4643 4644 nc, js := jsClientConnect(t, s) 4645 defer nc.Close() 4646 4647 _, err := js.AddStream(&nats.StreamConfig{Name: "DD", Subjects: []string{"ORDERS.*"}, MaxMsgs: 10_000}) 4648 require_NoError(t, err) 4649 4650 _, err = js.AddStream(&nats.StreamConfig{ 4651 Name: "M", 4652 Mirror: &nats.StreamSource{Name: "DD"}, 4653 MaxMsgs: 10_000, 4654 }) 4655 require_NoError(t, err) 4656 4657 m := nats.NewMsg("ORDERS.0") 4658 m.Data = []byte(strings.Repeat("Z", 2048)) 4659 4660 start := time.Now() 4661 4662 n := 1_000_000 4663 for i := 0; i < n; i++ { 4664 m.Subject = fmt.Sprintf("ORDERS.%d", i) 4665 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4666 _, err := js.PublishMsgAsync(m) 4667 require_NoError(t, err) 4668 } 4669 4670 select { 4671 case <-js.PublishAsyncComplete(): 4672 case <-time.After(20 * time.Second): 4673 t.Fatalf("Did not receive completion signal") 4674 } 4675 4676 tt := time.Since(start) 4677 si, err := js.StreamInfo("DD") 4678 require_NoError(t, err) 4679 4680 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4681 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4682 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4683 4684 v, _ := s.Varz(nil) 4685 fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem)) 4686 } 4687 4688 func TestNoRaceJetStreamOrderedConsumerLongRTTPerformance(t *testing.T) { 4689 skip(t) 4690 4691 s := RunBasicJetStreamServer(t) 4692 defer s.Shutdown() 4693 4694 nc, _ := jsClientConnect(t, s) 4695 defer nc.Close() 4696 4697 js, err := nc.JetStream(nats.PublishAsyncMaxPending(1000)) 4698 require_NoError(t, err) 4699 4700 _, err = js.AddStream(&nats.StreamConfig{Name: "OCP"}) 4701 require_NoError(t, err) 4702 4703 n, msg := 100_000, []byte(strings.Repeat("D", 30_000)) 4704 4705 for i := 0; i < n; i++ { 4706 _, err := js.PublishAsync("OCP", msg) 4707 require_NoError(t, err) 4708 } 4709 select { 4710 case <-js.PublishAsyncComplete(): 4711 case <-time.After(5 * time.Second): 4712 t.Fatalf("Did not receive completion signal") 4713 } 4714 4715 // Approximately 3GB 4716 si, err := js.StreamInfo("OCP") 4717 require_NoError(t, err) 4718 4719 start := time.Now() 4720 received, done := 0, make(chan bool) 4721 sub, err := js.Subscribe("OCP", func(m *nats.Msg) { 4722 received++ 4723 if received >= n { 4724 done <- true 4725 } 4726 }, nats.OrderedConsumer()) 4727 require_NoError(t, err) 4728 defer sub.Unsubscribe() 4729 4730 // Wait to receive all messages. 4731 select { 4732 case <-done: 4733 case <-time.After(30 * time.Second): 4734 t.Fatalf("Did not receive all of our messages") 4735 } 4736 4737 tt := time.Since(start) 4738 fmt.Printf("Took %v to receive %d msgs\n", tt, n) 4739 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4740 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4741 4742 sub.Unsubscribe() 4743 4744 rtt := 10 * time.Millisecond 4745 bw := 10 * 1024 * 1024 * 1024 4746 proxy := newNetProxy(rtt, bw, bw, s.ClientURL()) 4747 defer proxy.stop() 4748 4749 nc, err = nats.Connect(proxy.clientURL()) 4750 require_NoError(t, err) 4751 defer nc.Close() 4752 js, err = nc.JetStream() 4753 require_NoError(t, err) 4754 4755 start, received = time.Now(), 0 4756 sub, err = js.Subscribe("OCP", func(m *nats.Msg) { 4757 received++ 4758 if received >= n { 4759 done <- true 4760 } 4761 }, nats.OrderedConsumer()) 4762 require_NoError(t, err) 4763 defer sub.Unsubscribe() 4764 4765 // Wait to receive all messages. 4766 select { 4767 case <-done: 4768 case <-time.After(60 * time.Second): 4769 t.Fatalf("Did not receive all of our messages") 4770 } 4771 4772 tt = time.Since(start) 4773 fmt.Printf("Proxy RTT: %v, UP: %d, DOWN: %d\n", rtt, bw, bw) 4774 fmt.Printf("Took %v to receive %d msgs\n", tt, n) 4775 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4776 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4777 } 4778 4779 var jsClusterStallCatchupTempl = ` 4780 listen: 127.0.0.1:-1 4781 server_name: %s 4782 jetstream: {max_mem_store: 256MB, max_file_store: 32GB, store_dir: '%s'} 4783 4784 leaf { 4785 listen: 127.0.0.1:-1 4786 } 4787 4788 cluster { 4789 name: %s 4790 listen: 127.0.0.1:%d 4791 routes = [%s] 4792 } 4793 4794 # For access to system account. 4795 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 4796 ` 4797 4798 // Test our global stall gate for outstanding catchup bytes. 4799 func TestNoRaceJetStreamClusterCatchupStallGate(t *testing.T) { 4800 skip(t) 4801 4802 c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3) 4803 defer c.shutdown() 4804 4805 nc, js := jsClientConnect(t, c.randomServer()) 4806 defer nc.Close() 4807 4808 // ~100k per message. 4809 msg := []byte(strings.Repeat("A", 99_960)) 4810 4811 // Create 200 streams with 100MB. 4812 // Each server has ~2GB 4813 var wg sync.WaitGroup 4814 for i := 0; i < 20; i++ { 4815 wg.Add(1) 4816 go func(x int) { 4817 defer wg.Done() 4818 for n := 1; n <= 10; n++ { 4819 sn := fmt.Sprintf("S-%d", n+x) 4820 _, err := js.AddStream(&nats.StreamConfig{ 4821 Name: sn, 4822 Replicas: 3, 4823 }) 4824 require_NoError(t, err) 4825 for i := 0; i < 100; i++ { 4826 _, err := js.Publish(sn, msg) 4827 require_NoError(t, err) 4828 } 4829 } 4830 }(i * 20) 4831 } 4832 wg.Wait() 4833 4834 info, err := js.AccountInfo() 4835 require_NoError(t, err) 4836 require_True(t, info.Streams == 200) 4837 4838 runtime.GC() 4839 debug.FreeOSMemory() 4840 4841 // Now bring a server down and wipe its storage. 4842 s := c.servers[0] 4843 vz, err := s.Varz(nil) 4844 require_NoError(t, err) 4845 fmt.Printf("MEM BEFORE is %v\n", friendlyBytes(vz.Mem)) 4846 4847 sd := s.JetStreamConfig().StoreDir 4848 s.Shutdown() 4849 removeDir(t, sd) 4850 s = c.restartServer(s) 4851 4852 c.waitOnServerHealthz(s) 4853 4854 runtime.GC() 4855 debug.FreeOSMemory() 4856 4857 vz, err = s.Varz(nil) 4858 require_NoError(t, err) 4859 fmt.Printf("MEM AFTER is %v\n", friendlyBytes(vz.Mem)) 4860 } 4861 4862 func TestNoRaceJetStreamClusterCatchupBailMidway(t *testing.T) { 4863 skip(t) 4864 4865 c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3) 4866 defer c.shutdown() 4867 4868 ml := c.leader() 4869 nc, js := jsClientConnect(t, ml) 4870 defer nc.Close() 4871 4872 msg := []byte(strings.Repeat("A", 480)) 4873 4874 for i := 0; i < maxConcurrentSyncRequests*2; i++ { 4875 sn := fmt.Sprintf("CUP-%d", i+1) 4876 _, err := js.AddStream(&nats.StreamConfig{ 4877 Name: sn, 4878 Replicas: 3, 4879 }) 4880 require_NoError(t, err) 4881 4882 for i := 0; i < 10_000; i++ { 4883 _, err := js.PublishAsync(sn, msg) 4884 require_NoError(t, err) 4885 } 4886 select { 4887 case <-js.PublishAsyncComplete(): 4888 case <-time.After(10 * time.Second): 4889 t.Fatalf("Did not receive completion signal") 4890 } 4891 } 4892 4893 jsz, _ := ml.Jsz(nil) 4894 expectedMsgs := jsz.Messages 4895 4896 // Now select a server and shut it down, removing the storage directory. 4897 s := c.randomNonLeader() 4898 sd := s.JetStreamConfig().StoreDir 4899 s.Shutdown() 4900 removeDir(t, sd) 4901 4902 // Now restart the server. 4903 s = c.restartServer(s) 4904 4905 // We want to force the follower to bail before the catchup through the 4906 // upper level catchup logic completes. 4907 checkFor(t, 5*time.Second, 10*time.Millisecond, func() error { 4908 jsz, _ := s.Jsz(nil) 4909 if jsz.Messages > expectedMsgs/2 { 4910 s.Shutdown() 4911 return nil 4912 } 4913 return fmt.Errorf("Not enough yet") 4914 }) 4915 4916 // Now restart the server. 4917 s = c.restartServer(s) 4918 4919 checkFor(t, 5*time.Second, 500*time.Millisecond, func() error { 4920 jsz, _ := s.Jsz(nil) 4921 if jsz.Messages == expectedMsgs { 4922 return nil 4923 } 4924 return fmt.Errorf("Not enough yet") 4925 }) 4926 } 4927 4928 func TestNoRaceJetStreamAccountLimitsAndRestart(t *testing.T) { 4929 c := createJetStreamClusterWithTemplate(t, jsClusterAccountLimitsTempl, "A3S", 3) 4930 defer c.shutdown() 4931 4932 nc, js := jsClientConnect(t, c.randomServer()) 4933 defer nc.Close() 4934 4935 if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Replicas: 3}); err != nil { 4936 t.Fatalf("Unexpected error: %v", err) 4937 } 4938 4939 for i := 0; i < 20_000; i++ { 4940 if _, err := js.Publish("TEST", []byte("A")); err != nil { 4941 break 4942 } 4943 if i == 5_000 { 4944 snl := c.randomNonStreamLeader("$JS", "TEST") 4945 snl.Shutdown() 4946 } 4947 } 4948 4949 c.stopAll() 4950 c.restartAll() 4951 c.waitOnLeader() 4952 c.waitOnStreamLeader("$JS", "TEST") 4953 4954 for _, cs := range c.servers { 4955 c.waitOnStreamCurrent(cs, "$JS", "TEST") 4956 } 4957 } 4958 4959 func TestNoRaceJetStreamPullConsumersAndInteriorDeletes(t *testing.T) { 4960 c := createJetStreamClusterExplicit(t, "ID", 3) 4961 defer c.shutdown() 4962 4963 nc, js := jsClientConnect(t, c.randomServer()) 4964 defer nc.Close() 4965 4966 _, err := js.AddStream(&nats.StreamConfig{ 4967 Name: "foo", 4968 Replicas: 3, 4969 MaxMsgs: 50000, 4970 Retention: nats.InterestPolicy, 4971 }) 4972 require_NoError(t, err) 4973 4974 c.waitOnStreamLeader(globalAccountName, "foo") 4975 4976 _, err = js.AddConsumer("foo", &nats.ConsumerConfig{ 4977 Durable: "foo", 4978 FilterSubject: "foo", 4979 MaxAckPending: 20000, 4980 AckWait: time.Minute, 4981 AckPolicy: nats.AckExplicitPolicy, 4982 }) 4983 require_NoError(t, err) 4984 4985 c.waitOnConsumerLeader(globalAccountName, "foo", "foo") 4986 4987 rcv := int32(0) 4988 prods := 5 4989 cons := 5 4990 wg := sync.WaitGroup{} 4991 wg.Add(prods + cons) 4992 toSend := 100000 4993 4994 for i := 0; i < cons; i++ { 4995 go func() { 4996 defer wg.Done() 4997 4998 sub, err := js.PullSubscribe("foo", "foo") 4999 if err != nil { 5000 return 5001 } 5002 for { 5003 msgs, err := sub.Fetch(200, nats.MaxWait(250*time.Millisecond)) 5004 if err != nil { 5005 if n := int(atomic.LoadInt32(&rcv)); n >= toSend { 5006 return 5007 } 5008 continue 5009 } 5010 for _, m := range msgs { 5011 m.Ack() 5012 atomic.AddInt32(&rcv, 1) 5013 } 5014 } 5015 }() 5016 } 5017 5018 for i := 0; i < prods; i++ { 5019 go func() { 5020 defer wg.Done() 5021 5022 for i := 0; i < toSend/prods; i++ { 5023 js.Publish("foo", []byte("hello")) 5024 } 5025 }() 5026 } 5027 5028 time.Sleep(time.Second) 5029 resp, err := nc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "foo", "foo"), nil, time.Second) 5030 if err != nil { 5031 t.Fatalf("Unexpected error: %v", err) 5032 } 5033 var cdResp JSApiConsumerLeaderStepDownResponse 5034 if err := json.Unmarshal(resp.Data, &cdResp); err != nil { 5035 t.Fatalf("Unexpected error: %v", err) 5036 } 5037 if cdResp.Error != nil { 5038 t.Fatalf("Unexpected error: %+v", cdResp.Error) 5039 } 5040 ch := make(chan struct{}) 5041 go func() { 5042 wg.Wait() 5043 close(ch) 5044 }() 5045 select { 5046 case <-ch: 5047 // OK 5048 case <-time.After(30 * time.Second): 5049 t.Fatalf("Consumers took too long to consumer all messages") 5050 } 5051 } 5052 5053 func TestNoRaceJetStreamClusterInterestPullConsumerStreamLimitBug(t *testing.T) { 5054 c := createJetStreamClusterExplicit(t, "JSC", 3) 5055 defer c.shutdown() 5056 5057 nc, js := jsClientConnect(t, c.randomServer()) 5058 defer nc.Close() 5059 5060 limit := uint64(1000) 5061 5062 _, err := js.AddStream(&nats.StreamConfig{ 5063 Name: "TEST", 5064 Subjects: []string{"foo"}, 5065 Retention: nats.InterestPolicy, 5066 MaxMsgs: int64(limit), 5067 Replicas: 3, 5068 }) 5069 require_NoError(t, err) 5070 5071 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dur", AckPolicy: nats.AckExplicitPolicy}) 5072 require_NoError(t, err) 5073 5074 qch := make(chan bool) 5075 var wg sync.WaitGroup 5076 5077 // Publisher 5078 wg.Add(1) 5079 go func() { 5080 defer wg.Done() 5081 for { 5082 pt := time.NewTimer(time.Duration(rand.Intn(2)) * time.Millisecond) 5083 select { 5084 case <-pt.C: 5085 _, err := js.Publish("foo", []byte("BUG!")) 5086 require_NoError(t, err) 5087 case <-qch: 5088 pt.Stop() 5089 return 5090 } 5091 } 5092 }() 5093 5094 time.Sleep(time.Second) 5095 5096 // Pull Consumers 5097 wg.Add(100) 5098 for i := 0; i < 100; i++ { 5099 go func() { 5100 defer wg.Done() 5101 nc := natsConnect(t, c.randomServer().ClientURL()) 5102 defer nc.Close() 5103 5104 js, err := nc.JetStream(nats.MaxWait(time.Second)) 5105 require_NoError(t, err) 5106 5107 var sub *nats.Subscription 5108 for j := 0; j < 5; j++ { 5109 sub, err = js.PullSubscribe("foo", "dur") 5110 if err == nil { 5111 break 5112 } 5113 } 5114 require_NoError(t, err) 5115 5116 for { 5117 pt := time.NewTimer(time.Duration(rand.Intn(300)) * time.Millisecond) 5118 select { 5119 case <-pt.C: 5120 msgs, err := sub.Fetch(1) 5121 if err != nil { 5122 t.Logf("Got a Fetch error: %v", err) 5123 return 5124 } 5125 if len(msgs) > 0 { 5126 go func() { 5127 ackDelay := time.Duration(rand.Intn(375)+15) * time.Millisecond 5128 m := msgs[0] 5129 time.AfterFunc(ackDelay, func() { m.AckSync() }) 5130 }() 5131 } 5132 case <-qch: 5133 return 5134 } 5135 } 5136 }() 5137 } 5138 5139 // Make sure we have hit the limit for the number of messages we expected. 5140 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5141 si, err := js.StreamInfo("TEST") 5142 require_NoError(t, err) 5143 if si.State.Msgs < limit { 5144 return fmt.Errorf("Not hit limit yet") 5145 } 5146 return nil 5147 }) 5148 5149 close(qch) 5150 wg.Wait() 5151 5152 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5153 si, err := js.StreamInfo("TEST") 5154 require_NoError(t, err) 5155 ci, err := js.ConsumerInfo("TEST", "dur") 5156 require_NoError(t, err) 5157 5158 np := ci.NumPending + uint64(ci.NumAckPending) 5159 if np != si.State.Msgs { 5160 return fmt.Errorf("Expected NumPending to be %d got %d", si.State.Msgs-uint64(ci.NumAckPending), ci.NumPending) 5161 } 5162 return nil 5163 }) 5164 } 5165 5166 // Test that all peers have the direct access subs that participate in a queue group, 5167 // but only when they are current and ready. So we will start with R1, add in messages 5168 // then scale up while also still adding messages. 5169 func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { 5170 c := createJetStreamClusterExplicit(t, "JSC", 3) 5171 defer c.shutdown() 5172 5173 nc, js := jsClientConnect(t, c.randomServer()) 5174 defer nc.Close() 5175 5176 // Start as R1 5177 cfg := &StreamConfig{ 5178 Name: "TEST", 5179 Subjects: []string{"kv.>"}, 5180 MaxMsgsPer: 10, 5181 AllowDirect: true, 5182 Replicas: 1, 5183 Storage: FileStorage, 5184 } 5185 addStream(t, nc, cfg) 5186 5187 // Seed with enough messages to start then we will scale up while still adding more messages. 5188 num, msg := 1000, bytes.Repeat([]byte("XYZ"), 64) 5189 for i := 0; i < num; i++ { 5190 js.PublishAsync(fmt.Sprintf("kv.%d", i), msg) 5191 } 5192 select { 5193 case <-js.PublishAsyncComplete(): 5194 case <-time.After(5 * time.Second): 5195 t.Fatalf("Did not receive completion signal") 5196 } 5197 5198 getSubj := fmt.Sprintf(JSDirectMsgGetT, "TEST") 5199 getMsg := func(key string) *nats.Msg { 5200 t.Helper() 5201 req := []byte(fmt.Sprintf(`{"last_by_subj":%q}`, key)) 5202 m, err := nc.Request(getSubj, req, time.Second) 5203 require_NoError(t, err) 5204 require_True(t, m.Header.Get(JSSubject) == key) 5205 return m 5206 } 5207 5208 // Just make sure we can succeed here. 5209 getMsg("kv.22") 5210 5211 // Now crank up a go routine to continue sending more messages. 5212 qch := make(chan bool) 5213 var wg sync.WaitGroup 5214 5215 for i := 0; i < 5; i++ { 5216 wg.Add(1) 5217 go func() { 5218 defer wg.Done() 5219 nc, js := jsClientConnect(t, c.randomServer()) 5220 defer nc.Close() 5221 for { 5222 select { 5223 case <-qch: 5224 select { 5225 case <-js.PublishAsyncComplete(): 5226 case <-time.After(10 * time.Second): 5227 } 5228 return 5229 default: 5230 // Send as fast as we can. 5231 js.Publish(fmt.Sprintf("kv.%d", rand.Intn(1000)), msg) 5232 } 5233 } 5234 }() 5235 } 5236 5237 time.Sleep(200 * time.Millisecond) 5238 5239 // Now let's scale up to an R3. 5240 cfg.Replicas = 3 5241 updateStream(t, nc, cfg) 5242 5243 // Wait for the stream to register the new replicas and have a leader. 5244 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5245 si, err := js.StreamInfo("TEST") 5246 if err != nil { 5247 return err 5248 } 5249 if si.Cluster == nil { 5250 return fmt.Errorf("No cluster yet") 5251 } 5252 if si.Cluster.Leader == _EMPTY_ || len(si.Cluster.Replicas) != 2 { 5253 return fmt.Errorf("Cluster not ready yet") 5254 } 5255 return nil 5256 }) 5257 5258 close(qch) 5259 wg.Wait() 5260 5261 // Just make sure we can succeed here. 5262 getMsg("kv.22") 5263 5264 // For each non-leader check that the direct sub fires up. 5265 // We just test all, the leader will already have a directSub. 5266 for _, s := range c.servers { 5267 mset, err := s.GlobalAccount().lookupStream("TEST") 5268 require_NoError(t, err) 5269 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5270 mset.mu.RLock() 5271 ok := mset.directSub != nil 5272 mset.mu.RUnlock() 5273 if ok { 5274 return nil 5275 } 5276 return fmt.Errorf("No directSub yet") 5277 }) 5278 } 5279 5280 si, err := js.StreamInfo("TEST") 5281 require_NoError(t, err) 5282 5283 if si.State.Msgs == uint64(num) { 5284 t.Fatalf("Expected to see messages increase, got %d", si.State.Msgs) 5285 } 5286 5287 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 5288 // Make sure they are all the same from a state perspective. 5289 // Leader will have the expected state. 5290 lmset, err := c.streamLeader("$G", "TEST").GlobalAccount().lookupStream("TEST") 5291 require_NoError(t, err) 5292 expected := lmset.state() 5293 5294 for _, s := range c.servers { 5295 mset, err := s.GlobalAccount().lookupStream("TEST") 5296 require_NoError(t, err) 5297 if state := mset.state(); !reflect.DeepEqual(expected, state) { 5298 return fmt.Errorf("Expected %+v, got %+v", expected, state) 5299 } 5300 } 5301 return nil 5302 }) 5303 5304 } 5305 5306 func TestNoRaceJetStreamClusterStreamNamesAndInfosMoreThanAPILimit(t *testing.T) { 5307 c := createJetStreamClusterExplicit(t, "R3S", 3) 5308 defer c.shutdown() 5309 5310 s := c.randomServer() 5311 nc, js := jsClientConnect(t, s) 5312 defer nc.Close() 5313 5314 createStream := func(name string) { 5315 t.Helper() 5316 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 5317 t.Fatalf("Unexpected error: %v", err) 5318 } 5319 } 5320 5321 max := JSApiListLimit 5322 if JSApiNamesLimit > max { 5323 max = JSApiNamesLimit 5324 } 5325 max += 10 5326 5327 for i := 0; i < max; i++ { 5328 name := fmt.Sprintf("foo_%d", i) 5329 createStream(name) 5330 } 5331 5332 // Not using the JS API here beacause we want to make sure that the 5333 // server returns the proper Total count, but also that it does not 5334 // send more than when the API limit is in one go. 5335 check := func(subj string, limit int) { 5336 t.Helper() 5337 5338 nreq := JSApiStreamNamesRequest{} 5339 b, _ := json.Marshal(nreq) 5340 msg, err := nc.Request(subj, b, 2*time.Second) 5341 require_NoError(t, err) 5342 5343 nresp := JSApiStreamNamesResponse{} 5344 json.Unmarshal(msg.Data, &nresp) 5345 if n := nresp.ApiPaged.Total; n != max { 5346 t.Fatalf("Expected total to be %v, got %v", max, n) 5347 } 5348 if n := nresp.ApiPaged.Limit; n != limit { 5349 t.Fatalf("Expected limit to be %v, got %v", limit, n) 5350 } 5351 if n := len(nresp.Streams); n != limit { 5352 t.Fatalf("Expected number of streams to be %v, got %v", limit, n) 5353 } 5354 } 5355 5356 check(JSApiStreams, JSApiNamesLimit) 5357 check(JSApiStreamList, JSApiListLimit) 5358 } 5359 5360 func TestNoRaceJetStreamClusterConsumerListPaging(t *testing.T) { 5361 c := createJetStreamClusterExplicit(t, "R3S", 3) 5362 defer c.shutdown() 5363 5364 s := c.randomNonLeader() 5365 nc, js := jsClientConnect(t, s) 5366 defer nc.Close() 5367 5368 _, err := js.AddStream(&nats.StreamConfig{ 5369 Name: "TEST", 5370 Subjects: []string{"foo"}, 5371 Replicas: 3, 5372 }) 5373 require_NoError(t, err) 5374 c.waitOnStreamLeader(globalAccountName, "TEST") 5375 5376 cfg := &nats.ConsumerConfig{ 5377 Replicas: 1, 5378 MemoryStorage: true, 5379 AckPolicy: nats.AckExplicitPolicy, 5380 } 5381 5382 // create 3000 consumers. 5383 numConsumers := 3000 5384 for i := 1; i <= numConsumers; i++ { 5385 cfg.Durable = fmt.Sprintf("d-%.4d", i) 5386 _, err := js.AddConsumer("TEST", cfg) 5387 require_NoError(t, err) 5388 } 5389 5390 // Test both names and list operations. 5391 5392 // Names 5393 reqSubj := fmt.Sprintf(JSApiConsumersT, "TEST") 5394 grabConsumerNames := func(offset int) []string { 5395 req := fmt.Sprintf(`{"offset":%d}`, offset) 5396 respMsg, err := nc.Request(reqSubj, []byte(req), time.Second) 5397 require_NoError(t, err) 5398 var resp JSApiConsumerNamesResponse 5399 err = json.Unmarshal(respMsg.Data, &resp) 5400 require_NoError(t, err) 5401 // Sanity check that we are actually paging properly around limits. 5402 if resp.Limit < len(resp.Consumers) { 5403 t.Fatalf("Expected total limited to %d but got %d", resp.Limit, len(resp.Consumers)) 5404 } 5405 if resp.Total != numConsumers { 5406 t.Fatalf("Invalid total response: expected %d got %d", numConsumers, resp.Total) 5407 } 5408 return resp.Consumers 5409 } 5410 5411 results := make(map[string]bool) 5412 5413 for offset := 0; len(results) < numConsumers; { 5414 consumers := grabConsumerNames(offset) 5415 offset += len(consumers) 5416 for _, name := range consumers { 5417 if results[name] { 5418 t.Fatalf("Found duplicate %q", name) 5419 } 5420 results[name] = true 5421 } 5422 } 5423 5424 // List 5425 reqSubj = fmt.Sprintf(JSApiConsumerListT, "TEST") 5426 grabConsumerList := func(offset int) []*ConsumerInfo { 5427 req := fmt.Sprintf(`{"offset":%d}`, offset) 5428 respMsg, err := nc.Request(reqSubj, []byte(req), time.Second) 5429 require_NoError(t, err) 5430 var resp JSApiConsumerListResponse 5431 err = json.Unmarshal(respMsg.Data, &resp) 5432 require_NoError(t, err) 5433 // Sanity check that we are actually paging properly around limits. 5434 if resp.Limit < len(resp.Consumers) { 5435 t.Fatalf("Expected total limited to %d but got %d", resp.Limit, len(resp.Consumers)) 5436 } 5437 if resp.Total != numConsumers { 5438 t.Fatalf("Invalid total response: expected %d got %d", numConsumers, resp.Total) 5439 } 5440 return resp.Consumers 5441 } 5442 5443 results = make(map[string]bool) 5444 5445 for offset := 0; len(results) < numConsumers; { 5446 consumers := grabConsumerList(offset) 5447 offset += len(consumers) 5448 for _, ci := range consumers { 5449 name := ci.Config.Durable 5450 if results[name] { 5451 t.Fatalf("Found duplicate %q", name) 5452 } 5453 results[name] = true 5454 } 5455 } 5456 5457 if len(results) != numConsumers { 5458 t.Fatalf("Received %d / %d consumers", len(results), numConsumers) 5459 } 5460 } 5461 5462 func TestNoRaceJetStreamFileStoreLargeKVAccessTiming(t *testing.T) { 5463 storeDir := t.TempDir() 5464 5465 blkSize := uint64(4 * 1024) 5466 // Compensate for slower IO on MacOSX 5467 if runtime.GOOS == "darwin" { 5468 blkSize *= 4 5469 } 5470 5471 fs, err := newFileStore( 5472 FileStoreConfig{StoreDir: storeDir, BlockSize: blkSize, CacheExpire: 30 * time.Second}, 5473 StreamConfig{Name: "zzz", Subjects: []string{"KV.STREAM_NAME.*"}, Storage: FileStorage, MaxMsgsPer: 1}, 5474 ) 5475 require_NoError(t, err) 5476 defer fs.Stop() 5477 5478 tmpl := "KV.STREAM_NAME.%d" 5479 nkeys, val := 100_000, bytes.Repeat([]byte("Z"), 1024) 5480 5481 for i := 1; i <= nkeys; i++ { 5482 subj := fmt.Sprintf(tmpl, i) 5483 _, _, err := fs.StoreMsg(subj, nil, val) 5484 require_NoError(t, err) 5485 } 5486 5487 first := fmt.Sprintf(tmpl, 1) 5488 last := fmt.Sprintf(tmpl, nkeys) 5489 5490 start := time.Now() 5491 sm, err := fs.LoadLastMsg(last, nil) 5492 require_NoError(t, err) 5493 base := time.Since(start) 5494 5495 if !bytes.Equal(sm.msg, val) { 5496 t.Fatalf("Retrieved value did not match") 5497 } 5498 5499 start = time.Now() 5500 _, err = fs.LoadLastMsg(first, nil) 5501 require_NoError(t, err) 5502 slow := time.Since(start) 5503 5504 if slow > 4*base || slow > time.Millisecond { 5505 t.Fatalf("Took too long to look up first key vs last: %v vs %v", base, slow) 5506 } 5507 5508 // time first seq lookup for both as well. 5509 // Base will be first in this case. 5510 fs.mu.RLock() 5511 start = time.Now() 5512 fs.firstSeqForSubj(first) 5513 base = time.Since(start) 5514 start = time.Now() 5515 fs.firstSeqForSubj(last) 5516 slow = time.Since(start) 5517 fs.mu.RUnlock() 5518 5519 if slow > 4*base || slow > time.Millisecond { 5520 t.Fatalf("Took too long to look up last key by subject vs first: %v vs %v", base, slow) 5521 } 5522 } 5523 5524 func TestNoRaceJetStreamKVLock(t *testing.T) { 5525 s := RunBasicJetStreamServer(t) 5526 defer s.Shutdown() 5527 5528 nc, js := jsClientConnect(t, s) 5529 defer nc.Close() 5530 5531 _, err := js.CreateKeyValue(&nats.KeyValueConfig{Bucket: "LOCKS"}) 5532 require_NoError(t, err) 5533 5534 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 5535 defer cancel() 5536 5537 var wg sync.WaitGroup 5538 start := make(chan bool) 5539 5540 var tracker int64 5541 5542 for i := 0; i < 100; i++ { 5543 wg.Add(1) 5544 go func() { 5545 defer wg.Done() 5546 5547 nc, js := jsClientConnect(t, s) 5548 defer nc.Close() 5549 kv, err := js.KeyValue("LOCKS") 5550 require_NoError(t, err) 5551 5552 <-start 5553 5554 for { 5555 last, err := kv.Create("MY_LOCK", []byte("Z")) 5556 if err != nil { 5557 select { 5558 case <-time.After(10 * time.Millisecond): 5559 continue 5560 case <-ctx.Done(): 5561 return 5562 } 5563 } 5564 5565 if v := atomic.AddInt64(&tracker, 1); v != 1 { 5566 t.Logf("TRACKER NOT 1 -> %d\n", v) 5567 cancel() 5568 } 5569 5570 time.Sleep(10 * time.Millisecond) 5571 if v := atomic.AddInt64(&tracker, -1); v != 0 { 5572 t.Logf("TRACKER NOT 0 AFTER RELEASE -> %d\n", v) 5573 cancel() 5574 } 5575 5576 err = kv.Delete("MY_LOCK", nats.LastRevision(last)) 5577 if err != nil { 5578 t.Logf("Could not unlock for last %d: %v", last, err) 5579 } 5580 5581 if ctx.Err() != nil { 5582 return 5583 } 5584 } 5585 }() 5586 } 5587 5588 close(start) 5589 wg.Wait() 5590 } 5591 5592 func TestNoRaceJetStreamSuperClusterStreamMoveLongRTT(t *testing.T) { 5593 // Make C2 far away. 5594 gwm := gwProxyMap{ 5595 "C2": &gwProxy{ 5596 rtt: 20 * time.Millisecond, 5597 up: 1 * 1024 * 1024 * 1024, // 1gbit 5598 down: 1 * 1024 * 1024 * 1024, // 1gbit 5599 }, 5600 } 5601 sc := createJetStreamTaggedSuperClusterWithGWProxy(t, gwm) 5602 defer sc.shutdown() 5603 5604 nc, js := jsClientConnect(t, sc.randomServer()) 5605 defer nc.Close() 5606 5607 cfg := &nats.StreamConfig{ 5608 Name: "TEST", 5609 Subjects: []string{"chunk.*"}, 5610 Placement: &nats.Placement{Tags: []string{"cloud:aws", "country:us"}}, 5611 Replicas: 3, 5612 } 5613 5614 // Place a stream in C1. 5615 _, err := js.AddStream(cfg, nats.MaxWait(10*time.Second)) 5616 require_NoError(t, err) 5617 5618 chunk := bytes.Repeat([]byte("Z"), 1000*1024) // ~1MB 5619 // 256 MB 5620 for i := 0; i < 256; i++ { 5621 subj := fmt.Sprintf("chunk.%d", i) 5622 js.PublishAsync(subj, chunk) 5623 } 5624 select { 5625 case <-js.PublishAsyncComplete(): 5626 case <-time.After(10 * time.Second): 5627 t.Fatalf("Did not receive completion signal") 5628 } 5629 5630 // C2, slow RTT. 5631 cfg.Placement = &nats.Placement{Tags: []string{"cloud:gcp", "country:uk"}} 5632 _, err = js.UpdateStream(cfg) 5633 require_NoError(t, err) 5634 5635 checkFor(t, 20*time.Second, time.Second, func() error { 5636 si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)) 5637 if err != nil { 5638 return err 5639 } 5640 if si.Cluster.Name != "C2" { 5641 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 5642 } 5643 if si.Cluster.Leader == _EMPTY_ { 5644 return fmt.Errorf("No leader yet") 5645 } else if !strings.HasPrefix(si.Cluster.Leader, "C2-") { 5646 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 5647 } 5648 // Now we want to see that we shrink back to original. 5649 if len(si.Cluster.Replicas) != cfg.Replicas-1 { 5650 return fmt.Errorf("Expected %d replicas, got %d", cfg.Replicas-1, len(si.Cluster.Replicas)) 5651 } 5652 return nil 5653 }) 5654 } 5655 5656 // https://github.com/nats-io/nats-server/issues/3455 5657 func TestNoRaceJetStreamConcurrentPullConsumerBatch(t *testing.T) { 5658 s := RunBasicJetStreamServer(t) 5659 defer s.Shutdown() 5660 5661 nc, js := jsClientConnect(t, s) 5662 defer nc.Close() 5663 5664 _, err := js.AddStream(&nats.StreamConfig{ 5665 Name: "TEST", 5666 Subjects: []string{"ORDERS.*"}, 5667 Storage: nats.MemoryStorage, 5668 Retention: nats.WorkQueuePolicy, 5669 }) 5670 require_NoError(t, err) 5671 5672 toSend := int32(100_000) 5673 5674 for i := 0; i < 100_000; i++ { 5675 subj := fmt.Sprintf("ORDERS.%d", i+1) 5676 js.PublishAsync(subj, []byte("BUY")) 5677 } 5678 select { 5679 case <-js.PublishAsyncComplete(): 5680 case <-time.After(5 * time.Second): 5681 t.Fatalf("Did not receive completion signal") 5682 } 5683 5684 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5685 Durable: "PROCESSOR", 5686 AckPolicy: nats.AckExplicitPolicy, 5687 MaxAckPending: 5000, 5688 }) 5689 require_NoError(t, err) 5690 5691 nc, js = jsClientConnect(t, s) 5692 defer nc.Close() 5693 5694 sub1, err := js.PullSubscribe(_EMPTY_, _EMPTY_, nats.Bind("TEST", "PROCESSOR")) 5695 require_NoError(t, err) 5696 5697 nc, js = jsClientConnect(t, s) 5698 defer nc.Close() 5699 5700 sub2, err := js.PullSubscribe(_EMPTY_, _EMPTY_, nats.Bind("TEST", "PROCESSOR")) 5701 require_NoError(t, err) 5702 5703 startCh := make(chan bool) 5704 5705 var received int32 5706 5707 wg := sync.WaitGroup{} 5708 5709 fetchSize := 1000 5710 fetch := func(sub *nats.Subscription) { 5711 <-startCh 5712 defer wg.Done() 5713 5714 for { 5715 msgs, err := sub.Fetch(fetchSize, nats.MaxWait(time.Second)) 5716 if atomic.AddInt32(&received, int32(len(msgs))) >= toSend { 5717 break 5718 } 5719 // We should always receive a full batch here if not last competing fetch. 5720 if err != nil || len(msgs) != fetchSize { 5721 break 5722 } 5723 for _, m := range msgs { 5724 m.Ack() 5725 } 5726 } 5727 } 5728 5729 wg.Add(2) 5730 5731 go fetch(sub1) 5732 go fetch(sub2) 5733 5734 close(startCh) 5735 5736 wg.Wait() 5737 require_True(t, received == toSend) 5738 } 5739 5740 func TestNoRaceJetStreamManyPullConsumersNeedAckOptimization(t *testing.T) { 5741 // Uncomment to run. Do not want as part of Travis tests atm. 5742 // Run with cpu and memory profiling to make sure we have improved. 5743 skip(t) 5744 5745 s := RunBasicJetStreamServer(t) 5746 defer s.Shutdown() 5747 5748 nc, js := jsClientConnect(t, s) 5749 defer nc.Close() 5750 5751 _, err := js.AddStream(&nats.StreamConfig{ 5752 Name: "ORDERS", 5753 Subjects: []string{"ORDERS.*"}, 5754 Storage: nats.MemoryStorage, 5755 Retention: nats.InterestPolicy, 5756 }) 5757 require_NoError(t, err) 5758 5759 toSend := 100_000 5760 numConsumers := 500 5761 5762 // Create 500 consumers 5763 for i := 1; i <= numConsumers; i++ { 5764 _, err := js.AddConsumer("ORDERS", &nats.ConsumerConfig{ 5765 Durable: fmt.Sprintf("ORDERS_%d", i), 5766 FilterSubject: fmt.Sprintf("ORDERS.%d", i), 5767 AckPolicy: nats.AckAllPolicy, 5768 }) 5769 require_NoError(t, err) 5770 } 5771 5772 for i := 1; i <= toSend; i++ { 5773 subj := fmt.Sprintf("ORDERS.%d", i%numConsumers+1) 5774 js.PublishAsync(subj, []byte("HELLO")) 5775 } 5776 select { 5777 case <-js.PublishAsyncComplete(): 5778 case <-time.After(5 * time.Second): 5779 t.Fatalf("Did not receive completion signal") 5780 } 5781 5782 sub, err := js.PullSubscribe("ORDERS.500", "ORDERS_500") 5783 require_NoError(t, err) 5784 5785 fetchSize := toSend / numConsumers 5786 msgs, err := sub.Fetch(fetchSize, nats.MaxWait(time.Second)) 5787 require_NoError(t, err) 5788 5789 last := msgs[len(msgs)-1] 5790 last.AckSync() 5791 } 5792 5793 // https://github.com/nats-io/nats-server/issues/3499 5794 func TestNoRaceJetStreamDeleteConsumerWithInterestStreamAndHighSeqs(t *testing.T) { 5795 s := RunBasicJetStreamServer(t) 5796 defer s.Shutdown() 5797 5798 // Client for API requests. 5799 nc, js := jsClientConnect(t, s) 5800 defer nc.Close() 5801 5802 _, err := js.AddStream(&nats.StreamConfig{ 5803 Name: "TEST", 5804 Subjects: []string{"log.>"}, 5805 Retention: nats.InterestPolicy, 5806 }) 5807 require_NoError(t, err) 5808 5809 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5810 Durable: "c", 5811 AckPolicy: nats.AckExplicitPolicy, 5812 }) 5813 require_NoError(t, err) 5814 5815 // Set baseline for time to delete so we can see linear increase as sequence numbers increase. 5816 start := time.Now() 5817 err = js.DeleteConsumer("TEST", "c") 5818 require_NoError(t, err) 5819 elapsed := time.Since(start) 5820 5821 // Crank up sequence numbers. 5822 msg := []byte(strings.Repeat("ZZZ", 128)) 5823 for i := 0; i < 5_000_000; i++ { 5824 nc.Publish("log.Z", msg) 5825 } 5826 nc.Flush() 5827 5828 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5829 Durable: "c", 5830 AckPolicy: nats.AckExplicitPolicy, 5831 }) 5832 require_NoError(t, err) 5833 5834 // We have a bug that spins unecessarily through all the sequences from this consumer's 5835 // ackfloor(0) and the last sequence for the stream. We will detect by looking for the time 5836 // to delete being 100x more. Should be the same since both times no messages exist in the stream. 5837 start = time.Now() 5838 err = js.DeleteConsumer("TEST", "c") 5839 require_NoError(t, err) 5840 5841 if e := time.Since(start); e > 100*elapsed { 5842 t.Fatalf("Consumer delete took too long: %v vs baseline %v", e, elapsed) 5843 } 5844 } 5845 5846 // Bug when we encode a timestamp that upon decode causes an error which causes server to panic. 5847 // This can happen on consumer redelivery since they adjusted timstamps can be in the future, and result 5848 // in a negative encoding. If that encoding was exactly -1 seconds, would cause decodeConsumerState to fail 5849 // and the server to panic. 5850 func TestNoRaceEncodeConsumerStateBug(t *testing.T) { 5851 for i := 0; i < 200_000; i++ { 5852 // Pretend we redelivered and updated the timestamp to reflect the new start time for expiration. 5853 // The bug will trip when time.Now() rounded to seconds in encode is 1 second below the truncated version 5854 // of pending. 5855 pending := Pending{Sequence: 1, Timestamp: time.Now().Add(time.Second).UnixNano()} 5856 state := ConsumerState{ 5857 Delivered: SequencePair{Consumer: 1, Stream: 1}, 5858 Pending: map[uint64]*Pending{1: &pending}, 5859 } 5860 buf := encodeConsumerState(&state) 5861 _, err := decodeConsumerState(buf) 5862 require_NoError(t, err) 5863 } 5864 } 5865 5866 // Performance impact on stream ingress with large number of consumers. 5867 func TestNoRaceJetStreamLargeNumConsumersPerfImpact(t *testing.T) { 5868 skip(t) 5869 5870 s := RunBasicJetStreamServer(t) 5871 defer s.Shutdown() 5872 5873 // Client for API requests. 5874 nc, js := jsClientConnect(t, s) 5875 defer nc.Close() 5876 5877 _, err := js.AddStream(&nats.StreamConfig{ 5878 Name: "TEST", 5879 Subjects: []string{"foo"}, 5880 }) 5881 require_NoError(t, err) 5882 5883 // Baseline with no consumers. 5884 toSend := 1_000_000 5885 start := time.Now() 5886 for i := 0; i < toSend; i++ { 5887 js.PublishAsync("foo", []byte("OK")) 5888 } 5889 <-js.PublishAsyncComplete() 5890 tt := time.Since(start) 5891 fmt.Printf("Base time is %v\n", tt) 5892 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5893 5894 err = js.PurgeStream("TEST") 5895 require_NoError(t, err) 5896 5897 // Now add in 10 idle consumers. 5898 for i := 1; i <= 10; i++ { 5899 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5900 Durable: fmt.Sprintf("d-%d", i), 5901 AckPolicy: nats.AckExplicitPolicy, 5902 }) 5903 require_NoError(t, err) 5904 } 5905 5906 start = time.Now() 5907 for i := 0; i < toSend; i++ { 5908 js.PublishAsync("foo", []byte("OK")) 5909 } 5910 <-js.PublishAsyncComplete() 5911 tt = time.Since(start) 5912 fmt.Printf("\n10 consumers time is %v\n", tt) 5913 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5914 5915 err = js.PurgeStream("TEST") 5916 require_NoError(t, err) 5917 5918 // Now add in 90 more idle consumers. 5919 for i := 11; i <= 100; i++ { 5920 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5921 Durable: fmt.Sprintf("d-%d", i), 5922 AckPolicy: nats.AckExplicitPolicy, 5923 }) 5924 require_NoError(t, err) 5925 } 5926 5927 start = time.Now() 5928 for i := 0; i < toSend; i++ { 5929 js.PublishAsync("foo", []byte("OK")) 5930 } 5931 <-js.PublishAsyncComplete() 5932 tt = time.Since(start) 5933 fmt.Printf("\n100 consumers time is %v\n", tt) 5934 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5935 5936 err = js.PurgeStream("TEST") 5937 require_NoError(t, err) 5938 5939 // Now add in 900 more 5940 for i := 101; i <= 1000; i++ { 5941 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5942 Durable: fmt.Sprintf("d-%d", i), 5943 AckPolicy: nats.AckExplicitPolicy, 5944 }) 5945 require_NoError(t, err) 5946 } 5947 5948 start = time.Now() 5949 for i := 0; i < toSend; i++ { 5950 js.PublishAsync("foo", []byte("OK")) 5951 } 5952 <-js.PublishAsyncComplete() 5953 tt = time.Since(start) 5954 fmt.Printf("\n1000 consumers time is %v\n", tt) 5955 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5956 } 5957 5958 // Performance impact on large number of consumers but sparse delivery. 5959 func TestNoRaceJetStreamLargeNumConsumersSparseDelivery(t *testing.T) { 5960 skip(t) 5961 5962 s := RunBasicJetStreamServer(t) 5963 defer s.Shutdown() 5964 5965 // Client for API requests. 5966 nc, js := jsClientConnect(t, s) 5967 defer nc.Close() 5968 5969 _, err := js.AddStream(&nats.StreamConfig{ 5970 Name: "TEST", 5971 Subjects: []string{"ID.*"}, 5972 }) 5973 require_NoError(t, err) 5974 5975 // Now add in ~10k consumers on different subjects. 5976 for i := 3; i <= 10_000; i++ { 5977 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5978 Durable: fmt.Sprintf("d-%d", i), 5979 FilterSubject: fmt.Sprintf("ID.%d", i), 5980 AckPolicy: nats.AckNonePolicy, 5981 }) 5982 require_NoError(t, err) 5983 } 5984 5985 toSend := 100_000 5986 5987 // Bind a consumer to ID.2. 5988 var received int 5989 done := make(chan bool) 5990 5991 nc, js = jsClientConnect(t, s) 5992 defer nc.Close() 5993 5994 mh := func(m *nats.Msg) { 5995 received++ 5996 if received >= toSend { 5997 close(done) 5998 } 5999 } 6000 _, err = js.Subscribe("ID.2", mh) 6001 require_NoError(t, err) 6002 6003 last := make(chan bool) 6004 _, err = js.Subscribe("ID.1", func(_ *nats.Msg) { close(last) }) 6005 require_NoError(t, err) 6006 6007 nc, _ = jsClientConnect(t, s) 6008 defer nc.Close() 6009 js, err = nc.JetStream(nats.PublishAsyncMaxPending(8 * 1024)) 6010 require_NoError(t, err) 6011 6012 start := time.Now() 6013 for i := 0; i < toSend; i++ { 6014 js.PublishAsync("ID.2", []byte("ok")) 6015 } 6016 // Check latency for this one message. 6017 // This will show the issue better than throughput which can bypass signal processing. 6018 js.PublishAsync("ID.1", []byte("ok")) 6019 6020 select { 6021 case <-done: 6022 break 6023 case <-time.After(10 * time.Second): 6024 t.Fatalf("Failed to receive all messages: %d of %d\n", received, toSend) 6025 } 6026 6027 tt := time.Since(start) 6028 fmt.Printf("Took %v to receive %d msgs\n", tt, toSend) 6029 fmt.Printf("%.0f msgs/s\n", float64(toSend)/tt.Seconds()) 6030 6031 select { 6032 case <-last: 6033 break 6034 case <-time.After(30 * time.Second): 6035 t.Fatalf("Failed to receive last message\n") 6036 } 6037 lt := time.Since(start) 6038 6039 fmt.Printf("Took %v to receive last msg\n", lt) 6040 } 6041 6042 func TestNoRaceJetStreamEndToEndLatency(t *testing.T) { 6043 s := RunBasicJetStreamServer(t) 6044 defer s.Shutdown() 6045 6046 // Client for API requests. 6047 nc, js := jsClientConnect(t, s) 6048 defer nc.Close() 6049 6050 _, err := js.AddStream(&nats.StreamConfig{ 6051 Name: "TEST", 6052 Subjects: []string{"foo"}, 6053 }) 6054 require_NoError(t, err) 6055 6056 nc, js = jsClientConnect(t, s) 6057 defer nc.Close() 6058 6059 var sent time.Time 6060 var max time.Duration 6061 next := make(chan struct{}) 6062 6063 mh := func(m *nats.Msg) { 6064 received := time.Now() 6065 tt := received.Sub(sent) 6066 if max == 0 || tt > max { 6067 max = tt 6068 } 6069 next <- struct{}{} 6070 } 6071 sub, err := js.Subscribe("foo", mh) 6072 require_NoError(t, err) 6073 6074 nc, js = jsClientConnect(t, s) 6075 defer nc.Close() 6076 6077 toSend := 50_000 6078 for i := 0; i < toSend; i++ { 6079 sent = time.Now() 6080 js.Publish("foo", []byte("ok")) 6081 <-next 6082 } 6083 sub.Unsubscribe() 6084 6085 if max > 250*time.Millisecond { 6086 t.Fatalf("Expected max latency to be < 250ms, got %v", max) 6087 } 6088 } 6089 6090 func TestNoRaceJetStreamClusterEnsureWALCompact(t *testing.T) { 6091 c := createJetStreamClusterExplicit(t, "R3S", 3) 6092 defer c.shutdown() 6093 6094 nc, js := jsClientConnect(t, c.randomServer()) 6095 defer nc.Close() 6096 6097 _, err := js.AddStream(&nats.StreamConfig{ 6098 Name: "TEST", 6099 Subjects: []string{"foo"}, 6100 Replicas: 3, 6101 }) 6102 require_NoError(t, err) 6103 6104 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 6105 Durable: "dlc", 6106 DeliverSubject: "zz", 6107 Replicas: 3, 6108 }) 6109 require_NoError(t, err) 6110 6111 // Force snapshot on stream leader. 6112 sl := c.streamLeader(globalAccountName, "TEST") 6113 mset, err := sl.GlobalAccount().lookupStream("TEST") 6114 require_NoError(t, err) 6115 node := mset.raftNode() 6116 require_True(t, node != nil) 6117 6118 err = node.InstallSnapshot(mset.stateSnapshot()) 6119 require_NoError(t, err) 6120 6121 // Now publish more than should be needed to cause an additional snapshot. 6122 ns := 75_000 6123 for i := 0; i <= ns; i++ { 6124 _, err := js.Publish("foo", []byte("bar")) 6125 require_NoError(t, err) 6126 } 6127 6128 // Grab progress and use that to look into WAL entries. 6129 _, _, applied := node.Progress() 6130 // If ne == ns that means snapshots and compacts were not happening when 6131 // they should have been. 6132 if ne, _ := node.Applied(applied); ne >= uint64(ns) { 6133 t.Fatalf("Did not snapshot and compact the raft WAL, entries == %d", ne) 6134 } 6135 6136 // Now check consumer. 6137 // Force snapshot on consumerleader. 6138 cl := c.consumerLeader(globalAccountName, "TEST", "dlc") 6139 mset, err = cl.GlobalAccount().lookupStream("TEST") 6140 require_NoError(t, err) 6141 o := mset.lookupConsumer("dlc") 6142 require_True(t, o != nil) 6143 6144 node = o.raftNode() 6145 require_True(t, node != nil) 6146 6147 snap, err := o.store.EncodedState() 6148 require_NoError(t, err) 6149 err = node.InstallSnapshot(snap) 6150 require_NoError(t, err) 6151 6152 received, done := 0, make(chan bool, 1) 6153 6154 nc.Subscribe("zz", func(m *nats.Msg) { 6155 received++ 6156 if received >= ns { 6157 select { 6158 case done <- true: 6159 default: 6160 } 6161 } 6162 m.Ack() 6163 }) 6164 6165 select { 6166 case <-done: 6167 return 6168 case <-time.After(10 * time.Second): 6169 t.Fatalf("Did not received all %d msgs, only %d", ns, received) 6170 } 6171 6172 // Do same trick and check that WAL was compacted. 6173 // Grab progress and use that to look into WAL entries. 6174 _, _, applied = node.Progress() 6175 // If ne == ns that means snapshots and compacts were not happening when 6176 // they should have been. 6177 if ne, _ := node.Applied(applied); ne >= uint64(ns) { 6178 t.Fatalf("Did not snapshot and compact the raft WAL, entries == %d", ne) 6179 } 6180 } 6181 6182 func TestNoRaceFileStoreStreamMaxAgePerformance(t *testing.T) { 6183 // Uncomment to run. 6184 skip(t) 6185 6186 storeDir := t.TempDir() 6187 maxAge := 5 * time.Second 6188 6189 fs, err := newFileStore( 6190 FileStoreConfig{StoreDir: storeDir}, 6191 StreamConfig{Name: "MA", 6192 Subjects: []string{"foo.*"}, 6193 MaxAge: maxAge, 6194 Storage: FileStorage}, 6195 ) 6196 require_NoError(t, err) 6197 defer fs.Stop() 6198 6199 // Simulate a callback similar to consumers decrementing. 6200 var mu sync.RWMutex 6201 var pending int64 6202 6203 fs.RegisterStorageUpdates(func(md, bd int64, seq uint64, subj string) { 6204 mu.Lock() 6205 defer mu.Unlock() 6206 pending += md 6207 }) 6208 6209 start, num, subj := time.Now(), 0, "foo.foo" 6210 6211 timeout := start.Add(maxAge) 6212 for time.Now().Before(timeout) { 6213 // We will store in blocks of 100. 6214 for i := 0; i < 100; i++ { 6215 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 6216 require_NoError(t, err) 6217 num++ 6218 } 6219 } 6220 elapsed := time.Since(start) 6221 fmt.Printf("Took %v to store %d\n", elapsed, num) 6222 fmt.Printf("%.0f msgs/sec\n", float64(num)/elapsed.Seconds()) 6223 6224 // Now keep running for 2x longer knowing we are expiring messages in the background. 6225 // We want to see the effect on performance. 6226 6227 start = time.Now() 6228 timeout = start.Add(maxAge * 2) 6229 6230 for time.Now().Before(timeout) { 6231 // We will store in blocks of 100. 6232 for i := 0; i < 100; i++ { 6233 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 6234 require_NoError(t, err) 6235 num++ 6236 } 6237 } 6238 elapsed = time.Since(start) 6239 fmt.Printf("Took %v to store %d\n", elapsed, num) 6240 fmt.Printf("%.0f msgs/sec\n", float64(num)/elapsed.Seconds()) 6241 } 6242 6243 // SequenceSet memory tests vs dmaps. 6244 func TestNoRaceSeqSetSizeComparison(t *testing.T) { 6245 // Create 5M random entries (dupes possible but ok for this test) out of 8M range. 6246 num := 5_000_000 6247 max := 7_000_000 6248 6249 seqs := make([]uint64, 0, num) 6250 for i := 0; i < num; i++ { 6251 n := uint64(rand.Int63n(int64(max + 1))) 6252 seqs = append(seqs, n) 6253 } 6254 6255 runtime.GC() 6256 // Disable to get stable results. 6257 gcp := debug.SetGCPercent(-1) 6258 defer debug.SetGCPercent(gcp) 6259 6260 mem := runtime.MemStats{} 6261 runtime.ReadMemStats(&mem) 6262 inUseBefore := mem.HeapInuse 6263 6264 dmap := make(map[uint64]struct{}, num) 6265 for _, n := range seqs { 6266 dmap[n] = struct{}{} 6267 } 6268 runtime.ReadMemStats(&mem) 6269 dmapUse := mem.HeapInuse - inUseBefore 6270 inUseBefore = mem.HeapInuse 6271 6272 // Now do SequenceSet on same dataset. 6273 var sset avl.SequenceSet 6274 for _, n := range seqs { 6275 sset.Insert(n) 6276 } 6277 6278 runtime.ReadMemStats(&mem) 6279 seqSetUse := mem.HeapInuse - inUseBefore 6280 6281 if seqSetUse > 2*1024*1024 { 6282 t.Fatalf("Expected SequenceSet size to be < 2M, got %v", friendlyBytes(int64(seqSetUse))) 6283 } 6284 if seqSetUse*50 > dmapUse { 6285 t.Fatalf("Expected SequenceSet to be at least 50x better then dmap approach: %v vs %v", 6286 friendlyBytes(int64(seqSetUse)), 6287 friendlyBytes(int64(dmapUse)), 6288 ) 6289 } 6290 } 6291 6292 // FilteredState for ">" with large interior deletes was very slow. 6293 func TestNoRaceFileStoreFilteredStateWithLargeDeletes(t *testing.T) { 6294 storeDir := t.TempDir() 6295 6296 fs, err := newFileStore( 6297 FileStoreConfig{StoreDir: storeDir, BlockSize: 4096}, 6298 StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, 6299 ) 6300 require_NoError(t, err) 6301 defer fs.Stop() 6302 6303 subj, msg := "foo", []byte("Hello World") 6304 6305 toStore := 500_000 6306 for i := 0; i < toStore; i++ { 6307 _, _, err := fs.StoreMsg(subj, nil, msg) 6308 require_NoError(t, err) 6309 } 6310 6311 // Now delete every other one. 6312 for seq := 2; seq <= toStore; seq += 2 { 6313 _, err := fs.RemoveMsg(uint64(seq)) 6314 require_NoError(t, err) 6315 } 6316 6317 runtime.GC() 6318 // Disable to get stable results. 6319 gcp := debug.SetGCPercent(-1) 6320 defer debug.SetGCPercent(gcp) 6321 6322 start := time.Now() 6323 fss := fs.FilteredState(1, _EMPTY_) 6324 elapsed := time.Since(start) 6325 6326 require_True(t, fss.Msgs == uint64(toStore/2)) 6327 require_True(t, elapsed < 500*time.Microsecond) 6328 } 6329 6330 // ConsumerInfo seems to being called quite a bit more than we had anticipated. 6331 // Under certain circumstances, since we reset num pending, this can be very costly. 6332 // We will use the fast path to alleviate that performance bottleneck but also make 6333 // sure we are still being accurate. 6334 func TestNoRaceJetStreamClusterConsumerInfoSpeed(t *testing.T) { 6335 c := createJetStreamClusterExplicit(t, "R3S", 3) 6336 defer c.shutdown() 6337 6338 c.waitOnLeader() 6339 server := c.randomNonLeader() 6340 6341 nc, js := jsClientConnect(t, server) 6342 defer nc.Close() 6343 6344 _, err := js.AddStream(&nats.StreamConfig{ 6345 Name: "TEST", 6346 Subjects: []string{"events.>"}, 6347 Replicas: 3, 6348 }) 6349 require_NoError(t, err) 6350 6351 // The issue is compounded when we have lots of different subjects captured 6352 // by a terminal fwc. The consumer will have a terminal pwc. 6353 // Here make all subjects unique. 6354 6355 sub, err := js.PullSubscribe("events.*", "DLC") 6356 require_NoError(t, err) 6357 6358 toSend := 250_000 6359 for i := 0; i < toSend; i++ { 6360 subj := fmt.Sprintf("events.%d", i+1) 6361 js.PublishAsync(subj, []byte("ok")) 6362 } 6363 select { 6364 case <-js.PublishAsyncComplete(): 6365 case <-time.After(5 * time.Second): 6366 t.Fatalf("Did not receive completion signal") 6367 } 6368 6369 checkNumPending := func(expected int) { 6370 t.Helper() 6371 start := time.Now() 6372 ci, err := js.ConsumerInfo("TEST", "DLC") 6373 require_NoError(t, err) 6374 // Make sure these are fast now. 6375 if elapsed := time.Since(start); elapsed > 5*time.Millisecond { 6376 t.Fatalf("ConsumerInfo took too long: %v", elapsed) 6377 } 6378 // Make sure pending == expected. 6379 if ci.NumPending != uint64(expected) { 6380 t.Fatalf("Expected %d NumPending, got %d", expected, ci.NumPending) 6381 } 6382 } 6383 // Make sure in simple case it is correct. 6384 checkNumPending(toSend) 6385 6386 // Do a few acks. 6387 toAck := 25 6388 for _, m := range fetchMsgs(t, sub, 25, time.Second) { 6389 err = m.AckSync() 6390 require_NoError(t, err) 6391 } 6392 checkNumPending(toSend - toAck) 6393 6394 // Now do a purge such that we only keep so many. 6395 // We want to make sure we do the right thing here and have correct calculations. 6396 toKeep := 100_000 6397 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Keep: uint64(toKeep)}) 6398 require_NoError(t, err) 6399 6400 checkNumPending(toKeep) 6401 } 6402 6403 func TestNoRaceJetStreamKVAccountWithServerRestarts(t *testing.T) { 6404 // Uncomment to run. Needs fast machine to not time out on KeyValue lookup. 6405 skip(t) 6406 6407 c := createJetStreamClusterExplicit(t, "R3S", 3) 6408 defer c.shutdown() 6409 6410 nc, js := jsClientConnect(t, c.randomServer()) 6411 defer nc.Close() 6412 6413 _, err := js.CreateKeyValue(&nats.KeyValueConfig{ 6414 Bucket: "TEST", 6415 Replicas: 3, 6416 }) 6417 require_NoError(t, err) 6418 6419 npubs := 10_000 6420 par := 8 6421 iter := 2 6422 nsubjs := 250 6423 6424 wg := sync.WaitGroup{} 6425 putKeys := func() { 6426 wg.Add(1) 6427 go func() { 6428 defer wg.Done() 6429 nc, js := jsClientConnect(t, c.randomServer()) 6430 defer nc.Close() 6431 kv, err := js.KeyValue("TEST") 6432 require_NoError(t, err) 6433 6434 for i := 0; i < npubs; i++ { 6435 subj := fmt.Sprintf("KEY-%d", rand.Intn(nsubjs)) 6436 if _, err := kv.PutString(subj, "hello"); err != nil { 6437 nc, js := jsClientConnect(t, c.randomServer()) 6438 defer nc.Close() 6439 kv, err = js.KeyValue("TEST") 6440 require_NoError(t, err) 6441 } 6442 } 6443 }() 6444 } 6445 6446 restartServers := func() { 6447 time.Sleep(2 * time.Second) 6448 // Rotate through and restart the servers. 6449 for _, server := range c.servers { 6450 server.Shutdown() 6451 restarted := c.restartServer(server) 6452 checkFor(t, time.Second, 200*time.Millisecond, func() error { 6453 hs := restarted.healthz(&HealthzOptions{ 6454 JSEnabled: true, 6455 JSServerOnly: true, 6456 }) 6457 if hs.Error != _EMPTY_ { 6458 return errors.New(hs.Error) 6459 } 6460 return nil 6461 }) 6462 } 6463 c.waitOnLeader() 6464 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 6465 } 6466 6467 for n := 0; n < iter; n++ { 6468 for i := 0; i < par; i++ { 6469 putKeys() 6470 } 6471 restartServers() 6472 } 6473 wg.Wait() 6474 6475 nc, js = jsClientConnect(t, c.randomServer()) 6476 defer nc.Close() 6477 6478 si, err := js.StreamInfo("KV_TEST") 6479 require_NoError(t, err) 6480 require_True(t, si.State.NumSubjects == uint64(nsubjs)) 6481 } 6482 6483 // Test for consumer create when the subject cardinality is high and the 6484 // consumer is filtered with a wildcard that forces linear scans. 6485 // We have an optimization to use in memory structures in filestore to speed up. 6486 // Only if asking to scan all (DeliverAll). 6487 func TestNoRaceJetStreamConsumerCreateTimeNumPending(t *testing.T) { 6488 s := RunBasicJetStreamServer(t) 6489 defer s.Shutdown() 6490 6491 nc, js := jsClientConnect(t, s) 6492 defer nc.Close() 6493 6494 _, err := js.AddStream(&nats.StreamConfig{ 6495 Name: "TEST", 6496 Subjects: []string{"events.>"}, 6497 }) 6498 require_NoError(t, err) 6499 6500 n := 500_000 6501 msg := bytes.Repeat([]byte("X"), 8*1024) 6502 6503 for i := 0; i < n; i++ { 6504 subj := fmt.Sprintf("events.%d", rand.Intn(100_000)) 6505 js.PublishAsync(subj, msg) 6506 } 6507 select { 6508 case <-js.PublishAsyncComplete(): 6509 case <-time.After(5 * time.Second): 6510 } 6511 6512 // Should stay under 5ms now, but for Travis variability say 50ms. 6513 threshold := 50 * time.Millisecond 6514 6515 start := time.Now() 6516 _, err = js.PullSubscribe("events.*", "dlc") 6517 require_NoError(t, err) 6518 if elapsed := time.Since(start); elapsed > threshold { 6519 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6520 } 6521 6522 start = time.Now() 6523 _, err = js.PullSubscribe("events.99999", "xxx") 6524 require_NoError(t, err) 6525 if elapsed := time.Since(start); elapsed > threshold { 6526 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6527 } 6528 6529 start = time.Now() 6530 _, err = js.PullSubscribe(">", "zzz") 6531 require_NoError(t, err) 6532 if elapsed := time.Since(start); elapsed > threshold { 6533 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6534 } 6535 } 6536 6537 func TestNoRaceJetStreamClusterGhostConsumers(t *testing.T) { 6538 c := createJetStreamClusterExplicit(t, "GHOST", 3) 6539 defer c.shutdown() 6540 6541 nc, js := jsClientConnect(t, c.randomServer()) 6542 defer nc.Close() 6543 6544 _, err := js.AddStream(&nats.StreamConfig{ 6545 Name: "TEST", 6546 Subjects: []string{"events.>"}, 6547 Replicas: 3, 6548 }) 6549 require_NoError(t, err) 6550 6551 for i := 0; i < 10; i++ { 6552 for j := 0; j < 10; j++ { 6553 require_NoError(t, nc.Publish(fmt.Sprintf("events.%d.%d", i, j), []byte(`test`))) 6554 } 6555 } 6556 6557 fetch := func(id int) { 6558 subject := fmt.Sprintf("events.%d.*", id) 6559 subscription, err := js.PullSubscribe(subject, 6560 _EMPTY_, // ephemeral consumer 6561 nats.DeliverAll(), 6562 nats.ReplayInstant(), 6563 nats.BindStream("TEST"), 6564 nats.ConsumerReplicas(1), 6565 nats.ConsumerMemoryStorage(), 6566 ) 6567 if err != nil { 6568 return 6569 } 6570 defer subscription.Unsubscribe() 6571 6572 info, err := subscription.ConsumerInfo() 6573 if err != nil { 6574 return 6575 } 6576 6577 subscription.Fetch(int(info.NumPending)) 6578 } 6579 6580 replay := func(ctx context.Context, id int) { 6581 for { 6582 select { 6583 case <-ctx.Done(): 6584 return 6585 default: 6586 fetch(id) 6587 } 6588 } 6589 } 6590 6591 ctx, cancel := context.WithCancel(context.Background()) 6592 6593 go replay(ctx, 0) 6594 go replay(ctx, 1) 6595 go replay(ctx, 2) 6596 go replay(ctx, 3) 6597 go replay(ctx, 4) 6598 go replay(ctx, 5) 6599 go replay(ctx, 6) 6600 go replay(ctx, 7) 6601 go replay(ctx, 8) 6602 go replay(ctx, 9) 6603 6604 time.Sleep(5 * time.Second) 6605 6606 for _, server := range c.servers { 6607 server.Shutdown() 6608 restarted := c.restartServer(server) 6609 checkFor(t, time.Second, 200*time.Millisecond, func() error { 6610 hs := restarted.healthz(&HealthzOptions{ 6611 JSEnabled: true, 6612 JSServerOnly: true, 6613 }) 6614 if hs.Error != _EMPTY_ { 6615 return errors.New(hs.Error) 6616 } 6617 return nil 6618 }) 6619 c.waitOnStreamLeader(globalAccountName, "TEST") 6620 time.Sleep(time.Second * 2) 6621 go replay(ctx, 5) 6622 go replay(ctx, 6) 6623 go replay(ctx, 7) 6624 go replay(ctx, 8) 6625 go replay(ctx, 9) 6626 } 6627 6628 time.Sleep(5 * time.Second) 6629 cancel() 6630 6631 getMissing := func() []string { 6632 m, err := nc.Request("$JS.API.CONSUMER.LIST.TEST", nil, time.Second*10) 6633 require_NoError(t, err) 6634 6635 var resp JSApiConsumerListResponse 6636 err = json.Unmarshal(m.Data, &resp) 6637 require_NoError(t, err) 6638 return resp.Missing 6639 } 6640 6641 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 6642 missing := getMissing() 6643 if len(missing) == 0 { 6644 return nil 6645 } 6646 return fmt.Errorf("Still have missing: %+v", missing) 6647 }) 6648 } 6649 6650 // This is to test a publish slowdown and general instability experienced in a setup similar to this. 6651 // We have feeder streams that are all sourced to an aggregate stream. All streams are interest retention. 6652 // We want to monitor the avg publish time for the sync publishers to the feeder streams, the ingest rate to 6653 // the aggregate stream, and general health of the consumers on the aggregate stream. 6654 // Target publish rate is ~2k/s with publish time being ~40-60ms but remaining stable. 6655 // We can also simulate max redeliveries that create interior deletes in streams. 6656 func TestNoRaceJetStreamClusterF3Setup(t *testing.T) { 6657 // Uncomment to run. Needs to be on a pretty big machine. Do not want as part of Travis tests atm. 6658 skip(t) 6659 6660 // These and the settings below achieve ~60ms pub time on avg and ~2k msgs per sec inbound to the aggregate stream. 6661 // On my machine though. 6662 np := clusterProxy{ 6663 rtt: 2 * time.Millisecond, 6664 up: 1 * 1024 * 1024 * 1024, // 1gbit 6665 down: 1 * 1024 * 1024 * 1024, // 1gbit 6666 } 6667 6668 // Test params. 6669 numSourceStreams := 20 6670 numConsumersPerSource := 1 6671 numPullersPerConsumer := 50 6672 numPublishers := 100 6673 setHighStartSequence := false 6674 simulateMaxRedeliveries := false 6675 maxBadPubTimes := uint32(20) 6676 badPubThresh := 500 * time.Millisecond 6677 testTime := 5 * time.Minute // make sure to do --timeout=65m 6678 6679 t.Logf("Starting Test: Total Test Time %v", testTime) 6680 6681 c := createJetStreamClusterWithNetProxy(t, "R3S", 3, &np) 6682 defer c.shutdown() 6683 6684 // Do some quick sanity checking for latency stuff. 6685 { 6686 nc, js := jsClientConnect(t, c.randomServer()) 6687 defer nc.Close() 6688 6689 _, err := js.AddStream(&nats.StreamConfig{ 6690 Name: "TEST", 6691 Replicas: 3, 6692 Subjects: []string{"foo"}, 6693 Retention: nats.InterestPolicy, 6694 }) 6695 require_NoError(t, err) 6696 defer js.DeleteStream("TEST") 6697 6698 sl := c.streamLeader(globalAccountName, "TEST") 6699 nc, js = jsClientConnect(t, sl) 6700 defer nc.Close() 6701 start := time.Now() 6702 _, err = js.Publish("foo", []byte("hello")) 6703 require_NoError(t, err) 6704 // This is best case, and with client connection being close to free, this should be at least > rtt 6705 if elapsed := time.Since(start); elapsed < np.rtt { 6706 t.Fatalf("Expected publish time to be > %v, got %v", np.rtt, elapsed) 6707 } 6708 6709 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 6710 nc, js = jsClientConnect(t, nl) 6711 defer nc.Close() 6712 start = time.Now() 6713 _, err = js.Publish("foo", []byte("hello")) 6714 require_NoError(t, err) 6715 // This is worst case, meaning message has to travel to leader, then to fastest replica, then back. 6716 // So should be at 3x rtt, so check at least > 2x rtt. 6717 if elapsed := time.Since(start); elapsed < 2*np.rtt { 6718 t.Fatalf("Expected publish time to be > %v, got %v", 2*np.rtt, elapsed) 6719 } 6720 } 6721 6722 // Setup source streams. 6723 nc, js := jsClientConnect(t, c.randomServer()) 6724 defer nc.Close() 6725 6726 t.Logf("Creating %d Source Streams", numSourceStreams) 6727 6728 var sources []string 6729 wg := sync.WaitGroup{} 6730 for i := 0; i < numSourceStreams; i++ { 6731 sname := fmt.Sprintf("EVENT-%s", nuid.Next()) 6732 sources = append(sources, sname) 6733 wg.Add(1) 6734 go func(stream string) { 6735 defer wg.Done() 6736 t.Logf(" %q", stream) 6737 subj := fmt.Sprintf("%s.>", stream) 6738 _, err := js.AddStream(&nats.StreamConfig{ 6739 Name: stream, 6740 Subjects: []string{subj}, 6741 Replicas: 3, 6742 Retention: nats.InterestPolicy, 6743 }) 6744 require_NoError(t, err) 6745 for j := 0; j < numConsumersPerSource; j++ { 6746 consumer := fmt.Sprintf("C%d", j) 6747 _, err := js.Subscribe(_EMPTY_, func(msg *nats.Msg) { 6748 msg.Ack() 6749 }, nats.BindStream(stream), nats.Durable(consumer), nats.ManualAck()) 6750 require_NoError(t, err) 6751 } 6752 }(sname) 6753 } 6754 wg.Wait() 6755 6756 var streamSources []*nats.StreamSource 6757 for _, src := range sources { 6758 streamSources = append(streamSources, &nats.StreamSource{Name: src}) 6759 6760 } 6761 6762 t.Log("Creating Aggregate Stream") 6763 6764 // Now create the aggregate stream. 6765 _, err := js.AddStream(&nats.StreamConfig{ 6766 Name: "EVENTS", 6767 Replicas: 3, 6768 Retention: nats.InterestPolicy, 6769 Sources: streamSources, 6770 }) 6771 require_NoError(t, err) 6772 6773 // Set first sequence to a high number. 6774 if setHighStartSequence { 6775 require_NoError(t, js.PurgeStream("EVENTS", &nats.StreamPurgeRequest{Sequence: 32_000_001})) 6776 } 6777 6778 // Now create 2 pull consumers. 6779 _, err = js.PullSubscribe(_EMPTY_, "C1", 6780 nats.BindStream("EVENTS"), 6781 nats.MaxDeliver(1), 6782 nats.AckWait(10*time.Second), 6783 nats.ManualAck(), 6784 ) 6785 require_NoError(t, err) 6786 6787 _, err = js.PullSubscribe(_EMPTY_, "C2", 6788 nats.BindStream("EVENTS"), 6789 nats.MaxDeliver(1), 6790 nats.AckWait(10*time.Second), 6791 nats.ManualAck(), 6792 ) 6793 require_NoError(t, err) 6794 6795 t.Logf("Creating %d x 2 Pull Subscribers", numPullersPerConsumer) 6796 6797 // Now create the pullers. 6798 for _, subName := range []string{"C1", "C2"} { 6799 for i := 0; i < numPullersPerConsumer; i++ { 6800 go func(subName string) { 6801 nc, js := jsClientConnect(t, c.randomServer()) 6802 defer nc.Close() 6803 6804 sub, err := js.PullSubscribe(_EMPTY_, subName, 6805 nats.BindStream("EVENTS"), 6806 nats.MaxDeliver(1), 6807 nats.AckWait(10*time.Second), 6808 nats.ManualAck(), 6809 ) 6810 require_NoError(t, err) 6811 6812 for { 6813 msgs, err := sub.Fetch(25, nats.MaxWait(2*time.Second)) 6814 if err != nil && err != nats.ErrTimeout { 6815 t.Logf("Exiting pull subscriber %q: %v", subName, err) 6816 return 6817 } 6818 // Shuffle 6819 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 6820 6821 // Wait for a random interval up to 100ms. 6822 time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond) 6823 6824 for _, m := range msgs { 6825 // If we want to simulate max redeliveries being hit, since not acking 6826 // once will cause it due to subscriber setup. 6827 // 100_000 == 0.01% 6828 if simulateMaxRedeliveries && rand.Intn(100_000) == 0 { 6829 md, err := m.Metadata() 6830 require_NoError(t, err) 6831 t.Logf("** Skipping Ack: %d **", md.Sequence.Stream) 6832 } else { 6833 m.Ack() 6834 } 6835 } 6836 } 6837 }(subName) 6838 } 6839 } 6840 6841 // Now create feeder publishers. 6842 eventTypes := []string{"PAYMENT", "SUBMISSION", "CANCEL"} 6843 6844 msg := make([]byte, 2*1024) // 2k payload 6845 crand.Read(msg) 6846 6847 // For tracking pub times. 6848 var pubs int 6849 var totalPubTime time.Duration 6850 var pmu sync.Mutex 6851 last := time.Now() 6852 6853 updatePubStats := func(elapsed time.Duration) { 6854 pmu.Lock() 6855 defer pmu.Unlock() 6856 // Reset every 5s 6857 if time.Since(last) > 5*time.Second { 6858 pubs = 0 6859 totalPubTime = 0 6860 last = time.Now() 6861 } 6862 pubs++ 6863 totalPubTime += elapsed 6864 } 6865 avgPubTime := func() time.Duration { 6866 pmu.Lock() 6867 np := pubs 6868 tpt := totalPubTime 6869 pmu.Unlock() 6870 return tpt / time.Duration(np) 6871 } 6872 6873 t.Logf("Creating %d Publishers", numPublishers) 6874 6875 var numLimitsExceeded atomic.Uint32 6876 errCh := make(chan error, 100) 6877 6878 for i := 0; i < numPublishers; i++ { 6879 go func() { 6880 nc, js := jsClientConnect(t, c.randomServer()) 6881 defer nc.Close() 6882 6883 for { 6884 // Grab a random source stream 6885 stream := sources[rand.Intn(len(sources))] 6886 // Grab random event type. 6887 evt := eventTypes[rand.Intn(len(eventTypes))] 6888 subj := fmt.Sprintf("%s.%s", stream, evt) 6889 start := time.Now() 6890 _, err := js.Publish(subj, msg) 6891 if err != nil { 6892 t.Logf("Exiting publisher: %v", err) 6893 return 6894 } 6895 elapsed := time.Since(start) 6896 if elapsed > badPubThresh { 6897 t.Logf("Publish time took more than expected: %v", elapsed) 6898 numLimitsExceeded.Add(1) 6899 if ne := numLimitsExceeded.Load(); ne > maxBadPubTimes { 6900 errCh <- fmt.Errorf("Too many exceeded times on publish: %d", ne) 6901 return 6902 } 6903 } 6904 updatePubStats(elapsed) 6905 } 6906 }() 6907 } 6908 6909 t.Log("Creating Monitoring Routine - Data in ~10s") 6910 6911 // Create monitoring routine. 6912 go func() { 6913 nc, js := jsClientConnect(t, c.randomServer()) 6914 defer nc.Close() 6915 6916 fseq, lseq := uint64(0), uint64(0) 6917 for { 6918 // Grab consumers 6919 var minAckFloor uint64 = math.MaxUint64 6920 for _, consumer := range []string{"C1", "C2"} { 6921 ci, err := js.ConsumerInfo("EVENTS", consumer) 6922 if err != nil { 6923 t.Logf("Exiting Monitor: %v", err) 6924 return 6925 } 6926 if lseq > 0 { 6927 t.Logf("%s:\n Delivered:\t%d\n AckFloor:\t%d\n AckPending:\t%d\n NumPending:\t%d", 6928 consumer, ci.Delivered.Stream, ci.AckFloor.Stream, ci.NumAckPending, ci.NumPending) 6929 } 6930 if ci.AckFloor.Stream < minAckFloor { 6931 minAckFloor = ci.AckFloor.Stream 6932 } 6933 } 6934 // Now grab aggregate stream state. 6935 si, err := js.StreamInfo("EVENTS") 6936 if err != nil { 6937 t.Logf("Exiting Monitor: %v", err) 6938 return 6939 } 6940 state := si.State 6941 if lseq != 0 { 6942 t.Logf("Stream:\n Msgs: \t%d\n First:\t%d\n Last: \t%d\n Deletes:\t%d\n", 6943 state.Msgs, state.FirstSeq, state.LastSeq, state.NumDeleted) 6944 t.Logf("Publish Stats:\n Msgs/s:\t%0.2f\n Avg Pub:\t%v\n\n", float64(si.State.LastSeq-lseq)/5.0, avgPubTime()) 6945 if si.State.FirstSeq < minAckFloor && si.State.FirstSeq == fseq { 6946 t.Log("Stream first seq < minimum ack floor") 6947 } 6948 } 6949 fseq, lseq = si.State.FirstSeq, si.State.LastSeq 6950 time.Sleep(5 * time.Second) 6951 } 6952 6953 }() 6954 6955 select { 6956 case e := <-errCh: 6957 t.Fatal(e) 6958 case <-time.After(testTime): 6959 t.Fatalf("Did not receive completion signal") 6960 } 6961 } 6962 6963 // Unbalanced stretch cluster. 6964 // S2 (stream leader) will have a slow path to S1 (via proxy) and S3 (consumer leader) will have a fast path. 6965 // 6966 // Route Ports 6967 // "S1": 14622 6968 // "S2": 15622 6969 // "S3": 16622 6970 func createStretchUnbalancedCluster(t testing.TB) (c *cluster, np *netProxy) { 6971 t.Helper() 6972 6973 tmpl := ` 6974 listen: 127.0.0.1:-1 6975 server_name: %s 6976 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 6977 6978 cluster { 6979 name: "F3" 6980 listen: 127.0.0.1:%d 6981 routes = [%s] 6982 } 6983 6984 accounts { 6985 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 6986 } 6987 ` 6988 // Do these in order, S1, S2 (proxy) then S3. 6989 c = &cluster{t: t, servers: make([]*Server, 3), opts: make([]*Options, 3), name: "F3"} 6990 6991 // S1 6992 conf := fmt.Sprintf(tmpl, "S1", t.TempDir(), 14622, "route://127.0.0.1:15622, route://127.0.0.1:16622") 6993 c.servers[0], c.opts[0] = RunServerWithConfig(createConfFile(t, []byte(conf))) 6994 6995 // S2 6996 // Create the proxy first. Connect this to S1. Make it slow, e.g. 5ms RTT. 6997 np = createNetProxy(1*time.Millisecond, 1024*1024*1024, 1024*1024*1024, "route://127.0.0.1:14622", true) 6998 routes := fmt.Sprintf("%s, route://127.0.0.1:16622", np.routeURL()) 6999 conf = fmt.Sprintf(tmpl, "S2", t.TempDir(), 15622, routes) 7000 c.servers[1], c.opts[1] = RunServerWithConfig(createConfFile(t, []byte(conf))) 7001 7002 // S3 7003 conf = fmt.Sprintf(tmpl, "S3", t.TempDir(), 16622, "route://127.0.0.1:14622, route://127.0.0.1:15622") 7004 c.servers[2], c.opts[2] = RunServerWithConfig(createConfFile(t, []byte(conf))) 7005 7006 c.checkClusterFormed() 7007 c.waitOnClusterReady() 7008 7009 return c, np 7010 } 7011 7012 // We test an interest based stream that has a cluster with a node with asymmetric paths from 7013 // the stream leader and the consumer leader such that the consumer leader path is fast and 7014 // replicated acks arrive sooner then the actual message. This path was considered, but also 7015 // categorized as very rare and was expensive as it tried to forward a new stream msg delete 7016 // proposal to the original stream leader. It now will deal with the issue locally and not 7017 // slow down the ingest rate to the stream's publishers. 7018 func TestNoRaceJetStreamClusterDifferentRTTInterestBasedStreamSetup(t *testing.T) { 7019 // Uncomment to run. Do not want as part of Travis tests atm. 7020 skip(t) 7021 7022 c, np := createStretchUnbalancedCluster(t) 7023 defer c.shutdown() 7024 defer np.stop() 7025 7026 nc, js := jsClientConnect(t, c.randomServer()) 7027 defer nc.Close() 7028 7029 // Now create the stream. 7030 _, err := js.AddStream(&nats.StreamConfig{ 7031 Name: "EVENTS", 7032 Subjects: []string{"EV.>"}, 7033 Replicas: 3, 7034 Retention: nats.InterestPolicy, 7035 }) 7036 require_NoError(t, err) 7037 7038 // Make sure it's leader is on S2. 7039 sl := c.servers[1] 7040 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7041 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7042 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7043 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7044 return fmt.Errorf("Server %s is not stream leader yet", sl) 7045 } 7046 return nil 7047 }) 7048 7049 // Now create the consumer. 7050 _, err = js.PullSubscribe(_EMPTY_, "C", nats.BindStream("EVENTS"), nats.ManualAck()) 7051 require_NoError(t, err) 7052 7053 // Make sure the consumer leader is on S3. 7054 cl := c.servers[2] 7055 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7056 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7057 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7058 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7059 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7060 } 7061 return nil 7062 }) 7063 7064 go func(js nats.JetStream) { 7065 sub, err := js.PullSubscribe(_EMPTY_, "C", nats.BindStream("EVENTS"), nats.ManualAck()) 7066 require_NoError(t, err) 7067 7068 for { 7069 msgs, err := sub.Fetch(100, nats.MaxWait(2*time.Second)) 7070 if err != nil && err != nats.ErrTimeout { 7071 return 7072 } 7073 // Shuffle 7074 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 7075 for _, m := range msgs { 7076 m.Ack() 7077 } 7078 } 7079 }(js) 7080 7081 numPublishers := 25 7082 pubThresh := 2 * time.Second 7083 var maxExceeded atomic.Int64 7084 errCh := make(chan error, numPublishers) 7085 wg := sync.WaitGroup{} 7086 7087 msg := make([]byte, 2*1024) // 2k payload 7088 crand.Read(msg) 7089 7090 // Publishers. 7091 for i := 0; i < numPublishers; i++ { 7092 wg.Add(1) 7093 go func(iter int) { 7094 defer wg.Done() 7095 7096 // Connect to random, the slow ones will be connected to the slow node. 7097 // But if you connect them all there it will pass. 7098 s := c.randomServer() 7099 nc, js := jsClientConnect(t, s) 7100 defer nc.Close() 7101 7102 for i := 0; i < 1_000; i++ { 7103 start := time.Now() 7104 _, err := js.Publish("EV.PAID", msg) 7105 if err != nil { 7106 errCh <- fmt.Errorf("Publish error: %v", err) 7107 return 7108 } 7109 if elapsed := time.Since(start); elapsed > pubThresh { 7110 errCh <- fmt.Errorf("Publish time exceeded") 7111 if int64(elapsed) > maxExceeded.Load() { 7112 maxExceeded.Store(int64(elapsed)) 7113 } 7114 return 7115 } 7116 } 7117 }(i) 7118 } 7119 7120 wg.Wait() 7121 7122 select { 7123 case e := <-errCh: 7124 t.Fatalf("%v: threshold is %v, maximum seen: %v", e, pubThresh, time.Duration(maxExceeded.Load())) 7125 default: 7126 } 7127 } 7128 7129 func TestNoRaceJetStreamInterestStreamCheckInterestRaceBug(t *testing.T) { 7130 c := createJetStreamClusterExplicit(t, "R3S", 3) 7131 defer c.shutdown() 7132 7133 nc, js := jsClientConnect(t, c.randomServer()) 7134 defer nc.Close() 7135 7136 _, err := js.AddStream(&nats.StreamConfig{ 7137 Name: "TEST", 7138 Subjects: []string{"foo"}, 7139 Replicas: 3, 7140 Retention: nats.InterestPolicy, 7141 }) 7142 require_NoError(t, err) 7143 7144 numConsumers := 10 7145 for i := 0; i < numConsumers; i++ { 7146 nc, js := jsClientConnect(t, c.randomServer()) 7147 defer nc.Close() 7148 7149 _, err = js.Subscribe("foo", func(m *nats.Msg) { 7150 m.Ack() 7151 }, nats.Durable(fmt.Sprintf("C%d", i)), nats.ManualAck()) 7152 require_NoError(t, err) 7153 } 7154 7155 numToSend := 10_000 7156 for i := 0; i < numToSend; i++ { 7157 _, err := js.PublishAsync("foo", nil, nats.StallWait(800*time.Millisecond)) 7158 require_NoError(t, err) 7159 } 7160 select { 7161 case <-js.PublishAsyncComplete(): 7162 case <-time.After(20 * time.Second): 7163 t.Fatalf("Did not receive completion signal") 7164 } 7165 7166 // Wait til ackfloor is correct for all consumers. 7167 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 7168 for _, s := range c.servers { 7169 mset, err := s.GlobalAccount().lookupStream("TEST") 7170 require_NoError(t, err) 7171 7172 mset.mu.RLock() 7173 defer mset.mu.RUnlock() 7174 7175 require_True(t, len(mset.consumers) == numConsumers) 7176 7177 for _, o := range mset.consumers { 7178 state, err := o.store.State() 7179 require_NoError(t, err) 7180 if state.AckFloor.Stream != uint64(numToSend) { 7181 return fmt.Errorf("Ackfloor not correct yet") 7182 } 7183 } 7184 } 7185 return nil 7186 }) 7187 7188 for _, s := range c.servers { 7189 mset, err := s.GlobalAccount().lookupStream("TEST") 7190 require_NoError(t, err) 7191 7192 mset.mu.RLock() 7193 defer mset.mu.RUnlock() 7194 7195 state := mset.state() 7196 require_True(t, state.Msgs == 0) 7197 require_True(t, state.FirstSeq == uint64(numToSend+1)) 7198 } 7199 } 7200 7201 func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *testing.T) { 7202 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 7203 skip(t) 7204 7205 c := createJetStreamClusterExplicit(t, "R3S", 3) 7206 defer c.shutdown() 7207 7208 numStreams := 200 7209 numConsumersPer := 5 7210 numPublishers := 10 7211 7212 nc, js := jsClientConnect(t, c.randomServer()) 7213 defer nc.Close() 7214 7215 qch := make(chan bool) 7216 7217 var mm sync.Mutex 7218 ackMap := make(map[string]map[uint64][]string) 7219 7220 addAckTracking := func(seq uint64, stream, consumer string) { 7221 mm.Lock() 7222 defer mm.Unlock() 7223 sam := ackMap[stream] 7224 if sam == nil { 7225 sam = make(map[uint64][]string) 7226 ackMap[stream] = sam 7227 } 7228 sam[seq] = append(sam[seq], consumer) 7229 } 7230 7231 doPullSubscriber := func(stream, consumer, filter string) { 7232 nc, js := jsClientConnect(t, c.randomServer()) 7233 defer nc.Close() 7234 7235 var err error 7236 var sub *nats.Subscription 7237 timeout := time.Now().Add(5 * time.Second) 7238 for time.Now().Before(timeout) { 7239 sub, err = js.PullSubscribe(filter, consumer, nats.BindStream(stream), nats.ManualAck()) 7240 if err == nil { 7241 break 7242 } 7243 } 7244 if err != nil { 7245 t.Logf("Error on pull subscriber: %v", err) 7246 return 7247 } 7248 7249 for { 7250 select { 7251 case <-time.After(500 * time.Millisecond): 7252 msgs, err := sub.Fetch(100, nats.MaxWait(time.Second)) 7253 if err != nil { 7254 continue 7255 } 7256 // Shuffle 7257 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 7258 for _, m := range msgs { 7259 meta, err := m.Metadata() 7260 require_NoError(t, err) 7261 m.Ack() 7262 addAckTracking(meta.Sequence.Stream, stream, consumer) 7263 if meta.NumDelivered > 1 { 7264 t.Logf("Got a msg redelivered %d for sequence %d on %q %q\n", meta.NumDelivered, meta.Sequence.Stream, stream, consumer) 7265 } 7266 } 7267 case <-qch: 7268 nc.Flush() 7269 return 7270 } 7271 } 7272 } 7273 7274 // Setup 7275 wg := sync.WaitGroup{} 7276 for i := 0; i < numStreams; i++ { 7277 wg.Add(1) 7278 go func(stream string) { 7279 defer wg.Done() 7280 subj := fmt.Sprintf("%s.>", stream) 7281 _, err := js.AddStream(&nats.StreamConfig{ 7282 Name: stream, 7283 Subjects: []string{subj}, 7284 Replicas: 3, 7285 Retention: nats.InterestPolicy, 7286 }) 7287 require_NoError(t, err) 7288 for i := 0; i < numConsumersPer; i++ { 7289 consumer := fmt.Sprintf("C%d", i) 7290 filter := fmt.Sprintf("%s.%d", stream, i) 7291 _, err = js.AddConsumer(stream, &nats.ConsumerConfig{ 7292 Durable: consumer, 7293 FilterSubject: filter, 7294 AckPolicy: nats.AckExplicitPolicy, 7295 AckWait: 2 * time.Second, 7296 }) 7297 require_NoError(t, err) 7298 c.waitOnConsumerLeader(globalAccountName, stream, consumer) 7299 go doPullSubscriber(stream, consumer, filter) 7300 } 7301 }(fmt.Sprintf("A-%d", i)) 7302 } 7303 wg.Wait() 7304 7305 msg := make([]byte, 2*1024) // 2k payload 7306 crand.Read(msg) 7307 7308 // Controls if publishing is on or off. 7309 var pubActive atomic.Bool 7310 7311 doPublish := func() { 7312 nc, js := jsClientConnect(t, c.randomServer()) 7313 defer nc.Close() 7314 7315 for { 7316 select { 7317 case <-time.After(100 * time.Millisecond): 7318 if pubActive.Load() { 7319 for i := 0; i < numStreams; i++ { 7320 for j := 0; j < numConsumersPer; j++ { 7321 subj := fmt.Sprintf("A-%d.%d", i, j) 7322 // Don't care about errors here for this test. 7323 js.Publish(subj, msg) 7324 } 7325 } 7326 } 7327 case <-qch: 7328 return 7329 } 7330 } 7331 } 7332 7333 pubActive.Store(true) 7334 7335 for i := 0; i < numPublishers; i++ { 7336 go doPublish() 7337 } 7338 7339 // Let run for a bit. 7340 time.Sleep(20 * time.Second) 7341 7342 // Do a rolling restart. 7343 for _, s := range c.servers { 7344 t.Logf("Shutdown %v\n", s) 7345 s.Shutdown() 7346 s.WaitForShutdown() 7347 time.Sleep(20 * time.Second) 7348 t.Logf("Restarting %v\n", s) 7349 s = c.restartServer(s) 7350 c.waitOnServerHealthz(s) 7351 } 7352 7353 // Let run for a bit longer. 7354 time.Sleep(10 * time.Second) 7355 7356 // Stop pubs. 7357 pubActive.Store(false) 7358 7359 // Let settle. 7360 time.Sleep(10 * time.Second) 7361 close(qch) 7362 time.Sleep(20 * time.Second) 7363 7364 nc, js = jsClientConnect(t, c.randomServer()) 7365 defer nc.Close() 7366 7367 minAckFloor := func(stream string) (uint64, string) { 7368 var maf uint64 = math.MaxUint64 7369 var consumer string 7370 for i := 0; i < numConsumersPer; i++ { 7371 cname := fmt.Sprintf("C%d", i) 7372 ci, err := js.ConsumerInfo(stream, cname) 7373 require_NoError(t, err) 7374 if ci.AckFloor.Stream < maf { 7375 maf = ci.AckFloor.Stream 7376 consumer = cname 7377 } 7378 } 7379 return maf, consumer 7380 } 7381 7382 checkStreamAcks := func(stream string) { 7383 mm.Lock() 7384 defer mm.Unlock() 7385 if sam := ackMap[stream]; sam != nil { 7386 for seq := 1; ; seq++ { 7387 acks := sam[uint64(seq)] 7388 if acks == nil { 7389 if sam[uint64(seq+1)] != nil { 7390 t.Logf("Missing an ack on stream %q for sequence %d\n", stream, seq) 7391 } else { 7392 break 7393 } 7394 } 7395 if len(acks) > 1 { 7396 t.Logf("Multiple acks for %d which is not expected: %+v", seq, acks) 7397 } 7398 } 7399 } 7400 } 7401 7402 // Now check all streams such that their first sequence is equal to the minimum of all consumers. 7403 for i := 0; i < numStreams; i++ { 7404 stream := fmt.Sprintf("A-%d", i) 7405 si, err := js.StreamInfo(stream) 7406 require_NoError(t, err) 7407 7408 if maf, consumer := minAckFloor(stream); maf > si.State.FirstSeq { 7409 t.Logf("\nBAD STATE DETECTED FOR %q, CHECKING OTHER SERVERS! ACK %d vs %+v LEADER %v, CL FOR %q %v\n", 7410 stream, maf, si.State, c.streamLeader(globalAccountName, stream), consumer, c.consumerLeader(globalAccountName, stream, consumer)) 7411 7412 t.Logf("TEST ACKS %+v\n", ackMap) 7413 7414 checkStreamAcks(stream) 7415 7416 for _, s := range c.servers { 7417 mset, err := s.GlobalAccount().lookupStream(stream) 7418 require_NoError(t, err) 7419 state := mset.state() 7420 t.Logf("Server %v Stream STATE %+v\n", s, state) 7421 7422 var smv StoreMsg 7423 if sm, err := mset.store.LoadMsg(state.FirstSeq, &smv); err == nil { 7424 t.Logf("Subject for msg %d is %q", state.FirstSeq, sm.subj) 7425 } else { 7426 t.Logf("Could not retrieve msg for %d: %v", state.FirstSeq, err) 7427 } 7428 7429 if len(mset.preAcks) > 0 { 7430 t.Logf("%v preAcks %+v\n", s, mset.preAcks) 7431 } 7432 7433 for _, o := range mset.consumers { 7434 ostate, err := o.store.State() 7435 require_NoError(t, err) 7436 t.Logf("Consumer STATE for %q is %+v\n", o.name, ostate) 7437 } 7438 } 7439 t.Fatalf("BAD STATE: ACKFLOOR > FIRST %d vs %d\n", maf, si.State.FirstSeq) 7440 } 7441 } 7442 } 7443 7444 func TestNoRaceFileStoreNumPending(t *testing.T) { 7445 // No need for all permutations here. 7446 storeDir := t.TempDir() 7447 fcfg := FileStoreConfig{ 7448 StoreDir: storeDir, 7449 BlockSize: 2 * 1024, // Create many blocks on purpose. 7450 } 7451 fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*.*.*.*"}, Storage: FileStorage}) 7452 require_NoError(t, err) 7453 defer fs.Stop() 7454 7455 tokens := []string{"foo", "bar", "baz"} 7456 genSubj := func() string { 7457 return fmt.Sprintf("%s.%s.%s.%s", 7458 tokens[rand.Intn(len(tokens))], 7459 tokens[rand.Intn(len(tokens))], 7460 tokens[rand.Intn(len(tokens))], 7461 tokens[rand.Intn(len(tokens))], 7462 ) 7463 } 7464 7465 for i := 0; i < 50_000; i++ { 7466 subj := genSubj() 7467 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 7468 require_NoError(t, err) 7469 } 7470 7471 state := fs.State() 7472 7473 // Scan one by one for sanity check against other calculations. 7474 sanityCheck := func(sseq uint64, filter string) SimpleState { 7475 t.Helper() 7476 var ss SimpleState 7477 var smv StoreMsg 7478 // For here we know 0 is invalid, set to 1. 7479 if sseq == 0 { 7480 sseq = 1 7481 } 7482 for seq := sseq; seq <= state.LastSeq; seq++ { 7483 sm, err := fs.LoadMsg(seq, &smv) 7484 if err != nil { 7485 t.Logf("Encountered error %v loading sequence: %d", err, seq) 7486 continue 7487 } 7488 if subjectIsSubsetMatch(sm.subj, filter) { 7489 ss.Msgs++ 7490 ss.Last = seq 7491 if ss.First == 0 || seq < ss.First { 7492 ss.First = seq 7493 } 7494 } 7495 } 7496 return ss 7497 } 7498 7499 check := func(sseq uint64, filter string) { 7500 t.Helper() 7501 np, lvs := fs.NumPending(sseq, filter, false) 7502 ss := fs.FilteredState(sseq, filter) 7503 sss := sanityCheck(sseq, filter) 7504 if lvs != state.LastSeq { 7505 t.Fatalf("Expected NumPending to return valid through last of %d but got %d", state.LastSeq, lvs) 7506 } 7507 if ss.Msgs != np { 7508 t.Fatalf("NumPending of %d did not match ss.Msgs of %d", np, ss.Msgs) 7509 } 7510 if ss != sss { 7511 t.Fatalf("Failed sanity check, expected %+v got %+v", sss, ss) 7512 } 7513 } 7514 7515 sanityCheckLastOnly := func(sseq uint64, filter string) SimpleState { 7516 t.Helper() 7517 var ss SimpleState 7518 var smv StoreMsg 7519 // For here we know 0 is invalid, set to 1. 7520 if sseq == 0 { 7521 sseq = 1 7522 } 7523 seen := make(map[string]bool) 7524 for seq := state.LastSeq; seq >= sseq; seq-- { 7525 sm, err := fs.LoadMsg(seq, &smv) 7526 if err != nil { 7527 t.Logf("Encountered error %v loading sequence: %d", err, seq) 7528 continue 7529 } 7530 if !seen[sm.subj] && subjectIsSubsetMatch(sm.subj, filter) { 7531 ss.Msgs++ 7532 if ss.Last == 0 { 7533 ss.Last = seq 7534 } 7535 if ss.First == 0 || seq < ss.First { 7536 ss.First = seq 7537 } 7538 seen[sm.subj] = true 7539 } 7540 } 7541 return ss 7542 } 7543 7544 checkLastOnly := func(sseq uint64, filter string) { 7545 t.Helper() 7546 np, lvs := fs.NumPending(sseq, filter, true) 7547 ss := sanityCheckLastOnly(sseq, filter) 7548 if lvs != state.LastSeq { 7549 t.Fatalf("Expected NumPending to return valid through last of %d but got %d", state.LastSeq, lvs) 7550 } 7551 if ss.Msgs != np { 7552 t.Fatalf("NumPending of %d did not match ss.Msgs of %d", np, ss.Msgs) 7553 } 7554 } 7555 7556 startSeqs := []uint64{0, 1, 2, 200, 444, 555, 2222, 8888, 12_345, 28_222, 33_456, 44_400, 49_999} 7557 checkSubs := []string{"foo.>", "*.bar.>", "foo.bar.*.baz", "*.bar.>", "*.foo.bar.*", "foo.foo.bar.baz"} 7558 7559 for _, filter := range checkSubs { 7560 for _, start := range startSeqs { 7561 check(start, filter) 7562 checkLastOnly(start, filter) 7563 } 7564 } 7565 } 7566 7567 func TestNoRaceJetStreamClusterUnbalancedInterestMultipleConsumers(t *testing.T) { 7568 c, np := createStretchUnbalancedCluster(t) 7569 defer c.shutdown() 7570 defer np.stop() 7571 7572 nc, js := jsClientConnect(t, c.randomServer()) 7573 defer nc.Close() 7574 7575 // Now create the stream. 7576 _, err := js.AddStream(&nats.StreamConfig{ 7577 Name: "EVENTS", 7578 Subjects: []string{"EV.>"}, 7579 Replicas: 3, 7580 Retention: nats.InterestPolicy, 7581 }) 7582 require_NoError(t, err) 7583 7584 // Make sure it's leader is on S2. 7585 sl := c.servers[1] 7586 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7587 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7588 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7589 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7590 return fmt.Errorf("Server %s is not stream leader yet", sl) 7591 } 7592 return nil 7593 }) 7594 7595 // Create a fast ack consumer. 7596 _, err = js.Subscribe("EV.NEW", func(m *nats.Msg) { 7597 m.Ack() 7598 }, nats.Durable("C"), nats.ManualAck()) 7599 require_NoError(t, err) 7600 7601 // Make sure the consumer leader is on S3. 7602 cl := c.servers[2] 7603 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7604 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7605 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7606 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7607 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7608 } 7609 return nil 7610 }) 7611 7612 // Connect a client directly to the stream leader. 7613 nc, js = jsClientConnect(t, sl) 7614 defer nc.Close() 7615 7616 // Now create a pull subscriber. 7617 sub, err := js.PullSubscribe("EV.NEW", "D", nats.ManualAck()) 7618 require_NoError(t, err) 7619 7620 // Make sure this consumer leader is on S1. 7621 cl = c.servers[0] 7622 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7623 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "D") 7624 if s := c.consumerLeader(globalAccountName, "EVENTS", "D"); s != cl { 7625 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "D") 7626 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7627 } 7628 return nil 7629 }) 7630 7631 numToSend := 1000 7632 for i := 0; i < numToSend; i++ { 7633 _, err := js.PublishAsync("EV.NEW", nil) 7634 require_NoError(t, err) 7635 } 7636 select { 7637 case <-js.PublishAsyncComplete(): 7638 case <-time.After(20 * time.Second): 7639 t.Fatalf("Did not receive completion signal") 7640 } 7641 7642 // Now make sure we can pull messages since we have not acked. 7643 // The bug is that the acks arrive on S1 faster then the messages but we want to 7644 // make sure we do not remove prematurely. 7645 msgs, err := sub.Fetch(100, nats.MaxWait(time.Second)) 7646 require_NoError(t, err) 7647 require_True(t, len(msgs) == 100) 7648 for _, m := range msgs { 7649 m.AckSync() 7650 } 7651 7652 ci, err := js.ConsumerInfo("EVENTS", "D") 7653 require_NoError(t, err) 7654 require_True(t, ci.NumPending == uint64(numToSend-100)) 7655 require_True(t, ci.NumAckPending == 0) 7656 require_True(t, ci.Delivered.Stream == 100) 7657 require_True(t, ci.AckFloor.Stream == 100) 7658 7659 // Check stream state on all servers. 7660 for _, s := range c.servers { 7661 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7662 require_NoError(t, err) 7663 state := mset.state() 7664 require_True(t, state.Msgs == 900) 7665 require_True(t, state.FirstSeq == 101) 7666 require_True(t, state.LastSeq == 1000) 7667 require_True(t, state.Consumers == 2) 7668 } 7669 7670 msgs, err = sub.Fetch(900, nats.MaxWait(time.Second)) 7671 require_NoError(t, err) 7672 require_True(t, len(msgs) == 900) 7673 for _, m := range msgs { 7674 m.AckSync() 7675 } 7676 7677 // Let acks propagate. 7678 time.Sleep(250 * time.Millisecond) 7679 7680 // Check final stream state on all servers. 7681 for _, s := range c.servers { 7682 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7683 require_NoError(t, err) 7684 state := mset.state() 7685 require_True(t, state.Msgs == 0) 7686 require_True(t, state.FirstSeq == 1001) 7687 require_True(t, state.LastSeq == 1000) 7688 require_True(t, state.Consumers == 2) 7689 // Now check preAcks 7690 mset.mu.RLock() 7691 numPreAcks := len(mset.preAcks) 7692 mset.mu.RUnlock() 7693 require_True(t, numPreAcks == 0) 7694 } 7695 } 7696 7697 func TestNoRaceJetStreamClusterUnbalancedInterestMultipleFilteredConsumers(t *testing.T) { 7698 c, np := createStretchUnbalancedCluster(t) 7699 defer c.shutdown() 7700 defer np.stop() 7701 7702 nc, js := jsClientConnect(t, c.randomServer()) 7703 defer nc.Close() 7704 7705 // Now create the stream. 7706 _, err := js.AddStream(&nats.StreamConfig{ 7707 Name: "EVENTS", 7708 Subjects: []string{"EV.>"}, 7709 Replicas: 3, 7710 Retention: nats.InterestPolicy, 7711 }) 7712 require_NoError(t, err) 7713 7714 // Make sure it's leader is on S2. 7715 sl := c.servers[1] 7716 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7717 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7718 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7719 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7720 return fmt.Errorf("Server %s is not stream leader yet", sl) 7721 } 7722 return nil 7723 }) 7724 7725 // Create a fast ack consumer. 7726 _, err = js.Subscribe("EV.NEW", func(m *nats.Msg) { 7727 m.Ack() 7728 }, nats.Durable("C"), nats.ManualAck()) 7729 require_NoError(t, err) 7730 7731 // Make sure the consumer leader is on S3. 7732 cl := c.servers[2] 7733 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7734 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7735 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7736 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7737 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7738 } 7739 return nil 7740 }) 7741 7742 // Connect a client directly to the stream leader. 7743 nc, js = jsClientConnect(t, sl) 7744 defer nc.Close() 7745 7746 // Now create another fast ack consumer. 7747 _, err = js.Subscribe("EV.UPDATED", func(m *nats.Msg) { 7748 m.Ack() 7749 }, nats.Durable("D"), nats.ManualAck()) 7750 require_NoError(t, err) 7751 7752 // Make sure this consumer leader is on S1. 7753 cl = c.servers[0] 7754 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7755 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "D") 7756 if s := c.consumerLeader(globalAccountName, "EVENTS", "D"); s != cl { 7757 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "D") 7758 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7759 } 7760 return nil 7761 }) 7762 7763 numToSend := 500 7764 for i := 0; i < numToSend; i++ { 7765 _, err := js.PublishAsync("EV.NEW", nil) 7766 require_NoError(t, err) 7767 _, err = js.PublishAsync("EV.UPDATED", nil) 7768 require_NoError(t, err) 7769 } 7770 select { 7771 case <-js.PublishAsyncComplete(): 7772 case <-time.After(20 * time.Second): 7773 t.Fatalf("Did not receive completion signal") 7774 } 7775 7776 // Let acks propagate. 7777 time.Sleep(250 * time.Millisecond) 7778 7779 ci, err := js.ConsumerInfo("EVENTS", "D") 7780 require_NoError(t, err) 7781 require_True(t, ci.NumPending == 0) 7782 require_True(t, ci.NumAckPending == 0) 7783 require_True(t, ci.Delivered.Consumer == 500) 7784 require_True(t, ci.Delivered.Stream == 1000) 7785 require_True(t, ci.AckFloor.Consumer == 500) 7786 require_True(t, ci.AckFloor.Stream == 1000) 7787 7788 // Check final stream state on all servers. 7789 for _, s := range c.servers { 7790 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7791 require_NoError(t, err) 7792 state := mset.state() 7793 require_True(t, state.Msgs == 0) 7794 require_True(t, state.FirstSeq == 1001) 7795 require_True(t, state.LastSeq == 1000) 7796 require_True(t, state.Consumers == 2) 7797 // Now check preAcks 7798 mset.mu.RLock() 7799 numPreAcks := len(mset.preAcks) 7800 mset.mu.RUnlock() 7801 require_True(t, numPreAcks == 0) 7802 } 7803 } 7804 7805 func TestNoRaceParallelStreamAndConsumerCreation(t *testing.T) { 7806 s := RunBasicJetStreamServer(t) 7807 defer s.Shutdown() 7808 7809 // stream config. 7810 scfg := &StreamConfig{ 7811 Name: "TEST", 7812 Subjects: []string{"foo", "bar"}, 7813 MaxMsgs: 10, 7814 Storage: FileStorage, 7815 Replicas: 1, 7816 } 7817 7818 // Will do these direct against the low level API to really make 7819 // sure parallel creation ok. 7820 np := 1000 7821 startCh := make(chan bool) 7822 errCh := make(chan error, np) 7823 wg := sync.WaitGroup{} 7824 wg.Add(np) 7825 7826 var streams sync.Map 7827 7828 for i := 0; i < np; i++ { 7829 go func() { 7830 defer wg.Done() 7831 7832 // Make them all fire at once. 7833 <-startCh 7834 7835 if mset, err := s.GlobalAccount().addStream(scfg); err != nil { 7836 t.Logf("Stream create got an error: %v", err) 7837 errCh <- err 7838 } else { 7839 streams.Store(mset, true) 7840 } 7841 }() 7842 } 7843 time.Sleep(100 * time.Millisecond) 7844 close(startCh) 7845 wg.Wait() 7846 7847 // Check for no errors. 7848 if len(errCh) > 0 { 7849 t.Fatalf("Expected no errors, got %d", len(errCh)) 7850 } 7851 7852 // Now make sure we really only created one stream. 7853 var numStreams int 7854 streams.Range(func(k, v any) bool { 7855 numStreams++ 7856 return true 7857 }) 7858 if numStreams > 1 { 7859 t.Fatalf("Expected only one stream to be really created, got %d out of %d attempts", numStreams, np) 7860 } 7861 7862 // Also make sure we cleanup the inflight entries for streams. 7863 gacc := s.GlobalAccount() 7864 _, jsa, err := gacc.checkForJetStream() 7865 require_NoError(t, err) 7866 var numEntries int 7867 jsa.inflight.Range(func(k, v any) bool { 7868 numEntries++ 7869 return true 7870 }) 7871 if numEntries > 0 { 7872 t.Fatalf("Expected no inflight entries to be left over, got %d", numEntries) 7873 } 7874 7875 // Now do consumers. 7876 mset, err := gacc.lookupStream("TEST") 7877 require_NoError(t, err) 7878 7879 cfg := &ConsumerConfig{ 7880 DeliverSubject: "to", 7881 Name: "DLC", 7882 AckPolicy: AckExplicit, 7883 } 7884 7885 startCh = make(chan bool) 7886 errCh = make(chan error, np) 7887 wg.Add(np) 7888 7889 var consumers sync.Map 7890 7891 for i := 0; i < np; i++ { 7892 go func() { 7893 defer wg.Done() 7894 7895 // Make them all fire at once. 7896 <-startCh 7897 7898 if _, err = mset.addConsumer(cfg); err != nil { 7899 t.Logf("Consumer create got an error: %v", err) 7900 errCh <- err 7901 } else { 7902 consumers.Store(mset, true) 7903 } 7904 }() 7905 } 7906 time.Sleep(100 * time.Millisecond) 7907 close(startCh) 7908 wg.Wait() 7909 7910 // Check for no errors. 7911 if len(errCh) > 0 { 7912 t.Fatalf("Expected no errors, got %d", len(errCh)) 7913 } 7914 7915 // Now make sure we really only created one stream. 7916 var numConsumers int 7917 consumers.Range(func(k, v any) bool { 7918 numConsumers++ 7919 return true 7920 }) 7921 if numConsumers > 1 { 7922 t.Fatalf("Expected only one consumer to be really created, got %d out of %d attempts", numConsumers, np) 7923 } 7924 } 7925 7926 func TestNoRaceRoutePool(t *testing.T) { 7927 var dur1 time.Duration 7928 var dur2 time.Duration 7929 7930 total := 1_000_000 7931 7932 for _, test := range []struct { 7933 name string 7934 poolSize int 7935 }{ 7936 {"no pooling", 0}, 7937 {"pooling", 5}, 7938 } { 7939 t.Run(test.name, func(t *testing.T) { 7940 tmpl := ` 7941 port: -1 7942 accounts { 7943 A { users: [{user: "A", password: "A"}] } 7944 B { users: [{user: "B", password: "B"}] } 7945 C { users: [{user: "C", password: "C"}] } 7946 D { users: [{user: "D", password: "D"}] } 7947 E { users: [{user: "E", password: "E"}] } 7948 } 7949 cluster { 7950 port: -1 7951 name: "local" 7952 %s 7953 pool_size: %d 7954 } 7955 ` 7956 conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_, test.poolSize))) 7957 s1, o1 := RunServerWithConfig(conf1) 7958 defer s1.Shutdown() 7959 7960 conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, 7961 fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), 7962 test.poolSize))) 7963 s2, _ := RunServerWithConfig(conf2) 7964 defer s2.Shutdown() 7965 7966 checkClusterFormed(t, s1, s2) 7967 7968 wg := sync.WaitGroup{} 7969 wg.Add(5) 7970 7971 sendAndRecv := func(acc string) (*nats.Conn, *nats.Conn) { 7972 t.Helper() 7973 7974 s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(acc, acc)) 7975 count := 0 7976 natsSub(t, s2nc, "foo", func(_ *nats.Msg) { 7977 if count++; count == total { 7978 wg.Done() 7979 } 7980 }) 7981 natsFlush(t, s2nc) 7982 7983 s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(acc, acc)) 7984 7985 checkSubInterest(t, s1, acc, "foo", time.Second) 7986 return s2nc, s1nc 7987 } 7988 7989 var rcv = [5]*nats.Conn{} 7990 var snd = [5]*nats.Conn{} 7991 accs := []string{"A", "B", "C", "D", "E"} 7992 7993 for i := 0; i < 5; i++ { 7994 rcv[i], snd[i] = sendAndRecv(accs[i]) 7995 defer rcv[i].Close() 7996 defer snd[i].Close() 7997 } 7998 7999 payload := []byte("some message") 8000 start := time.Now() 8001 for i := 0; i < 5; i++ { 8002 go func(idx int) { 8003 for i := 0; i < total; i++ { 8004 snd[idx].Publish("foo", payload) 8005 } 8006 }(i) 8007 } 8008 8009 wg.Wait() 8010 dur := time.Since(start) 8011 if test.poolSize == 0 { 8012 dur1 = dur 8013 } else { 8014 dur2 = dur 8015 } 8016 }) 8017 } 8018 perf1 := float64(total*5) / dur1.Seconds() 8019 t.Logf("No pooling: %.0f msgs/sec", perf1) 8020 perf2 := float64(total*5) / dur2.Seconds() 8021 t.Logf("Pooling : %.0f msgs/sec", perf2) 8022 t.Logf("Gain : %.2fx", perf2/perf1) 8023 } 8024 8025 func testNoRaceRoutePerAccount(t *testing.T, useWildCard bool) { 8026 var dur1 time.Duration 8027 var dur2 time.Duration 8028 8029 accounts := make([]string, 5) 8030 for i := 0; i < 5; i++ { 8031 akp, _ := nkeys.CreateAccount() 8032 pub, _ := akp.PublicKey() 8033 accounts[i] = pub 8034 } 8035 routeAccs := fmt.Sprintf("accounts: [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"]", 8036 accounts[0], accounts[1], accounts[2], accounts[3], accounts[4]) 8037 8038 total := 1_000_000 8039 8040 for _, test := range []struct { 8041 name string 8042 dedicated bool 8043 }{ 8044 {"route for all accounts", false}, 8045 {"route per account", true}, 8046 } { 8047 t.Run(test.name, func(t *testing.T) { 8048 tmpl := ` 8049 server_name: "%s" 8050 port: -1 8051 accounts { 8052 %s { users: [{user: "0", password: "0"}] } 8053 %s { users: [{user: "1", password: "1"}] } 8054 %s { users: [{user: "2", password: "2"}] } 8055 %s { users: [{user: "3", password: "3"}] } 8056 %s { users: [{user: "4", password: "4"}] } 8057 } 8058 cluster { 8059 port: -1 8060 name: "local" 8061 %s 8062 %s 8063 } 8064 ` 8065 var racc string 8066 if test.dedicated { 8067 racc = routeAccs 8068 } else { 8069 racc = _EMPTY_ 8070 } 8071 conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "A", 8072 accounts[0], accounts[1], accounts[2], accounts[3], 8073 accounts[4], _EMPTY_, racc))) 8074 s1, o1 := RunServerWithConfig(conf1) 8075 defer s1.Shutdown() 8076 8077 conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "B", 8078 accounts[0], accounts[1], accounts[2], accounts[3], accounts[4], 8079 fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), 8080 racc))) 8081 s2, _ := RunServerWithConfig(conf2) 8082 defer s2.Shutdown() 8083 8084 checkClusterFormed(t, s1, s2) 8085 8086 wg := sync.WaitGroup{} 8087 wg.Add(5) 8088 8089 sendAndRecv := func(acc string, user string) (*nats.Conn, *nats.Conn) { 8090 t.Helper() 8091 8092 s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(user, user)) 8093 count := 0 8094 var subj string 8095 var checkSubj string 8096 if useWildCard { 8097 subj, checkSubj = "foo.*", "foo.0" 8098 } else { 8099 subj, checkSubj = "foo", "foo" 8100 } 8101 natsSub(t, s2nc, subj, func(_ *nats.Msg) { 8102 if count++; count == total { 8103 wg.Done() 8104 } 8105 }) 8106 natsFlush(t, s2nc) 8107 8108 s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(user, user)) 8109 8110 checkSubInterest(t, s1, acc, checkSubj, time.Second) 8111 return s2nc, s1nc 8112 } 8113 8114 var rcv = [5]*nats.Conn{} 8115 var snd = [5]*nats.Conn{} 8116 users := []string{"0", "1", "2", "3", "4"} 8117 8118 for i := 0; i < 5; i++ { 8119 rcv[i], snd[i] = sendAndRecv(accounts[i], users[i]) 8120 defer rcv[i].Close() 8121 defer snd[i].Close() 8122 } 8123 8124 payload := []byte("some message") 8125 start := time.Now() 8126 for i := 0; i < 5; i++ { 8127 go func(idx int) { 8128 for i := 0; i < total; i++ { 8129 var subj string 8130 if useWildCard { 8131 subj = fmt.Sprintf("foo.%d", i) 8132 } else { 8133 subj = "foo" 8134 } 8135 snd[idx].Publish(subj, payload) 8136 } 8137 }(i) 8138 } 8139 8140 wg.Wait() 8141 dur := time.Since(start) 8142 if !test.dedicated { 8143 dur1 = dur 8144 } else { 8145 dur2 = dur 8146 } 8147 }) 8148 } 8149 perf1 := float64(total*5) / dur1.Seconds() 8150 t.Logf("Route for all accounts: %.0f msgs/sec", perf1) 8151 perf2 := float64(total*5) / dur2.Seconds() 8152 t.Logf("Route per account : %.0f msgs/sec", perf2) 8153 t.Logf("Gain : %.2fx", perf2/perf1) 8154 } 8155 8156 func TestNoRaceRoutePerAccount(t *testing.T) { 8157 testNoRaceRoutePerAccount(t, false) 8158 } 8159 8160 func TestNoRaceRoutePerAccountSubWithWildcard(t *testing.T) { 8161 testNoRaceRoutePerAccount(t, true) 8162 } 8163 8164 // This test, which checks that messages are not duplicated when pooling or 8165 // per-account routes are reloaded, would cause a DATA RACE that is not 8166 // specific to the changes for pooling/per_account. For this reason, this 8167 // test is located in the norace_test.go file. 8168 func TestNoRaceRoutePoolAndPerAccountConfigReload(t *testing.T) { 8169 for _, test := range []struct { 8170 name string 8171 poolSizeBefore string 8172 poolSizeAfter string 8173 accountsBefore string 8174 accountsAfter string 8175 }{ 8176 {"from no pool to pool", _EMPTY_, "pool_size: 2", _EMPTY_, _EMPTY_}, 8177 {"increase pool size", "pool_size: 2", "pool_size: 5", _EMPTY_, _EMPTY_}, 8178 {"decrease pool size", "pool_size: 5", "pool_size: 2", _EMPTY_, _EMPTY_}, 8179 {"from pool to no pool", "pool_size: 5", _EMPTY_, _EMPTY_, _EMPTY_}, 8180 {"from no account to account", _EMPTY_, _EMPTY_, _EMPTY_, "accounts: [\"A\"]"}, 8181 {"add account", _EMPTY_, _EMPTY_, "accounts: [\"B\"]", "accounts: [\"A\",\"B\"]"}, 8182 {"remove account", _EMPTY_, _EMPTY_, "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, 8183 {"from account to no account", _EMPTY_, _EMPTY_, "accounts: [\"A\"]", _EMPTY_}, 8184 {"increase pool size and add account", "pool_size: 2", "pool_size: 3", "accounts: [\"B\"]", "accounts: [\"B\",\"A\"]"}, 8185 {"decrease pool size and remove account", "pool_size: 3", "pool_size: 2", "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, 8186 } { 8187 t.Run(test.name, func(t *testing.T) { 8188 tmplA := ` 8189 port: -1 8190 server_name: "A" 8191 accounts { 8192 A { users: [{user: a, password: pwd}] } 8193 B { users: [{user: b, password: pwd}] } 8194 } 8195 cluster: { 8196 port: -1 8197 name: "local" 8198 %s 8199 %s 8200 } 8201 ` 8202 confA := createConfFile(t, []byte(fmt.Sprintf(tmplA, test.poolSizeBefore, test.accountsBefore))) 8203 srva, optsA := RunServerWithConfig(confA) 8204 defer srva.Shutdown() 8205 8206 tmplB := ` 8207 port: -1 8208 server_name: "B" 8209 accounts { 8210 A { users: [{user: a, password: pwd}] } 8211 B { users: [{user: b, password: pwd}] } 8212 } 8213 cluster: { 8214 port: -1 8215 name: "local" 8216 %s 8217 %s 8218 routes: ["nats://127.0.0.1:%d"] 8219 } 8220 ` 8221 confB := createConfFile(t, []byte(fmt.Sprintf(tmplB, test.poolSizeBefore, test.accountsBefore, optsA.Cluster.Port))) 8222 srvb, _ := RunServerWithConfig(confB) 8223 defer srvb.Shutdown() 8224 8225 checkClusterFormed(t, srva, srvb) 8226 8227 ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("a", "pwd")) 8228 defer ncA.Close() 8229 8230 sub := natsSubSync(t, ncA, "foo") 8231 sub.SetPendingLimits(-1, -1) 8232 checkSubInterest(t, srvb, "A", "foo", time.Second) 8233 8234 ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("a", "pwd")) 8235 defer ncB.Close() 8236 8237 wg := sync.WaitGroup{} 8238 wg.Add(1) 8239 ch := make(chan struct{}) 8240 go func() { 8241 defer wg.Done() 8242 8243 for i := 0; ; i++ { 8244 ncB.Publish("foo", []byte(fmt.Sprintf("%d", i))) 8245 select { 8246 case <-ch: 8247 return 8248 default: 8249 } 8250 if i%300 == 0 { 8251 time.Sleep(time.Duration(rand.Intn(5)) * time.Millisecond) 8252 } 8253 } 8254 }() 8255 8256 var l *captureErrorLogger 8257 if test.accountsBefore != _EMPTY_ && test.accountsAfter == _EMPTY_ { 8258 l = &captureErrorLogger{errCh: make(chan string, 100)} 8259 srva.SetLogger(l, false, false) 8260 } 8261 8262 time.Sleep(250 * time.Millisecond) 8263 reloadUpdateConfig(t, srva, confA, fmt.Sprintf(tmplA, test.poolSizeAfter, test.accountsAfter)) 8264 time.Sleep(125 * time.Millisecond) 8265 reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(tmplB, test.poolSizeAfter, test.accountsAfter, optsA.Cluster.Port)) 8266 8267 checkClusterFormed(t, srva, srvb) 8268 checkSubInterest(t, srvb, "A", "foo", time.Second) 8269 8270 if l != nil { 8271 // Errors regarding "No route for account" should stop 8272 var ok bool 8273 for numErrs := 0; !ok && numErrs < 10; { 8274 select { 8275 case e := <-l.errCh: 8276 if strings.Contains(e, "No route for account") { 8277 numErrs++ 8278 } 8279 case <-time.After(DEFAULT_ROUTE_RECONNECT + 250*time.Millisecond): 8280 ok = true 8281 } 8282 } 8283 if !ok { 8284 t.Fatalf("Still report of no route for account") 8285 } 8286 } 8287 8288 close(ch) 8289 wg.Wait() 8290 8291 for prev := -1; ; { 8292 msg, err := sub.NextMsg(50 * time.Millisecond) 8293 if err != nil { 8294 break 8295 } 8296 cur, _ := strconv.Atoi(string(msg.Data)) 8297 if cur <= prev { 8298 t.Fatalf("Previous was %d, got %d", prev, cur) 8299 } 8300 prev = cur 8301 } 8302 }) 8303 } 8304 } 8305 8306 // This test ensures that outbound queues don't cause a run on 8307 // memory when sending something to lots of clients. 8308 func TestNoRaceClientOutboundQueueMemory(t *testing.T) { 8309 opts := DefaultOptions() 8310 s := RunServer(opts) 8311 defer s.Shutdown() 8312 8313 var before runtime.MemStats 8314 var after runtime.MemStats 8315 8316 var err error 8317 clients := make([]*nats.Conn, 50000) 8318 wait := &sync.WaitGroup{} 8319 wait.Add(len(clients)) 8320 8321 for i := 0; i < len(clients); i++ { 8322 clients[i], err = nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port), nats.InProcessServer(s)) 8323 if err != nil { 8324 t.Fatalf("Error on connect: %v", err) 8325 } 8326 defer clients[i].Close() 8327 8328 clients[i].Subscribe("test", func(m *nats.Msg) { 8329 wait.Done() 8330 }) 8331 } 8332 8333 runtime.GC() 8334 runtime.ReadMemStats(&before) 8335 8336 nc, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port), nats.InProcessServer(s)) 8337 if err != nil { 8338 t.Fatalf("Error on connect: %v", err) 8339 } 8340 defer nc.Close() 8341 8342 var m [48000]byte 8343 if err = nc.Publish("test", m[:]); err != nil { 8344 t.Fatal(err) 8345 } 8346 8347 wait.Wait() 8348 8349 runtime.GC() 8350 runtime.ReadMemStats(&after) 8351 8352 hb, ha := float64(before.HeapAlloc), float64(after.HeapAlloc) 8353 ms := float64(len(m)) 8354 diff := float64(ha) - float64(hb) 8355 inc := (diff / float64(hb)) * 100 8356 8357 if inc > 10 { 8358 t.Logf("Message size: %.1fKB\n", ms/1024) 8359 t.Logf("Subscribed clients: %d\n", len(clients)) 8360 t.Logf("Heap allocs before: %.1fMB\n", hb/1024/1024) 8361 t.Logf("Heap allocs after: %.1fMB\n", ha/1024/1024) 8362 t.Logf("Heap allocs delta: %.1f%%\n", inc) 8363 8364 t.Fatalf("memory increase was %.1f%% (should be <= 10%%)", inc) 8365 } 8366 } 8367 8368 func TestNoRaceJetStreamClusterLeafnodeConnectPerf(t *testing.T) { 8369 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 8370 skip(t) 8371 8372 tmpl := strings.Replace(jsClusterAccountsTempl, "store_dir:", "domain: cloud, store_dir:", 1) 8373 c := createJetStreamCluster(t, tmpl, "CLOUD", _EMPTY_, 3, 18033, true) 8374 defer c.shutdown() 8375 8376 nc, js := jsClientConnect(t, c.randomServer()) 8377 defer nc.Close() 8378 8379 _, err := js.AddStream(&nats.StreamConfig{ 8380 Name: "STATE", 8381 Subjects: []string{"STATE.GLOBAL.CELL1.*.>"}, 8382 Replicas: 3, 8383 }) 8384 require_NoError(t, err) 8385 8386 tmpl = strings.Replace(jsClusterTemplWithSingleFleetLeafNode, "store_dir:", "domain: vehicle, store_dir:", 1) 8387 8388 var vinSerial int 8389 genVIN := func() string { 8390 vinSerial++ 8391 return fmt.Sprintf("7PDSGAALXNN%06d", vinSerial) 8392 } 8393 8394 numVehicles := 500 8395 for i := 0; i < numVehicles; i++ { 8396 start := time.Now() 8397 vin := genVIN() 8398 ln := c.createLeafNodeWithTemplateNoSystemWithProto(vin, tmpl, "ws") 8399 nc, js := jsClientConnect(t, ln) 8400 _, err := js.AddStream(&nats.StreamConfig{ 8401 Name: "VEHICLE", 8402 Subjects: []string{"STATE.GLOBAL.LOCAL.>"}, 8403 Sources: []*nats.StreamSource{{ 8404 Name: "STATE", 8405 FilterSubject: fmt.Sprintf("STATE.GLOBAL.CELL1.%s.>", vin), 8406 External: &nats.ExternalStream{ 8407 APIPrefix: "$JS.cloud.API", 8408 DeliverPrefix: fmt.Sprintf("DELIVER.STATE.GLOBAL.CELL1.%s", vin), 8409 }, 8410 }}, 8411 }) 8412 require_NoError(t, err) 8413 // Create the sourced stream. 8414 checkLeafNodeConnectedCount(t, ln, 1) 8415 if elapsed := time.Since(start); elapsed > 2*time.Second { 8416 t.Fatalf("Took too long to create leafnode %d connection: %v", i+1, elapsed) 8417 } 8418 nc.Close() 8419 } 8420 } 8421 8422 func TestNoRaceJetStreamClusterDifferentRTTInterestBasedStreamPreAck(t *testing.T) { 8423 tmpl := ` 8424 listen: 127.0.0.1:-1 8425 server_name: %s 8426 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 8427 8428 cluster { 8429 name: "F3" 8430 listen: 127.0.0.1:%d 8431 routes = [%s] 8432 } 8433 8434 accounts { 8435 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 8436 } 8437 ` 8438 8439 // Route Ports 8440 // "S1": 14622, 8441 // "S2": 15622, 8442 // "S3": 16622, 8443 8444 // S2 (stream leader) will have a slow path to S1 (via proxy) and S3 (consumer leader) will have a fast path. 8445 8446 // Do these in order, S1, S2 (proxy) then S3. 8447 c := &cluster{t: t, servers: make([]*Server, 3), opts: make([]*Options, 3), name: "F3"} 8448 8449 // S1 8450 conf := fmt.Sprintf(tmpl, "S1", t.TempDir(), 14622, "route://127.0.0.1:15622, route://127.0.0.1:16622") 8451 c.servers[0], c.opts[0] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8452 8453 // S2 8454 // Create the proxy first. Connect this to S1. Make it slow, e.g. 5ms RTT. 8455 np := createNetProxy(1*time.Millisecond, 1024*1024*1024, 1024*1024*1024, "route://127.0.0.1:14622", true) 8456 routes := fmt.Sprintf("%s, route://127.0.0.1:16622", np.routeURL()) 8457 conf = fmt.Sprintf(tmpl, "S2", t.TempDir(), 15622, routes) 8458 c.servers[1], c.opts[1] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8459 8460 // S3 8461 conf = fmt.Sprintf(tmpl, "S3", t.TempDir(), 16622, "route://127.0.0.1:14622, route://127.0.0.1:15622") 8462 c.servers[2], c.opts[2] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8463 8464 c.checkClusterFormed() 8465 c.waitOnClusterReady() 8466 defer c.shutdown() 8467 defer np.stop() 8468 8469 nc, js := jsClientConnect(t, c.randomServer()) 8470 defer nc.Close() 8471 8472 // Now create the stream. 8473 _, err := js.AddStream(&nats.StreamConfig{ 8474 Name: "EVENTS", 8475 Subjects: []string{"EV.>"}, 8476 Replicas: 3, 8477 Retention: nats.InterestPolicy, 8478 }) 8479 require_NoError(t, err) 8480 8481 // Make sure it's leader is on S2. 8482 sl := c.servers[1] 8483 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 8484 c.waitOnStreamLeader(globalAccountName, "EVENTS") 8485 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 8486 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 8487 return fmt.Errorf("Server %s is not stream leader yet", sl) 8488 } 8489 return nil 8490 }) 8491 8492 // Now create the consumer. 8493 _, err = js.AddConsumer("EVENTS", &nats.ConsumerConfig{ 8494 Durable: "C", 8495 AckPolicy: nats.AckExplicitPolicy, 8496 DeliverSubject: "dx", 8497 }) 8498 require_NoError(t, err) 8499 8500 // Make sure the consumer leader is on S3. 8501 cl := c.servers[2] 8502 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 8503 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 8504 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 8505 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 8506 return fmt.Errorf("Server %s is not consumer leader yet", sl) 8507 } 8508 return nil 8509 }) 8510 8511 // Create the real consumer on the consumer leader to make it efficient. 8512 nc, js = jsClientConnect(t, cl) 8513 defer nc.Close() 8514 8515 _, err = js.Subscribe(_EMPTY_, func(msg *nats.Msg) { 8516 msg.Ack() 8517 }, nats.BindStream("EVENTS"), nats.Durable("C"), nats.ManualAck()) 8518 require_NoError(t, err) 8519 8520 for i := 0; i < 1_000; i++ { 8521 _, err := js.PublishAsync("EVENTS.PAID", []byte("ok")) 8522 require_NoError(t, err) 8523 } 8524 select { 8525 case <-js.PublishAsyncComplete(): 8526 case <-time.After(5 * time.Second): 8527 t.Fatalf("Did not receive completion signal") 8528 } 8529 8530 slow := c.servers[0] 8531 mset, err := slow.GlobalAccount().lookupStream("EVENTS") 8532 require_NoError(t, err) 8533 8534 // Make sure preAck is non-nil, so we know the logic has kicked in. 8535 mset.mu.RLock() 8536 preAcks := mset.preAcks 8537 mset.mu.RUnlock() 8538 require_NotNil(t, preAcks) 8539 8540 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 8541 state := mset.state() 8542 if state.Msgs == 0 { 8543 mset.mu.RLock() 8544 lp := len(mset.preAcks) 8545 mset.mu.RUnlock() 8546 if lp == 0 { 8547 return nil 8548 } else { 8549 t.Fatalf("Expected no preAcks with no msgs, but got %d", lp) 8550 } 8551 } 8552 return fmt.Errorf("Still have %d msgs left", state.Msgs) 8553 }) 8554 8555 } 8556 8557 func TestNoRaceCheckAckFloorWithVeryLargeFirstSeqAndNewConsumers(t *testing.T) { 8558 s := RunBasicJetStreamServer(t) 8559 defer s.Shutdown() 8560 8561 nc, _ := jsClientConnect(t, s) 8562 defer nc.Close() 8563 8564 // Make sure to time bound here for the acksync call below. 8565 js, err := nc.JetStream(nats.MaxWait(200 * time.Millisecond)) 8566 require_NoError(t, err) 8567 8568 _, err = js.AddStream(&nats.StreamConfig{ 8569 Name: "TEST", 8570 Subjects: []string{"wq-req"}, 8571 Retention: nats.WorkQueuePolicy, 8572 }) 8573 require_NoError(t, err) 8574 8575 largeFirstSeq := uint64(1_200_000_000) 8576 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: largeFirstSeq}) 8577 require_NoError(t, err) 8578 si, err := js.StreamInfo("TEST") 8579 require_NoError(t, err) 8580 require_True(t, si.State.FirstSeq == largeFirstSeq) 8581 8582 // Add a simple request to the stream. 8583 sendStreamMsg(t, nc, "wq-req", "HELP") 8584 8585 sub, err := js.PullSubscribe("wq-req", "dlc") 8586 require_NoError(t, err) 8587 8588 msgs, err := sub.Fetch(1) 8589 require_NoError(t, err) 8590 require_True(t, len(msgs) == 1) 8591 8592 // The bug is around the checkAckFloor walking the sequences from current ackfloor 8593 // to the first sequence of the stream. We time bound the max wait with the js context 8594 // to 200ms. Since checkAckFloor is spinning and holding up processing of acks this will fail. 8595 // We will short circuit new consumers to fix this one. 8596 require_NoError(t, msgs[0].AckSync()) 8597 8598 // Now do again so we move past the new consumer with no ack floor situation. 8599 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: 2 * largeFirstSeq}) 8600 require_NoError(t, err) 8601 si, err = js.StreamInfo("TEST") 8602 require_NoError(t, err) 8603 require_True(t, si.State.FirstSeq == 2*largeFirstSeq) 8604 8605 sendStreamMsg(t, nc, "wq-req", "MORE HELP") 8606 8607 // We check this one directly for this use case. 8608 mset, err := s.GlobalAccount().lookupStream("TEST") 8609 require_NoError(t, err) 8610 o := mset.lookupConsumer("dlc") 8611 require_True(t, o != nil) 8612 8613 // Purge will move the stream floor by default, so force into the situation where it is back to largeFirstSeq. 8614 // This will not trigger the new consumer logic, but will trigger a walk of the sequence space. 8615 // Fix will be to walk the lesser of the two linear spaces. 8616 o.mu.Lock() 8617 o.asflr = largeFirstSeq 8618 o.mu.Unlock() 8619 8620 done := make(chan bool) 8621 go func() { 8622 o.checkAckFloor() 8623 done <- true 8624 }() 8625 8626 select { 8627 case <-done: 8628 return 8629 case <-time.After(time.Second): 8630 t.Fatalf("Check ack floor taking too long!") 8631 } 8632 } 8633 8634 func TestNoRaceReplicatedMirrorWithLargeStartingSequenceOverLeafnode(t *testing.T) { 8635 // Cluster B 8636 tmpl := strings.Replace(jsClusterTempl, "store_dir:", "domain: B, store_dir:", 1) 8637 c := createJetStreamCluster(t, tmpl, "B", _EMPTY_, 3, 22020, true) 8638 defer c.shutdown() 8639 8640 // Cluster A 8641 // Domain is "A' 8642 lc := c.createLeafNodesWithStartPortAndDomain("A", 3, 22110, "A") 8643 defer lc.shutdown() 8644 8645 lc.waitOnClusterReady() 8646 8647 // Create a stream on B (HUB/CLOUD) and set its starting sequence very high. 8648 nc, js := jsClientConnect(t, c.randomServer()) 8649 defer nc.Close() 8650 8651 _, err := js.AddStream(&nats.StreamConfig{ 8652 Name: "TEST", 8653 Subjects: []string{"foo"}, 8654 Replicas: 3, 8655 }) 8656 require_NoError(t, err) 8657 8658 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: 1_000_000_000}) 8659 require_NoError(t, err) 8660 8661 // Send in a small amount of messages. 8662 for i := 0; i < 1000; i++ { 8663 sendStreamMsg(t, nc, "foo", "Hello") 8664 } 8665 8666 si, err := js.StreamInfo("TEST") 8667 require_NoError(t, err) 8668 require_True(t, si.State.FirstSeq == 1_000_000_000) 8669 8670 // Now try to create a replicated mirror on the leaf cluster. 8671 lnc, ljs := jsClientConnect(t, lc.randomServer()) 8672 defer lnc.Close() 8673 8674 _, err = ljs.AddStream(&nats.StreamConfig{ 8675 Name: "TEST", 8676 Mirror: &nats.StreamSource{ 8677 Name: "TEST", 8678 Domain: "B", 8679 }, 8680 }) 8681 require_NoError(t, err) 8682 8683 // Make sure we sync quickly. 8684 checkFor(t, time.Second, 200*time.Millisecond, func() error { 8685 si, err = ljs.StreamInfo("TEST") 8686 require_NoError(t, err) 8687 if si.State.Msgs == 1000 && si.State.FirstSeq == 1_000_000_000 { 8688 return nil 8689 } 8690 return fmt.Errorf("Mirror state not correct: %+v", si.State) 8691 }) 8692 } 8693 8694 func TestNoRaceBinaryStreamSnapshotEncodingBasic(t *testing.T) { 8695 s := RunBasicJetStreamServer(t) 8696 defer s.Shutdown() 8697 8698 nc, js := jsClientConnect(t, s) 8699 defer nc.Close() 8700 8701 _, err := js.AddStream(&nats.StreamConfig{ 8702 Name: "TEST", 8703 Subjects: []string{"*"}, 8704 MaxMsgsPerSubject: 1, 8705 }) 8706 require_NoError(t, err) 8707 8708 // Set first key 8709 sendStreamMsg(t, nc, "key:1", "hello") 8710 8711 // Set Second key but keep updating it, causing a laggard pattern. 8712 value := bytes.Repeat([]byte("Z"), 8*1024) 8713 8714 for i := 0; i <= 1000; i++ { 8715 _, err := js.PublishAsync("key:2", value) 8716 require_NoError(t, err) 8717 } 8718 select { 8719 case <-js.PublishAsyncComplete(): 8720 case <-time.After(5 * time.Second): 8721 t.Fatalf("Did not receive completion signal") 8722 } 8723 8724 // Now do more of swiss cheese style. 8725 for i := 3; i <= 1000; i++ { 8726 key := fmt.Sprintf("key:%d", i) 8727 _, err := js.PublishAsync(key, value) 8728 require_NoError(t, err) 8729 // Send it twice to create hole right behind it, like swiss cheese. 8730 _, err = js.PublishAsync(key, value) 8731 require_NoError(t, err) 8732 } 8733 select { 8734 case <-js.PublishAsyncComplete(): 8735 case <-time.After(5 * time.Second): 8736 t.Fatalf("Did not receive completion signal") 8737 } 8738 8739 // Make for round numbers for stream state. 8740 sendStreamMsg(t, nc, "key:2", "hello") 8741 sendStreamMsg(t, nc, "key:2", "world") 8742 8743 si, err := js.StreamInfo("TEST") 8744 require_NoError(t, err) 8745 require_True(t, si.State.FirstSeq == 1) 8746 require_True(t, si.State.LastSeq == 3000) 8747 require_True(t, si.State.Msgs == 1000) 8748 require_True(t, si.State.NumDeleted == 2000) 8749 8750 mset, err := s.GlobalAccount().lookupStream("TEST") 8751 require_NoError(t, err) 8752 8753 snap, err := mset.store.EncodedStreamState(0) 8754 require_NoError(t, err) 8755 8756 // Now decode the snapshot. 8757 ss, err := DecodeStreamState(snap) 8758 require_NoError(t, err) 8759 8760 require_Equal(t, ss.FirstSeq, 1) 8761 require_Equal(t, ss.LastSeq, 3000) 8762 require_Equal(t, ss.Msgs, 1000) 8763 require_Equal(t, ss.Deleted.NumDeleted(), 2000) 8764 } 8765 8766 func TestNoRaceFilestoreBinaryStreamSnapshotEncodingLargeGaps(t *testing.T) { 8767 storeDir := t.TempDir() 8768 fcfg := FileStoreConfig{ 8769 StoreDir: storeDir, 8770 BlockSize: 512, // Small on purpose to create alot of blks. 8771 } 8772 fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) 8773 require_NoError(t, err) 8774 defer fs.Stop() 8775 8776 subj, msg := "zzz", bytes.Repeat([]byte("X"), 128) 8777 numMsgs := 20_000 8778 8779 fs.StoreMsg(subj, nil, msg) 8780 for i := 2; i < numMsgs; i++ { 8781 seq, _, err := fs.StoreMsg(subj, nil, nil) 8782 require_NoError(t, err) 8783 fs.RemoveMsg(seq) 8784 } 8785 fs.StoreMsg(subj, nil, msg) 8786 8787 snap, err := fs.EncodedStreamState(0) 8788 require_NoError(t, err) 8789 require_True(t, len(snap) < 512) 8790 8791 // Now decode the snapshot. 8792 ss, err := DecodeStreamState(snap) 8793 require_NoError(t, err) 8794 8795 require_True(t, ss.FirstSeq == 1) 8796 require_True(t, ss.LastSeq == 20_000) 8797 require_True(t, ss.Msgs == 2) 8798 require_True(t, len(ss.Deleted) <= 2) 8799 require_True(t, ss.Deleted.NumDeleted() == 19_998) 8800 } 8801 8802 func TestNoRaceJetStreamClusterStreamSnapshotCatchup(t *testing.T) { 8803 c := createJetStreamClusterExplicit(t, "R3S", 3) 8804 defer c.shutdown() 8805 8806 // Client based API 8807 nc, js := jsClientConnect(t, c.randomServer()) 8808 defer nc.Close() 8809 8810 _, err := js.AddStream(&nats.StreamConfig{ 8811 Name: "TEST", 8812 Subjects: []string{"*"}, 8813 MaxMsgsPerSubject: 1, 8814 Replicas: 3, 8815 }) 8816 require_NoError(t, err) 8817 8818 msg := []byte("Hello World") 8819 _, err = js.Publish("foo", msg) 8820 require_NoError(t, err) 8821 8822 for i := 1; i < 1000; i++ { 8823 _, err := js.PublishAsync("bar", msg) 8824 require_NoError(t, err) 8825 } 8826 select { 8827 case <-js.PublishAsyncComplete(): 8828 case <-time.After(5 * time.Second): 8829 t.Fatalf("Did not receive completion signal") 8830 } 8831 8832 sr := c.randomNonStreamLeader(globalAccountName, "TEST") 8833 sr.Shutdown() 8834 8835 // In case we were connected to sr. 8836 nc, js = jsClientConnect(t, c.randomServer()) 8837 defer nc.Close() 8838 8839 // Now create a large gap. 8840 for i := 0; i < 50_000; i++ { 8841 _, err := js.PublishAsync("bar", msg) 8842 require_NoError(t, err) 8843 } 8844 select { 8845 case <-js.PublishAsyncComplete(): 8846 case <-time.After(10 * time.Second): 8847 t.Fatalf("Did not receive completion signal") 8848 } 8849 8850 sl := c.streamLeader(globalAccountName, "TEST") 8851 sl.JetStreamSnapshotStream(globalAccountName, "TEST") 8852 8853 sr = c.restartServer(sr) 8854 c.checkClusterFormed() 8855 c.waitOnServerCurrent(sr) 8856 c.waitOnStreamCurrent(sr, globalAccountName, "TEST") 8857 8858 mset, err := sr.GlobalAccount().lookupStream("TEST") 8859 require_NoError(t, err) 8860 8861 // Make sure it's caught up 8862 var state StreamState 8863 mset.store.FastState(&state) 8864 require_Equal(t, state.Msgs, 2) 8865 require_Equal(t, state.FirstSeq, 1) 8866 require_Equal(t, state.LastSeq, 51_000) 8867 require_Equal(t, state.NumDeleted, 51_000-2) 8868 8869 sr.Shutdown() 8870 8871 _, err = js.Publish("baz", msg) 8872 require_NoError(t, err) 8873 8874 sl.JetStreamSnapshotStream(globalAccountName, "TEST") 8875 8876 sr = c.restartServer(sr) 8877 c.checkClusterFormed() 8878 c.waitOnServerCurrent(sr) 8879 c.waitOnStreamCurrent(sr, globalAccountName, "TEST") 8880 8881 mset, err = sr.GlobalAccount().lookupStream("TEST") 8882 require_NoError(t, err) 8883 mset.store.FastState(&state) 8884 8885 require_Equal(t, state.Msgs, 3) 8886 require_Equal(t, state.FirstSeq, 1) 8887 require_Equal(t, state.LastSeq, 51_001) 8888 require_Equal(t, state.NumDeleted, 51_001-3) 8889 } 8890 8891 func TestNoRaceStoreStreamEncoderDecoder(t *testing.T) { 8892 cfg := &StreamConfig{ 8893 Name: "zzz", 8894 Subjects: []string{"*"}, 8895 MaxMsgsPer: 1, 8896 Storage: MemoryStorage, 8897 } 8898 ms, err := newMemStore(cfg) 8899 require_NoError(t, err) 8900 8901 fs, err := newFileStore( 8902 FileStoreConfig{StoreDir: t.TempDir()}, 8903 StreamConfig{Name: "zzz", Subjects: []string{"*"}, MaxMsgsPer: 1, Storage: FileStorage}, 8904 ) 8905 require_NoError(t, err) 8906 defer fs.Stop() 8907 8908 const seed = 2222222 8909 msg := bytes.Repeat([]byte("ABC"), 33) // ~100bytes 8910 8911 maxEncodeTime := 2 * time.Second 8912 maxEncodeSize := 700 * 1024 8913 8914 test := func(t *testing.T, gs StreamStore) { 8915 t.Parallel() 8916 prand := rand.New(rand.NewSource(seed)) 8917 tick := time.NewTicker(time.Second) 8918 defer tick.Stop() 8919 done := time.NewTimer(10 * time.Second) 8920 8921 for running := true; running; { 8922 select { 8923 case <-tick.C: 8924 var state StreamState 8925 gs.FastState(&state) 8926 if state.NumDeleted == 0 { 8927 continue 8928 } 8929 start := time.Now() 8930 snap, err := gs.EncodedStreamState(0) 8931 require_NoError(t, err) 8932 elapsed := time.Since(start) 8933 // Should take <1ms without race but if CI/CD is slow we will give it a bit of room. 8934 if elapsed > maxEncodeTime { 8935 t.Logf("Encode took longer then expected: %v", elapsed) 8936 } 8937 if len(snap) > maxEncodeSize { 8938 t.Fatalf("Expected snapshot size < %v got %v", friendlyBytes(maxEncodeSize), friendlyBytes(len(snap))) 8939 } 8940 ss, err := DecodeStreamState(snap) 8941 require_True(t, len(ss.Deleted) > 0) 8942 require_NoError(t, err) 8943 case <-done.C: 8944 running = false 8945 default: 8946 key := strconv.Itoa(prand.Intn(256_000)) 8947 gs.StoreMsg(key, nil, msg) 8948 } 8949 } 8950 } 8951 8952 for _, gs := range []StreamStore{ms, fs} { 8953 switch gs.(type) { 8954 case *memStore: 8955 t.Run("MemStore", func(t *testing.T) { 8956 test(t, gs) 8957 }) 8958 case *fileStore: 8959 t.Run("FileStore", func(t *testing.T) { 8960 test(t, gs) 8961 }) 8962 } 8963 } 8964 } 8965 8966 func TestNoRaceJetStreamClusterKVWithServerKill(t *testing.T) { 8967 c := createJetStreamClusterExplicit(t, "R3S", 3) 8968 defer c.shutdown() 8969 8970 // Setup the KV bucket and use for making assertions. 8971 nc, js := jsClientConnect(t, c.randomServer()) 8972 defer nc.Close() 8973 _, err := js.CreateKeyValue(&nats.KeyValueConfig{ 8974 Bucket: "TEST", 8975 Replicas: 3, 8976 History: 10, 8977 }) 8978 require_NoError(t, err) 8979 8980 // Total number of keys to range over. 8981 numKeys := 50 8982 8983 // ID is the server id to explicitly connect to. 8984 work := func(ctx context.Context, wg *sync.WaitGroup, id int) { 8985 defer wg.Done() 8986 8987 nc, js := jsClientConnect(t, c.servers[id]) 8988 defer nc.Close() 8989 8990 kv, err := js.KeyValue("TEST") 8991 require_NoError(t, err) 8992 8993 // 100 messages a second for each single client. 8994 tk := time.NewTicker(10 * time.Millisecond) 8995 defer tk.Stop() 8996 8997 for { 8998 select { 8999 case <-ctx.Done(): 9000 return 9001 9002 case <-tk.C: 9003 // Pick a random key within the range. 9004 k := fmt.Sprintf("key.%d", rand.Intn(numKeys)) 9005 // Attempt to get a key. 9006 e, err := kv.Get(k) 9007 // If found, attempt to update or delete. 9008 if err == nil { 9009 if rand.Intn(10) < 3 { 9010 kv.Delete(k, nats.LastRevision(e.Revision())) 9011 } else { 9012 kv.Update(k, nil, e.Revision()) 9013 } 9014 } else if errors.Is(err, nats.ErrKeyNotFound) { 9015 kv.Create(k, nil) 9016 } 9017 } 9018 } 9019 } 9020 9021 ctx, cancel := context.WithCancel(context.Background()) 9022 defer cancel() 9023 9024 var wg sync.WaitGroup 9025 wg.Add(3) 9026 9027 go work(ctx, &wg, 0) 9028 go work(ctx, &wg, 1) 9029 go work(ctx, &wg, 2) 9030 9031 time.Sleep(time.Second) 9032 9033 // Simulate server stop and restart. 9034 for i := 0; i < 10; i++ { 9035 s := c.randomServer() 9036 s.Shutdown() 9037 c.waitOnLeader() 9038 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 9039 9040 // Wait for a bit and then start the server again. 9041 time.Sleep(time.Duration(rand.Intn(1500)) * time.Millisecond) 9042 s = c.restartServer(s) 9043 c.waitOnServerCurrent(s) 9044 c.waitOnLeader() 9045 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 9046 c.waitOnPeerCount(3) 9047 } 9048 9049 // Stop the workload. 9050 cancel() 9051 wg.Wait() 9052 9053 type fullState struct { 9054 state StreamState 9055 lseq uint64 9056 clfs uint64 9057 } 9058 9059 grabState := func(mset *stream) *fullState { 9060 mset.mu.RLock() 9061 defer mset.mu.RUnlock() 9062 var state StreamState 9063 mset.store.FastState(&state) 9064 return &fullState{state, mset.lseq, mset.clfs} 9065 } 9066 9067 grabStore := func(mset *stream) map[string][]uint64 { 9068 mset.mu.RLock() 9069 store := mset.store 9070 mset.mu.RUnlock() 9071 var state StreamState 9072 store.FastState(&state) 9073 storeMap := make(map[string][]uint64) 9074 for seq := state.FirstSeq; seq <= state.LastSeq; seq++ { 9075 if sm, err := store.LoadMsg(seq, nil); err == nil { 9076 storeMap[sm.subj] = append(storeMap[sm.subj], sm.seq) 9077 } 9078 } 9079 return storeMap 9080 } 9081 9082 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 9083 // Current stream leader. 9084 sl := c.streamLeader(globalAccountName, "KV_TEST") 9085 mset, err := sl.GlobalAccount().lookupStream("KV_TEST") 9086 require_NoError(t, err) 9087 lstate := grabState(mset) 9088 golden := grabStore(mset) 9089 9090 // Report messages per server. 9091 for _, s := range c.servers { 9092 if s == sl { 9093 continue 9094 } 9095 mset, err := s.GlobalAccount().lookupStream("KV_TEST") 9096 require_NoError(t, err) 9097 state := grabState(mset) 9098 if !reflect.DeepEqual(state, lstate) { 9099 return fmt.Errorf("Expected follower state\n%+v\nto match leader's\n %+v", state, lstate) 9100 } 9101 sm := grabStore(mset) 9102 if !reflect.DeepEqual(sm, golden) { 9103 t.Fatalf("Expected follower store for %v\n%+v\nto match leader's %v\n %+v", s, sm, sl, golden) 9104 } 9105 } 9106 return nil 9107 }) 9108 } 9109 9110 func TestNoRaceFileStoreLargeMsgsAndFirstMatching(t *testing.T) { 9111 sd := t.TempDir() 9112 fs, err := newFileStore( 9113 FileStoreConfig{StoreDir: sd, BlockSize: 8 * 1024 * 1024}, 9114 StreamConfig{Name: "zzz", Subjects: []string{">"}, Storage: FileStorage}) 9115 require_NoError(t, err) 9116 defer fs.Stop() 9117 9118 for i := 0; i < 150_000; i++ { 9119 fs.StoreMsg(fmt.Sprintf("foo.bar.%d", i), nil, nil) 9120 } 9121 for i := 0; i < 150_000; i++ { 9122 fs.StoreMsg(fmt.Sprintf("foo.baz.%d", i), nil, nil) 9123 } 9124 require_Equal(t, fs.numMsgBlocks(), 2) 9125 fs.mu.RLock() 9126 mb := fs.blks[1] 9127 fs.mu.RUnlock() 9128 fseq := atomic.LoadUint64(&mb.first.seq) 9129 // The -40 leaves enough mb.fss entries to kick in linear scan. 9130 for seq := fseq; seq < 300_000-40; seq++ { 9131 fs.RemoveMsg(uint64(seq)) 9132 } 9133 start := time.Now() 9134 fs.LoadNextMsg("*.baz.*", true, fseq, nil) 9135 require_True(t, time.Since(start) < 200*time.Microsecond) 9136 // Now remove more to kick into non-linear logic. 9137 for seq := 300_000 - 40; seq < 300_000; seq++ { 9138 fs.RemoveMsg(uint64(seq)) 9139 } 9140 start = time.Now() 9141 fs.LoadNextMsg("*.baz.*", true, fseq, nil) 9142 require_True(t, time.Since(start) < 200*time.Microsecond) 9143 } 9144 9145 func TestNoRaceWSNoCorruptionWithFrameSizeLimit(t *testing.T) { 9146 testWSNoCorruptionWithFrameSizeLimit(t, 50000) 9147 } 9148 9149 func TestNoRaceJetStreamAPIDispatchQueuePending(t *testing.T) { 9150 c := createJetStreamClusterExplicit(t, "R3S", 3) 9151 defer c.shutdown() 9152 9153 // Setup the KV bucket and use for making assertions. 9154 nc, js := jsClientConnect(t, c.randomServer()) 9155 defer nc.Close() 9156 9157 _, err := js.AddStream(&nats.StreamConfig{ 9158 Name: "TEST", 9159 Subjects: []string{"foo.*.*"}, 9160 }) 9161 require_NoError(t, err) 9162 9163 // Queue up 500k messages all with different subjects. 9164 // We want to make num pending for a consumer expensive, so a large subject 9165 // space and wildcards for now does the trick. 9166 toks := []string{"foo", "bar", "baz"} // for second token. 9167 for i := 1; i <= 500_000; i++ { 9168 subj := fmt.Sprintf("foo.%s.%d", toks[rand.Intn(len(toks))], i) 9169 _, err := js.PublishAsync(subj, nil, nats.StallWait(time.Second)) 9170 require_NoError(t, err) 9171 } 9172 select { 9173 case <-js.PublishAsyncComplete(): 9174 case <-time.After(20 * time.Second): 9175 t.Fatalf("Did not receive completion signal") 9176 } 9177 9178 // To back up our pending queue we will create lots of filtered, with wildcards, R1 consumers 9179 // from a different server then the one hosting the stream. 9180 // ok to share this connection here. 9181 sldr := c.streamLeader(globalAccountName, "TEST") 9182 for _, s := range c.servers { 9183 if s != sldr { 9184 nc, js = jsClientConnect(t, s) 9185 defer nc.Close() 9186 break 9187 } 9188 } 9189 9190 ngr, ncons := 100, 10 9191 startCh, errCh := make(chan bool), make(chan error, ngr) 9192 var wg, swg sync.WaitGroup 9193 wg.Add(ngr) 9194 swg.Add(ngr) 9195 9196 // The wildcard in the filter subject is the key. 9197 cfg := &nats.ConsumerConfig{FilterSubject: "foo.*.22"} 9198 var tt atomic.Int64 9199 9200 for i := 0; i < ngr; i++ { 9201 go func() { 9202 defer wg.Done() 9203 swg.Done() 9204 // Make them all fire at once. 9205 <-startCh 9206 9207 for i := 0; i < ncons; i++ { 9208 start := time.Now() 9209 if _, err := js.AddConsumer("TEST", cfg); err != nil { 9210 errCh <- err 9211 t.Logf("Got err creating consumer: %v", err) 9212 } 9213 elapsed := time.Since(start) 9214 tt.Add(int64(elapsed)) 9215 } 9216 }() 9217 } 9218 swg.Wait() 9219 close(startCh) 9220 time.Sleep(time.Millisecond) 9221 jsz, _ := sldr.Jsz(nil) 9222 // This could be 0 legit, so just log, don't fail. 9223 if jsz.JetStreamStats.API.Inflight == 0 { 9224 t.Log("Expected a non-zero inflight") 9225 } 9226 wg.Wait() 9227 9228 if len(errCh) > 0 { 9229 t.Fatalf("Expected no errors, got %d", len(errCh)) 9230 } 9231 } 9232 9233 func TestNoRaceJetStreamMirrorAndSourceConsumerFailBackoff(t *testing.T) { 9234 // Check calculations first. 9235 for i := 1; i <= 20; i++ { 9236 backoff := calculateRetryBackoff(i) 9237 if i < 12 { 9238 require_Equal(t, backoff, time.Duration(i)*10*time.Second) 9239 } else { 9240 require_Equal(t, backoff, retryMaximum) 9241 } 9242 } 9243 9244 c := createJetStreamClusterExplicit(t, "R3S", 3) 9245 defer c.shutdown() 9246 9247 // Setup the KV bucket and use for making assertions. 9248 nc, js := jsClientConnect(t, c.randomServer()) 9249 defer nc.Close() 9250 9251 _, err := js.AddStream(&nats.StreamConfig{ 9252 Name: "TEST", 9253 Subjects: []string{"foo.*.*"}, 9254 }) 9255 require_NoError(t, err) 9256 sl := c.streamLeader(globalAccountName, "TEST") 9257 9258 // Create a mirror. 9259 ml := sl 9260 // Make sure not on the same server. Should not happened in general but possible. 9261 for ml == sl { 9262 js.DeleteStream("MIRROR") 9263 _, err = js.AddStream(&nats.StreamConfig{ 9264 Name: "MIRROR", 9265 Mirror: &nats.StreamSource{Name: "TEST"}, 9266 }) 9267 require_NoError(t, err) 9268 ml = c.streamLeader(globalAccountName, "MIRROR") 9269 } 9270 9271 // Create sub to watch for the consumer create requests. 9272 nc, _ = jsClientConnect(t, ml) 9273 defer nc.Close() 9274 sub, err := nc.SubscribeSync("$JS.API.CONSUMER.CREATE.>") 9275 require_NoError(t, err) 9276 9277 // Kill the server where the source is.. 9278 sldr := c.streamLeader(globalAccountName, "TEST") 9279 sldr.Shutdown() 9280 9281 // Wait for just greater than 10s. We should only see 1 request during this time. 9282 time.Sleep(11 * time.Second) 9283 n, _, _ := sub.Pending() 9284 require_Equal(t, n, 1) 9285 9286 // Now make sure that the fails is set properly. 9287 mset, err := ml.GlobalAccount().lookupStream("MIRROR") 9288 require_NoError(t, err) 9289 mset.mu.RLock() 9290 fails := mset.mirror.fails 9291 mset.mu.RUnlock() 9292 require_Equal(t, fails, 1) 9293 9294 js.DeleteStream("MIRROR") 9295 // Clear sub 9296 sub.NextMsg(time.Second) 9297 // Make sure sources behave similarly. 9298 _, err = js.AddStream(&nats.StreamConfig{ 9299 Name: "SOURCE", 9300 Sources: []*nats.StreamSource{{Name: "TEST"}}, 9301 }) 9302 require_NoError(t, err) 9303 9304 // Wait for just greater than 10s. We should only see 1 request during this time. 9305 time.Sleep(11 * time.Second) 9306 n, _, _ = sub.Pending() 9307 require_Equal(t, n, 1) 9308 9309 mset, err = c.streamLeader(globalAccountName, "SOURCE").GlobalAccount().lookupStream("SOURCE") 9310 require_NoError(t, err) 9311 mset.mu.RLock() 9312 si := mset.sources["TEST > >"] 9313 mset.mu.RUnlock() 9314 require_True(t, si != nil) 9315 require_Equal(t, si.fails, 1) 9316 } 9317 9318 func TestNoRaceJetStreamClusterStreamCatchupLargeInteriorDeletes(t *testing.T) { 9319 c := createJetStreamClusterExplicit(t, "R3S", 3) 9320 defer c.shutdown() 9321 9322 nc, js := jsClientConnect(t, c.randomServer()) 9323 defer nc.Close() 9324 9325 cfg := &nats.StreamConfig{ 9326 Name: "TEST", 9327 Subjects: []string{"foo.*"}, 9328 MaxMsgsPerSubject: 100, 9329 Replicas: 1, 9330 } 9331 9332 _, err := js.AddStream(cfg) 9333 require_NoError(t, err) 9334 9335 msg := bytes.Repeat([]byte("Z"), 2*1024) 9336 // We will create lots of interior deletes on our R1 then scale up. 9337 _, err = js.Publish("foo.0", msg) 9338 require_NoError(t, err) 9339 9340 // Create 50k messages randomly from 1-100 9341 for i := 0; i < 50_000; i++ { 9342 subj := fmt.Sprintf("foo.%d", rand.Intn(100)+1) 9343 js.PublishAsync(subj, msg) 9344 } 9345 select { 9346 case <-js.PublishAsyncComplete(): 9347 case <-time.After(5 * time.Second): 9348 t.Fatalf("Did not receive completion signal") 9349 } 9350 // Now create a large gap. 9351 for i := 0; i < 100_000; i++ { 9352 js.PublishAsync("foo.2", msg) 9353 } 9354 select { 9355 case <-js.PublishAsyncComplete(): 9356 case <-time.After(5 * time.Second): 9357 t.Fatalf("Did not receive completion signal") 9358 } 9359 // Do 50k random again at end. 9360 for i := 0; i < 50_000; i++ { 9361 subj := fmt.Sprintf("foo.%d", rand.Intn(100)+1) 9362 js.PublishAsync(subj, msg) 9363 } 9364 select { 9365 case <-js.PublishAsyncComplete(): 9366 case <-time.After(5 * time.Second): 9367 t.Fatalf("Did not receive completion signal") 9368 } 9369 9370 si, err := js.StreamInfo("TEST") 9371 require_NoError(t, err) 9372 9373 cfg.Replicas = 2 9374 _, err = js.UpdateStream(cfg) 9375 require_NoError(t, err) 9376 9377 // Let catchup start. 9378 c.waitOnStreamLeader(globalAccountName, "TEST") 9379 9380 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 9381 require_True(t, nl != nil) 9382 mset, err := nl.GlobalAccount().lookupStream("TEST") 9383 require_NoError(t, err) 9384 9385 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 9386 state := mset.state() 9387 if state.Msgs == si.State.Msgs { 9388 return nil 9389 } 9390 return fmt.Errorf("Msgs not equal %d vs %d", state.Msgs, si.State.Msgs) 9391 }) 9392 } 9393 9394 func TestNoRaceJetStreamClusterBadRestartsWithHealthzPolling(t *testing.T) { 9395 c := createJetStreamClusterExplicit(t, "R3S", 3) 9396 defer c.shutdown() 9397 9398 nc, js := jsClientConnect(t, c.randomServer()) 9399 defer nc.Close() 9400 9401 cfg := &nats.StreamConfig{ 9402 Name: "TEST", 9403 Subjects: []string{"foo.>"}, 9404 Replicas: 3, 9405 } 9406 _, err := js.AddStream(cfg) 9407 require_NoError(t, err) 9408 9409 // We will poll healthz at a decent clip and make sure any restart logic works 9410 // correctly with assets coming and going. 9411 ch := make(chan struct{}) 9412 defer close(ch) 9413 9414 go func() { 9415 for { 9416 select { 9417 case <-ch: 9418 return 9419 case <-time.After(50 * time.Millisecond): 9420 for _, s := range c.servers { 9421 s.healthz(nil) 9422 } 9423 } 9424 } 9425 }() 9426 9427 numConsumers := 500 9428 consumers := make([]string, 0, numConsumers) 9429 9430 var wg sync.WaitGroup 9431 9432 for i := 0; i < numConsumers; i++ { 9433 cname := fmt.Sprintf("CONS-%d", i+1) 9434 consumers = append(consumers, cname) 9435 wg.Add(1) 9436 go func() { 9437 defer wg.Done() 9438 _, err := js.PullSubscribe("foo.>", cname, nats.BindStream("TEST")) 9439 require_NoError(t, err) 9440 }() 9441 } 9442 wg.Wait() 9443 9444 // Make sure all are reported. 9445 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9446 for _, s := range c.servers { 9447 jsz, _ := s.Jsz(nil) 9448 if jsz.Consumers != numConsumers { 9449 return fmt.Errorf("%v wrong number of consumers: %d vs %d", s, jsz.Consumers, numConsumers) 9450 } 9451 } 9452 return nil 9453 }) 9454 9455 // Now do same for streams. 9456 numStreams := 200 9457 streams := make([]string, 0, numStreams) 9458 9459 for i := 0; i < numStreams; i++ { 9460 sname := fmt.Sprintf("TEST-%d", i+1) 9461 streams = append(streams, sname) 9462 wg.Add(1) 9463 go func() { 9464 defer wg.Done() 9465 _, err := js.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 9466 require_NoError(t, err) 9467 }() 9468 } 9469 wg.Wait() 9470 9471 // Make sure all are reported. 9472 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9473 for _, s := range c.servers { 9474 jsz, _ := s.Jsz(nil) 9475 if jsz.Streams != numStreams+1 { 9476 return fmt.Errorf("%v wrong number of streams: %d vs %d", s, jsz.Streams, numStreams+1) 9477 } 9478 } 9479 return nil 9480 }) 9481 9482 // Delete consumers. 9483 for _, cname := range consumers { 9484 err := js.DeleteConsumer("TEST", cname) 9485 require_NoError(t, err) 9486 } 9487 // Make sure reporting goes to zero. 9488 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9489 for _, s := range c.servers { 9490 jsz, _ := s.Jsz(nil) 9491 if jsz.Consumers != 0 { 9492 return fmt.Errorf("%v still has %d consumers", s, jsz.Consumers) 9493 } 9494 } 9495 return nil 9496 }) 9497 9498 // Delete streams 9499 for _, sname := range streams { 9500 err := js.DeleteStream(sname) 9501 require_NoError(t, err) 9502 } 9503 err = js.DeleteStream("TEST") 9504 require_NoError(t, err) 9505 9506 // Make sure reporting goes to zero. 9507 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9508 for _, s := range c.servers { 9509 jsz, _ := s.Jsz(nil) 9510 if jsz.Streams != 0 { 9511 return fmt.Errorf("%v still has %d streams", s, jsz.Streams) 9512 } 9513 } 9514 return nil 9515 }) 9516 } 9517 9518 func TestNoRaceJetStreamKVReplaceWithServerRestart(t *testing.T) { 9519 c := createJetStreamClusterExplicit(t, "R3S", 3) 9520 defer c.shutdown() 9521 9522 nc, _ := jsClientConnect(t, c.randomServer()) 9523 defer nc.Close() 9524 // Shorten wait time for disconnects. 9525 js, err := nc.JetStream(nats.MaxWait(time.Second)) 9526 require_NoError(t, err) 9527 9528 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 9529 Bucket: "TEST", 9530 Replicas: 3, 9531 }) 9532 require_NoError(t, err) 9533 9534 createData := func(n int) []byte { 9535 const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 9536 b := make([]byte, n) 9537 for i := range b { 9538 b[i] = letterBytes[rand.Intn(len(letterBytes))] 9539 } 9540 return b 9541 } 9542 9543 _, err = kv.Create("foo", createData(160)) 9544 require_NoError(t, err) 9545 9546 ch := make(chan struct{}) 9547 wg := sync.WaitGroup{} 9548 9549 // For counting errors that should not happen. 9550 errCh := make(chan error, 1024) 9551 9552 wg.Add(1) 9553 go func() { 9554 defer wg.Done() 9555 9556 var lastData []byte 9557 var revision uint64 9558 9559 for { 9560 select { 9561 case <-ch: 9562 return 9563 default: 9564 k, err := kv.Get("foo") 9565 if err == nats.ErrKeyNotFound { 9566 errCh <- err 9567 } else if k != nil { 9568 if lastData != nil && k.Revision() == revision && !bytes.Equal(lastData, k.Value()) { 9569 errCh <- fmt.Errorf("data loss [%s][rev:%d] expected:[%q] is:[%q]\n", "foo", revision, lastData, k.Value()) 9570 } 9571 newData := createData(160) 9572 if revision, err = kv.Update("foo", newData, k.Revision()); err == nil { 9573 lastData = newData 9574 } 9575 } 9576 } 9577 } 9578 }() 9579 9580 // Wait a short bit. 9581 time.Sleep(2 * time.Second) 9582 for _, s := range c.servers { 9583 s.Shutdown() 9584 // Need to leave servers down for awhile to trigger bug properly. 9585 time.Sleep(5 * time.Second) 9586 s = c.restartServer(s) 9587 c.waitOnServerHealthz(s) 9588 } 9589 9590 // Shutdown the go routine above. 9591 close(ch) 9592 // Wait for it to finish. 9593 wg.Wait() 9594 9595 if len(errCh) != 0 { 9596 for err := range errCh { 9597 t.Logf("Received err %v during test", err) 9598 } 9599 t.Fatalf("Encountered errors") 9600 } 9601 } 9602 9603 func TestNoRaceMemStoreCompactPerformance(t *testing.T) { 9604 //Load MemStore so that it is full 9605 subj, msg := "foo", make([]byte, 1000) 9606 storedMsgSize := memStoreMsgSize(subj, nil, msg) 9607 9608 toStore := uint64(10_000) 9609 toStoreOnTop := uint64(1_000) 9610 setSeqNo := uint64(10_000_000_000) 9611 9612 expectedPurge := toStore - 1 9613 maxBytes := storedMsgSize * toStore 9614 9615 ms, err := newMemStore(&StreamConfig{Storage: MemoryStorage, MaxBytes: int64(maxBytes)}) 9616 require_NoError(t, err) 9617 defer ms.Stop() 9618 9619 for i := uint64(0); i < toStore; i++ { 9620 ms.StoreMsg(subj, nil, msg) 9621 } 9622 state := ms.State() 9623 require_Equal(t, toStore, state.Msgs) 9624 require_Equal(t, state.Bytes, storedMsgSize*toStore) 9625 9626 //1st run: Load additional messages then compact 9627 for i := uint64(0); i < toStoreOnTop; i++ { 9628 ms.StoreMsg(subj, nil, msg) 9629 } 9630 startFirstRun := time.Now() 9631 purgedFirstRun, _ := ms.Compact(toStore + toStoreOnTop) 9632 elapsedFirstRun := time.Since(startFirstRun) 9633 require_Equal(t, expectedPurge, purgedFirstRun) 9634 9635 //set the seq number to a very high value by compacting with a too high seq number 9636 purgedFull, _ := ms.Compact(setSeqNo) 9637 require_Equal(t, 1, purgedFull) 9638 9639 //2nd run: Compact again 9640 for i := uint64(0); i < toStore; i++ { 9641 ms.StoreMsg(subj, nil, msg) 9642 } 9643 startSecondRun := time.Now() 9644 purgedSecondRun, _ := ms.Compact(setSeqNo + toStore - 1) 9645 elapsedSecondRun := time.Since(startSecondRun) 9646 require_Equal(t, expectedPurge, purgedSecondRun) 9647 9648 //Calculate delta between runs and fail if it is too high 9649 require_LessThan(t, elapsedSecondRun-elapsedFirstRun, time.Duration(1)*time.Second) 9650 } 9651 9652 func TestNoRaceJetStreamSnapshotsWithSlowAckDontSlowConsumer(t *testing.T) { 9653 s := RunBasicJetStreamServer(t) 9654 defer s.Shutdown() 9655 9656 ech := make(chan error) 9657 ecb := func(_ *nats.Conn, _ *nats.Subscription, err error) { 9658 if err != nil { 9659 ech <- err 9660 } 9661 } 9662 nc, js := jsClientConnect(t, s, nats.ErrorHandler(ecb)) 9663 defer nc.Close() 9664 9665 _, err := js.AddStream(&nats.StreamConfig{ 9666 Name: "TEST", 9667 Subjects: []string{"foo"}, 9668 }) 9669 require_NoError(t, err) 9670 9671 // Put in over 64MB. 9672 msg, toSend := make([]byte, 1024*1024), 80 9673 crand.Read(msg) 9674 9675 for i := 0; i < toSend; i++ { 9676 _, err := js.Publish("foo", msg) 9677 require_NoError(t, err) 9678 } 9679 9680 sreq := &JSApiStreamSnapshotRequest{ 9681 DeliverSubject: nats.NewInbox(), 9682 ChunkSize: 1024 * 1024, 9683 } 9684 req, _ := json.Marshal(sreq) 9685 rmsg, err := nc.Request(fmt.Sprintf(JSApiStreamSnapshotT, "TEST"), req, time.Second) 9686 require_NoError(t, err) 9687 9688 var resp JSApiStreamSnapshotResponse 9689 json.Unmarshal(rmsg.Data, &resp) 9690 require_True(t, resp.Error == nil) 9691 9692 done := make(chan *nats.Msg) 9693 sub, _ := nc.Subscribe(sreq.DeliverSubject, func(m *nats.Msg) { 9694 // EOF 9695 if len(m.Data) == 0 { 9696 done <- m 9697 return 9698 } 9699 }) 9700 defer sub.Unsubscribe() 9701 9702 // Check that we do not get disconnected due to slow consumer. 9703 select { 9704 case msg := <-done: 9705 require_Equal(t, msg.Header.Get("Status"), "408") 9706 require_Equal(t, msg.Header.Get("Description"), "No Flow Response") 9707 case <-ech: 9708 t.Fatalf("Got disconnected: %v", err) 9709 case <-time.After(5 * time.Second): 9710 t.Fatalf("Should have received EOF with error status") 9711 } 9712 }