github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/norace_test.go (about) 1 // Copyright 2018-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 //go:build !race && !skip_no_race_tests 15 // +build !race,!skip_no_race_tests 16 17 package server 18 19 import ( 20 "bufio" 21 "bytes" 22 "compress/gzip" 23 "context" 24 "encoding/binary" 25 "encoding/json" 26 "errors" 27 "fmt" 28 "io" 29 "math" 30 "math/rand" 31 "net" 32 "net/http" 33 "net/url" 34 "os" 35 "path/filepath" 36 "reflect" 37 "runtime" 38 "runtime/debug" 39 "sort" 40 "strconv" 41 "strings" 42 "sync" 43 "sync/atomic" 44 "testing" 45 "time" 46 47 "crypto/hmac" 48 crand "crypto/rand" 49 "crypto/sha256" 50 51 "github.com/klauspost/compress/s2" 52 "github.com/nats-io/jwt/v2" 53 "github.com/nats-io/nats-server/v2/server/avl" 54 "github.com/nats-io/nats.go" 55 "github.com/nats-io/nkeys" 56 "github.com/nats-io/nuid" 57 ) 58 59 // IMPORTANT: Tests in this file are not executed when running with the -race flag. 60 // The test name should be prefixed with TestNoRace so we can run only 61 // those tests: go test -run=TestNoRace ... 62 63 func TestNoRaceAvoidSlowConsumerBigMessages(t *testing.T) { 64 opts := DefaultOptions() // Use defaults to make sure they avoid pending slow consumer. 65 opts.NoSystemAccount = true 66 s := RunServer(opts) 67 defer s.Shutdown() 68 69 nc1, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 70 if err != nil { 71 t.Fatalf("Error on connect: %v", err) 72 } 73 defer nc1.Close() 74 75 nc2, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 76 if err != nil { 77 t.Fatalf("Error on connect: %v", err) 78 } 79 defer nc2.Close() 80 81 data := make([]byte, 1024*1024) // 1MB payload 82 crand.Read(data) 83 84 expected := int32(500) 85 received := int32(0) 86 87 done := make(chan bool) 88 89 // Create Subscription. 90 nc1.Subscribe("slow.consumer", func(m *nats.Msg) { 91 // Just eat it so that we are not measuring 92 // code time, just delivery. 93 atomic.AddInt32(&received, 1) 94 if received >= expected { 95 done <- true 96 } 97 }) 98 99 // Create Error handler 100 nc1.SetErrorHandler(func(c *nats.Conn, s *nats.Subscription, err error) { 101 t.Fatalf("Received an error on the subscription's connection: %v\n", err) 102 }) 103 104 nc1.Flush() 105 106 for i := 0; i < int(expected); i++ { 107 nc2.Publish("slow.consumer", data) 108 } 109 nc2.Flush() 110 111 select { 112 case <-done: 113 return 114 case <-time.After(10 * time.Second): 115 r := atomic.LoadInt32(&received) 116 if s.NumSlowConsumers() > 0 { 117 t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", r, expected) 118 } 119 t.Fatalf("Failed to receive all large messages: %d of %d\n", r, expected) 120 } 121 } 122 123 func TestNoRaceRoutedQueueAutoUnsubscribe(t *testing.T) { 124 optsA, err := ProcessConfigFile("./configs/seed.conf") 125 require_NoError(t, err) 126 optsA.NoSigs, optsA.NoLog = true, true 127 optsA.NoSystemAccount = true 128 srvA := RunServer(optsA) 129 defer srvA.Shutdown() 130 131 srvARouteURL := fmt.Sprintf("nats://%s:%d", optsA.Cluster.Host, srvA.ClusterAddr().Port) 132 optsB := nextServerOpts(optsA) 133 optsB.Routes = RoutesFromStr(srvARouteURL) 134 135 srvB := RunServer(optsB) 136 defer srvB.Shutdown() 137 138 // Wait for these 2 to connect to each other 139 checkClusterFormed(t, srvA, srvB) 140 141 // Have a client connection to each server 142 ncA, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsA.Host, optsA.Port)) 143 if err != nil { 144 t.Fatalf("Error on connect: %v", err) 145 } 146 defer ncA.Close() 147 148 ncB, err := nats.Connect(fmt.Sprintf("nats://%s:%d", optsB.Host, optsB.Port)) 149 if err != nil { 150 t.Fatalf("Error on connect: %v", err) 151 } 152 defer ncB.Close() 153 154 rbar := int32(0) 155 barCb := func(m *nats.Msg) { 156 atomic.AddInt32(&rbar, 1) 157 } 158 rbaz := int32(0) 159 bazCb := func(m *nats.Msg) { 160 atomic.AddInt32(&rbaz, 1) 161 } 162 163 // Create 125 queue subs with auto-unsubscribe to each server for 164 // group bar and group baz. So 250 total per queue group. 165 cons := []*nats.Conn{ncA, ncB} 166 for _, c := range cons { 167 for i := 0; i < 100; i++ { 168 qsub, err := c.QueueSubscribe("foo", "bar", barCb) 169 if err != nil { 170 t.Fatalf("Error on subscribe: %v", err) 171 } 172 if err := qsub.AutoUnsubscribe(1); err != nil { 173 t.Fatalf("Error on auto-unsubscribe: %v", err) 174 } 175 qsub, err = c.QueueSubscribe("foo", "baz", bazCb) 176 if err != nil { 177 t.Fatalf("Error on subscribe: %v", err) 178 } 179 if err := qsub.AutoUnsubscribe(1); err != nil { 180 t.Fatalf("Error on auto-unsubscribe: %v", err) 181 } 182 } 183 c.Subscribe("TEST.COMPLETE", func(m *nats.Msg) {}) 184 } 185 186 // We coelasce now so for each server we will have all local (200) plus 187 // two from the remote side for each queue group. We also create one more 188 // and will wait til each server has 204 subscriptions, that will make sure 189 // that we have everything setup. 190 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 191 subsA := srvA.NumSubscriptions() 192 subsB := srvB.NumSubscriptions() 193 if subsA != 204 || subsB != 204 { 194 return fmt.Errorf("Not all subs processed yet: %d and %d", subsA, subsB) 195 } 196 return nil 197 }) 198 199 expected := int32(200) 200 // Now send messages from each server 201 for i := int32(0); i < expected; i++ { 202 c := cons[i%2] 203 c.Publish("foo", []byte("Don't Drop Me!")) 204 } 205 for _, c := range cons { 206 c.Flush() 207 } 208 209 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 210 nbar := atomic.LoadInt32(&rbar) 211 nbaz := atomic.LoadInt32(&rbaz) 212 if nbar == expected && nbaz == expected { 213 return nil 214 } 215 return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'", 216 expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz)) 217 }) 218 } 219 220 func TestNoRaceClosedSlowConsumerWriteDeadline(t *testing.T) { 221 opts := DefaultOptions() 222 opts.NoSystemAccount = true 223 opts.WriteDeadline = 10 * time.Millisecond // Make very small to trip. 224 opts.MaxPending = 500 * 1024 * 1024 // Set high so it will not trip here. 225 s := RunServer(opts) 226 defer s.Shutdown() 227 228 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 229 if err != nil { 230 t.Fatalf("Error on connect: %v", err) 231 } 232 defer c.Close() 233 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 234 t.Fatalf("Error sending protocols to server: %v", err) 235 } 236 // Reduce socket buffer to increase reliability of data backing up in the server destined 237 // for our subscribed client. 238 c.(*net.TCPConn).SetReadBuffer(128) 239 240 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 241 sender, err := nats.Connect(url) 242 if err != nil { 243 t.Fatalf("Error on connect: %v", err) 244 } 245 defer sender.Close() 246 247 payload := make([]byte, 1024*1024) 248 for i := 0; i < 100; i++ { 249 if err := sender.Publish("foo", payload); err != nil { 250 t.Fatalf("Error on publish: %v", err) 251 } 252 } 253 254 // Flush sender connection to ensure that all data has been sent. 255 if err := sender.Flush(); err != nil { 256 t.Fatalf("Error on flush: %v", err) 257 } 258 259 // At this point server should have closed connection c. 260 checkClosedConns(t, s, 1, 2*time.Second) 261 conns := s.closedClients() 262 if lc := len(conns); lc != 1 { 263 t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc) 264 } 265 checkReason(t, conns[0].Reason, SlowConsumerWriteDeadline) 266 } 267 268 func TestNoRaceClosedSlowConsumerPendingBytes(t *testing.T) { 269 opts := DefaultOptions() 270 opts.NoSystemAccount = true 271 opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer. 272 opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip. 273 s := RunServer(opts) 274 defer s.Shutdown() 275 276 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 277 if err != nil { 278 t.Fatalf("Error on connect: %v", err) 279 } 280 defer c.Close() 281 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 282 t.Fatalf("Error sending protocols to server: %v", err) 283 } 284 // Reduce socket buffer to increase reliability of data backing up in the server destined 285 // for our subscribed client. 286 c.(*net.TCPConn).SetReadBuffer(128) 287 288 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 289 sender, err := nats.Connect(url) 290 if err != nil { 291 t.Fatalf("Error on connect: %v", err) 292 } 293 defer sender.Close() 294 295 payload := make([]byte, 1024*1024) 296 for i := 0; i < 100; i++ { 297 if err := sender.Publish("foo", payload); err != nil { 298 t.Fatalf("Error on publish: %v", err) 299 } 300 } 301 302 // Flush sender connection to ensure that all data has been sent. 303 if err := sender.Flush(); err != nil { 304 t.Fatalf("Error on flush: %v", err) 305 } 306 307 // At this point server should have closed connection c. 308 checkClosedConns(t, s, 1, 2*time.Second) 309 conns := s.closedClients() 310 if lc := len(conns); lc != 1 { 311 t.Fatalf("len(conns) expected to be %d, got %d\n", 1, lc) 312 } 313 checkReason(t, conns[0].Reason, SlowConsumerPendingBytes) 314 } 315 316 func TestNoRaceSlowConsumerPendingBytes(t *testing.T) { 317 opts := DefaultOptions() 318 opts.NoSystemAccount = true 319 opts.WriteDeadline = 30 * time.Second // Wait for long time so write deadline does not trigger slow consumer. 320 opts.MaxPending = 1 * 1024 * 1024 // Set to low value (1MB) to allow SC to trip. 321 s := RunServer(opts) 322 defer s.Shutdown() 323 324 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 325 if err != nil { 326 t.Fatalf("Error on connect: %v", err) 327 } 328 defer c.Close() 329 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 330 t.Fatalf("Error sending protocols to server: %v", err) 331 } 332 // Reduce socket buffer to increase reliability of data backing up in the server destined 333 // for our subscribed client. 334 c.(*net.TCPConn).SetReadBuffer(128) 335 336 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 337 sender, err := nats.Connect(url) 338 if err != nil { 339 t.Fatalf("Error on connect: %v", err) 340 } 341 defer sender.Close() 342 343 payload := make([]byte, 1024*1024) 344 for i := 0; i < 100; i++ { 345 if err := sender.Publish("foo", payload); err != nil { 346 t.Fatalf("Error on publish: %v", err) 347 } 348 } 349 350 // Flush sender connection to ensure that all data has been sent. 351 if err := sender.Flush(); err != nil { 352 t.Fatalf("Error on flush: %v", err) 353 } 354 355 // At this point server should have closed connection c. 356 357 // On certain platforms, it may take more than one call before 358 // getting the error. 359 for i := 0; i < 100; i++ { 360 if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil { 361 // ok 362 return 363 } 364 } 365 t.Fatal("Connection should have been closed") 366 } 367 368 func TestNoRaceGatewayNoMissingReplies(t *testing.T) { 369 // This test will have following setup: 370 // 371 // responder1 requestor 372 // | | 373 // v v 374 // [A1]<-------gw------------[B1] 375 // | \ | 376 // | \______gw__________ | route 377 // | _\| | 378 // [ ]--------gw----------->[ ] 379 // [A2]<-------gw------------[B2] 380 // [ ] [ ] 381 // ^ 382 // | 383 // responder2 384 // 385 // There is a possible race that when the requestor creates 386 // a subscription on the reply subject, the subject interest 387 // being sent from the inbound gateway, and B1 having none, 388 // the SUB first goes to B2 before being sent to A1 from 389 // B2's inbound GW. But the request can go from B1 to A1 390 // right away and the responder1 connecting to A1 may send 391 // back the reply before the interest on the reply makes it 392 // to A1 (from B2). 393 // This test will also verify that if the responder is instead 394 // connected to A2, the reply is properly received by requestor 395 // on B1. 396 397 // For this test we want to be in interestOnly mode, so 398 // make it happen quickly 399 gatewayMaxRUnsubBeforeSwitch = 1 400 defer func() { gatewayMaxRUnsubBeforeSwitch = defaultGatewayMaxRUnsubBeforeSwitch }() 401 402 // Start with setting up A2 and B2. 403 ob2 := testDefaultOptionsForGateway("B") 404 sb2 := runGatewayServer(ob2) 405 defer sb2.Shutdown() 406 407 oa2 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2) 408 sa2 := runGatewayServer(oa2) 409 defer sa2.Shutdown() 410 411 waitForOutboundGateways(t, sa2, 1, time.Second) 412 waitForInboundGateways(t, sa2, 1, time.Second) 413 waitForOutboundGateways(t, sb2, 1, time.Second) 414 waitForInboundGateways(t, sb2, 1, time.Second) 415 416 // Now start A1 which will connect to B2 417 oa1 := testGatewayOptionsFromToWithServers(t, "A", "B", sb2) 418 oa1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa2.Cluster.Host, oa2.Cluster.Port)) 419 sa1 := runGatewayServer(oa1) 420 defer sa1.Shutdown() 421 422 waitForOutboundGateways(t, sa1, 1, time.Second) 423 waitForInboundGateways(t, sb2, 2, time.Second) 424 425 checkClusterFormed(t, sa1, sa2) 426 427 // Finally, start B1 that will connect to A1. 428 ob1 := testGatewayOptionsFromToWithServers(t, "B", "A", sa1) 429 ob1.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", ob2.Cluster.Host, ob2.Cluster.Port)) 430 sb1 := runGatewayServer(ob1) 431 defer sb1.Shutdown() 432 433 // Check that we have the outbound gateway from B1 to A1 434 checkFor(t, 3*time.Second, 15*time.Millisecond, func() error { 435 c := sb1.getOutboundGatewayConnection("A") 436 if c == nil { 437 return fmt.Errorf("Outbound connection to A not created yet") 438 } 439 c.mu.Lock() 440 name := c.opts.Name 441 nc := c.nc 442 c.mu.Unlock() 443 if name != sa1.ID() { 444 // Force a disconnect 445 nc.Close() 446 return fmt.Errorf("Was unable to have B1 connect to A1") 447 } 448 return nil 449 }) 450 451 waitForInboundGateways(t, sa1, 1, time.Second) 452 checkClusterFormed(t, sb1, sb2) 453 454 a1URL := fmt.Sprintf("nats://%s:%d", oa1.Host, oa1.Port) 455 a2URL := fmt.Sprintf("nats://%s:%d", oa2.Host, oa2.Port) 456 b1URL := fmt.Sprintf("nats://%s:%d", ob1.Host, ob1.Port) 457 b2URL := fmt.Sprintf("nats://%s:%d", ob2.Host, ob2.Port) 458 459 ncb1 := natsConnect(t, b1URL) 460 defer ncb1.Close() 461 462 ncb2 := natsConnect(t, b2URL) 463 defer ncb2.Close() 464 465 natsSubSync(t, ncb1, "just.a.sub") 466 natsSubSync(t, ncb2, "just.a.sub") 467 checkExpectedSubs(t, 2, sb1, sb2) 468 469 // For this test, we want A to be checking B's interest in order 470 // to send messages (which would cause replies to be dropped if 471 // there is no interest registered on A). So from A servers, 472 // send to various subjects and cause B's to switch to interestOnly 473 // mode. 474 nca1 := natsConnect(t, a1URL) 475 defer nca1.Close() 476 for i := 0; i < 10; i++ { 477 natsPub(t, nca1, fmt.Sprintf("reject.%d", i), []byte("hello")) 478 } 479 nca2 := natsConnect(t, a2URL) 480 defer nca2.Close() 481 for i := 0; i < 10; i++ { 482 natsPub(t, nca2, fmt.Sprintf("reject.%d", i), []byte("hello")) 483 } 484 485 checkSwitchedMode := func(t *testing.T, s *Server) { 486 t.Helper() 487 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 488 var switchedMode bool 489 c := s.getOutboundGatewayConnection("B") 490 ei, _ := c.gw.outsim.Load(globalAccountName) 491 if ei != nil { 492 e := ei.(*outsie) 493 e.RLock() 494 switchedMode = e.ni == nil && e.mode == InterestOnly 495 e.RUnlock() 496 } 497 if !switchedMode { 498 return fmt.Errorf("Still not switched mode") 499 } 500 return nil 501 }) 502 } 503 checkSwitchedMode(t, sa1) 504 checkSwitchedMode(t, sa2) 505 506 // Setup a subscriber on _INBOX.> on each of A's servers. 507 total := 1000 508 expected := int32(total) 509 rcvOnA := int32(0) 510 qrcvOnA := int32(0) 511 natsSub(t, nca1, "myreply.>", func(_ *nats.Msg) { 512 atomic.AddInt32(&rcvOnA, 1) 513 }) 514 natsQueueSub(t, nca2, "myreply.>", "bar", func(_ *nats.Msg) { 515 atomic.AddInt32(&qrcvOnA, 1) 516 }) 517 checkExpectedSubs(t, 2, sa1, sa2) 518 519 // Ok.. so now we will run the actual test where we 520 // create a responder on A1 and make sure that every 521 // single request from B1 gets the reply. Will repeat 522 // test with responder connected to A2. 523 sendReqs := func(t *testing.T, subConn *nats.Conn) { 524 t.Helper() 525 responder := natsSub(t, subConn, "foo", func(m *nats.Msg) { 526 m.Respond([]byte("reply")) 527 }) 528 natsFlush(t, subConn) 529 checkExpectedSubs(t, 3, sa1, sa2) 530 531 // We are not going to use Request() because this sets 532 // a wildcard subscription on an INBOX and less likely 533 // to produce the race. Instead we will explicitly set 534 // the subscription on the reply subject and create one 535 // per request. 536 for i := 0; i < total/2; i++ { 537 reply := fmt.Sprintf("myreply.%d", i) 538 replySub := natsQueueSubSync(t, ncb1, reply, "bar") 539 natsFlush(t, ncb1) 540 541 // Let's make sure we have interest on B2. 542 if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 { 543 checkFor(t, time.Second, time.Millisecond, func() error { 544 if r := sb2.globalAccount().sl.Match(reply); len(r.qsubs) == 0 { 545 return fmt.Errorf("B still not registered interest on %s", reply) 546 } 547 return nil 548 }) 549 } 550 natsPubReq(t, ncb1, "foo", reply, []byte("request")) 551 if _, err := replySub.NextMsg(time.Second); err != nil { 552 t.Fatalf("Did not receive reply: %v", err) 553 } 554 natsUnsub(t, replySub) 555 } 556 557 responder.Unsubscribe() 558 natsFlush(t, subConn) 559 checkExpectedSubs(t, 2, sa1, sa2) 560 } 561 sendReqs(t, nca1) 562 sendReqs(t, nca2) 563 564 checkFor(t, time.Second, 15*time.Millisecond, func() error { 565 if n := atomic.LoadInt32(&rcvOnA); n != expected { 566 return fmt.Errorf("Subs on A expected to get %v replies, got %v", expected, n) 567 } 568 return nil 569 }) 570 571 // We should not have received a single message on the queue sub 572 // on cluster A because messages will have been delivered to 573 // the member on cluster B. 574 if n := atomic.LoadInt32(&qrcvOnA); n != 0 { 575 t.Fatalf("Queue sub on A should not have received message, got %v", n) 576 } 577 } 578 579 func TestNoRaceRouteMemUsage(t *testing.T) { 580 oa := DefaultOptions() 581 sa := RunServer(oa) 582 defer sa.Shutdown() 583 584 ob := DefaultOptions() 585 ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port)) 586 sb := RunServer(ob) 587 defer sb.Shutdown() 588 589 checkClusterFormed(t, sa, sb) 590 591 responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port)) 592 defer responder.Close() 593 for i := 0; i < 10; i++ { 594 natsSub(t, responder, "foo", func(m *nats.Msg) { 595 m.Respond(m.Data) 596 }) 597 } 598 natsFlush(t, responder) 599 600 payload := make([]byte, 50*1024) 601 602 bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 603 604 // Capture mem usage 605 mem := runtime.MemStats{} 606 runtime.ReadMemStats(&mem) 607 inUseBefore := mem.HeapInuse 608 609 for i := 0; i < 100; i++ { 610 requestor := natsConnect(t, bURL) 611 // Don't use a defer here otherwise that will make the memory check fail! 612 // We are closing the connection just after these few instructions that 613 // are not calling t.Fatal() anyway. 614 inbox := nats.NewInbox() 615 sub := natsSubSync(t, requestor, inbox) 616 natsPubReq(t, requestor, "foo", inbox, payload) 617 for j := 0; j < 10; j++ { 618 natsNexMsg(t, sub, time.Second) 619 } 620 requestor.Close() 621 } 622 623 runtime.GC() 624 debug.FreeOSMemory() 625 runtime.ReadMemStats(&mem) 626 inUseNow := mem.HeapInuse 627 if inUseNow > 3*inUseBefore { 628 t.Fatalf("Heap in-use before was %v, now %v: too high", inUseBefore, inUseNow) 629 } 630 } 631 632 func TestNoRaceRouteCache(t *testing.T) { 633 maxPerAccountCacheSize = 20 634 prunePerAccountCacheSize = 5 635 closedSubsCheckInterval = 250 * time.Millisecond 636 637 defer func() { 638 maxPerAccountCacheSize = defaultMaxPerAccountCacheSize 639 prunePerAccountCacheSize = defaultPrunePerAccountCacheSize 640 closedSubsCheckInterval = defaultClosedSubsCheckInterval 641 }() 642 643 for _, test := range []struct { 644 name string 645 useQueue bool 646 }{ 647 {"plain_sub", false}, 648 {"queue_sub", true}, 649 } { 650 t.Run(test.name, func(t *testing.T) { 651 652 oa := DefaultOptions() 653 oa.NoSystemAccount = true 654 oa.Cluster.PoolSize = -1 655 sa := RunServer(oa) 656 defer sa.Shutdown() 657 658 ob := DefaultOptions() 659 ob.NoSystemAccount = true 660 ob.Cluster.PoolSize = -1 661 ob.Routes = RoutesFromStr(fmt.Sprintf("nats://%s:%d", oa.Cluster.Host, oa.Cluster.Port)) 662 sb := RunServer(ob) 663 defer sb.Shutdown() 664 665 checkClusterFormed(t, sa, sb) 666 667 responder := natsConnect(t, fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port)) 668 defer responder.Close() 669 natsSub(t, responder, "foo", func(m *nats.Msg) { 670 m.Respond(m.Data) 671 }) 672 natsFlush(t, responder) 673 674 checkExpectedSubs(t, 1, sa) 675 checkExpectedSubs(t, 1, sb) 676 677 bURL := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 678 requestor := natsConnect(t, bURL) 679 defer requestor.Close() 680 681 ch := make(chan struct{}, 1) 682 cb := func(_ *nats.Msg) { 683 select { 684 case ch <- struct{}{}: 685 default: 686 } 687 } 688 689 sendReqs := func(t *testing.T, nc *nats.Conn, count int, unsub bool) { 690 t.Helper() 691 for i := 0; i < count; i++ { 692 inbox := nats.NewInbox() 693 var sub *nats.Subscription 694 if test.useQueue { 695 sub = natsQueueSub(t, nc, inbox, "queue", cb) 696 } else { 697 sub = natsSub(t, nc, inbox, cb) 698 } 699 natsPubReq(t, nc, "foo", inbox, []byte("hello")) 700 select { 701 case <-ch: 702 case <-time.After(time.Second): 703 t.Fatalf("Failed to get reply") 704 } 705 if unsub { 706 natsUnsub(t, sub) 707 } 708 } 709 } 710 sendReqs(t, requestor, maxPerAccountCacheSize+1, true) 711 712 var route *client 713 sb.mu.Lock() 714 route = getFirstRoute(sb) 715 sb.mu.Unlock() 716 717 checkExpected := func(t *testing.T, expected int) { 718 t.Helper() 719 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 720 route.mu.Lock() 721 n := len(route.in.pacache) 722 route.mu.Unlock() 723 if n != expected { 724 return fmt.Errorf("Expected %v subs in the cache, got %v", expected, n) 725 } 726 return nil 727 }) 728 } 729 checkExpected(t, (maxPerAccountCacheSize+1)-(prunePerAccountCacheSize+1)) 730 731 // Wait for more than the orphan check 732 time.Sleep(2 * closedSubsCheckInterval) 733 734 // Add a new subs up to point where new prune would occur 735 sendReqs(t, requestor, prunePerAccountCacheSize+1, false) 736 737 // Now closed subs should have been removed, so expected 738 // subs in the cache should be the new ones. 739 checkExpected(t, prunePerAccountCacheSize+1) 740 741 // Now try wil implicit unsubscribe (due to connection close) 742 sendReqs(t, requestor, maxPerAccountCacheSize+1, false) 743 requestor.Close() 744 745 checkExpected(t, maxPerAccountCacheSize-prunePerAccountCacheSize) 746 747 // Wait for more than the orphan check 748 time.Sleep(2 * closedSubsCheckInterval) 749 750 // Now create new connection and send prunePerAccountCacheSize+1 751 // and that should cause all subs from previous connection to be 752 // removed from cache 753 requestor = natsConnect(t, bURL) 754 defer requestor.Close() 755 756 sendReqs(t, requestor, prunePerAccountCacheSize+1, false) 757 checkExpected(t, prunePerAccountCacheSize+1) 758 }) 759 } 760 } 761 762 func TestNoRaceFetchAccountDoesNotRegisterAccountTwice(t *testing.T) { 763 sa, oa, sb, ob, _ := runTrustedGateways(t) 764 defer sa.Shutdown() 765 defer sb.Shutdown() 766 767 // Let's create a user account. 768 okp, _ := nkeys.FromSeed(oSeed) 769 akp, _ := nkeys.CreateAccount() 770 pub, _ := akp.PublicKey() 771 nac := jwt.NewAccountClaims(pub) 772 jwt, _ := nac.Encode(okp) 773 userAcc := pub 774 775 // Replace B's account resolver with one that introduces 776 // delay during the Fetch() 777 sac := &slowAccResolver{AccountResolver: sb.AccountResolver()} 778 sb.SetAccountResolver(sac) 779 780 // Add the account in sa and sb 781 addAccountToMemResolver(sa, userAcc, jwt) 782 addAccountToMemResolver(sb, userAcc, jwt) 783 784 // Tell the slow account resolver which account to slow down 785 sac.Lock() 786 sac.acc = userAcc 787 sac.Unlock() 788 789 urlA := fmt.Sprintf("nats://%s:%d", oa.Host, oa.Port) 790 urlB := fmt.Sprintf("nats://%s:%d", ob.Host, ob.Port) 791 792 nca, err := nats.Connect(urlA, createUserCreds(t, sa, akp)) 793 if err != nil { 794 t.Fatalf("Error connecting to A: %v", err) 795 } 796 defer nca.Close() 797 798 // Since there is an optimistic send, this message will go to B 799 // and on processing this message, B will lookup/fetch this 800 // account, which can produce race with the fetch of this 801 // account from A's system account that sent a notification 802 // about this account, or with the client connect just after 803 // that. 804 nca.Publish("foo", []byte("hello")) 805 806 // Now connect and create a subscription on B 807 ncb, err := nats.Connect(urlB, createUserCreds(t, sb, akp)) 808 if err != nil { 809 t.Fatalf("Error connecting to A: %v", err) 810 } 811 defer ncb.Close() 812 sub, err := ncb.SubscribeSync("foo") 813 if err != nil { 814 t.Fatalf("Error on subscribe: %v", err) 815 } 816 ncb.Flush() 817 818 // Now send messages from A and B should ultimately start to receive 819 // them (once the subscription has been correctly registered) 820 ok := false 821 for i := 0; i < 10; i++ { 822 nca.Publish("foo", []byte("hello")) 823 if _, err := sub.NextMsg(100 * time.Millisecond); err != nil { 824 continue 825 } 826 ok = true 827 break 828 } 829 if !ok { 830 t.Fatalf("B should be able to receive messages") 831 } 832 833 checkTmpAccounts := func(t *testing.T, s *Server) { 834 t.Helper() 835 empty := true 836 s.tmpAccounts.Range(func(_, _ any) bool { 837 empty = false 838 return false 839 }) 840 if !empty { 841 t.Fatalf("tmpAccounts is not empty") 842 } 843 } 844 checkTmpAccounts(t, sa) 845 checkTmpAccounts(t, sb) 846 } 847 848 func TestNoRaceWriteDeadline(t *testing.T) { 849 opts := DefaultOptions() 850 opts.NoSystemAccount = true 851 opts.WriteDeadline = 30 * time.Millisecond 852 s := RunServer(opts) 853 defer s.Shutdown() 854 855 c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%d", opts.Host, opts.Port), 3*time.Second) 856 if err != nil { 857 t.Fatalf("Error on connect: %v", err) 858 } 859 defer c.Close() 860 if _, err := c.Write([]byte("CONNECT {}\r\nPING\r\nSUB foo 1\r\n")); err != nil { 861 t.Fatalf("Error sending protocols to server: %v", err) 862 } 863 // Reduce socket buffer to increase reliability of getting 864 // write deadline errors. 865 c.(*net.TCPConn).SetReadBuffer(4) 866 867 url := fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port) 868 sender, err := nats.Connect(url) 869 if err != nil { 870 t.Fatalf("Error on connect: %v", err) 871 } 872 defer sender.Close() 873 874 payload := make([]byte, 1000000) 875 total := 1000 876 for i := 0; i < total; i++ { 877 if err := sender.Publish("foo", payload); err != nil { 878 t.Fatalf("Error on publish: %v", err) 879 } 880 } 881 // Flush sender connection to ensure that all data has been sent. 882 if err := sender.Flush(); err != nil { 883 t.Fatalf("Error on flush: %v", err) 884 } 885 886 // At this point server should have closed connection c. 887 888 // On certain platforms, it may take more than one call before 889 // getting the error. 890 for i := 0; i < 100; i++ { 891 if _, err := c.Write([]byte("PUB bar 5\r\nhello\r\n")); err != nil { 892 // ok 893 return 894 } 895 } 896 t.Fatal("Connection should have been closed") 897 } 898 899 func TestNoRaceLeafNodeClusterNameConflictDeadlock(t *testing.T) { 900 o := DefaultOptions() 901 o.LeafNode.Port = -1 902 s := RunServer(o) 903 defer s.Shutdown() 904 905 u, err := url.Parse(fmt.Sprintf("nats://127.0.0.1:%d", o.LeafNode.Port)) 906 if err != nil { 907 t.Fatalf("Error parsing url: %v", err) 908 } 909 910 o1 := DefaultOptions() 911 o1.ServerName = "A1" 912 o1.Cluster.Name = "clusterA" 913 o1.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 914 s1 := RunServer(o1) 915 defer s1.Shutdown() 916 917 checkLeafNodeConnected(t, s1) 918 919 o2 := DefaultOptions() 920 o2.ServerName = "A2" 921 o2.Cluster.Name = "clusterA" 922 o2.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port)) 923 o2.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 924 s2 := RunServer(o2) 925 defer s2.Shutdown() 926 927 checkLeafNodeConnected(t, s2) 928 checkClusterFormed(t, s1, s2) 929 930 o3 := DefaultOptions() 931 o3.ServerName = "A3" 932 o3.Cluster.Name = "" // intentionally not set 933 o3.Routes = RoutesFromStr(fmt.Sprintf("nats://127.0.0.1:%d", o1.Cluster.Port)) 934 o3.LeafNode.Remotes = []*RemoteLeafOpts{{URLs: []*url.URL{u}}} 935 s3 := RunServer(o3) 936 defer s3.Shutdown() 937 938 checkLeafNodeConnected(t, s3) 939 checkClusterFormed(t, s1, s2, s3) 940 } 941 942 // This test is same than TestAccountAddServiceImportRace but running 943 // without the -race flag, it would capture more easily the possible 944 // duplicate sid, resulting in less than expected number of subscriptions 945 // in the account's internal subscriptions map. 946 func TestNoRaceAccountAddServiceImportRace(t *testing.T) { 947 TestAccountAddServiceImportRace(t) 948 } 949 950 // Similar to the routed version. Make sure we receive all of the 951 // messages with auto-unsubscribe enabled. 952 func TestNoRaceQueueAutoUnsubscribe(t *testing.T) { 953 opts := DefaultOptions() 954 s := RunServer(opts) 955 defer s.Shutdown() 956 957 nc, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port)) 958 if err != nil { 959 t.Fatalf("Error on connect: %v", err) 960 } 961 defer nc.Close() 962 963 rbar := int32(0) 964 barCb := func(m *nats.Msg) { 965 atomic.AddInt32(&rbar, 1) 966 } 967 rbaz := int32(0) 968 bazCb := func(m *nats.Msg) { 969 atomic.AddInt32(&rbaz, 1) 970 } 971 972 // Create 1000 subscriptions with auto-unsubscribe of 1. 973 // Do two groups, one bar and one baz. 974 total := 1000 975 for i := 0; i < total; i++ { 976 qsub, err := nc.QueueSubscribe("foo", "bar", barCb) 977 if err != nil { 978 t.Fatalf("Error on subscribe: %v", err) 979 } 980 if err := qsub.AutoUnsubscribe(1); err != nil { 981 t.Fatalf("Error on auto-unsubscribe: %v", err) 982 } 983 qsub, err = nc.QueueSubscribe("foo", "baz", bazCb) 984 if err != nil { 985 t.Fatalf("Error on subscribe: %v", err) 986 } 987 if err := qsub.AutoUnsubscribe(1); err != nil { 988 t.Fatalf("Error on auto-unsubscribe: %v", err) 989 } 990 } 991 nc.Flush() 992 993 expected := int32(total) 994 for i := int32(0); i < expected; i++ { 995 nc.Publish("foo", []byte("Don't Drop Me!")) 996 } 997 nc.Flush() 998 999 checkFor(t, 5*time.Second, 10*time.Millisecond, func() error { 1000 nbar := atomic.LoadInt32(&rbar) 1001 nbaz := atomic.LoadInt32(&rbaz) 1002 if nbar == expected && nbaz == expected { 1003 return nil 1004 } 1005 return fmt.Errorf("Did not receive all %d queue messages, received %d for 'bar' and %d for 'baz'", 1006 expected, atomic.LoadInt32(&rbar), atomic.LoadInt32(&rbaz)) 1007 }) 1008 } 1009 1010 func TestNoRaceAcceptLoopsDoNotLeaveOpenedConn(t *testing.T) { 1011 for _, test := range []struct { 1012 name string 1013 url func(o *Options) (string, int) 1014 }{ 1015 {"client", func(o *Options) (string, int) { return o.Host, o.Port }}, 1016 {"route", func(o *Options) (string, int) { return o.Cluster.Host, o.Cluster.Port }}, 1017 {"gateway", func(o *Options) (string, int) { return o.Gateway.Host, o.Gateway.Port }}, 1018 {"leafnode", func(o *Options) (string, int) { return o.LeafNode.Host, o.LeafNode.Port }}, 1019 {"websocket", func(o *Options) (string, int) { return o.Websocket.Host, o.Websocket.Port }}, 1020 } { 1021 t.Run(test.name, func(t *testing.T) { 1022 o := DefaultOptions() 1023 o.DisableShortFirstPing = true 1024 o.Accounts = []*Account{NewAccount("$SYS")} 1025 o.SystemAccount = "$SYS" 1026 o.Cluster.Name = "abc" 1027 o.Cluster.Host = "127.0.0.1" 1028 o.Cluster.Port = -1 1029 o.Gateway.Name = "abc" 1030 o.Gateway.Host = "127.0.0.1" 1031 o.Gateway.Port = -1 1032 o.LeafNode.Host = "127.0.0.1" 1033 o.LeafNode.Port = -1 1034 o.Websocket.Host = "127.0.0.1" 1035 o.Websocket.Port = -1 1036 o.Websocket.HandshakeTimeout = 1 1037 o.Websocket.NoTLS = true 1038 s := RunServer(o) 1039 defer s.Shutdown() 1040 1041 host, port := test.url(o) 1042 url := fmt.Sprintf("%s:%d", host, port) 1043 var conns []net.Conn 1044 1045 wg := sync.WaitGroup{} 1046 wg.Add(1) 1047 done := make(chan struct{}, 1) 1048 go func() { 1049 defer wg.Done() 1050 // Have an upper limit 1051 for i := 0; i < 200; i++ { 1052 c, err := net.Dial("tcp", url) 1053 if err != nil { 1054 return 1055 } 1056 conns = append(conns, c) 1057 select { 1058 case <-done: 1059 return 1060 default: 1061 } 1062 } 1063 }() 1064 time.Sleep(15 * time.Millisecond) 1065 s.Shutdown() 1066 close(done) 1067 wg.Wait() 1068 for _, c := range conns { 1069 c.SetReadDeadline(time.Now().Add(2 * time.Second)) 1070 br := bufio.NewReader(c) 1071 // Read INFO for connections that were accepted 1072 _, _, err := br.ReadLine() 1073 if err == nil { 1074 // After that, the connection should be closed, 1075 // so we should get an error here. 1076 _, _, err = br.ReadLine() 1077 } 1078 // We expect an io.EOF or any other error indicating the use of a closed 1079 // connection, but we should not get the timeout error. 1080 if ne, ok := err.(net.Error); ok && ne.Timeout() { 1081 err = nil 1082 } 1083 if err == nil { 1084 var buf [10]byte 1085 c.SetDeadline(time.Now().Add(2 * time.Second)) 1086 c.Write([]byte("C")) 1087 _, err = c.Read(buf[:]) 1088 if ne, ok := err.(net.Error); ok && ne.Timeout() { 1089 err = nil 1090 } 1091 } 1092 if err == nil { 1093 t.Fatalf("Connection should have been closed") 1094 } 1095 c.Close() 1096 } 1097 }) 1098 } 1099 } 1100 1101 func TestNoRaceJetStreamDeleteStreamManyConsumers(t *testing.T) { 1102 s := RunBasicJetStreamServer(t) 1103 defer s.Shutdown() 1104 1105 mname := "MYS" 1106 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Storage: FileStorage}) 1107 if err != nil { 1108 t.Fatalf("Unexpected error adding stream: %v", err) 1109 } 1110 1111 // This number needs to be higher than the internal sendq size to trigger what this test is testing. 1112 for i := 0; i < 2000; i++ { 1113 _, err := mset.addConsumer(&ConsumerConfig{ 1114 Durable: fmt.Sprintf("D-%d", i), 1115 DeliverSubject: fmt.Sprintf("deliver.%d", i), 1116 }) 1117 if err != nil { 1118 t.Fatalf("Error creating consumer: %v", err) 1119 } 1120 } 1121 // With bug this would not return and would hang. 1122 mset.delete() 1123 } 1124 1125 // We used to swap accounts on an inbound message when processing service imports. 1126 // Until JetStream this was kinda ok, but with JetStream we can have pull consumers 1127 // trying to access the clients account in another Go routine now which causes issues. 1128 // This is not limited to the case above, its just the one that exposed it. 1129 // This test is to show that issue and that the fix works, meaning we no longer swap c.acc. 1130 func TestNoRaceJetStreamServiceImportAccountSwapIssue(t *testing.T) { 1131 s := RunBasicJetStreamServer(t) 1132 defer s.Shutdown() 1133 1134 // Client based API 1135 nc, js := jsClientConnect(t, s) 1136 defer nc.Close() 1137 1138 _, err := js.AddStream(&nats.StreamConfig{ 1139 Name: "TEST", 1140 Subjects: []string{"foo", "bar"}, 1141 }) 1142 if err != nil { 1143 t.Fatalf("Unexpected error: %v", err) 1144 } 1145 1146 sub, err := js.PullSubscribe("foo", "dlc") 1147 if err != nil { 1148 t.Fatalf("Unexpected error: %v", err) 1149 } 1150 1151 beforeSubs := s.NumSubscriptions() 1152 1153 // How long we want both sides to run. 1154 timeout := time.Now().Add(3 * time.Second) 1155 errs := make(chan error, 1) 1156 1157 // Publishing side, which will signal the consumer that is waiting and which will access c.acc. If publish 1158 // operation runs concurrently we will catch c.acc being $SYS some of the time. 1159 go func() { 1160 time.Sleep(100 * time.Millisecond) 1161 for time.Now().Before(timeout) { 1162 // This will signal the delivery of the pull messages. 1163 js.Publish("foo", []byte("Hello")) 1164 // This will swap the account because of JetStream service import. 1165 // We can get an error here with the bug or not. 1166 if _, err := js.StreamInfo("TEST"); err != nil { 1167 errs <- err 1168 return 1169 } 1170 } 1171 errs <- nil 1172 }() 1173 1174 // Pull messages flow. 1175 var received int 1176 for time.Now().Before(timeout.Add(2 * time.Second)) { 1177 if msgs, err := sub.Fetch(1, nats.MaxWait(200*time.Millisecond)); err == nil { 1178 for _, m := range msgs { 1179 received++ 1180 m.AckSync() 1181 } 1182 } else { 1183 break 1184 } 1185 } 1186 // Wait on publisher Go routine and check for errors. 1187 if err := <-errs; err != nil { 1188 t.Fatalf("Unexpected error: %v", err) 1189 } 1190 // Double check all received. 1191 si, err := js.StreamInfo("TEST") 1192 if err != nil { 1193 t.Fatalf("Unexpected error: %v", err) 1194 } 1195 if int(si.State.Msgs) != received { 1196 t.Fatalf("Expected to receive %d msgs, only got %d", si.State.Msgs, received) 1197 } 1198 // Now check for leaked subs from the fetch call above. That is what we first saw from the bug. 1199 if afterSubs := s.NumSubscriptions(); afterSubs != beforeSubs { 1200 t.Fatalf("Leaked subscriptions: %d before, %d after", beforeSubs, afterSubs) 1201 } 1202 } 1203 1204 func TestNoRaceJetStreamAPIStreamListPaging(t *testing.T) { 1205 s := RunBasicJetStreamServer(t) 1206 defer s.Shutdown() 1207 1208 // Create 2X limit 1209 streamsNum := 2 * JSApiNamesLimit 1210 for i := 1; i <= streamsNum; i++ { 1211 name := fmt.Sprintf("STREAM-%06d", i) 1212 cfg := StreamConfig{Name: name, Storage: MemoryStorage} 1213 _, err := s.GlobalAccount().addStream(&cfg) 1214 if err != nil { 1215 t.Fatalf("Unexpected error adding stream: %v", err) 1216 } 1217 } 1218 1219 // Client for API requests. 1220 nc := clientConnectToServer(t, s) 1221 defer nc.Close() 1222 1223 reqList := func(offset int) []byte { 1224 t.Helper() 1225 var req []byte 1226 if offset > 0 { 1227 req, _ = json.Marshal(&ApiPagedRequest{Offset: offset}) 1228 } 1229 resp, err := nc.Request(JSApiStreams, req, time.Second) 1230 if err != nil { 1231 t.Fatalf("Unexpected error getting stream list: %v", err) 1232 } 1233 return resp.Data 1234 } 1235 1236 checkResp := func(resp []byte, expectedLen, expectedOffset int) { 1237 t.Helper() 1238 var listResponse JSApiStreamNamesResponse 1239 if err := json.Unmarshal(resp, &listResponse); err != nil { 1240 t.Fatalf("Unexpected error: %v", err) 1241 } 1242 if len(listResponse.Streams) != expectedLen { 1243 t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Streams)) 1244 } 1245 if listResponse.Total != streamsNum { 1246 t.Fatalf("Expected total to be %d but got %d", streamsNum, listResponse.Total) 1247 } 1248 if listResponse.Offset != expectedOffset { 1249 t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset) 1250 } 1251 if expectedLen < 1 { 1252 return 1253 } 1254 // Make sure we get the right stream. 1255 sname := fmt.Sprintf("STREAM-%06d", expectedOffset+1) 1256 if listResponse.Streams[0] != sname { 1257 t.Fatalf("Expected stream %q to be first, got %q", sname, listResponse.Streams[0]) 1258 } 1259 } 1260 1261 checkResp(reqList(0), JSApiNamesLimit, 0) 1262 checkResp(reqList(JSApiNamesLimit), JSApiNamesLimit, JSApiNamesLimit) 1263 checkResp(reqList(streamsNum), 0, streamsNum) 1264 checkResp(reqList(streamsNum-22), 22, streamsNum-22) 1265 checkResp(reqList(streamsNum+22), 0, streamsNum) 1266 } 1267 1268 func TestNoRaceJetStreamAPIConsumerListPaging(t *testing.T) { 1269 s := RunBasicJetStreamServer(t) 1270 defer s.Shutdown() 1271 1272 sname := "MYSTREAM" 1273 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: sname}) 1274 if err != nil { 1275 t.Fatalf("Unexpected error adding stream: %v", err) 1276 } 1277 1278 // Client for API requests. 1279 nc := clientConnectToServer(t, s) 1280 defer nc.Close() 1281 1282 consumersNum := JSApiNamesLimit 1283 for i := 1; i <= consumersNum; i++ { 1284 dsubj := fmt.Sprintf("d.%d", i) 1285 sub, _ := nc.SubscribeSync(dsubj) 1286 defer sub.Unsubscribe() 1287 nc.Flush() 1288 1289 _, err := mset.addConsumer(&ConsumerConfig{DeliverSubject: dsubj}) 1290 if err != nil { 1291 t.Fatalf("Unexpected error: %v", err) 1292 } 1293 } 1294 1295 reqListSubject := fmt.Sprintf(JSApiConsumersT, sname) 1296 reqList := func(offset int) []byte { 1297 t.Helper() 1298 var req []byte 1299 if offset > 0 { 1300 req, _ = json.Marshal(&JSApiConsumersRequest{ApiPagedRequest: ApiPagedRequest{Offset: offset}}) 1301 } 1302 resp, err := nc.Request(reqListSubject, req, time.Second) 1303 if err != nil { 1304 t.Fatalf("Unexpected error getting stream list: %v", err) 1305 } 1306 return resp.Data 1307 } 1308 1309 checkResp := func(resp []byte, expectedLen, expectedOffset int) { 1310 t.Helper() 1311 var listResponse JSApiConsumerNamesResponse 1312 if err := json.Unmarshal(resp, &listResponse); err != nil { 1313 t.Fatalf("Unexpected error: %v", err) 1314 } 1315 if len(listResponse.Consumers) != expectedLen { 1316 t.Fatalf("Expected only %d streams but got %d", expectedLen, len(listResponse.Consumers)) 1317 } 1318 if listResponse.Total != consumersNum { 1319 t.Fatalf("Expected total to be %d but got %d", consumersNum, listResponse.Total) 1320 } 1321 if listResponse.Offset != expectedOffset { 1322 t.Fatalf("Expected offset to be %d but got %d", expectedOffset, listResponse.Offset) 1323 } 1324 } 1325 1326 checkResp(reqList(0), JSApiNamesLimit, 0) 1327 checkResp(reqList(consumersNum-22), 22, consumersNum-22) 1328 checkResp(reqList(consumersNum+22), 0, consumersNum) 1329 } 1330 1331 func TestNoRaceJetStreamWorkQueueLoadBalance(t *testing.T) { 1332 s := RunBasicJetStreamServer(t) 1333 defer s.Shutdown() 1334 1335 mname := "MY_MSG_SET" 1336 mset, err := s.GlobalAccount().addStream(&StreamConfig{Name: mname, Subjects: []string{"foo", "bar"}}) 1337 if err != nil { 1338 t.Fatalf("Unexpected error adding message set: %v", err) 1339 } 1340 defer mset.delete() 1341 1342 // Create basic work queue mode consumer. 1343 oname := "WQ" 1344 o, err := mset.addConsumer(&ConsumerConfig{Durable: oname, AckPolicy: AckExplicit}) 1345 if err != nil { 1346 t.Fatalf("Expected no error with durable, got %v", err) 1347 } 1348 defer o.delete() 1349 1350 // To send messages. 1351 nc := clientConnectToServer(t, s) 1352 defer nc.Close() 1353 1354 // For normal work queue semantics, you send requests to the subject with stream and consumer name. 1355 reqMsgSubj := o.requestNextMsgSubject() 1356 1357 numWorkers := 25 1358 counts := make([]int32, numWorkers) 1359 var received int32 1360 1361 rwg := &sync.WaitGroup{} 1362 rwg.Add(numWorkers) 1363 1364 wg := &sync.WaitGroup{} 1365 wg.Add(numWorkers) 1366 ch := make(chan bool) 1367 1368 toSend := 1000 1369 1370 for i := 0; i < numWorkers; i++ { 1371 nc := clientConnectToServer(t, s) 1372 defer nc.Close() 1373 1374 go func(index int32) { 1375 rwg.Done() 1376 defer wg.Done() 1377 <-ch 1378 1379 for counter := &counts[index]; ; { 1380 m, err := nc.Request(reqMsgSubj, nil, 100*time.Millisecond) 1381 if err != nil { 1382 return 1383 } 1384 m.Respond(nil) 1385 atomic.AddInt32(counter, 1) 1386 if total := atomic.AddInt32(&received, 1); total >= int32(toSend) { 1387 return 1388 } 1389 } 1390 }(int32(i)) 1391 } 1392 1393 // Wait for requestors to be ready 1394 rwg.Wait() 1395 close(ch) 1396 1397 sendSubj := "bar" 1398 for i := 0; i < toSend; i++ { 1399 sendStreamMsg(t, nc, sendSubj, "Hello World!") 1400 } 1401 1402 // Wait for test to complete. 1403 wg.Wait() 1404 1405 target := toSend / numWorkers 1406 delta := target/2 + 5 1407 low, high := int32(target-delta), int32(target+delta) 1408 1409 for i := 0; i < numWorkers; i++ { 1410 if msgs := atomic.LoadInt32(&counts[i]); msgs < low || msgs > high { 1411 t.Fatalf("Messages received for worker [%d] too far off from target of %d, got %d", i, target, msgs) 1412 } 1413 } 1414 } 1415 1416 func TestNoRaceJetStreamClusterLargeStreamInlineCatchup(t *testing.T) { 1417 c := createJetStreamClusterExplicit(t, "LSS", 3) 1418 defer c.shutdown() 1419 1420 // Client based API 1421 s := c.randomServer() 1422 nc, js := jsClientConnect(t, s) 1423 defer nc.Close() 1424 1425 _, err := js.AddStream(&nats.StreamConfig{ 1426 Name: "TEST", 1427 Subjects: []string{"foo"}, 1428 Replicas: 3, 1429 }) 1430 if err != nil { 1431 t.Fatalf("Unexpected error: %v", err) 1432 } 1433 1434 sr := c.randomNonStreamLeader("$G", "TEST") 1435 sr.Shutdown() 1436 1437 // In case sr was meta leader. 1438 c.waitOnLeader() 1439 1440 msg, toSend := []byte("Hello JS Clustering"), 5000 1441 1442 // Now fill up stream. 1443 for i := 0; i < toSend; i++ { 1444 if _, err = js.Publish("foo", msg); err != nil { 1445 t.Fatalf("Unexpected publish error: %v", err) 1446 } 1447 } 1448 si, err := js.StreamInfo("TEST") 1449 if err != nil { 1450 t.Fatalf("Unexpected error: %v", err) 1451 } 1452 // Check active state as well, shows that the owner answered. 1453 if si.State.Msgs != uint64(toSend) { 1454 t.Fatalf("Expected %d msgs, got bad state: %+v", toSend, si.State) 1455 } 1456 1457 // Kill our current leader to make just 2. 1458 c.streamLeader("$G", "TEST").Shutdown() 1459 1460 // Now restart the shutdown peer and wait for it to be current. 1461 sr = c.restartServer(sr) 1462 c.waitOnStreamCurrent(sr, "$G", "TEST") 1463 1464 // Ask other servers to stepdown as leader so that sr becomes the leader. 1465 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 1466 c.waitOnStreamLeader("$G", "TEST") 1467 if sl := c.streamLeader("$G", "TEST"); sl != sr { 1468 sl.JetStreamStepdownStream("$G", "TEST") 1469 return fmt.Errorf("Server %s is not leader yet", sr) 1470 } 1471 return nil 1472 }) 1473 1474 si, err = js.StreamInfo("TEST") 1475 if err != nil { 1476 t.Fatalf("Unexpected error: %v", err) 1477 } 1478 // Check that we have all of our messsages stored. 1479 // Wait for a bit for upper layers to process. 1480 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 1481 if si.State.Msgs != uint64(toSend) { 1482 return fmt.Errorf("Expected %d msgs, got %d", toSend, si.State.Msgs) 1483 } 1484 return nil 1485 }) 1486 } 1487 1488 func TestNoRaceJetStreamClusterStreamCreateAndLostQuorum(t *testing.T) { 1489 c := createJetStreamClusterExplicit(t, "R5S", 3) 1490 defer c.shutdown() 1491 1492 // Client based API 1493 s := c.randomServer() 1494 nc, js := jsClientConnect(t, s) 1495 defer nc.Close() 1496 1497 sub, err := nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*") 1498 if err != nil { 1499 t.Fatalf("Unexpected error: %v", err) 1500 } 1501 1502 if _, err := js.AddStream(&nats.StreamConfig{Name: "NO-LQ-START", Replicas: 3}); err != nil { 1503 t.Fatalf("Unexpected error: %v", err) 1504 } 1505 c.waitOnStreamLeader("$G", "NO-LQ-START") 1506 checkSubsPending(t, sub, 0) 1507 1508 c.stopAll() 1509 // Start up the one we were connected to first and wait for it to be connected. 1510 s = c.restartServer(s) 1511 nc, err = nats.Connect(s.ClientURL()) 1512 if err != nil { 1513 t.Fatalf("Failed to create client: %v", err) 1514 } 1515 defer nc.Close() 1516 1517 sub, err = nc.SubscribeSync(JSAdvisoryStreamQuorumLostPre + ".*") 1518 if err != nil { 1519 t.Fatalf("Unexpected error: %v", err) 1520 } 1521 nc.Flush() 1522 1523 c.restartAll() 1524 c.waitOnStreamLeader("$G", "NO-LQ-START") 1525 1526 checkSubsPending(t, sub, 0) 1527 } 1528 1529 func TestNoRaceJetStreamSuperClusterMirrors(t *testing.T) { 1530 sc := createJetStreamSuperCluster(t, 3, 3) 1531 defer sc.shutdown() 1532 1533 // Client based API 1534 s := sc.clusterForName("C2").randomServer() 1535 nc, js := jsClientConnect(t, s) 1536 defer nc.Close() 1537 1538 // Create source stream. 1539 _, err := js.AddStream(&nats.StreamConfig{Name: "S1", Subjects: []string{"foo", "bar"}, Replicas: 3, Placement: &nats.Placement{Cluster: "C2"}}) 1540 if err != nil { 1541 t.Fatalf("Unexpected error: %v", err) 1542 } 1543 1544 // Needed while Go client does not have mirror support. 1545 createStream := func(cfg *nats.StreamConfig) { 1546 t.Helper() 1547 if _, err := js.AddStream(cfg); err != nil { 1548 t.Fatalf("Unexpected error: %+v", err) 1549 } 1550 } 1551 1552 // Send 100 messages. 1553 for i := 0; i < 100; i++ { 1554 if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil { 1555 t.Fatalf("Unexpected publish error: %v", err) 1556 } 1557 } 1558 1559 createStream(&nats.StreamConfig{ 1560 Name: "M1", 1561 Mirror: &nats.StreamSource{Name: "S1"}, 1562 Placement: &nats.Placement{Cluster: "C1"}, 1563 }) 1564 1565 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 1566 si, err := js.StreamInfo("M1") 1567 if err != nil { 1568 t.Fatalf("Unexpected error: %v", err) 1569 } 1570 if si.State.Msgs != 100 { 1571 return fmt.Errorf("Expected 100 msgs, got state: %+v", si.State) 1572 } 1573 return nil 1574 }) 1575 1576 // Purge the source stream. 1577 if err := js.PurgeStream("S1"); err != nil { 1578 t.Fatalf("Unexpected purge error: %v", err) 1579 } 1580 // Send 50 more msgs now. 1581 for i := 0; i < 50; i++ { 1582 if _, err := js.Publish("bar", []byte("OK")); err != nil { 1583 t.Fatalf("Unexpected publish error: %v", err) 1584 } 1585 } 1586 1587 createStream(&nats.StreamConfig{ 1588 Name: "M2", 1589 Mirror: &nats.StreamSource{Name: "S1"}, 1590 Replicas: 3, 1591 Placement: &nats.Placement{Cluster: "C3"}, 1592 }) 1593 1594 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1595 si, err := js.StreamInfo("M2") 1596 if err != nil { 1597 t.Fatalf("Unexpected error: %v", err) 1598 } 1599 if si.State.Msgs != 50 { 1600 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1601 } 1602 if si.State.FirstSeq != 101 { 1603 return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State) 1604 } 1605 return nil 1606 }) 1607 1608 sl := sc.clusterForName("C3").streamLeader("$G", "M2") 1609 doneCh := make(chan bool) 1610 1611 // Now test that if the mirror get's interrupted that it picks up where it left off etc. 1612 go func() { 1613 // Send 100 more messages. 1614 for i := 0; i < 100; i++ { 1615 if _, err := js.Publish("foo", []byte("MIRRORS!")); err != nil { 1616 t.Errorf("Unexpected publish on %d error: %v", i, err) 1617 } 1618 time.Sleep(2 * time.Millisecond) 1619 } 1620 doneCh <- true 1621 }() 1622 1623 time.Sleep(20 * time.Millisecond) 1624 sl.Shutdown() 1625 1626 <-doneCh 1627 sc.clusterForName("C3").waitOnStreamLeader("$G", "M2") 1628 1629 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1630 si, err := js.StreamInfo("M2") 1631 if err != nil { 1632 t.Fatalf("Unexpected error: %v", err) 1633 } 1634 if si.State.Msgs != 150 { 1635 return fmt.Errorf("Expected 150 msgs, got state: %+v", si.State) 1636 } 1637 if si.State.FirstSeq != 101 { 1638 return fmt.Errorf("Expected start seq of 101, got state: %+v", si.State) 1639 } 1640 return nil 1641 }) 1642 } 1643 1644 func TestNoRaceJetStreamSuperClusterMixedModeMirrors(t *testing.T) { 1645 // Unlike the similar sources test, this test is not reliably catching the bug 1646 // that would cause mirrors to not have the expected messages count. 1647 // Still, adding this test in case we have a regression and we are lucky in 1648 // getting the failure while running this. 1649 1650 tmpl := ` 1651 listen: 127.0.0.1:-1 1652 server_name: %s 1653 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 1654 leaf: { listen: 127.0.0.1:-1 } 1655 1656 cluster { 1657 name: %s 1658 listen: 127.0.0.1:%d 1659 routes = [%s] 1660 } 1661 1662 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 1663 ` 1664 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 4, 1665 func(serverName, clusterName, storeDir, conf string) string { 1666 sname := serverName[strings.Index(serverName, "-")+1:] 1667 switch sname { 1668 case "S5", "S6", "S7": 1669 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 1670 default: 1671 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 1672 } 1673 return conf 1674 }, nil) 1675 defer sc.shutdown() 1676 1677 // Connect our client to a non JS server 1678 c := sc.randomCluster() 1679 var s *Server 1680 for { 1681 if as := c.randomServer(); !as.JetStreamEnabled() { 1682 s = as 1683 break 1684 } 1685 } 1686 nc, js := jsClientConnect(t, s) 1687 defer nc.Close() 1688 1689 numStreams := 10 1690 toSend := 1000 1691 errCh := make(chan error, numStreams) 1692 wg := sync.WaitGroup{} 1693 wg.Add(numStreams) 1694 // Create 10 origin streams 1695 for i := 0; i < 10; i++ { 1696 go func(idx int) { 1697 defer wg.Done() 1698 name := fmt.Sprintf("S%d", idx+1) 1699 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 1700 errCh <- fmt.Errorf("unexpected error: %v", err) 1701 return 1702 } 1703 c.waitOnStreamLeader(globalAccountName, name) 1704 // Load them up with a bunch of messages. 1705 for n := 0; n < toSend; n++ { 1706 m := nats.NewMsg(name) 1707 m.Header.Set("stream", name) 1708 m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10)) 1709 if err := nc.PublishMsg(m); err != nil { 1710 errCh <- fmt.Errorf("unexpected publish error: %v", err) 1711 } 1712 } 1713 }(i) 1714 } 1715 wg.Wait() 1716 select { 1717 case err := <-errCh: 1718 t.Fatal(err) 1719 default: 1720 } 1721 1722 for i := 0; i < 3; i++ { 1723 // Now create our mirrors 1724 wg := sync.WaitGroup{} 1725 mirrorsCount := 10 1726 wg.Add(mirrorsCount) 1727 errCh := make(chan error, 1) 1728 for m := 0; m < mirrorsCount; m++ { 1729 sname := fmt.Sprintf("S%d", rand.Intn(10)+1) 1730 go func(sname string, mirrorIdx int) { 1731 defer wg.Done() 1732 if _, err := js.AddStream(&nats.StreamConfig{ 1733 Name: fmt.Sprintf("M%d", mirrorIdx), 1734 Mirror: &nats.StreamSource{Name: sname}, 1735 Replicas: 3, 1736 }); err != nil { 1737 select { 1738 case errCh <- err: 1739 default: 1740 } 1741 } 1742 }(sname, m+1) 1743 } 1744 wg.Wait() 1745 select { 1746 case err := <-errCh: 1747 t.Fatalf("Error creating mirrors: %v", err) 1748 default: 1749 } 1750 // Now check the mirrors have all expected messages 1751 for m := 0; m < mirrorsCount; m++ { 1752 name := fmt.Sprintf("M%d", m+1) 1753 checkFor(t, 15*time.Second, 500*time.Millisecond, func() error { 1754 si, err := js.StreamInfo(name) 1755 if err != nil { 1756 t.Fatalf("Could not retrieve stream info") 1757 } 1758 if si.State.Msgs != uint64(toSend) { 1759 return fmt.Errorf("Expected %d msgs, got state: %+v", toSend, si.State) 1760 } 1761 return nil 1762 }) 1763 err := js.DeleteStream(name) 1764 require_NoError(t, err) 1765 } 1766 } 1767 } 1768 1769 func TestNoRaceJetStreamSuperClusterSources(t *testing.T) { 1770 sc := createJetStreamSuperCluster(t, 3, 3) 1771 defer sc.shutdown() 1772 1773 // Client based API 1774 s := sc.clusterForName("C1").randomServer() 1775 nc, js := jsClientConnect(t, s) 1776 defer nc.Close() 1777 1778 // Create our source streams. 1779 for _, sname := range []string{"foo", "bar", "baz"} { 1780 if _, err := js.AddStream(&nats.StreamConfig{Name: sname, Replicas: 1}); err != nil { 1781 t.Fatalf("Unexpected error: %v", err) 1782 } 1783 } 1784 1785 sendBatch := func(subject string, n int) { 1786 for i := 0; i < n; i++ { 1787 msg := fmt.Sprintf("MSG-%d", i+1) 1788 if _, err := js.Publish(subject, []byte(msg)); err != nil { 1789 t.Fatalf("Unexpected publish error: %v", err) 1790 } 1791 } 1792 } 1793 // Populate each one. 1794 sendBatch("foo", 10) 1795 sendBatch("bar", 15) 1796 sendBatch("baz", 25) 1797 1798 // Needed while Go client does not have mirror support for creating mirror or source streams. 1799 createStream := func(cfg *nats.StreamConfig) { 1800 t.Helper() 1801 if _, err := js.AddStream(cfg); err != nil { 1802 t.Fatalf("Unexpected error: %+v", err) 1803 } 1804 } 1805 1806 cfg := &nats.StreamConfig{ 1807 Name: "MS", 1808 Sources: []*nats.StreamSource{ 1809 {Name: "foo"}, 1810 {Name: "bar"}, 1811 {Name: "baz"}, 1812 }, 1813 } 1814 1815 createStream(cfg) 1816 time.Sleep(time.Second) 1817 1818 // Faster timeout since we loop below checking for condition. 1819 js2, err := nc.JetStream(nats.MaxWait(50 * time.Millisecond)) 1820 if err != nil { 1821 t.Fatalf("Unexpected error: %v", err) 1822 } 1823 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1824 si, err := js2.StreamInfo("MS") 1825 if err != nil { 1826 return err 1827 } 1828 if si.State.Msgs != 50 { 1829 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1830 } 1831 return nil 1832 }) 1833 1834 // Purge the source streams. 1835 for _, sname := range []string{"foo", "bar", "baz"} { 1836 if err := js.PurgeStream(sname); err != nil { 1837 t.Fatalf("Unexpected purge error: %v", err) 1838 } 1839 } 1840 1841 if err := js.DeleteStream("MS"); err != nil { 1842 t.Fatalf("Unexpected delete error: %v", err) 1843 } 1844 1845 // Send more msgs now. 1846 sendBatch("foo", 10) 1847 sendBatch("bar", 15) 1848 sendBatch("baz", 25) 1849 1850 cfg = &nats.StreamConfig{ 1851 Name: "MS2", 1852 Sources: []*nats.StreamSource{ 1853 {Name: "foo"}, 1854 {Name: "bar"}, 1855 {Name: "baz"}, 1856 }, 1857 Replicas: 3, 1858 Placement: &nats.Placement{Cluster: "C3"}, 1859 } 1860 1861 createStream(cfg) 1862 1863 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 1864 si, err := js2.StreamInfo("MS2") 1865 if err != nil { 1866 t.Fatalf("Unexpected error: %v", err) 1867 } 1868 if si.State.Msgs != 50 { 1869 return fmt.Errorf("Expected 50 msgs, got state: %+v", si.State) 1870 } 1871 if si.State.FirstSeq != 1 { 1872 return fmt.Errorf("Expected start seq of 1, got state: %+v", si.State) 1873 } 1874 return nil 1875 }) 1876 1877 sl := sc.clusterForName("C3").streamLeader("$G", "MS2") 1878 doneCh := make(chan bool) 1879 1880 if sl == sc.leader() { 1881 nc.Request(JSApiLeaderStepDown, nil, time.Second) 1882 sc.waitOnLeader() 1883 } 1884 1885 // Now test that if the mirror get's interrupted that it picks up where it left off etc. 1886 go func() { 1887 // Send 50 more messages each. 1888 for i := 0; i < 50; i++ { 1889 msg := fmt.Sprintf("R-MSG-%d", i+1) 1890 for _, sname := range []string{"foo", "bar", "baz"} { 1891 m := nats.NewMsg(sname) 1892 m.Data = []byte(msg) 1893 if _, err := js.PublishMsg(m); err != nil { 1894 t.Errorf("Unexpected publish error: %v", err) 1895 } 1896 } 1897 time.Sleep(2 * time.Millisecond) 1898 } 1899 doneCh <- true 1900 }() 1901 1902 time.Sleep(20 * time.Millisecond) 1903 sl.Shutdown() 1904 1905 sc.clusterForName("C3").waitOnStreamLeader("$G", "MS2") 1906 <-doneCh 1907 1908 checkFor(t, 20*time.Second, time.Second, func() error { 1909 si, err := js2.StreamInfo("MS2") 1910 if err != nil { 1911 return err 1912 } 1913 if si.State.Msgs != 200 { 1914 return fmt.Errorf("Expected 200 msgs, got state: %+v", si.State) 1915 } 1916 return nil 1917 }) 1918 } 1919 1920 func TestNoRaceJetStreamClusterSourcesMuxd(t *testing.T) { 1921 c := createJetStreamClusterExplicit(t, "SMUX", 3) 1922 defer c.shutdown() 1923 1924 // Client for API requests. 1925 nc, js := jsClientConnect(t, c.randomServer()) 1926 defer nc.Close() 1927 1928 // Send in 10000 messages. 1929 msg, toSend := make([]byte, 1024), 10000 1930 crand.Read(msg) 1931 1932 var sources []*nats.StreamSource 1933 // Create 10 origin streams. 1934 for i := 1; i <= 10; i++ { 1935 name := fmt.Sprintf("O-%d", i) 1936 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 1937 t.Fatalf("Unexpected error: %v", err) 1938 } 1939 // Make sure we have a leader before publishing, especially since we use 1940 // non JS publisher, we would not know if the messages made it to those 1941 // streams or not. 1942 c.waitOnStreamLeader(globalAccountName, name) 1943 // Load them up with a bunch of messages. 1944 for n := 0; n < toSend; n++ { 1945 if err := nc.Publish(name, msg); err != nil { 1946 t.Fatalf("Unexpected publish error: %v", err) 1947 } 1948 } 1949 sources = append(sources, &nats.StreamSource{Name: name}) 1950 } 1951 1952 // Now create our downstream stream that sources from all of them. 1953 if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 2, Sources: sources}); err != nil { 1954 t.Fatalf("Unexpected error: %v", err) 1955 } 1956 1957 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 1958 si, err := js.StreamInfo("S") 1959 if err != nil { 1960 t.Fatalf("Could not retrieve stream info") 1961 } 1962 if si.State.Msgs != uint64(10*toSend) { 1963 return fmt.Errorf("Expected %d msgs, got state: %+v", toSend*10, si.State) 1964 } 1965 return nil 1966 }) 1967 1968 } 1969 1970 func TestNoRaceJetStreamSuperClusterMixedModeSources(t *testing.T) { 1971 tmpl := ` 1972 listen: 127.0.0.1:-1 1973 server_name: %s 1974 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 1975 leaf: { listen: 127.0.0.1:-1 } 1976 1977 cluster { 1978 name: %s 1979 listen: 127.0.0.1:%d 1980 routes = [%s] 1981 } 1982 1983 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 1984 ` 1985 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 7, 2, 1986 func(serverName, clusterName, storeDir, conf string) string { 1987 sname := serverName[strings.Index(serverName, "-")+1:] 1988 switch sname { 1989 case "S5", "S6", "S7": 1990 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 1991 default: 1992 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 1993 } 1994 return conf 1995 }, nil) 1996 defer sc.shutdown() 1997 // Connect our client to a non JS server 1998 c := sc.randomCluster() 1999 var s *Server 2000 for { 2001 if as := c.randomServer(); !as.JetStreamEnabled() { 2002 s = as 2003 break 2004 } 2005 } 2006 nc, js := jsClientConnect(t, s) 2007 defer nc.Close() 2008 2009 numStreams := 100 2010 toSend := 1000 2011 var sources []*nats.StreamSource 2012 errCh := make(chan error, numStreams) 2013 srcCh := make(chan *nats.StreamSource, numStreams) 2014 wg := sync.WaitGroup{} 2015 wg.Add(numStreams) 2016 // Create 100 origin streams. 2017 for i := 1; i <= numStreams; i++ { 2018 go func(idx int) { 2019 defer wg.Done() 2020 2021 name := fmt.Sprintf("O-%d", idx) 2022 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 2023 errCh <- fmt.Errorf("unexpected error: %v", err) 2024 return 2025 } 2026 c.waitOnStreamLeader(globalAccountName, name) 2027 // Load them up with a bunch of messages. 2028 for n := 0; n < toSend; n++ { 2029 m := nats.NewMsg(name) 2030 m.Header.Set("stream", name) 2031 m.Header.Set("idx", strconv.FormatInt(int64(n+1), 10)) 2032 if err := nc.PublishMsg(m); err != nil { 2033 errCh <- fmt.Errorf("unexpected publish error: %v", err) 2034 return 2035 } 2036 } 2037 srcCh <- &nats.StreamSource{Name: name} 2038 }(i) 2039 } 2040 wg.Wait() 2041 select { 2042 case err := <-errCh: 2043 t.Fatal(err) 2044 default: 2045 } 2046 for i := 0; i < numStreams; i++ { 2047 sources = append(sources, <-srcCh) 2048 } 2049 2050 for i := 0; i < 3; i++ { 2051 // Now create our downstream stream that sources from all of them. 2052 if _, err := js.AddStream(&nats.StreamConfig{Name: "S", Replicas: 3, Sources: sources}); err != nil { 2053 t.Fatalf("Unexpected error: %v", err) 2054 } 2055 2056 checkFor(t, 15*time.Second, 1000*time.Millisecond, func() error { 2057 si, err := js.StreamInfo("S") 2058 if err != nil { 2059 t.Fatalf("Could not retrieve stream info") 2060 } 2061 if si.State.Msgs != uint64(numStreams*toSend) { 2062 return fmt.Errorf("Expected %d msgs, got state: %+v", numStreams*toSend, si.State) 2063 } 2064 return nil 2065 }) 2066 2067 err := js.DeleteStream("S") 2068 require_NoError(t, err) 2069 } 2070 } 2071 2072 func TestNoRaceJetStreamClusterExtendedStreamPurgeStall(t *testing.T) { 2073 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 2074 skip(t) 2075 2076 cerr := func(t *testing.T, err error) { 2077 t.Helper() 2078 if err != nil { 2079 t.Fatalf("unexepected err: %s", err) 2080 } 2081 } 2082 2083 s := RunBasicJetStreamServer(t) 2084 defer s.Shutdown() 2085 2086 nc, js := jsClientConnect(t, s) 2087 defer nc.Close() 2088 2089 si, err := js.AddStream(&nats.StreamConfig{ 2090 Name: "KV", 2091 Subjects: []string{"kv.>"}, 2092 Storage: nats.FileStorage, 2093 }) 2094 cerr(t, err) 2095 2096 // 100kb messages spread over 1000 different subjects 2097 body := make([]byte, 100*1024) 2098 for i := 0; i < 50000; i++ { 2099 if _, err := js.PublishAsync(fmt.Sprintf("kv.%d", i%1000), body); err != nil { 2100 cerr(t, err) 2101 } 2102 } 2103 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 2104 if si, err = js.StreamInfo("KV"); err != nil { 2105 return err 2106 } 2107 if si.State.Msgs == 50000 { 2108 return nil 2109 } 2110 return fmt.Errorf("waiting for more") 2111 }) 2112 2113 jp, _ := json.Marshal(&JSApiStreamPurgeRequest{Subject: "kv.20"}) 2114 start := time.Now() 2115 res, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), jp, time.Minute) 2116 elapsed := time.Since(start) 2117 cerr(t, err) 2118 pres := JSApiStreamPurgeResponse{} 2119 err = json.Unmarshal(res.Data, &pres) 2120 cerr(t, err) 2121 if !pres.Success { 2122 t.Fatalf("purge failed: %#v", pres) 2123 } 2124 if elapsed > time.Second { 2125 t.Fatalf("Purge took too long %s", elapsed) 2126 } 2127 v, _ := s.Varz(nil) 2128 if v.Mem > 100*1024*1024 { // 100MB limit but in practice < 100MB -> Was ~7GB when failing. 2129 t.Fatalf("Used too much memory: %v", friendlyBytes(v.Mem)) 2130 } 2131 } 2132 2133 func TestNoRaceJetStreamClusterMirrorExpirationAndMissingSequences(t *testing.T) { 2134 c := createJetStreamClusterExplicit(t, "MMS", 9) 2135 defer c.shutdown() 2136 2137 // Client for API requests. 2138 nc, js := jsClientConnect(t, c.randomServer()) 2139 defer nc.Close() 2140 2141 sendBatch := func(n int) { 2142 t.Helper() 2143 // Send a batch to a given subject. 2144 for i := 0; i < n; i++ { 2145 if _, err := js.Publish("TEST", []byte("OK")); err != nil { 2146 t.Fatalf("Unexpected publish error: %v", err) 2147 } 2148 } 2149 } 2150 2151 checkStream := func(stream string, num uint64) { 2152 t.Helper() 2153 checkFor(t, 20*time.Second, 20*time.Millisecond, func() error { 2154 si, err := js.StreamInfo(stream) 2155 if err != nil { 2156 return err 2157 } 2158 if si.State.Msgs != num { 2159 return fmt.Errorf("Expected %d msgs, got %d", num, si.State.Msgs) 2160 } 2161 return nil 2162 }) 2163 } 2164 2165 checkMirror := func(num uint64) { t.Helper(); checkStream("M", num) } 2166 checkTest := func(num uint64) { t.Helper(); checkStream("TEST", num) } 2167 2168 // Origin 2169 _, err := js.AddStream(&nats.StreamConfig{ 2170 Name: "TEST", 2171 MaxAge: 500 * time.Millisecond, 2172 }) 2173 if err != nil { 2174 t.Fatalf("Unexpected error: %v", err) 2175 } 2176 2177 ts := c.streamLeader("$G", "TEST") 2178 ml := c.leader() 2179 2180 // Create mirror now. 2181 for ms := ts; ms == ts || ms == ml; { 2182 _, err = js.AddStream(&nats.StreamConfig{ 2183 Name: "M", 2184 Mirror: &nats.StreamSource{Name: "TEST"}, 2185 Replicas: 2, 2186 }) 2187 if err != nil { 2188 t.Fatalf("Unexpected error: %v", err) 2189 } 2190 ms = c.streamLeader("$G", "M") 2191 if ts == ms || ms == ml { 2192 // Delete and retry. 2193 js.DeleteStream("M") 2194 } 2195 } 2196 2197 sendBatch(10) 2198 checkMirror(10) 2199 2200 // Now shutdown the server with the mirror. 2201 ms := c.streamLeader("$G", "M") 2202 ms.Shutdown() 2203 c.waitOnLeader() 2204 2205 // Send more messages but let them expire. 2206 sendBatch(10) 2207 checkTest(0) 2208 2209 c.restartServer(ms) 2210 c.checkClusterFormed() 2211 c.waitOnStreamLeader("$G", "M") 2212 2213 sendBatch(10) 2214 checkMirror(20) 2215 } 2216 2217 func TestNoRaceJetStreamClusterLargeActiveOnReplica(t *testing.T) { 2218 // Uncomment to run. 2219 skip(t) 2220 2221 c := createJetStreamClusterExplicit(t, "LAG", 3) 2222 defer c.shutdown() 2223 2224 // Client for API requests. 2225 nc, js := jsClientConnect(t, c.randomServer()) 2226 defer nc.Close() 2227 2228 timeout := time.Now().Add(60 * time.Second) 2229 for time.Now().Before(timeout) { 2230 si, err := js.AddStream(&nats.StreamConfig{ 2231 Name: "TEST", 2232 Subjects: []string{"foo", "bar"}, 2233 Replicas: 3, 2234 }) 2235 if err != nil { 2236 t.Fatalf("Unexpected error: %v", err) 2237 } 2238 for _, r := range si.Cluster.Replicas { 2239 if r.Active > 5*time.Second { 2240 t.Fatalf("Bad Active value: %+v", r) 2241 } 2242 } 2243 if err := js.DeleteStream("TEST"); err != nil { 2244 t.Fatalf("Unexpected delete error: %v", err) 2245 } 2246 } 2247 } 2248 2249 func TestNoRaceJetStreamSuperClusterRIPStress(t *testing.T) { 2250 // Uncomment to run. Needs to be on a big machine. 2251 skip(t) 2252 2253 sc := createJetStreamSuperCluster(t, 3, 3) 2254 defer sc.shutdown() 2255 2256 // Client based API 2257 s := sc.clusterForName("C2").randomServer() 2258 nc, js := jsClientConnect(t, s) 2259 defer nc.Close() 2260 2261 scm := make(map[string][]string) 2262 2263 // Create 50 streams per cluster. 2264 for _, cn := range []string{"C1", "C2", "C3"} { 2265 var streams []string 2266 for i := 0; i < 50; i++ { 2267 sn := fmt.Sprintf("%s-S%d", cn, i+1) 2268 streams = append(streams, sn) 2269 _, err := js.AddStream(&nats.StreamConfig{ 2270 Name: sn, 2271 Replicas: 3, 2272 Placement: &nats.Placement{Cluster: cn}, 2273 MaxAge: 2 * time.Minute, 2274 MaxMsgs: 50_000, 2275 }) 2276 if err != nil { 2277 t.Fatalf("Unexpected error: %v", err) 2278 } 2279 } 2280 scm[cn] = streams 2281 } 2282 2283 sourceForCluster := func(cn string) []*nats.StreamSource { 2284 var sns []string 2285 switch cn { 2286 case "C1": 2287 sns = scm["C2"] 2288 case "C2": 2289 sns = scm["C3"] 2290 case "C3": 2291 sns = scm["C1"] 2292 default: 2293 t.Fatalf("Unknown cluster %q", cn) 2294 } 2295 var ss []*nats.StreamSource 2296 for _, sn := range sns { 2297 ss = append(ss, &nats.StreamSource{Name: sn}) 2298 } 2299 return ss 2300 } 2301 2302 // Mux all 50 streams from one cluster to a single stream across a GW connection to another cluster. 2303 _, err := js.AddStream(&nats.StreamConfig{ 2304 Name: "C1-S-MUX", 2305 Replicas: 2, 2306 Placement: &nats.Placement{Cluster: "C1"}, 2307 Sources: sourceForCluster("C2"), 2308 MaxAge: time.Minute, 2309 MaxMsgs: 20_000, 2310 }) 2311 if err != nil { 2312 t.Fatalf("Unexpected error: %v", err) 2313 } 2314 2315 _, err = js.AddStream(&nats.StreamConfig{ 2316 Name: "C2-S-MUX", 2317 Replicas: 2, 2318 Placement: &nats.Placement{Cluster: "C2"}, 2319 Sources: sourceForCluster("C3"), 2320 MaxAge: time.Minute, 2321 MaxMsgs: 20_000, 2322 }) 2323 if err != nil { 2324 t.Fatalf("Unexpected error: %v", err) 2325 } 2326 2327 _, err = js.AddStream(&nats.StreamConfig{ 2328 Name: "C3-S-MUX", 2329 Replicas: 2, 2330 Placement: &nats.Placement{Cluster: "C3"}, 2331 Sources: sourceForCluster("C1"), 2332 MaxAge: time.Minute, 2333 MaxMsgs: 20_000, 2334 }) 2335 if err != nil { 2336 t.Fatalf("Unexpected error: %v", err) 2337 } 2338 2339 // Now create mirrors for our mux'd streams. 2340 _, err = js.AddStream(&nats.StreamConfig{ 2341 Name: "C1-MIRROR", 2342 Replicas: 3, 2343 Placement: &nats.Placement{Cluster: "C1"}, 2344 Mirror: &nats.StreamSource{Name: "C3-S-MUX"}, 2345 MaxAge: 5 * time.Minute, 2346 MaxMsgs: 10_000, 2347 }) 2348 if err != nil { 2349 t.Fatalf("Unexpected error: %v", err) 2350 } 2351 2352 _, err = js.AddStream(&nats.StreamConfig{ 2353 Name: "C2-MIRROR", 2354 Replicas: 3, 2355 Placement: &nats.Placement{Cluster: "C2"}, 2356 Mirror: &nats.StreamSource{Name: "C2-S-MUX"}, 2357 MaxAge: 5 * time.Minute, 2358 MaxMsgs: 10_000, 2359 }) 2360 if err != nil { 2361 t.Fatalf("Unexpected error: %v", err) 2362 } 2363 2364 _, err = js.AddStream(&nats.StreamConfig{ 2365 Name: "C3-MIRROR", 2366 Replicas: 3, 2367 Placement: &nats.Placement{Cluster: "C3"}, 2368 Mirror: &nats.StreamSource{Name: "C1-S-MUX"}, 2369 MaxAge: 5 * time.Minute, 2370 MaxMsgs: 10_000, 2371 }) 2372 if err != nil { 2373 t.Fatalf("Unexpected error: %v", err) 2374 } 2375 2376 var jsc []nats.JetStream 2377 2378 // Create 64 clients. 2379 for i := 0; i < 64; i++ { 2380 s := sc.randomCluster().randomServer() 2381 nc, _ := jsClientConnect(t, s) 2382 defer nc.Close() 2383 js, err := nc.JetStream(nats.PublishAsyncMaxPending(8 * 1024)) 2384 if err != nil { 2385 t.Fatalf("Unexpected error: %v", err) 2386 } 2387 jsc = append(jsc, js) 2388 } 2389 2390 msg := make([]byte, 1024) 2391 crand.Read(msg) 2392 2393 // 10 minutes 2394 expires := time.Now().Add(480 * time.Second) 2395 for time.Now().Before(expires) { 2396 for _, sns := range scm { 2397 rand.Shuffle(len(sns), func(i, j int) { sns[i], sns[j] = sns[j], sns[i] }) 2398 for _, sn := range sns { 2399 js := jsc[rand.Intn(len(jsc))] 2400 if _, err = js.PublishAsync(sn, msg); err != nil { 2401 t.Fatalf("Unexpected publish error: %v", err) 2402 } 2403 } 2404 } 2405 time.Sleep(10 * time.Millisecond) 2406 } 2407 } 2408 2409 func TestNoRaceJetStreamSlowFilteredInititalPendingAndFirstMsg(t *testing.T) { 2410 s := RunBasicJetStreamServer(t) 2411 defer s.Shutdown() 2412 2413 // Create directly here to force multiple blocks, etc. 2414 a, err := s.LookupAccount("$G") 2415 if err != nil { 2416 t.Fatalf("Unexpected error: %v", err) 2417 } 2418 mset, err := a.addStreamWithStore( 2419 &StreamConfig{ 2420 Name: "S", 2421 Subjects: []string{"foo", "bar", "baz", "foo.bar.baz", "foo.*"}, 2422 }, 2423 &FileStoreConfig{ 2424 BlockSize: 4 * 1024 * 1024, 2425 AsyncFlush: true, 2426 }, 2427 ) 2428 if err != nil { 2429 t.Fatalf("Unexpected error: %v", err) 2430 } 2431 2432 nc, js := jsClientConnect(t, s) 2433 defer nc.Close() 2434 2435 toSend := 100_000 // 500k total though. 2436 2437 // Messages will be 'foo' 'bar' 'baz' repeated 100k times. 2438 // Then 'foo.bar.baz' all contigous for 100k. 2439 // Then foo.N for 1-100000 2440 for i := 0; i < toSend; i++ { 2441 js.PublishAsync("foo", []byte("HELLO")) 2442 js.PublishAsync("bar", []byte("WORLD")) 2443 js.PublishAsync("baz", []byte("AGAIN")) 2444 } 2445 // Make contiguous block of same subject. 2446 for i := 0; i < toSend; i++ { 2447 js.PublishAsync("foo.bar.baz", []byte("ALL-TOGETHER")) 2448 } 2449 // Now add some more at the end. 2450 for i := 0; i < toSend; i++ { 2451 js.PublishAsync(fmt.Sprintf("foo.%d", i+1), []byte("LATER")) 2452 } 2453 2454 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 2455 si, err := js.StreamInfo("S") 2456 if err != nil { 2457 return err 2458 } 2459 if si.State.Msgs != uint64(5*toSend) { 2460 return fmt.Errorf("Expected %d msgs, got %d", 5*toSend, si.State.Msgs) 2461 } 2462 return nil 2463 }) 2464 2465 // Threshold for taking too long. 2466 const thresh = 150 * time.Millisecond 2467 2468 var dindex int 2469 testConsumerCreate := func(subj string, startSeq, expectedNumPending uint64) { 2470 t.Helper() 2471 dindex++ 2472 dname := fmt.Sprintf("dur-%d", dindex) 2473 cfg := ConsumerConfig{FilterSubject: subj, Durable: dname, AckPolicy: AckExplicit} 2474 if startSeq > 1 { 2475 cfg.OptStartSeq, cfg.DeliverPolicy = startSeq, DeliverByStartSequence 2476 } 2477 start := time.Now() 2478 o, err := mset.addConsumer(&cfg) 2479 if err != nil { 2480 t.Fatalf("Unexpected error: %v", err) 2481 } 2482 if delta := time.Since(start); delta > thresh { 2483 t.Fatalf("Creating consumer for %q and start: %d took too long: %v", subj, startSeq, delta) 2484 } 2485 if ci := o.info(); ci.NumPending != expectedNumPending { 2486 t.Fatalf("Expected NumPending of %d, got %d", expectedNumPending, ci.NumPending) 2487 } 2488 } 2489 2490 testConsumerCreate("foo.100000", 1, 1) 2491 testConsumerCreate("foo.100000", 222_000, 1) 2492 testConsumerCreate("foo", 1, 100_000) 2493 testConsumerCreate("foo", 4, 100_000-1) 2494 testConsumerCreate("foo.bar.baz", 1, 100_000) 2495 testConsumerCreate("foo.bar.baz", 350_001, 50_000) 2496 testConsumerCreate("*", 1, 300_000) 2497 testConsumerCreate("*", 4, 300_000-3) 2498 testConsumerCreate(">", 1, 500_000) 2499 testConsumerCreate(">", 50_000, 500_000-50_000+1) 2500 testConsumerCreate("foo.10", 1, 1) 2501 2502 // Also test that we do not take long if the start sequence is later in the stream. 2503 sub, err := js.PullSubscribe("foo.100000", "dlc") 2504 if err != nil { 2505 t.Fatalf("Unexpected error: %v", err) 2506 } 2507 start := time.Now() 2508 fetchMsgs(t, sub, 1, time.Second) 2509 if delta := time.Since(start); delta > thresh { 2510 t.Fatalf("Took too long for pull subscriber to fetch the message: %v", delta) 2511 } 2512 2513 // Now do some deletes and make sure these are handled correctly. 2514 // Delete 3 foo messages. 2515 mset.removeMsg(1) 2516 mset.removeMsg(4) 2517 mset.removeMsg(7) 2518 testConsumerCreate("foo", 1, 100_000-3) 2519 2520 // Make sure wider scoped subjects do the right thing from a pending perspective. 2521 o, err := mset.addConsumer(&ConsumerConfig{FilterSubject: ">", Durable: "cat", AckPolicy: AckExplicit}) 2522 if err != nil { 2523 t.Fatalf("Unexpected error: %v", err) 2524 } 2525 ci, expected := o.info(), uint64(500_000-3) 2526 if ci.NumPending != expected { 2527 t.Fatalf("Expected NumPending of %d, got %d", expected, ci.NumPending) 2528 } 2529 // Send another and make sure its captured by our wide scope consumer. 2530 js.Publish("foo", []byte("HELLO AGAIN")) 2531 if ci = o.info(); ci.NumPending != expected+1 { 2532 t.Fatalf("Expected the consumer to recognize the wide scoped consumer, wanted pending of %d, got %d", expected+1, ci.NumPending) 2533 } 2534 2535 // Stop current server and test restart.. 2536 sd := s.JetStreamConfig().StoreDir 2537 s.Shutdown() 2538 // Restart. 2539 s = RunJetStreamServerOnPort(-1, sd) 2540 defer s.Shutdown() 2541 2542 a, err = s.LookupAccount("$G") 2543 if err != nil { 2544 t.Fatalf("Unexpected error: %v", err) 2545 } 2546 mset, err = a.lookupStream("S") 2547 if err != nil { 2548 t.Fatalf("Unexpected error: %v", err) 2549 } 2550 2551 // Make sure we recovered our per subject state on restart. 2552 testConsumerCreate("foo.100000", 1, 1) 2553 testConsumerCreate("foo", 1, 100_000-2) 2554 } 2555 2556 func TestNoRaceJetStreamFileStoreBufferReuse(t *testing.T) { 2557 // Uncomment to run. Needs to be on a big machine. 2558 skip(t) 2559 2560 s := RunBasicJetStreamServer(t) 2561 defer s.Shutdown() 2562 2563 cfg := &StreamConfig{Name: "TEST", Subjects: []string{"foo", "bar", "baz"}, Storage: FileStorage} 2564 if _, err := s.GlobalAccount().addStreamWithStore(cfg, nil); err != nil { 2565 t.Fatalf("Unexpected error adding stream: %v", err) 2566 } 2567 2568 // Client for API requests. 2569 nc, js := jsClientConnect(t, s) 2570 defer nc.Close() 2571 2572 toSend := 200_000 2573 2574 m := nats.NewMsg("foo") 2575 m.Data = make([]byte, 8*1024) 2576 crand.Read(m.Data) 2577 2578 start := time.Now() 2579 for i := 0; i < toSend; i++ { 2580 m.Reply = _EMPTY_ 2581 switch i % 3 { 2582 case 0: 2583 m.Subject = "foo" 2584 case 1: 2585 m.Subject = "bar" 2586 case 2: 2587 m.Subject = "baz" 2588 } 2589 m.Header.Set("X-ID2", fmt.Sprintf("XXXXX-%d", i)) 2590 if _, err := js.PublishMsgAsync(m); err != nil { 2591 t.Fatalf("Err on publish: %v", err) 2592 } 2593 } 2594 <-js.PublishAsyncComplete() 2595 fmt.Printf("TOOK %v to publish\n", time.Since(start)) 2596 2597 v, err := s.Varz(nil) 2598 if err != nil { 2599 t.Fatalf("Unexpected error: %v", err) 2600 } 2601 fmt.Printf("MEM AFTER PUBLISH is %v\n", friendlyBytes(v.Mem)) 2602 2603 si, _ := js.StreamInfo("TEST") 2604 fmt.Printf("si is %+v\n", si.State) 2605 2606 received := 0 2607 done := make(chan bool) 2608 2609 cb := func(m *nats.Msg) { 2610 received++ 2611 if received >= toSend { 2612 done <- true 2613 } 2614 } 2615 2616 start = time.Now() 2617 sub, err := js.Subscribe("*", cb, nats.EnableFlowControl(), nats.IdleHeartbeat(time.Second), nats.AckNone()) 2618 if err != nil { 2619 t.Fatalf("Unexpected error: %v", err) 2620 } 2621 defer sub.Unsubscribe() 2622 <-done 2623 fmt.Printf("TOOK %v to consume\n", time.Since(start)) 2624 2625 v, err = s.Varz(nil) 2626 if err != nil { 2627 t.Fatalf("Unexpected error: %v", err) 2628 } 2629 fmt.Printf("MEM AFTER SUBSCRIBE is %v\n", friendlyBytes(v.Mem)) 2630 } 2631 2632 // Report of slow restart for a server that has many messages that have expired while it was not running. 2633 func TestNoRaceJetStreamSlowRestartWithManyExpiredMsgs(t *testing.T) { 2634 opts := DefaultTestOptions 2635 opts.Port = -1 2636 opts.JetStream = true 2637 s := RunServer(&opts) 2638 if config := s.JetStreamConfig(); config != nil { 2639 defer removeDir(t, config.StoreDir) 2640 } 2641 defer s.Shutdown() 2642 2643 // Client for API requests. 2644 nc, js := jsClientConnect(t, s) 2645 defer nc.Close() 2646 2647 ttl := 2 * time.Second 2648 _, err := js.AddStream(&nats.StreamConfig{ 2649 Name: "ORDERS", 2650 Subjects: []string{"orders.*"}, 2651 MaxAge: ttl, 2652 }) 2653 if err != nil { 2654 t.Fatalf("Unexpected error: %v", err) 2655 } 2656 2657 // Attach a consumer who is filtering on a wildcard subject as well. 2658 // This does not affect it like I thought originally but will keep it here. 2659 _, err = js.AddConsumer("ORDERS", &nats.ConsumerConfig{ 2660 Durable: "c22", 2661 FilterSubject: "orders.*", 2662 AckPolicy: nats.AckExplicitPolicy, 2663 }) 2664 if err != nil { 2665 t.Fatalf("Unexpected error: %v", err) 2666 } 2667 2668 // Now fill up with messages. 2669 toSend := 100_000 2670 for i := 1; i <= toSend; i++ { 2671 js.PublishAsync(fmt.Sprintf("orders.%d", i), []byte("OK")) 2672 } 2673 <-js.PublishAsyncComplete() 2674 2675 sdir := strings.TrimSuffix(s.JetStreamConfig().StoreDir, JetStreamStoreDir) 2676 s.Shutdown() 2677 2678 // Let them expire while not running. 2679 time.Sleep(ttl + 500*time.Millisecond) 2680 2681 start := time.Now() 2682 opts.Port = -1 2683 opts.StoreDir = sdir 2684 s = RunServer(&opts) 2685 elapsed := time.Since(start) 2686 defer s.Shutdown() 2687 2688 if elapsed > 2*time.Second { 2689 t.Fatalf("Took %v for restart which is too long", elapsed) 2690 } 2691 2692 // Check everything is correct. 2693 nc, js = jsClientConnect(t, s) 2694 defer nc.Close() 2695 2696 si, err := js.StreamInfo("ORDERS") 2697 if err != nil { 2698 t.Fatalf("Unexpected error: %v", err) 2699 } 2700 if si.State.Msgs != 0 { 2701 t.Fatalf("Expected no msgs after restart, got %d", si.State.Msgs) 2702 } 2703 } 2704 2705 func TestNoRaceJetStreamStalledMirrorsAfterExpire(t *testing.T) { 2706 c := createJetStreamClusterExplicit(t, "JSC", 3) 2707 defer c.shutdown() 2708 2709 nc, js := jsClientConnect(t, c.randomServer()) 2710 defer nc.Close() 2711 2712 cfg := &nats.StreamConfig{ 2713 Name: "TEST", 2714 Subjects: []string{"foo.*"}, 2715 Replicas: 1, 2716 MaxAge: 100 * time.Millisecond, 2717 } 2718 2719 if _, err := js.AddStream(cfg); err != nil { 2720 t.Fatalf("Error creating stream: %v", err) 2721 } 2722 2723 if _, err := js.AddStream(&nats.StreamConfig{ 2724 Name: "M", 2725 Replicas: 2, 2726 Mirror: &nats.StreamSource{Name: "TEST"}, 2727 }); err != nil { 2728 t.Fatalf("Unexpected error: %v", err) 2729 } 2730 2731 sendBatch := func(batch int) { 2732 t.Helper() 2733 for i := 0; i < batch; i++ { 2734 js.PublishAsync("foo.bar", []byte("Hello")) 2735 } 2736 select { 2737 case <-js.PublishAsyncComplete(): 2738 case <-time.After(5 * time.Second): 2739 t.Fatalf("Did not receive completion signal") 2740 } 2741 } 2742 2743 numMsgs := 10_000 2744 sendBatch(numMsgs) 2745 2746 // Turn off expiration so we can test we did not stall. 2747 cfg.MaxAge = 0 2748 if _, err := js.UpdateStream(cfg); err != nil { 2749 t.Fatalf("Unexpected error: %v", err) 2750 } 2751 2752 sendBatch(numMsgs) 2753 2754 // Wait for mirror to be caught up. 2755 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 2756 si, err := js.StreamInfo("M") 2757 if err != nil { 2758 t.Fatalf("Unexpected error: %v", err) 2759 } 2760 if si.State.LastSeq != uint64(2*numMsgs) { 2761 return fmt.Errorf("Expected %d as last sequence, got state: %+v", 2*numMsgs, si.State) 2762 } 2763 return nil 2764 }) 2765 } 2766 2767 // We will use JetStream helpers to create supercluster but this test is about exposing the ability to access 2768 // account scoped connz with subject interest filtering. 2769 func TestNoRaceJetStreamSuperClusterAccountConnz(t *testing.T) { 2770 // This has 4 different account, 3 general and system. 2771 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 3, 3) 2772 defer sc.shutdown() 2773 2774 // Create 20 connections on account one and two 2775 // Create JetStream assets for each as well to make sure by default we do not report them. 2776 num := 20 2777 for i := 0; i < num; i++ { 2778 nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo("one", "p"), nats.Name("one")) 2779 defer nc.Close() 2780 2781 if i%2 == 0 { 2782 nc.SubscribeSync("foo") 2783 } else { 2784 nc.SubscribeSync("bar") 2785 } 2786 2787 nc, js := jsClientConnect(t, sc.randomServer(), nats.UserInfo("two", "p"), nats.Name("two")) 2788 defer nc.Close() 2789 nc.SubscribeSync("baz") 2790 nc.SubscribeSync("foo.bar.*") 2791 nc.SubscribeSync(fmt.Sprintf("id.%d", i+1)) 2792 2793 js.AddStream(&nats.StreamConfig{Name: fmt.Sprintf("TEST:%d", i+1)}) 2794 } 2795 2796 type czapi struct { 2797 Server *ServerInfo 2798 Data *Connz 2799 Error *ApiError 2800 } 2801 2802 parseConnz := func(buf []byte) *Connz { 2803 t.Helper() 2804 var cz czapi 2805 if err := json.Unmarshal(buf, &cz); err != nil { 2806 t.Fatalf("Unexpected error: %v", err) 2807 } 2808 if cz.Error != nil { 2809 t.Fatalf("Unexpected error: %+v", cz.Error) 2810 } 2811 return cz.Data 2812 } 2813 2814 doRequest := func(reqSubj, acc, filter string, expected int) { 2815 t.Helper() 2816 nc, _ := jsClientConnect(t, sc.randomServer(), nats.UserInfo(acc, "p"), nats.Name(acc)) 2817 defer nc.Close() 2818 2819 mch := make(chan *nats.Msg, 9) 2820 sub, _ := nc.ChanSubscribe(nats.NewInbox(), mch) 2821 2822 var req []byte 2823 if filter != _EMPTY_ { 2824 req, _ = json.Marshal(&ConnzOptions{FilterSubject: filter}) 2825 } 2826 2827 if err := nc.PublishRequest(reqSubj, sub.Subject, req); err != nil { 2828 t.Fatalf("Unexpected error: %v", err) 2829 } 2830 2831 // So we can igniore ourtselves. 2832 cid, _ := nc.GetClientID() 2833 sid := nc.ConnectedServerId() 2834 2835 wt := time.NewTimer(200 * time.Millisecond) 2836 var conns []*ConnInfo 2837 LOOP: 2838 for { 2839 select { 2840 case m := <-mch: 2841 if len(m.Data) == 0 { 2842 t.Fatalf("No responders") 2843 } 2844 cr := parseConnz(m.Data) 2845 // For account scoped, NumConns and Total should be the same (sans limits and offsets). 2846 // It Total should not include other accounts since that would leak information about the system. 2847 if filter == _EMPTY_ && cr.NumConns != cr.Total { 2848 t.Fatalf("NumConns and Total should be same with account scoped connz, got %+v", cr) 2849 } 2850 for _, c := range cr.Conns { 2851 if c.Name != acc { 2852 t.Fatalf("Got wrong account: %q vs %q for %+v", acc, c.Account, c) 2853 } 2854 if !(c.Cid == cid && cr.ID == sid) { 2855 conns = append(conns, c) 2856 } 2857 } 2858 wt.Reset(200 * time.Millisecond) 2859 case <-wt.C: 2860 break LOOP 2861 } 2862 } 2863 if len(conns) != expected { 2864 t.Fatalf("Expected to see %d conns but got %d", expected, len(conns)) 2865 } 2866 } 2867 2868 doSysRequest := func(acc string, expected int) { 2869 t.Helper() 2870 doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, _EMPTY_, expected) 2871 } 2872 doAccRequest := func(acc string, expected int) { 2873 t.Helper() 2874 doRequest("$SYS.REQ.ACCOUNT.PING.CONNZ", acc, _EMPTY_, expected) 2875 } 2876 doFiltered := func(acc, filter string, expected int) { 2877 t.Helper() 2878 doRequest("$SYS.REQ.SERVER.PING.CONNZ", acc, filter, expected) 2879 } 2880 2881 doSysRequest("one", 20) 2882 doAccRequest("one", 20) 2883 2884 doSysRequest("two", 20) 2885 doAccRequest("two", 20) 2886 2887 // Now check filtering. 2888 doFiltered("one", _EMPTY_, 20) 2889 doFiltered("one", ">", 20) 2890 doFiltered("one", "bar", 10) 2891 doFiltered("two", "bar", 0) 2892 doFiltered("two", "id.1", 1) 2893 doFiltered("two", "id.*", 20) 2894 doFiltered("two", "foo.bar.*", 20) 2895 doFiltered("two", "foo.>", 20) 2896 } 2897 2898 func TestNoRaceCompressedConnz(t *testing.T) { 2899 s := RunBasicJetStreamServer(t) 2900 defer s.Shutdown() 2901 2902 nc, _ := jsClientConnect(t, s) 2903 defer nc.Close() 2904 2905 doRequest := func(compress string) { 2906 t.Helper() 2907 m := nats.NewMsg("$SYS.REQ.ACCOUNT.PING.CONNZ") 2908 m.Header.Add("Accept-Encoding", compress) 2909 resp, err := nc.RequestMsg(m, time.Second) 2910 if err != nil { 2911 t.Fatalf("Unexpected error: %v", err) 2912 } 2913 buf := resp.Data 2914 2915 // Make sure we have an encoding header. 2916 ce := resp.Header.Get("Content-Encoding") 2917 switch strings.ToLower(ce) { 2918 case "gzip": 2919 zr, err := gzip.NewReader(bytes.NewReader(buf)) 2920 if err != nil { 2921 t.Fatalf("Unexpected error: %v", err) 2922 } 2923 defer zr.Close() 2924 buf, err = io.ReadAll(zr) 2925 if err != nil && err != io.ErrUnexpectedEOF { 2926 t.Fatalf("Unexpected error: %v", err) 2927 } 2928 case "snappy", "s2": 2929 sr := s2.NewReader(bytes.NewReader(buf)) 2930 buf, err = io.ReadAll(sr) 2931 if err != nil && err != io.ErrUnexpectedEOF { 2932 t.Fatalf("Unexpected error: %v", err) 2933 } 2934 default: 2935 t.Fatalf("Unknown content-encoding of %q", ce) 2936 } 2937 2938 var cz ServerAPIConnzResponse 2939 if err := json.Unmarshal(buf, &cz); err != nil { 2940 t.Fatalf("Unexpected error: %v", err) 2941 } 2942 if cz.Error != nil { 2943 t.Fatalf("Unexpected error: %+v", cz.Error) 2944 } 2945 } 2946 2947 doRequest("gzip") 2948 doRequest("snappy") 2949 doRequest("s2") 2950 } 2951 2952 func TestNoRaceJetStreamClusterExtendedStreamPurge(t *testing.T) { 2953 for _, st := range []StorageType{FileStorage, MemoryStorage} { 2954 t.Run(st.String(), func(t *testing.T) { 2955 c := createJetStreamClusterExplicit(t, "JSC", 3) 2956 defer c.shutdown() 2957 2958 nc, js := jsClientConnect(t, c.randomServer()) 2959 defer nc.Close() 2960 2961 cfg := StreamConfig{ 2962 Name: "KV", 2963 Subjects: []string{"kv.>"}, 2964 Storage: st, 2965 Replicas: 2, 2966 MaxMsgsPer: 100, 2967 } 2968 req, err := json.Marshal(cfg) 2969 if err != nil { 2970 t.Fatalf("Unexpected error: %v", err) 2971 } 2972 // Do manually for now. 2973 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 2974 c.waitOnStreamLeader("$G", "KV") 2975 2976 si, err := js.StreamInfo("KV") 2977 if err != nil { 2978 t.Fatalf("Unexpected error: %v", err) 2979 } 2980 if si == nil || si.Config.Name != "KV" { 2981 t.Fatalf("StreamInfo is not correct %+v", si) 2982 } 2983 2984 for i := 0; i < 1000; i++ { 2985 js.PublishAsync("kv.foo", []byte("OK")) // 1 * i 2986 js.PublishAsync("kv.bar", []byte("OK")) // 2 * i 2987 js.PublishAsync("kv.baz", []byte("OK")) // 3 * i 2988 } 2989 // First is 2700, last is 3000 2990 for i := 0; i < 700; i++ { 2991 js.PublishAsync(fmt.Sprintf("kv.%d", i+1), []byte("OK")) 2992 } 2993 // Now first is 2700, last is 3700 2994 select { 2995 case <-js.PublishAsyncComplete(): 2996 case <-time.After(10 * time.Second): 2997 t.Fatalf("Did not receive completion signal") 2998 } 2999 3000 si, err = js.StreamInfo("KV") 3001 if err != nil { 3002 t.Fatalf("Unexpected error: %v", err) 3003 } 3004 if si.State.Msgs != 1000 { 3005 t.Fatalf("Expected %d msgs, got %d", 1000, si.State.Msgs) 3006 } 3007 3008 shouldFail := func(preq *JSApiStreamPurgeRequest) { 3009 req, _ := json.Marshal(preq) 3010 resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second) 3011 if err != nil { 3012 t.Fatalf("Unexpected error: %v", err) 3013 } 3014 var pResp JSApiStreamPurgeResponse 3015 if err = json.Unmarshal(resp.Data, &pResp); err != nil { 3016 t.Fatalf("Unexpected error: %v", err) 3017 } 3018 if pResp.Success || pResp.Error == nil { 3019 t.Fatalf("Expected an error response but got none") 3020 } 3021 } 3022 3023 // Sequence and Keep should be mutually exclusive. 3024 shouldFail(&JSApiStreamPurgeRequest{Sequence: 10, Keep: 10}) 3025 3026 purge := func(preq *JSApiStreamPurgeRequest, newTotal uint64) { 3027 t.Helper() 3028 req, _ := json.Marshal(preq) 3029 resp, err := nc.Request(fmt.Sprintf(JSApiStreamPurgeT, "KV"), req, time.Second) 3030 if err != nil { 3031 t.Fatalf("Unexpected error: %v", err) 3032 } 3033 var pResp JSApiStreamPurgeResponse 3034 if err = json.Unmarshal(resp.Data, &pResp); err != nil { 3035 t.Fatalf("Unexpected error: %v", err) 3036 } 3037 if !pResp.Success || pResp.Error != nil { 3038 t.Fatalf("Got a bad response %+v", pResp) 3039 } 3040 si, err = js.StreamInfo("KV") 3041 if err != nil { 3042 t.Fatalf("Unexpected error: %v", err) 3043 } 3044 if si.State.Msgs != newTotal { 3045 t.Fatalf("Expected total after purge to be %d but got %d", newTotal, si.State.Msgs) 3046 } 3047 } 3048 expectLeft := func(subject string, expected uint64) { 3049 t.Helper() 3050 ci, err := js.AddConsumer("KV", &nats.ConsumerConfig{Durable: "dlc", FilterSubject: subject, AckPolicy: nats.AckExplicitPolicy}) 3051 if err != nil { 3052 t.Fatalf("Unexpected error: %v", err) 3053 } 3054 defer js.DeleteConsumer("KV", "dlc") 3055 if ci.NumPending != expected { 3056 t.Fatalf("Expected %d remaining but got %d", expected, ci.NumPending) 3057 } 3058 } 3059 3060 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo"}, 900) 3061 expectLeft("kv.foo", 0) 3062 3063 purge(&JSApiStreamPurgeRequest{Subject: "kv.bar", Keep: 1}, 801) 3064 expectLeft("kv.bar", 1) 3065 3066 purge(&JSApiStreamPurgeRequest{Subject: "kv.baz", Sequence: 2851}, 751) 3067 expectLeft("kv.baz", 50) 3068 3069 purge(&JSApiStreamPurgeRequest{Subject: "kv.*"}, 0) 3070 3071 // RESET 3072 js.DeleteStream("KV") 3073 // Do manually for now. 3074 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3075 c.waitOnStreamLeader("$G", "KV") 3076 3077 if _, err := js.StreamInfo("KV"); err != nil { 3078 t.Fatalf("Unexpected error: %v", err) 3079 } 3080 // Put in 100. 3081 for i := 0; i < 100; i++ { 3082 js.PublishAsync("kv.foo", []byte("OK")) 3083 } 3084 select { 3085 case <-js.PublishAsyncComplete(): 3086 case <-time.After(time.Second): 3087 t.Fatalf("Did not receive completion signal") 3088 } 3089 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10) 3090 purge(&JSApiStreamPurgeRequest{Subject: "kv.foo", Keep: 10}, 10) 3091 expectLeft("kv.foo", 10) 3092 3093 // RESET AGAIN 3094 js.DeleteStream("KV") 3095 // Do manually for now. 3096 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3097 c.waitOnStreamLeader("$G", "KV") 3098 3099 if _, err := js.StreamInfo("KV"); err != nil { 3100 t.Fatalf("Unexpected error: %v", err) 3101 } 3102 // Put in 100. 3103 for i := 0; i < 100; i++ { 3104 js.Publish("kv.foo", []byte("OK")) 3105 } 3106 purge(&JSApiStreamPurgeRequest{Keep: 10}, 10) 3107 expectLeft(">", 10) 3108 3109 // RESET AGAIN 3110 js.DeleteStream("KV") 3111 // Do manually for now. 3112 nc.Request(fmt.Sprintf(JSApiStreamCreateT, cfg.Name), req, time.Second) 3113 if _, err := js.StreamInfo("KV"); err != nil { 3114 t.Fatalf("Unexpected error: %v", err) 3115 } 3116 // Put in 100. 3117 for i := 0; i < 100; i++ { 3118 js.Publish("kv.foo", []byte("OK")) 3119 } 3120 purge(&JSApiStreamPurgeRequest{Sequence: 90}, 11) // Up to 90 so we keep that, hence the 11. 3121 expectLeft(">", 11) 3122 }) 3123 } 3124 } 3125 3126 func TestNoRaceJetStreamFileStoreCompaction(t *testing.T) { 3127 s := RunBasicJetStreamServer(t) 3128 defer s.Shutdown() 3129 3130 nc, js := jsClientConnect(t, s) 3131 defer nc.Close() 3132 3133 cfg := &nats.StreamConfig{ 3134 Name: "KV", 3135 Subjects: []string{"KV.>"}, 3136 MaxMsgsPerSubject: 1, 3137 } 3138 if _, err := js.AddStream(cfg); err != nil { 3139 t.Fatalf("Unexpected error: %v", err) 3140 } 3141 3142 toSend := 10_000 3143 data := make([]byte, 4*1024) 3144 crand.Read(data) 3145 3146 // First one. 3147 js.PublishAsync("KV.FM", data) 3148 3149 for i := 0; i < toSend; i++ { 3150 js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) 3151 } 3152 // Do again and overwrite the previous batch. 3153 for i := 0; i < toSend; i++ { 3154 js.PublishAsync(fmt.Sprintf("KV.%d", i+1), data) 3155 } 3156 select { 3157 case <-js.PublishAsyncComplete(): 3158 case <-time.After(10 * time.Second): 3159 t.Fatalf("Did not receive completion signal") 3160 } 3161 3162 // Now check by hand the utilization level. 3163 mset, err := s.GlobalAccount().lookupStream("KV") 3164 if err != nil { 3165 t.Fatalf("Unexpected error: %v", err) 3166 } 3167 total, used, _ := mset.Store().Utilization() 3168 if pu := 100.0 * float32(used) / float32(total); pu < 80.0 { 3169 t.Fatalf("Utilization is less than 80%%, got %.2f", pu) 3170 } 3171 } 3172 3173 func TestNoRaceJetStreamEncryptionEnabledOnRestartWithExpire(t *testing.T) { 3174 conf := createConfFile(t, []byte(fmt.Sprintf(` 3175 listen: 127.0.0.1:-1 3176 jetstream { 3177 store_dir = %q 3178 } 3179 `, t.TempDir()))) 3180 3181 s, _ := RunServerWithConfig(conf) 3182 defer s.Shutdown() 3183 3184 config := s.JetStreamConfig() 3185 if config == nil { 3186 t.Fatalf("Expected config but got none") 3187 } 3188 defer removeDir(t, config.StoreDir) 3189 3190 nc, js := jsClientConnect(t, s) 3191 defer nc.Close() 3192 3193 toSend := 10_000 3194 3195 cfg := &nats.StreamConfig{ 3196 Name: "TEST", 3197 Subjects: []string{"foo", "bar"}, 3198 MaxMsgs: int64(toSend), 3199 } 3200 if _, err := js.AddStream(cfg); err != nil { 3201 t.Fatalf("Unexpected error: %v", err) 3202 } 3203 3204 data := make([]byte, 4*1024) // 4K payload 3205 crand.Read(data) 3206 3207 for i := 0; i < toSend; i++ { 3208 js.PublishAsync("foo", data) 3209 js.PublishAsync("bar", data) 3210 } 3211 select { 3212 case <-js.PublishAsyncComplete(): 3213 case <-time.After(5 * time.Second): 3214 t.Fatalf("Did not receive completion signal") 3215 } 3216 3217 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dlc", AckPolicy: nats.AckExplicitPolicy}) 3218 if err != nil { 3219 t.Fatalf("Unexpected error: %v", err) 3220 } 3221 3222 // Restart 3223 nc.Close() 3224 s.Shutdown() 3225 3226 ncs := fmt.Sprintf("\nlisten: 127.0.0.1:-1\njetstream: {key: %q, store_dir: %q}\n", "s3cr3t!", config.StoreDir) 3227 conf = createConfFile(t, []byte(ncs)) 3228 3229 // Try to drain entropy to see if effects startup time. 3230 drain := make([]byte, 32*1024*1024) // Pull 32Mb of crypto rand. 3231 crand.Read(drain) 3232 3233 start := time.Now() 3234 s, _ = RunServerWithConfig(conf) 3235 defer s.Shutdown() 3236 dd := time.Since(start) 3237 if dd > 5*time.Second { 3238 t.Fatalf("Restart took longer than expected: %v", dd) 3239 } 3240 } 3241 3242 // This test was from Ivan K. and showed a bug in the filestore implementation. 3243 // This is skipped by default since it takes >40s to run. 3244 func TestNoRaceJetStreamOrderedConsumerMissingMsg(t *testing.T) { 3245 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 3246 skip(t) 3247 3248 s := RunBasicJetStreamServer(t) 3249 defer s.Shutdown() 3250 3251 nc, js := jsClientConnect(t, s) 3252 defer nc.Close() 3253 3254 if _, err := js.AddStream(&nats.StreamConfig{ 3255 Name: "benchstream", 3256 Subjects: []string{"testsubject"}, 3257 Replicas: 1, 3258 }); err != nil { 3259 t.Fatalf("add stream failed: %s", err) 3260 } 3261 3262 total := 1_000_000 3263 3264 numSubs := 10 3265 ch := make(chan struct{}, numSubs) 3266 wg := sync.WaitGroup{} 3267 wg.Add(numSubs) 3268 errCh := make(chan error, 1) 3269 for i := 0; i < numSubs; i++ { 3270 nc, js := jsClientConnect(t, s) 3271 defer nc.Close() 3272 go func(nc *nats.Conn, js nats.JetStreamContext) { 3273 defer wg.Done() 3274 received := 0 3275 _, err := js.Subscribe("testsubject", func(m *nats.Msg) { 3276 meta, _ := m.Metadata() 3277 if meta.Sequence.Consumer != meta.Sequence.Stream { 3278 nc.Close() 3279 errCh <- fmt.Errorf("Bad meta: %+v", meta) 3280 } 3281 received++ 3282 if received == total { 3283 ch <- struct{}{} 3284 } 3285 }, nats.OrderedConsumer()) 3286 if err != nil { 3287 select { 3288 case errCh <- fmt.Errorf("Error creating sub: %v", err): 3289 default: 3290 } 3291 3292 } 3293 }(nc, js) 3294 } 3295 wg.Wait() 3296 select { 3297 case e := <-errCh: 3298 t.Fatal(e) 3299 default: 3300 } 3301 3302 payload := make([]byte, 500) 3303 for i := 1; i <= total; i++ { 3304 js.PublishAsync("testsubject", payload) 3305 } 3306 select { 3307 case <-js.PublishAsyncComplete(): 3308 case <-time.After(10 * time.Second): 3309 t.Fatalf("Did not send all messages") 3310 } 3311 3312 // Now wait for consumers to be done: 3313 for i := 0; i < numSubs; i++ { 3314 select { 3315 case <-ch: 3316 case <-time.After(10 * time.Second): 3317 t.Fatal("Did not receive all messages for all consumers in time") 3318 } 3319 } 3320 3321 } 3322 3323 // Issue #2488 - Bad accounting, can not reproduce the stalled consumers after last several PRs. 3324 // Issue did show bug in ack logic for no-ack and interest based retention. 3325 func TestNoRaceJetStreamClusterInterestPolicyAckNone(t *testing.T) { 3326 for _, test := range []struct { 3327 name string 3328 durable string 3329 }{ 3330 {"durable", "dlc"}, 3331 {"ephemeral", _EMPTY_}, 3332 } { 3333 t.Run(test.name, func(t *testing.T) { 3334 c := createJetStreamClusterExplicit(t, "R3S", 3) 3335 defer c.shutdown() 3336 3337 // Client based API 3338 nc, js := jsClientConnect(t, c.randomServer()) 3339 defer nc.Close() 3340 3341 _, err := js.AddStream(&nats.StreamConfig{ 3342 Name: "cluster", 3343 Subjects: []string{"cluster.*"}, 3344 Retention: nats.InterestPolicy, 3345 Discard: nats.DiscardOld, 3346 Replicas: 3, 3347 }) 3348 if err != nil { 3349 t.Fatalf("Unexpected error: %v", err) 3350 } 3351 3352 var received uint32 3353 mh := func(m *nats.Msg) { 3354 atomic.AddUint32(&received, 1) 3355 } 3356 3357 opts := []nats.SubOpt{nats.DeliverNew(), nats.AckNone()} 3358 if test.durable != _EMPTY_ { 3359 opts = append(opts, nats.Durable(test.durable)) 3360 } 3361 _, err = js.Subscribe("cluster.created", mh, opts...) 3362 if err != nil { 3363 t.Fatalf("Unexpected error: %v", err) 3364 } 3365 3366 msg := []byte("ACK ME") 3367 const total = uint32(1_000) 3368 for i := 0; i < int(total); i++ { 3369 if _, err := js.Publish("cluster.created", msg); err != nil { 3370 t.Fatalf("Unexpected error: %v", err) 3371 } 3372 //time.Sleep(100 * time.Microsecond) 3373 } 3374 3375 // Wait for all messages to be received. 3376 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 3377 r := atomic.LoadUint32(&received) 3378 if r == total { 3379 return nil 3380 } 3381 return fmt.Errorf("Received only %d out of %d", r, total) 3382 }) 3383 3384 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3385 si, err := js.StreamInfo("cluster") 3386 if err != nil { 3387 t.Fatalf("Error getting stream info: %v", err) 3388 } 3389 if si.State.Msgs != 0 { 3390 return fmt.Errorf("Expected no messages, got %d", si.State.Msgs) 3391 } 3392 return nil 3393 }) 3394 }) 3395 } 3396 } 3397 3398 // There was a bug in the filestore compact code that would cause a store 3399 // with JSExpectedLastSubjSeq to fail with "wrong last sequence: 0" 3400 func TestNoRaceJetStreamLastSubjSeqAndFilestoreCompact(t *testing.T) { 3401 s := RunBasicJetStreamServer(t) 3402 defer s.Shutdown() 3403 3404 // Client based API 3405 nc, js := jsClientConnect(t, s) 3406 defer nc.Close() 3407 3408 _, err := js.AddStream(&nats.StreamConfig{ 3409 Name: "MQTT_sess", 3410 Subjects: []string{"MQTT.sess.>"}, 3411 Storage: nats.FileStorage, 3412 Retention: nats.LimitsPolicy, 3413 Replicas: 1, 3414 MaxMsgsPerSubject: 1, 3415 }) 3416 if err != nil { 3417 t.Fatalf("Unexpected error: %v", err) 3418 } 3419 3420 firstPayload := make([]byte, 40) 3421 secondPayload := make([]byte, 380) 3422 for iter := 0; iter < 2; iter++ { 3423 for i := 0; i < 4000; i++ { 3424 subj := "MQTT.sess." + getHash(fmt.Sprintf("client_%d", i)) 3425 pa, err := js.Publish(subj, firstPayload) 3426 if err != nil { 3427 t.Fatalf("Error on publish: %v", err) 3428 } 3429 m := nats.NewMsg(subj) 3430 m.Data = secondPayload 3431 eseq := strconv.FormatInt(int64(pa.Sequence), 10) 3432 m.Header.Set(JSExpectedLastSubjSeq, eseq) 3433 if _, err := js.PublishMsg(m); err != nil { 3434 t.Fatalf("Error on publish (iter=%v seq=%v): %v", iter+1, pa.Sequence, err) 3435 } 3436 } 3437 } 3438 } 3439 3440 // Issue #2548 3441 func TestNoRaceJetStreamClusterMemoryStreamConsumerRaftGrowth(t *testing.T) { 3442 c := createJetStreamClusterExplicit(t, "R3S", 3) 3443 defer c.shutdown() 3444 3445 nc, js := jsClientConnect(t, c.randomServer()) 3446 defer nc.Close() 3447 3448 _, err := js.AddStream(&nats.StreamConfig{ 3449 Name: "memory-leak", 3450 Subjects: []string{"memory-leak"}, 3451 Retention: nats.LimitsPolicy, 3452 MaxMsgs: 1000, 3453 Discard: nats.DiscardOld, 3454 MaxAge: time.Minute, 3455 Storage: nats.MemoryStorage, 3456 Replicas: 3, 3457 }) 3458 if err != nil { 3459 t.Fatalf("Unexpected error: %v", err) 3460 } 3461 3462 _, err = js.QueueSubscribe("memory-leak", "q1", func(msg *nats.Msg) { 3463 time.Sleep(1 * time.Second) 3464 msg.AckSync() 3465 }) 3466 if err != nil { 3467 t.Fatalf("Unexpected error: %v", err) 3468 } 3469 3470 // Send 10k (Must be > 8192 which is compactNumMin from monitorConsumer. 3471 msg := []byte("NATS is a connective technology that powers modern distributed systems.") 3472 for i := 0; i < 10_000; i++ { 3473 if _, err := js.Publish("memory-leak", msg); err != nil { 3474 t.Fatalf("Unexpected error: %v", err) 3475 } 3476 } 3477 3478 // We will verify here that the underlying raft layer for the leader is not > 8192 3479 cl := c.consumerLeader("$G", "memory-leak", "q1") 3480 mset, err := cl.GlobalAccount().lookupStream("memory-leak") 3481 if err != nil { 3482 t.Fatalf("Unexpected error: %v", err) 3483 } 3484 o := mset.lookupConsumer("q1") 3485 if o == nil { 3486 t.Fatalf("Error looking up consumer %q", "q1") 3487 } 3488 node := o.raftNode().(*raft) 3489 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 3490 if ms := node.wal.(*memStore); ms.State().Msgs > 8192 { 3491 return fmt.Errorf("Did not compact the raft memory WAL") 3492 } 3493 return nil 3494 }) 3495 } 3496 3497 func TestNoRaceJetStreamClusterCorruptWAL(t *testing.T) { 3498 c := createJetStreamClusterExplicit(t, "R3S", 3) 3499 defer c.shutdown() 3500 3501 nc, js := jsClientConnect(t, c.randomServer()) 3502 defer nc.Close() 3503 3504 if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Subjects: []string{"foo"}, Replicas: 3}); err != nil { 3505 t.Fatalf("Unexpected error: %v", err) 3506 } 3507 3508 sub, err := js.PullSubscribe("foo", "dlc") 3509 if err != nil { 3510 t.Fatalf("Unexpected error: %v", err) 3511 } 3512 3513 numMsgs := 1000 3514 for i := 0; i < numMsgs; i++ { 3515 js.PublishAsync("foo", []byte("WAL")) 3516 } 3517 select { 3518 case <-js.PublishAsyncComplete(): 3519 case <-time.After(5 * time.Second): 3520 t.Fatalf("Did not receive completion signal") 3521 } 3522 3523 for i, m := range fetchMsgs(t, sub, 200, 5*time.Second) { 3524 // Ack first 50 and every other even on after that.. 3525 if i < 50 || i%2 == 1 { 3526 m.AckSync() 3527 } 3528 } 3529 // Make sure acks processed. 3530 time.Sleep(200 * time.Millisecond) 3531 nc.Close() 3532 3533 // Check consumer consistency. 3534 checkConsumerWith := func(delivered, ackFloor uint64, ackPending int) { 3535 t.Helper() 3536 nc, js := jsClientConnect(t, c.randomServer()) 3537 defer nc.Close() 3538 3539 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3540 ci, err := js.ConsumerInfo("TEST", "dlc") 3541 if err != nil { 3542 return fmt.Errorf("Unexpected error: %v", err) 3543 } 3544 if ci.Delivered.Consumer != ci.Delivered.Stream || ci.Delivered.Consumer != delivered { 3545 return fmt.Errorf("Expected %d for delivered, got %+v", delivered, ci.Delivered) 3546 } 3547 if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != ackFloor { 3548 return fmt.Errorf("Expected %d for ack floor, got %+v", ackFloor, ci.AckFloor) 3549 } 3550 nm := uint64(numMsgs) 3551 if ci.NumPending != nm-delivered { 3552 return fmt.Errorf("Expected num pending to be %d, got %d", nm-delivered, ci.NumPending) 3553 } 3554 if ci.NumAckPending != ackPending { 3555 return fmt.Errorf("Expected num ack pending to be %d, got %d", ackPending, ci.NumAckPending) 3556 } 3557 return nil 3558 }) 3559 } 3560 3561 checkConsumer := func() { 3562 t.Helper() 3563 checkConsumerWith(200, 50, 75) 3564 } 3565 3566 checkConsumer() 3567 3568 // Grab the consumer leader. 3569 cl := c.consumerLeader("$G", "TEST", "dlc") 3570 mset, err := cl.GlobalAccount().lookupStream("TEST") 3571 if err != nil { 3572 t.Fatalf("Unexpected error: %v", err) 3573 } 3574 o := mset.lookupConsumer("dlc") 3575 if o == nil { 3576 t.Fatalf("Error looking up consumer %q", "dlc") 3577 } 3578 // Grab underlying raft node and the WAL (filestore) and we will attempt to "corrupt" it. 3579 node := o.raftNode().(*raft) 3580 // We are doing a stop here to prevent the internal consumer snapshot from happening on exit 3581 node.Stop() 3582 fs := node.wal.(*fileStore) 3583 fcfg, cfg := fs.fcfg, fs.cfg.StreamConfig 3584 // Stop all the servers. 3585 c.stopAll() 3586 3587 // Manipulate directly with cluster down. 3588 fs, err = newFileStore(fcfg, cfg) 3589 if err != nil { 3590 t.Fatalf("Unexpected error: %v", err) 3591 } 3592 state := fs.State() 3593 sm, err := fs.LoadMsg(state.LastSeq, nil) 3594 if err != nil { 3595 t.Fatalf("Unexpected error: %v", err) 3596 } 3597 ae, err := node.decodeAppendEntry(sm.msg, nil, _EMPTY_) 3598 if err != nil { 3599 t.Fatalf("Unexpected error: %v", err) 3600 } 3601 3602 dentry := func(dseq, sseq, dc uint64, ts int64) []byte { 3603 b := make([]byte, 4*binary.MaxVarintLen64+1) 3604 b[0] = byte(updateDeliveredOp) 3605 n := 1 3606 n += binary.PutUvarint(b[n:], dseq) 3607 n += binary.PutUvarint(b[n:], sseq) 3608 n += binary.PutUvarint(b[n:], dc) 3609 n += binary.PutVarint(b[n:], ts) 3610 return b[:n] 3611 } 3612 3613 // Let's put a non-contigous AppendEntry into the system. 3614 ae.pindex += 10 3615 // Add in delivered record. 3616 ae.entries = []*Entry{{EntryNormal, dentry(1000, 1000, 1, time.Now().UnixNano())}} 3617 encoded, err := ae.encode(nil) 3618 if err != nil { 3619 t.Fatalf("Unexpected error: %v", err) 3620 } 3621 if _, _, err := fs.StoreMsg(_EMPTY_, nil, encoded); err != nil { 3622 t.Fatalf("Unexpected error: %v", err) 3623 } 3624 fs.Stop() 3625 3626 c.restartAllSamePorts() 3627 c.waitOnStreamLeader("$G", "TEST") 3628 c.waitOnConsumerLeader("$G", "TEST", "dlc") 3629 3630 checkConsumer() 3631 3632 // Now we will truncate out the WAL out from underneath the leader. 3633 // Grab the consumer leader. 3634 3635 nc, js = jsClientConnect(t, c.randomServer()) 3636 defer nc.Close() 3637 3638 cl = c.consumerLeader("$G", "TEST", "dlc") 3639 mset, err = cl.GlobalAccount().lookupStream("TEST") 3640 require_NoError(t, err) 3641 o = mset.lookupConsumer("dlc") 3642 require_NoError(t, err) 3643 3644 // Grab underlying raft node and the WAL (filestore) and truncate it. 3645 // This will simulate the WAL losing state due to truncate and we want to make sure it recovers. 3646 3647 fs = o.raftNode().(*raft).wal.(*fileStore) 3648 state = fs.State() 3649 err = fs.Truncate(state.FirstSeq) 3650 require_True(t, err == nil || err == ErrInvalidSequence) 3651 state = fs.State() 3652 3653 sub, err = js.PullSubscribe("foo", "dlc") 3654 require_NoError(t, err) 3655 3656 // This will cause us to stepdown and truncate our WAL. 3657 sub.Fetch(100) 3658 c.waitOnConsumerLeader("$G", "TEST", "dlc") 3659 // We can't trust the results sans that we have a leader back in place and the ackFloor. 3660 ci, err := js.ConsumerInfo("TEST", "dlc") 3661 require_NoError(t, err) 3662 if ci.AckFloor.Consumer != ci.AckFloor.Stream || ci.AckFloor.Consumer != 50 { 3663 t.Fatalf("Expected %d for ack floor, got %+v", 50, ci.AckFloor) 3664 } 3665 } 3666 3667 func TestNoRaceJetStreamClusterInterestRetentionDeadlock(t *testing.T) { 3668 c := createJetStreamClusterExplicit(t, "R3S", 3) 3669 defer c.shutdown() 3670 3671 // Client based API 3672 s := c.randomServer() 3673 nc, js := jsClientConnect(t, s) 3674 defer nc.Close() 3675 3676 // This can trigger deadlock with current architecture. 3677 // If stream is !limitsRetention and consumer is DIRECT and ack none we will try to place the msg seq 3678 // onto a chan for the stream to consider removing. All conditions above must hold to trigger. 3679 3680 // We will attempt to trigger here with a stream mirror setup which uses and R=1 DIRECT consumer to replicate msgs. 3681 _, err := js.AddStream(&nats.StreamConfig{Name: "S", Retention: nats.InterestPolicy, Storage: nats.MemoryStorage}) 3682 if err != nil { 3683 t.Fatalf("Unexpected error: %v", err) 3684 } 3685 3686 // Create a mirror which will create the consumer profile to trigger. 3687 _, err = js.AddStream(&nats.StreamConfig{Name: "M", Mirror: &nats.StreamSource{Name: "S"}}) 3688 if err != nil { 3689 t.Fatalf("Unexpected error: %v", err) 3690 } 3691 3692 // Queue up alot of messages. 3693 numRequests := 20_000 3694 for i := 0; i < numRequests; i++ { 3695 js.PublishAsync("S", []byte("Q")) 3696 } 3697 select { 3698 case <-js.PublishAsyncComplete(): 3699 case <-time.After(5 * time.Second): 3700 t.Fatalf("Did not receive completion signal") 3701 } 3702 3703 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3704 si, err := js.StreamInfo("S") 3705 if err != nil { 3706 t.Fatalf("Unexpected error: %v", err) 3707 } 3708 if si.State.Msgs != 0 { 3709 return fmt.Errorf("Expected 0 msgs, got state: %+v", si.State) 3710 } 3711 return nil 3712 }) 3713 } 3714 3715 func TestNoRaceJetStreamClusterMaxConsumersAndDirect(t *testing.T) { 3716 c := createJetStreamClusterExplicit(t, "R3S", 3) 3717 defer c.shutdown() 3718 3719 // Client based API 3720 s := c.randomServer() 3721 nc, js := jsClientConnect(t, s) 3722 defer nc.Close() 3723 3724 // We want to max sure max consumer limits do not affect mirrors or sources etc. 3725 _, err := js.AddStream(&nats.StreamConfig{Name: "S", Storage: nats.MemoryStorage, MaxConsumers: 1}) 3726 if err != nil { 3727 t.Fatalf("Unexpected error: %v", err) 3728 } 3729 3730 var mirrors []string 3731 for i := 0; i < 10; i++ { 3732 // Create a mirror. 3733 mname := fmt.Sprintf("M-%d", i+1) 3734 mirrors = append(mirrors, mname) 3735 _, err = js.AddStream(&nats.StreamConfig{Name: mname, Mirror: &nats.StreamSource{Name: "S"}}) 3736 if err != nil { 3737 t.Fatalf("Unexpected error: %v", err) 3738 } 3739 } 3740 3741 // Queue up messages. 3742 numRequests := 20 3743 for i := 0; i < numRequests; i++ { 3744 js.Publish("S", []byte("Q")) 3745 } 3746 3747 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3748 for _, mname := range mirrors { 3749 si, err := js.StreamInfo(mname) 3750 if err != nil { 3751 t.Fatalf("Unexpected error: %v", err) 3752 } 3753 if si.State.Msgs != uint64(numRequests) { 3754 return fmt.Errorf("Expected %d msgs for %q, got state: %+v", numRequests, mname, si.State) 3755 } 3756 } 3757 return nil 3758 }) 3759 } 3760 3761 // Make sure when we try to hard reset a stream state in a cluster that we also re-create the consumers. 3762 func TestNoRaceJetStreamClusterStreamReset(t *testing.T) { 3763 // Speed up raft 3764 omin, omax, ohb := minElectionTimeout, maxElectionTimeout, hbInterval 3765 minElectionTimeout = 250 * time.Millisecond 3766 maxElectionTimeout = time.Second 3767 hbInterval = 50 * time.Millisecond 3768 defer func() { 3769 minElectionTimeout = omin 3770 maxElectionTimeout = omax 3771 hbInterval = ohb 3772 }() 3773 3774 c := createJetStreamClusterExplicit(t, "R3S", 3) 3775 defer c.shutdown() 3776 3777 // Client based API 3778 s := c.randomServer() 3779 nc, js := jsClientConnect(t, s) 3780 defer nc.Close() 3781 3782 _, err := js.AddStream(&nats.StreamConfig{ 3783 Name: "TEST", 3784 Subjects: []string{"foo.*"}, 3785 Replicas: 2, 3786 Retention: nats.WorkQueuePolicy, 3787 }) 3788 if err != nil { 3789 t.Fatalf("Unexpected error: %v", err) 3790 } 3791 3792 numRequests := 20 3793 for i := 0; i < numRequests; i++ { 3794 js.Publish("foo.created", []byte("REQ")) 3795 } 3796 3797 // Durable. 3798 sub, err := js.SubscribeSync("foo.created", nats.Durable("d1")) 3799 if err != nil { 3800 t.Fatalf("Unexpected error: %v", err) 3801 } 3802 defer sub.Unsubscribe() 3803 3804 si, err := js.StreamInfo("TEST") 3805 require_NoError(t, err) 3806 require_True(t, si.State.Msgs == uint64(numRequests)) 3807 3808 // Let settle a bit for Go routine checks. 3809 time.Sleep(500 * time.Millisecond) 3810 3811 // Grab number go routines. 3812 base := runtime.NumGoroutine() 3813 3814 // Make the consumer busy here by async sending a bunch of messages. 3815 for i := 0; i < numRequests*10; i++ { 3816 js.PublishAsync("foo.created", []byte("REQ")) 3817 } 3818 3819 // Grab a server that is the consumer leader for the durable. 3820 cl := c.consumerLeader("$G", "TEST", "d1") 3821 mset, err := cl.GlobalAccount().lookupStream("TEST") 3822 if err != nil { 3823 t.Fatalf("Unexpected error: %v", err) 3824 } 3825 // Do a hard reset here by hand. 3826 mset.resetClusteredState(nil) 3827 3828 // Wait til we have the consumer leader re-elected. 3829 c.waitOnConsumerLeader("$G", "TEST", "d1") 3830 3831 // Make sure we can get the consumer info eventually. 3832 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 3833 _, err := js.ConsumerInfo("TEST", "d1", nats.MaxWait(250*time.Millisecond)) 3834 return err 3835 }) 3836 3837 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 3838 if after := runtime.NumGoroutine(); base > after { 3839 return fmt.Errorf("Expected %d go routines, got %d", base, after) 3840 } 3841 return nil 3842 }) 3843 3844 // Simulate a low level write error on our consumer and make sure we can recover etc. 3845 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 3846 if cl = c.consumerLeader("$G", "TEST", "d1"); cl != nil { 3847 return nil 3848 } 3849 return errors.New("waiting on consumer leader") 3850 }) 3851 3852 mset, err = cl.GlobalAccount().lookupStream("TEST") 3853 if err != nil { 3854 t.Fatalf("Unexpected error: %v", err) 3855 } 3856 o := mset.lookupConsumer("d1") 3857 if o == nil { 3858 t.Fatalf("Did not retrieve consumer") 3859 } 3860 node := o.raftNode().(*raft) 3861 if node == nil { 3862 t.Fatalf("could not retrieve the raft node for consumer") 3863 } 3864 3865 nc.Close() 3866 node.setWriteErr(io.ErrShortWrite) 3867 3868 c.stopAll() 3869 c.restartAll() 3870 3871 c.waitOnStreamLeader("$G", "TEST") 3872 c.waitOnConsumerLeader("$G", "TEST", "d1") 3873 } 3874 3875 // Reports of high cpu on compaction for a KV store. 3876 func TestNoRaceJetStreamKeyValueCompaction(t *testing.T) { 3877 c := createJetStreamClusterExplicit(t, "R3S", 3) 3878 defer c.shutdown() 3879 3880 // Client based API 3881 nc, js := jsClientConnect(t, c.randomServer()) 3882 defer nc.Close() 3883 3884 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 3885 Bucket: "COMPACT", 3886 Replicas: 3, 3887 }) 3888 if err != nil { 3889 t.Fatalf("Unexpected error: %v", err) 3890 } 3891 3892 value := strings.Repeat("A", 128*1024) 3893 for i := 0; i < 5_000; i++ { 3894 key := fmt.Sprintf("K-%d", rand.Intn(256)+1) 3895 if _, err := kv.PutString(key, value); err != nil { 3896 t.Fatalf("Unexpected error: %v", err) 3897 } 3898 } 3899 } 3900 3901 // Trying to recreate an issue rip saw with KV and server restarts complaining about 3902 // mismatch for a few minutes and growing memory. 3903 func TestNoRaceJetStreamClusterStreamSeqMismatchIssue(t *testing.T) { 3904 c := createJetStreamClusterExplicit(t, "R3S", 3) 3905 defer c.shutdown() 3906 3907 // Client based API 3908 nc, js := jsClientConnect(t, c.randomServer()) 3909 defer nc.Close() 3910 3911 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 3912 Bucket: "MM", 3913 Replicas: 3, 3914 TTL: 500 * time.Millisecond, 3915 }) 3916 require_NoError(t, err) 3917 3918 for i := 1; i <= 10; i++ { 3919 if _, err := kv.PutString("k", "1"); err != nil { 3920 t.Fatalf("Unexpected error: %v", err) 3921 } 3922 } 3923 // Close in case we are connected here. Will recreate. 3924 nc.Close() 3925 3926 // Shutdown a non-leader. 3927 s := c.randomNonStreamLeader("$G", "KV_MM") 3928 s.Shutdown() 3929 3930 nc, js = jsClientConnect(t, c.randomServer()) 3931 defer nc.Close() 3932 3933 kv, err = js.KeyValue("MM") 3934 require_NoError(t, err) 3935 3936 // Now change the state of the stream such that we have to do a compact upon restart 3937 // of the downed server. 3938 for i := 1; i <= 10; i++ { 3939 if _, err := kv.PutString("k", "2"); err != nil { 3940 t.Fatalf("Unexpected error: %v", err) 3941 } 3942 } 3943 3944 // Raft could save us here so need to run a compact on the leader. 3945 snapshotLeader := func() { 3946 sl := c.streamLeader("$G", "KV_MM") 3947 if sl == nil { 3948 t.Fatalf("Did not get the leader") 3949 } 3950 mset, err := sl.GlobalAccount().lookupStream("KV_MM") 3951 require_NoError(t, err) 3952 node := mset.raftNode() 3953 if node == nil { 3954 t.Fatalf("Could not get stream group") 3955 } 3956 if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil { 3957 t.Fatalf("Error installing snapshot: %v", err) 3958 } 3959 } 3960 3961 // Now wait for expiration 3962 time.Sleep(time.Second) 3963 3964 snapshotLeader() 3965 3966 s = c.restartServer(s) 3967 c.waitOnServerCurrent(s) 3968 3969 // We want to make sure we do not reset the raft state on a catchup due to no request yield. 3970 // Bug was if we did not actually request any help from snapshot we did not set mset.lseq properly. 3971 // So when we send next batch that would cause raft reset due to cluster reset for our stream. 3972 mset, err := s.GlobalAccount().lookupStream("KV_MM") 3973 require_NoError(t, err) 3974 3975 for i := 1; i <= 10; i++ { 3976 if _, err := kv.PutString("k1", "X"); err != nil { 3977 t.Fatalf("Unexpected error: %v", err) 3978 } 3979 } 3980 3981 c.waitOnStreamCurrent(s, "$G", "KV_MM") 3982 3983 // Make sure we did not reset our stream. 3984 msetNew, err := s.GlobalAccount().lookupStream("KV_MM") 3985 require_NoError(t, err) 3986 if msetNew != mset { 3987 t.Fatalf("Stream was reset") 3988 } 3989 } 3990 3991 func TestNoRaceJetStreamClusterStreamDropCLFS(t *testing.T) { 3992 c := createJetStreamClusterExplicit(t, "R3S", 3) 3993 defer c.shutdown() 3994 3995 // Client based API 3996 nc, js := jsClientConnect(t, c.randomServer()) 3997 defer nc.Close() 3998 3999 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 4000 Bucket: "CLFS", 4001 Replicas: 3, 4002 }) 4003 require_NoError(t, err) 4004 4005 // Will work 4006 _, err = kv.Create("k.1", []byte("X")) 4007 require_NoError(t, err) 4008 // Drive up CLFS state on leader. 4009 for i := 0; i < 10; i++ { 4010 _, err = kv.Create("k.1", []byte("X")) 4011 require_Error(t, err) 4012 } 4013 // Bookend with new key success. 4014 _, err = kv.Create("k.2", []byte("Z")) 4015 require_NoError(t, err) 4016 4017 // Close in case we are connected here. Will recreate. 4018 nc.Close() 4019 4020 // Shutdown, which will also clear clfs. 4021 s := c.randomNonStreamLeader("$G", "KV_CLFS") 4022 s.Shutdown() 4023 4024 nc, js = jsClientConnect(t, c.randomServer()) 4025 defer nc.Close() 4026 4027 kv, err = js.KeyValue("CLFS") 4028 require_NoError(t, err) 4029 4030 // Drive up CLFS state on leader. 4031 for i := 0; i < 10; i++ { 4032 _, err = kv.Create("k.1", []byte("X")) 4033 require_Error(t, err) 4034 } 4035 4036 sl := c.streamLeader("$G", "KV_CLFS") 4037 if sl == nil { 4038 t.Fatalf("Did not get the leader") 4039 } 4040 mset, err := sl.GlobalAccount().lookupStream("KV_CLFS") 4041 require_NoError(t, err) 4042 node := mset.raftNode() 4043 if node == nil { 4044 t.Fatalf("Could not get stream group") 4045 } 4046 if err := node.InstallSnapshot(mset.stateSnapshot()); err != nil { 4047 t.Fatalf("Error installing snapshot: %v", err) 4048 } 4049 4050 _, err = kv.Create("k.3", []byte("ZZZ")) 4051 require_NoError(t, err) 4052 4053 s = c.restartServer(s) 4054 c.waitOnServerCurrent(s) 4055 4056 mset, err = s.GlobalAccount().lookupStream("KV_CLFS") 4057 require_NoError(t, err) 4058 4059 _, err = kv.Create("k.4", []byte("YYY")) 4060 require_NoError(t, err) 4061 4062 c.waitOnStreamCurrent(s, "$G", "KV_CLFS") 4063 4064 // Make sure we did not reset our stream. 4065 msetNew, err := s.GlobalAccount().lookupStream("KV_CLFS") 4066 require_NoError(t, err) 4067 if msetNew != mset { 4068 t.Fatalf("Stream was reset") 4069 } 4070 } 4071 4072 func TestNoRaceJetStreamMemstoreWithLargeInteriorDeletes(t *testing.T) { 4073 s := RunBasicJetStreamServer(t) 4074 defer s.Shutdown() 4075 4076 // Client for API requests. 4077 nc, js := jsClientConnect(t, s) 4078 defer nc.Close() 4079 4080 _, err := js.AddStream(&nats.StreamConfig{ 4081 Name: "TEST", 4082 Subjects: []string{"foo", "bar"}, 4083 MaxMsgsPerSubject: 1, 4084 Storage: nats.MemoryStorage, 4085 }) 4086 require_NoError(t, err) 4087 4088 acc, err := s.lookupAccount("$G") 4089 require_NoError(t, err) 4090 mset, err := acc.lookupStream("TEST") 4091 require_NoError(t, err) 4092 4093 msg := []byte("Hello World!") 4094 if _, err := js.PublishAsync("foo", msg); err != nil { 4095 t.Fatalf("Unexpected publish error: %v", err) 4096 } 4097 for i := 1; i <= 1_000_000; i++ { 4098 if _, err := js.PublishAsync("bar", msg); err != nil { 4099 t.Fatalf("Unexpected publish error: %v", err) 4100 } 4101 } 4102 select { 4103 case <-js.PublishAsyncComplete(): 4104 case <-time.After(5 * time.Second): 4105 t.Fatalf("Did not receive completion signal") 4106 } 4107 4108 now := time.Now() 4109 ss := mset.stateWithDetail(true) 4110 // Before the fix the snapshot for this test would be > 200ms on my setup. 4111 if elapsed := time.Since(now); elapsed > 100*time.Millisecond { 4112 t.Fatalf("Took too long to snapshot: %v", elapsed) 4113 } else if elapsed > 50*time.Millisecond { 4114 t.Logf("WRN: Took longer than usual to snapshot: %v", elapsed) 4115 } 4116 4117 if ss.Msgs != 2 || ss.FirstSeq != 1 || ss.LastSeq != 1_000_001 || ss.NumDeleted != 999999 { 4118 // To not print out on error. 4119 ss.Deleted = nil 4120 t.Fatalf("Bad State: %+v", ss) 4121 } 4122 } 4123 4124 // This is related to an issue reported where we were exhausting threads by trying to 4125 // cleanup too many consumers at the same time. 4126 // https://github.com/nats-io/nats-server/issues/2742 4127 func TestNoRaceJetStreamConsumerFileStoreConcurrentDiskIO(t *testing.T) { 4128 storeDir := t.TempDir() 4129 4130 // Artificially adjust our environment for this test. 4131 gmp := runtime.GOMAXPROCS(32) 4132 defer runtime.GOMAXPROCS(gmp) 4133 4134 maxT := debug.SetMaxThreads(1050) // 1024 now 4135 defer debug.SetMaxThreads(maxT) 4136 4137 fs, err := newFileStore(FileStoreConfig{StoreDir: storeDir}, StreamConfig{Name: "MT", Storage: FileStorage}) 4138 require_NoError(t, err) 4139 defer fs.Stop() 4140 4141 startCh := make(chan bool) 4142 var wg sync.WaitGroup 4143 var swg sync.WaitGroup 4144 4145 ts := time.Now().UnixNano() 4146 4147 // Create 1000 consumerStores 4148 n := 1000 4149 swg.Add(n) 4150 4151 for i := 1; i <= n; i++ { 4152 name := fmt.Sprintf("o%d", i) 4153 o, err := fs.ConsumerStore(name, &ConsumerConfig{AckPolicy: AckExplicit}) 4154 require_NoError(t, err) 4155 wg.Add(1) 4156 swg.Done() 4157 4158 go func() { 4159 defer wg.Done() 4160 // Will make everyone run concurrently. 4161 <-startCh 4162 o.UpdateDelivered(22, 22, 1, ts) 4163 buf, _ := o.(*consumerFileStore).encodeState() 4164 o.(*consumerFileStore).writeState(buf) 4165 o.Delete() 4166 }() 4167 } 4168 4169 swg.Wait() 4170 close(startCh) 4171 wg.Wait() 4172 } 4173 4174 func TestNoRaceJetStreamClusterHealthz(t *testing.T) { 4175 c := createJetStreamCluster(t, jsClusterAccountsTempl, "HZ", _EMPTY_, 3, 23033, true) 4176 defer c.shutdown() 4177 4178 nc1, js1 := jsClientConnect(t, c.randomServer(), nats.UserInfo("one", "p")) 4179 defer nc1.Close() 4180 4181 nc2, js2 := jsClientConnect(t, c.randomServer(), nats.UserInfo("two", "p")) 4182 defer nc2.Close() 4183 4184 var err error 4185 for _, sname := range []string{"foo", "bar", "baz"} { 4186 _, err = js1.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 4187 require_NoError(t, err) 4188 _, err = js2.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 4189 require_NoError(t, err) 4190 } 4191 // R1 4192 _, err = js1.AddStream(&nats.StreamConfig{Name: "r1", Replicas: 1}) 4193 require_NoError(t, err) 4194 4195 // Now shutdown then send a bunch of data. 4196 s := c.servers[0] 4197 s.Shutdown() 4198 4199 for i := 0; i < 5_000; i++ { 4200 _, err = js1.PublishAsync("foo", []byte("OK")) 4201 require_NoError(t, err) 4202 _, err = js2.PublishAsync("bar", []byte("OK")) 4203 require_NoError(t, err) 4204 } 4205 select { 4206 case <-js1.PublishAsyncComplete(): 4207 case <-time.After(5 * time.Second): 4208 t.Fatalf("Did not receive completion signal") 4209 } 4210 select { 4211 case <-js2.PublishAsyncComplete(): 4212 case <-time.After(5 * time.Second): 4213 t.Fatalf("Did not receive completion signal") 4214 } 4215 4216 s = c.restartServer(s) 4217 opts := s.getOpts() 4218 opts.HTTPHost = "127.0.0.1" 4219 opts.HTTPPort = 11222 4220 err = s.StartMonitoring() 4221 require_NoError(t, err) 4222 url := fmt.Sprintf("http://127.0.0.1:%d/healthz", opts.HTTPPort) 4223 4224 getHealth := func() (int, *HealthStatus) { 4225 resp, err := http.Get(url) 4226 require_NoError(t, err) 4227 defer resp.Body.Close() 4228 body, err := io.ReadAll(resp.Body) 4229 require_NoError(t, err) 4230 var hs HealthStatus 4231 err = json.Unmarshal(body, &hs) 4232 require_NoError(t, err) 4233 return resp.StatusCode, &hs 4234 } 4235 4236 errors := 0 4237 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 4238 code, hs := getHealth() 4239 if code >= 200 && code < 300 { 4240 return nil 4241 } 4242 errors++ 4243 return fmt.Errorf("Got %d status with %+v", code, hs) 4244 }) 4245 if errors == 0 { 4246 t.Fatalf("Expected to have some errors until we became current, got none") 4247 } 4248 } 4249 4250 // Test that we can receive larger messages with stream subject details. 4251 // Also test that we will fail at some point and the user can fall back to 4252 // an orderedconsumer like we do with watch for KV Keys() call. 4253 func TestNoRaceJetStreamStreamInfoSubjectDetailsLimits(t *testing.T) { 4254 conf := createConfFile(t, []byte(fmt.Sprintf(` 4255 listen: 127.0.0.1:-1 4256 jetstream { 4257 store_dir = %q 4258 } 4259 accounts: { 4260 default: { 4261 jetstream: true 4262 users: [ {user: me, password: pwd} ] 4263 limits { max_payload: 512 } 4264 } 4265 } 4266 `, t.TempDir()))) 4267 4268 s, _ := RunServerWithConfig(conf) 4269 if config := s.JetStreamConfig(); config != nil { 4270 defer removeDir(t, config.StoreDir) 4271 } 4272 defer s.Shutdown() 4273 4274 nc, js := jsClientConnect(t, s, nats.UserInfo("me", "pwd")) 4275 defer nc.Close() 4276 4277 // Make sure to flush so we process the 2nd INFO. 4278 nc.Flush() 4279 4280 // Make sure we cannot send larger than 512 bytes. 4281 // But we can receive larger. 4282 sub, err := nc.SubscribeSync("foo") 4283 require_NoError(t, err) 4284 err = nc.Publish("foo", []byte(strings.Repeat("A", 600))) 4285 require_Error(t, err, nats.ErrMaxPayload) 4286 sub.Unsubscribe() 4287 4288 _, err = js.AddStream(&nats.StreamConfig{ 4289 Name: "TEST", 4290 Subjects: []string{"*", "X.*"}, 4291 }) 4292 require_NoError(t, err) 4293 4294 n := JSMaxSubjectDetails 4295 for i := 0; i < n; i++ { 4296 _, err := js.PublishAsync(fmt.Sprintf("X.%d", i), []byte("OK")) 4297 require_NoError(t, err) 4298 } 4299 select { 4300 case <-js.PublishAsyncComplete(): 4301 case <-time.After(5 * time.Second): 4302 t.Fatalf("Did not receive completion signal") 4303 } 4304 4305 // Need to grab StreamInfo by hand for now. 4306 req, err := json.Marshal(&JSApiStreamInfoRequest{SubjectsFilter: "X.*"}) 4307 require_NoError(t, err) 4308 resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second) 4309 require_NoError(t, err) 4310 var si StreamInfo 4311 err = json.Unmarshal(resp.Data, &si) 4312 require_NoError(t, err) 4313 if len(si.State.Subjects) != n { 4314 t.Fatalf("Expected to get %d subject details, got %d", n, len(si.State.Subjects)) 4315 } 4316 4317 // Now add one more message to check pagination 4318 _, err = js.Publish("foo", []byte("TOO MUCH")) 4319 require_NoError(t, err) 4320 4321 req, err = json.Marshal(&JSApiStreamInfoRequest{ApiPagedRequest: ApiPagedRequest{Offset: n}, SubjectsFilter: nats.AllKeys}) 4322 require_NoError(t, err) 4323 resp, err = nc.Request(fmt.Sprintf(JSApiStreamInfoT, "TEST"), req, 5*time.Second) 4324 require_NoError(t, err) 4325 var sir JSApiStreamInfoResponse 4326 err = json.Unmarshal(resp.Data, &sir) 4327 require_NoError(t, err) 4328 if len(sir.State.Subjects) != 1 { 4329 t.Fatalf("Expected to get 1 extra subject detail, got %d", len(sir.State.Subjects)) 4330 } 4331 } 4332 4333 func TestNoRaceJetStreamSparseConsumers(t *testing.T) { 4334 s := RunBasicJetStreamServer(t) 4335 defer s.Shutdown() 4336 4337 nc, js := jsClientConnect(t, s) 4338 defer nc.Close() 4339 4340 msg := []byte("ok") 4341 4342 cases := []struct { 4343 name string 4344 mconfig *nats.StreamConfig 4345 }{ 4346 {"MemoryStore", &nats.StreamConfig{Name: "TEST", Storage: nats.MemoryStorage, MaxMsgsPerSubject: 25_000_000, 4347 Subjects: []string{"*"}}}, 4348 {"FileStore", &nats.StreamConfig{Name: "TEST", Storage: nats.FileStorage, MaxMsgsPerSubject: 25_000_000, 4349 Subjects: []string{"*"}}}, 4350 } 4351 for _, c := range cases { 4352 t.Run(c.name, func(t *testing.T) { 4353 js.DeleteStream("TEST") 4354 _, err := js.AddStream(c.mconfig) 4355 require_NoError(t, err) 4356 4357 // We will purposely place foo msgs near the beginning, then in middle, then at the end. 4358 for n := 0; n < 2; n++ { 4359 _, err = js.PublishAsync("foo", msg, nats.StallWait(800*time.Millisecond)) 4360 require_NoError(t, err) 4361 4362 for i := 0; i < 1_000_000; i++ { 4363 _, err = js.PublishAsync("bar", msg, nats.StallWait(800*time.Millisecond)) 4364 require_NoError(t, err) 4365 } 4366 _, err = js.PublishAsync("foo", msg, nats.StallWait(800*time.Millisecond)) 4367 require_NoError(t, err) 4368 } 4369 select { 4370 case <-js.PublishAsyncComplete(): 4371 case <-time.After(5 * time.Second): 4372 t.Fatalf("Did not receive completion signal") 4373 } 4374 4375 // Now create a consumer on foo. 4376 ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{DeliverSubject: "x.x", FilterSubject: "foo", AckPolicy: nats.AckNonePolicy}) 4377 require_NoError(t, err) 4378 4379 done, received := make(chan bool), uint64(0) 4380 4381 cb := func(m *nats.Msg) { 4382 received++ 4383 if received >= ci.NumPending { 4384 done <- true 4385 } 4386 } 4387 4388 sub, err := nc.Subscribe("x.x", cb) 4389 require_NoError(t, err) 4390 defer sub.Unsubscribe() 4391 start := time.Now() 4392 var elapsed time.Duration 4393 4394 select { 4395 case <-done: 4396 elapsed = time.Since(start) 4397 case <-time.After(10 * time.Second): 4398 t.Fatal("Did not receive all messages for all consumers in time") 4399 } 4400 4401 if elapsed > 500*time.Millisecond { 4402 t.Fatalf("Getting all messages took longer than expected: %v", elapsed) 4403 } 4404 }) 4405 } 4406 } 4407 4408 func TestNoRaceJetStreamConsumerFilterPerfDegradation(t *testing.T) { 4409 s := RunBasicJetStreamServer(t) 4410 defer s.Shutdown() 4411 4412 nc, _ := jsClientConnect(t, s) 4413 defer nc.Close() 4414 4415 js, err := nc.JetStream(nats.PublishAsyncMaxPending(256)) 4416 require_NoError(t, err) 4417 4418 _, err = js.AddStream(&nats.StreamConfig{ 4419 Name: "test", 4420 Subjects: []string{"test.*.subj"}, 4421 Replicas: 1, 4422 }) 4423 require_NoError(t, err) 4424 4425 toSend := 50_000 4426 count := 0 4427 ch := make(chan struct{}, 6) 4428 _, err = js.Subscribe("test.*.subj", func(m *nats.Msg) { 4429 m.Ack() 4430 if count++; count == toSend { 4431 ch <- struct{}{} 4432 } 4433 }, nats.DeliverNew(), nats.ManualAck()) 4434 require_NoError(t, err) 4435 4436 msg := make([]byte, 1024) 4437 sent := int32(0) 4438 send := func() { 4439 defer func() { ch <- struct{}{} }() 4440 for i := 0; i < toSend/5; i++ { 4441 msgID := atomic.AddInt32(&sent, 1) 4442 _, err := js.Publish(fmt.Sprintf("test.%d.subj", msgID), msg) 4443 if err != nil { 4444 t.Error(err) 4445 return 4446 } 4447 } 4448 } 4449 for i := 0; i < 5; i++ { 4450 go send() 4451 } 4452 timeout := time.NewTimer(10 * time.Second) 4453 for i := 0; i < 6; i++ { 4454 select { 4455 case <-ch: 4456 case <-timeout.C: 4457 t.Fatal("Took too long") 4458 } 4459 } 4460 } 4461 4462 func TestNoRaceJetStreamFileStoreKeyFileCleanup(t *testing.T) { 4463 storeDir := t.TempDir() 4464 4465 prf := func(context []byte) ([]byte, error) { 4466 h := hmac.New(sha256.New, []byte("dlc22")) 4467 if _, err := h.Write(context); err != nil { 4468 return nil, err 4469 } 4470 return h.Sum(nil), nil 4471 } 4472 4473 fs, err := newFileStoreWithCreated( 4474 FileStoreConfig{StoreDir: storeDir, BlockSize: 1024 * 1024}, 4475 StreamConfig{Name: "TEST", Storage: FileStorage}, 4476 time.Now(), 4477 prf, nil) 4478 require_NoError(t, err) 4479 defer fs.Stop() 4480 4481 n, msg := 10_000, []byte(strings.Repeat("Z", 1024)) 4482 for i := 0; i < n; i++ { 4483 _, _, err := fs.StoreMsg(fmt.Sprintf("X.%d", i), nil, msg) 4484 require_NoError(t, err) 4485 } 4486 4487 var seqs []uint64 4488 for i := 1; i <= n; i++ { 4489 seqs = append(seqs, uint64(i)) 4490 } 4491 // Randomly delete msgs, make sure we cleanup as we empty the message blocks. 4492 rand.Shuffle(len(seqs), func(i, j int) { seqs[i], seqs[j] = seqs[j], seqs[i] }) 4493 4494 for _, seq := range seqs { 4495 _, err := fs.RemoveMsg(seq) 4496 require_NoError(t, err) 4497 } 4498 4499 // We will have cleanup the main .blk and .idx sans the lmb, but we should not have any *.fss files. 4500 kms, err := filepath.Glob(filepath.Join(storeDir, msgDir, keyScanAll)) 4501 require_NoError(t, err) 4502 4503 if len(kms) > 1 { 4504 t.Fatalf("Expected to find only 1 key file, found %d", len(kms)) 4505 } 4506 } 4507 4508 func TestNoRaceJetStreamMsgIdPerfDuringCatchup(t *testing.T) { 4509 // Uncomment to run. Needs to be on a bigger machine. Do not want as part of Travis tests atm. 4510 skip(t) 4511 4512 c := createJetStreamClusterExplicit(t, "JSC", 3) 4513 defer c.shutdown() 4514 4515 nc, js := jsClientConnect(t, c.serverByName("S-1")) 4516 defer nc.Close() 4517 4518 _, err := js.AddStream(&nats.StreamConfig{ 4519 Name: "TEST", 4520 Replicas: 3, 4521 }) 4522 require_NoError(t, err) 4523 4524 // This will be the one we restart. 4525 sl := c.streamLeader("$G", "TEST") 4526 // Now move leader. 4527 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 4528 require_NoError(t, err) 4529 c.waitOnStreamLeader("$G", "TEST") 4530 4531 // Connect to new leader. 4532 nc, _ = jsClientConnect(t, c.streamLeader("$G", "TEST")) 4533 defer nc.Close() 4534 4535 js, err = nc.JetStream(nats.PublishAsyncMaxPending(1024)) 4536 require_NoError(t, err) 4537 4538 n, ss, sr := 1_000_000, 250_000, 800_000 4539 m := nats.NewMsg("TEST") 4540 m.Data = []byte(strings.Repeat("Z", 2048)) 4541 4542 // Target rate 10k msgs/sec 4543 start := time.Now() 4544 4545 for i := 0; i < n; i++ { 4546 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4547 _, err := js.PublishMsgAsync(m) 4548 require_NoError(t, err) 4549 //time.Sleep(42 * time.Microsecond) 4550 if i == ss { 4551 fmt.Printf("SD") 4552 sl.Shutdown() 4553 } else if i == sr { 4554 nc.Flush() 4555 select { 4556 case <-js.PublishAsyncComplete(): 4557 case <-time.After(10 * time.Second): 4558 } 4559 fmt.Printf("RS") 4560 sl = c.restartServer(sl) 4561 } 4562 if i%10_000 == 0 { 4563 fmt.Print("#") 4564 } 4565 } 4566 fmt.Println() 4567 4568 // Wait to receive all messages. 4569 select { 4570 case <-js.PublishAsyncComplete(): 4571 case <-time.After(20 * time.Second): 4572 t.Fatalf("Did not receive completion signal") 4573 } 4574 4575 tt := time.Since(start) 4576 si, err := js.StreamInfo("TEST") 4577 require_NoError(t, err) 4578 4579 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4580 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4581 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4582 4583 c.waitOnStreamCurrent(sl, "$G", "TEST") 4584 for _, s := range c.servers { 4585 mset, _ := s.GlobalAccount().lookupStream("TEST") 4586 if state := mset.store.State(); state.Msgs != uint64(n) { 4587 t.Fatalf("Expected server %v to have correct number of msgs %d but got %d", s, n, state.Msgs) 4588 } 4589 } 4590 } 4591 4592 func TestNoRaceJetStreamRebuildDeDupeAndMemoryPerf(t *testing.T) { 4593 skip(t) 4594 4595 s := RunBasicJetStreamServer(t) 4596 defer s.Shutdown() 4597 4598 nc, js := jsClientConnect(t, s) 4599 defer nc.Close() 4600 4601 _, err := js.AddStream(&nats.StreamConfig{Name: "DD"}) 4602 require_NoError(t, err) 4603 4604 m := nats.NewMsg("DD") 4605 m.Data = []byte(strings.Repeat("Z", 2048)) 4606 4607 start := time.Now() 4608 4609 n := 1_000_000 4610 for i := 0; i < n; i++ { 4611 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4612 _, err := js.PublishMsgAsync(m) 4613 require_NoError(t, err) 4614 } 4615 4616 select { 4617 case <-js.PublishAsyncComplete(): 4618 case <-time.After(20 * time.Second): 4619 t.Fatalf("Did not receive completion signal") 4620 } 4621 4622 tt := time.Since(start) 4623 si, err := js.StreamInfo("DD") 4624 require_NoError(t, err) 4625 4626 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4627 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4628 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4629 4630 v, _ := s.Varz(nil) 4631 fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem)) 4632 4633 mset, err := s.GlobalAccount().lookupStream("DD") 4634 require_NoError(t, err) 4635 4636 mset.mu.Lock() 4637 mset.ddloaded = false 4638 start = time.Now() 4639 mset.rebuildDedupe() 4640 fmt.Printf("TOOK %v to rebuild dd\n", time.Since(start)) 4641 mset.mu.Unlock() 4642 4643 v, _ = s.Varz(nil) 4644 fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem)) 4645 4646 // Now do an ephemeral consumer and whip through every message. Doing same calculations. 4647 start = time.Now() 4648 received, done := 0, make(chan bool) 4649 sub, err := js.Subscribe("DD", func(m *nats.Msg) { 4650 received++ 4651 if received >= n { 4652 done <- true 4653 } 4654 }, nats.OrderedConsumer()) 4655 require_NoError(t, err) 4656 4657 select { 4658 case <-done: 4659 case <-time.After(10 * time.Second): 4660 if s.NumSlowConsumers() > 0 { 4661 t.Fatalf("Did not receive all large messages due to slow consumer status: %d of %d", received, n) 4662 } 4663 t.Fatalf("Failed to receive all large messages: %d of %d\n", received, n) 4664 } 4665 4666 fmt.Printf("TOOK %v to receive all %d msgs\n", time.Since(start), n) 4667 sub.Unsubscribe() 4668 4669 v, _ = s.Varz(nil) 4670 fmt.Printf("Memory: %v\n", friendlyBytes(v.Mem)) 4671 } 4672 4673 func TestNoRaceJetStreamMemoryUsageOnLimitedStreamWithMirror(t *testing.T) { 4674 skip(t) 4675 4676 s := RunBasicJetStreamServer(t) 4677 defer s.Shutdown() 4678 4679 nc, js := jsClientConnect(t, s) 4680 defer nc.Close() 4681 4682 _, err := js.AddStream(&nats.StreamConfig{Name: "DD", Subjects: []string{"ORDERS.*"}, MaxMsgs: 10_000}) 4683 require_NoError(t, err) 4684 4685 _, err = js.AddStream(&nats.StreamConfig{ 4686 Name: "M", 4687 Mirror: &nats.StreamSource{Name: "DD"}, 4688 MaxMsgs: 10_000, 4689 }) 4690 require_NoError(t, err) 4691 4692 m := nats.NewMsg("ORDERS.0") 4693 m.Data = []byte(strings.Repeat("Z", 2048)) 4694 4695 start := time.Now() 4696 4697 n := 1_000_000 4698 for i := 0; i < n; i++ { 4699 m.Subject = fmt.Sprintf("ORDERS.%d", i) 4700 m.Header.Set(JSMsgId, strconv.Itoa(i)) 4701 _, err := js.PublishMsgAsync(m) 4702 require_NoError(t, err) 4703 } 4704 4705 select { 4706 case <-js.PublishAsyncComplete(): 4707 case <-time.After(20 * time.Second): 4708 t.Fatalf("Did not receive completion signal") 4709 } 4710 4711 tt := time.Since(start) 4712 si, err := js.StreamInfo("DD") 4713 require_NoError(t, err) 4714 4715 fmt.Printf("Took %v to send %d msgs\n", tt, n) 4716 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4717 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4718 4719 v, _ := s.Varz(nil) 4720 fmt.Printf("Memory AFTER SEND: %v\n", friendlyBytes(v.Mem)) 4721 } 4722 4723 func TestNoRaceJetStreamOrderedConsumerLongRTTPerformance(t *testing.T) { 4724 skip(t) 4725 4726 s := RunBasicJetStreamServer(t) 4727 defer s.Shutdown() 4728 4729 nc, _ := jsClientConnect(t, s) 4730 defer nc.Close() 4731 4732 js, err := nc.JetStream(nats.PublishAsyncMaxPending(1000)) 4733 require_NoError(t, err) 4734 4735 _, err = js.AddStream(&nats.StreamConfig{Name: "OCP"}) 4736 require_NoError(t, err) 4737 4738 n, msg := 100_000, []byte(strings.Repeat("D", 30_000)) 4739 4740 for i := 0; i < n; i++ { 4741 _, err := js.PublishAsync("OCP", msg) 4742 require_NoError(t, err) 4743 } 4744 select { 4745 case <-js.PublishAsyncComplete(): 4746 case <-time.After(5 * time.Second): 4747 t.Fatalf("Did not receive completion signal") 4748 } 4749 4750 // Approximately 3GB 4751 si, err := js.StreamInfo("OCP") 4752 require_NoError(t, err) 4753 4754 start := time.Now() 4755 received, done := 0, make(chan bool) 4756 sub, err := js.Subscribe("OCP", func(m *nats.Msg) { 4757 received++ 4758 if received >= n { 4759 done <- true 4760 } 4761 }, nats.OrderedConsumer()) 4762 require_NoError(t, err) 4763 defer sub.Unsubscribe() 4764 4765 // Wait to receive all messages. 4766 select { 4767 case <-done: 4768 case <-time.After(30 * time.Second): 4769 t.Fatalf("Did not receive all of our messages") 4770 } 4771 4772 tt := time.Since(start) 4773 fmt.Printf("Took %v to receive %d msgs\n", tt, n) 4774 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4775 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4776 4777 sub.Unsubscribe() 4778 4779 rtt := 10 * time.Millisecond 4780 bw := 10 * 1024 * 1024 * 1024 4781 proxy := newNetProxy(rtt, bw, bw, s.ClientURL()) 4782 defer proxy.stop() 4783 4784 nc, err = nats.Connect(proxy.clientURL()) 4785 require_NoError(t, err) 4786 defer nc.Close() 4787 js, err = nc.JetStream() 4788 require_NoError(t, err) 4789 4790 start, received = time.Now(), 0 4791 sub, err = js.Subscribe("OCP", func(m *nats.Msg) { 4792 received++ 4793 if received >= n { 4794 done <- true 4795 } 4796 }, nats.OrderedConsumer()) 4797 require_NoError(t, err) 4798 defer sub.Unsubscribe() 4799 4800 // Wait to receive all messages. 4801 select { 4802 case <-done: 4803 case <-time.After(60 * time.Second): 4804 t.Fatalf("Did not receive all of our messages") 4805 } 4806 4807 tt = time.Since(start) 4808 fmt.Printf("Proxy RTT: %v, UP: %d, DOWN: %d\n", rtt, bw, bw) 4809 fmt.Printf("Took %v to receive %d msgs\n", tt, n) 4810 fmt.Printf("%.0f msgs/s\n", float64(n)/tt.Seconds()) 4811 fmt.Printf("%.0f mb/s\n\n", float64(si.State.Bytes/(1024*1024))/tt.Seconds()) 4812 } 4813 4814 var jsClusterStallCatchupTempl = ` 4815 listen: 127.0.0.1:-1 4816 server_name: %s 4817 jetstream: {max_mem_store: 256MB, max_file_store: 32GB, store_dir: '%s'} 4818 4819 leaf { 4820 listen: 127.0.0.1:-1 4821 } 4822 4823 cluster { 4824 name: %s 4825 listen: 127.0.0.1:%d 4826 routes = [%s] 4827 } 4828 4829 # For access to system account. 4830 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 4831 ` 4832 4833 // Test our global stall gate for outstanding catchup bytes. 4834 func TestNoRaceJetStreamClusterCatchupStallGate(t *testing.T) { 4835 skip(t) 4836 4837 c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3) 4838 defer c.shutdown() 4839 4840 nc, js := jsClientConnect(t, c.randomServer()) 4841 defer nc.Close() 4842 4843 // ~100k per message. 4844 msg := []byte(strings.Repeat("A", 99_960)) 4845 4846 // Create 200 streams with 100MB. 4847 // Each server has ~2GB 4848 var wg sync.WaitGroup 4849 for i := 0; i < 20; i++ { 4850 wg.Add(1) 4851 go func(x int) { 4852 defer wg.Done() 4853 for n := 1; n <= 10; n++ { 4854 sn := fmt.Sprintf("S-%d", n+x) 4855 _, err := js.AddStream(&nats.StreamConfig{ 4856 Name: sn, 4857 Replicas: 3, 4858 }) 4859 require_NoError(t, err) 4860 for i := 0; i < 100; i++ { 4861 _, err := js.Publish(sn, msg) 4862 require_NoError(t, err) 4863 } 4864 } 4865 }(i * 20) 4866 } 4867 wg.Wait() 4868 4869 info, err := js.AccountInfo() 4870 require_NoError(t, err) 4871 require_True(t, info.Streams == 200) 4872 4873 runtime.GC() 4874 debug.FreeOSMemory() 4875 4876 // Now bring a server down and wipe its storage. 4877 s := c.servers[0] 4878 vz, err := s.Varz(nil) 4879 require_NoError(t, err) 4880 fmt.Printf("MEM BEFORE is %v\n", friendlyBytes(vz.Mem)) 4881 4882 sd := s.JetStreamConfig().StoreDir 4883 s.Shutdown() 4884 removeDir(t, sd) 4885 s = c.restartServer(s) 4886 4887 c.waitOnServerHealthz(s) 4888 4889 runtime.GC() 4890 debug.FreeOSMemory() 4891 4892 vz, err = s.Varz(nil) 4893 require_NoError(t, err) 4894 fmt.Printf("MEM AFTER is %v\n", friendlyBytes(vz.Mem)) 4895 } 4896 4897 func TestNoRaceJetStreamClusterCatchupBailMidway(t *testing.T) { 4898 skip(t) 4899 4900 c := createJetStreamClusterWithTemplate(t, jsClusterStallCatchupTempl, "GSG", 3) 4901 defer c.shutdown() 4902 4903 ml := c.leader() 4904 nc, js := jsClientConnect(t, ml) 4905 defer nc.Close() 4906 4907 msg := []byte(strings.Repeat("A", 480)) 4908 4909 for i := 0; i < maxConcurrentSyncRequests*2; i++ { 4910 sn := fmt.Sprintf("CUP-%d", i+1) 4911 _, err := js.AddStream(&nats.StreamConfig{ 4912 Name: sn, 4913 Replicas: 3, 4914 }) 4915 require_NoError(t, err) 4916 4917 for i := 0; i < 10_000; i++ { 4918 _, err := js.PublishAsync(sn, msg) 4919 require_NoError(t, err) 4920 } 4921 select { 4922 case <-js.PublishAsyncComplete(): 4923 case <-time.After(10 * time.Second): 4924 t.Fatalf("Did not receive completion signal") 4925 } 4926 } 4927 4928 jsz, _ := ml.Jsz(nil) 4929 expectedMsgs := jsz.Messages 4930 4931 // Now select a server and shut it down, removing the storage directory. 4932 s := c.randomNonLeader() 4933 sd := s.JetStreamConfig().StoreDir 4934 s.Shutdown() 4935 removeDir(t, sd) 4936 4937 // Now restart the server. 4938 s = c.restartServer(s) 4939 4940 // We want to force the follower to bail before the catchup through the 4941 // upper level catchup logic completes. 4942 checkFor(t, 5*time.Second, 10*time.Millisecond, func() error { 4943 jsz, _ := s.Jsz(nil) 4944 if jsz.Messages > expectedMsgs/2 { 4945 s.Shutdown() 4946 return nil 4947 } 4948 return fmt.Errorf("Not enough yet") 4949 }) 4950 4951 // Now restart the server. 4952 s = c.restartServer(s) 4953 4954 checkFor(t, 5*time.Second, 500*time.Millisecond, func() error { 4955 jsz, _ := s.Jsz(nil) 4956 if jsz.Messages == expectedMsgs { 4957 return nil 4958 } 4959 return fmt.Errorf("Not enough yet") 4960 }) 4961 } 4962 4963 func TestNoRaceJetStreamAccountLimitsAndRestart(t *testing.T) { 4964 c := createJetStreamClusterWithTemplate(t, jsClusterAccountLimitsTempl, "A3S", 3) 4965 defer c.shutdown() 4966 4967 nc, js := jsClientConnect(t, c.randomServer()) 4968 defer nc.Close() 4969 4970 if _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Replicas: 3}); err != nil { 4971 t.Fatalf("Unexpected error: %v", err) 4972 } 4973 4974 for i := 0; i < 20_000; i++ { 4975 if _, err := js.Publish("TEST", []byte("A")); err != nil { 4976 break 4977 } 4978 if i == 5_000 { 4979 snl := c.randomNonStreamLeader("$JS", "TEST") 4980 snl.Shutdown() 4981 } 4982 } 4983 4984 c.stopAll() 4985 c.restartAll() 4986 c.waitOnLeader() 4987 c.waitOnStreamLeader("$JS", "TEST") 4988 4989 for _, cs := range c.servers { 4990 c.waitOnStreamCurrent(cs, "$JS", "TEST") 4991 } 4992 } 4993 4994 func TestNoRaceJetStreamPullConsumersAndInteriorDeletes(t *testing.T) { 4995 c := createJetStreamClusterExplicit(t, "ID", 3) 4996 defer c.shutdown() 4997 4998 nc, js := jsClientConnect(t, c.randomServer()) 4999 defer nc.Close() 5000 5001 _, err := js.AddStream(&nats.StreamConfig{ 5002 Name: "foo", 5003 Replicas: 3, 5004 MaxMsgs: 50000, 5005 Retention: nats.InterestPolicy, 5006 }) 5007 require_NoError(t, err) 5008 5009 c.waitOnStreamLeader(globalAccountName, "foo") 5010 5011 _, err = js.AddConsumer("foo", &nats.ConsumerConfig{ 5012 Durable: "foo", 5013 FilterSubject: "foo", 5014 MaxAckPending: 20000, 5015 AckWait: time.Minute, 5016 AckPolicy: nats.AckExplicitPolicy, 5017 }) 5018 require_NoError(t, err) 5019 5020 c.waitOnConsumerLeader(globalAccountName, "foo", "foo") 5021 5022 rcv := int32(0) 5023 prods := 5 5024 cons := 5 5025 wg := sync.WaitGroup{} 5026 wg.Add(prods + cons) 5027 toSend := 100000 5028 5029 for i := 0; i < cons; i++ { 5030 go func() { 5031 defer wg.Done() 5032 5033 sub, err := js.PullSubscribe("foo", "foo") 5034 if err != nil { 5035 return 5036 } 5037 for { 5038 msgs, err := sub.Fetch(200, nats.MaxWait(250*time.Millisecond)) 5039 if err != nil { 5040 if n := int(atomic.LoadInt32(&rcv)); n >= toSend { 5041 return 5042 } 5043 continue 5044 } 5045 for _, m := range msgs { 5046 m.Ack() 5047 atomic.AddInt32(&rcv, 1) 5048 } 5049 } 5050 }() 5051 } 5052 5053 for i := 0; i < prods; i++ { 5054 go func() { 5055 defer wg.Done() 5056 5057 for i := 0; i < toSend/prods; i++ { 5058 js.Publish("foo", []byte("hello")) 5059 } 5060 }() 5061 } 5062 5063 time.Sleep(time.Second) 5064 resp, err := nc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "foo", "foo"), nil, time.Second) 5065 if err != nil { 5066 t.Fatalf("Unexpected error: %v", err) 5067 } 5068 var cdResp JSApiConsumerLeaderStepDownResponse 5069 if err := json.Unmarshal(resp.Data, &cdResp); err != nil { 5070 t.Fatalf("Unexpected error: %v", err) 5071 } 5072 if cdResp.Error != nil { 5073 t.Fatalf("Unexpected error: %+v", cdResp.Error) 5074 } 5075 ch := make(chan struct{}) 5076 go func() { 5077 wg.Wait() 5078 close(ch) 5079 }() 5080 select { 5081 case <-ch: 5082 // OK 5083 case <-time.After(30 * time.Second): 5084 t.Fatalf("Consumers took too long to consumer all messages") 5085 } 5086 } 5087 5088 func TestNoRaceJetStreamClusterInterestPullConsumerStreamLimitBug(t *testing.T) { 5089 c := createJetStreamClusterExplicit(t, "JSC", 3) 5090 defer c.shutdown() 5091 5092 nc, js := jsClientConnect(t, c.randomServer()) 5093 defer nc.Close() 5094 5095 limit := uint64(1000) 5096 5097 _, err := js.AddStream(&nats.StreamConfig{ 5098 Name: "TEST", 5099 Subjects: []string{"foo"}, 5100 Retention: nats.InterestPolicy, 5101 MaxMsgs: int64(limit), 5102 Replicas: 3, 5103 }) 5104 require_NoError(t, err) 5105 5106 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "dur", AckPolicy: nats.AckExplicitPolicy}) 5107 require_NoError(t, err) 5108 5109 qch := make(chan bool) 5110 var wg sync.WaitGroup 5111 5112 // Publisher 5113 wg.Add(1) 5114 go func() { 5115 defer wg.Done() 5116 for { 5117 pt := time.NewTimer(time.Duration(rand.Intn(2)) * time.Millisecond) 5118 select { 5119 case <-pt.C: 5120 _, err := js.Publish("foo", []byte("BUG!")) 5121 require_NoError(t, err) 5122 case <-qch: 5123 pt.Stop() 5124 return 5125 } 5126 } 5127 }() 5128 5129 time.Sleep(time.Second) 5130 5131 // Pull Consumers 5132 wg.Add(100) 5133 for i := 0; i < 100; i++ { 5134 go func() { 5135 defer wg.Done() 5136 nc := natsConnect(t, c.randomServer().ClientURL()) 5137 defer nc.Close() 5138 5139 js, err := nc.JetStream(nats.MaxWait(time.Second)) 5140 require_NoError(t, err) 5141 5142 var sub *nats.Subscription 5143 for j := 0; j < 5; j++ { 5144 sub, err = js.PullSubscribe("foo", "dur") 5145 if err == nil { 5146 break 5147 } 5148 } 5149 require_NoError(t, err) 5150 5151 for { 5152 pt := time.NewTimer(time.Duration(rand.Intn(300)) * time.Millisecond) 5153 select { 5154 case <-pt.C: 5155 msgs, err := sub.Fetch(1) 5156 if err != nil { 5157 t.Logf("Got a Fetch error: %v", err) 5158 return 5159 } 5160 if len(msgs) > 0 { 5161 go func() { 5162 ackDelay := time.Duration(rand.Intn(375)+15) * time.Millisecond 5163 m := msgs[0] 5164 time.AfterFunc(ackDelay, func() { m.AckSync() }) 5165 }() 5166 } 5167 case <-qch: 5168 return 5169 } 5170 } 5171 }() 5172 } 5173 5174 // Make sure we have hit the limit for the number of messages we expected. 5175 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5176 si, err := js.StreamInfo("TEST") 5177 require_NoError(t, err) 5178 if si.State.Msgs < limit { 5179 return fmt.Errorf("Not hit limit yet") 5180 } 5181 return nil 5182 }) 5183 5184 close(qch) 5185 wg.Wait() 5186 5187 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5188 si, err := js.StreamInfo("TEST") 5189 require_NoError(t, err) 5190 ci, err := js.ConsumerInfo("TEST", "dur") 5191 require_NoError(t, err) 5192 5193 np := ci.NumPending + uint64(ci.NumAckPending) 5194 if np != si.State.Msgs { 5195 return fmt.Errorf("Expected NumPending to be %d got %d", si.State.Msgs-uint64(ci.NumAckPending), ci.NumPending) 5196 } 5197 return nil 5198 }) 5199 } 5200 5201 // Test that all peers have the direct access subs that participate in a queue group, 5202 // but only when they are current and ready. So we will start with R1, add in messages 5203 // then scale up while also still adding messages. 5204 func TestNoRaceJetStreamClusterDirectAccessAllPeersSubs(t *testing.T) { 5205 c := createJetStreamClusterExplicit(t, "JSC", 3) 5206 defer c.shutdown() 5207 5208 nc, js := jsClientConnect(t, c.randomServer()) 5209 defer nc.Close() 5210 5211 // Start as R1 5212 cfg := &StreamConfig{ 5213 Name: "TEST", 5214 Subjects: []string{"kv.>"}, 5215 MaxMsgsPer: 10, 5216 AllowDirect: true, 5217 Replicas: 1, 5218 Storage: FileStorage, 5219 } 5220 addStream(t, nc, cfg) 5221 5222 // Seed with enough messages to start then we will scale up while still adding more messages. 5223 num, msg := 1000, bytes.Repeat([]byte("XYZ"), 64) 5224 for i := 0; i < num; i++ { 5225 js.PublishAsync(fmt.Sprintf("kv.%d", i), msg) 5226 } 5227 select { 5228 case <-js.PublishAsyncComplete(): 5229 case <-time.After(5 * time.Second): 5230 t.Fatalf("Did not receive completion signal") 5231 } 5232 5233 getSubj := fmt.Sprintf(JSDirectMsgGetT, "TEST") 5234 getMsg := func(key string) *nats.Msg { 5235 t.Helper() 5236 req := []byte(fmt.Sprintf(`{"last_by_subj":%q}`, key)) 5237 m, err := nc.Request(getSubj, req, time.Second) 5238 require_NoError(t, err) 5239 require_True(t, m.Header.Get(JSSubject) == key) 5240 return m 5241 } 5242 5243 // Just make sure we can succeed here. 5244 getMsg("kv.22") 5245 5246 // Now crank up a go routine to continue sending more messages. 5247 qch := make(chan bool) 5248 var wg sync.WaitGroup 5249 5250 for i := 0; i < 5; i++ { 5251 wg.Add(1) 5252 go func() { 5253 defer wg.Done() 5254 nc, js := jsClientConnect(t, c.randomServer()) 5255 defer nc.Close() 5256 for { 5257 select { 5258 case <-qch: 5259 select { 5260 case <-js.PublishAsyncComplete(): 5261 case <-time.After(10 * time.Second): 5262 } 5263 return 5264 default: 5265 // Send as fast as we can. 5266 js.Publish(fmt.Sprintf("kv.%d", rand.Intn(1000)), msg) 5267 } 5268 } 5269 }() 5270 } 5271 5272 time.Sleep(200 * time.Millisecond) 5273 5274 // Now let's scale up to an R3. 5275 cfg.Replicas = 3 5276 updateStream(t, nc, cfg) 5277 5278 // Wait for the stream to register the new replicas and have a leader. 5279 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5280 si, err := js.StreamInfo("TEST") 5281 if err != nil { 5282 return err 5283 } 5284 if si.Cluster == nil { 5285 return fmt.Errorf("No cluster yet") 5286 } 5287 if si.Cluster.Leader == _EMPTY_ || len(si.Cluster.Replicas) != 2 { 5288 return fmt.Errorf("Cluster not ready yet") 5289 } 5290 return nil 5291 }) 5292 5293 close(qch) 5294 wg.Wait() 5295 5296 // Just make sure we can succeed here. 5297 getMsg("kv.22") 5298 5299 // For each non-leader check that the direct sub fires up. 5300 // We just test all, the leader will already have a directSub. 5301 for _, s := range c.servers { 5302 mset, err := s.GlobalAccount().lookupStream("TEST") 5303 require_NoError(t, err) 5304 checkFor(t, 20*time.Second, 500*time.Millisecond, func() error { 5305 mset.mu.RLock() 5306 ok := mset.directSub != nil 5307 mset.mu.RUnlock() 5308 if ok { 5309 return nil 5310 } 5311 return fmt.Errorf("No directSub yet") 5312 }) 5313 } 5314 5315 si, err := js.StreamInfo("TEST") 5316 require_NoError(t, err) 5317 5318 if si.State.Msgs == uint64(num) { 5319 t.Fatalf("Expected to see messages increase, got %d", si.State.Msgs) 5320 } 5321 5322 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 5323 // Make sure they are all the same from a state perspective. 5324 // Leader will have the expected state. 5325 lmset, err := c.streamLeader("$G", "TEST").GlobalAccount().lookupStream("TEST") 5326 require_NoError(t, err) 5327 expected := lmset.state() 5328 5329 for _, s := range c.servers { 5330 mset, err := s.GlobalAccount().lookupStream("TEST") 5331 require_NoError(t, err) 5332 if state := mset.state(); !reflect.DeepEqual(expected, state) { 5333 return fmt.Errorf("Expected %+v, got %+v", expected, state) 5334 } 5335 } 5336 return nil 5337 }) 5338 5339 } 5340 5341 func TestNoRaceJetStreamClusterStreamNamesAndInfosMoreThanAPILimit(t *testing.T) { 5342 c := createJetStreamClusterExplicit(t, "R3S", 3) 5343 defer c.shutdown() 5344 5345 s := c.randomServer() 5346 nc, js := jsClientConnect(t, s) 5347 defer nc.Close() 5348 5349 createStream := func(name string) { 5350 t.Helper() 5351 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 5352 t.Fatalf("Unexpected error: %v", err) 5353 } 5354 } 5355 5356 max := JSApiListLimit 5357 if JSApiNamesLimit > max { 5358 max = JSApiNamesLimit 5359 } 5360 max += 10 5361 5362 for i := 0; i < max; i++ { 5363 name := fmt.Sprintf("foo_%d", i) 5364 createStream(name) 5365 } 5366 5367 // Not using the JS API here beacause we want to make sure that the 5368 // server returns the proper Total count, but also that it does not 5369 // send more than when the API limit is in one go. 5370 check := func(subj string, limit int) { 5371 t.Helper() 5372 5373 nreq := JSApiStreamNamesRequest{} 5374 b, _ := json.Marshal(nreq) 5375 msg, err := nc.Request(subj, b, 2*time.Second) 5376 require_NoError(t, err) 5377 5378 nresp := JSApiStreamNamesResponse{} 5379 json.Unmarshal(msg.Data, &nresp) 5380 if n := nresp.ApiPaged.Total; n != max { 5381 t.Fatalf("Expected total to be %v, got %v", max, n) 5382 } 5383 if n := nresp.ApiPaged.Limit; n != limit { 5384 t.Fatalf("Expected limit to be %v, got %v", limit, n) 5385 } 5386 if n := len(nresp.Streams); n != limit { 5387 t.Fatalf("Expected number of streams to be %v, got %v", limit, n) 5388 } 5389 } 5390 5391 check(JSApiStreams, JSApiNamesLimit) 5392 check(JSApiStreamList, JSApiListLimit) 5393 } 5394 5395 func TestNoRaceJetStreamClusterConsumerListPaging(t *testing.T) { 5396 c := createJetStreamClusterExplicit(t, "R3S", 3) 5397 defer c.shutdown() 5398 5399 s := c.randomNonLeader() 5400 nc, js := jsClientConnect(t, s) 5401 defer nc.Close() 5402 5403 _, err := js.AddStream(&nats.StreamConfig{ 5404 Name: "TEST", 5405 Subjects: []string{"foo"}, 5406 Replicas: 3, 5407 }) 5408 require_NoError(t, err) 5409 c.waitOnStreamLeader(globalAccountName, "TEST") 5410 5411 cfg := &nats.ConsumerConfig{ 5412 Replicas: 1, 5413 MemoryStorage: true, 5414 AckPolicy: nats.AckExplicitPolicy, 5415 } 5416 5417 // create 3000 consumers. 5418 numConsumers := 3000 5419 for i := 1; i <= numConsumers; i++ { 5420 cfg.Durable = fmt.Sprintf("d-%.4d", i) 5421 _, err := js.AddConsumer("TEST", cfg) 5422 require_NoError(t, err) 5423 } 5424 5425 // Test both names and list operations. 5426 5427 // Names 5428 reqSubj := fmt.Sprintf(JSApiConsumersT, "TEST") 5429 grabConsumerNames := func(offset int) []string { 5430 req := fmt.Sprintf(`{"offset":%d}`, offset) 5431 respMsg, err := nc.Request(reqSubj, []byte(req), time.Second) 5432 require_NoError(t, err) 5433 var resp JSApiConsumerNamesResponse 5434 err = json.Unmarshal(respMsg.Data, &resp) 5435 require_NoError(t, err) 5436 // Sanity check that we are actually paging properly around limits. 5437 if resp.Limit < len(resp.Consumers) { 5438 t.Fatalf("Expected total limited to %d but got %d", resp.Limit, len(resp.Consumers)) 5439 } 5440 if resp.Total != numConsumers { 5441 t.Fatalf("Invalid total response: expected %d got %d", numConsumers, resp.Total) 5442 } 5443 return resp.Consumers 5444 } 5445 5446 results := make(map[string]bool) 5447 5448 for offset := 0; len(results) < numConsumers; { 5449 consumers := grabConsumerNames(offset) 5450 offset += len(consumers) 5451 for _, name := range consumers { 5452 if results[name] { 5453 t.Fatalf("Found duplicate %q", name) 5454 } 5455 results[name] = true 5456 } 5457 } 5458 5459 // List 5460 reqSubj = fmt.Sprintf(JSApiConsumerListT, "TEST") 5461 grabConsumerList := func(offset int) []*ConsumerInfo { 5462 req := fmt.Sprintf(`{"offset":%d}`, offset) 5463 respMsg, err := nc.Request(reqSubj, []byte(req), time.Second) 5464 require_NoError(t, err) 5465 var resp JSApiConsumerListResponse 5466 err = json.Unmarshal(respMsg.Data, &resp) 5467 require_NoError(t, err) 5468 // Sanity check that we are actually paging properly around limits. 5469 if resp.Limit < len(resp.Consumers) { 5470 t.Fatalf("Expected total limited to %d but got %d", resp.Limit, len(resp.Consumers)) 5471 } 5472 if resp.Total != numConsumers { 5473 t.Fatalf("Invalid total response: expected %d got %d", numConsumers, resp.Total) 5474 } 5475 return resp.Consumers 5476 } 5477 5478 results = make(map[string]bool) 5479 5480 for offset := 0; len(results) < numConsumers; { 5481 consumers := grabConsumerList(offset) 5482 offset += len(consumers) 5483 for _, ci := range consumers { 5484 name := ci.Config.Durable 5485 if results[name] { 5486 t.Fatalf("Found duplicate %q", name) 5487 } 5488 results[name] = true 5489 } 5490 } 5491 5492 if len(results) != numConsumers { 5493 t.Fatalf("Received %d / %d consumers", len(results), numConsumers) 5494 } 5495 } 5496 5497 func TestNoRaceJetStreamFileStoreLargeKVAccessTiming(t *testing.T) { 5498 storeDir := t.TempDir() 5499 5500 blkSize := uint64(4 * 1024) 5501 // Compensate for slower IO on MacOSX 5502 if runtime.GOOS == "darwin" { 5503 blkSize *= 4 5504 } 5505 5506 fs, err := newFileStore( 5507 FileStoreConfig{StoreDir: storeDir, BlockSize: blkSize, CacheExpire: 30 * time.Second}, 5508 StreamConfig{Name: "zzz", Subjects: []string{"KV.STREAM_NAME.*"}, Storage: FileStorage, MaxMsgsPer: 1}, 5509 ) 5510 require_NoError(t, err) 5511 defer fs.Stop() 5512 5513 tmpl := "KV.STREAM_NAME.%d" 5514 nkeys, val := 100_000, bytes.Repeat([]byte("Z"), 1024) 5515 5516 for i := 1; i <= nkeys; i++ { 5517 subj := fmt.Sprintf(tmpl, i) 5518 _, _, err := fs.StoreMsg(subj, nil, val) 5519 require_NoError(t, err) 5520 } 5521 5522 first := fmt.Sprintf(tmpl, 1) 5523 last := fmt.Sprintf(tmpl, nkeys) 5524 5525 start := time.Now() 5526 sm, err := fs.LoadLastMsg(last, nil) 5527 require_NoError(t, err) 5528 base := time.Since(start) 5529 5530 if !bytes.Equal(sm.msg, val) { 5531 t.Fatalf("Retrieved value did not match") 5532 } 5533 5534 start = time.Now() 5535 _, err = fs.LoadLastMsg(first, nil) 5536 require_NoError(t, err) 5537 slow := time.Since(start) 5538 5539 if base > 100*time.Microsecond || slow > 200*time.Microsecond { 5540 t.Fatalf("Took too long to look up first key vs last: %v vs %v", base, slow) 5541 } 5542 5543 // time first seq lookup for both as well. 5544 // Base will be first in this case. 5545 fs.mu.RLock() 5546 start = time.Now() 5547 fs.firstSeqForSubj(first) 5548 base = time.Since(start) 5549 start = time.Now() 5550 fs.firstSeqForSubj(last) 5551 slow = time.Since(start) 5552 fs.mu.RUnlock() 5553 5554 if base > 100*time.Microsecond || slow > 200*time.Microsecond { 5555 t.Fatalf("Took too long to look up last key by subject vs first: %v vs %v", base, slow) 5556 } 5557 } 5558 5559 func TestNoRaceJetStreamKVLock(t *testing.T) { 5560 s := RunBasicJetStreamServer(t) 5561 defer s.Shutdown() 5562 5563 nc, js := jsClientConnect(t, s) 5564 defer nc.Close() 5565 5566 _, err := js.CreateKeyValue(&nats.KeyValueConfig{Bucket: "LOCKS"}) 5567 require_NoError(t, err) 5568 5569 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 5570 defer cancel() 5571 5572 var wg sync.WaitGroup 5573 start := make(chan bool) 5574 5575 var tracker int64 5576 5577 for i := 0; i < 100; i++ { 5578 wg.Add(1) 5579 go func() { 5580 defer wg.Done() 5581 5582 nc, js := jsClientConnect(t, s) 5583 defer nc.Close() 5584 kv, err := js.KeyValue("LOCKS") 5585 require_NoError(t, err) 5586 5587 <-start 5588 5589 for { 5590 last, err := kv.Create("MY_LOCK", []byte("Z")) 5591 if err != nil { 5592 select { 5593 case <-time.After(10 * time.Millisecond): 5594 continue 5595 case <-ctx.Done(): 5596 return 5597 } 5598 } 5599 5600 if v := atomic.AddInt64(&tracker, 1); v != 1 { 5601 t.Logf("TRACKER NOT 1 -> %d\n", v) 5602 cancel() 5603 } 5604 5605 time.Sleep(10 * time.Millisecond) 5606 if v := atomic.AddInt64(&tracker, -1); v != 0 { 5607 t.Logf("TRACKER NOT 0 AFTER RELEASE -> %d\n", v) 5608 cancel() 5609 } 5610 5611 err = kv.Delete("MY_LOCK", nats.LastRevision(last)) 5612 if err != nil { 5613 t.Logf("Could not unlock for last %d: %v", last, err) 5614 } 5615 5616 if ctx.Err() != nil { 5617 return 5618 } 5619 } 5620 }() 5621 } 5622 5623 close(start) 5624 wg.Wait() 5625 } 5626 5627 func TestNoRaceJetStreamSuperClusterStreamMoveLongRTT(t *testing.T) { 5628 // Make C2 far away. 5629 gwm := gwProxyMap{ 5630 "C2": &gwProxy{ 5631 rtt: 20 * time.Millisecond, 5632 up: 1 * 1024 * 1024 * 1024, // 1gbit 5633 down: 1 * 1024 * 1024 * 1024, // 1gbit 5634 }, 5635 } 5636 sc := createJetStreamTaggedSuperClusterWithGWProxy(t, gwm) 5637 defer sc.shutdown() 5638 5639 nc, js := jsClientConnect(t, sc.randomServer()) 5640 defer nc.Close() 5641 5642 cfg := &nats.StreamConfig{ 5643 Name: "TEST", 5644 Subjects: []string{"chunk.*"}, 5645 Placement: &nats.Placement{Tags: []string{"cloud:aws", "country:us"}}, 5646 Replicas: 3, 5647 } 5648 5649 // Place a stream in C1. 5650 _, err := js.AddStream(cfg, nats.MaxWait(10*time.Second)) 5651 require_NoError(t, err) 5652 5653 chunk := bytes.Repeat([]byte("Z"), 1000*1024) // ~1MB 5654 // 256 MB 5655 for i := 0; i < 256; i++ { 5656 subj := fmt.Sprintf("chunk.%d", i) 5657 js.PublishAsync(subj, chunk) 5658 } 5659 select { 5660 case <-js.PublishAsyncComplete(): 5661 case <-time.After(10 * time.Second): 5662 t.Fatalf("Did not receive completion signal") 5663 } 5664 5665 // C2, slow RTT. 5666 cfg.Placement = &nats.Placement{Tags: []string{"cloud:gcp", "country:uk"}} 5667 _, err = js.UpdateStream(cfg) 5668 require_NoError(t, err) 5669 5670 checkFor(t, 20*time.Second, time.Second, func() error { 5671 si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)) 5672 if err != nil { 5673 return err 5674 } 5675 if si.Cluster.Name != "C2" { 5676 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 5677 } 5678 if si.Cluster.Leader == _EMPTY_ { 5679 return fmt.Errorf("No leader yet") 5680 } else if !strings.HasPrefix(si.Cluster.Leader, "C2-") { 5681 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 5682 } 5683 // Now we want to see that we shrink back to original. 5684 if len(si.Cluster.Replicas) != cfg.Replicas-1 { 5685 return fmt.Errorf("Expected %d replicas, got %d", cfg.Replicas-1, len(si.Cluster.Replicas)) 5686 } 5687 return nil 5688 }) 5689 } 5690 5691 // https://github.com/nats-io/nats-server/issues/3455 5692 func TestNoRaceJetStreamConcurrentPullConsumerBatch(t *testing.T) { 5693 s := RunBasicJetStreamServer(t) 5694 defer s.Shutdown() 5695 5696 nc, js := jsClientConnect(t, s) 5697 defer nc.Close() 5698 5699 _, err := js.AddStream(&nats.StreamConfig{ 5700 Name: "TEST", 5701 Subjects: []string{"ORDERS.*"}, 5702 Storage: nats.MemoryStorage, 5703 Retention: nats.WorkQueuePolicy, 5704 }) 5705 require_NoError(t, err) 5706 5707 toSend := int32(100_000) 5708 5709 for i := 0; i < 100_000; i++ { 5710 subj := fmt.Sprintf("ORDERS.%d", i+1) 5711 js.PublishAsync(subj, []byte("BUY")) 5712 } 5713 select { 5714 case <-js.PublishAsyncComplete(): 5715 case <-time.After(5 * time.Second): 5716 t.Fatalf("Did not receive completion signal") 5717 } 5718 5719 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5720 Durable: "PROCESSOR", 5721 AckPolicy: nats.AckExplicitPolicy, 5722 MaxAckPending: 5000, 5723 }) 5724 require_NoError(t, err) 5725 5726 nc, js = jsClientConnect(t, s) 5727 defer nc.Close() 5728 5729 sub1, err := js.PullSubscribe(_EMPTY_, _EMPTY_, nats.Bind("TEST", "PROCESSOR")) 5730 require_NoError(t, err) 5731 5732 nc, js = jsClientConnect(t, s) 5733 defer nc.Close() 5734 5735 sub2, err := js.PullSubscribe(_EMPTY_, _EMPTY_, nats.Bind("TEST", "PROCESSOR")) 5736 require_NoError(t, err) 5737 5738 startCh := make(chan bool) 5739 5740 var received int32 5741 5742 wg := sync.WaitGroup{} 5743 5744 fetchSize := 1000 5745 fetch := func(sub *nats.Subscription) { 5746 <-startCh 5747 defer wg.Done() 5748 5749 for { 5750 msgs, err := sub.Fetch(fetchSize, nats.MaxWait(time.Second)) 5751 if atomic.AddInt32(&received, int32(len(msgs))) >= toSend { 5752 break 5753 } 5754 // We should always receive a full batch here if not last competing fetch. 5755 if err != nil || len(msgs) != fetchSize { 5756 break 5757 } 5758 for _, m := range msgs { 5759 m.Ack() 5760 } 5761 } 5762 } 5763 5764 wg.Add(2) 5765 5766 go fetch(sub1) 5767 go fetch(sub2) 5768 5769 close(startCh) 5770 5771 wg.Wait() 5772 require_True(t, received == toSend) 5773 } 5774 5775 func TestNoRaceJetStreamManyPullConsumersNeedAckOptimization(t *testing.T) { 5776 // Uncomment to run. Do not want as part of Travis tests atm. 5777 // Run with cpu and memory profiling to make sure we have improved. 5778 skip(t) 5779 5780 s := RunBasicJetStreamServer(t) 5781 defer s.Shutdown() 5782 5783 nc, js := jsClientConnect(t, s) 5784 defer nc.Close() 5785 5786 _, err := js.AddStream(&nats.StreamConfig{ 5787 Name: "ORDERS", 5788 Subjects: []string{"ORDERS.*"}, 5789 Storage: nats.MemoryStorage, 5790 Retention: nats.InterestPolicy, 5791 }) 5792 require_NoError(t, err) 5793 5794 toSend := 100_000 5795 numConsumers := 500 5796 5797 // Create 500 consumers 5798 for i := 1; i <= numConsumers; i++ { 5799 _, err := js.AddConsumer("ORDERS", &nats.ConsumerConfig{ 5800 Durable: fmt.Sprintf("ORDERS_%d", i), 5801 FilterSubject: fmt.Sprintf("ORDERS.%d", i), 5802 AckPolicy: nats.AckAllPolicy, 5803 }) 5804 require_NoError(t, err) 5805 } 5806 5807 for i := 1; i <= toSend; i++ { 5808 subj := fmt.Sprintf("ORDERS.%d", i%numConsumers+1) 5809 js.PublishAsync(subj, []byte("HELLO")) 5810 } 5811 select { 5812 case <-js.PublishAsyncComplete(): 5813 case <-time.After(5 * time.Second): 5814 t.Fatalf("Did not receive completion signal") 5815 } 5816 5817 sub, err := js.PullSubscribe("ORDERS.500", "ORDERS_500") 5818 require_NoError(t, err) 5819 5820 fetchSize := toSend / numConsumers 5821 msgs, err := sub.Fetch(fetchSize, nats.MaxWait(time.Second)) 5822 require_NoError(t, err) 5823 5824 last := msgs[len(msgs)-1] 5825 last.AckSync() 5826 } 5827 5828 // https://github.com/nats-io/nats-server/issues/3499 5829 func TestNoRaceJetStreamDeleteConsumerWithInterestStreamAndHighSeqs(t *testing.T) { 5830 s := RunBasicJetStreamServer(t) 5831 defer s.Shutdown() 5832 5833 // Client for API requests. 5834 nc, js := jsClientConnect(t, s) 5835 defer nc.Close() 5836 5837 _, err := js.AddStream(&nats.StreamConfig{ 5838 Name: "TEST", 5839 Subjects: []string{"log.>"}, 5840 Retention: nats.InterestPolicy, 5841 }) 5842 require_NoError(t, err) 5843 5844 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5845 Durable: "c", 5846 AckPolicy: nats.AckExplicitPolicy, 5847 }) 5848 require_NoError(t, err) 5849 5850 // Set baseline for time to delete so we can see linear increase as sequence numbers increase. 5851 start := time.Now() 5852 err = js.DeleteConsumer("TEST", "c") 5853 require_NoError(t, err) 5854 elapsed := time.Since(start) 5855 5856 // Crank up sequence numbers. 5857 msg := []byte(strings.Repeat("ZZZ", 128)) 5858 for i := 0; i < 5_000_000; i++ { 5859 nc.Publish("log.Z", msg) 5860 } 5861 nc.Flush() 5862 5863 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5864 Durable: "c", 5865 AckPolicy: nats.AckExplicitPolicy, 5866 }) 5867 require_NoError(t, err) 5868 5869 // We have a bug that spins unecessarily through all the sequences from this consumer's 5870 // ackfloor(0) and the last sequence for the stream. We will detect by looking for the time 5871 // to delete being 100x more. Should be the same since both times no messages exist in the stream. 5872 start = time.Now() 5873 err = js.DeleteConsumer("TEST", "c") 5874 require_NoError(t, err) 5875 5876 if e := time.Since(start); e > 100*elapsed { 5877 t.Fatalf("Consumer delete took too long: %v vs baseline %v", e, elapsed) 5878 } 5879 } 5880 5881 // Bug when we encode a timestamp that upon decode causes an error which causes server to panic. 5882 // This can happen on consumer redelivery since they adjusted timstamps can be in the future, and result 5883 // in a negative encoding. If that encoding was exactly -1 seconds, would cause decodeConsumerState to fail 5884 // and the server to panic. 5885 func TestNoRaceEncodeConsumerStateBug(t *testing.T) { 5886 for i := 0; i < 200_000; i++ { 5887 // Pretend we redelivered and updated the timestamp to reflect the new start time for expiration. 5888 // The bug will trip when time.Now() rounded to seconds in encode is 1 second below the truncated version 5889 // of pending. 5890 pending := Pending{Sequence: 1, Timestamp: time.Now().Add(time.Second).UnixNano()} 5891 state := ConsumerState{ 5892 Delivered: SequencePair{Consumer: 1, Stream: 1}, 5893 Pending: map[uint64]*Pending{1: &pending}, 5894 } 5895 buf := encodeConsumerState(&state) 5896 _, err := decodeConsumerState(buf) 5897 require_NoError(t, err) 5898 } 5899 } 5900 5901 // Performance impact on stream ingress with large number of consumers. 5902 func TestNoRaceJetStreamLargeNumConsumersPerfImpact(t *testing.T) { 5903 skip(t) 5904 5905 s := RunBasicJetStreamServer(t) 5906 defer s.Shutdown() 5907 5908 // Client for API requests. 5909 nc, js := jsClientConnect(t, s) 5910 defer nc.Close() 5911 5912 _, err := js.AddStream(&nats.StreamConfig{ 5913 Name: "TEST", 5914 Subjects: []string{"foo"}, 5915 }) 5916 require_NoError(t, err) 5917 5918 // Baseline with no consumers. 5919 toSend := 1_000_000 5920 start := time.Now() 5921 for i := 0; i < toSend; i++ { 5922 js.PublishAsync("foo", []byte("OK")) 5923 } 5924 <-js.PublishAsyncComplete() 5925 tt := time.Since(start) 5926 fmt.Printf("Base time is %v\n", tt) 5927 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5928 5929 err = js.PurgeStream("TEST") 5930 require_NoError(t, err) 5931 5932 // Now add in 10 idle consumers. 5933 for i := 1; i <= 10; i++ { 5934 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5935 Durable: fmt.Sprintf("d-%d", i), 5936 AckPolicy: nats.AckExplicitPolicy, 5937 }) 5938 require_NoError(t, err) 5939 } 5940 5941 start = time.Now() 5942 for i := 0; i < toSend; i++ { 5943 js.PublishAsync("foo", []byte("OK")) 5944 } 5945 <-js.PublishAsyncComplete() 5946 tt = time.Since(start) 5947 fmt.Printf("\n10 consumers time is %v\n", tt) 5948 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5949 5950 err = js.PurgeStream("TEST") 5951 require_NoError(t, err) 5952 5953 // Now add in 90 more idle consumers. 5954 for i := 11; i <= 100; i++ { 5955 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5956 Durable: fmt.Sprintf("d-%d", i), 5957 AckPolicy: nats.AckExplicitPolicy, 5958 }) 5959 require_NoError(t, err) 5960 } 5961 5962 start = time.Now() 5963 for i := 0; i < toSend; i++ { 5964 js.PublishAsync("foo", []byte("OK")) 5965 } 5966 <-js.PublishAsyncComplete() 5967 tt = time.Since(start) 5968 fmt.Printf("\n100 consumers time is %v\n", tt) 5969 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5970 5971 err = js.PurgeStream("TEST") 5972 require_NoError(t, err) 5973 5974 // Now add in 900 more 5975 for i := 101; i <= 1000; i++ { 5976 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 5977 Durable: fmt.Sprintf("d-%d", i), 5978 AckPolicy: nats.AckExplicitPolicy, 5979 }) 5980 require_NoError(t, err) 5981 } 5982 5983 start = time.Now() 5984 for i := 0; i < toSend; i++ { 5985 js.PublishAsync("foo", []byte("OK")) 5986 } 5987 <-js.PublishAsyncComplete() 5988 tt = time.Since(start) 5989 fmt.Printf("\n1000 consumers time is %v\n", tt) 5990 fmt.Printf("%.0f msgs/sec\n", float64(toSend)/tt.Seconds()) 5991 } 5992 5993 // Performance impact on large number of consumers but sparse delivery. 5994 func TestNoRaceJetStreamLargeNumConsumersSparseDelivery(t *testing.T) { 5995 skip(t) 5996 5997 s := RunBasicJetStreamServer(t) 5998 defer s.Shutdown() 5999 6000 // Client for API requests. 6001 nc, js := jsClientConnect(t, s) 6002 defer nc.Close() 6003 6004 _, err := js.AddStream(&nats.StreamConfig{ 6005 Name: "TEST", 6006 Subjects: []string{"ID.*"}, 6007 }) 6008 require_NoError(t, err) 6009 6010 // Now add in ~10k consumers on different subjects. 6011 for i := 3; i <= 10_000; i++ { 6012 _, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 6013 Durable: fmt.Sprintf("d-%d", i), 6014 FilterSubject: fmt.Sprintf("ID.%d", i), 6015 AckPolicy: nats.AckNonePolicy, 6016 }) 6017 require_NoError(t, err) 6018 } 6019 6020 toSend := 100_000 6021 6022 // Bind a consumer to ID.2. 6023 var received int 6024 done := make(chan bool) 6025 6026 nc, js = jsClientConnect(t, s) 6027 defer nc.Close() 6028 6029 mh := func(m *nats.Msg) { 6030 received++ 6031 if received >= toSend { 6032 close(done) 6033 } 6034 } 6035 _, err = js.Subscribe("ID.2", mh) 6036 require_NoError(t, err) 6037 6038 last := make(chan bool) 6039 _, err = js.Subscribe("ID.1", func(_ *nats.Msg) { close(last) }) 6040 require_NoError(t, err) 6041 6042 nc, _ = jsClientConnect(t, s) 6043 defer nc.Close() 6044 js, err = nc.JetStream(nats.PublishAsyncMaxPending(8 * 1024)) 6045 require_NoError(t, err) 6046 6047 start := time.Now() 6048 for i := 0; i < toSend; i++ { 6049 js.PublishAsync("ID.2", []byte("ok")) 6050 } 6051 // Check latency for this one message. 6052 // This will show the issue better than throughput which can bypass signal processing. 6053 js.PublishAsync("ID.1", []byte("ok")) 6054 6055 select { 6056 case <-done: 6057 break 6058 case <-time.After(10 * time.Second): 6059 t.Fatalf("Failed to receive all messages: %d of %d\n", received, toSend) 6060 } 6061 6062 tt := time.Since(start) 6063 fmt.Printf("Took %v to receive %d msgs\n", tt, toSend) 6064 fmt.Printf("%.0f msgs/s\n", float64(toSend)/tt.Seconds()) 6065 6066 select { 6067 case <-last: 6068 break 6069 case <-time.After(30 * time.Second): 6070 t.Fatalf("Failed to receive last message\n") 6071 } 6072 lt := time.Since(start) 6073 6074 fmt.Printf("Took %v to receive last msg\n", lt) 6075 } 6076 6077 func TestNoRaceJetStreamEndToEndLatency(t *testing.T) { 6078 s := RunBasicJetStreamServer(t) 6079 defer s.Shutdown() 6080 6081 // Client for API requests. 6082 nc, js := jsClientConnect(t, s) 6083 defer nc.Close() 6084 6085 _, err := js.AddStream(&nats.StreamConfig{ 6086 Name: "TEST", 6087 Subjects: []string{"foo"}, 6088 }) 6089 require_NoError(t, err) 6090 6091 nc, js = jsClientConnect(t, s) 6092 defer nc.Close() 6093 6094 var sent time.Time 6095 var max time.Duration 6096 next := make(chan struct{}) 6097 6098 mh := func(m *nats.Msg) { 6099 received := time.Now() 6100 tt := received.Sub(sent) 6101 if max == 0 || tt > max { 6102 max = tt 6103 } 6104 next <- struct{}{} 6105 } 6106 sub, err := js.Subscribe("foo", mh) 6107 require_NoError(t, err) 6108 6109 nc, js = jsClientConnect(t, s) 6110 defer nc.Close() 6111 6112 toSend := 50_000 6113 for i := 0; i < toSend; i++ { 6114 sent = time.Now() 6115 js.Publish("foo", []byte("ok")) 6116 <-next 6117 } 6118 sub.Unsubscribe() 6119 6120 if max > 250*time.Millisecond { 6121 t.Fatalf("Expected max latency to be < 250ms, got %v", max) 6122 } 6123 } 6124 6125 func TestNoRaceJetStreamClusterEnsureWALCompact(t *testing.T) { 6126 c := createJetStreamClusterExplicit(t, "R3S", 3) 6127 defer c.shutdown() 6128 6129 nc, js := jsClientConnect(t, c.randomServer()) 6130 defer nc.Close() 6131 6132 _, err := js.AddStream(&nats.StreamConfig{ 6133 Name: "TEST", 6134 Subjects: []string{"foo"}, 6135 Replicas: 3, 6136 }) 6137 require_NoError(t, err) 6138 6139 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 6140 Durable: "dlc", 6141 DeliverSubject: "zz", 6142 Replicas: 3, 6143 }) 6144 require_NoError(t, err) 6145 6146 // Force snapshot on stream leader. 6147 sl := c.streamLeader(globalAccountName, "TEST") 6148 mset, err := sl.GlobalAccount().lookupStream("TEST") 6149 require_NoError(t, err) 6150 node := mset.raftNode() 6151 require_True(t, node != nil) 6152 6153 err = node.InstallSnapshot(mset.stateSnapshot()) 6154 require_NoError(t, err) 6155 6156 // Now publish more than should be needed to cause an additional snapshot. 6157 ns := 75_000 6158 for i := 0; i <= ns; i++ { 6159 _, err := js.Publish("foo", []byte("bar")) 6160 require_NoError(t, err) 6161 } 6162 6163 // Grab progress and use that to look into WAL entries. 6164 _, _, applied := node.Progress() 6165 // If ne == ns that means snapshots and compacts were not happening when 6166 // they should have been. 6167 if ne, _ := node.Applied(applied); ne >= uint64(ns) { 6168 t.Fatalf("Did not snapshot and compact the raft WAL, entries == %d", ne) 6169 } 6170 6171 // Now check consumer. 6172 // Force snapshot on consumerleader. 6173 cl := c.consumerLeader(globalAccountName, "TEST", "dlc") 6174 mset, err = cl.GlobalAccount().lookupStream("TEST") 6175 require_NoError(t, err) 6176 o := mset.lookupConsumer("dlc") 6177 require_True(t, o != nil) 6178 6179 node = o.raftNode() 6180 require_True(t, node != nil) 6181 6182 snap, err := o.store.EncodedState() 6183 require_NoError(t, err) 6184 err = node.InstallSnapshot(snap) 6185 require_NoError(t, err) 6186 6187 received, done := 0, make(chan bool, 1) 6188 6189 nc.Subscribe("zz", func(m *nats.Msg) { 6190 received++ 6191 if received >= ns { 6192 select { 6193 case done <- true: 6194 default: 6195 } 6196 } 6197 m.Ack() 6198 }) 6199 6200 select { 6201 case <-done: 6202 return 6203 case <-time.After(10 * time.Second): 6204 t.Fatalf("Did not received all %d msgs, only %d", ns, received) 6205 } 6206 6207 // Do same trick and check that WAL was compacted. 6208 // Grab progress and use that to look into WAL entries. 6209 _, _, applied = node.Progress() 6210 // If ne == ns that means snapshots and compacts were not happening when 6211 // they should have been. 6212 if ne, _ := node.Applied(applied); ne >= uint64(ns) { 6213 t.Fatalf("Did not snapshot and compact the raft WAL, entries == %d", ne) 6214 } 6215 } 6216 6217 func TestNoRaceFileStoreStreamMaxAgePerformance(t *testing.T) { 6218 // Uncomment to run. 6219 skip(t) 6220 6221 storeDir := t.TempDir() 6222 maxAge := 5 * time.Second 6223 6224 fs, err := newFileStore( 6225 FileStoreConfig{StoreDir: storeDir}, 6226 StreamConfig{Name: "MA", 6227 Subjects: []string{"foo.*"}, 6228 MaxAge: maxAge, 6229 Storage: FileStorage}, 6230 ) 6231 require_NoError(t, err) 6232 defer fs.Stop() 6233 6234 // Simulate a callback similar to consumers decrementing. 6235 var mu sync.RWMutex 6236 var pending int64 6237 6238 fs.RegisterStorageUpdates(func(md, bd int64, seq uint64, subj string) { 6239 mu.Lock() 6240 defer mu.Unlock() 6241 pending += md 6242 }) 6243 6244 start, num, subj := time.Now(), 0, "foo.foo" 6245 6246 timeout := start.Add(maxAge) 6247 for time.Now().Before(timeout) { 6248 // We will store in blocks of 100. 6249 for i := 0; i < 100; i++ { 6250 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 6251 require_NoError(t, err) 6252 num++ 6253 } 6254 } 6255 elapsed := time.Since(start) 6256 fmt.Printf("Took %v to store %d\n", elapsed, num) 6257 fmt.Printf("%.0f msgs/sec\n", float64(num)/elapsed.Seconds()) 6258 6259 // Now keep running for 2x longer knowing we are expiring messages in the background. 6260 // We want to see the effect on performance. 6261 6262 start = time.Now() 6263 timeout = start.Add(maxAge * 2) 6264 6265 for time.Now().Before(timeout) { 6266 // We will store in blocks of 100. 6267 for i := 0; i < 100; i++ { 6268 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 6269 require_NoError(t, err) 6270 num++ 6271 } 6272 } 6273 elapsed = time.Since(start) 6274 fmt.Printf("Took %v to store %d\n", elapsed, num) 6275 fmt.Printf("%.0f msgs/sec\n", float64(num)/elapsed.Seconds()) 6276 } 6277 6278 // SequenceSet memory tests vs dmaps. 6279 func TestNoRaceSeqSetSizeComparison(t *testing.T) { 6280 // Create 5M random entries (dupes possible but ok for this test) out of 8M range. 6281 num := 5_000_000 6282 max := 7_000_000 6283 6284 seqs := make([]uint64, 0, num) 6285 for i := 0; i < num; i++ { 6286 n := uint64(rand.Int63n(int64(max + 1))) 6287 seqs = append(seqs, n) 6288 } 6289 6290 runtime.GC() 6291 // Disable to get stable results. 6292 gcp := debug.SetGCPercent(-1) 6293 defer debug.SetGCPercent(gcp) 6294 6295 mem := runtime.MemStats{} 6296 runtime.ReadMemStats(&mem) 6297 inUseBefore := mem.HeapInuse 6298 6299 dmap := make(map[uint64]struct{}, num) 6300 for _, n := range seqs { 6301 dmap[n] = struct{}{} 6302 } 6303 runtime.ReadMemStats(&mem) 6304 dmapUse := mem.HeapInuse - inUseBefore 6305 inUseBefore = mem.HeapInuse 6306 6307 // Now do SequenceSet on same dataset. 6308 var sset avl.SequenceSet 6309 for _, n := range seqs { 6310 sset.Insert(n) 6311 } 6312 6313 runtime.ReadMemStats(&mem) 6314 seqSetUse := mem.HeapInuse - inUseBefore 6315 6316 if seqSetUse > 2*1024*1024 { 6317 t.Fatalf("Expected SequenceSet size to be < 2M, got %v", friendlyBytes(int64(seqSetUse))) 6318 } 6319 if seqSetUse*50 > dmapUse { 6320 t.Fatalf("Expected SequenceSet to be at least 50x better then dmap approach: %v vs %v", 6321 friendlyBytes(int64(seqSetUse)), 6322 friendlyBytes(int64(dmapUse)), 6323 ) 6324 } 6325 } 6326 6327 // FilteredState for ">" with large interior deletes was very slow. 6328 func TestNoRaceFileStoreFilteredStateWithLargeDeletes(t *testing.T) { 6329 storeDir := t.TempDir() 6330 6331 fs, err := newFileStore( 6332 FileStoreConfig{StoreDir: storeDir, BlockSize: 4096}, 6333 StreamConfig{Name: "zzz", Subjects: []string{"foo"}, Storage: FileStorage}, 6334 ) 6335 require_NoError(t, err) 6336 defer fs.Stop() 6337 6338 subj, msg := "foo", []byte("Hello World") 6339 6340 toStore := 500_000 6341 for i := 0; i < toStore; i++ { 6342 _, _, err := fs.StoreMsg(subj, nil, msg) 6343 require_NoError(t, err) 6344 } 6345 6346 // Now delete every other one. 6347 for seq := 2; seq <= toStore; seq += 2 { 6348 _, err := fs.RemoveMsg(uint64(seq)) 6349 require_NoError(t, err) 6350 } 6351 6352 runtime.GC() 6353 // Disable to get stable results. 6354 gcp := debug.SetGCPercent(-1) 6355 defer debug.SetGCPercent(gcp) 6356 6357 start := time.Now() 6358 fss := fs.FilteredState(1, _EMPTY_) 6359 elapsed := time.Since(start) 6360 6361 require_True(t, fss.Msgs == uint64(toStore/2)) 6362 require_True(t, elapsed < 500*time.Microsecond) 6363 } 6364 6365 // ConsumerInfo seems to being called quite a bit more than we had anticipated. 6366 // Under certain circumstances, since we reset num pending, this can be very costly. 6367 // We will use the fast path to alleviate that performance bottleneck but also make 6368 // sure we are still being accurate. 6369 func TestNoRaceJetStreamClusterConsumerInfoSpeed(t *testing.T) { 6370 c := createJetStreamClusterExplicit(t, "R3S", 3) 6371 defer c.shutdown() 6372 6373 c.waitOnLeader() 6374 server := c.randomNonLeader() 6375 6376 nc, js := jsClientConnect(t, server) 6377 defer nc.Close() 6378 6379 _, err := js.AddStream(&nats.StreamConfig{ 6380 Name: "TEST", 6381 Subjects: []string{"events.>"}, 6382 Replicas: 3, 6383 }) 6384 require_NoError(t, err) 6385 6386 // The issue is compounded when we have lots of different subjects captured 6387 // by a terminal fwc. The consumer will have a terminal pwc. 6388 // Here make all subjects unique. 6389 6390 sub, err := js.PullSubscribe("events.*", "DLC") 6391 require_NoError(t, err) 6392 6393 toSend := 250_000 6394 for i := 0; i < toSend; i++ { 6395 subj := fmt.Sprintf("events.%d", i+1) 6396 js.PublishAsync(subj, []byte("ok")) 6397 } 6398 select { 6399 case <-js.PublishAsyncComplete(): 6400 case <-time.After(5 * time.Second): 6401 t.Fatalf("Did not receive completion signal") 6402 } 6403 6404 checkNumPending := func(expected int) { 6405 t.Helper() 6406 start := time.Now() 6407 ci, err := js.ConsumerInfo("TEST", "DLC") 6408 require_NoError(t, err) 6409 // Make sure these are fast now. 6410 if elapsed := time.Since(start); elapsed > 5*time.Millisecond { 6411 t.Fatalf("ConsumerInfo took too long: %v", elapsed) 6412 } 6413 // Make sure pending == expected. 6414 if ci.NumPending != uint64(expected) { 6415 t.Fatalf("Expected %d NumPending, got %d", expected, ci.NumPending) 6416 } 6417 } 6418 // Make sure in simple case it is correct. 6419 checkNumPending(toSend) 6420 6421 // Do a few acks. 6422 toAck := 25 6423 for _, m := range fetchMsgs(t, sub, 25, time.Second) { 6424 err = m.AckSync() 6425 require_NoError(t, err) 6426 } 6427 checkNumPending(toSend - toAck) 6428 6429 // Now do a purge such that we only keep so many. 6430 // We want to make sure we do the right thing here and have correct calculations. 6431 toKeep := 100_000 6432 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Keep: uint64(toKeep)}) 6433 require_NoError(t, err) 6434 6435 checkNumPending(toKeep) 6436 } 6437 6438 func TestNoRaceJetStreamKVAccountWithServerRestarts(t *testing.T) { 6439 // Uncomment to run. Needs fast machine to not time out on KeyValue lookup. 6440 skip(t) 6441 6442 c := createJetStreamClusterExplicit(t, "R3S", 3) 6443 defer c.shutdown() 6444 6445 nc, js := jsClientConnect(t, c.randomServer()) 6446 defer nc.Close() 6447 6448 _, err := js.CreateKeyValue(&nats.KeyValueConfig{ 6449 Bucket: "TEST", 6450 Replicas: 3, 6451 }) 6452 require_NoError(t, err) 6453 6454 npubs := 10_000 6455 par := 8 6456 iter := 2 6457 nsubjs := 250 6458 6459 wg := sync.WaitGroup{} 6460 putKeys := func() { 6461 wg.Add(1) 6462 go func() { 6463 defer wg.Done() 6464 nc, js := jsClientConnect(t, c.randomServer()) 6465 defer nc.Close() 6466 kv, err := js.KeyValue("TEST") 6467 require_NoError(t, err) 6468 6469 for i := 0; i < npubs; i++ { 6470 subj := fmt.Sprintf("KEY-%d", rand.Intn(nsubjs)) 6471 if _, err := kv.PutString(subj, "hello"); err != nil { 6472 nc, js := jsClientConnect(t, c.randomServer()) 6473 defer nc.Close() 6474 kv, err = js.KeyValue("TEST") 6475 require_NoError(t, err) 6476 } 6477 } 6478 }() 6479 } 6480 6481 restartServers := func() { 6482 time.Sleep(2 * time.Second) 6483 // Rotate through and restart the servers. 6484 for _, server := range c.servers { 6485 server.Shutdown() 6486 restarted := c.restartServer(server) 6487 checkFor(t, time.Second, 200*time.Millisecond, func() error { 6488 hs := restarted.healthz(&HealthzOptions{ 6489 JSEnabled: true, 6490 JSServerOnly: true, 6491 }) 6492 if hs.Error != _EMPTY_ { 6493 return errors.New(hs.Error) 6494 } 6495 return nil 6496 }) 6497 } 6498 c.waitOnLeader() 6499 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 6500 } 6501 6502 for n := 0; n < iter; n++ { 6503 for i := 0; i < par; i++ { 6504 putKeys() 6505 } 6506 restartServers() 6507 } 6508 wg.Wait() 6509 6510 nc, js = jsClientConnect(t, c.randomServer()) 6511 defer nc.Close() 6512 6513 si, err := js.StreamInfo("KV_TEST") 6514 require_NoError(t, err) 6515 require_True(t, si.State.NumSubjects == uint64(nsubjs)) 6516 } 6517 6518 // Test for consumer create when the subject cardinality is high and the 6519 // consumer is filtered with a wildcard that forces linear scans. 6520 // We have an optimization to use in memory structures in filestore to speed up. 6521 // Only if asking to scan all (DeliverAll). 6522 func TestNoRaceJetStreamConsumerCreateTimeNumPending(t *testing.T) { 6523 s := RunBasicJetStreamServer(t) 6524 defer s.Shutdown() 6525 6526 nc, js := jsClientConnect(t, s) 6527 defer nc.Close() 6528 6529 _, err := js.AddStream(&nats.StreamConfig{ 6530 Name: "TEST", 6531 Subjects: []string{"events.>"}, 6532 }) 6533 require_NoError(t, err) 6534 6535 n := 500_000 6536 msg := bytes.Repeat([]byte("X"), 8*1024) 6537 6538 for i := 0; i < n; i++ { 6539 subj := fmt.Sprintf("events.%d", rand.Intn(100_000)) 6540 js.PublishAsync(subj, msg) 6541 } 6542 select { 6543 case <-js.PublishAsyncComplete(): 6544 case <-time.After(5 * time.Second): 6545 } 6546 6547 // Should stay under 5ms now, but for Travis variability say 50ms. 6548 threshold := 50 * time.Millisecond 6549 6550 start := time.Now() 6551 _, err = js.PullSubscribe("events.*", "dlc") 6552 require_NoError(t, err) 6553 if elapsed := time.Since(start); elapsed > threshold { 6554 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6555 } 6556 6557 start = time.Now() 6558 _, err = js.PullSubscribe("events.99999", "xxx") 6559 require_NoError(t, err) 6560 if elapsed := time.Since(start); elapsed > threshold { 6561 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6562 } 6563 6564 start = time.Now() 6565 _, err = js.PullSubscribe(">", "zzz") 6566 require_NoError(t, err) 6567 if elapsed := time.Since(start); elapsed > threshold { 6568 t.Fatalf("Consumer create took longer than expected, %v vs %v", elapsed, threshold) 6569 } 6570 } 6571 6572 func TestNoRaceJetStreamClusterGhostConsumers(t *testing.T) { 6573 c := createJetStreamClusterExplicit(t, "GHOST", 3) 6574 defer c.shutdown() 6575 6576 nc, js := jsClientConnect(t, c.randomServer()) 6577 defer nc.Close() 6578 6579 _, err := js.AddStream(&nats.StreamConfig{ 6580 Name: "TEST", 6581 Subjects: []string{"events.>"}, 6582 Replicas: 3, 6583 }) 6584 require_NoError(t, err) 6585 6586 for i := 0; i < 10; i++ { 6587 for j := 0; j < 10; j++ { 6588 require_NoError(t, nc.Publish(fmt.Sprintf("events.%d.%d", i, j), []byte(`test`))) 6589 } 6590 } 6591 6592 fetch := func(id int) { 6593 subject := fmt.Sprintf("events.%d.*", id) 6594 subscription, err := js.PullSubscribe(subject, 6595 _EMPTY_, // ephemeral consumer 6596 nats.DeliverAll(), 6597 nats.ReplayInstant(), 6598 nats.BindStream("TEST"), 6599 nats.ConsumerReplicas(1), 6600 nats.ConsumerMemoryStorage(), 6601 ) 6602 if err != nil { 6603 return 6604 } 6605 defer subscription.Unsubscribe() 6606 6607 info, err := subscription.ConsumerInfo() 6608 if err != nil { 6609 return 6610 } 6611 6612 subscription.Fetch(int(info.NumPending)) 6613 } 6614 6615 replay := func(ctx context.Context, id int) { 6616 for { 6617 select { 6618 case <-ctx.Done(): 6619 return 6620 default: 6621 fetch(id) 6622 } 6623 } 6624 } 6625 6626 ctx, cancel := context.WithCancel(context.Background()) 6627 6628 go replay(ctx, 0) 6629 go replay(ctx, 1) 6630 go replay(ctx, 2) 6631 go replay(ctx, 3) 6632 go replay(ctx, 4) 6633 go replay(ctx, 5) 6634 go replay(ctx, 6) 6635 go replay(ctx, 7) 6636 go replay(ctx, 8) 6637 go replay(ctx, 9) 6638 6639 time.Sleep(5 * time.Second) 6640 6641 for _, server := range c.servers { 6642 server.Shutdown() 6643 restarted := c.restartServer(server) 6644 checkFor(t, time.Second, 200*time.Millisecond, func() error { 6645 hs := restarted.healthz(&HealthzOptions{ 6646 JSEnabled: true, 6647 JSServerOnly: true, 6648 }) 6649 if hs.Error != _EMPTY_ { 6650 return errors.New(hs.Error) 6651 } 6652 return nil 6653 }) 6654 c.waitOnStreamLeader(globalAccountName, "TEST") 6655 time.Sleep(time.Second * 2) 6656 go replay(ctx, 5) 6657 go replay(ctx, 6) 6658 go replay(ctx, 7) 6659 go replay(ctx, 8) 6660 go replay(ctx, 9) 6661 } 6662 6663 time.Sleep(5 * time.Second) 6664 cancel() 6665 6666 getMissing := func() []string { 6667 m, err := nc.Request("$JS.API.CONSUMER.LIST.TEST", nil, time.Second*10) 6668 require_NoError(t, err) 6669 6670 var resp JSApiConsumerListResponse 6671 err = json.Unmarshal(m.Data, &resp) 6672 require_NoError(t, err) 6673 return resp.Missing 6674 } 6675 6676 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 6677 missing := getMissing() 6678 if len(missing) == 0 { 6679 return nil 6680 } 6681 return fmt.Errorf("Still have missing: %+v", missing) 6682 }) 6683 } 6684 6685 // This is to test a publish slowdown and general instability experienced in a setup similar to this. 6686 // We have feeder streams that are all sourced to an aggregate stream. All streams are interest retention. 6687 // We want to monitor the avg publish time for the sync publishers to the feeder streams, the ingest rate to 6688 // the aggregate stream, and general health of the consumers on the aggregate stream. 6689 // Target publish rate is ~2k/s with publish time being ~40-60ms but remaining stable. 6690 // We can also simulate max redeliveries that create interior deletes in streams. 6691 func TestNoRaceJetStreamClusterF3Setup(t *testing.T) { 6692 // Uncomment to run. Needs to be on a pretty big machine. Do not want as part of Travis tests atm. 6693 skip(t) 6694 6695 // These and the settings below achieve ~60ms pub time on avg and ~2k msgs per sec inbound to the aggregate stream. 6696 // On my machine though. 6697 np := clusterProxy{ 6698 rtt: 2 * time.Millisecond, 6699 up: 1 * 1024 * 1024 * 1024, // 1gbit 6700 down: 1 * 1024 * 1024 * 1024, // 1gbit 6701 } 6702 6703 // Test params. 6704 numSourceStreams := 20 6705 numConsumersPerSource := 1 6706 numPullersPerConsumer := 50 6707 numPublishers := 100 6708 setHighStartSequence := false 6709 simulateMaxRedeliveries := false 6710 maxBadPubTimes := uint32(20) 6711 badPubThresh := 500 * time.Millisecond 6712 testTime := 5 * time.Minute // make sure to do --timeout=65m 6713 6714 t.Logf("Starting Test: Total Test Time %v", testTime) 6715 6716 c := createJetStreamClusterWithNetProxy(t, "R3S", 3, &np) 6717 defer c.shutdown() 6718 6719 // Do some quick sanity checking for latency stuff. 6720 { 6721 nc, js := jsClientConnect(t, c.randomServer()) 6722 defer nc.Close() 6723 6724 _, err := js.AddStream(&nats.StreamConfig{ 6725 Name: "TEST", 6726 Replicas: 3, 6727 Subjects: []string{"foo"}, 6728 Retention: nats.InterestPolicy, 6729 }) 6730 require_NoError(t, err) 6731 defer js.DeleteStream("TEST") 6732 6733 sl := c.streamLeader(globalAccountName, "TEST") 6734 nc, js = jsClientConnect(t, sl) 6735 defer nc.Close() 6736 start := time.Now() 6737 _, err = js.Publish("foo", []byte("hello")) 6738 require_NoError(t, err) 6739 // This is best case, and with client connection being close to free, this should be at least > rtt 6740 if elapsed := time.Since(start); elapsed < np.rtt { 6741 t.Fatalf("Expected publish time to be > %v, got %v", np.rtt, elapsed) 6742 } 6743 6744 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 6745 nc, js = jsClientConnect(t, nl) 6746 defer nc.Close() 6747 start = time.Now() 6748 _, err = js.Publish("foo", []byte("hello")) 6749 require_NoError(t, err) 6750 // This is worst case, meaning message has to travel to leader, then to fastest replica, then back. 6751 // So should be at 3x rtt, so check at least > 2x rtt. 6752 if elapsed := time.Since(start); elapsed < 2*np.rtt { 6753 t.Fatalf("Expected publish time to be > %v, got %v", 2*np.rtt, elapsed) 6754 } 6755 } 6756 6757 // Setup source streams. 6758 nc, js := jsClientConnect(t, c.randomServer()) 6759 defer nc.Close() 6760 6761 t.Logf("Creating %d Source Streams", numSourceStreams) 6762 6763 var sources []string 6764 wg := sync.WaitGroup{} 6765 for i := 0; i < numSourceStreams; i++ { 6766 sname := fmt.Sprintf("EVENT-%s", nuid.Next()) 6767 sources = append(sources, sname) 6768 wg.Add(1) 6769 go func(stream string) { 6770 defer wg.Done() 6771 t.Logf(" %q", stream) 6772 subj := fmt.Sprintf("%s.>", stream) 6773 _, err := js.AddStream(&nats.StreamConfig{ 6774 Name: stream, 6775 Subjects: []string{subj}, 6776 Replicas: 3, 6777 Retention: nats.InterestPolicy, 6778 }) 6779 require_NoError(t, err) 6780 for j := 0; j < numConsumersPerSource; j++ { 6781 consumer := fmt.Sprintf("C%d", j) 6782 _, err := js.Subscribe(_EMPTY_, func(msg *nats.Msg) { 6783 msg.Ack() 6784 }, nats.BindStream(stream), nats.Durable(consumer), nats.ManualAck()) 6785 require_NoError(t, err) 6786 } 6787 }(sname) 6788 } 6789 wg.Wait() 6790 6791 var streamSources []*nats.StreamSource 6792 for _, src := range sources { 6793 streamSources = append(streamSources, &nats.StreamSource{Name: src}) 6794 6795 } 6796 6797 t.Log("Creating Aggregate Stream") 6798 6799 // Now create the aggregate stream. 6800 _, err := js.AddStream(&nats.StreamConfig{ 6801 Name: "EVENTS", 6802 Replicas: 3, 6803 Retention: nats.InterestPolicy, 6804 Sources: streamSources, 6805 }) 6806 require_NoError(t, err) 6807 6808 // Set first sequence to a high number. 6809 if setHighStartSequence { 6810 require_NoError(t, js.PurgeStream("EVENTS", &nats.StreamPurgeRequest{Sequence: 32_000_001})) 6811 } 6812 6813 // Now create 2 pull consumers. 6814 _, err = js.PullSubscribe(_EMPTY_, "C1", 6815 nats.BindStream("EVENTS"), 6816 nats.MaxDeliver(1), 6817 nats.AckWait(10*time.Second), 6818 nats.ManualAck(), 6819 ) 6820 require_NoError(t, err) 6821 6822 _, err = js.PullSubscribe(_EMPTY_, "C2", 6823 nats.BindStream("EVENTS"), 6824 nats.MaxDeliver(1), 6825 nats.AckWait(10*time.Second), 6826 nats.ManualAck(), 6827 ) 6828 require_NoError(t, err) 6829 6830 t.Logf("Creating %d x 2 Pull Subscribers", numPullersPerConsumer) 6831 6832 // Now create the pullers. 6833 for _, subName := range []string{"C1", "C2"} { 6834 for i := 0; i < numPullersPerConsumer; i++ { 6835 go func(subName string) { 6836 nc, js := jsClientConnect(t, c.randomServer()) 6837 defer nc.Close() 6838 6839 sub, err := js.PullSubscribe(_EMPTY_, subName, 6840 nats.BindStream("EVENTS"), 6841 nats.MaxDeliver(1), 6842 nats.AckWait(10*time.Second), 6843 nats.ManualAck(), 6844 ) 6845 require_NoError(t, err) 6846 6847 for { 6848 msgs, err := sub.Fetch(25, nats.MaxWait(2*time.Second)) 6849 if err != nil && err != nats.ErrTimeout { 6850 t.Logf("Exiting pull subscriber %q: %v", subName, err) 6851 return 6852 } 6853 // Shuffle 6854 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 6855 6856 // Wait for a random interval up to 100ms. 6857 time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond) 6858 6859 for _, m := range msgs { 6860 // If we want to simulate max redeliveries being hit, since not acking 6861 // once will cause it due to subscriber setup. 6862 // 100_000 == 0.01% 6863 if simulateMaxRedeliveries && rand.Intn(100_000) == 0 { 6864 md, err := m.Metadata() 6865 require_NoError(t, err) 6866 t.Logf("** Skipping Ack: %d **", md.Sequence.Stream) 6867 } else { 6868 m.Ack() 6869 } 6870 } 6871 } 6872 }(subName) 6873 } 6874 } 6875 6876 // Now create feeder publishers. 6877 eventTypes := []string{"PAYMENT", "SUBMISSION", "CANCEL"} 6878 6879 msg := make([]byte, 2*1024) // 2k payload 6880 crand.Read(msg) 6881 6882 // For tracking pub times. 6883 var pubs int 6884 var totalPubTime time.Duration 6885 var pmu sync.Mutex 6886 last := time.Now() 6887 6888 updatePubStats := func(elapsed time.Duration) { 6889 pmu.Lock() 6890 defer pmu.Unlock() 6891 // Reset every 5s 6892 if time.Since(last) > 5*time.Second { 6893 pubs = 0 6894 totalPubTime = 0 6895 last = time.Now() 6896 } 6897 pubs++ 6898 totalPubTime += elapsed 6899 } 6900 avgPubTime := func() time.Duration { 6901 pmu.Lock() 6902 np := pubs 6903 tpt := totalPubTime 6904 pmu.Unlock() 6905 return tpt / time.Duration(np) 6906 } 6907 6908 t.Logf("Creating %d Publishers", numPublishers) 6909 6910 var numLimitsExceeded atomic.Uint32 6911 errCh := make(chan error, 100) 6912 6913 for i := 0; i < numPublishers; i++ { 6914 go func() { 6915 nc, js := jsClientConnect(t, c.randomServer()) 6916 defer nc.Close() 6917 6918 for { 6919 // Grab a random source stream 6920 stream := sources[rand.Intn(len(sources))] 6921 // Grab random event type. 6922 evt := eventTypes[rand.Intn(len(eventTypes))] 6923 subj := fmt.Sprintf("%s.%s", stream, evt) 6924 start := time.Now() 6925 _, err := js.Publish(subj, msg) 6926 if err != nil { 6927 t.Logf("Exiting publisher: %v", err) 6928 return 6929 } 6930 elapsed := time.Since(start) 6931 if elapsed > badPubThresh { 6932 t.Logf("Publish time took more than expected: %v", elapsed) 6933 numLimitsExceeded.Add(1) 6934 if ne := numLimitsExceeded.Load(); ne > maxBadPubTimes { 6935 errCh <- fmt.Errorf("Too many exceeded times on publish: %d", ne) 6936 return 6937 } 6938 } 6939 updatePubStats(elapsed) 6940 } 6941 }() 6942 } 6943 6944 t.Log("Creating Monitoring Routine - Data in ~10s") 6945 6946 // Create monitoring routine. 6947 go func() { 6948 nc, js := jsClientConnect(t, c.randomServer()) 6949 defer nc.Close() 6950 6951 fseq, lseq := uint64(0), uint64(0) 6952 for { 6953 // Grab consumers 6954 var minAckFloor uint64 = math.MaxUint64 6955 for _, consumer := range []string{"C1", "C2"} { 6956 ci, err := js.ConsumerInfo("EVENTS", consumer) 6957 if err != nil { 6958 t.Logf("Exiting Monitor: %v", err) 6959 return 6960 } 6961 if lseq > 0 { 6962 t.Logf("%s:\n Delivered:\t%d\n AckFloor:\t%d\n AckPending:\t%d\n NumPending:\t%d", 6963 consumer, ci.Delivered.Stream, ci.AckFloor.Stream, ci.NumAckPending, ci.NumPending) 6964 } 6965 if ci.AckFloor.Stream < minAckFloor { 6966 minAckFloor = ci.AckFloor.Stream 6967 } 6968 } 6969 // Now grab aggregate stream state. 6970 si, err := js.StreamInfo("EVENTS") 6971 if err != nil { 6972 t.Logf("Exiting Monitor: %v", err) 6973 return 6974 } 6975 state := si.State 6976 if lseq != 0 { 6977 t.Logf("Stream:\n Msgs: \t%d\n First:\t%d\n Last: \t%d\n Deletes:\t%d\n", 6978 state.Msgs, state.FirstSeq, state.LastSeq, state.NumDeleted) 6979 t.Logf("Publish Stats:\n Msgs/s:\t%0.2f\n Avg Pub:\t%v\n\n", float64(si.State.LastSeq-lseq)/5.0, avgPubTime()) 6980 if si.State.FirstSeq < minAckFloor && si.State.FirstSeq == fseq { 6981 t.Log("Stream first seq < minimum ack floor") 6982 } 6983 } 6984 fseq, lseq = si.State.FirstSeq, si.State.LastSeq 6985 time.Sleep(5 * time.Second) 6986 } 6987 6988 }() 6989 6990 select { 6991 case e := <-errCh: 6992 t.Fatal(e) 6993 case <-time.After(testTime): 6994 t.Fatalf("Did not receive completion signal") 6995 } 6996 } 6997 6998 // Unbalanced stretch cluster. 6999 // S2 (stream leader) will have a slow path to S1 (via proxy) and S3 (consumer leader) will have a fast path. 7000 // 7001 // Route Ports 7002 // "S1": 14622 7003 // "S2": 15622 7004 // "S3": 16622 7005 func createStretchUnbalancedCluster(t testing.TB) (c *cluster, np *netProxy) { 7006 t.Helper() 7007 7008 tmpl := ` 7009 listen: 127.0.0.1:-1 7010 server_name: %s 7011 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 7012 7013 cluster { 7014 name: "F3" 7015 listen: 127.0.0.1:%d 7016 routes = [%s] 7017 } 7018 7019 accounts { 7020 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 7021 } 7022 ` 7023 // Do these in order, S1, S2 (proxy) then S3. 7024 c = &cluster{t: t, servers: make([]*Server, 3), opts: make([]*Options, 3), name: "F3"} 7025 7026 // S1 7027 conf := fmt.Sprintf(tmpl, "S1", t.TempDir(), 14622, "route://127.0.0.1:15622, route://127.0.0.1:16622") 7028 c.servers[0], c.opts[0] = RunServerWithConfig(createConfFile(t, []byte(conf))) 7029 7030 // S2 7031 // Create the proxy first. Connect this to S1. Make it slow, e.g. 5ms RTT. 7032 np = createNetProxy(1*time.Millisecond, 1024*1024*1024, 1024*1024*1024, "route://127.0.0.1:14622", true) 7033 routes := fmt.Sprintf("%s, route://127.0.0.1:16622", np.routeURL()) 7034 conf = fmt.Sprintf(tmpl, "S2", t.TempDir(), 15622, routes) 7035 c.servers[1], c.opts[1] = RunServerWithConfig(createConfFile(t, []byte(conf))) 7036 7037 // S3 7038 conf = fmt.Sprintf(tmpl, "S3", t.TempDir(), 16622, "route://127.0.0.1:14622, route://127.0.0.1:15622") 7039 c.servers[2], c.opts[2] = RunServerWithConfig(createConfFile(t, []byte(conf))) 7040 7041 c.checkClusterFormed() 7042 c.waitOnClusterReady() 7043 7044 return c, np 7045 } 7046 7047 // We test an interest based stream that has a cluster with a node with asymmetric paths from 7048 // the stream leader and the consumer leader such that the consumer leader path is fast and 7049 // replicated acks arrive sooner then the actual message. This path was considered, but also 7050 // categorized as very rare and was expensive as it tried to forward a new stream msg delete 7051 // proposal to the original stream leader. It now will deal with the issue locally and not 7052 // slow down the ingest rate to the stream's publishers. 7053 func TestNoRaceJetStreamClusterDifferentRTTInterestBasedStreamSetup(t *testing.T) { 7054 // Uncomment to run. Do not want as part of Travis tests atm. 7055 skip(t) 7056 7057 c, np := createStretchUnbalancedCluster(t) 7058 defer c.shutdown() 7059 defer np.stop() 7060 7061 nc, js := jsClientConnect(t, c.randomServer()) 7062 defer nc.Close() 7063 7064 // Now create the stream. 7065 _, err := js.AddStream(&nats.StreamConfig{ 7066 Name: "EVENTS", 7067 Subjects: []string{"EV.>"}, 7068 Replicas: 3, 7069 Retention: nats.InterestPolicy, 7070 }) 7071 require_NoError(t, err) 7072 7073 // Make sure it's leader is on S2. 7074 sl := c.servers[1] 7075 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7076 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7077 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7078 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7079 return fmt.Errorf("Server %s is not stream leader yet", sl) 7080 } 7081 return nil 7082 }) 7083 7084 // Now create the consumer. 7085 _, err = js.PullSubscribe(_EMPTY_, "C", nats.BindStream("EVENTS"), nats.ManualAck()) 7086 require_NoError(t, err) 7087 7088 // Make sure the consumer leader is on S3. 7089 cl := c.servers[2] 7090 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7091 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7092 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7093 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7094 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7095 } 7096 return nil 7097 }) 7098 7099 go func(js nats.JetStream) { 7100 sub, err := js.PullSubscribe(_EMPTY_, "C", nats.BindStream("EVENTS"), nats.ManualAck()) 7101 require_NoError(t, err) 7102 7103 for { 7104 msgs, err := sub.Fetch(100, nats.MaxWait(2*time.Second)) 7105 if err != nil && err != nats.ErrTimeout { 7106 return 7107 } 7108 // Shuffle 7109 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 7110 for _, m := range msgs { 7111 m.Ack() 7112 } 7113 } 7114 }(js) 7115 7116 numPublishers := 25 7117 pubThresh := 2 * time.Second 7118 var maxExceeded atomic.Int64 7119 errCh := make(chan error, numPublishers) 7120 wg := sync.WaitGroup{} 7121 7122 msg := make([]byte, 2*1024) // 2k payload 7123 crand.Read(msg) 7124 7125 // Publishers. 7126 for i := 0; i < numPublishers; i++ { 7127 wg.Add(1) 7128 go func(iter int) { 7129 defer wg.Done() 7130 7131 // Connect to random, the slow ones will be connected to the slow node. 7132 // But if you connect them all there it will pass. 7133 s := c.randomServer() 7134 nc, js := jsClientConnect(t, s) 7135 defer nc.Close() 7136 7137 for i := 0; i < 1_000; i++ { 7138 start := time.Now() 7139 _, err := js.Publish("EV.PAID", msg) 7140 if err != nil { 7141 errCh <- fmt.Errorf("Publish error: %v", err) 7142 return 7143 } 7144 if elapsed := time.Since(start); elapsed > pubThresh { 7145 errCh <- fmt.Errorf("Publish time exceeded") 7146 if int64(elapsed) > maxExceeded.Load() { 7147 maxExceeded.Store(int64(elapsed)) 7148 } 7149 return 7150 } 7151 } 7152 }(i) 7153 } 7154 7155 wg.Wait() 7156 7157 select { 7158 case e := <-errCh: 7159 t.Fatalf("%v: threshold is %v, maximum seen: %v", e, pubThresh, time.Duration(maxExceeded.Load())) 7160 default: 7161 } 7162 } 7163 7164 func TestNoRaceJetStreamInterestStreamCheckInterestRaceBug(t *testing.T) { 7165 c := createJetStreamClusterExplicit(t, "R3S", 3) 7166 defer c.shutdown() 7167 7168 nc, js := jsClientConnect(t, c.randomServer()) 7169 defer nc.Close() 7170 7171 _, err := js.AddStream(&nats.StreamConfig{ 7172 Name: "TEST", 7173 Subjects: []string{"foo"}, 7174 Replicas: 3, 7175 Retention: nats.InterestPolicy, 7176 }) 7177 require_NoError(t, err) 7178 7179 numConsumers := 10 7180 for i := 0; i < numConsumers; i++ { 7181 nc, js := jsClientConnect(t, c.randomServer()) 7182 defer nc.Close() 7183 7184 _, err = js.Subscribe("foo", func(m *nats.Msg) { 7185 m.Ack() 7186 }, nats.Durable(fmt.Sprintf("C%d", i)), nats.ManualAck()) 7187 require_NoError(t, err) 7188 } 7189 7190 numToSend := 10_000 7191 for i := 0; i < numToSend; i++ { 7192 _, err := js.PublishAsync("foo", nil, nats.StallWait(800*time.Millisecond)) 7193 require_NoError(t, err) 7194 } 7195 select { 7196 case <-js.PublishAsyncComplete(): 7197 case <-time.After(20 * time.Second): 7198 t.Fatalf("Did not receive completion signal") 7199 } 7200 7201 // Wait til ackfloor is correct for all consumers. 7202 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 7203 for _, s := range c.servers { 7204 mset, err := s.GlobalAccount().lookupStream("TEST") 7205 require_NoError(t, err) 7206 7207 mset.mu.RLock() 7208 defer mset.mu.RUnlock() 7209 7210 require_True(t, len(mset.consumers) == numConsumers) 7211 7212 for _, o := range mset.consumers { 7213 state, err := o.store.State() 7214 require_NoError(t, err) 7215 if state.AckFloor.Stream != uint64(numToSend) { 7216 return fmt.Errorf("Ackfloor not correct yet") 7217 } 7218 } 7219 } 7220 return nil 7221 }) 7222 7223 for _, s := range c.servers { 7224 mset, err := s.GlobalAccount().lookupStream("TEST") 7225 require_NoError(t, err) 7226 7227 mset.mu.RLock() 7228 defer mset.mu.RUnlock() 7229 7230 state := mset.state() 7231 require_True(t, state.Msgs == 0) 7232 require_True(t, state.FirstSeq == uint64(numToSend+1)) 7233 } 7234 } 7235 7236 func TestNoRaceJetStreamClusterInterestStreamConsistencyAfterRollingRestart(t *testing.T) { 7237 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 7238 skip(t) 7239 7240 c := createJetStreamClusterExplicit(t, "R3S", 3) 7241 defer c.shutdown() 7242 7243 numStreams := 200 7244 numConsumersPer := 5 7245 numPublishers := 10 7246 7247 nc, js := jsClientConnect(t, c.randomServer()) 7248 defer nc.Close() 7249 7250 qch := make(chan bool) 7251 7252 var mm sync.Mutex 7253 ackMap := make(map[string]map[uint64][]string) 7254 7255 addAckTracking := func(seq uint64, stream, consumer string) { 7256 mm.Lock() 7257 defer mm.Unlock() 7258 sam := ackMap[stream] 7259 if sam == nil { 7260 sam = make(map[uint64][]string) 7261 ackMap[stream] = sam 7262 } 7263 sam[seq] = append(sam[seq], consumer) 7264 } 7265 7266 doPullSubscriber := func(stream, consumer, filter string) { 7267 nc, js := jsClientConnect(t, c.randomServer()) 7268 defer nc.Close() 7269 7270 var err error 7271 var sub *nats.Subscription 7272 timeout := time.Now().Add(5 * time.Second) 7273 for time.Now().Before(timeout) { 7274 sub, err = js.PullSubscribe(filter, consumer, nats.BindStream(stream), nats.ManualAck()) 7275 if err == nil { 7276 break 7277 } 7278 } 7279 if err != nil { 7280 t.Logf("Error on pull subscriber: %v", err) 7281 return 7282 } 7283 7284 for { 7285 select { 7286 case <-time.After(500 * time.Millisecond): 7287 msgs, err := sub.Fetch(100, nats.MaxWait(time.Second)) 7288 if err != nil { 7289 continue 7290 } 7291 // Shuffle 7292 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 7293 for _, m := range msgs { 7294 meta, err := m.Metadata() 7295 require_NoError(t, err) 7296 m.Ack() 7297 addAckTracking(meta.Sequence.Stream, stream, consumer) 7298 if meta.NumDelivered > 1 { 7299 t.Logf("Got a msg redelivered %d for sequence %d on %q %q\n", meta.NumDelivered, meta.Sequence.Stream, stream, consumer) 7300 } 7301 } 7302 case <-qch: 7303 nc.Flush() 7304 return 7305 } 7306 } 7307 } 7308 7309 // Setup 7310 wg := sync.WaitGroup{} 7311 for i := 0; i < numStreams; i++ { 7312 wg.Add(1) 7313 go func(stream string) { 7314 defer wg.Done() 7315 subj := fmt.Sprintf("%s.>", stream) 7316 _, err := js.AddStream(&nats.StreamConfig{ 7317 Name: stream, 7318 Subjects: []string{subj}, 7319 Replicas: 3, 7320 Retention: nats.InterestPolicy, 7321 }) 7322 require_NoError(t, err) 7323 for i := 0; i < numConsumersPer; i++ { 7324 consumer := fmt.Sprintf("C%d", i) 7325 filter := fmt.Sprintf("%s.%d", stream, i) 7326 _, err = js.AddConsumer(stream, &nats.ConsumerConfig{ 7327 Durable: consumer, 7328 FilterSubject: filter, 7329 AckPolicy: nats.AckExplicitPolicy, 7330 AckWait: 2 * time.Second, 7331 }) 7332 require_NoError(t, err) 7333 c.waitOnConsumerLeader(globalAccountName, stream, consumer) 7334 go doPullSubscriber(stream, consumer, filter) 7335 } 7336 }(fmt.Sprintf("A-%d", i)) 7337 } 7338 wg.Wait() 7339 7340 msg := make([]byte, 2*1024) // 2k payload 7341 crand.Read(msg) 7342 7343 // Controls if publishing is on or off. 7344 var pubActive atomic.Bool 7345 7346 doPublish := func() { 7347 nc, js := jsClientConnect(t, c.randomServer()) 7348 defer nc.Close() 7349 7350 for { 7351 select { 7352 case <-time.After(100 * time.Millisecond): 7353 if pubActive.Load() { 7354 for i := 0; i < numStreams; i++ { 7355 for j := 0; j < numConsumersPer; j++ { 7356 subj := fmt.Sprintf("A-%d.%d", i, j) 7357 // Don't care about errors here for this test. 7358 js.Publish(subj, msg) 7359 } 7360 } 7361 } 7362 case <-qch: 7363 return 7364 } 7365 } 7366 } 7367 7368 pubActive.Store(true) 7369 7370 for i := 0; i < numPublishers; i++ { 7371 go doPublish() 7372 } 7373 7374 // Let run for a bit. 7375 time.Sleep(20 * time.Second) 7376 7377 // Do a rolling restart. 7378 for _, s := range c.servers { 7379 t.Logf("Shutdown %v\n", s) 7380 s.Shutdown() 7381 s.WaitForShutdown() 7382 time.Sleep(20 * time.Second) 7383 t.Logf("Restarting %v\n", s) 7384 s = c.restartServer(s) 7385 c.waitOnServerHealthz(s) 7386 } 7387 7388 // Let run for a bit longer. 7389 time.Sleep(10 * time.Second) 7390 7391 // Stop pubs. 7392 pubActive.Store(false) 7393 7394 // Let settle. 7395 time.Sleep(10 * time.Second) 7396 close(qch) 7397 time.Sleep(20 * time.Second) 7398 7399 nc, js = jsClientConnect(t, c.randomServer()) 7400 defer nc.Close() 7401 7402 minAckFloor := func(stream string) (uint64, string) { 7403 var maf uint64 = math.MaxUint64 7404 var consumer string 7405 for i := 0; i < numConsumersPer; i++ { 7406 cname := fmt.Sprintf("C%d", i) 7407 ci, err := js.ConsumerInfo(stream, cname) 7408 require_NoError(t, err) 7409 if ci.AckFloor.Stream < maf { 7410 maf = ci.AckFloor.Stream 7411 consumer = cname 7412 } 7413 } 7414 return maf, consumer 7415 } 7416 7417 checkStreamAcks := func(stream string) { 7418 mm.Lock() 7419 defer mm.Unlock() 7420 if sam := ackMap[stream]; sam != nil { 7421 for seq := 1; ; seq++ { 7422 acks := sam[uint64(seq)] 7423 if acks == nil { 7424 if sam[uint64(seq+1)] != nil { 7425 t.Logf("Missing an ack on stream %q for sequence %d\n", stream, seq) 7426 } else { 7427 break 7428 } 7429 } 7430 if len(acks) > 1 { 7431 t.Logf("Multiple acks for %d which is not expected: %+v", seq, acks) 7432 } 7433 } 7434 } 7435 } 7436 7437 // Now check all streams such that their first sequence is equal to the minimum of all consumers. 7438 for i := 0; i < numStreams; i++ { 7439 stream := fmt.Sprintf("A-%d", i) 7440 si, err := js.StreamInfo(stream) 7441 require_NoError(t, err) 7442 7443 if maf, consumer := minAckFloor(stream); maf > si.State.FirstSeq { 7444 t.Logf("\nBAD STATE DETECTED FOR %q, CHECKING OTHER SERVERS! ACK %d vs %+v LEADER %v, CL FOR %q %v\n", 7445 stream, maf, si.State, c.streamLeader(globalAccountName, stream), consumer, c.consumerLeader(globalAccountName, stream, consumer)) 7446 7447 t.Logf("TEST ACKS %+v\n", ackMap) 7448 7449 checkStreamAcks(stream) 7450 7451 for _, s := range c.servers { 7452 mset, err := s.GlobalAccount().lookupStream(stream) 7453 require_NoError(t, err) 7454 state := mset.state() 7455 t.Logf("Server %v Stream STATE %+v\n", s, state) 7456 7457 var smv StoreMsg 7458 if sm, err := mset.store.LoadMsg(state.FirstSeq, &smv); err == nil { 7459 t.Logf("Subject for msg %d is %q", state.FirstSeq, sm.subj) 7460 } else { 7461 t.Logf("Could not retrieve msg for %d: %v", state.FirstSeq, err) 7462 } 7463 7464 if len(mset.preAcks) > 0 { 7465 t.Logf("%v preAcks %+v\n", s, mset.preAcks) 7466 } 7467 7468 for _, o := range mset.consumers { 7469 ostate, err := o.store.State() 7470 require_NoError(t, err) 7471 t.Logf("Consumer STATE for %q is %+v\n", o.name, ostate) 7472 } 7473 } 7474 t.Fatalf("BAD STATE: ACKFLOOR > FIRST %d vs %d\n", maf, si.State.FirstSeq) 7475 } 7476 } 7477 } 7478 7479 func TestNoRaceFileStoreNumPending(t *testing.T) { 7480 // No need for all permutations here. 7481 storeDir := t.TempDir() 7482 fcfg := FileStoreConfig{ 7483 StoreDir: storeDir, 7484 BlockSize: 2 * 1024, // Create many blocks on purpose. 7485 } 7486 fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"*.*.*.*"}, Storage: FileStorage}) 7487 require_NoError(t, err) 7488 defer fs.Stop() 7489 7490 tokens := []string{"foo", "bar", "baz"} 7491 genSubj := func() string { 7492 return fmt.Sprintf("%s.%s.%s.%s", 7493 tokens[rand.Intn(len(tokens))], 7494 tokens[rand.Intn(len(tokens))], 7495 tokens[rand.Intn(len(tokens))], 7496 tokens[rand.Intn(len(tokens))], 7497 ) 7498 } 7499 7500 for i := 0; i < 50_000; i++ { 7501 subj := genSubj() 7502 _, _, err := fs.StoreMsg(subj, nil, []byte("Hello World")) 7503 require_NoError(t, err) 7504 } 7505 7506 state := fs.State() 7507 7508 // Scan one by one for sanity check against other calculations. 7509 sanityCheck := func(sseq uint64, filter string) SimpleState { 7510 t.Helper() 7511 var ss SimpleState 7512 var smv StoreMsg 7513 // For here we know 0 is invalid, set to 1. 7514 if sseq == 0 { 7515 sseq = 1 7516 } 7517 for seq := sseq; seq <= state.LastSeq; seq++ { 7518 sm, err := fs.LoadMsg(seq, &smv) 7519 if err != nil { 7520 t.Logf("Encountered error %v loading sequence: %d", err, seq) 7521 continue 7522 } 7523 if subjectIsSubsetMatch(sm.subj, filter) { 7524 ss.Msgs++ 7525 ss.Last = seq 7526 if ss.First == 0 || seq < ss.First { 7527 ss.First = seq 7528 } 7529 } 7530 } 7531 return ss 7532 } 7533 7534 check := func(sseq uint64, filter string) { 7535 t.Helper() 7536 np, lvs := fs.NumPending(sseq, filter, false) 7537 ss := fs.FilteredState(sseq, filter) 7538 sss := sanityCheck(sseq, filter) 7539 if lvs != state.LastSeq { 7540 t.Fatalf("Expected NumPending to return valid through last of %d but got %d", state.LastSeq, lvs) 7541 } 7542 if ss.Msgs != np { 7543 t.Fatalf("NumPending of %d did not match ss.Msgs of %d", np, ss.Msgs) 7544 } 7545 if ss != sss { 7546 t.Fatalf("Failed sanity check, expected %+v got %+v", sss, ss) 7547 } 7548 } 7549 7550 sanityCheckLastOnly := func(sseq uint64, filter string) SimpleState { 7551 t.Helper() 7552 var ss SimpleState 7553 var smv StoreMsg 7554 // For here we know 0 is invalid, set to 1. 7555 if sseq == 0 { 7556 sseq = 1 7557 } 7558 seen := make(map[string]bool) 7559 for seq := state.LastSeq; seq >= sseq; seq-- { 7560 sm, err := fs.LoadMsg(seq, &smv) 7561 if err != nil { 7562 t.Logf("Encountered error %v loading sequence: %d", err, seq) 7563 continue 7564 } 7565 if !seen[sm.subj] && subjectIsSubsetMatch(sm.subj, filter) { 7566 ss.Msgs++ 7567 if ss.Last == 0 { 7568 ss.Last = seq 7569 } 7570 if ss.First == 0 || seq < ss.First { 7571 ss.First = seq 7572 } 7573 seen[sm.subj] = true 7574 } 7575 } 7576 return ss 7577 } 7578 7579 checkLastOnly := func(sseq uint64, filter string) { 7580 t.Helper() 7581 np, lvs := fs.NumPending(sseq, filter, true) 7582 ss := sanityCheckLastOnly(sseq, filter) 7583 if lvs != state.LastSeq { 7584 t.Fatalf("Expected NumPending to return valid through last of %d but got %d", state.LastSeq, lvs) 7585 } 7586 if ss.Msgs != np { 7587 t.Fatalf("NumPending of %d did not match ss.Msgs of %d", np, ss.Msgs) 7588 } 7589 } 7590 7591 startSeqs := []uint64{0, 1, 2, 200, 444, 555, 2222, 8888, 12_345, 28_222, 33_456, 44_400, 49_999} 7592 checkSubs := []string{"foo.>", "*.bar.>", "foo.bar.*.baz", "*.bar.>", "*.foo.bar.*", "foo.foo.bar.baz"} 7593 7594 for _, filter := range checkSubs { 7595 for _, start := range startSeqs { 7596 check(start, filter) 7597 checkLastOnly(start, filter) 7598 } 7599 } 7600 } 7601 7602 func TestNoRaceJetStreamClusterUnbalancedInterestMultipleConsumers(t *testing.T) { 7603 c, np := createStretchUnbalancedCluster(t) 7604 defer c.shutdown() 7605 defer np.stop() 7606 7607 nc, js := jsClientConnect(t, c.randomServer()) 7608 defer nc.Close() 7609 7610 // Now create the stream. 7611 _, err := js.AddStream(&nats.StreamConfig{ 7612 Name: "EVENTS", 7613 Subjects: []string{"EV.>"}, 7614 Replicas: 3, 7615 Retention: nats.InterestPolicy, 7616 }) 7617 require_NoError(t, err) 7618 7619 // Make sure it's leader is on S2. 7620 sl := c.servers[1] 7621 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7622 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7623 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7624 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7625 return fmt.Errorf("Server %s is not stream leader yet", sl) 7626 } 7627 return nil 7628 }) 7629 7630 // Create a fast ack consumer. 7631 _, err = js.Subscribe("EV.NEW", func(m *nats.Msg) { 7632 m.Ack() 7633 }, nats.Durable("C"), nats.ManualAck()) 7634 require_NoError(t, err) 7635 7636 // Make sure the consumer leader is on S3. 7637 cl := c.servers[2] 7638 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7639 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7640 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7641 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7642 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7643 } 7644 return nil 7645 }) 7646 7647 // Connect a client directly to the stream leader. 7648 nc, js = jsClientConnect(t, sl) 7649 defer nc.Close() 7650 7651 // Now create a pull subscriber. 7652 sub, err := js.PullSubscribe("EV.NEW", "D", nats.ManualAck()) 7653 require_NoError(t, err) 7654 7655 // Make sure this consumer leader is on S1. 7656 cl = c.servers[0] 7657 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7658 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "D") 7659 if s := c.consumerLeader(globalAccountName, "EVENTS", "D"); s != cl { 7660 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "D") 7661 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7662 } 7663 return nil 7664 }) 7665 7666 numToSend := 1000 7667 for i := 0; i < numToSend; i++ { 7668 _, err := js.PublishAsync("EV.NEW", nil) 7669 require_NoError(t, err) 7670 } 7671 select { 7672 case <-js.PublishAsyncComplete(): 7673 case <-time.After(20 * time.Second): 7674 t.Fatalf("Did not receive completion signal") 7675 } 7676 7677 // Now make sure we can pull messages since we have not acked. 7678 // The bug is that the acks arrive on S1 faster then the messages but we want to 7679 // make sure we do not remove prematurely. 7680 msgs, err := sub.Fetch(100, nats.MaxWait(time.Second)) 7681 require_NoError(t, err) 7682 require_True(t, len(msgs) == 100) 7683 for _, m := range msgs { 7684 m.AckSync() 7685 } 7686 7687 ci, err := js.ConsumerInfo("EVENTS", "D") 7688 require_NoError(t, err) 7689 require_True(t, ci.NumPending == uint64(numToSend-100)) 7690 require_True(t, ci.NumAckPending == 0) 7691 require_True(t, ci.Delivered.Stream == 100) 7692 require_True(t, ci.AckFloor.Stream == 100) 7693 7694 // Check stream state on all servers. 7695 for _, s := range c.servers { 7696 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7697 require_NoError(t, err) 7698 state := mset.state() 7699 require_True(t, state.Msgs == 900) 7700 require_True(t, state.FirstSeq == 101) 7701 require_True(t, state.LastSeq == 1000) 7702 require_True(t, state.Consumers == 2) 7703 } 7704 7705 msgs, err = sub.Fetch(900, nats.MaxWait(time.Second)) 7706 require_NoError(t, err) 7707 require_True(t, len(msgs) == 900) 7708 for _, m := range msgs { 7709 m.AckSync() 7710 } 7711 7712 // Let acks propagate. 7713 time.Sleep(250 * time.Millisecond) 7714 7715 // Check final stream state on all servers. 7716 for _, s := range c.servers { 7717 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7718 require_NoError(t, err) 7719 state := mset.state() 7720 require_True(t, state.Msgs == 0) 7721 require_True(t, state.FirstSeq == 1001) 7722 require_True(t, state.LastSeq == 1000) 7723 require_True(t, state.Consumers == 2) 7724 // Now check preAcks 7725 mset.mu.RLock() 7726 numPreAcks := len(mset.preAcks) 7727 mset.mu.RUnlock() 7728 require_True(t, numPreAcks == 0) 7729 } 7730 } 7731 7732 func TestNoRaceJetStreamClusterUnbalancedInterestMultipleFilteredConsumers(t *testing.T) { 7733 c, np := createStretchUnbalancedCluster(t) 7734 defer c.shutdown() 7735 defer np.stop() 7736 7737 nc, js := jsClientConnect(t, c.randomServer()) 7738 defer nc.Close() 7739 7740 // Now create the stream. 7741 _, err := js.AddStream(&nats.StreamConfig{ 7742 Name: "EVENTS", 7743 Subjects: []string{"EV.>"}, 7744 Replicas: 3, 7745 Retention: nats.InterestPolicy, 7746 }) 7747 require_NoError(t, err) 7748 7749 // Make sure it's leader is on S2. 7750 sl := c.servers[1] 7751 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7752 c.waitOnStreamLeader(globalAccountName, "EVENTS") 7753 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 7754 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 7755 return fmt.Errorf("Server %s is not stream leader yet", sl) 7756 } 7757 return nil 7758 }) 7759 7760 // Create a fast ack consumer. 7761 _, err = js.Subscribe("EV.NEW", func(m *nats.Msg) { 7762 m.Ack() 7763 }, nats.Durable("C"), nats.ManualAck()) 7764 require_NoError(t, err) 7765 7766 // Make sure the consumer leader is on S3. 7767 cl := c.servers[2] 7768 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7769 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 7770 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 7771 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 7772 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7773 } 7774 return nil 7775 }) 7776 7777 // Connect a client directly to the stream leader. 7778 nc, js = jsClientConnect(t, sl) 7779 defer nc.Close() 7780 7781 // Now create another fast ack consumer. 7782 _, err = js.Subscribe("EV.UPDATED", func(m *nats.Msg) { 7783 m.Ack() 7784 }, nats.Durable("D"), nats.ManualAck()) 7785 require_NoError(t, err) 7786 7787 // Make sure this consumer leader is on S1. 7788 cl = c.servers[0] 7789 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 7790 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "D") 7791 if s := c.consumerLeader(globalAccountName, "EVENTS", "D"); s != cl { 7792 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "D") 7793 return fmt.Errorf("Server %s is not consumer leader yet", cl) 7794 } 7795 return nil 7796 }) 7797 7798 numToSend := 500 7799 for i := 0; i < numToSend; i++ { 7800 _, err := js.PublishAsync("EV.NEW", nil) 7801 require_NoError(t, err) 7802 _, err = js.PublishAsync("EV.UPDATED", nil) 7803 require_NoError(t, err) 7804 } 7805 select { 7806 case <-js.PublishAsyncComplete(): 7807 case <-time.After(20 * time.Second): 7808 t.Fatalf("Did not receive completion signal") 7809 } 7810 7811 // Let acks propagate. 7812 time.Sleep(250 * time.Millisecond) 7813 7814 ci, err := js.ConsumerInfo("EVENTS", "D") 7815 require_NoError(t, err) 7816 require_True(t, ci.NumPending == 0) 7817 require_True(t, ci.NumAckPending == 0) 7818 require_True(t, ci.Delivered.Consumer == 500) 7819 require_True(t, ci.Delivered.Stream == 1000) 7820 require_True(t, ci.AckFloor.Consumer == 500) 7821 require_True(t, ci.AckFloor.Stream == 1000) 7822 7823 // Check final stream state on all servers. 7824 for _, s := range c.servers { 7825 mset, err := s.GlobalAccount().lookupStream("EVENTS") 7826 require_NoError(t, err) 7827 state := mset.state() 7828 require_True(t, state.Msgs == 0) 7829 require_True(t, state.FirstSeq == 1001) 7830 require_True(t, state.LastSeq == 1000) 7831 require_True(t, state.Consumers == 2) 7832 // Now check preAcks 7833 mset.mu.RLock() 7834 numPreAcks := len(mset.preAcks) 7835 mset.mu.RUnlock() 7836 require_True(t, numPreAcks == 0) 7837 } 7838 } 7839 7840 func TestNoRaceParallelStreamAndConsumerCreation(t *testing.T) { 7841 s := RunBasicJetStreamServer(t) 7842 defer s.Shutdown() 7843 7844 // stream config. 7845 scfg := &StreamConfig{ 7846 Name: "TEST", 7847 Subjects: []string{"foo", "bar"}, 7848 MaxMsgs: 10, 7849 Storage: FileStorage, 7850 Replicas: 1, 7851 } 7852 7853 // Will do these direct against the low level API to really make 7854 // sure parallel creation ok. 7855 np := 1000 7856 startCh := make(chan bool) 7857 errCh := make(chan error, np) 7858 wg := sync.WaitGroup{} 7859 wg.Add(np) 7860 7861 var streams sync.Map 7862 7863 for i := 0; i < np; i++ { 7864 go func() { 7865 defer wg.Done() 7866 7867 // Make them all fire at once. 7868 <-startCh 7869 7870 if mset, err := s.GlobalAccount().addStream(scfg); err != nil { 7871 t.Logf("Stream create got an error: %v", err) 7872 errCh <- err 7873 } else { 7874 streams.Store(mset, true) 7875 } 7876 }() 7877 } 7878 time.Sleep(100 * time.Millisecond) 7879 close(startCh) 7880 wg.Wait() 7881 7882 // Check for no errors. 7883 if len(errCh) > 0 { 7884 t.Fatalf("Expected no errors, got %d", len(errCh)) 7885 } 7886 7887 // Now make sure we really only created one stream. 7888 var numStreams int 7889 streams.Range(func(k, v any) bool { 7890 numStreams++ 7891 return true 7892 }) 7893 if numStreams > 1 { 7894 t.Fatalf("Expected only one stream to be really created, got %d out of %d attempts", numStreams, np) 7895 } 7896 7897 // Also make sure we cleanup the inflight entries for streams. 7898 gacc := s.GlobalAccount() 7899 _, jsa, err := gacc.checkForJetStream() 7900 require_NoError(t, err) 7901 var numEntries int 7902 jsa.inflight.Range(func(k, v any) bool { 7903 numEntries++ 7904 return true 7905 }) 7906 if numEntries > 0 { 7907 t.Fatalf("Expected no inflight entries to be left over, got %d", numEntries) 7908 } 7909 7910 // Now do consumers. 7911 mset, err := gacc.lookupStream("TEST") 7912 require_NoError(t, err) 7913 7914 cfg := &ConsumerConfig{ 7915 DeliverSubject: "to", 7916 Name: "DLC", 7917 AckPolicy: AckExplicit, 7918 } 7919 7920 startCh = make(chan bool) 7921 errCh = make(chan error, np) 7922 wg.Add(np) 7923 7924 var consumers sync.Map 7925 7926 for i := 0; i < np; i++ { 7927 go func() { 7928 defer wg.Done() 7929 7930 // Make them all fire at once. 7931 <-startCh 7932 7933 if _, err = mset.addConsumer(cfg); err != nil { 7934 t.Logf("Consumer create got an error: %v", err) 7935 errCh <- err 7936 } else { 7937 consumers.Store(mset, true) 7938 } 7939 }() 7940 } 7941 time.Sleep(100 * time.Millisecond) 7942 close(startCh) 7943 wg.Wait() 7944 7945 // Check for no errors. 7946 if len(errCh) > 0 { 7947 t.Fatalf("Expected no errors, got %d", len(errCh)) 7948 } 7949 7950 // Now make sure we really only created one stream. 7951 var numConsumers int 7952 consumers.Range(func(k, v any) bool { 7953 numConsumers++ 7954 return true 7955 }) 7956 if numConsumers > 1 { 7957 t.Fatalf("Expected only one consumer to be really created, got %d out of %d attempts", numConsumers, np) 7958 } 7959 } 7960 7961 func TestNoRaceRoutePool(t *testing.T) { 7962 var dur1 time.Duration 7963 var dur2 time.Duration 7964 7965 total := 1_000_000 7966 7967 for _, test := range []struct { 7968 name string 7969 poolSize int 7970 }{ 7971 {"no pooling", 0}, 7972 {"pooling", 5}, 7973 } { 7974 t.Run(test.name, func(t *testing.T) { 7975 tmpl := ` 7976 port: -1 7977 accounts { 7978 A { users: [{user: "A", password: "A"}] } 7979 B { users: [{user: "B", password: "B"}] } 7980 C { users: [{user: "C", password: "C"}] } 7981 D { users: [{user: "D", password: "D"}] } 7982 E { users: [{user: "E", password: "E"}] } 7983 } 7984 cluster { 7985 port: -1 7986 name: "local" 7987 %s 7988 pool_size: %d 7989 } 7990 ` 7991 conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, _EMPTY_, test.poolSize))) 7992 s1, o1 := RunServerWithConfig(conf1) 7993 defer s1.Shutdown() 7994 7995 conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, 7996 fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), 7997 test.poolSize))) 7998 s2, _ := RunServerWithConfig(conf2) 7999 defer s2.Shutdown() 8000 8001 checkClusterFormed(t, s1, s2) 8002 8003 wg := sync.WaitGroup{} 8004 wg.Add(5) 8005 8006 sendAndRecv := func(acc string) (*nats.Conn, *nats.Conn) { 8007 t.Helper() 8008 8009 s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(acc, acc)) 8010 count := 0 8011 natsSub(t, s2nc, "foo", func(_ *nats.Msg) { 8012 if count++; count == total { 8013 wg.Done() 8014 } 8015 }) 8016 natsFlush(t, s2nc) 8017 8018 s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(acc, acc)) 8019 8020 checkSubInterest(t, s1, acc, "foo", time.Second) 8021 return s2nc, s1nc 8022 } 8023 8024 var rcv = [5]*nats.Conn{} 8025 var snd = [5]*nats.Conn{} 8026 accs := []string{"A", "B", "C", "D", "E"} 8027 8028 for i := 0; i < 5; i++ { 8029 rcv[i], snd[i] = sendAndRecv(accs[i]) 8030 defer rcv[i].Close() 8031 defer snd[i].Close() 8032 } 8033 8034 payload := []byte("some message") 8035 start := time.Now() 8036 for i := 0; i < 5; i++ { 8037 go func(idx int) { 8038 for i := 0; i < total; i++ { 8039 snd[idx].Publish("foo", payload) 8040 } 8041 }(i) 8042 } 8043 8044 wg.Wait() 8045 dur := time.Since(start) 8046 if test.poolSize == 0 { 8047 dur1 = dur 8048 } else { 8049 dur2 = dur 8050 } 8051 }) 8052 } 8053 perf1 := float64(total*5) / dur1.Seconds() 8054 t.Logf("No pooling: %.0f msgs/sec", perf1) 8055 perf2 := float64(total*5) / dur2.Seconds() 8056 t.Logf("Pooling : %.0f msgs/sec", perf2) 8057 t.Logf("Gain : %.2fx", perf2/perf1) 8058 } 8059 8060 func testNoRaceRoutePerAccount(t *testing.T, useWildCard bool) { 8061 var dur1 time.Duration 8062 var dur2 time.Duration 8063 8064 accounts := make([]string, 5) 8065 for i := 0; i < 5; i++ { 8066 akp, _ := nkeys.CreateAccount() 8067 pub, _ := akp.PublicKey() 8068 accounts[i] = pub 8069 } 8070 routeAccs := fmt.Sprintf("accounts: [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"]", 8071 accounts[0], accounts[1], accounts[2], accounts[3], accounts[4]) 8072 8073 total := 1_000_000 8074 8075 for _, test := range []struct { 8076 name string 8077 dedicated bool 8078 }{ 8079 {"route for all accounts", false}, 8080 {"route per account", true}, 8081 } { 8082 t.Run(test.name, func(t *testing.T) { 8083 tmpl := ` 8084 server_name: "%s" 8085 port: -1 8086 accounts { 8087 %s { users: [{user: "0", password: "0"}] } 8088 %s { users: [{user: "1", password: "1"}] } 8089 %s { users: [{user: "2", password: "2"}] } 8090 %s { users: [{user: "3", password: "3"}] } 8091 %s { users: [{user: "4", password: "4"}] } 8092 } 8093 cluster { 8094 port: -1 8095 name: "local" 8096 %s 8097 %s 8098 } 8099 ` 8100 var racc string 8101 if test.dedicated { 8102 racc = routeAccs 8103 } else { 8104 racc = _EMPTY_ 8105 } 8106 conf1 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "A", 8107 accounts[0], accounts[1], accounts[2], accounts[3], 8108 accounts[4], _EMPTY_, racc))) 8109 s1, o1 := RunServerWithConfig(conf1) 8110 defer s1.Shutdown() 8111 8112 conf2 := createConfFile(t, []byte(fmt.Sprintf(tmpl, "B", 8113 accounts[0], accounts[1], accounts[2], accounts[3], accounts[4], 8114 fmt.Sprintf("routes: [\"nats://127.0.0.1:%d\"]", o1.Cluster.Port), 8115 racc))) 8116 s2, _ := RunServerWithConfig(conf2) 8117 defer s2.Shutdown() 8118 8119 checkClusterFormed(t, s1, s2) 8120 8121 wg := sync.WaitGroup{} 8122 wg.Add(5) 8123 8124 sendAndRecv := func(acc string, user string) (*nats.Conn, *nats.Conn) { 8125 t.Helper() 8126 8127 s2nc := natsConnect(t, s2.ClientURL(), nats.UserInfo(user, user)) 8128 count := 0 8129 var subj string 8130 var checkSubj string 8131 if useWildCard { 8132 subj, checkSubj = "foo.*", "foo.0" 8133 } else { 8134 subj, checkSubj = "foo", "foo" 8135 } 8136 natsSub(t, s2nc, subj, func(_ *nats.Msg) { 8137 if count++; count == total { 8138 wg.Done() 8139 } 8140 }) 8141 natsFlush(t, s2nc) 8142 8143 s1nc := natsConnect(t, s1.ClientURL(), nats.UserInfo(user, user)) 8144 8145 checkSubInterest(t, s1, acc, checkSubj, time.Second) 8146 return s2nc, s1nc 8147 } 8148 8149 var rcv = [5]*nats.Conn{} 8150 var snd = [5]*nats.Conn{} 8151 users := []string{"0", "1", "2", "3", "4"} 8152 8153 for i := 0; i < 5; i++ { 8154 rcv[i], snd[i] = sendAndRecv(accounts[i], users[i]) 8155 defer rcv[i].Close() 8156 defer snd[i].Close() 8157 } 8158 8159 payload := []byte("some message") 8160 start := time.Now() 8161 for i := 0; i < 5; i++ { 8162 go func(idx int) { 8163 for i := 0; i < total; i++ { 8164 var subj string 8165 if useWildCard { 8166 subj = fmt.Sprintf("foo.%d", i) 8167 } else { 8168 subj = "foo" 8169 } 8170 snd[idx].Publish(subj, payload) 8171 } 8172 }(i) 8173 } 8174 8175 wg.Wait() 8176 dur := time.Since(start) 8177 if !test.dedicated { 8178 dur1 = dur 8179 } else { 8180 dur2 = dur 8181 } 8182 }) 8183 } 8184 perf1 := float64(total*5) / dur1.Seconds() 8185 t.Logf("Route for all accounts: %.0f msgs/sec", perf1) 8186 perf2 := float64(total*5) / dur2.Seconds() 8187 t.Logf("Route per account : %.0f msgs/sec", perf2) 8188 t.Logf("Gain : %.2fx", perf2/perf1) 8189 } 8190 8191 func TestNoRaceRoutePerAccount(t *testing.T) { 8192 testNoRaceRoutePerAccount(t, false) 8193 } 8194 8195 func TestNoRaceRoutePerAccountSubWithWildcard(t *testing.T) { 8196 testNoRaceRoutePerAccount(t, true) 8197 } 8198 8199 // This test, which checks that messages are not duplicated when pooling or 8200 // per-account routes are reloaded, would cause a DATA RACE that is not 8201 // specific to the changes for pooling/per_account. For this reason, this 8202 // test is located in the norace_test.go file. 8203 func TestNoRaceRoutePoolAndPerAccountConfigReload(t *testing.T) { 8204 for _, test := range []struct { 8205 name string 8206 poolSizeBefore string 8207 poolSizeAfter string 8208 accountsBefore string 8209 accountsAfter string 8210 }{ 8211 {"from no pool to pool", _EMPTY_, "pool_size: 2", _EMPTY_, _EMPTY_}, 8212 {"increase pool size", "pool_size: 2", "pool_size: 5", _EMPTY_, _EMPTY_}, 8213 {"decrease pool size", "pool_size: 5", "pool_size: 2", _EMPTY_, _EMPTY_}, 8214 {"from pool to no pool", "pool_size: 5", _EMPTY_, _EMPTY_, _EMPTY_}, 8215 {"from no account to account", _EMPTY_, _EMPTY_, _EMPTY_, "accounts: [\"A\"]"}, 8216 {"add account", _EMPTY_, _EMPTY_, "accounts: [\"B\"]", "accounts: [\"A\",\"B\"]"}, 8217 {"remove account", _EMPTY_, _EMPTY_, "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, 8218 {"from account to no account", _EMPTY_, _EMPTY_, "accounts: [\"A\"]", _EMPTY_}, 8219 {"increase pool size and add account", "pool_size: 2", "pool_size: 3", "accounts: [\"B\"]", "accounts: [\"B\",\"A\"]"}, 8220 {"decrease pool size and remove account", "pool_size: 3", "pool_size: 2", "accounts: [\"A\",\"B\"]", "accounts: [\"B\"]"}, 8221 } { 8222 t.Run(test.name, func(t *testing.T) { 8223 tmplA := ` 8224 port: -1 8225 server_name: "A" 8226 accounts { 8227 A { users: [{user: a, password: pwd}] } 8228 B { users: [{user: b, password: pwd}] } 8229 } 8230 cluster: { 8231 port: -1 8232 name: "local" 8233 %s 8234 %s 8235 } 8236 ` 8237 confA := createConfFile(t, []byte(fmt.Sprintf(tmplA, test.poolSizeBefore, test.accountsBefore))) 8238 srva, optsA := RunServerWithConfig(confA) 8239 defer srva.Shutdown() 8240 8241 tmplB := ` 8242 port: -1 8243 server_name: "B" 8244 accounts { 8245 A { users: [{user: a, password: pwd}] } 8246 B { users: [{user: b, password: pwd}] } 8247 } 8248 cluster: { 8249 port: -1 8250 name: "local" 8251 %s 8252 %s 8253 routes: ["nats://127.0.0.1:%d"] 8254 } 8255 ` 8256 confB := createConfFile(t, []byte(fmt.Sprintf(tmplB, test.poolSizeBefore, test.accountsBefore, optsA.Cluster.Port))) 8257 srvb, _ := RunServerWithConfig(confB) 8258 defer srvb.Shutdown() 8259 8260 checkClusterFormed(t, srva, srvb) 8261 8262 ncA := natsConnect(t, srva.ClientURL(), nats.UserInfo("a", "pwd")) 8263 defer ncA.Close() 8264 8265 sub := natsSubSync(t, ncA, "foo") 8266 sub.SetPendingLimits(-1, -1) 8267 checkSubInterest(t, srvb, "A", "foo", time.Second) 8268 8269 ncB := natsConnect(t, srvb.ClientURL(), nats.UserInfo("a", "pwd")) 8270 defer ncB.Close() 8271 8272 wg := sync.WaitGroup{} 8273 wg.Add(1) 8274 ch := make(chan struct{}) 8275 go func() { 8276 defer wg.Done() 8277 8278 for i := 0; ; i++ { 8279 ncB.Publish("foo", []byte(fmt.Sprintf("%d", i))) 8280 select { 8281 case <-ch: 8282 return 8283 default: 8284 } 8285 if i%300 == 0 { 8286 time.Sleep(time.Duration(rand.Intn(5)) * time.Millisecond) 8287 } 8288 } 8289 }() 8290 8291 var l *captureErrorLogger 8292 if test.accountsBefore != _EMPTY_ && test.accountsAfter == _EMPTY_ { 8293 l = &captureErrorLogger{errCh: make(chan string, 100)} 8294 srva.SetLogger(l, false, false) 8295 } 8296 8297 time.Sleep(250 * time.Millisecond) 8298 reloadUpdateConfig(t, srva, confA, fmt.Sprintf(tmplA, test.poolSizeAfter, test.accountsAfter)) 8299 time.Sleep(125 * time.Millisecond) 8300 reloadUpdateConfig(t, srvb, confB, fmt.Sprintf(tmplB, test.poolSizeAfter, test.accountsAfter, optsA.Cluster.Port)) 8301 8302 checkClusterFormed(t, srva, srvb) 8303 checkSubInterest(t, srvb, "A", "foo", time.Second) 8304 8305 if l != nil { 8306 // Errors regarding "No route for account" should stop 8307 var ok bool 8308 for numErrs := 0; !ok && numErrs < 10; { 8309 select { 8310 case e := <-l.errCh: 8311 if strings.Contains(e, "No route for account") { 8312 numErrs++ 8313 } 8314 case <-time.After(DEFAULT_ROUTE_RECONNECT + 250*time.Millisecond): 8315 ok = true 8316 } 8317 } 8318 if !ok { 8319 t.Fatalf("Still report of no route for account") 8320 } 8321 } 8322 8323 close(ch) 8324 wg.Wait() 8325 8326 for prev := -1; ; { 8327 msg, err := sub.NextMsg(50 * time.Millisecond) 8328 if err != nil { 8329 break 8330 } 8331 cur, _ := strconv.Atoi(string(msg.Data)) 8332 if cur <= prev { 8333 t.Fatalf("Previous was %d, got %d", prev, cur) 8334 } 8335 prev = cur 8336 } 8337 }) 8338 } 8339 } 8340 8341 // This test ensures that outbound queues don't cause a run on 8342 // memory when sending something to lots of clients. 8343 func TestNoRaceClientOutboundQueueMemory(t *testing.T) { 8344 opts := DefaultOptions() 8345 s := RunServer(opts) 8346 defer s.Shutdown() 8347 8348 var before runtime.MemStats 8349 var after runtime.MemStats 8350 8351 var err error 8352 clients := make([]*nats.Conn, 50000) 8353 wait := &sync.WaitGroup{} 8354 wait.Add(len(clients)) 8355 8356 for i := 0; i < len(clients); i++ { 8357 clients[i], err = nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port), nats.InProcessServer(s)) 8358 if err != nil { 8359 t.Fatalf("Error on connect: %v", err) 8360 } 8361 defer clients[i].Close() 8362 8363 clients[i].Subscribe("test", func(m *nats.Msg) { 8364 wait.Done() 8365 }) 8366 } 8367 8368 runtime.GC() 8369 runtime.ReadMemStats(&before) 8370 8371 nc, err := nats.Connect(fmt.Sprintf("nats://%s:%d", opts.Host, opts.Port), nats.InProcessServer(s)) 8372 if err != nil { 8373 t.Fatalf("Error on connect: %v", err) 8374 } 8375 defer nc.Close() 8376 8377 var m [48000]byte 8378 if err = nc.Publish("test", m[:]); err != nil { 8379 t.Fatal(err) 8380 } 8381 8382 wait.Wait() 8383 8384 runtime.GC() 8385 runtime.ReadMemStats(&after) 8386 8387 hb, ha := float64(before.HeapAlloc), float64(after.HeapAlloc) 8388 ms := float64(len(m)) 8389 diff := float64(ha) - float64(hb) 8390 inc := (diff / float64(hb)) * 100 8391 8392 if inc > 10 { 8393 t.Logf("Message size: %.1fKB\n", ms/1024) 8394 t.Logf("Subscribed clients: %d\n", len(clients)) 8395 t.Logf("Heap allocs before: %.1fMB\n", hb/1024/1024) 8396 t.Logf("Heap allocs after: %.1fMB\n", ha/1024/1024) 8397 t.Logf("Heap allocs delta: %.1f%%\n", inc) 8398 8399 t.Fatalf("memory increase was %.1f%% (should be <= 10%%)", inc) 8400 } 8401 } 8402 8403 func TestNoRaceJetStreamClusterLeafnodeConnectPerf(t *testing.T) { 8404 // Uncomment to run. Needs to be on a big machine. Do not want as part of Travis tests atm. 8405 skip(t) 8406 8407 tmpl := strings.Replace(jsClusterAccountsTempl, "store_dir:", "domain: cloud, store_dir:", 1) 8408 c := createJetStreamCluster(t, tmpl, "CLOUD", _EMPTY_, 3, 18033, true) 8409 defer c.shutdown() 8410 8411 nc, js := jsClientConnect(t, c.randomServer()) 8412 defer nc.Close() 8413 8414 _, err := js.AddStream(&nats.StreamConfig{ 8415 Name: "STATE", 8416 Subjects: []string{"STATE.GLOBAL.CELL1.*.>"}, 8417 Replicas: 3, 8418 }) 8419 require_NoError(t, err) 8420 8421 tmpl = strings.Replace(jsClusterTemplWithSingleFleetLeafNode, "store_dir:", "domain: vehicle, store_dir:", 1) 8422 8423 var vinSerial int 8424 genVIN := func() string { 8425 vinSerial++ 8426 return fmt.Sprintf("7PDSGAALXNN%06d", vinSerial) 8427 } 8428 8429 numVehicles := 500 8430 for i := 0; i < numVehicles; i++ { 8431 start := time.Now() 8432 vin := genVIN() 8433 ln := c.createLeafNodeWithTemplateNoSystemWithProto(vin, tmpl, "ws") 8434 nc, js := jsClientConnect(t, ln) 8435 _, err := js.AddStream(&nats.StreamConfig{ 8436 Name: "VEHICLE", 8437 Subjects: []string{"STATE.GLOBAL.LOCAL.>"}, 8438 Sources: []*nats.StreamSource{{ 8439 Name: "STATE", 8440 FilterSubject: fmt.Sprintf("STATE.GLOBAL.CELL1.%s.>", vin), 8441 External: &nats.ExternalStream{ 8442 APIPrefix: "$JS.cloud.API", 8443 DeliverPrefix: fmt.Sprintf("DELIVER.STATE.GLOBAL.CELL1.%s", vin), 8444 }, 8445 }}, 8446 }) 8447 require_NoError(t, err) 8448 // Create the sourced stream. 8449 checkLeafNodeConnectedCount(t, ln, 1) 8450 if elapsed := time.Since(start); elapsed > 2*time.Second { 8451 t.Fatalf("Took too long to create leafnode %d connection: %v", i+1, elapsed) 8452 } 8453 nc.Close() 8454 } 8455 } 8456 8457 func TestNoRaceJetStreamClusterDifferentRTTInterestBasedStreamPreAck(t *testing.T) { 8458 tmpl := ` 8459 listen: 127.0.0.1:-1 8460 server_name: %s 8461 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 8462 8463 cluster { 8464 name: "F3" 8465 listen: 127.0.0.1:%d 8466 routes = [%s] 8467 } 8468 8469 accounts { 8470 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 8471 } 8472 ` 8473 8474 // Route Ports 8475 // "S1": 14622, 8476 // "S2": 15622, 8477 // "S3": 16622, 8478 8479 // S2 (stream leader) will have a slow path to S1 (via proxy) and S3 (consumer leader) will have a fast path. 8480 8481 // Do these in order, S1, S2 (proxy) then S3. 8482 c := &cluster{t: t, servers: make([]*Server, 3), opts: make([]*Options, 3), name: "F3"} 8483 8484 // S1 8485 conf := fmt.Sprintf(tmpl, "S1", t.TempDir(), 14622, "route://127.0.0.1:15622, route://127.0.0.1:16622") 8486 c.servers[0], c.opts[0] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8487 8488 // S2 8489 // Create the proxy first. Connect this to S1. Make it slow, e.g. 5ms RTT. 8490 np := createNetProxy(1*time.Millisecond, 1024*1024*1024, 1024*1024*1024, "route://127.0.0.1:14622", true) 8491 routes := fmt.Sprintf("%s, route://127.0.0.1:16622", np.routeURL()) 8492 conf = fmt.Sprintf(tmpl, "S2", t.TempDir(), 15622, routes) 8493 c.servers[1], c.opts[1] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8494 8495 // S3 8496 conf = fmt.Sprintf(tmpl, "S3", t.TempDir(), 16622, "route://127.0.0.1:14622, route://127.0.0.1:15622") 8497 c.servers[2], c.opts[2] = RunServerWithConfig(createConfFile(t, []byte(conf))) 8498 8499 c.checkClusterFormed() 8500 c.waitOnClusterReady() 8501 defer c.shutdown() 8502 defer np.stop() 8503 8504 nc, js := jsClientConnect(t, c.randomServer()) 8505 defer nc.Close() 8506 8507 // Now create the stream. 8508 _, err := js.AddStream(&nats.StreamConfig{ 8509 Name: "EVENTS", 8510 Subjects: []string{"EV.>"}, 8511 Replicas: 3, 8512 Retention: nats.InterestPolicy, 8513 }) 8514 require_NoError(t, err) 8515 8516 // Make sure it's leader is on S2. 8517 sl := c.servers[1] 8518 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 8519 c.waitOnStreamLeader(globalAccountName, "EVENTS") 8520 if s := c.streamLeader(globalAccountName, "EVENTS"); s != sl { 8521 s.JetStreamStepdownStream(globalAccountName, "EVENTS") 8522 return fmt.Errorf("Server %s is not stream leader yet", sl) 8523 } 8524 return nil 8525 }) 8526 8527 // Now create the consumer. 8528 _, err = js.AddConsumer("EVENTS", &nats.ConsumerConfig{ 8529 Durable: "C", 8530 AckPolicy: nats.AckExplicitPolicy, 8531 DeliverSubject: "dx", 8532 }) 8533 require_NoError(t, err) 8534 8535 // Make sure the consumer leader is on S3. 8536 cl := c.servers[2] 8537 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 8538 c.waitOnConsumerLeader(globalAccountName, "EVENTS", "C") 8539 if s := c.consumerLeader(globalAccountName, "EVENTS", "C"); s != cl { 8540 s.JetStreamStepdownConsumer(globalAccountName, "EVENTS", "C") 8541 return fmt.Errorf("Server %s is not consumer leader yet", sl) 8542 } 8543 return nil 8544 }) 8545 8546 // Create the real consumer on the consumer leader to make it efficient. 8547 nc, js = jsClientConnect(t, cl) 8548 defer nc.Close() 8549 8550 _, err = js.Subscribe(_EMPTY_, func(msg *nats.Msg) { 8551 msg.Ack() 8552 }, nats.BindStream("EVENTS"), nats.Durable("C"), nats.ManualAck()) 8553 require_NoError(t, err) 8554 8555 for i := 0; i < 1_000; i++ { 8556 _, err := js.PublishAsync("EVENTS.PAID", []byte("ok")) 8557 require_NoError(t, err) 8558 } 8559 select { 8560 case <-js.PublishAsyncComplete(): 8561 case <-time.After(5 * time.Second): 8562 t.Fatalf("Did not receive completion signal") 8563 } 8564 8565 slow := c.servers[0] 8566 mset, err := slow.GlobalAccount().lookupStream("EVENTS") 8567 require_NoError(t, err) 8568 8569 // Make sure preAck is non-nil, so we know the logic has kicked in. 8570 mset.mu.RLock() 8571 preAcks := mset.preAcks 8572 mset.mu.RUnlock() 8573 require_NotNil(t, preAcks) 8574 8575 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 8576 state := mset.state() 8577 if state.Msgs == 0 { 8578 mset.mu.RLock() 8579 lp := len(mset.preAcks) 8580 mset.mu.RUnlock() 8581 if lp == 0 { 8582 return nil 8583 } else { 8584 t.Fatalf("Expected no preAcks with no msgs, but got %d", lp) 8585 } 8586 } 8587 return fmt.Errorf("Still have %d msgs left", state.Msgs) 8588 }) 8589 8590 } 8591 8592 func TestNoRaceCheckAckFloorWithVeryLargeFirstSeqAndNewConsumers(t *testing.T) { 8593 s := RunBasicJetStreamServer(t) 8594 defer s.Shutdown() 8595 8596 nc, _ := jsClientConnect(t, s) 8597 defer nc.Close() 8598 8599 // Make sure to time bound here for the acksync call below. 8600 js, err := nc.JetStream(nats.MaxWait(200 * time.Millisecond)) 8601 require_NoError(t, err) 8602 8603 _, err = js.AddStream(&nats.StreamConfig{ 8604 Name: "TEST", 8605 Subjects: []string{"wq-req"}, 8606 Retention: nats.WorkQueuePolicy, 8607 }) 8608 require_NoError(t, err) 8609 8610 largeFirstSeq := uint64(1_200_000_000) 8611 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: largeFirstSeq}) 8612 require_NoError(t, err) 8613 si, err := js.StreamInfo("TEST") 8614 require_NoError(t, err) 8615 require_True(t, si.State.FirstSeq == largeFirstSeq) 8616 8617 // Add a simple request to the stream. 8618 sendStreamMsg(t, nc, "wq-req", "HELP") 8619 8620 sub, err := js.PullSubscribe("wq-req", "dlc") 8621 require_NoError(t, err) 8622 8623 msgs, err := sub.Fetch(1) 8624 require_NoError(t, err) 8625 require_True(t, len(msgs) == 1) 8626 8627 // The bug is around the checkAckFloor walking the sequences from current ackfloor 8628 // to the first sequence of the stream. We time bound the max wait with the js context 8629 // to 200ms. Since checkAckFloor is spinning and holding up processing of acks this will fail. 8630 // We will short circuit new consumers to fix this one. 8631 require_NoError(t, msgs[0].AckSync()) 8632 8633 // Now do again so we move past the new consumer with no ack floor situation. 8634 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: 2 * largeFirstSeq}) 8635 require_NoError(t, err) 8636 si, err = js.StreamInfo("TEST") 8637 require_NoError(t, err) 8638 require_True(t, si.State.FirstSeq == 2*largeFirstSeq) 8639 8640 sendStreamMsg(t, nc, "wq-req", "MORE HELP") 8641 8642 // We check this one directly for this use case. 8643 mset, err := s.GlobalAccount().lookupStream("TEST") 8644 require_NoError(t, err) 8645 o := mset.lookupConsumer("dlc") 8646 require_True(t, o != nil) 8647 8648 // Purge will move the stream floor by default, so force into the situation where it is back to largeFirstSeq. 8649 // This will not trigger the new consumer logic, but will trigger a walk of the sequence space. 8650 // Fix will be to walk the lesser of the two linear spaces. 8651 o.mu.Lock() 8652 o.asflr = largeFirstSeq 8653 o.mu.Unlock() 8654 8655 done := make(chan bool) 8656 go func() { 8657 o.checkAckFloor() 8658 done <- true 8659 }() 8660 8661 select { 8662 case <-done: 8663 return 8664 case <-time.After(time.Second): 8665 t.Fatalf("Check ack floor taking too long!") 8666 } 8667 } 8668 8669 func TestNoRaceReplicatedMirrorWithLargeStartingSequenceOverLeafnode(t *testing.T) { 8670 // Cluster B 8671 tmpl := strings.Replace(jsClusterTempl, "store_dir:", "domain: B, store_dir:", 1) 8672 c := createJetStreamCluster(t, tmpl, "B", _EMPTY_, 3, 22020, true) 8673 defer c.shutdown() 8674 8675 // Cluster A 8676 // Domain is "A' 8677 lc := c.createLeafNodesWithStartPortAndDomain("A", 3, 22110, "A") 8678 defer lc.shutdown() 8679 8680 lc.waitOnClusterReady() 8681 8682 // Create a stream on B (HUB/CLOUD) and set its starting sequence very high. 8683 nc, js := jsClientConnect(t, c.randomServer()) 8684 defer nc.Close() 8685 8686 _, err := js.AddStream(&nats.StreamConfig{ 8687 Name: "TEST", 8688 Subjects: []string{"foo"}, 8689 Replicas: 3, 8690 }) 8691 require_NoError(t, err) 8692 8693 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: 1_000_000_000}) 8694 require_NoError(t, err) 8695 8696 // Send in a small amount of messages. 8697 for i := 0; i < 1000; i++ { 8698 sendStreamMsg(t, nc, "foo", "Hello") 8699 } 8700 8701 si, err := js.StreamInfo("TEST") 8702 require_NoError(t, err) 8703 require_True(t, si.State.FirstSeq == 1_000_000_000) 8704 8705 // Now try to create a replicated mirror on the leaf cluster. 8706 lnc, ljs := jsClientConnect(t, lc.randomServer()) 8707 defer lnc.Close() 8708 8709 _, err = ljs.AddStream(&nats.StreamConfig{ 8710 Name: "TEST", 8711 Mirror: &nats.StreamSource{ 8712 Name: "TEST", 8713 Domain: "B", 8714 }, 8715 }) 8716 require_NoError(t, err) 8717 8718 // Make sure we sync quickly. 8719 checkFor(t, time.Second, 200*time.Millisecond, func() error { 8720 si, err = ljs.StreamInfo("TEST") 8721 require_NoError(t, err) 8722 if si.State.Msgs == 1000 && si.State.FirstSeq == 1_000_000_000 { 8723 return nil 8724 } 8725 return fmt.Errorf("Mirror state not correct: %+v", si.State) 8726 }) 8727 } 8728 8729 func TestNoRaceBinaryStreamSnapshotEncodingBasic(t *testing.T) { 8730 s := RunBasicJetStreamServer(t) 8731 defer s.Shutdown() 8732 8733 nc, js := jsClientConnect(t, s) 8734 defer nc.Close() 8735 8736 _, err := js.AddStream(&nats.StreamConfig{ 8737 Name: "TEST", 8738 Subjects: []string{"*"}, 8739 MaxMsgsPerSubject: 1, 8740 }) 8741 require_NoError(t, err) 8742 8743 // Set first key 8744 sendStreamMsg(t, nc, "key:1", "hello") 8745 8746 // Set Second key but keep updating it, causing a laggard pattern. 8747 value := bytes.Repeat([]byte("Z"), 8*1024) 8748 8749 for i := 0; i <= 1000; i++ { 8750 _, err := js.PublishAsync("key:2", value) 8751 require_NoError(t, err) 8752 } 8753 select { 8754 case <-js.PublishAsyncComplete(): 8755 case <-time.After(5 * time.Second): 8756 t.Fatalf("Did not receive completion signal") 8757 } 8758 8759 // Now do more of swiss cheese style. 8760 for i := 3; i <= 1000; i++ { 8761 key := fmt.Sprintf("key:%d", i) 8762 _, err := js.PublishAsync(key, value) 8763 require_NoError(t, err) 8764 // Send it twice to create hole right behind it, like swiss cheese. 8765 _, err = js.PublishAsync(key, value) 8766 require_NoError(t, err) 8767 } 8768 select { 8769 case <-js.PublishAsyncComplete(): 8770 case <-time.After(5 * time.Second): 8771 t.Fatalf("Did not receive completion signal") 8772 } 8773 8774 // Make for round numbers for stream state. 8775 sendStreamMsg(t, nc, "key:2", "hello") 8776 sendStreamMsg(t, nc, "key:2", "world") 8777 8778 si, err := js.StreamInfo("TEST") 8779 require_NoError(t, err) 8780 require_True(t, si.State.FirstSeq == 1) 8781 require_True(t, si.State.LastSeq == 3000) 8782 require_True(t, si.State.Msgs == 1000) 8783 require_True(t, si.State.NumDeleted == 2000) 8784 8785 mset, err := s.GlobalAccount().lookupStream("TEST") 8786 require_NoError(t, err) 8787 8788 snap, err := mset.store.EncodedStreamState(0) 8789 require_NoError(t, err) 8790 8791 // Now decode the snapshot. 8792 ss, err := DecodeStreamState(snap) 8793 require_NoError(t, err) 8794 8795 require_Equal(t, ss.FirstSeq, 1) 8796 require_Equal(t, ss.LastSeq, 3000) 8797 require_Equal(t, ss.Msgs, 1000) 8798 require_Equal(t, ss.Deleted.NumDeleted(), 2000) 8799 } 8800 8801 func TestNoRaceFilestoreBinaryStreamSnapshotEncodingLargeGaps(t *testing.T) { 8802 storeDir := t.TempDir() 8803 fcfg := FileStoreConfig{ 8804 StoreDir: storeDir, 8805 BlockSize: 512, // Small on purpose to create alot of blks. 8806 } 8807 fs, err := newFileStore(fcfg, StreamConfig{Name: "zzz", Subjects: []string{"zzz"}, Storage: FileStorage}) 8808 require_NoError(t, err) 8809 defer fs.Stop() 8810 8811 subj, msg := "zzz", bytes.Repeat([]byte("X"), 128) 8812 numMsgs := 20_000 8813 8814 fs.StoreMsg(subj, nil, msg) 8815 for i := 2; i < numMsgs; i++ { 8816 seq, _, err := fs.StoreMsg(subj, nil, nil) 8817 require_NoError(t, err) 8818 fs.RemoveMsg(seq) 8819 } 8820 fs.StoreMsg(subj, nil, msg) 8821 8822 snap, err := fs.EncodedStreamState(0) 8823 require_NoError(t, err) 8824 require_True(t, len(snap) < 512) 8825 8826 // Now decode the snapshot. 8827 ss, err := DecodeStreamState(snap) 8828 require_NoError(t, err) 8829 8830 require_True(t, ss.FirstSeq == 1) 8831 require_True(t, ss.LastSeq == 20_000) 8832 require_True(t, ss.Msgs == 2) 8833 require_True(t, len(ss.Deleted) <= 2) 8834 require_True(t, ss.Deleted.NumDeleted() == 19_998) 8835 } 8836 8837 func TestNoRaceJetStreamClusterStreamSnapshotCatchup(t *testing.T) { 8838 c := createJetStreamClusterExplicit(t, "R3S", 3) 8839 defer c.shutdown() 8840 8841 // Client based API 8842 nc, js := jsClientConnect(t, c.randomServer()) 8843 defer nc.Close() 8844 8845 _, err := js.AddStream(&nats.StreamConfig{ 8846 Name: "TEST", 8847 Subjects: []string{"*"}, 8848 MaxMsgsPerSubject: 1, 8849 Replicas: 3, 8850 }) 8851 require_NoError(t, err) 8852 8853 msg := []byte("Hello World") 8854 _, err = js.Publish("foo", msg) 8855 require_NoError(t, err) 8856 8857 for i := 1; i < 1000; i++ { 8858 _, err := js.PublishAsync("bar", msg) 8859 require_NoError(t, err) 8860 } 8861 select { 8862 case <-js.PublishAsyncComplete(): 8863 case <-time.After(5 * time.Second): 8864 t.Fatalf("Did not receive completion signal") 8865 } 8866 8867 sr := c.randomNonStreamLeader(globalAccountName, "TEST") 8868 sr.Shutdown() 8869 8870 // In case we were connected to sr. 8871 nc, js = jsClientConnect(t, c.randomServer()) 8872 defer nc.Close() 8873 8874 // Now create a large gap. 8875 for i := 0; i < 50_000; i++ { 8876 _, err := js.PublishAsync("bar", msg) 8877 require_NoError(t, err) 8878 } 8879 select { 8880 case <-js.PublishAsyncComplete(): 8881 case <-time.After(10 * time.Second): 8882 t.Fatalf("Did not receive completion signal") 8883 } 8884 8885 sl := c.streamLeader(globalAccountName, "TEST") 8886 sl.JetStreamSnapshotStream(globalAccountName, "TEST") 8887 8888 sr = c.restartServer(sr) 8889 c.checkClusterFormed() 8890 c.waitOnServerCurrent(sr) 8891 c.waitOnStreamCurrent(sr, globalAccountName, "TEST") 8892 8893 mset, err := sr.GlobalAccount().lookupStream("TEST") 8894 require_NoError(t, err) 8895 8896 // Make sure it's caught up 8897 var state StreamState 8898 mset.store.FastState(&state) 8899 require_Equal(t, state.Msgs, 2) 8900 require_Equal(t, state.FirstSeq, 1) 8901 require_Equal(t, state.LastSeq, 51_000) 8902 require_Equal(t, state.NumDeleted, 51_000-2) 8903 8904 sr.Shutdown() 8905 8906 _, err = js.Publish("baz", msg) 8907 require_NoError(t, err) 8908 8909 sl.JetStreamSnapshotStream(globalAccountName, "TEST") 8910 8911 sr = c.restartServer(sr) 8912 c.checkClusterFormed() 8913 c.waitOnServerCurrent(sr) 8914 c.waitOnStreamCurrent(sr, globalAccountName, "TEST") 8915 8916 mset, err = sr.GlobalAccount().lookupStream("TEST") 8917 require_NoError(t, err) 8918 mset.store.FastState(&state) 8919 8920 require_Equal(t, state.Msgs, 3) 8921 require_Equal(t, state.FirstSeq, 1) 8922 require_Equal(t, state.LastSeq, 51_001) 8923 require_Equal(t, state.NumDeleted, 51_001-3) 8924 } 8925 8926 func TestNoRaceStoreStreamEncoderDecoder(t *testing.T) { 8927 cfg := &StreamConfig{ 8928 Name: "zzz", 8929 Subjects: []string{"*"}, 8930 MaxMsgsPer: 1, 8931 Storage: MemoryStorage, 8932 } 8933 ms, err := newMemStore(cfg) 8934 require_NoError(t, err) 8935 8936 fs, err := newFileStore( 8937 FileStoreConfig{StoreDir: t.TempDir()}, 8938 StreamConfig{Name: "zzz", Subjects: []string{"*"}, MaxMsgsPer: 1, Storage: FileStorage}, 8939 ) 8940 require_NoError(t, err) 8941 defer fs.Stop() 8942 8943 const seed = 2222222 8944 msg := bytes.Repeat([]byte("ABC"), 33) // ~100bytes 8945 8946 maxEncodeTime := 2 * time.Second 8947 maxEncodeSize := 700 * 1024 8948 8949 test := func(t *testing.T, gs StreamStore) { 8950 t.Parallel() 8951 prand := rand.New(rand.NewSource(seed)) 8952 tick := time.NewTicker(time.Second) 8953 defer tick.Stop() 8954 done := time.NewTimer(10 * time.Second) 8955 8956 for running := true; running; { 8957 select { 8958 case <-tick.C: 8959 var state StreamState 8960 gs.FastState(&state) 8961 if state.NumDeleted == 0 { 8962 continue 8963 } 8964 start := time.Now() 8965 snap, err := gs.EncodedStreamState(0) 8966 require_NoError(t, err) 8967 elapsed := time.Since(start) 8968 // Should take <1ms without race but if CI/CD is slow we will give it a bit of room. 8969 if elapsed > maxEncodeTime { 8970 t.Logf("Encode took longer then expected: %v", elapsed) 8971 } 8972 if len(snap) > maxEncodeSize { 8973 t.Fatalf("Expected snapshot size < %v got %v", friendlyBytes(maxEncodeSize), friendlyBytes(len(snap))) 8974 } 8975 ss, err := DecodeStreamState(snap) 8976 require_True(t, len(ss.Deleted) > 0) 8977 require_NoError(t, err) 8978 case <-done.C: 8979 running = false 8980 default: 8981 key := strconv.Itoa(prand.Intn(256_000)) 8982 gs.StoreMsg(key, nil, msg) 8983 } 8984 } 8985 } 8986 8987 for _, gs := range []StreamStore{ms, fs} { 8988 switch gs.(type) { 8989 case *memStore: 8990 t.Run("MemStore", func(t *testing.T) { 8991 test(t, gs) 8992 }) 8993 case *fileStore: 8994 t.Run("FileStore", func(t *testing.T) { 8995 test(t, gs) 8996 }) 8997 } 8998 } 8999 } 9000 9001 func TestNoRaceJetStreamClusterKVWithServerKill(t *testing.T) { 9002 c := createJetStreamClusterExplicit(t, "R3S", 3) 9003 defer c.shutdown() 9004 9005 // Setup the KV bucket and use for making assertions. 9006 nc, js := jsClientConnect(t, c.randomServer()) 9007 defer nc.Close() 9008 _, err := js.CreateKeyValue(&nats.KeyValueConfig{ 9009 Bucket: "TEST", 9010 Replicas: 3, 9011 History: 10, 9012 }) 9013 require_NoError(t, err) 9014 9015 // Total number of keys to range over. 9016 numKeys := 50 9017 9018 // ID is the server id to explicitly connect to. 9019 work := func(ctx context.Context, wg *sync.WaitGroup, id int) { 9020 defer wg.Done() 9021 9022 nc, js := jsClientConnectEx(t, c.servers[id], []nats.JSOpt{nats.Context(ctx)}) 9023 defer nc.Close() 9024 9025 kv, err := js.KeyValue("TEST") 9026 require_NoError(t, err) 9027 9028 // 100 messages a second for each single client. 9029 tk := time.NewTicker(10 * time.Millisecond) 9030 defer tk.Stop() 9031 9032 for { 9033 select { 9034 case <-ctx.Done(): 9035 return 9036 9037 case <-tk.C: 9038 // Pick a random key within the range. 9039 k := fmt.Sprintf("key.%d", rand.Intn(numKeys)) 9040 // Attempt to get a key. 9041 e, err := kv.Get(k) 9042 // If found, attempt to update or delete. 9043 if err == nil { 9044 if rand.Intn(10) < 3 { 9045 kv.Delete(k, nats.LastRevision(e.Revision())) 9046 } else { 9047 kv.Update(k, nil, e.Revision()) 9048 } 9049 } else if errors.Is(err, nats.ErrKeyNotFound) { 9050 kv.Create(k, nil) 9051 } 9052 } 9053 } 9054 } 9055 9056 ctx, cancel := context.WithCancel(context.Background()) 9057 defer cancel() 9058 9059 var wg sync.WaitGroup 9060 wg.Add(3) 9061 9062 go work(ctx, &wg, 0) 9063 go work(ctx, &wg, 1) 9064 go work(ctx, &wg, 2) 9065 9066 time.Sleep(time.Second) 9067 9068 // Simulate server stop and restart. 9069 for i := 0; i < 7; i++ { 9070 s := c.randomServer() 9071 s.Shutdown() 9072 c.waitOnLeader() 9073 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 9074 9075 // Wait for a bit and then start the server again. 9076 time.Sleep(time.Duration(rand.Intn(1250)) * time.Millisecond) 9077 s = c.restartServer(s) 9078 c.waitOnServerCurrent(s) 9079 c.waitOnLeader() 9080 c.waitOnStreamLeader(globalAccountName, "KV_TEST") 9081 c.waitOnPeerCount(3) 9082 } 9083 9084 // Stop the workload. 9085 cancel() 9086 wg.Wait() 9087 9088 type fullState struct { 9089 state StreamState 9090 lseq uint64 9091 clfs uint64 9092 } 9093 9094 grabState := func(mset *stream) *fullState { 9095 mset.mu.RLock() 9096 defer mset.mu.RUnlock() 9097 var state StreamState 9098 mset.store.FastState(&state) 9099 return &fullState{state, mset.lseq, mset.clfs} 9100 } 9101 9102 grabStore := func(mset *stream) map[string][]uint64 { 9103 mset.mu.RLock() 9104 store := mset.store 9105 mset.mu.RUnlock() 9106 var state StreamState 9107 store.FastState(&state) 9108 storeMap := make(map[string][]uint64) 9109 for seq := state.FirstSeq; seq <= state.LastSeq; seq++ { 9110 if sm, err := store.LoadMsg(seq, nil); err == nil { 9111 storeMap[sm.subj] = append(storeMap[sm.subj], sm.seq) 9112 } 9113 } 9114 return storeMap 9115 } 9116 9117 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 9118 // Current stream leader. 9119 sl := c.streamLeader(globalAccountName, "KV_TEST") 9120 mset, err := sl.GlobalAccount().lookupStream("KV_TEST") 9121 require_NoError(t, err) 9122 lstate := grabState(mset) 9123 golden := grabStore(mset) 9124 9125 // Report messages per server. 9126 for _, s := range c.servers { 9127 if s == sl { 9128 continue 9129 } 9130 mset, err := s.GlobalAccount().lookupStream("KV_TEST") 9131 require_NoError(t, err) 9132 state := grabState(mset) 9133 if !reflect.DeepEqual(state, lstate) { 9134 return fmt.Errorf("Expected follower state\n%+v\nto match leader's\n %+v", state, lstate) 9135 } 9136 sm := grabStore(mset) 9137 if !reflect.DeepEqual(sm, golden) { 9138 t.Fatalf("Expected follower store for %v\n%+v\nto match leader's %v\n %+v", s, sm, sl, golden) 9139 } 9140 } 9141 return nil 9142 }) 9143 } 9144 9145 func TestNoRaceFileStoreLargeMsgsAndFirstMatching(t *testing.T) { 9146 sd := t.TempDir() 9147 fs, err := newFileStore( 9148 FileStoreConfig{StoreDir: sd, BlockSize: 8 * 1024 * 1024}, 9149 StreamConfig{Name: "zzz", Subjects: []string{">"}, Storage: FileStorage}) 9150 require_NoError(t, err) 9151 defer fs.Stop() 9152 9153 for i := 0; i < 150_000; i++ { 9154 fs.StoreMsg(fmt.Sprintf("foo.bar.%d", i), nil, nil) 9155 } 9156 for i := 0; i < 150_000; i++ { 9157 fs.StoreMsg(fmt.Sprintf("foo.baz.%d", i), nil, nil) 9158 } 9159 require_Equal(t, fs.numMsgBlocks(), 2) 9160 fs.mu.RLock() 9161 mb := fs.blks[1] 9162 fs.mu.RUnlock() 9163 fseq := atomic.LoadUint64(&mb.first.seq) 9164 // The -40 leaves enough mb.fss entries to kick in linear scan. 9165 for seq := fseq; seq < 300_000-40; seq++ { 9166 fs.RemoveMsg(uint64(seq)) 9167 } 9168 start := time.Now() 9169 fs.LoadNextMsg("*.baz.*", true, fseq, nil) 9170 require_True(t, time.Since(start) < 200*time.Microsecond) 9171 // Now remove more to kick into non-linear logic. 9172 for seq := 300_000 - 40; seq < 300_000; seq++ { 9173 fs.RemoveMsg(uint64(seq)) 9174 } 9175 start = time.Now() 9176 fs.LoadNextMsg("*.baz.*", true, fseq, nil) 9177 require_True(t, time.Since(start) < 200*time.Microsecond) 9178 } 9179 9180 func TestNoRaceWSNoCorruptionWithFrameSizeLimit(t *testing.T) { 9181 testWSNoCorruptionWithFrameSizeLimit(t, 50000) 9182 } 9183 9184 func TestNoRaceJetStreamAPIDispatchQueuePending(t *testing.T) { 9185 c := createJetStreamClusterExplicit(t, "R3S", 3) 9186 defer c.shutdown() 9187 9188 // Setup the KV bucket and use for making assertions. 9189 nc, js := jsClientConnect(t, c.randomServer()) 9190 defer nc.Close() 9191 9192 _, err := js.AddStream(&nats.StreamConfig{ 9193 Name: "TEST", 9194 Subjects: []string{"foo.*.*"}, 9195 }) 9196 require_NoError(t, err) 9197 9198 // Queue up 500k messages all with different subjects. 9199 // We want to make num pending for a consumer expensive, so a large subject 9200 // space and wildcards for now does the trick. 9201 toks := []string{"foo", "bar", "baz"} // for second token. 9202 for i := 1; i <= 500_000; i++ { 9203 subj := fmt.Sprintf("foo.%s.%d", toks[rand.Intn(len(toks))], i) 9204 _, err := js.PublishAsync(subj, nil, nats.StallWait(time.Second)) 9205 require_NoError(t, err) 9206 } 9207 select { 9208 case <-js.PublishAsyncComplete(): 9209 case <-time.After(20 * time.Second): 9210 t.Fatalf("Did not receive completion signal") 9211 } 9212 9213 // To back up our pending queue we will create lots of filtered, with wildcards, R1 consumers 9214 // from a different server then the one hosting the stream. 9215 // ok to share this connection here. 9216 sldr := c.streamLeader(globalAccountName, "TEST") 9217 for _, s := range c.servers { 9218 if s != sldr { 9219 nc, js = jsClientConnect(t, s) 9220 defer nc.Close() 9221 break 9222 } 9223 } 9224 9225 ngr, ncons := 100, 10 9226 startCh, errCh := make(chan bool), make(chan error, ngr) 9227 var wg, swg sync.WaitGroup 9228 wg.Add(ngr) 9229 swg.Add(ngr) 9230 9231 // The wildcard in the filter subject is the key. 9232 cfg := &nats.ConsumerConfig{FilterSubject: "foo.*.22"} 9233 var tt atomic.Int64 9234 9235 for i := 0; i < ngr; i++ { 9236 go func() { 9237 defer wg.Done() 9238 swg.Done() 9239 // Make them all fire at once. 9240 <-startCh 9241 9242 for i := 0; i < ncons; i++ { 9243 start := time.Now() 9244 if _, err := js.AddConsumer("TEST", cfg); err != nil { 9245 errCh <- err 9246 t.Logf("Got err creating consumer: %v", err) 9247 } 9248 elapsed := time.Since(start) 9249 tt.Add(int64(elapsed)) 9250 } 9251 }() 9252 } 9253 swg.Wait() 9254 close(startCh) 9255 time.Sleep(time.Millisecond) 9256 jsz, _ := sldr.Jsz(nil) 9257 // This could be 0 legit, so just log, don't fail. 9258 if jsz.JetStreamStats.API.Inflight == 0 { 9259 t.Log("Expected a non-zero inflight") 9260 } 9261 wg.Wait() 9262 9263 if len(errCh) > 0 { 9264 t.Fatalf("Expected no errors, got %d", len(errCh)) 9265 } 9266 } 9267 9268 func TestNoRaceJetStreamMirrorAndSourceConsumerFailBackoff(t *testing.T) { 9269 // Check calculations first. 9270 for i := 1; i <= 20; i++ { 9271 backoff := calculateRetryBackoff(i) 9272 if i < 12 { 9273 require_Equal(t, backoff, time.Duration(i)*10*time.Second) 9274 } else { 9275 require_Equal(t, backoff, retryMaximum) 9276 } 9277 } 9278 9279 c := createJetStreamClusterExplicit(t, "R3S", 3) 9280 defer c.shutdown() 9281 9282 nc, js := jsClientConnect(t, c.randomServer()) 9283 defer nc.Close() 9284 9285 _, err := js.AddStream(&nats.StreamConfig{ 9286 Name: "TEST", 9287 Subjects: []string{"foo.*.*"}, 9288 }) 9289 require_NoError(t, err) 9290 sl := c.streamLeader(globalAccountName, "TEST") 9291 9292 // Create a mirror. 9293 ml := sl 9294 // Make sure not on the same server. Should not happened in general but possible. 9295 for ml == sl { 9296 js.DeleteStream("MIRROR") 9297 _, err = js.AddStream(&nats.StreamConfig{ 9298 Name: "MIRROR", 9299 Mirror: &nats.StreamSource{Name: "TEST"}, 9300 }) 9301 require_NoError(t, err) 9302 ml = c.streamLeader(globalAccountName, "MIRROR") 9303 } 9304 // Create a source. 9305 srcl := sl 9306 for srcl == sl { 9307 js.DeleteStream("SOURCE") 9308 _, err = js.AddStream(&nats.StreamConfig{ 9309 Name: "SOURCE", 9310 Sources: []*nats.StreamSource{{Name: "TEST"}}, 9311 }) 9312 require_NoError(t, err) 9313 srcl = c.streamLeader(globalAccountName, "MIRROR") 9314 } 9315 9316 // Create sub to watch for the consumer create requests. 9317 nc, _ = jsClientConnect(t, ml) 9318 defer nc.Close() 9319 sub := natsSubSync(t, nc, "$JS.API.CONSUMER.CREATE.>") 9320 9321 // Kill the server where the source is.. 9322 sldr := c.streamLeader(globalAccountName, "TEST") 9323 sldr.Shutdown() 9324 9325 // Wait for just greater than 10s. We should only see 1 request during this time. 9326 time.Sleep(11 * time.Second) 9327 // There should have been 2 requests, one for mirror, one for source 9328 n, _, _ := sub.Pending() 9329 require_Equal(t, n, 2) 9330 var mreq, sreq int 9331 for i := 0; i < 2; i++ { 9332 msg := natsNexMsg(t, sub, time.Second) 9333 if bytes.Contains(msg.Data, []byte("$JS.M.")) { 9334 mreq++ 9335 } else if bytes.Contains(msg.Data, []byte("$JS.S.")) { 9336 sreq++ 9337 } 9338 } 9339 if mreq != 1 || sreq != 1 { 9340 t.Fatalf("Consumer create captures invalid: mreq=%v sreq=%v", mreq, sreq) 9341 } 9342 9343 // Now make sure that the fails is set properly. 9344 mset, err := c.streamLeader(globalAccountName, "MIRROR").GlobalAccount().lookupStream("MIRROR") 9345 require_NoError(t, err) 9346 mset.mu.RLock() 9347 fails := mset.mirror.fails 9348 mset.mu.RUnlock() 9349 require_Equal(t, fails, 1) 9350 9351 mset, err = c.streamLeader(globalAccountName, "SOURCE").GlobalAccount().lookupStream("SOURCE") 9352 require_NoError(t, err) 9353 mset.mu.RLock() 9354 si := mset.sources["TEST > >"] 9355 mset.mu.RUnlock() 9356 require_True(t, si != nil) 9357 require_Equal(t, si.fails, 1) 9358 } 9359 9360 func TestNoRaceJetStreamClusterStreamCatchupLargeInteriorDeletes(t *testing.T) { 9361 c := createJetStreamClusterExplicit(t, "R3S", 3) 9362 defer c.shutdown() 9363 9364 nc, js := jsClientConnect(t, c.randomServer()) 9365 defer nc.Close() 9366 9367 cfg := &nats.StreamConfig{ 9368 Name: "TEST", 9369 Subjects: []string{"foo.*"}, 9370 MaxMsgsPerSubject: 100, 9371 Replicas: 1, 9372 } 9373 9374 _, err := js.AddStream(cfg) 9375 require_NoError(t, err) 9376 9377 msg := bytes.Repeat([]byte("Z"), 2*1024) 9378 // We will create lots of interior deletes on our R1 then scale up. 9379 _, err = js.Publish("foo.0", msg) 9380 require_NoError(t, err) 9381 9382 // Create 50k messages randomly from 1-100 9383 for i := 0; i < 50_000; i++ { 9384 subj := fmt.Sprintf("foo.%d", rand.Intn(100)+1) 9385 js.PublishAsync(subj, msg) 9386 } 9387 select { 9388 case <-js.PublishAsyncComplete(): 9389 case <-time.After(5 * time.Second): 9390 t.Fatalf("Did not receive completion signal") 9391 } 9392 // Now create a large gap. 9393 for i := 0; i < 100_000; i++ { 9394 js.PublishAsync("foo.2", msg) 9395 } 9396 select { 9397 case <-js.PublishAsyncComplete(): 9398 case <-time.After(5 * time.Second): 9399 t.Fatalf("Did not receive completion signal") 9400 } 9401 // Do 50k random again at end. 9402 for i := 0; i < 50_000; i++ { 9403 subj := fmt.Sprintf("foo.%d", rand.Intn(100)+1) 9404 js.PublishAsync(subj, msg) 9405 } 9406 select { 9407 case <-js.PublishAsyncComplete(): 9408 case <-time.After(5 * time.Second): 9409 t.Fatalf("Did not receive completion signal") 9410 } 9411 9412 si, err := js.StreamInfo("TEST") 9413 require_NoError(t, err) 9414 9415 cfg.Replicas = 2 9416 _, err = js.UpdateStream(cfg) 9417 require_NoError(t, err) 9418 9419 // Let catchup start. 9420 c.waitOnStreamLeader(globalAccountName, "TEST") 9421 9422 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 9423 require_True(t, nl != nil) 9424 mset, err := nl.GlobalAccount().lookupStream("TEST") 9425 require_NoError(t, err) 9426 9427 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 9428 state := mset.state() 9429 if state.Msgs == si.State.Msgs { 9430 return nil 9431 } 9432 return fmt.Errorf("Msgs not equal %d vs %d", state.Msgs, si.State.Msgs) 9433 }) 9434 } 9435 9436 func TestNoRaceJetStreamClusterBadRestartsWithHealthzPolling(t *testing.T) { 9437 c := createJetStreamClusterExplicit(t, "R3S", 3) 9438 defer c.shutdown() 9439 9440 nc, js := jsClientConnect(t, c.randomServer()) 9441 defer nc.Close() 9442 9443 cfg := &nats.StreamConfig{ 9444 Name: "TEST", 9445 Subjects: []string{"foo.>"}, 9446 Replicas: 3, 9447 } 9448 _, err := js.AddStream(cfg) 9449 require_NoError(t, err) 9450 9451 // We will poll healthz at a decent clip and make sure any restart logic works 9452 // correctly with assets coming and going. 9453 ch := make(chan struct{}) 9454 defer close(ch) 9455 9456 go func() { 9457 for { 9458 select { 9459 case <-ch: 9460 return 9461 case <-time.After(50 * time.Millisecond): 9462 for _, s := range c.servers { 9463 s.healthz(nil) 9464 } 9465 } 9466 } 9467 }() 9468 9469 numConsumers := 500 9470 consumers := make([]string, 0, numConsumers) 9471 9472 var wg sync.WaitGroup 9473 9474 for i := 0; i < numConsumers; i++ { 9475 cname := fmt.Sprintf("CONS-%d", i+1) 9476 consumers = append(consumers, cname) 9477 wg.Add(1) 9478 go func() { 9479 defer wg.Done() 9480 _, err := js.PullSubscribe("foo.>", cname, nats.BindStream("TEST")) 9481 require_NoError(t, err) 9482 }() 9483 } 9484 wg.Wait() 9485 9486 // Make sure all are reported. 9487 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9488 for _, s := range c.servers { 9489 jsz, _ := s.Jsz(nil) 9490 if jsz.Consumers != numConsumers { 9491 return fmt.Errorf("%v wrong number of consumers: %d vs %d", s, jsz.Consumers, numConsumers) 9492 } 9493 } 9494 return nil 9495 }) 9496 9497 // Now do same for streams. 9498 numStreams := 200 9499 streams := make([]string, 0, numStreams) 9500 9501 for i := 0; i < numStreams; i++ { 9502 sname := fmt.Sprintf("TEST-%d", i+1) 9503 streams = append(streams, sname) 9504 wg.Add(1) 9505 go func() { 9506 defer wg.Done() 9507 _, err := js.AddStream(&nats.StreamConfig{Name: sname, Replicas: 3}) 9508 require_NoError(t, err) 9509 }() 9510 } 9511 wg.Wait() 9512 9513 // Make sure all are reported. 9514 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9515 for _, s := range c.servers { 9516 jsz, _ := s.Jsz(nil) 9517 if jsz.Streams != numStreams+1 { 9518 return fmt.Errorf("%v wrong number of streams: %d vs %d", s, jsz.Streams, numStreams+1) 9519 } 9520 } 9521 return nil 9522 }) 9523 9524 // Delete consumers. 9525 for _, cname := range consumers { 9526 err := js.DeleteConsumer("TEST", cname) 9527 require_NoError(t, err) 9528 } 9529 // Make sure reporting goes to zero. 9530 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9531 for _, s := range c.servers { 9532 jsz, _ := s.Jsz(nil) 9533 if jsz.Consumers != 0 { 9534 return fmt.Errorf("%v still has %d consumers", s, jsz.Consumers) 9535 } 9536 } 9537 return nil 9538 }) 9539 9540 // Delete streams 9541 for _, sname := range streams { 9542 err := js.DeleteStream(sname) 9543 require_NoError(t, err) 9544 } 9545 err = js.DeleteStream("TEST") 9546 require_NoError(t, err) 9547 9548 // Make sure reporting goes to zero. 9549 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 9550 for _, s := range c.servers { 9551 jsz, _ := s.Jsz(nil) 9552 if jsz.Streams != 0 { 9553 return fmt.Errorf("%v still has %d streams", s, jsz.Streams) 9554 } 9555 } 9556 return nil 9557 }) 9558 } 9559 9560 func TestNoRaceJetStreamKVReplaceWithServerRestart(t *testing.T) { 9561 c := createJetStreamClusterExplicit(t, "R3S", 3) 9562 defer c.shutdown() 9563 9564 nc, _ := jsClientConnect(t, c.randomServer()) 9565 defer nc.Close() 9566 // Shorten wait time for disconnects. 9567 js, err := nc.JetStream(nats.MaxWait(time.Second)) 9568 require_NoError(t, err) 9569 9570 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 9571 Bucket: "TEST", 9572 Replicas: 3, 9573 }) 9574 require_NoError(t, err) 9575 9576 createData := func(n int) []byte { 9577 const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 9578 b := make([]byte, n) 9579 for i := range b { 9580 b[i] = letterBytes[rand.Intn(len(letterBytes))] 9581 } 9582 return b 9583 } 9584 9585 _, err = kv.Create("foo", createData(160)) 9586 require_NoError(t, err) 9587 9588 ch := make(chan struct{}) 9589 wg := sync.WaitGroup{} 9590 9591 // For counting errors that should not happen. 9592 errCh := make(chan error, 1024) 9593 9594 wg.Add(1) 9595 go func() { 9596 defer wg.Done() 9597 9598 var lastData []byte 9599 var revision uint64 9600 9601 for { 9602 select { 9603 case <-ch: 9604 return 9605 default: 9606 k, err := kv.Get("foo") 9607 if err == nats.ErrKeyNotFound { 9608 errCh <- err 9609 } else if k != nil { 9610 if lastData != nil && k.Revision() == revision && !bytes.Equal(lastData, k.Value()) { 9611 errCh <- fmt.Errorf("data loss [%s][rev:%d] expected:[%q] is:[%q]\n", "foo", revision, lastData, k.Value()) 9612 } 9613 newData := createData(160) 9614 if revision, err = kv.Update("foo", newData, k.Revision()); err == nil { 9615 lastData = newData 9616 } 9617 } 9618 } 9619 } 9620 }() 9621 9622 // Wait a short bit. 9623 time.Sleep(2 * time.Second) 9624 for _, s := range c.servers { 9625 s.Shutdown() 9626 // Need to leave servers down for awhile to trigger bug properly. 9627 time.Sleep(5 * time.Second) 9628 s = c.restartServer(s) 9629 c.waitOnServerHealthz(s) 9630 } 9631 9632 // Shutdown the go routine above. 9633 close(ch) 9634 // Wait for it to finish. 9635 wg.Wait() 9636 9637 if len(errCh) != 0 { 9638 for err := range errCh { 9639 t.Logf("Received err %v during test", err) 9640 } 9641 t.Fatalf("Encountered errors") 9642 } 9643 } 9644 9645 func TestNoRaceMemStoreCompactPerformance(t *testing.T) { 9646 //Load MemStore so that it is full 9647 subj, msg := "foo", make([]byte, 1000) 9648 storedMsgSize := memStoreMsgSize(subj, nil, msg) 9649 9650 toStore := uint64(10_000) 9651 toStoreOnTop := uint64(1_000) 9652 setSeqNo := uint64(10_000_000_000) 9653 9654 expectedPurge := toStore - 1 9655 maxBytes := storedMsgSize * toStore 9656 9657 ms, err := newMemStore(&StreamConfig{Storage: MemoryStorage, MaxBytes: int64(maxBytes)}) 9658 require_NoError(t, err) 9659 defer ms.Stop() 9660 9661 for i := uint64(0); i < toStore; i++ { 9662 ms.StoreMsg(subj, nil, msg) 9663 } 9664 state := ms.State() 9665 require_Equal(t, toStore, state.Msgs) 9666 require_Equal(t, state.Bytes, storedMsgSize*toStore) 9667 9668 //1st run: Load additional messages then compact 9669 for i := uint64(0); i < toStoreOnTop; i++ { 9670 ms.StoreMsg(subj, nil, msg) 9671 } 9672 startFirstRun := time.Now() 9673 purgedFirstRun, _ := ms.Compact(toStore + toStoreOnTop) 9674 elapsedFirstRun := time.Since(startFirstRun) 9675 require_Equal(t, expectedPurge, purgedFirstRun) 9676 9677 //set the seq number to a very high value by compacting with a too high seq number 9678 purgedFull, _ := ms.Compact(setSeqNo) 9679 require_Equal(t, 1, purgedFull) 9680 9681 //2nd run: Compact again 9682 for i := uint64(0); i < toStore; i++ { 9683 ms.StoreMsg(subj, nil, msg) 9684 } 9685 startSecondRun := time.Now() 9686 purgedSecondRun, _ := ms.Compact(setSeqNo + toStore - 1) 9687 elapsedSecondRun := time.Since(startSecondRun) 9688 require_Equal(t, expectedPurge, purgedSecondRun) 9689 9690 //Calculate delta between runs and fail if it is too high 9691 require_LessThan(t, elapsedSecondRun-elapsedFirstRun, time.Duration(1)*time.Second) 9692 } 9693 9694 func TestNoRaceJetStreamSnapshotsWithSlowAckDontSlowConsumer(t *testing.T) { 9695 s := RunBasicJetStreamServer(t) 9696 defer s.Shutdown() 9697 9698 ech := make(chan error) 9699 ecb := func(_ *nats.Conn, _ *nats.Subscription, err error) { 9700 if err != nil { 9701 ech <- err 9702 } 9703 } 9704 nc, js := jsClientConnect(t, s, nats.ErrorHandler(ecb)) 9705 defer nc.Close() 9706 9707 _, err := js.AddStream(&nats.StreamConfig{ 9708 Name: "TEST", 9709 Subjects: []string{"foo"}, 9710 }) 9711 require_NoError(t, err) 9712 9713 // Put in over 64MB. 9714 msg, toSend := make([]byte, 1024*1024), 80 9715 crand.Read(msg) 9716 9717 for i := 0; i < toSend; i++ { 9718 _, err := js.Publish("foo", msg) 9719 require_NoError(t, err) 9720 } 9721 9722 sreq := &JSApiStreamSnapshotRequest{ 9723 DeliverSubject: nats.NewInbox(), 9724 ChunkSize: 1024 * 1024, 9725 } 9726 req, _ := json.Marshal(sreq) 9727 rmsg, err := nc.Request(fmt.Sprintf(JSApiStreamSnapshotT, "TEST"), req, time.Second) 9728 require_NoError(t, err) 9729 9730 var resp JSApiStreamSnapshotResponse 9731 json.Unmarshal(rmsg.Data, &resp) 9732 require_True(t, resp.Error == nil) 9733 9734 done := make(chan *nats.Msg) 9735 sub, _ := nc.Subscribe(sreq.DeliverSubject, func(m *nats.Msg) { 9736 // EOF 9737 if len(m.Data) == 0 { 9738 done <- m 9739 return 9740 } 9741 }) 9742 defer sub.Unsubscribe() 9743 9744 // Check that we do not get disconnected due to slow consumer. 9745 select { 9746 case msg := <-done: 9747 require_Equal(t, msg.Header.Get("Status"), "408") 9748 require_Equal(t, msg.Header.Get("Description"), "No Flow Response") 9749 case <-ech: 9750 t.Fatalf("Got disconnected: %v", err) 9751 case <-time.After(5 * time.Second): 9752 t.Fatalf("Should have received EOF with error status") 9753 } 9754 } 9755 9756 func TestNoRaceJetStreamWQSkippedMsgsOnScaleUp(t *testing.T) { 9757 c := createJetStreamClusterExplicit(t, "R3S", 3) 9758 defer c.shutdown() 9759 9760 nc, js := jsClientConnect(t, c.randomServer()) 9761 defer nc.Close() 9762 9763 const pre = "CORE_ENT_DR_OTP_22." 9764 wcSubj := pre + ">" 9765 9766 _, err := js.AddStream(&nats.StreamConfig{ 9767 Name: "TEST", 9768 Subjects: []string{wcSubj}, 9769 Retention: nats.WorkQueuePolicy, 9770 AllowDirect: true, 9771 Replicas: 3, 9772 }) 9773 require_NoError(t, err) 9774 9775 cfg := &nats.ConsumerConfig{ 9776 Durable: "dlc", 9777 FilterSubject: wcSubj, 9778 DeliverPolicy: nats.DeliverAllPolicy, 9779 AckPolicy: nats.AckExplicitPolicy, 9780 MaxAckPending: 10_000, 9781 AckWait: 500 * time.Millisecond, 9782 MaxWaiting: 100, 9783 MaxRequestExpires: 1050 * time.Millisecond, 9784 } 9785 _, err = js.AddConsumer("TEST", cfg) 9786 require_NoError(t, err) 9787 9788 pdone := make(chan bool) 9789 cdone := make(chan bool) 9790 9791 // We will have 51 consumer apps and a producer app. Make sure to wait for 9792 // all go routines to end at the end of the test. 9793 wg := sync.WaitGroup{} 9794 wg.Add(52) 9795 9796 // Publish routine 9797 go func() { 9798 defer wg.Done() 9799 9800 publishSubjects := []string{ 9801 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.918886682066", 9802 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.918886682067", 9803 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543211", 9804 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543212", 9805 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543213", 9806 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543214", 9807 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543215", 9808 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543216", 9809 "CORE_ENT_DR_OTP_22.P.H.TC.10011.1010.916596543217", 9810 } 9811 // ~1.7kb 9812 msg := bytes.Repeat([]byte("Z"), 1750) 9813 9814 // 200 msgs/s 9815 st := time.NewTicker(5 * time.Millisecond) 9816 defer st.Stop() 9817 9818 nc, js := jsClientConnect(t, c.randomServer()) 9819 defer nc.Close() 9820 9821 for { 9822 select { 9823 case <-st.C: 9824 subj := publishSubjects[rand.Intn(len(publishSubjects))] 9825 _, err = js.Publish(subj, msg) 9826 require_NoError(t, err) 9827 case <-pdone: 9828 return 9829 } 9830 } 9831 }() 9832 9833 consumerApp := func() { 9834 defer wg.Done() 9835 9836 nc, js := jsClientConnect(t, c.randomServer()) 9837 defer nc.Close() 9838 9839 _, err := js.ConsumerInfo("TEST", "dlc") 9840 require_NoError(t, err) 9841 _, err = js.UpdateConsumer("TEST", cfg) 9842 require_NoError(t, err) 9843 9844 sub, err := js.PullSubscribe(wcSubj, "dlc") 9845 require_NoError(t, err) 9846 9847 st := time.NewTicker(100 * time.Millisecond) 9848 defer st.Stop() 9849 9850 for { 9851 select { 9852 case <-st.C: 9853 msgs, err := sub.Fetch(1, nats.MaxWait(100*time.Millisecond)) 9854 if err != nil { 9855 continue 9856 } 9857 require_Equal(t, len(msgs), 1) 9858 m := msgs[0] 9859 if rand.Intn(10) == 1 { 9860 m.Nak() 9861 } else { 9862 // Wait up to 20ms to ack. 9863 time.Sleep(time.Duration(rand.Intn(20)) * time.Millisecond) 9864 // This could fail and that is ok, system should recover due to low ack wait. 9865 m.Ack() 9866 } 9867 case <-cdone: 9868 return 9869 } 9870 } 9871 } 9872 9873 // Now consumer side single. 9874 go consumerApp() 9875 9876 // Wait for 2s 9877 time.Sleep(2 * time.Second) 9878 9879 // Now spin up 50 more. 9880 for i := 1; i <= 50; i++ { 9881 if i%5 == 0 { 9882 time.Sleep(200 * time.Millisecond) 9883 } 9884 go consumerApp() 9885 } 9886 9887 timeout := time.Now().Add(8 * time.Second) 9888 for time.Now().Before(timeout) { 9889 time.Sleep(750 * time.Millisecond) 9890 if s := c.consumerLeader(globalAccountName, "TEST", "dlc"); s != nil { 9891 s.JetStreamStepdownConsumer(globalAccountName, "TEST", "dlc") 9892 } 9893 } 9894 9895 // Close publishers and defer closing consumers. 9896 close(pdone) 9897 defer func() { 9898 close(cdone) 9899 wg.Wait() 9900 }() 9901 9902 checkFor(t, 30*time.Second, 50*time.Millisecond, func() error { 9903 si, err := js.StreamInfo("TEST") 9904 require_NoError(t, err) 9905 if si.State.NumDeleted > 0 || si.State.Msgs > 0 { 9906 return fmt.Errorf("State not correct: %+v", si.State) 9907 } 9908 return nil 9909 }) 9910 } 9911 9912 func TestNoRaceConnectionObjectReleased(t *testing.T) { 9913 ob1Conf := createConfFile(t, []byte(` 9914 listen: "127.0.0.1:-1" 9915 server_name: "B1" 9916 accounts { 9917 A { users: [{user: a, password: pwd}] } 9918 SYS { users: [{user: sys, password: pwd}] } 9919 } 9920 cluster { 9921 name: "B" 9922 listen: "127.0.0.1:-1" 9923 } 9924 gateway { 9925 name: "B" 9926 listen: "127.0.0.1:-1" 9927 } 9928 leaf { 9929 listen: "127.0.0.1:-1" 9930 } 9931 system_account: "SYS" 9932 `)) 9933 sb1, ob1 := RunServerWithConfig(ob1Conf) 9934 defer sb1.Shutdown() 9935 9936 oaConf := createConfFile(t, []byte(fmt.Sprintf(` 9937 listen: "127.0.0.1:-1" 9938 server_name: "A" 9939 accounts { 9940 A { users: [{user: a, password: pwd}] } 9941 SYS { users: [{user: sys, password: pwd}] } 9942 } 9943 gateway { 9944 name: "A" 9945 listen: "127.0.0.1:-1" 9946 gateways [ 9947 { 9948 name: "B" 9949 url: "nats://a:pwd@127.0.0.1:%d" 9950 } 9951 ] 9952 } 9953 websocket { 9954 listen: "127.0.0.1:-1" 9955 no_tls: true 9956 } 9957 system_account: "SYS" 9958 `, ob1.Gateway.Port))) 9959 sa, oa := RunServerWithConfig(oaConf) 9960 defer sa.Shutdown() 9961 9962 waitForOutboundGateways(t, sa, 1, 2*time.Second) 9963 waitForOutboundGateways(t, sb1, 1, 2*time.Second) 9964 9965 ob2Conf := createConfFile(t, []byte(fmt.Sprintf(` 9966 listen: "127.0.0.1:-1" 9967 server_name: "B2" 9968 accounts { 9969 A { users: [{user: a, password: pwd}] } 9970 SYS { users: [{user: sys, password: pwd}] } 9971 } 9972 cluster { 9973 name: "B" 9974 listen: "127.0.0.1:-1" 9975 routes: ["nats://127.0.0.1:%d"] 9976 } 9977 gateway { 9978 name: "B" 9979 listen: "127.0.0.1:-1" 9980 } 9981 system_account: "SYS" 9982 `, ob1.Cluster.Port))) 9983 sb2, _ := RunServerWithConfig(ob2Conf) 9984 defer sb2.Shutdown() 9985 9986 checkClusterFormed(t, sb1, sb2) 9987 waitForOutboundGateways(t, sb2, 1, 2*time.Second) 9988 waitForInboundGateways(t, sa, 2, 2*time.Second) 9989 9990 leafConf := createConfFile(t, []byte(fmt.Sprintf(` 9991 listen: "127.0.0.1:-1" 9992 server_name: "C" 9993 accounts { 9994 A { users: [{user: a, password: pwd}] } 9995 SYS { users: [{user: sys, password: pwd}] } 9996 } 9997 leafnodes { 9998 remotes [ 9999 { url: "nats://a:pwd@127.0.0.1:%d" } 10000 ] 10001 } 10002 system_account: "SYS" 10003 `, ob1.LeafNode.Port))) 10004 leaf, _ := RunServerWithConfig(leafConf) 10005 defer leaf.Shutdown() 10006 10007 checkLeafNodeConnected(t, leaf) 10008 10009 // Start an independent MQTT server to check MQTT client connection. 10010 mo := testMQTTDefaultOptions() 10011 sm := testMQTTRunServer(t, mo) 10012 defer testMQTTShutdownServer(sm) 10013 10014 mc, mr := testMQTTConnect(t, &mqttConnInfo{cleanSess: true}, mo.MQTT.Host, mo.MQTT.Port) 10015 defer mc.Close() 10016 testMQTTCheckConnAck(t, mr, mqttConnAckRCConnectionAccepted, false) 10017 10018 nc := natsConnect(t, sb1.ClientURL(), nats.UserInfo("a", "pwd")) 10019 defer nc.Close() 10020 cid, err := nc.GetClientID() 10021 require_NoError(t, err) 10022 natsSubSync(t, nc, "foo") 10023 10024 ncWS := natsConnect(t, fmt.Sprintf("ws://a:pwd@127.0.0.1:%d", oa.Websocket.Port)) 10025 defer ncWS.Close() 10026 cidWS, err := ncWS.GetClientID() 10027 require_NoError(t, err) 10028 10029 var conns []net.Conn 10030 var total int 10031 var ch chan string 10032 10033 track := func(c *client) { 10034 total++ 10035 c.mu.Lock() 10036 conns = append(conns, c.nc) 10037 c.mu.Unlock() 10038 runtime.SetFinalizer(c, func(c *client) { 10039 ch <- fmt.Sprintf("Server=%s - Kind=%s - Conn=%v", c.srv, c.kindString(), c) 10040 }) 10041 } 10042 // Track the connection for the MQTT client 10043 sm.mu.RLock() 10044 for _, c := range sm.clients { 10045 track(c) 10046 } 10047 sm.mu.RUnlock() 10048 10049 // Track the connection from the NATS client 10050 track(sb1.getClient(cid)) 10051 // The outbound connection to GW "A" 10052 track(sb1.getOutboundGatewayConnection("A")) 10053 // The inbound connection from GW "A" 10054 var inGW []*client 10055 sb1.getInboundGatewayConnections(&inGW) 10056 track(inGW[0]) 10057 // The routes from sb2 10058 sb1.forEachRoute(func(r *client) { 10059 track(r) 10060 }) 10061 // The leaf form "LEAF" 10062 sb1.mu.RLock() 10063 for _, l := range sb1.leafs { 10064 track(l) 10065 } 10066 sb1.mu.RUnlock() 10067 10068 // Now from sb2, the routes to sb1 10069 sb2.forEachRoute(func(r *client) { 10070 track(r) 10071 }) 10072 // The outbound connection to GW "A" 10073 track(sb2.getOutboundGatewayConnection("A")) 10074 10075 // From server "A", track the outbound GW 10076 track(sa.getOutboundGatewayConnection("B")) 10077 inGW = inGW[:0] 10078 // Track the inbound GW connections 10079 sa.getInboundGatewayConnections(&inGW) 10080 for _, ig := range inGW { 10081 track(ig) 10082 } 10083 // Track the websocket client 10084 track(sa.getClient(cidWS)) 10085 10086 // From the LEAF server, the connection to sb1 10087 leaf.mu.RLock() 10088 for _, l := range leaf.leafs { 10089 track(l) 10090 } 10091 leaf.mu.RUnlock() 10092 10093 // Now close all connections and wait to see if all connections 10094 // with the finalizer set is invoked. 10095 ch = make(chan string, total) 10096 // Close the clients and then all other connections to create a disconnect. 10097 nc.Close() 10098 mc.Close() 10099 ncWS.Close() 10100 for _, conn := range conns { 10101 conn.Close() 10102 } 10103 // Wait and see if we get them all. 10104 tm := time.NewTimer(10 * time.Second) 10105 defer tm.Stop() 10106 tk := time.NewTicker(10 * time.Millisecond) 10107 for clients := make([]string, 0, total); len(clients) < total; { 10108 select { 10109 case <-tk.C: 10110 runtime.GC() 10111 case cs := <-ch: 10112 clients = append(clients, cs) 10113 case <-tm.C: 10114 // Don't fail the test since there is no guarantee that 10115 // finalizers are invoked. 10116 t.Logf("Got %v out of %v finalizers", len(clients), total) 10117 sort.Strings(clients) 10118 for _, cs := range clients { 10119 t.Logf(" => %s", cs) 10120 } 10121 return 10122 } 10123 } 10124 } 10125 10126 func TestNoRaceFileStoreMsgLoadNextMsgMultiPerf(t *testing.T) { 10127 fs, err := newFileStore( 10128 FileStoreConfig{StoreDir: t.TempDir()}, 10129 StreamConfig{Name: "zzz", Subjects: []string{"foo.*"}, Storage: FileStorage}) 10130 require_NoError(t, err) 10131 defer fs.Stop() 10132 10133 // Put 1k msgs in 10134 for i := 0; i < 1000; i++ { 10135 subj := fmt.Sprintf("foo.%d", i) 10136 fs.StoreMsg(subj, nil, []byte("ZZZ")) 10137 } 10138 10139 var smv StoreMsg 10140 10141 // Now do normal load next with no filter. 10142 // This is baseline. 10143 start := time.Now() 10144 for i, seq := 0, uint64(1); i < 1000; i++ { 10145 sm, nseq, err := fs.LoadNextMsg(_EMPTY_, false, seq, &smv) 10146 require_NoError(t, err) 10147 require_True(t, sm.subj == fmt.Sprintf("foo.%d", i)) 10148 require_Equal(t, nseq, seq) 10149 seq++ 10150 } 10151 baseline := time.Since(start) 10152 t.Logf("Single - No filter %v", baseline) 10153 10154 // Now do normal load next with wc filter. 10155 start = time.Now() 10156 for i, seq := 0, uint64(1); i < 1000; i++ { 10157 sm, nseq, err := fs.LoadNextMsg("foo.>", true, seq, &smv) 10158 require_NoError(t, err) 10159 require_True(t, sm.subj == fmt.Sprintf("foo.%d", i)) 10160 require_Equal(t, nseq, seq) 10161 seq++ 10162 } 10163 elapsed := time.Since(start) 10164 require_True(t, elapsed < 2*baseline) 10165 t.Logf("Single - WC filter %v", elapsed) 10166 10167 // Now do multi load next with 1 wc entry. 10168 sl := NewSublistWithCache() 10169 sl.Insert(&subscription{subject: []byte("foo.>")}) 10170 start = time.Now() 10171 for i, seq := 0, uint64(1); i < 1000; i++ { 10172 sm, nseq, err := fs.LoadNextMsgMulti(sl, seq, &smv) 10173 require_NoError(t, err) 10174 require_True(t, sm.subj == fmt.Sprintf("foo.%d", i)) 10175 require_Equal(t, nseq, seq) 10176 seq++ 10177 } 10178 elapsed = time.Since(start) 10179 require_True(t, elapsed < 2*baseline) 10180 t.Logf("Multi - Single WC filter %v", elapsed) 10181 10182 // Now do multi load next with 1000 literal subjects. 10183 sl = NewSublistWithCache() 10184 for i := 0; i < 1000; i++ { 10185 subj := fmt.Sprintf("foo.%d", i) 10186 sl.Insert(&subscription{subject: []byte(subj)}) 10187 } 10188 start = time.Now() 10189 for i, seq := 0, uint64(1); i < 1000; i++ { 10190 sm, nseq, err := fs.LoadNextMsgMulti(sl, seq, &smv) 10191 require_NoError(t, err) 10192 require_True(t, sm.subj == fmt.Sprintf("foo.%d", i)) 10193 require_Equal(t, nseq, seq) 10194 seq++ 10195 } 10196 elapsed = time.Since(start) 10197 require_True(t, elapsed < 2*baseline) 10198 t.Logf("Multi - 1000 filters %v", elapsed) 10199 } 10200 10201 func TestNoRaceWQAndMultiSubjectFilters(t *testing.T) { 10202 c := createJetStreamClusterExplicit(t, "R3S", 3) 10203 defer c.shutdown() 10204 10205 nc, js := jsClientConnect(t, c.randomServer()) 10206 defer nc.Close() 10207 10208 _, err := js.AddStream(&nats.StreamConfig{ 10209 Name: "TEST", 10210 Subjects: []string{"Z.>"}, 10211 Retention: nats.WorkQueuePolicy, 10212 }) 10213 require_NoError(t, err) 10214 10215 stopPubs := make(chan bool) 10216 10217 publish := func(subject string) { 10218 nc, js := jsClientConnect(t, c.randomServer()) 10219 defer nc.Close() 10220 10221 for { 10222 select { 10223 case <-stopPubs: 10224 return 10225 default: 10226 _, _ = js.Publish(subject, []byte("hello")) 10227 } 10228 } 10229 } 10230 10231 go publish("Z.foo") 10232 go publish("Z.bar") 10233 go publish("Z.baz") 10234 10235 // Cancel pubs after 10s. 10236 time.AfterFunc(10*time.Second, func() { close(stopPubs) }) 10237 10238 // Create a consumer 10239 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 10240 Durable: "zzz", 10241 AckPolicy: nats.AckExplicitPolicy, 10242 AckWait: 5 * time.Second, 10243 FilterSubjects: []string{"Z.foo", "Z.bar", "Z.baz"}, 10244 }) 10245 require_NoError(t, err) 10246 10247 sub, err := js.PullSubscribe(_EMPTY_, "zzz", nats.Bind("TEST", "zzz")) 10248 require_NoError(t, err) 10249 10250 received := make([]uint64, 0, 256_000) 10251 batchSize := 10 10252 10253 for running := true; running; { 10254 msgs, err := sub.Fetch(batchSize, nats.MaxWait(2*time.Second)) 10255 if err == nats.ErrTimeout { 10256 running = false 10257 } 10258 for _, m := range msgs { 10259 meta, err := m.Metadata() 10260 require_NoError(t, err) 10261 received = append(received, meta.Sequence.Stream) 10262 m.Ack() 10263 } 10264 } 10265 10266 sort.Slice(received, func(i, j int) bool { return received[i] < received[j] }) 10267 10268 var pseq, gaps uint64 10269 for _, seq := range received { 10270 if pseq != 0 && pseq != seq-1 { 10271 gaps += seq - pseq + 1 10272 } 10273 pseq = seq 10274 } 10275 si, err := js.StreamInfo("TEST") 10276 require_NoError(t, err) 10277 10278 if si.State.Msgs != 0 || gaps > 0 { 10279 t.Fatalf("Orphaned msgs %d with %d gaps detected", si.State.Msgs, gaps) 10280 } 10281 } 10282 10283 // https://github.com/nats-io/nats-server/issues/4957 10284 func TestNoRaceWQAndMultiSubjectFiltersRace(t *testing.T) { 10285 c := createJetStreamClusterExplicit(t, "R3S", 3) 10286 defer c.shutdown() 10287 10288 nc, js := jsClientConnect(t, c.randomServer()) 10289 defer nc.Close() 10290 10291 _, err := js.AddStream(&nats.StreamConfig{ 10292 Name: "TEST", 10293 Subjects: []string{"Z.>"}, 10294 Retention: nats.WorkQueuePolicy, 10295 Replicas: 1, 10296 }) 10297 require_NoError(t, err) 10298 10299 // The bug would happen when the stream was on same server as meta-leader. 10300 // So make that so. 10301 // Make sure stream leader is on S-1 10302 sl := c.streamLeader(globalAccountName, "TEST") 10303 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 10304 if sl == c.leader() { 10305 return nil 10306 } 10307 // Move meta-leader since stream can be R1. 10308 nc.Request(JSApiLeaderStepDown, nil, time.Second) 10309 return fmt.Errorf("stream leader on meta-leader") 10310 }) 10311 10312 start := make(chan struct{}) 10313 var done, ready sync.WaitGroup 10314 10315 // Create num go routines who will all race to create a consumer with the same filter subject but a different name. 10316 num := 10 10317 ready.Add(num) 10318 done.Add(num) 10319 10320 for i := 0; i < num; i++ { 10321 go func(n int) { 10322 // Connect directly to the meta leader but with our own connection. 10323 s := c.leader() 10324 nc, js := jsClientConnect(t, s) 10325 defer nc.Close() 10326 10327 ready.Done() 10328 defer done.Done() 10329 <-start 10330 10331 js.AddConsumer("TEST", &nats.ConsumerConfig{ 10332 Name: fmt.Sprintf("C-%d", n), 10333 FilterSubject: "Z.foo", 10334 AckPolicy: nats.AckExplicitPolicy, 10335 }) 10336 }(i) 10337 } 10338 10339 // Wait for requestors to be ready 10340 ready.Wait() 10341 close(start) 10342 done.Wait() 10343 10344 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 10345 si, err := js.StreamInfo("TEST") 10346 require_NoError(t, err) 10347 if si.State.Consumers != 1 { 10348 return fmt.Errorf("Consumer count not correct: %d vs 1", si.State.Consumers) 10349 } 10350 return nil 10351 }) 10352 } 10353 10354 func TestNoRaceFileStoreWriteFullStateUniqueSubjects(t *testing.T) { 10355 fcfg := FileStoreConfig{StoreDir: t.TempDir()} 10356 fs, err := newFileStore(fcfg, 10357 StreamConfig{Name: "zzz", Subjects: []string{"records.>"}, Storage: FileStorage, MaxMsgsPer: 1, MaxBytes: 15 * 1024 * 1024 * 1024}) 10358 require_NoError(t, err) 10359 defer fs.Stop() 10360 10361 qch := make(chan struct{}) 10362 defer close(qch) 10363 10364 go func() { 10365 const numThreshold = 1_000_000 10366 tick := time.NewTicker(1 * time.Second) 10367 for { 10368 select { 10369 case <-qch: 10370 return 10371 case <-tick.C: 10372 err := fs.writeFullState() 10373 var state StreamState 10374 fs.FastState(&state) 10375 if state.Msgs > numThreshold && err != nil { 10376 require_Error(t, err, errStateTooBig) 10377 } 10378 } 10379 } 10380 }() 10381 10382 labels := []string{"AAAAA", "BBBB", "CCCC", "DD", "EEEEE"} 10383 msg := []byte(strings.Repeat("Z", 128)) 10384 10385 for i := 0; i < 100; i++ { 10386 partA := nuid.Next() 10387 for j := 0; j < 100; j++ { 10388 partB := nuid.Next() 10389 for k := 0; k < 500; k++ { 10390 partC := nuid.Next() 10391 partD := labels[rand.Intn(len(labels)-1)] 10392 subject := fmt.Sprintf("records.%s.%s.%s.%s.%s", partA, partB, partC, partD, nuid.Next()) 10393 start := time.Now() 10394 fs.StoreMsg(subject, nil, msg) 10395 elapsed := time.Since(start) 10396 if elapsed > 500*time.Millisecond { 10397 t.Fatalf("Slow store for %q: %v\n", subject, elapsed) 10398 } 10399 } 10400 } 10401 } 10402 // Make sure we do write the full state on stop. 10403 fs.Stop() 10404 fi, err := os.Stat(filepath.Join(fcfg.StoreDir, msgDir, streamStreamStateFile)) 10405 require_NoError(t, err) 10406 // ~500MB, could change if we tweak encodings.. 10407 require_True(t, fi.Size() > 500*1024*1024) 10408 }