get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/jetstream_super_cluster_test.go (about) 1 // Copyright 2020-2022 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 //go:build !skip_js_tests && !skip_js_cluster_tests && !skip_js_cluster_tests_2 && !skip_js_super_cluster_tests 15 // +build !skip_js_tests,!skip_js_cluster_tests,!skip_js_cluster_tests_2,!skip_js_super_cluster_tests 16 17 package server 18 19 import ( 20 "encoding/json" 21 "errors" 22 "fmt" 23 "math/rand" 24 "net/http" 25 "net/http/httptest" 26 "reflect" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "testing" 31 "time" 32 33 "github.com/nats-io/jwt/v2" 34 "github.com/nats-io/nats.go" 35 "github.com/nats-io/nkeys" 36 ) 37 38 func TestJetStreamSuperClusterMetaPlacement(t *testing.T) { 39 sc := createJetStreamSuperCluster(t, 3, 3) 40 defer sc.shutdown() 41 42 // We want to influence where the meta leader will place itself when we ask the 43 // current leader to stepdown. 44 ml := sc.leader() 45 cn := ml.ClusterName() 46 var pcn string 47 for _, c := range sc.clusters { 48 if c.name != cn { 49 pcn = c.name 50 break 51 } 52 } 53 54 // Client based API 55 s := sc.randomServer() 56 nc, err := nats.Connect(s.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 57 if err != nil { 58 t.Fatalf("Failed to create system client: %v", err) 59 } 60 defer nc.Close() 61 62 stepdown := func(cn string) *JSApiLeaderStepDownResponse { 63 req := &JSApiLeaderStepdownRequest{Placement: &Placement{Cluster: cn}} 64 jreq, err := json.Marshal(req) 65 if err != nil { 66 t.Fatalf("Unexpected error: %v", err) 67 } 68 69 resp, err := nc.Request(JSApiLeaderStepDown, jreq, time.Second) 70 if err != nil { 71 t.Fatalf("Error on stepdown request: %v", err) 72 } 73 var sdr JSApiLeaderStepDownResponse 74 if err := json.Unmarshal(resp.Data, &sdr); err != nil { 75 t.Fatalf("Unexpected error: %v", err) 76 } 77 return &sdr 78 } 79 80 // Make sure we get correct errors for tags and bad or unavailable cluster placement. 81 sdr := stepdown("C22") 82 if sdr.Error == nil || !strings.Contains(sdr.Error.Description, "no replacement peer connected") { 83 t.Fatalf("Got incorrect error result: %+v", sdr.Error) 84 } 85 // Should work. 86 sdr = stepdown(pcn) 87 if sdr.Error != nil { 88 t.Fatalf("Got an error on stepdown: %+v", sdr.Error) 89 } 90 91 sc.waitOnLeader() 92 ml = sc.leader() 93 cn = ml.ClusterName() 94 95 if cn != pcn { 96 t.Fatalf("Expected new metaleader to be in cluster %q, got %q", pcn, cn) 97 } 98 } 99 100 func TestJetStreamSuperClusterUniquePlacementTag(t *testing.T) { 101 tmlp := ` 102 listen: 127.0.0.1:-1 103 server_name: %s 104 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s', unique_tag: az} 105 leaf {listen: 127.0.0.1:-1} 106 cluster { 107 name: %s 108 listen: 127.0.0.1:%d 109 routes = [%s] 110 } 111 # For access to system account. 112 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 113 ` 114 s := createJetStreamSuperClusterWithTemplateAndModHook(t, tmlp, 5, 2, 115 func(serverName, clustername, storeDir, conf string) string { 116 azTag := map[string]string{ 117 "C1-S1": "az:same", 118 "C1-S2": "az:same", 119 "C1-S3": "az:same", 120 "C1-S4": "az:same", 121 "C1-S5": "az:same", 122 "C2-S1": "az:1", 123 "C2-S2": "az:2", 124 "C2-S3": "az:1", 125 "C2-S4": "az:2", 126 "C2-S5": "az:1", 127 } 128 return conf + fmt.Sprintf("\nserver_tags: [cloud:%s-tag, %s]\n", clustername, azTag[serverName]) 129 }, nil) 130 defer s.shutdown() 131 132 inDifferentAz := func(ci *nats.ClusterInfo) (bool, error) { 133 t.Helper() 134 if len(ci.Replicas) == 0 { 135 return true, nil 136 } 137 // if R2 (has replica, this setup does not support R3), test if the server in a cluster picked the same az, 138 // as determined by modulo2 of server number which aligns with az 139 dummy := 0 140 srvnum1 := 0 141 srvnum2 := 0 142 if n, _ := fmt.Sscanf(ci.Leader, "C%d-S%d", &dummy, &srvnum1); n != 2 { 143 return false, fmt.Errorf("couldn't parse leader") 144 } 145 if n, _ := fmt.Sscanf(ci.Replicas[0].Name, "C%d-S%d", &dummy, &srvnum2); n != 2 { 146 return false, fmt.Errorf("couldn't parse replica") 147 } 148 return srvnum1%2 != srvnum2%2, nil 149 } 150 151 nc := natsConnect(t, s.randomServer().ClientURL()) 152 defer nc.Close() 153 154 js, err := nc.JetStream() 155 require_NoError(t, err) 156 157 for i, test := range []struct { 158 placement *nats.Placement 159 replicas int 160 fail bool 161 cluster string 162 }{ 163 // these pass because replica count is 1 164 {&nats.Placement{Tags: []string{"az:same"}}, 1, false, "C1"}, 165 {&nats.Placement{Tags: []string{"cloud:C1-tag", "az:same"}}, 1, false, "C1"}, 166 {&nats.Placement{Tags: []string{"cloud:C1-tag"}}, 1, false, "C1"}, 167 // pass because az is set, which disables the filter 168 {&nats.Placement{Tags: []string{"az:same"}}, 2, false, "C1"}, 169 {&nats.Placement{Tags: []string{"cloud:C1-tag", "az:same"}}, 2, false, "C1"}, 170 // fails because this cluster only has the same az 171 {&nats.Placement{Tags: []string{"cloud:C1-tag"}}, 2, true, ""}, 172 // fails because no 3 unique tags exist 173 {&nats.Placement{Tags: []string{"cloud:C2-tag"}}, 3, true, ""}, 174 {nil, 3, true, ""}, 175 // pass because replica count is low enough 176 {nil, 2, false, "C2"}, 177 {&nats.Placement{Tags: []string{"cloud:C2-tag"}}, 2, false, "C2"}, 178 // pass because az is provided 179 {&nats.Placement{Tags: []string{"az:1"}}, 3, false, "C2"}, 180 {&nats.Placement{Tags: []string{"az:2"}}, 2, false, "C2"}, 181 } { 182 name := fmt.Sprintf("test-%d", i) 183 t.Run(name, func(t *testing.T) { 184 si, err := js.AddStream(&nats.StreamConfig{Name: name, Replicas: test.replicas, Placement: test.placement}) 185 if test.fail { 186 require_Error(t, err) 187 require_Contains(t, err.Error(), "no suitable peers for placement", "server tag not unique") 188 return 189 } 190 require_NoError(t, err) 191 if test.cluster != _EMPTY_ { 192 require_Equal(t, si.Cluster.Name, test.cluster) 193 } 194 // skip placement test if tags call for a particular az 195 if test.placement != nil && len(test.placement.Tags) > 0 { 196 for _, tag := range test.placement.Tags { 197 if strings.HasPrefix(tag, "az:") { 198 return 199 } 200 } 201 } 202 diff, err := inDifferentAz(si.Cluster) 203 require_NoError(t, err) 204 require_True(t, diff) 205 }) 206 } 207 208 t.Run("scale-up-test", func(t *testing.T) { 209 // create enough streams so we hit it eventually 210 for i := 0; i < 10; i++ { 211 cfg := &nats.StreamConfig{Name: fmt.Sprintf("scale-up-%d", i), Replicas: 1, 212 Placement: &nats.Placement{Tags: []string{"cloud:C2-tag"}}} 213 si, err := js.AddStream(cfg) 214 require_NoError(t, err) 215 require_Equal(t, si.Cluster.Name, "C2") 216 cfg.Replicas = 2 217 si, err = js.UpdateStream(cfg) 218 require_NoError(t, err) 219 require_Equal(t, si.Cluster.Name, "C2") 220 checkFor(t, 10, 250*time.Millisecond, func() error { 221 if si, err := js.StreamInfo(cfg.Name); err != nil { 222 return err 223 } else if diff, err := inDifferentAz(si.Cluster); err != nil { 224 return err 225 } else if !diff { 226 return fmt.Errorf("not in different AZ") 227 } 228 return nil 229 }) 230 } 231 }) 232 } 233 234 func TestJetStreamSuperClusterBasics(t *testing.T) { 235 sc := createJetStreamSuperCluster(t, 3, 3) 236 defer sc.shutdown() 237 238 // Client based API 239 s := sc.randomServer() 240 nc, js := jsClientConnect(t, s) 241 defer nc.Close() 242 243 _, err := js.AddStream(&nats.StreamConfig{Name: "TEST", Replicas: 3}) 244 if err != nil { 245 t.Fatalf("Unexpected error: %v", err) 246 } 247 248 // Send in 10 messages. 249 msg, toSend := []byte("Hello JS Clustering"), 10 250 for i := 0; i < toSend; i++ { 251 if _, err = js.Publish("TEST", msg); err != nil { 252 t.Fatalf("Unexpected publish error: %v", err) 253 } 254 } 255 // Now grab info for this stream. 256 si, err := js.StreamInfo("TEST") 257 if err != nil { 258 t.Fatalf("Unexpected error: %v", err) 259 } 260 if si == nil || si.Config.Name != "TEST" { 261 t.Fatalf("StreamInfo is not correct %+v", si) 262 } 263 // Check active state as well, shows that the owner answered. 264 if si.State.Msgs != uint64(toSend) { 265 t.Fatalf("Expected %d msgs, got bad state: %+v", toSend, si.State) 266 } 267 // Check request origin placement. 268 if si.Cluster.Name != s.ClusterName() { 269 t.Fatalf("Expected stream to be placed in %q, but got %q", s.ClusterName(), si.Cluster.Name) 270 } 271 272 // Check consumers. 273 sub, err := js.SubscribeSync("TEST") 274 if err != nil { 275 t.Fatalf("Unexpected error: %v", err) 276 } 277 checkSubsPending(t, sub, toSend) 278 ci, err := sub.ConsumerInfo() 279 if err != nil { 280 t.Fatalf("Unexpected error: %v", err) 281 } 282 if ci.Delivered.Consumer != uint64(toSend) || ci.NumAckPending != toSend { 283 t.Fatalf("ConsumerInfo is not correct: %+v", ci) 284 } 285 286 // Now check we can place a stream. 287 pcn := "C3" 288 scResp, err := js.AddStream(&nats.StreamConfig{ 289 Name: "TEST2", 290 Placement: &nats.Placement{Cluster: pcn}, 291 }) 292 if err != nil { 293 t.Fatalf("Unexpected error: %v", err) 294 } 295 296 if scResp.Cluster.Name != pcn { 297 t.Fatalf("Expected the stream to be placed in %q, got %q", pcn, scResp.Cluster.Name) 298 } 299 } 300 301 // Test that consumer interest across gateways and superclusters is properly identitifed in a remote cluster. 302 func TestJetStreamSuperClusterCrossClusterConsumerInterest(t *testing.T) { 303 sc := createJetStreamSuperCluster(t, 3, 3) 304 defer sc.shutdown() 305 306 // Since we need all of the peers accounted for to add the stream wait for all to be present. 307 sc.waitOnPeerCount(9) 308 309 // Client based API - Connect to Cluster C1. Stream and consumer will live in C2. 310 s := sc.clusterForName("C1").randomServer() 311 nc, js := jsClientConnect(t, s) 312 defer nc.Close() 313 314 pcn := "C2" 315 _, err := js.AddStream(&nats.StreamConfig{Name: "foo", Replicas: 3, Placement: &nats.Placement{Cluster: pcn}}) 316 if err != nil { 317 t.Fatalf("Unexpected error: %v", err) 318 } 319 320 // Pull based first. 321 sub, err := js.PullSubscribe("foo", "dlc") 322 if err != nil { 323 t.Fatalf("Unexpected error: %v", err) 324 } 325 326 // Send a message. 327 if _, err = js.Publish("foo", []byte("CCI")); err != nil { 328 t.Fatalf("Unexpected publish error: %v", err) 329 } 330 331 fetchMsgs(t, sub, 1, 5*time.Second) 332 333 // Now check push based delivery. 334 sub, err = js.SubscribeSync("foo", nats.Durable("rip")) 335 if err != nil { 336 t.Fatalf("Unexpected error: %v", err) 337 } 338 checkSubsPending(t, sub, 1) 339 340 // Send another message. 341 if _, err = js.Publish("foo", []byte("CCI")); err != nil { 342 t.Fatalf("Unexpected publish error: %v", err) 343 } 344 checkSubsPending(t, sub, 2) 345 } 346 347 func TestJetStreamSuperClusterPeerReassign(t *testing.T) { 348 sc := createJetStreamSuperCluster(t, 3, 3) 349 defer sc.shutdown() 350 351 // Client based API 352 s := sc.randomServer() 353 nc, js := jsClientConnect(t, s) 354 defer nc.Close() 355 356 pcn := "C2" 357 358 // Create a stream in C2 that sources TEST 359 _, err := js.AddStream(&nats.StreamConfig{ 360 Name: "TEST", 361 Placement: &nats.Placement{Cluster: pcn}, 362 Replicas: 3, 363 }) 364 if err != nil { 365 t.Fatalf("Unexpected error: %v", err) 366 } 367 368 // Send in 10 messages. 369 msg, toSend := []byte("Hello JS Clustering"), 10 370 for i := 0; i < toSend; i++ { 371 if _, err = js.Publish("TEST", msg); err != nil { 372 t.Fatalf("Unexpected publish error: %v", err) 373 } 374 } 375 // Now grab info for this stream. 376 si, err := js.StreamInfo("TEST") 377 if err != nil { 378 t.Fatalf("Unexpected error: %v", err) 379 } 380 if si == nil || si.Config.Name != "TEST" { 381 t.Fatalf("StreamInfo is not correct %+v", si) 382 } 383 // Check active state as well, shows that the owner answered. 384 if si.State.Msgs != uint64(toSend) { 385 t.Fatalf("Expected %d msgs, got bad state: %+v", toSend, si.State) 386 } 387 // Check request origin placement. 388 if si.Cluster.Name != pcn { 389 t.Fatalf("Expected stream to be placed in %q, but got %q", s.ClusterName(), si.Cluster.Name) 390 } 391 392 // Now remove a peer that is assigned to the stream. 393 rc := sc.clusterForName(pcn) 394 rs := rc.randomNonStreamLeader("$G", "TEST") 395 rc.removeJetStream(rs) 396 397 // Check the stream info is eventually correct. 398 checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { 399 si, err := js.StreamInfo("TEST") 400 if err != nil { 401 return fmt.Errorf("Could not fetch stream info: %v", err) 402 } 403 if len(si.Cluster.Replicas) != 2 { 404 return fmt.Errorf("Expected 2 replicas, got %d", len(si.Cluster.Replicas)) 405 } 406 for _, peer := range si.Cluster.Replicas { 407 if !peer.Current { 408 return fmt.Errorf("Expected replica to be current: %+v", peer) 409 } 410 if !strings.HasPrefix(peer.Name, pcn) { 411 t.Fatalf("Stream peer reassigned to wrong cluster: %q", peer.Name) 412 } 413 } 414 return nil 415 }) 416 } 417 418 func TestJetStreamSuperClusterInterestOnlyMode(t *testing.T) { 419 GatewayDoNotForceInterestOnlyMode(true) 420 defer GatewayDoNotForceInterestOnlyMode(false) 421 422 template := ` 423 listen: 127.0.0.1:-1 424 server_name: %s 425 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 426 accounts { 427 one { 428 jetstream: enabled 429 users [{user: one, password: password}] 430 } 431 two { 432 %s 433 users [{user: two, password: password}] 434 } 435 } 436 cluster { 437 listen: 127.0.0.1:%d 438 name: %s 439 routes = ["nats://127.0.0.1:%d"] 440 } 441 gateway { 442 name: %s 443 listen: 127.0.0.1:%d 444 gateways = [{name: %s, urls: ["nats://127.0.0.1:%d"]}] 445 } 446 ` 447 storeDir1 := t.TempDir() 448 conf1 := createConfFile(t, []byte(fmt.Sprintf(template, 449 "S1", storeDir1, "", 23222, "A", 23222, "A", 11222, "B", 11223))) 450 s1, o1 := RunServerWithConfig(conf1) 451 defer s1.Shutdown() 452 453 storeDir2 := t.TempDir() 454 conf2 := createConfFile(t, []byte(fmt.Sprintf(template, 455 "S2", storeDir2, "", 23223, "B", 23223, "B", 11223, "A", 11222))) 456 s2, o2 := RunServerWithConfig(conf2) 457 defer s2.Shutdown() 458 459 waitForInboundGateways(t, s1, 1, 2*time.Second) 460 waitForInboundGateways(t, s2, 1, 2*time.Second) 461 waitForOutboundGateways(t, s1, 1, 2*time.Second) 462 waitForOutboundGateways(t, s2, 1, 2*time.Second) 463 464 nc1 := natsConnect(t, fmt.Sprintf("nats://two:password@127.0.0.1:%d", o1.Port)) 465 defer nc1.Close() 466 nc1.Publish("foo", []byte("some message")) 467 nc1.Flush() 468 469 nc2 := natsConnect(t, fmt.Sprintf("nats://two:password@127.0.0.1:%d", o2.Port)) 470 defer nc2.Close() 471 nc2.Publish("bar", []byte("some message")) 472 nc2.Flush() 473 474 checkMode := func(accName string, expectedMode GatewayInterestMode) { 475 t.Helper() 476 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 477 servers := []*Server{s1, s2} 478 for _, s := range servers { 479 var gws []*client 480 s.getInboundGatewayConnections(&gws) 481 for _, gw := range gws { 482 var mode GatewayInterestMode 483 gw.mu.Lock() 484 ie := gw.gw.insim[accName] 485 if ie != nil { 486 mode = ie.mode 487 } 488 gw.mu.Unlock() 489 if ie == nil { 490 return fmt.Errorf("Account %q not in map", accName) 491 } 492 if mode != expectedMode { 493 return fmt.Errorf("Expected account %q mode to be %v, got: %v", accName, expectedMode, mode) 494 } 495 } 496 } 497 return nil 498 }) 499 } 500 501 checkMode("one", InterestOnly) 502 checkMode("two", Optimistic) 503 504 // Now change account "two" to enable JS 505 changeCurrentConfigContentWithNewContent(t, conf1, []byte(fmt.Sprintf(template, 506 "S1", storeDir1, "jetstream: enabled", 23222, "A", 23222, "A", 11222, "B", 11223))) 507 changeCurrentConfigContentWithNewContent(t, conf2, []byte(fmt.Sprintf(template, 508 "S2", storeDir2, "jetstream: enabled", 23223, "B", 23223, "B", 11223, "A", 11222))) 509 510 if err := s1.Reload(); err != nil { 511 t.Fatalf("Error on s1 reload: %v", err) 512 } 513 if err := s2.Reload(); err != nil { 514 t.Fatalf("Error on s2 reload: %v", err) 515 } 516 517 checkMode("one", InterestOnly) 518 checkMode("two", InterestOnly) 519 } 520 521 func TestJetStreamSuperClusterConnectionCount(t *testing.T) { 522 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 3, 2) 523 defer sc.shutdown() 524 525 sysNc := natsConnect(t, sc.randomServer().ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 526 defer sysNc.Close() 527 _, err := sysNc.Request(fmt.Sprintf(accDirectReqSubj, "ONE", "CONNS"), nil, 100*time.Millisecond) 528 // this is a timeout as the server only responds when it has connections.... 529 // not convinced this should be that way, but also not the issue to investigate. 530 require_True(t, err == nats.ErrTimeout) 531 532 for i := 1; i <= 2; i++ { 533 func() { 534 nc := natsConnect(t, sc.clusterForName(fmt.Sprintf("C%d", i)).randomServer().ClientURL()) 535 defer nc.Close() 536 js, err := nc.JetStream() 537 require_NoError(t, err) 538 name := fmt.Sprintf("foo%d", 1) 539 _, err = js.AddStream(&nats.StreamConfig{ 540 Name: name, 541 Subjects: []string{name}, 542 Replicas: 3}) 543 require_NoError(t, err) 544 }() 545 } 546 func() { 547 nc := natsConnect(t, sc.clusterForName("C1").randomServer().ClientURL()) 548 defer nc.Close() 549 js, err := nc.JetStream() 550 require_NoError(t, err) 551 _, err = js.AddStream(&nats.StreamConfig{ 552 Name: "src", 553 Sources: []*nats.StreamSource{{Name: "foo1"}, {Name: "foo2"}}, 554 Replicas: 3}) 555 require_NoError(t, err) 556 }() 557 func() { 558 nc := natsConnect(t, sc.clusterForName("C2").randomServer().ClientURL()) 559 defer nc.Close() 560 js, err := nc.JetStream() 561 require_NoError(t, err) 562 _, err = js.AddStream(&nats.StreamConfig{ 563 Name: "mir", 564 Mirror: &nats.StreamSource{Name: "foo2"}, 565 Replicas: 3}) 566 require_NoError(t, err) 567 }() 568 569 // There should be no active NATS CLIENT connections, but we still need 570 // to wait a little bit... 571 checkFor(t, 2*time.Second, 15*time.Millisecond, func() error { 572 _, err := sysNc.Request(fmt.Sprintf(accDirectReqSubj, "ONE", "CONNS"), nil, 100*time.Millisecond) 573 if err != nats.ErrTimeout { 574 return fmt.Errorf("Expected timeout, got %v", err) 575 } 576 return nil 577 }) 578 sysNc.Close() 579 580 s := sc.randomServer() 581 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 582 acc, err := s.lookupAccount("ONE") 583 if err != nil { 584 t.Fatalf("Could not look up account: %v", err) 585 } 586 if n := acc.NumConnections(); n != 0 { 587 return fmt.Errorf("Expected no connections, got %d", n) 588 } 589 return nil 590 }) 591 } 592 593 func TestJetStreamSuperClusterConsumersBrokenGateways(t *testing.T) { 594 sc := createJetStreamSuperCluster(t, 1, 2) 595 defer sc.shutdown() 596 597 // Client based API 598 s := sc.clusterForName("C1").randomServer() 599 nc, js := jsClientConnect(t, s) 600 defer nc.Close() 601 602 // This will be in C1. 603 _, err := js.AddStream(&nats.StreamConfig{Name: "TEST"}) 604 if err != nil { 605 t.Fatalf("Unexpected error: %v", err) 606 } 607 608 // Create a stream in C2 that sources TEST 609 _, err = js.AddStream(&nats.StreamConfig{ 610 Name: "S", 611 Placement: &nats.Placement{Cluster: "C2"}, 612 Sources: []*nats.StreamSource{{Name: "TEST"}}, 613 }) 614 if err != nil { 615 t.Fatalf("Unexpected error: %v", err) 616 } 617 618 // Wait for direct consumer to get registered and detect interest across GW. 619 time.Sleep(time.Second) 620 621 // Send 100 msgs over 100ms in separate Go routine. 622 msg, toSend, done := []byte("Hello"), 100, make(chan bool) 623 go func() { 624 // Send in 10 messages. 625 for i := 0; i < toSend; i++ { 626 if _, err = js.Publish("TEST", msg); err != nil { 627 t.Errorf("Unexpected publish error: %v", err) 628 } 629 time.Sleep(500 * time.Microsecond) 630 } 631 done <- true 632 }() 633 634 breakGW := func() { 635 s.gateway.Lock() 636 gw := s.gateway.out["C2"] 637 s.gateway.Unlock() 638 if gw != nil { 639 gw.closeConnection(ClientClosed) 640 } 641 } 642 643 // Wait til about half way through. 644 time.Sleep(20 * time.Millisecond) 645 // Now break GW connection. 646 breakGW() 647 648 // Wait for GW to reform. 649 for _, c := range sc.clusters { 650 for _, s := range c.servers { 651 waitForOutboundGateways(t, s, 1, 2*time.Second) 652 } 653 } 654 655 select { 656 case <-done: 657 case <-time.After(2 * time.Second): 658 t.Fatalf("Did not complete sending first batch of messages") 659 } 660 661 // Make sure we can deal with data loss at the end. 662 checkFor(t, 20*time.Second, 250*time.Millisecond, func() error { 663 si, err := js.StreamInfo("S") 664 if err != nil { 665 t.Fatalf("Unexpected error: %v", err) 666 } 667 if si.State.Msgs != 100 { 668 return fmt.Errorf("Expected to have %d messages, got %d", 100, si.State.Msgs) 669 } 670 return nil 671 }) 672 673 // Now send 100 more. Will aos break here in the middle. 674 for i := 0; i < toSend; i++ { 675 if _, err = js.Publish("TEST", msg); err != nil { 676 t.Fatalf("Unexpected publish error: %v", err) 677 } 678 if i == 50 { 679 breakGW() 680 } 681 } 682 683 // Wait for GW to reform. 684 for _, c := range sc.clusters { 685 for _, s := range c.servers { 686 waitForOutboundGateways(t, s, 1, 2*time.Second) 687 } 688 } 689 690 si, err := js.StreamInfo("TEST") 691 if err != nil { 692 t.Fatalf("Unexpected error: %v", err) 693 } 694 if si.State.Msgs != 200 { 695 t.Fatalf("Expected to have %d messages, got %d", 200, si.State.Msgs) 696 } 697 698 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 699 si, err := js.StreamInfo("S") 700 if err != nil { 701 return fmt.Errorf("Unexpected error: %v", err) 702 } 703 if si.State.Msgs != 200 { 704 return fmt.Errorf("Expected to have %d messages, got %d", 200, si.State.Msgs) 705 } 706 return nil 707 }) 708 } 709 710 func TestJetStreamSuperClusterLeafNodesWithSharedSystemAccountAndSameDomain(t *testing.T) { 711 sc := createJetStreamSuperCluster(t, 3, 2) 712 defer sc.shutdown() 713 714 lnc := sc.createLeafNodes("LNC", 2) 715 defer lnc.shutdown() 716 717 // We want to make sure there is only one leader and its always in the supercluster. 718 sc.waitOnLeader() 719 720 if ml := lnc.leader(); ml != nil { 721 t.Fatalf("Detected a meta-leader in the leafnode cluster: %s", ml) 722 } 723 724 // leafnodes should have been added into the overall peer count. 725 sc.waitOnPeerCount(8) 726 727 // Check here that we auto detect sharing system account as well and auto place the correct 728 // deny imports and exports. 729 ls := lnc.randomServer() 730 if ls == nil { 731 t.Fatalf("Expected a leafnode server, got none") 732 } 733 gacc := ls.globalAccount().GetName() 734 735 ls.mu.Lock() 736 var hasDE, hasDI bool 737 for _, ln := range ls.leafs { 738 ln.mu.Lock() 739 if ln.leaf.remote.RemoteLeafOpts.LocalAccount == gacc { 740 re := ln.perms.pub.deny.Match(jsAllAPI) 741 hasDE = len(re.psubs)+len(re.qsubs) > 0 742 rs := ln.perms.sub.deny.Match(jsAllAPI) 743 hasDI = len(rs.psubs)+len(rs.qsubs) > 0 744 } 745 ln.mu.Unlock() 746 } 747 ls.mu.Unlock() 748 749 if !hasDE { 750 t.Fatalf("No deny export on global account") 751 } 752 if !hasDI { 753 t.Fatalf("No deny import on global account") 754 } 755 756 // Make a stream by connecting to the leafnode cluster. Make sure placement is correct. 757 // Client based API 758 nc, js := jsClientConnect(t, lnc.randomServer()) 759 defer nc.Close() 760 761 si, err := js.AddStream(&nats.StreamConfig{ 762 Name: "TEST", 763 Subjects: []string{"foo", "bar"}, 764 Replicas: 2, 765 }) 766 if err != nil { 767 t.Fatalf("Unexpected error: %v", err) 768 } 769 if si.Cluster.Name != "LNC" { 770 t.Fatalf("Expected default placement to be %q, got %q", "LNC", si.Cluster.Name) 771 } 772 773 // Now make sure placement also works if we want to place in a cluster in the supercluster. 774 pcn := "C2" 775 si, err = js.AddStream(&nats.StreamConfig{ 776 Name: "TEST2", 777 Subjects: []string{"baz"}, 778 Replicas: 2, 779 Placement: &nats.Placement{Cluster: pcn}, 780 }) 781 if err != nil { 782 t.Fatalf("Unexpected error: %v", err) 783 } 784 if si.Cluster.Name != pcn { 785 t.Fatalf("Expected default placement to be %q, got %q", pcn, si.Cluster.Name) 786 } 787 } 788 789 func TestJetStreamSuperClusterLeafNodesWithSharedSystemAccountAndDifferentDomain(t *testing.T) { 790 sc := createJetStreamSuperCluster(t, 3, 2) 791 defer sc.shutdown() 792 793 lnc := sc.createLeafNodesWithDomain("LNC", 2, "LEAFDOMAIN") 794 defer lnc.shutdown() 795 796 // We want to make sure there is only one leader and its always in the supercluster. 797 sc.waitOnLeader() 798 lnc.waitOnLeader() 799 800 // even though system account is shared, because domains differ, 801 sc.waitOnPeerCount(6) 802 lnc.waitOnPeerCount(2) 803 804 // Check here that we auto detect sharing system account as well and auto place the correct 805 // deny imports and exports. 806 ls := lnc.randomServer() 807 if ls == nil { 808 t.Fatalf("Expected a leafnode server, got none") 809 } 810 gacc := ls.globalAccount().GetName() 811 812 ls.mu.Lock() 813 var hasDE, hasDI bool 814 for _, ln := range ls.leafs { 815 ln.mu.Lock() 816 if ln.leaf.remote.RemoteLeafOpts.LocalAccount == gacc { 817 re := ln.perms.pub.deny.Match(jsAllAPI) 818 hasDE = len(re.psubs)+len(re.qsubs) > 0 819 rs := ln.perms.sub.deny.Match(jsAllAPI) 820 hasDI = len(rs.psubs)+len(rs.qsubs) > 0 821 } 822 ln.mu.Unlock() 823 } 824 ls.mu.Unlock() 825 826 if !hasDE { 827 t.Fatalf("No deny export on global account") 828 } 829 if !hasDI { 830 t.Fatalf("No deny import on global account") 831 } 832 833 // Make a stream by connecting to the leafnode cluster. Make sure placement is correct. 834 // Client based API 835 nc, js := jsClientConnect(t, lnc.randomServer()) 836 defer nc.Close() 837 838 si, err := js.AddStream(&nats.StreamConfig{ 839 Name: "TEST", 840 Subjects: []string{"foo", "bar"}, 841 Replicas: 2, 842 }) 843 if err != nil { 844 t.Fatalf("Unexpected error: %v", err) 845 } 846 if si.Cluster.Name != "LNC" { 847 t.Fatalf("Expected default placement to be %q, got %q", "LNC", si.Cluster.Name) 848 } 849 850 // Now make sure placement does not works for cluster in different domain 851 pcn := "C2" 852 _, err = js.AddStream(&nats.StreamConfig{ 853 Name: "TEST2", 854 Subjects: []string{"baz"}, 855 Replicas: 2, 856 Placement: &nats.Placement{Cluster: pcn}, 857 }) 858 if err == nil || !strings.Contains(err.Error(), "no suitable peers for placement") { 859 t.Fatalf("Expected no suitable peers for placement, got: %v", err) 860 } 861 } 862 863 func TestJetStreamSuperClusterSingleLeafNodeWithSharedSystemAccount(t *testing.T) { 864 sc := createJetStreamSuperCluster(t, 3, 2) 865 defer sc.shutdown() 866 867 ln := sc.createSingleLeafNode(true) 868 defer ln.Shutdown() 869 870 // We want to make sure there is only one leader and its always in the supercluster. 871 sc.waitOnLeader() 872 873 // leafnodes should have been added into the overall peer count. 874 sc.waitOnPeerCount(7) 875 876 // Now make sure we can place a stream in the leaf node. 877 // First connect to the leafnode server itself. 878 nc, js := jsClientConnect(t, ln) 879 defer nc.Close() 880 881 si, err := js.AddStream(&nats.StreamConfig{ 882 Name: "TEST1", 883 Subjects: []string{"foo"}, 884 }) 885 if err != nil { 886 t.Fatalf("Unexpected error: %v", err) 887 } 888 if si.Cluster.Name != "LNS" { 889 t.Fatalf("Expected to be placed in leafnode with %q as cluster name, got %q", "LNS", si.Cluster.Name) 890 } 891 // Now check we can place on here as well but connect to the hub. 892 nc, js = jsClientConnect(t, sc.randomServer()) 893 defer nc.Close() 894 895 si, err = js.AddStream(&nats.StreamConfig{ 896 Name: "TEST2", 897 Subjects: []string{"bar"}, 898 Placement: &nats.Placement{Cluster: "LNS"}, 899 }) 900 if err != nil { 901 t.Fatalf("Unexpected error: %v", err) 902 } 903 if si.Cluster.Name != "LNS" { 904 t.Fatalf("Expected to be placed in leafnode with %q as cluster name, got %q", "LNS", si.Cluster.Name) 905 } 906 } 907 908 // Issue reported with superclusters and leafnodes where first few get next requests for pull subscribers 909 // have the wrong subject. 910 func TestJetStreamSuperClusterGetNextRewrite(t *testing.T) { 911 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 2, 2) 912 defer sc.shutdown() 913 914 // Will connect the leafnode to cluster C1. We will then connect the "client" to cluster C2 to cross gateways. 915 ln := sc.clusterForName("C1").createSingleLeafNodeNoSystemAccountAndEnablesJetStreamWithDomain("C", "nojs") 916 defer ln.Shutdown() 917 918 c2 := sc.clusterForName("C2") 919 nc, js := jsClientConnectEx(t, c2.randomServer(), "C", nats.UserInfo("nojs", "p")) 920 defer nc.Close() 921 922 // Create a stream and add messages. 923 if _, err := js.AddStream(&nats.StreamConfig{Name: "foo"}); err != nil { 924 t.Fatalf("Unexpected error: %v", err) 925 } 926 for i := 0; i < 10; i++ { 927 if _, err := js.Publish("foo", []byte("ok")); err != nil { 928 t.Fatalf("Unexpected publish error: %v", err) 929 } 930 } 931 932 // Pull messages and make sure subject rewrite works. 933 sub, err := js.PullSubscribe("foo", "dlc") 934 if err != nil { 935 t.Fatalf("Unexpected error: %v", err) 936 } 937 938 for _, m := range fetchMsgs(t, sub, 5, time.Second) { 939 if m.Subject != "foo" { 940 t.Fatalf("Expected %q as subject but got %q", "foo", m.Subject) 941 } 942 } 943 } 944 945 func TestJetStreamSuperClusterEphemeralCleanup(t *testing.T) { 946 sc := createJetStreamSuperCluster(t, 3, 2) 947 defer sc.shutdown() 948 949 // Create a stream in cluster 0 950 s := sc.clusters[0].randomServer() 951 nc, js := jsClientConnect(t, s) 952 defer nc.Close() 953 954 for _, test := range []struct { 955 name string 956 sourceInCluster int 957 streamName string 958 sourceName string 959 }{ 960 {"local", 0, "TEST1", "S1"}, 961 {"remote", 1, "TEST2", "S2"}, 962 } { 963 t.Run(test.name, func(t *testing.T) { 964 if _, err := js.AddStream(&nats.StreamConfig{Name: test.streamName, Replicas: 3}); err != nil { 965 t.Fatalf("Error adding %q stream: %v", test.streamName, err) 966 } 967 if _, err := js.Publish(test.streamName, []byte("hello")); err != nil { 968 t.Fatalf("Unexpected publish error: %v", err) 969 } 970 971 // Now create a source for that stream, either in same or remote cluster. 972 s2 := sc.clusters[test.sourceInCluster].randomServer() 973 nc2, js2 := jsClientConnect(t, s2) 974 defer nc2.Close() 975 976 if _, err := js2.AddStream(&nats.StreamConfig{ 977 Name: test.sourceName, 978 Storage: nats.FileStorage, 979 Sources: []*nats.StreamSource{{Name: test.streamName}}, 980 Replicas: 1, 981 }); err != nil { 982 t.Fatalf("Error adding source stream: %v", err) 983 } 984 985 // Check that TEST(n) has 1 consumer and that S(n) is created and has 1 message. 986 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 987 si, err := js2.StreamInfo(test.sourceName) 988 if err != nil { 989 return fmt.Errorf("Could not get stream info: %v", err) 990 } 991 if si.State.Msgs != 1 { 992 return fmt.Errorf("Expected 1 msg, got state: %+v", si.State) 993 } 994 return nil 995 }) 996 997 // Get the consumer because we will want to artificially reduce 998 // the delete threshold. 999 leader := sc.clusters[0].streamLeader("$G", test.streamName) 1000 mset, err := leader.GlobalAccount().lookupStream(test.streamName) 1001 if err != nil { 1002 t.Fatalf("Expected to find a stream for %q, got %v", test.streamName, err) 1003 } 1004 cons := mset.getConsumers()[0] 1005 cons.mu.Lock() 1006 cons.dthresh = 1250 * time.Millisecond 1007 active := cons.active 1008 dtimerSet := cons.dtmr != nil 1009 deliver := cons.cfg.DeliverSubject 1010 cons.mu.Unlock() 1011 1012 if !active || dtimerSet { 1013 t.Fatalf("Invalid values for active=%v dtimerSet=%v", active, dtimerSet) 1014 } 1015 // To add to the mix, let's create a local interest on the delivery subject 1016 // and stop it. This is to ensure that this does not stop timers that should 1017 // still be running and monitor the GW interest. 1018 sub := natsSubSync(t, nc, deliver) 1019 natsFlush(t, nc) 1020 natsUnsub(t, sub) 1021 natsFlush(t, nc) 1022 1023 // Now remove the "S(n)" stream... 1024 if err := js2.DeleteStream(test.sourceName); err != nil { 1025 t.Fatalf("Error deleting stream: %v", err) 1026 } 1027 1028 // Now check that the stream S(n) is really removed and that 1029 // the consumer is gone for stream TEST(n). 1030 checkFor(t, 5*time.Second, 25*time.Millisecond, func() error { 1031 // First, make sure that stream S(n) has disappeared. 1032 if _, err := js2.StreamInfo(test.sourceName); err == nil { 1033 return fmt.Errorf("Stream %q should no longer exist", test.sourceName) 1034 } 1035 if ndc := mset.numDirectConsumers(); ndc != 0 { 1036 return fmt.Errorf("Expected %q stream to have 0 consumers, got %v", test.streamName, ndc) 1037 } 1038 return nil 1039 }) 1040 }) 1041 } 1042 } 1043 1044 func TestJetStreamSuperClusterGetNextSubRace(t *testing.T) { 1045 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterAccountsTempl, 2, 2) 1046 defer sc.shutdown() 1047 1048 // Will connect the leafnode to cluster C1. We will then connect the "client" to cluster C2 to cross gateways. 1049 ln := sc.clusterForName("C1").createSingleLeafNodeNoSystemAccountAndEnablesJetStreamWithDomain("C", "nojs") 1050 defer ln.Shutdown() 1051 1052 // Shutdown 1 of the server from C1, (the one LN is not connected to) 1053 for _, s := range sc.clusterForName("C1").servers { 1054 s.mu.Lock() 1055 if len(s.leafs) == 0 { 1056 s.mu.Unlock() 1057 s.Shutdown() 1058 break 1059 } 1060 s.mu.Unlock() 1061 } 1062 1063 // Wait on meta leader in case shutdown of server above caused an election. 1064 sc.waitOnLeader() 1065 1066 var c2Srv *Server 1067 // Take the server from C2 that has no inbound from C1. 1068 c2 := sc.clusterForName("C2") 1069 for _, s := range c2.servers { 1070 var gwsa [2]*client 1071 gws := gwsa[:0] 1072 s.getInboundGatewayConnections(&gws) 1073 if len(gws) == 0 { 1074 c2Srv = s 1075 break 1076 } 1077 } 1078 if c2Srv == nil { 1079 t.Fatalf("Both servers in C2 had an inbound GW connection!") 1080 } 1081 1082 nc, js := jsClientConnectEx(t, c2Srv, "C", nats.UserInfo("nojs", "p")) 1083 defer nc.Close() 1084 1085 _, err := js.AddStream(&nats.StreamConfig{Name: "foo"}) 1086 require_NoError(t, err) 1087 1088 _, err = js.AddConsumer("foo", &nats.ConsumerConfig{Durable: "dur", AckPolicy: nats.AckExplicitPolicy}) 1089 require_NoError(t, err) 1090 1091 for i := 0; i < 100; i++ { 1092 sendStreamMsg(t, nc, "foo", "ok") 1093 } 1094 1095 // Wait for all messages to appear in the consumer 1096 checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { 1097 ci, err := js.ConsumerInfo("foo", "dur") 1098 if err != nil { 1099 return err 1100 } 1101 if n := ci.NumPending; n != 100 { 1102 return fmt.Errorf("Expected 100 msgs, got %v", n) 1103 } 1104 return nil 1105 }) 1106 1107 req := &JSApiConsumerGetNextRequest{Batch: 1, Expires: 5 * time.Second} 1108 jreq, err := json.Marshal(req) 1109 require_NoError(t, err) 1110 // Create this by hand here to make sure we create the subscription 1111 // on the reply subject for every single request 1112 nextSubj := fmt.Sprintf(JSApiRequestNextT, "foo", "dur") 1113 nextSubj = "$JS.C.API" + strings.TrimPrefix(nextSubj, "$JS.API") 1114 for i := 0; i < 100; i++ { 1115 inbox := nats.NewInbox() 1116 sub := natsSubSync(t, nc, inbox) 1117 natsPubReq(t, nc, nextSubj, inbox, jreq) 1118 msg := natsNexMsg(t, sub, time.Second) 1119 if len(msg.Header) != 0 && string(msg.Data) != "ok" { 1120 t.Fatalf("Unexpected message: header=%+v data=%s", msg.Header, msg.Data) 1121 } 1122 sub.Unsubscribe() 1123 } 1124 } 1125 1126 func TestJetStreamSuperClusterPullConsumerAndHeaders(t *testing.T) { 1127 sc := createJetStreamSuperCluster(t, 3, 2) 1128 defer sc.shutdown() 1129 1130 c1 := sc.clusterForName("C1") 1131 c2 := sc.clusterForName("C2") 1132 1133 nc, js := jsClientConnect(t, c1.randomServer()) 1134 defer nc.Close() 1135 1136 if _, err := js.AddStream(&nats.StreamConfig{Name: "ORIGIN"}); err != nil { 1137 t.Fatalf("Unexpected error: %v", err) 1138 } 1139 toSend := 50 1140 for i := 0; i < toSend; i++ { 1141 if _, err := js.Publish("ORIGIN", []byte("ok")); err != nil { 1142 t.Fatalf("Unexpected publish error: %v", err) 1143 } 1144 } 1145 1146 nc2, js2 := jsClientConnect(t, c2.randomServer()) 1147 defer nc2.Close() 1148 1149 _, err := js2.AddStream(&nats.StreamConfig{ 1150 Name: "S", 1151 Sources: []*nats.StreamSource{{Name: "ORIGIN"}}, 1152 }) 1153 if err != nil { 1154 t.Fatalf("Unexpected error: %v", err) 1155 } 1156 // Wait for them to be in the sourced stream. 1157 checkFor(t, 5*time.Second, 250*time.Millisecond, func() error { 1158 if si, _ := js2.StreamInfo("S"); si.State.Msgs != uint64(toSend) { 1159 return fmt.Errorf("Expected %d msgs for %q, got %d", toSend, "S", si.State.Msgs) 1160 } 1161 return nil 1162 }) 1163 1164 // Now create a pull consumer for the sourced stream. 1165 _, err = js2.AddConsumer("S", &nats.ConsumerConfig{Durable: "dlc", AckPolicy: nats.AckExplicitPolicy}) 1166 if err != nil { 1167 t.Fatalf("Unexpected error: %v", err) 1168 } 1169 1170 // Now we will connect and request the next message from each server in C1 cluster and check that headers remain in place. 1171 for _, s := range c1.servers { 1172 nc, err := nats.Connect(s.ClientURL()) 1173 if err != nil { 1174 t.Fatalf("Unexpected error: %v", err) 1175 } 1176 defer nc.Close() 1177 m, err := nc.Request("$JS.API.CONSUMER.MSG.NEXT.S.dlc", nil, 2*time.Second) 1178 if err != nil { 1179 t.Fatalf("Unexpected error: %v", err) 1180 } 1181 if len(m.Header) != 1 { 1182 t.Fatalf("Expected 1 header element, got %+v", m.Header) 1183 } 1184 } 1185 } 1186 1187 func TestJetStreamSuperClusterStatszActiveServers(t *testing.T) { 1188 sc := createJetStreamSuperCluster(t, 2, 2) 1189 defer sc.shutdown() 1190 1191 checkActive := func(expected int) { 1192 t.Helper() 1193 checkFor(t, 10*time.Second, 500*time.Millisecond, func() error { 1194 s := sc.randomServer() 1195 nc, err := nats.Connect(s.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 1196 if err != nil { 1197 t.Fatalf("Failed to create system client: %v", err) 1198 } 1199 defer nc.Close() 1200 1201 resp, err := nc.Request(serverStatsPingReqSubj, nil, time.Second) 1202 if err != nil { 1203 t.Fatalf("Unexpected error: %v", err) 1204 } 1205 var ssm ServerStatsMsg 1206 if err := json.Unmarshal(resp.Data, &ssm); err != nil { 1207 t.Fatalf("Unexpected error: %v", err) 1208 } 1209 if ssm.Stats.ActiveServers != expected { 1210 return fmt.Errorf("Wanted %d, got %d", expected, ssm.Stats.ActiveServers) 1211 } 1212 return nil 1213 }) 1214 } 1215 1216 checkActive(4) 1217 c := sc.randomCluster() 1218 ss := c.randomServer() 1219 ss.Shutdown() 1220 checkActive(3) 1221 c.restartServer(ss) 1222 checkActive(4) 1223 } 1224 1225 func TestJetStreamSuperClusterSourceAndMirrorConsumersLeaderChange(t *testing.T) { 1226 sc := createJetStreamSuperCluster(t, 3, 2) 1227 defer sc.shutdown() 1228 1229 c1 := sc.clusterForName("C1") 1230 c2 := sc.clusterForName("C2") 1231 1232 nc, js := jsClientConnect(t, c1.randomServer()) 1233 defer nc.Close() 1234 1235 var sources []*nats.StreamSource 1236 numStreams := 10 1237 1238 for i := 1; i <= numStreams; i++ { 1239 name := fmt.Sprintf("O%d", i) 1240 sources = append(sources, &nats.StreamSource{Name: name}) 1241 if _, err := js.AddStream(&nats.StreamConfig{Name: name}); err != nil { 1242 t.Fatalf("Unexpected error: %v", err) 1243 } 1244 } 1245 1246 // Place our new stream that will source all the others in different cluster. 1247 nc, js = jsClientConnect(t, c2.randomServer()) 1248 defer nc.Close() 1249 1250 _, err := js.AddStream(&nats.StreamConfig{ 1251 Name: "S", 1252 Replicas: 2, 1253 Sources: sources, 1254 }) 1255 if err != nil { 1256 t.Fatalf("Unexpected error: %v", err) 1257 } 1258 1259 // Force leader change twice. 1260 nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "S"), nil, time.Second) 1261 c2.waitOnStreamLeader("$G", "S") 1262 nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "S"), nil, time.Second) 1263 c2.waitOnStreamLeader("$G", "S") 1264 1265 // Now make sure we only have a single direct consumer on our origin streams. 1266 // Pick one at random. 1267 name := fmt.Sprintf("O%d", rand.Intn(numStreams-1)+1) 1268 c1.waitOnStreamLeader("$G", name) 1269 s := c1.streamLeader("$G", name) 1270 a, err := s.lookupAccount("$G") 1271 if err != nil { 1272 t.Fatalf("Unexpected error: %v", err) 1273 } 1274 mset, err := a.lookupStream(name) 1275 if err != nil { 1276 t.Fatalf("Unexpected error: %v", err) 1277 } 1278 1279 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 1280 if ndc := mset.numDirectConsumers(); ndc != 1 { 1281 return fmt.Errorf("Stream %q wanted 1 direct consumer, got %d", name, ndc) 1282 } 1283 return nil 1284 }) 1285 1286 // Now create a mirror of selected from above. Will test same scenario. 1287 _, err = js.AddStream(&nats.StreamConfig{ 1288 Name: "M", 1289 Replicas: 2, 1290 Mirror: &nats.StreamSource{Name: name}, 1291 }) 1292 if err != nil { 1293 t.Fatalf("Unexpected error: %v", err) 1294 } 1295 // Force leader change twice. 1296 nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "M"), nil, time.Second) 1297 c2.waitOnStreamLeader("$G", "M") 1298 nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "M"), nil, time.Second) 1299 c2.waitOnStreamLeader("$G", "M") 1300 1301 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 1302 if ndc := mset.numDirectConsumers(); ndc != 2 { 1303 return fmt.Errorf("Stream %q wanted 2 direct consumers, got %d", name, ndc) 1304 } 1305 return nil 1306 }) 1307 } 1308 1309 func TestJetStreamSuperClusterPushConsumerInterest(t *testing.T) { 1310 sc := createJetStreamSuperCluster(t, 3, 2) 1311 defer sc.shutdown() 1312 1313 for _, test := range []struct { 1314 name string 1315 queue string 1316 }{ 1317 {"non queue", _EMPTY_}, 1318 {"queue", "queue"}, 1319 } { 1320 t.Run(test.name, func(t *testing.T) { 1321 testInterest := func(s *Server) { 1322 t.Helper() 1323 nc, js := jsClientConnect(t, s) 1324 defer nc.Close() 1325 1326 _, err := js.AddStream(&nats.StreamConfig{ 1327 Name: "TEST", 1328 Subjects: []string{"foo"}, 1329 Replicas: 3, 1330 }) 1331 require_NoError(t, err) 1332 1333 var sub *nats.Subscription 1334 if test.queue != _EMPTY_ { 1335 sub, err = js.QueueSubscribeSync("foo", test.queue) 1336 } else { 1337 sub, err = js.SubscribeSync("foo", nats.Durable("dur")) 1338 } 1339 require_NoError(t, err) 1340 1341 js.Publish("foo", []byte("msg1")) 1342 // Since the GW watcher is checking every 1sec, make sure we are 1343 // giving it enough time for the delivery to start. 1344 _, err = sub.NextMsg(2 * time.Second) 1345 require_NoError(t, err) 1346 } 1347 1348 // Create the durable push consumer from cluster "0" 1349 testInterest(sc.clusters[0].servers[0]) 1350 1351 // Now "move" to a server in cluster "1" 1352 testInterest(sc.clusters[1].servers[0]) 1353 }) 1354 } 1355 } 1356 1357 func TestJetStreamSuperClusterOverflowPlacement(t *testing.T) { 1358 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterMaxBytesTempl, 3, 3) 1359 defer sc.shutdown() 1360 1361 pcn := "C2" 1362 s := sc.clusterForName(pcn).randomServer() 1363 nc, js := jsClientConnect(t, s) 1364 defer nc.Close() 1365 1366 // With this setup, we opted in for requiring MaxBytes, so this should error. 1367 _, err := js.AddStream(&nats.StreamConfig{ 1368 Name: "foo", 1369 Replicas: 3, 1370 }) 1371 require_Error(t, err, NewJSStreamMaxBytesRequiredError()) 1372 1373 // R=2 on purpose to leave one server empty. 1374 _, err = js.AddStream(&nats.StreamConfig{ 1375 Name: "foo", 1376 Replicas: 2, 1377 MaxBytes: 2 * 1024 * 1024 * 1024, 1378 }) 1379 require_NoError(t, err) 1380 1381 // Now try to add another that will overflow the current cluster's reservation. 1382 // Since we asked explicitly for the same cluster this should fail. 1383 // Note this will not be testing the peer picker since the update has probably not made it to the meta leader. 1384 _, err = js.AddStream(&nats.StreamConfig{ 1385 Name: "bar", 1386 Replicas: 3, 1387 MaxBytes: 2 * 1024 * 1024 * 1024, 1388 Placement: &nats.Placement{Cluster: pcn}, 1389 }) 1390 require_Contains(t, err.Error(), "nats: no suitable peers for placement") 1391 // Now test actual overflow placement. So try again with no placement designation. 1392 // This will test the peer picker's logic since they are updated at this point and the meta leader 1393 // knows it can not place it in C2. 1394 si, err := js.AddStream(&nats.StreamConfig{ 1395 Name: "bar", 1396 Replicas: 3, 1397 MaxBytes: 2 * 1024 * 1024 * 1024, 1398 }) 1399 require_NoError(t, err) 1400 1401 // Make sure we did not get place into C2. 1402 falt := si.Cluster.Name 1403 if falt == pcn { 1404 t.Fatalf("Expected to be placed in another cluster besides %q, but got %q", pcn, falt) 1405 } 1406 1407 // One more time that should spill over again to our last cluster. 1408 si, err = js.AddStream(&nats.StreamConfig{ 1409 Name: "baz", 1410 Replicas: 3, 1411 MaxBytes: 2 * 1024 * 1024 * 1024, 1412 }) 1413 require_NoError(t, err) 1414 1415 // Make sure we did not get place into C2. 1416 if salt := si.Cluster.Name; salt == pcn || salt == falt { 1417 t.Fatalf("Expected to be placed in last cluster besides %q or %q, but got %q", pcn, falt, salt) 1418 } 1419 1420 // Now place a stream of R1 into C2 which should have space. 1421 si, err = js.AddStream(&nats.StreamConfig{ 1422 Name: "dlc", 1423 MaxBytes: 2 * 1024 * 1024 * 1024, 1424 }) 1425 require_NoError(t, err) 1426 1427 if si.Cluster.Name != pcn { 1428 t.Fatalf("Expected to be placed in our origin cluster %q, but got %q", pcn, si.Cluster.Name) 1429 } 1430 } 1431 1432 func TestJetStreamSuperClusterConcurrentOverflow(t *testing.T) { 1433 sc := createJetStreamSuperClusterWithTemplate(t, jsClusterMaxBytesTempl, 3, 3) 1434 defer sc.shutdown() 1435 1436 pcn := "C2" 1437 1438 startCh := make(chan bool) 1439 var wg sync.WaitGroup 1440 var swg sync.WaitGroup 1441 1442 start := func(name string) { 1443 defer wg.Done() 1444 1445 s := sc.clusterForName(pcn).randomServer() 1446 nc, js := jsClientConnect(t, s) 1447 defer nc.Close() 1448 1449 swg.Done() 1450 <-startCh 1451 1452 _, err := js.AddStream(&nats.StreamConfig{ 1453 Name: name, 1454 Replicas: 3, 1455 MaxBytes: 2 * 1024 * 1024 * 1024, 1456 }) 1457 require_NoError(t, err) 1458 } 1459 wg.Add(2) 1460 swg.Add(2) 1461 go start("foo") 1462 go start("bar") 1463 swg.Wait() 1464 // Now start both at same time. 1465 close(startCh) 1466 wg.Wait() 1467 } 1468 1469 func TestJetStreamSuperClusterStreamTagPlacement(t *testing.T) { 1470 sc := createJetStreamTaggedSuperCluster(t) 1471 defer sc.shutdown() 1472 1473 placeOK := func(connectCluster string, tags []string, expectedCluster string) { 1474 t.Helper() 1475 nc, js := jsClientConnect(t, sc.clusterForName(connectCluster).randomServer()) 1476 defer nc.Close() 1477 si, err := js.AddStream(&nats.StreamConfig{ 1478 Name: "TEST", 1479 Subjects: []string{"foo"}, 1480 Placement: &nats.Placement{Tags: tags}, 1481 }) 1482 require_NoError(t, err) 1483 if si.Cluster.Name != expectedCluster { 1484 t.Fatalf("Failed to place properly in %q, got %q", expectedCluster, si.Cluster.Name) 1485 } 1486 js.DeleteStream("TEST") 1487 } 1488 1489 placeOK("C2", []string{"cloud:aws"}, "C1") 1490 placeOK("C2", []string{"country:jp"}, "C3") 1491 placeOK("C1", []string{"cloud:gcp", "country:uk"}, "C2") 1492 1493 // Case shoud not matter. 1494 placeOK("C1", []string{"cloud:GCP", "country:UK"}, "C2") 1495 placeOK("C2", []string{"Cloud:AwS", "Country:uS"}, "C1") 1496 1497 placeErr := func(connectCluster string, tags []string) { 1498 t.Helper() 1499 nc, js := jsClientConnect(t, sc.clusterForName(connectCluster).randomServer()) 1500 defer nc.Close() 1501 _, err := js.AddStream(&nats.StreamConfig{ 1502 Name: "TEST", 1503 Subjects: []string{"foo"}, 1504 Placement: &nats.Placement{Tags: tags}, 1505 }) 1506 require_Contains(t, err.Error(), "no suitable peers for placement", "tags not matched") 1507 require_Contains(t, err.Error(), tags...) 1508 } 1509 1510 placeErr("C1", []string{"cloud:GCP", "country:US"}) 1511 placeErr("C1", []string{"country:DN"}) 1512 placeErr("C1", []string{"cloud:DO"}) 1513 } 1514 1515 func TestJetStreamSuperClusterRemovedPeersAndStreamsListAndDelete(t *testing.T) { 1516 sc := createJetStreamSuperCluster(t, 3, 3) 1517 defer sc.shutdown() 1518 1519 pcn := "C2" 1520 sc.waitOnLeader() 1521 ml := sc.leader() 1522 if ml.ClusterName() == pcn { 1523 pcn = "C1" 1524 } 1525 1526 // Client based API 1527 nc, js := jsClientConnect(t, ml) 1528 defer nc.Close() 1529 1530 _, err := js.AddStream(&nats.StreamConfig{ 1531 Name: "GONE", 1532 Replicas: 3, 1533 Placement: &nats.Placement{Cluster: pcn}, 1534 }) 1535 require_NoError(t, err) 1536 1537 _, err = js.AddConsumer("GONE", &nats.ConsumerConfig{Durable: "dlc", AckPolicy: nats.AckExplicitPolicy}) 1538 require_NoError(t, err) 1539 1540 _, err = js.AddStream(&nats.StreamConfig{ 1541 Name: "TEST", 1542 Replicas: 3, 1543 Placement: &nats.Placement{Cluster: ml.ClusterName()}, 1544 }) 1545 require_NoError(t, err) 1546 1547 // Put messages in.. 1548 num := 100 1549 for i := 0; i < num; i++ { 1550 js.PublishAsync("GONE", []byte("SLS")) 1551 js.PublishAsync("TEST", []byte("SLS")) 1552 } 1553 select { 1554 case <-js.PublishAsyncComplete(): 1555 case <-time.After(5 * time.Second): 1556 t.Fatalf("Did not receive completion signal") 1557 } 1558 1559 c := sc.clusterForName(pcn) 1560 c.shutdown() 1561 1562 // Grab Stream List.. 1563 start := time.Now() 1564 resp, err := nc.Request(JSApiStreamList, nil, 2*time.Second) 1565 require_NoError(t, err) 1566 if delta := time.Since(start); delta > 100*time.Millisecond { 1567 t.Fatalf("Stream list call took too long to return: %v", delta) 1568 } 1569 var list JSApiStreamListResponse 1570 err = json.Unmarshal(resp.Data, &list) 1571 require_NoError(t, err) 1572 1573 if len(list.Missing) != 1 || list.Missing[0] != "GONE" { 1574 t.Fatalf("Wrong Missing: %+v", list) 1575 } 1576 1577 // Check behavior of stream info as well. We want it to return the stream is offline and not just timeout. 1578 _, err = js.StreamInfo("GONE") 1579 // FIXME(dlc) - Go client not putting nats: prefix on for stream but does for consumer. 1580 require_Error(t, err, NewJSStreamOfflineError(), errors.New("nats: stream is offline")) 1581 1582 // Same for Consumer 1583 start = time.Now() 1584 resp, err = nc.Request("$JS.API.CONSUMER.LIST.GONE", nil, 2*time.Second) 1585 require_NoError(t, err) 1586 if delta := time.Since(start); delta > 100*time.Millisecond { 1587 t.Fatalf("Consumer list call took too long to return: %v", delta) 1588 } 1589 var clist JSApiConsumerListResponse 1590 err = json.Unmarshal(resp.Data, &clist) 1591 require_NoError(t, err) 1592 1593 if len(clist.Missing) != 1 || clist.Missing[0] != "dlc" { 1594 t.Fatalf("Wrong Missing: %+v", clist) 1595 } 1596 1597 _, err = js.ConsumerInfo("GONE", "dlc") 1598 require_Error(t, err, NewJSConsumerOfflineError(), errors.New("nats: consumer is offline")) 1599 1600 // Make sure delete works. 1601 err = js.DeleteConsumer("GONE", "dlc") 1602 require_NoError(t, err) 1603 1604 err = js.DeleteStream("GONE") 1605 require_NoError(t, err) 1606 1607 // Test it is really gone. 1608 _, err = js.StreamInfo("GONE") 1609 require_Error(t, err, nats.ErrStreamNotFound) 1610 } 1611 1612 func TestJetStreamSuperClusterConsumerDeliverNewBug(t *testing.T) { 1613 sc := createJetStreamSuperCluster(t, 3, 3) 1614 defer sc.shutdown() 1615 1616 pcn := "C2" 1617 sc.waitOnLeader() 1618 ml := sc.leader() 1619 if ml.ClusterName() == pcn { 1620 pcn = "C1" 1621 } 1622 1623 // Client based API 1624 nc, js := jsClientConnect(t, ml) 1625 defer nc.Close() 1626 1627 _, err := js.AddStream(&nats.StreamConfig{ 1628 Name: "T", 1629 Replicas: 3, 1630 Placement: &nats.Placement{Cluster: pcn}, 1631 }) 1632 require_NoError(t, err) 1633 1634 // Put messages in.. 1635 num := 100 1636 for i := 0; i < num; i++ { 1637 js.PublishAsync("T", []byte("OK")) 1638 } 1639 select { 1640 case <-js.PublishAsyncComplete(): 1641 case <-time.After(5 * time.Second): 1642 t.Fatalf("Did not receive completion signal") 1643 } 1644 1645 ci, err := js.AddConsumer("T", &nats.ConsumerConfig{ 1646 Durable: "d", 1647 AckPolicy: nats.AckExplicitPolicy, 1648 DeliverPolicy: nats.DeliverNewPolicy, 1649 }) 1650 require_NoError(t, err) 1651 1652 if ci.Delivered.Consumer != 0 || ci.Delivered.Stream != 100 { 1653 t.Fatalf("Incorrect consumer delivered info: %+v", ci.Delivered) 1654 } 1655 1656 c := sc.clusterForName(pcn) 1657 for _, s := range c.servers { 1658 sd := s.JetStreamConfig().StoreDir 1659 s.Shutdown() 1660 removeDir(t, sd) 1661 s = c.restartServer(s) 1662 c.waitOnServerHealthz(s) 1663 c.waitOnConsumerLeader("$G", "T", "d") 1664 } 1665 1666 c.waitOnConsumerLeader("$G", "T", "d") 1667 ci, err = js.ConsumerInfo("T", "d") 1668 require_NoError(t, err) 1669 1670 if ci.Delivered.Consumer != 0 || ci.Delivered.Stream != 100 { 1671 t.Fatalf("Incorrect consumer delivered info: %+v", ci.Delivered) 1672 } 1673 if ci.NumPending != 0 { 1674 t.Fatalf("Did not expect NumPending, got %d", ci.NumPending) 1675 } 1676 } 1677 1678 // This will test our ability to move streams and consumers between clusters. 1679 func TestJetStreamSuperClusterMovingStreamsAndConsumers(t *testing.T) { 1680 sc := createJetStreamTaggedSuperCluster(t) 1681 defer sc.shutdown() 1682 1683 nc, js := jsClientConnect(t, sc.randomServer()) 1684 defer nc.Close() 1685 1686 for _, test := range []struct { 1687 name string 1688 replicas int 1689 }{ 1690 {"R1", 1}, 1691 {"R3", 3}, 1692 } { 1693 t.Run(test.name, func(t *testing.T) { 1694 replicas := test.replicas 1695 1696 si, err := js.AddStream(&nats.StreamConfig{ 1697 Name: "MOVE", 1698 Replicas: replicas, 1699 Placement: &nats.Placement{Tags: []string{"cloud:aws"}}, 1700 }) 1701 require_NoError(t, err) 1702 defer js.DeleteStream("MOVE") 1703 1704 if si.Cluster.Name != "C1" { 1705 t.Fatalf("Failed to place properly in %q, got %q", "C1", si.Cluster.Name) 1706 } 1707 1708 for i := 0; i < 1000; i++ { 1709 _, err := js.PublishAsync("MOVE", []byte("Moving on up")) 1710 require_NoError(t, err) 1711 } 1712 select { 1713 case <-js.PublishAsyncComplete(): 1714 case <-time.After(5 * time.Second): 1715 t.Fatalf("Did not receive completion signal") 1716 } 1717 1718 // Durable Push Consumer, so same R. 1719 dpushSub, err := js.SubscribeSync("MOVE", nats.Durable("dlc")) 1720 require_NoError(t, err) 1721 defer dpushSub.Unsubscribe() 1722 1723 // Ephemeral Push Consumer, R1. 1724 epushSub, err := js.SubscribeSync("MOVE") 1725 require_NoError(t, err) 1726 defer epushSub.Unsubscribe() 1727 1728 // Durable Pull Consumer, so same R. 1729 dpullSub, err := js.PullSubscribe("MOVE", "dlc-pull") 1730 require_NoError(t, err) 1731 defer dpullSub.Unsubscribe() 1732 1733 // TODO(dlc) - Server supports ephemeral pulls but Go client does not yet. 1734 1735 si, err = js.StreamInfo("MOVE") 1736 require_NoError(t, err) 1737 if si.State.Consumers != 3 { 1738 t.Fatalf("Expected 3 attached consumers, got %d", si.State.Consumers) 1739 } 1740 1741 initialState := si.State 1742 1743 checkSubsPending(t, dpushSub, int(initialState.Msgs)) 1744 checkSubsPending(t, epushSub, int(initialState.Msgs)) 1745 1746 // Ack 100 1747 toAck := 100 1748 for i := 0; i < toAck; i++ { 1749 m, err := dpushSub.NextMsg(time.Second) 1750 require_NoError(t, err) 1751 m.AckSync() 1752 // Ephemeral 1753 m, err = epushSub.NextMsg(time.Second) 1754 require_NoError(t, err) 1755 m.AckSync() 1756 } 1757 1758 // Do same with pull subscriber. 1759 for _, m := range fetchMsgs(t, dpullSub, toAck, 5*time.Second) { 1760 m.AckSync() 1761 } 1762 1763 // First make sure we disallow move and replica changes in same update. 1764 _, err = js.UpdateStream(&nats.StreamConfig{ 1765 Name: "MOVE", 1766 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 1767 Replicas: replicas + 1, 1768 }) 1769 require_Error(t, err, NewJSStreamMoveAndScaleError()) 1770 1771 // Now move to new cluster. 1772 si, err = js.UpdateStream(&nats.StreamConfig{ 1773 Name: "MOVE", 1774 Replicas: replicas, 1775 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 1776 }) 1777 require_NoError(t, err) 1778 1779 if si.Cluster.Name != "C1" { 1780 t.Fatalf("Expected cluster of %q but got %q", "C1", si.Cluster.Name) 1781 } 1782 1783 // Make sure we can not move an inflight stream and consumers, should error. 1784 _, err = js.UpdateStream(&nats.StreamConfig{ 1785 Name: "MOVE", 1786 Replicas: replicas, 1787 Placement: &nats.Placement{Tags: []string{"cloud:aws"}}, 1788 }) 1789 require_Contains(t, err.Error(), "stream move already in progress") 1790 1791 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 1792 si, err := js.StreamInfo("MOVE", nats.MaxWait(500*time.Millisecond)) 1793 if err != nil { 1794 return err 1795 } 1796 // We should see 2X peers. 1797 numPeers := len(si.Cluster.Replicas) 1798 if si.Cluster.Leader != _EMPTY_ { 1799 numPeers++ 1800 } 1801 if numPeers != 2*replicas { 1802 // The move can happen very quick now, so we might already be done. 1803 if si.Cluster.Name == "C2" { 1804 return nil 1805 } 1806 return fmt.Errorf("Expected to see %d replicas, got %d", 2*replicas, numPeers) 1807 } 1808 return nil 1809 }) 1810 1811 // Expect a new leader to emerge and replicas to drop as a leader is elected. 1812 // We have to check fast or it might complete and we will not see intermediate steps. 1813 sc.waitOnStreamLeader("$G", "MOVE") 1814 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 1815 si, err := js.StreamInfo("MOVE", nats.MaxWait(500*time.Millisecond)) 1816 if err != nil { 1817 return err 1818 } 1819 if len(si.Cluster.Replicas) >= 2*replicas { 1820 return fmt.Errorf("Expected <%d replicas, got %d", 2*replicas, len(si.Cluster.Replicas)) 1821 } 1822 return nil 1823 }) 1824 1825 // Should see the cluster designation and leader switch to C2. 1826 // We should also shrink back down to original replica count. 1827 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 1828 si, err := js.StreamInfo("MOVE", nats.MaxWait(500*time.Millisecond)) 1829 if err != nil { 1830 return err 1831 } 1832 if si.Cluster.Name != "C2" { 1833 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 1834 } 1835 if si.Cluster.Leader == _EMPTY_ { 1836 return fmt.Errorf("No leader yet") 1837 } else if !strings.HasPrefix(si.Cluster.Leader, "C2-") { 1838 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 1839 } 1840 // Now we want to see that we shrink back to original. 1841 if len(si.Cluster.Replicas) != replicas-1 { 1842 return fmt.Errorf("Expected %d replicas, got %d", replicas-1, len(si.Cluster.Replicas)) 1843 } 1844 return nil 1845 }) 1846 1847 // Check moved state is same as initial state. 1848 si, err = js.StreamInfo("MOVE") 1849 require_NoError(t, err) 1850 1851 if !reflect.DeepEqual(si.State, initialState) { 1852 t.Fatalf("States do not match after migration:\n%+v\nvs\n%+v", si.State, initialState) 1853 } 1854 1855 // Make sure we can still send messages. 1856 addN := toAck 1857 for i := 0; i < addN; i++ { 1858 _, err := js.Publish("MOVE", []byte("Done Moved")) 1859 require_NoError(t, err) 1860 } 1861 1862 si, err = js.StreamInfo("MOVE") 1863 require_NoError(t, err) 1864 1865 expectedPushMsgs := initialState.Msgs + uint64(addN) 1866 expectedPullMsgs := uint64(addN) 1867 1868 if si.State.Msgs != expectedPushMsgs { 1869 t.Fatalf("Expected to be able to send new messages") 1870 } 1871 1872 // Now check consumers, make sure the state is correct and that they transferred state and reflect the new messages. 1873 // We Ack'd 100 and sent another 100, so should be same. 1874 checkConsumer := func(sub *nats.Subscription, isPull bool) { 1875 t.Helper() 1876 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1877 ci, err := sub.ConsumerInfo() 1878 if err != nil { 1879 return err 1880 } 1881 var expectedMsgs uint64 1882 if isPull { 1883 expectedMsgs = expectedPullMsgs 1884 } else { 1885 expectedMsgs = expectedPushMsgs 1886 } 1887 1888 if ci.Delivered.Consumer != expectedMsgs || ci.Delivered.Stream != expectedMsgs { 1889 return fmt.Errorf("Delivered for %q is not correct: %+v", ci.Name, ci.Delivered) 1890 } 1891 if ci.AckFloor.Consumer != uint64(toAck) || ci.AckFloor.Stream != uint64(toAck) { 1892 return fmt.Errorf("AckFloor for %q is not correct: %+v", ci.Name, ci.AckFloor) 1893 } 1894 if isPull && ci.NumAckPending != 0 { 1895 return fmt.Errorf("NumAckPending for %q is not correct: %v", ci.Name, ci.NumAckPending) 1896 } else if !isPull && ci.NumAckPending != int(initialState.Msgs) { 1897 return fmt.Errorf("NumAckPending for %q is not correct: %v", ci.Name, ci.NumAckPending) 1898 } 1899 // Make sure the replicas etc are back to what is expected. 1900 si, err := js.StreamInfo("MOVE") 1901 if err != nil { 1902 return err 1903 } 1904 numExpected := si.Config.Replicas 1905 if ci.Config.Durable == _EMPTY_ { 1906 numExpected = 1 1907 } 1908 numPeers := len(ci.Cluster.Replicas) 1909 if ci.Cluster.Leader != _EMPTY_ { 1910 numPeers++ 1911 } 1912 if numPeers != numExpected { 1913 return fmt.Errorf("Expected %d peers, got %d", numExpected, numPeers) 1914 } 1915 // If we are push check sub pending. 1916 if !isPull { 1917 checkSubsPending(t, sub, int(expectedPushMsgs)-toAck) 1918 } 1919 return nil 1920 }) 1921 } 1922 1923 checkPushConsumer := func(sub *nats.Subscription) { 1924 t.Helper() 1925 checkConsumer(sub, false) 1926 } 1927 checkPullConsumer := func(sub *nats.Subscription) { 1928 t.Helper() 1929 checkConsumer(sub, true) 1930 } 1931 1932 checkPushConsumer(dpushSub) 1933 checkPushConsumer(epushSub) 1934 checkPullConsumer(dpullSub) 1935 1936 // Cleanup 1937 err = js.DeleteStream("MOVE") 1938 require_NoError(t, err) 1939 }) 1940 } 1941 } 1942 1943 func TestJetStreamSuperClusterMovingStreamsWithMirror(t *testing.T) { 1944 sc := createJetStreamTaggedSuperCluster(t) 1945 defer sc.shutdown() 1946 1947 nc, js := jsClientConnect(t, sc.randomServer()) 1948 defer nc.Close() 1949 1950 _, err := js.AddStream(&nats.StreamConfig{ 1951 Name: "SOURCE", 1952 Subjects: []string{"foo", "bar"}, 1953 Replicas: 3, 1954 Placement: &nats.Placement{Tags: []string{"cloud:aws"}}, 1955 }) 1956 require_NoError(t, err) 1957 1958 _, err = js.AddStream(&nats.StreamConfig{ 1959 Name: "MIRROR", 1960 Replicas: 1, 1961 Mirror: &nats.StreamSource{Name: "SOURCE"}, 1962 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 1963 }) 1964 require_NoError(t, err) 1965 1966 done := make(chan struct{}) 1967 exited := make(chan struct{}) 1968 errors := make(chan error, 1) 1969 1970 numNoResp := uint64(0) 1971 1972 // We will run a separate routine and send at 100hz 1973 go func() { 1974 nc, js := jsClientConnect(t, sc.randomServer()) 1975 defer nc.Close() 1976 1977 defer close(exited) 1978 1979 for { 1980 select { 1981 case <-done: 1982 return 1983 case <-time.After(10 * time.Millisecond): 1984 _, err := js.Publish("foo", []byte("100HZ")) 1985 if err == nil { 1986 } else if err == nats.ErrNoStreamResponse { 1987 atomic.AddUint64(&numNoResp, 1) 1988 continue 1989 } 1990 if err != nil { 1991 errors <- err 1992 return 1993 } 1994 } 1995 } 1996 }() 1997 1998 // Let it get going. 1999 time.Sleep(500 * time.Millisecond) 2000 2001 // Now move the source to a new cluster. 2002 _, err = js.UpdateStream(&nats.StreamConfig{ 2003 Name: "SOURCE", 2004 Subjects: []string{"foo", "bar"}, 2005 Replicas: 3, 2006 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 2007 }) 2008 require_NoError(t, err) 2009 2010 checkFor(t, 30*time.Second, 100*time.Millisecond, func() error { 2011 si, err := js.StreamInfo("SOURCE") 2012 if err != nil { 2013 return err 2014 } 2015 if si.Cluster.Name != "C2" { 2016 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 2017 } 2018 if si.Cluster.Leader == _EMPTY_ { 2019 return fmt.Errorf("No leader yet") 2020 } else if !strings.HasPrefix(si.Cluster.Leader, "C2-") { 2021 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 2022 } 2023 // Now we want to see that we shrink back to original. 2024 if len(si.Cluster.Replicas) != 2 { 2025 return fmt.Errorf("Expected %d replicas, got %d", 2, len(si.Cluster.Replicas)) 2026 } 2027 // Let's get to 50+ msgs. 2028 if si.State.Msgs < 50 { 2029 return fmt.Errorf("Only see %d msgs", si.State.Msgs) 2030 } 2031 return nil 2032 }) 2033 2034 close(done) 2035 <-exited 2036 2037 if nnr := atomic.LoadUint64(&numNoResp); nnr > 0 { 2038 if nnr > 5 { 2039 t.Fatalf("Expected no or very few failed message publishes, got %d", nnr) 2040 } else { 2041 t.Logf("Got a few failed publishes: %d", nnr) 2042 } 2043 } 2044 2045 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 2046 si, err := js.StreamInfo("SOURCE") 2047 require_NoError(t, err) 2048 mi, err := js.StreamInfo("MIRROR") 2049 require_NoError(t, err) 2050 2051 if !reflect.DeepEqual(si.State, mi.State) { 2052 return fmt.Errorf("Expected mirror to be the same, got %+v vs %+v", mi.State, si.State) 2053 } 2054 return nil 2055 }) 2056 2057 } 2058 2059 func TestJetStreamSuperClusterMovingStreamAndMoveBack(t *testing.T) { 2060 sc := createJetStreamTaggedSuperCluster(t) 2061 defer sc.shutdown() 2062 2063 nc, js := jsClientConnect(t, sc.randomServer()) 2064 defer nc.Close() 2065 2066 for _, test := range []struct { 2067 name string 2068 replicas int 2069 }{ 2070 {"R1", 1}, 2071 {"R3", 3}, 2072 } { 2073 t.Run(test.name, func(t *testing.T) { 2074 js.DeleteStream("TEST") 2075 2076 _, err := js.AddStream(&nats.StreamConfig{ 2077 Name: "TEST", 2078 Replicas: test.replicas, 2079 Placement: &nats.Placement{Tags: []string{"cloud:aws"}}, 2080 }) 2081 require_NoError(t, err) 2082 2083 toSend := 10_000 2084 for i := 0; i < toSend; i++ { 2085 _, err := js.Publish("TEST", []byte("HELLO WORLD")) 2086 require_NoError(t, err) 2087 } 2088 2089 _, err = js.UpdateStream(&nats.StreamConfig{ 2090 Name: "TEST", 2091 Replicas: test.replicas, 2092 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 2093 }) 2094 require_NoError(t, err) 2095 2096 checkMove := func(cluster string) { 2097 t.Helper() 2098 sc.waitOnStreamLeader("$G", "TEST") 2099 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 2100 si, err := js.StreamInfo("TEST") 2101 if err != nil { 2102 return err 2103 } 2104 if si.Cluster.Name != cluster { 2105 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 2106 } 2107 if si.Cluster.Leader == _EMPTY_ { 2108 return fmt.Errorf("No leader yet") 2109 } else if !strings.HasPrefix(si.Cluster.Leader, cluster) { 2110 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 2111 } 2112 // Now we want to see that we shrink back to original. 2113 if len(si.Cluster.Replicas) != test.replicas-1 { 2114 return fmt.Errorf("Expected %d replicas, got %d", test.replicas-1, len(si.Cluster.Replicas)) 2115 } 2116 if si.State.Msgs != uint64(toSend) { 2117 return fmt.Errorf("Only see %d msgs", si.State.Msgs) 2118 } 2119 return nil 2120 }) 2121 } 2122 2123 checkMove("C2") 2124 2125 _, err = js.UpdateStream(&nats.StreamConfig{ 2126 Name: "TEST", 2127 Replicas: test.replicas, 2128 Placement: &nats.Placement{Tags: []string{"cloud:aws"}}, 2129 }) 2130 require_NoError(t, err) 2131 2132 checkMove("C1") 2133 }) 2134 } 2135 } 2136 2137 func TestJetStreamSuperClusterImportConsumerStreamSubjectRemap(t *testing.T) { 2138 template := ` 2139 listen: 127.0.0.1:-1 2140 server_name: %s 2141 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, domain: HUB, store_dir: '%s'} 2142 2143 cluster { 2144 name: %s 2145 listen: 127.0.0.1:%d 2146 routes = [%s] 2147 } 2148 2149 accounts: { 2150 JS: { 2151 jetstream: enabled 2152 users: [ {user: js, password: pwd} ] 2153 exports [ 2154 # This is streaming to a delivery subject for a push based consumer. 2155 { stream: "deliver.ORDERS.*" } 2156 # This is to ack received messages. This is a service to support sync ack. 2157 { service: "$JS.ACK.ORDERS.*.>" } 2158 # To support ordered consumers, flow control. 2159 { service: "$JS.FC.>" } 2160 ] 2161 }, 2162 IM: { 2163 users: [ {user: im, password: pwd} ] 2164 imports [ 2165 { stream: { account: JS, subject: "deliver.ORDERS.route" }} 2166 { stream: { account: JS, subject: "deliver.ORDERS.gateway" }} 2167 { stream: { account: JS, subject: "deliver.ORDERS.leaf1" }} 2168 { stream: { account: JS, subject: "deliver.ORDERS.leaf2" }} 2169 { service: {account: JS, subject: "$JS.FC.>" }} 2170 ] 2171 }, 2172 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] }, 2173 } 2174 leaf { 2175 listen: 127.0.0.1:-1 2176 }` 2177 2178 test := func(t *testing.T, queue bool) { 2179 c := createJetStreamSuperClusterWithTemplate(t, template, 3, 2) 2180 defer c.shutdown() 2181 2182 s := c.randomServer() 2183 nc, js := jsClientConnect(t, s, nats.UserInfo("js", "pwd")) 2184 defer nc.Close() 2185 2186 _, err := js.AddStream(&nats.StreamConfig{ 2187 Name: "ORDERS", 2188 Subjects: []string{"foo"}, // The JS subject. 2189 Replicas: 3, 2190 Placement: &nats.Placement{Cluster: "C1"}, 2191 }) 2192 require_NoError(t, err) 2193 2194 _, err = js.Publish("foo", []byte("OK")) 2195 require_NoError(t, err) 2196 2197 for dur, deliver := range map[string]string{ 2198 "dur-route": "deliver.ORDERS.route", 2199 "dur-gateway": "deliver.ORDERS.gateway", 2200 "dur-leaf-1": "deliver.ORDERS.leaf1", 2201 "dur-leaf-2": "deliver.ORDERS.leaf2", 2202 } { 2203 cfg := &nats.ConsumerConfig{ 2204 Durable: dur, 2205 DeliverSubject: deliver, 2206 AckPolicy: nats.AckExplicitPolicy, 2207 } 2208 if queue { 2209 cfg.DeliverGroup = "queue" 2210 } 2211 _, err = js.AddConsumer("ORDERS", cfg) 2212 require_NoError(t, err) 2213 } 2214 2215 testCase := func(t *testing.T, s *Server, dSubj string) { 2216 nc2, err := nats.Connect(s.ClientURL(), nats.UserInfo("im", "pwd")) 2217 require_NoError(t, err) 2218 defer nc2.Close() 2219 2220 var sub *nats.Subscription 2221 if queue { 2222 sub, err = nc2.QueueSubscribeSync(dSubj, "queue") 2223 } else { 2224 sub, err = nc2.SubscribeSync(dSubj) 2225 } 2226 require_NoError(t, err) 2227 2228 m, err := sub.NextMsg(time.Second) 2229 require_NoError(t, err) 2230 2231 if m.Subject != "foo" { 2232 t.Fatalf("Subject not mapped correctly across account boundary, expected %q got %q", "foo", m.Subject) 2233 } 2234 require_False(t, strings.Contains(m.Reply, "@")) 2235 } 2236 2237 t.Run("route", func(t *testing.T) { 2238 // pick random non consumer leader so we receive via route 2239 s := c.clusterForName("C1").randomNonConsumerLeader("JS", "ORDERS", "dur-route") 2240 testCase(t, s, "deliver.ORDERS.route") 2241 }) 2242 t.Run("gateway", func(t *testing.T) { 2243 // pick server with inbound gateway from consumer leader, so we receive from gateway and have no route in between 2244 scl := c.clusterForName("C1").consumerLeader("JS", "ORDERS", "dur-gateway") 2245 var sfound *Server 2246 for _, s := range c.clusterForName("C2").servers { 2247 s.mu.Lock() 2248 for _, c := range s.gateway.in { 2249 if c.GetName() == scl.info.ID { 2250 sfound = s 2251 break 2252 } 2253 } 2254 s.mu.Unlock() 2255 if sfound != nil { 2256 break 2257 } 2258 } 2259 testCase(t, sfound, "deliver.ORDERS.gateway") 2260 }) 2261 t.Run("leaf-post-export", func(t *testing.T) { 2262 // create leaf node server connected post export/import 2263 scl := c.clusterForName("C1").consumerLeader("JS", "ORDERS", "dur-leaf-1") 2264 cf := createConfFile(t, []byte(fmt.Sprintf(` 2265 port: -1 2266 leafnodes { 2267 remotes [ { url: "nats://im:pwd@127.0.0.1:%d" } ] 2268 } 2269 authorization: { 2270 user: im, 2271 password: pwd 2272 } 2273 `, scl.getOpts().LeafNode.Port))) 2274 s, _ := RunServerWithConfig(cf) 2275 defer s.Shutdown() 2276 checkLeafNodeConnected(t, scl) 2277 testCase(t, s, "deliver.ORDERS.leaf1") 2278 }) 2279 t.Run("leaf-pre-export", func(t *testing.T) { 2280 // create leaf node server connected pre export, perform export/import on leaf node server 2281 scl := c.clusterForName("C1").consumerLeader("JS", "ORDERS", "dur-leaf-2") 2282 cf := createConfFile(t, []byte(fmt.Sprintf(` 2283 port: -1 2284 leafnodes { 2285 remotes [ { url: "nats://js:pwd@127.0.0.1:%d", account: JS2 } ] 2286 } 2287 accounts: { 2288 JS2: { 2289 users: [ {user: js, password: pwd} ] 2290 exports [ 2291 # This is streaming to a delivery subject for a push based consumer. 2292 { stream: "deliver.ORDERS.leaf2" } 2293 # This is to ack received messages. This is a service to support sync ack. 2294 { service: "$JS.ACK.ORDERS.*.>" } 2295 # To support ordered consumers, flow control. 2296 { service: "$JS.FC.>" } 2297 ] 2298 }, 2299 IM2: { 2300 users: [ {user: im, password: pwd} ] 2301 imports [ 2302 { stream: { account: JS2, subject: "deliver.ORDERS.leaf2" }} 2303 { service: {account: JS2, subject: "$JS.FC.>" }} 2304 ] 2305 }, 2306 } 2307 `, scl.getOpts().LeafNode.Port))) 2308 s, _ := RunServerWithConfig(cf) 2309 defer s.Shutdown() 2310 checkLeafNodeConnected(t, scl) 2311 testCase(t, s, "deliver.ORDERS.leaf2") 2312 }) 2313 } 2314 2315 t.Run("noQueue", func(t *testing.T) { 2316 test(t, false) 2317 }) 2318 t.Run("queue", func(t *testing.T) { 2319 test(t, true) 2320 }) 2321 } 2322 2323 func TestJetStreamSuperClusterMaxHaAssets(t *testing.T) { 2324 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, ` 2325 listen: 127.0.0.1:-1 2326 server_name: %s 2327 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s', limits: {max_ha_assets: 2}} 2328 cluster { 2329 name: %s 2330 listen: 127.0.0.1:%d 2331 routes = [%s] 2332 } 2333 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 2334 `, 3, 2, 2335 func(serverName, clusterName, storeDir, conf string) string { 2336 return conf 2337 }, nil) 2338 defer sc.shutdown() 2339 2340 // speed up statsz reporting 2341 for _, c := range sc.clusters { 2342 for _, s := range c.servers { 2343 s.mu.Lock() 2344 s.sys.statsz = 10 * time.Millisecond 2345 s.sys.cstatsz = s.sys.statsz 2346 s.sys.stmr.Reset(s.sys.statsz) 2347 s.mu.Unlock() 2348 } 2349 } 2350 2351 nc, js := jsClientConnect(t, sc.randomServer()) 2352 defer nc.Close() 2353 2354 ncSys := natsConnect(t, sc.randomServer().ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 2355 defer ncSys.Close() 2356 statszSub, err := ncSys.SubscribeSync(fmt.Sprintf(serverStatsSubj, "*")) 2357 require_NoError(t, err) 2358 require_NoError(t, ncSys.Flush()) 2359 2360 waitStatsz := func(peers, haassets int) { 2361 t.Helper() 2362 for peersWithExactHaAssets := 0; peersWithExactHaAssets < peers; { 2363 m, err := statszSub.NextMsg(time.Second) 2364 require_NoError(t, err) 2365 var statsz ServerStatsMsg 2366 err = json.Unmarshal(m.Data, &statsz) 2367 require_NoError(t, err) 2368 if statsz.Stats.JetStream == nil { 2369 continue 2370 } 2371 if haassets == statsz.Stats.JetStream.Stats.HAAssets { 2372 peersWithExactHaAssets++ 2373 } 2374 } 2375 } 2376 waitStatsz(6, 1) // counts _meta_ 2377 _, err = js.AddStream(&nats.StreamConfig{Name: "S0", Replicas: 1, Placement: &nats.Placement{Cluster: "C1"}}) 2378 require_NoError(t, err) 2379 waitStatsz(6, 1) 2380 _, err = js.AddStream(&nats.StreamConfig{Name: "S1", Replicas: 3, Placement: &nats.Placement{Cluster: "C1"}}) 2381 require_NoError(t, err) 2382 waitStatsz(3, 2) 2383 waitStatsz(3, 1) 2384 _, err = js.AddStream(&nats.StreamConfig{Name: "S2", Replicas: 3, Placement: &nats.Placement{Cluster: "C1"}}) 2385 require_NoError(t, err) 2386 waitStatsz(3, 3) 2387 waitStatsz(3, 1) 2388 _, err = js.AddStream(&nats.StreamConfig{Name: "S3", Replicas: 3, Placement: &nats.Placement{Cluster: "C1"}}) 2389 require_Error(t, err) 2390 require_Contains(t, err.Error(), "nats: no suitable peers for placement") 2391 require_Contains(t, err.Error(), "miscellaneous issue") 2392 require_NoError(t, js.DeleteStream("S1")) 2393 waitStatsz(3, 2) 2394 waitStatsz(3, 1) 2395 _, err = js.AddConsumer("S2", &nats.ConsumerConfig{Durable: "DUR1", AckPolicy: nats.AckExplicitPolicy}) 2396 require_NoError(t, err) 2397 waitStatsz(3, 3) 2398 waitStatsz(3, 1) 2399 _, err = js.AddConsumer("S2", &nats.ConsumerConfig{Durable: "DUR2", AckPolicy: nats.AckExplicitPolicy}) 2400 require_Error(t, err) 2401 require_Equal(t, err.Error(), "nats: insufficient resources") 2402 _, err = js.AddConsumer("S2", &nats.ConsumerConfig{AckPolicy: nats.AckExplicitPolicy}) 2403 require_NoError(t, err) 2404 waitStatsz(3, 3) 2405 waitStatsz(3, 1) 2406 _, err = js.UpdateStream(&nats.StreamConfig{Name: "S2", Replicas: 3, Description: "foobar"}) 2407 require_NoError(t, err) 2408 waitStatsz(3, 3) 2409 waitStatsz(3, 1) 2410 si, err := js.AddStream(&nats.StreamConfig{Name: "S4", Replicas: 3}) 2411 require_NoError(t, err) 2412 require_Equal(t, si.Cluster.Name, "C2") 2413 waitStatsz(3, 3) 2414 waitStatsz(3, 2) 2415 si, err = js.AddStream(&nats.StreamConfig{Name: "S5", Replicas: 3}) 2416 require_NoError(t, err) 2417 require_Equal(t, si.Cluster.Name, "C2") 2418 waitStatsz(6, 3) 2419 _, err = js.AddConsumer("S4", &nats.ConsumerConfig{Durable: "DUR2", AckPolicy: nats.AckExplicitPolicy}) 2420 require_Error(t, err) 2421 require_Equal(t, err.Error(), "nats: insufficient resources") 2422 _, err = js.UpdateStream(&nats.StreamConfig{Name: "S2", Replicas: 3, Placement: &nats.Placement{Cluster: "C2"}}) 2423 require_Error(t, err) 2424 require_Contains(t, err.Error(), "nats: no suitable peers for placement") 2425 require_Contains(t, err.Error(), "miscellaneous issue") 2426 } 2427 2428 func TestJetStreamSuperClusterStreamAlternates(t *testing.T) { 2429 sc := createJetStreamTaggedSuperCluster(t) 2430 defer sc.shutdown() 2431 2432 nc, js := jsClientConnect(t, sc.randomServer()) 2433 defer nc.Close() 2434 2435 // C1 2436 _, err := js.AddStream(&nats.StreamConfig{ 2437 Name: "SOURCE", 2438 Subjects: []string{"foo", "bar", "baz"}, 2439 Replicas: 3, 2440 Placement: &nats.Placement{Tags: []string{"cloud:aws", "country:us"}}, 2441 }) 2442 require_NoError(t, err) 2443 2444 // C2 2445 _, err = js.AddStream(&nats.StreamConfig{ 2446 Name: "MIRROR-1", 2447 Replicas: 1, 2448 Mirror: &nats.StreamSource{Name: "SOURCE"}, 2449 Placement: &nats.Placement{Tags: []string{"cloud:gcp", "country:uk"}}, 2450 }) 2451 require_NoError(t, err) 2452 2453 // C3 2454 _, err = js.AddStream(&nats.StreamConfig{ 2455 Name: "MIRROR-2", 2456 Replicas: 2, 2457 Mirror: &nats.StreamSource{Name: "SOURCE"}, 2458 Placement: &nats.Placement{Tags: []string{"cloud:az", "country:jp"}}, 2459 }) 2460 require_NoError(t, err) 2461 2462 // No client support yet, so do by hand. 2463 getStreamInfo := func(nc *nats.Conn, expected string) { 2464 t.Helper() 2465 resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "SOURCE"), nil, time.Second) 2466 require_NoError(t, err) 2467 var si StreamInfo 2468 err = json.Unmarshal(resp.Data, &si) 2469 require_NoError(t, err) 2470 require_True(t, len(si.Alternates) == 3) 2471 require_True(t, si.Alternates[0].Cluster == expected) 2472 seen := make(map[string]struct{}) 2473 for _, alt := range si.Alternates { 2474 seen[alt.Cluster] = struct{}{} 2475 } 2476 require_True(t, len(seen) == 3) 2477 } 2478 2479 // Connect to different clusters to check ordering. 2480 nc, _ = jsClientConnect(t, sc.clusterForName("C1").randomServer()) 2481 defer nc.Close() 2482 getStreamInfo(nc, "C1") 2483 nc, _ = jsClientConnect(t, sc.clusterForName("C2").randomServer()) 2484 defer nc.Close() 2485 getStreamInfo(nc, "C2") 2486 nc, _ = jsClientConnect(t, sc.clusterForName("C3").randomServer()) 2487 defer nc.Close() 2488 getStreamInfo(nc, "C3") 2489 } 2490 2491 // We had a scenario where a consumer would not recover properly on restart due to 2492 // the cluster state not being set properly when checking source subjects. 2493 func TestJetStreamSuperClusterStateOnRestartPreventsConsumerRecovery(t *testing.T) { 2494 sc := createJetStreamTaggedSuperCluster(t) 2495 defer sc.shutdown() 2496 2497 nc, js := jsClientConnect(t, sc.randomServer()) 2498 defer nc.Close() 2499 2500 // C1 2501 _, err := js.AddStream(&nats.StreamConfig{ 2502 Name: "SOURCE", 2503 Subjects: []string{"foo", "bar"}, 2504 Replicas: 3, 2505 Placement: &nats.Placement{Tags: []string{"cloud:aws", "country:us"}}, 2506 }) 2507 require_NoError(t, err) 2508 2509 // C2 2510 _, err = js.AddStream(&nats.StreamConfig{ 2511 Name: "DS", 2512 Subjects: []string{"baz"}, 2513 Replicas: 3, 2514 Sources: []*nats.StreamSource{{Name: "SOURCE"}}, 2515 Placement: &nats.Placement{Tags: []string{"cloud:gcp", "country:uk"}}, 2516 }) 2517 require_NoError(t, err) 2518 2519 // Bind to DS and match filter subject of SOURCE. 2520 _, err = js.AddConsumer("DS", &nats.ConsumerConfig{ 2521 Durable: "dlc", 2522 AckPolicy: nats.AckExplicitPolicy, 2523 FilterSubject: "foo", 2524 DeliverSubject: "d", 2525 }) 2526 require_NoError(t, err) 2527 2528 // Send a few messages. 2529 for i := 0; i < 100; i++ { 2530 _, err := js.Publish("foo", []byte("HELLO")) 2531 require_NoError(t, err) 2532 } 2533 sub := natsSubSync(t, nc, "d") 2534 natsNexMsg(t, sub, time.Second) 2535 2536 c := sc.clusterForName("C2") 2537 cl := c.consumerLeader("$G", "DS", "dlc") 2538 2539 // Pull source out from underneath the downstream stream. 2540 err = js.DeleteStream("SOURCE") 2541 require_NoError(t, err) 2542 2543 cl.Shutdown() 2544 cl = c.restartServer(cl) 2545 c.waitOnServerHealthz(cl) 2546 2547 // Now make sure the consumer is still on this server and has restarted properly. 2548 mset, err := cl.GlobalAccount().lookupStream("DS") 2549 require_NoError(t, err) 2550 if o := mset.lookupConsumer("dlc"); o == nil { 2551 t.Fatalf("Consumer was not properly restarted") 2552 } 2553 } 2554 2555 // We allow mirrors to opt-in to direct get in a distributed queue group. 2556 func TestJetStreamSuperClusterStreamDirectGetMirrorQueueGroup(t *testing.T) { 2557 sc := createJetStreamTaggedSuperCluster(t) 2558 defer sc.shutdown() 2559 2560 nc, js := jsClientConnect(t, sc.randomServer()) 2561 defer nc.Close() 2562 2563 // C1 2564 // Do by hand for now. 2565 cfg := &StreamConfig{ 2566 Name: "SOURCE", 2567 Subjects: []string{"kv.>"}, 2568 MaxMsgsPer: 1, 2569 Placement: &Placement{Tags: []string{"cloud:aws", "country:us"}}, 2570 AllowDirect: true, 2571 Replicas: 3, 2572 Storage: MemoryStorage, 2573 } 2574 addStream(t, nc, cfg) 2575 2576 num := 100 2577 for i := 0; i < num; i++ { 2578 js.PublishAsync(fmt.Sprintf("kv.%d", i), []byte("VAL")) 2579 } 2580 select { 2581 case <-js.PublishAsyncComplete(): 2582 case <-time.After(5 * time.Second): 2583 t.Fatalf("Did not receive completion signal") 2584 } 2585 2586 // C2 2587 cfg = &StreamConfig{ 2588 Name: "M1", 2589 Mirror: &StreamSource{Name: "SOURCE"}, 2590 Placement: &Placement{Tags: []string{"cloud:gcp", "country:uk"}}, 2591 MirrorDirect: true, 2592 Storage: MemoryStorage, 2593 } 2594 addStream(t, nc, cfg) 2595 2596 // C3 (clustered) 2597 cfg = &StreamConfig{ 2598 Name: "M2", 2599 Mirror: &StreamSource{Name: "SOURCE"}, 2600 Replicas: 3, 2601 Placement: &Placement{Tags: []string{"country:jp"}}, 2602 MirrorDirect: true, 2603 Storage: MemoryStorage, 2604 } 2605 addStream(t, nc, cfg) 2606 2607 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 2608 si, err := js.StreamInfo("M2") 2609 require_NoError(t, err) 2610 if si.State.Msgs != uint64(num) { 2611 return fmt.Errorf("Expected %d msgs, got state: %d", num, si.State.Msgs) 2612 } 2613 return nil 2614 }) 2615 2616 // Since last one was an R3, check and wait for the direct subscription. 2617 checkFor(t, 2*time.Second, 100*time.Millisecond, func() error { 2618 sl := sc.clusterForName("C3").streamLeader("$G", "M2") 2619 if mset, err := sl.GlobalAccount().lookupStream("M2"); err == nil { 2620 mset.mu.RLock() 2621 ok := mset.mirror.dsub != nil 2622 mset.mu.RUnlock() 2623 if ok { 2624 return nil 2625 } 2626 } 2627 return fmt.Errorf("No dsub yet") 2628 }) 2629 2630 // Always do a direct get to the source, but check that we are getting answers from the mirrors when connected to their cluster. 2631 getSubj := fmt.Sprintf(JSDirectMsgGetT, "SOURCE") 2632 req := []byte(`{"last_by_subj":"kv.22"}`) 2633 getMsg := func(c *nats.Conn) *nats.Msg { 2634 m, err := c.Request(getSubj, req, time.Second) 2635 require_NoError(t, err) 2636 require_True(t, string(m.Data) == "VAL") 2637 require_True(t, m.Header.Get(JSSequence) == "23") 2638 require_True(t, m.Header.Get(JSSubject) == "kv.22") 2639 return m 2640 } 2641 2642 // C1 -> SOURCE 2643 nc, _ = jsClientConnect(t, sc.clusterForName("C1").randomServer()) 2644 defer nc.Close() 2645 2646 m := getMsg(nc) 2647 require_True(t, m.Header.Get(JSStream) == "SOURCE") 2648 2649 // C2 -> M1 2650 nc, _ = jsClientConnect(t, sc.clusterForName("C2").randomServer()) 2651 defer nc.Close() 2652 2653 m = getMsg(nc) 2654 require_True(t, m.Header.Get(JSStream) == "M1") 2655 2656 // C3 -> M2 2657 nc, _ = jsClientConnect(t, sc.clusterForName("C3").randomServer()) 2658 defer nc.Close() 2659 2660 m = getMsg(nc) 2661 require_True(t, m.Header.Get(JSStream) == "M2") 2662 } 2663 2664 func TestJetStreamSuperClusterTagInducedMoveCancel(t *testing.T) { 2665 server := map[string]struct{}{} 2666 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, jsClusterTempl, 4, 2, 2667 func(serverName, clusterName, storeDir, conf string) string { 2668 server[serverName] = struct{}{} 2669 return fmt.Sprintf("%s\nserver_tags: [%s]", conf, clusterName) 2670 }, nil) 2671 defer sc.shutdown() 2672 2673 // Client based API 2674 c := sc.randomCluster() 2675 srv := c.randomNonLeader() 2676 nc, js := jsClientConnect(t, srv) 2677 defer nc.Close() 2678 2679 cfg := &nats.StreamConfig{ 2680 Name: "TEST", 2681 Subjects: []string{"foo"}, 2682 Placement: &nats.Placement{Tags: []string{"C1"}}, 2683 Replicas: 3, 2684 } 2685 siCreate, err := js.AddStream(cfg) 2686 require_NoError(t, err) 2687 require_Equal(t, siCreate.Cluster.Name, "C1") 2688 2689 toSend := uint64(1_000) 2690 for i := uint64(0); i < toSend; i++ { 2691 _, err = js.Publish("foo", nil) 2692 require_NoError(t, err) 2693 } 2694 2695 ncsys, err := nats.Connect(srv.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 2696 require_NoError(t, err) 2697 defer ncsys.Close() 2698 2699 // cause a move by altering placement tags 2700 cfg.Placement.Tags = []string{"C2"} 2701 _, err = js.UpdateStream(cfg) 2702 require_NoError(t, err) 2703 2704 rmsg, err := ncsys.Request(fmt.Sprintf(JSApiServerStreamCancelMoveT, "$G", "TEST"), nil, 5*time.Second) 2705 require_NoError(t, err) 2706 var cancelResp JSApiStreamUpdateResponse 2707 require_NoError(t, json.Unmarshal(rmsg.Data, &cancelResp)) 2708 if cancelResp.Error != nil && ErrorIdentifier(cancelResp.Error.ErrCode) == JSStreamMoveNotInProgress { 2709 t.Skip("This can happen with delays, when Move completed before Cancel", cancelResp.Error) 2710 return 2711 } 2712 require_True(t, cancelResp.Error == nil) 2713 2714 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 2715 si, err := js.StreamInfo("TEST") 2716 require_NoError(t, err) 2717 if si.Config.Placement != nil { 2718 return fmt.Errorf("expected placement to be cleared got: %+v", si.Config.Placement) 2719 } 2720 return nil 2721 }) 2722 } 2723 2724 func TestJetStreamSuperClusterMoveCancel(t *testing.T) { 2725 usageTickOld := usageTick 2726 usageTick = 250 * time.Millisecond 2727 defer func() { 2728 usageTick = usageTickOld 2729 }() 2730 2731 server := map[string]struct{}{} 2732 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, jsClusterTempl, 4, 2, 2733 func(serverName, clusterName, storeDir, conf string) string { 2734 server[serverName] = struct{}{} 2735 return fmt.Sprintf("%s\nserver_tags: [%s]", conf, serverName) 2736 }, nil) 2737 defer sc.shutdown() 2738 2739 // Client based API 2740 c := sc.randomCluster() 2741 srv := c.randomNonLeader() 2742 nc, js := jsClientConnect(t, srv) 2743 defer nc.Close() 2744 2745 siCreate, err := js.AddStream(&nats.StreamConfig{ 2746 Name: "TEST", 2747 Subjects: []string{"foo"}, 2748 Replicas: 3, 2749 }) 2750 require_NoError(t, err) 2751 streamPeerSrv := []string{siCreate.Cluster.Leader, siCreate.Cluster.Replicas[0].Name, siCreate.Cluster.Replicas[1].Name} 2752 // determine empty server 2753 for _, s := range streamPeerSrv { 2754 delete(server, s) 2755 } 2756 // pick left over server in same cluster as other server 2757 emptySrv := _EMPTY_ 2758 for s := range server { 2759 // server name is prefixed with cluster name 2760 if strings.HasPrefix(s, c.name) { 2761 emptySrv = s 2762 break 2763 } 2764 } 2765 2766 expectedPeers := map[string]struct{}{ 2767 getHash(streamPeerSrv[0]): {}, 2768 getHash(streamPeerSrv[1]): {}, 2769 getHash(streamPeerSrv[2]): {}, 2770 } 2771 2772 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "DUR", AckPolicy: nats.AckExplicitPolicy}) 2773 require_NoError(t, err) 2774 ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{InactiveThreshold: time.Hour, AckPolicy: nats.AckExplicitPolicy}) 2775 require_NoError(t, err) 2776 ephName := ci.Name 2777 2778 toSend := uint64(1_000) 2779 for i := uint64(0); i < toSend; i++ { 2780 _, err = js.Publish("foo", nil) 2781 require_NoError(t, err) 2782 } 2783 2784 serverEmpty := func(fromSrv string) error { 2785 if jszAfter, err := c.serverByName(fromSrv).Jsz(nil); err != nil { 2786 return fmt.Errorf("could not fetch JS info for server: %v", err) 2787 } else if jszAfter.Streams != 0 { 2788 return fmt.Errorf("empty server still has %d streams", jszAfter.Streams) 2789 } else if jszAfter.Consumers != 0 { 2790 return fmt.Errorf("empty server still has %d consumers", jszAfter.Consumers) 2791 } else if jszAfter.Bytes != 0 { 2792 return fmt.Errorf("empty server still has %d storage", jszAfter.Store) 2793 } 2794 return nil 2795 } 2796 2797 checkSrvInvariant := func(s *Server, expectedPeers map[string]struct{}) error { 2798 js, cc := s.getJetStreamCluster() 2799 js.mu.Lock() 2800 defer js.mu.Unlock() 2801 if sa, ok := cc.streams["$G"]["TEST"]; !ok { 2802 return fmt.Errorf("stream not found") 2803 } else if len(sa.Group.Peers) != len(expectedPeers) { 2804 return fmt.Errorf("stream peer group size not %d, but %d", len(expectedPeers), len(sa.Group.Peers)) 2805 } else if da, ok := sa.consumers["DUR"]; !ok { 2806 return fmt.Errorf("durable not found") 2807 } else if len(da.Group.Peers) != len(expectedPeers) { 2808 return fmt.Errorf("durable peer group size not %d, but %d", len(expectedPeers), len(da.Group.Peers)) 2809 } else if ea, ok := sa.consumers[ephName]; !ok { 2810 return fmt.Errorf("ephemeral not found") 2811 } else if len(ea.Group.Peers) != 1 { 2812 return fmt.Errorf("ephemeral peer group size not 1, but %d", len(ea.Group.Peers)) 2813 } else if _, ok := expectedPeers[ea.Group.Peers[0]]; !ok { 2814 return fmt.Errorf("ephemeral peer not an expected peer") 2815 } else { 2816 for _, p := range sa.Group.Peers { 2817 if _, ok := expectedPeers[p]; !ok { 2818 return fmt.Errorf("peer not expected") 2819 } 2820 found := false 2821 for _, dp := range da.Group.Peers { 2822 if p == dp { 2823 found = true 2824 break 2825 } 2826 } 2827 if !found { 2828 t.Logf("durable peer group does not match stream peer group") 2829 } 2830 } 2831 } 2832 return nil 2833 } 2834 2835 ncsys, err := nats.Connect(srv.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 2836 require_NoError(t, err) 2837 defer ncsys.Close() 2838 2839 time.Sleep(2 * usageTick) 2840 aiBefore, err := js.AccountInfo() 2841 require_NoError(t, err) 2842 2843 for _, moveFromSrv := range streamPeerSrv { 2844 moveReq, err := json.Marshal(&JSApiMetaServerStreamMoveRequest{Server: moveFromSrv, Tags: []string{emptySrv}}) 2845 require_NoError(t, err) 2846 rmsg, err := ncsys.Request(fmt.Sprintf(JSApiServerStreamMoveT, "$G", "TEST"), moveReq, 5*time.Second) 2847 require_NoError(t, err) 2848 var moveResp JSApiStreamUpdateResponse 2849 require_NoError(t, json.Unmarshal(rmsg.Data, &moveResp)) 2850 require_True(t, moveResp.Error == nil) 2851 2852 rmsg, err = ncsys.Request(fmt.Sprintf(JSApiServerStreamCancelMoveT, "$G", "TEST"), nil, 5*time.Second) 2853 require_NoError(t, err) 2854 var cancelResp JSApiStreamUpdateResponse 2855 require_NoError(t, json.Unmarshal(rmsg.Data, &cancelResp)) 2856 if cancelResp.Error != nil && ErrorIdentifier(cancelResp.Error.ErrCode) == JSStreamMoveNotInProgress { 2857 t.Skip("This can happen with delays, when Move completed before Cancel", cancelResp.Error) 2858 return 2859 } 2860 require_True(t, cancelResp.Error == nil) 2861 2862 for _, sExpected := range streamPeerSrv { 2863 s := c.serverByName(sExpected) 2864 require_True(t, s.JetStreamIsStreamAssigned("$G", "TEST")) 2865 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { return checkSrvInvariant(s, expectedPeers) }) 2866 } 2867 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { return serverEmpty(emptySrv) }) 2868 checkFor(t, 3*usageTick, 100*time.Millisecond, func() error { 2869 if aiAfter, err := js.AccountInfo(); err != nil { 2870 return err 2871 } else if aiAfter.Store != aiBefore.Store { 2872 return fmt.Errorf("store before %d and after %d don't match", aiBefore.Store, aiAfter.Store) 2873 } else { 2874 return nil 2875 } 2876 }) 2877 } 2878 } 2879 2880 func TestJetStreamSuperClusterDoubleStreamMove(t *testing.T) { 2881 server := map[string]struct{}{} 2882 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, jsClusterTempl, 4, 2, 2883 func(serverName, clusterName, storeDir, conf string) string { 2884 server[serverName] = struct{}{} 2885 return fmt.Sprintf("%s\nserver_tags: [%s]", conf, serverName) 2886 }, nil) 2887 defer sc.shutdown() 2888 2889 // Client based API 2890 c := sc.randomCluster() 2891 srv := c.randomNonLeader() 2892 nc, js := jsClientConnect(t, srv) 2893 defer nc.Close() 2894 2895 siCreate, err := js.AddStream(&nats.StreamConfig{ 2896 Name: "TEST", 2897 Subjects: []string{"foo"}, 2898 Replicas: 3, 2899 }) 2900 require_NoError(t, err) 2901 srvMoveList := []string{siCreate.Cluster.Leader, siCreate.Cluster.Replicas[0].Name, siCreate.Cluster.Replicas[1].Name} 2902 // determine empty server 2903 for _, s := range srvMoveList { 2904 delete(server, s) 2905 } 2906 // pick left over server in same cluster as other server 2907 for s := range server { 2908 // server name is prefixed with cluster name 2909 if strings.HasPrefix(s, c.name) { 2910 srvMoveList = append(srvMoveList, s) 2911 break 2912 } 2913 } 2914 2915 servers := []*Server{ 2916 c.serverByName(srvMoveList[0]), 2917 c.serverByName(srvMoveList[1]), 2918 c.serverByName(srvMoveList[2]), 2919 c.serverByName(srvMoveList[3]), // starts out empty 2920 } 2921 2922 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "DUR", AckPolicy: nats.AckExplicitPolicy}) 2923 require_NoError(t, err) 2924 ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{InactiveThreshold: time.Hour, AckPolicy: nats.AckExplicitPolicy}) 2925 require_NoError(t, err) 2926 ephName := ci.Name 2927 2928 toSend := uint64(100) 2929 for i := uint64(0); i < toSend; i++ { 2930 _, err = js.Publish("foo", nil) 2931 require_NoError(t, err) 2932 } 2933 2934 ncsys, err := nats.Connect(srv.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 2935 require_NoError(t, err) 2936 defer ncsys.Close() 2937 2938 move := func(fromSrv string, toTags ...string) { 2939 sEmpty := c.serverByName(fromSrv) 2940 jszBefore, err := sEmpty.Jsz(nil) 2941 require_NoError(t, err) 2942 require_True(t, jszBefore.Streams == 1) 2943 2944 moveReq, err := json.Marshal(&JSApiMetaServerStreamMoveRequest{ 2945 Server: fromSrv, Tags: toTags}) 2946 require_NoError(t, err) 2947 rmsg, err := ncsys.Request(fmt.Sprintf(JSApiServerStreamMoveT, "$G", "TEST"), moveReq, 100*time.Second) 2948 require_NoError(t, err) 2949 var moveResp JSApiStreamUpdateResponse 2950 require_NoError(t, json.Unmarshal(rmsg.Data, &moveResp)) 2951 require_True(t, moveResp.Error == nil) 2952 } 2953 2954 serverEmpty := func(fromSrv string) error { 2955 if jszAfter, err := c.serverByName(fromSrv).Jsz(nil); err != nil { 2956 return fmt.Errorf("could not fetch JS info for server: %v", err) 2957 } else if jszAfter.Streams != 0 { 2958 return fmt.Errorf("empty server still has %d streams", jszAfter.Streams) 2959 } else if jszAfter.Consumers != 0 { 2960 return fmt.Errorf("empty server still has %d consumers", jszAfter.Consumers) 2961 } else if jszAfter.Store != 0 { 2962 return fmt.Errorf("empty server still has %d storage", jszAfter.Store) 2963 } 2964 return nil 2965 } 2966 2967 moveComplete := func(toSrv string, expectedSet ...string) error { 2968 eSet := map[string]int{} 2969 foundInExpected := false 2970 for i, sExpected := range expectedSet { 2971 eSet[sExpected] = i 2972 s := c.serverByName(sExpected) 2973 if !s.JetStreamIsStreamAssigned("$G", "TEST") { 2974 return fmt.Errorf("expected stream to be assigned to %s", sExpected) 2975 } 2976 // test list order invariant 2977 js, cc := s.getJetStreamCluster() 2978 sExpHash := getHash(sExpected) 2979 js.mu.Lock() 2980 if sa, ok := cc.streams["$G"]["TEST"]; !ok { 2981 js.mu.Unlock() 2982 return fmt.Errorf("stream not found in cluster") 2983 } else if len(sa.Group.Peers) != 3 { 2984 js.mu.Unlock() 2985 return fmt.Errorf("peers not reset") 2986 } else if sa.Group.Peers[i] != sExpHash { 2987 js.mu.Unlock() 2988 return fmt.Errorf("stream: expected peer %s on index %d, got %s/%s", 2989 sa.Group.Peers[i], i, sExpHash, sExpected) 2990 } else if ca, ok := sa.consumers["DUR"]; !ok { 2991 js.mu.Unlock() 2992 return fmt.Errorf("durable not found in stream") 2993 } else { 2994 found := false 2995 for _, peer := range ca.Group.Peers { 2996 if peer == sExpHash { 2997 found = true 2998 break 2999 } 3000 } 3001 if !found { 3002 js.mu.Unlock() 3003 return fmt.Errorf("consumer expected peer %s/%s bud didn't find in %+v", 3004 sExpHash, sExpected, ca.Group.Peers) 3005 } 3006 if ephA, ok := sa.consumers[ephName]; ok { 3007 if len(ephA.Group.Peers) != 1 { 3008 return fmt.Errorf("ephemeral peers not reset") 3009 } 3010 foundInExpected = foundInExpected || (ephA.Group.Peers[0] == cc.meta.ID()) 3011 } 3012 } 3013 js.mu.Unlock() 3014 } 3015 if len(expectedSet) > 0 && !foundInExpected { 3016 return fmt.Errorf("ephemeral peer not expected") 3017 } 3018 for _, s := range servers { 3019 if jszAfter, err := c.serverByName(toSrv).Jsz(nil); err != nil { 3020 return fmt.Errorf("could not fetch JS info for server: %v", err) 3021 } else if jszAfter.Messages != toSend { 3022 return fmt.Errorf("messages not yet copied, got %d, expected %d", jszAfter.Messages, toSend) 3023 } 3024 nc, js := jsClientConnect(t, s) 3025 defer nc.Close() 3026 if si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)); err != nil { 3027 return fmt.Errorf("could not fetch stream info: %v", err) 3028 } else if len(si.Cluster.Replicas)+1 != si.Config.Replicas { 3029 return fmt.Errorf("not yet downsized replica should be empty has: %d %s", 3030 len(si.Cluster.Replicas), si.Cluster.Leader) 3031 } else if si.Cluster.Leader == _EMPTY_ { 3032 return fmt.Errorf("leader not found") 3033 } else if len(expectedSet) > 0 { 3034 if _, ok := eSet[si.Cluster.Leader]; !ok { 3035 return fmt.Errorf("leader %s not in expected set %+v", si.Cluster.Leader, eSet) 3036 } else if _, ok := eSet[si.Cluster.Replicas[0].Name]; !ok { 3037 return fmt.Errorf("leader %s not in expected set %+v", si.Cluster.Replicas[0].Name, eSet) 3038 } else if _, ok := eSet[si.Cluster.Replicas[1].Name]; !ok { 3039 return fmt.Errorf("leader %s not in expected set %+v", si.Cluster.Replicas[1].Name, eSet) 3040 } 3041 } 3042 nc.Close() 3043 } 3044 return nil 3045 } 3046 3047 moveAndCheck := func(from, to string, expectedSet ...string) { 3048 t.Helper() 3049 move(from, to) 3050 checkFor(t, 40*time.Second, 100*time.Millisecond, func() error { return moveComplete(to, expectedSet...) }) 3051 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { return serverEmpty(from) }) 3052 } 3053 3054 checkFor(t, 20*time.Second, 1000*time.Millisecond, func() error { return serverEmpty(srvMoveList[3]) }) 3055 // first iteration establishes order of server 0-2 (the internal order in the server could be 1,0,2) 3056 moveAndCheck(srvMoveList[0], srvMoveList[3]) 3057 moveAndCheck(srvMoveList[1], srvMoveList[0]) 3058 moveAndCheck(srvMoveList[2], srvMoveList[1]) 3059 moveAndCheck(srvMoveList[3], srvMoveList[2], srvMoveList[0], srvMoveList[1], srvMoveList[2]) 3060 // second iteration iterates in order 3061 moveAndCheck(srvMoveList[0], srvMoveList[3], srvMoveList[1], srvMoveList[2], srvMoveList[3]) 3062 moveAndCheck(srvMoveList[1], srvMoveList[0], srvMoveList[2], srvMoveList[3], srvMoveList[0]) 3063 moveAndCheck(srvMoveList[2], srvMoveList[1], srvMoveList[3], srvMoveList[0], srvMoveList[1]) 3064 moveAndCheck(srvMoveList[3], srvMoveList[2], srvMoveList[0], srvMoveList[1], srvMoveList[2]) 3065 // iterate in the opposite direction and establish order 2-0 3066 moveAndCheck(srvMoveList[2], srvMoveList[3], srvMoveList[0], srvMoveList[1], srvMoveList[3]) 3067 moveAndCheck(srvMoveList[1], srvMoveList[2], srvMoveList[0], srvMoveList[3], srvMoveList[2]) 3068 moveAndCheck(srvMoveList[0], srvMoveList[1], srvMoveList[3], srvMoveList[2], srvMoveList[1]) 3069 moveAndCheck(srvMoveList[3], srvMoveList[0], srvMoveList[2], srvMoveList[1], srvMoveList[0]) 3070 // move server in the middle of list 3071 moveAndCheck(srvMoveList[1], srvMoveList[3], srvMoveList[2], srvMoveList[0], srvMoveList[3]) 3072 moveAndCheck(srvMoveList[0], srvMoveList[1], srvMoveList[2], srvMoveList[3], srvMoveList[1]) 3073 moveAndCheck(srvMoveList[3], srvMoveList[0], srvMoveList[2], srvMoveList[1], srvMoveList[0]) 3074 // repeatedly use end 3075 moveAndCheck(srvMoveList[0], srvMoveList[3], srvMoveList[2], srvMoveList[1], srvMoveList[3]) 3076 moveAndCheck(srvMoveList[3], srvMoveList[0], srvMoveList[2], srvMoveList[1], srvMoveList[0]) 3077 moveAndCheck(srvMoveList[0], srvMoveList[3], srvMoveList[2], srvMoveList[1], srvMoveList[3]) 3078 moveAndCheck(srvMoveList[3], srvMoveList[0], srvMoveList[2], srvMoveList[1], srvMoveList[0]) 3079 } 3080 3081 func TestJetStreamSuperClusterPeerEvacuationAndStreamReassignment(t *testing.T) { 3082 s := createJetStreamSuperClusterWithTemplateAndModHook(t, jsClusterTempl, 4, 2, 3083 func(serverName, clusterName, storeDir, conf string) string { 3084 return fmt.Sprintf("%s\nserver_tags: [cluster:%s, server:%s]", conf, clusterName, serverName) 3085 }, nil) 3086 defer s.shutdown() 3087 3088 c := s.clusterForName("C1") 3089 3090 // Client based API 3091 srv := c.randomNonLeader() 3092 nc, js := jsClientConnect(t, srv) 3093 defer nc.Close() 3094 3095 test := func(t *testing.T, r int, moveTags []string, targetCluster string, testMigrateTo bool, listFrom bool) { 3096 si, err := js.AddStream(&nats.StreamConfig{ 3097 Name: "TEST", 3098 Subjects: []string{"foo"}, 3099 Replicas: r, 3100 }) 3101 require_NoError(t, err) 3102 defer js.DeleteStream("TEST") 3103 startSet := map[string]struct{}{ 3104 si.Cluster.Leader: {}, 3105 } 3106 for _, p := range si.Cluster.Replicas { 3107 startSet[p.Name] = struct{}{} 3108 } 3109 3110 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{Durable: "DUR", AckPolicy: nats.AckExplicitPolicy}) 3111 require_NoError(t, err) 3112 3113 sub, err := js.SubscribeSync("foo") 3114 require_NoError(t, err) 3115 3116 for i := 0; i < 100; i++ { 3117 _, err = js.Publish("foo", nil) 3118 require_NoError(t, err) 3119 } 3120 3121 toMoveFrom := si.Cluster.Leader 3122 if !listFrom { 3123 toMoveFrom = _EMPTY_ 3124 } 3125 sLdr := c.serverByName(si.Cluster.Leader) 3126 jszBefore, err := sLdr.Jsz(nil) 3127 require_NoError(t, err) 3128 require_True(t, jszBefore.Streams == 1) 3129 require_True(t, jszBefore.Consumers >= 1) 3130 require_True(t, jszBefore.Store != 0) 3131 3132 migrateToServer := _EMPTY_ 3133 if testMigrateTo { 3134 // find an empty server 3135 for _, s := range c.servers { 3136 name := s.Name() 3137 found := si.Cluster.Leader == name 3138 if !found { 3139 for _, r := range si.Cluster.Replicas { 3140 if r.Name == name { 3141 found = true 3142 break 3143 } 3144 } 3145 } 3146 if !found { 3147 migrateToServer = name 3148 break 3149 } 3150 } 3151 jszAfter, err := c.serverByName(migrateToServer).Jsz(nil) 3152 require_NoError(t, err) 3153 require_True(t, jszAfter.Streams == 0) 3154 3155 moveTags = append(moveTags, fmt.Sprintf("server:%s", migrateToServer)) 3156 } 3157 3158 ncsys, err := nats.Connect(srv.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 3159 require_NoError(t, err) 3160 defer ncsys.Close() 3161 3162 moveReq, err := json.Marshal(&JSApiMetaServerStreamMoveRequest{Server: toMoveFrom, Tags: moveTags}) 3163 require_NoError(t, err) 3164 rmsg, err := ncsys.Request(fmt.Sprintf(JSApiServerStreamMoveT, "$G", "TEST"), moveReq, 100*time.Second) 3165 require_NoError(t, err) 3166 var moveResp JSApiStreamUpdateResponse 3167 require_NoError(t, json.Unmarshal(rmsg.Data, &moveResp)) 3168 require_True(t, moveResp.Error == nil) 3169 3170 // test move to particular server 3171 if testMigrateTo { 3172 toSrv := c.serverByName(migrateToServer) 3173 checkFor(t, 20*time.Second, 1000*time.Millisecond, func() error { 3174 jszAfter, err := toSrv.Jsz(nil) 3175 if err != nil { 3176 return fmt.Errorf("could not fetch JS info for server: %v", err) 3177 } 3178 if jszAfter.Streams != 1 { 3179 return fmt.Errorf("server expected to have one stream, has %d", jszAfter.Streams) 3180 } 3181 return nil 3182 }) 3183 } 3184 // Now wait until the stream is now current. 3185 checkFor(t, 20*time.Second, 100*time.Millisecond, func() error { 3186 si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)) 3187 if err != nil { 3188 return fmt.Errorf("could not fetch stream info: %v", err) 3189 } 3190 if si.Cluster.Leader == toMoveFrom { 3191 return fmt.Errorf("peer not removed yet: %+v", toMoveFrom) 3192 } 3193 if si.Cluster.Leader == _EMPTY_ { 3194 return fmt.Errorf("no leader yet") 3195 } 3196 if len(si.Cluster.Replicas) != r-1 { 3197 return fmt.Errorf("not yet downsized replica should be %d has: %d", r-1, len(si.Cluster.Replicas)) 3198 } 3199 if si.Config.Replicas != r { 3200 return fmt.Errorf("bad replica count %d", si.Config.Replicas) 3201 } 3202 if si.Cluster.Name != targetCluster { 3203 return fmt.Errorf("stream expected in %s but found in %s", si.Cluster.Name, targetCluster) 3204 } 3205 sNew := s.serverByName(si.Cluster.Leader) 3206 if jszNew, err := sNew.Jsz(nil); err != nil { 3207 return err 3208 } else if jszNew.Streams != 1 { 3209 return fmt.Errorf("new leader has %d streams, not one", jszNew.Streams) 3210 } else if jszNew.Store != jszBefore.Store { 3211 return fmt.Errorf("new leader has %d storage, should have %d", jszNew.Store, jszBefore.Store) 3212 } 3213 return nil 3214 }) 3215 // test draining 3216 checkFor(t, 20*time.Second, time.Second, func() error { 3217 if !listFrom { 3218 // when needed determine which server move moved away from 3219 si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)) 3220 if err != nil { 3221 return fmt.Errorf("could not fetch stream info: %v", err) 3222 } 3223 for n := range startSet { 3224 if n != si.Cluster.Leader { 3225 var found bool 3226 for _, p := range si.Cluster.Replicas { 3227 if p.Name == n { 3228 found = true 3229 break 3230 } 3231 } 3232 if !found { 3233 toMoveFrom = n 3234 } 3235 } 3236 } 3237 } 3238 if toMoveFrom == _EMPTY_ { 3239 return fmt.Errorf("server to move away from not found") 3240 } 3241 sEmpty := c.serverByName(toMoveFrom) 3242 jszAfter, err := sEmpty.Jsz(nil) 3243 if err != nil { 3244 return fmt.Errorf("could not fetch JS info for server: %v", err) 3245 } 3246 if jszAfter.Streams != 0 { 3247 return fmt.Errorf("empty server still has %d streams", jszAfter.Streams) 3248 } 3249 if jszAfter.Consumers != 0 { 3250 return fmt.Errorf("empty server still has %d consumers", jszAfter.Consumers) 3251 } 3252 if jszAfter.Store != 0 { 3253 return fmt.Errorf("empty server still has %d storage", jszAfter.Store) 3254 } 3255 return nil 3256 }) 3257 // consume messages from ephemeral consumer 3258 for i := 0; i < 100; i++ { 3259 _, err := sub.NextMsg(time.Second) 3260 require_NoError(t, err) 3261 } 3262 } 3263 3264 for i := 1; i <= 3; i++ { 3265 t.Run(fmt.Sprintf("r%d", i), func(t *testing.T) { 3266 test(t, i, nil, "C1", false, true) 3267 }) 3268 t.Run(fmt.Sprintf("r%d-explicit", i), func(t *testing.T) { 3269 test(t, i, nil, "C1", true, true) 3270 }) 3271 t.Run(fmt.Sprintf("r%d-nosrc", i), func(t *testing.T) { 3272 test(t, i, nil, "C1", false, false) 3273 }) 3274 } 3275 3276 t.Run("r3-cluster-move", func(t *testing.T) { 3277 test(t, 3, []string{"cluster:C2"}, "C2", false, false) 3278 }) 3279 t.Run("r3-cluster-move-nosrc", func(t *testing.T) { 3280 test(t, 3, []string{"cluster:C2"}, "C2", false, true) 3281 }) 3282 } 3283 3284 func TestJetStreamSuperClusterMirrorInheritsAllowDirect(t *testing.T) { 3285 sc := createJetStreamTaggedSuperCluster(t) 3286 defer sc.shutdown() 3287 3288 nc, js := jsClientConnect(t, sc.randomServer()) 3289 defer nc.Close() 3290 3291 _, err := js.AddStream(&nats.StreamConfig{ 3292 Name: "KV", 3293 Subjects: []string{"key.*"}, 3294 Placement: &nats.Placement{Tags: []string{"cloud:aws", "country:us"}}, 3295 MaxMsgsPerSubject: 1, 3296 AllowDirect: true, 3297 }) 3298 require_NoError(t, err) 3299 3300 _, err = js.AddStream(&nats.StreamConfig{ 3301 Name: "M", 3302 Mirror: &nats.StreamSource{Name: "KV"}, 3303 Placement: &nats.Placement{Tags: []string{"cloud:gcp", "country:uk"}}, 3304 }) 3305 require_NoError(t, err) 3306 3307 // Do direct grab for now. 3308 resp, err := nc.Request(fmt.Sprintf(JSApiStreamInfoT, "M"), nil, time.Second) 3309 require_NoError(t, err) 3310 var si StreamInfo 3311 err = json.Unmarshal(resp.Data, &si) 3312 require_NoError(t, err) 3313 3314 if !si.Config.MirrorDirect { 3315 t.Fatalf("Expected MirrorDirect to be inherited as true") 3316 } 3317 } 3318 3319 func TestJetStreamSuperClusterSystemLimitsPlacement(t *testing.T) { 3320 const largeSystemLimit = 1024 3321 const smallSystemLimit = 512 3322 3323 tmpl := ` 3324 listen: 127.0.0.1:-1 3325 server_name: %s 3326 jetstream: { 3327 max_mem_store: _MAXMEM_ 3328 max_file_store: _MAXFILE_ 3329 store_dir: '%s', 3330 } 3331 server_tags: [ 3332 _TAG_ 3333 ] 3334 leaf { 3335 listen: 127.0.0.1:-1 3336 } 3337 cluster { 3338 name: %s 3339 listen: 127.0.0.1:%d 3340 routes = [%s] 3341 } 3342 3343 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 3344 ` 3345 storeCnf := func(serverName, clusterName, storeDir, conf string) string { 3346 switch { 3347 case strings.HasPrefix(serverName, "C1"): 3348 conf = strings.Replace(conf, "_MAXMEM_", fmt.Sprint(largeSystemLimit), 1) 3349 conf = strings.Replace(conf, "_MAXFILE_", fmt.Sprint(largeSystemLimit), 1) 3350 return strings.Replace(conf, "_TAG_", serverName, 1) 3351 case strings.HasPrefix(serverName, "C2"): 3352 conf = strings.Replace(conf, "_MAXMEM_", fmt.Sprint(smallSystemLimit), 1) 3353 conf = strings.Replace(conf, "_MAXFILE_", fmt.Sprint(smallSystemLimit), 1) 3354 return strings.Replace(conf, "_TAG_", serverName, 1) 3355 default: 3356 return conf 3357 } 3358 } 3359 3360 sCluster := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 3, 2, storeCnf, nil) 3361 defer sCluster.shutdown() 3362 3363 requestLeaderStepDown := func(clientURL string) error { 3364 nc, err := nats.Connect(clientURL) 3365 if err != nil { 3366 return err 3367 } 3368 defer nc.Close() 3369 3370 ncResp, err := nc.Request(JSApiLeaderStepDown, nil, 3*time.Second) 3371 if err != nil { 3372 return err 3373 } 3374 3375 var resp JSApiLeaderStepDownResponse 3376 if err := json.Unmarshal(ncResp.Data, &resp); err != nil { 3377 return err 3378 } 3379 if resp.Error != nil { 3380 return resp.Error 3381 } 3382 if !resp.Success { 3383 return fmt.Errorf("leader step down request not successful") 3384 } 3385 3386 return nil 3387 } 3388 3389 // Force large cluster to be leader 3390 var largeLeader *Server 3391 err := checkForErr(15*time.Second, 500*time.Millisecond, func() error { 3392 // Range over cluster A, which is the large cluster. 3393 servers := sCluster.clusters[0].servers 3394 for _, s := range servers { 3395 if s.JetStreamIsLeader() { 3396 largeLeader = s 3397 return nil 3398 } 3399 } 3400 3401 if err := requestLeaderStepDown(servers[0].ClientURL()); err != nil { 3402 return fmt.Errorf("failed to request leader step down: %s", err) 3403 } 3404 return fmt.Errorf("leader is not in large cluster") 3405 }) 3406 if err != nil { 3407 t.Skipf("failed to get desired layout: %s", err) 3408 } 3409 3410 getStreams := func(jsm nats.JetStreamManager) []string { 3411 var streams []string 3412 for s := range jsm.StreamNames() { 3413 streams = append(streams, s) 3414 } 3415 return streams 3416 } 3417 nc, js := jsClientConnect(t, largeLeader) 3418 defer nc.Close() 3419 3420 cases := []struct { 3421 name string 3422 storage nats.StorageType 3423 createMaxBytes int64 3424 serverTag string 3425 wantErr bool 3426 }{ 3427 { 3428 name: "file create large stream on small cluster b0", 3429 storage: nats.FileStorage, 3430 createMaxBytes: smallSystemLimit + 1, 3431 serverTag: "C2-S1", 3432 wantErr: true, 3433 }, 3434 { 3435 name: "memory create large stream on small cluster b0", 3436 storage: nats.MemoryStorage, 3437 createMaxBytes: smallSystemLimit + 1, 3438 serverTag: "C2-S1", 3439 wantErr: true, 3440 }, 3441 { 3442 name: "file create large stream on small cluster b1", 3443 storage: nats.FileStorage, 3444 createMaxBytes: smallSystemLimit + 1, 3445 serverTag: "C2-S2", 3446 wantErr: true, 3447 }, 3448 { 3449 name: "memory create large stream on small cluster b1", 3450 storage: nats.MemoryStorage, 3451 createMaxBytes: smallSystemLimit + 1, 3452 serverTag: "C2-S2", 3453 wantErr: true, 3454 }, 3455 { 3456 name: "file create large stream on small cluster b2", 3457 storage: nats.FileStorage, 3458 createMaxBytes: smallSystemLimit + 1, 3459 serverTag: "C2-S3", 3460 wantErr: true, 3461 }, 3462 { 3463 name: "memory create large stream on small cluster b2", 3464 storage: nats.MemoryStorage, 3465 createMaxBytes: smallSystemLimit + 1, 3466 serverTag: "C2-S3", 3467 wantErr: true, 3468 }, 3469 { 3470 name: "file create large stream on large cluster a0", 3471 storage: nats.FileStorage, 3472 createMaxBytes: smallSystemLimit + 1, 3473 serverTag: "C1-S1", 3474 }, 3475 { 3476 name: "memory create large stream on large cluster a0", 3477 storage: nats.MemoryStorage, 3478 createMaxBytes: smallSystemLimit + 1, 3479 serverTag: "C1-S1", 3480 }, 3481 { 3482 name: "file create large stream on large cluster a1", 3483 storage: nats.FileStorage, 3484 createMaxBytes: smallSystemLimit + 1, 3485 serverTag: "C1-S2", 3486 }, 3487 { 3488 name: "memory create large stream on large cluster a1", 3489 storage: nats.MemoryStorage, 3490 createMaxBytes: smallSystemLimit + 1, 3491 serverTag: "C1-S2", 3492 }, 3493 { 3494 name: "file create large stream on large cluster a2", 3495 storage: nats.FileStorage, 3496 createMaxBytes: smallSystemLimit + 1, 3497 serverTag: "C1-S3", 3498 }, 3499 { 3500 name: "memory create large stream on large cluster a2", 3501 storage: nats.MemoryStorage, 3502 createMaxBytes: smallSystemLimit + 1, 3503 serverTag: "C1-S3", 3504 }, 3505 } 3506 for i := 0; i < len(cases) && !t.Failed(); i++ { 3507 c := cases[i] 3508 t.Run(c.name, func(st *testing.T) { 3509 var clusterName string 3510 if strings.HasPrefix(c.serverTag, "a") { 3511 clusterName = "cluster-a" 3512 } else if strings.HasPrefix(c.serverTag, "b") { 3513 clusterName = "cluster-b" 3514 } 3515 3516 if s := getStreams(js); len(s) != 0 { 3517 st.Fatalf("unexpected stream count, got=%d, want=0", len(s)) 3518 } 3519 3520 streamName := fmt.Sprintf("TEST-%s", c.serverTag) 3521 si, err := js.AddStream(&nats.StreamConfig{ 3522 Name: streamName, 3523 Subjects: []string{"foo"}, 3524 Storage: c.storage, 3525 MaxBytes: c.createMaxBytes, 3526 Placement: &nats.Placement{ 3527 Cluster: clusterName, 3528 Tags: []string{c.serverTag}, 3529 }, 3530 }) 3531 if c.wantErr && err == nil { 3532 if s := getStreams(js); len(s) != 1 { 3533 st.Logf("unexpected stream count, got=%d, want=1, streams=%v", len(s), s) 3534 } 3535 3536 cfg := si.Config 3537 st.Fatalf("unexpected success, maxBytes=%d, cluster=%s, tags=%v", 3538 cfg.MaxBytes, cfg.Placement.Cluster, cfg.Placement.Tags) 3539 } else if !c.wantErr && err != nil { 3540 if s := getStreams(js); len(s) != 0 { 3541 st.Logf("unexpected stream count, got=%d, want=0, streams=%v", len(s), s) 3542 } 3543 3544 require_NoError(st, err) 3545 } 3546 3547 if err == nil { 3548 if s := getStreams(js); len(s) != 1 { 3549 st.Fatalf("unexpected stream count, got=%d, want=1", len(s)) 3550 } 3551 } 3552 // Delete regardless. 3553 js.DeleteStream(streamName) 3554 }) 3555 } 3556 } 3557 3558 func TestJetStreamSuperClusterMixedModeSwitchToInterestOnlyStaticConfig(t *testing.T) { 3559 tmpl := ` 3560 listen: 127.0.0.1:-1 3561 server_name: %s 3562 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 3563 leaf: { listen: 127.0.0.1:-1 } 3564 cluster { 3565 name: %s 3566 listen: 127.0.0.1:%d 3567 routes = [%s] 3568 } 3569 accounts { 3570 ONE { 3571 users = [ { user: "one", pass: "pwd" } ] 3572 jetstream: enabled 3573 } 3574 TWO { users = [ { user: "two", pass: "pwd" } ] } 3575 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 3576 } 3577 ` 3578 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 5, 3, 3579 func(serverName, clusterName, storeDir, conf string) string { 3580 sname := serverName[strings.Index(serverName, "-")+1:] 3581 switch sname { 3582 case "S4", "S5": 3583 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 3584 default: 3585 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 3586 } 3587 return conf 3588 }, nil) 3589 defer sc.shutdown() 3590 3591 // Connect our client to a non JS server 3592 c := sc.randomCluster() 3593 var s *Server 3594 for _, as := range c.servers { 3595 if !as.JetStreamEnabled() { 3596 s = as 3597 break 3598 } 3599 } 3600 if s == nil { 3601 t.Fatal("Did not find a non JS server!") 3602 } 3603 nc, js := jsClientConnect(t, s, nats.UserInfo("one", "pwd")) 3604 defer nc.Close() 3605 3606 // Just create a stream and then make sure that all gateways have switched 3607 // to interest-only mode. 3608 si, err := js.AddStream(&nats.StreamConfig{Name: "interest", Replicas: 3}) 3609 require_NoError(t, err) 3610 3611 sc.waitOnStreamLeader("ONE", "interest") 3612 3613 check := func(accName string) { 3614 t.Helper() 3615 for _, c := range sc.clusters { 3616 for _, s := range c.servers { 3617 // Check only JS servers outbound GW connections 3618 if !s.JetStreamEnabled() { 3619 continue 3620 } 3621 opts := s.getOpts() 3622 for _, gw := range opts.Gateway.Gateways { 3623 if gw.Name == opts.Gateway.Name { 3624 continue 3625 } 3626 checkGWInterestOnlyMode(t, s, gw.Name, accName) 3627 } 3628 } 3629 } 3630 } 3631 // Starting v2.9.0, all accounts should be switched to interest-only mode 3632 check("ONE") 3633 check("TWO") 3634 3635 var gwsa [16]*client 3636 gws := gwsa[:0] 3637 3638 s = sc.serverByName(si.Cluster.Leader) 3639 // Get the GW outbound connections 3640 s.getOutboundGatewayConnections(&gws) 3641 for _, gwc := range gws { 3642 gwc.mu.Lock() 3643 gwc.nc.Close() 3644 gwc.mu.Unlock() 3645 } 3646 waitForOutboundGateways(t, s, 2, 5*time.Second) 3647 check("ONE") 3648 check("TWO") 3649 } 3650 3651 func TestJetStreamSuperClusterMixedModeSwitchToInterestOnlyOperatorConfig(t *testing.T) { 3652 kp, _ := nkeys.FromSeed(oSeed) 3653 3654 skp, _ := nkeys.CreateAccount() 3655 spub, _ := skp.PublicKey() 3656 nac := jwt.NewAccountClaims(spub) 3657 sjwt, err := nac.Encode(kp) 3658 require_NoError(t, err) 3659 3660 akp, _ := nkeys.CreateAccount() 3661 apub, _ := akp.PublicKey() 3662 nac = jwt.NewAccountClaims(apub) 3663 // Set some limits to enable JS. 3664 nac.Limits.JetStreamLimits.DiskStorage = 1024 * 1024 3665 nac.Limits.JetStreamLimits.Streams = 10 3666 ajwt, err := nac.Encode(kp) 3667 require_NoError(t, err) 3668 3669 ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 3670 if strings.HasSuffix(r.URL.Path, spub) { 3671 w.Write([]byte(sjwt)) 3672 } else { 3673 w.Write([]byte(ajwt)) 3674 } 3675 })) 3676 defer ts.Close() 3677 3678 operator := fmt.Sprintf(` 3679 operator: %s 3680 resolver: URL("%s/ngs/v1/accounts/jwt/") 3681 `, ojwt, ts.URL) 3682 3683 tmpl := ` 3684 listen: 127.0.0.1:-1 3685 server_name: %s 3686 jetstream: { domain: ngs, max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 3687 leaf: { listen: 127.0.0.1:-1 } 3688 cluster { 3689 name: %s 3690 listen: 127.0.0.1:%d 3691 routes = [%s] 3692 } 3693 ` + operator 3694 sc := createJetStreamSuperClusterWithTemplateAndModHook(t, tmpl, 5, 3, 3695 func(serverName, clusterName, storeDir, conf string) string { 3696 conf = strings.ReplaceAll(conf, "system_account: \"$SYS\"", fmt.Sprintf("system_account: \"%s\"", spub)) 3697 sname := serverName[strings.Index(serverName, "-")+1:] 3698 switch sname { 3699 case "S4", "S5": 3700 conf = strings.ReplaceAll(conf, "jetstream: { ", "#jetstream: { ") 3701 default: 3702 conf = strings.ReplaceAll(conf, "leaf: { ", "#leaf: { ") 3703 } 3704 return conf 3705 }, nil) 3706 defer sc.shutdown() 3707 3708 // Connect our client to a non JS server 3709 c := sc.randomCluster() 3710 var s *Server 3711 for _, as := range c.servers { 3712 if !as.JetStreamEnabled() { 3713 s = as 3714 break 3715 } 3716 } 3717 if s == nil { 3718 t.Fatal("Did not find a non JS server!") 3719 } 3720 nc, js := jsClientConnect(t, s, createUserCreds(t, nil, akp)) 3721 defer nc.Close() 3722 3723 // Just create a stream and then make sure that all gateways have switched 3724 // to interest-only mode. 3725 si, err := js.AddStream(&nats.StreamConfig{Name: "interest", Replicas: 3}) 3726 require_NoError(t, err) 3727 3728 sc.waitOnStreamLeader(apub, "interest") 3729 3730 check := func(s *Server) { 3731 opts := s.getOpts() 3732 for _, gw := range opts.Gateway.Gateways { 3733 if gw.Name == opts.Gateway.Name { 3734 continue 3735 } 3736 checkGWInterestOnlyMode(t, s, gw.Name, apub) 3737 } 3738 } 3739 s = sc.serverByName(si.Cluster.Leader) 3740 check(s) 3741 3742 // Let's cause a leadership change and verify that it still works. 3743 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "interest"), nil, time.Second) 3744 require_NoError(t, err) 3745 sc.waitOnStreamLeader(apub, "interest") 3746 3747 si, err = js.StreamInfo("interest") 3748 require_NoError(t, err) 3749 s = sc.serverByName(si.Cluster.Leader) 3750 check(s) 3751 3752 var gwsa [16]*client 3753 gws := gwsa[:0] 3754 // Get the GW outbound connections 3755 s.getOutboundGatewayConnections(&gws) 3756 for _, gwc := range gws { 3757 gwc.mu.Lock() 3758 gwc.nc.Close() 3759 gwc.mu.Unlock() 3760 } 3761 waitForOutboundGateways(t, s, 2, 5*time.Second) 3762 check(s) 3763 } 3764 3765 type captureGWRewriteLogger struct { 3766 DummyLogger 3767 ch chan string 3768 } 3769 3770 func (l *captureGWRewriteLogger) Tracef(format string, args ...interface{}) { 3771 msg := fmt.Sprintf(format, args...) 3772 if strings.Contains(msg, "$JS.SNAPSHOT.ACK.TEST") && strings.Contains(msg, gwReplyPrefix) { 3773 select { 3774 case l.ch <- msg: 3775 default: 3776 } 3777 } 3778 } 3779 3780 func TestJetStreamSuperClusterGWReplyRewrite(t *testing.T) { 3781 sc := createJetStreamSuperCluster(t, 3, 2) 3782 defer sc.shutdown() 3783 3784 nc, js := jsClientConnect(t, sc.serverByName("C1-S1")) 3785 defer nc.Close() 3786 3787 _, err := js.AddStream(&nats.StreamConfig{ 3788 Name: "TEST", 3789 Subjects: []string{"foo"}, 3790 Replicas: 3, 3791 }) 3792 require_NoError(t, err) 3793 sc.waitOnStreamLeader(globalAccountName, "TEST") 3794 3795 for i := 0; i < 10; i++ { 3796 sendStreamMsg(t, nc, "foo", "msg") 3797 } 3798 3799 nc2, _ := jsClientConnect(t, sc.serverByName("C2-S2")) 3800 defer nc2.Close() 3801 3802 s := sc.clusters[0].streamLeader(globalAccountName, "TEST") 3803 var gws []*client 3804 s.getOutboundGatewayConnections(&gws) 3805 for _, gw := range gws { 3806 gw.mu.Lock() 3807 gw.trace = true 3808 gw.mu.Unlock() 3809 } 3810 l := &captureGWRewriteLogger{ch: make(chan string, 1)} 3811 s.SetLogger(l, false, true) 3812 3813 // Send a request through the gateway 3814 sreq := &JSApiStreamSnapshotRequest{ 3815 DeliverSubject: nats.NewInbox(), 3816 ChunkSize: 512, 3817 } 3818 natsSub(t, nc2, sreq.DeliverSubject, func(m *nats.Msg) { 3819 m.Respond(nil) 3820 }) 3821 natsFlush(t, nc2) 3822 req, _ := json.Marshal(sreq) 3823 rmsg, err := nc2.Request(fmt.Sprintf(JSApiStreamSnapshotT, "TEST"), req, time.Second) 3824 require_NoError(t, err) 3825 var resp JSApiStreamSnapshotResponse 3826 err = json.Unmarshal(rmsg.Data, &resp) 3827 require_NoError(t, err) 3828 if resp.Error != nil { 3829 t.Fatalf("Did not get correct error response: %+v", resp.Error) 3830 } 3831 3832 // Now we just want to make sure that the reply has the gateway prefix 3833 select { 3834 case <-l.ch: 3835 case <-time.After(10 * time.Second): 3836 } 3837 } 3838 3839 func TestJetStreamSuperClusterGWOfflineSatus(t *testing.T) { 3840 orgEventsHBInterval := eventsHBInterval 3841 eventsHBInterval = 500 * time.Millisecond //time.Second 3842 defer func() { eventsHBInterval = orgEventsHBInterval }() 3843 3844 tmpl := ` 3845 listen: 127.0.0.1:-1 3846 server_name: %s 3847 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 3848 3849 gateway { 3850 name: "local" 3851 listen: 127.0.0.1:-1 3852 } 3853 3854 cluster { 3855 name: %s 3856 listen: 127.0.0.1:%d 3857 routes = [%s] 3858 } 3859 3860 accounts { 3861 SYS { 3862 users [{user: sys, password: pwd}] 3863 } 3864 ONE { 3865 jetstream: enabled 3866 users [{user: one, password: pwd}] 3867 } 3868 } 3869 system_account=SYS 3870 ` 3871 c := createJetStreamClusterWithTemplate(t, tmpl, "local", 3) 3872 defer c.shutdown() 3873 3874 var gwURLs string 3875 for i, s := range c.servers { 3876 if i > 0 { 3877 gwURLs += "," 3878 } 3879 gwURLs += `"nats://` + s.GatewayAddr().String() + `"` 3880 } 3881 3882 tmpl2 := ` 3883 listen: 127.0.0.1:-1 3884 server_name: %s 3885 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 3886 3887 gateway { 3888 name: "remote" 3889 listen: 127.0.0.1:-1 3890 __remote__ 3891 } 3892 3893 cluster { 3894 name: %s 3895 listen: 127.0.0.1:%d 3896 routes = [%s] 3897 } 3898 3899 accounts { 3900 SYS { 3901 users [{user: sys, password: pwd}] 3902 } 3903 ONE { 3904 jetstream: enabled 3905 users [{user: one, password: pwd}] 3906 } 3907 } 3908 system_account=SYS 3909 ` 3910 c2 := createJetStreamClusterAndModHook(t, tmpl2, "remote", "R", 2, 16022, false, 3911 func(serverName, clusterName, storeDir, conf string) string { 3912 conf = strings.Replace(conf, "__remote__", fmt.Sprintf("gateways [ { name: 'local', urls: [%s] } ]", gwURLs), 1) 3913 return conf 3914 }) 3915 defer c2.shutdown() 3916 3917 for _, s := range c.servers { 3918 waitForOutboundGateways(t, s, 1, 2*time.Second) 3919 } 3920 for _, s := range c2.servers { 3921 waitForOutboundGateways(t, s, 1, 2*time.Second) 3922 } 3923 c.waitOnPeerCount(5) 3924 3925 // Simulate going offline without sending shutdown protocol 3926 for _, s := range c2.servers { 3927 c := s.getOutboundGatewayConnection("local") 3928 c.setNoReconnect() 3929 c.mu.Lock() 3930 c.nc.Close() 3931 c.mu.Unlock() 3932 } 3933 c2.shutdown() 3934 3935 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 3936 var ok int 3937 for _, s := range c.servers { 3938 jsz, err := s.Jsz(nil) 3939 if err != nil { 3940 return err 3941 } 3942 for _, r := range jsz.Meta.Replicas { 3943 if r.Name == "RS-1" && r.Offline { 3944 ok++ 3945 } else if r.Name == "RS-2" && r.Offline { 3946 ok++ 3947 } 3948 } 3949 } 3950 if ok != 2 { 3951 return fmt.Errorf("RS-1 or RS-2 still marked as online") 3952 } 3953 return nil 3954 }) 3955 } 3956 3957 func TestJetStreamSuperClusterMovingR1Stream(t *testing.T) { 3958 // Make C2 have some latency. 3959 gwm := gwProxyMap{ 3960 "C2": &gwProxy{ 3961 rtt: 10 * time.Millisecond, 3962 up: 1 * 1024 * 1024 * 1024, // 1gbit 3963 down: 1 * 1024 * 1024 * 1024, // 1gbit 3964 }, 3965 } 3966 sc := createJetStreamTaggedSuperClusterWithGWProxy(t, gwm) 3967 defer sc.shutdown() 3968 3969 nc, js := jsClientConnect(t, sc.clusterForName("C1").randomServer()) 3970 defer nc.Close() 3971 3972 _, err := js.AddStream(&nats.StreamConfig{ 3973 Name: "TEST", 3974 }) 3975 require_NoError(t, err) 3976 3977 toSend := 10_000 3978 for i := 0; i < toSend; i++ { 3979 _, err := js.PublishAsync("TEST", []byte("HELLO WORLD")) 3980 require_NoError(t, err) 3981 } 3982 select { 3983 case <-js.PublishAsyncComplete(): 3984 case <-time.After(5 * time.Second): 3985 t.Fatalf("Did not receive completion signal") 3986 } 3987 3988 // Have it move to GCP. 3989 _, err = js.UpdateStream(&nats.StreamConfig{ 3990 Name: "TEST", 3991 Placement: &nats.Placement{Tags: []string{"cloud:gcp"}}, 3992 }) 3993 require_NoError(t, err) 3994 3995 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3996 sc.waitOnStreamLeader(globalAccountName, "TEST") 3997 si, err := js.StreamInfo("TEST") 3998 if err != nil { 3999 return err 4000 } 4001 if si.Cluster.Name != "C2" { 4002 return fmt.Errorf("Wrong cluster: %q", si.Cluster.Name) 4003 } 4004 if si.Cluster.Leader == _EMPTY_ { 4005 return fmt.Errorf("No leader yet") 4006 } else if !strings.HasPrefix(si.Cluster.Leader, "C2") { 4007 return fmt.Errorf("Wrong leader: %q", si.Cluster.Leader) 4008 } 4009 // Now we want to see that we shrink back to original. 4010 if len(si.Cluster.Replicas) != 0 { 4011 return fmt.Errorf("Expected 0 replicas, got %d", len(si.Cluster.Replicas)) 4012 } 4013 if si.State.Msgs != uint64(toSend) { 4014 return fmt.Errorf("Only see %d msgs", si.State.Msgs) 4015 } 4016 return nil 4017 }) 4018 } 4019 4020 // https://github.com/nats-io/nats-server/issues/4396 4021 func TestJetStreamSuperClusterR1StreamPeerRemove(t *testing.T) { 4022 sc := createJetStreamSuperCluster(t, 1, 3) 4023 defer sc.shutdown() 4024 4025 nc, js := jsClientConnect(t, sc.serverByName("C1-S1")) 4026 defer nc.Close() 4027 4028 _, err := js.AddStream(&nats.StreamConfig{ 4029 Name: "TEST", 4030 Subjects: []string{"foo"}, 4031 Replicas: 1, 4032 }) 4033 require_NoError(t, err) 4034 4035 si, err := js.StreamInfo("TEST") 4036 require_NoError(t, err) 4037 4038 // Call peer remove on the only peer the leader. 4039 resp, err := nc.Request(fmt.Sprintf(JSApiStreamRemovePeerT, "TEST"), []byte(`{"peer":"`+si.Cluster.Leader+`"}`), time.Second) 4040 require_NoError(t, err) 4041 var rpr JSApiStreamRemovePeerResponse 4042 require_NoError(t, json.Unmarshal(resp.Data, &rpr)) 4043 require_False(t, rpr.Success) 4044 require_True(t, rpr.Error.ErrCode == 10075) 4045 4046 // Stream should still be in place and useable. 4047 _, err = js.StreamInfo("TEST") 4048 require_NoError(t, err) 4049 } 4050 4051 func TestJetStreamSuperClusterConsumerPauseAdvisories(t *testing.T) { 4052 sc := createJetStreamSuperCluster(t, 3, 3) 4053 defer sc.shutdown() 4054 4055 nc, js := jsClientConnect(t, sc.randomServer()) 4056 defer nc.Close() 4057 4058 pauseReq := func(consumer string, deadline time.Time) time.Time { 4059 j, err := json.Marshal(JSApiConsumerPauseRequest{ 4060 PauseUntil: deadline, 4061 }) 4062 require_NoError(t, err) 4063 msg, err := nc.Request(fmt.Sprintf(JSApiConsumerPauseT, "TEST", consumer), j, time.Second) 4064 require_NoError(t, err) 4065 var res JSApiConsumerPauseResponse 4066 err = json.Unmarshal(msg.Data, &res) 4067 require_NoError(t, err) 4068 return res.PauseUntil 4069 } 4070 4071 checkAdvisory := func(msg *nats.Msg, shouldBePaused bool, deadline time.Time) { 4072 t.Helper() 4073 var advisory JSConsumerPauseAdvisory 4074 require_NoError(t, json.Unmarshal(msg.Data, &advisory)) 4075 require_Equal(t, advisory.Stream, "TEST") 4076 require_Equal(t, advisory.Consumer, "my_consumer") 4077 require_Equal(t, advisory.Paused, shouldBePaused) 4078 require_True(t, advisory.PauseUntil.Equal(deadline)) 4079 } 4080 4081 _, err := js.AddStream(&nats.StreamConfig{ 4082 Name: "TEST", 4083 Subjects: []string{"foo"}, 4084 Replicas: 3, 4085 }) 4086 require_NoError(t, err) 4087 4088 ch := make(chan *nats.Msg, 10) 4089 _, err = nc.ChanSubscribe(JSAdvisoryConsumerPausePre+".TEST.my_consumer", ch) 4090 require_NoError(t, err) 4091 4092 deadline := time.Now().Add(time.Second) 4093 jsTestPause_CreateOrUpdateConsumer(t, nc, ActionCreate, "TEST", ConsumerConfig{ 4094 Name: "my_consumer", 4095 PauseUntil: &deadline, 4096 Replicas: 3, 4097 }) 4098 4099 // First advisory should tell us that the consumer was paused 4100 // on creation. 4101 msg := require_ChanRead(t, ch, time.Second*2) 4102 checkAdvisory(msg, true, deadline) 4103 require_Len(t, len(ch), 0) // Should only receive one advisory. 4104 4105 // The second one for the unpause. 4106 msg = require_ChanRead(t, ch, time.Second*2) 4107 checkAdvisory(msg, false, deadline) 4108 require_Len(t, len(ch), 0) // Should only receive one advisory. 4109 4110 // Now we'll pause the consumer for a second using the API. 4111 deadline = time.Now().Add(time.Second) 4112 require_True(t, pauseReq("my_consumer", deadline).Equal(deadline)) 4113 4114 // Third advisory should tell us about the pause via the API. 4115 msg = require_ChanRead(t, ch, time.Second*2) 4116 checkAdvisory(msg, true, deadline) 4117 require_Len(t, len(ch), 0) // Should only receive one advisory. 4118 4119 // Finally that should unpause. 4120 msg = require_ChanRead(t, ch, time.Second*2) 4121 checkAdvisory(msg, false, deadline) 4122 require_Len(t, len(ch), 0) // Should only receive one advisory. 4123 4124 // Now we're going to set the deadline into the future so we can 4125 // see what happens when we kick leaders or restart. 4126 deadline = time.Now().Add(time.Hour) 4127 require_True(t, pauseReq("my_consumer", deadline).Equal(deadline)) 4128 4129 // Setting the deadline should have generated an advisory. 4130 msg = require_ChanRead(t, ch, time.Second*2) 4131 checkAdvisory(msg, true, deadline) 4132 require_Len(t, len(ch), 0) // Should only receive one advisory. 4133 }