get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/test/client_cluster_test.go

get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/test/client_cluster_test.go (about)

     1  // Copyright 2013-2019 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package test
    15  
    16  import (
    17  	"fmt"
    18  	"math/rand"
    19  	"sync"
    20  	"sync/atomic"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/nats-io/nats.go"
    25  )
    26  
    27  func TestServerRestartReSliceIssue(t *testing.T) {
    28  	srvA, srvB, optsA, optsB := runServers(t)
    29  	defer srvA.Shutdown()
    30  
    31  	urlA := fmt.Sprintf("nats://%s:%d/", optsA.Host, optsA.Port)
    32  	urlB := fmt.Sprintf("nats://%s:%d/", optsB.Host, optsB.Port)
    33  
    34  	// msg to send..
    35  	msg := []byte("Hello World")
    36  
    37  	servers := []string{urlA, urlB}
    38  
    39  	opts := nats.GetDefaultOptions()
    40  	opts.Timeout = (5 * time.Second)
    41  	opts.ReconnectWait = (50 * time.Millisecond)
    42  	opts.MaxReconnect = 1000
    43  
    44  	numClients := 20
    45  
    46  	reconnects := int32(0)
    47  	reconnectsDone := make(chan bool, numClients)
    48  	opts.ReconnectedCB = func(nc *nats.Conn) {
    49  		atomic.AddInt32(&reconnects, 1)
    50  		reconnectsDone <- true
    51  	}
    52  
    53  	clients := make([]*nats.Conn, numClients)
    54  
    55  	// Create 20 random clients.
    56  	// Half connected to A and half to B..
    57  	for i := 0; i < numClients; i++ {
    58  		opts.Url = servers[i%2]
    59  		nc, err := opts.Connect()
    60  		if err != nil {
    61  			t.Fatalf("Failed to create connection: %v\n", err)
    62  		}
    63  		clients[i] = nc
    64  		defer nc.Close()
    65  
    66  		// Create 10 subscriptions each..
    67  		for x := 0; x < 10; x++ {
    68  			subject := fmt.Sprintf("foo.%d", (rand.Int()%50)+1)
    69  			nc.Subscribe(subject, func(m *nats.Msg) {
    70  				// Just eat it..
    71  			})
    72  		}
    73  		// Pick one subject to send to..
    74  		subject := fmt.Sprintf("foo.%d", (rand.Int()%50)+1)
    75  		go func() {
    76  			time.Sleep(10 * time.Millisecond)
    77  			for i := 1; i <= 100; i++ {
    78  				if err := nc.Publish(subject, msg); err != nil {
    79  					return
    80  				}
    81  				if i%10 == 0 {
    82  					time.Sleep(time.Millisecond)
    83  				}
    84  			}
    85  		}()
    86  	}
    87  
    88  	// Wait for a short bit..
    89  	time.Sleep(20 * time.Millisecond)
    90  
    91  	// Restart SrvB
    92  	srvB.Shutdown()
    93  	srvB = RunServer(optsB)
    94  	defer srvB.Shutdown()
    95  
    96  	// Check that all expected clients have reconnected
    97  	done := false
    98  	for i := 0; i < numClients/2 && !done; i++ {
    99  		select {
   100  		case <-reconnectsDone:
   101  			done = true
   102  		case <-time.After(3 * time.Second):
   103  			t.Fatalf("Expected %d reconnects, got %d\n", numClients/2, reconnects)
   104  		}
   105  	}
   106  
   107  	// Since srvB was restarted, its defer Shutdown() was last, so will
   108  	// exectue first, which would cause clients that have reconnected to
   109  	// it to try to reconnect (causing delays on Windows). So let's
   110  	// explicitly close them here.
   111  	// NOTE: With fix of NATS GO client (reconnect loop yields to Close()),
   112  	//       this change would not be required, however, it still speeeds up
   113  	//       the test, from more than 7s to less than one.
   114  	for i := 0; i < numClients; i++ {
   115  		nc := clients[i]
   116  		nc.Close()
   117  	}
   118  }
   119  
   120  // This will test queue subscriber semantics across a cluster in the presence
   121  // of server restarts.
   122  func TestServerRestartAndQueueSubs(t *testing.T) {
   123  	srvA, srvB, optsA, optsB := runServers(t)
   124  	defer srvA.Shutdown()
   125  	defer srvB.Shutdown()
   126  
   127  	urlA := fmt.Sprintf("nats://%s:%d/", optsA.Host, optsA.Port)
   128  	urlB := fmt.Sprintf("nats://%s:%d/", optsB.Host, optsB.Port)
   129  
   130  	// Client options
   131  	opts := nats.GetDefaultOptions()
   132  	opts.Timeout = (5 * time.Second)
   133  	opts.ReconnectWait = (20 * time.Millisecond)
   134  	opts.MaxReconnect = 1000
   135  	opts.NoRandomize = true
   136  
   137  	// Allow us to block on a reconnect completion.
   138  	reconnectsDone := make(chan bool)
   139  	opts.ReconnectedCB = func(nc *nats.Conn) {
   140  		reconnectsDone <- true
   141  	}
   142  
   143  	// Helper to wait on a reconnect.
   144  	waitOnReconnect := func() {
   145  		t.Helper()
   146  		select {
   147  		case <-reconnectsDone:
   148  		case <-time.After(2 * time.Second):
   149  			t.Fatalf("Expected a reconnect, timedout!\n")
   150  		}
   151  	}
   152  
   153  	// Create two clients..
   154  	opts.Servers = []string{urlA, urlB}
   155  	nc1, err := opts.Connect()
   156  	if err != nil {
   157  		t.Fatalf("Failed to create connection for nc1: %v\n", err)
   158  	}
   159  
   160  	opts.Servers = []string{urlB, urlA}
   161  	nc2, err := opts.Connect()
   162  	if err != nil {
   163  		t.Fatalf("Failed to create connection for nc2: %v\n", err)
   164  	}
   165  
   166  	c1, _ := nats.NewEncodedConn(nc1, "json")
   167  	defer c1.Close()
   168  	c2, _ := nats.NewEncodedConn(nc2, "json")
   169  	defer c2.Close()
   170  
   171  	// Flusher helper function.
   172  	flush := func() {
   173  		// Wait for processing.
   174  		c1.Flush()
   175  		c2.Flush()
   176  		// Wait for a short bit for cluster propagation.
   177  		time.Sleep(50 * time.Millisecond)
   178  	}
   179  
   180  	// To hold queue results.
   181  	results := make(map[int]int)
   182  	var mu sync.Mutex
   183  
   184  	// This corresponds to the subsriptions below.
   185  	const ExpectedMsgCount = 3
   186  
   187  	// Make sure we got what we needed, 1 msg only and all seqnos accounted for..
   188  	checkResults := func(numSent int) {
   189  		mu.Lock()
   190  		defer mu.Unlock()
   191  
   192  		for i := 0; i < numSent; i++ {
   193  			if results[i] != ExpectedMsgCount {
   194  				t.Fatalf("Received incorrect number of messages, [%d] vs [%d] for seq: %d\n", results[i], ExpectedMsgCount, i)
   195  			}
   196  		}
   197  
   198  		// Auto reset results map
   199  		results = make(map[int]int)
   200  	}
   201  
   202  	subj := "foo.bar"
   203  	qgroup := "workers"
   204  
   205  	cb := func(seqno int) {
   206  		mu.Lock()
   207  		defer mu.Unlock()
   208  		results[seqno] = results[seqno] + 1
   209  	}
   210  
   211  	// Create queue subscribers
   212  	c1.QueueSubscribe(subj, qgroup, cb)
   213  	c2.QueueSubscribe(subj, qgroup, cb)
   214  
   215  	// Do a wildcard subscription.
   216  	c1.Subscribe("foo.*", cb)
   217  	c2.Subscribe("foo.*", cb)
   218  
   219  	// Wait for processing.
   220  	flush()
   221  
   222  	sendAndCheckMsgs := func(numToSend int) {
   223  		for i := 0; i < numToSend; i++ {
   224  			if i%2 == 0 {
   225  				c1.Publish(subj, i)
   226  			} else {
   227  				c2.Publish(subj, i)
   228  			}
   229  		}
   230  		// Wait for processing.
   231  		flush()
   232  		// Check Results
   233  		checkResults(numToSend)
   234  	}
   235  
   236  	////////////////////////////////////////////////////////////////////////////
   237  	// Base Test
   238  	////////////////////////////////////////////////////////////////////////////
   239  
   240  	// Make sure subscriptions are propagated in the cluster
   241  	if err := checkExpectedSubs(4, srvA, srvB); err != nil {
   242  		t.Fatalf("%v", err)
   243  	}
   244  
   245  	// Now send 10 messages, from each client..
   246  	sendAndCheckMsgs(10)
   247  
   248  	////////////////////////////////////////////////////////////////////////////
   249  	// Now restart SrvA and srvB, re-run test
   250  	////////////////////////////////////////////////////////////////////////////
   251  
   252  	srvA.Shutdown()
   253  	// Wait for client on A to reconnect to B.
   254  	waitOnReconnect()
   255  
   256  	srvA = RunServer(optsA)
   257  	defer srvA.Shutdown()
   258  
   259  	srvB.Shutdown()
   260  	// Now both clients should reconnect to A.
   261  	waitOnReconnect()
   262  	waitOnReconnect()
   263  
   264  	srvB = RunServer(optsB)
   265  	defer srvB.Shutdown()
   266  
   267  	// Make sure the cluster is reformed
   268  	checkClusterFormed(t, srvA, srvB)
   269  
   270  	// Make sure subscriptions are propagated in the cluster
   271  	// Clients will be connected to srvA, so that will be 4,
   272  	// but srvB will only have 2 now since we coaelsce.
   273  	if err := checkExpectedSubs(4, srvA); err != nil {
   274  		t.Fatalf("%v", err)
   275  	}
   276  	if err := checkExpectedSubs(2, srvB); err != nil {
   277  		t.Fatalf("%v", err)
   278  	}
   279  
   280  	// Now send another 10 messages, from each client..
   281  	sendAndCheckMsgs(10)
   282  
   283  	// Since servers are restarted after all client's close defer calls,
   284  	// their defer Shutdown() are last, and so will be executed first,
   285  	// which would cause clients to try to reconnect on exit, causing
   286  	// delays on Windows. So let's explicitly close them here.
   287  	c1.Close()
   288  	c2.Close()
   289  }
   290  
   291  // This will test request semantics across a route
   292  func TestRequestsAcrossRoutes(t *testing.T) {
   293  	srvA, srvB, optsA, optsB := runServers(t)
   294  	defer srvA.Shutdown()
   295  	defer srvB.Shutdown()
   296  
   297  	urlA := fmt.Sprintf("nats://%s:%d/", optsA.Host, optsA.Port)
   298  	urlB := fmt.Sprintf("nats://%s:%d/", optsB.Host, optsB.Port)
   299  
   300  	nc1, err := nats.Connect(urlA)
   301  	if err != nil {
   302  		t.Fatalf("Failed to create connection for nc1: %v\n", err)
   303  	}
   304  	defer nc1.Close()
   305  
   306  	nc2, err := nats.Connect(urlB)
   307  	if err != nil {
   308  		t.Fatalf("Failed to create connection for nc2: %v\n", err)
   309  	}
   310  	defer nc2.Close()
   311  
   312  	ec2, _ := nats.NewEncodedConn(nc2, nats.JSON_ENCODER)
   313  
   314  	response := []byte("I will help you")
   315  
   316  	// Connect responder to srvA
   317  	nc1.Subscribe("foo-req", func(m *nats.Msg) {
   318  		nc1.Publish(m.Reply, response)
   319  	})
   320  	// Make sure the route and the subscription are propagated.
   321  	nc1.Flush()
   322  
   323  	if err := checkExpectedSubs(1, srvA, srvB); err != nil {
   324  		t.Fatalf(err.Error())
   325  	}
   326  
   327  	var resp string
   328  
   329  	for i := 0; i < 100; i++ {
   330  		if err := ec2.Request("foo-req", i, &resp, 250*time.Millisecond); err != nil {
   331  			t.Fatalf("Received an error on Request test [%d]: %s", i, err)
   332  		}
   333  	}
   334  }
   335  
   336  // This will test request semantics across a route to queues
   337  func TestRequestsAcrossRoutesToQueues(t *testing.T) {
   338  	srvA, srvB, optsA, optsB := runServers(t)
   339  	defer srvA.Shutdown()
   340  	defer srvB.Shutdown()
   341  
   342  	urlA := fmt.Sprintf("nats://%s:%d/", optsA.Host, optsA.Port)
   343  	urlB := fmt.Sprintf("nats://%s:%d/", optsB.Host, optsB.Port)
   344  
   345  	nc1, err := nats.Connect(urlA)
   346  	if err != nil {
   347  		t.Fatalf("Failed to create connection for nc1: %v\n", err)
   348  	}
   349  	defer nc1.Close()
   350  
   351  	nc2, err := nats.Connect(urlB)
   352  	if err != nil {
   353  		t.Fatalf("Failed to create connection for nc2: %v\n", err)
   354  	}
   355  	defer nc2.Close()
   356  
   357  	ec1, _ := nats.NewEncodedConn(nc1, nats.JSON_ENCODER)
   358  	ec2, _ := nats.NewEncodedConn(nc2, nats.JSON_ENCODER)
   359  
   360  	response := []byte("I will help you")
   361  
   362  	// Connect one responder to srvA
   363  	nc1.QueueSubscribe("foo-req", "booboo", func(m *nats.Msg) {
   364  		nc1.Publish(m.Reply, response)
   365  	})
   366  	// Make sure the route and the subscription are propagated.
   367  	nc1.Flush()
   368  
   369  	// Connect the other responder to srvB
   370  	nc2.QueueSubscribe("foo-req", "booboo", func(m *nats.Msg) {
   371  		nc2.Publish(m.Reply, response)
   372  	})
   373  
   374  	if err := checkExpectedSubs(2, srvA, srvB); err != nil {
   375  		t.Fatalf(err.Error())
   376  	}
   377  
   378  	var resp string
   379  
   380  	for i := 0; i < 100; i++ {
   381  		if err := ec2.Request("foo-req", i, &resp, 500*time.Millisecond); err != nil {
   382  			t.Fatalf("Received an error on Request test [%d]: %s", i, err)
   383  		}
   384  	}
   385  
   386  	for i := 0; i < 100; i++ {
   387  		if err := ec1.Request("foo-req", i, &resp, 500*time.Millisecond); err != nil {
   388  			t.Fatalf("Received an error on Request test [%d]: %s", i, err)
   389  		}
   390  	}
   391  }
   392  
   393  // This is in response to Issue #1144
   394  // https://github.com/nats-io/nats-server/issues/1144
   395  func TestQueueDistributionAcrossRoutes(t *testing.T) {
   396  	srvA, srvB, _, _ := runServers(t)
   397  	defer srvA.Shutdown()
   398  	defer srvB.Shutdown()
   399  
   400  	checkClusterFormed(t, srvA, srvB)
   401  
   402  	urlA := srvA.ClientURL()
   403  	urlB := srvB.ClientURL()
   404  
   405  	nc1, err := nats.Connect(urlA)
   406  	if err != nil {
   407  		t.Fatalf("Failed to create connection for nc1: %v\n", err)
   408  	}
   409  	defer nc1.Close()
   410  
   411  	nc2, err := nats.Connect(urlB)
   412  	if err != nil {
   413  		t.Fatalf("Failed to create connection for nc2: %v\n", err)
   414  	}
   415  	defer nc2.Close()
   416  
   417  	var qsubs []*nats.Subscription
   418  
   419  	// Connect queue subscriptions as mentioned in the issue. 2(A) - 6(B) - 4(A)
   420  	for i := 0; i < 2; i++ {
   421  		sub, _ := nc1.QueueSubscribeSync("foo", "bar")
   422  		qsubs = append(qsubs, sub)
   423  	}
   424  	nc1.Flush()
   425  	for i := 0; i < 6; i++ {
   426  		sub, _ := nc2.QueueSubscribeSync("foo", "bar")
   427  		qsubs = append(qsubs, sub)
   428  	}
   429  	nc2.Flush()
   430  	for i := 0; i < 4; i++ {
   431  		sub, _ := nc1.QueueSubscribeSync("foo", "bar")
   432  		qsubs = append(qsubs, sub)
   433  	}
   434  	nc1.Flush()
   435  
   436  	if err := checkExpectedSubs(7, srvA, srvB); err != nil {
   437  		t.Fatalf("%v", err)
   438  	}
   439  
   440  	send := 10000
   441  	for i := 0; i < send; i++ {
   442  		nc2.Publish("foo", nil)
   443  	}
   444  	nc2.Flush()
   445  
   446  	tp := func() int {
   447  		var total int
   448  		for i := 0; i < len(qsubs); i++ {
   449  			pending, _, _ := qsubs[i].Pending()
   450  			total += pending
   451  		}
   452  		return total
   453  	}
   454  
   455  	checkFor(t, time.Second, 10*time.Millisecond, func() error {
   456  		if total := tp(); total != send {
   457  			return fmt.Errorf("Number of total received %d", total)
   458  		}
   459  		return nil
   460  	})
   461  
   462  	// The bug is essentially that when we deliver across a route, we
   463  	// prefer locals, but if we randomize to a block of bounce backs, then
   464  	// we walk to the end and find the same local for all the remote options.
   465  	// So what you will see in this case is a large value at #9 (2+6, next one local).
   466  
   467  	avg := send / len(qsubs)
   468  	for i := 0; i < len(qsubs); i++ {
   469  		total, _, _ := qsubs[i].Pending()
   470  		if total > avg+(avg*3/10) {
   471  			if i == 8 {
   472  				t.Fatalf("Qsub in 8th position gets majority of the messages (prior 6 spots) in this test")
   473  			}
   474  			t.Fatalf("Received too high, %d vs %d", total, avg)
   475  		}
   476  	}
   477  }