github.com/swiftstack/ProxyFS@v0.0.0-20210203235616-4017c267d62f/retryrpc/stress_test.go

github.com/swiftstack/ProxyFS@v0.0.0-20210203235616-4017c267d62f/retryrpc/stress_test.go (about)

     1  // Copyright (c) 2015-2021, NVIDIA CORPORATION.
     2  // SPDX-License-Identifier: Apache-2.0
     3  
     4  package retryrpc
     5  
     6  import (
     7  	"fmt"
     8  	"math/rand"
     9  	"sync"
    10  	"testing"
    11  	"time"
    12  
    13  	/* DEBUG for pprof
    14  	_ "net/http/pprof"
    15  	*/
    16  
    17  	"github.com/stretchr/testify/assert"
    18  	"github.com/swiftstack/ProxyFS/retryrpc/rpctest"
    19  )
    20  
    21  func TestStress(t *testing.T) {
    22  
    23  	/*
    24  		 * DEBUG - used to debug memory leaks
    25  		 * Run " go tool pprof  http://localhost:12123/debug/pprof/heap"
    26  		 * to look at memory inuse
    27  		// Start the ws that listens for pprof requests
    28  		go http.ListenAndServe("localhost:12123", nil)
    29  	*/
    30  
    31  	testLoop(t)
    32  	testLoopClientAckTrim(t)
    33  	testLoopTTLTrim(t)
    34  	testSendLargeRPC(t)
    35  }
    36  
    37  func testLoop(t *testing.T) {
    38  	var (
    39  		agentCount = 15
    40  		sendCount  = 250
    41  	)
    42  	assert := assert.New(t)
    43  	zero := 0
    44  	assert.Equal(0, zero)
    45  
    46  	// Create new rpctest server - needed for calling
    47  	// RPCs
    48  	myJrpcfs := rpctest.NewServer()
    49  
    50  	rrSvr, ipAddr, port := getNewServer(65*time.Second, false)
    51  	assert.NotNil(rrSvr)
    52  
    53  	// Register the Server - sets up the methods supported by the
    54  	// server
    55  	err := rrSvr.Register(myJrpcfs)
    56  	assert.Nil(err)
    57  
    58  	// Start listening for requests on the ipaddr/port
    59  	startErr := rrSvr.Start()
    60  	assert.Nil(startErr, "startErr is not nil")
    61  
    62  	// Tell server to start accepting and processing requests
    63  	rrSvr.Run()
    64  
    65  	// Start up the agents
    66  	parallelAgentSenders(t, rrSvr, ipAddr, port, agentCount, "RpcPing", sendCount, rrSvr.Creds.RootCAx509CertificatePEM)
    67  
    68  	rrSvr.Close()
    69  }
    70  
    71  // testLoopClientAckTrim tests that we are correctly trimming messages
    72  // based on the shorter term trimmer.   The shorter term trimmer relies
    73  // on the client code saying "this is the highest consecutive sqn we have
    74  // seen".   Then the server can throw away messages up to and including the
    75  // highest consecutive sqn.
    76  func testLoopClientAckTrim(t *testing.T) {
    77  	var (
    78  		agentCount = 15
    79  		sendCount  = 250
    80  	)
    81  	assert := assert.New(t)
    82  	zero := 0
    83  	assert.Equal(0, zero)
    84  
    85  	// Create new rpctest server - needed for calling
    86  	// RPCs
    87  	myJrpcfs := rpctest.NewServer()
    88  
    89  	whenTTL := 10 * time.Millisecond
    90  	rrSvr, ipAddr, port := getNewServer(whenTTL, true)
    91  	assert.NotNil(rrSvr)
    92  
    93  	// Register the Server - sets up the methods supported by the
    94  	// server
    95  	err := rrSvr.Register(myJrpcfs)
    96  	assert.Nil(err)
    97  
    98  	// Start listening for requests on the ipaddr/port
    99  	startErr := rrSvr.Start()
   100  	assert.Nil(startErr, "startErr is not nil")
   101  
   102  	// Tell server to start accepting and processing requests
   103  	rrSvr.Run()
   104  
   105  	// Start up the agents
   106  	parallelAgentSenders(t, rrSvr, ipAddr, port, agentCount, "RpcPing", sendCount, rrSvr.Creds.RootCAx509CertificatePEM)
   107  
   108  	// Now for both trimmers to run
   109  	tm := time.Now()
   110  
   111  	// First the 100ms trimmer - this will leave 1 entry on completed request queue
   112  	// for each agent since there is no remaining client request to say it is completed.
   113  	//
   114  	// We need the TTL timer to clean up the last entry
   115  	rrSvr.trimCompleted(tm, false)
   116  	assert.Equal(agentCount, cntNotTrimmed(rrSvr), "Should have agentCount messages remaining")
   117  
   118  	// Make sure the queue messages will be old enough to be trimmed
   119  	time.Sleep(whenTTL)
   120  
   121  	// Now the TTL timer to cleanup the last
   122  	tmTTL := time.Now()
   123  	rrSvr.trimCompleted(tmTTL, true)
   124  
   125  	// All messages should be trimmed at this point
   126  	assert.Equal(0, cntNotTrimmed(rrSvr), "Still have incomplete messages")
   127  
   128  	/*
   129  		 *  DEBUG - allows user to use pprof to check for memory leaks
   130  		// The caller of this test will block and we can check for memory leaks with pprof
   131  		fmt.Printf("\n=========== SLEEP 5 minutes ===================\n")
   132  		time.Sleep(5 * time.Minute)
   133  	*/
   134  
   135  	rrSvr.Close()
   136  }
   137  
   138  func testLoopTTLTrim(t *testing.T) {
   139  	var (
   140  		agentCount = 15
   141  		sendCount  = 250
   142  	)
   143  	assert := assert.New(t)
   144  	zero := 0
   145  	assert.Equal(0, zero)
   146  
   147  	// Create new rpctest server - needed for calling
   148  	// RPCs
   149  	myJrpcfs := rpctest.NewServer()
   150  
   151  	whenTTL := 10 * time.Millisecond
   152  	rrSvr, ipAddr, port := getNewServer(whenTTL, true)
   153  	assert.NotNil(rrSvr)
   154  
   155  	// Register the Server - sets up the methods supported by the
   156  	// server
   157  	err := rrSvr.Register(myJrpcfs)
   158  	assert.Nil(err)
   159  
   160  	// Start listening for requests on the ipaddr/port
   161  	startErr := rrSvr.Start()
   162  	assert.Nil(startErr, "startErr is not nil")
   163  
   164  	// Tell server to start accepting and processing requests
   165  	rrSvr.Run()
   166  
   167  	// Start up the agents
   168  	parallelAgentSenders(t, rrSvr, ipAddr, port, agentCount, "RpcPing", sendCount, rrSvr.Creds.RootCAx509CertificatePEM)
   169  
   170  	// Use the TTL trimmer to remove all messages after guaranteeing we are
   171  	// past time when they should be removed
   172  	time.Sleep(whenTTL)
   173  	tmTTL := time.Now()
   174  	rrSvr.trimCompleted(tmTTL, true)
   175  
   176  	assert.Equal(0, cntNotTrimmed(rrSvr), "Still have incomplete messages")
   177  
   178  	/*
   179  		 * DEBUG - all time for pprof tool to be used for tracking down memory leaks
   180  		// The caller of this test will block and we can check for memory leaks with pprof
   181  		fmt.Printf("\n=========== SLEEP 5 minutes ===================\n")
   182  		time.Sleep(5 * time.Minute)
   183  	*/
   184  
   185  	rrSvr.Close()
   186  }
   187  
   188  func testSendLargeRPC(t *testing.T) {
   189  	var (
   190  		agentCount = 15
   191  		sendCount  = 250
   192  	)
   193  	assert := assert.New(t)
   194  	zero := 0
   195  	assert.Equal(0, zero)
   196  
   197  	// Create new rpctest server - needed for calling
   198  	// RPCs
   199  	myJrpcfs := rpctest.NewServer()
   200  
   201  	whenTTL := 10 * time.Millisecond
   202  	rrSvr, ipAddr, port := getNewServer(whenTTL, true)
   203  	assert.NotNil(rrSvr)
   204  
   205  	// Register the Server - sets up the methods supported by the
   206  	// server
   207  	err := rrSvr.Register(myJrpcfs)
   208  	assert.Nil(err)
   209  
   210  	// Start listening for requests on the ipaddr/port
   211  	startErr := rrSvr.Start()
   212  	assert.Nil(startErr, "startErr is not nil")
   213  
   214  	// Tell server to start accepting and processing requests
   215  	rrSvr.Run()
   216  
   217  	// Start up the agents
   218  	parallelAgentSenders(t, rrSvr, ipAddr, port, agentCount, "RpcPingLarge", sendCount, rrSvr.Creds.RootCAx509CertificatePEM)
   219  
   220  	// Now for both trimmers to run
   221  	tm := time.Now()
   222  
   223  	// First the 100ms trimmer - this will leave 1 entry on completed request queue
   224  	// for each agent since there is no remaining client request to say it is completed.
   225  	//
   226  	// We need the TTL timer to clean up the last entry
   227  	rrSvr.trimCompleted(tm, false)
   228  	assert.Equal(agentCount, cntNotTrimmed(rrSvr), "Should have agentCount messages remaining")
   229  
   230  	// Make sure the queue messages will be old enough to be trimmed
   231  	time.Sleep(whenTTL)
   232  
   233  	// Now the TTL timer to cleanup the last
   234  	tmTTL := time.Now()
   235  	rrSvr.trimCompleted(tmTTL, true)
   236  
   237  	/*
   238  		 * DEBUG - sleep for a time for pprof tool to be used for tracking down memory leaks
   239  		// The caller of this test will block and we can check for memory leaks with pprof
   240  		fmt.Printf("\n=========== SLEEP 5 minutes ===================\n")
   241  		time.Sleep(5 * time.Minute)
   242  	*/
   243  
   244  	// All messages should be trimmed at this point
   245  	assert.Equal(0, cntNotTrimmed(rrSvr), "Still have incomplete messages")
   246  
   247  	rrSvr.Close()
   248  }
   249  
   250  // testLoopClientAckTrim tests that we are correctly trimming messages
   251  
   252  func cntNotTrimmed(server *Server) (numItems int) {
   253  	server.Lock()
   254  	for _, ci := range server.perClientInfo {
   255  		ci.Lock()
   256  		if len(ci.completedRequest) != 0 {
   257  			numItems += len(ci.completedRequest)
   258  		} else {
   259  			if ci.completedRequestLRU.Len() != 0 {
   260  				numItems += ci.completedRequestLRU.Len()
   261  			}
   262  		}
   263  		ci.Unlock()
   264  	}
   265  	server.Unlock()
   266  
   267  	return
   268  }
   269  
   270  func ping(t *testing.T, client *Client, i int, agentID uint64, assert *assert.Assertions) {
   271  	// Send a ping RPC and print the results
   272  	msg := fmt.Sprintf("Ping Me - %v", i)
   273  	pingRequest := &rpctest.PingReq{Message: msg}
   274  	pingReply := &rpctest.PingReply{}
   275  	expectedReply := fmt.Sprintf("pong %d bytes", len(msg))
   276  	err := client.Send("RpcPing", pingRequest, pingReply)
   277  	assert.Nil(err, "client.Send() returned an error")
   278  	if expectedReply != pingReply.Message {
   279  		fmt.Printf("		 client - AGENTID: %v\n", agentID)
   280  		fmt.Printf("         client.Send(RpcPing) reply '%+v'\n", pingReply)
   281  		fmt.Printf("         client.Send(RpcPing) expected '%s' but received '%s'\n", expectedReply, pingReply.Message)
   282  		fmt.Printf("         client.Send(RpcPing) SENT: msg '%v' but received '%s'\n", msg, pingReply.Message)
   283  		fmt.Printf("         client.Send(RpcPing) len(pingRequest.Message): '%d' i: %v\n", len(pingRequest.Message), i)
   284  	}
   285  	assert.Equal(expectedReply, pingReply.Message, "Received different output then expected")
   286  }
   287  
   288  // pingLarge responds to the RPC with a large packet
   289  func pingLarge(t *testing.T, client *Client, i int, agentID uint64, assert *assert.Assertions) {
   290  	// Send a ping RPC and print the results
   291  	msg := fmt.Sprintf("Ping Me - %v", i)
   292  	pingRequest := &rpctest.PingReq{Message: msg}
   293  	pingReply := &rpctest.PingReply{}
   294  	err := client.Send("RpcPingLarge", pingRequest, pingReply)
   295  	assert.Nil(err, "client.Send() returned an error")
   296  }
   297  
   298  func sendIt(t *testing.T, client *Client, z int, sendCnt int, sendWg *sync.WaitGroup, prevWg *sync.WaitGroup, agentID uint64, method string, i int) {
   299  
   300  	assert := assert.New(t)
   301  	defer sendWg.Done()
   302  
   303  	switch method {
   304  	case "RpcPing":
   305  		ping(t, client, z, agentID, assert)
   306  		break
   307  	case "RpcPingLarge":
   308  		pingLarge(t, client, z, agentID, assert)
   309  		break
   310  	}
   311  
   312  	// The last send is blocked until the previous send has completed.   This
   313  	// is how we test the short trimmer.
   314  	if i <= (sendCnt - 2) {
   315  		prevWg.Done()
   316  	}
   317  }
   318  
   319  type stressMyClient struct {
   320  	sync.Mutex
   321  	cond         *sync.Cond // Signal that received Interrupt() callback
   322  	sawCallback  bool       // True if Interrupt() was called
   323  	interruptCnt int        // Count of Interrupt() calls received (best effort)
   324  }
   325  
   326  func (cb *stressMyClient) Interrupt(payload []byte) {
   327  	cb.Lock()
   328  	cb.sawCallback = true
   329  	cb.interruptCnt++
   330  	cb.cond.Broadcast()
   331  	cb.Unlock()
   332  	return
   333  }
   334  
   335  // Represents a pfsagent - sepearate client
   336  func pfsagent(t *testing.T, rrSvr *Server, ipAddr string, port int, agentID uint64, method string,
   337  	agentWg *sync.WaitGroup, sendCnt int, rootCAx509CertificatePEM []byte) {
   338  	defer agentWg.Done()
   339  
   340  	cb := &stressMyClient{}
   341  	cb.cond = sync.NewCond(&cb.Mutex)
   342  	clientID := fmt.Sprintf("client - %v", agentID)
   343  	clientConfig := &ClientConfig{MyUniqueID: clientID, IPAddr: ipAddr, Port: port,
   344  		RootCAx509CertificatePEM: rootCAx509CertificatePEM, Callbacks: cb, DeadlineIO: 5 * time.Second}
   345  	client, err := NewClient(clientConfig)
   346  	if err != nil {
   347  		fmt.Printf("Dial() failed with err: %v\n", err)
   348  		return
   349  	}
   350  	defer client.Close()
   351  
   352  	// WG to verify all messages sent
   353  	var sendWg sync.WaitGroup
   354  
   355  	// WG to verify all but the last send() has been sent and
   356  	// received.   This is needed to test the consecutive sequence
   357  	// trimmer is working.
   358  	var prevWg sync.WaitGroup
   359  
   360  	var z, r int
   361  	var msg1 []byte = []byte("server msg back to client")
   362  	for i := 0; i < sendCnt; i++ {
   363  
   364  		z = (z + i) * 10
   365  
   366  		if i == (sendCnt - 1) {
   367  			// Give server time to process messages.   This last
   368  			// call gets us closer to highestConsecutive set to sendCnt - 1.
   369  			prevWg.Wait()
   370  
   371  			// The highest consecutive number is updated in the background with
   372  			// a goroutine when send() returns.
   373  			//
   374  			// Therefore, we loop waiting for it to hit (sendCnt - 1)
   375  			for {
   376  				var currentHighest requestID
   377  				client.Lock()
   378  				currentHighest = client.highestConsecutive
   379  				client.Unlock()
   380  
   381  				if int(currentHighest) == (sendCnt - 1) {
   382  					break
   383  				}
   384  				time.Sleep(10 * time.Millisecond)
   385  			}
   386  		} else {
   387  			prevWg.Add(1)
   388  		}
   389  
   390  		sendWg.Add(1)
   391  		go func(z int, i int) {
   392  			sendIt(t, client, z, sendCnt, &sendWg, &prevWg, agentID, method, i)
   393  			rrSvr.SendCallback(clientID, msg1)
   394  		}(z, i)
   395  
   396  		// Occasionally drop the connection to the server to
   397  		// simulate retransmits
   398  		r = i % 10
   399  		if r == 0 && (i != 0) {
   400  			rrSvr.CloseClientConn()
   401  		}
   402  	}
   403  	sendWg.Wait()
   404  }
   405  
   406  // Start a bunch of "pfsagents" in parallel
   407  func parallelAgentSenders(t *testing.T, rrSrv *Server, ipAddr string, port int, agentCnt int,
   408  	method string, sendCnt int, rootCAx509CertificatePEM []byte) {
   409  
   410  	var agentWg sync.WaitGroup
   411  
   412  	// Figure out random seed for runs
   413  	r := rand.New(rand.NewSource(99))
   414  	clientSeed := r.Uint64()
   415  
   416  	// Start parallel pfsagents - each agent doing sendCnt parallel sends
   417  	var agentID uint64
   418  	for i := 0; i < agentCnt; i++ {
   419  		agentID = clientSeed + uint64(i)
   420  
   421  		agentWg.Add(1)
   422  		go pfsagent(t, rrSrv, ipAddr, port, agentID, method, &agentWg, sendCnt, rootCAx509CertificatePEM)
   423  	}
   424  	agentWg.Wait()
   425  }