github.com/hernad/nomad@v1.6.112/nomad/heartbeat_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"fmt"
     8  	"testing"
     9  	"time"
    10  
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
    13  	"github.com/hernad/nomad/ci"
    14  	"github.com/hernad/nomad/helper/pointer"
    15  	"github.com/hernad/nomad/nomad/mock"
    16  	"github.com/hernad/nomad/nomad/structs"
    17  	"github.com/hernad/nomad/testutil"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  func TestHeartbeat_InitializeHeartbeatTimers(t *testing.T) {
    22  	ci.Parallel(t)
    23  
    24  	s1, cleanupS1 := TestServer(t, nil)
    25  	defer cleanupS1()
    26  	testutil.WaitForLeader(t, s1.RPC)
    27  
    28  	node := mock.Node()
    29  	state := s1.fsm.State()
    30  	err := state.UpsertNode(structs.MsgTypeTestSetup, 1, node)
    31  	if err != nil {
    32  		t.Fatalf("err: %v", err)
    33  	}
    34  
    35  	// Reset the heartbeat timers
    36  	err = s1.initializeHeartbeatTimers()
    37  	if err != nil {
    38  		t.Fatalf("err: %v", err)
    39  	}
    40  
    41  	// Check that we have a timer
    42  	_, ok := s1.heartbeatTimers[node.ID]
    43  	if !ok {
    44  		t.Fatalf("missing heartbeat timer")
    45  	}
    46  }
    47  
    48  func TestHeartbeat_ResetHeartbeatTimer(t *testing.T) {
    49  	ci.Parallel(t)
    50  
    51  	s1, cleanupS1 := TestServer(t, nil)
    52  	defer cleanupS1()
    53  	testutil.WaitForLeader(t, s1.RPC)
    54  
    55  	// Create a new timer
    56  	ttl, err := s1.resetHeartbeatTimer("test")
    57  	if err != nil {
    58  		t.Fatalf("err: %v", err)
    59  	}
    60  	if ttl < s1.config.MinHeartbeatTTL || ttl > 2*s1.config.MinHeartbeatTTL {
    61  		t.Fatalf("bad: %#v", ttl)
    62  	}
    63  
    64  	// Check that we have a timer
    65  	_, ok := s1.heartbeatTimers["test"]
    66  	if !ok {
    67  		t.Fatalf("missing heartbeat timer")
    68  	}
    69  }
    70  
    71  func TestHeartbeat_ResetHeartbeatTimer_Nonleader(t *testing.T) {
    72  	ci.Parallel(t)
    73  	require := require.New(t)
    74  
    75  	s1, cleanupS1 := TestServer(t, func(c *Config) {
    76  		c.BootstrapExpect = 3 // Won't become leader
    77  	})
    78  	defer cleanupS1()
    79  
    80  	require.False(s1.IsLeader())
    81  
    82  	// Create a new timer
    83  	_, err := s1.resetHeartbeatTimer("test")
    84  	require.NotNil(err)
    85  	require.EqualError(err, heartbeatNotLeader)
    86  }
    87  
    88  func TestHeartbeat_ResetHeartbeatTimerLocked(t *testing.T) {
    89  	ci.Parallel(t)
    90  
    91  	s1, cleanupS1 := TestServer(t, nil)
    92  	defer cleanupS1()
    93  	testutil.WaitForLeader(t, s1.RPC)
    94  
    95  	s1.heartbeatTimersLock.Lock()
    96  	s1.resetHeartbeatTimerLocked("foo", 5*time.Millisecond)
    97  	s1.heartbeatTimersLock.Unlock()
    98  
    99  	if _, ok := s1.heartbeatTimers["foo"]; !ok {
   100  		t.Fatalf("missing timer")
   101  	}
   102  
   103  	time.Sleep(time.Duration(testutil.TestMultiplier()*10) * time.Millisecond)
   104  
   105  	if _, ok := s1.heartbeatTimers["foo"]; ok {
   106  		t.Fatalf("timer should be gone")
   107  	}
   108  }
   109  
   110  func TestHeartbeat_ResetHeartbeatTimerLocked_Renew(t *testing.T) {
   111  	ci.Parallel(t)
   112  
   113  	s1, cleanupS1 := TestServer(t, nil)
   114  	defer cleanupS1()
   115  	testutil.WaitForLeader(t, s1.RPC)
   116  
   117  	s1.heartbeatTimersLock.Lock()
   118  	s1.resetHeartbeatTimerLocked("foo", 30*time.Millisecond)
   119  	s1.heartbeatTimersLock.Unlock()
   120  
   121  	if _, ok := s1.heartbeatTimers["foo"]; !ok {
   122  		t.Fatalf("missing timer")
   123  	}
   124  
   125  	time.Sleep(2 * time.Millisecond)
   126  
   127  	// Renew the heartbeat
   128  	s1.heartbeatTimersLock.Lock()
   129  	s1.resetHeartbeatTimerLocked("foo", 30*time.Millisecond)
   130  	s1.heartbeatTimersLock.Unlock()
   131  	renew := time.Now()
   132  
   133  	// Watch for invalidation
   134  	for time.Now().Sub(renew) < time.Duration(testutil.TestMultiplier()*100)*time.Millisecond {
   135  		s1.heartbeatTimersLock.Lock()
   136  		_, ok := s1.heartbeatTimers["foo"]
   137  		s1.heartbeatTimersLock.Unlock()
   138  		if !ok {
   139  			end := time.Now()
   140  			if diff := end.Sub(renew); diff < 30*time.Millisecond {
   141  				t.Fatalf("early invalidate %v", diff)
   142  			}
   143  			return
   144  		}
   145  		time.Sleep(2 * time.Millisecond)
   146  	}
   147  	t.Fatalf("should have expired")
   148  }
   149  
   150  func TestHeartbeat_InvalidateHeartbeat(t *testing.T) {
   151  	ci.Parallel(t)
   152  	require := require.New(t)
   153  
   154  	s1, cleanupS1 := TestServer(t, nil)
   155  	defer cleanupS1()
   156  	testutil.WaitForLeader(t, s1.RPC)
   157  
   158  	// Create a node
   159  	node := mock.Node()
   160  	state := s1.fsm.State()
   161  	require.NoError(state.UpsertNode(structs.MsgTypeTestSetup, 1, node))
   162  
   163  	// This should cause a status update
   164  	s1.invalidateHeartbeat(node.ID)
   165  
   166  	// Check it is updated
   167  	ws := memdb.NewWatchSet()
   168  	out, err := state.NodeByID(ws, node.ID)
   169  	require.NoError(err)
   170  	require.True(out.TerminalStatus())
   171  	require.Len(out.Events, 2)
   172  	require.Equal(NodeHeartbeatEventMissed, out.Events[1].Message)
   173  }
   174  
   175  func TestHeartbeat_ClearHeartbeatTimer(t *testing.T) {
   176  	ci.Parallel(t)
   177  
   178  	s1, cleanupS1 := TestServer(t, nil)
   179  	defer cleanupS1()
   180  	testutil.WaitForLeader(t, s1.RPC)
   181  
   182  	s1.heartbeatTimersLock.Lock()
   183  	s1.resetHeartbeatTimerLocked("foo", 5*time.Millisecond)
   184  	s1.heartbeatTimersLock.Unlock()
   185  
   186  	err := s1.clearHeartbeatTimer("foo")
   187  	if err != nil {
   188  		t.Fatalf("err: %v", err)
   189  	}
   190  
   191  	if _, ok := s1.heartbeatTimers["foo"]; ok {
   192  		t.Fatalf("timer should be gone")
   193  	}
   194  }
   195  
   196  func TestHeartbeat_ClearAllHeartbeatTimers(t *testing.T) {
   197  	ci.Parallel(t)
   198  
   199  	s1, cleanupS1 := TestServer(t, nil)
   200  	defer cleanupS1()
   201  	testutil.WaitForLeader(t, s1.RPC)
   202  
   203  	s1.heartbeatTimersLock.Lock()
   204  	s1.resetHeartbeatTimerLocked("foo", 10*time.Millisecond)
   205  	s1.resetHeartbeatTimerLocked("bar", 10*time.Millisecond)
   206  	s1.resetHeartbeatTimerLocked("baz", 10*time.Millisecond)
   207  	s1.heartbeatTimersLock.Unlock()
   208  
   209  	err := s1.clearAllHeartbeatTimers()
   210  	if err != nil {
   211  		t.Fatalf("err: %v", err)
   212  	}
   213  
   214  	if len(s1.heartbeatTimers) != 0 {
   215  		t.Fatalf("timers should be gone")
   216  	}
   217  }
   218  
   219  func TestHeartbeat_Server_HeartbeatTTL_Failover(t *testing.T) {
   220  	ci.Parallel(t)
   221  
   222  	s1, cleanupS1 := TestServer(t, func(c *Config) {
   223  		c.BootstrapExpect = 3
   224  	})
   225  	defer cleanupS1()
   226  
   227  	s2, cleanupS2 := TestServer(t, func(c *Config) {
   228  		c.BootstrapExpect = 3
   229  	})
   230  	defer cleanupS2()
   231  
   232  	s3, cleanupS3 := TestServer(t, func(c *Config) {
   233  		c.BootstrapExpect = 3
   234  	})
   235  	defer cleanupS3()
   236  	servers := []*Server{s1, s2, s3}
   237  	TestJoin(t, s1, s2, s3)
   238  
   239  	leader := waitForStableLeadership(t, servers)
   240  	codec := rpcClient(t, leader)
   241  
   242  	// Create the register request
   243  	node := mock.Node()
   244  	req := &structs.NodeRegisterRequest{
   245  		Node:         node,
   246  		WriteRequest: structs.WriteRequest{Region: "global"},
   247  	}
   248  
   249  	// Fetch the response
   250  	var resp structs.GenericResponse
   251  	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", req, &resp); err != nil {
   252  		t.Fatalf("err: %v", err)
   253  	}
   254  
   255  	// Check that heartbeatTimers has the heartbeat ID
   256  	if _, ok := leader.heartbeatTimers[node.ID]; !ok {
   257  		t.Fatalf("missing heartbeat timer")
   258  	}
   259  
   260  	// Shutdown the leader!
   261  	leader.Shutdown()
   262  
   263  	// heartbeatTimers should be cleared on leader shutdown
   264  	testutil.WaitForResult(func() (bool, error) {
   265  		return len(leader.heartbeatTimers) == 0, nil
   266  	}, func(err error) {
   267  		t.Fatalf("heartbeat timers should be empty on the shutdown leader")
   268  	})
   269  
   270  	// Find the new leader
   271  	testutil.WaitForResult(func() (bool, error) {
   272  		leader = nil
   273  		for _, s := range servers {
   274  			if s.IsLeader() {
   275  				leader = s
   276  			}
   277  		}
   278  		if leader == nil {
   279  			return false, fmt.Errorf("Should have a new leader")
   280  		}
   281  
   282  		// Ensure heartbeat timer is restored
   283  		if _, ok := leader.heartbeatTimers[node.ID]; !ok {
   284  			return false, fmt.Errorf("missing heartbeat timer")
   285  		}
   286  
   287  		return true, nil
   288  	}, func(err error) {
   289  		t.Fatalf("err: %s", err)
   290  	})
   291  }
   292  
   293  func TestHeartbeat_InvalidateHeartbeat_DisconnectedClient(t *testing.T) {
   294  	ci.Parallel(t)
   295  
   296  	type testCase struct {
   297  		name                string
   298  		now                 time.Time
   299  		maxClientDisconnect *time.Duration
   300  		expectedNodeStatus  string
   301  	}
   302  
   303  	testCases := []testCase{
   304  		{
   305  			name:                "has-pending-reconnects",
   306  			now:                 time.Now().UTC(),
   307  			maxClientDisconnect: pointer.Of(5 * time.Second),
   308  			expectedNodeStatus:  structs.NodeStatusDisconnected,
   309  		},
   310  		{
   311  			name:                "has-expired-reconnects",
   312  			maxClientDisconnect: pointer.Of(5 * time.Second),
   313  			now:                 time.Now().UTC().Add(-10 * time.Second),
   314  			expectedNodeStatus:  structs.NodeStatusDown,
   315  		},
   316  		{
   317  			name:                "has-expired-reconnects-equal-timestamp",
   318  			maxClientDisconnect: pointer.Of(5 * time.Second),
   319  			now:                 time.Now().UTC().Add(-5 * time.Second),
   320  			expectedNodeStatus:  structs.NodeStatusDown,
   321  		},
   322  		{
   323  			name:                "has-no-reconnects",
   324  			now:                 time.Now().UTC(),
   325  			maxClientDisconnect: nil,
   326  			expectedNodeStatus:  structs.NodeStatusDown,
   327  		},
   328  	}
   329  
   330  	for _, tc := range testCases {
   331  		t.Run(tc.name, func(t *testing.T) {
   332  			s1, cleanupS1 := TestServer(t, nil)
   333  			defer cleanupS1()
   334  			testutil.WaitForLeader(t, s1.RPC)
   335  
   336  			// Create a node
   337  			node := mock.Node()
   338  			state := s1.fsm.State()
   339  			require.NoError(t, state.UpsertNode(structs.MsgTypeTestSetup, 1, node))
   340  
   341  			alloc := mock.Alloc()
   342  			alloc.NodeID = node.ID
   343  			alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxClientDisconnect
   344  			alloc.ClientStatus = structs.AllocClientStatusUnknown
   345  			alloc.AllocStates = []*structs.AllocState{{
   346  				Field: structs.AllocStateFieldClientStatus,
   347  				Value: structs.AllocClientStatusUnknown,
   348  				Time:  tc.now,
   349  			}}
   350  			require.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 2, []*structs.Allocation{alloc}))
   351  
   352  			// Trigger status update
   353  			s1.invalidateHeartbeat(node.ID)
   354  			out, err := state.NodeByID(nil, node.ID)
   355  			require.NoError(t, err)
   356  			require.Equal(t, tc.expectedNodeStatus, out.Status)
   357  		})
   358  	}
   359  }