github.com/hernad/nomad@v1.6.112/nomad/heartbeat_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "fmt" 8 "testing" 9 "time" 10 11 memdb "github.com/hashicorp/go-memdb" 12 msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" 13 "github.com/hernad/nomad/ci" 14 "github.com/hernad/nomad/helper/pointer" 15 "github.com/hernad/nomad/nomad/mock" 16 "github.com/hernad/nomad/nomad/structs" 17 "github.com/hernad/nomad/testutil" 18 "github.com/stretchr/testify/require" 19 ) 20 21 func TestHeartbeat_InitializeHeartbeatTimers(t *testing.T) { 22 ci.Parallel(t) 23 24 s1, cleanupS1 := TestServer(t, nil) 25 defer cleanupS1() 26 testutil.WaitForLeader(t, s1.RPC) 27 28 node := mock.Node() 29 state := s1.fsm.State() 30 err := state.UpsertNode(structs.MsgTypeTestSetup, 1, node) 31 if err != nil { 32 t.Fatalf("err: %v", err) 33 } 34 35 // Reset the heartbeat timers 36 err = s1.initializeHeartbeatTimers() 37 if err != nil { 38 t.Fatalf("err: %v", err) 39 } 40 41 // Check that we have a timer 42 _, ok := s1.heartbeatTimers[node.ID] 43 if !ok { 44 t.Fatalf("missing heartbeat timer") 45 } 46 } 47 48 func TestHeartbeat_ResetHeartbeatTimer(t *testing.T) { 49 ci.Parallel(t) 50 51 s1, cleanupS1 := TestServer(t, nil) 52 defer cleanupS1() 53 testutil.WaitForLeader(t, s1.RPC) 54 55 // Create a new timer 56 ttl, err := s1.resetHeartbeatTimer("test") 57 if err != nil { 58 t.Fatalf("err: %v", err) 59 } 60 if ttl < s1.config.MinHeartbeatTTL || ttl > 2*s1.config.MinHeartbeatTTL { 61 t.Fatalf("bad: %#v", ttl) 62 } 63 64 // Check that we have a timer 65 _, ok := s1.heartbeatTimers["test"] 66 if !ok { 67 t.Fatalf("missing heartbeat timer") 68 } 69 } 70 71 func TestHeartbeat_ResetHeartbeatTimer_Nonleader(t *testing.T) { 72 ci.Parallel(t) 73 require := require.New(t) 74 75 s1, cleanupS1 := TestServer(t, func(c *Config) { 76 c.BootstrapExpect = 3 // Won't become leader 77 }) 78 defer cleanupS1() 79 80 require.False(s1.IsLeader()) 81 82 // Create a new timer 83 _, err := s1.resetHeartbeatTimer("test") 84 require.NotNil(err) 85 require.EqualError(err, heartbeatNotLeader) 86 } 87 88 func TestHeartbeat_ResetHeartbeatTimerLocked(t *testing.T) { 89 ci.Parallel(t) 90 91 s1, cleanupS1 := TestServer(t, nil) 92 defer cleanupS1() 93 testutil.WaitForLeader(t, s1.RPC) 94 95 s1.heartbeatTimersLock.Lock() 96 s1.resetHeartbeatTimerLocked("foo", 5*time.Millisecond) 97 s1.heartbeatTimersLock.Unlock() 98 99 if _, ok := s1.heartbeatTimers["foo"]; !ok { 100 t.Fatalf("missing timer") 101 } 102 103 time.Sleep(time.Duration(testutil.TestMultiplier()*10) * time.Millisecond) 104 105 if _, ok := s1.heartbeatTimers["foo"]; ok { 106 t.Fatalf("timer should be gone") 107 } 108 } 109 110 func TestHeartbeat_ResetHeartbeatTimerLocked_Renew(t *testing.T) { 111 ci.Parallel(t) 112 113 s1, cleanupS1 := TestServer(t, nil) 114 defer cleanupS1() 115 testutil.WaitForLeader(t, s1.RPC) 116 117 s1.heartbeatTimersLock.Lock() 118 s1.resetHeartbeatTimerLocked("foo", 30*time.Millisecond) 119 s1.heartbeatTimersLock.Unlock() 120 121 if _, ok := s1.heartbeatTimers["foo"]; !ok { 122 t.Fatalf("missing timer") 123 } 124 125 time.Sleep(2 * time.Millisecond) 126 127 // Renew the heartbeat 128 s1.heartbeatTimersLock.Lock() 129 s1.resetHeartbeatTimerLocked("foo", 30*time.Millisecond) 130 s1.heartbeatTimersLock.Unlock() 131 renew := time.Now() 132 133 // Watch for invalidation 134 for time.Now().Sub(renew) < time.Duration(testutil.TestMultiplier()*100)*time.Millisecond { 135 s1.heartbeatTimersLock.Lock() 136 _, ok := s1.heartbeatTimers["foo"] 137 s1.heartbeatTimersLock.Unlock() 138 if !ok { 139 end := time.Now() 140 if diff := end.Sub(renew); diff < 30*time.Millisecond { 141 t.Fatalf("early invalidate %v", diff) 142 } 143 return 144 } 145 time.Sleep(2 * time.Millisecond) 146 } 147 t.Fatalf("should have expired") 148 } 149 150 func TestHeartbeat_InvalidateHeartbeat(t *testing.T) { 151 ci.Parallel(t) 152 require := require.New(t) 153 154 s1, cleanupS1 := TestServer(t, nil) 155 defer cleanupS1() 156 testutil.WaitForLeader(t, s1.RPC) 157 158 // Create a node 159 node := mock.Node() 160 state := s1.fsm.State() 161 require.NoError(state.UpsertNode(structs.MsgTypeTestSetup, 1, node)) 162 163 // This should cause a status update 164 s1.invalidateHeartbeat(node.ID) 165 166 // Check it is updated 167 ws := memdb.NewWatchSet() 168 out, err := state.NodeByID(ws, node.ID) 169 require.NoError(err) 170 require.True(out.TerminalStatus()) 171 require.Len(out.Events, 2) 172 require.Equal(NodeHeartbeatEventMissed, out.Events[1].Message) 173 } 174 175 func TestHeartbeat_ClearHeartbeatTimer(t *testing.T) { 176 ci.Parallel(t) 177 178 s1, cleanupS1 := TestServer(t, nil) 179 defer cleanupS1() 180 testutil.WaitForLeader(t, s1.RPC) 181 182 s1.heartbeatTimersLock.Lock() 183 s1.resetHeartbeatTimerLocked("foo", 5*time.Millisecond) 184 s1.heartbeatTimersLock.Unlock() 185 186 err := s1.clearHeartbeatTimer("foo") 187 if err != nil { 188 t.Fatalf("err: %v", err) 189 } 190 191 if _, ok := s1.heartbeatTimers["foo"]; ok { 192 t.Fatalf("timer should be gone") 193 } 194 } 195 196 func TestHeartbeat_ClearAllHeartbeatTimers(t *testing.T) { 197 ci.Parallel(t) 198 199 s1, cleanupS1 := TestServer(t, nil) 200 defer cleanupS1() 201 testutil.WaitForLeader(t, s1.RPC) 202 203 s1.heartbeatTimersLock.Lock() 204 s1.resetHeartbeatTimerLocked("foo", 10*time.Millisecond) 205 s1.resetHeartbeatTimerLocked("bar", 10*time.Millisecond) 206 s1.resetHeartbeatTimerLocked("baz", 10*time.Millisecond) 207 s1.heartbeatTimersLock.Unlock() 208 209 err := s1.clearAllHeartbeatTimers() 210 if err != nil { 211 t.Fatalf("err: %v", err) 212 } 213 214 if len(s1.heartbeatTimers) != 0 { 215 t.Fatalf("timers should be gone") 216 } 217 } 218 219 func TestHeartbeat_Server_HeartbeatTTL_Failover(t *testing.T) { 220 ci.Parallel(t) 221 222 s1, cleanupS1 := TestServer(t, func(c *Config) { 223 c.BootstrapExpect = 3 224 }) 225 defer cleanupS1() 226 227 s2, cleanupS2 := TestServer(t, func(c *Config) { 228 c.BootstrapExpect = 3 229 }) 230 defer cleanupS2() 231 232 s3, cleanupS3 := TestServer(t, func(c *Config) { 233 c.BootstrapExpect = 3 234 }) 235 defer cleanupS3() 236 servers := []*Server{s1, s2, s3} 237 TestJoin(t, s1, s2, s3) 238 239 leader := waitForStableLeadership(t, servers) 240 codec := rpcClient(t, leader) 241 242 // Create the register request 243 node := mock.Node() 244 req := &structs.NodeRegisterRequest{ 245 Node: node, 246 WriteRequest: structs.WriteRequest{Region: "global"}, 247 } 248 249 // Fetch the response 250 var resp structs.GenericResponse 251 if err := msgpackrpc.CallWithCodec(codec, "Node.Register", req, &resp); err != nil { 252 t.Fatalf("err: %v", err) 253 } 254 255 // Check that heartbeatTimers has the heartbeat ID 256 if _, ok := leader.heartbeatTimers[node.ID]; !ok { 257 t.Fatalf("missing heartbeat timer") 258 } 259 260 // Shutdown the leader! 261 leader.Shutdown() 262 263 // heartbeatTimers should be cleared on leader shutdown 264 testutil.WaitForResult(func() (bool, error) { 265 return len(leader.heartbeatTimers) == 0, nil 266 }, func(err error) { 267 t.Fatalf("heartbeat timers should be empty on the shutdown leader") 268 }) 269 270 // Find the new leader 271 testutil.WaitForResult(func() (bool, error) { 272 leader = nil 273 for _, s := range servers { 274 if s.IsLeader() { 275 leader = s 276 } 277 } 278 if leader == nil { 279 return false, fmt.Errorf("Should have a new leader") 280 } 281 282 // Ensure heartbeat timer is restored 283 if _, ok := leader.heartbeatTimers[node.ID]; !ok { 284 return false, fmt.Errorf("missing heartbeat timer") 285 } 286 287 return true, nil 288 }, func(err error) { 289 t.Fatalf("err: %s", err) 290 }) 291 } 292 293 func TestHeartbeat_InvalidateHeartbeat_DisconnectedClient(t *testing.T) { 294 ci.Parallel(t) 295 296 type testCase struct { 297 name string 298 now time.Time 299 maxClientDisconnect *time.Duration 300 expectedNodeStatus string 301 } 302 303 testCases := []testCase{ 304 { 305 name: "has-pending-reconnects", 306 now: time.Now().UTC(), 307 maxClientDisconnect: pointer.Of(5 * time.Second), 308 expectedNodeStatus: structs.NodeStatusDisconnected, 309 }, 310 { 311 name: "has-expired-reconnects", 312 maxClientDisconnect: pointer.Of(5 * time.Second), 313 now: time.Now().UTC().Add(-10 * time.Second), 314 expectedNodeStatus: structs.NodeStatusDown, 315 }, 316 { 317 name: "has-expired-reconnects-equal-timestamp", 318 maxClientDisconnect: pointer.Of(5 * time.Second), 319 now: time.Now().UTC().Add(-5 * time.Second), 320 expectedNodeStatus: structs.NodeStatusDown, 321 }, 322 { 323 name: "has-no-reconnects", 324 now: time.Now().UTC(), 325 maxClientDisconnect: nil, 326 expectedNodeStatus: structs.NodeStatusDown, 327 }, 328 } 329 330 for _, tc := range testCases { 331 t.Run(tc.name, func(t *testing.T) { 332 s1, cleanupS1 := TestServer(t, nil) 333 defer cleanupS1() 334 testutil.WaitForLeader(t, s1.RPC) 335 336 // Create a node 337 node := mock.Node() 338 state := s1.fsm.State() 339 require.NoError(t, state.UpsertNode(structs.MsgTypeTestSetup, 1, node)) 340 341 alloc := mock.Alloc() 342 alloc.NodeID = node.ID 343 alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxClientDisconnect 344 alloc.ClientStatus = structs.AllocClientStatusUnknown 345 alloc.AllocStates = []*structs.AllocState{{ 346 Field: structs.AllocStateFieldClientStatus, 347 Value: structs.AllocClientStatusUnknown, 348 Time: tc.now, 349 }} 350 require.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 2, []*structs.Allocation{alloc})) 351 352 // Trigger status update 353 s1.invalidateHeartbeat(node.ID) 354 out, err := state.NodeByID(nil, node.ID) 355 require.NoError(t, err) 356 require.Equal(t, tc.expectedNodeStatus, out.Status) 357 }) 358 } 359 }