github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/agent/agent_test.go (about) 1 package agent 2 3 import ( 4 "context" 5 "crypto/tls" 6 "errors" 7 "fmt" 8 "net" 9 "os" 10 "sync" 11 "testing" 12 "time" 13 14 "google.golang.org/grpc" 15 "google.golang.org/grpc/credentials" 16 17 events "github.com/docker/go-events" 18 agentutils "github.com/docker/swarmkit/agent/testutils" 19 "github.com/docker/swarmkit/api" 20 "github.com/docker/swarmkit/ca" 21 cautils "github.com/docker/swarmkit/ca/testutils" 22 "github.com/docker/swarmkit/connectionbroker" 23 "github.com/docker/swarmkit/log" 24 "github.com/docker/swarmkit/remotes" 25 "github.com/docker/swarmkit/testutils" 26 "github.com/docker/swarmkit/xnet" 27 "github.com/stretchr/testify/assert" 28 "github.com/stretchr/testify/require" 29 ) 30 31 var localDispatcher = false 32 33 // TestMain runs every test in this file twice - once with a local dispatcher, and 34 // once again with a remote dispatcher 35 func TestMain(m *testing.M) { 36 localDispatcher = false 37 dispatcherRPCTimeout = 500 * time.Millisecond 38 if status := m.Run(); status != 0 { 39 os.Exit(status) 40 } 41 42 localDispatcher = true 43 os.Exit(m.Run()) 44 } 45 46 func TestAgent(t *testing.T) { 47 // TODO(stevvooe): The current agent is fairly monolithic, making it hard 48 // to test without implementing or mocking an entire master. We'd like to 49 // avoid this, as these kinds of tests are expensive to maintain. 50 // 51 // To support a proper testing program, the plan is to decouple the agent 52 // into the following components: 53 // 54 // Connection: Manages the RPC connection and the available managers. Must 55 // follow lazy grpc style but also expose primitives to force reset, which 56 // is currently exposed through remotes. 57 // 58 // Session: Manages the lifecycle of an agent from Register to a failure. 59 // Currently, this is implemented as Agent.session but we'd prefer to 60 // encapsulate it to keep the agent simple. 61 // 62 // Agent: With the above scaffolding, the agent reduces to Agent.Assign 63 // and Agent.Watch. Testing becomes as simple as assigning tasks sets and 64 // checking that the appropriate events come up on the watch queue. 65 // 66 // We may also move the Assign/Watch to a Driver type and have the agent 67 // oversee everything. 68 } 69 70 func TestAgentStartStop(t *testing.T) { 71 tc := cautils.NewTestCA(t) 72 defer tc.Stop() 73 74 agentSecurityConfig, err := tc.NewNodeConfig(ca.WorkerRole) 75 require.NoError(t, err) 76 77 addr := "localhost:4949" 78 remotes := remotes.NewRemotes(api.Peer{Addr: addr}) 79 80 db, cleanup := storageTestEnv(t) 81 defer cleanup() 82 83 agent, err := New(&Config{ 84 Executor: &agentutils.TestExecutor{}, 85 ConnBroker: connectionbroker.New(remotes), 86 Credentials: agentSecurityConfig.ClientTLSCreds, 87 DB: db, 88 NodeTLSInfo: &api.NodeTLSInfo{}, 89 }) 90 require.NoError(t, err) 91 assert.NotNil(t, agent) 92 93 ctx, cancel := context.WithTimeout(tc.Context, 5000*time.Millisecond) 94 defer cancel() 95 96 assert.Equal(t, errAgentNotStarted, agent.Stop(ctx)) 97 assert.NoError(t, agent.Start(ctx)) 98 99 if err := agent.Start(ctx); err != errAgentStarted { 100 t.Fatalf("expected agent started error: %v", err) 101 } 102 103 assert.NoError(t, agent.Stop(ctx)) 104 } 105 106 func TestHandleSessionMessageNetworkManagerChanges(t *testing.T) { 107 nodeChangeCh := make(chan *NodeChanges, 1) 108 defer close(nodeChangeCh) 109 tester := agentTestEnv(t, nodeChangeCh, nil) 110 defer tester.cleanup() 111 defer tester.StartAgent(t)() 112 113 currSession, closedSessions := tester.dispatcher.GetSessions() 114 require.NotNil(t, currSession) 115 require.NotNil(t, currSession.Description) 116 require.Empty(t, closedSessions) 117 118 var messages = []*api.SessionMessage{ 119 { 120 Managers: []*api.WeightedPeer{ 121 {Peer: &api.Peer{NodeID: "node1", Addr: "10.0.0.1"}, Weight: 1.0}}, 122 NetworkBootstrapKeys: []*api.EncryptionKey{{}}, 123 }, 124 { 125 Managers: []*api.WeightedPeer{ 126 {Peer: &api.Peer{NodeID: "node1", Addr: ""}, Weight: 1.0}}, 127 NetworkBootstrapKeys: []*api.EncryptionKey{{}}, 128 }, 129 { 130 Managers: []*api.WeightedPeer{ 131 {Peer: &api.Peer{NodeID: "node1", Addr: "10.0.0.1"}, Weight: 1.0}}, 132 NetworkBootstrapKeys: nil, 133 }, 134 { 135 Managers: []*api.WeightedPeer{ 136 {Peer: &api.Peer{NodeID: "", Addr: "10.0.0.1"}, Weight: 1.0}}, 137 NetworkBootstrapKeys: []*api.EncryptionKey{{}}, 138 }, 139 { 140 Managers: []*api.WeightedPeer{ 141 {Peer: &api.Peer{NodeID: "node1", Addr: "10.0.0.1"}, Weight: 0.0}}, 142 NetworkBootstrapKeys: []*api.EncryptionKey{{}}, 143 }, 144 } 145 146 for _, m := range messages { 147 m.SessionID = currSession.SessionID 148 tester.dispatcher.SessionMessageChannel() <- m 149 select { 150 case nodeChange := <-nodeChangeCh: 151 require.FailNow(t, "there should be no node changes with these messages: %v", nodeChange) 152 case <-time.After(100 * time.Millisecond): 153 } 154 } 155 156 currSession, closedSessions = tester.dispatcher.GetSessions() 157 require.NotEmpty(t, currSession) 158 require.Empty(t, closedSessions) 159 } 160 161 func TestHandleSessionMessageNodeChanges(t *testing.T) { 162 nodeChangeCh := make(chan *NodeChanges, 1) 163 defer close(nodeChangeCh) 164 tester := agentTestEnv(t, nodeChangeCh, nil) 165 defer tester.cleanup() 166 defer tester.StartAgent(t)() 167 168 currSession, closedSessions := tester.dispatcher.GetSessions() 169 require.NotNil(t, currSession) 170 require.NotNil(t, currSession.Description) 171 require.Empty(t, closedSessions) 172 173 var testcases = []struct { 174 msg *api.SessionMessage 175 change *NodeChanges 176 errorMsg string 177 }{ 178 { 179 msg: &api.SessionMessage{ 180 Node: &api.Node{}, 181 }, 182 change: &NodeChanges{Node: &api.Node{}}, 183 errorMsg: "the node changed, but no notification of node change", 184 }, 185 { 186 msg: &api.SessionMessage{ 187 RootCA: []byte("new root CA"), 188 }, 189 change: &NodeChanges{RootCert: []byte("new root CA")}, 190 errorMsg: "the root cert changed, but no notification of node change", 191 }, 192 { 193 msg: &api.SessionMessage{ 194 Node: &api.Node{ID: "something"}, 195 RootCA: []byte("new root CA"), 196 }, 197 change: &NodeChanges{ 198 Node: &api.Node{ID: "something"}, 199 RootCert: []byte("new root CA"), 200 }, 201 errorMsg: "the root cert and node both changed, but no notification of node change", 202 }, 203 { 204 msg: &api.SessionMessage{ 205 Node: &api.Node{ID: "something"}, 206 RootCA: tester.testCA.RootCA.Certs, 207 }, 208 errorMsg: "while a node and root cert were provided, nothing has changed so no node changed", 209 }, 210 } 211 212 for _, tc := range testcases { 213 tc.msg.SessionID = currSession.SessionID 214 tester.dispatcher.SessionMessageChannel() <- tc.msg 215 if tc.change != nil { 216 select { 217 case nodeChange := <-nodeChangeCh: 218 require.Equal(t, tc.change, nodeChange, tc.errorMsg) 219 case <-time.After(100 * time.Millisecond): 220 require.FailNow(t, tc.errorMsg) 221 } 222 } else { 223 select { 224 case nodeChange := <-nodeChangeCh: 225 require.FailNow(t, "%s: but got change: %v", tc.errorMsg, nodeChange) 226 case <-time.After(100 * time.Millisecond): 227 } 228 } 229 } 230 231 currSession, closedSessions = tester.dispatcher.GetSessions() 232 require.NotEmpty(t, currSession) 233 require.Empty(t, closedSessions) 234 } 235 236 // when the node description changes, the session is restarted and propagated up to the dispatcher. 237 // the node description includes the FIPSness of the agent. 238 func TestSessionRestartedOnNodeDescriptionChange(t *testing.T) { 239 tlsCh := make(chan events.Event, 1) 240 defer close(tlsCh) 241 tester := agentTestEnv(t, nil, tlsCh) 242 tester.agent.config.FIPS = true // start out with the agent in FIPS-enabled mode 243 defer tester.cleanup() 244 defer tester.StartAgent(t)() 245 246 currSession, closedSessions := tester.dispatcher.GetSessions() 247 require.NotNil(t, currSession) 248 require.NotNil(t, currSession.Description) 249 require.True(t, currSession.Description.FIPS) 250 require.Empty(t, closedSessions) 251 252 tester.executor.UpdateNodeDescription(&api.NodeDescription{ 253 Hostname: "testAgent", 254 }) 255 var gotSession *api.SessionRequest 256 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 257 gotSession, closedSessions = tester.dispatcher.GetSessions() 258 if gotSession == nil { 259 return errors.New("no current session") 260 } 261 if len(closedSessions) != 1 { 262 return fmt.Errorf("expecting 1 closed sessions, got %d", len(closedSessions)) 263 } 264 return nil 265 }, 2*time.Second)) 266 require.NotEqual(t, currSession, gotSession) 267 require.NotNil(t, gotSession.Description) 268 require.Equal(t, "testAgent", gotSession.Description.Hostname) 269 require.True(t, gotSession.Description.FIPS) 270 currSession = gotSession 271 272 // If nothing changes, the session is not re-established 273 tlsCh <- gotSession.Description.TLSInfo 274 time.Sleep(1 * time.Second) 275 gotSession, closedSessions = tester.dispatcher.GetSessions() 276 require.Equal(t, currSession, gotSession) 277 require.Len(t, closedSessions, 1) 278 279 newTLSInfo := &api.NodeTLSInfo{ 280 TrustRoot: cautils.ECDSA256SHA256Cert, 281 CertIssuerPublicKey: []byte("public key"), 282 CertIssuerSubject: []byte("subject"), 283 } 284 tlsCh <- newTLSInfo 285 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 286 gotSession, closedSessions = tester.dispatcher.GetSessions() 287 if gotSession == nil { 288 return errors.New("no current session") 289 } 290 if len(closedSessions) != 2 { 291 return fmt.Errorf("expecting 2 closed sessions, got %d", len(closedSessions)) 292 } 293 return nil 294 }, 2*time.Second)) 295 require.NotEqual(t, currSession, gotSession) 296 require.NotNil(t, gotSession.Description) 297 require.Equal(t, "testAgent", gotSession.Description.Hostname) 298 require.Equal(t, newTLSInfo, gotSession.Description.TLSInfo) 299 require.True(t, gotSession.Description.FIPS) 300 } 301 302 // If the dispatcher returns an error, if it times out, or if it's unreachable, no matter 303 // what the agent attempts to reconnect and rebuild a new session. 304 func TestSessionReconnectsIfDispatcherErrors(t *testing.T) { 305 tlsCh := make(chan events.Event, 1) 306 defer close(tlsCh) 307 308 tester := agentTestEnv(t, nil, tlsCh) 309 defer tester.cleanup() 310 defer tester.StartAgent(t)() 311 312 // create a second dispatcher we can fall back on 313 anotherConfig, err := tester.testCA.NewNodeConfig(ca.ManagerRole) 314 require.NoError(t, err) 315 anotherDispatcher, stop := agentutils.NewMockDispatcher(t, anotherConfig, false) // this one is not local, because the other one may be 316 defer stop() 317 318 var counter int 319 anotherDispatcher.SetSessionHandler(func(r *api.SessionRequest, stream api.Dispatcher_SessionServer) error { 320 if counter == 0 { 321 counter++ 322 return errors.New("terminate immediately") 323 } 324 // hang forever until the other side cancels, and then set the session to nil so we use the default one 325 defer anotherDispatcher.SetSessionHandler(nil) 326 <-stream.Context().Done() 327 return stream.Context().Err() 328 }) 329 330 // ok, agent should have connect to the first dispatcher by now - if it has, kill the first dispatcher and ensure 331 // the agent connects to the second one 332 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 333 gotSession, closedSessions := tester.dispatcher.GetSessions() 334 if gotSession == nil { 335 return errors.New("no current session") 336 } 337 if len(closedSessions) != 0 { 338 return fmt.Errorf("expecting 0 closed sessions, got %d", len(closedSessions)) 339 } 340 return nil 341 }, 2*time.Second)) 342 tester.stopDispatcher() 343 tester.remotes.setPeer(api.Peer{Addr: anotherDispatcher.Addr}) 344 tester.agent.config.ConnBroker.SetLocalConn(nil) 345 346 // It should have connected with the second dispatcher 3 times - first because the first dispatcher died, 347 // second because the dispatcher returned an error, third time because the session timed out. So there should 348 // be 2 closed sessions. 349 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 350 gotSession, closedSessions := anotherDispatcher.GetSessions() 351 if gotSession == nil { 352 return errors.New("no current session") 353 } 354 if len(closedSessions) != 2 { 355 return fmt.Errorf("expecting 2 closed sessions, got %d", len(closedSessions)) 356 } 357 return nil 358 }, 10*time.Second)) 359 } 360 361 type testSessionTracker struct { 362 closeCounter, errCounter, establishedSessions int 363 err error 364 mu sync.Mutex 365 } 366 367 func (t *testSessionTracker) SessionError(err error) { 368 t.mu.Lock() 369 t.err = err 370 t.errCounter++ 371 t.mu.Unlock() 372 } 373 374 func (t *testSessionTracker) SessionClosed() error { 375 t.mu.Lock() 376 defer t.mu.Unlock() 377 t.closeCounter++ 378 if t.closeCounter >= 3 { 379 return t.err 380 } 381 return nil 382 } 383 384 func (t *testSessionTracker) SessionEstablished() { 385 t.mu.Lock() 386 t.establishedSessions++ 387 t.mu.Unlock() 388 } 389 390 func (t *testSessionTracker) Stats() (int, int, int) { 391 t.mu.Lock() 392 defer t.mu.Unlock() 393 return t.establishedSessions, t.errCounter, t.closeCounter 394 } 395 396 // If we pass a session tracker, and OnSessionClosed returns an error, the agent should exit with that error 397 // as opposed to rebuilding 398 func TestAgentExitsBasedOnSessionTracker(t *testing.T) { 399 tlsCh := make(chan events.Event, 1) 400 defer close(tlsCh) 401 tester := agentTestEnv(t, nil, tlsCh) 402 defer tester.cleanup() 403 404 // set the dispatcher to always error 405 tester.dispatcher.SetSessionHandler(func(r *api.SessionRequest, stream api.Dispatcher_SessionServer) error { 406 return errors.New("I always error") 407 }) 408 409 // add a hook to the agent to exit after 3 session rebuilds 410 tracker := testSessionTracker{} 411 tester.agent.config.SessionTracker = &tracker 412 413 go tester.agent.Start(tester.testCA.Context) 414 defer tester.agent.Stop(tester.testCA.Context) 415 416 getErr := make(chan error) 417 go func() { 418 getErr <- tester.agent.Err(tester.testCA.Context) 419 }() 420 421 select { 422 case err := <-getErr: 423 require.Error(t, err) 424 require.Contains(t, err.Error(), "I always error") 425 case <-tester.agent.Ready(): 426 require.FailNow(t, "agent should have failed to connect") 427 case <-time.After(5 * time.Second): 428 require.FailNow(t, "agent didn't fail within 5 seconds") 429 } 430 431 establishedSessions, errCounter, closeClounter := tracker.Stats() 432 require.Equal(t, establishedSessions, 0) 433 require.Equal(t, errCounter, 3) 434 require.Equal(t, closeClounter, 3) 435 currSession, closedSessions := tester.dispatcher.GetSessions() 436 require.Nil(t, currSession) 437 require.Len(t, closedSessions, 3) 438 } 439 440 // If we pass a session tracker, established sessions get tracked. 441 func TestAgentRegistersSessionsWithSessionTracker(t *testing.T) { 442 tlsCh := make(chan events.Event, 1) 443 defer close(tlsCh) 444 tester := agentTestEnv(t, nil, tlsCh) 445 defer tester.cleanup() 446 447 // add a hook to the agent to exit after 3 session rebuilds 448 tracker := testSessionTracker{} 449 tester.agent.config.SessionTracker = &tracker 450 451 defer tester.StartAgent(t)() 452 453 var establishedSessions, errCounter, closeCounter int 454 // poll because session tracker gets called after the ready channel is closed 455 // (so there may be edge cases where the stats are called before the session 456 // tracker is called) 457 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 458 establishedSessions, errCounter, closeCounter = tracker.Stats() 459 if establishedSessions != 1 { 460 return errors.New("sessiontracker hasn't been called yet") 461 } 462 return nil 463 }, 3*time.Millisecond)) 464 require.Equal(t, errCounter, 0) 465 require.Equal(t, closeCounter, 0) 466 currSession, closedSessions := tester.dispatcher.GetSessions() 467 require.NotNil(t, currSession) 468 require.Len(t, closedSessions, 0) 469 } 470 471 type agentTester struct { 472 agent *Agent 473 dispatcher *agentutils.MockDispatcher 474 executor *agentutils.TestExecutor 475 stopDispatcher, cleanup func() 476 testCA *cautils.TestCA 477 remotes *fakeRemotes 478 } 479 480 func (a *agentTester) StartAgent(t *testing.T) func() { 481 go a.agent.Start(a.testCA.Context) 482 483 getErr := make(chan error) 484 go func() { 485 getErr <- a.agent.Err(a.testCA.Context) 486 }() 487 select { 488 case err := <-getErr: 489 require.FailNow(t, "starting agent errored with: %v", err) 490 case <-a.agent.Ready(): 491 case <-time.After(5 * time.Second): 492 require.FailNow(t, "agent not ready within 5 seconds") 493 } 494 495 return func() { 496 a.agent.Stop(a.testCA.Context) 497 } 498 } 499 500 func agentTestEnv(t *testing.T, nodeChangeCh chan *NodeChanges, tlsChangeCh chan events.Event) *agentTester { 501 var cleanup []func() 502 tc := cautils.NewTestCA(t) 503 cleanup = append(cleanup, tc.Stop) 504 tc.Context = log.WithLogger(tc.Context, log.G(tc.Context).WithField("localDispatcher", localDispatcher)) 505 506 agentSecurityConfig, err := tc.NewNodeConfig(ca.WorkerRole) 507 require.NoError(t, err) 508 managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole) 509 require.NoError(t, err) 510 511 mockDispatcher, mockDispatcherStop := agentutils.NewMockDispatcher(t, managerSecurityConfig, localDispatcher) 512 cleanup = append(cleanup, mockDispatcherStop) 513 514 fr := &fakeRemotes{} 515 broker := connectionbroker.New(fr) 516 if localDispatcher { 517 insecureCreds := credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}) 518 conn, err := grpc.Dial( 519 mockDispatcher.Addr, 520 grpc.WithTransportCredentials(insecureCreds), 521 grpc.WithDialer( 522 func(addr string, timeout time.Duration) (net.Conn, error) { 523 return xnet.DialTimeoutLocal(addr, timeout) 524 }), 525 ) 526 require.NoError(t, err) 527 cleanup = append(cleanup, func() { conn.Close() }) 528 529 broker.SetLocalConn(conn) 530 } else { 531 fr.setPeer(api.Peer{Addr: mockDispatcher.Addr}) 532 } 533 534 db, cleanupStorage := storageTestEnv(t) 535 cleanup = append(cleanup, func() { cleanupStorage() }) 536 537 executor := &agentutils.TestExecutor{} 538 539 agent, err := New(&Config{ 540 Executor: executor, 541 ConnBroker: broker, 542 Credentials: agentSecurityConfig.ClientTLSCreds, 543 DB: db, 544 NotifyNodeChange: nodeChangeCh, 545 NotifyTLSChange: tlsChangeCh, 546 NodeTLSInfo: &api.NodeTLSInfo{ 547 TrustRoot: tc.RootCA.Certs, 548 CertIssuerPublicKey: agentSecurityConfig.IssuerInfo().PublicKey, 549 CertIssuerSubject: agentSecurityConfig.IssuerInfo().Subject, 550 }, 551 }) 552 require.NoError(t, err) 553 agent.nodeUpdatePeriod = 200 * time.Millisecond 554 555 return &agentTester{ 556 agent: agent, 557 dispatcher: mockDispatcher, 558 stopDispatcher: mockDispatcherStop, 559 executor: executor, 560 testCA: tc, 561 cleanup: func() { 562 // go in reverse order 563 for i := len(cleanup) - 1; i >= 0; i-- { 564 cleanup[i]() 565 } 566 }, 567 remotes: fr, 568 } 569 } 570 571 // fakeRemotes is a Remotes interface that just always selects the current remote until 572 // it is switched out 573 type fakeRemotes struct { 574 mu sync.Mutex 575 peer api.Peer 576 } 577 578 func (f *fakeRemotes) Weights() map[api.Peer]int { 579 f.mu.Lock() 580 defer f.mu.Unlock() 581 return map[api.Peer]int{f.peer: 1} 582 } 583 584 func (f *fakeRemotes) Select(...string) (api.Peer, error) { 585 f.mu.Lock() 586 defer f.mu.Unlock() 587 return f.peer, nil 588 } 589 590 // do nothing 591 func (f *fakeRemotes) Observe(peer api.Peer, weight int) {} 592 func (f *fakeRemotes) ObserveIfExists(peer api.Peer, weight int) {} 593 func (f *fakeRemotes) Remove(addrs ...api.Peer) {} 594 595 func (f *fakeRemotes) setPeer(p api.Peer) { 596 f.mu.Lock() 597 f.peer = p 598 f.mu.Unlock() 599 } 600 601 var _ remotes.Remotes = &fakeRemotes{}