github.com/kaisenlinux/docker@v0.0.0-20230510090727-ea55db55fac7/swarmkit/agent/agent.go (about) 1 package agent 2 3 import ( 4 "bytes" 5 "context" 6 "math/rand" 7 "reflect" 8 "sync" 9 "time" 10 11 "github.com/docker/swarmkit/agent/exec" 12 "github.com/docker/swarmkit/api" 13 "github.com/docker/swarmkit/log" 14 "github.com/pkg/errors" 15 ) 16 17 const ( 18 initialSessionFailureBackoff = 100 * time.Millisecond 19 maxSessionFailureBackoff = 8 * time.Second 20 nodeUpdatePeriod = 20 * time.Second 21 ) 22 23 // Agent implements the primary node functionality for a member of a swarm 24 // cluster. The primary functionality is to run and report on the status of 25 // tasks assigned to the node. 26 type Agent struct { 27 config *Config 28 29 // The latest node object state from manager 30 // for this node known to the agent. 31 node *api.Node 32 33 keys []*api.EncryptionKey 34 35 sessionq chan sessionOperation 36 worker Worker 37 38 started chan struct{} 39 startOnce sync.Once // start only once 40 ready chan struct{} 41 leaving chan struct{} 42 leaveOnce sync.Once 43 left chan struct{} // closed after "run" processes "leaving" and will no longer accept new assignments 44 stopped chan struct{} // requests shutdown 45 stopOnce sync.Once // only allow stop to be called once 46 closed chan struct{} // only closed in run 47 err error // read only after closed is closed 48 49 nodeUpdatePeriod time.Duration 50 } 51 52 // New returns a new agent, ready for task dispatch. 53 func New(config *Config) (*Agent, error) { 54 if err := config.validate(); err != nil { 55 return nil, err 56 } 57 58 a := &Agent{ 59 config: config, 60 sessionq: make(chan sessionOperation), 61 started: make(chan struct{}), 62 leaving: make(chan struct{}), 63 left: make(chan struct{}), 64 stopped: make(chan struct{}), 65 closed: make(chan struct{}), 66 ready: make(chan struct{}), 67 nodeUpdatePeriod: nodeUpdatePeriod, 68 } 69 70 a.worker = newWorker(config.DB, config.Executor, a) 71 return a, nil 72 } 73 74 // Start begins execution of the agent in the provided context, if not already 75 // started. 76 // 77 // Start returns an error if the agent has already started. 78 func (a *Agent) Start(ctx context.Context) error { 79 err := errAgentStarted 80 81 a.startOnce.Do(func() { 82 close(a.started) 83 go a.run(ctx) 84 err = nil // clear error above, only once. 85 }) 86 87 return err 88 } 89 90 // Leave instructs the agent to leave the cluster. This method will shutdown 91 // assignment processing and remove all assignments from the node. 92 // Leave blocks until worker has finished closing all task managers or agent 93 // is closed. 94 func (a *Agent) Leave(ctx context.Context) error { 95 select { 96 case <-a.started: 97 default: 98 return errAgentNotStarted 99 } 100 101 a.leaveOnce.Do(func() { 102 close(a.leaving) 103 }) 104 105 // Do not call Wait until we have confirmed that the agent is no longer 106 // accepting assignments. Starting a worker might race with Wait. 107 select { 108 case <-a.left: 109 case <-a.closed: 110 return ErrClosed 111 case <-ctx.Done(): 112 return ctx.Err() 113 } 114 115 // agent could be closed while Leave is in progress 116 var err error 117 ch := make(chan struct{}) 118 go func() { 119 err = a.worker.Wait(ctx) 120 close(ch) 121 }() 122 123 select { 124 case <-ch: 125 return err 126 case <-a.closed: 127 return ErrClosed 128 } 129 } 130 131 // Stop shuts down the agent, blocking until full shutdown. If the agent is not 132 // started, Stop will block until the agent has fully shutdown. 133 func (a *Agent) Stop(ctx context.Context) error { 134 select { 135 case <-a.started: 136 default: 137 return errAgentNotStarted 138 } 139 140 a.stop() 141 142 // wait till closed or context cancelled 143 select { 144 case <-a.closed: 145 return nil 146 case <-ctx.Done(): 147 return ctx.Err() 148 } 149 } 150 151 // stop signals the agent shutdown process, returning true if this call was the 152 // first to actually shutdown the agent. 153 func (a *Agent) stop() bool { 154 var stopped bool 155 a.stopOnce.Do(func() { 156 close(a.stopped) 157 stopped = true 158 }) 159 160 return stopped 161 } 162 163 // Err returns the error that caused the agent to shutdown or nil. Err blocks 164 // until the agent is fully shutdown. 165 func (a *Agent) Err(ctx context.Context) error { 166 select { 167 case <-a.closed: 168 return a.err 169 case <-ctx.Done(): 170 return ctx.Err() 171 } 172 } 173 174 // Ready returns a channel that will be closed when agent first becomes ready. 175 func (a *Agent) Ready() <-chan struct{} { 176 return a.ready 177 } 178 179 func (a *Agent) run(ctx context.Context) { 180 ctx, cancel := context.WithCancel(ctx) 181 defer cancel() 182 defer close(a.closed) // full shutdown. 183 184 ctx = log.WithModule(ctx, "agent") 185 186 log.G(ctx).Debug("(*Agent).run") 187 defer log.G(ctx).Debug("(*Agent).run exited") 188 189 nodeTLSInfo := a.config.NodeTLSInfo 190 191 // get the node description 192 nodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo) 193 if err != nil { 194 log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: node description unavailable") 195 } 196 // nodeUpdateTicker is used to periodically check for updates to node description 197 nodeUpdateTicker := time.NewTicker(a.nodeUpdatePeriod) 198 defer nodeUpdateTicker.Stop() 199 200 var ( 201 backoff time.Duration 202 session = newSession(ctx, a, backoff, "", nodeDescription) // start the initial session 203 registered = session.registered 204 ready = a.ready // first session ready 205 sessionq chan sessionOperation 206 leaving = a.leaving 207 subscriptions = map[string]context.CancelFunc{} 208 // subscriptionDone is a channel that allows us to notify ourselves 209 // that a lot subscription should be finished. this channel is 210 // unbuffered, because it is only written to in a goroutine, and 211 // therefore cannot block the main execution path. 212 subscriptionDone = make(chan string) 213 ) 214 defer func() { 215 session.close() 216 }() 217 218 if err := a.worker.Init(ctx); err != nil { 219 log.G(ctx).WithError(err).Error("worker initialization failed") 220 a.err = err 221 return // fatal? 222 } 223 defer a.worker.Close() 224 225 // setup a reliable reporter to call back to us. 226 reporter := newStatusReporter(ctx, a) 227 defer reporter.Close() 228 229 a.worker.Listen(ctx, reporter) 230 231 updateNode := func() { 232 // skip updating if the registration isn't finished 233 if registered != nil { 234 return 235 } 236 // get the current node description 237 newNodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo) 238 if err != nil { 239 log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: updated node description unavailable") 240 } 241 242 // if newNodeDescription is nil, it will cause a panic when 243 // trying to create a session. Typically this can happen 244 // if the engine goes down 245 if newNodeDescription == nil { 246 return 247 } 248 249 // if the node description has changed, update it to the new one 250 // and close the session. The old session will be stopped and a 251 // new one will be created with the updated description 252 if !reflect.DeepEqual(nodeDescription, newNodeDescription) { 253 nodeDescription = newNodeDescription 254 // close the session 255 log.G(ctx).Info("agent: found node update") 256 257 if err := session.close(); err != nil { 258 log.G(ctx).WithError(err).Error("agent: closing session failed") 259 } 260 sessionq = nil 261 registered = nil 262 } 263 } 264 265 for { 266 select { 267 case operation := <-sessionq: 268 operation.response <- operation.fn(session) 269 case <-leaving: 270 leaving = nil 271 272 // TODO(stevvooe): Signal to the manager that the node is leaving. 273 274 // when leaving we remove all assignments. 275 if err := a.worker.Assign(ctx, nil); err != nil { 276 log.G(ctx).WithError(err).Error("failed removing all assignments") 277 } 278 279 close(a.left) 280 case msg := <-session.assignments: 281 // if we have left, accept no more assignments 282 if leaving == nil { 283 continue 284 } 285 286 switch msg.Type { 287 case api.AssignmentsMessage_COMPLETE: 288 // Need to assign secrets and configs before tasks, 289 // because tasks might depend on new secrets or configs 290 if err := a.worker.Assign(ctx, msg.Changes); err != nil { 291 log.G(ctx).WithError(err).Error("failed to synchronize worker assignments") 292 } 293 case api.AssignmentsMessage_INCREMENTAL: 294 if err := a.worker.Update(ctx, msg.Changes); err != nil { 295 log.G(ctx).WithError(err).Error("failed to update worker assignments") 296 } 297 } 298 case msg := <-session.messages: 299 if err := a.handleSessionMessage(ctx, msg, nodeTLSInfo); err != nil { 300 log.G(ctx).WithError(err).Error("session message handler failed") 301 } 302 case sub := <-session.subscriptions: 303 if sub.Close { 304 if cancel, ok := subscriptions[sub.ID]; ok { 305 cancel() 306 } 307 delete(subscriptions, sub.ID) 308 continue 309 } 310 311 if _, ok := subscriptions[sub.ID]; ok { 312 // Duplicate subscription 313 continue 314 } 315 316 subCtx, subCancel := context.WithCancel(ctx) 317 subscriptions[sub.ID] = subCancel 318 // NOTE(dperny): for like 3 years, there has been a to do saying 319 // "we're tossing the error here, that seems wrong". this is not a 320 // to do anymore. 9/10 of these errors are going to be "context 321 // deadline exceeded", and the remaining 1/10 obviously doesn't 322 // matter or we'd have missed it by now. 323 go func() { 324 a.worker.Subscribe(subCtx, sub) 325 // when the worker finishes the subscription, we should notify 326 // ourselves that this has occurred. We cannot rely on getting 327 // a Close message from the manager, as any number of things 328 // could go wrong (see github.com/moby/moby/issues/39916). 329 subscriptionDone <- sub.ID 330 }() 331 case subID := <-subscriptionDone: 332 // subscription may already have been removed. If so, no need to 333 // take any action. 334 if cancel, ok := subscriptions[subID]; ok { 335 cancel() 336 delete(subscriptions, subID) 337 } 338 case <-registered: 339 log.G(ctx).Debugln("agent: registered") 340 if ready != nil { 341 close(ready) 342 } 343 if a.config.SessionTracker != nil { 344 a.config.SessionTracker.SessionEstablished() 345 } 346 ready = nil 347 registered = nil // we only care about this once per session 348 backoff = 0 // reset backoff 349 sessionq = a.sessionq 350 // re-report all task statuses when re-establishing a session 351 go a.worker.Report(ctx, reporter) 352 case err := <-session.errs: 353 // TODO(stevvooe): This may actually block if a session is closed 354 // but no error was sent. This must be the only place 355 // session.close is called in response to errors, for this to work. 356 if err != nil { 357 if a.config.SessionTracker != nil { 358 a.config.SessionTracker.SessionError(err) 359 } 360 361 backoff = initialSessionFailureBackoff + 2*backoff 362 if backoff > maxSessionFailureBackoff { 363 backoff = maxSessionFailureBackoff 364 } 365 log.G(ctx).WithError(err).WithField("backoff", backoff).Errorf("agent: session failed") 366 } 367 368 if err := session.close(); err != nil { 369 log.G(ctx).WithError(err).Error("agent: closing session failed") 370 } 371 sessionq = nil 372 // if we're here before <-registered, do nothing for that event 373 registered = nil 374 case <-session.closed: 375 if a.config.SessionTracker != nil { 376 if err := a.config.SessionTracker.SessionClosed(); err != nil { 377 log.G(ctx).WithError(err).Error("agent: exiting") 378 a.err = err 379 return 380 } 381 } 382 383 log.G(ctx).Debugf("agent: rebuild session") 384 385 // select a session registration delay from backoff range. 386 delay := time.Duration(0) 387 if backoff > 0 { 388 delay = time.Duration(rand.Int63n(int64(backoff))) 389 } 390 session = newSession(ctx, a, delay, session.sessionID, nodeDescription) 391 registered = session.registered 392 case ev := <-a.config.NotifyTLSChange: 393 // the TLS info has changed, so force a check to see if we need to restart the session 394 if tlsInfo, ok := ev.(*api.NodeTLSInfo); ok { 395 nodeTLSInfo = tlsInfo 396 updateNode() 397 nodeUpdateTicker.Stop() 398 nodeUpdateTicker = time.NewTicker(a.nodeUpdatePeriod) 399 } 400 case <-nodeUpdateTicker.C: 401 // periodically check to see whether the node information has changed, and if so, restart the session 402 updateNode() 403 case <-a.stopped: 404 // TODO(stevvooe): Wait on shutdown and cleanup. May need to pump 405 // this loop a few times. 406 return 407 case <-ctx.Done(): 408 if a.err == nil { 409 a.err = ctx.Err() 410 } 411 return 412 } 413 } 414 } 415 416 func (a *Agent) handleSessionMessage(ctx context.Context, message *api.SessionMessage, nti *api.NodeTLSInfo) error { 417 seen := map[api.Peer]struct{}{} 418 for _, manager := range message.Managers { 419 if manager.Peer.Addr == "" { 420 continue 421 } 422 423 a.config.ConnBroker.Remotes().Observe(*manager.Peer, int(manager.Weight)) 424 seen[*manager.Peer] = struct{}{} 425 } 426 427 var changes *NodeChanges 428 if message.Node != nil && (a.node == nil || !nodesEqual(a.node, message.Node)) { 429 if a.config.NotifyNodeChange != nil { 430 changes = &NodeChanges{Node: message.Node.Copy()} 431 } 432 a.node = message.Node.Copy() 433 if err := a.config.Executor.Configure(ctx, a.node); err != nil { 434 log.G(ctx).WithError(err).Error("node configure failed") 435 } 436 } 437 if len(message.RootCA) > 0 && !bytes.Equal(message.RootCA, nti.TrustRoot) { 438 if changes == nil { 439 changes = &NodeChanges{RootCert: message.RootCA} 440 } else { 441 changes.RootCert = message.RootCA 442 } 443 } 444 445 if changes != nil { 446 a.config.NotifyNodeChange <- changes 447 } 448 449 // prune managers not in list. 450 for peer := range a.config.ConnBroker.Remotes().Weights() { 451 if _, ok := seen[peer]; !ok { 452 a.config.ConnBroker.Remotes().Remove(peer) 453 } 454 } 455 456 if message.NetworkBootstrapKeys == nil { 457 return nil 458 } 459 460 for _, key := range message.NetworkBootstrapKeys { 461 same := false 462 for _, agentKey := range a.keys { 463 if agentKey.LamportTime == key.LamportTime { 464 same = true 465 } 466 } 467 if !same { 468 a.keys = message.NetworkBootstrapKeys 469 if err := a.config.Executor.SetNetworkBootstrapKeys(a.keys); err != nil { 470 return errors.Wrap(err, "configuring network key failed") 471 } 472 } 473 } 474 475 return nil 476 } 477 478 type sessionOperation struct { 479 fn func(session *session) error 480 response chan error 481 } 482 483 // withSession runs fn with the current session. 484 func (a *Agent) withSession(ctx context.Context, fn func(session *session) error) error { 485 response := make(chan error, 1) 486 select { 487 case a.sessionq <- sessionOperation{ 488 fn: fn, 489 response: response, 490 }: 491 select { 492 case err := <-response: 493 return err 494 case <-a.closed: 495 return ErrClosed 496 case <-ctx.Done(): 497 return ctx.Err() 498 } 499 case <-a.closed: 500 return ErrClosed 501 case <-ctx.Done(): 502 return ctx.Err() 503 } 504 } 505 506 // UpdateTaskStatus attempts to send a task status update over the current session, 507 // blocking until the operation is completed. 508 // 509 // If an error is returned, the operation should be retried. 510 func (a *Agent) UpdateTaskStatus(ctx context.Context, taskID string, status *api.TaskStatus) error { 511 log.G(ctx).WithField("task.id", taskID).Debug("(*Agent).UpdateTaskStatus") 512 ctx, cancel := context.WithCancel(ctx) 513 defer cancel() 514 515 errs := make(chan error, 1) 516 if err := a.withSession(ctx, func(session *session) error { 517 go func() { 518 err := session.sendTaskStatus(ctx, taskID, status) 519 if err != nil { 520 if err == errTaskUnknown { 521 err = nil // dispatcher no longer cares about this task. 522 } else { 523 log.G(ctx).WithError(err).Error("closing session after fatal error") 524 session.sendError(err) 525 } 526 } else { 527 log.G(ctx).Debug("task status reported") 528 } 529 530 errs <- err 531 }() 532 533 return nil 534 }); err != nil { 535 return err 536 } 537 538 select { 539 case err := <-errs: 540 return err 541 case <-ctx.Done(): 542 return ctx.Err() 543 } 544 } 545 546 // Publisher returns a LogPublisher for the given subscription 547 // as well as a cancel function that should be called when the log stream 548 // is completed. 549 func (a *Agent) Publisher(ctx context.Context, subscriptionID string) (exec.LogPublisher, func(), error) { 550 // TODO(stevvooe): The level of coordination here is WAY too much for logs. 551 // These should only be best effort and really just buffer until a session is 552 // ready. Ideally, they would use a separate connection completely. 553 554 var ( 555 err error 556 publisher api.LogBroker_PublishLogsClient 557 ) 558 559 err = a.withSession(ctx, func(session *session) error { 560 publisher, err = api.NewLogBrokerClient(session.conn.ClientConn).PublishLogs(ctx) 561 return err 562 }) 563 if err != nil { 564 return nil, nil, err 565 } 566 567 // make little closure for ending the log stream 568 sendCloseMsg := func() { 569 // send a close message, to tell the manager our logs are done 570 publisher.Send(&api.PublishLogsMessage{ 571 SubscriptionID: subscriptionID, 572 Close: true, 573 }) 574 // close the stream forreal. ignore the return value and the error, 575 // because we don't care. 576 publisher.CloseAndRecv() 577 } 578 579 return exec.LogPublisherFunc(func(ctx context.Context, message api.LogMessage) error { 580 select { 581 case <-ctx.Done(): 582 sendCloseMsg() 583 return ctx.Err() 584 default: 585 } 586 587 return publisher.Send(&api.PublishLogsMessage{ 588 SubscriptionID: subscriptionID, 589 Messages: []api.LogMessage{message}, 590 }) 591 }), func() { 592 sendCloseMsg() 593 }, nil 594 } 595 596 // nodeDescriptionWithHostname retrieves node description, and overrides hostname if available 597 func (a *Agent) nodeDescriptionWithHostname(ctx context.Context, tlsInfo *api.NodeTLSInfo) (*api.NodeDescription, error) { 598 desc, err := a.config.Executor.Describe(ctx) 599 600 // Override hostname and TLS info 601 if desc != nil { 602 if a.config.Hostname != "" { 603 desc.Hostname = a.config.Hostname 604 } 605 desc.TLSInfo = tlsInfo 606 desc.FIPS = a.config.FIPS 607 } 608 return desc, err 609 } 610 611 // nodesEqual returns true if the node states are functionally equal, ignoring status, 612 // version and other superfluous fields. 613 // 614 // This used to decide whether or not to propagate a node update to executor. 615 func nodesEqual(a, b *api.Node) bool { 616 a, b = a.Copy(), b.Copy() 617 618 a.Status, b.Status = api.NodeStatus{}, api.NodeStatus{} 619 a.Meta, b.Meta = api.Meta{}, api.Meta{} 620 621 return reflect.DeepEqual(a, b) 622 }