github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/node/node.go (about) 1 package node 2 3 import ( 4 "bytes" 5 "context" 6 "crypto/tls" 7 "encoding/json" 8 "io/ioutil" 9 "math" 10 "net" 11 "os" 12 "path/filepath" 13 "reflect" 14 "sort" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/docker/swarmkit/ca/keyutils" 20 "github.com/docker/swarmkit/identity" 21 22 "github.com/docker/docker/pkg/plugingetter" 23 "github.com/docker/go-metrics" 24 "github.com/docker/libnetwork/drivers/overlay/overlayutils" 25 "github.com/docker/swarmkit/agent" 26 "github.com/docker/swarmkit/agent/exec" 27 "github.com/docker/swarmkit/api" 28 "github.com/docker/swarmkit/ca" 29 "github.com/docker/swarmkit/connectionbroker" 30 "github.com/docker/swarmkit/ioutils" 31 "github.com/docker/swarmkit/log" 32 "github.com/docker/swarmkit/manager" 33 "github.com/docker/swarmkit/manager/allocator/cnmallocator" 34 "github.com/docker/swarmkit/manager/encryption" 35 "github.com/docker/swarmkit/remotes" 36 "github.com/docker/swarmkit/xnet" 37 grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" 38 "github.com/pkg/errors" 39 "github.com/sirupsen/logrus" 40 bolt "go.etcd.io/bbolt" 41 "google.golang.org/grpc" 42 "google.golang.org/grpc/credentials" 43 "google.golang.org/grpc/status" 44 ) 45 46 const ( 47 stateFilename = "state.json" 48 roleChangeTimeout = 16 * time.Second 49 ) 50 51 var ( 52 nodeInfo metrics.LabeledGauge 53 nodeManager metrics.Gauge 54 55 errNodeStarted = errors.New("node: already started") 56 errNodeNotStarted = errors.New("node: not started") 57 certDirectory = "certificates" 58 59 // ErrInvalidUnlockKey is returned when we can't decrypt the TLS certificate 60 ErrInvalidUnlockKey = errors.New("node is locked, and needs a valid unlock key") 61 62 // ErrMandatoryFIPS is returned when the cluster we are joining mandates FIPS, but we are running in non-FIPS mode 63 ErrMandatoryFIPS = errors.New("node is not FIPS-enabled but cluster requires FIPS") 64 ) 65 66 func init() { 67 ns := metrics.NewNamespace("swarm", "node", nil) 68 nodeInfo = ns.NewLabeledGauge("info", "Information related to the swarm", "", 69 "swarm_id", 70 "node_id", 71 ) 72 nodeManager = ns.NewGauge("manager", "Whether this node is a manager or not", "") 73 metrics.Register(ns) 74 } 75 76 // Config provides values for a Node. 77 type Config struct { 78 // Hostname is the name of host for agent instance. 79 Hostname string 80 81 // JoinAddr specifies node that should be used for the initial connection to 82 // other manager in cluster. This should be only one address and optional, 83 // the actual remotes come from the stored state. 84 JoinAddr string 85 86 // StateDir specifies the directory the node uses to keep the state of the 87 // remote managers and certificates. 88 StateDir string 89 90 // JoinToken is the token to be used on the first certificate request. 91 JoinToken string 92 93 // ExternalCAs is a list of CAs to which a manager node 94 // will make certificate signing requests for node certificates. 95 ExternalCAs []*api.ExternalCA 96 97 // ForceNewCluster creates a new cluster from current raft state. 98 ForceNewCluster bool 99 100 // ListenControlAPI specifies address the control API should listen on. 101 ListenControlAPI string 102 103 // ListenRemoteAPI specifies the address for the remote API that agents 104 // and raft members connect to. 105 ListenRemoteAPI string 106 107 // AdvertiseRemoteAPI specifies the address that should be advertised 108 // for connections to the remote API (including the raft service). 109 AdvertiseRemoteAPI string 110 111 // NetworkConfig stores network related config for the cluster 112 NetworkConfig *cnmallocator.NetworkConfig 113 114 // Executor specifies the executor to use for the agent. 115 Executor exec.Executor 116 117 // ElectionTick defines the amount of ticks needed without 118 // leader to trigger a new election 119 ElectionTick uint32 120 121 // HeartbeatTick defines the amount of ticks between each 122 // heartbeat sent to other members for health-check purposes 123 HeartbeatTick uint32 124 125 // AutoLockManagers determines whether or not an unlock key will be generated 126 // when bootstrapping a new cluster for the first time 127 AutoLockManagers bool 128 129 // UnlockKey is the key to unlock a node - used for decrypting at rest. This 130 // only applies to nodes that have already joined a cluster. 131 UnlockKey []byte 132 133 // Availability allows a user to control the current scheduling status of a node 134 Availability api.NodeSpec_Availability 135 136 // PluginGetter provides access to docker's plugin inventory. 137 PluginGetter plugingetter.PluginGetter 138 139 // FIPS is a boolean stating whether the node is FIPS enabled 140 FIPS bool 141 } 142 143 // Node implements the primary node functionality for a member of a swarm 144 // cluster. Node handles workloads and may also run as a manager. 145 type Node struct { 146 sync.RWMutex 147 config *Config 148 remotes *persistentRemotes 149 connBroker *connectionbroker.Broker 150 role string 151 roleCond *sync.Cond 152 conn *grpc.ClientConn 153 connCond *sync.Cond 154 nodeID string 155 started chan struct{} 156 startOnce sync.Once 157 stopped chan struct{} 158 stopOnce sync.Once 159 ready chan struct{} // closed when agent has completed registration and manager(if enabled) is ready to receive control requests 160 closed chan struct{} 161 err error 162 agent *agent.Agent 163 manager *manager.Manager 164 notifyNodeChange chan *agent.NodeChanges // used by the agent to relay node updates from the dispatcher Session stream to (*Node).run 165 unlockKey []byte 166 vxlanUDPPort uint32 167 } 168 169 type lastSeenRole struct { 170 role api.NodeRole 171 } 172 173 // observe notes the latest value of this node role, and returns true if it 174 // is the first seen value, or is different from the most recently seen value. 175 func (l *lastSeenRole) observe(newRole api.NodeRole) bool { 176 changed := l.role != newRole 177 l.role = newRole 178 return changed 179 } 180 181 // RemoteAPIAddr returns address on which remote manager api listens. 182 // Returns nil if node is not manager. 183 func (n *Node) RemoteAPIAddr() (string, error) { 184 n.RLock() 185 defer n.RUnlock() 186 if n.manager == nil { 187 return "", errors.New("manager is not running") 188 } 189 addr := n.manager.Addr() 190 if addr == "" { 191 return "", errors.New("manager addr is not set") 192 } 193 return addr, nil 194 } 195 196 // New returns new Node instance. 197 func New(c *Config) (*Node, error) { 198 if err := os.MkdirAll(c.StateDir, 0700); err != nil { 199 return nil, err 200 } 201 stateFile := filepath.Join(c.StateDir, stateFilename) 202 dt, err := ioutil.ReadFile(stateFile) 203 var p []api.Peer 204 if err != nil && !os.IsNotExist(err) { 205 return nil, err 206 } 207 if err == nil { 208 if err := json.Unmarshal(dt, &p); err != nil { 209 return nil, err 210 } 211 } 212 n := &Node{ 213 remotes: newPersistentRemotes(stateFile, p...), 214 role: ca.WorkerRole, 215 config: c, 216 started: make(chan struct{}), 217 stopped: make(chan struct{}), 218 closed: make(chan struct{}), 219 ready: make(chan struct{}), 220 notifyNodeChange: make(chan *agent.NodeChanges, 1), 221 unlockKey: c.UnlockKey, 222 } 223 224 if n.config.JoinAddr != "" || n.config.ForceNewCluster { 225 n.remotes = newPersistentRemotes(filepath.Join(n.config.StateDir, stateFilename)) 226 if n.config.JoinAddr != "" { 227 n.remotes.Observe(api.Peer{Addr: n.config.JoinAddr}, remotes.DefaultObservationWeight) 228 } 229 } 230 231 n.connBroker = connectionbroker.New(n.remotes) 232 233 n.roleCond = sync.NewCond(n.RLocker()) 234 n.connCond = sync.NewCond(n.RLocker()) 235 return n, nil 236 } 237 238 // BindRemote starts a listener that exposes the remote API. 239 func (n *Node) BindRemote(ctx context.Context, listenAddr string, advertiseAddr string) error { 240 n.RLock() 241 defer n.RUnlock() 242 243 if n.manager == nil { 244 return errors.New("manager is not running") 245 } 246 247 return n.manager.BindRemote(ctx, manager.RemoteAddrs{ 248 ListenAddr: listenAddr, 249 AdvertiseAddr: advertiseAddr, 250 }) 251 } 252 253 // Start starts a node instance. 254 func (n *Node) Start(ctx context.Context) error { 255 err := errNodeStarted 256 257 n.startOnce.Do(func() { 258 close(n.started) 259 go n.run(ctx) 260 err = nil // clear error above, only once. 261 }) 262 return err 263 } 264 265 func (n *Node) currentRole() api.NodeRole { 266 n.Lock() 267 currentRole := api.NodeRoleWorker 268 if n.role == ca.ManagerRole { 269 currentRole = api.NodeRoleManager 270 } 271 n.Unlock() 272 return currentRole 273 } 274 275 // configVXLANUDPPort sets vxlan port in libnetwork 276 func configVXLANUDPPort(ctx context.Context, vxlanUDPPort uint32) { 277 if err := overlayutils.ConfigVXLANUDPPort(vxlanUDPPort); err != nil { 278 log.G(ctx).WithError(err).Error("failed to configure VXLAN UDP port") 279 return 280 } 281 logrus.Infof("initialized VXLAN UDP port to %d ", vxlanUDPPort) 282 } 283 284 func (n *Node) run(ctx context.Context) (err error) { 285 defer func() { 286 n.err = err 287 // close the n.closed channel to indicate that the Node has completely 288 // terminated 289 close(n.closed) 290 }() 291 ctx, cancel := context.WithCancel(ctx) 292 defer cancel() 293 ctx = log.WithModule(ctx, "node") 294 295 // set up a goroutine to monitor the stop channel, and cancel the run 296 // context when the node is stopped 297 go func(ctx context.Context) { 298 select { 299 case <-ctx.Done(): 300 case <-n.stopped: 301 cancel() 302 } 303 }(ctx) 304 305 // First thing's first: get the SecurityConfig for this node. This includes 306 // the certificate information, and the root CA. It also returns a cancel 307 // function. This is needed because the SecurityConfig is a live object, 308 // and provides a watch queue so that caller can observe changes to the 309 // security config. This watch queue has to be closed, which is done by the 310 // secConfigCancel function. 311 // 312 // It's also noteworthy that loading the security config with the node's 313 // loadSecurityConfig method has the side effect of setting the node's ID 314 // and role fields, meaning it isn't until after that point that node knows 315 // its ID 316 paths := ca.NewConfigPaths(filepath.Join(n.config.StateDir, certDirectory)) 317 securityConfig, secConfigCancel, err := n.loadSecurityConfig(ctx, paths) 318 if err != nil { 319 return err 320 } 321 defer secConfigCancel() 322 323 // Now that we have the security config, we can get a TLSRenewer, which is 324 // a live component handling certificate rotation. 325 renewer := ca.NewTLSRenewer(securityConfig, n.connBroker, paths.RootCA) 326 327 // Now that we have the security goop all loaded, we know the Node's ID and 328 // can add that to our logging context. 329 ctx = log.WithLogger(ctx, log.G(ctx).WithField("node.id", n.NodeID())) 330 331 // Next, set up the task database. The task database is used by the agent 332 // to keep a persistent local record of its tasks. Since every manager also 333 // has an agent, every node needs a task database, so we do this regardless 334 // of role. 335 taskDBPath := filepath.Join(n.config.StateDir, "worker", "tasks.db") 336 // Doing os.MkdirAll will create the necessary directory path for the task 337 // database if it doesn't already exist, and if it does already exist, no 338 // error will be returned, so we use this regardless of whether this node 339 // is new or not. 340 if err := os.MkdirAll(filepath.Dir(taskDBPath), 0777); err != nil { 341 return err 342 } 343 344 db, err := bolt.Open(taskDBPath, 0666, nil) 345 if err != nil { 346 return err 347 } 348 defer db.Close() 349 350 // agentDone is a channel that represents the agent having exited. We start 351 // the agent in a goroutine a few blocks down, and before that goroutine 352 // exits, it closes this channel to signal to the goroutine just below to 353 // terminate. 354 agentDone := make(chan struct{}) 355 356 // This goroutine is the node changes loop. The n.notifyNodeChange 357 // channel is passed to the agent. When an new node object gets sent down 358 // to the agent, it gets passed back up to this node object, so that we can 359 // check if a role update or a root certificate rotation is required. This 360 // handles root rotation, but the renewer handles regular certification 361 // rotation. 362 go func() { 363 // lastNodeDesiredRole is the last-seen value of Node.Spec.DesiredRole, 364 // used to make role changes "edge triggered" and avoid renewal loops. 365 lastNodeDesiredRole := lastSeenRole{role: n.currentRole()} 366 367 for { 368 select { 369 case <-agentDone: 370 return 371 case nodeChanges := <-n.notifyNodeChange: 372 if nodeChanges.Node != nil { 373 if nodeChanges.Node.VXLANUDPPort != 0 { 374 n.vxlanUDPPort = nodeChanges.Node.VXLANUDPPort 375 configVXLANUDPPort(ctx, n.vxlanUDPPort) 376 } 377 // This is a bit complex to be backward compatible with older CAs that 378 // don't support the Node.Role field. They only use what's presently 379 // called DesiredRole. 380 // 1) If DesiredRole changes, kick off a certificate renewal. The renewal 381 // is delayed slightly to give Role time to change as well if this is 382 // a newer CA. If the certificate we get back doesn't have the expected 383 // role, we continue renewing with exponential backoff. 384 // 2) If the server is sending us IssuanceStateRotate, renew the cert as 385 // requested by the CA. 386 desiredRoleChanged := lastNodeDesiredRole.observe(nodeChanges.Node.Spec.DesiredRole) 387 if desiredRoleChanged { 388 switch nodeChanges.Node.Spec.DesiredRole { 389 case api.NodeRoleManager: 390 renewer.SetExpectedRole(ca.ManagerRole) 391 case api.NodeRoleWorker: 392 renewer.SetExpectedRole(ca.WorkerRole) 393 } 394 } 395 if desiredRoleChanged || nodeChanges.Node.Certificate.Status.State == api.IssuanceStateRotate { 396 renewer.Renew() 397 } 398 } 399 400 if nodeChanges.RootCert != nil { 401 if bytes.Equal(nodeChanges.RootCert, securityConfig.RootCA().Certs) { 402 continue 403 } 404 newRootCA, err := ca.NewRootCA(nodeChanges.RootCert, nil, nil, ca.DefaultNodeCertExpiration, nil) 405 if err != nil { 406 log.G(ctx).WithError(err).Error("invalid new root certificate from the dispatcher") 407 continue 408 } 409 if err := securityConfig.UpdateRootCA(&newRootCA); err != nil { 410 log.G(ctx).WithError(err).Error("could not use new root CA from dispatcher") 411 continue 412 } 413 if err := ca.SaveRootCA(newRootCA, paths.RootCA); err != nil { 414 log.G(ctx).WithError(err).Error("could not save new root certificate from the dispatcher") 415 continue 416 } 417 } 418 } 419 } 420 }() 421 422 // Now we're going to launch the main component goroutines, the Agent, the 423 // Manager (maybe) and the certificate updates loop. We shouldn't exit 424 // the node object until all 3 of these components have terminated, so we 425 // create a waitgroup to block termination of the node until then 426 var wg sync.WaitGroup 427 wg.Add(3) 428 429 // These two blocks update some of the metrics settings. 430 nodeInfo.WithValues( 431 securityConfig.ClientTLSCreds.Organization(), 432 securityConfig.ClientTLSCreds.NodeID(), 433 ).Set(1) 434 435 if n.currentRole() == api.NodeRoleManager { 436 nodeManager.Set(1) 437 } else { 438 nodeManager.Set(0) 439 } 440 441 // We created the renewer way up when we were creating the SecurityConfig 442 // at the beginning of run, but now we're ready to start receiving 443 // CertificateUpdates, and launch a goroutine to handle this. Updates is a 444 // channel we iterate containing the results of certificate renewals. 445 updates := renewer.Start(ctx) 446 go func() { 447 for certUpdate := range updates { 448 if certUpdate.Err != nil { 449 logrus.Warnf("error renewing TLS certificate: %v", certUpdate.Err) 450 continue 451 } 452 // Set the new role, and notify our waiting role changing logic 453 // that the role has changed. 454 n.Lock() 455 n.role = certUpdate.Role 456 n.roleCond.Broadcast() 457 n.Unlock() 458 459 // Export the new role for metrics 460 if n.currentRole() == api.NodeRoleManager { 461 nodeManager.Set(1) 462 } else { 463 nodeManager.Set(0) 464 } 465 } 466 467 wg.Done() 468 }() 469 470 // and, finally, start the two main components: the manager and the agent 471 role := n.role 472 473 // Channels to signal when these respective components are up and ready to 474 // go. 475 managerReady := make(chan struct{}) 476 agentReady := make(chan struct{}) 477 // these variables are defined in this scope so that they're closed on by 478 // respective goroutines below. 479 var managerErr error 480 var agentErr error 481 go func() { 482 // superviseManager is a routine that watches our manager role 483 managerErr = n.superviseManager(ctx, securityConfig, paths.RootCA, managerReady, renewer) // store err and loop 484 wg.Done() 485 cancel() 486 }() 487 go func() { 488 agentErr = n.runAgent(ctx, db, securityConfig, agentReady) 489 wg.Done() 490 cancel() 491 close(agentDone) 492 }() 493 494 // This goroutine is what signals that the node has fully started by 495 // closing the n.ready channel. First, it waits for the agent to start. 496 // Then, if this node is a manager, it will wait on either the manager 497 // starting, or the node role changing. This ensures that if the node is 498 // demoted before the manager starts, it doesn't get stuck. 499 go func() { 500 <-agentReady 501 if role == ca.ManagerRole { 502 workerRole := make(chan struct{}) 503 waitRoleCtx, waitRoleCancel := context.WithCancel(ctx) 504 go func() { 505 if n.waitRole(waitRoleCtx, ca.WorkerRole) == nil { 506 close(workerRole) 507 } 508 }() 509 select { 510 case <-managerReady: 511 case <-workerRole: 512 } 513 waitRoleCancel() 514 } 515 close(n.ready) 516 }() 517 518 // And, finally, we park and wait for the node to close up. If we get any 519 // error other than context canceled, we return it. 520 wg.Wait() 521 if managerErr != nil && errors.Cause(managerErr) != context.Canceled { 522 return managerErr 523 } 524 if agentErr != nil && errors.Cause(agentErr) != context.Canceled { 525 return agentErr 526 } 527 // NOTE(dperny): we return err here, but the last time I can see err being 528 // set is when we open the boltdb way up in this method, so I don't know 529 // what returning err is supposed to do. 530 return err 531 } 532 533 // Stop stops node execution 534 func (n *Node) Stop(ctx context.Context) error { 535 select { 536 case <-n.started: 537 default: 538 return errNodeNotStarted 539 } 540 // ask agent to clean up assignments 541 n.Lock() 542 if n.agent != nil { 543 if err := n.agent.Leave(ctx); err != nil { 544 log.G(ctx).WithError(err).Error("agent failed to clean up assignments") 545 } 546 } 547 n.Unlock() 548 549 n.stopOnce.Do(func() { 550 close(n.stopped) 551 }) 552 553 select { 554 case <-n.closed: 555 return nil 556 case <-ctx.Done(): 557 return ctx.Err() 558 } 559 } 560 561 // Err returns the error that caused the node to shutdown or nil. Err blocks 562 // until the node has fully shut down. 563 func (n *Node) Err(ctx context.Context) error { 564 select { 565 case <-n.closed: 566 return n.err 567 case <-ctx.Done(): 568 return ctx.Err() 569 } 570 } 571 572 // runAgent starts the node's agent. When the agent has started, the provided 573 // ready channel is closed. When the agent exits, this will return the error 574 // that caused it. 575 func (n *Node) runAgent(ctx context.Context, db *bolt.DB, securityConfig *ca.SecurityConfig, ready chan<- struct{}) error { 576 // First, get a channel for knowing when a remote peer has been selected. 577 // The value returned from the remotesCh is ignored, we just need to know 578 // when the peer is selected 579 remotesCh := n.remotes.WaitSelect(ctx) 580 // then, we set up a new context to pass specifically to 581 // ListenControlSocket, and start that method to wait on a connection on 582 // the cluster control API. 583 waitCtx, waitCancel := context.WithCancel(ctx) 584 controlCh := n.ListenControlSocket(waitCtx) 585 586 // The goal here to wait either until we have a remote peer selected, or 587 // connection to the control 588 // socket. These are both ways to connect the 589 // agent to a manager, and we need to wait until one or the other is 590 // available to start the agent 591 waitPeer: 592 for { 593 select { 594 case <-ctx.Done(): 595 break waitPeer 596 case <-remotesCh: 597 break waitPeer 598 case conn := <-controlCh: 599 // conn will probably be nil the first time we call this, probably, 600 // but only a non-nil conn represent an actual connection. 601 if conn != nil { 602 break waitPeer 603 } 604 } 605 } 606 607 // We can stop listening for new control socket connections once we're 608 // ready 609 waitCancel() 610 611 // NOTE(dperny): not sure why we need to recheck the context here. I guess 612 // it avoids a race if the context was canceled at the same time that a 613 // connection or peer was available. I think it's just an optimization. 614 select { 615 case <-ctx.Done(): 616 return ctx.Err() 617 default: 618 } 619 620 // Now we can go ahead and configure, create, and start the agent. 621 secChangesCh, secChangesCancel := securityConfig.Watch() 622 defer secChangesCancel() 623 624 rootCA := securityConfig.RootCA() 625 issuer := securityConfig.IssuerInfo() 626 627 agentConfig := &agent.Config{ 628 Hostname: n.config.Hostname, 629 ConnBroker: n.connBroker, 630 Executor: n.config.Executor, 631 DB: db, 632 NotifyNodeChange: n.notifyNodeChange, 633 NotifyTLSChange: secChangesCh, 634 Credentials: securityConfig.ClientTLSCreds, 635 NodeTLSInfo: &api.NodeTLSInfo{ 636 TrustRoot: rootCA.Certs, 637 CertIssuerPublicKey: issuer.PublicKey, 638 CertIssuerSubject: issuer.Subject, 639 }, 640 FIPS: n.config.FIPS, 641 } 642 // if a join address has been specified, then if the agent fails to connect 643 // due to a TLS error, fail fast - don't keep re-trying to join 644 if n.config.JoinAddr != "" { 645 agentConfig.SessionTracker = &firstSessionErrorTracker{} 646 } 647 648 a, err := agent.New(agentConfig) 649 if err != nil { 650 return err 651 } 652 if err := a.Start(ctx); err != nil { 653 return err 654 } 655 656 n.Lock() 657 n.agent = a 658 n.Unlock() 659 660 defer func() { 661 n.Lock() 662 n.agent = nil 663 n.Unlock() 664 }() 665 666 // when the agent indicates that it is ready, we close the ready channel. 667 go func() { 668 <-a.Ready() 669 close(ready) 670 }() 671 672 // todo: manually call stop on context cancellation? 673 674 return a.Err(context.Background()) 675 } 676 677 // Ready returns a channel that is closed after node's initialization has 678 // completes for the first time. 679 func (n *Node) Ready() <-chan struct{} { 680 return n.ready 681 } 682 683 func (n *Node) setControlSocket(conn *grpc.ClientConn) { 684 n.Lock() 685 if n.conn != nil { 686 n.conn.Close() 687 } 688 n.conn = conn 689 n.connBroker.SetLocalConn(conn) 690 n.connCond.Broadcast() 691 n.Unlock() 692 } 693 694 // ListenControlSocket listens changes of a connection for managing the 695 // cluster control api 696 func (n *Node) ListenControlSocket(ctx context.Context) <-chan *grpc.ClientConn { 697 c := make(chan *grpc.ClientConn, 1) 698 n.RLock() 699 conn := n.conn 700 c <- conn 701 done := make(chan struct{}) 702 go func() { 703 select { 704 case <-ctx.Done(): 705 n.connCond.Broadcast() 706 case <-done: 707 } 708 }() 709 go func() { 710 defer close(c) 711 defer close(done) 712 defer n.RUnlock() 713 for { 714 select { 715 case <-ctx.Done(): 716 return 717 default: 718 } 719 if conn == n.conn { 720 n.connCond.Wait() 721 continue 722 } 723 conn = n.conn 724 select { 725 case c <- conn: 726 case <-ctx.Done(): 727 return 728 } 729 } 730 }() 731 return c 732 } 733 734 // NodeID returns current node's ID. May be empty if not set. 735 func (n *Node) NodeID() string { 736 n.RLock() 737 defer n.RUnlock() 738 return n.nodeID 739 } 740 741 // Manager returns manager instance started by node. May be nil. 742 func (n *Node) Manager() *manager.Manager { 743 n.RLock() 744 defer n.RUnlock() 745 return n.manager 746 } 747 748 // Agent returns agent instance started by node. May be nil. 749 func (n *Node) Agent() *agent.Agent { 750 n.RLock() 751 defer n.RUnlock() 752 return n.agent 753 } 754 755 // IsStateDirty returns true if any objects have been added to raft which make 756 // the state "dirty". Currently, the existence of any object other than the 757 // default cluster or the local node implies a dirty state. 758 func (n *Node) IsStateDirty() (bool, error) { 759 n.RLock() 760 defer n.RUnlock() 761 762 if n.manager == nil { 763 return false, errors.New("node is not a manager") 764 } 765 766 return n.manager.IsStateDirty() 767 } 768 769 // Remotes returns a list of known peers known to node. 770 func (n *Node) Remotes() []api.Peer { 771 weights := n.remotes.Weights() 772 remotes := make([]api.Peer, 0, len(weights)) 773 for p := range weights { 774 remotes = append(remotes, p) 775 } 776 return remotes 777 } 778 779 // Given a cluster ID, returns whether the cluster ID indicates that the cluster 780 // mandates FIPS mode. These cluster IDs start with "FIPS." as a prefix. 781 func isMandatoryFIPSClusterID(securityConfig *ca.SecurityConfig) bool { 782 return strings.HasPrefix(securityConfig.ClientTLSCreds.Organization(), "FIPS.") 783 } 784 785 // Given a join token, returns whether it indicates that the cluster mandates FIPS 786 // mode. 787 func isMandatoryFIPSClusterJoinToken(joinToken string) bool { 788 if parsed, err := ca.ParseJoinToken(joinToken); err == nil { 789 return parsed.FIPS 790 } 791 return false 792 } 793 794 func generateFIPSClusterID() string { 795 return "FIPS." + identity.NewID() 796 } 797 798 func (n *Node) loadSecurityConfig(ctx context.Context, paths *ca.SecurityConfigPaths) (*ca.SecurityConfig, func() error, error) { 799 var ( 800 securityConfig *ca.SecurityConfig 801 cancel func() error 802 ) 803 804 krw := ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{FIPS: n.config.FIPS}) 805 // if FIPS is required, we want to make sure our key is stored in PKCS8 format 806 if n.config.FIPS { 807 krw.SetKeyFormatter(keyutils.FIPS) 808 } 809 if err := krw.Migrate(); err != nil { 810 return nil, nil, err 811 } 812 813 // Check if we already have a valid certificates on disk. 814 rootCA, err := ca.GetLocalRootCA(paths.RootCA) 815 if err != nil && err != ca.ErrNoLocalRootCA { 816 return nil, nil, err 817 } 818 if err == nil { 819 // if forcing a new cluster, we allow the certificates to be expired - a new set will be generated 820 securityConfig, cancel, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) 821 if err != nil { 822 _, isInvalidKEK := errors.Cause(err).(ca.ErrInvalidKEK) 823 if isInvalidKEK { 824 return nil, nil, ErrInvalidUnlockKey 825 } else if !os.IsNotExist(err) { 826 return nil, nil, errors.Wrapf(err, "error while loading TLS certificate in %s", paths.Node.Cert) 827 } 828 } 829 } 830 831 if securityConfig == nil { 832 if n.config.JoinAddr == "" { 833 // if we're not joining a cluster, bootstrap a new one - and we have to set the unlock key 834 n.unlockKey = nil 835 if n.config.AutoLockManagers { 836 n.unlockKey = encryption.GenerateSecretKey() 837 } 838 krw = ca.NewKeyReadWriter(paths.Node, n.unlockKey, &manager.RaftDEKData{FIPS: n.config.FIPS}) 839 rootCA, err = ca.CreateRootCA(ca.DefaultRootCN) 840 if err != nil { 841 return nil, nil, err 842 } 843 if err := ca.SaveRootCA(rootCA, paths.RootCA); err != nil { 844 return nil, nil, err 845 } 846 log.G(ctx).Debug("generated CA key and certificate") 847 } else if err == ca.ErrNoLocalRootCA { // from previous error loading the root CA from disk 848 // if we are attempting to join another cluster, which has a FIPS join token, and we are not FIPS, error 849 if n.config.JoinAddr != "" && isMandatoryFIPSClusterJoinToken(n.config.JoinToken) && !n.config.FIPS { 850 return nil, nil, ErrMandatoryFIPS 851 } 852 rootCA, err = ca.DownloadRootCA(ctx, paths.RootCA, n.config.JoinToken, n.connBroker) 853 if err != nil { 854 return nil, nil, err 855 } 856 log.G(ctx).Debug("downloaded CA certificate") 857 } 858 859 // Obtain new certs and setup TLS certificates renewal for this node: 860 // - If certificates weren't present on disk, we call CreateSecurityConfig, which blocks 861 // until a valid certificate has been issued. 862 // - We wait for CreateSecurityConfig to finish since we need a certificate to operate. 863 864 // Attempt to load certificate from disk 865 securityConfig, cancel, err = ca.LoadSecurityConfig(ctx, rootCA, krw, n.config.ForceNewCluster) 866 if err == nil { 867 log.G(ctx).WithFields(logrus.Fields{ 868 "node.id": securityConfig.ClientTLSCreds.NodeID(), 869 }).Debugf("loaded TLS certificate") 870 } else { 871 if _, ok := errors.Cause(err).(ca.ErrInvalidKEK); ok { 872 return nil, nil, ErrInvalidUnlockKey 873 } 874 log.G(ctx).WithError(err).Debugf("no node credentials found in: %s", krw.Target()) 875 876 // if we are attempting to join another cluster, which has a FIPS join token, and we are not FIPS, error 877 if n.config.JoinAddr != "" && isMandatoryFIPSClusterJoinToken(n.config.JoinToken) && !n.config.FIPS { 878 return nil, nil, ErrMandatoryFIPS 879 } 880 881 requestConfig := ca.CertificateRequestConfig{ 882 Token: n.config.JoinToken, 883 Availability: n.config.Availability, 884 ConnBroker: n.connBroker, 885 } 886 // If this is a new cluster, we want to name the cluster ID "FIPS-something" 887 if n.config.FIPS { 888 requestConfig.Organization = generateFIPSClusterID() 889 } 890 securityConfig, cancel, err = rootCA.CreateSecurityConfig(ctx, krw, requestConfig) 891 892 if err != nil { 893 return nil, nil, err 894 } 895 } 896 } 897 898 if isMandatoryFIPSClusterID(securityConfig) && !n.config.FIPS { 899 return nil, nil, ErrMandatoryFIPS 900 } 901 902 n.Lock() 903 n.role = securityConfig.ClientTLSCreds.Role() 904 n.nodeID = securityConfig.ClientTLSCreds.NodeID() 905 n.roleCond.Broadcast() 906 n.Unlock() 907 908 return securityConfig, cancel, nil 909 } 910 911 func (n *Node) initManagerConnection(ctx context.Context, ready chan<- struct{}) error { 912 opts := []grpc.DialOption{ 913 grpc.WithUnaryInterceptor(grpc_prometheus.UnaryClientInterceptor), 914 grpc.WithStreamInterceptor(grpc_prometheus.StreamClientInterceptor), 915 grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(math.MaxInt32)), 916 } 917 insecureCreds := credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}) 918 opts = append(opts, grpc.WithTransportCredentials(insecureCreds)) 919 addr := n.config.ListenControlAPI 920 opts = append(opts, grpc.WithDialer( 921 func(addr string, timeout time.Duration) (net.Conn, error) { 922 return xnet.DialTimeoutLocal(addr, timeout) 923 })) 924 conn, err := grpc.Dial(addr, opts...) 925 if err != nil { 926 return err 927 } 928 client := api.NewHealthClient(conn) 929 for { 930 resp, err := client.Check(ctx, &api.HealthCheckRequest{Service: "ControlAPI"}) 931 if err != nil { 932 return err 933 } 934 if resp.Status == api.HealthCheckResponse_SERVING { 935 break 936 } 937 time.Sleep(500 * time.Millisecond) 938 } 939 n.setControlSocket(conn) 940 if ready != nil { 941 close(ready) 942 } 943 return nil 944 } 945 946 // waitRole takes a context and a role. it the blocks until the context is 947 // canceled or the node's role updates to the provided role. returns nil when 948 // the node has acquired the provided role, or ctx.Err() if the context is 949 // canceled 950 func (n *Node) waitRole(ctx context.Context, role string) error { 951 n.roleCond.L.Lock() 952 if role == n.role { 953 n.roleCond.L.Unlock() 954 return nil 955 } 956 finishCh := make(chan struct{}) 957 defer close(finishCh) 958 go func() { 959 select { 960 case <-finishCh: 961 case <-ctx.Done(): 962 // call broadcast to shutdown this function 963 n.roleCond.Broadcast() 964 } 965 }() 966 defer n.roleCond.L.Unlock() 967 for role != n.role { 968 n.roleCond.Wait() 969 select { 970 case <-ctx.Done(): 971 return ctx.Err() 972 default: 973 } 974 } 975 976 return nil 977 } 978 979 // runManager runs the manager on this node. It returns a boolean indicating if 980 // the stoppage was due to a role change, and an error indicating why the 981 // manager stopped 982 func (n *Node) runManager(ctx context.Context, securityConfig *ca.SecurityConfig, rootPaths ca.CertPaths, ready chan struct{}, workerRole <-chan struct{}) (bool, error) { 983 // First, set up this manager's advertise and listen addresses, if 984 // provided. they might not be provided if this node is joining the cluster 985 // instead of creating a new one. 986 var remoteAPI *manager.RemoteAddrs 987 if n.config.ListenRemoteAPI != "" { 988 remoteAPI = &manager.RemoteAddrs{ 989 ListenAddr: n.config.ListenRemoteAPI, 990 AdvertiseAddr: n.config.AdvertiseRemoteAPI, 991 } 992 } 993 994 joinAddr := n.config.JoinAddr 995 if joinAddr == "" { 996 remoteAddr, err := n.remotes.Select(n.NodeID()) 997 if err == nil { 998 joinAddr = remoteAddr.Addr 999 } 1000 } 1001 1002 m, err := manager.New(&manager.Config{ 1003 ForceNewCluster: n.config.ForceNewCluster, 1004 RemoteAPI: remoteAPI, 1005 ControlAPI: n.config.ListenControlAPI, 1006 SecurityConfig: securityConfig, 1007 ExternalCAs: n.config.ExternalCAs, 1008 JoinRaft: joinAddr, 1009 ForceJoin: n.config.JoinAddr != "", 1010 StateDir: n.config.StateDir, 1011 HeartbeatTick: n.config.HeartbeatTick, 1012 ElectionTick: n.config.ElectionTick, 1013 AutoLockManagers: n.config.AutoLockManagers, 1014 UnlockKey: n.unlockKey, 1015 Availability: n.config.Availability, 1016 PluginGetter: n.config.PluginGetter, 1017 RootCAPaths: rootPaths, 1018 FIPS: n.config.FIPS, 1019 NetworkConfig: n.config.NetworkConfig, 1020 }) 1021 if err != nil { 1022 return false, err 1023 } 1024 // The done channel is used to signal that the manager has exited. 1025 done := make(chan struct{}) 1026 // runErr is an error value set by the goroutine that runs the manager 1027 var runErr error 1028 1029 // The context used to start this might have a logger associated with it 1030 // that we'd like to reuse, but we don't want to use that context, so we 1031 // pass to the goroutine only the logger, and create a new context with 1032 //that logger. 1033 go func(logger *logrus.Entry) { 1034 if err := m.Run(log.WithLogger(context.Background(), logger)); err != nil { 1035 runErr = err 1036 } 1037 close(done) 1038 }(log.G(ctx)) 1039 1040 // clearData is set in the select below, and is used to signal why the 1041 // manager is stopping, and indicate whether or not to delete raft data and 1042 // keys when stopping the manager. 1043 var clearData bool 1044 defer func() { 1045 n.Lock() 1046 n.manager = nil 1047 n.Unlock() 1048 m.Stop(ctx, clearData) 1049 <-done 1050 n.setControlSocket(nil) 1051 }() 1052 1053 n.Lock() 1054 n.manager = m 1055 n.Unlock() 1056 1057 connCtx, connCancel := context.WithCancel(ctx) 1058 defer connCancel() 1059 1060 // launch a goroutine that will manage our local connection to the manager 1061 // from the agent. Remember the managerReady channel created way back in 1062 // run? This is actually where we close it. Not when the manager starts, 1063 // but when a connection to the control socket has been established. 1064 go n.initManagerConnection(connCtx, ready) 1065 1066 // wait for manager stop or for role change 1067 // The manager can be stopped one of 4 ways: 1068 // 1. The manager may have errored out and returned an error, closing the 1069 // done channel in the process 1070 // 2. The node may have been demoted to a worker. In this case, we're gonna 1071 // have to stop the manager ourselves, setting clearData to true so the 1072 // local raft data, certs, keys, etc, are nuked. 1073 // 3. The manager may have been booted from raft. This could happen if it's 1074 // removed from the raft quorum but the role update hasn't registered 1075 // yet. The fact that there is more than 1 code path to cause the 1076 // manager to exit is a possible source of bugs. 1077 // 4. The context may have been canceled from above, in which case we 1078 // should stop the manager ourselves, but indicate that this is NOT a 1079 // demotion. 1080 select { 1081 case <-done: 1082 return false, runErr 1083 case <-workerRole: 1084 log.G(ctx).Info("role changed to worker, stopping manager") 1085 clearData = true 1086 case <-m.RemovedFromRaft(): 1087 log.G(ctx).Info("manager removed from raft cluster, stopping manager") 1088 clearData = true 1089 case <-ctx.Done(): 1090 return false, ctx.Err() 1091 } 1092 return clearData, nil 1093 } 1094 1095 // superviseManager controls whether or not we are running a manager on this 1096 // node 1097 func (n *Node) superviseManager(ctx context.Context, securityConfig *ca.SecurityConfig, rootPaths ca.CertPaths, ready chan struct{}, renewer *ca.TLSRenewer) error { 1098 // superviseManager is a loop, because we can come in and out of being a 1099 // manager, and need to appropriately handle that without disrupting the 1100 // node functionality. 1101 for { 1102 // if we're not a manager, we're just gonna park here and wait until we 1103 // are. For normal agent nodes, we'll stay here forever, as intended. 1104 if err := n.waitRole(ctx, ca.ManagerRole); err != nil { 1105 return err 1106 } 1107 1108 // Once we know we are a manager, we get ourselves ready for when we 1109 // lose that role. we create a channel to signal that we've become a 1110 // worker, and close it when n.waitRole completes. 1111 workerRole := make(chan struct{}) 1112 waitRoleCtx, waitRoleCancel := context.WithCancel(ctx) 1113 go func() { 1114 if n.waitRole(waitRoleCtx, ca.WorkerRole) == nil { 1115 close(workerRole) 1116 } 1117 }() 1118 1119 // the ready channel passed to superviseManager is in turn passed down 1120 // to the runManager function. It's used to signal to the caller that 1121 // the manager has started. 1122 wasRemoved, err := n.runManager(ctx, securityConfig, rootPaths, ready, workerRole) 1123 if err != nil { 1124 waitRoleCancel() 1125 return errors.Wrap(err, "manager stopped") 1126 } 1127 1128 // If the manager stopped running and our role is still 1129 // "manager", it's possible that the manager was demoted and 1130 // the agent hasn't realized this yet. We should wait for the 1131 // role to change instead of restarting the manager immediately. 1132 err = func() error { 1133 timer := time.NewTimer(roleChangeTimeout) 1134 defer timer.Stop() 1135 defer waitRoleCancel() 1136 1137 select { 1138 case <-timer.C: 1139 case <-workerRole: 1140 return nil 1141 case <-ctx.Done(): 1142 return ctx.Err() 1143 } 1144 1145 if !wasRemoved { 1146 log.G(ctx).Warn("failed to get worker role after manager stop, restarting manager") 1147 return nil 1148 } 1149 // We need to be extra careful about restarting the 1150 // manager. It may cause the node to wrongly join under 1151 // a new Raft ID. Since we didn't see a role change 1152 // yet, force a certificate renewal. If the certificate 1153 // comes back with a worker role, we know we shouldn't 1154 // restart the manager. However, if we don't see 1155 // workerRole get closed, it means we didn't switch to 1156 // a worker certificate, either because we couldn't 1157 // contact a working CA, or because we've been 1158 // re-promoted. In this case, we must assume we were 1159 // re-promoted, and restart the manager. 1160 log.G(ctx).Warn("failed to get worker role after manager stop, forcing certificate renewal") 1161 1162 // We can safely reset this timer without stopping/draining the timer 1163 // first because the only way the code has reached this point is if the timer 1164 // has already expired - if the role changed or the context were canceled, 1165 // then we would have returned already. 1166 timer.Reset(roleChangeTimeout) 1167 1168 renewer.Renew() 1169 1170 // Now that the renewal request has been sent to the 1171 // renewal goroutine, wait for a change in role. 1172 select { 1173 case <-timer.C: 1174 log.G(ctx).Warn("failed to get worker role after manager stop, restarting manager") 1175 case <-workerRole: 1176 case <-ctx.Done(): 1177 return ctx.Err() 1178 } 1179 return nil 1180 }() 1181 if err != nil { 1182 return err 1183 } 1184 1185 // set ready to nil after the first time we've gone through this, as we 1186 // don't need to signal after the first time that the manager is ready. 1187 ready = nil 1188 } 1189 } 1190 1191 // DowngradeKey reverts the node key to older format so that it can 1192 // run on older version of swarmkit 1193 func (n *Node) DowngradeKey() error { 1194 paths := ca.NewConfigPaths(filepath.Join(n.config.StateDir, certDirectory)) 1195 krw := ca.NewKeyReadWriter(paths.Node, n.config.UnlockKey, nil) 1196 1197 return krw.DowngradeKey() 1198 } 1199 1200 type persistentRemotes struct { 1201 sync.RWMutex 1202 c *sync.Cond 1203 remotes.Remotes 1204 storePath string 1205 lastSavedState []api.Peer 1206 } 1207 1208 func newPersistentRemotes(f string, peers ...api.Peer) *persistentRemotes { 1209 pr := &persistentRemotes{ 1210 storePath: f, 1211 Remotes: remotes.NewRemotes(peers...), 1212 } 1213 pr.c = sync.NewCond(pr.RLocker()) 1214 return pr 1215 } 1216 1217 func (s *persistentRemotes) Observe(peer api.Peer, weight int) { 1218 s.Lock() 1219 defer s.Unlock() 1220 s.Remotes.Observe(peer, weight) 1221 s.c.Broadcast() 1222 if err := s.save(); err != nil { 1223 logrus.Errorf("error writing cluster state file: %v", err) 1224 } 1225 } 1226 1227 func (s *persistentRemotes) Remove(peers ...api.Peer) { 1228 s.Lock() 1229 defer s.Unlock() 1230 s.Remotes.Remove(peers...) 1231 if err := s.save(); err != nil { 1232 logrus.Errorf("error writing cluster state file: %v", err) 1233 } 1234 } 1235 1236 func (s *persistentRemotes) save() error { 1237 weights := s.Weights() 1238 remotes := make([]api.Peer, 0, len(weights)) 1239 for r := range weights { 1240 remotes = append(remotes, r) 1241 } 1242 sort.Sort(sortablePeers(remotes)) 1243 if reflect.DeepEqual(remotes, s.lastSavedState) { 1244 return nil 1245 } 1246 dt, err := json.Marshal(remotes) 1247 if err != nil { 1248 return err 1249 } 1250 s.lastSavedState = remotes 1251 return ioutils.AtomicWriteFile(s.storePath, dt, 0600) 1252 } 1253 1254 // WaitSelect waits until at least one remote becomes available and then selects one. 1255 func (s *persistentRemotes) WaitSelect(ctx context.Context) <-chan api.Peer { 1256 c := make(chan api.Peer, 1) 1257 s.RLock() 1258 done := make(chan struct{}) 1259 go func() { 1260 select { 1261 case <-ctx.Done(): 1262 s.c.Broadcast() 1263 case <-done: 1264 } 1265 }() 1266 go func() { 1267 defer s.RUnlock() 1268 defer close(c) 1269 defer close(done) 1270 for { 1271 if ctx.Err() != nil { 1272 return 1273 } 1274 p, err := s.Select() 1275 if err == nil { 1276 c <- p 1277 return 1278 } 1279 s.c.Wait() 1280 } 1281 }() 1282 return c 1283 } 1284 1285 // sortablePeers is a sort wrapper for []api.Peer 1286 type sortablePeers []api.Peer 1287 1288 func (sp sortablePeers) Less(i, j int) bool { return sp[i].NodeID < sp[j].NodeID } 1289 1290 func (sp sortablePeers) Len() int { return len(sp) } 1291 1292 func (sp sortablePeers) Swap(i, j int) { sp[i], sp[j] = sp[j], sp[i] } 1293 1294 // firstSessionErrorTracker is a utility that helps determine whether the agent should exit after 1295 // a TLS failure on establishing the first session. This should only happen if a join address 1296 // is specified. If establishing the first session succeeds, but later on some session fails 1297 // because of a TLS error, we don't want to exit the agent because a previously successful 1298 // session indicates that the TLS error may be a transient issue. 1299 type firstSessionErrorTracker struct { 1300 mu sync.Mutex 1301 pastFirstSession bool 1302 err error 1303 } 1304 1305 func (fs *firstSessionErrorTracker) SessionEstablished() { 1306 fs.mu.Lock() 1307 fs.pastFirstSession = true 1308 fs.mu.Unlock() 1309 } 1310 1311 func (fs *firstSessionErrorTracker) SessionError(err error) { 1312 fs.mu.Lock() 1313 fs.err = err 1314 fs.mu.Unlock() 1315 } 1316 1317 // SessionClosed returns an error if we haven't yet established a session, and 1318 // we get a gprc error as a result of an X509 failure. 1319 func (fs *firstSessionErrorTracker) SessionClosed() error { 1320 fs.mu.Lock() 1321 defer fs.mu.Unlock() 1322 1323 // if we've successfully established at least 1 session, never return 1324 // errors 1325 if fs.pastFirstSession { 1326 return nil 1327 } 1328 1329 // get the GRPC status from the error, because we only care about GRPC 1330 // errors 1331 grpcStatus, ok := status.FromError(fs.err) 1332 // if this isn't a GRPC error, it's not an error we return from this method 1333 if !ok { 1334 return nil 1335 } 1336 1337 // NOTE(dperny, cyli): grpc does not expose the error type, which means we have 1338 // to string matching to figure out if it's an x509 error. 1339 // 1340 // The error we're looking for has "connection error:", then says 1341 // "transport:" and finally has "x509:" 1342 // specifically, the connection error description reads: 1343 // 1344 // transport: authentication handshake failed: x509: certificate signed by unknown authority 1345 // 1346 // This string matching has caused trouble in the past. specifically, at 1347 // some point between grpc versions 1.3.0 and 1.7.5, the string we were 1348 // matching changed from "transport: x509" to "transport: authentication 1349 // handshake failed: x509", which was an issue because we were matching for 1350 // string "transport: x509:". 1351 // 1352 // In GRPC >= 1.10.x, transient errors like TLS errors became hidden by the 1353 // load balancing that GRPC does. In GRPC 1.11.x, they were exposed again 1354 // (usually) in RPC calls, but the error string then became: 1355 // rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = "transport: authentication handshake failed: x509: certificate signed by unknown authority" 1356 // 1357 // It also went from an Internal error to an Unavailable error. So we're just going 1358 // to search for the string: "transport: authentication handshake failed: x509:" since 1359 // we want to fail for ALL x509 failures, not just unknown authority errors. 1360 1361 if !strings.Contains(grpcStatus.Message(), "connection error") || 1362 !strings.Contains(grpcStatus.Message(), "transport: authentication handshake failed: x509:") { 1363 return nil 1364 } 1365 return fs.err 1366 }