github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/vendor_skip/go.mongodb.org/mongo-driver/x/mongo/driver/topology/server.go (about) 1 // Copyright (C) MongoDB, Inc. 2017-present. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may 4 // not use this file except in compliance with the License. You may obtain 5 // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 6 7 package topology 8 9 import ( 10 "context" 11 "errors" 12 "fmt" 13 "net" 14 "sync" 15 "sync/atomic" 16 "time" 17 18 "go.mongodb.org/mongo-driver/bson/primitive" 19 "go.mongodb.org/mongo-driver/event" 20 "go.mongodb.org/mongo-driver/mongo/address" 21 "go.mongodb.org/mongo-driver/mongo/description" 22 "go.mongodb.org/mongo-driver/x/mongo/driver" 23 "go.mongodb.org/mongo-driver/x/mongo/driver/operation" 24 ) 25 26 const minHeartbeatInterval = 500 * time.Millisecond 27 const wireVersion42 = 8 // Wire version for MongoDB 4.2 28 29 // Server state constants. 30 const ( 31 serverDisconnected int64 = iota 32 serverDisconnecting 33 serverConnected 34 ) 35 36 func serverStateString(state int64) string { 37 switch state { 38 case serverDisconnected: 39 return "Disconnected" 40 case serverDisconnecting: 41 return "Disconnecting" 42 case serverConnected: 43 return "Connected" 44 } 45 46 return "" 47 } 48 49 var ( 50 // ErrServerClosed occurs when an attempt to Get a connection is made after 51 // the server has been closed. 52 ErrServerClosed = errors.New("server is closed") 53 // ErrServerConnected occurs when at attempt to Connect is made after a server 54 // has already been connected. 55 ErrServerConnected = errors.New("server is connected") 56 57 errCheckCancelled = errors.New("server check cancelled") 58 emptyDescription = description.NewDefaultServer("") 59 ) 60 61 // SelectedServer represents a specific server that was selected during server selection. 62 // It contains the kind of the topology it was selected from. 63 type SelectedServer struct { 64 *Server 65 66 Kind description.TopologyKind 67 } 68 69 // Description returns a description of the server as of the last heartbeat. 70 func (ss *SelectedServer) Description() description.SelectedServer { 71 sdesc := ss.Server.Description() 72 return description.SelectedServer{ 73 Server: sdesc, 74 Kind: ss.Kind, 75 } 76 } 77 78 // Server is a single server within a topology. 79 type Server struct { 80 // The following integer fields must be accessed using the atomic package and should be at the 81 // beginning of the struct. 82 // - atomic bug: https://pkg.go.dev/sync/atomic#pkg-note-BUG 83 // - suggested layout: https://go101.org/article/memory-layout.html 84 85 state int64 86 operationCount int64 87 88 cfg *serverConfig 89 address address.Address 90 91 // connection related fields 92 pool *pool 93 94 // goroutine management fields 95 done chan struct{} 96 checkNow chan struct{} 97 disconnecting chan struct{} 98 closewg sync.WaitGroup 99 100 // description related fields 101 desc atomic.Value // holds a description.Server 102 updateTopologyCallback atomic.Value 103 topologyID primitive.ObjectID 104 105 // subscriber related fields 106 subLock sync.Mutex 107 subscribers map[uint64]chan description.Server 108 currentSubscriberID uint64 109 subscriptionsClosed bool 110 111 // heartbeat and cancellation related fields 112 // globalCtx should be created in NewServer and cancelled in Disconnect to signal that the server is shutting down. 113 // heartbeatCtx should be used for individual heartbeats and should be a child of globalCtx so that it will be 114 // cancelled automatically during shutdown. 115 heartbeatLock sync.Mutex 116 conn *connection 117 globalCtx context.Context 118 globalCtxCancel context.CancelFunc 119 heartbeatCtx context.Context 120 heartbeatCtxCancel context.CancelFunc 121 122 processErrorLock sync.Mutex 123 rttMonitor *rttMonitor 124 } 125 126 // updateTopologyCallback is a callback used to create a server that should be called when the parent Topology instance 127 // should be updated based on a new server description. The callback must return the server description that should be 128 // stored by the server. 129 type updateTopologyCallback func(description.Server) description.Server 130 131 // ConnectServer creates a new Server and then initializes it using the 132 // Connect method. 133 func ConnectServer(addr address.Address, updateCallback updateTopologyCallback, topologyID primitive.ObjectID, opts ...ServerOption) (*Server, error) { 134 srvr := NewServer(addr, topologyID, opts...) 135 err := srvr.Connect(updateCallback) 136 if err != nil { 137 return nil, err 138 } 139 return srvr, nil 140 } 141 142 // NewServer creates a new server. The mongodb server at the address will be monitored 143 // on an internal monitoring goroutine. 144 func NewServer(addr address.Address, topologyID primitive.ObjectID, opts ...ServerOption) *Server { 145 cfg := newServerConfig(opts...) 146 globalCtx, globalCtxCancel := context.WithCancel(context.Background()) 147 s := &Server{ 148 state: serverDisconnected, 149 150 cfg: cfg, 151 address: addr, 152 153 done: make(chan struct{}), 154 checkNow: make(chan struct{}, 1), 155 disconnecting: make(chan struct{}), 156 157 topologyID: topologyID, 158 159 subscribers: make(map[uint64]chan description.Server), 160 globalCtx: globalCtx, 161 globalCtxCancel: globalCtxCancel, 162 } 163 s.desc.Store(description.NewDefaultServer(addr)) 164 rttCfg := &rttConfig{ 165 interval: cfg.heartbeatInterval, 166 minRTTWindow: 5 * time.Minute, 167 createConnectionFn: s.createConnection, 168 createOperationFn: s.createBaseOperation, 169 } 170 s.rttMonitor = newRTTMonitor(rttCfg) 171 172 pc := poolConfig{ 173 Address: addr, 174 MinPoolSize: cfg.minConns, 175 MaxPoolSize: cfg.maxConns, 176 MaxConnecting: cfg.maxConnecting, 177 MaxIdleTime: cfg.poolMaxIdleTime, 178 MaintainInterval: cfg.poolMaintainInterval, 179 PoolMonitor: cfg.poolMonitor, 180 Logger: cfg.logger, 181 handshakeErrFn: s.ProcessHandshakeError, 182 } 183 184 connectionOpts := copyConnectionOpts(cfg.connectionOpts) 185 s.pool = newPool(pc, connectionOpts...) 186 s.publishServerOpeningEvent(s.address) 187 188 return s 189 } 190 191 // Connect initializes the Server by starting background monitoring goroutines. 192 // This method must be called before a Server can be used. 193 func (s *Server) Connect(updateCallback updateTopologyCallback) error { 194 if !atomic.CompareAndSwapInt64(&s.state, serverDisconnected, serverConnected) { 195 return ErrServerConnected 196 } 197 198 desc := description.NewDefaultServer(s.address) 199 if s.cfg.loadBalanced { 200 // LBs automatically start off with kind LoadBalancer because there is no monitoring routine for state changes. 201 desc.Kind = description.LoadBalancer 202 } 203 s.desc.Store(desc) 204 s.updateTopologyCallback.Store(updateCallback) 205 206 if !s.cfg.monitoringDisabled && !s.cfg.loadBalanced { 207 s.rttMonitor.connect() 208 s.closewg.Add(1) 209 go s.update() 210 } 211 212 // The CMAP spec describes that pools should only be marked "ready" when the server description 213 // is updated to something other than "Unknown". However, we maintain the previous Server 214 // behavior here and immediately mark the pool as ready during Connect() to simplify and speed 215 // up the Client startup behavior. The risk of marking a pool as ready proactively during 216 // Connect() is that we could attempt to create connections to a server that was configured 217 // erroneously until the first server check or checkOut() failure occurs, when the SDAM error 218 // handler would transition the Server back to "Unknown" and set the pool to "paused". 219 return s.pool.ready() 220 } 221 222 // Disconnect closes sockets to the server referenced by this Server. 223 // Subscriptions to this Server will be closed. Disconnect will shutdown 224 // any monitoring goroutines, closeConnection the idle connection pool, and will 225 // wait until all the in use connections have been returned to the connection 226 // pool and are closed before returning. If the context expires via 227 // cancellation, deadline, or timeout before the in use connections have been 228 // returned, the in use connections will be closed, resulting in the failure of 229 // any in flight read or write operations. If this method returns with no 230 // errors, all connections associated with this Server have been closed. 231 func (s *Server) Disconnect(ctx context.Context) error { 232 if !atomic.CompareAndSwapInt64(&s.state, serverConnected, serverDisconnecting) { 233 return ErrServerClosed 234 } 235 236 s.updateTopologyCallback.Store((updateTopologyCallback)(nil)) 237 238 // Cancel the global context so any new contexts created from it will be automatically cancelled. Close the done 239 // channel so the update() routine will know that it can stop. Cancel any in-progress monitoring checks at the end. 240 // The done channel is closed before cancelling the check so the update routine() will immediately detect that it 241 // can stop rather than trying to create new connections until the read from done succeeds. 242 s.globalCtxCancel() 243 close(s.done) 244 s.cancelCheck() 245 246 s.rttMonitor.disconnect() 247 s.pool.close(ctx) 248 249 s.closewg.Wait() 250 atomic.StoreInt64(&s.state, serverDisconnected) 251 252 return nil 253 } 254 255 // Connection gets a connection to the server. 256 func (s *Server) Connection(ctx context.Context) (driver.Connection, error) { 257 if atomic.LoadInt64(&s.state) != serverConnected { 258 return nil, ErrServerClosed 259 } 260 261 // Increment the operation count before calling checkOut to make sure that all connection 262 // requests are included in the operation count, including those in the wait queue. If we got an 263 // error instead of a connection, immediately decrement the operation count. 264 atomic.AddInt64(&s.operationCount, 1) 265 conn, err := s.pool.checkOut(ctx) 266 if err != nil { 267 atomic.AddInt64(&s.operationCount, -1) 268 return nil, err 269 } 270 271 return &Connection{ 272 connection: conn, 273 cleanupServerFn: func() { 274 // Decrement the operation count whenever the caller is done with the connection. Note 275 // that cleanupServerFn() is not called while the connection is pinned to a cursor or 276 // transaction, so the operation count is not decremented until the cursor is closed or 277 // the transaction is committed or aborted. Use an int64 instead of a uint64 to mitigate 278 // the impact of any possible bugs that could cause the uint64 to underflow, which would 279 // make the server much less selectable. 280 atomic.AddInt64(&s.operationCount, -1) 281 }, 282 }, nil 283 } 284 285 // ProcessHandshakeError implements SDAM error handling for errors that occur before a connection 286 // finishes handshaking. 287 func (s *Server) ProcessHandshakeError(err error, startingGenerationNumber uint64, serviceID *primitive.ObjectID) { 288 // Ignore the error if the server is behind a load balancer but the service ID is unknown. This indicates that the 289 // error happened when dialing the connection or during the MongoDB handshake, so we don't know the service ID to 290 // use for clearing the pool. 291 if err == nil || s.cfg.loadBalanced && serviceID == nil { 292 return 293 } 294 // Ignore the error if the connection is stale. 295 if startingGenerationNumber < s.pool.generation.getGeneration(serviceID) { 296 return 297 } 298 299 // Unwrap any connection errors. If there is no wrapped connection error, then the error should 300 // not result in any Server state change (e.g. a command error from the database). 301 wrappedConnErr := unwrapConnectionError(err) 302 if wrappedConnErr == nil { 303 return 304 } 305 306 // Must hold the processErrorLock while updating the server description and clearing the pool. 307 // Not holding the lock leads to possible out-of-order processing of pool.clear() and 308 // pool.ready() calls from concurrent server description updates. 309 s.processErrorLock.Lock() 310 defer s.processErrorLock.Unlock() 311 312 // Since the only kind of ConnectionError we receive from pool.Get will be an initialization error, we should set 313 // the description.Server appropriately. The description should not have a TopologyVersion because the staleness 314 // checking logic above has already determined that this description is not stale. 315 s.updateDescription(description.NewServerFromError(s.address, wrappedConnErr, nil)) 316 s.pool.clear(err, serviceID) 317 s.cancelCheck() 318 } 319 320 // Description returns a description of the server as of the last heartbeat. 321 func (s *Server) Description() description.Server { 322 return s.desc.Load().(description.Server) 323 } 324 325 // SelectedDescription returns a description.SelectedServer with a Kind of 326 // Single. This can be used when performing tasks like monitoring a batch 327 // of servers and you want to run one off commands against those servers. 328 func (s *Server) SelectedDescription() description.SelectedServer { 329 sdesc := s.Description() 330 return description.SelectedServer{ 331 Server: sdesc, 332 Kind: description.Single, 333 } 334 } 335 336 // Subscribe returns a ServerSubscription which has a channel on which all 337 // updated server descriptions will be sent. The channel will have a buffer 338 // size of one, and will be pre-populated with the current description. 339 func (s *Server) Subscribe() (*ServerSubscription, error) { 340 if atomic.LoadInt64(&s.state) != serverConnected { 341 return nil, ErrSubscribeAfterClosed 342 } 343 ch := make(chan description.Server, 1) 344 ch <- s.desc.Load().(description.Server) 345 346 s.subLock.Lock() 347 defer s.subLock.Unlock() 348 if s.subscriptionsClosed { 349 return nil, ErrSubscribeAfterClosed 350 } 351 id := s.currentSubscriberID 352 s.subscribers[id] = ch 353 s.currentSubscriberID++ 354 355 ss := &ServerSubscription{ 356 C: ch, 357 s: s, 358 id: id, 359 } 360 361 return ss, nil 362 } 363 364 // RequestImmediateCheck will cause the server to send a heartbeat immediately 365 // instead of waiting for the heartbeat timeout. 366 func (s *Server) RequestImmediateCheck() { 367 select { 368 case s.checkNow <- struct{}{}: 369 default: 370 } 371 } 372 373 // getWriteConcernErrorForProcessing extracts a driver.WriteConcernError from the provided error. This function returns 374 // (error, true) if the error is a WriteConcernError and the falls under the requirements for SDAM error 375 // handling and (nil, false) otherwise. 376 func getWriteConcernErrorForProcessing(err error) (*driver.WriteConcernError, bool) { 377 writeCmdErr, ok := err.(driver.WriteCommandError) 378 if !ok { 379 return nil, false 380 } 381 382 wcerr := writeCmdErr.WriteConcernError 383 if wcerr != nil && (wcerr.NodeIsRecovering() || wcerr.NotPrimary()) { 384 return wcerr, true 385 } 386 return nil, false 387 } 388 389 // ProcessError handles SDAM error handling and implements driver.ErrorProcessor. 390 func (s *Server) ProcessError(err error, conn driver.Connection) driver.ProcessErrorResult { 391 // Ignore nil errors. 392 if err == nil { 393 return driver.NoChange 394 } 395 396 // Ignore errors from stale connections because the error came from a previous generation of the 397 // connection pool. The root cause of the error has aleady been handled, which is what caused 398 // the pool generation to increment. Processing errors for stale connections could result in 399 // handling the same error root cause multiple times (e.g. a temporary network interrupt causing 400 // all connections to the same server to return errors). 401 if conn.Stale() { 402 return driver.NoChange 403 } 404 405 // Must hold the processErrorLock while updating the server description and clearing the pool. 406 // Not holding the lock leads to possible out-of-order processing of pool.clear() and 407 // pool.ready() calls from concurrent server description updates. 408 s.processErrorLock.Lock() 409 defer s.processErrorLock.Unlock() 410 411 // Get the wire version and service ID from the connection description because they will never 412 // change for the lifetime of a connection and can possibly be different between connections to 413 // the same server. 414 connDesc := conn.Description() 415 wireVersion := connDesc.WireVersion 416 serviceID := connDesc.ServiceID 417 418 // Get the topology version from the Server description because the Server description is 419 // updated by heartbeats and errors, so typically has a more up-to-date topology version. 420 serverDesc := s.desc.Load().(description.Server) 421 topologyVersion := serverDesc.TopologyVersion 422 423 // We don't currently update the Server topology version when we create new application 424 // connections, so it's possible for a connection's topology version to be newer than the 425 // Server's topology version. Pick the "newest" of the two topology versions. 426 // Technically a nil topology version on a new database response should be considered a new 427 // topology version and replace the Server's topology version. However, we don't know if the 428 // connection's topology version is based on a new or old database response, so we ignore a nil 429 // topology version on the connection for now. 430 // 431 // TODO(GODRIVER-2841): Remove this logic once we set the Server description when we create 432 // TODO application connections because then the Server's topology version will always be the 433 // TODO latest known. 434 if tv := connDesc.TopologyVersion; tv != nil && topologyVersion.CompareToIncoming(tv) < 0 { 435 topologyVersion = tv 436 } 437 438 // Invalidate server description if not primary or node recovering error occurs. 439 // These errors can be reported as a command error or a write concern error. 440 if cerr, ok := err.(driver.Error); ok && (cerr.NodeIsRecovering() || cerr.NotPrimary()) { 441 // Ignore errors that came from when the database was on a previous topology version. 442 if topologyVersion.CompareToIncoming(cerr.TopologyVersion) >= 0 { 443 return driver.NoChange 444 } 445 446 // updates description to unknown 447 s.updateDescription(description.NewServerFromError(s.address, err, cerr.TopologyVersion)) 448 s.RequestImmediateCheck() 449 450 res := driver.ServerMarkedUnknown 451 // If the node is shutting down or is older than 4.2, we synchronously clear the pool 452 if cerr.NodeIsShuttingDown() || wireVersion == nil || wireVersion.Max < wireVersion42 { 453 res = driver.ConnectionPoolCleared 454 s.pool.clear(err, serviceID) 455 } 456 457 return res 458 } 459 if wcerr, ok := getWriteConcernErrorForProcessing(err); ok { 460 // Ignore errors that came from when the database was on a previous topology version. 461 if topologyVersion.CompareToIncoming(wcerr.TopologyVersion) >= 0 { 462 return driver.NoChange 463 } 464 465 // updates description to unknown 466 s.updateDescription(description.NewServerFromError(s.address, err, wcerr.TopologyVersion)) 467 s.RequestImmediateCheck() 468 469 res := driver.ServerMarkedUnknown 470 // If the node is shutting down or is older than 4.2, we synchronously clear the pool 471 if wcerr.NodeIsShuttingDown() || wireVersion == nil || wireVersion.Max < wireVersion42 { 472 res = driver.ConnectionPoolCleared 473 s.pool.clear(err, serviceID) 474 } 475 return res 476 } 477 478 wrappedConnErr := unwrapConnectionError(err) 479 if wrappedConnErr == nil { 480 return driver.NoChange 481 } 482 483 // Ignore transient timeout errors. 484 if netErr, ok := wrappedConnErr.(net.Error); ok && netErr.Timeout() { 485 return driver.NoChange 486 } 487 if wrappedConnErr == context.Canceled || wrappedConnErr == context.DeadlineExceeded { 488 return driver.NoChange 489 } 490 491 // For a non-timeout network error, we clear the pool, set the description to Unknown, and cancel the in-progress 492 // monitoring check. The check is cancelled last to avoid a post-cancellation reconnect racing with 493 // updateDescription. 494 s.updateDescription(description.NewServerFromError(s.address, err, nil)) 495 s.pool.clear(err, serviceID) 496 s.cancelCheck() 497 return driver.ConnectionPoolCleared 498 } 499 500 // update handles performing heartbeats and updating any subscribers of the 501 // newest description.Server retrieved. 502 func (s *Server) update() { 503 defer s.closewg.Done() 504 heartbeatTicker := time.NewTicker(s.cfg.heartbeatInterval) 505 rateLimiter := time.NewTicker(minHeartbeatInterval) 506 defer heartbeatTicker.Stop() 507 defer rateLimiter.Stop() 508 checkNow := s.checkNow 509 done := s.done 510 511 defer func() { 512 _ = recover() 513 }() 514 515 closeServer := func() { 516 s.subLock.Lock() 517 for id, c := range s.subscribers { 518 close(c) 519 delete(s.subscribers, id) 520 } 521 s.subscriptionsClosed = true 522 s.subLock.Unlock() 523 524 // We don't need to take s.heartbeatLock here because closeServer is called synchronously when the select checks 525 // below detect that the server is being closed, so we can be sure that the connection isn't being used. 526 if s.conn != nil { 527 _ = s.conn.close() 528 } 529 } 530 531 waitUntilNextCheck := func() { 532 // Wait until heartbeatFrequency elapses, an application operation requests an immediate check, or the server 533 // is disconnecting. 534 select { 535 case <-heartbeatTicker.C: 536 case <-checkNow: 537 case <-done: 538 // Return because the next update iteration will check the done channel again and clean up. 539 return 540 } 541 542 // Ensure we only return if minHeartbeatFrequency has elapsed or the server is disconnecting. 543 select { 544 case <-rateLimiter.C: 545 case <-done: 546 return 547 } 548 } 549 550 timeoutCnt := 0 551 for { 552 // Check if the server is disconnecting. Even if waitForNextCheck has already read from the done channel, we 553 // can safely read from it again because Disconnect closes the channel. 554 select { 555 case <-done: 556 closeServer() 557 return 558 default: 559 } 560 561 previousDescription := s.Description() 562 563 // Perform the next check. 564 desc, err := s.check() 565 if err == errCheckCancelled { 566 if atomic.LoadInt64(&s.state) != serverConnected { 567 continue 568 } 569 570 // If the server is not disconnecting, the check was cancelled by an application operation after an error. 571 // Wait before running the next check. 572 waitUntilNextCheck() 573 continue 574 } 575 576 if isShortcut := func() bool { 577 // Must hold the processErrorLock while updating the server description and clearing the 578 // pool. Not holding the lock leads to possible out-of-order processing of pool.clear() and 579 // pool.ready() calls from concurrent server description updates. 580 s.processErrorLock.Lock() 581 defer s.processErrorLock.Unlock() 582 583 s.updateDescription(desc) 584 // Retry after the first timeout before clearing the pool in case of a FAAS pause as 585 // described in GODRIVER-2577. 586 if err := unwrapConnectionError(desc.LastError); err != nil && timeoutCnt < 1 { 587 if err == context.Canceled || err == context.DeadlineExceeded { 588 timeoutCnt++ 589 // We want to immediately retry on timeout error. Continue to next loop. 590 return true 591 } 592 if err, ok := err.(net.Error); ok && err.Timeout() { 593 timeoutCnt++ 594 // We want to immediately retry on timeout error. Continue to next loop. 595 return true 596 } 597 } 598 if err := desc.LastError; err != nil { 599 // Clear the pool once the description has been updated to Unknown. Pass in a nil service ID to clear 600 // because the monitoring routine only runs for non-load balanced deployments in which servers don't return 601 // IDs. 602 s.pool.clear(err, nil) 603 } 604 // We're either not handling a timeout error, or we just handled the 2nd consecutive 605 // timeout error. In either case, reset the timeout count to 0 and return false to 606 // continue the normal check process. 607 timeoutCnt = 0 608 return false 609 }(); isShortcut { 610 continue 611 } 612 613 // If the server supports streaming or we're already streaming, we want to move to streaming the next response 614 // without waiting. If the server has transitioned to Unknown from a network error, we want to do another 615 // check without waiting in case it was a transient error and the server isn't actually down. 616 serverSupportsStreaming := desc.Kind != description.Unknown && desc.TopologyVersion != nil 617 connectionIsStreaming := s.conn != nil && s.conn.getCurrentlyStreaming() 618 transitionedFromNetworkError := desc.LastError != nil && unwrapConnectionError(desc.LastError) != nil && 619 previousDescription.Kind != description.Unknown 620 621 if serverSupportsStreaming || connectionIsStreaming || transitionedFromNetworkError { 622 continue 623 } 624 625 // The server either does not support the streamable protocol or is not in a healthy state, so we wait until 626 // the next check. 627 waitUntilNextCheck() 628 } 629 } 630 631 // updateDescription handles updating the description on the Server, notifying 632 // subscribers, and potentially draining the connection pool. The initial 633 // parameter is used to determine if this is the first description from the 634 // server. 635 func (s *Server) updateDescription(desc description.Server) { 636 if s.cfg.loadBalanced { 637 // In load balanced mode, there are no updates from the monitoring routine. For errors encountered in pooled 638 // connections, the server should not be marked Unknown to ensure that the LB remains selectable. 639 return 640 } 641 642 defer func() { 643 // ¯\_(ツ)_/¯ 644 _ = recover() 645 }() 646 647 // Anytime we update the server description to something other than "unknown", set the pool to 648 // "ready". Do this before updating the description so that connections can be checked out as 649 // soon as the server is selectable. If the pool is already ready, this operation is a no-op. 650 // Note that this behavior is roughly consistent with the current Go driver behavior (connects 651 // to all servers, even non-data-bearing nodes) but deviates slightly from CMAP spec, which 652 // specifies a more restricted set of server descriptions and topologies that should mark the 653 // pool ready. We don't have access to the topology here, so prefer the current Go driver 654 // behavior for simplicity. 655 if desc.Kind != description.Unknown { 656 _ = s.pool.ready() 657 } 658 659 // Use the updateTopologyCallback to update the parent Topology and get the description that should be stored. 660 callback, ok := s.updateTopologyCallback.Load().(updateTopologyCallback) 661 if ok && callback != nil { 662 desc = callback(desc) 663 } 664 s.desc.Store(desc) 665 666 s.subLock.Lock() 667 for _, c := range s.subscribers { 668 select { 669 // drain the channel if it isn't empty 670 case <-c: 671 default: 672 } 673 c <- desc 674 } 675 s.subLock.Unlock() 676 } 677 678 // createConnection creates a new connection instance but does not call connect on it. The caller must call connect 679 // before the connection can be used for network operations. 680 func (s *Server) createConnection() *connection { 681 opts := copyConnectionOpts(s.cfg.connectionOpts) 682 opts = append(opts, 683 WithConnectTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), 684 WithReadTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), 685 WithWriteTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), 686 // We override whatever handshaker is currently attached to the options with a basic 687 // one because need to make sure we don't do auth. 688 WithHandshaker(func(h Handshaker) Handshaker { 689 return operation.NewHello().AppName(s.cfg.appname).Compressors(s.cfg.compressionOpts). 690 ServerAPI(s.cfg.serverAPI) 691 }), 692 // Override any monitors specified in options with nil to avoid monitoring heartbeats. 693 WithMonitor(func(*event.CommandMonitor) *event.CommandMonitor { return nil }), 694 ) 695 696 return newConnection(s.address, opts...) 697 } 698 699 func copyConnectionOpts(opts []ConnectionOption) []ConnectionOption { 700 optsCopy := make([]ConnectionOption, len(opts)) 701 copy(optsCopy, opts) 702 return optsCopy 703 } 704 705 func (s *Server) setupHeartbeatConnection() error { 706 conn := s.createConnection() 707 708 // Take the lock when assigning the context and connection because they're accessed by cancelCheck. 709 s.heartbeatLock.Lock() 710 if s.heartbeatCtxCancel != nil { 711 // Ensure the previous context is cancelled to avoid a leak. 712 s.heartbeatCtxCancel() 713 } 714 s.heartbeatCtx, s.heartbeatCtxCancel = context.WithCancel(s.globalCtx) 715 s.conn = conn 716 s.heartbeatLock.Unlock() 717 718 return s.conn.connect(s.heartbeatCtx) 719 } 720 721 // cancelCheck cancels in-progress connection dials and reads. It does not set any fields on the server. 722 func (s *Server) cancelCheck() { 723 var conn *connection 724 725 // Take heartbeatLock for mutual exclusion with the checks in the update function. 726 s.heartbeatLock.Lock() 727 if s.heartbeatCtx != nil { 728 s.heartbeatCtxCancel() 729 } 730 conn = s.conn 731 s.heartbeatLock.Unlock() 732 733 if conn == nil { 734 return 735 } 736 737 // If the connection exists, we need to wait for it to be connected because conn.connect() and 738 // conn.close() cannot be called concurrently. If the connection wasn't successfully opened, its 739 // state was set back to disconnected, so calling conn.close() will be a no-op. 740 conn.closeConnectContext() 741 conn.wait() 742 _ = conn.close() 743 } 744 745 func (s *Server) checkWasCancelled() bool { 746 return s.heartbeatCtx.Err() != nil 747 } 748 749 func (s *Server) createBaseOperation(conn driver.Connection) *operation.Hello { 750 return operation. 751 NewHello(). 752 ClusterClock(s.cfg.clock). 753 Deployment(driver.SingleConnectionDeployment{conn}). 754 ServerAPI(s.cfg.serverAPI) 755 } 756 757 func (s *Server) check() (description.Server, error) { 758 var descPtr *description.Server 759 var err error 760 var duration time.Duration 761 762 start := time.Now() 763 if s.conn == nil || s.conn.closed() || s.checkWasCancelled() { 764 // Create a new connection if this is the first check, the connection was closed after an error during the previous 765 // check, or the previous check was cancelled. 766 if s.conn != nil { 767 s.publishServerHeartbeatStartedEvent(s.conn.ID(), false) 768 } 769 // Create a new connection and add it's handshake RTT as a sample. 770 err = s.setupHeartbeatConnection() 771 duration = time.Since(start) 772 if err == nil { 773 // Use the description from the connection handshake as the value for this check. 774 s.rttMonitor.addSample(s.conn.helloRTT) 775 descPtr = &s.conn.desc 776 if s.conn != nil { 777 s.publishServerHeartbeatSucceededEvent(s.conn.ID(), duration, s.conn.desc, false) 778 } 779 } else { 780 err = unwrapConnectionError(err) 781 if s.conn != nil { 782 s.publishServerHeartbeatFailedEvent(s.conn.ID(), duration, err, false) 783 } 784 } 785 } else { 786 // An existing connection is being used. Use the server description properties to execute the right heartbeat. 787 788 // Wrap conn in a type that implements driver.StreamerConnection. 789 heartbeatConn := initConnection{s.conn} 790 baseOperation := s.createBaseOperation(heartbeatConn) 791 previousDescription := s.Description() 792 streamable := previousDescription.TopologyVersion != nil 793 794 s.publishServerHeartbeatStartedEvent(s.conn.ID(), s.conn.getCurrentlyStreaming() || streamable) 795 switch { 796 case s.conn.getCurrentlyStreaming(): 797 // The connection is already in a streaming state, so we stream the next response. 798 err = baseOperation.StreamResponse(s.heartbeatCtx, heartbeatConn) 799 case streamable: 800 // The server supports the streamable protocol. Set the socket timeout to 801 // connectTimeoutMS+heartbeatFrequencyMS and execute an awaitable hello request. Set conn.canStream so 802 // the wire message will advertise streaming support to the server. 803 804 // Calculation for maxAwaitTimeMS is taken from time.Duration.Milliseconds (added in Go 1.13). 805 maxAwaitTimeMS := int64(s.cfg.heartbeatInterval) / 1e6 806 // If connectTimeoutMS=0, the socket timeout should be infinite. Otherwise, it is connectTimeoutMS + 807 // heartbeatFrequencyMS to account for the fact that the query will block for heartbeatFrequencyMS 808 // server-side. 809 socketTimeout := s.cfg.heartbeatTimeout 810 if socketTimeout != 0 { 811 socketTimeout += s.cfg.heartbeatInterval 812 } 813 s.conn.setSocketTimeout(socketTimeout) 814 baseOperation = baseOperation.TopologyVersion(previousDescription.TopologyVersion). 815 MaxAwaitTimeMS(maxAwaitTimeMS) 816 s.conn.setCanStream(true) 817 err = baseOperation.Execute(s.heartbeatCtx) 818 default: 819 // The server doesn't support the awaitable protocol. Set the socket timeout to connectTimeoutMS and 820 // execute a regular heartbeat without any additional parameters. 821 822 s.conn.setSocketTimeout(s.cfg.heartbeatTimeout) 823 err = baseOperation.Execute(s.heartbeatCtx) 824 } 825 duration = time.Since(start) 826 827 if err == nil { 828 tempDesc := baseOperation.Result(s.address) 829 descPtr = &tempDesc 830 s.publishServerHeartbeatSucceededEvent(s.conn.ID(), duration, tempDesc, s.conn.getCurrentlyStreaming() || streamable) 831 } else { 832 // Close the connection here rather than below so we ensure we're not closing a connection that wasn't 833 // successfully created. 834 if s.conn != nil { 835 _ = s.conn.close() 836 } 837 s.publishServerHeartbeatFailedEvent(s.conn.ID(), duration, err, s.conn.getCurrentlyStreaming() || streamable) 838 } 839 } 840 841 if descPtr != nil { 842 // The check was successful. Set the average RTT and the 90th percentile RTT and return. 843 desc := *descPtr 844 desc = desc.SetAverageRTT(s.rttMonitor.EWMA()) 845 desc.HeartbeatInterval = s.cfg.heartbeatInterval 846 return desc, nil 847 } 848 849 if s.checkWasCancelled() { 850 // If the previous check was cancelled, we don't want to clear the pool. Return a sentinel error so the caller 851 // will know that an actual error didn't occur. 852 return emptyDescription, errCheckCancelled 853 } 854 855 // An error occurred. We reset the RTT monitor for all errors and return an Unknown description. The pool must also 856 // be cleared, but only after the description has already been updated, so that is handled by the caller. 857 topologyVersion := extractTopologyVersion(err) 858 s.rttMonitor.reset() 859 return description.NewServerFromError(s.address, err, topologyVersion), nil 860 } 861 862 func extractTopologyVersion(err error) *description.TopologyVersion { 863 if ce, ok := err.(ConnectionError); ok { 864 err = ce.Wrapped 865 } 866 867 switch converted := err.(type) { 868 case driver.Error: 869 return converted.TopologyVersion 870 case driver.WriteCommandError: 871 if converted.WriteConcernError != nil { 872 return converted.WriteConcernError.TopologyVersion 873 } 874 } 875 876 return nil 877 } 878 879 // RTTMonitor returns this server's round-trip-time monitor. 880 func (s *Server) RTTMonitor() driver.RTTMonitor { 881 return s.rttMonitor 882 } 883 884 // OperationCount returns the current number of in-progress operations for this server. 885 func (s *Server) OperationCount() int64 { 886 return atomic.LoadInt64(&s.operationCount) 887 } 888 889 // String implements the Stringer interface. 890 func (s *Server) String() string { 891 desc := s.Description() 892 state := atomic.LoadInt64(&s.state) 893 str := fmt.Sprintf("Addr: %s, Type: %s, State: %s", 894 s.address, desc.Kind, serverStateString(state)) 895 if len(desc.Tags) != 0 { 896 str += fmt.Sprintf(", Tag sets: %s", desc.Tags) 897 } 898 if state == serverConnected { 899 str += fmt.Sprintf(", Average RTT: %s, Min RTT: %s", desc.AverageRTT, s.RTTMonitor().Min()) 900 } 901 if desc.LastError != nil { 902 str += fmt.Sprintf(", Last error: %s", desc.LastError) 903 } 904 905 return str 906 } 907 908 // ServerSubscription represents a subscription to the description.Server updates for 909 // a specific server. 910 type ServerSubscription struct { 911 C <-chan description.Server 912 s *Server 913 id uint64 914 } 915 916 // Unsubscribe unsubscribes this ServerSubscription from updates and closes the 917 // subscription channel. 918 func (ss *ServerSubscription) Unsubscribe() error { 919 ss.s.subLock.Lock() 920 defer ss.s.subLock.Unlock() 921 if ss.s.subscriptionsClosed { 922 return nil 923 } 924 925 ch, ok := ss.s.subscribers[ss.id] 926 if !ok { 927 return nil 928 } 929 930 close(ch) 931 delete(ss.s.subscribers, ss.id) 932 933 return nil 934 } 935 936 // publishes a ServerOpeningEvent to indicate the server is being initialized 937 func (s *Server) publishServerOpeningEvent(addr address.Address) { 938 if s == nil { 939 return 940 } 941 942 serverOpening := &event.ServerOpeningEvent{ 943 Address: addr, 944 TopologyID: s.topologyID, 945 } 946 947 if s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerOpening != nil { 948 s.cfg.serverMonitor.ServerOpening(serverOpening) 949 } 950 } 951 952 // publishes a ServerHeartbeatStartedEvent to indicate a hello command has started 953 func (s *Server) publishServerHeartbeatStartedEvent(connectionID string, await bool) { 954 serverHeartbeatStarted := &event.ServerHeartbeatStartedEvent{ 955 ConnectionID: connectionID, 956 Awaited: await, 957 } 958 959 if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatStarted != nil { 960 s.cfg.serverMonitor.ServerHeartbeatStarted(serverHeartbeatStarted) 961 } 962 } 963 964 // publishes a ServerHeartbeatSucceededEvent to indicate hello has succeeded 965 func (s *Server) publishServerHeartbeatSucceededEvent(connectionID string, 966 duration time.Duration, 967 desc description.Server, 968 await bool, 969 ) { 970 serverHeartbeatSucceeded := &event.ServerHeartbeatSucceededEvent{ 971 DurationNanos: duration.Nanoseconds(), 972 Duration: duration, 973 Reply: desc, 974 ConnectionID: connectionID, 975 Awaited: await, 976 } 977 978 if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatSucceeded != nil { 979 s.cfg.serverMonitor.ServerHeartbeatSucceeded(serverHeartbeatSucceeded) 980 } 981 } 982 983 // publishes a ServerHeartbeatFailedEvent to indicate hello has failed 984 func (s *Server) publishServerHeartbeatFailedEvent(connectionID string, 985 duration time.Duration, 986 err error, 987 await bool, 988 ) { 989 serverHeartbeatFailed := &event.ServerHeartbeatFailedEvent{ 990 DurationNanos: duration.Nanoseconds(), 991 Duration: duration, 992 Failure: err, 993 ConnectionID: connectionID, 994 Awaited: await, 995 } 996 997 if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatFailed != nil { 998 s.cfg.serverMonitor.ServerHeartbeatFailed(serverHeartbeatFailed) 999 } 1000 } 1001 1002 // unwrapConnectionError returns the connection error wrapped by err, or nil if err does not wrap a connection error. 1003 func unwrapConnectionError(err error) error { 1004 // This is essentially an implementation of errors.As to unwrap this error until we get a ConnectionError and then 1005 // return ConnectionError.Wrapped. 1006 1007 connErr, ok := err.(ConnectionError) 1008 if ok { 1009 return connErr.Wrapped 1010 } 1011 1012 driverErr, ok := err.(driver.Error) 1013 if !ok || !driverErr.NetworkError() { 1014 return nil 1015 } 1016 1017 connErr, ok = driverErr.Wrapped.(ConnectionError) 1018 if ok { 1019 return connErr.Wrapped 1020 } 1021 1022 return nil 1023 }