github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/apicaller/connect.go (about)

     1  // Copyright 2012-2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package apicaller
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/clock"
    10  	"github.com/juju/errors"
    11  	"github.com/juju/names/v5"
    12  	"github.com/juju/retry"
    13  	"github.com/juju/utils/v3"
    14  
    15  	"github.com/juju/juju/agent"
    16  	"github.com/juju/juju/api"
    17  	apiagent "github.com/juju/juju/api/agent/agent"
    18  	apiservererrors "github.com/juju/juju/apiserver/errors"
    19  	"github.com/juju/juju/rpc/params"
    20  )
    21  
    22  var (
    23  	// checkProvisionedStrategy defines the evil uninterruptible
    24  	// retry strategy for "handling" ErrNotProvisioned. It exists
    25  	// in the name of stability; as the code evolves, it would be
    26  	// great to see its function moved up a level or two.
    27  	checkProvisionedStrategy = retry.CallArgs{
    28  		Clock:       clock.WallClock,
    29  		MaxDuration: 10 * time.Minute,
    30  		Delay:       5 * time.Second,
    31  	}
    32  
    33  	// newConnFacade should similarly move up a level so it can
    34  	// be explicitly configured without export_test hackery
    35  	newConnFacade = apiagent.NewConnFacade
    36  
    37  	// errAgentEntityDead is an internal error returned by getEntity.
    38  	errAgentEntityDead = errors.New("agent entity is dead")
    39  
    40  	// ErrConnectImpossible indicates that we can contact an apiserver
    41  	// but have no hope of authenticating a connection with it.
    42  	ErrConnectImpossible = errors.New("connection permanently impossible")
    43  
    44  	// ErrChangedPassword indicates that the agent config used to connect
    45  	// has been updated with a new password, and you should try again.
    46  	ErrChangedPassword = errors.New("insecure password replaced; retry")
    47  )
    48  
    49  // OnlyConnect logs into the API using the supplied agent's credentials.
    50  func OnlyConnect(a agent.Agent, apiOpen api.OpenFunc, logger Logger) (api.Connection, error) {
    51  	agentConfig := a.CurrentConfig()
    52  	info, ok := agentConfig.APIInfo()
    53  	if !ok {
    54  		return nil, errors.New("API info not available")
    55  	}
    56  	conn, _, err := connectFallback(apiOpen, info, agentConfig.OldPassword(), logger)
    57  	if err != nil {
    58  		return nil, errors.Trace(err)
    59  	}
    60  	return conn, nil
    61  }
    62  
    63  // connectFallback opens an API connection using the supplied info,
    64  // or a copy using the fallbackPassword; blocks for up to 5 minutes
    65  // if it encounters a CodeNotProvisioned error, periodically retrying;
    66  // and eventually, having either succeeded, failed, or timed out, returns:
    67  //
    68  //   - (if successful) the connection, and whether the fallback was used
    69  //   - (otherwise) whatever error it most recently encountered
    70  //
    71  // It's clear that it still has machine-agent concerns still baked in,
    72  // but there's no obvious practical path to separating those entirely at
    73  // the moment.
    74  //
    75  // (The right answer is probably to treat CodeNotProvisioned as a normal
    76  // error and depend on (currently nonexistent) exponential backoff in
    77  // the framework: either it'll work soon enough, or the controller will
    78  // spot the error and nuke the machine anyway. No harm leaving the local
    79  // agent running and occasionally polling for changes -- it won't do much
    80  // until it's managed to log in, and any suicide-cutoff point we pick here
    81  // will be objectively bad in some circumstances.)
    82  func connectFallback(
    83  	apiOpen api.OpenFunc, info *api.Info, fallbackPassword string, logger Logger,
    84  ) (
    85  	conn api.Connection, didFallback bool, err error,
    86  ) {
    87  	// We expect to assign to `conn`, `err`, *and* `info` in
    88  	// the course of this operation: wrapping this repeated
    89  	// atom in a func currently seems to be less treacherous
    90  	// than the alternatives.
    91  	var tryConnect = func() {
    92  		conn, err = apiOpen(info, api.DialOpts{
    93  			// The DialTimeout is for connecting to the underlying
    94  			// socket. We use three seconds because it should be fast
    95  			// but it is possible to add a manual machine to a distant
    96  			// controller such that the round trip time could be as high
    97  			// as 500ms.
    98  			DialTimeout: 3 * time.Second,
    99  			// The delay between connecting to a different controller. Setting this to 0 means we try all controllers
   100  			// simultaneously. We set it to approximately how long the TLS handshake takes, to avoid doing TLS
   101  			// handshakes to a controller that we are going to end up ignoring.
   102  			DialAddressInterval: 200 * time.Millisecond,
   103  			// The timeout is for the complete login handshake.
   104  			// If the server is rate limiting, it will normally pause
   105  			// before responding to the login request, but the pause is
   106  			// in the realm of five to ten seconds.
   107  			Timeout: time.Minute,
   108  		})
   109  	}
   110  
   111  	didFallback = info.Password == ""
   112  	// Try to connect, trying both the primary and fallback
   113  	// passwords if necessary; and update info, and remember
   114  	// which password we used.
   115  	if !didFallback {
   116  		logger.Debugf("connecting with current password")
   117  		tryConnect()
   118  		if params.IsCodeUnauthorized(err) || errors.Cause(err) == apiservererrors.ErrBadCreds {
   119  			didFallback = true
   120  
   121  		}
   122  	}
   123  	if didFallback {
   124  		// We've perhaps used the wrong password, so
   125  		// try again with the fallback password.
   126  		infoCopy := *info
   127  		info = &infoCopy
   128  		info.Password = fallbackPassword
   129  		logger.Debugf("connecting with old password")
   130  		tryConnect()
   131  	}
   132  
   133  	// We might be a machine agent that's started before its
   134  	// provisioner has had a chance to report instance data
   135  	// to the machine; wait a fair while to ensure we really
   136  	// are in the (expected rare) provisioner-crash situation
   137  	// that would cause permanent CodeNotProvisioned (which
   138  	// indicates that the controller has forgotten about us,
   139  	// and is provisioning a new instance, so we really should
   140  	// uninstall).
   141  	//
   142  	// Yes, it's dumb that this can't be interrupted, and that
   143  	// it's not configurable without patching.
   144  	if params.IsCodeNotProvisioned(err) {
   145  		retryStrategy := checkProvisionedStrategy
   146  		retryStrategy.IsFatalError = func(err error) bool { return !params.IsCodeNotProvisioned(err) }
   147  		retryStrategy.Func = func() error {
   148  			tryConnect()
   149  			return err
   150  		}
   151  		err = retry.Call(retryStrategy)
   152  		if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) {
   153  			err = retry.LastError(err)
   154  		}
   155  	}
   156  
   157  	// At this point we've run out of reasons to retry connecting,
   158  	// and just go with whatever error we last saw (if any).
   159  	if err != nil {
   160  		logger.Debugf("[%s] failed to connect", shortModelUUID(info.ModelTag))
   161  		return nil, false, errors.Trace(err)
   162  	}
   163  	logger.Infof("[%s] %q successfully connected to %q",
   164  		shortModelUUID(info.ModelTag),
   165  		info.Tag.String(),
   166  		conn.Addr())
   167  	return conn, didFallback, nil
   168  }
   169  
   170  func shortModelUUID(model names.ModelTag) string {
   171  	uuid := model.Id()
   172  	if names.IsValidModel(uuid) {
   173  		return model.ShortId()
   174  	}
   175  	return uuid
   176  }
   177  
   178  // ScaryConnect logs into the API using the supplied agent's credentials,
   179  // like OnlyConnect; and then:
   180  //
   181  //   - returns ErrConnectImpossible if the agent entity is dead or
   182  //     unauthorized for all known passwords;
   183  //   - replaces insecure credentials with freshly (locally) generated ones
   184  //     (and returns ErrPasswordChanged, expecting to be reinvoked);
   185  //   - unconditionally resets the remote-state password to its current value
   186  //     (for what seems like a bad reason).
   187  //
   188  // This is clearly a mess but at least now it's a documented and localized
   189  // mess; it should be used only when making the primary API connection for
   190  // a machine or unit agent running in its own process.
   191  func ScaryConnect(a agent.Agent, apiOpen api.OpenFunc, logger Logger) (_ api.Connection, err error) {
   192  	agentConfig := a.CurrentConfig()
   193  	info, ok := agentConfig.APIInfo()
   194  	if !ok {
   195  		return nil, errors.New("API info not available")
   196  	}
   197  	oldPassword := agentConfig.OldPassword()
   198  
   199  	defer func() {
   200  		cause := errors.Cause(err)
   201  		switch {
   202  		case cause == apiagent.ErrDenied:
   203  		case cause == errAgentEntityDead:
   204  		case params.IsCodeUnauthorized(cause):
   205  		case params.IsCodeNotProvisioned(cause):
   206  		default:
   207  			return
   208  		}
   209  		logger.Errorf("Failed to connect to controller: %v", err)
   210  		err = ErrConnectImpossible
   211  	}()
   212  
   213  	// Start connection...
   214  	conn, usedOldPassword, err := connectFallback(apiOpen, info, oldPassword, logger)
   215  	if err != nil {
   216  		return nil, errors.Trace(err)
   217  	}
   218  
   219  	// ...and make sure we close it if anything goes wrong.
   220  	defer func() {
   221  		if err != nil {
   222  			if err := conn.Close(); err != nil {
   223  				logger.Errorf("while closing API connection: %v", err)
   224  			}
   225  		}
   226  	}()
   227  
   228  	// newConnFacade is patched out in export_test, because exhaustion.
   229  	// proper config/params struct would be better.
   230  	facade, err := newConnFacade(conn)
   231  	if err != nil {
   232  		return nil, errors.Trace(err)
   233  	}
   234  
   235  	// First of all, see if we're dead or removed, which will render
   236  	// any further work pointless.
   237  	entity := agentConfig.Tag()
   238  	life, err := facade.Life(entity)
   239  	if err != nil {
   240  		return nil, errors.Trace(err)
   241  	}
   242  	switch life {
   243  	case apiagent.Alive, apiagent.Dying:
   244  	case apiagent.Dead:
   245  		return nil, errAgentEntityDead
   246  	default:
   247  		return nil, errors.Errorf("unknown life value %q", life)
   248  	}
   249  
   250  	// If we need to change the password, it's far cleaner to
   251  	// exit with ErrChangedPassword and depend on the framework
   252  	// for expeditious retry than it is to mess around with those
   253  	// responsibilities in here.
   254  	if usedOldPassword {
   255  		logger.Debugf("changing password...")
   256  		err := changePassword(oldPassword, a, facade)
   257  		if err != nil {
   258  			return nil, errors.Trace(err)
   259  		}
   260  		logger.Infof("[%s] password changed for %q",
   261  			shortModelUUID(agentConfig.Model()), entity.String())
   262  		return nil, ErrChangedPassword
   263  	}
   264  
   265  	// If we *didn't* need to change the password, we apparently need
   266  	// to reset our password to its current value anyway. Reportedly,
   267  	// a machine agent promoted to controller status might have bad
   268  	// auth data in mongodb, and this "fixes" it... but this is scary,
   269  	// wrong, coincidental duct tape. The RTTD is to make controller-
   270  	// promotion work correctly in the first place.
   271  	//
   272  	// Still, can't fix everything at once.
   273  	if err := facade.SetPassword(entity, info.Password); err != nil {
   274  		return nil, errors.Annotate(err, "can't reset agent password")
   275  	}
   276  	return conn, nil
   277  }
   278  
   279  // changePassword generates a new random password and records it in
   280  // local agent configuration and on the remote state server. The supplied
   281  // oldPassword -- which must be the current valid password -- is set as a
   282  // fallback in local config, in case we fail to update the remote password.
   283  func changePassword(oldPassword string, a agent.Agent, facade apiagent.ConnFacade) error {
   284  	newPassword, err := utils.RandomPassword()
   285  	if err != nil {
   286  		return errors.Trace(err)
   287  	}
   288  	if err := a.ChangeConfig(func(c agent.ConfigSetter) error {
   289  		c.SetPassword(newPassword)
   290  		c.SetOldPassword(oldPassword)
   291  		return nil
   292  	}); err != nil {
   293  		return errors.Trace(err)
   294  	}
   295  	// This has to happen *after* we record the old/new passwords
   296  	// locally, lest we change it remotely, crash suddenly, and
   297  	// end up locked out forever.
   298  	return facade.SetPassword(a.CurrentConfig().Tag(), newPassword)
   299  }
   300  
   301  // NewExternalControllerConnectionFunc returns a function returning an
   302  // api connection to a controller with the specified api info.
   303  type NewExternalControllerConnectionFunc func(*api.Info) (api.Connection, error)
   304  
   305  // NewExternalControllerConnection returns an api connection to a controller
   306  // with the specified api info.
   307  func NewExternalControllerConnection(apiInfo *api.Info) (api.Connection, error) {
   308  	return api.Open(apiInfo, api.DialOpts{
   309  		Timeout:    2 * time.Second,
   310  		RetryDelay: 500 * time.Millisecond,
   311  	})
   312  }