github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/apicaller/connect.go (about)

     1  // Copyright 2012-2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package apicaller
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/utils"
    11  	"gopkg.in/juju/names.v2"
    12  
    13  	"github.com/juju/juju/agent"
    14  	"github.com/juju/juju/api"
    15  	apiagent "github.com/juju/juju/api/agent"
    16  	"github.com/juju/juju/apiserver/common"
    17  	"github.com/juju/juju/apiserver/params"
    18  )
    19  
    20  var (
    21  	// checkProvisionedStrategy defines the evil uninterruptible
    22  	// retry strategy for "handling" ErrNotProvisioned. It exists
    23  	// in the name of stability; as the code evolves, it would be
    24  	// great to see its function moved up a level or two.
    25  	//
    26  	// TODO(katco): 2016-08-09: lp:1611427
    27  	checkProvisionedStrategy = utils.AttemptStrategy{
    28  		Total: 10 * time.Minute,
    29  		Delay: 5 * time.Second,
    30  	}
    31  
    32  	// newConnFacade should similarly move up a level so it can
    33  	// be explicitly configured without export_test hackery
    34  	newConnFacade = apiagent.NewConnFacade
    35  
    36  	// errAgentEntityDead is an internal error returned by getEntity.
    37  	errAgentEntityDead = errors.New("agent entity is dead")
    38  
    39  	// ErrConnectImpossible indicates that we can contact an apiserver
    40  	// but have no hope of authenticating a connection with it.
    41  	ErrConnectImpossible = errors.New("connection permanently impossible")
    42  
    43  	// ErrChangedPassword indicates that the agent config used to connect
    44  	// has been updated with a new password, and you should try again.
    45  	ErrChangedPassword = errors.New("insecure password replaced; retry")
    46  )
    47  
    48  // OnlyConnect logs into the API using the supplied agent's credentials.
    49  func OnlyConnect(a agent.Agent, apiOpen api.OpenFunc) (api.Connection, error) {
    50  	agentConfig := a.CurrentConfig()
    51  	info, ok := agentConfig.APIInfo()
    52  	if !ok {
    53  		return nil, errors.New("API info not available")
    54  	}
    55  	conn, _, err := connectFallback(apiOpen, info, agentConfig.OldPassword())
    56  	if err != nil {
    57  		return nil, errors.Trace(err)
    58  	}
    59  	return conn, nil
    60  }
    61  
    62  // connectFallback opens an API connection using the supplied info,
    63  // or a copy using the fallbackPassword; blocks for up to 5 minutes
    64  // if it encounters a CodeNotProvisioned error, periodically retrying;
    65  // and eventually, having either succeeded, failed, or timed out, returns:
    66  //
    67  //   * (if successful) the connection, and whether the fallback was used
    68  //   * (otherwise) whatever error it most recently encountered
    69  //
    70  // It's clear that it still has machine-agent concerns still baked in,
    71  // but there's no obvious practical path to separating those entirely at
    72  // the moment.
    73  //
    74  // (The right answer is probably to treat CodeNotProvisioned as a normal
    75  // error and depend on (currently nonexistent) exponential backoff in
    76  // the framework: either it'll work soon enough, or the controller will
    77  // spot the error and nuke the machine anyway. No harm leaving the local
    78  // agent running and occasionally polling for changes -- it won't do much
    79  // until it's managed to log in, and any suicide-cutoff point we pick here
    80  // will be objectively bad in some circumstances.)
    81  func connectFallback(
    82  	apiOpen api.OpenFunc, info *api.Info, fallbackPassword string,
    83  ) (
    84  	conn api.Connection, didFallback bool, err error,
    85  ) {
    86  	// We expect to assign to `conn`, `err`, *and* `info` in
    87  	// the course of this operation: wrapping this repeated
    88  	// atom in a func currently seems to be less treacherous
    89  	// than the alternatives.
    90  	var tryConnect = func() {
    91  		conn, err = apiOpen(info, api.DialOpts{
    92  			// The DialTimeout is for connecting to the underlying
    93  			// socket. We use three seconds because it should be fast
    94  			// but it is possible to add a manual machine to a distant
    95  			// controller such that the round trip time could be as high
    96  			// as 500ms.
    97  			DialTimeout: 3 * time.Second,
    98  			// The delay between connecting to a different controller. Setting this to 0 means we try all controllers
    99  			// simultaneously. We set it to approximately how long the TLS handshake takes, to avoid doing TLS
   100  			// handshakes to a controller that we are going to end up ignoring.
   101  			DialAddressInterval: 200 * time.Millisecond,
   102  			// The timeout is for the complete login handshake.
   103  			// If the server is rate limiting, it will normally pause
   104  			// before responding to the login request, but the pause is
   105  			// in the realm of five to ten seconds.
   106  			Timeout: time.Minute,
   107  		})
   108  	}
   109  
   110  	didFallback = info.Password == ""
   111  	// Try to connect, trying both the primary and fallback
   112  	// passwords if necessary; and update info, and remember
   113  	// which password we used.
   114  	if !didFallback {
   115  		logger.Debugf("connecting with current password")
   116  		tryConnect()
   117  		if params.IsCodeUnauthorized(err) || errors.Cause(err) == common.ErrBadCreds {
   118  			didFallback = true
   119  
   120  		}
   121  	}
   122  	if didFallback {
   123  		// We've perhaps used the wrong password, so
   124  		// try again with the fallback password.
   125  		infoCopy := *info
   126  		info = &infoCopy
   127  		info.Password = fallbackPassword
   128  		logger.Debugf("connecting with old password")
   129  		tryConnect()
   130  	}
   131  
   132  	// We might be a machine agent that's started before its
   133  	// provisioner has had a chance to report instance data
   134  	// to the machine; wait a fair while to ensure we really
   135  	// are in the (expected rare) provisioner-crash situation
   136  	// that would cause permanent CodeNotProvisioned (which
   137  	// indicates that the controller has forgotten about us,
   138  	// and is provisioning a new instance, so we really should
   139  	// uninstall).
   140  	//
   141  	// Yes, it's dumb that this can't be interrupted, and that
   142  	// it's not configurable without patching.
   143  	if params.IsCodeNotProvisioned(err) {
   144  		for a := checkProvisionedStrategy.Start(); a.Next(); {
   145  			tryConnect()
   146  			if !params.IsCodeNotProvisioned(err) {
   147  				break
   148  			}
   149  		}
   150  	}
   151  
   152  	// At this point we've run out of reasons to retry connecting,
   153  	// and just go with whatever error we last saw (if any).
   154  	if err != nil {
   155  		logger.Debugf("[%s] failed to connect", shortModelUUID(info.ModelTag))
   156  		return nil, false, errors.Trace(err)
   157  	}
   158  	logger.Infof("[%s] %q successfully connected to %q",
   159  		shortModelUUID(info.ModelTag),
   160  		info.Tag.String(),
   161  		conn.Addr())
   162  	return conn, didFallback, nil
   163  }
   164  
   165  func shortModelUUID(model names.ModelTag) string {
   166  	uuid := model.Id()
   167  	if len(uuid) > 6 {
   168  		return uuid[:6]
   169  	}
   170  	return uuid
   171  }
   172  
   173  // ScaryConnect logs into the API using the supplied agent's credentials,
   174  // like OnlyConnect; and then:
   175  //
   176  //   * returns ErrConnectImpossible if the agent entity is dead or
   177  //     unauthorized for all known passwords;
   178  //   * replaces insecure credentials with freshly (locally) generated ones
   179  //     (and returns ErrPasswordChanged, expecting to be reinvoked);
   180  //   * unconditionally resets the remote-state password to its current value
   181  //     (for what seems like a bad reason).
   182  //
   183  // This is clearly a mess but at least now it's a documented and localized
   184  // mess; it should be used only when making the primary API connection for
   185  // a machine or unit agent running in its own process.
   186  func ScaryConnect(a agent.Agent, apiOpen api.OpenFunc) (_ api.Connection, err error) {
   187  	agentConfig := a.CurrentConfig()
   188  	info, ok := agentConfig.APIInfo()
   189  	if !ok {
   190  		return nil, errors.New("API info not available")
   191  	}
   192  	oldPassword := agentConfig.OldPassword()
   193  
   194  	defer func() {
   195  		cause := errors.Cause(err)
   196  		switch {
   197  		case cause == apiagent.ErrDenied:
   198  		case cause == errAgentEntityDead:
   199  		case params.IsCodeUnauthorized(cause):
   200  		case params.IsCodeNotProvisioned(cause):
   201  		default:
   202  			return
   203  		}
   204  		err = ErrConnectImpossible
   205  	}()
   206  
   207  	// Start connection...
   208  	conn, usedOldPassword, err := connectFallback(apiOpen, info, oldPassword)
   209  	if err != nil {
   210  		return nil, errors.Trace(err)
   211  	}
   212  
   213  	// ...and make sure we close it if anything goes wrong.
   214  	defer func() {
   215  		if err != nil {
   216  			if err := conn.Close(); err != nil {
   217  				logger.Errorf("while closing API connection: %v", err)
   218  			}
   219  		}
   220  	}()
   221  
   222  	// newConnFacade is patched out in export_test, because exhaustion.
   223  	// proper config/params struct would be better.
   224  	facade, err := newConnFacade(conn)
   225  	if err != nil {
   226  		return nil, errors.Trace(err)
   227  	}
   228  
   229  	// First of all, see if we're dead or removed, which will render
   230  	// any further work pointless.
   231  	entity := agentConfig.Tag()
   232  	life, err := facade.Life(entity)
   233  	if err != nil {
   234  		return nil, errors.Trace(err)
   235  	}
   236  	switch life {
   237  	case apiagent.Alive, apiagent.Dying:
   238  	case apiagent.Dead:
   239  		return nil, errAgentEntityDead
   240  	default:
   241  		return nil, errors.Errorf("unknown life value %q", life)
   242  	}
   243  
   244  	// If we need to change the password, it's far cleaner to
   245  	// exit with ErrChangedPassword and depend on the framework
   246  	// for expeditious retry than it is to mess around with those
   247  	// responsibilities in here.
   248  	if usedOldPassword {
   249  		logger.Debugf("changing password...")
   250  		err := changePassword(oldPassword, a, facade)
   251  		if err != nil {
   252  			return nil, errors.Trace(err)
   253  		}
   254  		logger.Infof("[%s] password changed for %q",
   255  			shortModelUUID(agentConfig.Model()), entity.String())
   256  		return nil, ErrChangedPassword
   257  	}
   258  
   259  	// If we *didn't* need to change the password, we apparently need
   260  	// to reset our password to its current value anyway. Reportedly,
   261  	// a machine agent promoted to controller status might have bad
   262  	// auth data in mongodb, and this "fixes" it... but this is scary,
   263  	// wrong, coincidental duct tape. The RTTD is to make controller-
   264  	// promotion work correctly in the first place.
   265  	//
   266  	// Still, can't fix everything at once.
   267  	if err := facade.SetPassword(entity, info.Password); err != nil {
   268  		return nil, errors.Annotate(err, "can't reset agent password")
   269  	}
   270  	return conn, nil
   271  }
   272  
   273  // changePassword generates a new random password and records it in
   274  // local agent configuration and on the remote state server. The supplied
   275  // oldPassword -- which must be the current valid password -- is set as a
   276  // fallback in local config, in case we fail to update the remote password.
   277  func changePassword(oldPassword string, a agent.Agent, facade apiagent.ConnFacade) error {
   278  	newPassword, err := utils.RandomPassword()
   279  	if err != nil {
   280  		return errors.Trace(err)
   281  	}
   282  	if err := a.ChangeConfig(func(c agent.ConfigSetter) error {
   283  		c.SetPassword(newPassword)
   284  		c.SetOldPassword(oldPassword)
   285  		return nil
   286  	}); err != nil {
   287  		return err
   288  	}
   289  	// This has to happen *after* we record the old/new passwords
   290  	// locally, lest we change it remotely, crash suddenly, and
   291  	// end up locked out forever.
   292  	return facade.SetPassword(a.CurrentConfig().Tag(), newPassword)
   293  }
   294  
   295  // NewExternalControllerConnectionFunc returns a function returning an
   296  // api connection to a controller with the specified api info.
   297  type NewExternalControllerConnectionFunc func(*api.Info) (api.Connection, error)
   298  
   299  // NewExternalControllerConnection returns an api connection to a controller
   300  // with the specified api info.
   301  func NewExternalControllerConnection(apiInfo *api.Info) (api.Connection, error) {
   302  	return api.Open(apiInfo, api.DialOpts{
   303  		Timeout:    2 * time.Second,
   304  		RetryDelay: 500 * time.Millisecond,
   305  	})
   306  }