github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/apicaller/connect.go (about) 1 // Copyright 2012-2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package apicaller 5 6 import ( 7 "time" 8 9 "github.com/juju/clock" 10 "github.com/juju/errors" 11 "github.com/juju/names/v5" 12 "github.com/juju/retry" 13 "github.com/juju/utils/v3" 14 15 "github.com/juju/juju/agent" 16 "github.com/juju/juju/api" 17 apiagent "github.com/juju/juju/api/agent/agent" 18 apiservererrors "github.com/juju/juju/apiserver/errors" 19 "github.com/juju/juju/rpc/params" 20 ) 21 22 var ( 23 // checkProvisionedStrategy defines the evil uninterruptible 24 // retry strategy for "handling" ErrNotProvisioned. It exists 25 // in the name of stability; as the code evolves, it would be 26 // great to see its function moved up a level or two. 27 checkProvisionedStrategy = retry.CallArgs{ 28 Clock: clock.WallClock, 29 MaxDuration: 10 * time.Minute, 30 Delay: 5 * time.Second, 31 } 32 33 // newConnFacade should similarly move up a level so it can 34 // be explicitly configured without export_test hackery 35 newConnFacade = apiagent.NewConnFacade 36 37 // errAgentEntityDead is an internal error returned by getEntity. 38 errAgentEntityDead = errors.New("agent entity is dead") 39 40 // ErrConnectImpossible indicates that we can contact an apiserver 41 // but have no hope of authenticating a connection with it. 42 ErrConnectImpossible = errors.New("connection permanently impossible") 43 44 // ErrChangedPassword indicates that the agent config used to connect 45 // has been updated with a new password, and you should try again. 46 ErrChangedPassword = errors.New("insecure password replaced; retry") 47 ) 48 49 // OnlyConnect logs into the API using the supplied agent's credentials. 50 func OnlyConnect(a agent.Agent, apiOpen api.OpenFunc, logger Logger) (api.Connection, error) { 51 agentConfig := a.CurrentConfig() 52 info, ok := agentConfig.APIInfo() 53 if !ok { 54 return nil, errors.New("API info not available") 55 } 56 conn, _, err := connectFallback(apiOpen, info, agentConfig.OldPassword(), logger) 57 if err != nil { 58 return nil, errors.Trace(err) 59 } 60 return conn, nil 61 } 62 63 // connectFallback opens an API connection using the supplied info, 64 // or a copy using the fallbackPassword; blocks for up to 5 minutes 65 // if it encounters a CodeNotProvisioned error, periodically retrying; 66 // and eventually, having either succeeded, failed, or timed out, returns: 67 // 68 // - (if successful) the connection, and whether the fallback was used 69 // - (otherwise) whatever error it most recently encountered 70 // 71 // It's clear that it still has machine-agent concerns still baked in, 72 // but there's no obvious practical path to separating those entirely at 73 // the moment. 74 // 75 // (The right answer is probably to treat CodeNotProvisioned as a normal 76 // error and depend on (currently nonexistent) exponential backoff in 77 // the framework: either it'll work soon enough, or the controller will 78 // spot the error and nuke the machine anyway. No harm leaving the local 79 // agent running and occasionally polling for changes -- it won't do much 80 // until it's managed to log in, and any suicide-cutoff point we pick here 81 // will be objectively bad in some circumstances.) 82 func connectFallback( 83 apiOpen api.OpenFunc, info *api.Info, fallbackPassword string, logger Logger, 84 ) ( 85 conn api.Connection, didFallback bool, err error, 86 ) { 87 // We expect to assign to `conn`, `err`, *and* `info` in 88 // the course of this operation: wrapping this repeated 89 // atom in a func currently seems to be less treacherous 90 // than the alternatives. 91 var tryConnect = func() { 92 conn, err = apiOpen(info, api.DialOpts{ 93 // The DialTimeout is for connecting to the underlying 94 // socket. We use three seconds because it should be fast 95 // but it is possible to add a manual machine to a distant 96 // controller such that the round trip time could be as high 97 // as 500ms. 98 DialTimeout: 3 * time.Second, 99 // The delay between connecting to a different controller. Setting this to 0 means we try all controllers 100 // simultaneously. We set it to approximately how long the TLS handshake takes, to avoid doing TLS 101 // handshakes to a controller that we are going to end up ignoring. 102 DialAddressInterval: 200 * time.Millisecond, 103 // The timeout is for the complete login handshake. 104 // If the server is rate limiting, it will normally pause 105 // before responding to the login request, but the pause is 106 // in the realm of five to ten seconds. 107 Timeout: time.Minute, 108 }) 109 } 110 111 didFallback = info.Password == "" 112 // Try to connect, trying both the primary and fallback 113 // passwords if necessary; and update info, and remember 114 // which password we used. 115 if !didFallback { 116 logger.Debugf("connecting with current password") 117 tryConnect() 118 if params.IsCodeUnauthorized(err) || errors.Cause(err) == apiservererrors.ErrBadCreds { 119 didFallback = true 120 121 } 122 } 123 if didFallback { 124 // We've perhaps used the wrong password, so 125 // try again with the fallback password. 126 infoCopy := *info 127 info = &infoCopy 128 info.Password = fallbackPassword 129 logger.Debugf("connecting with old password") 130 tryConnect() 131 } 132 133 // We might be a machine agent that's started before its 134 // provisioner has had a chance to report instance data 135 // to the machine; wait a fair while to ensure we really 136 // are in the (expected rare) provisioner-crash situation 137 // that would cause permanent CodeNotProvisioned (which 138 // indicates that the controller has forgotten about us, 139 // and is provisioning a new instance, so we really should 140 // uninstall). 141 // 142 // Yes, it's dumb that this can't be interrupted, and that 143 // it's not configurable without patching. 144 if params.IsCodeNotProvisioned(err) { 145 retryStrategy := checkProvisionedStrategy 146 retryStrategy.IsFatalError = func(err error) bool { return !params.IsCodeNotProvisioned(err) } 147 retryStrategy.Func = func() error { 148 tryConnect() 149 return err 150 } 151 err = retry.Call(retryStrategy) 152 if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) { 153 err = retry.LastError(err) 154 } 155 } 156 157 // At this point we've run out of reasons to retry connecting, 158 // and just go with whatever error we last saw (if any). 159 if err != nil { 160 logger.Debugf("[%s] failed to connect", shortModelUUID(info.ModelTag)) 161 return nil, false, errors.Trace(err) 162 } 163 logger.Infof("[%s] %q successfully connected to %q", 164 shortModelUUID(info.ModelTag), 165 info.Tag.String(), 166 conn.Addr()) 167 return conn, didFallback, nil 168 } 169 170 func shortModelUUID(model names.ModelTag) string { 171 uuid := model.Id() 172 if names.IsValidModel(uuid) { 173 return model.ShortId() 174 } 175 return uuid 176 } 177 178 // ScaryConnect logs into the API using the supplied agent's credentials, 179 // like OnlyConnect; and then: 180 // 181 // - returns ErrConnectImpossible if the agent entity is dead or 182 // unauthorized for all known passwords; 183 // - replaces insecure credentials with freshly (locally) generated ones 184 // (and returns ErrPasswordChanged, expecting to be reinvoked); 185 // - unconditionally resets the remote-state password to its current value 186 // (for what seems like a bad reason). 187 // 188 // This is clearly a mess but at least now it's a documented and localized 189 // mess; it should be used only when making the primary API connection for 190 // a machine or unit agent running in its own process. 191 func ScaryConnect(a agent.Agent, apiOpen api.OpenFunc, logger Logger) (_ api.Connection, err error) { 192 agentConfig := a.CurrentConfig() 193 info, ok := agentConfig.APIInfo() 194 if !ok { 195 return nil, errors.New("API info not available") 196 } 197 oldPassword := agentConfig.OldPassword() 198 199 defer func() { 200 cause := errors.Cause(err) 201 switch { 202 case cause == apiagent.ErrDenied: 203 case cause == errAgentEntityDead: 204 case params.IsCodeUnauthorized(cause): 205 case params.IsCodeNotProvisioned(cause): 206 default: 207 return 208 } 209 logger.Errorf("Failed to connect to controller: %v", err) 210 err = ErrConnectImpossible 211 }() 212 213 // Start connection... 214 conn, usedOldPassword, err := connectFallback(apiOpen, info, oldPassword, logger) 215 if err != nil { 216 return nil, errors.Trace(err) 217 } 218 219 // ...and make sure we close it if anything goes wrong. 220 defer func() { 221 if err != nil { 222 if err := conn.Close(); err != nil { 223 logger.Errorf("while closing API connection: %v", err) 224 } 225 } 226 }() 227 228 // newConnFacade is patched out in export_test, because exhaustion. 229 // proper config/params struct would be better. 230 facade, err := newConnFacade(conn) 231 if err != nil { 232 return nil, errors.Trace(err) 233 } 234 235 // First of all, see if we're dead or removed, which will render 236 // any further work pointless. 237 entity := agentConfig.Tag() 238 life, err := facade.Life(entity) 239 if err != nil { 240 return nil, errors.Trace(err) 241 } 242 switch life { 243 case apiagent.Alive, apiagent.Dying: 244 case apiagent.Dead: 245 return nil, errAgentEntityDead 246 default: 247 return nil, errors.Errorf("unknown life value %q", life) 248 } 249 250 // If we need to change the password, it's far cleaner to 251 // exit with ErrChangedPassword and depend on the framework 252 // for expeditious retry than it is to mess around with those 253 // responsibilities in here. 254 if usedOldPassword { 255 logger.Debugf("changing password...") 256 err := changePassword(oldPassword, a, facade) 257 if err != nil { 258 return nil, errors.Trace(err) 259 } 260 logger.Infof("[%s] password changed for %q", 261 shortModelUUID(agentConfig.Model()), entity.String()) 262 return nil, ErrChangedPassword 263 } 264 265 // If we *didn't* need to change the password, we apparently need 266 // to reset our password to its current value anyway. Reportedly, 267 // a machine agent promoted to controller status might have bad 268 // auth data in mongodb, and this "fixes" it... but this is scary, 269 // wrong, coincidental duct tape. The RTTD is to make controller- 270 // promotion work correctly in the first place. 271 // 272 // Still, can't fix everything at once. 273 if err := facade.SetPassword(entity, info.Password); err != nil { 274 return nil, errors.Annotate(err, "can't reset agent password") 275 } 276 return conn, nil 277 } 278 279 // changePassword generates a new random password and records it in 280 // local agent configuration and on the remote state server. The supplied 281 // oldPassword -- which must be the current valid password -- is set as a 282 // fallback in local config, in case we fail to update the remote password. 283 func changePassword(oldPassword string, a agent.Agent, facade apiagent.ConnFacade) error { 284 newPassword, err := utils.RandomPassword() 285 if err != nil { 286 return errors.Trace(err) 287 } 288 if err := a.ChangeConfig(func(c agent.ConfigSetter) error { 289 c.SetPassword(newPassword) 290 c.SetOldPassword(oldPassword) 291 return nil 292 }); err != nil { 293 return errors.Trace(err) 294 } 295 // This has to happen *after* we record the old/new passwords 296 // locally, lest we change it remotely, crash suddenly, and 297 // end up locked out forever. 298 return facade.SetPassword(a.CurrentConfig().Tag(), newPassword) 299 } 300 301 // NewExternalControllerConnectionFunc returns a function returning an 302 // api connection to a controller with the specified api info. 303 type NewExternalControllerConnectionFunc func(*api.Info) (api.Connection, error) 304 305 // NewExternalControllerConnection returns an api connection to a controller 306 // with the specified api info. 307 func NewExternalControllerConnection(apiInfo *api.Info) (api.Connection, error) { 308 return api.Open(apiInfo, api.DialOpts{ 309 Timeout: 2 * time.Second, 310 RetryDelay: 500 * time.Millisecond, 311 }) 312 }