github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/plugins/juju-restore/restore.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "archive/tar" 8 "bytes" 9 "compress/gzip" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "os" 14 "path" 15 "strconv" 16 "text/template" 17 "time" 18 19 "github.com/juju/cmd" 20 "github.com/juju/errors" 21 "github.com/juju/loggo" 22 "github.com/juju/names" 23 "github.com/juju/utils" 24 goyaml "gopkg.in/yaml.v1" 25 "launchpad.net/gnuflag" 26 27 "github.com/juju/juju/api" 28 "github.com/juju/juju/cmd/envcmd" 29 "github.com/juju/juju/constraints" 30 "github.com/juju/juju/environs" 31 "github.com/juju/juju/environs/bootstrap" 32 "github.com/juju/juju/environs/config" 33 "github.com/juju/juju/environs/configstore" 34 "github.com/juju/juju/instance" 35 "github.com/juju/juju/juju" 36 _ "github.com/juju/juju/provider/all" 37 "github.com/juju/juju/provider/common" 38 "github.com/juju/juju/state/backups" 39 "github.com/juju/juju/utils/ssh" 40 ) 41 42 func main() { 43 Main(os.Args) 44 } 45 46 func Main(args []string) { 47 ctx, err := cmd.DefaultContext() 48 if err != nil { 49 fmt.Fprintf(os.Stderr, "error: %v\n", err) 50 os.Exit(2) 51 } 52 if err := juju.InitJujuHome(); err != nil { 53 fmt.Fprintf(os.Stderr, "error: %s\n", err) 54 os.Exit(2) 55 } 56 os.Exit(cmd.Main(envcmd.Wrap(&restoreCommand{}), ctx, args[1:])) 57 } 58 59 var logger = loggo.GetLogger("juju.plugins.restore") 60 61 const restoreDoc = ` 62 Restore restores a backup created with juju backup 63 by creating a new juju bootstrap instance and arranging 64 it so that the existing instances in the environment 65 talk to it. 66 67 It verifies that the existing bootstrap instance is 68 not running. The given constraints will be used 69 to choose the new instance. 70 ` 71 72 type restoreCommand struct { 73 envcmd.EnvCommandBase 74 Log cmd.Log 75 Constraints constraints.Value 76 backupFile string 77 showDescription bool 78 } 79 80 func (c *restoreCommand) Info() *cmd.Info { 81 return &cmd.Info{ 82 Name: "juju-restore", 83 Purpose: "Restore a backup made with juju backup", 84 Args: "<backupfile.tar.gz>", 85 Doc: restoreDoc, 86 } 87 } 88 89 func (c *restoreCommand) SetFlags(f *gnuflag.FlagSet) { 90 f.Var(constraints.ConstraintsValue{Target: &c.Constraints}, "constraints", "set environment constraints") 91 f.BoolVar(&c.showDescription, "description", false, "show the purpose of this plugin") 92 c.Log.AddFlags(f) 93 } 94 95 func (c *restoreCommand) Init(args []string) error { 96 if c.showDescription { 97 return cmd.CheckEmpty(args) 98 } 99 if len(args) == 0 { 100 return fmt.Errorf("no backup file specified") 101 } 102 c.backupFile = args[0] 103 return cmd.CheckEmpty(args[1:]) 104 } 105 106 var updateBootstrapMachineTemplate = mustParseTemplate(` 107 set -exu 108 109 export LC_ALL=C 110 tar xzf juju-backup.tgz 111 test -d juju-backup 112 113 114 initctl stop jujud-machine-0 115 116 #The code apt-get throws when lock is taken 117 APTOUTPUT=100 118 while [ $APTOUTPUT -gt 0 ] 119 do 120 # We will try to run apt-get and it can fail if other dpkg is in use 121 # the subshell call is not reached by -e so we can have apt-get fail 122 APTOUTPUT=$(apt-get --option=Dpkg::Options::=--force-confold --option=Dpkg::options::=--force-unsafe-io --assume-yes --quiet install mongodb-clients &> /dev/null; echo $?) 123 if [ $APTOUTPUT -gt 0 ] && [ $APTOUTPUT -ne 100 ]; then 124 echo "apt-get failed with an irrecoverable error $APTOUTPUT"; 125 exit 1 126 fi 127 done 128 129 130 131 initctl stop juju-db 132 rm -r /var/lib/juju 133 rm -r /var/log/juju 134 135 tar -C / -xvp -f juju-backup/root.tar 136 mkdir -p /var/lib/juju/db 137 138 # Prefer jujud-mongodb binaries if available 139 export MONGORESTORE=mongorestore 140 if [ -f /usr/lib/juju/bin/mongorestore ]; then 141 export MONGORESTORE=/usr/lib/juju/bin/mongorestore; 142 fi 143 $MONGORESTORE --drop --dbpath /var/lib/juju/db juju-backup/dump 144 145 initctl start juju-db 146 147 mongoAdminEval() { 148 mongo --ssl -u admin -p {{.AgentConfig.Credentials.OldPassword | shquote}} localhost:{{.AgentConfig.StatePort}}/admin --eval "$1" 149 } 150 151 # wait for mongo to come up after starting the juju-db init service. 152 for i in $(seq 1 100) 153 do 154 mongoAdminEval ' ' && break 155 sleep 5 156 done 157 158 # Create a new replicaSet conf and re initiate it 159 mongoAdminEval ' 160 conf = { "_id" : "juju", "version" : 1, "members" : [ { "_id" : 1, "host" : "{{ .PrivateAddress | printf "%s:"}}{{.AgentConfig.StatePort}}" , "tags" : { "juju-machine-id" : "0" } }]} 161 rs.initiate(conf) 162 ' 163 # This looks arbitrary but there is no clear way to determine when replicaset is initiated 164 # and rs.initiate message is "this will take about a minute" so we honour that estimation 165 sleep 60 166 167 # Remove all state machines but 0, to restore HA 168 mongoAdminEval ' 169 db = db.getSiblingDB("juju") 170 db.machines.update({machineid: "0"}, {$set: {instanceid: {{.NewInstanceId | printf "%q" }} } }) 171 db.instanceData.update({_id: "0"}, {$set: {instanceid: {{.NewInstanceId | printf "%q" }} } }) 172 db.machines.remove({machineid: {$ne:"0"}, hasvote: true}) 173 db.stateServers.update({"_id":"e"}, {$set:{"machineids" : [0]}}) 174 db.stateServers.update({"_id":"e"}, {$set:{"votingmachineids" : [0]}}) 175 ' 176 177 # Give time to replset to initiate 178 for i in $(seq 1 20) 179 do 180 mongoAdminEval ' ' && break 181 sleep 5 182 done 183 184 initctl stop juju-db 185 186 # Update the agent.conf for machine-0 with the new addresses 187 cd /var/lib/juju/agents 188 189 # Remove extra state machines from conf 190 REMOVECOUNT=$(grep -Ec "^-.*{{.AgentConfig.ApiPort}}$" /var/lib/juju/agents/machine-0/agent.conf ) 191 awk '/\-.*{{.AgentConfig.ApiPort}}$/{i++}i<1' machine-0/agent.conf > machine-0/agent.conf.new 192 awk -v removecount=$REMOVECOUNT '/\-.*{{.AgentConfig.ApiPort}}$/{i++}i==removecount' machine-0/agent.conf >> machine-0/agent.conf.new 193 mv machine-0/agent.conf.new machine-0/agent.conf 194 195 sed -i.old -r -e "/^(stateaddresses):/{ 196 n 197 s/- .*(:[0-9]+)/- {{.Address}}\1/ 198 }" -e "/^(apiaddresses):/{ 199 n 200 s/- .*(:[0-9]+)/- {{.PrivateAddress}}\1/ 201 }" machine-0/agent.conf 202 203 204 initctl start juju-db 205 initctl start jujud-machine-0 206 `) 207 208 func updateBootstrapMachineScript(instanceId instance.Id, agentConf agentConfig, addr, paddr string) string { 209 return execTemplate(updateBootstrapMachineTemplate, struct { 210 NewInstanceId instance.Id 211 AgentConfig agentConfig 212 Address string 213 PrivateAddress string 214 }{instanceId, agentConf, addr, paddr}) 215 } 216 217 func (c *restoreCommand) Run(ctx *cmd.Context) error { 218 if c.showDescription { 219 fmt.Fprintf(ctx.Stdout, "%s\n", c.Info().Purpose) 220 return nil 221 } 222 if err := c.Log.Start(ctx); err != nil { 223 return err 224 } 225 agentConf, err := extractConfig(c.backupFile) 226 if err != nil { 227 return errors.Annotate(err, "cannot extract configuration from backup file") 228 } 229 progress("extracted credentials from backup file") 230 store, err := configstore.Default() 231 if err != nil { 232 return err 233 } 234 cfg, err := c.Config(store) 235 if err != nil { 236 return err 237 } 238 env, err := rebootstrap(cfg, ctx, c.Constraints) 239 if err != nil { 240 return errors.Annotate(err, "cannot re-bootstrap environment") 241 } 242 progress("connecting to newly bootstrapped instance") 243 var apiState *api.State 244 // The state server backend may not be ready to accept logins so we retry. 245 // We'll do up to 8 retries over 2 minutes to give the server time to come up. 246 // Typically we expect only 1 retry will be needed. 247 attempt := utils.AttemptStrategy{Delay: 15 * time.Second, Min: 8} 248 // While specifying the admin user will work for now, as soon as we allow 249 // the users to have a different initial user name, or they have changed 250 // the password for the admin user, this will fail. 251 owner := names.NewUserTag("admin") 252 for a := attempt.Start(); a.Next(); { 253 apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) 254 if err == nil || errors.Cause(err).Error() != "EOF" { 255 break 256 } 257 progress("bootstrapped instance not ready - attempting to redial") 258 } 259 if err != nil { 260 return errors.Annotate(err, "cannot connect to bootstrap instance") 261 } 262 progress("restoring bootstrap machine") 263 machine0Addr, err := restoreBootstrapMachine(apiState, c.backupFile, agentConf) 264 if err != nil { 265 return errors.Annotate(err, "cannot restore bootstrap machine") 266 } 267 progress("restored bootstrap machine") 268 269 apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) 270 progress("opening state") 271 if err != nil { 272 return errors.Annotate(err, "cannot connect to api server") 273 } 274 progress("updating all machines") 275 if err := updateAllMachines(apiState, machine0Addr); err != nil { 276 return errors.Annotate(err, "cannot update machines") 277 } 278 return nil 279 } 280 281 func progress(f string, a ...interface{}) { 282 fmt.Printf("%s\n", fmt.Sprintf(f, a...)) 283 } 284 285 func rebootstrap(cfg *config.Config, ctx *cmd.Context, cons constraints.Value) (environs.Environ, error) { 286 progress("re-bootstrapping environment") 287 // Turn on safe mode so that the newly bootstrapped instance 288 // will not destroy all the instances it does not know about. 289 cfg, err := cfg.Apply(map[string]interface{}{ 290 "provisioner-safe-mode": true, 291 }) 292 if err != nil { 293 return nil, errors.Annotate(err, "cannot enable provisioner-safe-mode") 294 } 295 env, err := environs.New(cfg) 296 if err != nil { 297 return nil, err 298 } 299 instanceIds, err := env.StateServerInstances() 300 switch errors.Cause(err) { 301 case nil, environs.ErrNoInstances: 302 // Some providers will return a nil error even 303 // if there are no live state server instances. 304 break 305 case environs.ErrNotBootstrapped: 306 return nil, errors.Trace(err) 307 default: 308 return nil, errors.Annotate(err, "cannot determine state server instances") 309 } 310 if len(instanceIds) > 0 { 311 instances, err := env.Instances(instanceIds) 312 switch errors.Cause(err) { 313 case nil, environs.ErrPartialInstances: 314 return nil, fmt.Errorf("old bootstrap instances %q still seems to exist; will not replace", instances) 315 case environs.ErrNoInstances: 316 // No state server instances, so keep running. 317 break 318 default: 319 return nil, errors.Annotate(err, "cannot detect whether old instance is still running") 320 } 321 } 322 // Remove the storage so that we can bootstrap without the provider complaining. 323 if env, ok := env.(environs.EnvironStorage); ok { 324 if err := env.Storage().Remove(common.StateFile); err != nil { 325 return nil, errors.Annotate(err, fmt.Sprintf("cannot remove %q from storage", common.StateFile)) 326 } 327 } 328 329 // TODO If we fail beyond here, then we won't have a state file and 330 // we won't be able to re-run this script because it fails without it. 331 // We could either try to recreate the file if we fail (which is itself 332 // error-prone) or we could provide a --no-check flag to make 333 // it go ahead anyway without the check. 334 335 args := bootstrap.BootstrapParams{Constraints: cons} 336 if err := bootstrap.Bootstrap(envcmd.BootstrapContextNoVerify(ctx), env, args); err != nil { 337 return nil, errors.Annotate(err, "cannot bootstrap new instance") 338 } 339 return env, nil 340 } 341 342 func restoreBootstrapMachine(st *api.State, backupFile string, agentConf agentConfig) (addr string, err error) { 343 client := st.Client() 344 addr, err = client.PublicAddress("0") 345 if err != nil { 346 return "", errors.Annotate(err, "cannot get public address of bootstrap machine") 347 } 348 paddr, err := client.PrivateAddress("0") 349 if err != nil { 350 return "", errors.Annotate(err, "cannot get private address of bootstrap machine") 351 } 352 status, err := client.Status(nil) 353 if err != nil { 354 return "", errors.Annotate(err, "cannot get environment status") 355 } 356 info, ok := status.Machines["0"] 357 if !ok { 358 return "", fmt.Errorf("cannot find bootstrap machine in status") 359 } 360 newInstId := instance.Id(info.InstanceId) 361 362 progress("copying backup file to bootstrap host") 363 if err := sendViaScp(backupFile, addr, "~/juju-backup.tgz"); err != nil { 364 return "", errors.Annotate(err, "cannot copy backup file to bootstrap instance") 365 } 366 progress("updating bootstrap machine") 367 if err := runViaSsh(addr, updateBootstrapMachineScript(newInstId, agentConf, addr, paddr)); err != nil { 368 return "", errors.Annotate(err, "update script failed") 369 } 370 return addr, nil 371 } 372 373 type credentials struct { 374 Tag string 375 Password string 376 OldPassword string 377 } 378 379 type agentConfig struct { 380 Credentials credentials 381 ApiPort string 382 StatePort string 383 } 384 385 func extractMachineID(archive *os.File) (string, error) { 386 paths := backups.NewCanonicalArchivePaths() 387 388 gzr, err := gzip.NewReader(archive) 389 if err != nil { 390 return "", errors.Annotate(err, fmt.Sprintf("cannot unzip %q", archive.Name())) 391 } 392 defer gzr.Close() 393 394 metaFile, err := findFileInTar(gzr, paths.MetadataFile) 395 if errors.IsNotFound(err) { 396 // Older archives don't have a metadata file and always have machine-0. 397 return "0", nil 398 } 399 if err != nil { 400 return "", errors.Trace(err) 401 } 402 meta, err := backups.NewMetadataJSONReader(metaFile) 403 if err != nil { 404 return "", errors.Trace(err) 405 } 406 return meta.Origin.Machine, nil 407 } 408 409 func extractConfig(backupFile string) (agentConfig, error) { 410 f, err := os.Open(backupFile) 411 if err != nil { 412 return agentConfig{}, err 413 } 414 defer f.Close() 415 416 // Extract the machine tag. 417 machineID, err := extractMachineID(f) 418 if err != nil { 419 return agentConfig{}, err 420 } 421 _, err = f.Seek(0, os.SEEK_SET) 422 if err != nil { 423 return agentConfig{}, err 424 } 425 tag := names.NewMachineTag(machineID) 426 427 // Extract the config file. 428 gzr, err := gzip.NewReader(f) 429 if err != nil { 430 return agentConfig{}, errors.Annotate(err, fmt.Sprintf("cannot unzip %q", backupFile)) 431 } 432 defer gzr.Close() 433 outerTar, err := findFileInTar(gzr, "juju-backup/root.tar") 434 if err != nil { 435 return agentConfig{}, err 436 } 437 // TODO(ericsnowcurrently) This should come from an authoritative source. 438 const confFilename = "var/lib/juju/agents/%s/agent.conf" 439 agentConf, err := findFileInTar(outerTar, fmt.Sprintf(confFilename, tag)) 440 if err != nil { 441 return agentConfig{}, err 442 } 443 444 // Extract the config data. 445 data, err := ioutil.ReadAll(agentConf) 446 if err != nil { 447 return agentConfig{}, errors.Annotate(err, "failed to read agent config file") 448 } 449 var conf interface{} 450 if err := goyaml.Unmarshal(data, &conf); err != nil { 451 return agentConfig{}, errors.Annotate(err, "cannot unmarshal agent config file") 452 } 453 m, ok := conf.(map[interface{}]interface{}) 454 if !ok { 455 return agentConfig{}, fmt.Errorf("config file unmarshalled to %T not %T", conf, m) 456 } 457 password, ok := m["statepassword"].(string) 458 if !ok || password == "" { 459 return agentConfig{}, fmt.Errorf("agent password not found in configuration") 460 } 461 oldPassword, ok := m["oldpassword"].(string) 462 if !ok || oldPassword == "" { 463 return agentConfig{}, fmt.Errorf("agent old password not found in configuration") 464 } 465 statePortNum, ok := m["stateport"].(int) 466 if !ok { 467 return agentConfig{}, fmt.Errorf("state port not found in configuration") 468 } 469 470 statePort := strconv.Itoa(statePortNum) 471 apiPortNum, ok := m["apiport"].(int) 472 if !ok { 473 return agentConfig{}, fmt.Errorf("api port not found in configuration") 474 } 475 apiPort := strconv.Itoa(apiPortNum) 476 477 return agentConfig{ 478 Credentials: credentials{ 479 Tag: "machine-0", 480 Password: password, 481 OldPassword: oldPassword, 482 }, 483 StatePort: statePort, 484 ApiPort: apiPort, 485 }, nil 486 } 487 488 func findFileInTar(r io.Reader, name string) (io.Reader, error) { 489 tarr := tar.NewReader(r) 490 for { 491 hdr, err := tarr.Next() 492 if err == io.EOF { 493 return nil, errors.NotFoundf(name) 494 } 495 if err != nil { 496 return nil, errors.Annotatef(err, "while looking for %q", name) 497 } 498 if path.Clean(hdr.Name) == name { 499 return tarr, nil 500 } 501 } 502 } 503 504 var agentAddressTemplate = mustParseTemplate(` 505 set -exu 506 cd /var/lib/juju/agents 507 for agent in * 508 do 509 initctl stop jujud-$agent 510 sed -i.old -r "/^(stateaddresses|apiaddresses):/{ 511 n 512 s/- .*(:[0-9]+)/- {{.Address}}\1/ 513 }" $agent/agent.conf 514 515 # If we're processing a unit agent's directly 516 # and it has some relations, reset 517 # the stored version of all of them to 518 # ensure that any relation hooks will 519 # fire. 520 if [[ $agent = unit-* ]] 521 then 522 find $agent/state/relations -type f -exec sed -i -r 's/change-version: [0-9]+$/change-version: 0/' {} \; 523 fi 524 initctl start jujud-$agent 525 done 526 `) 527 528 // setAgentAddressScript generates an ssh script argument to update state addresses 529 func setAgentAddressScript(stateAddr string) string { 530 return execTemplate(agentAddressTemplate, struct { 531 Address string 532 }{stateAddr}) 533 } 534 535 // updateAllMachines finds all machines and resets the stored state address 536 // in each of them. The address does not include the port. 537 func updateAllMachines(apiState *api.State, stateAddr string) error { 538 client := apiState.Client() 539 status, err := client.Status(nil) 540 if err != nil { 541 return errors.Annotate(err, "cannot get status") 542 } 543 pendingMachineCount := 0 544 done := make(chan error) 545 for _, machineStatus := range status.Machines { 546 // A newly resumed state server requires no updating, and more 547 // than one state server is not yet support by this plugin. 548 if machineStatus.HasVote || machineStatus.WantsVote || machineStatus.Life == "dead" { 549 continue 550 } 551 pendingMachineCount++ 552 machine := machineStatus 553 go func() { 554 err := runMachineUpdate(client, machine.Id, setAgentAddressScript(stateAddr)) 555 if err != nil { 556 logger.Errorf("failed to update machine %s: %v", machine.Id, err) 557 } else { 558 progress("updated machine %s", machine.Id) 559 } 560 done <- err 561 }() 562 } 563 err = nil 564 for ; pendingMachineCount > 0; pendingMachineCount-- { 565 if updateErr := <-done; updateErr != nil && err == nil { 566 err = errors.Annotate(updateErr, "machine update failed") 567 } 568 } 569 return err 570 } 571 572 // runMachineUpdate connects via ssh to the machine and runs the update script 573 func runMachineUpdate(client *api.Client, id string, sshArg string) error { 574 progress("updating machine: %v\n", id) 575 addr, err := client.PublicAddress(id) 576 if err != nil { 577 return errors.Annotate(err, "no public address found") 578 } 579 return runViaSsh(addr, sshArg) 580 } 581 582 func runViaSsh(addr string, script string) error { 583 // This is taken from cmd/juju/ssh.go there is no other clear way to set user 584 userAddr := "ubuntu@" + addr 585 userCmd := ssh.Command(userAddr, []string{"sudo", "-n", "bash", "-c " + utils.ShQuote(script)}, nil) 586 var stderrBuf bytes.Buffer 587 var stdoutBuf bytes.Buffer 588 userCmd.Stderr = &stderrBuf 589 userCmd.Stdout = &stdoutBuf 590 err := userCmd.Run() 591 if err != nil { 592 return errors.Annotate(err, fmt.Sprintf("ssh command failed: (%q)", stderrBuf.String())) 593 } 594 progress("ssh command succedded: %q", stdoutBuf.String()) 595 return nil 596 } 597 598 func sendViaScp(file, host, destFile string) error { 599 err := ssh.Copy([]string{file, "ubuntu@" + host + ":" + destFile}, nil) 600 if err != nil { 601 return errors.Annotate(err, "scp command failed") 602 } 603 return nil 604 } 605 606 func mustParseTemplate(templ string) *template.Template { 607 t := template.New("").Funcs(template.FuncMap{ 608 "shquote": utils.ShQuote, 609 }) 610 return template.Must(t.Parse(templ)) 611 } 612 613 func execTemplate(tmpl *template.Template, data interface{}) string { 614 var buf bytes.Buffer 615 err := tmpl.Execute(&buf, data) 616 if err != nil { 617 panic(errors.Annotate(err, "template error")) 618 } 619 return buf.String() 620 }