github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/cmd/plugins/juju-restore/restore.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "archive/tar" 8 "bytes" 9 "compress/gzip" 10 "fmt" 11 "io" 12 "io/ioutil" 13 "os" 14 "path" 15 "strconv" 16 "text/template" 17 "time" 18 19 "github.com/juju/cmd" 20 "github.com/juju/errors" 21 "github.com/juju/loggo" 22 "github.com/juju/names" 23 "github.com/juju/utils" 24 goyaml "gopkg.in/yaml.v1" 25 "launchpad.net/gnuflag" 26 27 "github.com/juju/juju/api" 28 "github.com/juju/juju/cmd/envcmd" 29 "github.com/juju/juju/constraints" 30 "github.com/juju/juju/environs" 31 "github.com/juju/juju/environs/bootstrap" 32 "github.com/juju/juju/environs/config" 33 "github.com/juju/juju/environs/configstore" 34 "github.com/juju/juju/instance" 35 "github.com/juju/juju/juju" 36 _ "github.com/juju/juju/provider/all" 37 "github.com/juju/juju/provider/common" 38 "github.com/juju/juju/state/backups" 39 "github.com/juju/juju/utils/ssh" 40 ) 41 42 func main() { 43 Main(os.Args) 44 } 45 46 func Main(args []string) { 47 ctx, err := cmd.DefaultContext() 48 if err != nil { 49 fmt.Fprintf(os.Stderr, "error: %v\n", err) 50 os.Exit(2) 51 } 52 if err := juju.InitJujuHome(); err != nil { 53 fmt.Fprintf(os.Stderr, "error: %s\n", err) 54 os.Exit(2) 55 } 56 os.Exit(cmd.Main(envcmd.Wrap(&restoreCommand{}), ctx, args[1:])) 57 } 58 59 var logger = loggo.GetLogger("juju.plugins.restore") 60 61 const restoreDoc = ` 62 Restore restores a backup created with juju backup 63 by creating a new juju bootstrap instance and arranging 64 it so that the existing instances in the environment 65 talk to it. 66 67 It verifies that the existing bootstrap instance is 68 not running. The given constraints will be used 69 to choose the new instance. 70 ` 71 72 type restoreCommand struct { 73 envcmd.EnvCommandBase 74 Log cmd.Log 75 Constraints constraints.Value 76 backupFile string 77 showDescription bool 78 } 79 80 func (c *restoreCommand) Info() *cmd.Info { 81 return &cmd.Info{ 82 Name: "juju-restore", 83 Purpose: "Restore a backup made with juju backup", 84 Args: "<backupfile.tar.gz>", 85 Doc: restoreDoc, 86 } 87 } 88 89 func (c *restoreCommand) SetFlags(f *gnuflag.FlagSet) { 90 f.Var(constraints.ConstraintsValue{Target: &c.Constraints}, "constraints", "set environment constraints") 91 f.BoolVar(&c.showDescription, "description", false, "show the purpose of this plugin") 92 c.Log.AddFlags(f) 93 } 94 95 func (c *restoreCommand) Init(args []string) error { 96 if c.showDescription { 97 return cmd.CheckEmpty(args) 98 } 99 if len(args) == 0 { 100 return fmt.Errorf("no backup file specified") 101 } 102 c.backupFile = args[0] 103 return cmd.CheckEmpty(args[1:]) 104 } 105 106 var updateBootstrapMachineTemplate = mustParseTemplate(` 107 set -exu 108 109 export LC_ALL=C 110 tar xzf juju-backup.tgz 111 test -d juju-backup 112 113 114 initctl stop jujud-machine-0 115 116 #The code apt-get throws when lock is taken 117 APTOUTPUT=100 118 while [ $APTOUTPUT -gt 0 ] 119 do 120 # We will try to run apt-get and it can fail if other dpkg is in use 121 # the subshell call is not reached by -e so we can have apt-get fail 122 APTOUTPUT=$(apt-get --option=Dpkg::Options::=--force-confold --option=Dpkg::options::=--force-unsafe-io --assume-yes --quiet install mongodb-clients &> /dev/null; echo $?) 123 if [ $APTOUTPUT -gt 0 ] && [ $APTOUTPUT -ne 100 ]; then 124 echo "apt-get failed with an irrecoverable error $APTOUTPUT"; 125 exit 1 126 fi 127 done 128 129 130 131 initctl stop juju-db 132 rm -r /var/lib/juju 133 rm -r /var/log/juju 134 135 tar -C / -xvp -f juju-backup/root.tar 136 mkdir -p /var/lib/juju/db 137 138 # Prefer jujud-mongodb binaries if available 139 export MONGORESTORE=mongorestore 140 if [ -f /usr/lib/juju/bin/mongorestore ]; then 141 export MONGORESTORE=/usr/lib/juju/bin/mongorestore; 142 fi 143 $MONGORESTORE --drop --dbpath /var/lib/juju/db juju-backup/dump 144 145 initctl start juju-db 146 147 mongoAdminEval() { 148 mongo --ssl -u admin -p {{.AgentConfig.Credentials.OldPassword | shquote}} localhost:{{.AgentConfig.StatePort}}/admin --eval "$1" 149 } 150 151 # wait for mongo to come up after starting the juju-db init service. 152 for i in $(seq 1 100) 153 do 154 mongoAdminEval ' ' && break 155 sleep 5 156 done 157 158 # Create a new replicaSet conf and re initiate it 159 mongoAdminEval ' 160 conf = { "_id" : "juju", "version" : 1, "members" : [ { "_id" : 1, "host" : "{{ .PrivateAddress | printf "%s:"}}{{.AgentConfig.StatePort}}" , "tags" : { "juju-machine-id" : "0" } }]} 161 rs.initiate(conf) 162 ' 163 # This looks arbitrary but there is no clear way to determine when replicaset is initiated 164 # and rs.initiate message is "this will take about a minute" so we honour that estimation 165 sleep 60 166 167 # Remove all state machines but 0, to restore HA 168 mongoAdminEval ' 169 db = db.getSiblingDB("juju") 170 db.machines.update({machineid: "0"}, {$set: {instanceid: {{.NewInstanceId | printf "%q" }} } }) 171 db.machines.update({machineid: "0"}, {$set: {"addresses": ["{{.Address}}"] } }) 172 db.instanceData.update({_id: "0"}, {$set: {instanceid: {{.NewInstanceId | printf "%q" }} } }) 173 db.machines.remove({machineid: {$ne:"0"}, hasvote: true}) 174 db.stateServers.update({"_id":"e"}, {$set:{"machineids" : ["0"]}}) 175 db.stateServers.update({"_id":"e"}, {$set:{"votingmachineids" : ["0"]}}) 176 ' 177 178 # Give time to replset to initiate 179 for i in $(seq 1 20) 180 do 181 mongoAdminEval ' ' && break 182 sleep 5 183 done 184 185 initctl stop juju-db 186 187 # Update the agent.conf for machine-0 with the new addresses 188 cd /var/lib/juju/agents 189 190 # Remove extra state machines from conf 191 REMOVECOUNT=$(grep -Ec "^-.*{{.AgentConfig.ApiPort}}$" /var/lib/juju/agents/machine-0/agent.conf ) 192 awk '/\-.*{{.AgentConfig.ApiPort}}$/{i++}i<1' machine-0/agent.conf > machine-0/agent.conf.new 193 awk -v removecount=$REMOVECOUNT '/\-.*{{.AgentConfig.ApiPort}}$/{i++}i==removecount' machine-0/agent.conf >> machine-0/agent.conf.new 194 mv machine-0/agent.conf.new machine-0/agent.conf 195 196 sed -i.old -r -e "/^(stateaddresses):/{ 197 n 198 s/- .*(:[0-9]+)/- {{.Address}}\1/ 199 }" -e "/^(apiaddresses):/{ 200 n 201 s/- .*(:[0-9]+)/- {{.PrivateAddress}}\1/ 202 }" machine-0/agent.conf 203 204 205 initctl start juju-db 206 initctl start jujud-machine-0 207 `) 208 209 func updateBootstrapMachineScript(instanceId instance.Id, agentConf agentConfig, addr, paddr string) string { 210 return execTemplate(updateBootstrapMachineTemplate, struct { 211 NewInstanceId instance.Id 212 AgentConfig agentConfig 213 Address string 214 PrivateAddress string 215 }{instanceId, agentConf, addr, paddr}) 216 } 217 218 func (c *restoreCommand) Run(ctx *cmd.Context) error { 219 if c.showDescription { 220 fmt.Fprintf(ctx.Stdout, "%s\n", c.Info().Purpose) 221 return nil 222 } 223 if err := c.Log.Start(ctx); err != nil { 224 return err 225 } 226 agentConf, err := extractConfig(c.backupFile) 227 if err != nil { 228 return errors.Annotate(err, "cannot extract configuration from backup file") 229 } 230 progress("extracted credentials from backup file") 231 store, err := configstore.Default() 232 if err != nil { 233 return err 234 } 235 cfg, err := c.Config(store, nil) 236 if err != nil { 237 return err 238 } 239 env, err := rebootstrap(cfg, ctx, c.Constraints) 240 if err != nil { 241 return errors.Annotate(err, "cannot re-bootstrap environment") 242 } 243 progress("connecting to newly bootstrapped instance") 244 var apiState api.Connection 245 // The state server backend may not be ready to accept logins so we retry. 246 // We'll do up to 8 retries over 2 minutes to give the server time to come up. 247 // Typically we expect only 1 retry will be needed. 248 attempt := utils.AttemptStrategy{Delay: 15 * time.Second, Min: 8} 249 // While specifying the admin user will work for now, as soon as we allow 250 // the users to have a different initial user name, or they have changed 251 // the password for the admin user, this will fail. 252 owner := names.NewUserTag("admin") 253 for a := attempt.Start(); a.Next(); { 254 apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) 255 if err == nil || errors.Cause(err).Error() != "EOF" { 256 break 257 } 258 progress("bootstrapped instance not ready - attempting to redial") 259 } 260 if err != nil { 261 return errors.Annotate(err, "cannot connect to bootstrap instance") 262 } 263 progress("restoring bootstrap machine") 264 machine0Addr, err := restoreBootstrapMachine(apiState, c.backupFile, agentConf) 265 if err != nil { 266 return errors.Annotate(err, "cannot restore bootstrap machine") 267 } 268 progress("restored bootstrap machine") 269 270 apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) 271 progress("opening state") 272 if err != nil { 273 return errors.Annotate(err, "cannot connect to api server") 274 } 275 progress("updating all machines") 276 results, err := updateAllMachines(apiState, machine0Addr) 277 if err != nil { 278 return errors.Annotate(err, "cannot update machines") 279 } 280 var message string 281 for _, result := range results { 282 if result.err != nil { 283 message = fmt.Sprintf("Update of machine %q failed: %v", result.machineName, result.err) 284 } else { 285 message = fmt.Sprintf("Succesful update of machine %q", result.machineName) 286 } 287 progress(message) 288 } 289 return nil 290 } 291 292 func progress(f string, a ...interface{}) { 293 fmt.Printf("%s\n", fmt.Sprintf(f, a...)) 294 } 295 296 func rebootstrap(cfg *config.Config, ctx *cmd.Context, cons constraints.Value) (environs.Environ, error) { 297 progress("re-bootstrapping environment") 298 // Turn on safe mode so that the newly bootstrapped instance 299 // will not destroy all the instances it does not know about. 300 cfg, err := cfg.Apply(map[string]interface{}{ 301 "provisioner-safe-mode": true, 302 }) 303 if err != nil { 304 return nil, errors.Annotate(err, "cannot enable provisioner-safe-mode") 305 } 306 env, err := environs.New(cfg) 307 if err != nil { 308 return nil, err 309 } 310 instanceIds, err := env.StateServerInstances() 311 switch errors.Cause(err) { 312 case nil, environs.ErrNoInstances: 313 // Some providers will return a nil error even 314 // if there are no live state server instances. 315 break 316 case environs.ErrNotBootstrapped: 317 return nil, errors.Trace(err) 318 default: 319 return nil, errors.Annotate(err, "cannot determine state server instances") 320 } 321 if len(instanceIds) > 0 { 322 instances, err := env.Instances(instanceIds) 323 switch errors.Cause(err) { 324 case nil, environs.ErrPartialInstances: 325 return nil, fmt.Errorf("old bootstrap instances %q still seems to exist; will not replace", instances) 326 case environs.ErrNoInstances: 327 // No state server instances, so keep running. 328 break 329 default: 330 return nil, errors.Annotate(err, "cannot detect whether old instance is still running") 331 } 332 } 333 // Remove the storage so that we can bootstrap without the provider complaining. 334 if env, ok := env.(environs.EnvironStorage); ok { 335 if err := env.Storage().Remove(common.StateFile); err != nil { 336 return nil, errors.Annotate(err, fmt.Sprintf("cannot remove %q from storage", common.StateFile)) 337 } 338 } 339 340 // TODO If we fail beyond here, then we won't have a state file and 341 // we won't be able to re-run this script because it fails without it. 342 // We could either try to recreate the file if we fail (which is itself 343 // error-prone) or we could provide a --no-check flag to make 344 // it go ahead anyway without the check. 345 346 args := bootstrap.BootstrapParams{Constraints: cons} 347 if err := bootstrap.Bootstrap(envcmd.BootstrapContextNoVerify(ctx), env, args); err != nil { 348 return nil, errors.Annotate(err, "cannot bootstrap new instance") 349 } 350 return env, nil 351 } 352 353 func restoreBootstrapMachine(st api.Connection, backupFile string, agentConf agentConfig) (addr string, err error) { 354 client := st.Client() 355 addr, err = client.PublicAddress("0") 356 if err != nil { 357 return "", errors.Annotate(err, "cannot get public address of bootstrap machine") 358 } 359 paddr, err := client.PrivateAddress("0") 360 if err != nil { 361 return "", errors.Annotate(err, "cannot get private address of bootstrap machine") 362 } 363 status, err := client.Status(nil) 364 if err != nil { 365 return "", errors.Annotate(err, "cannot get environment status") 366 } 367 info, ok := status.Machines["0"] 368 if !ok { 369 return "", fmt.Errorf("cannot find bootstrap machine in status") 370 } 371 newInstId := instance.Id(info.InstanceId) 372 373 progress("copying backup file to bootstrap host") 374 if err := sendViaScp(backupFile, addr, "~/juju-backup.tgz"); err != nil { 375 return "", errors.Annotate(err, "cannot copy backup file to bootstrap instance") 376 } 377 progress("updating bootstrap machine") 378 if err := runViaSsh(addr, updateBootstrapMachineScript(newInstId, agentConf, addr, paddr)); err != nil { 379 return "", errors.Annotate(err, "update script failed") 380 } 381 return addr, nil 382 } 383 384 type credentials struct { 385 Tag string 386 Password string 387 OldPassword string 388 } 389 390 type agentConfig struct { 391 Credentials credentials 392 ApiPort string 393 StatePort string 394 } 395 396 func extractMachineID(archive *os.File) (string, error) { 397 paths := backups.NewCanonicalArchivePaths() 398 399 gzr, err := gzip.NewReader(archive) 400 if err != nil { 401 return "", errors.Annotate(err, fmt.Sprintf("cannot unzip %q", archive.Name())) 402 } 403 defer gzr.Close() 404 405 metaFile, err := findFileInTar(gzr, paths.MetadataFile) 406 if errors.IsNotFound(err) { 407 // Older archives don't have a metadata file and always have machine-0. 408 return "0", nil 409 } 410 if err != nil { 411 return "", errors.Trace(err) 412 } 413 meta, err := backups.NewMetadataJSONReader(metaFile) 414 if err != nil { 415 return "", errors.Trace(err) 416 } 417 return meta.Origin.Machine, nil 418 } 419 420 func extractConfig(backupFile string) (agentConfig, error) { 421 f, err := os.Open(backupFile) 422 if err != nil { 423 return agentConfig{}, err 424 } 425 defer f.Close() 426 427 // Extract the machine tag. 428 machineID, err := extractMachineID(f) 429 if err != nil { 430 return agentConfig{}, err 431 } 432 _, err = f.Seek(0, os.SEEK_SET) 433 if err != nil { 434 return agentConfig{}, err 435 } 436 tag := names.NewMachineTag(machineID) 437 438 // Extract the config file. 439 gzr, err := gzip.NewReader(f) 440 if err != nil { 441 return agentConfig{}, errors.Annotate(err, fmt.Sprintf("cannot unzip %q", backupFile)) 442 } 443 defer gzr.Close() 444 outerTar, err := findFileInTar(gzr, "juju-backup/root.tar") 445 if err != nil { 446 return agentConfig{}, err 447 } 448 // TODO(ericsnowcurrently) This should come from an authoritative source. 449 const confFilename = "var/lib/juju/agents/%s/agent.conf" 450 agentConf, err := findFileInTar(outerTar, fmt.Sprintf(confFilename, tag)) 451 if err != nil { 452 return agentConfig{}, err 453 } 454 455 // Extract the config data. 456 data, err := ioutil.ReadAll(agentConf) 457 if err != nil { 458 return agentConfig{}, errors.Annotate(err, "failed to read agent config file") 459 } 460 var conf interface{} 461 if err := goyaml.Unmarshal(data, &conf); err != nil { 462 return agentConfig{}, errors.Annotate(err, "cannot unmarshal agent config file") 463 } 464 m, ok := conf.(map[interface{}]interface{}) 465 if !ok { 466 return agentConfig{}, fmt.Errorf("config file unmarshalled to %T not %T", conf, m) 467 } 468 password, ok := m["statepassword"].(string) 469 if !ok || password == "" { 470 return agentConfig{}, fmt.Errorf("agent password not found in configuration") 471 } 472 oldPassword, ok := m["oldpassword"].(string) 473 if !ok || oldPassword == "" { 474 return agentConfig{}, fmt.Errorf("agent old password not found in configuration") 475 } 476 statePortNum, ok := m["stateport"].(int) 477 if !ok { 478 return agentConfig{}, fmt.Errorf("state port not found in configuration") 479 } 480 481 statePort := strconv.Itoa(statePortNum) 482 apiPortNum, ok := m["apiport"].(int) 483 if !ok { 484 return agentConfig{}, fmt.Errorf("api port not found in configuration") 485 } 486 apiPort := strconv.Itoa(apiPortNum) 487 488 return agentConfig{ 489 Credentials: credentials{ 490 Tag: "machine-0", 491 Password: password, 492 OldPassword: oldPassword, 493 }, 494 StatePort: statePort, 495 ApiPort: apiPort, 496 }, nil 497 } 498 499 func findFileInTar(r io.Reader, name string) (io.Reader, error) { 500 tarr := tar.NewReader(r) 501 for { 502 hdr, err := tarr.Next() 503 if err == io.EOF { 504 return nil, errors.NotFoundf(name) 505 } 506 if err != nil { 507 return nil, errors.Annotatef(err, "while looking for %q", name) 508 } 509 if path.Clean(hdr.Name) == name { 510 return tarr, nil 511 } 512 } 513 } 514 515 var agentAddressTemplate = mustParseTemplate(` 516 set -exu 517 cd /var/lib/juju/agents 518 for agent in * 519 do 520 initctl stop jujud-$agent 521 sed -i.old -r "/^(stateaddresses|apiaddresses):/{ 522 n 523 s/- .*(:[0-9]+)/- {{.Address}}\1/ 524 }" $agent/agent.conf 525 526 # If we're processing a unit agent's directly 527 # and it has some relations, reset 528 # the stored version of all of them to 529 # ensure that any relation hooks will 530 # fire. 531 if [[ $agent = unit-* ]] 532 then 533 find $agent/state/relations -type f -exec sed -i -r 's/change-version: [0-9]+$/change-version: 0/' {} \; 534 fi 535 initctl start jujud-$agent 536 done 537 `) 538 539 // setAgentAddressScript generates an ssh script argument to update state addresses 540 func setAgentAddressScript(stateAddr string) string { 541 return execTemplate(agentAddressTemplate, struct { 542 Address string 543 }{stateAddr}) 544 } 545 546 type restoreResult struct { 547 machineName string 548 err error 549 } 550 551 // updateAllMachines finds all machines and resets the stored state address 552 // in each of them. The address does not include the port. 553 func updateAllMachines(apiState api.Connection, stateAddr string) ([]restoreResult, error) { 554 client := apiState.Client() 555 status, err := client.Status(nil) 556 if err != nil { 557 return nil, errors.Annotate(err, "cannot get status") 558 } 559 pendingMachineCount := 0 560 done := make(chan restoreResult) 561 562 for _, machineStatus := range status.Machines { 563 // A newly resumed state server requires no updating, and more 564 // than one state server is not yet support by this plugin. 565 if machineStatus.HasVote || machineStatus.WantsVote || machineStatus.Life == "dead" { 566 continue 567 } 568 pendingMachineCount++ 569 machine := machineStatus 570 go func() { 571 err := runMachineUpdate(client, machine.Id, setAgentAddressScript(stateAddr)) 572 if err != nil { 573 574 logger.Errorf("failed to update machine %s: %v", machine.Id, err) 575 } else { 576 progress("updated machine %s", machine.Id) 577 } 578 r := restoreResult{machineName: machine.Id, err: err} 579 done <- r 580 }() 581 } 582 results := make([]restoreResult, pendingMachineCount) 583 for ; pendingMachineCount > 0; pendingMachineCount-- { 584 results[pendingMachineCount-1] = <-done 585 } 586 return results, nil 587 } 588 589 // runMachineUpdate connects via ssh to the machine and runs the update script 590 func runMachineUpdate(client *api.Client, id string, sshArg string) error { 591 progress("updating machine: %v\n", id) 592 addr, err := client.PublicAddress(id) 593 if err != nil { 594 return errors.Annotate(err, "no public address found") 595 } 596 return runViaSsh(addr, sshArg) 597 } 598 599 func runViaSsh(addr string, script string) error { 600 // This is taken from cmd/juju/ssh.go there is no other clear way to set user 601 userAddr := "ubuntu@" + addr 602 userCmd := ssh.Command(userAddr, []string{"sudo", "-n", "bash", "-c " + utils.ShQuote(script)}, nil) 603 var stderrBuf bytes.Buffer 604 var stdoutBuf bytes.Buffer 605 userCmd.Stderr = &stderrBuf 606 userCmd.Stdout = &stdoutBuf 607 err := userCmd.Run() 608 if err != nil { 609 return errors.Annotate(err, fmt.Sprintf("ssh command failed: (%q)", stderrBuf.String())) 610 } 611 progress("ssh command succedded: %q", stdoutBuf.String()) 612 return nil 613 } 614 615 func sendViaScp(file, host, destFile string) error { 616 err := ssh.Copy([]string{file, "ubuntu@" + host + ":" + destFile}, nil) 617 if err != nil { 618 return errors.Annotate(err, "scp command failed") 619 } 620 return nil 621 } 622 623 func mustParseTemplate(templ string) *template.Template { 624 t := template.New("").Funcs(template.FuncMap{ 625 "shquote": utils.ShQuote, 626 }) 627 return template.Must(t.Parse(templ)) 628 } 629 630 func execTemplate(tmpl *template.Template, data interface{}) string { 631 var buf bytes.Buffer 632 err := tmpl.Execute(&buf, data) 633 if err != nil { 634 panic(errors.Annotate(err, "template error")) 635 } 636 return buf.String() 637 }