github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/cmd/jujud/reboot/reboot.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Copyright 2014 Cloudbase Solutions SRL
     3  // Licensed under the AGPLv3, see LICENCE file for details.
     4  
     5  package reboot
     6  
     7  import (
     8  	"os"
     9  	"os/exec"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/juju/errors"
    14  	"github.com/juju/loggo"
    15  	"github.com/juju/names/v5"
    16  
    17  	"github.com/juju/juju/agent"
    18  	"github.com/juju/juju/container"
    19  	"github.com/juju/juju/core/instance"
    20  	"github.com/juju/juju/environs/instances"
    21  	"github.com/juju/juju/rpc/params"
    22  )
    23  
    24  var logger = loggo.GetLogger("juju.cmd.jujud.reboot")
    25  var timeout = 10 * time.Minute
    26  var rebootAfter = 15
    27  
    28  func runCommand(args []string) error {
    29  	err := exec.Command(args[0], args[1:]...).Run()
    30  	return errors.Trace(err)
    31  }
    32  
    33  var tmpFile = func() (*os.File, error) {
    34  	f, err := os.CreateTemp(os.TempDir(), "juju-reboot")
    35  	return f, errors.Trace(err)
    36  }
    37  
    38  // Reboot implements the ExecuteReboot command which will reboot a machine
    39  // once all containers have shut down, or a timeout is reached
    40  type Reboot struct {
    41  	acfg   AgentConfig
    42  	reboot RebootWaiter
    43  }
    44  
    45  func NewRebootWaiter(acfg agent.Config) (*Reboot, error) {
    46  	// ensure we're only running on a machine agent.
    47  	if _, ok := acfg.Tag().(names.MachineTag); !ok {
    48  		return nil, errors.Errorf("Expected names.MachineTag, got: %T --> %v", acfg.Tag(), acfg.Tag())
    49  	}
    50  	return &Reboot{
    51  		acfg:   &agentConfigShim{aCfg: acfg},
    52  		reboot: rebootWaiterShim{},
    53  	}, nil
    54  }
    55  
    56  // ExecuteReboot will wait for all running containers to stop, and then execute
    57  // a shutdown or a reboot (based on the action param)
    58  func (r *Reboot) ExecuteReboot(action params.RebootAction) error {
    59  	if err := r.waitForContainersOrTimeout(); err != nil {
    60  		return errors.Trace(err)
    61  	}
    62  
    63  	// Stop all units before issuing a reboot. During a reboot, the machine agent
    64  	// will attempt to hold the execution lock until the reboot happens. However,
    65  	// since the old file based locking method has been replaced with sockets, if
    66  	// the machine agent is killed by the init system during shutdown, before the
    67  	// unit agents, the lock is released and unit agents start running hooks.
    68  	// When they in turn are killed, the hook is thrown into error state. If
    69  	// automatic retries are disabled, the hook remains in error state.
    70  	if err := r.stopDeployedUnits(); err != nil {
    71  		return errors.Trace(err)
    72  	}
    73  
    74  	if err := r.reboot.ScheduleAction(action, rebootAfter); err != nil {
    75  		return errors.Trace(err)
    76  	}
    77  
    78  	return nil
    79  }
    80  
    81  func (r *Reboot) stopDeployedUnits() error {
    82  	services, err := r.reboot.ListServices()
    83  	if err != nil {
    84  		return err
    85  	}
    86  	for _, svcName := range services {
    87  		if strings.HasPrefix(svcName, `jujud-unit-`) {
    88  			svc, err := r.reboot.NewServiceReference(svcName)
    89  			if err != nil {
    90  				return err
    91  			}
    92  			logger.Debugf("Stopping unit agent: %q", svcName)
    93  			if err = svc.Stop(); err != nil {
    94  				return err
    95  			}
    96  		}
    97  	}
    98  	return nil
    99  }
   100  
   101  func (r *Reboot) runningContainers() ([]instances.Instance, error) {
   102  	var runningInstances []instances.Instance
   103  	modelUUID := r.acfg.Model().Id()
   104  	for _, val := range instance.ContainerTypes {
   105  		managerConfig := container.ManagerConfig{
   106  			container.ConfigModelUUID: modelUUID,
   107  		}
   108  		cfg := managerConfig
   109  		manager, err := r.reboot.NewContainerManager(val, cfg)
   110  		if err != nil {
   111  			return nil, errors.Annotatef(err, "failed to get manager for container type %v", val)
   112  		}
   113  		if !manager.IsInitialized() {
   114  			logger.Infof("container type %q not supported", val)
   115  			continue
   116  		}
   117  		containers, err := manager.ListContainers()
   118  		if err != nil {
   119  			return nil, errors.Annotate(err, "failed to list containers")
   120  		}
   121  		runningInstances = append(runningInstances, containers...)
   122  	}
   123  	return runningInstances, nil
   124  }
   125  
   126  func (r *Reboot) waitForContainersOrTimeout() error {
   127  	c := make(chan error, 1)
   128  	quit := make(chan bool, 1)
   129  	go func() {
   130  		for {
   131  			select {
   132  			case <-quit:
   133  				c <- nil
   134  				return
   135  			default:
   136  				containers, err := r.runningContainers()
   137  				if err != nil {
   138  					c <- err
   139  					return
   140  				}
   141  				if len(containers) == 0 {
   142  					c <- nil
   143  					return
   144  				}
   145  				logger.Warningf("Waiting for containers to shutdown: %v", containers)
   146  				time.Sleep(1 * time.Second)
   147  			}
   148  		}
   149  	}()
   150  
   151  	select {
   152  	case <-time.After(timeout):
   153  		// TODO(fwereade): 2016-03-17 lp:1558657
   154  		// Containers are still up after timeout. C'est la vie
   155  		quit <- true
   156  		return errors.New("Timeout reached waiting for containers to shutdown")
   157  	case err := <-c:
   158  		return errors.Trace(err)
   159  	}
   160  }