github.com/Cloud-Foundations/Dominator@v0.3.4/sub/rpcd/update.go (about)

     1  package rpcd
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/json"
     6  	"errors"
     7  	"flag"
     8  	"os"
     9  	"os/exec"
    10  	"os/signal"
    11  	"syscall"
    12  	"time"
    13  
    14  	jsonlib "github.com/Cloud-Foundations/Dominator/lib/json"
    15  	"github.com/Cloud-Foundations/Dominator/lib/log"
    16  	"github.com/Cloud-Foundations/Dominator/lib/srpc"
    17  	"github.com/Cloud-Foundations/Dominator/lib/triggers"
    18  	"github.com/Cloud-Foundations/Dominator/lib/wsyscall"
    19  	"github.com/Cloud-Foundations/Dominator/proto/sub"
    20  	"github.com/Cloud-Foundations/Dominator/sub/lib"
    21  )
    22  
    23  var (
    24  	readOnly = flag.Bool("readOnly", false,
    25  		"If true, refuse all Fetch and Update requests. For debugging only")
    26  	disableUpdates = flag.Bool("disableUpdates", false,
    27  		"If true, refuse all Update requests. For debugging only")
    28  	disableTriggers = flag.Bool("disableTriggers", false,
    29  		"If true, do not run any triggers. For debugging only")
    30  )
    31  
    32  type flusher interface {
    33  	Flush() error
    34  }
    35  
    36  func (t *rpcType) Update(conn *srpc.Conn, request sub.UpdateRequest,
    37  	reply *sub.UpdateResponse) error {
    38  	if err := t.getUpdateLock(conn); err != nil {
    39  		t.params.Logger.Println(err)
    40  		return err
    41  	}
    42  	t.params.Logger.Printf("Update(%s)\n", conn.Username())
    43  	fs := t.params.FileSystemHistory.FileSystem()
    44  	if request.Wait {
    45  		return t.updateAndUnlock(request, fs.RootDirectoryName())
    46  	}
    47  	go t.updateAndUnlock(request, fs.RootDirectoryName())
    48  	return nil
    49  }
    50  
    51  func (t *rpcType) getUpdateLock(conn *srpc.Conn) error {
    52  	if *readOnly || *disableUpdates {
    53  		return errors.New("Update() rejected due to read-only mode")
    54  	}
    55  	fs := t.params.FileSystemHistory.FileSystem()
    56  	if fs == nil {
    57  		return errors.New("no file-system history yet")
    58  	}
    59  	t.rwLock.Lock()
    60  	defer t.rwLock.Unlock()
    61  	if err := t.checkIfLockedByAnotherClient(conn); err != nil {
    62  		t.params.Logger.Printf("Error: %s\n", err)
    63  		return err
    64  	}
    65  	if t.fetchInProgress {
    66  		return errors.New("Fetch() in progress")
    67  	}
    68  	if t.updateInProgress {
    69  		return errors.New("Update() already in progress")
    70  	}
    71  	t.updateInProgress = true
    72  	t.lastUpdateError = nil
    73  	return nil
    74  }
    75  
    76  func (t *rpcType) updateAndUnlock(request sub.UpdateRequest,
    77  	rootDirectoryName string) error {
    78  	defer t.clearUpdateInProgress()
    79  	defer t.params.ScannerConfiguration.BoostCpuLimit(t.params.Logger)
    80  	t.params.DisableScannerFunction(true)
    81  	defer t.params.DisableScannerFunction(false)
    82  	startTime := time.Now()
    83  	oldTriggers := &triggers.MergeableTriggers{}
    84  	file, err := os.Open(t.config.OldTriggersFilename)
    85  	if err == nil {
    86  		decoder := json.NewDecoder(file)
    87  		var trig triggers.Triggers
    88  		err = decoder.Decode(&trig.Triggers)
    89  		file.Close()
    90  		if err == nil {
    91  			oldTriggers.Merge(&trig)
    92  		} else {
    93  			t.params.Logger.Printf(
    94  				"Error decoding old triggers: %s", err.Error())
    95  		}
    96  	}
    97  	if request.Triggers != nil {
    98  		// Merge new triggers into old triggers. This supports initial
    99  		// Domination of a machine and when the old triggers are incomplete.
   100  		oldTriggers.Merge(request.Triggers)
   101  		file, err = os.Create(t.config.OldTriggersFilename)
   102  		if err == nil {
   103  			writer := bufio.NewWriter(file)
   104  			if err := jsonlib.WriteWithIndent(writer, "    ",
   105  				request.Triggers.Triggers); err != nil {
   106  				t.params.Logger.Printf("Error marshaling triggers: %s", err)
   107  			}
   108  			writer.Flush()
   109  			file.Close()
   110  		}
   111  	}
   112  	var hadTriggerFailures bool
   113  	var fsChangeDuration time.Duration
   114  	var lastUpdateError error
   115  	options := lib.UpdateOptions{
   116  		Logger:            t.params.Logger,
   117  		ObjectsDir:        t.config.ObjectsDirectoryName,
   118  		OldTriggers:       oldTriggers.ExportTriggers(),
   119  		RootDirectoryName: rootDirectoryName,
   120  		RunTriggers:       t.runTriggers,
   121  		SkipFilter:        t.params.ScannerConfiguration.ScanFilter,
   122  	}
   123  	if t.config.DisruptionManager != "" {
   124  		options.DisruptionCancel = t.disruptionCancel
   125  		options.DisruptionRequest = t.disruptionRequest
   126  	}
   127  	t.params.WorkdirGoroutine.Run(func() {
   128  		hadTriggerFailures, fsChangeDuration, lastUpdateError =
   129  			lib.UpdateWithOptions(request, options)
   130  	})
   131  	t.lastUpdateHadTriggerFailures = hadTriggerFailures
   132  	t.lastUpdateError = lastUpdateError
   133  	timeTaken := time.Since(startTime)
   134  	if t.lastUpdateError != nil {
   135  		t.params.Logger.Printf("Update(): last error: %s\n", t.lastUpdateError)
   136  	} else {
   137  		note, err := t.generateNote()
   138  		if err != nil {
   139  			t.params.Logger.Println(err)
   140  		}
   141  		t.rwLock.Lock()
   142  		t.lastSuccessfulImageName = request.ImageName
   143  		if err == nil {
   144  			t.lastNote = note
   145  		}
   146  		t.rwLock.Unlock()
   147  	}
   148  	t.params.Logger.Printf("Update() completed in %s (change window: %s)\n",
   149  		timeTaken, fsChangeDuration)
   150  	return t.lastUpdateError
   151  }
   152  
   153  func (t *rpcType) clearUpdateInProgress() {
   154  	t.rwLock.Lock()
   155  	defer t.rwLock.Unlock()
   156  	t.updateInProgress = false
   157  }
   158  
   159  // Returns true if there were failures.
   160  func (t *rpcType) runTriggers(triggers []*triggers.Trigger, action string,
   161  	logger log.Logger) bool {
   162  	var retval bool
   163  	t.systemGoroutine.Run(func() {
   164  		retval = runTriggers(triggers, action, logger)
   165  	})
   166  	return retval
   167  }
   168  
   169  func handleSignals(signals <-chan os.Signal, logger log.Logger) {
   170  	for sig := range signals {
   171  		logger.Printf("Caught %s: ignoring\n", sig)
   172  		if logger, ok := logger.(flusher); ok {
   173  			logger.Flush()
   174  		}
   175  	}
   176  }
   177  
   178  // hardReboot will try to sync file-system data and then issues a reboot system
   179  // call. It doesn't depend on a working "reboot" programme.
   180  func hardReboot(logger log.Logger) error {
   181  	syncAndWait(logger)
   182  	syncAndWait(logger)
   183  	syncAndWait(logger)
   184  	logger.Println("Calling reboot() system call and wait")
   185  	if logger, ok := logger.(flusher); ok {
   186  		logger.Flush()
   187  	}
   188  	time.Sleep(time.Second)
   189  	return wsyscall.Reboot()
   190  }
   191  
   192  // Returns true on success, else false.
   193  func runCommand(logger log.Logger, name string, args ...string) bool {
   194  	cmd := exec.Command(name, args...)
   195  	if logs, err := cmd.CombinedOutput(); err != nil {
   196  		errMsg := "error running: " + name
   197  		for _, arg := range args {
   198  			errMsg += " " + arg
   199  		}
   200  		errMsg += ": " + err.Error()
   201  		logger.Println(errMsg)
   202  		logger.Println(string(logs))
   203  		return false
   204  	}
   205  	return true
   206  }
   207  
   208  // runCommandBackground returns a channel that receives a message if the command
   209  // fails.
   210  func runCommandBackground(logger log.Logger, name string,
   211  	args ...string) <-chan struct{} {
   212  	failureChannel := make(chan struct{}, 1)
   213  	go func() {
   214  		if !runCommand(logger, name, args...) {
   215  			failureChannel <- struct{}{}
   216  		}
   217  	}()
   218  	return failureChannel
   219  }
   220  
   221  // Returns true if there were failures.
   222  func runTriggers(triggerList []*triggers.Trigger, action string,
   223  	logger log.Logger) bool {
   224  	hadFailures := false
   225  	needRestart := false
   226  	logPrefix := ""
   227  	var rebootingTriggers []*triggers.Trigger
   228  	if *disableTriggers {
   229  		logPrefix = "Disabled: "
   230  	}
   231  	for _, trigger := range triggerList {
   232  		if trigger.DoReboot {
   233  			rebootingTriggers = append(rebootingTriggers, trigger)
   234  		}
   235  	}
   236  	if len(rebootingTriggers) > 0 {
   237  		if action == "start" {
   238  			triggerList = rebootingTriggers
   239  		} else {
   240  			logger.Printf("%sWill reboot on start, skipping %s actions\n",
   241  				logPrefix, action)
   242  			return hadFailures
   243  		}
   244  	}
   245  	for _, trigger := range triggerList {
   246  		if trigger.Service == "subd" {
   247  			// Never kill myself, just restart. Must do it last, so that other
   248  			// triggers are started.
   249  			if action == "start" {
   250  				needRestart = true
   251  			}
   252  			continue
   253  		}
   254  		logger.Printf("%sAction: service %s %s\n",
   255  			logPrefix, trigger.Service, action)
   256  		if *disableTriggers {
   257  			continue
   258  		}
   259  		if !runCommand(logger, "service", trigger.Service, action) {
   260  			// Ignore failure for the "reboot" service: try later.
   261  			if action != "start" ||
   262  				!trigger.DoReboot ||
   263  				trigger.Service != "reboot" {
   264  				hadFailures = true
   265  			}
   266  		}
   267  	}
   268  	if len(rebootingTriggers) > 0 {
   269  		if hadFailures {
   270  			logger.Printf("%sSome triggers failed, will not reboot\n",
   271  				logPrefix)
   272  			return hadFailures
   273  		}
   274  		logger.Printf("%sRebooting\n", logPrefix)
   275  		if *disableTriggers {
   276  			return hadFailures
   277  		}
   278  		if logger, ok := logger.(flusher); ok {
   279  			logger.Flush()
   280  		}
   281  		// Catch and log some signals to try and handle cases where the init
   282  		// system signals subd but doesn't reboot, so we want to reach the hard
   283  		// reboot fallback.
   284  		signal.Reset(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
   285  		signals := make(chan os.Signal, 1)
   286  		go handleSignals(signals, logger)
   287  		signal.Notify(signals, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM)
   288  		time.Sleep(time.Second)
   289  		failureChannel := runCommandBackground(logger, "reboot", "-f")
   290  		timer := time.NewTimer(30 * time.Second)
   291  		select {
   292  		case <-failureChannel:
   293  			logger.Printf("%sReboot failed, trying harder\n", logPrefix)
   294  		case <-timer.C:
   295  			logger.Printf("%sStill alive after 30 seconds, rebooting harder\n",
   296  				logPrefix)
   297  		}
   298  		if logger, ok := logger.(flusher); ok {
   299  			logger.Flush()
   300  		}
   301  		time.Sleep(time.Second)
   302  		if err := hardReboot(logger); err != nil {
   303  			logger.Printf("%sHard reboot failed: %s\n", logPrefix, err)
   304  		} else {
   305  			time.Sleep(time.Second)
   306  			logger.Printf("%sStill alive after hard reboot. I'm at a loss\n",
   307  				logPrefix)
   308  		}
   309  		return true
   310  	}
   311  	if needRestart {
   312  		logger.Printf("%sAction: service subd restart\n", logPrefix)
   313  		if !runCommand(logger, "service", "subd", "restart") {
   314  			hadFailures = true
   315  		}
   316  	}
   317  	return hadFailures
   318  }
   319  
   320  // syncAndWait will try to sync file-system data and then waits 5 seconds.
   321  func syncAndWait(logger log.Logger) {
   322  	logger.Println("Calling sync() system call and wait")
   323  	go wsyscall.Sync()
   324  	time.Sleep(5 * time.Second)
   325  }