github.com/Cloud-Foundations/Dominator@v0.3.4/sub/rpcd/update.go (about) 1 package rpcd 2 3 import ( 4 "bufio" 5 "encoding/json" 6 "errors" 7 "flag" 8 "os" 9 "os/exec" 10 "os/signal" 11 "syscall" 12 "time" 13 14 jsonlib "github.com/Cloud-Foundations/Dominator/lib/json" 15 "github.com/Cloud-Foundations/Dominator/lib/log" 16 "github.com/Cloud-Foundations/Dominator/lib/srpc" 17 "github.com/Cloud-Foundations/Dominator/lib/triggers" 18 "github.com/Cloud-Foundations/Dominator/lib/wsyscall" 19 "github.com/Cloud-Foundations/Dominator/proto/sub" 20 "github.com/Cloud-Foundations/Dominator/sub/lib" 21 ) 22 23 var ( 24 readOnly = flag.Bool("readOnly", false, 25 "If true, refuse all Fetch and Update requests. For debugging only") 26 disableUpdates = flag.Bool("disableUpdates", false, 27 "If true, refuse all Update requests. For debugging only") 28 disableTriggers = flag.Bool("disableTriggers", false, 29 "If true, do not run any triggers. For debugging only") 30 ) 31 32 type flusher interface { 33 Flush() error 34 } 35 36 func (t *rpcType) Update(conn *srpc.Conn, request sub.UpdateRequest, 37 reply *sub.UpdateResponse) error { 38 if err := t.getUpdateLock(conn); err != nil { 39 t.params.Logger.Println(err) 40 return err 41 } 42 t.params.Logger.Printf("Update(%s)\n", conn.Username()) 43 fs := t.params.FileSystemHistory.FileSystem() 44 if request.Wait { 45 return t.updateAndUnlock(request, fs.RootDirectoryName()) 46 } 47 go t.updateAndUnlock(request, fs.RootDirectoryName()) 48 return nil 49 } 50 51 func (t *rpcType) getUpdateLock(conn *srpc.Conn) error { 52 if *readOnly || *disableUpdates { 53 return errors.New("Update() rejected due to read-only mode") 54 } 55 fs := t.params.FileSystemHistory.FileSystem() 56 if fs == nil { 57 return errors.New("no file-system history yet") 58 } 59 t.rwLock.Lock() 60 defer t.rwLock.Unlock() 61 if err := t.checkIfLockedByAnotherClient(conn); err != nil { 62 t.params.Logger.Printf("Error: %s\n", err) 63 return err 64 } 65 if t.fetchInProgress { 66 return errors.New("Fetch() in progress") 67 } 68 if t.updateInProgress { 69 return errors.New("Update() already in progress") 70 } 71 t.updateInProgress = true 72 t.lastUpdateError = nil 73 return nil 74 } 75 76 func (t *rpcType) updateAndUnlock(request sub.UpdateRequest, 77 rootDirectoryName string) error { 78 defer t.clearUpdateInProgress() 79 defer t.params.ScannerConfiguration.BoostCpuLimit(t.params.Logger) 80 t.params.DisableScannerFunction(true) 81 defer t.params.DisableScannerFunction(false) 82 startTime := time.Now() 83 oldTriggers := &triggers.MergeableTriggers{} 84 file, err := os.Open(t.config.OldTriggersFilename) 85 if err == nil { 86 decoder := json.NewDecoder(file) 87 var trig triggers.Triggers 88 err = decoder.Decode(&trig.Triggers) 89 file.Close() 90 if err == nil { 91 oldTriggers.Merge(&trig) 92 } else { 93 t.params.Logger.Printf( 94 "Error decoding old triggers: %s", err.Error()) 95 } 96 } 97 if request.Triggers != nil { 98 // Merge new triggers into old triggers. This supports initial 99 // Domination of a machine and when the old triggers are incomplete. 100 oldTriggers.Merge(request.Triggers) 101 file, err = os.Create(t.config.OldTriggersFilename) 102 if err == nil { 103 writer := bufio.NewWriter(file) 104 if err := jsonlib.WriteWithIndent(writer, " ", 105 request.Triggers.Triggers); err != nil { 106 t.params.Logger.Printf("Error marshaling triggers: %s", err) 107 } 108 writer.Flush() 109 file.Close() 110 } 111 } 112 var hadTriggerFailures bool 113 var fsChangeDuration time.Duration 114 var lastUpdateError error 115 options := lib.UpdateOptions{ 116 Logger: t.params.Logger, 117 ObjectsDir: t.config.ObjectsDirectoryName, 118 OldTriggers: oldTriggers.ExportTriggers(), 119 RootDirectoryName: rootDirectoryName, 120 RunTriggers: t.runTriggers, 121 SkipFilter: t.params.ScannerConfiguration.ScanFilter, 122 } 123 if t.config.DisruptionManager != "" { 124 options.DisruptionCancel = t.disruptionCancel 125 options.DisruptionRequest = t.disruptionRequest 126 } 127 t.params.WorkdirGoroutine.Run(func() { 128 hadTriggerFailures, fsChangeDuration, lastUpdateError = 129 lib.UpdateWithOptions(request, options) 130 }) 131 t.lastUpdateHadTriggerFailures = hadTriggerFailures 132 t.lastUpdateError = lastUpdateError 133 timeTaken := time.Since(startTime) 134 if t.lastUpdateError != nil { 135 t.params.Logger.Printf("Update(): last error: %s\n", t.lastUpdateError) 136 } else { 137 note, err := t.generateNote() 138 if err != nil { 139 t.params.Logger.Println(err) 140 } 141 t.rwLock.Lock() 142 t.lastSuccessfulImageName = request.ImageName 143 if err == nil { 144 t.lastNote = note 145 } 146 t.rwLock.Unlock() 147 } 148 t.params.Logger.Printf("Update() completed in %s (change window: %s)\n", 149 timeTaken, fsChangeDuration) 150 return t.lastUpdateError 151 } 152 153 func (t *rpcType) clearUpdateInProgress() { 154 t.rwLock.Lock() 155 defer t.rwLock.Unlock() 156 t.updateInProgress = false 157 } 158 159 // Returns true if there were failures. 160 func (t *rpcType) runTriggers(triggers []*triggers.Trigger, action string, 161 logger log.Logger) bool { 162 var retval bool 163 t.systemGoroutine.Run(func() { 164 retval = runTriggers(triggers, action, logger) 165 }) 166 return retval 167 } 168 169 func handleSignals(signals <-chan os.Signal, logger log.Logger) { 170 for sig := range signals { 171 logger.Printf("Caught %s: ignoring\n", sig) 172 if logger, ok := logger.(flusher); ok { 173 logger.Flush() 174 } 175 } 176 } 177 178 // hardReboot will try to sync file-system data and then issues a reboot system 179 // call. It doesn't depend on a working "reboot" programme. 180 func hardReboot(logger log.Logger) error { 181 syncAndWait(logger) 182 syncAndWait(logger) 183 syncAndWait(logger) 184 logger.Println("Calling reboot() system call and wait") 185 if logger, ok := logger.(flusher); ok { 186 logger.Flush() 187 } 188 time.Sleep(time.Second) 189 return wsyscall.Reboot() 190 } 191 192 // Returns true on success, else false. 193 func runCommand(logger log.Logger, name string, args ...string) bool { 194 cmd := exec.Command(name, args...) 195 if logs, err := cmd.CombinedOutput(); err != nil { 196 errMsg := "error running: " + name 197 for _, arg := range args { 198 errMsg += " " + arg 199 } 200 errMsg += ": " + err.Error() 201 logger.Println(errMsg) 202 logger.Println(string(logs)) 203 return false 204 } 205 return true 206 } 207 208 // runCommandBackground returns a channel that receives a message if the command 209 // fails. 210 func runCommandBackground(logger log.Logger, name string, 211 args ...string) <-chan struct{} { 212 failureChannel := make(chan struct{}, 1) 213 go func() { 214 if !runCommand(logger, name, args...) { 215 failureChannel <- struct{}{} 216 } 217 }() 218 return failureChannel 219 } 220 221 // Returns true if there were failures. 222 func runTriggers(triggerList []*triggers.Trigger, action string, 223 logger log.Logger) bool { 224 hadFailures := false 225 needRestart := false 226 logPrefix := "" 227 var rebootingTriggers []*triggers.Trigger 228 if *disableTriggers { 229 logPrefix = "Disabled: " 230 } 231 for _, trigger := range triggerList { 232 if trigger.DoReboot { 233 rebootingTriggers = append(rebootingTriggers, trigger) 234 } 235 } 236 if len(rebootingTriggers) > 0 { 237 if action == "start" { 238 triggerList = rebootingTriggers 239 } else { 240 logger.Printf("%sWill reboot on start, skipping %s actions\n", 241 logPrefix, action) 242 return hadFailures 243 } 244 } 245 for _, trigger := range triggerList { 246 if trigger.Service == "subd" { 247 // Never kill myself, just restart. Must do it last, so that other 248 // triggers are started. 249 if action == "start" { 250 needRestart = true 251 } 252 continue 253 } 254 logger.Printf("%sAction: service %s %s\n", 255 logPrefix, trigger.Service, action) 256 if *disableTriggers { 257 continue 258 } 259 if !runCommand(logger, "service", trigger.Service, action) { 260 // Ignore failure for the "reboot" service: try later. 261 if action != "start" || 262 !trigger.DoReboot || 263 trigger.Service != "reboot" { 264 hadFailures = true 265 } 266 } 267 } 268 if len(rebootingTriggers) > 0 { 269 if hadFailures { 270 logger.Printf("%sSome triggers failed, will not reboot\n", 271 logPrefix) 272 return hadFailures 273 } 274 logger.Printf("%sRebooting\n", logPrefix) 275 if *disableTriggers { 276 return hadFailures 277 } 278 if logger, ok := logger.(flusher); ok { 279 logger.Flush() 280 } 281 // Catch and log some signals to try and handle cases where the init 282 // system signals subd but doesn't reboot, so we want to reach the hard 283 // reboot fallback. 284 signal.Reset(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) 285 signals := make(chan os.Signal, 1) 286 go handleSignals(signals, logger) 287 signal.Notify(signals, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM) 288 time.Sleep(time.Second) 289 failureChannel := runCommandBackground(logger, "reboot", "-f") 290 timer := time.NewTimer(30 * time.Second) 291 select { 292 case <-failureChannel: 293 logger.Printf("%sReboot failed, trying harder\n", logPrefix) 294 case <-timer.C: 295 logger.Printf("%sStill alive after 30 seconds, rebooting harder\n", 296 logPrefix) 297 } 298 if logger, ok := logger.(flusher); ok { 299 logger.Flush() 300 } 301 time.Sleep(time.Second) 302 if err := hardReboot(logger); err != nil { 303 logger.Printf("%sHard reboot failed: %s\n", logPrefix, err) 304 } else { 305 time.Sleep(time.Second) 306 logger.Printf("%sStill alive after hard reboot. I'm at a loss\n", 307 logPrefix) 308 } 309 return true 310 } 311 if needRestart { 312 logger.Printf("%sAction: service subd restart\n", logPrefix) 313 if !runCommand(logger, "service", "subd", "restart") { 314 hadFailures = true 315 } 316 } 317 return hadFailures 318 } 319 320 // syncAndWait will try to sync file-system data and then waits 5 seconds. 321 func syncAndWait(logger log.Logger) { 322 logger.Println("Calling sync() system call and wait") 323 go wsyscall.Sync() 324 time.Sleep(5 * time.Second) 325 }