github.com/mvdan/u-root-coreutils@v0.0.0-20230122170626-c2eef2898555/pkg/watchdogd/watchdogd.go (about) 1 // Copyright 2021 the u-root Authors. All rights reserved 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // watchdogd implements a background process which periodically issues a 6 // keepalive. 7 // 8 // It starts in the running+armed state: 9 // 10 // | watchdogd Running | watchdogd Stopped 11 // ---------+-----------------------+-------------------------- 12 // Watchdog | watchdogd is actively | machine will soon reboot 13 // Armed | keeping machine alive | 14 // ---------+-----------------------+-------------------------- 15 // Watchdog | a hang will not | a hang will not reboot 16 // Disarmed | reboot the machine | the machine 17 // 18 19 package watchdogd 20 21 import ( 22 "context" 23 "errors" 24 "fmt" 25 "io" 26 "log" 27 "net" 28 "os" 29 "strings" 30 "time" 31 32 "github.com/mvdan/u-root-coreutils/pkg/watchdog" 33 "golang.org/x/sys/unix" 34 ) 35 36 const defaultUDS = "/tmp/watchdogd" 37 38 const ( 39 OpStop = 'S' // Stop the watchdogd petting. 40 OpContinue = 'C' // Continue the watchdogd petting. 41 OpDisarm = 'D' // Disarm the watchdog. 42 OpArm = 'A' // Arm the watchdog. 43 ) 44 45 const ( 46 OpResultOk = 'O' // Ok. 47 OpResultError = 'E' // Error. 48 OpResultInvalidOp = 'I' // Invalid Op. 49 ) 50 51 const ( 52 opStopPettingTimeoutSeconds = 10 53 ) 54 55 // Daemon contains running states of an instance of the daemon. 56 type Daemon struct { 57 // CurrentOpts is current operating parameters for the daemon. 58 // 59 // It is assigned at the first call of Run and updated on each subsequent call of it. 60 CurrentOpts *DaemonOpts 61 62 // CurrentWd is an open file descriptor to the watchdog device specified in the daemon options. 63 CurrentWd *watchdog.Watchdog 64 65 // PettingOp syncs the signal to continue or stop petting the watchdog. 66 PettingOp chan int 67 68 // PettingOn indicate if there is an active petting session. 69 PettingOn bool 70 } 71 72 // DaemonOpts contain operating parameters for bootstrapping a watchdog daemon. 73 type DaemonOpts struct { 74 // Dev is the watchdog device. Ex: /dev/watchdog 75 Dev string 76 77 // nil uses the preset values. 0 disables the timeout. 78 Timeout, PreTimeout *time.Duration 79 80 // KeepAlive is the length of the keep alive interval. 81 KeepAlive time.Duration 82 83 // Monitors are called before each keepalive interval. If any monitor 84 // function returns an error, the . 85 Monitors []func() error 86 87 // UDS is the name of daemon's unix domain socket. 88 UDS string 89 } 90 91 // MonitorOops return an error if the kernel logs contain an oops. 92 func MonitorOops() error { 93 dmesg := make([]byte, 256*1024) 94 n, err := unix.Klogctl(unix.SYSLOG_ACTION_READ_ALL, dmesg) 95 if err != nil { 96 return fmt.Errorf("syslog failed: %v", err) 97 } 98 if strings.Contains(string(dmesg[:n]), "Oops:") { 99 return fmt.Errorf("founds Oops in dmesg") 100 } 101 return nil 102 } 103 104 // StartServing enters a loop of accepting and processing next incoming watchdogd operation call. 105 func (d *Daemon) StartServing(l *net.UnixListener) { 106 for { // All requests are processed sequentially. 107 c, err := l.AcceptUnix() 108 if err != nil { 109 log.Printf("Failed to accept new request: %v", err) 110 continue 111 } 112 b := make([]byte, 1) // Expect single byte operation instruction. 113 if _, err := io.ReadAtLeast(c, b, 1); err != nil { 114 log.Printf("Failed to read operation bit, err: %v", err) 115 } 116 op := int(b[0]) 117 log.Printf("New op received: %c", op) 118 var r rune 119 switch op { 120 case OpStop: 121 r = d.StopPetting() 122 case OpContinue: 123 r = d.StartPetting() 124 case OpArm: 125 r = d.ArmWatchdog() 126 case OpDisarm: 127 r = d.DisarmWatchdog() 128 default: 129 r = OpResultInvalidOp 130 } 131 c.Write([]byte{byte(r)}) 132 c.Close() 133 } 134 } 135 136 // setupListener sets up a new "unix" network listener for the daemon. 137 func setupListener(uds string) (*net.UnixListener, func(), error) { 138 os.Remove(uds) 139 140 l, err := net.ListenUnix("unix", &net.UnixAddr{uds, "unix"}) 141 if err != nil { 142 return nil, nil, err 143 } 144 cleanup := func() { 145 os.Remove(uds) 146 } 147 return l, cleanup, nil 148 } 149 150 // armWatchdog starts watchdog timer. 151 func (d *Daemon) ArmWatchdog() rune { 152 if d.CurrentOpts == nil { 153 log.Printf("Current daemon opts is nil, don't know how to arm Watchdog") 154 return OpResultError 155 } 156 wd, err := watchdog.Open(d.CurrentOpts.Dev) 157 if err != nil { 158 // Most likely cause is /dev/watchdog does not exist. 159 // Second most likely cause is another process (perhaps 160 // another watchdogd?) has the file open. 161 log.Printf("Failed to arm: %v", err) 162 return OpResultError 163 } 164 if d.CurrentOpts.Timeout != nil { 165 if err := wd.SetTimeout(*d.CurrentOpts.Timeout); err != nil { 166 d.CurrentWd.Close() 167 log.Printf("Failed to set timeout: %v", err) 168 return OpResultError 169 } 170 } 171 if d.CurrentOpts.PreTimeout != nil { 172 if err := wd.SetPreTimeout(*d.CurrentOpts.PreTimeout); err != nil { 173 d.CurrentWd.Close() 174 log.Printf("Failed to set pretimeout: %v", err) 175 return OpResultError 176 } 177 } 178 d.CurrentWd = wd 179 log.Printf("Watchdog armed") 180 return OpResultOk 181 } 182 183 // disarmWatchdog disarm the watchdog if already armed. 184 func (d *Daemon) DisarmWatchdog() rune { 185 if d.CurrentWd == nil { 186 log.Printf("No armed Watchdog") 187 return OpResultOk 188 } 189 if err := d.CurrentWd.MagicClose(); err != nil { 190 log.Printf("Failed to disarm watchdog: %v", err) 191 return OpResultError 192 } 193 log.Println("Watchdog disarming request went through (Watchdog will not be disabled if CONFIG_WATCHDOG_NOWAYOUT is enabled).") 194 return OpResultOk 195 } 196 197 // doPetting sends keepalive signal to Watchdog when necessary. 198 // 199 // If at least one of the custom monitors failed check(s), it won't send a keepalive 200 // signal. 201 func (d *Daemon) DoPetting() error { 202 if d.CurrentWd == nil { 203 return fmt.Errorf("no reference to any Watchdog") 204 } 205 if err := doMonitors(d.CurrentOpts.Monitors); err != nil { 206 return fmt.Errorf("won't keepalive since at least one of the custom monitors failed: %v", err) 207 } 208 if err := d.CurrentWd.KeepAlive(); err != nil { 209 return err 210 } 211 return nil 212 } 213 214 // startPetting starts Watchdog petting in a new goroutine. 215 func (d *Daemon) StartPetting() rune { 216 if d.PettingOn { 217 log.Printf("Petting ongoing") 218 return OpResultError 219 } 220 221 go func() { 222 d.PettingOn = true 223 defer func() { d.PettingOn = false }() 224 for { 225 select { 226 case op := <-d.PettingOp: 227 if op == OpStop { 228 log.Println("Petting stopped.") 229 return 230 } 231 case <-time.After(d.CurrentOpts.KeepAlive): 232 if err := d.DoPetting(); err != nil { 233 log.Printf("Failed to keeplive: %v", err) 234 // Keep trying to pet until the watchdog times out. 235 } 236 } 237 } 238 }() 239 240 log.Println("Start petting watchdog.") 241 return OpResultOk 242 } 243 244 // stopPetting stops an ongoing petting process if there is. 245 func (d *Daemon) StopPetting() rune { 246 if !d.PettingOn { 247 return OpResultOk 248 } // No petting on, simply return. 249 r := OpResultOk 250 erredOut := func() { 251 <-d.PettingOp 252 log.Printf("Stop petting times out after %d seconds", opStopPettingTimeoutSeconds) 253 r = OpResultError 254 } 255 // It will time out when there is no active petting. 256 t := time.AfterFunc(opStopPettingTimeoutSeconds*time.Second, erredOut) 257 defer t.Stop() 258 d.PettingOp <- OpStop 259 return r 260 } 261 262 // Run starts up the daemon. 263 // 264 // That includes: 265 // 1) Starts listening for watchdog(d) operation requests over unix network. 266 // 2) Arms the watchdog timer if it is not already armed. 267 // 3) Starts petting the watchdog timer. 268 func Run(ctx context.Context, opts *DaemonOpts) error { 269 log.SetPrefix("watchdogd: ") 270 defer log.Printf("Daemon quit") 271 d := New(opts) 272 l, cleanup, err := setupListener(d.CurrentOpts.UDS) 273 if err != nil { 274 return fmt.Errorf("Failed to setup server: %v", err) 275 } 276 go func() { 277 log.Println("Start serving.") 278 d.StartServing(l) 279 }() 280 281 log.Println("Start arming watchdog initially.") 282 if r := d.ArmWatchdog(); r != OpResultOk { 283 return fmt.Errorf("Initial arm failed") 284 } 285 286 if r := d.StartPetting(); r != OpResultOk { 287 return fmt.Errorf("Start petting failed") 288 } 289 290 for { 291 select { 292 case <-ctx.Done(): 293 cleanup() 294 } 295 } 296 } 297 298 // doMonitors is a helper function to run the monitors. 299 // 300 // If there is anything wrong identified, it serves as a signal to stop 301 // petting Watchdog. 302 func doMonitors(monitors []func() error) error { 303 for _, m := range monitors { 304 if err := m(); err != nil { 305 return err 306 } 307 } 308 // All monitors return normal. 309 return nil 310 } 311 312 func New(opts *DaemonOpts) *Daemon { 313 d := &Daemon{ 314 CurrentOpts: opts, 315 PettingOp: make(chan int), 316 PettingOn: false, 317 } 318 return d 319 } 320 321 type client struct { 322 Conn *net.UnixConn 323 } 324 325 func (c *client) Stop() error { 326 return sendAndCheckResult(c.Conn, OpStop) 327 } 328 329 func (c *client) Continue() error { 330 return sendAndCheckResult(c.Conn, OpContinue) 331 } 332 333 func (c *client) Disarm() error { 334 return sendAndCheckResult(c.Conn, OpDisarm) 335 } 336 337 func (c *client) Arm() error { 338 return sendAndCheckResult(c.Conn, OpArm) 339 } 340 341 // sendAndCheckResult sends operation bit and evaluates result. 342 func sendAndCheckResult(c *net.UnixConn, op int) error { 343 n, err := c.Write([]byte{byte(op)}) 344 if err != nil { 345 return err 346 } 347 if n != 1 { 348 return errors.New("no error; but message not delivered neither") 349 } 350 b := make([]byte, 1) 351 if _, err := io.ReadAtLeast(c, b, 1); err != nil { 352 log.Printf("Failed to read operation bit from server: %v", err) 353 } 354 r := int(b[0]) 355 if r != OpResultOk { 356 return fmt.Errorf("non-Ok op result: %c", r) 357 } 358 return nil 359 } 360 361 func NewClientFromUDS(uds string) (*client, error) { 362 conn, err := net.DialUnix("unix", nil, &net.UnixAddr{uds, "unix"}) 363 if err != nil { 364 return nil, err 365 } 366 return &client{Conn: conn}, nil 367 } 368 369 func NewClient() (*client, error) { 370 return NewClientFromUDS(defaultUDS) 371 }