github.com/mvdan/u-root-coreutils@v0.0.0-20230122170626-c2eef2898555/pkg/watchdogd/watchdogd.go (about)

     1  // Copyright 2021 the u-root Authors. All rights reserved
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // watchdogd implements a background process which periodically issues a
     6  // keepalive.
     7  //
     8  // It starts in the running+armed state:
     9  //
    10  //              | watchdogd Running     | watchdogd Stopped
    11  //     ---------+-----------------------+--------------------------
    12  //     Watchdog | watchdogd is actively | machine will soon reboot
    13  //     Armed    | keeping machine alive |
    14  //     ---------+-----------------------+--------------------------
    15  //     Watchdog | a hang will not       | a hang will not reboot
    16  //     Disarmed | reboot the machine    | the machine
    17  //
    18  
    19  package watchdogd
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"log"
    27  	"net"
    28  	"os"
    29  	"strings"
    30  	"time"
    31  
    32  	"github.com/mvdan/u-root-coreutils/pkg/watchdog"
    33  	"golang.org/x/sys/unix"
    34  )
    35  
    36  const defaultUDS = "/tmp/watchdogd"
    37  
    38  const (
    39  	OpStop     = 'S' // Stop the watchdogd petting.
    40  	OpContinue = 'C' // Continue the watchdogd petting.
    41  	OpDisarm   = 'D' // Disarm the watchdog.
    42  	OpArm      = 'A' // Arm the watchdog.
    43  )
    44  
    45  const (
    46  	OpResultOk        = 'O' // Ok.
    47  	OpResultError     = 'E' // Error.
    48  	OpResultInvalidOp = 'I' // Invalid Op.
    49  )
    50  
    51  const (
    52  	opStopPettingTimeoutSeconds = 10
    53  )
    54  
    55  // Daemon contains running states of an instance of the daemon.
    56  type Daemon struct {
    57  	// CurrentOpts is current operating parameters for the daemon.
    58  	//
    59  	// It is assigned at the first call of Run and updated on each subsequent call of it.
    60  	CurrentOpts *DaemonOpts
    61  
    62  	// CurrentWd is an open file descriptor to the watchdog device specified in the daemon options.
    63  	CurrentWd *watchdog.Watchdog
    64  
    65  	// PettingOp syncs the signal to continue or stop petting the watchdog.
    66  	PettingOp chan int
    67  
    68  	// PettingOn indicate if there is an active petting session.
    69  	PettingOn bool
    70  }
    71  
    72  // DaemonOpts contain operating parameters for bootstrapping a watchdog daemon.
    73  type DaemonOpts struct {
    74  	// Dev is the watchdog device. Ex: /dev/watchdog
    75  	Dev string
    76  
    77  	// nil uses the preset values. 0 disables the timeout.
    78  	Timeout, PreTimeout *time.Duration
    79  
    80  	// KeepAlive is the length of the keep alive interval.
    81  	KeepAlive time.Duration
    82  
    83  	// Monitors are called before each keepalive interval. If any monitor
    84  	// function returns an error, the .
    85  	Monitors []func() error
    86  
    87  	// UDS is the name of daemon's unix domain socket.
    88  	UDS string
    89  }
    90  
    91  // MonitorOops return an error if the kernel logs contain an oops.
    92  func MonitorOops() error {
    93  	dmesg := make([]byte, 256*1024)
    94  	n, err := unix.Klogctl(unix.SYSLOG_ACTION_READ_ALL, dmesg)
    95  	if err != nil {
    96  		return fmt.Errorf("syslog failed: %v", err)
    97  	}
    98  	if strings.Contains(string(dmesg[:n]), "Oops:") {
    99  		return fmt.Errorf("founds Oops in dmesg")
   100  	}
   101  	return nil
   102  }
   103  
   104  // StartServing enters a loop of accepting and processing next incoming watchdogd operation call.
   105  func (d *Daemon) StartServing(l *net.UnixListener) {
   106  	for { // All requests are processed sequentially.
   107  		c, err := l.AcceptUnix()
   108  		if err != nil {
   109  			log.Printf("Failed to accept new request: %v", err)
   110  			continue
   111  		}
   112  		b := make([]byte, 1) // Expect single byte operation instruction.
   113  		if _, err := io.ReadAtLeast(c, b, 1); err != nil {
   114  			log.Printf("Failed to read operation bit, err: %v", err)
   115  		}
   116  		op := int(b[0])
   117  		log.Printf("New op received: %c", op)
   118  		var r rune
   119  		switch op {
   120  		case OpStop:
   121  			r = d.StopPetting()
   122  		case OpContinue:
   123  			r = d.StartPetting()
   124  		case OpArm:
   125  			r = d.ArmWatchdog()
   126  		case OpDisarm:
   127  			r = d.DisarmWatchdog()
   128  		default:
   129  			r = OpResultInvalidOp
   130  		}
   131  		c.Write([]byte{byte(r)})
   132  		c.Close()
   133  	}
   134  }
   135  
   136  // setupListener sets up a new "unix" network listener for the daemon.
   137  func setupListener(uds string) (*net.UnixListener, func(), error) {
   138  	os.Remove(uds)
   139  
   140  	l, err := net.ListenUnix("unix", &net.UnixAddr{uds, "unix"})
   141  	if err != nil {
   142  		return nil, nil, err
   143  	}
   144  	cleanup := func() {
   145  		os.Remove(uds)
   146  	}
   147  	return l, cleanup, nil
   148  }
   149  
   150  // armWatchdog starts watchdog timer.
   151  func (d *Daemon) ArmWatchdog() rune {
   152  	if d.CurrentOpts == nil {
   153  		log.Printf("Current daemon opts is nil, don't know how to arm Watchdog")
   154  		return OpResultError
   155  	}
   156  	wd, err := watchdog.Open(d.CurrentOpts.Dev)
   157  	if err != nil {
   158  		// Most likely cause is /dev/watchdog does not exist.
   159  		// Second most likely cause is another process (perhaps
   160  		// another watchdogd?) has the file open.
   161  		log.Printf("Failed to arm: %v", err)
   162  		return OpResultError
   163  	}
   164  	if d.CurrentOpts.Timeout != nil {
   165  		if err := wd.SetTimeout(*d.CurrentOpts.Timeout); err != nil {
   166  			d.CurrentWd.Close()
   167  			log.Printf("Failed to set timeout: %v", err)
   168  			return OpResultError
   169  		}
   170  	}
   171  	if d.CurrentOpts.PreTimeout != nil {
   172  		if err := wd.SetPreTimeout(*d.CurrentOpts.PreTimeout); err != nil {
   173  			d.CurrentWd.Close()
   174  			log.Printf("Failed to set pretimeout: %v", err)
   175  			return OpResultError
   176  		}
   177  	}
   178  	d.CurrentWd = wd
   179  	log.Printf("Watchdog armed")
   180  	return OpResultOk
   181  }
   182  
   183  // disarmWatchdog disarm the watchdog if already armed.
   184  func (d *Daemon) DisarmWatchdog() rune {
   185  	if d.CurrentWd == nil {
   186  		log.Printf("No armed Watchdog")
   187  		return OpResultOk
   188  	}
   189  	if err := d.CurrentWd.MagicClose(); err != nil {
   190  		log.Printf("Failed to disarm watchdog: %v", err)
   191  		return OpResultError
   192  	}
   193  	log.Println("Watchdog disarming request went through (Watchdog will not be disabled if CONFIG_WATCHDOG_NOWAYOUT is enabled).")
   194  	return OpResultOk
   195  }
   196  
   197  // doPetting sends keepalive signal to Watchdog when necessary.
   198  //
   199  // If at least one of the custom monitors failed check(s), it won't send a keepalive
   200  // signal.
   201  func (d *Daemon) DoPetting() error {
   202  	if d.CurrentWd == nil {
   203  		return fmt.Errorf("no reference to any Watchdog")
   204  	}
   205  	if err := doMonitors(d.CurrentOpts.Monitors); err != nil {
   206  		return fmt.Errorf("won't keepalive since at least one of the custom monitors failed: %v", err)
   207  	}
   208  	if err := d.CurrentWd.KeepAlive(); err != nil {
   209  		return err
   210  	}
   211  	return nil
   212  }
   213  
   214  // startPetting starts Watchdog petting in a new goroutine.
   215  func (d *Daemon) StartPetting() rune {
   216  	if d.PettingOn {
   217  		log.Printf("Petting ongoing")
   218  		return OpResultError
   219  	}
   220  
   221  	go func() {
   222  		d.PettingOn = true
   223  		defer func() { d.PettingOn = false }()
   224  		for {
   225  			select {
   226  			case op := <-d.PettingOp:
   227  				if op == OpStop {
   228  					log.Println("Petting stopped.")
   229  					return
   230  				}
   231  			case <-time.After(d.CurrentOpts.KeepAlive):
   232  				if err := d.DoPetting(); err != nil {
   233  					log.Printf("Failed to keeplive: %v", err)
   234  					// Keep trying to pet until the watchdog times out.
   235  				}
   236  			}
   237  		}
   238  	}()
   239  
   240  	log.Println("Start petting watchdog.")
   241  	return OpResultOk
   242  }
   243  
   244  // stopPetting stops an ongoing petting process if there is.
   245  func (d *Daemon) StopPetting() rune {
   246  	if !d.PettingOn {
   247  		return OpResultOk
   248  	} // No petting on, simply return.
   249  	r := OpResultOk
   250  	erredOut := func() {
   251  		<-d.PettingOp
   252  		log.Printf("Stop petting times out after %d seconds", opStopPettingTimeoutSeconds)
   253  		r = OpResultError
   254  	}
   255  	// It will time out when there is no active petting.
   256  	t := time.AfterFunc(opStopPettingTimeoutSeconds*time.Second, erredOut)
   257  	defer t.Stop()
   258  	d.PettingOp <- OpStop
   259  	return r
   260  }
   261  
   262  // Run starts up the daemon.
   263  //
   264  // That includes:
   265  // 1) Starts listening for watchdog(d) operation requests over unix network.
   266  // 2) Arms the watchdog timer if it is not already armed.
   267  // 3) Starts petting the watchdog timer.
   268  func Run(ctx context.Context, opts *DaemonOpts) error {
   269  	log.SetPrefix("watchdogd: ")
   270  	defer log.Printf("Daemon quit")
   271  	d := New(opts)
   272  	l, cleanup, err := setupListener(d.CurrentOpts.UDS)
   273  	if err != nil {
   274  		return fmt.Errorf("Failed to setup server: %v", err)
   275  	}
   276  	go func() {
   277  		log.Println("Start serving.")
   278  		d.StartServing(l)
   279  	}()
   280  
   281  	log.Println("Start arming watchdog initially.")
   282  	if r := d.ArmWatchdog(); r != OpResultOk {
   283  		return fmt.Errorf("Initial arm failed")
   284  	}
   285  
   286  	if r := d.StartPetting(); r != OpResultOk {
   287  		return fmt.Errorf("Start petting failed")
   288  	}
   289  
   290  	for {
   291  		select {
   292  		case <-ctx.Done():
   293  			cleanup()
   294  		}
   295  	}
   296  }
   297  
   298  // doMonitors is a helper function to run the monitors.
   299  //
   300  // If there is anything wrong identified, it serves as a signal to stop
   301  // petting Watchdog.
   302  func doMonitors(monitors []func() error) error {
   303  	for _, m := range monitors {
   304  		if err := m(); err != nil {
   305  			return err
   306  		}
   307  	}
   308  	// All monitors return normal.
   309  	return nil
   310  }
   311  
   312  func New(opts *DaemonOpts) *Daemon {
   313  	d := &Daemon{
   314  		CurrentOpts: opts,
   315  		PettingOp:   make(chan int),
   316  		PettingOn:   false,
   317  	}
   318  	return d
   319  }
   320  
   321  type client struct {
   322  	Conn *net.UnixConn
   323  }
   324  
   325  func (c *client) Stop() error {
   326  	return sendAndCheckResult(c.Conn, OpStop)
   327  }
   328  
   329  func (c *client) Continue() error {
   330  	return sendAndCheckResult(c.Conn, OpContinue)
   331  }
   332  
   333  func (c *client) Disarm() error {
   334  	return sendAndCheckResult(c.Conn, OpDisarm)
   335  }
   336  
   337  func (c *client) Arm() error {
   338  	return sendAndCheckResult(c.Conn, OpArm)
   339  }
   340  
   341  // sendAndCheckResult sends operation bit and evaluates result.
   342  func sendAndCheckResult(c *net.UnixConn, op int) error {
   343  	n, err := c.Write([]byte{byte(op)})
   344  	if err != nil {
   345  		return err
   346  	}
   347  	if n != 1 {
   348  		return errors.New("no error; but message not delivered neither")
   349  	}
   350  	b := make([]byte, 1)
   351  	if _, err := io.ReadAtLeast(c, b, 1); err != nil {
   352  		log.Printf("Failed to read operation bit from server: %v", err)
   353  	}
   354  	r := int(b[0])
   355  	if r != OpResultOk {
   356  		return fmt.Errorf("non-Ok op result: %c", r)
   357  	}
   358  	return nil
   359  }
   360  
   361  func NewClientFromUDS(uds string) (*client, error) {
   362  	conn, err := net.DialUnix("unix", nil, &net.UnixAddr{uds, "unix"})
   363  	if err != nil {
   364  		return nil, err
   365  	}
   366  	return &client{Conn: conn}, nil
   367  }
   368  
   369  func NewClient() (*client, error) {
   370  	return NewClientFromUDS(defaultUDS)
   371  }