github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/core/machinelock/machinelock.go (about)

     1  // Copyright 2018 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package machinelock
     5  
     6  import (
     7  	"fmt"
     8  	"runtime/debug"
     9  	"sort"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/juju/collections/deque"
    14  	"github.com/juju/errors"
    15  	"github.com/juju/lumberjack/v2"
    16  	"github.com/juju/mutex/v2"
    17  	"gopkg.in/yaml.v2"
    18  
    19  	"github.com/juju/juju/core/paths"
    20  )
    21  
    22  // Filename represents the name of the logfile that is created in the LOG_DIR.
    23  const Filename = "machine-lock.log"
    24  
    25  // Lock is used to give external packages something to refer to.
    26  type Lock interface {
    27  	Acquire(spec Spec) (func(), error)
    28  	Report(opts ...ReportOption) (string, error)
    29  }
    30  
    31  // Clock provides an interface for dealing with clocks.
    32  type Clock interface {
    33  	// After waits for the duration to elapse and then sends the
    34  	// current time on the returned channel.
    35  	After(time.Duration) <-chan time.Time
    36  
    37  	// Now returns the current clock time.
    38  	Now() time.Time
    39  }
    40  
    41  // Logger defines the logging methods used by the machine lock.
    42  type Logger interface {
    43  	Debugf(string, ...interface{})
    44  	Warningf(string, ...interface{})
    45  }
    46  
    47  // Config defines the attributes needed to correctly construct
    48  // a machine lock.
    49  type Config struct {
    50  	AgentName   string
    51  	Clock       Clock
    52  	Logger      Logger
    53  	LogFilename string
    54  }
    55  
    56  // Validate ensures that all the required config values are set.
    57  func (c Config) Validate() error {
    58  	if c.AgentName == "" {
    59  		return errors.NotValidf("missing AgentName")
    60  	}
    61  	if c.Clock == nil {
    62  		return errors.NotValidf("missing Clock")
    63  	}
    64  	if c.Logger == nil {
    65  		return errors.NotValidf("missing Logger")
    66  	}
    67  	if c.LogFilename == "" {
    68  		return errors.NotValidf("missing LogFilename")
    69  	}
    70  	return nil
    71  }
    72  
    73  // New creates a new machine lock.
    74  func New(config Config) (*lock, error) {
    75  	if err := config.Validate(); err != nil {
    76  		return nil, errors.Trace(err)
    77  	}
    78  	if err := paths.PrimeLogFile(config.LogFilename); err != nil {
    79  		// This isn't a fatal error so  continue if priming fails.
    80  		_ = fmt.Sprintf("failed to create prime logfile in %s, because: %v", config.LogFilename, err)
    81  	}
    82  	lock := &lock{
    83  		agent:       config.AgentName,
    84  		clock:       config.Clock,
    85  		logger:      config.Logger,
    86  		logFilename: config.LogFilename,
    87  		acquire:     mutex.Acquire,
    88  		spec: mutex.Spec{
    89  			Name:  "machine-lock",
    90  			Clock: config.Clock,
    91  			Delay: 250 * time.Millisecond,
    92  			// Cancel is added in Acquire.
    93  		},
    94  		waiting: make(map[int]*info),
    95  		history: deque.NewWithMaxLen(1000),
    96  	}
    97  	lock.setStartMessage()
    98  	return lock, nil
    99  }
   100  
   101  func (c *lock) setStartMessage() {
   102  	now := c.clock.Now().Format(timeFormat)
   103  	// The reason that we don't attempt to write the start message out immediately
   104  	// is that there may be multiple agents on the machine writing to the same log file.
   105  	// The way that we control this is to open and close the file while we hold the
   106  	// machine lock. We don't want to acquire the lock just to write out the agent
   107  	// has started as this would potentially hold up the agent starting while another
   108  	// agent holds the lock. However it is very useful to have the agent start time
   109  	// recorded in the central file, so we save it and write it out the first time.
   110  	c.startMessage = fmt.Sprintf("%s === agent %s started ===", now, c.agent)
   111  }
   112  
   113  // Spec is an argument struct for the `Acquire` method.
   114  // It must have a Cancel channel and a Worker name defined.
   115  type Spec struct {
   116  	Cancel <-chan struct{}
   117  	// The purpose of the NoCancel is to ensure that there isn't
   118  	// an accidental forgetting of the cancel channel. The primary
   119  	// use case for this is the reboot worker that doesn't want to
   120  	// pass in a cancel channel because it really wants to reboot.
   121  	NoCancel bool
   122  	Worker   string
   123  	Comment  string
   124  	Group    string
   125  }
   126  
   127  // Validate ensures that a Cancel channel and a Worker name are defined.
   128  func (s Spec) Validate() error {
   129  	if s.Cancel == nil {
   130  		if !s.NoCancel {
   131  			return errors.NotValidf("missing Cancel")
   132  		}
   133  	}
   134  	if s.Worker == "" {
   135  		return errors.NotValidf("missing Worker")
   136  	}
   137  	return nil
   138  }
   139  
   140  // Acquire will attempt to acquire the machine hook execution lock.
   141  // The method returns an error if the spec is invalid, or if the Cancel
   142  // channel is signalled before the lock is acquired.
   143  func (c *lock) Acquire(spec Spec) (func(), error) {
   144  	if err := spec.Validate(); err != nil {
   145  		return nil, errors.Trace(err)
   146  	}
   147  	current := &info{
   148  		worker:    spec.Worker,
   149  		comment:   spec.Comment,
   150  		stack:     string(debug.Stack()),
   151  		requested: c.clock.Now(),
   152  	}
   153  	c.mu.Lock()
   154  
   155  	id := c.next
   156  	c.next++
   157  	c.waiting[id] = current
   158  
   159  	mSpec := c.spec
   160  	mSpec.Cancel = spec.Cancel
   161  	if spec.Group != "" {
   162  		mSpec.Name = fmt.Sprintf("%s-%s", mSpec.Name, spec.Group)
   163  	}
   164  
   165  	c.mu.Unlock()
   166  	c.logger.Debugf("acquire machine lock %q for %s (%s)", mSpec.Name, spec.Worker, spec.Comment)
   167  	releaser, err := c.acquire(mSpec)
   168  	c.mu.Lock()
   169  	defer c.mu.Unlock()
   170  	// Remove from the waiting map.
   171  	delete(c.waiting, id)
   172  
   173  	if err != nil {
   174  		return nil, errors.Trace(err)
   175  	}
   176  	c.logger.Debugf("machine lock %q acquired for %s (%s)", mSpec.Name, spec.Worker, spec.Comment)
   177  	c.holder = current
   178  	current.acquired = c.clock.Now()
   179  	return func() {
   180  		// We need to acquire the mutex before we call the releaser
   181  		// to ensure that we move the current to the history before
   182  		// another pending acquisition overwrites the current value.
   183  		c.mu.Lock()
   184  		defer c.mu.Unlock()
   185  		// We write the log file entry before we release the execution
   186  		// lock to ensure that no other agent is attempting to write to the
   187  		// log file.
   188  		current.released = c.clock.Now()
   189  		c.writeLogEntry()
   190  		c.logger.Debugf("machine lock %q released for %s (%s)", mSpec.Name, spec.Worker, spec.Comment)
   191  		releaser.Release()
   192  		c.history.PushFront(current)
   193  		c.holder = nil
   194  	}, nil
   195  }
   196  
   197  func (c *lock) writeLogEntry() {
   198  	// At the time this method is called, the holder is still set and the lock's
   199  	// mutex is held.
   200  	writer := &lumberjack.Logger{
   201  		Filename:   c.logFilename,
   202  		MaxSize:    10, // megabytes
   203  		MaxBackups: 5,
   204  		Compress:   true,
   205  	}
   206  	c.logger.Debugf("created rotating log file %q with max size %d MB and max backups %d",
   207  		writer.Filename, writer.MaxSize, writer.MaxBackups)
   208  	defer func() { _ = writer.Close() }()
   209  
   210  	if c.startMessage != "" {
   211  		_, err := fmt.Fprintln(writer, c.startMessage)
   212  		if err != nil {
   213  			c.logger.Warningf("unable to write startMessage: %s", err.Error())
   214  		}
   215  		c.startMessage = ""
   216  	}
   217  
   218  	_, err := fmt.Fprintln(writer, simpleInfo(c.agent, c.holder, c.clock.Now()))
   219  	if err != nil {
   220  		c.logger.Warningf("unable to release message: %s", err.Error())
   221  	}
   222  }
   223  
   224  type info struct {
   225  	// worker is the worker that wants or has the lock.
   226  	worker string
   227  	// comment is provided by the worker to say what they are doing.
   228  	comment string
   229  	// stack trace for additional debugging
   230  	stack string
   231  
   232  	requested time.Time
   233  	acquired  time.Time
   234  	released  time.Time
   235  }
   236  
   237  type lock struct {
   238  	agent        string
   239  	clock        Clock
   240  	logger       Logger
   241  	logFilename  string
   242  	startMessage string
   243  
   244  	acquire func(mutex.Spec) (mutex.Releaser, error)
   245  
   246  	spec mutex.Spec
   247  
   248  	mu      sync.Mutex
   249  	next    int
   250  	holder  *info
   251  	waiting map[int]*info
   252  	history *deque.Deque
   253  }
   254  
   255  type ReportOption int
   256  
   257  const (
   258  	ShowHistory ReportOption = iota
   259  	ShowStack
   260  	ShowDetailsYAML
   261  )
   262  
   263  func contains(opts []ReportOption, opt ReportOption) bool {
   264  	for _, value := range opts {
   265  		if value == opt {
   266  			return true
   267  		}
   268  	}
   269  	return false
   270  }
   271  
   272  type reportInfo struct {
   273  	Worker  string `yaml:"worker"`
   274  	Comment string `yaml:"comment,omitempty"`
   275  
   276  	Requested string `yaml:"requested,omitempty"`
   277  	Acquired  string `yaml:"acquired,omitempty"`
   278  	Released  string `yaml:"released,omitempty"`
   279  
   280  	WaitTime time.Duration `yaml:"wait-time,omitempty"`
   281  	HoldTime time.Duration `yaml:"hold-time,omitempty"`
   282  
   283  	Stack string `yaml:"stack,omitempty"`
   284  }
   285  
   286  type report struct {
   287  	Holder  interface{}   `yaml:"holder"`
   288  	Waiting []interface{} `yaml:"waiting,omitempty"`
   289  	History []interface{} `yaml:"history,omitempty"`
   290  }
   291  
   292  func (c *lock) Report(opts ...ReportOption) (string, error) {
   293  	includeStack := contains(opts, ShowStack)
   294  	detailsYAML := contains(opts, ShowDetailsYAML)
   295  
   296  	c.mu.Lock()
   297  	defer c.mu.Unlock()
   298  	now := c.clock.Now()
   299  
   300  	r := report{
   301  		Holder: displayInfo(c.holder, includeStack, detailsYAML, now),
   302  	}
   303  	// Show the waiting with oldest first, which will have the smallest
   304  	// map key.
   305  	for _, key := range sortedKeys(c.waiting) {
   306  		r.Waiting = append(r.Waiting, displayInfo(c.waiting[key], includeStack, detailsYAML, now))
   307  	}
   308  	if contains(opts, ShowHistory) {
   309  		iter := c.history.Iterator()
   310  		var v *info
   311  		for iter.Next(&v) {
   312  			r.History = append(r.History, displayInfo(v, includeStack, detailsYAML, now))
   313  		}
   314  	}
   315  
   316  	output := map[string]report{c.agent: r}
   317  	out, err := yaml.Marshal(output)
   318  	if err != nil {
   319  		return "", errors.Trace(err)
   320  	}
   321  	return string(out), nil
   322  }
   323  
   324  func sortedKeys(m map[int]*info) []int {
   325  	values := make([]int, 0, len(m))
   326  	for key := range m {
   327  		values = append(values, key)
   328  	}
   329  	sort.Ints(values)
   330  	return values
   331  }
   332  
   333  func timeOutput(t time.Time) string {
   334  	if t.IsZero() {
   335  		return ""
   336  	}
   337  	return t.String()
   338  }
   339  
   340  func displayInfo(info *info, includeStack, detailsYAML bool, now time.Time) interface{} {
   341  	if !detailsYAML {
   342  		return simpleInfo("", info, now)
   343  	}
   344  	if info == nil {
   345  		return nil
   346  	}
   347  	output := reportInfo{
   348  		Worker:    info.worker,
   349  		Comment:   info.comment,
   350  		Requested: timeOutput(info.requested),
   351  		Acquired:  timeOutput(info.acquired),
   352  		Released:  timeOutput(info.released),
   353  	}
   354  	var other time.Time
   355  	if info.acquired.IsZero() {
   356  		other = now
   357  	} else {
   358  		if info.released.IsZero() {
   359  			other = now
   360  		} else {
   361  			other = info.released
   362  		}
   363  		output.HoldTime = other.Sub(info.acquired).Round(time.Second)
   364  		// Now set other for the wait time.
   365  		other = info.acquired
   366  	}
   367  	output.WaitTime = other.Sub(info.requested).Round(time.Second)
   368  	return &output
   369  }
   370  
   371  func simpleInfo(agent string, info *info, now time.Time) string {
   372  	if info == nil {
   373  		return "none"
   374  	}
   375  	msg := info.worker
   376  	if info.comment != "" {
   377  		msg += " (" + info.comment + ")"
   378  	}
   379  	// We pass in agent when writing to the file, but not for the report.
   380  	// This allows us to have the agent in the file but keep the first column
   381  	// aligned for timestamps.
   382  	if agent != "" {
   383  		msg = agent + ": " + msg
   384  	}
   385  	if info.acquired.IsZero() {
   386  		waiting := now.Sub(info.requested).Round(time.Second)
   387  		return fmt.Sprintf("%s, waiting %s", msg, waiting)
   388  	}
   389  	if info.released.IsZero() {
   390  		holding := now.Sub(info.acquired).Round(time.Second)
   391  		return fmt.Sprintf("%s, holding %s", msg, holding)
   392  	}
   393  	ts := info.released.Format(timeFormat)
   394  	waited := info.acquired.Sub(info.requested).Round(time.Second)
   395  	held := info.released.Sub(info.acquired).Round(time.Second)
   396  	return fmt.Sprintf("%s %s, waited %s, held %s", ts, msg, waited, held)
   397  }
   398  
   399  const timeFormat = "2006-01-02 15:04:05"