github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/core/machinelock/machinelock.go (about) 1 // Copyright 2018 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package machinelock 5 6 import ( 7 "fmt" 8 "runtime/debug" 9 "sort" 10 "sync" 11 "time" 12 13 "github.com/juju/collections/deque" 14 "github.com/juju/errors" 15 "github.com/juju/lumberjack/v2" 16 "github.com/juju/mutex/v2" 17 "gopkg.in/yaml.v2" 18 19 "github.com/juju/juju/core/paths" 20 ) 21 22 // Filename represents the name of the logfile that is created in the LOG_DIR. 23 const Filename = "machine-lock.log" 24 25 // Lock is used to give external packages something to refer to. 26 type Lock interface { 27 Acquire(spec Spec) (func(), error) 28 Report(opts ...ReportOption) (string, error) 29 } 30 31 // Clock provides an interface for dealing with clocks. 32 type Clock interface { 33 // After waits for the duration to elapse and then sends the 34 // current time on the returned channel. 35 After(time.Duration) <-chan time.Time 36 37 // Now returns the current clock time. 38 Now() time.Time 39 } 40 41 // Logger defines the logging methods used by the machine lock. 42 type Logger interface { 43 Debugf(string, ...interface{}) 44 Warningf(string, ...interface{}) 45 } 46 47 // Config defines the attributes needed to correctly construct 48 // a machine lock. 49 type Config struct { 50 AgentName string 51 Clock Clock 52 Logger Logger 53 LogFilename string 54 } 55 56 // Validate ensures that all the required config values are set. 57 func (c Config) Validate() error { 58 if c.AgentName == "" { 59 return errors.NotValidf("missing AgentName") 60 } 61 if c.Clock == nil { 62 return errors.NotValidf("missing Clock") 63 } 64 if c.Logger == nil { 65 return errors.NotValidf("missing Logger") 66 } 67 if c.LogFilename == "" { 68 return errors.NotValidf("missing LogFilename") 69 } 70 return nil 71 } 72 73 // New creates a new machine lock. 74 func New(config Config) (*lock, error) { 75 if err := config.Validate(); err != nil { 76 return nil, errors.Trace(err) 77 } 78 if err := paths.PrimeLogFile(config.LogFilename); err != nil { 79 // This isn't a fatal error so continue if priming fails. 80 _ = fmt.Sprintf("failed to create prime logfile in %s, because: %v", config.LogFilename, err) 81 } 82 lock := &lock{ 83 agent: config.AgentName, 84 clock: config.Clock, 85 logger: config.Logger, 86 logFilename: config.LogFilename, 87 acquire: mutex.Acquire, 88 spec: mutex.Spec{ 89 Name: "machine-lock", 90 Clock: config.Clock, 91 Delay: 250 * time.Millisecond, 92 // Cancel is added in Acquire. 93 }, 94 waiting: make(map[int]*info), 95 history: deque.NewWithMaxLen(1000), 96 } 97 lock.setStartMessage() 98 return lock, nil 99 } 100 101 func (c *lock) setStartMessage() { 102 now := c.clock.Now().Format(timeFormat) 103 // The reason that we don't attempt to write the start message out immediately 104 // is that there may be multiple agents on the machine writing to the same log file. 105 // The way that we control this is to open and close the file while we hold the 106 // machine lock. We don't want to acquire the lock just to write out the agent 107 // has started as this would potentially hold up the agent starting while another 108 // agent holds the lock. However it is very useful to have the agent start time 109 // recorded in the central file, so we save it and write it out the first time. 110 c.startMessage = fmt.Sprintf("%s === agent %s started ===", now, c.agent) 111 } 112 113 // Spec is an argument struct for the `Acquire` method. 114 // It must have a Cancel channel and a Worker name defined. 115 type Spec struct { 116 Cancel <-chan struct{} 117 // The purpose of the NoCancel is to ensure that there isn't 118 // an accidental forgetting of the cancel channel. The primary 119 // use case for this is the reboot worker that doesn't want to 120 // pass in a cancel channel because it really wants to reboot. 121 NoCancel bool 122 Worker string 123 Comment string 124 Group string 125 } 126 127 // Validate ensures that a Cancel channel and a Worker name are defined. 128 func (s Spec) Validate() error { 129 if s.Cancel == nil { 130 if !s.NoCancel { 131 return errors.NotValidf("missing Cancel") 132 } 133 } 134 if s.Worker == "" { 135 return errors.NotValidf("missing Worker") 136 } 137 return nil 138 } 139 140 // Acquire will attempt to acquire the machine hook execution lock. 141 // The method returns an error if the spec is invalid, or if the Cancel 142 // channel is signalled before the lock is acquired. 143 func (c *lock) Acquire(spec Spec) (func(), error) { 144 if err := spec.Validate(); err != nil { 145 return nil, errors.Trace(err) 146 } 147 current := &info{ 148 worker: spec.Worker, 149 comment: spec.Comment, 150 stack: string(debug.Stack()), 151 requested: c.clock.Now(), 152 } 153 c.mu.Lock() 154 155 id := c.next 156 c.next++ 157 c.waiting[id] = current 158 159 mSpec := c.spec 160 mSpec.Cancel = spec.Cancel 161 if spec.Group != "" { 162 mSpec.Name = fmt.Sprintf("%s-%s", mSpec.Name, spec.Group) 163 } 164 165 c.mu.Unlock() 166 c.logger.Debugf("acquire machine lock %q for %s (%s)", mSpec.Name, spec.Worker, spec.Comment) 167 releaser, err := c.acquire(mSpec) 168 c.mu.Lock() 169 defer c.mu.Unlock() 170 // Remove from the waiting map. 171 delete(c.waiting, id) 172 173 if err != nil { 174 return nil, errors.Trace(err) 175 } 176 c.logger.Debugf("machine lock %q acquired for %s (%s)", mSpec.Name, spec.Worker, spec.Comment) 177 c.holder = current 178 current.acquired = c.clock.Now() 179 return func() { 180 // We need to acquire the mutex before we call the releaser 181 // to ensure that we move the current to the history before 182 // another pending acquisition overwrites the current value. 183 c.mu.Lock() 184 defer c.mu.Unlock() 185 // We write the log file entry before we release the execution 186 // lock to ensure that no other agent is attempting to write to the 187 // log file. 188 current.released = c.clock.Now() 189 c.writeLogEntry() 190 c.logger.Debugf("machine lock %q released for %s (%s)", mSpec.Name, spec.Worker, spec.Comment) 191 releaser.Release() 192 c.history.PushFront(current) 193 c.holder = nil 194 }, nil 195 } 196 197 func (c *lock) writeLogEntry() { 198 // At the time this method is called, the holder is still set and the lock's 199 // mutex is held. 200 writer := &lumberjack.Logger{ 201 Filename: c.logFilename, 202 MaxSize: 10, // megabytes 203 MaxBackups: 5, 204 Compress: true, 205 } 206 c.logger.Debugf("created rotating log file %q with max size %d MB and max backups %d", 207 writer.Filename, writer.MaxSize, writer.MaxBackups) 208 defer func() { _ = writer.Close() }() 209 210 if c.startMessage != "" { 211 _, err := fmt.Fprintln(writer, c.startMessage) 212 if err != nil { 213 c.logger.Warningf("unable to write startMessage: %s", err.Error()) 214 } 215 c.startMessage = "" 216 } 217 218 _, err := fmt.Fprintln(writer, simpleInfo(c.agent, c.holder, c.clock.Now())) 219 if err != nil { 220 c.logger.Warningf("unable to release message: %s", err.Error()) 221 } 222 } 223 224 type info struct { 225 // worker is the worker that wants or has the lock. 226 worker string 227 // comment is provided by the worker to say what they are doing. 228 comment string 229 // stack trace for additional debugging 230 stack string 231 232 requested time.Time 233 acquired time.Time 234 released time.Time 235 } 236 237 type lock struct { 238 agent string 239 clock Clock 240 logger Logger 241 logFilename string 242 startMessage string 243 244 acquire func(mutex.Spec) (mutex.Releaser, error) 245 246 spec mutex.Spec 247 248 mu sync.Mutex 249 next int 250 holder *info 251 waiting map[int]*info 252 history *deque.Deque 253 } 254 255 type ReportOption int 256 257 const ( 258 ShowHistory ReportOption = iota 259 ShowStack 260 ShowDetailsYAML 261 ) 262 263 func contains(opts []ReportOption, opt ReportOption) bool { 264 for _, value := range opts { 265 if value == opt { 266 return true 267 } 268 } 269 return false 270 } 271 272 type reportInfo struct { 273 Worker string `yaml:"worker"` 274 Comment string `yaml:"comment,omitempty"` 275 276 Requested string `yaml:"requested,omitempty"` 277 Acquired string `yaml:"acquired,omitempty"` 278 Released string `yaml:"released,omitempty"` 279 280 WaitTime time.Duration `yaml:"wait-time,omitempty"` 281 HoldTime time.Duration `yaml:"hold-time,omitempty"` 282 283 Stack string `yaml:"stack,omitempty"` 284 } 285 286 type report struct { 287 Holder interface{} `yaml:"holder"` 288 Waiting []interface{} `yaml:"waiting,omitempty"` 289 History []interface{} `yaml:"history,omitempty"` 290 } 291 292 func (c *lock) Report(opts ...ReportOption) (string, error) { 293 includeStack := contains(opts, ShowStack) 294 detailsYAML := contains(opts, ShowDetailsYAML) 295 296 c.mu.Lock() 297 defer c.mu.Unlock() 298 now := c.clock.Now() 299 300 r := report{ 301 Holder: displayInfo(c.holder, includeStack, detailsYAML, now), 302 } 303 // Show the waiting with oldest first, which will have the smallest 304 // map key. 305 for _, key := range sortedKeys(c.waiting) { 306 r.Waiting = append(r.Waiting, displayInfo(c.waiting[key], includeStack, detailsYAML, now)) 307 } 308 if contains(opts, ShowHistory) { 309 iter := c.history.Iterator() 310 var v *info 311 for iter.Next(&v) { 312 r.History = append(r.History, displayInfo(v, includeStack, detailsYAML, now)) 313 } 314 } 315 316 output := map[string]report{c.agent: r} 317 out, err := yaml.Marshal(output) 318 if err != nil { 319 return "", errors.Trace(err) 320 } 321 return string(out), nil 322 } 323 324 func sortedKeys(m map[int]*info) []int { 325 values := make([]int, 0, len(m)) 326 for key := range m { 327 values = append(values, key) 328 } 329 sort.Ints(values) 330 return values 331 } 332 333 func timeOutput(t time.Time) string { 334 if t.IsZero() { 335 return "" 336 } 337 return t.String() 338 } 339 340 func displayInfo(info *info, includeStack, detailsYAML bool, now time.Time) interface{} { 341 if !detailsYAML { 342 return simpleInfo("", info, now) 343 } 344 if info == nil { 345 return nil 346 } 347 output := reportInfo{ 348 Worker: info.worker, 349 Comment: info.comment, 350 Requested: timeOutput(info.requested), 351 Acquired: timeOutput(info.acquired), 352 Released: timeOutput(info.released), 353 } 354 var other time.Time 355 if info.acquired.IsZero() { 356 other = now 357 } else { 358 if info.released.IsZero() { 359 other = now 360 } else { 361 other = info.released 362 } 363 output.HoldTime = other.Sub(info.acquired).Round(time.Second) 364 // Now set other for the wait time. 365 other = info.acquired 366 } 367 output.WaitTime = other.Sub(info.requested).Round(time.Second) 368 return &output 369 } 370 371 func simpleInfo(agent string, info *info, now time.Time) string { 372 if info == nil { 373 return "none" 374 } 375 msg := info.worker 376 if info.comment != "" { 377 msg += " (" + info.comment + ")" 378 } 379 // We pass in agent when writing to the file, but not for the report. 380 // This allows us to have the agent in the file but keep the first column 381 // aligned for timestamps. 382 if agent != "" { 383 msg = agent + ": " + msg 384 } 385 if info.acquired.IsZero() { 386 waiting := now.Sub(info.requested).Round(time.Second) 387 return fmt.Sprintf("%s, waiting %s", msg, waiting) 388 } 389 if info.released.IsZero() { 390 holding := now.Sub(info.acquired).Round(time.Second) 391 return fmt.Sprintf("%s, holding %s", msg, holding) 392 } 393 ts := info.released.Format(timeFormat) 394 waited := info.acquired.Sub(info.requested).Round(time.Second) 395 held := info.released.Sub(info.acquired).Round(time.Second) 396 return fmt.Sprintf("%s %s, waited %s, held %s", ts, msg, waited, held) 397 } 398 399 const timeFormat = "2006-01-02 15:04:05"