github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/machineswatcher/machines.go (about) 1 // Copyright (c) 2021-2022, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package machines 6 7 import ( 8 "bytes" 9 "context" 10 "crypto/ed25519" 11 "encoding/base64" 12 "encoding/hex" 13 "encoding/json" 14 "fmt" 15 "os" 16 "path/filepath" 17 "strings" 18 "sync" 19 "text/template" 20 "time" 21 22 "github.com/choria-io/go-choria/aagent/model" 23 "github.com/choria-io/go-choria/aagent/util" 24 "github.com/choria-io/go-choria/aagent/watchers/event" 25 "github.com/choria-io/go-choria/aagent/watchers/watcher" 26 iu "github.com/choria-io/go-choria/internal/util" 27 "github.com/mitchellh/mapstructure" 28 ) 29 30 type State int 31 32 var ( 33 // PublicKey allows a public key to be compiled in to the binary during CI while using a standard 34 // compiled in machine.yaml, effectively this is equivalent to setting the public_key property 35 PublicKey = "" 36 ) 37 38 const ( 39 Unknown State = iota 40 Skipped 41 Error 42 Updated 43 Unchanged 44 45 wtype = "machines" 46 version = "v1" 47 ) 48 49 var stateNames = map[State]string{ 50 Unknown: "unknown", 51 Skipped: "skipped", 52 Error: "error", 53 Updated: "updated", 54 Unchanged: "unchanged", 55 } 56 57 type Specification struct { 58 Machines []byte `json:"machines"` 59 Signature string `json:"signature,omitempty"` 60 } 61 62 type ManagedMachine struct { 63 Name string `json:"name" yaml:"name"` 64 Source string `json:"source" yaml:"source"` 65 Username string `json:"username" yaml:"username"` 66 Password string `json:"password" yaml:"password"` 67 ContentChecksumsChecksum string `json:"verify_checksum" yaml:"verify_checksum" mapstructure:"verify_checksum"` 68 ArchiveChecksum string `json:"checksum" yaml:"checksum" mapstructure:"checksum"` 69 Matcher string `json:"match" yaml:"match" mapstructure:"match"` 70 Governor string `json:"governor" yaml:"governor" mapstructure:"governor"` 71 72 Interval string `json:"-"` 73 Target string `json:"-"` 74 } 75 76 type Properties struct { 77 // DataItem is the data item key to get ManagedMachines from, typically sourced from Key-Value store 78 DataItem string `mapstructure:"data_item"` 79 // PurgeUnknown will remove machines not declared in DataItem 80 PurgeUnknown bool `mapstructure:"purge_unknown"` 81 // MachineManageInterval is the interval that created machines will use to manage their archives 82 MachineManageInterval time.Duration 83 // PublicKey is the optional ed25519 public key used to sign the specification, when set 84 // the specification received will be validated and any invalid specification will be discarded 85 PublicKey string `mapstructure:"public_key"` 86 } 87 88 type Watcher struct { 89 *watcher.Watcher 90 91 name string 92 machine model.Machine 93 previous State 94 interval time.Duration 95 previousRunTime time.Duration 96 previousManaged []*ManagedMachine 97 properties *Properties 98 99 lastWatch time.Time 100 101 wmu *sync.Mutex 102 mu *sync.Mutex 103 } 104 105 func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, rawprop map[string]any) (any, error) { 106 var err error 107 108 machines := &Watcher{ 109 name: name, 110 machine: machine, 111 properties: &Properties{}, 112 lastWatch: time.Time{}, 113 wmu: &sync.Mutex{}, 114 mu: &sync.Mutex{}, 115 } 116 117 machines.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent) 118 if err != nil { 119 return nil, err 120 } 121 122 err = machines.setProperties(rawprop) 123 if err != nil { 124 return nil, fmt.Errorf("could not set properties: %v", err) 125 } 126 127 if interval != "" { 128 machines.interval, err = iu.ParseDuration(interval) 129 if err != nil { 130 return nil, fmt.Errorf("invalid interval: %v", err) 131 } 132 133 if machines.interval < 2*time.Second { 134 return nil, fmt.Errorf("interval %v is too small", machines.interval) 135 } 136 } 137 138 // Loads the public key from plugin.choria.machine.signing_key when set, overriding the value set here 139 if pk := machine.SignerKey(); pk != "" { 140 machines.properties.PublicKey = pk 141 } 142 143 return machines, nil 144 } 145 146 func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) { 147 defer wg.Done() 148 149 w.Infof("machines watcher %s starting", w.name) 150 151 if w.interval != 0 { 152 wg.Add(1) 153 go w.intervalWatcher(ctx, wg) 154 } 155 156 w.performWatch(ctx, false) 157 158 for { 159 select { 160 case <-w.Watcher.StateChangeC(): 161 w.performWatch(ctx, true) 162 163 case <-ctx.Done(): 164 w.Infof("Stopping on context interrupt") 165 w.CancelGovernor() 166 return 167 } 168 } 169 } 170 171 func (w *Watcher) watch(ctx context.Context) (state State, err error) { 172 if !w.ShouldWatch() { 173 return Skipped, nil 174 } 175 176 start := time.Now() 177 defer func() { 178 w.mu.Lock() 179 w.previousRunTime = time.Since(start) 180 w.mu.Unlock() 181 }() 182 183 desired, err := w.desiredState() 184 if err != nil { 185 return Error, err 186 } 187 188 w.mu.Lock() 189 w.previousManaged = desired 190 w.mu.Unlock() 191 192 purged := false 193 updated := false 194 195 if w.properties.PurgeUnknown { 196 purged, err = w.purgeUnknownMachines(ctx, desired) 197 if err != nil { 198 return Error, err 199 } 200 } 201 202 for _, m := range desired { 203 if m == nil || m.Name == "" { 204 continue 205 } 206 207 match, err := w.isNodeMatch(m) 208 if err != nil { 209 w.Debugf("Could not match machine %s to node: %s", m.Name, err) 210 continue 211 } 212 if !match { 213 continue 214 } 215 216 targetDir := w.targetDirForManagerMachine(m.Name) 217 target := filepath.Join(targetDir, "machine.yaml") 218 spec, err := w.renderMachine(m) 219 if err != nil { 220 w.Errorf("Failed to render machine %s: %v", m.Name, err) 221 continue 222 } 223 224 if iu.FileExist(target) { 225 specHash, err := iu.Sha256HashBytes(spec) 226 if err != nil { 227 w.Errorf("Could not determine hash for spec for %s: %s", m.Name, err) 228 continue 229 } 230 231 ok, _, err := iu.FileHasSha256Sum(target, specHash) 232 if err != nil { 233 w.Errorf("Could not compare spec with target %s: %s", target, err) 234 continue 235 } 236 237 if ok { 238 w.Debugf("Machine in %s has the correct content, continuing", target) 239 continue 240 } else { 241 w.Warnf("Machine in %s has incorrect content, updating", target) 242 } 243 244 err = os.RemoveAll(targetDir) 245 if err != nil { 246 w.Errorf("Could not remove unmatched machine in %s: %s", targetDir, err) 247 return Error, err 248 } 249 } 250 251 w.Warnf("Deploying Choria Autonomous Agent %s from %s", m.Name, m.Source) 252 253 err = os.MkdirAll(targetDir, 0700) 254 if err != nil { 255 w.Errorf("Could not create directory for %s: %s", m.Name, err) 256 continue 257 } 258 259 err = os.WriteFile(target, spec, 0600) 260 if err != nil { 261 w.Errorf("Could not write machine spec for %s: %s", m.Name, err) 262 os.RemoveAll(targetDir) 263 continue 264 } 265 266 updated = true 267 } 268 269 if purged || updated { 270 return Updated, nil 271 } 272 273 return Unchanged, nil 274 } 275 276 func (w *Watcher) handleCheck(s State, err error) error { 277 w.Debugf("handling state for %s %v", stateNames[s], err) 278 279 w.mu.Lock() 280 w.previous = s 281 w.mu.Unlock() 282 283 switch s { 284 case Error: 285 if err != nil { 286 w.Errorf("Managing machines failed: %s", err) 287 } 288 289 w.NotifyWatcherState(w.CurrentState()) 290 return w.FailureTransition() 291 292 case Updated: 293 w.NotifyWatcherState(w.CurrentState()) 294 return w.SuccessTransition() 295 296 } 297 298 return nil 299 } 300 301 func (w *Watcher) renderMachine(m *ManagedMachine) ([]byte, error) { 302 buf := bytes.NewBuffer([]byte{}) 303 t := template.New("machine") 304 305 p, err := t.Parse(string(mdat)) 306 if err != nil { 307 return nil, err 308 } 309 310 err = p.Execute(buf, m) 311 if err != nil { 312 return nil, err 313 } 314 315 return buf.Bytes(), nil 316 } 317 318 func (w *Watcher) targetDirForManagerMachine(m string) string { 319 return filepath.Join(filepath.Dir(w.machine.Directory()), fmt.Sprintf("mm_%s", m)) 320 } 321 322 func (w *Watcher) targetDirForManagedMachine(m string) string { 323 return filepath.Join(filepath.Dir(w.machine.Directory()), m) 324 } 325 326 func (w *Watcher) purgeUnknownMachines(ctx context.Context, desired []*ManagedMachine) (bool, error) { 327 current, err := w.currentMachines() 328 if err != nil { 329 return false, err 330 } 331 332 w.Debugf("Purging unknown machines from current list %v", current) 333 334 purged := false 335 for _, m := range current { 336 keep := false 337 for _, d := range desired { 338 if d == nil || d.Name == "" { 339 continue 340 } 341 342 if m == d.Name { 343 if ok, _ := w.isNodeMatch(d); ok { 344 keep = true 345 break 346 } 347 } 348 } 349 350 if !keep { 351 w.Warnf("Removing existing managed machine %s that is not in new desired set", m) 352 target := w.targetDirForManagerMachine(m) 353 err = os.RemoveAll(target) 354 if err != nil { 355 w.Errorf("Could not remove %s: %s", target, err) 356 continue 357 } 358 359 w.Debugf("Sleeping for 2 seconds to allow manager to exit") 360 iu.InterruptibleSleep(ctx, 2*time.Second) 361 362 target = w.targetDirForManagedMachine(m) 363 err = os.RemoveAll(target) 364 if err != nil { 365 w.Errorf("Could not remove %s: %s", target, err) 366 continue 367 } 368 369 purged = true 370 } 371 } 372 373 return purged, nil 374 } 375 376 func (w *Watcher) currentMachines() ([]string, error) { 377 dirs, err := os.ReadDir(filepath.Dir(w.machine.Directory())) 378 if err != nil { 379 return nil, err 380 } 381 382 var found []string 383 384 for _, e := range dirs { 385 if !e.IsDir() { 386 continue 387 } 388 389 parts := strings.SplitN(e.Name(), "_", 2) 390 if len(parts) != 2 { 391 continue 392 } 393 394 if parts[0] == "mm" { 395 found = append(found, parts[1]) 396 } 397 } 398 399 return found, nil 400 } 401 402 func (w *Watcher) loadAndValidateData() ([]byte, error) { 403 dat, ok := w.machine.DataGet(w.properties.DataItem) 404 if !ok { 405 return nil, fmt.Errorf("data item %s not present", w.properties.DataItem) 406 } 407 408 spec := &Specification{} 409 decoder, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ 410 DecodeHook: mapstructure.ComposeDecodeHookFunc(mapstructure.StringToTimeDurationHookFunc()), 411 Result: &spec, 412 WeaklyTypedInput: true, 413 }) 414 if err != nil { 415 return nil, err 416 } 417 418 err = decoder.Decode(dat) 419 if err != nil { 420 return nil, err 421 } 422 423 payload, err := base64.StdEncoding.DecodeString(string(spec.Machines)) 424 if err != nil { 425 w.Errorf("Invalid base64 encoded machines specification, removing data: %s", err) 426 w.machine.DataDelete(w.properties.DataItem) 427 return nil, fmt.Errorf("invalid data_item") 428 } 429 430 if w.properties.PublicKey != "" { 431 if len(spec.Signature) == 0 { 432 w.Errorf("No signature found in specification, removing data") 433 w.machine.DataDelete(w.properties.DataItem) 434 return nil, fmt.Errorf("invalid data_item") 435 } 436 437 pk, err := hex.DecodeString(w.properties.PublicKey) 438 if err != nil { 439 w.Errorf("invalid public key: %s", err) 440 return nil, fmt.Errorf("invalid data_item") 441 } 442 443 sig, err := hex.DecodeString(spec.Signature) 444 if err != nil { 445 w.Errorf("invalid signature string, removing data %s: %s", w.properties.DataItem, err) 446 w.machine.DataDelete(w.properties.DataItem) 447 return nil, fmt.Errorf("invalid data_item") 448 } 449 450 if !ed25519.Verify(pk, payload, sig) { 451 w.Errorf("Signature in data_item %s did not verify using configured public key '%s', removing data", w.properties.DataItem, w.properties.PublicKey) 452 w.machine.DataDelete(w.properties.DataItem) 453 return nil, fmt.Errorf("invalid data_item") 454 } 455 } 456 457 return payload, nil 458 } 459 460 func (w *Watcher) desiredState() ([]*ManagedMachine, error) { 461 data, err := w.loadAndValidateData() 462 if err != nil { 463 return nil, err 464 } 465 466 desired := []*ManagedMachine{} 467 err = json.Unmarshal(data, &desired) 468 if err != nil { 469 return nil, fmt.Errorf("invalid machines specification: %s", err) 470 } 471 472 for _, m := range desired { 473 m.Interval = w.properties.MachineManageInterval.String() 474 m.Target = filepath.Dir(w.machine.Directory()) 475 476 if m.Name == "" { 477 return nil, fmt.Errorf("name is required") 478 } 479 480 if m.Source == "" { 481 return nil, fmt.Errorf("source is required for %s", m.Name) 482 } 483 484 if m.ArchiveChecksum == "" { 485 return nil, fmt.Errorf("checksum is required for %s", m.Name) 486 } 487 488 if m.Target == "" { 489 return nil, fmt.Errorf("could not determine target for managed machine for %s", m.Name) 490 } 491 492 if m.ContentChecksumsChecksum == "" { 493 return nil, fmt.Errorf("verify_checksum is required for %s", m.Name) 494 } 495 } 496 497 return desired, nil 498 } 499 500 func (w *Watcher) performWatch(ctx context.Context, force bool) { 501 w.wmu.Lock() 502 defer w.wmu.Unlock() 503 504 if !force && time.Since(w.lastWatch) < w.interval { 505 return 506 } 507 508 err := w.handleCheck(w.watch(ctx)) 509 if err != nil { 510 w.Errorf("could not handle watcher event: %s", err) 511 } 512 } 513 514 func (w *Watcher) intervalWatcher(ctx context.Context, wg *sync.WaitGroup) { 515 defer wg.Done() 516 517 tick := time.NewTicker(w.interval) 518 519 for { 520 select { 521 case <-tick.C: 522 w.performWatch(ctx, false) 523 524 case <-ctx.Done(): 525 tick.Stop() 526 return 527 } 528 } 529 } 530 531 func (w *Watcher) setProperties(props map[string]any) error { 532 if w.properties == nil { 533 w.properties = &Properties{} 534 } 535 536 err := util.ParseMapStructure(props, w.properties) 537 if err != nil { 538 return err 539 } 540 541 if PublicKey != "" { 542 w.properties.PublicKey = PublicKey 543 } 544 545 return w.validate() 546 } 547 548 func (w *Watcher) validate() error { 549 if w.properties.DataItem == "" { 550 return fmt.Errorf("data_item is required") 551 } 552 if w.machine.Directory() == "" { 553 return fmt.Errorf("machine store is not configured") 554 } 555 556 if w.properties.MachineManageInterval == 0 { 557 w.properties.MachineManageInterval = 2 * time.Minute 558 } 559 560 return nil 561 } 562 563 func (w *Watcher) CurrentState() any { 564 w.mu.Lock() 565 defer w.mu.Unlock() 566 567 s := &StateNotification{ 568 Event: event.New(w.name, wtype, version, w.machine), 569 PreviousManagedMachines: []string{}, 570 PreviousOutcome: stateNames[w.previous], 571 PreviousRunTime: w.previousRunTime.Nanoseconds(), 572 } 573 574 for _, m := range w.previousManaged { 575 s.PreviousManagedMachines = append(s.PreviousManagedMachines, m.Name) 576 } 577 578 return s 579 }