github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/machine/machine.go (about) 1 // Copyright (c) 2019-2024, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package machine 6 7 import ( 8 "context" 9 "encoding/json" 10 "fmt" 11 "math/rand" 12 "os" 13 "path/filepath" 14 "sync" 15 "time" 16 17 "github.com/choria-io/go-choria/aagent/model" 18 "github.com/choria-io/go-choria/backoff" 19 "github.com/choria-io/go-choria/inter" 20 "github.com/choria-io/go-choria/lifecycle" 21 "github.com/ghodss/yaml" 22 "github.com/nats-io/jsm.go" 23 "github.com/sirupsen/logrus" 24 25 "github.com/choria-io/go-choria/aagent/watchers" 26 "github.com/choria-io/go-choria/internal/util" 27 28 "github.com/looplab/fsm" 29 ) 30 31 const dataFileName = "machine_data.json" 32 33 // ActivationChecker allows embedded machines to determine if they should activate or not 34 type ActivationChecker func(*logrus.Entry) bool 35 36 // Machine is a autonomous agent implemented as a Finite State Machine and hosted within Choria Server 37 type Machine struct { 38 // MachineName is the unique name for this machine 39 MachineName string `json:"name" yaml:"name"` 40 41 // MachineVersion is the semver compliant version for the running machine 42 MachineVersion string `json:"version" yaml:"version"` 43 44 // InitialState is the state this machine starts in when it first starts 45 InitialState string `json:"initial_state" yaml:"initial_state"` 46 47 // Transitions contain a list of valid events of transitions this machine can move through 48 Transitions []*Transition `json:"transitions" yaml:"transitions"` 49 50 // WatcherDefs contains all the watchers that can interact with the system 51 WatcherDefs []*watchers.WatcherDef `json:"watchers" yaml:"watchers"` 52 53 // SplayStart causes a random sleep of maximum this many seconds before the machine starts 54 SplayStart int `json:"splay_start" yaml:"splay_start"` 55 56 // ActivationCheck when set this can be called to avoid activating a plugin 57 // typically this would be used when compiling machines into the binary 58 ActivationCheck ActivationChecker `json:"-" yaml:"-"` 59 60 instanceID string 61 identity string 62 directory string 63 manifest string 64 txtfileDir string 65 overridesFile string 66 choriaStatusFile string 67 mainCollective string 68 signerKey string 69 choriaStatusFreq int 70 startTime time.Time 71 72 embedded bool 73 data map[string]any 74 facts func() json.RawMessage 75 jsm *jsm.Manager 76 conn inter.Connector 77 manager WatcherManager 78 fsm *fsm.FSM 79 notifiers []NotificationService 80 knownStates map[string]bool 81 82 // we use a 5 second backoff to limit fast transitions 83 // this when this timer fires it will reset the try counter 84 // to 0, but we reset this timer on every transition meaning 85 // it will only fire once there has been no transitions for 86 // its duration. 87 // 88 // so effectively this means a fast transition loop will slow 89 // down to 1 transition every 5 seconds max but reset to fast 90 // once there have not been a storm of transitions for a while 91 backoffTimer *time.Timer 92 transitionCounter int 93 94 ctx context.Context 95 cancel context.CancelFunc 96 dataMu sync.Mutex 97 sync.Mutex 98 } 99 100 // Transition describes a transition event within the Finite State Machine 101 type Transition struct { 102 // Name is the name for the transition shown in logs and graphs 103 Name string `json:"name" yaml:"name"` 104 105 // From is a list of valid state names from where this transition event is valid 106 From []string `json:"from" yaml:"from"` 107 108 // Destination is the name of the target state this event will move the machine into 109 Destination string `json:"destination" yaml:"destination"` 110 111 // Description is a human friendly description of the purpose of this transition 112 Description string `json:"description" yaml:"description"` 113 } 114 115 // WatcherManager manages watchers 116 type WatcherManager interface { 117 Run(context.Context, *sync.WaitGroup) error 118 NotifyStateChance() 119 SetMachine(any) error 120 WatcherState(watcher string) (any, bool) 121 Delete() 122 } 123 124 func yamlPath(dir string) string { 125 return filepath.Join(dir, "machine.yaml") 126 } 127 128 func FromPlugin(p model.MachineConstructor, manager WatcherManager, log *logrus.Entry) (*Machine, error) { 129 m, ok := p.Machine().(*Machine) 130 if !ok { 131 return nil, fmt.Errorf("%s is not a valid machine plugin", p.PluginName()) 132 } 133 134 if m.ActivationCheck != nil { 135 if !m.ActivationCheck(log) { 136 return nil, fmt.Errorf("%s activation skipped by plugin activation checks", p.PluginName()) 137 } 138 } 139 140 m.embedded = true 141 142 err := initializeMachine(m, "", "", manager) 143 if err != nil { 144 return nil, err 145 } 146 147 return m, nil 148 } 149 150 func FromDir(dir string, manager WatcherManager) (m *Machine, err error) { 151 mpath := yamlPath(dir) 152 153 if !util.FileExist(mpath) { 154 return nil, fmt.Errorf("cannot read %s", mpath) 155 } 156 157 m, err = FromYAML(mpath, manager) 158 if err != nil { 159 return nil, fmt.Errorf("could not load machine.yaml: %s", err) 160 } 161 162 m.directory, err = filepath.Abs(dir) 163 164 return m, err 165 } 166 167 func initializeMachine(m *Machine, dir string, afile string, manager WatcherManager) (err error) { 168 m.notifiers = []NotificationService{} 169 m.manager = manager 170 m.instanceID = m.UniqueID() 171 m.knownStates = make(map[string]bool) 172 m.data = make(map[string]any) 173 174 if dir != "" { 175 m.SetDirectory(dir, afile) 176 } 177 178 err = manager.SetMachine(m) 179 if err != nil { 180 return fmt.Errorf("could not register with manager: %s", err) 181 } 182 183 err = m.Setup() 184 if err != nil { 185 return err 186 } 187 188 return nil 189 } 190 191 // FromYAML loads a machine from a YAML definition 192 func FromYAML(file string, manager WatcherManager) (m *Machine, err error) { 193 afile, err := filepath.Abs(file) 194 if err != nil { 195 return nil, fmt.Errorf("could not determine absolute path for %s: %s", file, err) 196 } 197 198 f, err := os.ReadFile(afile) 199 if err != nil { 200 return nil, err 201 } 202 203 m = &Machine{} 204 err = yaml.Unmarshal(f, m) 205 if err != nil { 206 return nil, err 207 } 208 209 err = initializeMachine(m, filepath.Dir(afile), afile, manager) 210 if err != nil { 211 return nil, err 212 } 213 214 return m, nil 215 } 216 217 // ValidateDir validates a machine.yaml against the v1 schema 218 func ValidateDir(dir string) (validationErrors []string, err error) { 219 mpath := yamlPath(dir) 220 yml, err := os.ReadFile(mpath) 221 if err != nil { 222 return nil, err 223 } 224 225 var dat any 226 err = yaml.Unmarshal(yml, &dat) 227 if err != nil { 228 return nil, err 229 } 230 231 return util.ValidateSchemaFromFS("schemas/choria/machine/v1/manifest.json", dat) 232 } 233 234 func (m *Machine) SetDirectory(dir string, manifest string) error { 235 m.directory = dir 236 if manifest != "" { 237 m.manifest = manifest 238 } 239 240 err := m.loadData() 241 if err != nil { 242 // warning only, we dont want a corrupt data file from stopping the whole world, generally data should 243 // be ephemeral and recreate from other sources like kv or exec watchers, new computers need to be able to 244 // survive without data so should a machine recovering from a bad state 245 m.Warnf("machine", "Could not load data file, discarding: %s", err) 246 } 247 248 return nil 249 } 250 251 func (m *Machine) IsEmbedded() bool { 252 return m.embedded 253 } 254 255 // Facts is the active facts for the node 256 func (m *Machine) Facts() json.RawMessage { 257 m.Lock() 258 fs := m.facts 259 m.Unlock() 260 261 if fs != nil { 262 return fs() 263 } 264 265 return json.RawMessage("{}") 266 } 267 268 // SetFactSource sets a function that return current machine facts 269 func (m *Machine) SetFactSource(facts func() json.RawMessage) { 270 m.Lock() 271 defer m.Unlock() 272 273 m.facts = facts 274 } 275 276 // MainCollective is the main collective this choria belongs to 277 func (m *Machine) MainCollective() string { 278 m.Lock() 279 defer m.Unlock() 280 281 return m.mainCollective 282 } 283 284 // SetMainCollective sets the collective name this machine lives in 285 func (m *Machine) SetMainCollective(collective string) { 286 m.Lock() 287 defer m.Unlock() 288 289 m.mainCollective = collective 290 } 291 292 // SetSignerKey sets the signer key configured in config file that can override the compiled in one 293 func (m *Machine) SetSignerKey(pk string) { 294 m.Lock() 295 defer m.Unlock() 296 297 m.signerKey = pk 298 } 299 300 // SignerKey is a config setable signer key that will override the one that is compiled in 301 func (m *Machine) SignerKey() string { 302 m.Lock() 303 defer m.Unlock() 304 305 return m.signerKey 306 } 307 308 // SetChoriaStatusFile sets the path and write frequency of the choria status file 309 func (m *Machine) SetChoriaStatusFile(f string, freq int) { 310 m.Lock() 311 defer m.Unlock() 312 313 m.choriaStatusFile = f 314 m.choriaStatusFreq = freq 315 } 316 317 // ChoriaStatusFile is the path to and write frequency of the choria status file, empty when not set 318 func (m *Machine) ChoriaStatusFile() (string, int) { 319 m.Lock() 320 defer m.Unlock() 321 322 return m.choriaStatusFile, m.choriaStatusFreq 323 } 324 325 // SetIdentity sets the identity of the node hosting this machine 326 func (m *Machine) SetIdentity(id string) { 327 m.Lock() 328 defer m.Unlock() 329 330 m.identity = id 331 } 332 333 func (m *Machine) SetTextFileDirectory(d string) { 334 m.Lock() 335 defer m.Unlock() 336 337 m.txtfileDir = d 338 } 339 340 func (m *Machine) TextFileDirectory() string { 341 m.Lock() 342 defer m.Unlock() 343 344 return m.txtfileDir 345 } 346 347 func (m *Machine) SetConnection(conn inter.Connector) error { 348 m.Lock() 349 defer m.Unlock() 350 351 mgr, err := jsm.New(conn.Nats()) 352 if err != nil { 353 return err 354 } 355 356 m.conn = conn 357 m.jsm = mgr 358 359 return nil 360 } 361 362 func (m *Machine) PublishLifecycleEvent(t lifecycle.Type, opts ...lifecycle.Option) { 363 m.Lock() 364 conn := m.conn 365 m.Unlock() 366 367 if conn == nil { 368 m.Warnf("machine", "Lifecycle event not published without network connection") 369 return 370 } 371 372 event, err := lifecycle.New(t, opts...) 373 if err != nil { 374 m.Warnf("machine", "Lifecycle event not published: %v", err) 375 return 376 } 377 378 lifecycle.PublishEvent(event, conn) 379 } 380 381 func (m *Machine) JetStreamConnection() (*jsm.Manager, error) { 382 m.Lock() 383 defer m.Unlock() 384 385 var err error 386 if m.jsm == nil { 387 if m.conn != nil { 388 m.jsm, err = jsm.New(m.conn.Nats()) 389 if err != nil { 390 return nil, err 391 } 392 } else { 393 return nil, fmt.Errorf("not supplied") 394 } 395 } 396 397 return m.jsm, nil 398 } 399 400 func (m *Machine) SetOverridesFile(f string) { 401 m.Lock() 402 defer m.Unlock() 403 404 m.overridesFile = f 405 } 406 407 func (m *Machine) OverrideData() ([]byte, error) { 408 m.Lock() 409 source := m.overridesFile 410 m.Unlock() 411 412 if source == "" { 413 return []byte{}, nil 414 } 415 416 // todo: maybe some caching here 417 return os.ReadFile(source) 418 } 419 420 // Watchers retrieves the watcher definitions 421 func (m *Machine) Watchers() []*watchers.WatcherDef { 422 return m.WatcherDefs 423 } 424 425 // Graph produce a dot graph of the fsm 426 func (m *Machine) Graph() string { 427 return fsm.Visualize(m.fsm) 428 } 429 430 func (m *Machine) backoffFunc() { 431 m.Lock() 432 defer m.Unlock() 433 434 m.transitionCounter = 0 435 436 if m.backoffTimer == nil { 437 return 438 } 439 440 m.backoffTimer.Reset(time.Minute) 441 } 442 443 func (m *Machine) buildFSM() error { 444 events := fsm.Events{} 445 446 for _, t := range m.Transitions { 447 events = append(events, fsm.EventDesc{ 448 Dst: t.Destination, 449 Src: t.From, 450 Name: t.Name, 451 }) 452 } 453 454 if len(events) == 0 { 455 return fmt.Errorf("no transitions found") 456 } 457 458 f := fsm.NewFSM(m.InitialState, events, fsm.Callbacks{ 459 "enter_state": func(ctx context.Context, e *fsm.Event) { 460 for i, notifier := range m.notifiers { 461 if i == 0 { 462 m.manager.NotifyStateChance() 463 } 464 465 err := notifier.NotifyPostTransition(&TransitionNotification{ 466 Protocol: "io.choria.machine.v1.transition", 467 Identity: m.Identity(), 468 ID: m.InstanceID(), 469 Version: m.Version(), 470 Timestamp: m.TimeStampSeconds(), 471 Machine: m.MachineName, 472 Transition: e.Event, 473 FromState: e.Src, 474 ToState: e.Dst, 475 Info: m, 476 }) 477 if err != nil { 478 m.Errorf("machine", "Could not publish event notification for %s: %s", e.Event, err) 479 } 480 } 481 }, 482 }) 483 484 m.fsm = f 485 486 return nil 487 } 488 489 // Validate performs basic validation on the machine settings 490 func (m *Machine) Validate() error { 491 if m.MachineName == "" { 492 return fmt.Errorf("a machine name is required") 493 } 494 495 if m.MachineVersion == "" { 496 return fmt.Errorf("a machine version is required") 497 } 498 499 if m.InitialState == "" { 500 return fmt.Errorf("an initial state is required") 501 } 502 503 if len(m.Transitions) == 0 { 504 return fmt.Errorf("no transitions defined") 505 } 506 507 if len(m.WatcherDefs) == 0 { 508 return fmt.Errorf("no watchers defined") 509 } 510 511 for _, w := range m.Watchers() { 512 err := w.ParseAnnounceInterval() 513 if err != nil { 514 return err 515 } 516 517 err = w.ValidateStates(m.KnownStates()) 518 if err != nil { 519 return err 520 } 521 522 err = w.ValidateTransitions(m.KnownTransitions()) 523 if err != nil { 524 return err 525 } 526 } 527 528 return nil 529 } 530 531 // Setup validates and prepares the machine for execution 532 func (m *Machine) Setup() error { 533 err := m.Validate() 534 if err != nil { 535 return fmt.Errorf("validation failed: %s", err) 536 } 537 538 return m.buildFSM() 539 } 540 541 // Start runs the machine in the background 542 func (m *Machine) Start(ctx context.Context, wg *sync.WaitGroup) (started chan struct{}) { 543 m.ctx, m.cancel = context.WithCancel(ctx) 544 545 started = make(chan struct{}, 1) 546 547 runf := func() { 548 if m.SplayStart > 0 { 549 sleep := time.Duration(rand.Intn(m.SplayStart)) * time.Second 550 m.Infof(m.MachineName, "Sleeping %v before starting Autonomous Agent", sleep) 551 552 t := time.NewTimer(sleep) 553 554 select { 555 case <-t.C: 556 case <-m.ctx.Done(): 557 m.startTime = time.Time{} 558 m.Infof(m.MachineName, "Exiting on context interrupt") 559 started <- struct{}{} 560 return 561 } 562 } 563 564 m.Infof(m.MachineName, "Starting Choria Machine %s version %s from %s in state %s", m.MachineName, m.MachineVersion, m.directory, m.InitialState) 565 566 err := m.manager.Run(m.ctx, wg) 567 if err != nil { 568 m.Errorf(m.MachineName, "Could not start manager: %s", err) 569 } else { 570 m.startTime = time.Now().UTC() 571 } 572 573 started <- struct{}{} 574 } 575 576 go runf() 577 578 return started 579 } 580 581 // IsStarted determines if the machine is currently running 582 func (m *Machine) IsStarted() bool { 583 m.Lock() 584 defer m.Unlock() 585 586 return !m.startTime.IsZero() 587 } 588 589 // Delete deletes a running machine by canceling its context and giving its manager 590 // a change to do clean up before final termination 591 func (m *Machine) Delete() { 592 m.Lock() 593 defer m.Unlock() 594 595 m.manager.Delete() 596 597 if m.backoffTimer != nil { 598 m.backoffTimer.Stop() 599 } 600 601 if m.cancel != nil { 602 m.Infof("runner", "Stopping") 603 m.cancel() 604 } 605 606 m.startTime = time.Time{} 607 } 608 609 // Stop stops a running machine by canceling its context 610 func (m *Machine) Stop() { 611 m.Lock() 612 defer m.Unlock() 613 614 if m.backoffTimer != nil { 615 m.backoffTimer.Stop() 616 } 617 618 if m.cancel != nil { 619 m.Infof("runner", "Stopping") 620 m.cancel() 621 } 622 623 m.startTime = time.Time{} 624 } 625 626 func (m *Machine) backoffTransition(t string) error { 627 if m.backoffTimer == nil { 628 m.backoffTimer = time.AfterFunc(time.Minute, m.backoffFunc) 629 } 630 631 if m.transitionCounter > 0 { 632 m.Infof("machine", "Rate limiting fast transition %s after %d transitions without a quiet period for %s", t, m.transitionCounter, backoff.FiveSecStartGrace.Duration(m.transitionCounter)) 633 err := backoff.FiveSecStartGrace.TrySleep(m.ctx, m.transitionCounter) 634 if err != nil { 635 return err 636 } 637 638 m.backoffTimer.Reset(time.Minute) 639 } 640 641 m.transitionCounter++ 642 643 return nil 644 } 645 646 // Transition performs the machine transition as defined by event t 647 func (m *Machine) Transition(t string, args ...any) error { 648 m.Lock() 649 defer m.Unlock() 650 651 if t == "" { 652 return nil 653 } 654 655 if m.Can(t) { 656 err := m.backoffTransition(t) 657 if err != nil { 658 return err 659 } 660 661 m.fsm.Event(m.ctx, t, args...) 662 } else { 663 m.Warnf("machine", "Could not fire '%s' event while in %s", t, m.fsm.Current()) 664 } 665 666 return nil 667 } 668 669 // Can determines if a transition could be performed 670 func (m *Machine) Can(t string) bool { 671 return m.fsm.Can(t) 672 } 673 674 // KnownTransitions is a list of known transition names 675 func (m *Machine) KnownTransitions() []string { 676 transitions := make([]string, len(m.Transitions)) 677 678 for i, t := range m.Transitions { 679 transitions[i] = t.Name 680 } 681 682 return transitions 683 } 684 685 // KnownStates is a list of all the known states in the Machine gathered by looking at initial state and all the states mentioned in transitions 686 func (m *Machine) KnownStates() []string { 687 m.Lock() 688 defer m.Unlock() 689 690 lister := func() []string { 691 var states []string 692 693 for k := range m.knownStates { 694 states = append(states, k) 695 } 696 697 return states 698 } 699 700 if len(m.knownStates) > 0 { 701 return lister() 702 } 703 704 m.knownStates = make(map[string]bool) 705 706 m.knownStates[m.InitialState] = true 707 708 for _, t := range m.Transitions { 709 m.knownStates[t.Destination] = true 710 711 for _, e := range t.From { 712 m.knownStates[e] = true 713 } 714 } 715 716 return lister() 717 } 718 719 // DataGet gets the value for a key, empty string and false when no value is stored 720 func (m *Machine) DataGet(key string) (any, bool) { 721 m.dataMu.Lock() 722 defer m.dataMu.Unlock() 723 724 v, ok := m.data[key] 725 726 return v, ok 727 } 728 729 // DataPut stores a value in a key 730 func (m *Machine) DataPut(key string, val any) error { 731 m.dataMu.Lock() 732 defer m.dataMu.Unlock() 733 734 m.data[key] = val 735 736 err := m.saveData() 737 if err != nil { 738 m.Errorf("machine", "Could not save data to %s: %s", dataFileName, err) 739 return err 740 } 741 742 return nil 743 } 744 745 // DataDelete deletes a value from the store 746 func (m *Machine) DataDelete(key string) error { 747 m.dataMu.Lock() 748 defer m.dataMu.Unlock() 749 750 _, ok := m.data[key] 751 if !ok { 752 return nil 753 } 754 755 delete(m.data, key) 756 757 err := m.saveData() 758 if err != nil { 759 m.Errorf("machine", "Could not save data to %s: %s", dataFileName, err) 760 return err 761 } 762 763 return nil 764 } 765 766 func (m *Machine) loadData() error { 767 path := filepath.Join(m.Directory(), dataFileName) 768 if !util.FileExist(path) { 769 return nil 770 } 771 772 j, err := os.ReadFile(path) 773 if err != nil { 774 return err 775 } 776 777 m.dataMu.Lock() 778 defer m.dataMu.Unlock() 779 780 return json.Unmarshal(j, &m.data) 781 } 782 783 // lock should be held by caller 784 func (m *Machine) saveData() error { 785 j, err := json.Marshal(m.data) 786 if err != nil { 787 return err 788 } 789 790 tf, err := os.CreateTemp(m.Directory(), "") 791 if err != nil { 792 return err 793 } 794 defer os.Remove(tf.Name()) 795 796 _, err = tf.Write(j) 797 tf.Close() 798 if err != nil { 799 return err 800 } 801 802 return os.Rename(tf.Name(), filepath.Join(m.Directory(), dataFileName)) 803 } 804 805 // Data retrieves a copy of the current data stored by the machine, changes will not be reflected in the machine 806 func (m *Machine) Data() map[string]any { 807 m.dataMu.Lock() 808 defer m.dataMu.Unlock() 809 810 res := make(map[string]any, len(m.data)) 811 for k, v := range m.data { 812 res[k] = v 813 } 814 815 return res 816 }