github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/machineswatcher/machines.go (about)

     1  // Copyright (c) 2021-2022, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package machines
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"crypto/ed25519"
    11  	"encoding/base64"
    12  	"encoding/hex"
    13  	"encoding/json"
    14  	"fmt"
    15  	"os"
    16  	"path/filepath"
    17  	"strings"
    18  	"sync"
    19  	"text/template"
    20  	"time"
    21  
    22  	"github.com/choria-io/go-choria/aagent/model"
    23  	"github.com/choria-io/go-choria/aagent/util"
    24  	"github.com/choria-io/go-choria/aagent/watchers/event"
    25  	"github.com/choria-io/go-choria/aagent/watchers/watcher"
    26  	iu "github.com/choria-io/go-choria/internal/util"
    27  	"github.com/mitchellh/mapstructure"
    28  )
    29  
    30  type State int
    31  
    32  var (
    33  	// PublicKey allows a public key to be compiled in to the binary during CI while using a standard
    34  	// compiled in machine.yaml, effectively this is equivalent to setting the public_key property
    35  	PublicKey = ""
    36  )
    37  
    38  const (
    39  	Unknown State = iota
    40  	Skipped
    41  	Error
    42  	Updated
    43  	Unchanged
    44  
    45  	wtype   = "machines"
    46  	version = "v1"
    47  )
    48  
    49  var stateNames = map[State]string{
    50  	Unknown:   "unknown",
    51  	Skipped:   "skipped",
    52  	Error:     "error",
    53  	Updated:   "updated",
    54  	Unchanged: "unchanged",
    55  }
    56  
    57  type Specification struct {
    58  	Machines  []byte `json:"machines"`
    59  	Signature string `json:"signature,omitempty"`
    60  }
    61  
    62  type ManagedMachine struct {
    63  	Name                     string `json:"name" yaml:"name"`
    64  	Source                   string `json:"source" yaml:"source"`
    65  	Username                 string `json:"username" yaml:"username"`
    66  	Password                 string `json:"password" yaml:"password"`
    67  	ContentChecksumsChecksum string `json:"verify_checksum" yaml:"verify_checksum" mapstructure:"verify_checksum"`
    68  	ArchiveChecksum          string `json:"checksum" yaml:"checksum" mapstructure:"checksum"`
    69  	Matcher                  string `json:"match" yaml:"match" mapstructure:"match"`
    70  	Governor                 string `json:"governor" yaml:"governor" mapstructure:"governor"`
    71  
    72  	Interval string `json:"-"`
    73  	Target   string `json:"-"`
    74  }
    75  
    76  type Properties struct {
    77  	// DataItem is the data item key to get ManagedMachines from, typically sourced from Key-Value store
    78  	DataItem string `mapstructure:"data_item"`
    79  	// PurgeUnknown will remove machines not declared in DataItem
    80  	PurgeUnknown bool `mapstructure:"purge_unknown"`
    81  	// MachineManageInterval is the interval that created machines will use to manage their archives
    82  	MachineManageInterval time.Duration
    83  	// PublicKey is the optional ed25519 public key used to sign the specification, when set
    84  	// the specification received will be validated and any invalid specification will be discarded
    85  	PublicKey string `mapstructure:"public_key"`
    86  }
    87  
    88  type Watcher struct {
    89  	*watcher.Watcher
    90  
    91  	name            string
    92  	machine         model.Machine
    93  	previous        State
    94  	interval        time.Duration
    95  	previousRunTime time.Duration
    96  	previousManaged []*ManagedMachine
    97  	properties      *Properties
    98  
    99  	lastWatch time.Time
   100  
   101  	wmu *sync.Mutex
   102  	mu  *sync.Mutex
   103  }
   104  
   105  func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, rawprop map[string]any) (any, error) {
   106  	var err error
   107  
   108  	machines := &Watcher{
   109  		name:       name,
   110  		machine:    machine,
   111  		properties: &Properties{},
   112  		lastWatch:  time.Time{},
   113  		wmu:        &sync.Mutex{},
   114  		mu:         &sync.Mutex{},
   115  	}
   116  
   117  	machines.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent)
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	err = machines.setProperties(rawprop)
   123  	if err != nil {
   124  		return nil, fmt.Errorf("could not set properties: %v", err)
   125  	}
   126  
   127  	if interval != "" {
   128  		machines.interval, err = iu.ParseDuration(interval)
   129  		if err != nil {
   130  			return nil, fmt.Errorf("invalid interval: %v", err)
   131  		}
   132  
   133  		if machines.interval < 2*time.Second {
   134  			return nil, fmt.Errorf("interval %v is too small", machines.interval)
   135  		}
   136  	}
   137  
   138  	// Loads the public key from plugin.choria.machine.signing_key when set, overriding the value set here
   139  	if pk := machine.SignerKey(); pk != "" {
   140  		machines.properties.PublicKey = pk
   141  	}
   142  
   143  	return machines, nil
   144  }
   145  
   146  func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) {
   147  	defer wg.Done()
   148  
   149  	w.Infof("machines watcher %s starting", w.name)
   150  
   151  	if w.interval != 0 {
   152  		wg.Add(1)
   153  		go w.intervalWatcher(ctx, wg)
   154  	}
   155  
   156  	w.performWatch(ctx, false)
   157  
   158  	for {
   159  		select {
   160  		case <-w.Watcher.StateChangeC():
   161  			w.performWatch(ctx, true)
   162  
   163  		case <-ctx.Done():
   164  			w.Infof("Stopping on context interrupt")
   165  			w.CancelGovernor()
   166  			return
   167  		}
   168  	}
   169  }
   170  
   171  func (w *Watcher) watch(ctx context.Context) (state State, err error) {
   172  	if !w.ShouldWatch() {
   173  		return Skipped, nil
   174  	}
   175  
   176  	start := time.Now()
   177  	defer func() {
   178  		w.mu.Lock()
   179  		w.previousRunTime = time.Since(start)
   180  		w.mu.Unlock()
   181  	}()
   182  
   183  	desired, err := w.desiredState()
   184  	if err != nil {
   185  		return Error, err
   186  	}
   187  
   188  	w.mu.Lock()
   189  	w.previousManaged = desired
   190  	w.mu.Unlock()
   191  
   192  	purged := false
   193  	updated := false
   194  
   195  	if w.properties.PurgeUnknown {
   196  		purged, err = w.purgeUnknownMachines(ctx, desired)
   197  		if err != nil {
   198  			return Error, err
   199  		}
   200  	}
   201  
   202  	for _, m := range desired {
   203  		if m == nil || m.Name == "" {
   204  			continue
   205  		}
   206  
   207  		match, err := w.isNodeMatch(m)
   208  		if err != nil {
   209  			w.Debugf("Could not match machine %s to node: %s", m.Name, err)
   210  			continue
   211  		}
   212  		if !match {
   213  			continue
   214  		}
   215  
   216  		targetDir := w.targetDirForManagerMachine(m.Name)
   217  		target := filepath.Join(targetDir, "machine.yaml")
   218  		spec, err := w.renderMachine(m)
   219  		if err != nil {
   220  			w.Errorf("Failed to render machine %s: %v", m.Name, err)
   221  			continue
   222  		}
   223  
   224  		if iu.FileExist(target) {
   225  			specHash, err := iu.Sha256HashBytes(spec)
   226  			if err != nil {
   227  				w.Errorf("Could not determine hash for spec for %s: %s", m.Name, err)
   228  				continue
   229  			}
   230  
   231  			ok, _, err := iu.FileHasSha256Sum(target, specHash)
   232  			if err != nil {
   233  				w.Errorf("Could not compare spec with target %s: %s", target, err)
   234  				continue
   235  			}
   236  
   237  			if ok {
   238  				w.Debugf("Machine in %s has the correct content, continuing", target)
   239  				continue
   240  			} else {
   241  				w.Warnf("Machine in %s has incorrect content, updating", target)
   242  			}
   243  
   244  			err = os.RemoveAll(targetDir)
   245  			if err != nil {
   246  				w.Errorf("Could not remove unmatched machine in %s: %s", targetDir, err)
   247  				return Error, err
   248  			}
   249  		}
   250  
   251  		w.Warnf("Deploying Choria Autonomous Agent %s from %s", m.Name, m.Source)
   252  
   253  		err = os.MkdirAll(targetDir, 0700)
   254  		if err != nil {
   255  			w.Errorf("Could not create directory for %s: %s", m.Name, err)
   256  			continue
   257  		}
   258  
   259  		err = os.WriteFile(target, spec, 0600)
   260  		if err != nil {
   261  			w.Errorf("Could not write machine spec for %s: %s", m.Name, err)
   262  			os.RemoveAll(targetDir)
   263  			continue
   264  		}
   265  
   266  		updated = true
   267  	}
   268  
   269  	if purged || updated {
   270  		return Updated, nil
   271  	}
   272  
   273  	return Unchanged, nil
   274  }
   275  
   276  func (w *Watcher) handleCheck(s State, err error) error {
   277  	w.Debugf("handling state for %s %v", stateNames[s], err)
   278  
   279  	w.mu.Lock()
   280  	w.previous = s
   281  	w.mu.Unlock()
   282  
   283  	switch s {
   284  	case Error:
   285  		if err != nil {
   286  			w.Errorf("Managing machines failed: %s", err)
   287  		}
   288  
   289  		w.NotifyWatcherState(w.CurrentState())
   290  		return w.FailureTransition()
   291  
   292  	case Updated:
   293  		w.NotifyWatcherState(w.CurrentState())
   294  		return w.SuccessTransition()
   295  
   296  	}
   297  
   298  	return nil
   299  }
   300  
   301  func (w *Watcher) renderMachine(m *ManagedMachine) ([]byte, error) {
   302  	buf := bytes.NewBuffer([]byte{})
   303  	t := template.New("machine")
   304  
   305  	p, err := t.Parse(string(mdat))
   306  	if err != nil {
   307  		return nil, err
   308  	}
   309  
   310  	err = p.Execute(buf, m)
   311  	if err != nil {
   312  		return nil, err
   313  	}
   314  
   315  	return buf.Bytes(), nil
   316  }
   317  
   318  func (w *Watcher) targetDirForManagerMachine(m string) string {
   319  	return filepath.Join(filepath.Dir(w.machine.Directory()), fmt.Sprintf("mm_%s", m))
   320  }
   321  
   322  func (w *Watcher) targetDirForManagedMachine(m string) string {
   323  	return filepath.Join(filepath.Dir(w.machine.Directory()), m)
   324  }
   325  
   326  func (w *Watcher) purgeUnknownMachines(ctx context.Context, desired []*ManagedMachine) (bool, error) {
   327  	current, err := w.currentMachines()
   328  	if err != nil {
   329  		return false, err
   330  	}
   331  
   332  	w.Debugf("Purging unknown machines from current list %v", current)
   333  
   334  	purged := false
   335  	for _, m := range current {
   336  		keep := false
   337  		for _, d := range desired {
   338  			if d == nil || d.Name == "" {
   339  				continue
   340  			}
   341  
   342  			if m == d.Name {
   343  				if ok, _ := w.isNodeMatch(d); ok {
   344  					keep = true
   345  					break
   346  				}
   347  			}
   348  		}
   349  
   350  		if !keep {
   351  			w.Warnf("Removing existing managed machine %s that is not in new desired set", m)
   352  			target := w.targetDirForManagerMachine(m)
   353  			err = os.RemoveAll(target)
   354  			if err != nil {
   355  				w.Errorf("Could not remove %s: %s", target, err)
   356  				continue
   357  			}
   358  
   359  			w.Debugf("Sleeping for 2 seconds to allow manager to exit")
   360  			iu.InterruptibleSleep(ctx, 2*time.Second)
   361  
   362  			target = w.targetDirForManagedMachine(m)
   363  			err = os.RemoveAll(target)
   364  			if err != nil {
   365  				w.Errorf("Could not remove %s: %s", target, err)
   366  				continue
   367  			}
   368  
   369  			purged = true
   370  		}
   371  	}
   372  
   373  	return purged, nil
   374  }
   375  
   376  func (w *Watcher) currentMachines() ([]string, error) {
   377  	dirs, err := os.ReadDir(filepath.Dir(w.machine.Directory()))
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  
   382  	var found []string
   383  
   384  	for _, e := range dirs {
   385  		if !e.IsDir() {
   386  			continue
   387  		}
   388  
   389  		parts := strings.SplitN(e.Name(), "_", 2)
   390  		if len(parts) != 2 {
   391  			continue
   392  		}
   393  
   394  		if parts[0] == "mm" {
   395  			found = append(found, parts[1])
   396  		}
   397  	}
   398  
   399  	return found, nil
   400  }
   401  
   402  func (w *Watcher) loadAndValidateData() ([]byte, error) {
   403  	dat, ok := w.machine.DataGet(w.properties.DataItem)
   404  	if !ok {
   405  		return nil, fmt.Errorf("data item %s not present", w.properties.DataItem)
   406  	}
   407  
   408  	spec := &Specification{}
   409  	decoder, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
   410  		DecodeHook:       mapstructure.ComposeDecodeHookFunc(mapstructure.StringToTimeDurationHookFunc()),
   411  		Result:           &spec,
   412  		WeaklyTypedInput: true,
   413  	})
   414  	if err != nil {
   415  		return nil, err
   416  	}
   417  
   418  	err = decoder.Decode(dat)
   419  	if err != nil {
   420  		return nil, err
   421  	}
   422  
   423  	payload, err := base64.StdEncoding.DecodeString(string(spec.Machines))
   424  	if err != nil {
   425  		w.Errorf("Invalid base64 encoded machines specification, removing data: %s", err)
   426  		w.machine.DataDelete(w.properties.DataItem)
   427  		return nil, fmt.Errorf("invalid data_item")
   428  	}
   429  
   430  	if w.properties.PublicKey != "" {
   431  		if len(spec.Signature) == 0 {
   432  			w.Errorf("No signature found in specification, removing data")
   433  			w.machine.DataDelete(w.properties.DataItem)
   434  			return nil, fmt.Errorf("invalid data_item")
   435  		}
   436  
   437  		pk, err := hex.DecodeString(w.properties.PublicKey)
   438  		if err != nil {
   439  			w.Errorf("invalid public key: %s", err)
   440  			return nil, fmt.Errorf("invalid data_item")
   441  		}
   442  
   443  		sig, err := hex.DecodeString(spec.Signature)
   444  		if err != nil {
   445  			w.Errorf("invalid signature string, removing data %s: %s", w.properties.DataItem, err)
   446  			w.machine.DataDelete(w.properties.DataItem)
   447  			return nil, fmt.Errorf("invalid data_item")
   448  		}
   449  
   450  		if !ed25519.Verify(pk, payload, sig) {
   451  			w.Errorf("Signature in data_item %s did not verify using configured public key '%s', removing data", w.properties.DataItem, w.properties.PublicKey)
   452  			w.machine.DataDelete(w.properties.DataItem)
   453  			return nil, fmt.Errorf("invalid data_item")
   454  		}
   455  	}
   456  
   457  	return payload, nil
   458  }
   459  
   460  func (w *Watcher) desiredState() ([]*ManagedMachine, error) {
   461  	data, err := w.loadAndValidateData()
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  
   466  	desired := []*ManagedMachine{}
   467  	err = json.Unmarshal(data, &desired)
   468  	if err != nil {
   469  		return nil, fmt.Errorf("invalid machines specification: %s", err)
   470  	}
   471  
   472  	for _, m := range desired {
   473  		m.Interval = w.properties.MachineManageInterval.String()
   474  		m.Target = filepath.Dir(w.machine.Directory())
   475  
   476  		if m.Name == "" {
   477  			return nil, fmt.Errorf("name is required")
   478  		}
   479  
   480  		if m.Source == "" {
   481  			return nil, fmt.Errorf("source is required for %s", m.Name)
   482  		}
   483  
   484  		if m.ArchiveChecksum == "" {
   485  			return nil, fmt.Errorf("checksum is required for %s", m.Name)
   486  		}
   487  
   488  		if m.Target == "" {
   489  			return nil, fmt.Errorf("could not determine target for managed machine for %s", m.Name)
   490  		}
   491  
   492  		if m.ContentChecksumsChecksum == "" {
   493  			return nil, fmt.Errorf("verify_checksum is required for %s", m.Name)
   494  		}
   495  	}
   496  
   497  	return desired, nil
   498  }
   499  
   500  func (w *Watcher) performWatch(ctx context.Context, force bool) {
   501  	w.wmu.Lock()
   502  	defer w.wmu.Unlock()
   503  
   504  	if !force && time.Since(w.lastWatch) < w.interval {
   505  		return
   506  	}
   507  
   508  	err := w.handleCheck(w.watch(ctx))
   509  	if err != nil {
   510  		w.Errorf("could not handle watcher event: %s", err)
   511  	}
   512  }
   513  
   514  func (w *Watcher) intervalWatcher(ctx context.Context, wg *sync.WaitGroup) {
   515  	defer wg.Done()
   516  
   517  	tick := time.NewTicker(w.interval)
   518  
   519  	for {
   520  		select {
   521  		case <-tick.C:
   522  			w.performWatch(ctx, false)
   523  
   524  		case <-ctx.Done():
   525  			tick.Stop()
   526  			return
   527  		}
   528  	}
   529  }
   530  
   531  func (w *Watcher) setProperties(props map[string]any) error {
   532  	if w.properties == nil {
   533  		w.properties = &Properties{}
   534  	}
   535  
   536  	err := util.ParseMapStructure(props, w.properties)
   537  	if err != nil {
   538  		return err
   539  	}
   540  
   541  	if PublicKey != "" {
   542  		w.properties.PublicKey = PublicKey
   543  	}
   544  
   545  	return w.validate()
   546  }
   547  
   548  func (w *Watcher) validate() error {
   549  	if w.properties.DataItem == "" {
   550  		return fmt.Errorf("data_item is required")
   551  	}
   552  	if w.machine.Directory() == "" {
   553  		return fmt.Errorf("machine store is not configured")
   554  	}
   555  
   556  	if w.properties.MachineManageInterval == 0 {
   557  		w.properties.MachineManageInterval = 2 * time.Minute
   558  	}
   559  
   560  	return nil
   561  }
   562  
   563  func (w *Watcher) CurrentState() any {
   564  	w.mu.Lock()
   565  	defer w.mu.Unlock()
   566  
   567  	s := &StateNotification{
   568  		Event:                   event.New(w.name, wtype, version, w.machine),
   569  		PreviousManagedMachines: []string{},
   570  		PreviousOutcome:         stateNames[w.previous],
   571  		PreviousRunTime:         w.previousRunTime.Nanoseconds(),
   572  	}
   573  
   574  	for _, m := range w.previousManaged {
   575  		s.PreviousManagedMachines = append(s.PreviousManagedMachines, m.Name)
   576  	}
   577  
   578  	return s
   579  }