github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/gossipwatcher/gossip.go (about)

     1  // Copyright (c) 2022-2024, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package gossipwatcher
     6  
     7  import (
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"net"
    12  	"regexp"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/choria-io/go-choria/aagent/model"
    18  	"github.com/choria-io/go-choria/aagent/util"
    19  	"github.com/choria-io/go-choria/aagent/watchers/event"
    20  	"github.com/choria-io/go-choria/aagent/watchers/watcher"
    21  	iu "github.com/choria-io/go-choria/internal/util"
    22  	"github.com/nats-io/nats.go"
    23  )
    24  
    25  type State int
    26  
    27  const (
    28  	Stopped State = iota
    29  	Running
    30  
    31  	wtype   = "gossip"
    32  	version = "v1"
    33  )
    34  
    35  var (
    36  	validBasicName    = `[a-zA-Z][a-zA-Z\d_-]*`
    37  	validServiceRegex = regexp.MustCompile(`^` + validBasicName + `$`)
    38  )
    39  
    40  type Registration struct {
    41  	Cluster     string            `json:"cluster"`
    42  	Service     string            `json:"service"`
    43  	Protocol    string            `json:"protocol"`
    44  	IP          string            `json:"address"`
    45  	Port        uint              `json:"port"`
    46  	Priority    uint              `json:"priority"`
    47  	Annotations map[string]string `json:"annotations,omitempty"`
    48  	Prefix      string            `json:"-"`
    49  }
    50  
    51  type properties struct {
    52  	Subject      string
    53  	Payload      string
    54  	Registration *Registration
    55  }
    56  
    57  type Watcher struct {
    58  	*watcher.Watcher
    59  	properties *properties
    60  
    61  	name         string
    62  	machine      model.Machine
    63  	nc           *nats.Conn
    64  	interval     time.Duration
    65  	gossipCancel context.CancelFunc
    66  	runCtx       context.Context
    67  	state        State
    68  	lastSubject  string
    69  	lastPayload  string
    70  	lastGossip   time.Time
    71  
    72  	terminate chan struct{}
    73  	mu        *sync.Mutex
    74  }
    75  
    76  func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, properties map[string]any) (any, error) {
    77  	var err error
    78  
    79  	tw := &Watcher{
    80  		name:      name,
    81  		machine:   machine,
    82  		terminate: make(chan struct{}),
    83  		mu:        &sync.Mutex{},
    84  	}
    85  
    86  	tw.interval, err = iu.ParseDuration(interval)
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  
    91  	tw.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  
    96  	err = tw.setProperties(properties)
    97  	if err != nil {
    98  		return nil, fmt.Errorf("could not set properties: %s", err)
    99  	}
   100  
   101  	return tw, nil
   102  }
   103  
   104  func (w *Watcher) getConn() (*nats.Conn, error) {
   105  	w.mu.Lock()
   106  	defer w.mu.Unlock()
   107  
   108  	if w.nc != nil {
   109  		return w.nc, nil
   110  	}
   111  
   112  	mgr, err := w.machine.JetStreamConnection()
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  
   117  	w.nc = mgr.NatsConn()
   118  
   119  	return w.nc, nil
   120  }
   121  
   122  func (w *Watcher) stopGossip() {
   123  	w.mu.Lock()
   124  	cancel := w.gossipCancel
   125  	w.state = Stopped
   126  	w.mu.Unlock()
   127  
   128  	if cancel != nil {
   129  		w.Infof("Stopping gossip on transition to %s", w.machine.State())
   130  		cancel()
   131  	}
   132  }
   133  
   134  func (w *Watcher) startGossip() {
   135  	w.mu.Lock()
   136  	cancel := w.gossipCancel
   137  	ctx := w.runCtx
   138  	w.mu.Unlock()
   139  
   140  	if cancel != nil {
   141  		return
   142  	}
   143  
   144  	go func() {
   145  		tick := time.NewTicker(w.interval)
   146  		gCtx, cancel := context.WithCancel(ctx)
   147  
   148  		var err error
   149  
   150  		w.mu.Lock()
   151  		w.state = Running
   152  		w.gossipCancel = cancel
   153  		w.mu.Unlock()
   154  
   155  		if err != nil {
   156  			w.Errorf("Could not get a NATS connection to publish Gossip")
   157  		}
   158  
   159  		stop := func() {
   160  			w.mu.Lock()
   161  			w.gossipCancel = nil
   162  			w.state = Stopped
   163  			tick.Stop()
   164  			w.mu.Unlock()
   165  		}
   166  
   167  		publish := func() {
   168  			if !w.ShouldWatch() {
   169  				return
   170  			}
   171  
   172  			w.Infof("Gossiping while in state %v", w.machine.State())
   173  			nc, err := w.getConn()
   174  			if err != nil {
   175  				w.Errorf("Could not get NATS connection: %v", err)
   176  				return
   177  			}
   178  
   179  			subject, err := w.ProcessTemplate(w.properties.Subject)
   180  			if err != nil {
   181  				w.Errorf("Could not template parse subject: %v", err)
   182  				return
   183  			}
   184  
   185  			payload, err := w.ProcessTemplate(w.properties.Payload)
   186  			if err != nil {
   187  				w.Errorf("Could not template parse payload: %v", err)
   188  				return
   189  			}
   190  
   191  			w.Debugf("Publishing gossip to %s", subject)
   192  			nc.Publish(subject, []byte(payload))
   193  
   194  			w.mu.Lock()
   195  			w.lastGossip = time.Now()
   196  			w.lastSubject = subject
   197  			w.lastPayload = payload
   198  			w.mu.Unlock()
   199  		}
   200  
   201  		publish()
   202  
   203  		for {
   204  			select {
   205  			case <-tick.C:
   206  				publish()
   207  			case <-gCtx.Done():
   208  				stop()
   209  				return
   210  			case <-w.terminate:
   211  				stop()
   212  				return
   213  			}
   214  		}
   215  	}()
   216  }
   217  
   218  func (w *Watcher) watch() {
   219  	if !w.ShouldWatch() {
   220  		w.stopGossip()
   221  		return
   222  	}
   223  
   224  	w.Infof("Starting gossip timer")
   225  	w.startGossip()
   226  }
   227  
   228  func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) {
   229  	defer wg.Done()
   230  
   231  	w.mu.Lock()
   232  	w.runCtx = ctx
   233  	w.mu.Unlock()
   234  
   235  	w.Infof("Gossip watcher starting with subject %q on interval %v", w.properties.Subject, w.interval)
   236  
   237  	w.watch()
   238  
   239  	for {
   240  		select {
   241  		case <-w.StateChangeC():
   242  			w.watch()
   243  
   244  		case <-w.terminate:
   245  			w.Infof("Handling terminate notification")
   246  			return
   247  		case <-ctx.Done():
   248  			w.Infof("Stopping on context interrupt")
   249  			return
   250  		}
   251  	}
   252  }
   253  
   254  func (w *Watcher) setProperties(props map[string]any) error {
   255  	if w.properties == nil {
   256  		w.properties = &properties{}
   257  	}
   258  
   259  	err := util.ParseMapStructure(props, w.properties)
   260  	if err != nil {
   261  		return err
   262  	}
   263  
   264  	return w.validate()
   265  }
   266  
   267  func (w *Watcher) validate() error {
   268  	switch {
   269  	case w.properties.Registration == nil:
   270  		if w.properties.Subject == "" {
   271  			return fmt.Errorf("subject is required")
   272  		}
   273  		if w.properties.Payload == "" {
   274  			return fmt.Errorf("payload is required")
   275  		}
   276  	default:
   277  		if w.properties.Subject != "" {
   278  			return fmt.Errorf("subject cannot be set with registration")
   279  		}
   280  		if w.properties.Payload != "" {
   281  			return fmt.Errorf("payload cannot be set with registration")
   282  		}
   283  		reg := w.properties.Registration
   284  		if reg.Cluster == "" {
   285  			return fmt.Errorf("cluster is required")
   286  		}
   287  		if !validServiceRegex.MatchString(reg.Cluster) {
   288  			return fmt.Errorf("invalid cluster")
   289  		}
   290  		if reg.Service == "" {
   291  			return fmt.Errorf("service is required")
   292  		}
   293  		if !validServiceRegex.MatchString(reg.Service) {
   294  			return fmt.Errorf("invalid service")
   295  		}
   296  		if reg.Protocol == "" {
   297  			return fmt.Errorf("protocol is required")
   298  		}
   299  		if !validServiceRegex.MatchString(reg.Protocol) {
   300  			return fmt.Errorf("invalid protocol")
   301  		}
   302  		if reg.IP == "" {
   303  			return fmt.Errorf("ip is required")
   304  		}
   305  		if net.ParseIP(reg.IP) == nil {
   306  			return fmt.Errorf("invalid ip")
   307  		}
   308  		if reg.Port == 0 {
   309  			return fmt.Errorf("port is required")
   310  		}
   311  
   312  		subj := fmt.Sprintf("%s.%s.%s.%s", reg.Cluster, reg.Protocol, reg.Service, w.machine.InstanceID())
   313  		if reg.Prefix == "" {
   314  			w.properties.Subject = fmt.Sprintf("$KV.CHORIA_SERVICES.%s", subj)
   315  		} else {
   316  			w.properties.Subject = fmt.Sprintf("%s.%s", reg.Prefix, subj)
   317  		}
   318  
   319  		if strings.ContainsAny(w.properties.Subject, " ^*") || strings.Contains(w.properties.Subject, "..") {
   320  			return fmt.Errorf("invalid registration properties")
   321  		}
   322  
   323  		pj, err := json.Marshal(w.properties.Registration)
   324  		if err != nil {
   325  			return err
   326  		}
   327  		w.properties.Payload = string(pj)
   328  	}
   329  
   330  	if w.interval == 0 {
   331  		w.interval = 15 * time.Second
   332  	}
   333  
   334  	return nil
   335  }
   336  
   337  func (w *Watcher) Delete() {
   338  	close(w.terminate)
   339  }
   340  
   341  func (w *Watcher) CurrentState() any {
   342  	w.mu.Lock()
   343  	defer w.mu.Unlock()
   344  
   345  	s := &StateNotification{
   346  		Event:     event.New(w.name, wtype, version, w.machine),
   347  		Published: w.lastGossip.Unix(),
   348  		Payload:   w.lastPayload,
   349  		Subject:   w.lastSubject,
   350  	}
   351  
   352  	return s
   353  }