github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/server/agents/agents.go (about)

     1  // Copyright (c) 2017-2021, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package agents
     6  
     7  import (
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"sort"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/choria-io/go-choria/build"
    16  	"github.com/choria-io/go-choria/inter"
    17  	"github.com/choria-io/go-choria/lifecycle"
    18  	"github.com/choria-io/go-choria/protocol"
    19  	"github.com/choria-io/go-choria/providers/data/ddl"
    20  	"github.com/choria-io/go-choria/statistics"
    21  	"github.com/sirupsen/logrus"
    22  
    23  	"github.com/choria-io/go-choria/aagent"
    24  )
    25  
    26  // Agent is a generic choria agent
    27  type Agent interface {
    28  	Metadata() *Metadata
    29  	Name() string
    30  	HandleMessage(context.Context, inter.Message, protocol.Request, inter.ConnectorInfo, chan *AgentReply)
    31  	SetServerInfo(ServerInfoSource)
    32  	ServerInfo() ServerInfoSource
    33  	ShouldActivate() bool
    34  }
    35  
    36  // ServerInfoSource provides data about a running server instance
    37  type ServerInfoSource interface {
    38  	AgentMetadata(string) (Metadata, bool)
    39  	BuildInfo() *build.Info
    40  	Classes() []string
    41  	ConfigFile() string
    42  	ConnectedServer() string
    43  	DataFuncMap() (ddl.FuncMap, error)
    44  	Facts() json.RawMessage
    45  	Identity() string
    46  	KnownAgents() []string
    47  	LastProcessedMessage() time.Time
    48  	MachineTransition(name string, version string, path string, id string, transition string) error
    49  	MachinesStatus() ([]aagent.MachineState, error)
    50  	NewEvent(t lifecycle.Type, opts ...lifecycle.Option) error
    51  	PrepareForShutdown() error
    52  	Provisioning() bool
    53  	StartTime() time.Time
    54  	Stats() statistics.ServerStats
    55  	UpTime() int64
    56  }
    57  
    58  // AgentReply is a generic reply from an agent
    59  type AgentReply struct {
    60  	Body    []byte
    61  	Request protocol.Request
    62  	Message inter.Message
    63  	Error   error
    64  }
    65  
    66  // Metadata describes an agent at a high level and is required for any agent
    67  type Metadata struct {
    68  	License     string `json:"license"`
    69  	Author      string `json:"author"`
    70  	Timeout     int    `json:"timeout"`
    71  	Name        string `json:"name"`
    72  	Version     string `json:"version"`
    73  	URL         string `json:"url"`
    74  	Description string `json:"description"`
    75  	Provider    string `json:"provider,omitempty"`
    76  	Service     bool   `json:"service,omitempty"`
    77  }
    78  
    79  // Manager manages agents, handles registration, dispatches requests etc
    80  type Manager struct {
    81  	agents       map[string]Agent
    82  	subs         map[string][]string
    83  	fw           inter.Framework
    84  	log          *logrus.Entry
    85  	mu           *sync.Mutex
    86  	conn         inter.ConnectorInfo
    87  	serverInfo   ServerInfoSource
    88  	denylist     []string
    89  	requests     chan inter.ConnectorMessage
    90  	servicesOnly bool
    91  }
    92  
    93  // NewServices creates an agent manager restricted to service agents
    94  func NewServices(requests chan inter.ConnectorMessage, fw inter.Framework, conn inter.ConnectorInfo, srv ServerInfoSource, log *logrus.Entry) *Manager {
    95  	m := New(requests, fw, conn, srv, log)
    96  	m.servicesOnly = true
    97  	m.log = m.log.WithField("service_host", true)
    98  
    99  	return m
   100  }
   101  
   102  // New creates a new Agent Manager
   103  func New(requests chan inter.ConnectorMessage, fw inter.Framework, conn inter.ConnectorInfo, srv ServerInfoSource, log *logrus.Entry) *Manager {
   104  	return &Manager{
   105  		agents:     make(map[string]Agent),
   106  		subs:       make(map[string][]string),
   107  		fw:         fw,
   108  		log:        log.WithFields(logrus.Fields{"subsystem": "agents"}),
   109  		mu:         &sync.Mutex{},
   110  		requests:   requests,
   111  		conn:       conn,
   112  		serverInfo: srv,
   113  	}
   114  }
   115  
   116  // DenyAgent adds an agent to the list of agent names not allowed to start
   117  func (a *Manager) DenyAgent(agent string) {
   118  	a.denylist = append(a.denylist, agent)
   119  }
   120  
   121  // ReplaceAgent allows an agent manager to replace an agent that is already known, and subsscribed, with another instance to facilitate in-place upgrades
   122  func (a *Manager) ReplaceAgent(name string, agent Agent) error {
   123  	if name == "" {
   124  		return fmt.Errorf("agent name is required")
   125  	}
   126  
   127  	err := a.validateAgent(agent)
   128  	if err != nil {
   129  		a.log.Warnf("Denying agent %q update: %v", agent.Name(), err)
   130  		return fmt.Errorf("invalid agent: %w", err)
   131  	}
   132  
   133  	if !agent.ShouldActivate() {
   134  		return fmt.Errorf("replacement agent is not activating due to activation checks")
   135  	}
   136  
   137  	md := agent.Metadata()
   138  
   139  	a.mu.Lock()
   140  	defer a.mu.Unlock()
   141  
   142  	ca, found := a.agents[name]
   143  	if !found {
   144  		return fmt.Errorf("agent %q is not currently known", name)
   145  	}
   146  
   147  	if ca.Metadata().Service != md.Service {
   148  		return fmt.Errorf("replacement agent cannot change service property")
   149  	}
   150  
   151  	a.log.Infof("Replacing agent %s of type %s with a new instance moving from version %s to %s", name, md.Name, ca.Metadata().Version, md.Version)
   152  
   153  	agent.SetServerInfo(a.serverInfo)
   154  
   155  	a.agents[name] = agent
   156  
   157  	return nil
   158  }
   159  
   160  func (a *Manager) validateAgent(agent Agent) error {
   161  	md := agent.Metadata()
   162  
   163  	if md.Timeout < 1 {
   164  		return fmt.Errorf("timeout < 1")
   165  	}
   166  
   167  	if md.Name == "" {
   168  		return fmt.Errorf("invalid metadata")
   169  	}
   170  
   171  	return nil
   172  }
   173  
   174  // UnregisterAgent attempts to remove interest in messages for an agent
   175  //
   176  // Each agent has a number of subscriptions (one per collective) so this can fail for some
   177  // while working for others, in this case the agent is essentially in an unrecoverable state
   178  // however the cases where unsubscribe will error are quite few in the nats client as its
   179  // not being-connected dependant and we handle most errors correctly.
   180  //
   181  // So this function will try to unsubscribe but if it fails, it will continue and finally unload
   182  // the agent, any stale subscriptions then will be dropped by the handlers so its ok. We will treat
   183  // unsbuscribe errors as non terminal, only logging errors.
   184  func (a *Manager) UnregisterAgent(name string, conn inter.AgentConnector) error {
   185  	if name == "" {
   186  		return fmt.Errorf("agent name is required")
   187  	}
   188  
   189  	a.mu.Lock()
   190  	defer a.mu.Unlock()
   191  
   192  	_, found := a.agents[name]
   193  	if !found {
   194  		return fmt.Errorf("unknown agent")
   195  	}
   196  
   197  	a.log.Debugf("Unregistering agent %v", name)
   198  
   199  	err := a.unSubscribeAgent(name, conn)
   200  	if err != nil {
   201  		a.log.Errorf("Could not unsubscribe all interest for agent %v: %v", name, err)
   202  	}
   203  
   204  	delete(a.agents, name)
   205  	delete(a.subs, name)
   206  
   207  	return nil
   208  }
   209  
   210  // RegisterAgent connects a new agent to the server instance, subscribe to all its targets etc
   211  func (a *Manager) RegisterAgent(ctx context.Context, name string, agent Agent, conn inter.AgentConnector) error {
   212  	if name == "" {
   213  		return fmt.Errorf("agent name is required")
   214  	}
   215  
   216  	err := a.validateAgent(agent)
   217  	if err != nil {
   218  		a.log.Warnf("Denying agent %q: %v", name, err)
   219  		return fmt.Errorf("invalid agent: %w", err)
   220  	}
   221  
   222  	a.mu.Lock()
   223  	defer a.mu.Unlock()
   224  
   225  	if a.servicesOnly && !agent.Metadata().Service {
   226  		a.log.Infof("Denying non Service Agent %s", name)
   227  		return nil
   228  	}
   229  
   230  	if !agent.ShouldActivate() {
   231  		a.log.Infof("Agent %s not activating due to ShouldActivate checks", name)
   232  		return nil
   233  	}
   234  
   235  	if a.agentDenied(name) {
   236  		a.log.Infof("Denying agent %s based on agent deny list", name)
   237  		return nil
   238  	}
   239  
   240  	a.log.Infof("Registering new agent %s of type %s", name, agent.Metadata().Name)
   241  
   242  	agent.SetServerInfo(a.serverInfo)
   243  
   244  	if _, found := a.agents[name]; found {
   245  		return fmt.Errorf("agent %s is already registered", name)
   246  	}
   247  
   248  	err = a.subscribeAgent(ctx, name, agent, conn)
   249  	if err != nil {
   250  		return fmt.Errorf("could not register agent %s: %s", name, err)
   251  	}
   252  
   253  	a.agents[name] = agent
   254  
   255  	return nil
   256  }
   257  
   258  // KnownAgents retrieves a list of known agents
   259  func (a *Manager) KnownAgents() []string {
   260  	a.mu.Lock()
   261  	defer a.mu.Unlock()
   262  
   263  	known := make([]string, 0, len(a.agents))
   264  
   265  	for agent := range a.agents {
   266  		known = append(known, agent)
   267  	}
   268  
   269  	sort.Strings(known)
   270  
   271  	return known
   272  }
   273  
   274  func (a *Manager) agentDenied(name string) bool {
   275  	for _, n := range a.denylist {
   276  		if n == name {
   277  			return true
   278  		}
   279  	}
   280  
   281  	return false
   282  }
   283  
   284  func (a *Manager) unSubscribeAgent(name string, conn inter.AgentConnector) error {
   285  	subs, ok := a.subs[name]
   286  	if !ok {
   287  		return nil
   288  	}
   289  
   290  	for _, sub := range subs {
   291  		err := conn.Unsubscribe(sub)
   292  		if err != nil {
   293  			return err
   294  		}
   295  	}
   296  
   297  	delete(a.subs, name)
   298  
   299  	return nil
   300  }
   301  
   302  // Subscribes an agent to all its targets on the connector.  Should any subscription fail
   303  // all the preceding subscriptions for this agents is unsubscribed and an error returned.
   304  // Errors during the unsub is just ignored because it's quite possible that they would fail
   305  // too but this avoids problems of messages arriving we did not expect.
   306  //
   307  // In practice though this is something done during bootstrap and failure here should exit
   308  // the whole instance, so it's probably not needed
   309  func (a *Manager) subscribeAgent(ctx context.Context, name string, agent Agent, conn inter.AgentConnector) error {
   310  	if _, found := a.subs[name]; found {
   311  		return fmt.Errorf("could not subscribe agent %s, it's already subscribed", name)
   312  	}
   313  
   314  	a.subs[name] = []string{}
   315  
   316  	for _, collective := range a.fw.Configuration().Collectives {
   317  		var target string
   318  		group := ""
   319  
   320  		if agent.Metadata().Service {
   321  			target = conn.ServiceBroadcastTarget(collective, name)
   322  			group = name
   323  			a.log.Infof("Subscribing service agent %s to %s in group %s", name, target, group)
   324  		} else {
   325  			target = conn.AgentBroadcastTarget(collective, name)
   326  			a.log.Infof("Subscribing agent %s to %s", name, target)
   327  		}
   328  
   329  		subname := fmt.Sprintf("%s.%s", collective, name)
   330  
   331  		err := conn.QueueSubscribe(ctx, subname, target, group, a.requests)
   332  		if err != nil {
   333  			a.log.Errorf("could not subscribe agent %s to %s, rewinding all subscriptions for this agent", name, target)
   334  			for _, sub := range a.subs[name] {
   335  				conn.Unsubscribe(sub)
   336  			}
   337  
   338  			return fmt.Errorf("subscription failed: %s", err)
   339  		}
   340  
   341  		a.subs[name] = append(a.subs[name], subname)
   342  	}
   343  
   344  	return nil
   345  }
   346  
   347  // Get retrieves an agent by name
   348  func (a *Manager) Get(name string) (Agent, bool) {
   349  	a.mu.Lock()
   350  	defer a.mu.Unlock()
   351  
   352  	agent, found := a.agents[name]
   353  
   354  	return agent, found
   355  }
   356  
   357  // Dispatch sends a request to a agent and wait for a reply
   358  func (a *Manager) Dispatch(ctx context.Context, wg *sync.WaitGroup, replies chan *AgentReply, msg inter.Message, request protocol.Request) {
   359  	defer wg.Done()
   360  
   361  	agent, found := a.Get(msg.Agent())
   362  	if !found {
   363  		a.log.Errorf("Received a message for agent %s that does not exist, discarding", msg.Agent())
   364  		return
   365  	}
   366  
   367  	result := make(chan *AgentReply)
   368  
   369  	td := time.Duration(agent.Metadata().Timeout) * time.Second
   370  	a.log.Debugf("Handling message %s with timeout %s", msg.RequestID(), td)
   371  
   372  	timeout, cancel := context.WithTimeout(context.Background(), td)
   373  	defer cancel()
   374  
   375  	go agent.HandleMessage(timeout, msg, request, a.conn, result)
   376  
   377  	select {
   378  	case reply := <-result:
   379  		replies <- reply
   380  	case <-ctx.Done():
   381  		replies <- &AgentReply{
   382  			Message: msg,
   383  			Request: request,
   384  			Error:   fmt.Errorf("agent dispatcher for request %s exiting on interrupt", msg.RequestID()),
   385  		}
   386  	case <-timeout.Done():
   387  		replies <- &AgentReply{
   388  			Message: msg,
   389  			Request: request,
   390  			Error:   fmt.Errorf("agent dispatcher for request %s exiting on %ds timeout", msg.RequestID(), agent.Metadata().Timeout),
   391  		}
   392  	}
   393  }
   394  
   395  // Logger is the logger the manager prefers new agents derive from
   396  func (a *Manager) Logger() *logrus.Entry {
   397  	return a.log
   398  }
   399  
   400  // Choria provides an instance of the choria framework
   401  func (a *Manager) Choria() inter.Framework {
   402  	return a.fw
   403  }