github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/server/agents/agents.go (about) 1 // Copyright (c) 2017-2021, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package agents 6 7 import ( 8 "context" 9 "encoding/json" 10 "fmt" 11 "sort" 12 "sync" 13 "time" 14 15 "github.com/choria-io/go-choria/build" 16 "github.com/choria-io/go-choria/inter" 17 "github.com/choria-io/go-choria/lifecycle" 18 "github.com/choria-io/go-choria/protocol" 19 "github.com/choria-io/go-choria/providers/data/ddl" 20 "github.com/choria-io/go-choria/statistics" 21 "github.com/sirupsen/logrus" 22 23 "github.com/choria-io/go-choria/aagent" 24 ) 25 26 // Agent is a generic choria agent 27 type Agent interface { 28 Metadata() *Metadata 29 Name() string 30 HandleMessage(context.Context, inter.Message, protocol.Request, inter.ConnectorInfo, chan *AgentReply) 31 SetServerInfo(ServerInfoSource) 32 ServerInfo() ServerInfoSource 33 ShouldActivate() bool 34 } 35 36 // ServerInfoSource provides data about a running server instance 37 type ServerInfoSource interface { 38 AgentMetadata(string) (Metadata, bool) 39 BuildInfo() *build.Info 40 Classes() []string 41 ConfigFile() string 42 ConnectedServer() string 43 DataFuncMap() (ddl.FuncMap, error) 44 Facts() json.RawMessage 45 Identity() string 46 KnownAgents() []string 47 LastProcessedMessage() time.Time 48 MachineTransition(name string, version string, path string, id string, transition string) error 49 MachinesStatus() ([]aagent.MachineState, error) 50 NewEvent(t lifecycle.Type, opts ...lifecycle.Option) error 51 PrepareForShutdown() error 52 Provisioning() bool 53 StartTime() time.Time 54 Stats() statistics.ServerStats 55 UpTime() int64 56 } 57 58 // AgentReply is a generic reply from an agent 59 type AgentReply struct { 60 Body []byte 61 Request protocol.Request 62 Message inter.Message 63 Error error 64 } 65 66 // Metadata describes an agent at a high level and is required for any agent 67 type Metadata struct { 68 License string `json:"license"` 69 Author string `json:"author"` 70 Timeout int `json:"timeout"` 71 Name string `json:"name"` 72 Version string `json:"version"` 73 URL string `json:"url"` 74 Description string `json:"description"` 75 Provider string `json:"provider,omitempty"` 76 Service bool `json:"service,omitempty"` 77 } 78 79 // Manager manages agents, handles registration, dispatches requests etc 80 type Manager struct { 81 agents map[string]Agent 82 subs map[string][]string 83 fw inter.Framework 84 log *logrus.Entry 85 mu *sync.Mutex 86 conn inter.ConnectorInfo 87 serverInfo ServerInfoSource 88 denylist []string 89 requests chan inter.ConnectorMessage 90 servicesOnly bool 91 } 92 93 // NewServices creates an agent manager restricted to service agents 94 func NewServices(requests chan inter.ConnectorMessage, fw inter.Framework, conn inter.ConnectorInfo, srv ServerInfoSource, log *logrus.Entry) *Manager { 95 m := New(requests, fw, conn, srv, log) 96 m.servicesOnly = true 97 m.log = m.log.WithField("service_host", true) 98 99 return m 100 } 101 102 // New creates a new Agent Manager 103 func New(requests chan inter.ConnectorMessage, fw inter.Framework, conn inter.ConnectorInfo, srv ServerInfoSource, log *logrus.Entry) *Manager { 104 return &Manager{ 105 agents: make(map[string]Agent), 106 subs: make(map[string][]string), 107 fw: fw, 108 log: log.WithFields(logrus.Fields{"subsystem": "agents"}), 109 mu: &sync.Mutex{}, 110 requests: requests, 111 conn: conn, 112 serverInfo: srv, 113 } 114 } 115 116 // DenyAgent adds an agent to the list of agent names not allowed to start 117 func (a *Manager) DenyAgent(agent string) { 118 a.denylist = append(a.denylist, agent) 119 } 120 121 // ReplaceAgent allows an agent manager to replace an agent that is already known, and subsscribed, with another instance to facilitate in-place upgrades 122 func (a *Manager) ReplaceAgent(name string, agent Agent) error { 123 if name == "" { 124 return fmt.Errorf("agent name is required") 125 } 126 127 err := a.validateAgent(agent) 128 if err != nil { 129 a.log.Warnf("Denying agent %q update: %v", agent.Name(), err) 130 return fmt.Errorf("invalid agent: %w", err) 131 } 132 133 if !agent.ShouldActivate() { 134 return fmt.Errorf("replacement agent is not activating due to activation checks") 135 } 136 137 md := agent.Metadata() 138 139 a.mu.Lock() 140 defer a.mu.Unlock() 141 142 ca, found := a.agents[name] 143 if !found { 144 return fmt.Errorf("agent %q is not currently known", name) 145 } 146 147 if ca.Metadata().Service != md.Service { 148 return fmt.Errorf("replacement agent cannot change service property") 149 } 150 151 a.log.Infof("Replacing agent %s of type %s with a new instance moving from version %s to %s", name, md.Name, ca.Metadata().Version, md.Version) 152 153 agent.SetServerInfo(a.serverInfo) 154 155 a.agents[name] = agent 156 157 return nil 158 } 159 160 func (a *Manager) validateAgent(agent Agent) error { 161 md := agent.Metadata() 162 163 if md.Timeout < 1 { 164 return fmt.Errorf("timeout < 1") 165 } 166 167 if md.Name == "" { 168 return fmt.Errorf("invalid metadata") 169 } 170 171 return nil 172 } 173 174 // UnregisterAgent attempts to remove interest in messages for an agent 175 // 176 // Each agent has a number of subscriptions (one per collective) so this can fail for some 177 // while working for others, in this case the agent is essentially in an unrecoverable state 178 // however the cases where unsubscribe will error are quite few in the nats client as its 179 // not being-connected dependant and we handle most errors correctly. 180 // 181 // So this function will try to unsubscribe but if it fails, it will continue and finally unload 182 // the agent, any stale subscriptions then will be dropped by the handlers so its ok. We will treat 183 // unsbuscribe errors as non terminal, only logging errors. 184 func (a *Manager) UnregisterAgent(name string, conn inter.AgentConnector) error { 185 if name == "" { 186 return fmt.Errorf("agent name is required") 187 } 188 189 a.mu.Lock() 190 defer a.mu.Unlock() 191 192 _, found := a.agents[name] 193 if !found { 194 return fmt.Errorf("unknown agent") 195 } 196 197 a.log.Debugf("Unregistering agent %v", name) 198 199 err := a.unSubscribeAgent(name, conn) 200 if err != nil { 201 a.log.Errorf("Could not unsubscribe all interest for agent %v: %v", name, err) 202 } 203 204 delete(a.agents, name) 205 delete(a.subs, name) 206 207 return nil 208 } 209 210 // RegisterAgent connects a new agent to the server instance, subscribe to all its targets etc 211 func (a *Manager) RegisterAgent(ctx context.Context, name string, agent Agent, conn inter.AgentConnector) error { 212 if name == "" { 213 return fmt.Errorf("agent name is required") 214 } 215 216 err := a.validateAgent(agent) 217 if err != nil { 218 a.log.Warnf("Denying agent %q: %v", name, err) 219 return fmt.Errorf("invalid agent: %w", err) 220 } 221 222 a.mu.Lock() 223 defer a.mu.Unlock() 224 225 if a.servicesOnly && !agent.Metadata().Service { 226 a.log.Infof("Denying non Service Agent %s", name) 227 return nil 228 } 229 230 if !agent.ShouldActivate() { 231 a.log.Infof("Agent %s not activating due to ShouldActivate checks", name) 232 return nil 233 } 234 235 if a.agentDenied(name) { 236 a.log.Infof("Denying agent %s based on agent deny list", name) 237 return nil 238 } 239 240 a.log.Infof("Registering new agent %s of type %s", name, agent.Metadata().Name) 241 242 agent.SetServerInfo(a.serverInfo) 243 244 if _, found := a.agents[name]; found { 245 return fmt.Errorf("agent %s is already registered", name) 246 } 247 248 err = a.subscribeAgent(ctx, name, agent, conn) 249 if err != nil { 250 return fmt.Errorf("could not register agent %s: %s", name, err) 251 } 252 253 a.agents[name] = agent 254 255 return nil 256 } 257 258 // KnownAgents retrieves a list of known agents 259 func (a *Manager) KnownAgents() []string { 260 a.mu.Lock() 261 defer a.mu.Unlock() 262 263 known := make([]string, 0, len(a.agents)) 264 265 for agent := range a.agents { 266 known = append(known, agent) 267 } 268 269 sort.Strings(known) 270 271 return known 272 } 273 274 func (a *Manager) agentDenied(name string) bool { 275 for _, n := range a.denylist { 276 if n == name { 277 return true 278 } 279 } 280 281 return false 282 } 283 284 func (a *Manager) unSubscribeAgent(name string, conn inter.AgentConnector) error { 285 subs, ok := a.subs[name] 286 if !ok { 287 return nil 288 } 289 290 for _, sub := range subs { 291 err := conn.Unsubscribe(sub) 292 if err != nil { 293 return err 294 } 295 } 296 297 delete(a.subs, name) 298 299 return nil 300 } 301 302 // Subscribes an agent to all its targets on the connector. Should any subscription fail 303 // all the preceding subscriptions for this agents is unsubscribed and an error returned. 304 // Errors during the unsub is just ignored because it's quite possible that they would fail 305 // too but this avoids problems of messages arriving we did not expect. 306 // 307 // In practice though this is something done during bootstrap and failure here should exit 308 // the whole instance, so it's probably not needed 309 func (a *Manager) subscribeAgent(ctx context.Context, name string, agent Agent, conn inter.AgentConnector) error { 310 if _, found := a.subs[name]; found { 311 return fmt.Errorf("could not subscribe agent %s, it's already subscribed", name) 312 } 313 314 a.subs[name] = []string{} 315 316 for _, collective := range a.fw.Configuration().Collectives { 317 var target string 318 group := "" 319 320 if agent.Metadata().Service { 321 target = conn.ServiceBroadcastTarget(collective, name) 322 group = name 323 a.log.Infof("Subscribing service agent %s to %s in group %s", name, target, group) 324 } else { 325 target = conn.AgentBroadcastTarget(collective, name) 326 a.log.Infof("Subscribing agent %s to %s", name, target) 327 } 328 329 subname := fmt.Sprintf("%s.%s", collective, name) 330 331 err := conn.QueueSubscribe(ctx, subname, target, group, a.requests) 332 if err != nil { 333 a.log.Errorf("could not subscribe agent %s to %s, rewinding all subscriptions for this agent", name, target) 334 for _, sub := range a.subs[name] { 335 conn.Unsubscribe(sub) 336 } 337 338 return fmt.Errorf("subscription failed: %s", err) 339 } 340 341 a.subs[name] = append(a.subs[name], subname) 342 } 343 344 return nil 345 } 346 347 // Get retrieves an agent by name 348 func (a *Manager) Get(name string) (Agent, bool) { 349 a.mu.Lock() 350 defer a.mu.Unlock() 351 352 agent, found := a.agents[name] 353 354 return agent, found 355 } 356 357 // Dispatch sends a request to a agent and wait for a reply 358 func (a *Manager) Dispatch(ctx context.Context, wg *sync.WaitGroup, replies chan *AgentReply, msg inter.Message, request protocol.Request) { 359 defer wg.Done() 360 361 agent, found := a.Get(msg.Agent()) 362 if !found { 363 a.log.Errorf("Received a message for agent %s that does not exist, discarding", msg.Agent()) 364 return 365 } 366 367 result := make(chan *AgentReply) 368 369 td := time.Duration(agent.Metadata().Timeout) * time.Second 370 a.log.Debugf("Handling message %s with timeout %s", msg.RequestID(), td) 371 372 timeout, cancel := context.WithTimeout(context.Background(), td) 373 defer cancel() 374 375 go agent.HandleMessage(timeout, msg, request, a.conn, result) 376 377 select { 378 case reply := <-result: 379 replies <- reply 380 case <-ctx.Done(): 381 replies <- &AgentReply{ 382 Message: msg, 383 Request: request, 384 Error: fmt.Errorf("agent dispatcher for request %s exiting on interrupt", msg.RequestID()), 385 } 386 case <-timeout.Done(): 387 replies <- &AgentReply{ 388 Message: msg, 389 Request: request, 390 Error: fmt.Errorf("agent dispatcher for request %s exiting on %ds timeout", msg.RequestID(), agent.Metadata().Timeout), 391 } 392 } 393 } 394 395 // Logger is the logger the manager prefers new agents derive from 396 func (a *Manager) Logger() *logrus.Entry { 397 return a.log 398 } 399 400 // Choria provides an instance of the choria framework 401 func (a *Manager) Choria() inter.Framework { 402 return a.fw 403 }