github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/server/internal/services/manager.go (about) 1 // Copyright 2017 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package services defines internal fleetspeak components relating to services. 16 package services 17 18 import ( 19 "context" 20 "encoding/hex" 21 "fmt" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 log "github.com/golang/glog" 27 "golang.org/x/time/rate" 28 "google.golang.org/protobuf/proto" 29 30 "github.com/google/fleetspeak/fleetspeak/src/common" 31 "github.com/google/fleetspeak/fleetspeak/src/server/db" 32 "github.com/google/fleetspeak/fleetspeak/src/server/internal/cache" 33 "github.com/google/fleetspeak/fleetspeak/src/server/internal/ftime" 34 "github.com/google/fleetspeak/fleetspeak/src/server/service" 35 "github.com/google/fleetspeak/fleetspeak/src/server/stats" 36 37 fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak" 38 spb "github.com/google/fleetspeak/fleetspeak/src/server/proto/fleetspeak_server" 39 ) 40 41 const MaxServiceFailureReasonLength = 900 42 43 // A Manager starts, remembers, and shuts down services. 44 type Manager struct { 45 services map[string]*liveService 46 dataStore db.Store 47 serviceRegistry map[string]service.Factory // Used to look up the correct factory when configuring services. 48 stats stats.Collector 49 cc *cache.Clients 50 } 51 52 // NewManager creates a new manager using the provided components. Initially it only contains the 'system' service. 53 func NewManager(dataStore db.Store, serviceRegistry map[string]service.Factory, stats stats.Collector, clientCache *cache.Clients) *Manager { 54 m := Manager{ 55 services: make(map[string]*liveService), 56 dataStore: dataStore, 57 serviceRegistry: serviceRegistry, 58 stats: stats, 59 cc: clientCache, 60 } 61 62 ssd := liveService{ 63 manager: &m, 64 name: "system", 65 maxParallelism: 100, 66 pLogLimiter: rate.NewLimiter(rate.Every(10*time.Second), 1), 67 } 68 ss := systemService{ 69 sctx: &ssd, 70 stats: stats, 71 datastore: dataStore, 72 cc: clientCache, 73 } 74 ssd.service = &ss 75 m.services["system"] = &ssd 76 ss.Start(&ssd) 77 78 return &m 79 } 80 81 // clientData returns client data corresponding to client that is the source of the given message. 82 func (c *Manager) clientData(ctx context.Context, m *fspb.Message) (*db.ClientData, error) { 83 cID, err := common.BytesToClientID(m.Source.ClientId) 84 if err != nil || cID.IsNil() { 85 return nil, fmt.Errorf("invalid source client id[%v]: %v", m.Source.ClientId, err) 86 } 87 88 cData, _, err := c.cc.GetOrRead(ctx, cID, c.dataStore) 89 if err != nil { 90 return nil, fmt.Errorf("can't get client data for id[%v]: %v", cID, err) 91 } 92 93 return cData, nil 94 } 95 96 // Install adds a service to the configuration, removing any existing service with 97 // the same name. 98 func (c *Manager) Install(cfg *spb.ServiceConfig) error { 99 cfg = proto.Clone(cfg).(*spb.ServiceConfig) 100 101 f := c.serviceRegistry[cfg.Factory] 102 if f == nil { 103 return fmt.Errorf("unable to find factory [%v]", cfg.Factory) 104 } 105 // "system" is a special service handling configuration and other 106 // message passing for Fleetspeak itself. "client" is the service name 107 // used for labels set by (and known by) the base Fleetspeak client 108 // itself. 109 if cfg.Name == "" || cfg.Name == "system" || cfg.Name == "client" { 110 return fmt.Errorf("illegal service name [%v]", cfg.Name) 111 } 112 113 s, err := f(cfg) 114 if err != nil { 115 return err 116 } 117 118 if cfg.MaxParallelism == 0 { 119 cfg.MaxParallelism = 100 120 } 121 122 d := liveService{ 123 manager: c, 124 name: cfg.Name, 125 service: s, 126 127 maxParallelism: cfg.MaxParallelism, 128 pLogLimiter: rate.NewLimiter(rate.Every(10*time.Second), 1), 129 } 130 131 if err = s.Start(&d); err != nil { 132 return err 133 } 134 c.services[cfg.Name] = &d 135 136 log.Infof("Installed %v service.", cfg.Name) 137 return nil 138 } 139 140 // Stop closes and removes all services in the configuration. 141 func (c *Manager) Stop() { 142 for _, d := range c.services { 143 d.stop() 144 } 145 c.services = map[string]*liveService{} 146 } 147 148 // ProcessMessages implements MessageProcessor and is called by the datastore on 149 // backlogged messages. 150 func (c *Manager) ProcessMessages(msgs []*fspb.Message) { 151 ctx, fin := context.WithTimeout(context.Background(), 30*time.Second) 152 153 hasResult := make([]bool, len(msgs)) 154 155 var working sync.WaitGroup 156 working.Add(len(msgs)) 157 158 for idx, msg := range msgs { 159 i, m := idx, msg 160 go func() { 161 defer working.Done() 162 l := c.services[m.Destination.ServiceName] 163 if l == nil { 164 log.Errorf("Message in datastore [%v] is for unknown service [%s].", hex.EncodeToString(m.MessageId), m.Destination.ServiceName) 165 return 166 } 167 cData, err := c.clientData(ctx, m) 168 if err != nil { 169 log.Warningf("Message in datastore [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err) 170 } 171 172 c.stats.MessageIngested(true, m, cData) 173 res := l.processMessage(ctx, m, false) 174 if res != nil { 175 hasResult[i] = true 176 m.Result = res 177 } 178 }() 179 } 180 working.Wait() 181 fin() 182 183 toSave := make([]*fspb.Message, 0, len(msgs)) 184 for i, m := range msgs { 185 if hasResult[i] { 186 toSave = append(toSave, m) 187 } 188 } 189 if len(toSave) == 0 { 190 return 191 } 192 ctx, fin = context.WithTimeout(context.Background(), 15*time.Second) 193 defer fin() 194 if err := c.dataStore.StoreMessages(ctx, toSave, ""); err != nil { 195 log.Errorf("Error saving results for %d messages: %v", len(toSave), err) 196 } 197 } 198 199 // processMessage attempts to processes m, returning a fspb.MessageResult. It 200 // also updates stats, calling exactly one of MessageDropped, MessageFailed, 201 // MessageProcessed. 202 func (s *liveService) processMessage(ctx context.Context, m *fspb.Message, isFirstTry bool) *fspb.MessageResult { 203 cData, err := s.manager.clientData(ctx, m) 204 if err != nil { 205 log.Warningf("Couldn't fetch client data for the message: %v", err) 206 } 207 208 if cData == nil { 209 log.Warningf("Can't annotate message with blocklisted status [service=%s] as client data couldn't be fetched.", s.name) 210 } else { 211 m.IsBlocklistedSource = cData.Blacklisted 212 } 213 214 p := atomic.AddUint32(&s.parallelism, 1) 215 // Documented decrement operation. 216 // https://golang.org/pkg/sync/atomic/#AddUint32 217 defer atomic.AddUint32(&s.parallelism, ^uint32(0)) 218 if p > s.maxParallelism { 219 if s.pLogLimiter.Allow() { 220 log.Warningf("%s: Overloaded with %d concurrent messages, dropping excess, will retry.", s.name, s.maxParallelism) 221 } 222 s.manager.stats.MessageDropped(m, isFirstTry, cData) 223 return nil 224 } 225 226 mid, err := common.BytesToMessageID(m.MessageId) 227 if err != nil || mid.IsNil() { 228 // message id should be validated before it gets to us. 229 log.Fatalf("Invalid message id presented for processing: %v, %v", m.MessageId, err) 230 } 231 232 start := ftime.Now() 233 e := s.service.ProcessMessage(ctx, m) 234 switch { 235 case e == nil: 236 s.manager.stats.MessageProcessed(start, ftime.Now(), m, isFirstTry, cData) 237 return &fspb.MessageResult{ProcessedTime: db.NowProto()} 238 case service.IsTemporary(e): 239 s.manager.stats.MessageErrored(start, ftime.Now(), true, m, isFirstTry, cData) 240 log.Warningf("%s: Temporary error processing message %v, will retry: %v", s.name, mid, e) 241 return nil 242 case !service.IsTemporary(e): 243 s.manager.stats.MessageErrored(start, ftime.Now(), false, m, isFirstTry, cData) 244 log.Errorf("%s: Permanent error processing message %v, giving up: %v", s.name, mid, e) 245 failedReason := e.Error() 246 if len(failedReason) > MaxServiceFailureReasonLength { 247 failedReason = failedReason[:MaxServiceFailureReasonLength-3] + "..." 248 } 249 return &fspb.MessageResult{ 250 ProcessedTime: db.NowProto(), 251 Failed: true, 252 FailedReason: failedReason, 253 } 254 } 255 log.Fatal("Error is neither temporary or permanent.") 256 return nil 257 } 258 259 // HandleNewMessages handles newly arrived messages that should be processed on 260 // the fleetspeak server. This handling includes validating that we recognize 261 // its ServiceNames, saving the messages to the datastore and attempting to 262 // process them. 263 func (c *Manager) HandleNewMessages(ctx context.Context, msgs []*fspb.Message, contact db.ContactID) error { 264 now := db.NowProto() 265 for _, m := range msgs { 266 if m.Destination == nil || len(m.Destination.ClientId) != 0 { 267 return fmt.Errorf("HandleNewMessage called with bad Destination: %v", m.Destination) 268 } 269 m.CreationTime = now 270 } 271 272 // Try to processes all the messages in parallel, with a 30 second timeout. 273 ctx1, fin1 := context.WithTimeout(ctx, 30*time.Second) 274 var wg sync.WaitGroup 275 wg.Add(len(msgs)) 276 for _, msg := range msgs { 277 m := msg 278 go func() { 279 defer wg.Done() 280 l := c.services[m.Destination.ServiceName] 281 if l == nil { 282 log.Errorf("Received new message [%v] for unknown service [%s].", hex.EncodeToString(m.MessageId), m.Destination.ServiceName) 283 return 284 } 285 286 cData, err := c.clientData(ctx1, m) 287 if err != nil { 288 log.Warningf("Can't get client data for message [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err) 289 } 290 c.stats.MessageIngested(false, m, cData) 291 292 res := l.processMessage(ctx1, m, true) 293 if res == nil { 294 return 295 } 296 m.Result = res 297 m.Data = nil 298 }() 299 } 300 wg.Wait() 301 fin1() 302 303 if ctx.Err() != nil { 304 return ctx.Err() 305 } 306 307 ctx2, fin2 := context.WithTimeout(ctx, 30*time.Second) 308 defer fin2() 309 310 // Record that we are saving messages. 311 for _, m := range msgs { 312 cData, err := c.clientData(ctx2, m) 313 if err != nil { 314 log.Warningf("Can't get client data for message [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err) 315 } 316 317 c.stats.MessageSaved(false, m, cData) 318 } 319 320 return c.dataStore.StoreMessages(ctx2, msgs, contact) 321 } 322 323 // A liveService is a running Service, including implementation provided by the 324 // associated ServiceFactory and bookkeeping structures and methods. 325 type liveService struct { 326 manager *Manager 327 name string 328 service service.Service 329 330 parallelism uint32 // Current number of calls, used for load shedding. atomic access only. 331 maxParallelism uint32 332 pLogLimiter *rate.Limiter 333 } 334 335 func (s *liveService) stop() { 336 if err := s.service.Stop(); err != nil { 337 log.Errorf("Error shutting down service [%v]: %v", s.name, err) 338 } 339 } 340 341 // Send implements service.Context. 342 func (s *liveService) Send(ctx context.Context, m *fspb.Message) error { 343 m.Source = &fspb.Address{ServiceName: s.name} 344 if len(m.Destination.ClientId) == 0 { 345 return s.manager.HandleNewMessages(ctx, []*fspb.Message{m}, "") 346 } 347 348 return s.manager.dataStore.StoreMessages(ctx, []*fspb.Message{m}, "") 349 } 350 351 // GetClientData implements service.Context. 352 func (s *liveService) GetClientData(ctx context.Context, id common.ClientID) (*db.ClientData, error) { 353 cd, _, err := s.manager.cc.GetOrRead(ctx, id, s.manager.dataStore) 354 if err != nil { 355 return nil, err 356 } 357 return cd, nil 358 }