github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/server/internal/services/manager.go (about)

     1  // Copyright 2017 Google Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package services defines internal fleetspeak components relating to services.
    16  package services
    17  
    18  import (
    19  	"context"
    20  	"encoding/hex"
    21  	"fmt"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	log "github.com/golang/glog"
    27  	"golang.org/x/time/rate"
    28  	"google.golang.org/protobuf/proto"
    29  
    30  	"github.com/google/fleetspeak/fleetspeak/src/common"
    31  	"github.com/google/fleetspeak/fleetspeak/src/server/db"
    32  	"github.com/google/fleetspeak/fleetspeak/src/server/internal/cache"
    33  	"github.com/google/fleetspeak/fleetspeak/src/server/internal/ftime"
    34  	"github.com/google/fleetspeak/fleetspeak/src/server/service"
    35  	"github.com/google/fleetspeak/fleetspeak/src/server/stats"
    36  
    37  	fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak"
    38  	spb "github.com/google/fleetspeak/fleetspeak/src/server/proto/fleetspeak_server"
    39  )
    40  
    41  const MaxServiceFailureReasonLength = 900
    42  
    43  // A Manager starts, remembers, and shuts down services.
    44  type Manager struct {
    45  	services        map[string]*liveService
    46  	dataStore       db.Store
    47  	serviceRegistry map[string]service.Factory // Used to look up the correct factory when configuring services.
    48  	stats           stats.Collector
    49  	cc              *cache.Clients
    50  }
    51  
    52  // NewManager creates a new manager using the provided components. Initially it only contains the 'system' service.
    53  func NewManager(dataStore db.Store, serviceRegistry map[string]service.Factory, stats stats.Collector, clientCache *cache.Clients) *Manager {
    54  	m := Manager{
    55  		services:        make(map[string]*liveService),
    56  		dataStore:       dataStore,
    57  		serviceRegistry: serviceRegistry,
    58  		stats:           stats,
    59  		cc:              clientCache,
    60  	}
    61  
    62  	ssd := liveService{
    63  		manager:        &m,
    64  		name:           "system",
    65  		maxParallelism: 100,
    66  		pLogLimiter:    rate.NewLimiter(rate.Every(10*time.Second), 1),
    67  	}
    68  	ss := systemService{
    69  		sctx:      &ssd,
    70  		stats:     stats,
    71  		datastore: dataStore,
    72  		cc:        clientCache,
    73  	}
    74  	ssd.service = &ss
    75  	m.services["system"] = &ssd
    76  	ss.Start(&ssd)
    77  
    78  	return &m
    79  }
    80  
    81  // clientData returns client data corresponding to client that is the source of the given message.
    82  func (c *Manager) clientData(ctx context.Context, m *fspb.Message) (*db.ClientData, error) {
    83  	cID, err := common.BytesToClientID(m.Source.ClientId)
    84  	if err != nil || cID.IsNil() {
    85  		return nil, fmt.Errorf("invalid source client id[%v]: %v", m.Source.ClientId, err)
    86  	}
    87  
    88  	cData, _, err := c.cc.GetOrRead(ctx, cID, c.dataStore)
    89  	if err != nil {
    90  		return nil, fmt.Errorf("can't get client data for id[%v]: %v", cID, err)
    91  	}
    92  
    93  	return cData, nil
    94  }
    95  
    96  // Install adds a service to the configuration, removing any existing service with
    97  // the same name.
    98  func (c *Manager) Install(cfg *spb.ServiceConfig) error {
    99  	cfg = proto.Clone(cfg).(*spb.ServiceConfig)
   100  
   101  	f := c.serviceRegistry[cfg.Factory]
   102  	if f == nil {
   103  		return fmt.Errorf("unable to find factory [%v]", cfg.Factory)
   104  	}
   105  	// "system" is a special service handling configuration and other
   106  	// message passing for Fleetspeak itself. "client" is the service name
   107  	// used for labels set by (and known by) the base Fleetspeak client
   108  	// itself.
   109  	if cfg.Name == "" || cfg.Name == "system" || cfg.Name == "client" {
   110  		return fmt.Errorf("illegal service name [%v]", cfg.Name)
   111  	}
   112  
   113  	s, err := f(cfg)
   114  	if err != nil {
   115  		return err
   116  	}
   117  
   118  	if cfg.MaxParallelism == 0 {
   119  		cfg.MaxParallelism = 100
   120  	}
   121  
   122  	d := liveService{
   123  		manager: c,
   124  		name:    cfg.Name,
   125  		service: s,
   126  
   127  		maxParallelism: cfg.MaxParallelism,
   128  		pLogLimiter:    rate.NewLimiter(rate.Every(10*time.Second), 1),
   129  	}
   130  
   131  	if err = s.Start(&d); err != nil {
   132  		return err
   133  	}
   134  	c.services[cfg.Name] = &d
   135  
   136  	log.Infof("Installed %v service.", cfg.Name)
   137  	return nil
   138  }
   139  
   140  // Stop closes and removes all services in the configuration.
   141  func (c *Manager) Stop() {
   142  	for _, d := range c.services {
   143  		d.stop()
   144  	}
   145  	c.services = map[string]*liveService{}
   146  }
   147  
   148  // ProcessMessages implements MessageProcessor and is called by the datastore on
   149  // backlogged messages.
   150  func (c *Manager) ProcessMessages(msgs []*fspb.Message) {
   151  	ctx, fin := context.WithTimeout(context.Background(), 30*time.Second)
   152  
   153  	hasResult := make([]bool, len(msgs))
   154  
   155  	var working sync.WaitGroup
   156  	working.Add(len(msgs))
   157  
   158  	for idx, msg := range msgs {
   159  		i, m := idx, msg
   160  		go func() {
   161  			defer working.Done()
   162  			l := c.services[m.Destination.ServiceName]
   163  			if l == nil {
   164  				log.Errorf("Message in datastore [%v] is for unknown service [%s].", hex.EncodeToString(m.MessageId), m.Destination.ServiceName)
   165  				return
   166  			}
   167  			cData, err := c.clientData(ctx, m)
   168  			if err != nil {
   169  				log.Warningf("Message in datastore [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err)
   170  			}
   171  
   172  			c.stats.MessageIngested(true, m, cData)
   173  			res := l.processMessage(ctx, m, false)
   174  			if res != nil {
   175  				hasResult[i] = true
   176  				m.Result = res
   177  			}
   178  		}()
   179  	}
   180  	working.Wait()
   181  	fin()
   182  
   183  	toSave := make([]*fspb.Message, 0, len(msgs))
   184  	for i, m := range msgs {
   185  		if hasResult[i] {
   186  			toSave = append(toSave, m)
   187  		}
   188  	}
   189  	if len(toSave) == 0 {
   190  		return
   191  	}
   192  	ctx, fin = context.WithTimeout(context.Background(), 15*time.Second)
   193  	defer fin()
   194  	if err := c.dataStore.StoreMessages(ctx, toSave, ""); err != nil {
   195  		log.Errorf("Error saving results for %d messages: %v", len(toSave), err)
   196  	}
   197  }
   198  
   199  // processMessage attempts to processes m, returning a fspb.MessageResult. It
   200  // also updates stats, calling exactly one of MessageDropped, MessageFailed,
   201  // MessageProcessed.
   202  func (s *liveService) processMessage(ctx context.Context, m *fspb.Message, isFirstTry bool) *fspb.MessageResult {
   203  	cData, err := s.manager.clientData(ctx, m)
   204  	if err != nil {
   205  		log.Warningf("Couldn't fetch client data for the message: %v", err)
   206  	}
   207  
   208  	if cData == nil {
   209  		log.Warningf("Can't annotate message with blocklisted status [service=%s] as client data couldn't be fetched.", s.name)
   210  	} else {
   211  		m.IsBlocklistedSource = cData.Blacklisted
   212  	}
   213  
   214  	p := atomic.AddUint32(&s.parallelism, 1)
   215  	// Documented decrement operation.
   216  	// https://golang.org/pkg/sync/atomic/#AddUint32
   217  	defer atomic.AddUint32(&s.parallelism, ^uint32(0))
   218  	if p > s.maxParallelism {
   219  		if s.pLogLimiter.Allow() {
   220  			log.Warningf("%s: Overloaded with %d concurrent messages, dropping excess, will retry.", s.name, s.maxParallelism)
   221  		}
   222  		s.manager.stats.MessageDropped(m, isFirstTry, cData)
   223  		return nil
   224  	}
   225  
   226  	mid, err := common.BytesToMessageID(m.MessageId)
   227  	if err != nil || mid.IsNil() {
   228  		// message id should be validated before it gets to us.
   229  		log.Fatalf("Invalid message id presented for processing: %v, %v", m.MessageId, err)
   230  	}
   231  
   232  	start := ftime.Now()
   233  	e := s.service.ProcessMessage(ctx, m)
   234  	switch {
   235  	case e == nil:
   236  		s.manager.stats.MessageProcessed(start, ftime.Now(), m, isFirstTry, cData)
   237  		return &fspb.MessageResult{ProcessedTime: db.NowProto()}
   238  	case service.IsTemporary(e):
   239  		s.manager.stats.MessageErrored(start, ftime.Now(), true, m, isFirstTry, cData)
   240  		log.Warningf("%s: Temporary error processing message %v, will retry: %v", s.name, mid, e)
   241  		return nil
   242  	case !service.IsTemporary(e):
   243  		s.manager.stats.MessageErrored(start, ftime.Now(), false, m, isFirstTry, cData)
   244  		log.Errorf("%s: Permanent error processing message %v, giving up: %v", s.name, mid, e)
   245  		failedReason := e.Error()
   246  		if len(failedReason) > MaxServiceFailureReasonLength {
   247  			failedReason = failedReason[:MaxServiceFailureReasonLength-3] + "..."
   248  		}
   249  		return &fspb.MessageResult{
   250  			ProcessedTime: db.NowProto(),
   251  			Failed:        true,
   252  			FailedReason:  failedReason,
   253  		}
   254  	}
   255  	log.Fatal("Error is neither temporary or permanent.")
   256  	return nil
   257  }
   258  
   259  // HandleNewMessages handles newly arrived messages that should be processed on
   260  // the fleetspeak server. This handling includes validating that we recognize
   261  // its ServiceNames, saving the messages to the datastore and attempting to
   262  // process them.
   263  func (c *Manager) HandleNewMessages(ctx context.Context, msgs []*fspb.Message, contact db.ContactID) error {
   264  	now := db.NowProto()
   265  	for _, m := range msgs {
   266  		if m.Destination == nil || len(m.Destination.ClientId) != 0 {
   267  			return fmt.Errorf("HandleNewMessage called with bad Destination: %v", m.Destination)
   268  		}
   269  		m.CreationTime = now
   270  	}
   271  
   272  	// Try to processes all the messages in parallel, with a 30 second timeout.
   273  	ctx1, fin1 := context.WithTimeout(ctx, 30*time.Second)
   274  	var wg sync.WaitGroup
   275  	wg.Add(len(msgs))
   276  	for _, msg := range msgs {
   277  		m := msg
   278  		go func() {
   279  			defer wg.Done()
   280  			l := c.services[m.Destination.ServiceName]
   281  			if l == nil {
   282  				log.Errorf("Received new message [%v] for unknown service [%s].", hex.EncodeToString(m.MessageId), m.Destination.ServiceName)
   283  				return
   284  			}
   285  
   286  			cData, err := c.clientData(ctx1, m)
   287  			if err != nil {
   288  				log.Warningf("Can't get client data for message [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err)
   289  			}
   290  			c.stats.MessageIngested(false, m, cData)
   291  
   292  			res := l.processMessage(ctx1, m, true)
   293  			if res == nil {
   294  				return
   295  			}
   296  			m.Result = res
   297  			m.Data = nil
   298  		}()
   299  	}
   300  	wg.Wait()
   301  	fin1()
   302  
   303  	if ctx.Err() != nil {
   304  		return ctx.Err()
   305  	}
   306  
   307  	ctx2, fin2 := context.WithTimeout(ctx, 30*time.Second)
   308  	defer fin2()
   309  
   310  	// Record that we are saving messages.
   311  	for _, m := range msgs {
   312  		cData, err := c.clientData(ctx2, m)
   313  		if err != nil {
   314  			log.Warningf("Can't get client data for message [%v] for service [%s] is from unknown client: %v.", hex.EncodeToString(m.MessageId), m.Destination.ServiceName, err)
   315  		}
   316  
   317  		c.stats.MessageSaved(false, m, cData)
   318  	}
   319  
   320  	return c.dataStore.StoreMessages(ctx2, msgs, contact)
   321  }
   322  
   323  // A liveService is a running Service, including implementation provided by the
   324  // associated ServiceFactory and bookkeeping structures and methods.
   325  type liveService struct {
   326  	manager *Manager
   327  	name    string
   328  	service service.Service
   329  
   330  	parallelism    uint32 // Current number of calls, used for load shedding. atomic access only.
   331  	maxParallelism uint32
   332  	pLogLimiter    *rate.Limiter
   333  }
   334  
   335  func (s *liveService) stop() {
   336  	if err := s.service.Stop(); err != nil {
   337  		log.Errorf("Error shutting down service [%v]: %v", s.name, err)
   338  	}
   339  }
   340  
   341  // Send implements service.Context.
   342  func (s *liveService) Send(ctx context.Context, m *fspb.Message) error {
   343  	m.Source = &fspb.Address{ServiceName: s.name}
   344  	if len(m.Destination.ClientId) == 0 {
   345  		return s.manager.HandleNewMessages(ctx, []*fspb.Message{m}, "")
   346  	}
   347  
   348  	return s.manager.dataStore.StoreMessages(ctx, []*fspb.Message{m}, "")
   349  }
   350  
   351  // GetClientData implements service.Context.
   352  func (s *liveService) GetClientData(ctx context.Context, id common.ClientID) (*db.ClientData, error) {
   353  	cd, _, err := s.manager.cc.GetOrRead(ctx, id, s.manager.dataStore)
   354  	if err != nil {
   355  		return nil, err
   356  	}
   357  	return cd, nil
   358  }