github.com/aporeto-inc/trireme-lib@v10.358.0+incompatible/monitor/server/server.go (about)

     1  package server
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  
     8  	"time"
     9  
    10  	"github.com/golang/protobuf/ptypes/empty"
    11  	"go.aporeto.io/enforcerd/internal/extractors/containermetadata"
    12  	"go.aporeto.io/enforcerd/trireme-lib/common"
    13  	"go.aporeto.io/enforcerd/trireme-lib/controller/pkg/counters"
    14  	monitorpb "go.aporeto.io/enforcerd/trireme-lib/monitor/api/spec/protos"
    15  	"go.aporeto.io/enforcerd/trireme-lib/monitor/constants"
    16  	"go.aporeto.io/enforcerd/trireme-lib/monitor/external"
    17  	"go.uber.org/zap"
    18  	"google.golang.org/grpc"
    19  )
    20  
    21  var _ Controls = &Server{}
    22  
    23  var _ external.ReceiverRegistration = &Server{}
    24  
    25  var _ monitorpb.CNIServer = &Server{}
    26  var _ monitorpb.RunCServer = &Server{}
    27  
    28  // Controls is the controlling interface for starting/stopping the server
    29  type Controls interface {
    30  	Start(context.Context) error
    31  	Stop() error
    32  }
    33  
    34  // Server is the grpcMonitorServer server
    35  type Server struct {
    36  	ctx                             context.Context
    37  	enforcerID                      string
    38  	stop                            chan struct{}
    39  	enforcerStop                    chan struct{}
    40  	socketAddress                   string
    41  	socketType                      int
    42  	running                         bool
    43  	monitors                        map[string]external.ReceiveEvents
    44  	monitorsLock                    sync.RWMutex
    45  	runcProxyStarted                bool
    46  	cniInstalled                    bool
    47  	notifyProcessRuncProxyStartedCh chan struct{}
    48  	notifyProcessCniInstalledCh     chan struct{}
    49  	extMonitorStartedLock           sync.RWMutex
    50  	waitStopGrp                     sync.WaitGroup
    51  	apoRuncWaitGrp                  *sync.WaitGroup
    52  }
    53  
    54  const (
    55  	socketTypeUnix = iota
    56  	socketTypeTCP  // nolint: varcheck
    57  	socketTypeWindowsNamedPipe
    58  )
    59  
    60  // NewMonitorServer creates a gRPC server for the twistlock defender integration
    61  func NewMonitorServer(
    62  	socketAddress string,
    63  	stopchan chan struct{},
    64  	enforcerID string,
    65  	runcWaitGrp *sync.WaitGroup,
    66  ) *Server {
    67  	return &Server{
    68  		enforcerID:                      enforcerID,
    69  		stop:                            make(chan struct{}),
    70  		enforcerStop:                    stopchan,
    71  		socketAddress:                   socketAddress,
    72  		socketType:                      socketTypeUnix,
    73  		running:                         false,
    74  		monitors:                        make(map[string]external.ReceiveEvents),
    75  		notifyProcessRuncProxyStartedCh: make(chan struct{}),
    76  		notifyProcessCniInstalledCh:     make(chan struct{}),
    77  		waitStopGrp:                     sync.WaitGroup{},
    78  		apoRuncWaitGrp:                  runcWaitGrp,
    79  	}
    80  }
    81  
    82  // Start the grpcMonitorServer gRPC server
    83  func (s *Server) Start(ctx context.Context) (err error) {
    84  
    85  	s.ctx = ctx
    86  
    87  	errChan := make(chan error)
    88  	zap.L().Info("Starting the gRPC Monitor server, listening on", zap.String("address", s.socketAddress))
    89  
    90  	if err := cleanupPipe(s.socketAddress); err != nil {
    91  		zap.L().Fatal("unable to cleanup the old gRPC Monitor server socket address", zap.String("address", s.socketAddress), zap.Error(err))
    92  	}
    93  
    94  	// create the listener
    95  	lis, err := makePipe(s.socketAddress)
    96  	if err != nil {
    97  		zap.L().Fatal("Failed to create the listener socket", zap.String("address", s.socketAddress), zap.Error(err))
    98  	}
    99  
   100  	var opts []grpc.ServerOption
   101  
   102  	// TODO - TLS certs for the gRPC connection ??
   103  	// if tls {
   104  	// 	creds, err := credentials.NewServerTLSFromFile(tls.certFile, tls.keyFile)
   105  	// 	if err != nil {
   106  	// 		zap.L().Fatal("Failed to load TLS credentials %v", zap.Error(err))
   107  	// 	}
   108  	//
   109  	// 	opts = []grpc.ServerOption{grpc.Creds(creds)}
   110  	// }
   111  
   112  	grpcServer := grpc.NewServer(opts...)
   113  
   114  	// now register the runc and CNI servers.
   115  	monitorpb.RegisterCNIServer(grpcServer, s)
   116  	monitorpb.RegisterRunCServer(grpcServer, s)
   117  	zap.L().Debug("Starting the gRPC Monitor' server loop")
   118  
   119  	go s.processExtMonitorStarted(ctx)
   120  
   121  	// run blocking call in a separate goroutine, report errors via channel
   122  	go func() {
   123  		if err := grpcServer.Serve(lis); err != nil {
   124  			zap.L().Error("failed to start the gRPC Monitor' server", zap.Error(err))
   125  			errChan <- err
   126  		}
   127  		zap.L().Debug("Exiting gRPC Monitor' server go func")
   128  
   129  		// the listener should be closed by this time, remove it
   130  		if s.socketType == socketTypeUnix || s.socketType == socketTypeWindowsNamedPipe {
   131  			if err := cleanupPipe(s.socketAddress); err != nil {
   132  				zap.L().Error("unable to cleanup the gRPC Monitor' server socket address", zap.String("address", s.socketAddress), zap.Error(err))
   133  				errChan <- err
   134  			}
   135  		}
   136  	}()
   137  	// add the waitGrp to make sure that the GRPC shuts down graceFully.
   138  	s.waitStopGrp.Add(1)
   139  
   140  	// Start() is non-blocking, but we block in the go routine
   141  	// until either OS signal, or server fatal error
   142  	go func() {
   143  
   144  		s.running = true
   145  		zap.L().Debug("the gRPC Monitor' server loop is running")
   146  
   147  		// terminate gracefully
   148  		defer func() {
   149  			zap.L().Debug("Stopping the gRPC Monitor' server loop and listener socket")
   150  			grpcServer.GracefulStop()
   151  			// now we are sure that the connections have been drained completely.
   152  			s.waitStopGrp.Done()
   153  			s.running = false
   154  		}()
   155  
   156  		for {
   157  			select {
   158  			case <-s.stop:
   159  				zap.L().Debug("gRPC Monitor' server channel loop: got a stop notification on the stop channel")
   160  				return
   161  			case err := <-errChan:
   162  				zap.L().Fatal("gRPC Monitor' server channel loop: got an error notification on the error channel", zap.Error(err))
   163  			case <-ctx.Done():
   164  				return
   165  			}
   166  		}
   167  	}()
   168  
   169  	return nil
   170  }
   171  
   172  // Stop stops the Monitor' gRPC server (does not stop enforcer)
   173  func (s *Server) Stop() error {
   174  	if s.running {
   175  		zap.L().Debug("gRPC Server: notified the graceful stop")
   176  		close(s.stop)
   177  	}
   178  	// add the wait for to make sure the GRPC gracefulStop drains all the connections.
   179  	s.waitStopGrp.Wait()
   180  	return nil
   181  }
   182  
   183  // RuncProxyStarted gets sent by the defender once when the defender has started the runc-proxy.
   184  func (s *Server) RuncProxyStarted(context.Context, *empty.Empty) (*empty.Empty, error) {
   185  	zap.L().Info("grpc: runc-proxy has started")
   186  	s.extMonitorStartedLock.Lock()
   187  	s.runcProxyStarted = true
   188  	s.extMonitorStartedLock.Unlock()
   189  	s.notifyProcessRuncProxyStartedCh <- struct{}{}
   190  	return &empty.Empty{}, nil
   191  }
   192  
   193  // isRuncProxyStarted returns the internal state of RuncProxyStarted as a copy
   194  func (s *Server) isRuncProxyStarted() bool {
   195  	s.extMonitorStartedLock.RLock()
   196  	defer s.extMonitorStartedLock.RUnlock()
   197  	return s.runcProxyStarted
   198  }
   199  
   200  // CniPluginInstalled gets sent by the defender once when the defender has started the runc-proxy.
   201  func (s *Server) CniPluginInstalled(context.Context, *empty.Empty) (*empty.Empty, error) {
   202  	zap.L().Info("grpc: cni Plugin is installed")
   203  	s.extMonitorStartedLock.Lock()
   204  	s.cniInstalled = true
   205  	s.extMonitorStartedLock.Unlock()
   206  	s.notifyProcessCniInstalledCh <- struct{}{}
   207  	return &empty.Empty{}, nil
   208  }
   209  
   210  // isCniInstalled returns the internal state of RuncProxyStarted as a copy
   211  func (s *Server) isCniInstalled() bool {
   212  	s.extMonitorStartedLock.RLock()
   213  	defer s.extMonitorStartedLock.RUnlock()
   214  	return s.cniInstalled
   215  }
   216  
   217  func (s *Server) processExtMonitorStarted(ctx context.Context) {
   218  	m := make(map[string]struct{})
   219  	for {
   220  		// signal only when runc/cni has not yet started
   221  		if !s.isRuncProxyStarted() && !s.isCniInstalled() {
   222  			s.apoRuncWaitGrp.Done()
   223  		}
   224  		// wait for a notification: this will be sent for two cases:
   225  		// - RuncProxyStarted was called
   226  		// - a new monitor registers with the grpc servcer
   227  		select {
   228  		case <-ctx.Done():
   229  			return
   230  		case <-s.notifyProcessRuncProxyStartedCh:
   231  			// continue here
   232  		case <-s.notifyProcessCniInstalledCh:
   233  		}
   234  		if s.isRuncProxyStarted() || s.isCniInstalled() {
   235  			s.monitorsLock.RLock()
   236  			// iterate over all currently registered monitors
   237  			// and if they haven't gotten the SenderReady() yet
   238  			// we will send it to them
   239  			for name, monitor := range s.monitors {
   240  				if _, ok := m[name]; ok {
   241  					continue
   242  				}
   243  				monitor.SenderReady()
   244  				m[name] = struct{}{}
   245  			}
   246  			s.monitorsLock.RUnlock()
   247  		}
   248  	}
   249  }
   250  
   251  const maxProcessingTime = time.Second * 5
   252  
   253  func calProcessingTime(onStart time.Time, containerID string) {
   254  	processingTime := time.Since(onStart)
   255  	if processingTime > (maxProcessingTime) {
   256  		counters.IncrementCounter(counters.ErrSegmentServerContainerEventExceedsProcessingTime)
   257  		zap.L().Warn(
   258  			"grpc: ContainerEvent: processing of container event took longer than allowed processing time",
   259  			zap.String("id", containerID),
   260  			zap.Duration("processingTime", processingTime),
   261  			zap.Duration("maxProcessingTime", maxProcessingTime),
   262  		)
   263  	} else {
   264  		zap.L().Debug(
   265  			"grpc: ContainerEvent: processing of container event was within allowed time frame",
   266  			zap.String("id", containerID),
   267  			zap.Duration("processingTime", processingTime),
   268  			zap.Duration("maxProcessingTime", maxProcessingTime),
   269  		)
   270  	}
   271  }
   272  
   273  // CNIContainerEvent handles container event requests
   274  func (s *Server) CNIContainerEvent(ctx context.Context, req *monitorpb.CNIContainerEventRequest) (*monitorpb.ContainerEventResponse, error) {
   275  	zap.L().Debug("grpc: CNI ContainerEvent received", zap.Any("request", req), zap.Any("type", req.Type))
   276  
   277  	// calculate the time that this function takes and log accordingly
   278  	onStart := time.Now()
   279  	defer func() {
   280  		calProcessingTime(onStart, req.ContainerID)
   281  	}()
   282  	containerArgs := containermetadata.NewCniArguments(req)
   283  	// now send the container event to the monitor
   284  	return s.sendContainerEvent(ctx, containerArgs)
   285  }
   286  
   287  // RunCContainerEvent handles container event requests
   288  func (s *Server) RunCContainerEvent(ctx context.Context, req *monitorpb.RunCContainerEventRequest) (*monitorpb.ContainerEventResponse, error) {
   289  	zap.L().Debug("grpc: runc ContainerEvent received", zap.Strings("commandLine", req.GetCommandLine()))
   290  
   291  	if !s.isRuncProxyStarted() {
   292  		zap.L().Warn("grpc: receiving ContainerEvent, but have not received RuncProxyStarted event yet. Compensating...")
   293  		s.RuncProxyStarted(ctx, &empty.Empty{}) // nolint
   294  		return &monitorpb.ContainerEventResponse{
   295  			ErrorMessage: "received ContainerEvent before RuncProxyStarted event",
   296  		}, nil
   297  	}
   298  
   299  	// parse the runc command-line first
   300  	containerArgs, err := containermetadata.ParseRuncArguments(req.GetCommandLine())
   301  	if err != nil {
   302  		zap.L().Error("grpc: ContainerEvent: failed to parse runc commandline")
   303  		return &monitorpb.ContainerEventResponse{
   304  			ErrorMessage: fmt.Sprintf("failed to parse runc commandline: %s", err),
   305  		}, nil
   306  	}
   307  	// calculate the time that this function takes and log accordingly
   308  	onStart := time.Now()
   309  	defer func() {
   310  		calProcessingTime(onStart, containerArgs.ID())
   311  	}()
   312  	// now send the container event to the monitor
   313  	return s.sendContainerEvent(ctx, containerArgs)
   314  }
   315  
   316  func (s *Server) sendContainerEvent(ctx context.Context, containerArgs containermetadata.ContainerArgs) (*monitorpb.ContainerEventResponse, error) {
   317  	var kmd containermetadata.CommonKubernetesContainerMetadata
   318  	var md containermetadata.CommonContainerMetadata
   319  	var err error
   320  	// now 1st check if the netnsPath is given, if given then its a CNI event and process it 1st
   321  	// if the netnsPath is not given then we fallback to the default mechanism for extraction.
   322  	// if we can identify that we have this container
   323  	if len(containerArgs.NetNsPath()) > 0 && len(containerArgs.PodName()) > 0 && len(containerArgs.PodNamespace()) > 0 {
   324  		// create the cni containerMetadata
   325  		kmd = containermetadata.NewCniContainerMetadata(containerArgs)
   326  	} else if containermetadata.AutoDetect().Has(containerArgs) {
   327  
   328  		// then extract the common container metadata
   329  		md, kmd, err = containermetadata.AutoDetect().Extract(containerArgs)
   330  		if err != nil {
   331  			return &monitorpb.ContainerEventResponse{
   332  				ErrorMessage: fmt.Sprintf("failed to parse runc commandline: %s", err),
   333  			}, nil
   334  		}
   335  
   336  		// as we are only interested in Kubernetes containers at the moment
   337  		// simply log if this is a non-Kubernetes event
   338  		if md != nil && kmd == nil {
   339  			zap.L().Debug(
   340  				"grpc: ContainerEvent: container event does not belong to a Kubernetes container",
   341  				zap.String("md.ID()", md.ID()),
   342  				zap.String("md.Root()", md.Root()),
   343  				zap.String("md.Kind()", md.Kind().String()),
   344  				zap.String("md.Runtime()", md.Runtime().String()),
   345  				zap.Int("md.PID()", md.PID()),
   346  				zap.Bool("md.SystemdCgroups()", md.SystemdCgroups()),
   347  			)
   348  			return &monitorpb.ContainerEventResponse{}, nil
   349  		}
   350  	}
   351  
   352  	// and now send an event to the K8s monitor
   353  	if kmd != nil {
   354  		zap.L().Debug(
   355  			"grpc: ContainerEvent: container event belongs to a Kubernetes container",
   356  			zap.String("kmd.ID()", kmd.ID()),
   357  			zap.String("kmd.Root()", kmd.Root()),
   358  			zap.String("kmd.Kind()", kmd.Kind().String()),
   359  			zap.String("kmd.Runtime()", kmd.Runtime().String()),
   360  			zap.Int("kmd.PID()", kmd.PID()),
   361  			zap.Bool("kmd.SystemdCgroups()", kmd.SystemdCgroups()),
   362  			zap.String("kmd.PodName()", kmd.PodName()),
   363  			zap.String("kmd.NetNsPath()", kmd.NetNSPath()),
   364  			zap.String("kmd.PodNamespace()", kmd.PodNamespace()),
   365  			zap.String("kmd.PodUID()", kmd.PodUID()),
   366  			zap.String("kmd.PodSandboxID()", kmd.PodSandboxID()),
   367  		)
   368  
   369  		s.monitorsLock.RLock()
   370  		defer s.monitorsLock.RUnlock()
   371  		monitor, ok := s.monitors[constants.K8sMonitorRegistrationName]
   372  		if !ok {
   373  			zap.L().Debug("grpc: K8s monitor is not registered yet. Skipping processing of event.")
   374  			return &monitorpb.ContainerEventResponse{
   375  				ErrorMessage: "K8s monitor is not initialized yet",
   376  			}, nil
   377  		}
   378  
   379  		switch containerArgs.Action() {
   380  		case containermetadata.StartAction:
   381  			// the start action MUST be synchronous at all costs
   382  			monitor.Event(ctx, common.EventStart, kmd) // nolint: errcheck
   383  		case containermetadata.DeleteAction:
   384  			// the delete event SHOULD be synchronous
   385  			// however, we can unblock the caller and respect the context if it is not
   386  			ch := make(chan struct{})
   387  			go func() {
   388  				monitor.Event(context.Background(), common.EventDestroy, kmd) // nolint: errcheck
   389  				close(ch)
   390  			}()
   391  			select {
   392  			case <-ctx.Done():
   393  				zap.L().Warn("grpc: ContainerEvent: failed to process delete event within the context constraints",
   394  					zap.String("kmd.ID()", kmd.ID()),
   395  					zap.String("kmd.PodName()", kmd.PodName()),
   396  					zap.String("kmd.PodNamespace()", kmd.PodNamespace()),
   397  					zap.String("kmd.PodUID()", kmd.PodUID()),
   398  					zap.String("kmd.NetNsPath()", kmd.NetNSPath()),
   399  					zap.Error(ctx.Err()),
   400  				)
   401  			case <-ch:
   402  				// success, nothing more needs to be done
   403  			}
   404  		default:
   405  			zap.L().Debug("grpc: unsupported action by the K8s monitor", zap.String("action", containerArgs.Action().String()))
   406  			return &monitorpb.ContainerEventResponse{
   407  				ErrorMessage: "unexpected action received: " + containerArgs.Action().String(),
   408  			}, nil
   409  		}
   410  
   411  		return &monitorpb.ContainerEventResponse{}, nil
   412  	}
   413  
   414  	// log an error if we can't find it because we should always be able to find it, and this is an error in the extractor
   415  	zap.L().Error("grpc: ContainerEvent: container not found", zap.String("containerID", containerArgs.ID()), zap.String("action", containerArgs.Action().String()))
   416  	return &monitorpb.ContainerEventResponse{
   417  		ErrorMessage: "container not found",
   418  	}, nil
   419  }
   420  
   421  // SenderName must return a globally unique name of the implementor.
   422  func (s *Server) SenderName() string {
   423  	return constants.MonitorExtSenderName
   424  }
   425  
   426  // Register will register the given `monitor` for receiving events under `name`.
   427  // Multiple calls to this function for the same `name` must update the internal
   428  // state of the implementor to now send events to the newly regitered monitor of this
   429  // name. Only one registration of a monitor of the same name is allowed.
   430  func (s *Server) Register(name string, monitor external.ReceiveEvents) error {
   431  	s.monitorsLock.Lock()
   432  	defer s.monitorsLock.Unlock()
   433  	s.monitors[name] = monitor
   434  	s.notifyProcessRuncProxyStartedCh <- struct{}{}
   435  	return nil
   436  }