github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/observer/server.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package observer
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  
    10  	"github.com/sirupsen/logrus"
    11  	"golang.org/x/sync/errgroup"
    12  	"google.golang.org/grpc/codes"
    13  	"google.golang.org/grpc/metadata"
    14  	grpcStatus "google.golang.org/grpc/status"
    15  	"google.golang.org/protobuf/types/known/wrapperspb"
    16  
    17  	observerpb "github.com/cilium/cilium/api/v1/observer"
    18  	relaypb "github.com/cilium/cilium/api/v1/relay"
    19  	"github.com/cilium/cilium/pkg/hubble/build"
    20  	"github.com/cilium/cilium/pkg/hubble/observer"
    21  	poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types"
    22  	"github.com/cilium/cilium/pkg/inctimer"
    23  	"github.com/cilium/cilium/pkg/lock"
    24  )
    25  
    26  // numUnavailableNodesReportMax represents the maximum number of unavailable
    27  // nodes that should be reported on ServerStatus call. The intent is not to be
    28  // exhaustive when listing them as reporting all unavailable nodes might
    29  // clutter in certain cases.
    30  // Reporting up to 10 unavailable nodes is probably reasonable.
    31  const numUnavailableNodesReportMax = 10
    32  
    33  // PeerLister is the interface that wraps the List method.
    34  type PeerLister interface {
    35  	// List returns a list of peers with active connections. If a peer cannot
    36  	// be connected to; its Conn attribute must be nil.
    37  	List() []poolTypes.Peer
    38  }
    39  
    40  // Server implements the observerpb.ObserverServer interface.
    41  type Server struct {
    42  	opts  options
    43  	peers PeerLister
    44  }
    45  
    46  // NewServer creates a new Server.
    47  func NewServer(peers PeerLister, options ...Option) (*Server, error) {
    48  	opts := defaultOptions
    49  	for _, opt := range options {
    50  		if err := opt(&opts); err != nil {
    51  			return nil, fmt.Errorf("failed to apply option: %w", err)
    52  		}
    53  	}
    54  	return &Server{
    55  		opts:  opts,
    56  		peers: peers,
    57  	}, nil
    58  }
    59  
    60  // GetFlows implements observerpb.ObserverServer.GetFlows by proxying requests to
    61  // the hubble instance the proxy is connected to.
    62  func (s *Server) GetFlows(req *observerpb.GetFlowsRequest, stream observerpb.Observer_GetFlowsServer) error {
    63  	ctx := stream.Context()
    64  	md, ok := metadata.FromIncomingContext(ctx)
    65  	if ok {
    66  		ctx = metadata.NewOutgoingContext(ctx, md)
    67  	}
    68  	ctx, cancel := context.WithCancel(ctx)
    69  	defer cancel()
    70  
    71  	peers := s.peers.List()
    72  	qlen := s.opts.sortBufferMaxLen // we don't want to buffer too many flows
    73  	if nqlen := req.GetNumber() * uint64(len(peers)); nqlen > 0 && nqlen < uint64(qlen) {
    74  		// don't make the queue bigger than necessary as it would be a problem
    75  		// with the priority queue (we pop out when the queue is full)
    76  		qlen = int(nqlen)
    77  	}
    78  
    79  	g, gctx := errgroup.WithContext(ctx)
    80  	flows := make(chan *observerpb.GetFlowsResponse, qlen)
    81  
    82  	fc := newFlowCollector(req, s.opts)
    83  	connectedNodes, unavailableNodes := fc.collect(gctx, g, peers, flows)
    84  
    85  	if req.GetFollow() {
    86  		go func() {
    87  			updateTimer, updateTimerDone := inctimer.New()
    88  			defer updateTimerDone()
    89  			for {
    90  				select {
    91  				case <-updateTimer.After(s.opts.peerUpdateInterval):
    92  					peers := s.peers.List()
    93  					_, _ = fc.collect(gctx, g, peers, flows)
    94  				case <-gctx.Done():
    95  					return
    96  				}
    97  			}
    98  		}()
    99  	}
   100  	go func() {
   101  		g.Wait()
   102  		close(flows)
   103  	}()
   104  
   105  	aggregated := aggregateErrors(ctx, flows, s.opts.errorAggregationWindow)
   106  	sortedFlows := sortFlows(ctx, aggregated, qlen, s.opts.sortBufferDrainTimeout)
   107  
   108  	// inform the client about the nodes from which we expect to receive flows first
   109  	if len(connectedNodes) > 0 {
   110  		status := nodeStatusEvent(relaypb.NodeState_NODE_CONNECTED, connectedNodes...)
   111  		if err := stream.Send(status); err != nil {
   112  			return err
   113  		}
   114  	}
   115  	if len(unavailableNodes) > 0 {
   116  		status := nodeStatusEvent(relaypb.NodeState_NODE_UNAVAILABLE, unavailableNodes...)
   117  		if err := stream.Send(status); err != nil {
   118  			return err
   119  		}
   120  	}
   121  
   122  	err := sendFlowsResponse(ctx, stream, sortedFlows)
   123  	if err != nil {
   124  		return err
   125  	}
   126  	return g.Wait()
   127  }
   128  
   129  // GetAgentEvents implements observerpb.ObserverServer.GetAgentEvents by proxying requests to
   130  // the hubble instance the proxy is connected to.
   131  func (s *Server) GetAgentEvents(req *observerpb.GetAgentEventsRequest, stream observerpb.Observer_GetAgentEventsServer) error {
   132  	return grpcStatus.Errorf(codes.Unimplemented, "GetAgentEvents not yet implemented")
   133  }
   134  
   135  // GetDebugEvents implements observerpb.ObserverServer.GetDebugEvents by proxying requests to
   136  // the hubble instance the proxy is connected to.
   137  func (s *Server) GetDebugEvents(req *observerpb.GetDebugEventsRequest, stream observerpb.Observer_GetDebugEventsServer) error {
   138  	return grpcStatus.Errorf(codes.Unimplemented, "GetDebugEvents not yet implemented")
   139  }
   140  
   141  // GetNodes implements observerpb.ObserverClient.GetNodes.
   142  func (s *Server) GetNodes(ctx context.Context, req *observerpb.GetNodesRequest) (*observerpb.GetNodesResponse, error) {
   143  	if md, ok := metadata.FromIncomingContext(ctx); ok {
   144  		ctx = metadata.NewOutgoingContext(ctx, md)
   145  	}
   146  	ctx, cancel := context.WithCancel(ctx)
   147  	defer cancel()
   148  	g, ctx := errgroup.WithContext(ctx)
   149  
   150  	peers := s.peers.List()
   151  	nodes := make([]*observerpb.Node, 0, len(peers))
   152  	for _, p := range peers {
   153  		n := &observerpb.Node{
   154  			Name: p.Name,
   155  			Tls: &observerpb.TLS{
   156  				Enabled:    p.TLSEnabled,
   157  				ServerName: p.TLSServerName,
   158  			},
   159  		}
   160  		if p.Address != nil {
   161  			n.Address = p.Address.String()
   162  		}
   163  		nodes = append(nodes, n)
   164  		if !isAvailable(p.Conn) {
   165  			n.State = relaypb.NodeState_NODE_UNAVAILABLE
   166  			s.opts.log.WithField("address", p.Address).Infof(
   167  				"No connection to peer %s, skipping", p.Name,
   168  			)
   169  			continue
   170  		}
   171  		n.State = relaypb.NodeState_NODE_CONNECTED
   172  		g.Go(func() error {
   173  			n := n
   174  			client := s.opts.ocb.observerClient(&p)
   175  			status, err := client.ServerStatus(ctx, &observerpb.ServerStatusRequest{})
   176  			if err != nil {
   177  				n.State = relaypb.NodeState_NODE_ERROR
   178  				s.opts.log.WithFields(logrus.Fields{
   179  					"error": err,
   180  					"peer":  p,
   181  				}).Warning("Failed to retrieve server status")
   182  				return nil
   183  			}
   184  			n.Version = status.GetVersion()
   185  			n.UptimeNs = status.GetUptimeNs()
   186  			n.MaxFlows = status.GetMaxFlows()
   187  			n.NumFlows = status.GetNumFlows()
   188  			n.SeenFlows = status.GetSeenFlows()
   189  			return nil
   190  		})
   191  	}
   192  	if err := g.Wait(); err != nil {
   193  		return nil, err
   194  	}
   195  	return &observerpb.GetNodesResponse{Nodes: nodes}, nil
   196  }
   197  
   198  // GetNamespaces implements observerpb.ObserverClient.GetNamespaces.
   199  func (s *Server) GetNamespaces(ctx context.Context, req *observerpb.GetNamespacesRequest) (*observerpb.GetNamespacesResponse, error) {
   200  	if md, ok := metadata.FromIncomingContext(ctx); ok {
   201  		ctx = metadata.NewOutgoingContext(ctx, md)
   202  	}
   203  	// We are not using errgroup.WithContext because we will return partial
   204  	// results over failing on the first error
   205  	g := new(errgroup.Group)
   206  
   207  	namespaceManager := observer.NewNamespaceManager()
   208  
   209  	for _, p := range s.peers.List() {
   210  		if !isAvailable(p.Conn) {
   211  			s.opts.log.WithField("address", p.Address).Infof(
   212  				"No connection to peer %s, skipping", p.Name,
   213  			)
   214  			continue
   215  		}
   216  
   217  		g.Go(func() error {
   218  			client := s.opts.ocb.observerClient(&p)
   219  			nsResp, err := client.GetNamespaces(ctx, req)
   220  			if err != nil {
   221  				s.opts.log.WithFields(logrus.Fields{
   222  					"error": err,
   223  					"peer":  p,
   224  				}).Warning("Failed to retrieve namespaces")
   225  				return nil
   226  			}
   227  			for _, ns := range nsResp.GetNamespaces() {
   228  				namespaceManager.AddNamespace(ns)
   229  			}
   230  			return nil
   231  		})
   232  	}
   233  
   234  	if err := g.Wait(); err != nil {
   235  		return nil, err
   236  	}
   237  
   238  	return &observerpb.GetNamespacesResponse{Namespaces: namespaceManager.GetNamespaces()}, nil
   239  }
   240  
   241  // ServerStatus implements observerpb.ObserverServer.ServerStatus by aggregating
   242  // the ServerStatus answer of all hubble peers.
   243  func (s *Server) ServerStatus(ctx context.Context, req *observerpb.ServerStatusRequest) (*observerpb.ServerStatusResponse, error) {
   244  	var (
   245  		cancel context.CancelFunc
   246  		g      *errgroup.Group
   247  	)
   248  	md, ok := metadata.FromIncomingContext(ctx)
   249  	if ok {
   250  		ctx = metadata.NewOutgoingContext(ctx, md)
   251  	}
   252  	ctx, cancel = context.WithCancel(ctx)
   253  	defer cancel()
   254  	g, ctx = errgroup.WithContext(ctx)
   255  
   256  	peers := s.peers.List()
   257  	mu := lock.Mutex{}
   258  	numUnavailableNodes := 0
   259  	var unavailableNodes []string
   260  	statuses := make(chan *observerpb.ServerStatusResponse, len(peers))
   261  	for _, p := range peers {
   262  		if !isAvailable(p.Conn) {
   263  			s.opts.log.WithField("address", p.Address).Infof(
   264  				"No connection to peer %s, skipping", p.Name,
   265  			)
   266  			mu.Lock()
   267  			numUnavailableNodes++
   268  			if len(unavailableNodes) < numUnavailableNodesReportMax {
   269  				unavailableNodes = append(unavailableNodes, p.Name)
   270  			}
   271  			mu.Unlock()
   272  			continue
   273  		}
   274  
   275  		g.Go(func() error {
   276  			client := s.opts.ocb.observerClient(&p)
   277  			status, err := client.ServerStatus(ctx, req)
   278  			if err != nil {
   279  				s.opts.log.WithFields(logrus.Fields{
   280  					"error": err,
   281  					"peer":  p,
   282  				}).Warning("Failed to retrieve server status")
   283  				mu.Lock()
   284  				numUnavailableNodes++
   285  				if len(unavailableNodes) < numUnavailableNodesReportMax {
   286  					unavailableNodes = append(unavailableNodes, p.Name)
   287  				}
   288  				mu.Unlock()
   289  				return nil
   290  			}
   291  			select {
   292  			case statuses <- status:
   293  			case <-ctx.Done():
   294  			}
   295  			return nil
   296  		})
   297  	}
   298  	go func() {
   299  		g.Wait()
   300  		close(statuses)
   301  	}()
   302  	resp := &observerpb.ServerStatusResponse{
   303  		Version: build.RelayVersion.String(),
   304  	}
   305  	for status := range statuses {
   306  		if status == nil {
   307  			continue
   308  		}
   309  		resp.MaxFlows += status.MaxFlows
   310  		resp.NumFlows += status.NumFlows
   311  		resp.SeenFlows += status.SeenFlows
   312  		// use the oldest uptime as a reference for the uptime as cumulating
   313  		// values would make little sense
   314  		if resp.UptimeNs < status.UptimeNs {
   315  			resp.UptimeNs = status.UptimeNs
   316  		}
   317  		resp.FlowsRate += status.FlowsRate
   318  	}
   319  
   320  	resp.NumConnectedNodes = &wrapperspb.UInt32Value{
   321  		Value: uint32(len(peers) - numUnavailableNodes),
   322  	}
   323  	resp.NumUnavailableNodes = &wrapperspb.UInt32Value{
   324  		Value: uint32(numUnavailableNodes),
   325  	}
   326  	resp.UnavailableNodes = unavailableNodes
   327  
   328  	return resp, g.Wait()
   329  }