github.com/cilium/cilium@v1.16.2/pkg/hubble/relay/observer/observer.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package observer
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"io"
    10  
    11  	"github.com/sirupsen/logrus"
    12  	"golang.org/x/sync/errgroup"
    13  	"google.golang.org/grpc/codes"
    14  	"google.golang.org/grpc/connectivity"
    15  	"google.golang.org/grpc/status"
    16  	"google.golang.org/protobuf/types/known/timestamppb"
    17  
    18  	observerpb "github.com/cilium/cilium/api/v1/observer"
    19  	relaypb "github.com/cilium/cilium/api/v1/relay"
    20  	poolTypes "github.com/cilium/cilium/pkg/hubble/relay/pool/types"
    21  	"github.com/cilium/cilium/pkg/hubble/relay/queue"
    22  	"github.com/cilium/cilium/pkg/inctimer"
    23  	"github.com/cilium/cilium/pkg/lock"
    24  	nodeTypes "github.com/cilium/cilium/pkg/node/types"
    25  	"github.com/cilium/cilium/pkg/time"
    26  )
    27  
    28  func isAvailable(conn poolTypes.ClientConn) bool {
    29  	if conn == nil {
    30  		return false
    31  	}
    32  	state := conn.GetState()
    33  	return state != connectivity.TransientFailure &&
    34  		state != connectivity.Shutdown
    35  }
    36  
    37  func retrieveFlowsFromPeer(
    38  	ctx context.Context,
    39  	client observerpb.ObserverClient,
    40  	req *observerpb.GetFlowsRequest,
    41  	flows chan<- *observerpb.GetFlowsResponse,
    42  ) error {
    43  	c, err := client.GetFlows(ctx, req)
    44  	if err != nil {
    45  		return err
    46  	}
    47  	for {
    48  		flow, err := c.Recv()
    49  		if err != nil {
    50  			if errors.Is(err, io.EOF) || errors.Is(err, context.Canceled) {
    51  				return nil
    52  			}
    53  			if status.Code(err) == codes.Canceled {
    54  				return nil
    55  			}
    56  			return err
    57  		}
    58  
    59  		select {
    60  		case flows <- flow:
    61  		case <-ctx.Done():
    62  			return nil
    63  		}
    64  	}
    65  }
    66  
    67  func sortFlows(
    68  	ctx context.Context,
    69  	flows <-chan *observerpb.GetFlowsResponse,
    70  	qlen int,
    71  	bufferDrainTimeout time.Duration,
    72  ) <-chan *observerpb.GetFlowsResponse {
    73  	pq := queue.NewPriorityQueue(qlen)
    74  	sortedFlows := make(chan *observerpb.GetFlowsResponse, qlen)
    75  
    76  	go func() {
    77  		defer close(sortedFlows)
    78  		bufferTimer, bufferTimerDone := inctimer.New()
    79  		defer bufferTimerDone()
    80  	flowsLoop:
    81  		for {
    82  			select {
    83  			case flow, ok := <-flows:
    84  				if !ok {
    85  					break flowsLoop
    86  				}
    87  				if pq.Len() == qlen {
    88  					f := pq.Pop()
    89  					select {
    90  					case sortedFlows <- f:
    91  					case <-ctx.Done():
    92  						return
    93  					}
    94  				}
    95  				pq.Push(flow)
    96  			case t := <-bufferTimer.After(bufferDrainTimeout):
    97  				// Make sure to drain old flows from the queue when no new
    98  				// flows are received. The bufferDrainTimeout duration is used
    99  				// as a sorting window.
   100  				for _, f := range pq.PopOlderThan(t.Add(-bufferDrainTimeout)) {
   101  					select {
   102  					case sortedFlows <- f:
   103  					case <-ctx.Done():
   104  						return
   105  					}
   106  				}
   107  			case <-ctx.Done():
   108  				return
   109  			}
   110  		}
   111  		// drain the queue
   112  		for f := pq.Pop(); f != nil; f = pq.Pop() {
   113  			select {
   114  			case sortedFlows <- f:
   115  			case <-ctx.Done():
   116  				return
   117  			}
   118  		}
   119  	}()
   120  	return sortedFlows
   121  }
   122  
   123  func nodeStatusError(err error, nodeNames ...string) *observerpb.GetFlowsResponse {
   124  	msg := err.Error()
   125  	if s, ok := status.FromError(err); ok && s.Code() == codes.Unknown {
   126  		msg = s.Message()
   127  	}
   128  
   129  	return &observerpb.GetFlowsResponse{
   130  		NodeName: nodeTypes.GetAbsoluteNodeName(),
   131  		Time:     timestamppb.New(time.Now()),
   132  		ResponseTypes: &observerpb.GetFlowsResponse_NodeStatus{
   133  			NodeStatus: &relaypb.NodeStatusEvent{
   134  				StateChange: relaypb.NodeState_NODE_ERROR,
   135  				NodeNames:   nodeNames,
   136  				Message:     msg,
   137  			},
   138  		},
   139  	}
   140  }
   141  
   142  func nodeStatusEvent(state relaypb.NodeState, nodeNames ...string) *observerpb.GetFlowsResponse {
   143  	return &observerpb.GetFlowsResponse{
   144  		NodeName: nodeTypes.GetAbsoluteNodeName(),
   145  		Time:     timestamppb.New(time.Now()),
   146  		ResponseTypes: &observerpb.GetFlowsResponse_NodeStatus{
   147  			NodeStatus: &relaypb.NodeStatusEvent{
   148  				StateChange: state,
   149  				NodeNames:   nodeNames,
   150  			},
   151  		},
   152  	}
   153  }
   154  
   155  func aggregateErrors(
   156  	ctx context.Context,
   157  	responses <-chan *observerpb.GetFlowsResponse,
   158  	errorAggregationWindow time.Duration,
   159  ) <-chan *observerpb.GetFlowsResponse {
   160  	aggregated := make(chan *observerpb.GetFlowsResponse, cap(responses))
   161  
   162  	var flushPending <-chan time.Time
   163  	var pendingResponse *observerpb.GetFlowsResponse
   164  
   165  	go func() {
   166  		defer close(aggregated)
   167  	aggregateErrorsLoop:
   168  		for {
   169  			select {
   170  			case response, ok := <-responses:
   171  				if !ok {
   172  					// flush any pending response before exiting
   173  					if pendingResponse != nil {
   174  						select {
   175  						case aggregated <- pendingResponse:
   176  						case <-ctx.Done():
   177  						}
   178  					}
   179  					return
   180  				}
   181  
   182  				// any non-error responses are directly forwarded
   183  				current := response.GetNodeStatus()
   184  				if current.GetStateChange() != relaypb.NodeState_NODE_ERROR {
   185  					select {
   186  					case aggregated <- response:
   187  						continue aggregateErrorsLoop
   188  					case <-ctx.Done():
   189  						return
   190  					}
   191  				}
   192  
   193  				// either merge with pending or flush it
   194  				if pending := pendingResponse.GetNodeStatus(); pending != nil {
   195  					if current.GetMessage() == pending.GetMessage() {
   196  						pending.NodeNames = append(pending.NodeNames, current.NodeNames...)
   197  						continue aggregateErrorsLoop
   198  					}
   199  
   200  					select {
   201  					case aggregated <- pendingResponse:
   202  					case <-ctx.Done():
   203  						return
   204  					}
   205  				}
   206  
   207  				pendingResponse = response
   208  				flushPending = inctimer.After(errorAggregationWindow)
   209  			case <-flushPending:
   210  				select {
   211  				case aggregated <- pendingResponse:
   212  					pendingResponse = nil
   213  					flushPending = nil
   214  				case <-ctx.Done():
   215  					return
   216  				}
   217  			case <-ctx.Done():
   218  				return
   219  			}
   220  		}
   221  
   222  	}()
   223  	return aggregated
   224  }
   225  
   226  func sendFlowsResponse(ctx context.Context, stream observerpb.Observer_GetFlowsServer, sortedFlows <-chan *observerpb.GetFlowsResponse) error {
   227  	for {
   228  		select {
   229  		case flow, ok := <-sortedFlows:
   230  			if !ok {
   231  				return nil
   232  			}
   233  			if err := stream.Send(flow); err != nil {
   234  				return err
   235  			}
   236  		case <-ctx.Done():
   237  			return nil
   238  		}
   239  	}
   240  }
   241  
   242  func newFlowCollector(req *observerpb.GetFlowsRequest, opts options) *flowCollector {
   243  	fc := &flowCollector{
   244  		log: opts.log,
   245  		ocb: opts.ocb,
   246  
   247  		req: req,
   248  
   249  		connectedNodes: map[string]struct{}{},
   250  	}
   251  	return fc
   252  }
   253  
   254  type flowCollector struct {
   255  	log logrus.FieldLogger
   256  	ocb observerClientBuilder
   257  
   258  	req *observerpb.GetFlowsRequest
   259  
   260  	mu             lock.Mutex
   261  	connectedNodes map[string]struct{}
   262  }
   263  
   264  func (fc *flowCollector) collect(ctx context.Context, g *errgroup.Group, peers []poolTypes.Peer, flows chan *observerpb.GetFlowsResponse) ([]string, []string) {
   265  	var connected, unavailable []string
   266  	fc.mu.Lock()
   267  	defer fc.mu.Unlock()
   268  	for _, p := range peers {
   269  		if _, ok := fc.connectedNodes[p.Name]; ok {
   270  			connected = append(connected, p.Name)
   271  			continue
   272  		}
   273  		if !isAvailable(p.Conn) {
   274  			fc.log.WithField("address", p.Address).Infof(
   275  				"No connection to peer %s, skipping", p.Name,
   276  			)
   277  			unavailable = append(unavailable, p.Name)
   278  			continue
   279  		}
   280  		connected = append(connected, p.Name)
   281  		fc.connectedNodes[p.Name] = struct{}{}
   282  		g.Go(func() error {
   283  			// retrieveFlowsFromPeer returns blocks until the peer finishes
   284  			// the request by closing the connection, an error occurs,
   285  			// or ctx expires.
   286  			err := retrieveFlowsFromPeer(ctx, fc.ocb.observerClient(&p), fc.req, flows)
   287  			if err != nil {
   288  				fc.log.WithFields(logrus.Fields{
   289  					"error": err,
   290  					"peer":  p,
   291  				}).Warning("Failed to retrieve flows from peer")
   292  				fc.mu.Lock()
   293  				delete(fc.connectedNodes, p.Name)
   294  				fc.mu.Unlock()
   295  				select {
   296  				case flows <- nodeStatusError(err, p.Name):
   297  				case <-ctx.Done():
   298  				}
   299  			}
   300  			return nil
   301  		})
   302  	}
   303  	return connected, unavailable
   304  }