github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/state/netflow_pipeline.go (about)

     1  package state
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/binary"
     7  	"fmt"
     8  	"net/netip"
     9  	"time"
    10  
    11  	kubepb "github.com/castai/kvisor/api/v1/kube"
    12  	castpb "github.com/castai/kvisor/api/v1/runtime"
    13  	"github.com/castai/kvisor/pkg/ebpftracer/types"
    14  	"github.com/castai/kvisor/pkg/metrics"
    15  	"golang.org/x/sync/errgroup"
    16  )
    17  
    18  type clusterInfo struct {
    19  	podCidr     netip.Prefix
    20  	serviceCidr netip.Prefix
    21  }
    22  
    23  func (c *Controller) getClusterInfo(ctx context.Context) (*clusterInfo, error) {
    24  	for {
    25  		select {
    26  		case <-ctx.Done():
    27  			return nil, ctx.Err()
    28  		default:
    29  		}
    30  
    31  		resp, err := c.kubeClient.GetClusterInfo(ctx, &kubepb.GetClusterInfoRequest{})
    32  		if err != nil {
    33  			c.log.Warnf("getting cluster info: %v", err)
    34  			sleep(ctx, 2*time.Second)
    35  			continue
    36  		}
    37  		res := clusterInfo{}
    38  		res.podCidr, err = netip.ParsePrefix(resp.PodsCidr)
    39  		if err != nil {
    40  			return nil, err
    41  		}
    42  		res.serviceCidr, err = netip.ParsePrefix(resp.ServiceCidr)
    43  		if err != nil {
    44  			return nil, err
    45  		}
    46  		return &res, nil
    47  	}
    48  }
    49  
    50  func (c *Controller) runNetflowPipeline(ctx context.Context) error {
    51  	c.log.Info("running netflow pipeline")
    52  	defer c.log.Info("netflow pipeline done")
    53  
    54  	var err error
    55  	c.clusterInfo, err = c.getClusterInfo(ctx)
    56  	if err != nil {
    57  		return fmt.Errorf("get cluster info: %w", err)
    58  	}
    59  	c.log.Infof("fetched cluster info, pod_cidr=%s, cluster_cidr=%s", c.clusterInfo.podCidr, c.clusterInfo.serviceCidr)
    60  
    61  	errg, ctx := errgroup.WithContext(ctx)
    62  	errg.Go(func() error {
    63  		for {
    64  			select {
    65  			case <-ctx.Done():
    66  				return ctx.Err()
    67  			case e := <-c.tracer.NetflowEvents():
    68  				c.upsertNetflow(e)
    69  			}
    70  		}
    71  	})
    72  	errg.Go(func() error {
    73  		t := time.NewTicker(c.cfg.NetflowCleanupInterval)
    74  		defer t.Stop()
    75  		for {
    76  			select {
    77  			case <-ctx.Done():
    78  				return ctx.Err()
    79  			case <-t.C:
    80  				c.cleanupNetflow()
    81  			}
    82  		}
    83  	})
    84  	return errg.Wait()
    85  }
    86  
    87  type netflowVal struct {
    88  	updatedAt    time.Time
    89  	event        *types.Event
    90  	destinations map[uint64]*netflowDest
    91  }
    92  
    93  type netflowDest struct {
    94  	addrPort  netip.AddrPort
    95  	txBytes   uint64
    96  	rxBytes   uint64
    97  	txPackets uint64
    98  	rxPackets uint64
    99  }
   100  
   101  func (c *Controller) upsertNetflow(e *types.Event) {
   102  	c.netflowsMu.Lock()
   103  	defer c.netflowsMu.Unlock()
   104  
   105  	args := e.Args.(types.NetFlowBaseArgs)
   106  	key := c.netflowKey(e, &args)
   107  	netflow, found := c.netflows[key]
   108  	if !found {
   109  		netflow = &netflowVal{
   110  			event:        e,
   111  			destinations: map[uint64]*netflowDest{},
   112  		}
   113  		c.netflows[key] = netflow
   114  	}
   115  
   116  	destKey := c.netflowDestKey(&args)
   117  	dest, found := netflow.destinations[destKey]
   118  	if !found {
   119  		dest = &netflowDest{
   120  			addrPort: args.Tuple.Dst,
   121  		}
   122  		netflow.destinations[key] = dest
   123  	}
   124  	// Update stats
   125  	dest.txBytes += args.TxBytes
   126  	dest.rxBytes += args.RxBytes
   127  	dest.txPackets += args.TxPackets
   128  	dest.rxPackets += args.RxPackets
   129  
   130  	now := time.Now()
   131  	start := time.UnixMicro(int64(e.Context.Ts) / 1e3)
   132  	netflow.updatedAt = now
   133  	flowType := e.Context.GetNetflowType()
   134  	if now.Sub(start) >= c.cfg.NetflowExportInterval || flowType == types.NetflowTypeTCPBegin || flowType == types.NetflowTypeTCPEnd {
   135  		pbNetFlow := c.toProtoNetflow(netflow, &args, now)
   136  		for _, exp := range c.exporters.Netflow {
   137  			exp.Enqueue(pbNetFlow)
   138  		}
   139  		// Reset flow stats after export.
   140  		for _, flowDest := range netflow.destinations {
   141  			flowDest.txBytes = 0
   142  			flowDest.rxBytes = 0
   143  			flowDest.txPackets = 0
   144  			flowDest.rxPackets = 0
   145  		}
   146  	}
   147  
   148  	// Cleanup flow.
   149  	if flowType == types.NetflowTypeTCPEnd {
   150  		delete(c.netflows, key)
   151  	}
   152  }
   153  
   154  func (c *Controller) toProtoNetflow(flow *netflowVal, args *types.NetFlowBaseArgs, now time.Time) *castpb.Netflow {
   155  	ctx := flow.event.Context
   156  	cont := flow.event.Container
   157  
   158  	res := &castpb.Netflow{
   159  		StartTs:       ctx.Ts,
   160  		EndTs:         uint64(now.UnixNano()),
   161  		ProcessName:   string(bytes.TrimRight(ctx.Comm[:], "\x00")),
   162  		Namespace:     cont.PodNamespace,
   163  		PodName:       cont.PodName,
   164  		ContainerName: cont.Name,
   165  		Addr:          args.Tuple.Src.Addr().AsSlice(),
   166  		Port:          uint32(args.Tuple.Src.Port()),
   167  		Protocol:      toProtoProtocol(args.Proto),
   168  		Destinations:  make([]*castpb.NetflowDestination, 0, len(flow.destinations)),
   169  	}
   170  
   171  	c.enrichFlowKubeInfo(cont.PodUID, res)
   172  
   173  	for _, dest := range flow.destinations {
   174  		dst := dest.addrPort
   175  		dns := c.getAddrDnsQuestion(ctx.CgroupID, dst.Addr())
   176  
   177  		if c.clusterInfo.serviceCidr.Contains(dst.Addr()) {
   178  			if realDst, found := c.ct.GetDestination(args.Tuple.Src, args.Tuple.Dst); found {
   179  				dst = realDst
   180  			}
   181  		}
   182  
   183  		pbDest := &castpb.NetflowDestination{
   184  			DnsQuestion: dns,
   185  			Addr:        dst.Addr().AsSlice(),
   186  			Port:        uint32(dst.Port()),
   187  			TxBytes:     dest.txBytes,
   188  			RxBytes:     dest.rxBytes,
   189  			TxPackets:   dest.txPackets,
   190  			RxPackets:   dest.rxPackets,
   191  		}
   192  
   193  		c.enrichFlowDestinationKubeInfo(dst.Addr(), pbDest)
   194  
   195  		res.Destinations = append(res.Destinations, pbDest)
   196  	}
   197  	return res
   198  }
   199  
   200  func (c *Controller) enrichFlowKubeInfo(podID string, res *castpb.Netflow) {
   201  	ipInfo, found := c.getPodInfo(podID)
   202  	if !found {
   203  		return
   204  	}
   205  	res.WorkloadName = ipInfo.WorkloadName
   206  	res.WorkloadKind = ipInfo.WorkloadKind
   207  	res.Zone = ipInfo.Zone
   208  }
   209  
   210  func (c *Controller) enrichFlowDestinationKubeInfo(dstAddr netip.Addr, pbDest *castpb.NetflowDestination) {
   211  	if !c.clusterInfo.serviceCidr.Contains(dstAddr) && !c.clusterInfo.podCidr.Contains(dstAddr) {
   212  		return
   213  	}
   214  
   215  	ipInfo, found := c.getIPInfo(dstAddr)
   216  	if !found {
   217  		return
   218  	}
   219  
   220  	pbDest.PodName = ipInfo.PodName
   221  	pbDest.Namespace = ipInfo.Namespace
   222  	pbDest.WorkloadName = ipInfo.WorkloadName
   223  	pbDest.WorkloadKind = ipInfo.WorkloadKind
   224  	pbDest.Zone = ipInfo.Zone
   225  }
   226  
   227  func (c *Controller) getIPInfo(addr netip.Addr) (*kubepb.IPInfo, bool) {
   228  	ipInfo, found := c.ipInfoCache.Get(addr)
   229  	if !found {
   230  		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
   231  		defer cancel()
   232  		resp, err := c.kubeClient.GetIPInfo(ctx, &kubepb.GetIPInfoRequest{Ip: addr.Unmap().String()})
   233  		if err != nil {
   234  			metrics.AgentFetchKubeIPInfoErrorsTotal.Inc()
   235  			return nil, false
   236  		}
   237  		ipInfo = resp.Info
   238  		c.ipInfoCache.Add(addr, ipInfo)
   239  	}
   240  	return ipInfo, true
   241  }
   242  
   243  func (c *Controller) getPodInfo(podID string) (*kubepb.Pod, bool) {
   244  	pod, found := c.podCache.Get(podID)
   245  	if !found {
   246  		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
   247  		defer cancel()
   248  		resp, err := c.kubeClient.GetPod(ctx, &kubepb.GetPodRequest{Uid: podID})
   249  		if err != nil {
   250  			return nil, false
   251  		}
   252  		pod = resp.Pod
   253  		c.podCache.Add(podID, pod)
   254  	}
   255  	return pod, true
   256  }
   257  
   258  func (c *Controller) cleanupNetflow() {
   259  	c.netflowsMu.Lock()
   260  	defer c.netflowsMu.Unlock()
   261  
   262  	now := time.Now()
   263  	var totalRemoved int
   264  	for key, flow := range c.netflows {
   265  		lastFlowUpdate := now.Sub(flow.updatedAt)
   266  		if lastFlowUpdate >= c.cfg.NetflowExportInterval*2 {
   267  			totalRemoved++
   268  			delete(c.netflows, key)
   269  		}
   270  	}
   271  	c.log.Debugf("removed expired netflow flows, count=%d", totalRemoved)
   272  }
   273  
   274  func (c *Controller) netflowKey(e *types.Event, args *types.NetFlowBaseArgs) uint64 {
   275  	c.netflowKeyHash.Reset()
   276  
   277  	// Cgroup id.
   278  	var cgroup [8]byte
   279  	binary.LittleEndian.PutUint64(cgroup[:], e.Context.CgroupID)
   280  	_, _ = c.netflowKeyHash.Write(cgroup[:])
   281  
   282  	// Pid.
   283  	var pid [4]byte
   284  	binary.LittleEndian.PutUint32(cgroup[:], e.Context.HostPid)
   285  	_, _ = c.netflowKeyHash.Write(pid[:])
   286  
   287  	// Source addr+port.
   288  	srcBytes, _ := args.Tuple.Src.MarshalBinary()
   289  	_, _ = c.netflowKeyHash.Write(srcBytes)
   290  
   291  	// Protocol.
   292  	_ = c.netflowKeyHash.WriteByte(args.Proto)
   293  
   294  	return c.netflowKeyHash.Sum64()
   295  }
   296  
   297  func (c *Controller) netflowDestKey(args *types.NetFlowBaseArgs) uint64 {
   298  	c.netflowDestKeyHash.Reset()
   299  
   300  	// Destination addr+port.
   301  	srcBytes, _ := args.Tuple.Dst.MarshalBinary()
   302  	_, _ = c.netflowKeyHash.Write(srcBytes)
   303  
   304  	return c.netflowKeyHash.Sum64()
   305  }
   306  
   307  func toProtoProtocol(proto uint8) castpb.NetflowProtocol {
   308  	switch proto {
   309  	case 6:
   310  		return castpb.NetflowProtocol_NETFLOW_PROTOCOL_TCP
   311  	default:
   312  		return castpb.NetflowProtocol_NETFLOW_PROTOCOL_UNKNOWN
   313  	}
   314  }
   315  
   316  func sleep(ctx context.Context, timeout time.Duration) {
   317  	t := time.NewTimer(timeout)
   318  	defer t.Stop()
   319  	select {
   320  	case <-t.C:
   321  	case <-ctx.Done():
   322  	}
   323  }