github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/state/netflow_pipeline.go (about) 1 package state 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/binary" 7 "fmt" 8 "net/netip" 9 "time" 10 11 kubepb "github.com/castai/kvisor/api/v1/kube" 12 castpb "github.com/castai/kvisor/api/v1/runtime" 13 "github.com/castai/kvisor/pkg/ebpftracer/types" 14 "github.com/castai/kvisor/pkg/metrics" 15 "golang.org/x/sync/errgroup" 16 ) 17 18 type clusterInfo struct { 19 podCidr netip.Prefix 20 serviceCidr netip.Prefix 21 } 22 23 func (c *Controller) getClusterInfo(ctx context.Context) (*clusterInfo, error) { 24 for { 25 select { 26 case <-ctx.Done(): 27 return nil, ctx.Err() 28 default: 29 } 30 31 resp, err := c.kubeClient.GetClusterInfo(ctx, &kubepb.GetClusterInfoRequest{}) 32 if err != nil { 33 c.log.Warnf("getting cluster info: %v", err) 34 sleep(ctx, 2*time.Second) 35 continue 36 } 37 res := clusterInfo{} 38 res.podCidr, err = netip.ParsePrefix(resp.PodsCidr) 39 if err != nil { 40 return nil, err 41 } 42 res.serviceCidr, err = netip.ParsePrefix(resp.ServiceCidr) 43 if err != nil { 44 return nil, err 45 } 46 return &res, nil 47 } 48 } 49 50 func (c *Controller) runNetflowPipeline(ctx context.Context) error { 51 c.log.Info("running netflow pipeline") 52 defer c.log.Info("netflow pipeline done") 53 54 var err error 55 c.clusterInfo, err = c.getClusterInfo(ctx) 56 if err != nil { 57 return fmt.Errorf("get cluster info: %w", err) 58 } 59 c.log.Infof("fetched cluster info, pod_cidr=%s, cluster_cidr=%s", c.clusterInfo.podCidr, c.clusterInfo.serviceCidr) 60 61 errg, ctx := errgroup.WithContext(ctx) 62 errg.Go(func() error { 63 for { 64 select { 65 case <-ctx.Done(): 66 return ctx.Err() 67 case e := <-c.tracer.NetflowEvents(): 68 c.upsertNetflow(e) 69 } 70 } 71 }) 72 errg.Go(func() error { 73 t := time.NewTicker(c.cfg.NetflowCleanupInterval) 74 defer t.Stop() 75 for { 76 select { 77 case <-ctx.Done(): 78 return ctx.Err() 79 case <-t.C: 80 c.cleanupNetflow() 81 } 82 } 83 }) 84 return errg.Wait() 85 } 86 87 type netflowVal struct { 88 updatedAt time.Time 89 event *types.Event 90 destinations map[uint64]*netflowDest 91 } 92 93 type netflowDest struct { 94 addrPort netip.AddrPort 95 txBytes uint64 96 rxBytes uint64 97 txPackets uint64 98 rxPackets uint64 99 } 100 101 func (c *Controller) upsertNetflow(e *types.Event) { 102 c.netflowsMu.Lock() 103 defer c.netflowsMu.Unlock() 104 105 args := e.Args.(types.NetFlowBaseArgs) 106 key := c.netflowKey(e, &args) 107 netflow, found := c.netflows[key] 108 if !found { 109 netflow = &netflowVal{ 110 event: e, 111 destinations: map[uint64]*netflowDest{}, 112 } 113 c.netflows[key] = netflow 114 } 115 116 destKey := c.netflowDestKey(&args) 117 dest, found := netflow.destinations[destKey] 118 if !found { 119 dest = &netflowDest{ 120 addrPort: args.Tuple.Dst, 121 } 122 netflow.destinations[key] = dest 123 } 124 // Update stats 125 dest.txBytes += args.TxBytes 126 dest.rxBytes += args.RxBytes 127 dest.txPackets += args.TxPackets 128 dest.rxPackets += args.RxPackets 129 130 now := time.Now() 131 start := time.UnixMicro(int64(e.Context.Ts) / 1e3) 132 netflow.updatedAt = now 133 flowType := e.Context.GetNetflowType() 134 if now.Sub(start) >= c.cfg.NetflowExportInterval || flowType == types.NetflowTypeTCPBegin || flowType == types.NetflowTypeTCPEnd { 135 pbNetFlow := c.toProtoNetflow(netflow, &args, now) 136 for _, exp := range c.exporters.Netflow { 137 exp.Enqueue(pbNetFlow) 138 } 139 // Reset flow stats after export. 140 for _, flowDest := range netflow.destinations { 141 flowDest.txBytes = 0 142 flowDest.rxBytes = 0 143 flowDest.txPackets = 0 144 flowDest.rxPackets = 0 145 } 146 } 147 148 // Cleanup flow. 149 if flowType == types.NetflowTypeTCPEnd { 150 delete(c.netflows, key) 151 } 152 } 153 154 func (c *Controller) toProtoNetflow(flow *netflowVal, args *types.NetFlowBaseArgs, now time.Time) *castpb.Netflow { 155 ctx := flow.event.Context 156 cont := flow.event.Container 157 158 res := &castpb.Netflow{ 159 StartTs: ctx.Ts, 160 EndTs: uint64(now.UnixNano()), 161 ProcessName: string(bytes.TrimRight(ctx.Comm[:], "\x00")), 162 Namespace: cont.PodNamespace, 163 PodName: cont.PodName, 164 ContainerName: cont.Name, 165 Addr: args.Tuple.Src.Addr().AsSlice(), 166 Port: uint32(args.Tuple.Src.Port()), 167 Protocol: toProtoProtocol(args.Proto), 168 Destinations: make([]*castpb.NetflowDestination, 0, len(flow.destinations)), 169 } 170 171 c.enrichFlowKubeInfo(cont.PodUID, res) 172 173 for _, dest := range flow.destinations { 174 dst := dest.addrPort 175 dns := c.getAddrDnsQuestion(ctx.CgroupID, dst.Addr()) 176 177 if c.clusterInfo.serviceCidr.Contains(dst.Addr()) { 178 if realDst, found := c.ct.GetDestination(args.Tuple.Src, args.Tuple.Dst); found { 179 dst = realDst 180 } 181 } 182 183 pbDest := &castpb.NetflowDestination{ 184 DnsQuestion: dns, 185 Addr: dst.Addr().AsSlice(), 186 Port: uint32(dst.Port()), 187 TxBytes: dest.txBytes, 188 RxBytes: dest.rxBytes, 189 TxPackets: dest.txPackets, 190 RxPackets: dest.rxPackets, 191 } 192 193 c.enrichFlowDestinationKubeInfo(dst.Addr(), pbDest) 194 195 res.Destinations = append(res.Destinations, pbDest) 196 } 197 return res 198 } 199 200 func (c *Controller) enrichFlowKubeInfo(podID string, res *castpb.Netflow) { 201 ipInfo, found := c.getPodInfo(podID) 202 if !found { 203 return 204 } 205 res.WorkloadName = ipInfo.WorkloadName 206 res.WorkloadKind = ipInfo.WorkloadKind 207 res.Zone = ipInfo.Zone 208 } 209 210 func (c *Controller) enrichFlowDestinationKubeInfo(dstAddr netip.Addr, pbDest *castpb.NetflowDestination) { 211 if !c.clusterInfo.serviceCidr.Contains(dstAddr) && !c.clusterInfo.podCidr.Contains(dstAddr) { 212 return 213 } 214 215 ipInfo, found := c.getIPInfo(dstAddr) 216 if !found { 217 return 218 } 219 220 pbDest.PodName = ipInfo.PodName 221 pbDest.Namespace = ipInfo.Namespace 222 pbDest.WorkloadName = ipInfo.WorkloadName 223 pbDest.WorkloadKind = ipInfo.WorkloadKind 224 pbDest.Zone = ipInfo.Zone 225 } 226 227 func (c *Controller) getIPInfo(addr netip.Addr) (*kubepb.IPInfo, bool) { 228 ipInfo, found := c.ipInfoCache.Get(addr) 229 if !found { 230 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 231 defer cancel() 232 resp, err := c.kubeClient.GetIPInfo(ctx, &kubepb.GetIPInfoRequest{Ip: addr.Unmap().String()}) 233 if err != nil { 234 metrics.AgentFetchKubeIPInfoErrorsTotal.Inc() 235 return nil, false 236 } 237 ipInfo = resp.Info 238 c.ipInfoCache.Add(addr, ipInfo) 239 } 240 return ipInfo, true 241 } 242 243 func (c *Controller) getPodInfo(podID string) (*kubepb.Pod, bool) { 244 pod, found := c.podCache.Get(podID) 245 if !found { 246 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 247 defer cancel() 248 resp, err := c.kubeClient.GetPod(ctx, &kubepb.GetPodRequest{Uid: podID}) 249 if err != nil { 250 return nil, false 251 } 252 pod = resp.Pod 253 c.podCache.Add(podID, pod) 254 } 255 return pod, true 256 } 257 258 func (c *Controller) cleanupNetflow() { 259 c.netflowsMu.Lock() 260 defer c.netflowsMu.Unlock() 261 262 now := time.Now() 263 var totalRemoved int 264 for key, flow := range c.netflows { 265 lastFlowUpdate := now.Sub(flow.updatedAt) 266 if lastFlowUpdate >= c.cfg.NetflowExportInterval*2 { 267 totalRemoved++ 268 delete(c.netflows, key) 269 } 270 } 271 c.log.Debugf("removed expired netflow flows, count=%d", totalRemoved) 272 } 273 274 func (c *Controller) netflowKey(e *types.Event, args *types.NetFlowBaseArgs) uint64 { 275 c.netflowKeyHash.Reset() 276 277 // Cgroup id. 278 var cgroup [8]byte 279 binary.LittleEndian.PutUint64(cgroup[:], e.Context.CgroupID) 280 _, _ = c.netflowKeyHash.Write(cgroup[:]) 281 282 // Pid. 283 var pid [4]byte 284 binary.LittleEndian.PutUint32(cgroup[:], e.Context.HostPid) 285 _, _ = c.netflowKeyHash.Write(pid[:]) 286 287 // Source addr+port. 288 srcBytes, _ := args.Tuple.Src.MarshalBinary() 289 _, _ = c.netflowKeyHash.Write(srcBytes) 290 291 // Protocol. 292 _ = c.netflowKeyHash.WriteByte(args.Proto) 293 294 return c.netflowKeyHash.Sum64() 295 } 296 297 func (c *Controller) netflowDestKey(args *types.NetFlowBaseArgs) uint64 { 298 c.netflowDestKeyHash.Reset() 299 300 // Destination addr+port. 301 srcBytes, _ := args.Tuple.Dst.MarshalBinary() 302 _, _ = c.netflowKeyHash.Write(srcBytes) 303 304 return c.netflowKeyHash.Sum64() 305 } 306 307 func toProtoProtocol(proto uint8) castpb.NetflowProtocol { 308 switch proto { 309 case 6: 310 return castpb.NetflowProtocol_NETFLOW_PROTOCOL_TCP 311 default: 312 return castpb.NetflowProtocol_NETFLOW_PROTOCOL_UNKNOWN 313 } 314 } 315 316 func sleep(ctx context.Context, timeout time.Duration) { 317 t := time.NewTimer(timeout) 318 defer t.Stop() 319 select { 320 case <-t.C: 321 case <-ctx.Done(): 322 } 323 }