github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/cmd/agent/daemon/state/container_stats_pipeline.go (about) 1 package state 2 3 import ( 4 "context" 5 "errors" 6 "math" 7 "time" 8 9 castpb "github.com/castai/kvisor/api/v1/runtime" 10 "github.com/castai/kvisor/cmd/agent/daemon/netstats" 11 "github.com/castai/kvisor/pkg/containers" 12 "github.com/castai/kvisor/pkg/ebpftracer" 13 "github.com/castai/kvisor/pkg/stats" 14 "github.com/samber/lo" 15 "k8s.io/apimachinery/pkg/api/resource" 16 ) 17 18 func (c *Controller) runContainerStatsPipeline(ctx context.Context) error { 19 c.log.Info("running container stats pipeline") 20 defer c.log.Info("container stats pipeline done") 21 22 ticker := time.NewTicker(c.cfg.ContainerStatsScrapeInterval) 23 defer ticker.Stop() 24 25 for { 26 select { 27 case <-ctx.Done(): 28 return ctx.Err() 29 case <-ticker.C: 30 batch := &castpb.ContainerStatsBatch{} 31 c.scrapeContainersResourceStats(batch) 32 c.scrapeContainersSyscallStats(ctx, batch) 33 if len(batch.Items) > 0 { 34 for _, exp := range c.exporters.ContainerStats { 35 exp.Enqueue(batch) 36 } 37 } 38 } 39 } 40 } 41 42 func (c *Controller) scrapeContainersResourceStats(batch *castpb.ContainerStatsBatch) { 43 for _, cont := range c.containersClient.ListContainers() { 44 if cont.Name == "" { 45 // We ignore containers that do not have a name, as they are likely just the pause containers. 46 continue 47 } 48 49 if len(cont.PIDs) == 0 { 50 continue 51 } 52 53 if c.IsMutedNamespace(cont.PodNamespace) { 54 continue 55 } 56 57 now := time.Now().UTC() 58 cpu, err := c.containersClient.GetCgroupCpuStats(cont) 59 if err != nil { 60 // TODO: Metrics 61 continue 62 } 63 mem, err := c.containersClient.GetCgroupMemoryStats(cont) 64 if err != nil { 65 // TODO: Metrics 66 continue 67 } 68 69 netStats, err := c.netStatsReader.Read(cont.PIDs[0]) 70 if err != nil { 71 // TODO: metrics or handle this better, getting sometimes no such file or directory 72 continue 73 } 74 mainNicStats, _ := lo.Find(netStats, func(item netstats.InterfaceStats) bool { 75 return item.Name == "eth0" 76 }) 77 78 currScrape := &resourcesStatsScrapePoint{ 79 ts: now, 80 cpuStat: cpu, 81 memStats: mem, 82 netStats: &mainNicStats, 83 } 84 85 // We need at least 2 scrapes to calculate diff count. 86 c.resourcesStatsScrapePointsMu.RLock() 87 prevScrape, found := c.resourcesStatsScrapePoints[cont.CgroupID] 88 c.resourcesStatsScrapePointsMu.RUnlock() 89 if !found { 90 c.resourcesStatsScrapePointsMu.Lock() 91 c.resourcesStatsScrapePoints[cont.CgroupID] = currScrape 92 c.resourcesStatsScrapePointsMu.Unlock() 93 continue 94 } 95 96 pbStats := c.collectContainerResourcesStats(prevScrape, currScrape) 97 if len(pbStats) > 0 { 98 batch.Items = append(batch.Items, &castpb.ContainerStats{ 99 Namespace: cont.PodNamespace, 100 PodName: cont.PodName, 101 ContainerName: cont.Name, 102 PodUid: cont.PodUID, 103 ContainerId: cont.ID, 104 Stats: pbStats, 105 }) 106 } 107 108 prevScrape.ts = currScrape.ts 109 prevScrape.cpuStat = currScrape.cpuStat 110 prevScrape.memStats = currScrape.memStats 111 prevScrape.netStats = currScrape.netStats 112 } 113 } 114 115 func (c *Controller) collectContainerResourcesStats(prev, curr *resourcesStatsScrapePoint) []*castpb.Stats { 116 // TODO(Kvisord): Add io stats. 117 var pbStats []*castpb.Stats 118 119 // CPU stats. Stored as cpu millicores used during this scrape period. 120 // Stored values can be used directly with `avg(value)`. 121 window := curr.ts.Sub(prev.ts) 122 cpuUsage := uint64Quantity(uint64((curr.cpuStat.UsageSeconds-prev.cpuStat.UsageSeconds)/window.Seconds()*1e9), resource.DecimalSI, -9) 123 cpuThrottled := uint64Quantity(uint64((curr.cpuStat.ThrottledTimeSeconds-prev.cpuStat.ThrottledTimeSeconds)/window.Seconds()*1e9), resource.DecimalSI, -9) 124 if v := cpuUsage.MilliValue(); v > 0 { 125 pbStats = append(pbStats, &castpb.Stats{ 126 Group: castpb.StatsGroup_STATS_GROUP_CPU, 127 Subgroup: stats.SubgroupCPUUsage, 128 Value: float64(v), 129 }) 130 } 131 if v := cpuThrottled.MilliValue(); v > 0 { 132 pbStats = append(pbStats, &castpb.Stats{ 133 Group: castpb.StatsGroup_STATS_GROUP_CPU, 134 Subgroup: stats.SubgroupCPUThrottled, 135 Value: float64(v), 136 }) 137 } 138 139 // Memory stats. 140 if v := curr.memStats.RSS; v > 0 { 141 pbStats = append(pbStats, &castpb.Stats{ 142 Group: castpb.StatsGroup_STATS_GROUP_MEMORY, 143 Subgroup: stats.SubgroupMemoryUsage, 144 Value: float64(v), 145 }) 146 } 147 if v := curr.memStats.Limit; v > 0 { 148 pbStats = append(pbStats, &castpb.Stats{ 149 Group: castpb.StatsGroup_STATS_GROUP_MEMORY, 150 Subgroup: stats.SubgroupMemoryLimit, 151 Value: float64(v), 152 }) 153 } 154 155 // Network stats. Saved as deltas. 156 // Stored values can be converted to rate similar to prometheus by `sum(value)/60` for period is one minute. 157 if v := curr.netStats.TxBytes - prev.netStats.TxBytes; v > 0 { 158 pbStats = append(pbStats, &castpb.Stats{ 159 Group: castpb.StatsGroup_STATS_GROUP_NET, 160 Subgroup: stats.SubgroupNetworkTxBytes, 161 Value: float64(v), 162 }) 163 } 164 if v := curr.netStats.TxDropped - prev.netStats.TxDropped; v > 0 { 165 pbStats = append(pbStats, &castpb.Stats{ 166 Group: castpb.StatsGroup_STATS_GROUP_NET, 167 Subgroup: stats.SubgroupNetworkTxDropped, 168 Value: float64(v), 169 }) 170 } 171 if v := curr.netStats.RxBytes - prev.netStats.RxBytes; v > 0 { 172 pbStats = append(pbStats, &castpb.Stats{ 173 Group: castpb.StatsGroup_STATS_GROUP_NET, 174 Subgroup: stats.SubgroupNetworkRxBytes, 175 Value: float64(v), 176 }) 177 } 178 if v := curr.netStats.RxDropped - prev.netStats.RxDropped; v > 0 { 179 pbStats = append(pbStats, &castpb.Stats{ 180 Group: castpb.StatsGroup_STATS_GROUP_NET, 181 Subgroup: stats.SubgroupNetworkRxDropped, 182 Value: float64(v), 183 }) 184 } 185 186 return pbStats 187 } 188 189 func (c *Controller) scrapeContainersSyscallStats(ctx context.Context, batch *castpb.ContainerStatsBatch) { 190 st, err := c.tracer.ReadSyscallStats() 191 if err != nil { 192 c.log.Errorf("reading syscalls stats from kernel: %v", err) 193 return 194 } 195 196 for cgroupID, syscallStats := range st { 197 cont, err := c.containersClient.GetContainerForCgroup(ctx, uint64(cgroupID)) 198 if err != nil { 199 if !errors.Is(err, containers.ErrContainerNotFound) { 200 c.log.Errorf("getting container: %v", err) 201 } 202 continue 203 } 204 205 if c.IsMutedNamespace(cont.PodNamespace) { 206 continue 207 } 208 209 // We need at least 2 scrapes to calculate diff count. 210 c.syscallScrapePointsMu.RLock() 211 prevScrape, found := c.syscallScrapePoints[cont.CgroupID] 212 c.syscallScrapePointsMu.RUnlock() 213 if !found { 214 syscalls := make(map[ebpftracer.SyscallID]uint64, len(syscallStats)) 215 for _, v := range syscallStats { 216 syscalls[v.ID] = v.Count 217 } 218 c.syscallScrapePointsMu.Lock() 219 c.syscallScrapePoints[cont.CgroupID] = &syscallScrapePoint{ 220 syscalls: syscalls, 221 } 222 c.syscallScrapePointsMu.Unlock() 223 continue 224 } 225 226 cgStats := &castpb.ContainerStats{ 227 Namespace: cont.PodNamespace, 228 PodName: cont.PodName, 229 ContainerName: cont.Name, 230 PodUid: cont.PodUID, 231 ContainerId: cont.ID, 232 } 233 for _, stat := range syscallStats { 234 prevValue := prevScrape.syscalls[stat.ID] 235 currValue := float64(stat.Count - prevValue) 236 if currValue == 0 { 237 continue 238 } 239 cgStats.Stats = append(cgStats.Stats, &castpb.Stats{ 240 Group: castpb.StatsGroup_STATS_GROUP_SYSCALL, 241 Subgroup: uint32(stat.ID), 242 Value: currValue, 243 }) 244 } 245 if len(cgStats.Stats) > 0 { 246 batch.Items = append(batch.Items, cgStats) 247 } 248 249 syscalls := make(map[ebpftracer.SyscallID]uint64, len(syscallStats)) 250 for _, v := range syscallStats { 251 syscalls[v.ID] = v.Count 252 } 253 prevScrape.syscalls = syscalls 254 } 255 } 256 257 func uint64Quantity(val uint64, format resource.Format, scale resource.Scale) resource.Quantity { 258 q := *resource.NewScaledQuantity(int64(val), scale) 259 if val > math.MaxInt64 { 260 q = *resource.NewScaledQuantity(int64(val/10), resource.Scale(1)+scale) 261 } 262 q.Format = format 263 return q 264 }