github.com/cilium/cilium@v1.16.2/pkg/maps/metricsmap/metricsmap.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package metricsmap 5 6 import ( 7 "unsafe" 8 9 "github.com/cilium/hive/cell" 10 "github.com/prometheus/client_golang/prometheus" 11 12 "github.com/cilium/cilium/pkg/ebpf" 13 "github.com/cilium/cilium/pkg/lock" 14 "github.com/cilium/cilium/pkg/logging" 15 "github.com/cilium/cilium/pkg/logging/logfields" 16 "github.com/cilium/cilium/pkg/metrics" 17 monitorAPI "github.com/cilium/cilium/pkg/monitor/api" 18 ) 19 20 var Cell = cell.Module( 21 "metricsmap", 22 "eBPF Metrics Map", 23 cell.Invoke(RegisterCollector), 24 ) 25 26 // IterateCallback represents the signature of the callback function expected by 27 // the IterateWithCallback method, which in turn is used to iterate all the 28 // keys/values of a metrics map. 29 type IterateCallback func(*Key, *Values) 30 31 // MetricsMap interface represents a metrics map, and can be reused to implement 32 // mock maps for unit tests. 33 type MetricsMap interface { 34 IterateWithCallback(IterateCallback) error 35 } 36 37 type metricsMap struct { 38 *ebpf.Map 39 } 40 41 var ( 42 // Metrics is the bpf metrics map 43 Metrics = metricsMap{ebpf.NewMap(&ebpf.MapSpec{ 44 Name: MapName, 45 Type: ebpf.PerCPUHash, 46 KeySize: uint32(unsafe.Sizeof(Key{})), 47 ValueSize: uint32(unsafe.Sizeof(Value{})), 48 MaxEntries: MaxEntries, 49 Pinning: ebpf.PinByName, 50 })} 51 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "map-metrics") 52 ) 53 54 const ( 55 // MapName for metrics map. 56 MapName = "cilium_metrics" 57 // MaxEntries is the maximum number of keys that can be present in the 58 // Metrics Map. 59 // 60 // Currently max. 2 bits of the Key.Dir member are used (unknown, 61 // ingress or egress). Thus we can reduce from the theoretical max. size 62 // of 2**16 (2 uint8) to 2**10 (1 uint8 + 2 bits). 63 MaxEntries = 1024 64 // dirIngress and dirEgress values should match with 65 // METRIC_INGRESS, METRIC_EGRESS and METRIC_SERVICE 66 // in bpf/lib/common.h 67 dirUnknown = 0 68 dirIngress = 1 69 dirEgress = 2 70 dirService = 3 71 ) 72 73 // direction is the metrics direction i.e ingress (to an endpoint), 74 // egress (from an endpoint) or service (NodePort service being accessed from 75 // outside or a ClusterIP service being accessed from inside the cluster). 76 // If it's none of the above, we return UNKNOWN direction. 77 var direction = map[uint8]string{ 78 dirUnknown: "UNKNOWN", 79 dirIngress: "INGRESS", 80 dirEgress: "EGRESS", 81 dirService: "SERVICE", 82 } 83 84 // Key must be in sync with struct metrics_key in <bpf/lib/common.h> 85 type Key struct { 86 Reason uint8 `align:"reason"` 87 Dir uint8 `align:"dir"` 88 // Line contains the line number of the metrics statement. 89 Line uint16 `align:"line"` 90 // File is the number of the source file containing the metrics statement. 91 File uint8 `align:"file"` 92 Reserved [3]uint8 `align:"reserved"` 93 } 94 95 // Value must be in sync with struct metrics_value in <bpf/lib/common.h> 96 type Value struct { 97 Count uint64 `align:"count"` 98 Bytes uint64 `align:"bytes"` 99 } 100 101 // Values is a slice of Values 102 type Values []Value 103 104 // IterateWithCallback iterates through all the keys/values of a metrics map, 105 // passing each key/value pair to the cb callback 106 func (m metricsMap) IterateWithCallback(cb IterateCallback) error { 107 return m.Map.IterateWithCallback(&Key{}, &Values{}, func(k, v interface{}) { 108 key := k.(*Key) 109 values := v.(*Values) 110 cb(key, values) 111 }) 112 } 113 114 // MetricDirection gets the direction in human readable string format 115 func MetricDirection(dir uint8) string { 116 if desc, ok := direction[dir]; ok { 117 return desc 118 } 119 return direction[dirUnknown] 120 } 121 122 // Direction gets the direction in human readable string format 123 func (k *Key) Direction() string { 124 return MetricDirection(k.Dir) 125 } 126 127 // DropForwardReason gets the forwarded/dropped reason in human readable string format 128 func (k *Key) DropForwardReason() string { 129 return monitorAPI.DropReason(k.Reason) 130 } 131 132 // FileName returns the filename where the event occurred, in string format. 133 func (k *Key) FileName() string { 134 return monitorAPI.BPFFileName(k.File) 135 } 136 137 // IsDrop checks if the reason is drop or not. 138 func (k *Key) IsDrop() bool { 139 return k.Reason == monitorAPI.DropInvalid || k.Reason >= monitorAPI.DropMin 140 } 141 142 // Count returns the sum of all the per-CPU count values 143 func (vs Values) Count() uint64 { 144 c := uint64(0) 145 for _, v := range vs { 146 c += v.Count 147 } 148 149 return c 150 } 151 152 // Bytes returns the sum of all the per-CPU bytes values 153 func (vs Values) Bytes() uint64 { 154 b := uint64(0) 155 for _, v := range vs { 156 b += v.Bytes 157 } 158 159 return b 160 } 161 162 // metricsMapCollector implements Prometheus Collector interface 163 type metricsmapCollector struct { 164 mutex lock.Mutex 165 166 droppedCountDesc *prometheus.Desc 167 droppedByteDesc *prometheus.Desc 168 forwardCountDesc *prometheus.Desc 169 forwardByteDesc *prometheus.Desc 170 } 171 172 func newMetricsMapCollector() prometheus.Collector { 173 return &metricsmapCollector{ 174 droppedByteDesc: prometheus.NewDesc( 175 prometheus.BuildFQName(metrics.Namespace, "", "drop_bytes_total"), 176 "Total dropped bytes, tagged by drop reason and ingress/egress direction", 177 []string{metrics.LabelDropReason, metrics.LabelDirection}, nil, 178 ), 179 droppedCountDesc: prometheus.NewDesc( 180 prometheus.BuildFQName(metrics.Namespace, "", "drop_count_total"), 181 "Total dropped packets, tagged by drop reason and ingress/egress direction", 182 []string{metrics.LabelDropReason, metrics.LabelDirection}, nil, 183 ), 184 forwardCountDesc: prometheus.NewDesc( 185 prometheus.BuildFQName(metrics.Namespace, "", "forward_count_total"), 186 "Total forwarded packets, tagged by ingress/egress direction", 187 []string{metrics.LabelDirection}, nil, 188 ), 189 forwardByteDesc: prometheus.NewDesc( 190 prometheus.BuildFQName(metrics.Namespace, "", "forward_bytes_total"), 191 "Total forwarded bytes, tagged by ingress/egress direction", 192 []string{metrics.LabelDirection}, nil, 193 ), 194 } 195 } 196 197 type forwardLabels struct { 198 direction string 199 } 200 201 type dropLabels struct { 202 direction string 203 reason string 204 } 205 206 type metricValues struct { 207 bytes float64 208 count float64 209 } 210 211 type labels comparable 212 213 // promMetrics is used to sum values by a desired set of labels for both 214 // forwarded and dropped metrics. 215 type promMetrics[k labels] map[k]*metricValues 216 217 // sum accumulates a value for the given label set k and stores it in p. Can be 218 // called multiple times with the same label set. 219 // 220 // values is a row from the metrics map, a per-cpu data structure. All entries 221 // in the row are summed, and the result is added to any preexisting values 222 // belonging to the label set. 223 func (p promMetrics[k]) sum(labels k, values *Values) { 224 if v, ok := p[labels]; ok { 225 v.bytes += float64(values.Bytes()) 226 v.count += float64(values.Count()) 227 return 228 } 229 230 p[labels] = &metricValues{ 231 bytes: float64(values.Bytes()), 232 count: float64(values.Count()), 233 } 234 } 235 236 func (mc *metricsmapCollector) Collect(ch chan<- prometheus.Metric) { 237 mc.mutex.Lock() 238 defer mc.mutex.Unlock() 239 240 // The datapath knows many reasons for forwarding or dropping a packet. All 241 // packet metrics carry a direction label, and forwarded packets can carry 242 // either the 'success' or 'interface' forward reason depending on where it 243 // came in. 244 // 245 // Drop metrics carry direction and one of many possible drop reasons. 246 // 247 // Since Cilium 1.16, the underlying metrics map contains line/file 248 // information for all metrics to enable troubleshooting. We don't expose 249 // these as labels through the /metrics endpoint to keep cardinality low and 250 // to avoid breaking user queries and recording rules. `cilium-dbg bpf metrics 251 // list` always shows all properties and is included in sysdumps. 252 // 253 // The code below first generates a label set, typically a subset of the 254 // members of the metrics key, and sums up all byte/packet counters matching 255 // the label set. This accounts for future versions of Cilium adding new 256 // fields, causing surprising behaviour without the summing logic in place in 257 // case the agent is downgraded. From the perspective of the downgraded agent, 258 // this will cause multiple identical metrics to appear with different values. 259 // The Prometheus library rejects metrics with duplicate label sets. 260 261 drop := make(promMetrics[dropLabels]) 262 fwd := make(promMetrics[forwardLabels]) 263 264 err := Metrics.IterateWithCallback(func(key *Key, values *Values) { 265 if key.IsDrop() { 266 labelSet := dropLabels{ 267 direction: key.Direction(), 268 reason: key.DropForwardReason(), 269 } 270 drop.sum(labelSet, values) 271 272 return 273 } 274 275 labelSet := forwardLabels{ 276 direction: key.Direction(), 277 } 278 fwd.sum(labelSet, values) 279 }) 280 if err != nil { 281 log.WithError(err).Warn("Failed to read metrics from BPF map") 282 // Do not update partial metrics 283 return 284 } 285 286 for labels, value := range fwd { 287 mc.updateCounterMetric(mc.forwardCountDesc, ch, value.count, labels.direction) 288 mc.updateCounterMetric(mc.forwardByteDesc, ch, value.bytes, labels.direction) 289 } 290 291 for labels, value := range drop { 292 mc.updateCounterMetric(mc.droppedCountDesc, ch, value.count, labels.reason, labels.direction) 293 mc.updateCounterMetric(mc.droppedByteDesc, ch, value.bytes, labels.reason, labels.direction) 294 } 295 } 296 297 func (mc *metricsmapCollector) updateCounterMetric(desc *prometheus.Desc, metricsChan chan<- prometheus.Metric, value float64, labelValues ...string) { 298 metricsChan <- prometheus.MustNewConstMetric( 299 desc, 300 prometheus.CounterValue, 301 value, 302 labelValues...) 303 } 304 305 func (mc *metricsmapCollector) Describe(ch chan<- *prometheus.Desc) { 306 ch <- mc.forwardByteDesc 307 ch <- mc.forwardCountDesc 308 ch <- mc.droppedCountDesc 309 ch <- mc.droppedByteDesc 310 } 311 312 func RegisterCollector() { 313 if err := metrics.Register(newMetricsMapCollector()); err != nil { 314 log.WithError(err).Error("Failed to register metrics map collector to Prometheus registry. " + 315 "cilium_datapath_drop/forward metrics will not be collected") 316 } 317 }