github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/tracker/resource_tracker.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package tracker 5 6 import ( 7 "errors" 8 "fmt" 9 "sync" 10 "time" 11 12 "github.com/prometheus/client_golang/prometheus" 13 14 "github.com/MetalBlockchain/metalgo/ids" 15 "github.com/MetalBlockchain/metalgo/utils/linked" 16 "github.com/MetalBlockchain/metalgo/utils/math/meter" 17 "github.com/MetalBlockchain/metalgo/utils/resource" 18 ) 19 20 const epsilon = 1e-9 21 22 var _ ResourceTracker = (*resourceTracker)(nil) 23 24 type Tracker interface { 25 // Returns the current usage for the given node. 26 Usage(nodeID ids.NodeID, now time.Time) float64 27 // Returns the current usage by all nodes. 28 TotalUsage() float64 29 // Returns the duration between [now] and when the usage of [nodeID] reaches 30 // [value], assuming that the node uses no more resources. 31 // If the node's usage isn't known, or is already <= [value], returns the 32 // zero duration. 33 TimeUntilUsage(nodeID ids.NodeID, now time.Time, value float64) time.Duration 34 } 35 36 type DiskTracker interface { 37 Tracker 38 AvailableDiskBytes() uint64 39 } 40 41 // ResourceTracker is an interface for tracking peers' usage of resources 42 type ResourceTracker interface { 43 CPUTracker() Tracker 44 DiskTracker() DiskTracker 45 // Registers that the given node started processing at the given time. 46 StartProcessing(ids.NodeID, time.Time) 47 // Registers that the given node stopped processing at the given time. 48 StopProcessing(ids.NodeID, time.Time) 49 } 50 51 type cpuResourceTracker struct { 52 t *resourceTracker 53 } 54 55 func (t *cpuResourceTracker) Usage(nodeID ids.NodeID, now time.Time) float64 { 56 rt := t.t 57 rt.lock.Lock() 58 defer rt.lock.Unlock() 59 60 realCPUUsage := rt.resources.CPUUsage() 61 rt.metrics.cpuMetric.Set(realCPUUsage) 62 63 measuredProcessingTime := rt.processingMeter.Read(now) 64 rt.metrics.processingTimeMetric.Set(measuredProcessingTime) 65 66 if measuredProcessingTime == 0 { 67 return 0 68 } 69 70 m, exists := rt.meters.Get(nodeID) 71 if !exists { 72 return 0 73 } 74 75 portionUsageByNode := m.Read(now) / measuredProcessingTime 76 return realCPUUsage * portionUsageByNode 77 } 78 79 func (t *cpuResourceTracker) TotalUsage() float64 { 80 return t.t.resources.CPUUsage() 81 } 82 83 func (t *cpuResourceTracker) TimeUntilUsage(nodeID ids.NodeID, now time.Time, value float64) time.Duration { 84 rt := t.t 85 rt.lock.Lock() 86 defer rt.lock.Unlock() 87 88 rt.prune(now) 89 90 m, exists := rt.meters.Get(nodeID) 91 if !exists { 92 return 0 93 } 94 95 measuredProcessingTime := rt.processingMeter.Read(now) 96 rt.metrics.processingTimeMetric.Set(measuredProcessingTime) 97 98 if measuredProcessingTime == 0 { 99 return 0 100 } 101 102 realCPUUsage := rt.resources.CPUUsage() 103 rt.metrics.cpuMetric.Set(realCPUUsage) 104 105 if realCPUUsage == 0 { 106 return 0 107 } 108 109 scale := realCPUUsage / measuredProcessingTime 110 return m.TimeUntil(now, value/scale) 111 } 112 113 type diskResourceTracker struct { 114 t *resourceTracker 115 } 116 117 func (t *diskResourceTracker) Usage(nodeID ids.NodeID, now time.Time) float64 { 118 rt := t.t 119 rt.lock.Lock() 120 defer rt.lock.Unlock() 121 122 // [realWriteUsage] is only used for metrics. 123 realReadUsage, realWriteUsage := rt.resources.DiskUsage() 124 rt.metrics.diskReadsMetric.Set(realReadUsage) 125 rt.metrics.diskWritesMetric.Set(realWriteUsage) 126 127 measuredProcessingTime := rt.processingMeter.Read(now) 128 rt.metrics.processingTimeMetric.Set(measuredProcessingTime) 129 130 if measuredProcessingTime == 0 { 131 return 0 132 } 133 134 m, exists := rt.meters.Get(nodeID) 135 if !exists { 136 return 0 137 } 138 139 portionUsageByNode := m.Read(now) / measuredProcessingTime 140 return realReadUsage * portionUsageByNode 141 } 142 143 func (t *diskResourceTracker) AvailableDiskBytes() uint64 { 144 rt := t.t 145 rt.lock.Lock() 146 defer rt.lock.Unlock() 147 148 bytesAvailable := rt.resources.AvailableDiskBytes() 149 rt.metrics.diskSpaceAvailable.Set(float64(bytesAvailable)) 150 return bytesAvailable 151 } 152 153 func (t *diskResourceTracker) TotalUsage() float64 { 154 realReadUsage, _ := t.t.resources.DiskUsage() 155 return realReadUsage 156 } 157 158 func (t *diskResourceTracker) TimeUntilUsage(nodeID ids.NodeID, now time.Time, value float64) time.Duration { 159 rt := t.t 160 rt.lock.Lock() 161 defer rt.lock.Unlock() 162 163 rt.prune(now) 164 165 m, exists := rt.meters.Get(nodeID) 166 if !exists { 167 return 0 168 } 169 170 measuredProcessingTime := rt.processingMeter.Read(now) 171 rt.metrics.processingTimeMetric.Set(measuredProcessingTime) 172 173 if measuredProcessingTime == 0 { 174 return 0 175 } 176 177 // [realWriteUsage] is only used for metrics. 178 realReadUsage, realWriteUsage := rt.resources.DiskUsage() 179 rt.metrics.diskReadsMetric.Set(realReadUsage) 180 rt.metrics.diskWritesMetric.Set(realWriteUsage) 181 182 if realReadUsage == 0 { 183 return 0 184 } 185 186 scale := realReadUsage / measuredProcessingTime 187 return m.TimeUntil(now, value/scale) 188 } 189 190 type resourceTracker struct { 191 lock sync.RWMutex 192 193 resources resource.User 194 factory meter.Factory 195 // Tracks total number of current processing requests by all nodes. 196 processingMeter meter.Meter 197 halflife time.Duration 198 // Each element is a meter that tracks the number of current processing 199 // requests by a node. [meters] is ordered by the last time that a meter was 200 // utilized. This doesn't necessarily result in the meters being sorted 201 // based on their usage. However, in practice the nodes that are not being 202 // utilized will move towards the oldest elements where they can be deleted. 203 meters *linked.Hashmap[ids.NodeID, meter.Meter] 204 metrics *trackerMetrics 205 } 206 207 func NewResourceTracker( 208 reg prometheus.Registerer, 209 resources resource.User, 210 factory meter.Factory, 211 halflife time.Duration, 212 ) (ResourceTracker, error) { 213 t := &resourceTracker{ 214 factory: factory, 215 resources: resources, 216 processingMeter: factory.New(halflife), 217 halflife: halflife, 218 meters: linked.NewHashmap[ids.NodeID, meter.Meter](), 219 } 220 var err error 221 t.metrics, err = newCPUTrackerMetrics(reg) 222 if err != nil { 223 return nil, fmt.Errorf("initializing resourceTracker metrics errored with: %w", err) 224 } 225 return t, nil 226 } 227 228 func (rt *resourceTracker) CPUTracker() Tracker { 229 return &cpuResourceTracker{t: rt} 230 } 231 232 func (rt *resourceTracker) DiskTracker() DiskTracker { 233 return &diskResourceTracker{t: rt} 234 } 235 236 func (rt *resourceTracker) StartProcessing(nodeID ids.NodeID, now time.Time) { 237 rt.lock.Lock() 238 defer rt.lock.Unlock() 239 240 meter := rt.getMeter(nodeID) 241 meter.Inc(now, 1) 242 rt.processingMeter.Inc(now, 1) 243 } 244 245 func (rt *resourceTracker) StopProcessing(nodeID ids.NodeID, now time.Time) { 246 rt.lock.Lock() 247 defer rt.lock.Unlock() 248 249 meter := rt.getMeter(nodeID) 250 meter.Dec(now, 1) 251 rt.processingMeter.Dec(now, 1) 252 } 253 254 // getMeter returns the meter used to measure CPU time spent processing 255 // messages from [nodeID]. 256 // assumes [rt.lock] is held. 257 func (rt *resourceTracker) getMeter(nodeID ids.NodeID) meter.Meter { 258 m, exists := rt.meters.Get(nodeID) 259 if exists { 260 return m 261 } 262 263 newMeter := rt.factory.New(rt.halflife) 264 rt.meters.Put(nodeID, newMeter) 265 return newMeter 266 } 267 268 // prune attempts to remove meters that currently show a value less than 269 // [epsilon]. 270 // 271 // Because [rt.meters] isn't guaranteed to be sorted by their values, this 272 // doesn't guarantee that all meters showing less than [epsilon] are removed. 273 func (rt *resourceTracker) prune(now time.Time) { 274 for { 275 oldest, meter, exists := rt.meters.Oldest() 276 if !exists { 277 return 278 } 279 280 if meter.Read(now) > epsilon { 281 return 282 } 283 284 rt.meters.Delete(oldest) 285 } 286 } 287 288 type trackerMetrics struct { 289 processingTimeMetric prometheus.Gauge 290 cpuMetric prometheus.Gauge 291 diskReadsMetric prometheus.Gauge 292 diskWritesMetric prometheus.Gauge 293 diskSpaceAvailable prometheus.Gauge 294 } 295 296 func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) { 297 m := &trackerMetrics{ 298 processingTimeMetric: prometheus.NewGauge(prometheus.GaugeOpts{ 299 Name: "processing_time", 300 Help: "Tracked processing time over all nodes. Value expected to be in [0, number of CPU cores], but can go higher due to IO bound processes and thread multiplexing", 301 }), 302 cpuMetric: prometheus.NewGauge(prometheus.GaugeOpts{ 303 Name: "cpu_usage", 304 Help: "CPU usage tracked by the resource manager. Value should be in [0, number of CPU cores]", 305 }), 306 diskReadsMetric: prometheus.NewGauge(prometheus.GaugeOpts{ 307 Name: "disk_reads", 308 Help: "Disk reads (bytes/sec) tracked by the resource manager", 309 }), 310 diskWritesMetric: prometheus.NewGauge(prometheus.GaugeOpts{ 311 Name: "disk_writes", 312 Help: "Disk writes (bytes/sec) tracked by the resource manager", 313 }), 314 diskSpaceAvailable: prometheus.NewGauge(prometheus.GaugeOpts{ 315 Name: "disk_available_space", 316 Help: "Available space remaining (bytes) on the database volume", 317 }), 318 } 319 err := errors.Join( 320 reg.Register(m.processingTimeMetric), 321 reg.Register(m.cpuMetric), 322 reg.Register(m.diskReadsMetric), 323 reg.Register(m.diskWritesMetric), 324 reg.Register(m.diskSpaceAvailable), 325 ) 326 return m, err 327 }