github.com/evdatsion/aphelion-dpos-bft@v0.32.1/tools/tm-monitor/monitor/network.go (about) 1 package monitor 2 3 import ( 4 "sync" 5 "time" 6 7 metrics "github.com/rcrowley/go-metrics" 8 tmtypes "github.com/evdatsion/aphelion-dpos-bft/types" 9 ) 10 11 // UptimeData stores data for how long network has been running. 12 type UptimeData struct { 13 StartTime time.Time `json:"start_time"` 14 Uptime float64 `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever 15 16 totalDownTime time.Duration // total downtime (only updated when we come back online) 17 wentDown time.Time 18 } 19 20 // Health describes the health of the network. Note that this applies only to 21 // the observed nodes, and not to the entire cluster, which may consist of 22 // thousands of machines. It may change in the future. 23 type Health int 24 25 const ( 26 // FullHealth means all nodes online, synced, validators making blocks 27 FullHealth = Health(0) 28 // ModerateHealth means we're making blocks 29 ModerateHealth = Health(1) 30 // Dead means we're not making blocks due to all validators freezing or crashing 31 Dead = Health(2) 32 ) 33 34 // Common statistics for network of nodes 35 type Network struct { 36 Height int64 `json:"height"` 37 38 AvgBlockTime float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute) 39 blockTimeMeter metrics.Meter 40 AvgTxThroughput float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute) 41 txThroughputMeter metrics.Meter 42 AvgBlockLatency float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute) 43 blockLatencyMeter metrics.Meter 44 45 NumValidators int `json:"num_validators"` 46 NumNodesMonitored int `json:"num_nodes_monitored"` 47 NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"` 48 49 Health Health `json:"health"` 50 51 UptimeData *UptimeData `json:"uptime_data"` 52 53 nodeStatusMap map[string]bool 54 55 mu sync.Mutex 56 } 57 58 func NewNetwork() *Network { 59 return &Network{ 60 blockTimeMeter: metrics.NewMeter(), 61 txThroughputMeter: metrics.NewMeter(), 62 blockLatencyMeter: metrics.NewMeter(), 63 Health: FullHealth, 64 UptimeData: &UptimeData{ 65 StartTime: time.Now(), 66 Uptime: 100.0, 67 }, 68 nodeStatusMap: make(map[string]bool), 69 } 70 } 71 72 func (n *Network) NewBlock(b tmtypes.Header) { 73 n.mu.Lock() 74 defer n.mu.Unlock() 75 76 if n.Height >= b.Height { 77 return 78 } 79 80 n.Height = b.Height 81 82 n.blockTimeMeter.Mark(1) 83 if n.blockTimeMeter.Rate1() > 0.0 { 84 n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms 85 } else { 86 n.AvgBlockTime = 0.0 87 } 88 n.txThroughputMeter.Mark(int64(b.NumTxs)) 89 n.AvgTxThroughput = n.txThroughputMeter.Rate1() 90 } 91 92 func (n *Network) NewBlockLatency(l float64) { 93 n.mu.Lock() 94 defer n.mu.Unlock() 95 96 n.blockLatencyMeter.Mark(int64(l)) 97 n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms 98 } 99 100 // RecalculateUptime calculates uptime on demand. 101 func (n *Network) RecalculateUptime() { 102 n.mu.Lock() 103 defer n.mu.Unlock() 104 105 since := time.Since(n.UptimeData.StartTime) 106 uptime := since - n.UptimeData.totalDownTime 107 if n.Health != FullHealth { 108 uptime -= time.Since(n.UptimeData.wentDown) 109 } 110 n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0 111 } 112 113 // NodeIsDown is called when the node disconnects for whatever reason. 114 // Must be safe to call multiple times. 115 func (n *Network) NodeIsDown(name string) { 116 n.mu.Lock() 117 defer n.mu.Unlock() 118 119 if online, ok := n.nodeStatusMap[name]; !ok || online { 120 n.nodeStatusMap[name] = false 121 n.NumNodesMonitoredOnline-- 122 n.UptimeData.wentDown = time.Now() 123 n.updateHealth() 124 } 125 } 126 127 // NodeIsOnline is called when connection to the node is restored. 128 // Must be safe to call multiple times. 129 func (n *Network) NodeIsOnline(name string) { 130 n.mu.Lock() 131 defer n.mu.Unlock() 132 133 if online, ok := n.nodeStatusMap[name]; ok && !online { 134 n.nodeStatusMap[name] = true 135 n.NumNodesMonitoredOnline++ 136 n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown) 137 n.updateHealth() 138 } 139 } 140 141 // NewNode is called when the new node is added to the monitor. 142 func (n *Network) NewNode(name string) { 143 n.mu.Lock() 144 defer n.mu.Unlock() 145 146 n.NumNodesMonitored++ 147 n.NumNodesMonitoredOnline++ 148 n.updateHealth() 149 } 150 151 // NodeDeleted is called when the node is deleted from under the monitor. 152 func (n *Network) NodeDeleted(name string) { 153 n.mu.Lock() 154 defer n.mu.Unlock() 155 156 n.NumNodesMonitored-- 157 n.NumNodesMonitoredOnline-- 158 n.updateHealth() 159 } 160 161 func (n *Network) updateHealth() { 162 // if we are connected to all validators, we're at full health 163 // TODO: make sure they're all at the same height (within a block) 164 // and all proposing (and possibly validating ) Alternatively, just 165 // check there hasn't been a new round in numValidators rounds 166 if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators { 167 n.Health = FullHealth 168 } else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored { 169 n.Health = ModerateHealth 170 } else { 171 n.Health = Dead 172 } 173 } 174 175 func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) { 176 n.mu.Lock() 177 defer n.mu.Unlock() 178 179 if n.Height <= height { 180 n.NumValidators = num 181 } 182 } 183 184 func (n *Network) GetHealthString() string { 185 switch n.Health { 186 case FullHealth: 187 return "full" 188 case ModerateHealth: 189 return "moderate" 190 case Dead: 191 return "dead" 192 default: 193 return "undefined" 194 } 195 } 196 197 // Uptime returns network's uptime in percentages. 198 func (n *Network) Uptime() float64 { 199 n.mu.Lock() 200 defer n.mu.Unlock() 201 return n.UptimeData.Uptime 202 } 203 204 // StartTime returns time we started monitoring. 205 func (n *Network) StartTime() time.Time { 206 return n.UptimeData.StartTime 207 }