github.com/evdatsion/aphelion-dpos-bft@v0.32.1/tools/tm-monitor/monitor/monitor.go (about) 1 package monitor 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 "github.com/pkg/errors" 10 "github.com/evdatsion/aphelion-dpos-bft/libs/log" 11 tmtypes "github.com/evdatsion/aphelion-dpos-bft/types" 12 ) 13 14 // waiting more than this many seconds for a block means we're unhealthy 15 const nodeLivenessTimeout = 5 * time.Second 16 17 // Monitor keeps track of the nodes and updates common statistics upon 18 // receiving new events from nodes. 19 // 20 // Common statistics is stored in Network struct. 21 type Monitor struct { 22 mtx sync.Mutex 23 Nodes []*Node 24 25 Network *Network 26 27 monitorQuit chan struct{} // monitor exitting 28 nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor 29 30 recalculateNetworkUptimeEvery time.Duration 31 numValidatorsUpdateInterval time.Duration 32 33 logger log.Logger 34 } 35 36 // NewMonitor creates new instance of a Monitor. You can provide options to 37 // change some default values. 38 // 39 // Example: 40 // NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second)) 41 func NewMonitor(options ...func(*Monitor)) *Monitor { 42 m := &Monitor{ 43 Nodes: make([]*Node, 0), 44 Network: NewNetwork(), 45 monitorQuit: make(chan struct{}), 46 nodeQuit: make(map[string]chan struct{}), 47 recalculateNetworkUptimeEvery: 10 * time.Second, 48 numValidatorsUpdateInterval: 5 * time.Second, 49 logger: log.NewNopLogger(), 50 } 51 52 for _, option := range options { 53 option(m) 54 } 55 56 return m 57 } 58 59 // RecalculateNetworkUptimeEvery lets you change network uptime update interval. 60 func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) { 61 return func(m *Monitor) { 62 m.recalculateNetworkUptimeEvery = d 63 } 64 } 65 66 // SetNumValidatorsUpdateInterval lets you change num validators update interval. 67 func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) { 68 return func(m *Monitor) { 69 m.numValidatorsUpdateInterval = d 70 } 71 } 72 73 // SetLogger lets you set your own logger 74 func (m *Monitor) SetLogger(l log.Logger) { 75 m.logger = l 76 } 77 78 // Monitor begins to monitor the node `n`. The node will be started and added 79 // to the monitor. 80 func (m *Monitor) Monitor(n *Node) error { 81 m.mtx.Lock() 82 m.Nodes = append(m.Nodes, n) 83 m.mtx.Unlock() 84 85 blockCh := make(chan tmtypes.Header, 10) 86 n.SendBlocksTo(blockCh) 87 blockLatencyCh := make(chan float64, 10) 88 n.SendBlockLatenciesTo(blockLatencyCh) 89 disconnectCh := make(chan bool, 10) 90 n.NotifyAboutDisconnects(disconnectCh) 91 92 if err := n.Start(); err != nil { 93 return err 94 } 95 96 m.Network.NewNode(n.Name) 97 98 m.nodeQuit[n.Name] = make(chan struct{}) 99 go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name]) 100 101 return nil 102 } 103 104 // Unmonitor stops monitoring node `n`. The node will be stopped and removed 105 // from the monitor. 106 func (m *Monitor) Unmonitor(n *Node) { 107 m.Network.NodeDeleted(n.Name) 108 109 n.Stop() 110 close(m.nodeQuit[n.Name]) 111 delete(m.nodeQuit, n.Name) 112 i, _ := m.NodeByName(n.Name) 113 114 m.mtx.Lock() 115 m.Nodes[i] = m.Nodes[len(m.Nodes)-1] 116 m.Nodes = m.Nodes[:len(m.Nodes)-1] 117 m.mtx.Unlock() 118 } 119 120 // NodeByName returns the node and its index if such node exists within the 121 // monitor. Otherwise, -1 and nil are returned. 122 func (m *Monitor) NodeByName(name string) (index int, node *Node) { 123 m.mtx.Lock() 124 defer m.mtx.Unlock() 125 126 for i, n := range m.Nodes { 127 if name == n.Name { 128 return i, n 129 } 130 } 131 return -1, nil 132 } 133 134 // NodeIsOnline is called when connection to the node is restored. 135 // Must be safe to call multiple times. 136 func (m *Monitor) NodeIsOnline(name string) { 137 138 _, node := m.NodeByName(name) 139 if nil != node { 140 if online, ok := m.Network.nodeStatusMap[name]; ok && online { 141 m.mtx.Lock() 142 node.Online = online 143 m.mtx.Unlock() 144 } 145 } 146 147 } 148 149 // Start starts the monitor's routines: recalculating network uptime and 150 // updating number of validators. 151 func (m *Monitor) Start() error { 152 go m.recalculateNetworkUptimeLoop() 153 go m.updateNumValidatorLoop() 154 155 return nil 156 } 157 158 // Stop stops the monitor's routines. 159 func (m *Monitor) Stop() { 160 close(m.monitorQuit) 161 162 for _, n := range m.Nodes { 163 m.Unmonitor(n) 164 } 165 } 166 167 // main loop where we listen for events from the node 168 func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) { 169 logger := m.logger.With("node", nodeName) 170 171 for { 172 select { 173 case <-quit: 174 return 175 case b := <-blockCh: 176 m.Network.NewBlock(b) 177 m.Network.NodeIsOnline(nodeName) 178 m.NodeIsOnline(nodeName) 179 case l := <-blockLatencyCh: 180 m.Network.NewBlockLatency(l) 181 m.Network.NodeIsOnline(nodeName) 182 m.NodeIsOnline(nodeName) 183 case disconnected := <-disconnectCh: 184 if disconnected { 185 m.Network.NodeIsDown(nodeName) 186 } else { 187 m.Network.NodeIsOnline(nodeName) 188 m.NodeIsOnline(nodeName) 189 } 190 case <-time.After(nodeLivenessTimeout): 191 logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout)) 192 m.Network.NodeIsDown(nodeName) 193 } 194 } 195 } 196 197 // recalculateNetworkUptimeLoop every N seconds. 198 func (m *Monitor) recalculateNetworkUptimeLoop() { 199 for { 200 select { 201 case <-m.monitorQuit: 202 return 203 case <-time.After(m.recalculateNetworkUptimeEvery): 204 m.Network.RecalculateUptime() 205 } 206 } 207 } 208 209 // updateNumValidatorLoop sends a request to a random node once every N seconds, 210 // which in turn makes an RPC call to get the latest validators. 211 func (m *Monitor) updateNumValidatorLoop() { 212 rand.Seed(time.Now().Unix()) 213 214 var height int64 215 var num int 216 var err error 217 218 for { 219 m.mtx.Lock() 220 nodesCount := len(m.Nodes) 221 m.mtx.Unlock() 222 if 0 == nodesCount { 223 time.Sleep(m.numValidatorsUpdateInterval) 224 continue 225 } 226 227 randomNodeIndex := rand.Intn(nodesCount) 228 229 select { 230 case <-m.monitorQuit: 231 return 232 case <-time.After(m.numValidatorsUpdateInterval): 233 i := 0 234 235 m.mtx.Lock() 236 for _, n := range m.Nodes { 237 if i == randomNodeIndex { 238 height, num, err = n.NumValidators() 239 if err != nil { 240 m.logger.Info("err", errors.Wrap(err, "update num validators failed")) 241 } 242 break 243 } 244 i++ 245 } 246 m.mtx.Unlock() 247 248 m.Network.UpdateNumValidatorsForHeight(num, height) 249 } 250 } 251 }