github.com/evdatsion/aphelion-dpos-bft@v0.32.1/tools/tm-monitor/monitor/network.go

github.com/evdatsion/aphelion-dpos-bft@v0.32.1/tools/tm-monitor/monitor/network.go (about)

     1  package monitor
     2  
     3  import (
     4  	"sync"
     5  	"time"
     6  
     7  	metrics "github.com/rcrowley/go-metrics"
     8  	tmtypes "github.com/evdatsion/aphelion-dpos-bft/types"
     9  )
    10  
    11  // UptimeData stores data for how long network has been running.
    12  type UptimeData struct {
    13  	StartTime time.Time `json:"start_time"`
    14  	Uptime    float64   `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
    15  
    16  	totalDownTime time.Duration // total downtime (only updated when we come back online)
    17  	wentDown      time.Time
    18  }
    19  
    20  // Health describes the health of the network. Note that this applies only to
    21  // the observed nodes, and not to the entire cluster, which may consist of
    22  // thousands of machines. It may change in the future.
    23  type Health int
    24  
    25  const (
    26  	// FullHealth means all nodes online, synced, validators making blocks
    27  	FullHealth = Health(0)
    28  	// ModerateHealth means we're making blocks
    29  	ModerateHealth = Health(1)
    30  	// Dead means we're not making blocks due to all validators freezing or crashing
    31  	Dead = Health(2)
    32  )
    33  
    34  // Common statistics for network of nodes
    35  type Network struct {
    36  	Height int64 `json:"height"`
    37  
    38  	AvgBlockTime      float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
    39  	blockTimeMeter    metrics.Meter
    40  	AvgTxThroughput   float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
    41  	txThroughputMeter metrics.Meter
    42  	AvgBlockLatency   float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
    43  	blockLatencyMeter metrics.Meter
    44  
    45  	NumValidators           int `json:"num_validators"`
    46  	NumNodesMonitored       int `json:"num_nodes_monitored"`
    47  	NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
    48  
    49  	Health Health `json:"health"`
    50  
    51  	UptimeData *UptimeData `json:"uptime_data"`
    52  
    53  	nodeStatusMap map[string]bool
    54  
    55  	mu sync.Mutex
    56  }
    57  
    58  func NewNetwork() *Network {
    59  	return &Network{
    60  		blockTimeMeter:    metrics.NewMeter(),
    61  		txThroughputMeter: metrics.NewMeter(),
    62  		blockLatencyMeter: metrics.NewMeter(),
    63  		Health:            FullHealth,
    64  		UptimeData: &UptimeData{
    65  			StartTime: time.Now(),
    66  			Uptime:    100.0,
    67  		},
    68  		nodeStatusMap: make(map[string]bool),
    69  	}
    70  }
    71  
    72  func (n *Network) NewBlock(b tmtypes.Header) {
    73  	n.mu.Lock()
    74  	defer n.mu.Unlock()
    75  
    76  	if n.Height >= b.Height {
    77  		return
    78  	}
    79  
    80  	n.Height = b.Height
    81  
    82  	n.blockTimeMeter.Mark(1)
    83  	if n.blockTimeMeter.Rate1() > 0.0 {
    84  		n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
    85  	} else {
    86  		n.AvgBlockTime = 0.0
    87  	}
    88  	n.txThroughputMeter.Mark(int64(b.NumTxs))
    89  	n.AvgTxThroughput = n.txThroughputMeter.Rate1()
    90  }
    91  
    92  func (n *Network) NewBlockLatency(l float64) {
    93  	n.mu.Lock()
    94  	defer n.mu.Unlock()
    95  
    96  	n.blockLatencyMeter.Mark(int64(l))
    97  	n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
    98  }
    99  
   100  // RecalculateUptime calculates uptime on demand.
   101  func (n *Network) RecalculateUptime() {
   102  	n.mu.Lock()
   103  	defer n.mu.Unlock()
   104  
   105  	since := time.Since(n.UptimeData.StartTime)
   106  	uptime := since - n.UptimeData.totalDownTime
   107  	if n.Health != FullHealth {
   108  		uptime -= time.Since(n.UptimeData.wentDown)
   109  	}
   110  	n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
   111  }
   112  
   113  // NodeIsDown is called when the node disconnects for whatever reason.
   114  // Must be safe to call multiple times.
   115  func (n *Network) NodeIsDown(name string) {
   116  	n.mu.Lock()
   117  	defer n.mu.Unlock()
   118  
   119  	if online, ok := n.nodeStatusMap[name]; !ok || online {
   120  		n.nodeStatusMap[name] = false
   121  		n.NumNodesMonitoredOnline--
   122  		n.UptimeData.wentDown = time.Now()
   123  		n.updateHealth()
   124  	}
   125  }
   126  
   127  // NodeIsOnline is called when connection to the node is restored.
   128  // Must be safe to call multiple times.
   129  func (n *Network) NodeIsOnline(name string) {
   130  	n.mu.Lock()
   131  	defer n.mu.Unlock()
   132  
   133  	if online, ok := n.nodeStatusMap[name]; ok && !online {
   134  		n.nodeStatusMap[name] = true
   135  		n.NumNodesMonitoredOnline++
   136  		n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
   137  		n.updateHealth()
   138  	}
   139  }
   140  
   141  // NewNode is called when the new node is added to the monitor.
   142  func (n *Network) NewNode(name string) {
   143  	n.mu.Lock()
   144  	defer n.mu.Unlock()
   145  
   146  	n.NumNodesMonitored++
   147  	n.NumNodesMonitoredOnline++
   148  	n.updateHealth()
   149  }
   150  
   151  // NodeDeleted is called when the node is deleted from under the monitor.
   152  func (n *Network) NodeDeleted(name string) {
   153  	n.mu.Lock()
   154  	defer n.mu.Unlock()
   155  
   156  	n.NumNodesMonitored--
   157  	n.NumNodesMonitoredOnline--
   158  	n.updateHealth()
   159  }
   160  
   161  func (n *Network) updateHealth() {
   162  	// if we are connected to all validators, we're at full health
   163  	// TODO: make sure they're all at the same height (within a block)
   164  	// and all proposing (and possibly validating ) Alternatively, just
   165  	// check there hasn't been a new round in numValidators rounds
   166  	if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {
   167  		n.Health = FullHealth
   168  	} else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {
   169  		n.Health = ModerateHealth
   170  	} else {
   171  		n.Health = Dead
   172  	}
   173  }
   174  
   175  func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
   176  	n.mu.Lock()
   177  	defer n.mu.Unlock()
   178  
   179  	if n.Height <= height {
   180  		n.NumValidators = num
   181  	}
   182  }
   183  
   184  func (n *Network) GetHealthString() string {
   185  	switch n.Health {
   186  	case FullHealth:
   187  		return "full"
   188  	case ModerateHealth:
   189  		return "moderate"
   190  	case Dead:
   191  		return "dead"
   192  	default:
   193  		return "undefined"
   194  	}
   195  }
   196  
   197  // Uptime returns network's uptime in percentages.
   198  func (n *Network) Uptime() float64 {
   199  	n.mu.Lock()
   200  	defer n.mu.Unlock()
   201  	return n.UptimeData.Uptime
   202  }
   203  
   204  // StartTime returns time we started monitoring.
   205  func (n *Network) StartTime() time.Time {
   206  	return n.UptimeData.StartTime
   207  }