code.vegaprotocol.io/vega@v0.79.0/wallet/api/node/round_robin_selector.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package node 17 18 import ( 19 "context" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "sort" 24 "sync" 25 "sync/atomic" 26 27 vgcrypto "code.vegaprotocol.io/vega/libs/crypto" 28 29 "go.uber.org/zap" 30 ) 31 32 var ( 33 ErrNoNodeConfigured = errors.New("no node configured on round-robin selector") 34 ErrNoHealthyNodeAvailable = errors.New("no healthy node available") 35 ) 36 37 // RoundRobinSelector uses a classic round-robin algorithm to select a node. 38 // When requesting the next node, this is the node right behind the current one 39 // that is selected. When the last node is reached, it starts over the first one. 40 type RoundRobinSelector struct { 41 log *zap.Logger 42 43 // currentIndex is the index used to determine which node is returned. 44 currentIndex *atomic.Int64 45 46 // nodes is the list of the nodes we are connected to. 47 nodes []Node 48 49 mu sync.Mutex 50 } 51 52 // Node returns the next node in line among the healthiest nodes. 53 // 54 // Algorithm: 55 // 1. It gets the statistics of the nodes configured 56 // 2. It filters out the nodes that returns data different from the majority, 57 // and label those left as the "healthiest" nodes. 58 // 3. It tries to resolve the next node in line, based on the previous selection 59 // and availability of the node. If the next node that should have selected 60 // is not healthy, it skips the node. It applies this logic until it ends up 61 // on a healthy node. 62 // 63 // Warning: 64 // We look for the network information that are the most commonly shared among 65 // the nodes, because, in decentralized system, the most commonly shared data 66 // represents the truth. While true from the entire network point of view, on a 67 // limited subset of nodes, this might not be true. If most of the nodes 68 // set up in the configuration are late, or misbehaving, the algorithm will 69 // fail to identify the truly healthy ones. That's the major reason to favour 70 // highly trusted and stable nodes. 71 func (ns *RoundRobinSelector) Node(ctx context.Context, reporterFn SelectionReporter) (Node, error) { 72 ns.mu.Lock() 73 defer ns.mu.Unlock() 74 75 healthiestNodesIndexes, err := ns.retrieveHealthiestNodes(ctx, reporterFn) 76 if err != nil { 77 ns.log.Error("no healthy node available") 78 return nil, err 79 } 80 81 var selectedIndex int 82 if len(healthiestNodesIndexes) > 1 { 83 reporterFn(InfoEvent, "Starting round-robin selection of the node...") 84 85 lowestHealthyIndex := healthiestNodesIndexes[0] 86 highestHealthyIndex := healthiestNodesIndexes[len(healthiestNodesIndexes)-1] 87 88 if lowestHealthyIndex == highestHealthyIndex { 89 // We have a single healthy node, so no other choice than using it. 90 return ns.selectNode(lowestHealthyIndex, reporterFn), nil 91 } 92 93 currentIndex := int(ns.currentIndex.Load()) 94 95 if currentIndex < lowestHealthyIndex || currentIndex >= highestHealthyIndex { 96 // If the current index is outside the boundaries of the healthy indexes, 97 // or already equal to the highest index, we get back to the first healthy 98 // index. 99 return ns.selectNode(lowestHealthyIndex, reporterFn), nil 100 } 101 102 selectedIndex = lowestHealthyIndex 103 for _, healthyIndex := range healthiestNodesIndexes { 104 if currentIndex < healthyIndex { 105 // As soon as the current index is lower than the healthy index, it 106 // means we found the next healthy node to use. 107 selectedIndex = healthyIndex 108 break 109 } 110 } 111 } else { 112 selectedIndex = healthiestNodesIndexes[0] 113 } 114 115 selectedNode := ns.selectNode(selectedIndex, reporterFn) 116 117 return selectedNode, nil 118 } 119 120 // Stop stops all the registered nodes. If a node raises an error during 121 // closing, the selector ignores it and carry on a best-effort. 122 func (ns *RoundRobinSelector) Stop() { 123 ns.mu.Lock() 124 defer ns.mu.Unlock() 125 126 for _, n := range ns.nodes { 127 // Ignoring errors to ensure we close as many connections as possible. 128 _ = n.Stop() 129 } 130 ns.log.Info("Stopped all the nodes") 131 } 132 133 func (ns *RoundRobinSelector) selectNode(selectedIndex int, reporterFn SelectionReporter) Node { 134 ns.currentIndex.Store(int64(selectedIndex)) 135 selectedNode := ns.nodes[ns.currentIndex.Load()] 136 137 reporterFn(SuccessEvent, fmt.Sprintf("The node %q has been selected", selectedNode.Host())) 138 ns.log.Info("a node has been selected", 139 zap.String("host", selectedNode.Host()), 140 zap.Int("index", selectedIndex), 141 ) 142 143 return selectedNode 144 } 145 146 func (ns *RoundRobinSelector) retrieveHealthiestNodes(ctx context.Context, reporterFn SelectionReporter) ([]int, error) { 147 ns.log.Info("start evaluating nodes health based on each others state") 148 149 nodeStats, err := ns.collectNodesInformation(ctx, reporterFn) 150 if err != nil { 151 return nil, err 152 } 153 154 if len(nodeStats) == 1 { 155 return []int{nodeStats[0].index}, nil 156 } 157 158 nodesGroupedByHash := ns.groupNodesByStatsHash(nodeStats) 159 160 hashCount := len(nodesGroupedByHash) 161 162 reporterFn(InfoEvent, "Figuring out the healthy nodes...") 163 164 rankedHashes := ns.rankHashes(hashCount, nodesGroupedByHash) 165 166 // We return the nodes indexes that generate the same hash the most often. 167 // Since the slice is sorted for the lowest to the highest occurrences, 168 // the last element is the highest. 169 selectedHash := rankedHashes[hashCount-1] 170 171 healthiestNodesIndexes := selectedHash.nodesIndexes 172 173 healthyNodesCount := len(healthiestNodesIndexes) 174 if healthyNodesCount > 1 { 175 reporterFn(SuccessEvent, fmt.Sprintf("%d healthy nodes found", healthyNodesCount)) 176 } else { 177 reporterFn(SuccessEvent, "1 healthy node found") 178 } 179 ns.log.Info("healthy nodes found", zap.Any("node-indexes", healthiestNodesIndexes)) 180 181 return healthiestNodesIndexes, nil 182 } 183 184 func (ns *RoundRobinSelector) rankHashes(hashCount int, nodesGroupedByHash map[string]nodesByHash) []nodesByHash { 185 rankedHashes := make([]nodesByHash, 0, hashCount) 186 for _, groupedNodes := range nodesGroupedByHash { 187 rankedHashes = append(rankedHashes, groupedNodes) 188 } 189 190 sort.Slice(rankedHashes, func(i, j int) bool { 191 if len(rankedHashes[i].nodesIndexes) == len(rankedHashes[j].nodesIndexes) { 192 // if we have the same number of nodes indexes, we select the ones that 193 // have the most recent block height, as we think it's the most 194 // sensible thing to do. 195 // However, if they also have the same block height, nothing can be 196 // done to really figure out which nodes are the healthiest one, so 197 // we just ensure a deterministic sorting. 198 // This can be wrong, but at least it's consistently wrong. 199 if rankedHashes[i].blockHeight == rankedHashes[j].blockHeight { 200 return rankedHashes[i].statsHash < rankedHashes[j].statsHash 201 } 202 return rankedHashes[i].blockHeight < rankedHashes[j].blockHeight 203 } 204 return len(rankedHashes[i].nodesIndexes) < len(rankedHashes[j].nodesIndexes) 205 }) 206 207 return rankedHashes 208 } 209 210 func (ns *RoundRobinSelector) groupNodesByStatsHash(nodesStats []nodeStat) map[string]nodesByHash { 211 nodesGroupedByStatsHash := map[string]nodesByHash{} 212 for _, nodeStats := range nodesStats { 213 sh, hashAlreadyTracked := nodesGroupedByStatsHash[nodeStats.statsHash] 214 if !hashAlreadyTracked { 215 nodesGroupedByStatsHash[nodeStats.statsHash] = nodesByHash{ 216 statsHash: nodeStats.statsHash, 217 blockHeight: nodeStats.blockHeight, 218 nodesIndexes: []int{nodeStats.index}, 219 } 220 continue 221 } 222 223 sh.nodesIndexes = append(sh.nodesIndexes, nodeStats.index) 224 nodesGroupedByStatsHash[nodeStats.statsHash] = sh 225 } 226 return nodesGroupedByStatsHash 227 } 228 229 func (ns *RoundRobinSelector) collectNodesInformation(ctx context.Context, reporterFn SelectionReporter) ([]nodeStat, error) { 230 reporterFn(InfoEvent, "Collecting nodes information to evaluate their health...") 231 232 nodesCount := len(ns.nodes) 233 234 wg := sync.WaitGroup{} 235 wg.Add(nodesCount) 236 237 nodeHashes := make([]*nodeStat, nodesCount) 238 for nodeIndex, node := range ns.nodes { 239 _index := nodeIndex 240 _node := node 241 go func() { 242 defer wg.Done() 243 244 statsHash, blockHeight := ns.queryNodeInformation(ctx, _node, reporterFn) 245 if statsHash == "" { 246 return 247 } 248 249 nodeHashes[_index] = &nodeStat{ 250 statsHash: statsHash, 251 blockHeight: blockHeight, 252 index: _index, 253 } 254 }() 255 } 256 257 wg.Wait() 258 259 filteredNodeHashes := []nodeStat{} 260 for _, nodeHash := range nodeHashes { 261 if nodeHash != nil { 262 filteredNodeHashes = append(filteredNodeHashes, *nodeHash) 263 } 264 } 265 266 respondingNodeCount := len(filteredNodeHashes) 267 268 if respondingNodeCount == 0 { 269 ns.log.Error("No healthy node available") 270 return nil, ErrNoHealthyNodeAvailable 271 } 272 273 if respondingNodeCount > 1 { 274 reporterFn(SuccessEvent, fmt.Sprintf("%d nodes are responding", respondingNodeCount)) 275 } else { 276 reporterFn(SuccessEvent, "1 node is responding") 277 } 278 279 return filteredNodeHashes, nil 280 } 281 282 func (ns *RoundRobinSelector) queryNodeInformation(ctx context.Context, node Node, reporterFn SelectionReporter) (string, uint64) { 283 stats, err := node.Statistics(ctx) 284 if err != nil { 285 reporterFn(WarningEvent, fmt.Sprintf("Could not collect information from the node %q, skipping...", node.Host())) 286 ns.log.Warn("Could not collect statistics for the node, skipping", zap.Error(err), zap.String("host", node.Host())) 287 return "", 0 288 } 289 290 marshaledStats, err := json.Marshal(stats) 291 if err != nil { 292 // It's very unlikely to happen. 293 reporterFn(ErrorEvent, fmt.Sprintf("[internal error] Could not prepare the collected information from the node %q for the health check", node.Host())) 294 ns.log.Error("Could not marshal statistics to JSON, skipping", zap.Error(err), zap.String("host", node.Host())) 295 return "", 0 296 } 297 298 ns.log.Info("The node is responding and staged for the health check", zap.String("host", node.Host())) 299 300 return vgcrypto.HashToHex(marshaledStats), stats.BlockHeight 301 } 302 303 func NewRoundRobinSelector(log *zap.Logger, nodes ...Node) (*RoundRobinSelector, error) { 304 if len(nodes) == 0 { 305 return nil, ErrNoNodeConfigured 306 } 307 308 currentIndex := &atomic.Int64{} 309 currentIndex.Store(-1) 310 return &RoundRobinSelector{ 311 log: log, 312 currentIndex: currentIndex, 313 nodes: nodes, 314 }, nil 315 } 316 317 type nodeStat struct { 318 statsHash string 319 blockHeight uint64 320 index int 321 } 322 323 type nodesByHash struct { 324 statsHash string 325 blockHeight uint64 326 nodesIndexes []int 327 }