code.vegaprotocol.io/vega@v0.79.0/wallet/api/node/round_robin_selector.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package node
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"sync"
    25  	"sync/atomic"
    26  
    27  	vgcrypto "code.vegaprotocol.io/vega/libs/crypto"
    28  
    29  	"go.uber.org/zap"
    30  )
    31  
    32  var (
    33  	ErrNoNodeConfigured       = errors.New("no node configured on round-robin selector")
    34  	ErrNoHealthyNodeAvailable = errors.New("no healthy node available")
    35  )
    36  
    37  // RoundRobinSelector uses a classic round-robin algorithm to select a node.
    38  // When requesting the next node, this is the node right behind the current one
    39  // that is selected. When the last node is reached, it starts over the first one.
    40  type RoundRobinSelector struct {
    41  	log *zap.Logger
    42  
    43  	// currentIndex is the index used to determine which node is returned.
    44  	currentIndex *atomic.Int64
    45  
    46  	// nodes is the list of the nodes we are connected to.
    47  	nodes []Node
    48  
    49  	mu sync.Mutex
    50  }
    51  
    52  // Node returns the next node in line among the healthiest nodes.
    53  //
    54  // Algorithm:
    55  //  1. It gets the statistics of the nodes configured
    56  //  2. It filters out the nodes that returns data different from the majority,
    57  //     and label those left as the "healthiest" nodes.
    58  //  3. It tries to resolve the next node in line, based on the previous selection
    59  //     and availability of the node. If the next node that should have selected
    60  //     is not healthy, it skips the node. It applies this logic until it ends up
    61  //     on a healthy node.
    62  //
    63  // Warning:
    64  // We look for the network information that are the most commonly shared among
    65  // the nodes, because, in decentralized system, the most commonly shared data
    66  // represents the truth. While true from the entire network point of view, on a
    67  // limited subset of nodes, this might not be true. If most of the nodes
    68  // set up in the configuration are late, or misbehaving, the algorithm will
    69  // fail to identify the truly healthy ones. That's the major reason to favour
    70  // highly trusted and stable nodes.
    71  func (ns *RoundRobinSelector) Node(ctx context.Context, reporterFn SelectionReporter) (Node, error) {
    72  	ns.mu.Lock()
    73  	defer ns.mu.Unlock()
    74  
    75  	healthiestNodesIndexes, err := ns.retrieveHealthiestNodes(ctx, reporterFn)
    76  	if err != nil {
    77  		ns.log.Error("no healthy node available")
    78  		return nil, err
    79  	}
    80  
    81  	var selectedIndex int
    82  	if len(healthiestNodesIndexes) > 1 {
    83  		reporterFn(InfoEvent, "Starting round-robin selection of the node...")
    84  
    85  		lowestHealthyIndex := healthiestNodesIndexes[0]
    86  		highestHealthyIndex := healthiestNodesIndexes[len(healthiestNodesIndexes)-1]
    87  
    88  		if lowestHealthyIndex == highestHealthyIndex {
    89  			// We have a single healthy node, so no other choice than using it.
    90  			return ns.selectNode(lowestHealthyIndex, reporterFn), nil
    91  		}
    92  
    93  		currentIndex := int(ns.currentIndex.Load())
    94  
    95  		if currentIndex < lowestHealthyIndex || currentIndex >= highestHealthyIndex {
    96  			// If the current index is outside the boundaries of the healthy indexes,
    97  			// or already equal to the highest index, we get back to the first healthy
    98  			// index.
    99  			return ns.selectNode(lowestHealthyIndex, reporterFn), nil
   100  		}
   101  
   102  		selectedIndex = lowestHealthyIndex
   103  		for _, healthyIndex := range healthiestNodesIndexes {
   104  			if currentIndex < healthyIndex {
   105  				// As soon as the current index is lower than the healthy index, it
   106  				// means we found the next healthy node to use.
   107  				selectedIndex = healthyIndex
   108  				break
   109  			}
   110  		}
   111  	} else {
   112  		selectedIndex = healthiestNodesIndexes[0]
   113  	}
   114  
   115  	selectedNode := ns.selectNode(selectedIndex, reporterFn)
   116  
   117  	return selectedNode, nil
   118  }
   119  
   120  // Stop stops all the registered nodes. If a node raises an error during
   121  // closing, the selector ignores it and carry on a best-effort.
   122  func (ns *RoundRobinSelector) Stop() {
   123  	ns.mu.Lock()
   124  	defer ns.mu.Unlock()
   125  
   126  	for _, n := range ns.nodes {
   127  		// Ignoring errors to ensure we close as many connections as possible.
   128  		_ = n.Stop()
   129  	}
   130  	ns.log.Info("Stopped all the nodes")
   131  }
   132  
   133  func (ns *RoundRobinSelector) selectNode(selectedIndex int, reporterFn SelectionReporter) Node {
   134  	ns.currentIndex.Store(int64(selectedIndex))
   135  	selectedNode := ns.nodes[ns.currentIndex.Load()]
   136  
   137  	reporterFn(SuccessEvent, fmt.Sprintf("The node %q has been selected", selectedNode.Host()))
   138  	ns.log.Info("a node has been selected",
   139  		zap.String("host", selectedNode.Host()),
   140  		zap.Int("index", selectedIndex),
   141  	)
   142  
   143  	return selectedNode
   144  }
   145  
   146  func (ns *RoundRobinSelector) retrieveHealthiestNodes(ctx context.Context, reporterFn SelectionReporter) ([]int, error) {
   147  	ns.log.Info("start evaluating nodes health based on each others state")
   148  
   149  	nodeStats, err := ns.collectNodesInformation(ctx, reporterFn)
   150  	if err != nil {
   151  		return nil, err
   152  	}
   153  
   154  	if len(nodeStats) == 1 {
   155  		return []int{nodeStats[0].index}, nil
   156  	}
   157  
   158  	nodesGroupedByHash := ns.groupNodesByStatsHash(nodeStats)
   159  
   160  	hashCount := len(nodesGroupedByHash)
   161  
   162  	reporterFn(InfoEvent, "Figuring out the healthy nodes...")
   163  
   164  	rankedHashes := ns.rankHashes(hashCount, nodesGroupedByHash)
   165  
   166  	// We return the nodes indexes that generate the same hash the most often.
   167  	// Since the slice is sorted for the lowest to the highest occurrences,
   168  	// the last element is the highest.
   169  	selectedHash := rankedHashes[hashCount-1]
   170  
   171  	healthiestNodesIndexes := selectedHash.nodesIndexes
   172  
   173  	healthyNodesCount := len(healthiestNodesIndexes)
   174  	if healthyNodesCount > 1 {
   175  		reporterFn(SuccessEvent, fmt.Sprintf("%d healthy nodes found", healthyNodesCount))
   176  	} else {
   177  		reporterFn(SuccessEvent, "1 healthy node found")
   178  	}
   179  	ns.log.Info("healthy nodes found", zap.Any("node-indexes", healthiestNodesIndexes))
   180  
   181  	return healthiestNodesIndexes, nil
   182  }
   183  
   184  func (ns *RoundRobinSelector) rankHashes(hashCount int, nodesGroupedByHash map[string]nodesByHash) []nodesByHash {
   185  	rankedHashes := make([]nodesByHash, 0, hashCount)
   186  	for _, groupedNodes := range nodesGroupedByHash {
   187  		rankedHashes = append(rankedHashes, groupedNodes)
   188  	}
   189  
   190  	sort.Slice(rankedHashes, func(i, j int) bool {
   191  		if len(rankedHashes[i].nodesIndexes) == len(rankedHashes[j].nodesIndexes) {
   192  			// if we have the same number of nodes indexes, we select the ones that
   193  			// have the most recent block height, as we think it's the most
   194  			// sensible thing to do.
   195  			// However, if they also have the same block height, nothing can be
   196  			// done to really figure out which nodes are the healthiest one, so
   197  			// we just ensure a deterministic sorting.
   198  			// This can be wrong, but at least it's consistently wrong.
   199  			if rankedHashes[i].blockHeight == rankedHashes[j].blockHeight {
   200  				return rankedHashes[i].statsHash < rankedHashes[j].statsHash
   201  			}
   202  			return rankedHashes[i].blockHeight < rankedHashes[j].blockHeight
   203  		}
   204  		return len(rankedHashes[i].nodesIndexes) < len(rankedHashes[j].nodesIndexes)
   205  	})
   206  
   207  	return rankedHashes
   208  }
   209  
   210  func (ns *RoundRobinSelector) groupNodesByStatsHash(nodesStats []nodeStat) map[string]nodesByHash {
   211  	nodesGroupedByStatsHash := map[string]nodesByHash{}
   212  	for _, nodeStats := range nodesStats {
   213  		sh, hashAlreadyTracked := nodesGroupedByStatsHash[nodeStats.statsHash]
   214  		if !hashAlreadyTracked {
   215  			nodesGroupedByStatsHash[nodeStats.statsHash] = nodesByHash{
   216  				statsHash:    nodeStats.statsHash,
   217  				blockHeight:  nodeStats.blockHeight,
   218  				nodesIndexes: []int{nodeStats.index},
   219  			}
   220  			continue
   221  		}
   222  
   223  		sh.nodesIndexes = append(sh.nodesIndexes, nodeStats.index)
   224  		nodesGroupedByStatsHash[nodeStats.statsHash] = sh
   225  	}
   226  	return nodesGroupedByStatsHash
   227  }
   228  
   229  func (ns *RoundRobinSelector) collectNodesInformation(ctx context.Context, reporterFn SelectionReporter) ([]nodeStat, error) {
   230  	reporterFn(InfoEvent, "Collecting nodes information to evaluate their health...")
   231  
   232  	nodesCount := len(ns.nodes)
   233  
   234  	wg := sync.WaitGroup{}
   235  	wg.Add(nodesCount)
   236  
   237  	nodeHashes := make([]*nodeStat, nodesCount)
   238  	for nodeIndex, node := range ns.nodes {
   239  		_index := nodeIndex
   240  		_node := node
   241  		go func() {
   242  			defer wg.Done()
   243  
   244  			statsHash, blockHeight := ns.queryNodeInformation(ctx, _node, reporterFn)
   245  			if statsHash == "" {
   246  				return
   247  			}
   248  
   249  			nodeHashes[_index] = &nodeStat{
   250  				statsHash:   statsHash,
   251  				blockHeight: blockHeight,
   252  				index:       _index,
   253  			}
   254  		}()
   255  	}
   256  
   257  	wg.Wait()
   258  
   259  	filteredNodeHashes := []nodeStat{}
   260  	for _, nodeHash := range nodeHashes {
   261  		if nodeHash != nil {
   262  			filteredNodeHashes = append(filteredNodeHashes, *nodeHash)
   263  		}
   264  	}
   265  
   266  	respondingNodeCount := len(filteredNodeHashes)
   267  
   268  	if respondingNodeCount == 0 {
   269  		ns.log.Error("No healthy node available")
   270  		return nil, ErrNoHealthyNodeAvailable
   271  	}
   272  
   273  	if respondingNodeCount > 1 {
   274  		reporterFn(SuccessEvent, fmt.Sprintf("%d nodes are responding", respondingNodeCount))
   275  	} else {
   276  		reporterFn(SuccessEvent, "1 node is responding")
   277  	}
   278  
   279  	return filteredNodeHashes, nil
   280  }
   281  
   282  func (ns *RoundRobinSelector) queryNodeInformation(ctx context.Context, node Node, reporterFn SelectionReporter) (string, uint64) {
   283  	stats, err := node.Statistics(ctx)
   284  	if err != nil {
   285  		reporterFn(WarningEvent, fmt.Sprintf("Could not collect information from the node %q, skipping...", node.Host()))
   286  		ns.log.Warn("Could not collect statistics for the node, skipping", zap.Error(err), zap.String("host", node.Host()))
   287  		return "", 0
   288  	}
   289  
   290  	marshaledStats, err := json.Marshal(stats)
   291  	if err != nil {
   292  		// It's very unlikely to happen.
   293  		reporterFn(ErrorEvent, fmt.Sprintf("[internal error] Could not prepare the collected information from the node %q for the health check", node.Host()))
   294  		ns.log.Error("Could not marshal statistics to JSON, skipping", zap.Error(err), zap.String("host", node.Host()))
   295  		return "", 0
   296  	}
   297  
   298  	ns.log.Info("The node is responding and staged for the health check", zap.String("host", node.Host()))
   299  
   300  	return vgcrypto.HashToHex(marshaledStats), stats.BlockHeight
   301  }
   302  
   303  func NewRoundRobinSelector(log *zap.Logger, nodes ...Node) (*RoundRobinSelector, error) {
   304  	if len(nodes) == 0 {
   305  		return nil, ErrNoNodeConfigured
   306  	}
   307  
   308  	currentIndex := &atomic.Int64{}
   309  	currentIndex.Store(-1)
   310  	return &RoundRobinSelector{
   311  		log:          log,
   312  		currentIndex: currentIndex,
   313  		nodes:        nodes,
   314  	}, nil
   315  }
   316  
   317  type nodeStat struct {
   318  	statsHash   string
   319  	blockHeight uint64
   320  	index       int
   321  }
   322  
   323  type nodesByHash struct {
   324  	statsHash    string
   325  	blockHeight  uint64
   326  	nodesIndexes []int
   327  }