github.com/zhiqiangxu/go-ethereum@v1.9.16-0.20210824055606-be91cfdebc48/les/serverpool.go (about)

     1  // Copyright 2020 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  package les
    18  
    19  import (
    20  	"errors"
    21  	"math/rand"
    22  	"reflect"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/zhiqiangxu/go-ethereum/common/mclock"
    28  	"github.com/zhiqiangxu/go-ethereum/ethdb"
    29  	lpc "github.com/zhiqiangxu/go-ethereum/les/lespay/client"
    30  	"github.com/zhiqiangxu/go-ethereum/les/utils"
    31  	"github.com/zhiqiangxu/go-ethereum/log"
    32  	"github.com/zhiqiangxu/go-ethereum/p2p/enode"
    33  	"github.com/zhiqiangxu/go-ethereum/p2p/enr"
    34  	"github.com/zhiqiangxu/go-ethereum/p2p/nodestate"
    35  	"github.com/zhiqiangxu/go-ethereum/rlp"
    36  )
    37  
    38  const (
    39  	minTimeout          = time.Millisecond * 500 // minimum request timeout suggested by the server pool
    40  	timeoutRefresh      = time.Second * 5        // recalculate timeout if older than this
    41  	dialCost            = 10000                  // cost of a TCP dial (used for known node selection weight calculation)
    42  	dialWaitStep        = 1.5                    // exponential multiplier of redial wait time when no value was provided by the server
    43  	queryCost           = 500                    // cost of a UDP pre-negotiation query
    44  	queryWaitStep       = 1.02                   // exponential multiplier of redial wait time when no value was provided by the server
    45  	waitThreshold       = time.Hour * 2000       // drop node if waiting time is over the threshold
    46  	nodeWeightMul       = 1000000                // multiplier constant for node weight calculation
    47  	nodeWeightThreshold = 100                    // minimum weight for keeping a node in the the known (valuable) set
    48  	minRedialWait       = 10                     // minimum redial wait time in seconds
    49  	preNegLimit         = 5                      // maximum number of simultaneous pre-negotiation queries
    50  	maxQueryFails       = 100                    // number of consecutive UDP query failures before we print a warning
    51  )
    52  
    53  // serverPool provides a node iterator for dial candidates. The output is a mix of newly discovered
    54  // nodes, a weighted random selection of known (previously valuable) nodes and trusted/paid nodes.
    55  type serverPool struct {
    56  	clock    mclock.Clock
    57  	unixTime func() int64
    58  	db       ethdb.KeyValueStore
    59  
    60  	ns           *nodestate.NodeStateMachine
    61  	vt           *lpc.ValueTracker
    62  	mixer        *enode.FairMix
    63  	mixSources   []enode.Iterator
    64  	dialIterator enode.Iterator
    65  	validSchemes enr.IdentityScheme
    66  	trustedURLs  []string
    67  	fillSet      *lpc.FillSet
    68  	queryFails   uint32
    69  
    70  	timeoutLock      sync.RWMutex
    71  	timeout          time.Duration
    72  	timeWeights      lpc.ResponseTimeWeights
    73  	timeoutRefreshed mclock.AbsTime
    74  }
    75  
    76  // nodeHistory keeps track of dial costs which determine node weight together with the
    77  // service value calculated by lpc.ValueTracker.
    78  type nodeHistory struct {
    79  	dialCost                       utils.ExpiredValue
    80  	redialWaitStart, redialWaitEnd int64 // unix time (seconds)
    81  }
    82  
    83  type nodeHistoryEnc struct {
    84  	DialCost                       utils.ExpiredValue
    85  	RedialWaitStart, RedialWaitEnd uint64
    86  }
    87  
    88  // queryFunc sends a pre-negotiation query and blocks until a response arrives or timeout occurs.
    89  // It returns 1 if the remote node has confirmed that connection is possible, 0 if not
    90  // possible and -1 if no response arrived (timeout).
    91  type queryFunc func(*enode.Node) int
    92  
    93  var (
    94  	serverPoolSetup    = &nodestate.Setup{Version: 1}
    95  	sfHasValue         = serverPoolSetup.NewPersistentFlag("hasValue")
    96  	sfQueried          = serverPoolSetup.NewFlag("queried")
    97  	sfCanDial          = serverPoolSetup.NewFlag("canDial")
    98  	sfDialing          = serverPoolSetup.NewFlag("dialed")
    99  	sfWaitDialTimeout  = serverPoolSetup.NewFlag("dialTimeout")
   100  	sfConnected        = serverPoolSetup.NewFlag("connected")
   101  	sfRedialWait       = serverPoolSetup.NewFlag("redialWait")
   102  	sfAlwaysConnect    = serverPoolSetup.NewFlag("alwaysConnect")
   103  	sfDisableSelection = nodestate.MergeFlags(sfQueried, sfCanDial, sfDialing, sfConnected, sfRedialWait)
   104  
   105  	sfiNodeHistory = serverPoolSetup.NewPersistentField("nodeHistory", reflect.TypeOf(nodeHistory{}),
   106  		func(field interface{}) ([]byte, error) {
   107  			if n, ok := field.(nodeHistory); ok {
   108  				ne := nodeHistoryEnc{
   109  					DialCost:        n.dialCost,
   110  					RedialWaitStart: uint64(n.redialWaitStart),
   111  					RedialWaitEnd:   uint64(n.redialWaitEnd),
   112  				}
   113  				enc, err := rlp.EncodeToBytes(&ne)
   114  				return enc, err
   115  			} else {
   116  				return nil, errors.New("invalid field type")
   117  			}
   118  		},
   119  		func(enc []byte) (interface{}, error) {
   120  			var ne nodeHistoryEnc
   121  			err := rlp.DecodeBytes(enc, &ne)
   122  			n := nodeHistory{
   123  				dialCost:        ne.DialCost,
   124  				redialWaitStart: int64(ne.RedialWaitStart),
   125  				redialWaitEnd:   int64(ne.RedialWaitEnd),
   126  			}
   127  			return n, err
   128  		},
   129  	)
   130  	sfiNodeWeight     = serverPoolSetup.NewField("nodeWeight", reflect.TypeOf(uint64(0)))
   131  	sfiConnectedStats = serverPoolSetup.NewField("connectedStats", reflect.TypeOf(lpc.ResponseTimeStats{}))
   132  )
   133  
   134  // newServerPool creates a new server pool
   135  func newServerPool(db ethdb.KeyValueStore, dbKey []byte, vt *lpc.ValueTracker, discovery enode.Iterator, mixTimeout time.Duration, query queryFunc, clock mclock.Clock, trustedURLs []string) *serverPool {
   136  	s := &serverPool{
   137  		db:           db,
   138  		clock:        clock,
   139  		unixTime:     func() int64 { return time.Now().Unix() },
   140  		validSchemes: enode.ValidSchemes,
   141  		trustedURLs:  trustedURLs,
   142  		vt:           vt,
   143  		ns:           nodestate.NewNodeStateMachine(db, []byte(string(dbKey)+"ns:"), clock, serverPoolSetup),
   144  	}
   145  	s.recalTimeout()
   146  	s.mixer = enode.NewFairMix(mixTimeout)
   147  	knownSelector := lpc.NewWrsIterator(s.ns, sfHasValue, sfDisableSelection, sfiNodeWeight)
   148  	alwaysConnect := lpc.NewQueueIterator(s.ns, sfAlwaysConnect, sfDisableSelection, true, nil)
   149  	s.mixSources = append(s.mixSources, knownSelector)
   150  	s.mixSources = append(s.mixSources, alwaysConnect)
   151  	if discovery != nil {
   152  		s.mixSources = append(s.mixSources, discovery)
   153  	}
   154  
   155  	iter := enode.Iterator(s.mixer)
   156  	if query != nil {
   157  		iter = s.addPreNegFilter(iter, query)
   158  	}
   159  	s.dialIterator = enode.Filter(iter, func(node *enode.Node) bool {
   160  		s.ns.SetState(node, sfDialing, sfCanDial, 0)
   161  		s.ns.SetState(node, sfWaitDialTimeout, nodestate.Flags{}, time.Second*10)
   162  		return true
   163  	})
   164  
   165  	s.ns.SubscribeState(nodestate.MergeFlags(sfWaitDialTimeout, sfConnected), func(n *enode.Node, oldState, newState nodestate.Flags) {
   166  		if oldState.Equals(sfWaitDialTimeout) && newState.IsEmpty() {
   167  			// dial timeout, no connection
   168  			s.setRedialWait(n, dialCost, dialWaitStep)
   169  			s.ns.SetState(n, nodestate.Flags{}, sfDialing, 0)
   170  		}
   171  	})
   172  
   173  	s.ns.AddLogMetrics(sfHasValue, sfDisableSelection, "selectable", nil, nil, serverSelectableGauge)
   174  	s.ns.AddLogMetrics(sfDialing, nodestate.Flags{}, "dialed", serverDialedMeter, nil, nil)
   175  	s.ns.AddLogMetrics(sfConnected, nodestate.Flags{}, "connected", nil, nil, serverConnectedGauge)
   176  	return s
   177  }
   178  
   179  // addPreNegFilter installs a node filter mechanism that performs a pre-negotiation query.
   180  // Nodes that are filtered out and does not appear on the output iterator are put back
   181  // into redialWait state.
   182  func (s *serverPool) addPreNegFilter(input enode.Iterator, query queryFunc) enode.Iterator {
   183  	s.fillSet = lpc.NewFillSet(s.ns, input, sfQueried)
   184  	s.ns.SubscribeState(sfQueried, func(n *enode.Node, oldState, newState nodestate.Flags) {
   185  		if newState.Equals(sfQueried) {
   186  			fails := atomic.LoadUint32(&s.queryFails)
   187  			if fails == maxQueryFails {
   188  				log.Warn("UDP pre-negotiation query does not seem to work")
   189  			}
   190  			if fails > maxQueryFails {
   191  				fails = maxQueryFails
   192  			}
   193  			if rand.Intn(maxQueryFails*2) < int(fails) {
   194  				// skip pre-negotiation with increasing chance, max 50%
   195  				// this ensures that the client can operate even if UDP is not working at all
   196  				s.ns.SetState(n, sfCanDial, nodestate.Flags{}, time.Second*10)
   197  				// set canDial before resetting queried so that FillSet will not read more
   198  				// candidates unnecessarily
   199  				s.ns.SetState(n, nodestate.Flags{}, sfQueried, 0)
   200  				return
   201  			}
   202  			go func() {
   203  				q := query(n)
   204  				if q == -1 {
   205  					atomic.AddUint32(&s.queryFails, 1)
   206  				} else {
   207  					atomic.StoreUint32(&s.queryFails, 0)
   208  				}
   209  				if q == 1 {
   210  					s.ns.SetState(n, sfCanDial, nodestate.Flags{}, time.Second*10)
   211  				} else {
   212  					s.setRedialWait(n, queryCost, queryWaitStep)
   213  				}
   214  				s.ns.SetState(n, nodestate.Flags{}, sfQueried, 0)
   215  			}()
   216  		}
   217  	})
   218  	return lpc.NewQueueIterator(s.ns, sfCanDial, nodestate.Flags{}, false, func(waiting bool) {
   219  		if waiting {
   220  			s.fillSet.SetTarget(preNegLimit)
   221  		} else {
   222  			s.fillSet.SetTarget(0)
   223  		}
   224  	})
   225  }
   226  
   227  // start starts the server pool. Note that NodeStateMachine should be started first.
   228  func (s *serverPool) start() {
   229  	s.ns.Start()
   230  	for _, iter := range s.mixSources {
   231  		// add sources to mixer at startup because the mixer instantly tries to read them
   232  		// which should only happen after NodeStateMachine has been started
   233  		s.mixer.AddSource(iter)
   234  	}
   235  	for _, url := range s.trustedURLs {
   236  		if node, err := enode.Parse(s.validSchemes, url); err == nil {
   237  			s.ns.SetState(node, sfAlwaysConnect, nodestate.Flags{}, 0)
   238  		} else {
   239  			log.Error("Invalid trusted server URL", "url", url, "error", err)
   240  		}
   241  	}
   242  	unixTime := s.unixTime()
   243  	s.ns.ForEach(sfHasValue, nodestate.Flags{}, func(node *enode.Node, state nodestate.Flags) {
   244  		s.calculateWeight(node)
   245  		if n, ok := s.ns.GetField(node, sfiNodeHistory).(nodeHistory); ok && n.redialWaitEnd > unixTime {
   246  			wait := n.redialWaitEnd - unixTime
   247  			lastWait := n.redialWaitEnd - n.redialWaitStart
   248  			if wait > lastWait {
   249  				// if the time until expiration is larger than the last suggested
   250  				// waiting time then the system clock was probably adjusted
   251  				wait = lastWait
   252  			}
   253  			s.ns.SetState(node, sfRedialWait, nodestate.Flags{}, time.Duration(wait)*time.Second)
   254  		}
   255  	})
   256  }
   257  
   258  // stop stops the server pool
   259  func (s *serverPool) stop() {
   260  	s.dialIterator.Close()
   261  	if s.fillSet != nil {
   262  		s.fillSet.Close()
   263  	}
   264  	s.ns.ForEach(sfConnected, nodestate.Flags{}, func(n *enode.Node, state nodestate.Flags) {
   265  		// recalculate weight of connected nodes in order to update hasValue flag if necessary
   266  		s.calculateWeight(n)
   267  	})
   268  	s.ns.Stop()
   269  }
   270  
   271  // registerPeer implements serverPeerSubscriber
   272  func (s *serverPool) registerPeer(p *serverPeer) {
   273  	s.ns.SetState(p.Node(), sfConnected, sfDialing.Or(sfWaitDialTimeout), 0)
   274  	nvt := s.vt.Register(p.ID())
   275  	s.ns.SetField(p.Node(), sfiConnectedStats, nvt.RtStats())
   276  	p.setValueTracker(s.vt, nvt)
   277  	p.updateVtParams()
   278  }
   279  
   280  // unregisterPeer implements serverPeerSubscriber
   281  func (s *serverPool) unregisterPeer(p *serverPeer) {
   282  	s.setRedialWait(p.Node(), dialCost, dialWaitStep)
   283  	s.ns.SetState(p.Node(), nodestate.Flags{}, sfConnected, 0)
   284  	s.ns.SetField(p.Node(), sfiConnectedStats, nil)
   285  	s.vt.Unregister(p.ID())
   286  	p.setValueTracker(nil, nil)
   287  }
   288  
   289  // recalTimeout calculates the current recommended timeout. This value is used by
   290  // the client as a "soft timeout" value. It also affects the service value calculation
   291  // of individual nodes.
   292  func (s *serverPool) recalTimeout() {
   293  	// Use cached result if possible, avoid recalculating too frequently.
   294  	s.timeoutLock.RLock()
   295  	refreshed := s.timeoutRefreshed
   296  	s.timeoutLock.RUnlock()
   297  	now := s.clock.Now()
   298  	if refreshed != 0 && time.Duration(now-refreshed) < timeoutRefresh {
   299  		return
   300  	}
   301  	// Cached result is stale, recalculate a new one.
   302  	rts := s.vt.RtStats()
   303  
   304  	// Add a fake statistic here. It is an easy way to initialize with some
   305  	// conservative values when the database is new. As soon as we have a
   306  	// considerable amount of real stats this small value won't matter.
   307  	rts.Add(time.Second*2, 10, s.vt.StatsExpFactor())
   308  
   309  	// Use either 10% failure rate timeout or twice the median response time
   310  	// as the recommended timeout.
   311  	timeout := minTimeout
   312  	if t := rts.Timeout(0.1); t > timeout {
   313  		timeout = t
   314  	}
   315  	if t := rts.Timeout(0.5) * 2; t > timeout {
   316  		timeout = t
   317  	}
   318  	s.timeoutLock.Lock()
   319  	if s.timeout != timeout {
   320  		s.timeout = timeout
   321  		s.timeWeights = lpc.TimeoutWeights(s.timeout)
   322  
   323  		suggestedTimeoutGauge.Update(int64(s.timeout / time.Millisecond))
   324  		totalValueGauge.Update(int64(rts.Value(s.timeWeights, s.vt.StatsExpFactor())))
   325  	}
   326  	s.timeoutRefreshed = now
   327  	s.timeoutLock.Unlock()
   328  }
   329  
   330  // getTimeout returns the recommended request timeout.
   331  func (s *serverPool) getTimeout() time.Duration {
   332  	s.recalTimeout()
   333  	s.timeoutLock.RLock()
   334  	defer s.timeoutLock.RUnlock()
   335  	return s.timeout
   336  }
   337  
   338  // getTimeoutAndWeight returns the recommended request timeout as well as the
   339  // response time weight which is necessary to calculate service value.
   340  func (s *serverPool) getTimeoutAndWeight() (time.Duration, lpc.ResponseTimeWeights) {
   341  	s.recalTimeout()
   342  	s.timeoutLock.RLock()
   343  	defer s.timeoutLock.RUnlock()
   344  	return s.timeout, s.timeWeights
   345  }
   346  
   347  // addDialCost adds the given amount of dial cost to the node history and returns the current
   348  // amount of total dial cost
   349  func (s *serverPool) addDialCost(n *nodeHistory, amount int64) uint64 {
   350  	logOffset := s.vt.StatsExpirer().LogOffset(s.clock.Now())
   351  	if amount > 0 {
   352  		n.dialCost.Add(amount, logOffset)
   353  	}
   354  	totalDialCost := n.dialCost.Value(logOffset)
   355  	if totalDialCost < dialCost {
   356  		totalDialCost = dialCost
   357  	}
   358  	return totalDialCost
   359  }
   360  
   361  // serviceValue returns the service value accumulated in this session and in total
   362  func (s *serverPool) serviceValue(node *enode.Node) (sessionValue, totalValue float64) {
   363  	nvt := s.vt.GetNode(node.ID())
   364  	if nvt == nil {
   365  		return 0, 0
   366  	}
   367  	currentStats := nvt.RtStats()
   368  	_, timeWeights := s.getTimeoutAndWeight()
   369  	expFactor := s.vt.StatsExpFactor()
   370  
   371  	totalValue = currentStats.Value(timeWeights, expFactor)
   372  	if connStats, ok := s.ns.GetField(node, sfiConnectedStats).(lpc.ResponseTimeStats); ok {
   373  		diff := currentStats
   374  		diff.SubStats(&connStats)
   375  		sessionValue = diff.Value(timeWeights, expFactor)
   376  		sessionValueMeter.Mark(int64(sessionValue))
   377  	}
   378  	return
   379  }
   380  
   381  // updateWeight calculates the node weight and updates the nodeWeight field and the
   382  // hasValue flag. It also saves the node state if necessary.
   383  func (s *serverPool) updateWeight(node *enode.Node, totalValue float64, totalDialCost uint64) {
   384  	weight := uint64(totalValue * nodeWeightMul / float64(totalDialCost))
   385  	if weight >= nodeWeightThreshold {
   386  		s.ns.SetState(node, sfHasValue, nodestate.Flags{}, 0)
   387  		s.ns.SetField(node, sfiNodeWeight, weight)
   388  	} else {
   389  		s.ns.SetState(node, nodestate.Flags{}, sfHasValue, 0)
   390  		s.ns.SetField(node, sfiNodeWeight, nil)
   391  	}
   392  	s.ns.Persist(node) // saved if node history or hasValue changed
   393  }
   394  
   395  // setRedialWait calculates and sets the redialWait timeout based on the service value
   396  // and dial cost accumulated during the last session/attempt and in total.
   397  // The waiting time is raised exponentially if no service value has been received in order
   398  // to prevent dialing an unresponsive node frequently for a very long time just because it
   399  // was useful in the past. It can still be occasionally dialed though and once it provides
   400  // a significant amount of service value again its waiting time is quickly reduced or reset
   401  // to the minimum.
   402  // Note: node weight is also recalculated and updated by this function.
   403  func (s *serverPool) setRedialWait(node *enode.Node, addDialCost int64, waitStep float64) {
   404  	n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory)
   405  	sessionValue, totalValue := s.serviceValue(node)
   406  	totalDialCost := s.addDialCost(&n, addDialCost)
   407  
   408  	// if the current dial session has yielded at least the average value/dial cost ratio
   409  	// then the waiting time should be reset to the minimum. If the session value
   410  	// is below average but still positive then timeout is limited to the ratio of
   411  	// average / current service value multiplied by the minimum timeout. If the attempt
   412  	// was unsuccessful then timeout is raised exponentially without limitation.
   413  	// Note: dialCost is used in the formula below even if dial was not attempted at all
   414  	// because the pre-negotiation query did not return a positive result. In this case
   415  	// the ratio has no meaning anyway and waitFactor is always raised, though in smaller
   416  	// steps because queries are cheaper and therefore we can allow more failed attempts.
   417  	unixTime := s.unixTime()
   418  	plannedTimeout := float64(n.redialWaitEnd - n.redialWaitStart) // last planned redialWait timeout
   419  	var actualWait float64                                         // actual waiting time elapsed
   420  	if unixTime > n.redialWaitEnd {
   421  		// the planned timeout has elapsed
   422  		actualWait = plannedTimeout
   423  	} else {
   424  		// if the node was redialed earlier then we do not raise the planned timeout
   425  		// exponentially because that could lead to the timeout rising very high in
   426  		// a short amount of time
   427  		// Note that in case of an early redial actualWait also includes the dial
   428  		// timeout or connection time of the last attempt but it still serves its
   429  		// purpose of preventing the timeout rising quicker than linearly as a function
   430  		// of total time elapsed without a successful connection.
   431  		actualWait = float64(unixTime - n.redialWaitStart)
   432  	}
   433  	// raise timeout exponentially if the last planned timeout has elapsed
   434  	// (use at least the last planned timeout otherwise)
   435  	nextTimeout := actualWait * waitStep
   436  	if plannedTimeout > nextTimeout {
   437  		nextTimeout = plannedTimeout
   438  	}
   439  	// we reduce the waiting time if the server has provided service value during the
   440  	// connection (but never under the minimum)
   441  	a := totalValue * dialCost * float64(minRedialWait)
   442  	b := float64(totalDialCost) * sessionValue
   443  	if a < b*nextTimeout {
   444  		nextTimeout = a / b
   445  	}
   446  	if nextTimeout < minRedialWait {
   447  		nextTimeout = minRedialWait
   448  	}
   449  	wait := time.Duration(float64(time.Second) * nextTimeout)
   450  	if wait < waitThreshold {
   451  		n.redialWaitStart = unixTime
   452  		n.redialWaitEnd = unixTime + int64(nextTimeout)
   453  		s.ns.SetField(node, sfiNodeHistory, n)
   454  		s.ns.SetState(node, sfRedialWait, nodestate.Flags{}, wait)
   455  		s.updateWeight(node, totalValue, totalDialCost)
   456  	} else {
   457  		// discard known node statistics if waiting time is very long because the node
   458  		// hasn't been responsive for a very long time
   459  		s.ns.SetField(node, sfiNodeHistory, nil)
   460  		s.ns.SetField(node, sfiNodeWeight, nil)
   461  		s.ns.SetState(node, nodestate.Flags{}, sfHasValue, 0)
   462  	}
   463  }
   464  
   465  // calculateWeight calculates and sets the node weight without altering the node history.
   466  // This function should be called during startup and shutdown only, otherwise setRedialWait
   467  // will keep the weights updated as the underlying statistics are adjusted.
   468  func (s *serverPool) calculateWeight(node *enode.Node) {
   469  	n, _ := s.ns.GetField(node, sfiNodeHistory).(nodeHistory)
   470  	_, totalValue := s.serviceValue(node)
   471  	totalDialCost := s.addDialCost(&n, 0)
   472  	s.updateWeight(node, totalValue, totalDialCost)
   473  }