github.com/theQRL/go-zond@v0.1.1/zond/downloader/fetchers_concurrent.go (about)

     1  // Copyright 2021 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  package downloader
    18  
    19  import (
    20  	"errors"
    21  	"sort"
    22  	"time"
    23  
    24  	"github.com/theQRL/go-zond/common"
    25  	"github.com/theQRL/go-zond/common/prque"
    26  	"github.com/theQRL/go-zond/log"
    27  	"github.com/theQRL/go-zond/zond/protocols/zond"
    28  )
    29  
    30  // timeoutGracePeriod is the amount of time to allow for a peer to deliver a
    31  // response to a locally already timed out request. Timeouts are not penalized
    32  // as a peer might be temporarily overloaded, however, they still must reply
    33  // to each request. Failing to do so is considered a protocol violation.
    34  var timeoutGracePeriod = 2 * time.Minute
    35  
    36  // typedQueue is an interface defining the adaptor needed to translate the type
    37  // specific downloader/queue schedulers into the type-agnostic general concurrent
    38  // fetcher algorithm calls.
    39  type typedQueue interface {
    40  	// waker returns a notification channel that gets pinged in case more fetches
    41  	// have been queued up, so the fetcher might assign it to idle peers.
    42  	waker() chan bool
    43  
    44  	// pending returns the number of wrapped items that are currently queued for
    45  	// fetching by the concurrent downloader.
    46  	pending() int
    47  
    48  	// capacity is responsible for calculating how many items of the abstracted
    49  	// type a particular peer is estimated to be able to retrieve within the
    50  	// allotted round trip time.
    51  	capacity(peer *peerConnection, rtt time.Duration) int
    52  
    53  	// updateCapacity is responsible for updating how many items of the abstracted
    54  	// type a particular peer is estimated to be able to retrieve in a unit time.
    55  	updateCapacity(peer *peerConnection, items int, elapsed time.Duration)
    56  
    57  	// reserve is responsible for allocating a requested number of pending items
    58  	// from the download queue to the specified peer.
    59  	reserve(peer *peerConnection, items int) (*fetchRequest, bool, bool)
    60  
    61  	// unreserve is responsible for removing the current retrieval allocation
    62  	// assigned to a specific peer and placing it back into the pool to allow
    63  	// reassigning to some other peer.
    64  	unreserve(peer string) int
    65  
    66  	// request is responsible for converting a generic fetch request into a typed
    67  	// one and sending it to the remote peer for fulfillment.
    68  	request(peer *peerConnection, req *fetchRequest, resCh chan *zond.Response) (*zond.Request, error)
    69  
    70  	// deliver is responsible for taking a generic response packet from the
    71  	// concurrent fetcher, unpacking the type specific data and delivering
    72  	// it to the downloader's queue.
    73  	deliver(peer *peerConnection, packet *zond.Response) (int, error)
    74  }
    75  
    76  // concurrentFetch iteratively downloads scheduled block parts, taking available
    77  // peers, reserving a chunk of fetch requests for each and waiting for delivery
    78  // or timeouts.
    79  func (d *Downloader) concurrentFetch(queue typedQueue, beaconMode bool) error {
    80  	// Create a delivery channel to accept responses from all peers
    81  	responses := make(chan *zond.Response)
    82  
    83  	// Track the currently active requests and their timeout order
    84  	pending := make(map[string]*zond.Request)
    85  	defer func() {
    86  		// Abort all requests on sync cycle cancellation. The requests may still
    87  		// be fulfilled by the remote side, but the dispatcher will not wait to
    88  		// deliver them since nobody's going to be listening.
    89  		for _, req := range pending {
    90  			req.Close()
    91  		}
    92  	}()
    93  	ordering := make(map[*zond.Request]int)
    94  	timeouts := prque.New[int64, *zond.Request](func(data *zond.Request, index int) {
    95  		ordering[data] = index
    96  	})
    97  
    98  	timeout := time.NewTimer(0)
    99  	if !timeout.Stop() {
   100  		<-timeout.C
   101  	}
   102  	defer timeout.Stop()
   103  
   104  	// Track the timed-out but not-yet-answered requests separately. We want to
   105  	// keep tracking which peers are busy (potentially overloaded), so removing
   106  	// all trace of a timed out request is not good. We also can't just cancel
   107  	// the pending request altogether as that would prevent a late response from
   108  	// being delivered, thus never unblocking the peer.
   109  	stales := make(map[string]*zond.Request)
   110  	defer func() {
   111  		// Abort all requests on sync cycle cancellation. The requests may still
   112  		// be fulfilled by the remote side, but the dispatcher will not wait to
   113  		// deliver them since nobody's going to be listening.
   114  		for _, req := range stales {
   115  			req.Close()
   116  		}
   117  	}()
   118  	// Subscribe to peer lifecycle events to schedule tasks to new joiners and
   119  	// reschedule tasks upon disconnections. We don't care which event happened
   120  	// for simplicity, so just use a single channel.
   121  	peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection
   122  
   123  	peeringSub := d.peers.SubscribeEvents(peering)
   124  	defer peeringSub.Unsubscribe()
   125  
   126  	// Prepare the queue and fetch block parts until the block header fetcher's done
   127  	finished := false
   128  	for {
   129  		// Short circuit if we lost all our peers
   130  		if d.peers.Len() == 0 && !beaconMode {
   131  			return errNoPeers
   132  		}
   133  		// If there's nothing more to fetch, wait or terminate
   134  		if queue.pending() == 0 {
   135  			if len(pending) == 0 && finished {
   136  				return nil
   137  			}
   138  		} else {
   139  			// Send a download request to all idle peers, until throttled
   140  			var (
   141  				idles []*peerConnection
   142  				caps  []int
   143  			)
   144  			for _, peer := range d.peers.AllPeers() {
   145  				pending, stale := pending[peer.id], stales[peer.id]
   146  				if pending == nil && stale == nil {
   147  					idles = append(idles, peer)
   148  					caps = append(caps, queue.capacity(peer, time.Second))
   149  				} else if stale != nil {
   150  					if waited := time.Since(stale.Sent); waited > timeoutGracePeriod {
   151  						// Request has been in flight longer than the grace period
   152  						// permitted it, consider the peer malicious attempting to
   153  						// stall the sync.
   154  						peer.log.Warn("Peer stalling, dropping", "waited", common.PrettyDuration(waited))
   155  						d.dropPeer(peer.id)
   156  					}
   157  				}
   158  			}
   159  			sort.Sort(&peerCapacitySort{idles, caps})
   160  
   161  			var (
   162  				progressed bool
   163  				throttled  bool
   164  				queued     = queue.pending()
   165  			)
   166  			for _, peer := range idles {
   167  				// Short circuit if throttling activated or there are no more
   168  				// queued tasks to be retrieved
   169  				if throttled {
   170  					break
   171  				}
   172  				if queued = queue.pending(); queued == 0 {
   173  					break
   174  				}
   175  				// Reserve a chunk of fetches for a peer. A nil can mean either that
   176  				// no more headers are available, or that the peer is known not to
   177  				// have them.
   178  				request, progress, throttle := queue.reserve(peer, queue.capacity(peer, d.peers.rates.TargetRoundTrip()))
   179  				if progress {
   180  					progressed = true
   181  				}
   182  				if throttle {
   183  					throttled = true
   184  					throttleCounter.Inc(1)
   185  				}
   186  				if request == nil {
   187  					continue
   188  				}
   189  				// Fetch the chunk and make sure any errors return the hashes to the queue
   190  				req, err := queue.request(peer, request, responses)
   191  				if err != nil {
   192  					// Sending the request failed, which generally means the peer
   193  					// was disconnected in between assignment and network send.
   194  					// Although all peer removal operations return allocated tasks
   195  					// to the queue, that is async, and we can do better here by
   196  					// immediately pushing the unfulfilled requests.
   197  					queue.unreserve(peer.id) // TODO(karalabe): This needs a non-expiration method
   198  					continue
   199  				}
   200  				pending[peer.id] = req
   201  
   202  				ttl := d.peers.rates.TargetTimeout()
   203  				ordering[req] = timeouts.Size()
   204  
   205  				timeouts.Push(req, -time.Now().Add(ttl).UnixNano())
   206  				if timeouts.Size() == 1 {
   207  					timeout.Reset(ttl)
   208  				}
   209  			}
   210  			// Make sure that we have peers available for fetching. If all peers have been tried
   211  			// and all failed throw an error
   212  			if !progressed && !throttled && len(pending) == 0 && len(idles) == d.peers.Len() && queued > 0 && !beaconMode {
   213  				return errPeersUnavailable
   214  			}
   215  		}
   216  		// Wait for something to happen
   217  		select {
   218  		case <-d.cancelCh:
   219  			// If sync was cancelled, tear down the parallel retriever. Pending
   220  			// requests will be cancelled locally, and the remote responses will
   221  			// be dropped when they arrive
   222  			return errCanceled
   223  
   224  		case event := <-peering:
   225  			// A peer joined or left, the tasks queue and allocations need to be
   226  			// checked for potential assignment or reassignment
   227  			peerid := event.peer.id
   228  
   229  			if event.join {
   230  				// Sanity check the internal state; this can be dropped later
   231  				if _, ok := pending[peerid]; ok {
   232  					event.peer.log.Error("Pending request exists for joining peer")
   233  				}
   234  				if _, ok := stales[peerid]; ok {
   235  					event.peer.log.Error("Stale request exists for joining peer")
   236  				}
   237  				// Loop back to the entry point for task assignment
   238  				continue
   239  			}
   240  			// A peer left, any existing requests need to be untracked, pending
   241  			// tasks returned and possible reassignment checked
   242  			if req, ok := pending[peerid]; ok {
   243  				queue.unreserve(peerid) // TODO(karalabe): This needs a non-expiration method
   244  				delete(pending, peerid)
   245  				req.Close()
   246  
   247  				if index, live := ordering[req]; live {
   248  					timeouts.Remove(index)
   249  					if index == 0 {
   250  						if !timeout.Stop() {
   251  							<-timeout.C
   252  						}
   253  						if timeouts.Size() > 0 {
   254  							_, exp := timeouts.Peek()
   255  							timeout.Reset(time.Until(time.Unix(0, -exp)))
   256  						}
   257  					}
   258  					delete(ordering, req)
   259  				}
   260  			}
   261  			if req, ok := stales[peerid]; ok {
   262  				delete(stales, peerid)
   263  				req.Close()
   264  			}
   265  
   266  		case <-timeout.C:
   267  			// Retrieve the next request which should have timed out. The check
   268  			// below is purely for to catch programming errors, given the correct
   269  			// code, there's no possible order of events that should result in a
   270  			// timeout firing for a non-existent event.
   271  			req, exp := timeouts.Peek()
   272  			if now, at := time.Now(), time.Unix(0, -exp); now.Before(at) {
   273  				log.Error("Timeout triggered but not reached", "left", at.Sub(now))
   274  				timeout.Reset(at.Sub(now))
   275  				continue
   276  			}
   277  			// Stop tracking the timed out request from a timing perspective,
   278  			// cancel it, so it's not considered in-flight anymore, but keep
   279  			// the peer marked busy to prevent assigning a second request and
   280  			// overloading it further.
   281  			delete(pending, req.Peer)
   282  			stales[req.Peer] = req
   283  
   284  			timeouts.Pop() // Popping an item will reorder indices in `ordering`, delete after, otherwise will resurrect!
   285  			if timeouts.Size() > 0 {
   286  				_, exp := timeouts.Peek()
   287  				timeout.Reset(time.Until(time.Unix(0, -exp)))
   288  			}
   289  			delete(ordering, req)
   290  
   291  			// New timeout potentially set if there are more requests pending,
   292  			// reschedule the failed one to a free peer
   293  			fails := queue.unreserve(req.Peer)
   294  
   295  			// Finally, update the peer's retrieval capacity, or if it's already
   296  			// below the minimum allowance, drop the peer. If a lot of retrieval
   297  			// elements expired, we might have overestimated the remote peer or
   298  			// perhaps ourselves. Only reset to minimal throughput but don't drop
   299  			// just yet.
   300  			//
   301  			// The reason the minimum threshold is 2 is that the downloader tries
   302  			// to estimate the bandwidth and latency of a peer separately, which
   303  			// requires pushing the measured capacity a bit and seeing how response
   304  			// times reacts, to it always requests one more than the minimum (i.e.
   305  			// min 2).
   306  			peer := d.peers.Peer(req.Peer)
   307  			if peer == nil {
   308  				// If the peer got disconnected in between, we should really have
   309  				// short-circuited it already. Just in case there's some strange
   310  				// codepath, leave this check in not to crash.
   311  				log.Error("Delivery timeout from unknown peer", "peer", req.Peer)
   312  				continue
   313  			}
   314  			if fails > 2 {
   315  				queue.updateCapacity(peer, 0, 0)
   316  			} else {
   317  				d.dropPeer(peer.id)
   318  
   319  				// If this peer was the master peer, abort sync immediately
   320  				d.cancelLock.RLock()
   321  				master := peer.id == d.cancelPeer
   322  				d.cancelLock.RUnlock()
   323  
   324  				if master {
   325  					d.cancel()
   326  					return errTimeout
   327  				}
   328  			}
   329  
   330  		case res := <-responses:
   331  			// Response arrived, it may be for an existing or an already timed
   332  			// out request. If the former, update the timeout heap and perhaps
   333  			// reschedule the timeout timer.
   334  			index, live := ordering[res.Req]
   335  			if live {
   336  				timeouts.Remove(index)
   337  				if index == 0 {
   338  					if !timeout.Stop() {
   339  						<-timeout.C
   340  					}
   341  					if timeouts.Size() > 0 {
   342  						_, exp := timeouts.Peek()
   343  						timeout.Reset(time.Until(time.Unix(0, -exp)))
   344  					}
   345  				}
   346  				delete(ordering, res.Req)
   347  			}
   348  			// Delete the pending request (if it still exists) and mark the peer idle
   349  			delete(pending, res.Req.Peer)
   350  			delete(stales, res.Req.Peer)
   351  
   352  			// Signal the dispatcher that the round trip is done. We'll drop the
   353  			// peer if the data turns out to be junk.
   354  			res.Done <- nil
   355  			res.Req.Close()
   356  
   357  			// If the peer was previously banned and failed to deliver its pack
   358  			// in a reasonable time frame, ignore its message.
   359  			if peer := d.peers.Peer(res.Req.Peer); peer != nil {
   360  				// Deliver the received chunk of data and check chain validity
   361  				accepted, err := queue.deliver(peer, res)
   362  				if errors.Is(err, errInvalidChain) {
   363  					return err
   364  				}
   365  				// Unless a peer delivered something completely else than requested (usually
   366  				// caused by a timed out request which came through in the end), set it to
   367  				// idle. If the delivery's stale, the peer should have already been idled.
   368  				if !errors.Is(err, errStaleDelivery) {
   369  					queue.updateCapacity(peer, accepted, res.Time)
   370  				}
   371  			}
   372  
   373  		case cont := <-queue.waker():
   374  			// The header fetcher sent a continuation flag, check if it's done
   375  			if !cont {
   376  				finished = true
   377  			}
   378  		}
   379  	}
   380  }