github.com/theQRL/go-zond@v0.1.1/zond/downloader/fetchers_concurrent.go (about) 1 // Copyright 2021 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "errors" 21 "sort" 22 "time" 23 24 "github.com/theQRL/go-zond/common" 25 "github.com/theQRL/go-zond/common/prque" 26 "github.com/theQRL/go-zond/log" 27 "github.com/theQRL/go-zond/zond/protocols/zond" 28 ) 29 30 // timeoutGracePeriod is the amount of time to allow for a peer to deliver a 31 // response to a locally already timed out request. Timeouts are not penalized 32 // as a peer might be temporarily overloaded, however, they still must reply 33 // to each request. Failing to do so is considered a protocol violation. 34 var timeoutGracePeriod = 2 * time.Minute 35 36 // typedQueue is an interface defining the adaptor needed to translate the type 37 // specific downloader/queue schedulers into the type-agnostic general concurrent 38 // fetcher algorithm calls. 39 type typedQueue interface { 40 // waker returns a notification channel that gets pinged in case more fetches 41 // have been queued up, so the fetcher might assign it to idle peers. 42 waker() chan bool 43 44 // pending returns the number of wrapped items that are currently queued for 45 // fetching by the concurrent downloader. 46 pending() int 47 48 // capacity is responsible for calculating how many items of the abstracted 49 // type a particular peer is estimated to be able to retrieve within the 50 // allotted round trip time. 51 capacity(peer *peerConnection, rtt time.Duration) int 52 53 // updateCapacity is responsible for updating how many items of the abstracted 54 // type a particular peer is estimated to be able to retrieve in a unit time. 55 updateCapacity(peer *peerConnection, items int, elapsed time.Duration) 56 57 // reserve is responsible for allocating a requested number of pending items 58 // from the download queue to the specified peer. 59 reserve(peer *peerConnection, items int) (*fetchRequest, bool, bool) 60 61 // unreserve is responsible for removing the current retrieval allocation 62 // assigned to a specific peer and placing it back into the pool to allow 63 // reassigning to some other peer. 64 unreserve(peer string) int 65 66 // request is responsible for converting a generic fetch request into a typed 67 // one and sending it to the remote peer for fulfillment. 68 request(peer *peerConnection, req *fetchRequest, resCh chan *zond.Response) (*zond.Request, error) 69 70 // deliver is responsible for taking a generic response packet from the 71 // concurrent fetcher, unpacking the type specific data and delivering 72 // it to the downloader's queue. 73 deliver(peer *peerConnection, packet *zond.Response) (int, error) 74 } 75 76 // concurrentFetch iteratively downloads scheduled block parts, taking available 77 // peers, reserving a chunk of fetch requests for each and waiting for delivery 78 // or timeouts. 79 func (d *Downloader) concurrentFetch(queue typedQueue, beaconMode bool) error { 80 // Create a delivery channel to accept responses from all peers 81 responses := make(chan *zond.Response) 82 83 // Track the currently active requests and their timeout order 84 pending := make(map[string]*zond.Request) 85 defer func() { 86 // Abort all requests on sync cycle cancellation. The requests may still 87 // be fulfilled by the remote side, but the dispatcher will not wait to 88 // deliver them since nobody's going to be listening. 89 for _, req := range pending { 90 req.Close() 91 } 92 }() 93 ordering := make(map[*zond.Request]int) 94 timeouts := prque.New[int64, *zond.Request](func(data *zond.Request, index int) { 95 ordering[data] = index 96 }) 97 98 timeout := time.NewTimer(0) 99 if !timeout.Stop() { 100 <-timeout.C 101 } 102 defer timeout.Stop() 103 104 // Track the timed-out but not-yet-answered requests separately. We want to 105 // keep tracking which peers are busy (potentially overloaded), so removing 106 // all trace of a timed out request is not good. We also can't just cancel 107 // the pending request altogether as that would prevent a late response from 108 // being delivered, thus never unblocking the peer. 109 stales := make(map[string]*zond.Request) 110 defer func() { 111 // Abort all requests on sync cycle cancellation. The requests may still 112 // be fulfilled by the remote side, but the dispatcher will not wait to 113 // deliver them since nobody's going to be listening. 114 for _, req := range stales { 115 req.Close() 116 } 117 }() 118 // Subscribe to peer lifecycle events to schedule tasks to new joiners and 119 // reschedule tasks upon disconnections. We don't care which event happened 120 // for simplicity, so just use a single channel. 121 peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection 122 123 peeringSub := d.peers.SubscribeEvents(peering) 124 defer peeringSub.Unsubscribe() 125 126 // Prepare the queue and fetch block parts until the block header fetcher's done 127 finished := false 128 for { 129 // Short circuit if we lost all our peers 130 if d.peers.Len() == 0 && !beaconMode { 131 return errNoPeers 132 } 133 // If there's nothing more to fetch, wait or terminate 134 if queue.pending() == 0 { 135 if len(pending) == 0 && finished { 136 return nil 137 } 138 } else { 139 // Send a download request to all idle peers, until throttled 140 var ( 141 idles []*peerConnection 142 caps []int 143 ) 144 for _, peer := range d.peers.AllPeers() { 145 pending, stale := pending[peer.id], stales[peer.id] 146 if pending == nil && stale == nil { 147 idles = append(idles, peer) 148 caps = append(caps, queue.capacity(peer, time.Second)) 149 } else if stale != nil { 150 if waited := time.Since(stale.Sent); waited > timeoutGracePeriod { 151 // Request has been in flight longer than the grace period 152 // permitted it, consider the peer malicious attempting to 153 // stall the sync. 154 peer.log.Warn("Peer stalling, dropping", "waited", common.PrettyDuration(waited)) 155 d.dropPeer(peer.id) 156 } 157 } 158 } 159 sort.Sort(&peerCapacitySort{idles, caps}) 160 161 var ( 162 progressed bool 163 throttled bool 164 queued = queue.pending() 165 ) 166 for _, peer := range idles { 167 // Short circuit if throttling activated or there are no more 168 // queued tasks to be retrieved 169 if throttled { 170 break 171 } 172 if queued = queue.pending(); queued == 0 { 173 break 174 } 175 // Reserve a chunk of fetches for a peer. A nil can mean either that 176 // no more headers are available, or that the peer is known not to 177 // have them. 178 request, progress, throttle := queue.reserve(peer, queue.capacity(peer, d.peers.rates.TargetRoundTrip())) 179 if progress { 180 progressed = true 181 } 182 if throttle { 183 throttled = true 184 throttleCounter.Inc(1) 185 } 186 if request == nil { 187 continue 188 } 189 // Fetch the chunk and make sure any errors return the hashes to the queue 190 req, err := queue.request(peer, request, responses) 191 if err != nil { 192 // Sending the request failed, which generally means the peer 193 // was disconnected in between assignment and network send. 194 // Although all peer removal operations return allocated tasks 195 // to the queue, that is async, and we can do better here by 196 // immediately pushing the unfulfilled requests. 197 queue.unreserve(peer.id) // TODO(karalabe): This needs a non-expiration method 198 continue 199 } 200 pending[peer.id] = req 201 202 ttl := d.peers.rates.TargetTimeout() 203 ordering[req] = timeouts.Size() 204 205 timeouts.Push(req, -time.Now().Add(ttl).UnixNano()) 206 if timeouts.Size() == 1 { 207 timeout.Reset(ttl) 208 } 209 } 210 // Make sure that we have peers available for fetching. If all peers have been tried 211 // and all failed throw an error 212 if !progressed && !throttled && len(pending) == 0 && len(idles) == d.peers.Len() && queued > 0 && !beaconMode { 213 return errPeersUnavailable 214 } 215 } 216 // Wait for something to happen 217 select { 218 case <-d.cancelCh: 219 // If sync was cancelled, tear down the parallel retriever. Pending 220 // requests will be cancelled locally, and the remote responses will 221 // be dropped when they arrive 222 return errCanceled 223 224 case event := <-peering: 225 // A peer joined or left, the tasks queue and allocations need to be 226 // checked for potential assignment or reassignment 227 peerid := event.peer.id 228 229 if event.join { 230 // Sanity check the internal state; this can be dropped later 231 if _, ok := pending[peerid]; ok { 232 event.peer.log.Error("Pending request exists for joining peer") 233 } 234 if _, ok := stales[peerid]; ok { 235 event.peer.log.Error("Stale request exists for joining peer") 236 } 237 // Loop back to the entry point for task assignment 238 continue 239 } 240 // A peer left, any existing requests need to be untracked, pending 241 // tasks returned and possible reassignment checked 242 if req, ok := pending[peerid]; ok { 243 queue.unreserve(peerid) // TODO(karalabe): This needs a non-expiration method 244 delete(pending, peerid) 245 req.Close() 246 247 if index, live := ordering[req]; live { 248 timeouts.Remove(index) 249 if index == 0 { 250 if !timeout.Stop() { 251 <-timeout.C 252 } 253 if timeouts.Size() > 0 { 254 _, exp := timeouts.Peek() 255 timeout.Reset(time.Until(time.Unix(0, -exp))) 256 } 257 } 258 delete(ordering, req) 259 } 260 } 261 if req, ok := stales[peerid]; ok { 262 delete(stales, peerid) 263 req.Close() 264 } 265 266 case <-timeout.C: 267 // Retrieve the next request which should have timed out. The check 268 // below is purely for to catch programming errors, given the correct 269 // code, there's no possible order of events that should result in a 270 // timeout firing for a non-existent event. 271 req, exp := timeouts.Peek() 272 if now, at := time.Now(), time.Unix(0, -exp); now.Before(at) { 273 log.Error("Timeout triggered but not reached", "left", at.Sub(now)) 274 timeout.Reset(at.Sub(now)) 275 continue 276 } 277 // Stop tracking the timed out request from a timing perspective, 278 // cancel it, so it's not considered in-flight anymore, but keep 279 // the peer marked busy to prevent assigning a second request and 280 // overloading it further. 281 delete(pending, req.Peer) 282 stales[req.Peer] = req 283 284 timeouts.Pop() // Popping an item will reorder indices in `ordering`, delete after, otherwise will resurrect! 285 if timeouts.Size() > 0 { 286 _, exp := timeouts.Peek() 287 timeout.Reset(time.Until(time.Unix(0, -exp))) 288 } 289 delete(ordering, req) 290 291 // New timeout potentially set if there are more requests pending, 292 // reschedule the failed one to a free peer 293 fails := queue.unreserve(req.Peer) 294 295 // Finally, update the peer's retrieval capacity, or if it's already 296 // below the minimum allowance, drop the peer. If a lot of retrieval 297 // elements expired, we might have overestimated the remote peer or 298 // perhaps ourselves. Only reset to minimal throughput but don't drop 299 // just yet. 300 // 301 // The reason the minimum threshold is 2 is that the downloader tries 302 // to estimate the bandwidth and latency of a peer separately, which 303 // requires pushing the measured capacity a bit and seeing how response 304 // times reacts, to it always requests one more than the minimum (i.e. 305 // min 2). 306 peer := d.peers.Peer(req.Peer) 307 if peer == nil { 308 // If the peer got disconnected in between, we should really have 309 // short-circuited it already. Just in case there's some strange 310 // codepath, leave this check in not to crash. 311 log.Error("Delivery timeout from unknown peer", "peer", req.Peer) 312 continue 313 } 314 if fails > 2 { 315 queue.updateCapacity(peer, 0, 0) 316 } else { 317 d.dropPeer(peer.id) 318 319 // If this peer was the master peer, abort sync immediately 320 d.cancelLock.RLock() 321 master := peer.id == d.cancelPeer 322 d.cancelLock.RUnlock() 323 324 if master { 325 d.cancel() 326 return errTimeout 327 } 328 } 329 330 case res := <-responses: 331 // Response arrived, it may be for an existing or an already timed 332 // out request. If the former, update the timeout heap and perhaps 333 // reschedule the timeout timer. 334 index, live := ordering[res.Req] 335 if live { 336 timeouts.Remove(index) 337 if index == 0 { 338 if !timeout.Stop() { 339 <-timeout.C 340 } 341 if timeouts.Size() > 0 { 342 _, exp := timeouts.Peek() 343 timeout.Reset(time.Until(time.Unix(0, -exp))) 344 } 345 } 346 delete(ordering, res.Req) 347 } 348 // Delete the pending request (if it still exists) and mark the peer idle 349 delete(pending, res.Req.Peer) 350 delete(stales, res.Req.Peer) 351 352 // Signal the dispatcher that the round trip is done. We'll drop the 353 // peer if the data turns out to be junk. 354 res.Done <- nil 355 res.Req.Close() 356 357 // If the peer was previously banned and failed to deliver its pack 358 // in a reasonable time frame, ignore its message. 359 if peer := d.peers.Peer(res.Req.Peer); peer != nil { 360 // Deliver the received chunk of data and check chain validity 361 accepted, err := queue.deliver(peer, res) 362 if errors.Is(err, errInvalidChain) { 363 return err 364 } 365 // Unless a peer delivered something completely else than requested (usually 366 // caused by a timed out request which came through in the end), set it to 367 // idle. If the delivery's stale, the peer should have already been idled. 368 if !errors.Is(err, errStaleDelivery) { 369 queue.updateCapacity(peer, accepted, res.Time) 370 } 371 } 372 373 case cont := <-queue.waker(): 374 // The header fetcher sent a continuation flag, check if it's done 375 if !cont { 376 finished = true 377 } 378 } 379 } 380 }