github.com/codysnider/go-ethereum@v1.10.18-0.20220420071915-14f4ae99222a/eth/downloader/fetchers_concurrent.go (about) 1 // Copyright 2021 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "errors" 21 "sort" 22 "time" 23 24 "github.com/ethereum/go-ethereum/common" 25 "github.com/ethereum/go-ethereum/common/prque" 26 "github.com/ethereum/go-ethereum/eth/protocols/eth" 27 "github.com/ethereum/go-ethereum/log" 28 ) 29 30 // timeoutGracePeriod is the amount of time to allow for a peer to deliver a 31 // response to a locally already timed out request. Timeouts are not penalized 32 // as a peer might be temporarily overloaded, however, they still must reply 33 // to each request. Failing to do so is considered a protocol violation. 34 var timeoutGracePeriod = 2 * time.Minute 35 36 // typedQueue is an interface defining the adaptor needed to translate the type 37 // specific downloader/queue schedulers into the type-agnostic general concurrent 38 // fetcher algorithm calls. 39 type typedQueue interface { 40 // waker returns a notification channel that gets pinged in case more fetches 41 // have been queued up, so the fetcher might assign it to idle peers. 42 waker() chan bool 43 44 // pending returns the number of wrapped items that are currently queued for 45 // fetching by the concurrent downloader. 46 pending() int 47 48 // capacity is responsible for calculating how many items of the abstracted 49 // type a particular peer is estimated to be able to retrieve within the 50 // alloted round trip time. 51 capacity(peer *peerConnection, rtt time.Duration) int 52 53 // updateCapacity is responsible for updating how many items of the abstracted 54 // type a particular peer is estimated to be able to retrieve in a unit time. 55 updateCapacity(peer *peerConnection, items int, elapsed time.Duration) 56 57 // reserve is responsible for allocating a requested number of pending items 58 // from the download queue to the specified peer. 59 reserve(peer *peerConnection, items int) (*fetchRequest, bool, bool) 60 61 // unreserve is resposible for removing the current retrieval allocation 62 // assigned to a specific peer and placing it back into the pool to allow 63 // reassigning to some other peer. 64 unreserve(peer string) int 65 66 // request is responsible for converting a generic fetch request into a typed 67 // one and sending it to the remote peer for fulfillment. 68 request(peer *peerConnection, req *fetchRequest, resCh chan *eth.Response) (*eth.Request, error) 69 70 // deliver is responsible for taking a generic response packet from the 71 // concurrent fetcher, unpacking the type specific data and delivering 72 // it to the downloader's queue. 73 deliver(peer *peerConnection, packet *eth.Response) (int, error) 74 } 75 76 // concurrentFetch iteratively downloads scheduled block parts, taking available 77 // peers, reserving a chunk of fetch requests for each and waiting for delivery 78 // or timeouts. 79 func (d *Downloader) concurrentFetch(queue typedQueue, beaconMode bool) error { 80 // Create a delivery channel to accept responses from all peers 81 responses := make(chan *eth.Response) 82 83 // Track the currently active requests and their timeout order 84 pending := make(map[string]*eth.Request) 85 defer func() { 86 // Abort all requests on sync cycle cancellation. The requests may still 87 // be fulfilled by the remote side, but the dispatcher will not wait to 88 // deliver them since nobody's going to be listening. 89 for _, req := range pending { 90 req.Close() 91 } 92 }() 93 ordering := make(map[*eth.Request]int) 94 timeouts := prque.New(func(data interface{}, index int) { 95 ordering[data.(*eth.Request)] = index 96 }) 97 98 timeout := time.NewTimer(0) 99 if !timeout.Stop() { 100 <-timeout.C 101 } 102 defer timeout.Stop() 103 104 // Track the timed-out but not-yet-answered requests separately. We want to 105 // keep tracking which peers are busy (potentially overloaded), so removing 106 // all trace of a timed out request is not good. We also can't just cancel 107 // the pending request altogether as that would prevent a late response from 108 // being delivered, thus never unblocking the peer. 109 stales := make(map[string]*eth.Request) 110 defer func() { 111 // Abort all requests on sync cycle cancellation. The requests may still 112 // be fulfilled by the remote side, but the dispatcher will not wait to 113 // deliver them since nobody's going to be listening. 114 for _, req := range stales { 115 req.Close() 116 } 117 }() 118 // Subscribe to peer lifecycle events to schedule tasks to new joiners and 119 // reschedule tasks upon disconnections. We don't care which event happened 120 // for simplicity, so just use a single channel. 121 peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection 122 123 peeringSub := d.peers.SubscribeEvents(peering) 124 defer peeringSub.Unsubscribe() 125 126 // Prepare the queue and fetch block parts until the block header fetcher's done 127 finished := false 128 for { 129 // Short circuit if we lost all our peers 130 if d.peers.Len() == 0 && !beaconMode { 131 return errNoPeers 132 } 133 // If there's nothing more to fetch, wait or terminate 134 if queue.pending() == 0 { 135 if len(pending) == 0 && finished { 136 return nil 137 } 138 } else { 139 // Send a download request to all idle peers, until throttled 140 var ( 141 idles []*peerConnection 142 caps []int 143 ) 144 for _, peer := range d.peers.AllPeers() { 145 pending, stale := pending[peer.id], stales[peer.id] 146 if pending == nil && stale == nil { 147 idles = append(idles, peer) 148 caps = append(caps, queue.capacity(peer, time.Second)) 149 } else if stale != nil { 150 if waited := time.Since(stale.Sent); waited > timeoutGracePeriod { 151 // Request has been in flight longer than the grace period 152 // permitted it, consider the peer malicious attempting to 153 // stall the sync. 154 peer.log.Warn("Peer stalling, dropping", "waited", common.PrettyDuration(waited)) 155 d.dropPeer(peer.id) 156 } 157 } 158 } 159 sort.Sort(&peerCapacitySort{idles, caps}) 160 161 var ( 162 progressed bool 163 throttled bool 164 queued = queue.pending() 165 ) 166 for _, peer := range idles { 167 // Short circuit if throttling activated or there are no more 168 // queued tasks to be retrieved 169 if throttled { 170 break 171 } 172 if queued = queue.pending(); queued == 0 { 173 break 174 } 175 // Reserve a chunk of fetches for a peer. A nil can mean either that 176 // no more headers are available, or that the peer is known not to 177 // have them. 178 request, progress, throttle := queue.reserve(peer, queue.capacity(peer, d.peers.rates.TargetRoundTrip())) 179 if progress { 180 progressed = true 181 } 182 if throttle { 183 throttled = true 184 throttleCounter.Inc(1) 185 } 186 if request == nil { 187 continue 188 } 189 // Fetch the chunk and make sure any errors return the hashes to the queue 190 req, err := queue.request(peer, request, responses) 191 if err != nil { 192 // Sending the request failed, which generally means the peer 193 // was diconnected in between assignment and network send. 194 // Although all peer removal operations return allocated tasks 195 // to the queue, that is async, and we can do better here by 196 // immediately pushing the unfulfilled requests. 197 queue.unreserve(peer.id) // TODO(karalabe): This needs a non-expiration method 198 continue 199 } 200 pending[peer.id] = req 201 202 ttl := d.peers.rates.TargetTimeout() 203 ordering[req] = timeouts.Size() 204 205 timeouts.Push(req, -time.Now().Add(ttl).UnixNano()) 206 if timeouts.Size() == 1 { 207 timeout.Reset(ttl) 208 } 209 } 210 // Make sure that we have peers available for fetching. If all peers have been tried 211 // and all failed throw an error 212 if !progressed && !throttled && len(pending) == 0 && len(idles) == d.peers.Len() && queued > 0 && !beaconMode { 213 return errPeersUnavailable 214 } 215 } 216 // Wait for something to happen 217 select { 218 case <-d.cancelCh: 219 // If sync was cancelled, tear down the parallel retriever. Pending 220 // requests will be cancelled locally, and the remote responses will 221 // be dropped when they arrive 222 return errCanceled 223 224 case event := <-peering: 225 // A peer joined or left, the tasks queue and allocations need to be 226 // checked for potential assignment or reassignment 227 peerid := event.peer.id 228 229 if event.join { 230 // Sanity check the internal state; this can be dropped later 231 if _, ok := pending[peerid]; ok { 232 event.peer.log.Error("Pending request exists for joining peer") 233 } 234 if _, ok := stales[peerid]; ok { 235 event.peer.log.Error("Stale request exists for joining peer") 236 } 237 // Loop back to the entry point for task assignment 238 continue 239 } 240 // A peer left, any existing requests need to be untracked, pending 241 // tasks returned and possible reassignment checked 242 if req, ok := pending[peerid]; ok { 243 queue.unreserve(peerid) // TODO(karalabe): This needs a non-expiration method 244 delete(pending, peerid) 245 req.Close() 246 247 if index, live := ordering[req]; live { 248 timeouts.Remove(index) 249 if index == 0 { 250 if !timeout.Stop() { 251 <-timeout.C 252 } 253 if timeouts.Size() > 0 { 254 _, exp := timeouts.Peek() 255 timeout.Reset(time.Until(time.Unix(0, -exp))) 256 } 257 } 258 delete(ordering, req) 259 } 260 } 261 if req, ok := stales[peerid]; ok { 262 delete(stales, peerid) 263 req.Close() 264 } 265 266 case <-timeout.C: 267 // Retrieve the next request which should have timed out. The check 268 // below is purely for to catch programming errors, given the correct 269 // code, there's no possible order of events that should result in a 270 // timeout firing for a non-existent event. 271 item, exp := timeouts.Peek() 272 if now, at := time.Now(), time.Unix(0, -exp); now.Before(at) { 273 log.Error("Timeout triggered but not reached", "left", at.Sub(now)) 274 timeout.Reset(at.Sub(now)) 275 continue 276 } 277 req := item.(*eth.Request) 278 279 // Stop tracking the timed out request from a timing perspective, 280 // cancel it, so it's not considered in-flight anymore, but keep 281 // the peer marked busy to prevent assigning a second request and 282 // overloading it further. 283 delete(pending, req.Peer) 284 stales[req.Peer] = req 285 delete(ordering, req) 286 287 timeouts.Pop() 288 if timeouts.Size() > 0 { 289 _, exp := timeouts.Peek() 290 timeout.Reset(time.Until(time.Unix(0, -exp))) 291 } 292 // New timeout potentially set if there are more requests pending, 293 // reschedule the failed one to a free peer 294 fails := queue.unreserve(req.Peer) 295 296 // Finally, update the peer's retrieval capacity, or if it's already 297 // below the minimum allowance, drop the peer. If a lot of retrieval 298 // elements expired, we might have overestimated the remote peer or 299 // perhaps ourselves. Only reset to minimal throughput but don't drop 300 // just yet. 301 // 302 // The reason the minimum threshold is 2 is that the downloader tries 303 // to estimate the bandwidth and latency of a peer separately, which 304 // requires pushing the measured capacity a bit and seeing how response 305 // times reacts, to it always requests one more than the minimum (i.e. 306 // min 2). 307 peer := d.peers.Peer(req.Peer) 308 if peer == nil { 309 // If the peer got disconnected in between, we should really have 310 // short-circuited it already. Just in case there's some strange 311 // codepath, leave this check in not to crash. 312 log.Error("Delivery timeout from unknown peer", "peer", req.Peer) 313 continue 314 } 315 if fails > 2 { 316 queue.updateCapacity(peer, 0, 0) 317 } else { 318 d.dropPeer(peer.id) 319 320 // If this peer was the master peer, abort sync immediately 321 d.cancelLock.RLock() 322 master := peer.id == d.cancelPeer 323 d.cancelLock.RUnlock() 324 325 if master { 326 d.cancel() 327 return errTimeout 328 } 329 } 330 331 case res := <-responses: 332 // Response arrived, it may be for an existing or an already timed 333 // out request. If the former, update the timeout heap and perhaps 334 // reschedule the timeout timer. 335 index, live := ordering[res.Req] 336 if live { 337 timeouts.Remove(index) 338 if index == 0 { 339 if !timeout.Stop() { 340 <-timeout.C 341 } 342 if timeouts.Size() > 0 { 343 _, exp := timeouts.Peek() 344 timeout.Reset(time.Until(time.Unix(0, -exp))) 345 } 346 } 347 delete(ordering, res.Req) 348 } 349 // Delete the pending request (if it still exists) and mark the peer idle 350 delete(pending, res.Req.Peer) 351 delete(stales, res.Req.Peer) 352 353 // Signal the dispatcher that the round trip is done. We'll drop the 354 // peer if the data turns out to be junk. 355 res.Done <- nil 356 res.Req.Close() 357 358 // If the peer was previously banned and failed to deliver its pack 359 // in a reasonable time frame, ignore its message. 360 if peer := d.peers.Peer(res.Req.Peer); peer != nil { 361 // Deliver the received chunk of data and check chain validity 362 accepted, err := queue.deliver(peer, res) 363 if errors.Is(err, errInvalidChain) { 364 return err 365 } 366 // Unless a peer delivered something completely else than requested (usually 367 // caused by a timed out request which came through in the end), set it to 368 // idle. If the delivery's stale, the peer should have already been idled. 369 if !errors.Is(err, errStaleDelivery) { 370 queue.updateCapacity(peer, accepted, res.Time) 371 } 372 } 373 374 case cont := <-queue.waker(): 375 // The header fetcher sent a continuation flag, check if it's done 376 if !cont { 377 finished = true 378 } 379 } 380 } 381 }