github.com/bamzi/go-ethereum@v1.6.7-0.20170704111104-138f26c93af1/eth/downloader/statesync.go

github.com/bamzi/go-ethereum@v1.6.7-0.20170704111104-138f26c93af1/eth/downloader/statesync.go (about)

     1  // Copyright 2017 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  package downloader
    18  
    19  import (
    20  	"fmt"
    21  	"hash"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/ethereum/go-ethereum/common"
    27  	"github.com/ethereum/go-ethereum/core/state"
    28  	"github.com/ethereum/go-ethereum/crypto/sha3"
    29  	"github.com/ethereum/go-ethereum/log"
    30  	"github.com/ethereum/go-ethereum/trie"
    31  )
    32  
    33  // stateReq represents a batch of state fetch requests groupped together into
    34  // a single data retrieval network packet.
    35  type stateReq struct {
    36  	items    []common.Hash              // Hashes of the state items to download
    37  	tasks    map[common.Hash]*stateTask // Download tasks to track previous attempts
    38  	timeout  time.Duration              // Maximum round trip time for this to complete
    39  	timer    *time.Timer                // Timer to fire when the RTT timeout expires
    40  	peer     *peer                      // Peer that we're requesting from
    41  	response [][]byte                   // Response data of the peer (nil for timeouts)
    42  }
    43  
    44  // timedOut returns if this request timed out.
    45  func (req *stateReq) timedOut() bool {
    46  	return req.response == nil
    47  }
    48  
    49  // stateSyncStats is a collection of progress stats to report during a state trie
    50  // sync to RPC requests as well as to display in user logs.
    51  type stateSyncStats struct {
    52  	processed  uint64 // Number of state entries processed
    53  	duplicate  uint64 // Number of state entries downloaded twice
    54  	unexpected uint64 // Number of non-requested state entries received
    55  	pending    uint64 // Number of still pending state entries
    56  }
    57  
    58  // syncState starts downloading state with the given root hash.
    59  func (d *Downloader) syncState(root common.Hash) *stateSync {
    60  	s := newStateSync(d, root)
    61  	select {
    62  	case d.stateSyncStart <- s:
    63  	case <-d.quitCh:
    64  		s.err = errCancelStateFetch
    65  		close(s.done)
    66  	}
    67  	return s
    68  }
    69  
    70  // stateFetcher manages the active state sync and accepts requests
    71  // on its behalf.
    72  func (d *Downloader) stateFetcher() {
    73  	for {
    74  		select {
    75  		case s := <-d.stateSyncStart:
    76  			for next := s; next != nil; {
    77  				next = d.runStateSync(next)
    78  			}
    79  		case <-d.stateCh:
    80  			// Ignore state responses while no sync is running.
    81  		case <-d.quitCh:
    82  			return
    83  		}
    84  	}
    85  }
    86  
    87  // runStateSync runs a state synchronisation until it completes or another root
    88  // hash is requested to be switched over to.
    89  func (d *Downloader) runStateSync(s *stateSync) *stateSync {
    90  	var (
    91  		active   = make(map[string]*stateReq) // Currently in-flight requests
    92  		finished []*stateReq                  // Completed or failed requests
    93  		timeout  = make(chan *stateReq)       // Timed out active requests
    94  	)
    95  	defer func() {
    96  		// Cancel active request timers on exit. Also set peers to idle so they're
    97  		// available for the next sync.
    98  		for _, req := range active {
    99  			req.timer.Stop()
   100  			req.peer.SetNodeDataIdle(len(req.items))
   101  		}
   102  	}()
   103  	// Run the state sync.
   104  	go s.run()
   105  	defer s.Cancel()
   106  
   107  	for {
   108  		// Enable sending of the first buffered element if there is one.
   109  		var (
   110  			deliverReq   *stateReq
   111  			deliverReqCh chan *stateReq
   112  		)
   113  		if len(finished) > 0 {
   114  			deliverReq = finished[0]
   115  			deliverReqCh = s.deliver
   116  		}
   117  
   118  		select {
   119  		// The stateSync lifecycle:
   120  		case next := <-d.stateSyncStart:
   121  			return next
   122  
   123  		case <-s.done:
   124  			return nil
   125  
   126  		// Send the next finished request to the current sync:
   127  		case deliverReqCh <- deliverReq:
   128  			finished = append(finished[:0], finished[1:]...)
   129  
   130  		// Handle incoming state packs:
   131  		case pack := <-d.stateCh:
   132  			// Discard any data not requested (or previsouly timed out)
   133  			req := active[pack.PeerId()]
   134  			if req == nil {
   135  				log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items())
   136  				continue
   137  			}
   138  			// Finalize the request and queue up for processing
   139  			req.timer.Stop()
   140  			req.response = pack.(*statePack).states
   141  
   142  			finished = append(finished, req)
   143  			delete(active, pack.PeerId())
   144  
   145  		// Handle timed-out requests:
   146  		case req := <-timeout:
   147  			// If the peer is already requesting something else, ignore the stale timeout.
   148  			// This can happen when the timeout and the delivery happens simultaneously,
   149  			// causing both pathways to trigger.
   150  			if active[req.peer.id] != req {
   151  				continue
   152  			}
   153  			// Move the timed out data back into the download queue
   154  			finished = append(finished, req)
   155  			delete(active, req.peer.id)
   156  
   157  		// Track outgoing state requests:
   158  		case req := <-d.trackStateReq:
   159  			// If an active request already exists for this peer, we have a problem. In
   160  			// theory the trie node schedule must never assign two requests to the same
   161  			// peer. In practive however, a peer might receive a request, disconnect and
   162  			// immediately reconnect before the previous times out. In this case the first
   163  			// request is never honored, alas we must not silently overwrite it, as that
   164  			// causes valid requests to go missing and sync to get stuck.
   165  			if old := active[req.peer.id]; old != nil {
   166  				log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id)
   167  
   168  				// Make sure the previous one doesn't get siletly lost
   169  				finished = append(finished, old)
   170  			}
   171  			// Start a timer to notify the sync loop if the peer stalled.
   172  			req.timer = time.AfterFunc(req.timeout, func() {
   173  				select {
   174  				case timeout <- req:
   175  				case <-s.done:
   176  					// Prevent leaking of timer goroutines in the unlikely case where a
   177  					// timer is fired just before exiting runStateSync.
   178  				}
   179  			})
   180  			active[req.peer.id] = req
   181  		}
   182  	}
   183  }
   184  
   185  // stateSync schedules requests for downloading a particular state trie defined
   186  // by a given state root.
   187  type stateSync struct {
   188  	d *Downloader // Downloader instance to access and manage current peerset
   189  
   190  	sched  *state.StateSync           // State trie sync scheduler defining the tasks
   191  	keccak hash.Hash                  // Keccak256 hasher to verify deliveries with
   192  	tasks  map[common.Hash]*stateTask // Set of tasks currently queued for retrieval
   193  
   194  	deliver    chan *stateReq // Delivery channel multiplexing peer responses
   195  	cancel     chan struct{}  // Channel to signal a termination request
   196  	cancelOnce sync.Once      // Ensures cancel only ever gets called once
   197  	done       chan struct{}  // Channel to signal termination completion
   198  	err        error          // Any error hit during sync (set before completion)
   199  }
   200  
   201  // stateTask represents a single trie node download taks, containing a set of
   202  // peers already attempted retrieval from to detect stalled syncs and abort.
   203  type stateTask struct {
   204  	attempts map[string]struct{}
   205  }
   206  
   207  // newStateSync creates a new state trie download scheduler. This method does not
   208  // yet start the sync. The user needs to call run to initiate.
   209  func newStateSync(d *Downloader, root common.Hash) *stateSync {
   210  	return &stateSync{
   211  		d:       d,
   212  		sched:   state.NewStateSync(root, d.stateDB),
   213  		keccak:  sha3.NewKeccak256(),
   214  		tasks:   make(map[common.Hash]*stateTask),
   215  		deliver: make(chan *stateReq),
   216  		cancel:  make(chan struct{}),
   217  		done:    make(chan struct{}),
   218  	}
   219  }
   220  
   221  // run starts the task assignment and response processing loop, blocking until
   222  // it finishes, and finally notifying any goroutines waiting for the loop to
   223  // finish.
   224  func (s *stateSync) run() {
   225  	s.err = s.loop()
   226  	close(s.done)
   227  }
   228  
   229  // Wait blocks until the sync is done or canceled.
   230  func (s *stateSync) Wait() error {
   231  	<-s.done
   232  	return s.err
   233  }
   234  
   235  // Cancel cancels the sync and waits until it has shut down.
   236  func (s *stateSync) Cancel() error {
   237  	s.cancelOnce.Do(func() { close(s.cancel) })
   238  	return s.Wait()
   239  }
   240  
   241  // loop is the main event loop of a state trie sync. It it responsible for the
   242  // assignment of new tasks to peers (including sending it to them) as well as
   243  // for the processing of inbound data. Note, that the loop does not directly
   244  // receive data from peers, rather those are buffered up in the downloader and
   245  // pushed here async. The reason is to decouple processing from data receipt
   246  // and timeouts.
   247  func (s *stateSync) loop() error {
   248  	// Listen for new peer events to assign tasks to them
   249  	newPeer := make(chan *peer, 1024)
   250  	peerSub := s.d.peers.SubscribeNewPeers(newPeer)
   251  	defer peerSub.Unsubscribe()
   252  
   253  	// Keep assigning new tasks until the sync completes or aborts
   254  	for s.sched.Pending() > 0 {
   255  		if err := s.assignTasks(); err != nil {
   256  			return err
   257  		}
   258  		// Tasks assigned, wait for something to happen
   259  		select {
   260  		case <-newPeer:
   261  			// New peer arrived, try to assign it download tasks
   262  
   263  		case <-s.cancel:
   264  			return errCancelStateFetch
   265  
   266  		case req := <-s.deliver:
   267  			// Response or timeout triggered, drop the peer if stalling
   268  			log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "timeout", req.timedOut())
   269  			if len(req.items) <= 2 && req.timedOut() {
   270  				// 2 items are the minimum requested, if even that times out, we've no use of
   271  				// this peer at the moment.
   272  				log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id)
   273  				s.d.dropPeer(req.peer.id)
   274  			}
   275  			// Process all the received blobs and check for stale delivery
   276  			stale, err := s.process(req)
   277  			if err != nil {
   278  				log.Warn("Node data write error", "err", err)
   279  				return err
   280  			}
   281  			// The the delivery contains requested data, mark the node idle (otherwise it's a timed out delivery)
   282  			if !stale {
   283  				req.peer.SetNodeDataIdle(len(req.response))
   284  			}
   285  		}
   286  	}
   287  	return nil
   288  }
   289  
   290  // assignTasks attempts to assing new tasks to all idle peers, either from the
   291  // batch currently being retried, or fetching new data from the trie sync itself.
   292  func (s *stateSync) assignTasks() error {
   293  	// Iterate over all idle peers and try to assign them state fetches
   294  	peers, _ := s.d.peers.NodeDataIdlePeers()
   295  	for _, p := range peers {
   296  		// Assign a batch of fetches proportional to the estimated latency/bandwidth
   297  		cap := p.NodeDataCapacity(s.d.requestRTT())
   298  		req := &stateReq{peer: p, timeout: s.d.requestTTL()}
   299  		s.fillTasks(cap, req)
   300  
   301  		// If the peer was assigned tasks to fetch, send the network request
   302  		if len(req.items) > 0 {
   303  			req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items))
   304  
   305  			select {
   306  			case s.d.trackStateReq <- req:
   307  				req.peer.FetchNodeData(req.items)
   308  			case <-s.cancel:
   309  			}
   310  		}
   311  	}
   312  	return nil
   313  }
   314  
   315  // fillTasks fills the given request object with a maximum of n state download
   316  // tasks to send to the remote peer.
   317  func (s *stateSync) fillTasks(n int, req *stateReq) {
   318  	// Refill available tasks from the scheduler.
   319  	if len(s.tasks) < n {
   320  		new := s.sched.Missing(n - len(s.tasks))
   321  		for _, hash := range new {
   322  			s.tasks[hash] = &stateTask{make(map[string]struct{})}
   323  		}
   324  	}
   325  	// Find tasks that haven't been tried with the request's peer.
   326  	req.items = make([]common.Hash, 0, n)
   327  	req.tasks = make(map[common.Hash]*stateTask, n)
   328  	for hash, t := range s.tasks {
   329  		// Stop when we've gathered enough requests
   330  		if len(req.items) == n {
   331  			break
   332  		}
   333  		// Skip any requests we've already tried from this peer
   334  		if _, ok := t.attempts[req.peer.id]; ok {
   335  			continue
   336  		}
   337  		// Assign the request to this peer
   338  		t.attempts[req.peer.id] = struct{}{}
   339  		req.items = append(req.items, hash)
   340  		req.tasks[hash] = t
   341  		delete(s.tasks, hash)
   342  	}
   343  }
   344  
   345  // process iterates over a batch of delivered state data, injecting each item
   346  // into a running state sync, re-queuing any items that were requested but not
   347  // delivered.
   348  func (s *stateSync) process(req *stateReq) (bool, error) {
   349  	// Collect processing stats and update progress if valid data was received
   350  	processed, written, duplicate, unexpected := 0, 0, 0, 0
   351  
   352  	defer func(start time.Time) {
   353  		if processed+written+duplicate+unexpected > 0 {
   354  			s.updateStats(processed, written, duplicate, unexpected, time.Since(start))
   355  		}
   356  	}(time.Now())
   357  
   358  	// Iterate over all the delivered data and inject one-by-one into the trie
   359  	progress, stale := false, len(req.response) > 0
   360  
   361  	for _, blob := range req.response {
   362  		prog, hash, err := s.processNodeData(blob)
   363  		switch err {
   364  		case nil:
   365  			processed++
   366  		case trie.ErrNotRequested:
   367  			unexpected++
   368  		case trie.ErrAlreadyProcessed:
   369  			duplicate++
   370  		default:
   371  			return stale, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err)
   372  		}
   373  		if prog {
   374  			progress = true
   375  		}
   376  		// If the node delivered a requested item, mark the delivery non-stale
   377  		if _, ok := req.tasks[hash]; ok {
   378  			delete(req.tasks, hash)
   379  			stale = false
   380  		}
   381  	}
   382  	// If some data managed to hit the database, flush and reset failure counters
   383  	if progress {
   384  		// Flush any accumulated data out to disk
   385  		batch := s.d.stateDB.NewBatch()
   386  
   387  		count, err := s.sched.Commit(batch)
   388  		if err != nil {
   389  			return stale, err
   390  		}
   391  		if err := batch.Write(); err != nil {
   392  			return stale, err
   393  		}
   394  		written = count
   395  
   396  		// If we're inside the critical section, reset fail counter since we progressed
   397  		if atomic.LoadUint32(&s.d.fsPivotFails) > 1 {
   398  			log.Trace("Fast-sync progressed, resetting fail counter", "previous", atomic.LoadUint32(&s.d.fsPivotFails))
   399  			atomic.StoreUint32(&s.d.fsPivotFails, 1) // Don't ever reset to 0, as that will unlock the pivot block
   400  		}
   401  	}
   402  	// Put unfulfilled tasks back into the retry queue
   403  	npeers := s.d.peers.Len()
   404  
   405  	for hash, task := range req.tasks {
   406  		// If the node did deliver something, missing items may be due to a protocol
   407  		// limit or a previous timeout + delayed delivery. Both cases should permit
   408  		// the node to retry the missing items (to avoid single-peer stalls).
   409  		if len(req.response) > 0 || req.timedOut() {
   410  			delete(task.attempts, req.peer.id)
   411  		}
   412  		// If we've requested the node too many times already, it may be a malicious
   413  		// sync where nobody has the right data. Abort.
   414  		if len(task.attempts) >= npeers {
   415  			return stale, fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers)
   416  		}
   417  		// Missing item, place into the retry queue.
   418  		s.tasks[hash] = task
   419  	}
   420  	return stale, nil
   421  }
   422  
   423  // processNodeData tries to inject a trie node data blob delivered from a remote
   424  // peer into the state trie, returning whether anything useful was written or any
   425  // error occurred.
   426  func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) {
   427  	res := trie.SyncResult{Data: blob}
   428  
   429  	s.keccak.Reset()
   430  	s.keccak.Write(blob)
   431  	s.keccak.Sum(res.Hash[:0])
   432  
   433  	committed, _, err := s.sched.Process([]trie.SyncResult{res})
   434  	return committed, res.Hash, err
   435  }
   436  
   437  // updateStats bumps the various state sync progress counters and displays a log
   438  // message for the user to see.
   439  func (s *stateSync) updateStats(processed, written, duplicate, unexpected int, duration time.Duration) {
   440  	s.d.syncStatsLock.Lock()
   441  	defer s.d.syncStatsLock.Unlock()
   442  
   443  	s.d.syncStatsState.pending = uint64(s.sched.Pending())
   444  	s.d.syncStatsState.processed += uint64(processed)
   445  	s.d.syncStatsState.duplicate += uint64(duplicate)
   446  	s.d.syncStatsState.unexpected += uint64(unexpected)
   447  
   448  	log.Info("Imported new state entries", "count", processed, "flushed", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected)
   449  }