github.com/arieschain/arieschain@v0.0.0-20191023063405-37c074544356/qct/downloader/statesync.go (about)

     1  package downloader
     2  
     3  import (
     4  	"fmt"
     5  	"hash"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/quickchainproject/quickchain/common"
    10  	"github.com/quickchainproject/quickchain/core"
    11  	"github.com/quickchainproject/quickchain/core/state"
    12  	"github.com/quickchainproject/quickchain/crypto/sha3"
    13  	"github.com/quickchainproject/quickchain/qctdb"
    14  	"github.com/quickchainproject/quickchain/log"
    15  	"github.com/quickchainproject/quickchain/trie"
    16  )
    17  
    18  // stateReq represents a batch of state fetch requests grouped together into
    19  // a single data retrieval network packet.
    20  type stateReq struct {
    21  	items    []common.Hash              // Hashes of the state items to download
    22  	tasks    map[common.Hash]*stateTask // Download tasks to track previous attempts
    23  	timeout  time.Duration              // Maximum round trip time for this to complete
    24  	timer    *time.Timer                // Timer to fire when the RTT timeout expires
    25  	peer     *peerConnection            // Peer that we're requesting from
    26  	response [][]byte                   // Response data of the peer (nil for timeouts)
    27  	dropped  bool                       // Flag whether the peer dropped off early
    28  }
    29  
    30  // timedOut returns if this request timed out.
    31  func (req *stateReq) timedOut() bool {
    32  	return req.response == nil
    33  }
    34  
    35  // stateSyncStats is a collection of progress stats to report during a state trie
    36  // sync to RPC requests as well as to display in user logs.
    37  type stateSyncStats struct {
    38  	processed  uint64 // Number of state entries processed
    39  	duplicate  uint64 // Number of state entries downloaded twice
    40  	unexpected uint64 // Number of non-requested state entries received
    41  	pending    uint64 // Number of still pending state entries
    42  }
    43  
    44  // syncState starts downloading state with the given root hash.
    45  func (d *Downloader) syncState(root common.Hash) *stateSync {
    46  	s := newStateSync(d, root)
    47  	select {
    48  	case d.stateSyncStart <- s:
    49  	case <-d.quitCh:
    50  		s.err = errCancelStateFetch
    51  		close(s.done)
    52  	}
    53  	return s
    54  }
    55  
    56  // stateFetcher manages the active state sync and accepts requests
    57  // on its behalf.
    58  func (d *Downloader) stateFetcher() {
    59  	for {
    60  		select {
    61  		case s := <-d.stateSyncStart:
    62  			for next := s; next != nil; {
    63  				next = d.runStateSync(next)
    64  			}
    65  		case <-d.stateCh:
    66  			// Ignore state responses while no sync is running.
    67  		case <-d.quitCh:
    68  			return
    69  		}
    70  	}
    71  }
    72  
    73  // runStateSync runs a state synchronisation until it completes or another root
    74  // hash is requested to be switched over to.
    75  func (d *Downloader) runStateSync(s *stateSync) *stateSync {
    76  	var (
    77  		active   = make(map[string]*stateReq) // Currently in-flight requests
    78  		finished []*stateReq                  // Completed or failed requests
    79  		timeout  = make(chan *stateReq)       // Timed out active requests
    80  	)
    81  	defer func() {
    82  		// Cancel active request timers on exit. Also set peers to idle so they're
    83  		// available for the next sync.
    84  		for _, req := range active {
    85  			req.timer.Stop()
    86  			req.peer.SetNodeDataIdle(len(req.items))
    87  		}
    88  	}()
    89  	// Run the state sync.
    90  	go s.run()
    91  	defer s.Cancel()
    92  
    93  	// Listen for peer departure events to cancel assigned tasks
    94  	peerDrop := make(chan *peerConnection, 1024)
    95  	peerSub := s.d.peers.SubscribePeerDrops(peerDrop)
    96  	defer peerSub.Unsubscribe()
    97  
    98  	for {
    99  		// Enable sending of the first buffered element if there is one.
   100  		var (
   101  			deliverReq   *stateReq
   102  			deliverReqCh chan *stateReq
   103  		)
   104  		if len(finished) > 0 {
   105  			deliverReq = finished[0]
   106  			deliverReqCh = s.deliver
   107  		}
   108  
   109  		select {
   110  		// The stateSync lifecycle:
   111  		case next := <-d.stateSyncStart:
   112  			return next
   113  
   114  		case <-s.done:
   115  			return nil
   116  
   117  		// Send the next finished request to the current sync:
   118  		case deliverReqCh <- deliverReq:
   119  			// Shift out the first request, but also set the emptied slot to nil for GC
   120  			copy(finished, finished[1:])
   121  			finished[len(finished)-1] = nil
   122  			finished = finished[:len(finished)-1]
   123  
   124  		// Handle incoming state packs:
   125  		case pack := <-d.stateCh:
   126  			// Discard any data not requested (or previously timed out)
   127  			req := active[pack.PeerId()]
   128  			if req == nil {
   129  				log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items())
   130  				continue
   131  			}
   132  			// Finalize the request and queue up for processing
   133  			req.timer.Stop()
   134  			req.response = pack.(*statePack).states
   135  
   136  			finished = append(finished, req)
   137  			delete(active, pack.PeerId())
   138  
   139  			// Handle dropped peer connections:
   140  		case p := <-peerDrop:
   141  			// Skip if no request is currently pending
   142  			req := active[p.id]
   143  			if req == nil {
   144  				continue
   145  			}
   146  			// Finalize the request and queue up for processing
   147  			req.timer.Stop()
   148  			req.dropped = true
   149  
   150  			finished = append(finished, req)
   151  			delete(active, p.id)
   152  
   153  		// Handle timed-out requests:
   154  		case req := <-timeout:
   155  			// If the peer is already requesting something else, ignore the stale timeout.
   156  			// This can happen when the timeout and the delivery happens simultaneously,
   157  			// causing both pathways to trigger.
   158  			if active[req.peer.id] != req {
   159  				continue
   160  			}
   161  			// Move the timed out data back into the download queue
   162  			finished = append(finished, req)
   163  			delete(active, req.peer.id)
   164  
   165  		// Track outgoing state requests:
   166  		case req := <-d.trackStateReq:
   167  			// If an active request already exists for this peer, we have a problem. In
   168  			// theory the trie node schedule must never assign two requests to the same
   169  			// peer. In practice however, a peer might receive a request, disconnect and
   170  			// immediately reconnect before the previous times out. In this case the first
   171  			// request is never honored, alas we must not silently overwrite it, as that
   172  			// causes valid requests to go missing and sync to get stuck.
   173  			if old := active[req.peer.id]; old != nil {
   174  				log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id)
   175  
   176  				// Make sure the previous one doesn't get siletly lost
   177  				old.timer.Stop()
   178  				old.dropped = true
   179  
   180  				finished = append(finished, old)
   181  			}
   182  			// Start a timer to notify the sync loop if the peer stalled.
   183  			req.timer = time.AfterFunc(req.timeout, func() {
   184  				select {
   185  				case timeout <- req:
   186  				case <-s.done:
   187  					// Prevent leaking of timer goroutines in the unlikely case where a
   188  					// timer is fired just before exiting runStateSync.
   189  				}
   190  			})
   191  			active[req.peer.id] = req
   192  		}
   193  	}
   194  }
   195  
   196  // stateSync schedules requests for downloading a particular state trie defined
   197  // by a given state root.
   198  type stateSync struct {
   199  	d *Downloader // Downloader instance to access and manage current peerset
   200  
   201  	sched  *trie.TrieSync             // State trie sync scheduler defining the tasks
   202  	keccak hash.Hash                  // Keccak256 hasher to verify deliveries with
   203  	tasks  map[common.Hash]*stateTask // Set of tasks currently queued for retrieval
   204  
   205  	numUncommitted   int
   206  	bytesUncommitted int
   207  
   208  	deliver    chan *stateReq // Delivery channel multiplexing peer responses
   209  	cancel     chan struct{}  // Channel to signal a termination request
   210  	cancelOnce sync.Once      // Ensures cancel only ever gets called once
   211  	done       chan struct{}  // Channel to signal termination completion
   212  	err        error          // Any error hit during sync (set before completion)
   213  }
   214  
   215  // stateTask represents a single trie node download task, containing a set of
   216  // peers already attempted retrieval from to detect stalled syncs and abort.
   217  type stateTask struct {
   218  	attempts map[string]struct{}
   219  }
   220  
   221  // newStateSync creates a new state trie download scheduler. This method does not
   222  // yet start the sync. The user needs to call run to initiate.
   223  func newStateSync(d *Downloader, root common.Hash) *stateSync {
   224  	return &stateSync{
   225  		d:       d,
   226  		sched:   state.NewStateSync(root, d.stateDB),
   227  		keccak:  sha3.NewKeccak256(),
   228  		tasks:   make(map[common.Hash]*stateTask),
   229  		deliver: make(chan *stateReq),
   230  		cancel:  make(chan struct{}),
   231  		done:    make(chan struct{}),
   232  	}
   233  }
   234  
   235  // run starts the task assignment and response processing loop, blocking until
   236  // it finishes, and finally notifying any goroutines waiting for the loop to
   237  // finish.
   238  func (s *stateSync) run() {
   239  	s.err = s.loop()
   240  	close(s.done)
   241  }
   242  
   243  // Wait blocks until the sync is done or canceled.
   244  func (s *stateSync) Wait() error {
   245  	<-s.done
   246  	return s.err
   247  }
   248  
   249  // Cancel cancels the sync and waits until it has shut down.
   250  func (s *stateSync) Cancel() error {
   251  	s.cancelOnce.Do(func() { close(s.cancel) })
   252  	return s.Wait()
   253  }
   254  
   255  // loop is the main event loop of a state trie sync. It it responsible for the
   256  // assignment of new tasks to peers (including sending it to them) as well as
   257  // for the processing of inbound data. Note, that the loop does not directly
   258  // receive data from peers, rather those are buffered up in the downloader and
   259  // pushed here async. The reason is to decouple processing from data receipt
   260  // and timeouts.
   261  func (s *stateSync) loop() (err error) {
   262  	// Listen for new peer events to assign tasks to them
   263  	newPeer := make(chan *peerConnection, 1024)
   264  	peerSub := s.d.peers.SubscribeNewPeers(newPeer)
   265  	defer peerSub.Unsubscribe()
   266  	defer func() {
   267  		cerr := s.commit(true)
   268  		if err == nil {
   269  			err = cerr
   270  		}
   271  	}()
   272  
   273  	// Keep assigning new tasks until the sync completes or aborts
   274  	for s.sched.Pending() > 0 {
   275  		if err = s.commit(false); err != nil {
   276  			return err
   277  		}
   278  		s.assignTasks()
   279  		// Tasks assigned, wait for something to happen
   280  		select {
   281  		case <-newPeer:
   282  			// New peer arrived, try to assign it download tasks
   283  
   284  		case <-s.cancel:
   285  			return errCancelStateFetch
   286  
   287  		case <-s.d.cancelCh:
   288  			return errCancelStateFetch
   289  
   290  		case req := <-s.deliver:
   291  			// Response, disconnect or timeout triggered, drop the peer if stalling
   292  			log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut())
   293  			if len(req.items) <= 2 && !req.dropped && req.timedOut() {
   294  				// 2 items are the minimum requested, if even that times out, we've no use of
   295  				// this peer at the moment.
   296  				log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id)
   297  				s.d.dropPeer(req.peer.id)
   298  			}
   299  			// Process all the received blobs and check for stale delivery
   300  			if err = s.process(req); err != nil {
   301  				log.Warn("Node data write error", "err", err)
   302  				return err
   303  			}
   304  			req.peer.SetNodeDataIdle(len(req.response))
   305  		}
   306  	}
   307  	return nil
   308  }
   309  
   310  func (s *stateSync) commit(force bool) error {
   311  	if !force && s.bytesUncommitted < qctdb.IdealBatchSize {
   312  		return nil
   313  	}
   314  	start := time.Now()
   315  	b := s.d.stateDB.NewBatch()
   316  	if written, err := s.sched.Commit(b); written == 0 || err != nil {
   317  		return err
   318  	}
   319  	if err := b.Write(); err != nil {
   320  		return fmt.Errorf("DB write error: %v", err)
   321  	}
   322  	s.updateStats(s.numUncommitted, 0, 0, time.Since(start))
   323  	s.numUncommitted = 0
   324  	s.bytesUncommitted = 0
   325  	return nil
   326  }
   327  
   328  // assignTasks attempts to assign new tasks to all idle peers, either from the
   329  // batch currently being retried, or fetching new data from the trie sync itself.
   330  func (s *stateSync) assignTasks() {
   331  	// Iterate over all idle peers and try to assign them state fetches
   332  	peers, _ := s.d.peers.NodeDataIdlePeers()
   333  	for _, p := range peers {
   334  		// Assign a batch of fetches proportional to the estimated latency/bandwidth
   335  		cap := p.NodeDataCapacity(s.d.requestRTT())
   336  		req := &stateReq{peer: p, timeout: s.d.requestTTL()}
   337  		s.fillTasks(cap, req)
   338  
   339  		// If the peer was assigned tasks to fetch, send the network request
   340  		if len(req.items) > 0 {
   341  			req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items))
   342  			select {
   343  			case s.d.trackStateReq <- req:
   344  				req.peer.FetchNodeData(req.items)
   345  			case <-s.cancel:
   346  			case <-s.d.cancelCh:
   347  			}
   348  		}
   349  	}
   350  }
   351  
   352  // fillTasks fills the given request object with a maximum of n state download
   353  // tasks to send to the remote peer.
   354  func (s *stateSync) fillTasks(n int, req *stateReq) {
   355  	// Refill available tasks from the scheduler.
   356  	if len(s.tasks) < n {
   357  		new := s.sched.Missing(n - len(s.tasks))
   358  		for _, hash := range new {
   359  			s.tasks[hash] = &stateTask{make(map[string]struct{})}
   360  		}
   361  	}
   362  	// Find tasks that haven't been tried with the request's peer.
   363  	req.items = make([]common.Hash, 0, n)
   364  	req.tasks = make(map[common.Hash]*stateTask, n)
   365  	for hash, t := range s.tasks {
   366  		// Stop when we've gathered enough requests
   367  		if len(req.items) == n {
   368  			break
   369  		}
   370  		// Skip any requests we've already tried from this peer
   371  		if _, ok := t.attempts[req.peer.id]; ok {
   372  			continue
   373  		}
   374  		// Assign the request to this peer
   375  		t.attempts[req.peer.id] = struct{}{}
   376  		req.items = append(req.items, hash)
   377  		req.tasks[hash] = t
   378  		delete(s.tasks, hash)
   379  	}
   380  }
   381  
   382  // process iterates over a batch of delivered state data, injecting each item
   383  // into a running state sync, re-queuing any items that were requested but not
   384  // delivered.
   385  func (s *stateSync) process(req *stateReq) error {
   386  	// Collect processing stats and update progress if valid data was received
   387  	duplicate, unexpected := 0, 0
   388  
   389  	defer func(start time.Time) {
   390  		if duplicate > 0 || unexpected > 0 {
   391  			s.updateStats(0, duplicate, unexpected, time.Since(start))
   392  		}
   393  	}(time.Now())
   394  
   395  	// Iterate over all the delivered data and inject one-by-one into the trie
   396  	progress := false
   397  
   398  	for _, blob := range req.response {
   399  		prog, hash, err := s.processNodeData(blob)
   400  		switch err {
   401  		case nil:
   402  			s.numUncommitted++
   403  			s.bytesUncommitted += len(blob)
   404  			progress = progress || prog
   405  		case trie.ErrNotRequested:
   406  			unexpected++
   407  		case trie.ErrAlreadyProcessed:
   408  			duplicate++
   409  		default:
   410  			return fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err)
   411  		}
   412  		if _, ok := req.tasks[hash]; ok {
   413  			delete(req.tasks, hash)
   414  		}
   415  	}
   416  	// Put unfulfilled tasks back into the retry queue
   417  	npeers := s.d.peers.Len()
   418  	for hash, task := range req.tasks {
   419  		// If the node did deliver something, missing items may be due to a protocol
   420  		// limit or a previous timeout + delayed delivery. Both cases should permit
   421  		// the node to retry the missing items (to avoid single-peer stalls).
   422  		if len(req.response) > 0 || req.timedOut() {
   423  			delete(task.attempts, req.peer.id)
   424  		}
   425  		// If we've requested the node too many times already, it may be a malicious
   426  		// sync where nobody has the right data. Abort.
   427  		if len(task.attempts) >= npeers {
   428  			return fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers)
   429  		}
   430  		// Missing item, place into the retry queue.
   431  		s.tasks[hash] = task
   432  	}
   433  	return nil
   434  }
   435  
   436  // processNodeData tries to inject a trie node data blob delivered from a remote
   437  // peer into the state trie, returning whether anything useful was written or any
   438  // error occurred.
   439  func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) {
   440  	res := trie.SyncResult{Data: blob}
   441  	s.keccak.Reset()
   442  	s.keccak.Write(blob)
   443  	s.keccak.Sum(res.Hash[:0])
   444  	committed, _, err := s.sched.Process([]trie.SyncResult{res})
   445  	return committed, res.Hash, err
   446  }
   447  
   448  // updateStats bumps the various state sync progress counters and displays a log
   449  // message for the user to see.
   450  func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) {
   451  	s.d.syncStatsLock.Lock()
   452  	defer s.d.syncStatsLock.Unlock()
   453  
   454  	s.d.syncStatsState.pending = uint64(s.sched.Pending())
   455  	s.d.syncStatsState.processed += uint64(written)
   456  	s.d.syncStatsState.duplicate += uint64(duplicate)
   457  	s.d.syncStatsState.unexpected += uint64(unexpected)
   458  
   459  	if written > 0 || duplicate > 0 || unexpected > 0 {
   460  		log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected)
   461  	}
   462  	if written > 0 {
   463  		core.WriteTrieSyncProgress(s.d.stateDB, s.d.syncStatsState.processed)
   464  	}
   465  }