github.com/NebulousLabs/Sia@v1.3.7/modules/renter/hostdb/scan.go (about)

     1  package hostdb
     2  
     3  // scan.go contains the functions which periodically scan the list of all hosts
     4  // to see which hosts are online or offline, and to get any updates to the
     5  // settings of the hosts.
     6  
     7  import (
     8  	"net"
     9  	"sort"
    10  	"time"
    11  
    12  	"github.com/NebulousLabs/Sia/build"
    13  	"github.com/NebulousLabs/Sia/crypto"
    14  	"github.com/NebulousLabs/Sia/encoding"
    15  	"github.com/NebulousLabs/Sia/modules"
    16  	"github.com/NebulousLabs/fastrand"
    17  )
    18  
    19  // queueScan will add a host to the queue to be scanned. The host will be added
    20  // at a random position which means that the order in which queueScan is called
    21  // is not necessarily the order in which the hosts get scanned. That guarantees
    22  // a random scan order during the initial scan.
    23  func (hdb *HostDB) queueScan(entry modules.HostDBEntry) {
    24  	// If this entry is already in the scan pool, can return immediately.
    25  	_, exists := hdb.scanMap[entry.PublicKey.String()]
    26  	if exists {
    27  		return
    28  	}
    29  	// Add the entry to a random position in the waitlist.
    30  	hdb.scanMap[entry.PublicKey.String()] = struct{}{}
    31  	hdb.scanList = append(hdb.scanList, entry)
    32  	if len(hdb.scanList) > 1 {
    33  		i := len(hdb.scanList) - 1
    34  		j := fastrand.Intn(i)
    35  		hdb.scanList[i], hdb.scanList[j] = hdb.scanList[j], hdb.scanList[i]
    36  	}
    37  	// Check if any thread is currently emptying the waitlist. If not, spawn a
    38  	// thread to empty the waitlist.
    39  	if hdb.scanWait {
    40  		// Another thread is emptying the scan list, nothing to worry about.
    41  		return
    42  	}
    43  
    44  	// Sanity check - the scan map and the scan list should have the same
    45  	// length.
    46  	if build.DEBUG && len(hdb.scanMap) > len(hdb.scanList)+maxScanningThreads {
    47  		hdb.log.Critical("The hostdb scan map has seemingly grown too large:", len(hdb.scanMap), len(hdb.scanList), maxScanningThreads)
    48  	}
    49  
    50  	hdb.scanWait = true
    51  	go func() {
    52  		scanPool := make(chan modules.HostDBEntry)
    53  		defer close(scanPool)
    54  
    55  		// Nobody is emptying the scan list, volunteer.
    56  		if hdb.tg.Add() != nil {
    57  			// Hostdb is shutting down, don't spin up another thread.  It is
    58  			// okay to leave scanWait set to true as that will not affect
    59  			// shutdown.
    60  			return
    61  		}
    62  		defer hdb.tg.Done()
    63  
    64  		// Block scan when a specific dependency is provided.
    65  		hdb.deps.Disrupt("BlockScan")
    66  
    67  		// Due to the patterns used to spin up scanning threads, it's possible
    68  		// that we get to this point while all scanning threads are currently
    69  		// used up, completing jobs that were sent out by the previous pool
    70  		// managing thread. This thread is at risk of deadlocking if there's
    71  		// not at least one scanning thread accepting work that it created
    72  		// itself, so we use a starterThread exception and spin up
    73  		// one-thread-too-many on the first iteration to ensure that we do not
    74  		// deadlock.
    75  		starterThread := false
    76  		for {
    77  			// If the scanList is empty, this thread can spin down.
    78  			hdb.mu.Lock()
    79  			if len(hdb.scanList) == 0 {
    80  				// Scan list is empty, can exit. Let the world know that nobody
    81  				// is emptying the scan list anymore.
    82  				hdb.scanWait = false
    83  				hdb.mu.Unlock()
    84  				return
    85  			}
    86  
    87  			// Get the next host, shrink the scan list.
    88  			entry := hdb.scanList[0]
    89  			hdb.scanList = hdb.scanList[1:]
    90  			delete(hdb.scanMap, entry.PublicKey.String())
    91  			scansRemaining := len(hdb.scanList)
    92  
    93  			// Grab the most recent entry for this host.
    94  			recentEntry, exists := hdb.hostTree.Select(entry.PublicKey)
    95  			if exists {
    96  				entry = recentEntry
    97  			}
    98  
    99  			// Try to send this entry to an existing idle worker (non-blocking).
   100  			select {
   101  			case scanPool <- entry:
   102  				hdb.log.Debugf("Sending host %v for scan, %v hosts remain", entry.PublicKey.String(), scansRemaining)
   103  				hdb.mu.Unlock()
   104  				continue
   105  			default:
   106  			}
   107  
   108  			// Create new worker thread.
   109  			if hdb.scanningThreads < maxScanningThreads || !starterThread {
   110  				starterThread = true
   111  				hdb.scanningThreads++
   112  				if err := hdb.tg.Add(); err != nil {
   113  					hdb.mu.Unlock()
   114  					return
   115  				}
   116  				go func() {
   117  					defer hdb.tg.Done()
   118  					hdb.threadedProbeHosts(scanPool)
   119  					hdb.mu.Lock()
   120  					hdb.scanningThreads--
   121  					hdb.mu.Unlock()
   122  				}()
   123  			}
   124  			hdb.mu.Unlock()
   125  
   126  			// Block while waiting for an opening in the scan pool.
   127  			hdb.log.Debugf("Sending host %v for scan, %v hosts remain", entry.PublicKey.String(), scansRemaining)
   128  			select {
   129  			case scanPool <- entry:
   130  				// iterate again
   131  			case <-hdb.tg.StopChan():
   132  				// quit
   133  				return
   134  			}
   135  		}
   136  	}()
   137  }
   138  
   139  // updateEntry updates an entry in the hostdb after a scan has taken place.
   140  //
   141  // CAUTION: This function will automatically add multiple entries to a new host
   142  // to give that host some base uptime. This makes this function co-dependent
   143  // with the host weight functions. Adjustment of the host weight functions need
   144  // to keep this function in mind, and vice-versa.
   145  func (hdb *HostDB) updateEntry(entry modules.HostDBEntry, netErr error) {
   146  	// If the scan failed because we don't have Internet access, toss out this update.
   147  	if netErr != nil && !hdb.gateway.Online() {
   148  		return
   149  	}
   150  
   151  	// Grab the host from the host tree, and update it with the neew settings.
   152  	newEntry, exists := hdb.hostTree.Select(entry.PublicKey)
   153  	if exists {
   154  		newEntry.HostExternalSettings = entry.HostExternalSettings
   155  	} else {
   156  		newEntry = entry
   157  	}
   158  
   159  	// Update the recent interactions with this host.
   160  	if netErr == nil {
   161  		newEntry.RecentSuccessfulInteractions++
   162  	} else {
   163  		newEntry.RecentFailedInteractions++
   164  	}
   165  
   166  	// Add the datapoints for the scan.
   167  	if len(newEntry.ScanHistory) < 2 {
   168  		// Add two scans to the scan history. Two are needed because the scans
   169  		// are forward looking, but we want this first scan to represent as
   170  		// much as one week of uptime or downtime.
   171  		earliestStartTime := time.Now().Add(time.Hour * 7 * 24 * -1)                                                   // Permit up to a week of starting uptime or downtime.
   172  		suggestedStartTime := time.Now().Add(time.Minute * 10 * time.Duration(hdb.blockHeight-entry.FirstSeen+1) * -1) // Add one to the FirstSeen in case FirstSeen is this block, guarantees incrementing order.
   173  		if suggestedStartTime.Before(earliestStartTime) {
   174  			suggestedStartTime = earliestStartTime
   175  		}
   176  		newEntry.ScanHistory = modules.HostDBScans{
   177  			{Timestamp: suggestedStartTime, Success: netErr == nil},
   178  			{Timestamp: time.Now(), Success: netErr == nil},
   179  		}
   180  	} else {
   181  		if newEntry.ScanHistory[len(newEntry.ScanHistory)-1].Success && netErr != nil {
   182  			hdb.log.Debugf("Host %v is being downgraded from an online host to an offline host: %v\n", newEntry.PublicKey.String(), netErr)
   183  		}
   184  
   185  		// Make sure that the current time is after the timestamp of the
   186  		// previous scan. It may not be if the system clock has changed. This
   187  		// will prevent the sort-check sanity checks from triggering.
   188  		newTimestamp := time.Now()
   189  		prevTimestamp := newEntry.ScanHistory[len(newEntry.ScanHistory)-1].Timestamp
   190  		if !newTimestamp.After(prevTimestamp) {
   191  			newTimestamp = prevTimestamp.Add(time.Second)
   192  		}
   193  
   194  		// Before appending, make sure that the scan we just performed is
   195  		// timestamped after the previous scan performed. It may not be if the
   196  		// system clock has changed.
   197  		newEntry.ScanHistory = append(newEntry.ScanHistory, modules.HostDBScan{Timestamp: newTimestamp, Success: netErr == nil})
   198  	}
   199  
   200  	// Check whether any of the recent scans demonstrate uptime. The pruning and
   201  	// compression of the history ensure that there are only relatively recent
   202  	// scans represented.
   203  	var recentUptime bool
   204  	for _, scan := range newEntry.ScanHistory {
   205  		if scan.Success {
   206  			recentUptime = true
   207  		}
   208  	}
   209  
   210  	// If the host has been offline for too long, delete the host from the
   211  	// hostdb. Only delete if there have been enough scans over a long enough
   212  	// period to be confident that the host really is offline for good.
   213  	if time.Now().Sub(newEntry.ScanHistory[0].Timestamp) > maxHostDowntime && !recentUptime && len(newEntry.ScanHistory) >= minScans {
   214  		err := hdb.hostTree.Remove(newEntry.PublicKey)
   215  		if err != nil {
   216  			hdb.log.Println("ERROR: unable to remove host newEntry which has had a ton of downtime:", err)
   217  		}
   218  
   219  		// The function should terminate here as no more interaction is needed
   220  		// with this host.
   221  		return
   222  	}
   223  
   224  	// Compress any old scans into the historic values.
   225  	for len(newEntry.ScanHistory) > minScans && time.Now().Sub(newEntry.ScanHistory[0].Timestamp) > maxHostDowntime {
   226  		timePassed := newEntry.ScanHistory[1].Timestamp.Sub(newEntry.ScanHistory[0].Timestamp)
   227  		if newEntry.ScanHistory[0].Success {
   228  			newEntry.HistoricUptime += timePassed
   229  		} else {
   230  			newEntry.HistoricDowntime += timePassed
   231  		}
   232  		newEntry.ScanHistory = newEntry.ScanHistory[1:]
   233  	}
   234  
   235  	// Add the updated entry
   236  	if !exists {
   237  		err := hdb.hostTree.Insert(newEntry)
   238  		if err != nil {
   239  			hdb.log.Println("ERROR: unable to insert entry which is was thought to be new:", err)
   240  		} else {
   241  			hdb.log.Debugf("Adding host %v to the hostdb. Net error: %v\n", newEntry.PublicKey.String(), netErr)
   242  		}
   243  	} else {
   244  		err := hdb.hostTree.Modify(newEntry)
   245  		if err != nil {
   246  			hdb.log.Println("ERROR: unable to modify entry which is thought to exist:", err)
   247  		} else {
   248  			hdb.log.Debugf("Adding host %v to the hostdb. Net error: %v\n", newEntry.PublicKey.String(), netErr)
   249  		}
   250  	}
   251  }
   252  
   253  // managedScanHost will connect to a host and grab the settings, verifying
   254  // uptime and updating to the host's preferences.
   255  func (hdb *HostDB) managedScanHost(entry modules.HostDBEntry) {
   256  	// Request settings from the queued host entry.
   257  	netAddr := entry.NetAddress
   258  	pubKey := entry.PublicKey
   259  	hdb.log.Debugf("Scanning host %v at %v", pubKey, netAddr)
   260  
   261  	// Update historic interactions of entry if necessary
   262  	hdb.mu.RLock()
   263  	updateHostHistoricInteractions(&entry, hdb.blockHeight)
   264  	hdb.mu.RUnlock()
   265  
   266  	var settings modules.HostExternalSettings
   267  	var latency time.Duration
   268  	err := func() error {
   269  		timeout := hostRequestTimeout
   270  		hdb.mu.RLock()
   271  		if len(hdb.initialScanLatencies) > minScansForSpeedup {
   272  			build.Critical("initialScanLatencies should never be greater than minScansForSpeedup")
   273  		}
   274  		if !hdb.initialScanComplete && len(hdb.initialScanLatencies) == minScansForSpeedup {
   275  			// During an initial scan, when we have at least minScansForSpeedup
   276  			// active scans in initialScanLatencies, we use
   277  			// 5*median(initialScanLatencies) as the new hostRequestTimeout to
   278  			// speedup the scanning process.
   279  			timeout = hdb.initialScanLatencies[len(hdb.initialScanLatencies)/2]
   280  			timeout *= scanSpeedupMedianMultiplier
   281  			if hostRequestTimeout < timeout {
   282  				timeout = hostRequestTimeout
   283  			}
   284  		}
   285  		hdb.mu.RUnlock()
   286  
   287  		dialer := &net.Dialer{
   288  			Cancel:  hdb.tg.StopChan(),
   289  			Timeout: timeout,
   290  		}
   291  		start := time.Now()
   292  		conn, err := dialer.Dial("tcp", string(netAddr))
   293  		latency = time.Since(start)
   294  		if err != nil {
   295  			return err
   296  		}
   297  		connCloseChan := make(chan struct{})
   298  		go func() {
   299  			select {
   300  			case <-hdb.tg.StopChan():
   301  			case <-connCloseChan:
   302  			}
   303  			conn.Close()
   304  		}()
   305  		defer close(connCloseChan)
   306  		conn.SetDeadline(time.Now().Add(hostScanDeadline))
   307  
   308  		err = encoding.WriteObject(conn, modules.RPCSettings)
   309  		if err != nil {
   310  			return err
   311  		}
   312  		var pubkey crypto.PublicKey
   313  		copy(pubkey[:], pubKey.Key)
   314  		return crypto.ReadSignedObject(conn, &settings, maxSettingsLen, pubkey)
   315  	}()
   316  	if err != nil {
   317  		hdb.log.Debugf("Scan of host at %v failed: %v", netAddr, err)
   318  
   319  	} else {
   320  		hdb.log.Debugf("Scan of host at %v succeeded.", netAddr)
   321  		entry.HostExternalSettings = settings
   322  	}
   323  	success := err == nil
   324  
   325  	hdb.mu.Lock()
   326  	defer hdb.mu.Unlock()
   327  	// Update the host tree to have a new entry, including the new error. Then
   328  	// delete the entry from the scan map as the scan has been successful.
   329  	hdb.updateEntry(entry, err)
   330  
   331  	// Add the scan to the initialScanLatencies if it was successful.
   332  	if success && len(hdb.initialScanLatencies) < minScansForSpeedup {
   333  		hdb.initialScanLatencies = append(hdb.initialScanLatencies, latency)
   334  		// If the slice has reached its maximum size we sort it.
   335  		if len(hdb.initialScanLatencies) == minScansForSpeedup {
   336  			sort.Slice(hdb.initialScanLatencies, func(i, j int) bool {
   337  				return hdb.initialScanLatencies[i] < hdb.initialScanLatencies[j]
   338  			})
   339  		}
   340  	}
   341  }
   342  
   343  // waitForScans is a helper function that blocks until the hostDB's scanList is
   344  // empty.
   345  func (hdb *HostDB) managedWaitForScans() {
   346  	for {
   347  		hdb.mu.Lock()
   348  		length := len(hdb.scanList)
   349  		hdb.mu.Unlock()
   350  		if length == 0 {
   351  			break
   352  		}
   353  		select {
   354  		case <-hdb.tg.StopChan():
   355  		case <-time.After(scanCheckInterval):
   356  		}
   357  	}
   358  }
   359  
   360  // threadedProbeHosts pulls hosts from the thread pool and runs a scan on them.
   361  func (hdb *HostDB) threadedProbeHosts(scanPool <-chan modules.HostDBEntry) {
   362  	for hostEntry := range scanPool {
   363  		// Block until hostdb has internet connectivity.
   364  		for {
   365  			hdb.mu.RLock()
   366  			online := hdb.gateway.Online()
   367  			hdb.mu.RUnlock()
   368  			if online {
   369  				break
   370  			}
   371  			select {
   372  			case <-time.After(time.Second * 30):
   373  				continue
   374  			case <-hdb.tg.StopChan():
   375  				return
   376  			}
   377  		}
   378  
   379  		// There appears to be internet connectivity, continue with the
   380  		// scan.
   381  		hdb.managedScanHost(hostEntry)
   382  	}
   383  }
   384  
   385  // threadedScan is an ongoing function which will query the full set of hosts
   386  // every few hours to see who is online and available for uploading.
   387  func (hdb *HostDB) threadedScan() {
   388  	err := hdb.tg.Add()
   389  	if err != nil {
   390  		return
   391  	}
   392  	defer hdb.tg.Done()
   393  
   394  	// Wait until the consensus set is synced. Only then we can be sure that
   395  	// the initial scan covers the whole network.
   396  	for {
   397  		if hdb.cs.Synced() {
   398  			break
   399  		}
   400  		select {
   401  		case <-hdb.tg.StopChan():
   402  			return
   403  		case <-time.After(scanCheckInterval):
   404  		}
   405  	}
   406  
   407  	// Block scan when a specific dependency is provided.
   408  	hdb.deps.Disrupt("BlockScan")
   409  
   410  	// The initial scan might have been interrupted. Queue one scan for every
   411  	// announced host that was missed by the initial scan and wait for the
   412  	// scans to finish before starting the scan loop.
   413  	allHosts := hdb.hostTree.All()
   414  	hdb.mu.Lock()
   415  	for _, host := range allHosts {
   416  		if len(host.ScanHistory) == 0 && host.HistoricUptime == 0 && host.HistoricDowntime == 0 {
   417  			hdb.queueScan(host)
   418  		}
   419  	}
   420  	hdb.mu.Unlock()
   421  	hdb.managedWaitForScans()
   422  
   423  	// Set the flag to indicate that the initial scan is complete.
   424  	hdb.mu.Lock()
   425  	hdb.initialScanComplete = true
   426  	hdb.mu.Unlock()
   427  
   428  	for {
   429  		// Set up a scan for the hostCheckupQuanity most valuable hosts in the
   430  		// hostdb. Hosts that fail their scans will be docked significantly,
   431  		// pushing them further back in the hierarchy, ensuring that for the
   432  		// most part only online hosts are getting scanned unless there are
   433  		// fewer than hostCheckupQuantity of them.
   434  
   435  		// Grab a set of hosts to scan, grab hosts that are active, inactive,
   436  		// and offline to get high diversity.
   437  		var onlineHosts, offlineHosts []modules.HostDBEntry
   438  		allHosts := hdb.hostTree.All()
   439  		for i := len(allHosts) - 1; i >= 0; i-- {
   440  			if len(onlineHosts) >= hostCheckupQuantity && len(offlineHosts) >= hostCheckupQuantity {
   441  				break
   442  			}
   443  
   444  			// Figure out if the host is online or offline.
   445  			host := allHosts[i]
   446  			online := len(host.ScanHistory) > 0 && host.ScanHistory[len(host.ScanHistory)-1].Success
   447  			if online && len(onlineHosts) < hostCheckupQuantity {
   448  				onlineHosts = append(onlineHosts, host)
   449  			} else if !online && len(offlineHosts) < hostCheckupQuantity {
   450  				offlineHosts = append(offlineHosts, host)
   451  			}
   452  		}
   453  
   454  		// Queue the scans for each host.
   455  		hdb.log.Println("Performing scan on", len(onlineHosts), "online hosts and", len(offlineHosts), "offline hosts.")
   456  		hdb.mu.Lock()
   457  		for _, host := range onlineHosts {
   458  			hdb.queueScan(host)
   459  		}
   460  		for _, host := range offlineHosts {
   461  			hdb.queueScan(host)
   462  		}
   463  		hdb.mu.Unlock()
   464  
   465  		// Sleep for a random amount of time before doing another round of
   466  		// scanning. The minimums and maximums keep the scan time reasonable,
   467  		// while the randomness prevents the scanning from always happening at
   468  		// the same time of day or week.
   469  		sleepRange := uint64(maxScanSleep - minScanSleep)
   470  		sleepTime := minScanSleep + time.Duration(fastrand.Uint64n(sleepRange))
   471  
   472  		// Sleep until it's time for the next scan cycle.
   473  		select {
   474  		case <-hdb.tg.StopChan():
   475  			return
   476  		case <-time.After(sleepTime):
   477  		}
   478  	}
   479  }