github.com/NebulousLabs/Sia@v1.3.7/modules/renter/hostdb/scan.go (about) 1 package hostdb 2 3 // scan.go contains the functions which periodically scan the list of all hosts 4 // to see which hosts are online or offline, and to get any updates to the 5 // settings of the hosts. 6 7 import ( 8 "net" 9 "sort" 10 "time" 11 12 "github.com/NebulousLabs/Sia/build" 13 "github.com/NebulousLabs/Sia/crypto" 14 "github.com/NebulousLabs/Sia/encoding" 15 "github.com/NebulousLabs/Sia/modules" 16 "github.com/NebulousLabs/fastrand" 17 ) 18 19 // queueScan will add a host to the queue to be scanned. The host will be added 20 // at a random position which means that the order in which queueScan is called 21 // is not necessarily the order in which the hosts get scanned. That guarantees 22 // a random scan order during the initial scan. 23 func (hdb *HostDB) queueScan(entry modules.HostDBEntry) { 24 // If this entry is already in the scan pool, can return immediately. 25 _, exists := hdb.scanMap[entry.PublicKey.String()] 26 if exists { 27 return 28 } 29 // Add the entry to a random position in the waitlist. 30 hdb.scanMap[entry.PublicKey.String()] = struct{}{} 31 hdb.scanList = append(hdb.scanList, entry) 32 if len(hdb.scanList) > 1 { 33 i := len(hdb.scanList) - 1 34 j := fastrand.Intn(i) 35 hdb.scanList[i], hdb.scanList[j] = hdb.scanList[j], hdb.scanList[i] 36 } 37 // Check if any thread is currently emptying the waitlist. If not, spawn a 38 // thread to empty the waitlist. 39 if hdb.scanWait { 40 // Another thread is emptying the scan list, nothing to worry about. 41 return 42 } 43 44 // Sanity check - the scan map and the scan list should have the same 45 // length. 46 if build.DEBUG && len(hdb.scanMap) > len(hdb.scanList)+maxScanningThreads { 47 hdb.log.Critical("The hostdb scan map has seemingly grown too large:", len(hdb.scanMap), len(hdb.scanList), maxScanningThreads) 48 } 49 50 hdb.scanWait = true 51 go func() { 52 scanPool := make(chan modules.HostDBEntry) 53 defer close(scanPool) 54 55 // Nobody is emptying the scan list, volunteer. 56 if hdb.tg.Add() != nil { 57 // Hostdb is shutting down, don't spin up another thread. It is 58 // okay to leave scanWait set to true as that will not affect 59 // shutdown. 60 return 61 } 62 defer hdb.tg.Done() 63 64 // Block scan when a specific dependency is provided. 65 hdb.deps.Disrupt("BlockScan") 66 67 // Due to the patterns used to spin up scanning threads, it's possible 68 // that we get to this point while all scanning threads are currently 69 // used up, completing jobs that were sent out by the previous pool 70 // managing thread. This thread is at risk of deadlocking if there's 71 // not at least one scanning thread accepting work that it created 72 // itself, so we use a starterThread exception and spin up 73 // one-thread-too-many on the first iteration to ensure that we do not 74 // deadlock. 75 starterThread := false 76 for { 77 // If the scanList is empty, this thread can spin down. 78 hdb.mu.Lock() 79 if len(hdb.scanList) == 0 { 80 // Scan list is empty, can exit. Let the world know that nobody 81 // is emptying the scan list anymore. 82 hdb.scanWait = false 83 hdb.mu.Unlock() 84 return 85 } 86 87 // Get the next host, shrink the scan list. 88 entry := hdb.scanList[0] 89 hdb.scanList = hdb.scanList[1:] 90 delete(hdb.scanMap, entry.PublicKey.String()) 91 scansRemaining := len(hdb.scanList) 92 93 // Grab the most recent entry for this host. 94 recentEntry, exists := hdb.hostTree.Select(entry.PublicKey) 95 if exists { 96 entry = recentEntry 97 } 98 99 // Try to send this entry to an existing idle worker (non-blocking). 100 select { 101 case scanPool <- entry: 102 hdb.log.Debugf("Sending host %v for scan, %v hosts remain", entry.PublicKey.String(), scansRemaining) 103 hdb.mu.Unlock() 104 continue 105 default: 106 } 107 108 // Create new worker thread. 109 if hdb.scanningThreads < maxScanningThreads || !starterThread { 110 starterThread = true 111 hdb.scanningThreads++ 112 if err := hdb.tg.Add(); err != nil { 113 hdb.mu.Unlock() 114 return 115 } 116 go func() { 117 defer hdb.tg.Done() 118 hdb.threadedProbeHosts(scanPool) 119 hdb.mu.Lock() 120 hdb.scanningThreads-- 121 hdb.mu.Unlock() 122 }() 123 } 124 hdb.mu.Unlock() 125 126 // Block while waiting for an opening in the scan pool. 127 hdb.log.Debugf("Sending host %v for scan, %v hosts remain", entry.PublicKey.String(), scansRemaining) 128 select { 129 case scanPool <- entry: 130 // iterate again 131 case <-hdb.tg.StopChan(): 132 // quit 133 return 134 } 135 } 136 }() 137 } 138 139 // updateEntry updates an entry in the hostdb after a scan has taken place. 140 // 141 // CAUTION: This function will automatically add multiple entries to a new host 142 // to give that host some base uptime. This makes this function co-dependent 143 // with the host weight functions. Adjustment of the host weight functions need 144 // to keep this function in mind, and vice-versa. 145 func (hdb *HostDB) updateEntry(entry modules.HostDBEntry, netErr error) { 146 // If the scan failed because we don't have Internet access, toss out this update. 147 if netErr != nil && !hdb.gateway.Online() { 148 return 149 } 150 151 // Grab the host from the host tree, and update it with the neew settings. 152 newEntry, exists := hdb.hostTree.Select(entry.PublicKey) 153 if exists { 154 newEntry.HostExternalSettings = entry.HostExternalSettings 155 } else { 156 newEntry = entry 157 } 158 159 // Update the recent interactions with this host. 160 if netErr == nil { 161 newEntry.RecentSuccessfulInteractions++ 162 } else { 163 newEntry.RecentFailedInteractions++ 164 } 165 166 // Add the datapoints for the scan. 167 if len(newEntry.ScanHistory) < 2 { 168 // Add two scans to the scan history. Two are needed because the scans 169 // are forward looking, but we want this first scan to represent as 170 // much as one week of uptime or downtime. 171 earliestStartTime := time.Now().Add(time.Hour * 7 * 24 * -1) // Permit up to a week of starting uptime or downtime. 172 suggestedStartTime := time.Now().Add(time.Minute * 10 * time.Duration(hdb.blockHeight-entry.FirstSeen+1) * -1) // Add one to the FirstSeen in case FirstSeen is this block, guarantees incrementing order. 173 if suggestedStartTime.Before(earliestStartTime) { 174 suggestedStartTime = earliestStartTime 175 } 176 newEntry.ScanHistory = modules.HostDBScans{ 177 {Timestamp: suggestedStartTime, Success: netErr == nil}, 178 {Timestamp: time.Now(), Success: netErr == nil}, 179 } 180 } else { 181 if newEntry.ScanHistory[len(newEntry.ScanHistory)-1].Success && netErr != nil { 182 hdb.log.Debugf("Host %v is being downgraded from an online host to an offline host: %v\n", newEntry.PublicKey.String(), netErr) 183 } 184 185 // Make sure that the current time is after the timestamp of the 186 // previous scan. It may not be if the system clock has changed. This 187 // will prevent the sort-check sanity checks from triggering. 188 newTimestamp := time.Now() 189 prevTimestamp := newEntry.ScanHistory[len(newEntry.ScanHistory)-1].Timestamp 190 if !newTimestamp.After(prevTimestamp) { 191 newTimestamp = prevTimestamp.Add(time.Second) 192 } 193 194 // Before appending, make sure that the scan we just performed is 195 // timestamped after the previous scan performed. It may not be if the 196 // system clock has changed. 197 newEntry.ScanHistory = append(newEntry.ScanHistory, modules.HostDBScan{Timestamp: newTimestamp, Success: netErr == nil}) 198 } 199 200 // Check whether any of the recent scans demonstrate uptime. The pruning and 201 // compression of the history ensure that there are only relatively recent 202 // scans represented. 203 var recentUptime bool 204 for _, scan := range newEntry.ScanHistory { 205 if scan.Success { 206 recentUptime = true 207 } 208 } 209 210 // If the host has been offline for too long, delete the host from the 211 // hostdb. Only delete if there have been enough scans over a long enough 212 // period to be confident that the host really is offline for good. 213 if time.Now().Sub(newEntry.ScanHistory[0].Timestamp) > maxHostDowntime && !recentUptime && len(newEntry.ScanHistory) >= minScans { 214 err := hdb.hostTree.Remove(newEntry.PublicKey) 215 if err != nil { 216 hdb.log.Println("ERROR: unable to remove host newEntry which has had a ton of downtime:", err) 217 } 218 219 // The function should terminate here as no more interaction is needed 220 // with this host. 221 return 222 } 223 224 // Compress any old scans into the historic values. 225 for len(newEntry.ScanHistory) > minScans && time.Now().Sub(newEntry.ScanHistory[0].Timestamp) > maxHostDowntime { 226 timePassed := newEntry.ScanHistory[1].Timestamp.Sub(newEntry.ScanHistory[0].Timestamp) 227 if newEntry.ScanHistory[0].Success { 228 newEntry.HistoricUptime += timePassed 229 } else { 230 newEntry.HistoricDowntime += timePassed 231 } 232 newEntry.ScanHistory = newEntry.ScanHistory[1:] 233 } 234 235 // Add the updated entry 236 if !exists { 237 err := hdb.hostTree.Insert(newEntry) 238 if err != nil { 239 hdb.log.Println("ERROR: unable to insert entry which is was thought to be new:", err) 240 } else { 241 hdb.log.Debugf("Adding host %v to the hostdb. Net error: %v\n", newEntry.PublicKey.String(), netErr) 242 } 243 } else { 244 err := hdb.hostTree.Modify(newEntry) 245 if err != nil { 246 hdb.log.Println("ERROR: unable to modify entry which is thought to exist:", err) 247 } else { 248 hdb.log.Debugf("Adding host %v to the hostdb. Net error: %v\n", newEntry.PublicKey.String(), netErr) 249 } 250 } 251 } 252 253 // managedScanHost will connect to a host and grab the settings, verifying 254 // uptime and updating to the host's preferences. 255 func (hdb *HostDB) managedScanHost(entry modules.HostDBEntry) { 256 // Request settings from the queued host entry. 257 netAddr := entry.NetAddress 258 pubKey := entry.PublicKey 259 hdb.log.Debugf("Scanning host %v at %v", pubKey, netAddr) 260 261 // Update historic interactions of entry if necessary 262 hdb.mu.RLock() 263 updateHostHistoricInteractions(&entry, hdb.blockHeight) 264 hdb.mu.RUnlock() 265 266 var settings modules.HostExternalSettings 267 var latency time.Duration 268 err := func() error { 269 timeout := hostRequestTimeout 270 hdb.mu.RLock() 271 if len(hdb.initialScanLatencies) > minScansForSpeedup { 272 build.Critical("initialScanLatencies should never be greater than minScansForSpeedup") 273 } 274 if !hdb.initialScanComplete && len(hdb.initialScanLatencies) == minScansForSpeedup { 275 // During an initial scan, when we have at least minScansForSpeedup 276 // active scans in initialScanLatencies, we use 277 // 5*median(initialScanLatencies) as the new hostRequestTimeout to 278 // speedup the scanning process. 279 timeout = hdb.initialScanLatencies[len(hdb.initialScanLatencies)/2] 280 timeout *= scanSpeedupMedianMultiplier 281 if hostRequestTimeout < timeout { 282 timeout = hostRequestTimeout 283 } 284 } 285 hdb.mu.RUnlock() 286 287 dialer := &net.Dialer{ 288 Cancel: hdb.tg.StopChan(), 289 Timeout: timeout, 290 } 291 start := time.Now() 292 conn, err := dialer.Dial("tcp", string(netAddr)) 293 latency = time.Since(start) 294 if err != nil { 295 return err 296 } 297 connCloseChan := make(chan struct{}) 298 go func() { 299 select { 300 case <-hdb.tg.StopChan(): 301 case <-connCloseChan: 302 } 303 conn.Close() 304 }() 305 defer close(connCloseChan) 306 conn.SetDeadline(time.Now().Add(hostScanDeadline)) 307 308 err = encoding.WriteObject(conn, modules.RPCSettings) 309 if err != nil { 310 return err 311 } 312 var pubkey crypto.PublicKey 313 copy(pubkey[:], pubKey.Key) 314 return crypto.ReadSignedObject(conn, &settings, maxSettingsLen, pubkey) 315 }() 316 if err != nil { 317 hdb.log.Debugf("Scan of host at %v failed: %v", netAddr, err) 318 319 } else { 320 hdb.log.Debugf("Scan of host at %v succeeded.", netAddr) 321 entry.HostExternalSettings = settings 322 } 323 success := err == nil 324 325 hdb.mu.Lock() 326 defer hdb.mu.Unlock() 327 // Update the host tree to have a new entry, including the new error. Then 328 // delete the entry from the scan map as the scan has been successful. 329 hdb.updateEntry(entry, err) 330 331 // Add the scan to the initialScanLatencies if it was successful. 332 if success && len(hdb.initialScanLatencies) < minScansForSpeedup { 333 hdb.initialScanLatencies = append(hdb.initialScanLatencies, latency) 334 // If the slice has reached its maximum size we sort it. 335 if len(hdb.initialScanLatencies) == minScansForSpeedup { 336 sort.Slice(hdb.initialScanLatencies, func(i, j int) bool { 337 return hdb.initialScanLatencies[i] < hdb.initialScanLatencies[j] 338 }) 339 } 340 } 341 } 342 343 // waitForScans is a helper function that blocks until the hostDB's scanList is 344 // empty. 345 func (hdb *HostDB) managedWaitForScans() { 346 for { 347 hdb.mu.Lock() 348 length := len(hdb.scanList) 349 hdb.mu.Unlock() 350 if length == 0 { 351 break 352 } 353 select { 354 case <-hdb.tg.StopChan(): 355 case <-time.After(scanCheckInterval): 356 } 357 } 358 } 359 360 // threadedProbeHosts pulls hosts from the thread pool and runs a scan on them. 361 func (hdb *HostDB) threadedProbeHosts(scanPool <-chan modules.HostDBEntry) { 362 for hostEntry := range scanPool { 363 // Block until hostdb has internet connectivity. 364 for { 365 hdb.mu.RLock() 366 online := hdb.gateway.Online() 367 hdb.mu.RUnlock() 368 if online { 369 break 370 } 371 select { 372 case <-time.After(time.Second * 30): 373 continue 374 case <-hdb.tg.StopChan(): 375 return 376 } 377 } 378 379 // There appears to be internet connectivity, continue with the 380 // scan. 381 hdb.managedScanHost(hostEntry) 382 } 383 } 384 385 // threadedScan is an ongoing function which will query the full set of hosts 386 // every few hours to see who is online and available for uploading. 387 func (hdb *HostDB) threadedScan() { 388 err := hdb.tg.Add() 389 if err != nil { 390 return 391 } 392 defer hdb.tg.Done() 393 394 // Wait until the consensus set is synced. Only then we can be sure that 395 // the initial scan covers the whole network. 396 for { 397 if hdb.cs.Synced() { 398 break 399 } 400 select { 401 case <-hdb.tg.StopChan(): 402 return 403 case <-time.After(scanCheckInterval): 404 } 405 } 406 407 // Block scan when a specific dependency is provided. 408 hdb.deps.Disrupt("BlockScan") 409 410 // The initial scan might have been interrupted. Queue one scan for every 411 // announced host that was missed by the initial scan and wait for the 412 // scans to finish before starting the scan loop. 413 allHosts := hdb.hostTree.All() 414 hdb.mu.Lock() 415 for _, host := range allHosts { 416 if len(host.ScanHistory) == 0 && host.HistoricUptime == 0 && host.HistoricDowntime == 0 { 417 hdb.queueScan(host) 418 } 419 } 420 hdb.mu.Unlock() 421 hdb.managedWaitForScans() 422 423 // Set the flag to indicate that the initial scan is complete. 424 hdb.mu.Lock() 425 hdb.initialScanComplete = true 426 hdb.mu.Unlock() 427 428 for { 429 // Set up a scan for the hostCheckupQuanity most valuable hosts in the 430 // hostdb. Hosts that fail their scans will be docked significantly, 431 // pushing them further back in the hierarchy, ensuring that for the 432 // most part only online hosts are getting scanned unless there are 433 // fewer than hostCheckupQuantity of them. 434 435 // Grab a set of hosts to scan, grab hosts that are active, inactive, 436 // and offline to get high diversity. 437 var onlineHosts, offlineHosts []modules.HostDBEntry 438 allHosts := hdb.hostTree.All() 439 for i := len(allHosts) - 1; i >= 0; i-- { 440 if len(onlineHosts) >= hostCheckupQuantity && len(offlineHosts) >= hostCheckupQuantity { 441 break 442 } 443 444 // Figure out if the host is online or offline. 445 host := allHosts[i] 446 online := len(host.ScanHistory) > 0 && host.ScanHistory[len(host.ScanHistory)-1].Success 447 if online && len(onlineHosts) < hostCheckupQuantity { 448 onlineHosts = append(onlineHosts, host) 449 } else if !online && len(offlineHosts) < hostCheckupQuantity { 450 offlineHosts = append(offlineHosts, host) 451 } 452 } 453 454 // Queue the scans for each host. 455 hdb.log.Println("Performing scan on", len(onlineHosts), "online hosts and", len(offlineHosts), "offline hosts.") 456 hdb.mu.Lock() 457 for _, host := range onlineHosts { 458 hdb.queueScan(host) 459 } 460 for _, host := range offlineHosts { 461 hdb.queueScan(host) 462 } 463 hdb.mu.Unlock() 464 465 // Sleep for a random amount of time before doing another round of 466 // scanning. The minimums and maximums keep the scan time reasonable, 467 // while the randomness prevents the scanning from always happening at 468 // the same time of day or week. 469 sleepRange := uint64(maxScanSleep - minScanSleep) 470 sleepTime := minScanSleep + time.Duration(fastrand.Uint64n(sleepRange)) 471 472 // Sleep until it's time for the next scan cycle. 473 select { 474 case <-hdb.tg.StopChan(): 475 return 476 case <-time.After(sleepTime): 477 } 478 } 479 }