gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/healthloop.go (about) 1 package renter 2 3 // healthloop.go houses the code that runs the health loop. The health loop is 4 // called the health loop because its main purpose is to check the churn levels 5 // on all of the files. As hosts enter and leave the Sia network, we need to 6 // make sure that files are being repaired. 7 // 8 // The health loop does its job by just generally updating the metadata of all 9 // directories, so it could just as well be called the 'metadata loop'. But if 10 // there wasn't host churn on the network, the health loop probably wouldn't 11 // exist, and aggregate metadata would just be updated as the files are updated. 12 // 13 // NOTE: The stateful variable of the health loop is not exposed to the renter 14 // in any way. For the most part, the renter can learn everything useful about 15 // the health loop by checking the numbers in the root siadir. 16 17 // TODO: Once tagged logging is in place, everywhere that a log has the prefix 18 // 'HEALTH LOOP', we can swap out that log.Println for a 19 // log.Tagline("health-loop", $msg), meaning those lines will always print if 20 // someone activates the tag "health-loop" in the logger. "HEALTH LOOP VERBOSE" 21 // logs can be given the tag "health-loop-verbose". 22 23 import ( 24 "fmt" 25 "sync/atomic" 26 "time" 27 28 "gitlab.com/NebulousLabs/errors" 29 "gitlab.com/SkynetLabs/skyd/build" 30 "gitlab.com/SkynetLabs/skyd/skymodules" 31 ) 32 33 const ( 34 // The system scan time estimator decay values determine how much decay 35 // should be applied to the estimator. A decay that is closer to 1 will take 36 // into account more historical data, and a decay that is closer to 0 will 37 // be more responsive to changes in the environment. 38 systemScanTimeEstimatorDecay = 0.9 39 ) 40 41 var ( 42 // emptyFilesystemSleepDuration determines how long the health loop will 43 // sleep if there are files in the filesystem. 44 emptyFilesystemSleepDuration = build.Select(build.Var{ 45 Dev: 5 * time.Second, 46 Standard: 5 * time.Minute, 47 Testing: 2 * time.Second, 48 }).(time.Duration) 49 50 // healthLoopErrorSleepDuration indicates how long the health loop should 51 // sleep before retrying if there is an error preventing progress. 52 healthLoopErrorSleepDuration = build.Select(build.Var{ 53 Dev: 9 * time.Second, 54 Standard: 5 * time.Minute, 55 Testing: time.Second, 56 }).(time.Duration) 57 58 // healthLoopResetInterval defines how frequently the health loop resets, 59 // cleaning out its cache and restarting from root. 60 healthLoopResetInterval = build.Select(build.Var{ 61 Dev: 30 * time.Second, 62 Standard: 15 * time.Minute, 63 Testing: 2 * time.Second, 64 }).(time.Duration) 65 66 // TargetHealthCheckFrequency defines how frequently we want to update the 67 // health of the filesystem when everything is running smoothly. The goal of 68 // the health check system is to spread the health checks of files over this 69 // interval, so that the load of performing health checks is as light as 70 // possible when the system is healthy. 71 // 72 // For standard builds, we're targeting 24 hours as a sign of a filesystem 73 // in good health. This value is picked based on the rate at which hosts 74 // churn through Skynet - in the course of 24 hours, we should never have 75 // enough churn to have built up a concerning amount of repair burden. 76 TargetHealthCheckFrequency = build.Select(build.Var{ 77 Dev: 3 * time.Minute, 78 Standard: 24 * time.Hour, 79 Testing: 5 * time.Second, 80 }).(time.Duration) 81 82 // urgentHealthCheckFrequency is the time at which we feel the health of the 83 // system has reached an urgent state, we haven't checked the health of 84 // certain files in so long that the system should be running at full speed 85 // performing only health checks. 86 // 87 // As the health time of the files in the filesystem grows from the target 88 // health check frequency to the urgent health check frequency, the 89 // percentage of resources that are devoted to the health checks will 90 // linearly increase. If the recent health time of current files is nearly 91 // at the urgent frequency, the health loop will be running nearly at full 92 // speed. If the recent health time of the current files is only halfway 93 // there, the health loop will run halfway between proprtional speed and 94 // full speed. 95 urgentHealthCheckFrequency = build.Select(build.Var{ 96 Dev: 9 * time.Minute, 97 Standard: 72 * time.Hour, 98 Testing: 10 * time.Second, 99 }).(time.Duration) 100 ) 101 102 // healthLoopDirFinder is a helper structure which keeps track of which 103 // directories the health loop has visited and which directories still need to 104 // be visited 105 // 106 // NOTE: this struct is not thread safe, it is only intended to be used in 107 // single-threaded situations. 108 type healthLoopDirFinder struct { 109 // If the user has triggered a manual check, the health loop should run at 110 // full speed until the check is complete. We track whether it's complete by 111 // looking at whether the latest aggregate health time is later than the 112 // moment the check was triggered. 113 manualCheckTime time.Time 114 115 nextDir skymodules.SiaPath // The next dir to scan and update. 116 filesInNextDir uint64 // An approximation of the number of files in the next dir we will be scanning. 117 leastRecentCheck time.Time // The time of the least recently checked dir in the filesystem. 118 totalFiles uint64 // An approximation of the total number of files in the filesystem. 119 120 // These variables are used to estimate how long it takes to scan the 121 // filesystem when you exclude the sleeps. The weighted values are used to 122 // compute an exponential moving average to get a more accurate estimate 123 // based on historic data. The window variables count up the stats in the 124 // most recent window of time. 125 estimatedSystemScanDuration time.Duration 126 systemScanDurationAvg *expMovingAvg 127 windowFilesProcessed uint64 128 windowSleepTime time.Duration 129 windowStartTime time.Time 130 131 renter *Renter 132 } 133 134 // computeUpdatedEstimatedSystemScanDuration computes the estimated system scan 135 // duration of the dirFinder. It uses an exponential moving average, compressing 136 // historic values and then adding the new values from the recent window. 137 // Finally, it resets the new values from the recent window so that the EMA is 138 // not corrupted if called multiple times. 139 func (dirFinder *healthLoopDirFinder) updateEstimatedSystemScanDuration() { 140 if dirFinder.windowFilesProcessed > 0 { 141 processingTime := time.Since(dirFinder.windowStartTime) - dirFinder.windowSleepTime 142 estimatedScanDuration := float64(processingTime) * float64(dirFinder.totalFiles) / float64(dirFinder.windowFilesProcessed) 143 dirFinder.systemScanDurationAvg.addDataPoint(estimatedScanDuration) 144 dirFinder.estimatedSystemScanDuration = time.Duration(dirFinder.systemScanDurationAvg.average()) 145 146 // Set the renter's estimated system scan duration as well. 147 atomic.StoreUint64(&dirFinder.renter.atomicSystemHealthScanDuration, uint64(dirFinder.estimatedSystemScanDuration)) 148 } 149 150 // Reset the window variables. 151 dirFinder.windowFilesProcessed = 0 152 dirFinder.windowSleepTime = 0 153 dirFinder.windowStartTime = time.Now() 154 } 155 156 // reset will reset the dirFinder and start the dirFinder back at the root 157 // level. 158 // 159 // TODO: When tiered caching is added, reset the tiered caching here. 160 // 161 // TODO: If we aren't doing everything from root, then upon reset we need to 162 // commit the directory metadata changes in every part of our cacheing layer, so 163 // the changes exist on disk. 164 func (dirFinder *healthLoopDirFinder) reset() { 165 filesProcessed := dirFinder.windowFilesProcessed 166 timeTaken := time.Since(dirFinder.windowStartTime) - dirFinder.windowSleepTime 167 dirFinder.updateEstimatedSystemScanDuration() 168 dirFinder.renter.staticLog.Debugf("HEALTH LOOP: scanned %v files in %v, resulting in a new estimated full scan duration of %v", filesProcessed, timeTaken, dirFinder.estimatedSystemScanDuration) 169 } 170 171 // loadNextDir will find the next directory with the worst health and load 172 // it. 173 // 174 // TODO: This function can be significantly optimized by remembering/cacheing 175 // the healths of the levels above us, it's still roughly log(n) space but 176 // allows us to cut down on the reads and even attempt to linearize. 177 // 178 // TODO: We can attempt to linearize by refusing to retreat back up a level if 179 // the other directories at our current level are reasonably within the timeout 180 // range, preferring to go deeper here and making the structure more linear in 181 // the future. 182 // 183 // TODO: There's an idea of regionalized dfs, where we attempt to explore all 184 // the potential directories in order of health, but while we are in a given 185 // region we potentially add extra directories since it is cheaper to do them 186 // now while in the region than to come back to them later. The full algorithm 187 // is a bit involved, consult David before attempting. 188 func (dirFinder *healthLoopDirFinder) loadNextDir() error { 189 // Check if we need to reset the dirFinder. 190 if time.Since(dirFinder.windowStartTime) > healthLoopResetInterval { 191 dirFinder.reset() 192 } 193 194 // Check the siadir metadata for the root files directory. 195 siaPath := skymodules.RootSiaPath() 196 metadata, err := dirFinder.renter.managedDirectoryMetadata(siaPath) 197 if err != nil { 198 return errors.AddContext(err, "unable to load root metadata") 199 } 200 dirFinder.totalFiles = metadata.AggregateNumFiles 201 dirFinder.leastRecentCheck = metadata.AggregateLastHealthCheckTime 202 203 // Run a loop that will continually descend into child directories until it 204 // discovers the directory with the least recent health check time. 205 for { 206 // Load any subdirectories. 207 subDirSiaPaths, err := dirFinder.renter.managedSubDirectories(siaPath) 208 if err != nil { 209 errStr := fmt.Sprintf("error when fetching the sub directories of %s", siaPath) 210 return errors.AddContext(err, errStr) 211 } 212 213 // Find the oldest LastHealthCheckTime of the sub directories 214 betterSubdirFound := false 215 for _, subDirPath := range subDirSiaPaths { 216 // Load the metadata of this subdir. 217 subMetadata, err := dirFinder.renter.managedDirectoryMetadata(subDirPath) 218 if err != nil { 219 errStr := fmt.Sprintf("unable to load the metadata of subdirectory %s", subDirPath) 220 return errors.AddContext(err, errStr) 221 } 222 223 // Check whether this subdir is better. 224 if !subMetadata.AggregateLastHealthCheckTime.After(metadata.AggregateLastHealthCheckTime) { 225 betterSubdirFound = true 226 siaPath = subDirPath 227 metadata = subMetadata 228 } 229 } 230 // If a better subdir was not discovered, this is the winning subdir. 231 if !betterSubdirFound { 232 break 233 } 234 } 235 236 dirFinder.filesInNextDir = metadata.NumFiles 237 dirFinder.nextDir = siaPath 238 return nil 239 } 240 241 // sleepDurationBeforeNextDir will determine how long the health loop should 242 // sleep before processing the next directory. 243 // 244 // NOTE: The dir finder tries to estimate the amount of time that it takes to 245 // process the entire filesystem if there was no sleeping. It does this by 246 // remembering how long it has told callers to sleep, which means that in order 247 // for the estimate to be correct, the callers *must* sleep after making this 248 // call. 249 func (dirFinder *healthLoopDirFinder) sleepDurationBeforeNextDir() time.Duration { 250 // If there are no files, return a standard time for sleeping. 251 // 252 // NOTE: Without this check, you get a divide by zero. 253 if dirFinder.totalFiles == 0 { 254 dirFinder.renter.staticLog.Println("HEALTH LOOP: sleeping because the total files in the filesystem is zero") 255 return emptyFilesystemSleepDuration 256 } 257 258 // Sleep before processing any directories. The amount of sleep will be 259 // determined by the recent health time of the provided directory compared 260 // against the target health time. If the health time is more recent, we 261 // will sleep a proportionate amount of time so that we average scanning the 262 // entire filesystem once per target interval, but evenly spaced throughout 263 // that interval. 264 // 265 // If the recent check time is later than the target interval, the amount of 266 // sleep is reduced proportionally to the distance from the urgent time. 267 // This proportional reduction still has a bit of a spreading effect, to 268 // keep the load distributed over a large range of time rather than 269 // clustered. 270 // 271 // If the recent check is later than the urgent interval, there is no sleep 272 // at all because we need to get the updated health status on the files. 273 lrc := dirFinder.leastRecentCheck 274 timeSinceLRC := time.Since(lrc) 275 urgent := timeSinceLRC > urgentHealthCheckFrequency 276 slowScanTime := dirFinder.estimatedSystemScanDuration >= TargetHealthCheckFrequency 277 manualCheckActive := dirFinder.manualCheckTime.After(lrc) 278 // If a manual check is currently active, or if the condition of the 279 // file health is urgent, or if the amount of time it takes to scan the 280 // filesystem is longer than the target health interval, do not try to 281 // sleep. 282 if urgent || manualCheckActive || slowScanTime { 283 dirFinder.renter.staticLog.Debugln("HEALTH LOOP VERBOSE: skipping a sleep", urgent, manualCheckActive, slowScanTime) 284 return 0 285 } 286 287 // Compute the sleepTime. We want to sleep such that we check files 288 // at a rate that is perfectly evenly spread over the target health 289 // check interval. To compute that, you divide the target health 290 // check interval by the total number of files. 291 // 292 // We update an entire directory at once though, so we need to 293 // multiply the sleep time by the total number of files in the 294 // directory. 295 // 296 // Implemented naively, the average amount of time we sleep per 297 // cycle is exactly equal to the target health check interval, which 298 // gives us zero computational time to do the health check itself. 299 // To compensate for that, we track how much time we spend in system 300 // scan per cylce and subtract that from the numerator of the above 301 // described equation. 302 desiredSleepPerScan := TargetHealthCheckFrequency - dirFinder.estimatedSystemScanDuration 303 sleepTime := desiredSleepPerScan * time.Duration(dirFinder.filesInNextDir) / time.Duration(dirFinder.totalFiles) 304 // If we are behind schedule, we compress the sleep time 305 // proportionally to how far behind schedule we are. 306 if timeSinceLRC > TargetHealthCheckFrequency { 307 // We are behind schedule, compute the percentage progress 308 // towards urgency that we are. For example, if we are 1 minute 309 // later than the target health check frequency, and the urgent 310 // frequency is 100 minutes later than the target frequency, 311 // reduce the amount of sleep by 1%. If 2 minutes later than 312 // target, reduce by 2%, etc. 313 // 314 // NOTE: This is safe from divide by zero errors because we check 315 // earlier in the program that the urgent time is strictly greater than 316 // the target time. 317 compressionNum := float64(timeSinceLRC - TargetHealthCheckFrequency) 318 compressionDenom := float64(urgentHealthCheckFrequency - TargetHealthCheckFrequency) 319 compression := 1 - (compressionNum / compressionDenom) 320 sleepTime = time.Duration(float64(sleepTime) * compression) 321 } 322 dirFinder.windowSleepTime += sleepTime 323 return sleepTime 324 } 325 326 // processNextDir performs the actual health check and update on the directory 327 // that was discovered in loadNextDir. 328 func (dirFinder *healthLoopDirFinder) processNextDir() error { 329 // Scan and update the healths of all the files in the directory, and update 330 // the corresponding directory metadata. 331 nextDir := dirFinder.nextDir 332 err := dirFinder.renter.managedUpdateFilesInDir(nextDir) 333 if err != nil { 334 errStr := fmt.Sprintf("unable to process directory %s from within the health loop", nextDir) 335 return errors.AddContext(err, errStr) 336 } 337 dirFinder.windowFilesProcessed += dirFinder.filesInNextDir 338 err = dirFinder.renter.managedUpdateDirMetadata(nextDir) 339 if err != nil { 340 errStr := fmt.Sprintf("unable to update the metadata of directory %s", nextDir) 341 return errors.AddContext(err, errStr) 342 } 343 344 // Update the metadatas of all the underlying directories up to root. This 345 // won't scan and update all of the inner files, it'll just use the 346 // metadatas that the inner files already have. Most skynet portals only 347 // have files at the leaf directories, so it shouldn't make a big difference 348 // either way. 349 for !nextDir.IsRoot() { 350 parent, err := nextDir.Dir() 351 if err != nil { 352 str := fmt.Sprint("unable to get the parent of the non-root siapath:", nextDir) 353 err = errors.AddContext(err, str) 354 build.Critical(err) 355 return err 356 } 357 nextDir = parent 358 err = dirFinder.renter.managedUpdateDirMetadata(nextDir) 359 if err != nil { 360 errStr := fmt.Sprintf("unable to update the metadata of directory %s", nextDir) 361 return errors.AddContext(err, errStr) 362 } 363 } 364 return nil 365 } 366 367 // newHealthLoopDirFinder creates a new dir finder that is ready to perform 368 // health checks. 369 func (r *Renter) newHealthLoopDirFinder() *healthLoopDirFinder { 370 return &healthLoopDirFinder{ 371 windowStartTime: time.Now(), 372 systemScanDurationAvg: newExpMovingAvg(systemScanTimeEstimatorDecay), 373 374 renter: r, 375 } 376 } 377 378 // threadedHealthLoop is a permanent background loop in the renter that keeps 379 // the health of the files up to date. 380 // 381 // NOTE: The entire health loop is single threaded. If the system is under load 382 // such that the health loop could benefit from being multi-threaded, the CPU 383 // and disk IO cost of doing the health checks would probably causing 384 // significant disruption to other services on the Skynet portal. The health 385 // checks really should never be consuming more than a fraction of the total 386 // system resources, it's a sign that you need more servers rather than more 387 // threads if your health loop is not keeping up on a single thread. 388 func (r *Renter) threadedHealthLoop() { 389 err := r.tg.Add() 390 if err != nil { 391 return 392 } 393 defer r.tg.Done() 394 395 // Perform a check that the constants are configured correctly. 396 // 397 // NOTE: If this invariant is broken, it could cause divide by zero errors. 398 if urgentHealthCheckFrequency <= TargetHealthCheckFrequency { 399 panic("constants are set incorrectly, TargetHealthCheckFrequenecy needs to be smaller than urgentHealthCheckFrequency") 400 } 401 402 // Launch the background loop to perform health checks on the filesystem. 403 dirFinder := r.newHealthLoopDirFinder() 404 for { 405 // Load the next directory. In the event of an error, reset and try again. 406 err := dirFinder.loadNextDir() 407 for err != nil { 408 // Log the error and then sleep. 409 r.staticLog.Println("Error loading next directory:", err) 410 select { 411 case <-time.After(healthLoopErrorSleepDuration): 412 case <-r.tg.StopChan(): 413 return 414 } 415 416 // Try again to load the next directory. The logic inside the 417 // function handles any resets that are required. Normally the reset 418 // would be handled out here, but that made the error handling and 419 // logging incredibly verbose. 420 err = dirFinder.loadNextDir() 421 } 422 423 // Sleep before processing the next directory. This also serves as the 424 // exit condition for the loop. 425 // 426 // NOTE: The dirFinder tries to measure a throughput to estimate how 427 // long it would take to scan the entire filesystem, this estimate 428 // ignores the sleep time. In order for the estimate to be correct, this 429 // loop *must* sleep every time that it calls 430 // sleepDurationBeforeNextDir(). 431 // 432 // NOTE: Need to make sure this is called after 'loadNextDir' so that 433 // the right amount of sleep time is chosen, as the sleep duration will 434 // depend on which directory is up next. 435 sleepTime := dirFinder.sleepDurationBeforeNextDir() 436 r.staticLog.Debugln("HEALTH LOOP VERBOSE: sleeping before next directory", sleepTime) 437 select { 438 case <-time.After(sleepTime): 439 case <-r.tg.StopChan(): 440 return 441 } 442 443 // Process the next directory. We don't retry on error, we just move on 444 // to the next directory. 445 err = dirFinder.processNextDir() 446 if err != nil { 447 r.staticLog.Println("Error processing a directory in the health loop:", err) 448 } 449 } 450 }