gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/healthloop.go (about)

     1  package renter
     2  
     3  // healthloop.go houses the code that runs the health loop. The health loop is
     4  // called the health loop because its main purpose is to check the churn levels
     5  // on all of the files. As hosts enter and leave the Sia network, we need to
     6  // make sure that files are being repaired.
     7  //
     8  // The health loop does its job by just generally updating the metadata of all
     9  // directories, so it could just as well be called the 'metadata loop'. But if
    10  // there wasn't host churn on the network, the health loop probably wouldn't
    11  // exist, and aggregate metadata would just be updated as the files are updated.
    12  //
    13  // NOTE: The stateful variable of the health loop is not exposed to the renter
    14  // in any way. For the most part, the renter can learn everything useful about
    15  // the health loop by checking the numbers in the root siadir.
    16  
    17  // TODO: Once tagged logging is in place, everywhere that a log has the prefix
    18  // 'HEALTH LOOP', we can swap out that log.Println for a
    19  // log.Tagline("health-loop", $msg), meaning those lines will always print if
    20  // someone activates the tag "health-loop" in the logger. "HEALTH LOOP VERBOSE"
    21  // logs can be given the tag "health-loop-verbose".
    22  
    23  import (
    24  	"fmt"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"gitlab.com/NebulousLabs/errors"
    29  	"gitlab.com/SkynetLabs/skyd/build"
    30  	"gitlab.com/SkynetLabs/skyd/skymodules"
    31  )
    32  
    33  const (
    34  	// The system scan time estimator decay values determine how much decay
    35  	// should be applied to the estimator. A decay that is closer to 1 will take
    36  	// into account more historical data, and a decay that is closer to 0 will
    37  	// be more responsive to changes in the environment.
    38  	systemScanTimeEstimatorDecay = 0.9
    39  )
    40  
    41  var (
    42  	// emptyFilesystemSleepDuration determines how long the health loop will
    43  	// sleep if there are files in the filesystem.
    44  	emptyFilesystemSleepDuration = build.Select(build.Var{
    45  		Dev:      5 * time.Second,
    46  		Standard: 5 * time.Minute,
    47  		Testing:  2 * time.Second,
    48  	}).(time.Duration)
    49  
    50  	// healthLoopErrorSleepDuration indicates how long the health loop should
    51  	// sleep before retrying if there is an error preventing progress.
    52  	healthLoopErrorSleepDuration = build.Select(build.Var{
    53  		Dev:      9 * time.Second,
    54  		Standard: 5 * time.Minute,
    55  		Testing:  time.Second,
    56  	}).(time.Duration)
    57  
    58  	// healthLoopResetInterval defines how frequently the health loop resets,
    59  	// cleaning out its cache and restarting from root.
    60  	healthLoopResetInterval = build.Select(build.Var{
    61  		Dev:      30 * time.Second,
    62  		Standard: 15 * time.Minute,
    63  		Testing:  2 * time.Second,
    64  	}).(time.Duration)
    65  
    66  	// TargetHealthCheckFrequency defines how frequently we want to update the
    67  	// health of the filesystem when everything is running smoothly. The goal of
    68  	// the health check system is to spread the health checks of files over this
    69  	// interval, so that the load of performing health checks is as light as
    70  	// possible when the system is healthy.
    71  	//
    72  	// For standard builds, we're targeting 24 hours as a sign of a filesystem
    73  	// in good health. This value is picked based on the rate at which hosts
    74  	// churn through Skynet - in the course of 24 hours, we should never have
    75  	// enough churn to have built up a concerning amount of repair burden.
    76  	TargetHealthCheckFrequency = build.Select(build.Var{
    77  		Dev:      3 * time.Minute,
    78  		Standard: 24 * time.Hour,
    79  		Testing:  5 * time.Second,
    80  	}).(time.Duration)
    81  
    82  	// urgentHealthCheckFrequency is the time at which we feel the health of the
    83  	// system has reached an urgent state, we haven't checked the health of
    84  	// certain files in so long that the system should be running at full speed
    85  	// performing only health checks.
    86  	//
    87  	// As the health time of the files in the filesystem grows from the target
    88  	// health check frequency to the urgent health check frequency, the
    89  	// percentage of resources that are devoted to the health checks will
    90  	// linearly increase. If the recent health time of current files is nearly
    91  	// at the urgent frequency, the health loop will be running nearly at full
    92  	// speed. If the recent health time of the current files is only halfway
    93  	// there, the health loop will run halfway between proprtional speed and
    94  	// full speed.
    95  	urgentHealthCheckFrequency = build.Select(build.Var{
    96  		Dev:      9 * time.Minute,
    97  		Standard: 72 * time.Hour,
    98  		Testing:  10 * time.Second,
    99  	}).(time.Duration)
   100  )
   101  
   102  // healthLoopDirFinder is a helper structure which keeps track of which
   103  // directories the health loop has visited and which directories still need to
   104  // be visited
   105  //
   106  // NOTE: this struct is not thread safe, it is only intended to be used in
   107  // single-threaded situations.
   108  type healthLoopDirFinder struct {
   109  	// If the user has triggered a manual check, the health loop should run at
   110  	// full speed until the check is complete. We track whether it's complete by
   111  	// looking at whether the latest aggregate health time is later than the
   112  	// moment the check was triggered.
   113  	manualCheckTime time.Time
   114  
   115  	nextDir          skymodules.SiaPath // The next dir to scan and update.
   116  	filesInNextDir   uint64             // An approximation of the number of files in the next dir we will be scanning.
   117  	leastRecentCheck time.Time          // The time of the least recently checked dir in the filesystem.
   118  	totalFiles       uint64             // An approximation of the total number of files in the filesystem.
   119  
   120  	// These variables are used to estimate how long it takes to scan the
   121  	// filesystem when you exclude the sleeps. The weighted values are used to
   122  	// compute an exponential moving average to get a more accurate estimate
   123  	// based on historic data. The window variables count up the stats in the
   124  	// most recent window of time.
   125  	estimatedSystemScanDuration time.Duration
   126  	systemScanDurationAvg       *expMovingAvg
   127  	windowFilesProcessed        uint64
   128  	windowSleepTime             time.Duration
   129  	windowStartTime             time.Time
   130  
   131  	renter *Renter
   132  }
   133  
   134  // computeUpdatedEstimatedSystemScanDuration computes the estimated system scan
   135  // duration of the dirFinder. It uses an exponential moving average, compressing
   136  // historic values and then adding the new values from the recent window.
   137  // Finally, it resets the new values from the recent window so that the EMA is
   138  // not corrupted if called multiple times.
   139  func (dirFinder *healthLoopDirFinder) updateEstimatedSystemScanDuration() {
   140  	if dirFinder.windowFilesProcessed > 0 {
   141  		processingTime := time.Since(dirFinder.windowStartTime) - dirFinder.windowSleepTime
   142  		estimatedScanDuration := float64(processingTime) * float64(dirFinder.totalFiles) / float64(dirFinder.windowFilesProcessed)
   143  		dirFinder.systemScanDurationAvg.addDataPoint(estimatedScanDuration)
   144  		dirFinder.estimatedSystemScanDuration = time.Duration(dirFinder.systemScanDurationAvg.average())
   145  
   146  		// Set the renter's estimated system scan duration as well.
   147  		atomic.StoreUint64(&dirFinder.renter.atomicSystemHealthScanDuration, uint64(dirFinder.estimatedSystemScanDuration))
   148  	}
   149  
   150  	// Reset the window variables.
   151  	dirFinder.windowFilesProcessed = 0
   152  	dirFinder.windowSleepTime = 0
   153  	dirFinder.windowStartTime = time.Now()
   154  }
   155  
   156  // reset will reset the dirFinder and start the dirFinder back at the root
   157  // level.
   158  //
   159  // TODO: When tiered caching is added, reset the tiered caching here.
   160  //
   161  // TODO: If we aren't doing everything from root, then upon reset we need to
   162  // commit the directory metadata changes in every part of our cacheing layer, so
   163  // the changes exist on disk.
   164  func (dirFinder *healthLoopDirFinder) reset() {
   165  	filesProcessed := dirFinder.windowFilesProcessed
   166  	timeTaken := time.Since(dirFinder.windowStartTime) - dirFinder.windowSleepTime
   167  	dirFinder.updateEstimatedSystemScanDuration()
   168  	dirFinder.renter.staticLog.Debugf("HEALTH LOOP: scanned %v files in %v, resulting in a new estimated full scan duration of %v", filesProcessed, timeTaken, dirFinder.estimatedSystemScanDuration)
   169  }
   170  
   171  // loadNextDir will find the next directory with the worst health and load
   172  // it.
   173  //
   174  // TODO: This function can be significantly optimized by remembering/cacheing
   175  // the healths of the levels above us, it's still roughly log(n) space but
   176  // allows us to cut down on the reads and even attempt to linearize.
   177  //
   178  // TODO: We can attempt to linearize by refusing to retreat back up a level if
   179  // the other directories at our current level are reasonably within the timeout
   180  // range, preferring to go deeper here and making the structure more linear in
   181  // the future.
   182  //
   183  // TODO: There's an idea of regionalized dfs, where we attempt to explore all
   184  // the potential directories in order of health, but while we are in a given
   185  // region we potentially add extra directories since it is cheaper to do them
   186  // now while in the region than to come back to them later. The full algorithm
   187  // is a bit involved, consult David before attempting.
   188  func (dirFinder *healthLoopDirFinder) loadNextDir() error {
   189  	// Check if we need to reset the dirFinder.
   190  	if time.Since(dirFinder.windowStartTime) > healthLoopResetInterval {
   191  		dirFinder.reset()
   192  	}
   193  
   194  	// Check the siadir metadata for the root files directory.
   195  	siaPath := skymodules.RootSiaPath()
   196  	metadata, err := dirFinder.renter.managedDirectoryMetadata(siaPath)
   197  	if err != nil {
   198  		return errors.AddContext(err, "unable to load root metadata")
   199  	}
   200  	dirFinder.totalFiles = metadata.AggregateNumFiles
   201  	dirFinder.leastRecentCheck = metadata.AggregateLastHealthCheckTime
   202  
   203  	// Run a loop that will continually descend into child directories until it
   204  	// discovers the directory with the least recent health check time.
   205  	for {
   206  		// Load any subdirectories.
   207  		subDirSiaPaths, err := dirFinder.renter.managedSubDirectories(siaPath)
   208  		if err != nil {
   209  			errStr := fmt.Sprintf("error when fetching the sub directories of %s", siaPath)
   210  			return errors.AddContext(err, errStr)
   211  		}
   212  
   213  		// Find the oldest LastHealthCheckTime of the sub directories
   214  		betterSubdirFound := false
   215  		for _, subDirPath := range subDirSiaPaths {
   216  			// Load the metadata of this subdir.
   217  			subMetadata, err := dirFinder.renter.managedDirectoryMetadata(subDirPath)
   218  			if err != nil {
   219  				errStr := fmt.Sprintf("unable to load the metadata of subdirectory %s", subDirPath)
   220  				return errors.AddContext(err, errStr)
   221  			}
   222  
   223  			// Check whether this subdir is better.
   224  			if !subMetadata.AggregateLastHealthCheckTime.After(metadata.AggregateLastHealthCheckTime) {
   225  				betterSubdirFound = true
   226  				siaPath = subDirPath
   227  				metadata = subMetadata
   228  			}
   229  		}
   230  		// If a better subdir was not discovered, this is the winning subdir.
   231  		if !betterSubdirFound {
   232  			break
   233  		}
   234  	}
   235  
   236  	dirFinder.filesInNextDir = metadata.NumFiles
   237  	dirFinder.nextDir = siaPath
   238  	return nil
   239  }
   240  
   241  // sleepDurationBeforeNextDir will determine how long the health loop should
   242  // sleep before processing the next directory.
   243  //
   244  // NOTE: The dir finder tries to estimate the amount of time that it takes to
   245  // process the entire filesystem if there was no sleeping. It does this by
   246  // remembering how long it has told callers to sleep, which means that in order
   247  // for the estimate to be correct, the callers *must* sleep after making this
   248  // call.
   249  func (dirFinder *healthLoopDirFinder) sleepDurationBeforeNextDir() time.Duration {
   250  	// If there are no files, return a standard time for sleeping.
   251  	//
   252  	// NOTE: Without this check, you get a divide by zero.
   253  	if dirFinder.totalFiles == 0 {
   254  		dirFinder.renter.staticLog.Println("HEALTH LOOP: sleeping because the total files in the filesystem is zero")
   255  		return emptyFilesystemSleepDuration
   256  	}
   257  
   258  	// Sleep before processing any directories. The amount of sleep will be
   259  	// determined by the recent health time of the provided directory compared
   260  	// against the target health time. If the health time is more recent, we
   261  	// will sleep a proportionate amount of time so that we average scanning the
   262  	// entire filesystem once per target interval, but evenly spaced throughout
   263  	// that interval.
   264  	//
   265  	// If the recent check time is later than the target interval, the amount of
   266  	// sleep is reduced proportionally to the distance from the urgent time.
   267  	// This proportional reduction still has a bit of a spreading effect, to
   268  	// keep the load distributed over a large range of time rather than
   269  	// clustered.
   270  	//
   271  	// If the recent check is later than the urgent interval, there is no sleep
   272  	// at all because we need to get the updated health status on the files.
   273  	lrc := dirFinder.leastRecentCheck
   274  	timeSinceLRC := time.Since(lrc)
   275  	urgent := timeSinceLRC > urgentHealthCheckFrequency
   276  	slowScanTime := dirFinder.estimatedSystemScanDuration >= TargetHealthCheckFrequency
   277  	manualCheckActive := dirFinder.manualCheckTime.After(lrc)
   278  	// If a manual check is currently active, or if the condition of the
   279  	// file health is urgent, or if the amount of time it takes to scan the
   280  	// filesystem is longer than the target health interval, do not try to
   281  	// sleep.
   282  	if urgent || manualCheckActive || slowScanTime {
   283  		dirFinder.renter.staticLog.Debugln("HEALTH LOOP VERBOSE: skipping a sleep", urgent, manualCheckActive, slowScanTime)
   284  		return 0
   285  	}
   286  
   287  	// Compute the sleepTime. We want to sleep such that we check files
   288  	// at a rate that is perfectly evenly spread over the target health
   289  	// check interval. To compute that, you divide the target health
   290  	// check interval by the total number of files.
   291  	//
   292  	// We update an entire directory at once though, so we need to
   293  	// multiply the sleep time by the total number of files in the
   294  	// directory.
   295  	//
   296  	// Implemented naively, the average amount of time we sleep per
   297  	// cycle is exactly equal to the target health check interval, which
   298  	// gives us zero computational time to do the health check itself.
   299  	// To compensate for that, we track how much time we spend in system
   300  	// scan per cylce and subtract that from the numerator of the above
   301  	// described equation.
   302  	desiredSleepPerScan := TargetHealthCheckFrequency - dirFinder.estimatedSystemScanDuration
   303  	sleepTime := desiredSleepPerScan * time.Duration(dirFinder.filesInNextDir) / time.Duration(dirFinder.totalFiles)
   304  	// If we are behind schedule, we compress the sleep time
   305  	// proportionally to how far behind schedule we are.
   306  	if timeSinceLRC > TargetHealthCheckFrequency {
   307  		// We are behind schedule, compute the percentage progress
   308  		// towards urgency that we are. For example, if we are 1 minute
   309  		// later than the target health check frequency, and the urgent
   310  		// frequency is 100 minutes later than the target frequency,
   311  		// reduce the amount of sleep by 1%. If 2 minutes later than
   312  		// target, reduce by 2%, etc.
   313  		//
   314  		// NOTE: This is safe from divide by zero errors because we check
   315  		// earlier in the program that the urgent time is strictly greater than
   316  		// the target time.
   317  		compressionNum := float64(timeSinceLRC - TargetHealthCheckFrequency)
   318  		compressionDenom := float64(urgentHealthCheckFrequency - TargetHealthCheckFrequency)
   319  		compression := 1 - (compressionNum / compressionDenom)
   320  		sleepTime = time.Duration(float64(sleepTime) * compression)
   321  	}
   322  	dirFinder.windowSleepTime += sleepTime
   323  	return sleepTime
   324  }
   325  
   326  // processNextDir performs the actual health check and update on the directory
   327  // that was discovered in loadNextDir.
   328  func (dirFinder *healthLoopDirFinder) processNextDir() error {
   329  	// Scan and update the healths of all the files in the directory, and update
   330  	// the corresponding directory metadata.
   331  	nextDir := dirFinder.nextDir
   332  	err := dirFinder.renter.managedUpdateFilesInDir(nextDir)
   333  	if err != nil {
   334  		errStr := fmt.Sprintf("unable to process directory %s from within the health loop", nextDir)
   335  		return errors.AddContext(err, errStr)
   336  	}
   337  	dirFinder.windowFilesProcessed += dirFinder.filesInNextDir
   338  	err = dirFinder.renter.managedUpdateDirMetadata(nextDir)
   339  	if err != nil {
   340  		errStr := fmt.Sprintf("unable to update the metadata of directory %s", nextDir)
   341  		return errors.AddContext(err, errStr)
   342  	}
   343  
   344  	// Update the metadatas of all the underlying directories up to root. This
   345  	// won't scan and update all of the inner files, it'll just use the
   346  	// metadatas that the inner files already have. Most skynet portals only
   347  	// have files at the leaf directories, so it shouldn't make a big difference
   348  	// either way.
   349  	for !nextDir.IsRoot() {
   350  		parent, err := nextDir.Dir()
   351  		if err != nil {
   352  			str := fmt.Sprint("unable to get the parent of the non-root siapath:", nextDir)
   353  			err = errors.AddContext(err, str)
   354  			build.Critical(err)
   355  			return err
   356  		}
   357  		nextDir = parent
   358  		err = dirFinder.renter.managedUpdateDirMetadata(nextDir)
   359  		if err != nil {
   360  			errStr := fmt.Sprintf("unable to update the metadata of directory %s", nextDir)
   361  			return errors.AddContext(err, errStr)
   362  		}
   363  	}
   364  	return nil
   365  }
   366  
   367  // newHealthLoopDirFinder creates a new dir finder that is ready to perform
   368  // health checks.
   369  func (r *Renter) newHealthLoopDirFinder() *healthLoopDirFinder {
   370  	return &healthLoopDirFinder{
   371  		windowStartTime:       time.Now(),
   372  		systemScanDurationAvg: newExpMovingAvg(systemScanTimeEstimatorDecay),
   373  
   374  		renter: r,
   375  	}
   376  }
   377  
   378  // threadedHealthLoop is a permanent background loop in the renter that keeps
   379  // the health of the files up to date.
   380  //
   381  // NOTE: The entire health loop is single threaded. If the system is under load
   382  // such that the health loop could benefit from being multi-threaded, the CPU
   383  // and disk IO cost of doing the health checks would probably causing
   384  // significant disruption to other services on the Skynet portal. The health
   385  // checks really should never be consuming more than a fraction of the total
   386  // system resources, it's a sign that you need more servers rather than more
   387  // threads if your health loop is not keeping up on a single thread.
   388  func (r *Renter) threadedHealthLoop() {
   389  	err := r.tg.Add()
   390  	if err != nil {
   391  		return
   392  	}
   393  	defer r.tg.Done()
   394  
   395  	// Perform a check that the constants are configured correctly.
   396  	//
   397  	// NOTE: If this invariant is broken, it could cause divide by zero errors.
   398  	if urgentHealthCheckFrequency <= TargetHealthCheckFrequency {
   399  		panic("constants are set incorrectly, TargetHealthCheckFrequenecy needs to be smaller than urgentHealthCheckFrequency")
   400  	}
   401  
   402  	// Launch the background loop to perform health checks on the filesystem.
   403  	dirFinder := r.newHealthLoopDirFinder()
   404  	for {
   405  		// Load the next directory. In the event of an error, reset and try again.
   406  		err := dirFinder.loadNextDir()
   407  		for err != nil {
   408  			// Log the error and then sleep.
   409  			r.staticLog.Println("Error loading next directory:", err)
   410  			select {
   411  			case <-time.After(healthLoopErrorSleepDuration):
   412  			case <-r.tg.StopChan():
   413  				return
   414  			}
   415  
   416  			// Try again to load the next directory. The logic inside the
   417  			// function handles any resets that are required. Normally the reset
   418  			// would be handled out here, but that made the error handling and
   419  			// logging incredibly verbose.
   420  			err = dirFinder.loadNextDir()
   421  		}
   422  
   423  		// Sleep before processing the next directory. This also serves as the
   424  		// exit condition for the loop.
   425  		//
   426  		// NOTE: The dirFinder tries to measure a throughput to estimate how
   427  		// long it would take to scan the entire filesystem, this estimate
   428  		// ignores the sleep time. In order for the estimate to be correct, this
   429  		// loop *must* sleep every time that it calls
   430  		// sleepDurationBeforeNextDir().
   431  		//
   432  		// NOTE: Need to make sure this is called after 'loadNextDir' so that
   433  		// the right amount of sleep time is chosen, as the sleep duration will
   434  		// depend on which directory is up next.
   435  		sleepTime := dirFinder.sleepDurationBeforeNextDir()
   436  		r.staticLog.Debugln("HEALTH LOOP VERBOSE: sleeping before next directory", sleepTime)
   437  		select {
   438  		case <-time.After(sleepTime):
   439  		case <-r.tg.StopChan():
   440  			return
   441  		}
   442  
   443  		// Process the next directory. We don't retry on error, we just move on
   444  		// to the next directory.
   445  		err = dirFinder.processNextDir()
   446  		if err != nil {
   447  			r.staticLog.Println("Error processing a directory in the health loop:", err)
   448  		}
   449  	}
   450  }