gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/proto/refcounter_acid_test.go (about)

     1  package proto
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"os"
     7  	"path/filepath"
     8  	"runtime"
     9  	"sync"
    10  	"sync/atomic"
    11  	"testing"
    12  	"time"
    13  
    14  	"gitlab.com/NebulousLabs/errors"
    15  	"gitlab.com/NebulousLabs/fastrand"
    16  	"gitlab.com/NebulousLabs/writeaheadlog"
    17  	"gitlab.com/SkynetLabs/skyd/build"
    18  	"gitlab.com/SkynetLabs/skyd/siatest/dependencies"
    19  	"gitlab.com/SkynetLabs/skyd/skymodules"
    20  	"go.sia.tech/siad/crypto"
    21  	"go.sia.tech/siad/types"
    22  )
    23  
    24  // errTestTimeout is returned when the time allotted for testing runs out. It's
    25  // not a real error in the sense that it doesn't cause the test to fail.
    26  var errTestTimeout = errors.New("test timeout has run out")
    27  
    28  // tracker allows us to manually keep track of all the changes that happen to a
    29  // refcounter in order to validate them
    30  type tracker struct {
    31  	// denotes whether we can create new updates or we first need to reload from
    32  	// disk
    33  	crashed bool
    34  	counts  []uint16
    35  	mu      sync.Mutex
    36  
    37  	// stat counters
    38  	atomicNumRecoveries           uint64
    39  	atomicNumSuccessfulIterations uint64
    40  }
    41  
    42  // managedSetCrashed marks the tracker as crashed
    43  func (t *tracker) managedSetCrashed(c bool) {
    44  	t.mu.Lock()
    45  	defer t.mu.Unlock()
    46  	t.crashed = c
    47  }
    48  
    49  // managedCrashed checks if the tracker is marked as crashed
    50  func (t *tracker) managedCrashed() bool {
    51  	t.mu.Lock()
    52  	defer t.mu.Unlock()
    53  	return t.crashed
    54  }
    55  
    56  // TestRefCounterFaultyDisk simulates interacting with a SiaFile on a faulty disk.
    57  func TestRefCounterFaultyDisk(t *testing.T) {
    58  	if testing.Short() {
    59  		t.SkipNow()
    60  	}
    61  	t.Parallel()
    62  
    63  	// Determine a reasonable timeout for the test
    64  	testTimeout := 5 * time.Second
    65  	if build.VLONG {
    66  		testTimeout = 30 * time.Second
    67  	}
    68  
    69  	// Prepare for the tests
    70  	testDir := build.TempDir(t.Name())
    71  	wal, walPath := newTestWAL()
    72  	if err := os.MkdirAll(testDir, skymodules.DefaultDirPerm); err != nil {
    73  		t.Fatal("Failed to create test directory:", err)
    74  	}
    75  	// Create a new ref counter
    76  	testContractID := types.FileContractID(crypto.HashBytes([]byte("contractId")))
    77  	rcFilePath := filepath.Join(testDir, testContractID.String()+refCounterExtension)
    78  	// Create the faulty disk dependency
    79  	fdd := dependencies.NewFaultyDiskDependency(10000) // Fails after 10000 writes.
    80  	fdd.Enable()
    81  	// Attach it to the refcounter
    82  	rc, err := newCustomRefCounter(rcFilePath, 200, wal, fdd)
    83  	if err != nil {
    84  		t.Fatal("Failed to create a reference counter:", err)
    85  	}
    86  
    87  	// Create a struct to monitor all changes happening to the test refcounter.
    88  	// At the end we'll use it to validate all changes.
    89  	track := newTracker(rc)
    90  
    91  	// testTimeoutChan will signal to all goroutines that it's time to wrap up and exit
    92  	testTimeoutChan := make(chan struct{})
    93  	time.AfterFunc(testTimeout, func() {
    94  		close(testTimeoutChan)
    95  	})
    96  
    97  	var wg sync.WaitGroup
    98  	// The outer loop keeps restarting the tests until the time runs out
    99  OUTER:
   100  	for {
   101  		if track.managedCrashed() {
   102  			t.Fatal("The tracker is crashed on start.")
   103  		}
   104  		// Run a high number of tests in parallel
   105  		for i := 0; i < runtime.NumCPU()*10; i++ {
   106  			wg.Add(1)
   107  			go func(n int) {
   108  				defer wg.Done()
   109  				errLocal := performUpdates(rc, track, testTimeoutChan)
   110  				if errLocal != nil && !errors.Contains(errLocal, dependencies.ErrDiskFault) && !errors.Contains(errLocal, errTimeoutOnLock) {
   111  					// We have a real error - fail the test
   112  					t.Error(errLocal)
   113  				}
   114  			}(i)
   115  		}
   116  		wg.Wait()
   117  
   118  		select {
   119  		case <-testTimeoutChan:
   120  			// the time has run out, finish the test
   121  			break OUTER
   122  		default:
   123  			// there is still time, load the wal from disk and re-run the test
   124  			rcFromDisk, err := reloadRefCounter(rcFilePath, walPath, fdd, track, testTimeoutChan)
   125  			if errors.Contains(err, errTestTimeout) {
   126  				break OUTER
   127  			}
   128  			if err != nil {
   129  				t.Fatal("Failed to reload wal from disk:", err)
   130  			}
   131  			// we only assign it when there is no error because we need the
   132  			// latest reference for the sanity check we do against the tracker
   133  			// struct at the end
   134  			rc = rcFromDisk
   135  		}
   136  	}
   137  
   138  	// Load the WAL from disk and apply all outstanding txns.
   139  	// Try until successful.
   140  	for {
   141  		wal, err = loadWal(rcFilePath, walPath, fdd)
   142  		if errors.Contains(err, dependencies.ErrDiskFault) {
   143  			// Disk has failed, all future attempts to load the wal will fail so
   144  			// we need to reset the dependency and try again
   145  			fdd.Reset()
   146  			atomic.AddUint64(&track.atomicNumRecoveries, 1)
   147  			continue
   148  		}
   149  		if err != nil {
   150  			t.Fatal(err)
   151  		}
   152  		break // successful
   153  	}
   154  
   155  	// Load the refcounter from disk.
   156  	rc, err = loadRefCounter(rcFilePath, wal)
   157  	if err != nil {
   158  		t.Fatal(err)
   159  	}
   160  
   161  	// Validate changes
   162  	if err = validateStatusAfterAllTests(rc, track); err != nil {
   163  		t.Fatal(err)
   164  	}
   165  
   166  	t.Logf("\nRecovered from %v disk failures\n", track.atomicNumRecoveries)
   167  	t.Logf("Inner loop %v iterations without failures\n", track.atomicNumSuccessfulIterations)
   168  }
   169  
   170  // loadWal reads the wal from disk and applies all outstanding transactions
   171  func loadWal(rcFilePath string, walPath string, fdd *dependencies.DependencyFaultyDisk) (_ *writeaheadlog.WAL, err error) {
   172  	// load the wal from disk
   173  	txns, newWal, err := writeaheadlog.New(walPath)
   174  	if err != nil {
   175  		return nil, errors.AddContext(err, "failed to load wal from disk")
   176  	}
   177  	f, err := fdd.OpenFile(rcFilePath, os.O_RDWR, skymodules.DefaultFilePerm)
   178  	if err != nil {
   179  		return nil, errors.AddContext(err, "failed to open refcounter file in order to apply updates")
   180  	}
   181  	defer func() {
   182  		err = errors.Compose(err, f.Close())
   183  	}()
   184  	// apply any outstanding transactions
   185  	for _, txn := range txns {
   186  		if err := applyUpdates(f, txn.Updates...); err != nil {
   187  			return nil, errors.AddContext(err, "failed to apply updates")
   188  		}
   189  		if err := txn.SignalUpdatesApplied(); err != nil {
   190  			return nil, errors.AddContext(err, "failed to signal updates applied")
   191  		}
   192  	}
   193  	return newWal, f.Sync()
   194  }
   195  
   196  // newTracker creates a tracker instance and initialises its counts
   197  // slice
   198  func newTracker(rc *refCounter) *tracker {
   199  	t := &tracker{
   200  		counts: make([]uint16, rc.numSectors),
   201  	}
   202  	for i := uint64(0); i < rc.numSectors; i++ {
   203  		c, err := rc.callCount(i)
   204  		if err != nil {
   205  			panic("Failed to read count from refcounter.")
   206  		}
   207  		t.counts[i] = c
   208  	}
   209  	return t
   210  }
   211  
   212  // performUpdates keeps applying a random number of operations on the refcounter
   213  // until an error occurs.
   214  func performUpdates(rcLocal *refCounter, tr *tracker, testTimeoutChan <-chan struct{}) error {
   215  	for {
   216  		err := performUpdateOperations(rcLocal, tr)
   217  		if err != nil {
   218  			// we have an error, fake or not we should return
   219  			return err
   220  		}
   221  		if tr.managedCrashed() {
   222  			return nil
   223  		}
   224  
   225  		atomic.AddUint64(&tr.atomicNumSuccessfulIterations, 1)
   226  
   227  		select {
   228  		case <-testTimeoutChan:
   229  			return nil
   230  		default:
   231  		}
   232  	}
   233  }
   234  
   235  // performUpdateOperations executes a randomised set of updates within an
   236  // update session.
   237  func performUpdateOperations(rc *refCounter, tr *tracker) (err error) {
   238  	err = rc.managedStartUpdateWithTimeout(100 * time.Millisecond)
   239  	if err != nil {
   240  		// don't fail the test on a timeout on the lock
   241  		if errors.Contains(err, errTimeoutOnLock) {
   242  			err = nil
   243  		}
   244  		return
   245  	}
   246  	// This will wipe the temporary in-mem changes to the counters.
   247  	// On success that's OK.
   248  	// On error we need to crash anyway, so it's OK as well.
   249  	defer rc.callUpdateApplied()
   250  
   251  	// We can afford to lock the tracker because only one goroutine is
   252  	// allowed to make changes at any given time anyway.
   253  	tr.mu.Lock()
   254  	defer func() {
   255  		// If we're returning a disk error we need to crash in order to avoid
   256  		// data corruption.
   257  		if errors.Contains(err, dependencies.ErrDiskFault) {
   258  			tr.crashed = true
   259  		}
   260  		tr.mu.Unlock()
   261  	}()
   262  	if tr.crashed {
   263  		return nil
   264  	}
   265  
   266  	var updates []writeaheadlog.Update
   267  	var u writeaheadlog.Update
   268  
   269  	// 50% chance to increment, 2 chances
   270  	for i := 0; i < 2; i++ {
   271  		if fastrand.Intn(100) < 50 {
   272  			secIdx := fastrand.Uint64n(rc.numSectors)
   273  			// check if the operation is valid - we won't gain anything
   274  			// from hitting an overflow
   275  			if errValidate := validateIncrement(rc, secIdx); errValidate != nil {
   276  				continue
   277  			}
   278  			if u, err = rc.callIncrement(secIdx); err != nil {
   279  				return
   280  			}
   281  			tr.counts[secIdx]++
   282  			updates = append(updates, u)
   283  		}
   284  	}
   285  
   286  	// 50% chance to decrement, 2 chances
   287  	for i := 0; i < 2; i++ {
   288  		if fastrand.Intn(100) < 50 {
   289  			secIdx := fastrand.Uint64n(rc.numSectors)
   290  			// check if the operation is valid - we won't gain anything
   291  			// from hitting an underflow
   292  			if errValidate := validateDecrement(rc, secIdx); errValidate != nil {
   293  				continue
   294  			}
   295  			if u, err = rc.callDecrement(secIdx); err != nil {
   296  				return
   297  			}
   298  			tr.counts[secIdx]--
   299  			updates = append(updates, u)
   300  		}
   301  	}
   302  
   303  	// 40% chance to append
   304  	if fastrand.Intn(100) < 40 {
   305  		if u, err = rc.callAppend(); err != nil {
   306  			return
   307  		}
   308  		tr.counts = append(tr.counts, 1)
   309  		updates = append(updates, u)
   310  	}
   311  
   312  	// 20% chance to drop up to 2 sectors
   313  	if fastrand.Intn(100) < 20 {
   314  		secNum := fastrand.Uint64n(3)
   315  		// check if the operation is valid - we won't gain anything
   316  		// from running out of sectors
   317  		if errValidate := validateDropSectors(rc, secNum); errValidate == nil {
   318  			if u, err = rc.callDropSectors(secNum); err != nil {
   319  				return
   320  			}
   321  			tr.counts = tr.counts[:len(tr.counts)-int(secNum)]
   322  			updates = append(updates, u)
   323  		}
   324  	}
   325  
   326  	// 20% chance to swap sectors
   327  	if fastrand.Intn(100) < 20 {
   328  		var us []writeaheadlog.Update
   329  		secIdx1 := fastrand.Uint64n(rc.numSectors)
   330  		secIdx2 := fastrand.Uint64n(rc.numSectors)
   331  		if us, err = rc.callSwap(secIdx1, secIdx2); err != nil {
   332  			return
   333  		}
   334  		tr.counts[secIdx1], tr.counts[secIdx2] = tr.counts[secIdx2], tr.counts[secIdx1]
   335  		updates = append(updates, us...)
   336  	}
   337  
   338  	// In case of an error in its critical section, CreateAndApplyTransaction
   339  	// will panic. An injected disk error can cause such a panic. We need to
   340  	// recover from it and handle it as a normal faulty disk error within this
   341  	// test.
   342  	defer func() {
   343  		r := recover()
   344  		e, ok := r.(error)
   345  		if ok && errors.Contains(e, dependencies.ErrDiskFault) {
   346  			// we recovered by a panic caused by the faulty disk dependency
   347  			err = dependencies.ErrDiskFault
   348  		} else if r != nil {
   349  			panic(r)
   350  		}
   351  	}()
   352  
   353  	if len(updates) > 0 {
   354  		err = rc.callCreateAndApplyTransaction(updates...)
   355  	}
   356  	return
   357  }
   358  
   359  // reloadRefCounter tries to reload the file. This simulates failures during recovery.
   360  func reloadRefCounter(rcFilePath, walPath string, fdd *dependencies.DependencyFaultyDisk, tr *tracker, doneChan <-chan struct{}) (*refCounter, error) {
   361  	// Try to reload the file. This simulates failures during recovery.
   362  	for tries := 1; ; tries++ {
   363  		select {
   364  		case <-doneChan:
   365  			return nil, errTestTimeout
   366  		default:
   367  		}
   368  
   369  		// 20% chance to auto-recover
   370  		if fastrand.Intn(100) < 20 {
   371  			fdd.Reset()
   372  		}
   373  
   374  		// Every 10 times, we reset the dependency to avoid getting stuck here.
   375  		if tries%10 == 0 {
   376  			fdd.Reset()
   377  		}
   378  		// Reload the wal from disk and apply unfinished txns
   379  		newWal, err := loadWal(rcFilePath, walPath, fdd)
   380  		if errors.Contains(err, dependencies.ErrDiskFault) {
   381  			atomic.AddUint64(&tr.atomicNumRecoveries, 1)
   382  			continue // try again
   383  		} else if err != nil {
   384  			// an actual error occurred, the test must fail
   385  			return nil, err
   386  		}
   387  		// Reload the refcounter from disk
   388  		newRc, err := loadRefCounter(rcFilePath, newWal)
   389  		if err != nil {
   390  			return nil, err
   391  		}
   392  		newRc.staticDeps = fdd
   393  		tr.managedSetCrashed(false)
   394  		return newRc, nil
   395  	}
   396  }
   397  
   398  // validateDecrement is a helper method that ensures the counter is above 0.
   399  // This allows us to avoid a counter underflow.
   400  func validateDecrement(rc *refCounter, secNum uint64) error {
   401  	n, err := rc.readCount(secNum)
   402  	// Ignore errors coming from the dependency for this one
   403  	if err != nil && !errors.Contains(err, dependencies.ErrDiskFault) {
   404  		// If the error wasn't caused by the dependency, the test fails.
   405  		return err
   406  	}
   407  	if n < 1 {
   408  		return errors.New("Cannot decrement a zero counter")
   409  	}
   410  	return nil
   411  }
   412  
   413  // validateDropSectors is a helper method that ensures the number of sectors we
   414  // want to drop does not exceed the number of sectors in the refcounter.
   415  func validateDropSectors(rc *refCounter, secNum uint64) error {
   416  	if rc.numSectors < secNum {
   417  		return errors.New("Cannot drop more sectors than the total")
   418  	}
   419  	return nil
   420  }
   421  
   422  // validateIncrement is a helper method that ensures the counter has not
   423  // reached its maximum value. This allows us to avoid a counter overflow.
   424  func validateIncrement(rc *refCounter, secNum uint64) error {
   425  	n, err := rc.readCount(secNum)
   426  	// Ignore errors coming from the dependency for this one
   427  	if err != nil && !errors.Contains(err, dependencies.ErrDiskFault) {
   428  		// If the error wasn't caused by the dependency, the test fails.
   429  		return err
   430  	}
   431  	if n == math.MaxUint16 {
   432  		return errors.New("Cannot increment a counter at max value")
   433  	}
   434  	return nil
   435  }
   436  
   437  // validateStatusAfterAllTests does the final validation of the test by
   438  // comparing the state of the refcounter after all the test updates are applied.
   439  func validateStatusAfterAllTests(rc *refCounter, tr *tracker) error {
   440  	rc.mu.Lock()
   441  	numSec := rc.numSectors
   442  	rc.mu.Unlock()
   443  	if numSec != uint64(len(tr.counts)) {
   444  		return fmt.Errorf("Expected %d sectors, got %d\n", uint64(len(tr.counts)), numSec)
   445  	}
   446  	var errorList []error
   447  	for i := uint64(0); i < numSec; i++ {
   448  		n, err := rc.callCount(i)
   449  		if err != nil {
   450  			return errors.AddContext(err, "failed to read count")
   451  		}
   452  		tr.mu.Lock()
   453  		if n != tr.counts[i] {
   454  			errorList = append(errorList, fmt.Errorf("expected counter value for sector %d to be %d, got %d", i, tr.counts[i], n))
   455  		}
   456  		tr.mu.Unlock()
   457  	}
   458  	if len(errorList) > 0 {
   459  		return errors.AddContext(errors.Compose(errorList...), "sector counter values do not match expectations")
   460  	}
   461  	return nil
   462  }