gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/proto/refcounter_acid_test.go (about) 1 package proto 2 3 import ( 4 "fmt" 5 "math" 6 "os" 7 "path/filepath" 8 "runtime" 9 "sync" 10 "sync/atomic" 11 "testing" 12 "time" 13 14 "gitlab.com/NebulousLabs/errors" 15 "gitlab.com/NebulousLabs/fastrand" 16 "gitlab.com/NebulousLabs/writeaheadlog" 17 "gitlab.com/SkynetLabs/skyd/build" 18 "gitlab.com/SkynetLabs/skyd/siatest/dependencies" 19 "gitlab.com/SkynetLabs/skyd/skymodules" 20 "go.sia.tech/siad/crypto" 21 "go.sia.tech/siad/types" 22 ) 23 24 // errTestTimeout is returned when the time allotted for testing runs out. It's 25 // not a real error in the sense that it doesn't cause the test to fail. 26 var errTestTimeout = errors.New("test timeout has run out") 27 28 // tracker allows us to manually keep track of all the changes that happen to a 29 // refcounter in order to validate them 30 type tracker struct { 31 // denotes whether we can create new updates or we first need to reload from 32 // disk 33 crashed bool 34 counts []uint16 35 mu sync.Mutex 36 37 // stat counters 38 atomicNumRecoveries uint64 39 atomicNumSuccessfulIterations uint64 40 } 41 42 // managedSetCrashed marks the tracker as crashed 43 func (t *tracker) managedSetCrashed(c bool) { 44 t.mu.Lock() 45 defer t.mu.Unlock() 46 t.crashed = c 47 } 48 49 // managedCrashed checks if the tracker is marked as crashed 50 func (t *tracker) managedCrashed() bool { 51 t.mu.Lock() 52 defer t.mu.Unlock() 53 return t.crashed 54 } 55 56 // TestRefCounterFaultyDisk simulates interacting with a SiaFile on a faulty disk. 57 func TestRefCounterFaultyDisk(t *testing.T) { 58 if testing.Short() { 59 t.SkipNow() 60 } 61 t.Parallel() 62 63 // Determine a reasonable timeout for the test 64 testTimeout := 5 * time.Second 65 if build.VLONG { 66 testTimeout = 30 * time.Second 67 } 68 69 // Prepare for the tests 70 testDir := build.TempDir(t.Name()) 71 wal, walPath := newTestWAL() 72 if err := os.MkdirAll(testDir, skymodules.DefaultDirPerm); err != nil { 73 t.Fatal("Failed to create test directory:", err) 74 } 75 // Create a new ref counter 76 testContractID := types.FileContractID(crypto.HashBytes([]byte("contractId"))) 77 rcFilePath := filepath.Join(testDir, testContractID.String()+refCounterExtension) 78 // Create the faulty disk dependency 79 fdd := dependencies.NewFaultyDiskDependency(10000) // Fails after 10000 writes. 80 fdd.Enable() 81 // Attach it to the refcounter 82 rc, err := newCustomRefCounter(rcFilePath, 200, wal, fdd) 83 if err != nil { 84 t.Fatal("Failed to create a reference counter:", err) 85 } 86 87 // Create a struct to monitor all changes happening to the test refcounter. 88 // At the end we'll use it to validate all changes. 89 track := newTracker(rc) 90 91 // testTimeoutChan will signal to all goroutines that it's time to wrap up and exit 92 testTimeoutChan := make(chan struct{}) 93 time.AfterFunc(testTimeout, func() { 94 close(testTimeoutChan) 95 }) 96 97 var wg sync.WaitGroup 98 // The outer loop keeps restarting the tests until the time runs out 99 OUTER: 100 for { 101 if track.managedCrashed() { 102 t.Fatal("The tracker is crashed on start.") 103 } 104 // Run a high number of tests in parallel 105 for i := 0; i < runtime.NumCPU()*10; i++ { 106 wg.Add(1) 107 go func(n int) { 108 defer wg.Done() 109 errLocal := performUpdates(rc, track, testTimeoutChan) 110 if errLocal != nil && !errors.Contains(errLocal, dependencies.ErrDiskFault) && !errors.Contains(errLocal, errTimeoutOnLock) { 111 // We have a real error - fail the test 112 t.Error(errLocal) 113 } 114 }(i) 115 } 116 wg.Wait() 117 118 select { 119 case <-testTimeoutChan: 120 // the time has run out, finish the test 121 break OUTER 122 default: 123 // there is still time, load the wal from disk and re-run the test 124 rcFromDisk, err := reloadRefCounter(rcFilePath, walPath, fdd, track, testTimeoutChan) 125 if errors.Contains(err, errTestTimeout) { 126 break OUTER 127 } 128 if err != nil { 129 t.Fatal("Failed to reload wal from disk:", err) 130 } 131 // we only assign it when there is no error because we need the 132 // latest reference for the sanity check we do against the tracker 133 // struct at the end 134 rc = rcFromDisk 135 } 136 } 137 138 // Load the WAL from disk and apply all outstanding txns. 139 // Try until successful. 140 for { 141 wal, err = loadWal(rcFilePath, walPath, fdd) 142 if errors.Contains(err, dependencies.ErrDiskFault) { 143 // Disk has failed, all future attempts to load the wal will fail so 144 // we need to reset the dependency and try again 145 fdd.Reset() 146 atomic.AddUint64(&track.atomicNumRecoveries, 1) 147 continue 148 } 149 if err != nil { 150 t.Fatal(err) 151 } 152 break // successful 153 } 154 155 // Load the refcounter from disk. 156 rc, err = loadRefCounter(rcFilePath, wal) 157 if err != nil { 158 t.Fatal(err) 159 } 160 161 // Validate changes 162 if err = validateStatusAfterAllTests(rc, track); err != nil { 163 t.Fatal(err) 164 } 165 166 t.Logf("\nRecovered from %v disk failures\n", track.atomicNumRecoveries) 167 t.Logf("Inner loop %v iterations without failures\n", track.atomicNumSuccessfulIterations) 168 } 169 170 // loadWal reads the wal from disk and applies all outstanding transactions 171 func loadWal(rcFilePath string, walPath string, fdd *dependencies.DependencyFaultyDisk) (_ *writeaheadlog.WAL, err error) { 172 // load the wal from disk 173 txns, newWal, err := writeaheadlog.New(walPath) 174 if err != nil { 175 return nil, errors.AddContext(err, "failed to load wal from disk") 176 } 177 f, err := fdd.OpenFile(rcFilePath, os.O_RDWR, skymodules.DefaultFilePerm) 178 if err != nil { 179 return nil, errors.AddContext(err, "failed to open refcounter file in order to apply updates") 180 } 181 defer func() { 182 err = errors.Compose(err, f.Close()) 183 }() 184 // apply any outstanding transactions 185 for _, txn := range txns { 186 if err := applyUpdates(f, txn.Updates...); err != nil { 187 return nil, errors.AddContext(err, "failed to apply updates") 188 } 189 if err := txn.SignalUpdatesApplied(); err != nil { 190 return nil, errors.AddContext(err, "failed to signal updates applied") 191 } 192 } 193 return newWal, f.Sync() 194 } 195 196 // newTracker creates a tracker instance and initialises its counts 197 // slice 198 func newTracker(rc *refCounter) *tracker { 199 t := &tracker{ 200 counts: make([]uint16, rc.numSectors), 201 } 202 for i := uint64(0); i < rc.numSectors; i++ { 203 c, err := rc.callCount(i) 204 if err != nil { 205 panic("Failed to read count from refcounter.") 206 } 207 t.counts[i] = c 208 } 209 return t 210 } 211 212 // performUpdates keeps applying a random number of operations on the refcounter 213 // until an error occurs. 214 func performUpdates(rcLocal *refCounter, tr *tracker, testTimeoutChan <-chan struct{}) error { 215 for { 216 err := performUpdateOperations(rcLocal, tr) 217 if err != nil { 218 // we have an error, fake or not we should return 219 return err 220 } 221 if tr.managedCrashed() { 222 return nil 223 } 224 225 atomic.AddUint64(&tr.atomicNumSuccessfulIterations, 1) 226 227 select { 228 case <-testTimeoutChan: 229 return nil 230 default: 231 } 232 } 233 } 234 235 // performUpdateOperations executes a randomised set of updates within an 236 // update session. 237 func performUpdateOperations(rc *refCounter, tr *tracker) (err error) { 238 err = rc.managedStartUpdateWithTimeout(100 * time.Millisecond) 239 if err != nil { 240 // don't fail the test on a timeout on the lock 241 if errors.Contains(err, errTimeoutOnLock) { 242 err = nil 243 } 244 return 245 } 246 // This will wipe the temporary in-mem changes to the counters. 247 // On success that's OK. 248 // On error we need to crash anyway, so it's OK as well. 249 defer rc.callUpdateApplied() 250 251 // We can afford to lock the tracker because only one goroutine is 252 // allowed to make changes at any given time anyway. 253 tr.mu.Lock() 254 defer func() { 255 // If we're returning a disk error we need to crash in order to avoid 256 // data corruption. 257 if errors.Contains(err, dependencies.ErrDiskFault) { 258 tr.crashed = true 259 } 260 tr.mu.Unlock() 261 }() 262 if tr.crashed { 263 return nil 264 } 265 266 var updates []writeaheadlog.Update 267 var u writeaheadlog.Update 268 269 // 50% chance to increment, 2 chances 270 for i := 0; i < 2; i++ { 271 if fastrand.Intn(100) < 50 { 272 secIdx := fastrand.Uint64n(rc.numSectors) 273 // check if the operation is valid - we won't gain anything 274 // from hitting an overflow 275 if errValidate := validateIncrement(rc, secIdx); errValidate != nil { 276 continue 277 } 278 if u, err = rc.callIncrement(secIdx); err != nil { 279 return 280 } 281 tr.counts[secIdx]++ 282 updates = append(updates, u) 283 } 284 } 285 286 // 50% chance to decrement, 2 chances 287 for i := 0; i < 2; i++ { 288 if fastrand.Intn(100) < 50 { 289 secIdx := fastrand.Uint64n(rc.numSectors) 290 // check if the operation is valid - we won't gain anything 291 // from hitting an underflow 292 if errValidate := validateDecrement(rc, secIdx); errValidate != nil { 293 continue 294 } 295 if u, err = rc.callDecrement(secIdx); err != nil { 296 return 297 } 298 tr.counts[secIdx]-- 299 updates = append(updates, u) 300 } 301 } 302 303 // 40% chance to append 304 if fastrand.Intn(100) < 40 { 305 if u, err = rc.callAppend(); err != nil { 306 return 307 } 308 tr.counts = append(tr.counts, 1) 309 updates = append(updates, u) 310 } 311 312 // 20% chance to drop up to 2 sectors 313 if fastrand.Intn(100) < 20 { 314 secNum := fastrand.Uint64n(3) 315 // check if the operation is valid - we won't gain anything 316 // from running out of sectors 317 if errValidate := validateDropSectors(rc, secNum); errValidate == nil { 318 if u, err = rc.callDropSectors(secNum); err != nil { 319 return 320 } 321 tr.counts = tr.counts[:len(tr.counts)-int(secNum)] 322 updates = append(updates, u) 323 } 324 } 325 326 // 20% chance to swap sectors 327 if fastrand.Intn(100) < 20 { 328 var us []writeaheadlog.Update 329 secIdx1 := fastrand.Uint64n(rc.numSectors) 330 secIdx2 := fastrand.Uint64n(rc.numSectors) 331 if us, err = rc.callSwap(secIdx1, secIdx2); err != nil { 332 return 333 } 334 tr.counts[secIdx1], tr.counts[secIdx2] = tr.counts[secIdx2], tr.counts[secIdx1] 335 updates = append(updates, us...) 336 } 337 338 // In case of an error in its critical section, CreateAndApplyTransaction 339 // will panic. An injected disk error can cause such a panic. We need to 340 // recover from it and handle it as a normal faulty disk error within this 341 // test. 342 defer func() { 343 r := recover() 344 e, ok := r.(error) 345 if ok && errors.Contains(e, dependencies.ErrDiskFault) { 346 // we recovered by a panic caused by the faulty disk dependency 347 err = dependencies.ErrDiskFault 348 } else if r != nil { 349 panic(r) 350 } 351 }() 352 353 if len(updates) > 0 { 354 err = rc.callCreateAndApplyTransaction(updates...) 355 } 356 return 357 } 358 359 // reloadRefCounter tries to reload the file. This simulates failures during recovery. 360 func reloadRefCounter(rcFilePath, walPath string, fdd *dependencies.DependencyFaultyDisk, tr *tracker, doneChan <-chan struct{}) (*refCounter, error) { 361 // Try to reload the file. This simulates failures during recovery. 362 for tries := 1; ; tries++ { 363 select { 364 case <-doneChan: 365 return nil, errTestTimeout 366 default: 367 } 368 369 // 20% chance to auto-recover 370 if fastrand.Intn(100) < 20 { 371 fdd.Reset() 372 } 373 374 // Every 10 times, we reset the dependency to avoid getting stuck here. 375 if tries%10 == 0 { 376 fdd.Reset() 377 } 378 // Reload the wal from disk and apply unfinished txns 379 newWal, err := loadWal(rcFilePath, walPath, fdd) 380 if errors.Contains(err, dependencies.ErrDiskFault) { 381 atomic.AddUint64(&tr.atomicNumRecoveries, 1) 382 continue // try again 383 } else if err != nil { 384 // an actual error occurred, the test must fail 385 return nil, err 386 } 387 // Reload the refcounter from disk 388 newRc, err := loadRefCounter(rcFilePath, newWal) 389 if err != nil { 390 return nil, err 391 } 392 newRc.staticDeps = fdd 393 tr.managedSetCrashed(false) 394 return newRc, nil 395 } 396 } 397 398 // validateDecrement is a helper method that ensures the counter is above 0. 399 // This allows us to avoid a counter underflow. 400 func validateDecrement(rc *refCounter, secNum uint64) error { 401 n, err := rc.readCount(secNum) 402 // Ignore errors coming from the dependency for this one 403 if err != nil && !errors.Contains(err, dependencies.ErrDiskFault) { 404 // If the error wasn't caused by the dependency, the test fails. 405 return err 406 } 407 if n < 1 { 408 return errors.New("Cannot decrement a zero counter") 409 } 410 return nil 411 } 412 413 // validateDropSectors is a helper method that ensures the number of sectors we 414 // want to drop does not exceed the number of sectors in the refcounter. 415 func validateDropSectors(rc *refCounter, secNum uint64) error { 416 if rc.numSectors < secNum { 417 return errors.New("Cannot drop more sectors than the total") 418 } 419 return nil 420 } 421 422 // validateIncrement is a helper method that ensures the counter has not 423 // reached its maximum value. This allows us to avoid a counter overflow. 424 func validateIncrement(rc *refCounter, secNum uint64) error { 425 n, err := rc.readCount(secNum) 426 // Ignore errors coming from the dependency for this one 427 if err != nil && !errors.Contains(err, dependencies.ErrDiskFault) { 428 // If the error wasn't caused by the dependency, the test fails. 429 return err 430 } 431 if n == math.MaxUint16 { 432 return errors.New("Cannot increment a counter at max value") 433 } 434 return nil 435 } 436 437 // validateStatusAfterAllTests does the final validation of the test by 438 // comparing the state of the refcounter after all the test updates are applied. 439 func validateStatusAfterAllTests(rc *refCounter, tr *tracker) error { 440 rc.mu.Lock() 441 numSec := rc.numSectors 442 rc.mu.Unlock() 443 if numSec != uint64(len(tr.counts)) { 444 return fmt.Errorf("Expected %d sectors, got %d\n", uint64(len(tr.counts)), numSec) 445 } 446 var errorList []error 447 for i := uint64(0); i < numSec; i++ { 448 n, err := rc.callCount(i) 449 if err != nil { 450 return errors.AddContext(err, "failed to read count") 451 } 452 tr.mu.Lock() 453 if n != tr.counts[i] { 454 errorList = append(errorList, fmt.Errorf("expected counter value for sector %d to be %d, got %d", i, tr.counts[i], n)) 455 } 456 tr.mu.Unlock() 457 } 458 if len(errorList) > 0 { 459 return errors.AddContext(errors.Compose(errorList...), "sector counter values do not match expectations") 460 } 461 return nil 462 }