gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobgeneric_test.go (about) 1 package renter 2 3 import ( 4 "reflect" 5 "runtime" 6 "runtime/debug" 7 "sync" 8 "testing" 9 "time" 10 11 "gitlab.com/SkynetLabs/skyd/build" 12 "golang.org/x/net/context" 13 14 "gitlab.com/NebulousLabs/errors" 15 ) 16 17 // jobTest is a minimum viable implementation for a worker job. It most 18 // importantly needs a channel that it can send the result of its work down, so 19 // the caller can see how the job panned out. Technically this is not actually 20 // necessary, but most jobs will need to communicate some result to the caller. 21 // 22 // There are also some variables for tracking whether the job has been executed 23 // or discarded, these are for testing purposes and not actually part of a 24 // minimum viable job. 25 type jobTest struct { 26 // jobGeneric implements a lot of the boilerplate job code for us. 27 jobGeneric 28 29 // When a job completes it will send a result down the resultChan. 30 resultChan chan *jobTestResult 31 32 // When a job is executed the time is registered in this field. 33 executeTime time.Time 34 35 // These are variables for tracking the execution status of the job, they 36 // are only used for testing. 'staticShouldFail' tells the execution function 37 // whether the job should simulate a success or a failure. 38 staticShouldFail bool 39 discarded bool 40 executed bool 41 mu sync.Mutex 42 } 43 44 // jobTestResult is a minimum viable implementation for a worker job result. 45 type jobTestResult struct { 46 // Generally a caller minimally needs to know if there was an error. Often 47 // the caller will also be expecting some result such as a piece of data. 48 staticErr error 49 } 50 51 // jobTestMetadata is a test struct that represents test job metadata. 52 type jobTestMetadata struct { 53 staticField bool 54 } 55 56 // sendResult will send the result of a job down the resultChan. Note that 57 // sending the result should be done in a goroutine so that the worker does not 58 // get blocked if nobody is listening on the resultChan. Note that also the 59 // resultChan should generally be created as a buffered channel with enough 60 // result slots that this should never block, but defensive programming suggests 61 // that we should implement precautions on both ends. 62 func (j *jobTest) sendResult(result *jobTestResult) { 63 w := j.staticQueue.staticWorker() 64 err := w.staticTG.Launch(func() { 65 select { 66 case j.resultChan <- result: 67 case <-w.staticTG.StopChan(): 68 case <-j.staticCtx.Done(): 69 } 70 }) 71 if err != nil { 72 panic(err) 73 } 74 } 75 76 // callDiscard expires the job. This typically requires telling the caller that 77 // the job has failed. 78 func (j *jobTest) callDiscard(err error) { 79 // Send a failed result to the caller. 80 result := &jobTestResult{ 81 staticErr: errors.AddContext(err, "test job is being discarded"), 82 } 83 j.sendResult(result) 84 85 // Mark 'j.discarded' as true so that we can verify in the test that this 86 // function is being called. Do a sanity check that the job is only being 87 // discarded once. 88 j.mu.Lock() 89 if j.discarded { 90 build.Critical("double discard on job") 91 } 92 j.discarded = true 93 j.mu.Unlock() 94 } 95 96 // callExecute will mark the job as executed. 97 func (j *jobTest) callExecute() (err error) { 98 j.mu.Lock() 99 j.executed = true 100 j.executeTime = time.Now() 101 staticShouldFail := j.staticShouldFail 102 j.mu.Unlock() 103 104 // Need to report a success if the job succeeded, and a fail otherwise. 105 if staticShouldFail { 106 j.mu.Lock() 107 executeTime := j.executeTime 108 j.mu.Unlock() 109 110 err = errors.New("job is simulated to have failed") 111 j.staticQueue.callReportFailure(err, executeTime, time.Now()) 112 } else { 113 j.staticQueue.callReportSuccess() 114 } 115 116 // Send the error the caller. 117 result := &jobTestResult{ 118 staticErr: err, 119 } 120 j.sendResult(result) 121 122 return 123 } 124 125 // callExpectedBandwidth returns the amount of bandwidth this job is expected to 126 // consume. 127 func (j *jobTest) callExpectedBandwidth() (ul, dl uint64) { 128 return 0, 0 129 } 130 131 // TestWorkerJobGeneric tests that all of the code for the generic worker job is 132 // functioning correctly. 133 func TestWorkerJobGeneric(t *testing.T) { 134 if testing.Short() { 135 t.SkipNow() 136 } 137 138 // Create a job queue. 139 w := new(worker) 140 w.staticRenter = new(Renter) 141 jq := newJobGenericQueue(w) 142 cancelCtx, cancel := context.WithCancel(context.Background()) 143 144 // Create a job, add the job to the queue, and then ensure that the 145 // cancelation is working correctly. 146 resultChan := make(chan *jobTestResult, 1) 147 j := &jobTest{ 148 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 149 150 resultChan: resultChan, 151 } 152 if j.staticCanceled() { 153 t.Error("job should not be canceled yet") 154 } 155 if !jq.callAdd(j) { 156 t.Fatal("call to add job to new job queue should succeed") 157 } 158 cancel() 159 job := jq.callNext() 160 if job != nil { 161 t.Error("queue should not be returning canceled jobs") 162 } 163 if !j.staticCanceled() { 164 t.Error("job should be reporting itself as canceled") 165 } 166 j.mu.Lock() 167 discarded := j.discarded 168 executed := j.executed 169 j.mu.Unlock() 170 if !discarded || executed { 171 t.Error("job should not have executed but discarded") 172 } 173 // NOTE: the job is not expected to send a result when it has been 174 // explicitly canceled. Check that no result was sent. 175 select { 176 case <-resultChan: 177 t.Error("there should not be any result after a job was canceled successfully") 178 default: 179 } 180 // NOTE: a job being canceled is not considered to be an error, the queue 181 // will not go on cooldown. Next job should be able to succeed without any 182 // sort of waiting for a cooldown. 183 184 // Create two new jobs, add them to the queue, and then simulate the work 185 // loop executing the jobs. 186 cancelCtx, cancel = context.WithCancel(context.Background()) 187 resultChan = make(chan *jobTestResult, 1) 188 j = &jobTest{ 189 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 190 191 resultChan: resultChan, 192 } 193 if !jq.callAdd(j) { 194 t.Fatal("call to add job to new job queue should succeed") 195 } 196 // Add a second job to the queue to check that the queue function is working 197 // correctly. 198 cancelCtx2, _ := context.WithCancel(context.Background()) 199 resultChan2 := make(chan *jobTestResult, 1) 200 j2 := &jobTest{ 201 jobGeneric: newJobGeneric(cancelCtx2, jq, nil), 202 203 resultChan: resultChan2, 204 } 205 if !jq.callAdd(j2) { 206 t.Fatal("call to add job to new job queue should succeed") 207 } 208 job = jq.callNext() 209 if job == nil { 210 t.Fatal("call to grab the next job failed, there should be a job ready in the queue") 211 } 212 // Simulate a successful execution by the control loop. 213 job.callExecute() 214 // There should be one more job in the queue. 215 job = jq.callNext() 216 if job == nil { 217 t.Fatal("call to grab the next job failed, there should be a job ready in the queue") 218 } 219 job.callExecute() 220 // Queue should be empty now. 221 job = jq.callNext() 222 if job != nil { 223 t.Fatal("job queue should be empty") 224 } 225 // jobs should be marked as executed, and should not be marked as discarded. 226 j.mu.Lock() 227 if !j.executed || j.discarded { 228 t.Error("job state indicates that the wrong code ran") 229 } 230 j.mu.Unlock() 231 j2.mu.Lock() 232 if !j2.executed || j2.discarded { 233 t.Error("job state indicates that the wrong code ran") 234 } 235 j2.mu.Unlock() 236 // There should be a result with no error in the result chan. 237 select { 238 case res := <-resultChan: 239 if res == nil || res.staticErr != nil { 240 t.Error("there should be a result with a nil error") 241 } 242 case <-time.After(time.Second * 3): 243 t.Error("there should be a result") 244 } 245 select { 246 case res := <-resultChan2: 247 if res == nil || res.staticErr != nil { 248 t.Error("there should be a result with a nil error") 249 } 250 case <-time.After(time.Second * 3): 251 t.Error("there should be a result") 252 } 253 254 // Create several jobs and add them to the queue. Have the first job fail, 255 // this should result in the worker going on cooldown and cause all of the 256 // rest of the jobs to fail as well. 257 cancelCtx, cancel = context.WithCancel(context.Background()) 258 resultChan = make(chan *jobTestResult, 1) 259 j = &jobTest{ 260 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 261 262 resultChan: resultChan, 263 264 // Set staticShouldFail to true, so the execution knows to fail the job. 265 staticShouldFail: true, 266 } 267 if !jq.callAdd(j) { 268 t.Fatal("call to add job to new job queue should succeed") 269 } 270 cancelCtx2, _ = context.WithCancel(context.Background()) 271 resultChan2 = make(chan *jobTestResult, 1) 272 j2 = &jobTest{ 273 jobGeneric: newJobGeneric(cancelCtx2, jq, nil), 274 275 resultChan: resultChan2, 276 } 277 if !jq.callAdd(j2) { 278 t.Fatal("call to add job to new job queue should succeed") 279 } 280 cancelCtx3, _ := context.WithCancel(context.Background()) 281 resultChan3 := make(chan *jobTestResult, 1) 282 j3 := &jobTest{ 283 jobGeneric: newJobGeneric(cancelCtx3, jq, nil), 284 285 resultChan: resultChan3, 286 } 287 if !jq.callAdd(j3) { 288 t.Fatal("call to add job to new job queue should succeed") 289 } 290 // Simulate execution of the first job, this should fail. 291 job = jq.callNext() 292 if job == nil { 293 t.Fatal("there should be a job in the queue") 294 } 295 job.callExecute() 296 // Queue should be empty now and the other jobs should be discarded. 297 job = jq.callNext() 298 if job != nil { 299 t.Error("there should be no more jobs in the queue") 300 } 301 // j should be marked as executed, the others should be marked as discarded. 302 j.mu.Lock() 303 if !j.executed || j.discarded { 304 t.Error("j indicates wrong execution path") 305 } 306 j.mu.Unlock() 307 j2.mu.Lock() 308 if j2.executed || !j2.discarded { 309 t.Error("j2 indicates wrong execution path") 310 } 311 j2.mu.Unlock() 312 j3.mu.Lock() 313 if j3.executed || !j3.discarded { 314 t.Error("j3 indicates wrong execution path") 315 } 316 j3.mu.Unlock() 317 // All three jobs should be giving out errors on their resultChans. 318 select { 319 case res := <-resultChan: 320 if res == nil || res.staticErr == nil { 321 t.Error("there should be a result with an error") 322 } 323 case <-time.After(time.Second * 3): 324 t.Error("there should be a result") 325 } 326 select { 327 case res := <-resultChan2: 328 if res == nil || res.staticErr == nil { 329 t.Error("there should be a result with an error") 330 } 331 case <-time.After(time.Second * 3): 332 t.Error("there should be a result") 333 } 334 select { 335 case res := <-resultChan3: 336 if res == nil || res.staticErr == nil { 337 t.Error("there should be a result with an error") 338 } 339 case <-time.After(time.Second * 3): 340 t.Error("there should be a result") 341 } 342 // Check the recentErr and consecutive failures field of the generic job, 343 // they should be set since there was a failure. 344 jq.mu.Lock() 345 if jq.recentErr == nil { 346 t.Error("the recentErr field should be set since there was a failure") 347 } 348 if jq.consecutiveFailures != 1 { 349 t.Error("job queue should be reporting consecutive failures") 350 } 351 cu := jq.cooldownUntil 352 jq.mu.Unlock() 353 354 // Check the queue is on a cooldown 355 if !jq.callOnCooldown() { 356 t.Error("queue should be on cooldown") 357 } 358 359 // The queue should be on cooldown now, adding a new job should fail. 360 cancelCtx, cancel = context.WithCancel(context.Background()) 361 resultChan = make(chan *jobTestResult, 1) 362 j = &jobTest{ 363 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 364 365 resultChan: resultChan, 366 367 // Set staticShouldFail to true, so the execution knows to fail the job. 368 staticShouldFail: true, 369 } 370 if jq.callAdd(j) { 371 t.Fatal("job queue should be on cooldown") 372 } 373 // Sleep until the cooldown has ended. 374 time.Sleep(time.Until(cu)) 375 // Try adding the job again, this time adding the job should succeed. 376 if !jq.callAdd(j) { 377 t.Fatal("job queue should be off cooldown now") 378 } 379 // Execute the job, which should cause a failure and more cooldown. 380 job = jq.callNext() 381 if job == nil { 382 t.Fatal("there should be a job") 383 } 384 job.callExecute() 385 // Drain the result of the job, make sure it's an error. 386 select { 387 case res := <-resultChan: 388 if res == nil || res.staticErr == nil { 389 t.Error("there should be a result with an error") 390 } 391 case <-time.After(time.Second * 3): 392 t.Error("there should be a result") 393 } 394 // Check the job execution status. 395 j.mu.Lock() 396 if !j.executed || j.discarded { 397 t.Error("j has wrong execution flags") 398 } 399 j.mu.Unlock() 400 // Check the queue cooldown status. 401 jq.mu.Lock() 402 if jq.recentErr == nil { 403 t.Error("the recentErr field should be set since there was a failure") 404 } 405 if jq.consecutiveFailures != 2 { 406 t.Error("job queue should be reporting consecutive failures") 407 } 408 cu = jq.cooldownUntil 409 jq.mu.Unlock() 410 // Sleep off the cooldown. 411 time.Sleep(time.Until(cu)) 412 413 // Check the cooldown status 414 if jq.callOnCooldown() { 415 t.Error("queue should not be on cooldown") 416 } 417 418 // Add a job with metadata to the queue 419 j5 := &jobTest{ 420 jobGeneric: newJobGeneric(context.Background(), jq, jobTestMetadata{ 421 staticField: true, 422 }), 423 resultChan: make(chan *jobTestResult, 1), 424 } 425 if !jq.callAdd(j5) { 426 t.Fatal("call to add job to new job queue should succeed") 427 } 428 job = jq.callNext() 429 if job == nil { 430 t.Fatal("call to grab the next job failed, there should be a job ready in the queue") 431 } 432 meta, ok := job.staticGetMetadata().(jobTestMetadata) 433 if !ok { 434 t.Fatal("expected job metadata to be present on the job", ok, job.staticGetMetadata()) 435 } 436 if !reflect.DeepEqual(meta, jobTestMetadata{ 437 staticField: true, 438 }) { 439 t.Fatal("unexpected metadata") 440 } 441 442 // Add one more job, and check that killing the queue kills the job. 443 cancelCtx, cancel = context.WithCancel(context.Background()) 444 resultChan = make(chan *jobTestResult, 1) 445 j = &jobTest{ 446 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 447 448 resultChan: resultChan, 449 } 450 if !jq.callAdd(j) { 451 t.Fatal("job queue should be off cooldown now") 452 } 453 454 // Kill the queue. 455 jq.callKill() 456 job = jq.callNext() 457 if job != nil { 458 t.Fatal("after killing the queue, there should be no more jobs") 459 } 460 // Check that the job result is an error. 461 select { 462 case res := <-resultChan: 463 if res == nil || res.staticErr == nil { 464 t.Error("there should be a result with an error") 465 } 466 case <-time.After(time.Second * 3): 467 t.Error("there should be a result") 468 } 469 // Check the job execution status. 470 j.mu.Lock() 471 if j.executed || !j.discarded { 472 t.Error("j has wrong execution flags") 473 } 474 j.mu.Unlock() 475 476 // Try adding a new job, this should fail because the queue was killed. 477 cancelCtx, cancel = context.WithCancel(context.Background()) 478 resultChan = make(chan *jobTestResult, 1) 479 j = &jobTest{ 480 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 481 482 resultChan: resultChan, 483 } 484 if jq.callAdd(j) { 485 t.Fatal("should not be able to add jobs after the queue has been killed") 486 } 487 } 488 489 // TestQueueMemoryLeak makes sure that adding jobs to a queue in a tight loop 490 // won't cause too many allocated objects in memory. 491 func TestQueueMemoryLeak(t *testing.T) { 492 if testing.Short() { 493 t.SkipNow() 494 } 495 t.Parallel() 496 497 // Create queue. 498 w := new(worker) 499 w.staticRenter = new(Renter) 500 jq := newJobGenericQueue(w) 501 502 // Prepare a job. 503 cancelCtx, cancel := context.WithCancel(context.Background()) 504 defer cancel() 505 resultChan := make(chan *jobTestResult, 1) 506 j := &jobTest{ 507 jobGeneric: newJobGeneric(cancelCtx, jq, nil), 508 resultChan: resultChan, 509 } 510 511 // Add the job 1 million times and remove it again. 512 n := 1000000 513 for i := 0; i < n; i++ { 514 if !jq.callAdd(j) { 515 t.Fatal("failed to add job") 516 } 517 jq.callNext() 518 } 519 520 // Get the memory stats and print them. 521 var ms runtime.MemStats 522 runtime.ReadMemStats(&ms) 523 t.Log("before gc", ms.HeapObjects, ms.HeapAlloc) 524 525 // Less than 250k objects should be allocated. 526 // NOTE: This number was chosen after manually testing and printing the 527 // stats. During testing it turned out that running the loop above 1 528 // million times would cause the number of objects to be <250k vs 250+k 529 // with the old code. 530 if ms.HeapObjects > 250000 { 531 t.Fatal("Too many allocated objects", ms.HeapObjects) 532 } 533 534 // Free memory. 535 debug.FreeOSMemory() 536 537 // Print the stats again. 538 runtime.ReadMemStats(&ms) 539 t.Log("after gc", ms.HeapObjects, ms.HeapAlloc) 540 } 541 542 // TestJobGenericQueue is a collection of unit tests that cover the 543 // functionality of the JobGenericQueue 544 func TestJobGenericQueue(t *testing.T) { 545 t.Parallel() 546 547 t.Run("callCooldownStatus", testCallCooldownStatus) 548 t.Run("callReportFailure", testCallReportFailure) 549 } 550 551 // testCallCooldownStatus is a unit test that covers the method 552 // callCooldownStatus on the JobGenericQueue 553 func testCallCooldownStatus(t *testing.T) { 554 w := new(worker) 555 q := newJobGenericQueue(w) 556 557 // base case 558 onCD, killed, numJobs, onCDFor, cdErr := q.callCooldownStatus() 559 if onCD { 560 t.Fatal("unexpected") 561 } 562 if killed { 563 t.Fatal("unexpected") 564 } 565 if numJobs != 0 { 566 t.Fatal("unexpected") 567 } 568 if onCDFor != 0 { 569 t.Fatal("unexpected") 570 } 571 if cdErr != "" { 572 t.Fatal("unexpected") 573 } 574 575 // jobs|killed case 576 q.callAdd(&jobTest{}) 577 q.killed = true 578 579 _, killed, numJobs, _, _ = q.callCooldownStatus() 580 if !killed { 581 t.Fatal("unexpected") 582 } 583 if numJobs != 1 { 584 t.Fatal("unexpected", numJobs) 585 } 586 587 q.cooldownUntil = time.Now().Add(time.Minute) 588 q.recentErr = errors.New("foo") 589 590 // cooldown case 591 onCD, _, _, onCDFor, cdErr = q.callCooldownStatus() 592 if !onCD { 593 t.Fatal("unexpected") 594 } 595 if onCDFor.Round(time.Second) != time.Minute { 596 t.Fatal("unexpected") 597 } 598 if cdErr != "foo" { 599 t.Fatal("unexpected") 600 } 601 } 602 603 // testCallReportFailure is a unit test that covers the method callReportFailure 604 // on the JobGenericQueue 605 func testCallReportFailure(t *testing.T) { 606 w := new(worker) 607 q := newJobGenericQueue(w) 608 err := errors.New("job failure") 609 610 // assert initial state 611 if q.consecutiveFailures != 0 || !q.recentErrTime.IsZero() || !q.firstFailureTime.IsZero() { 612 t.Fatal("unexpected") 613 } 614 615 // prepare some timings 616 // 617 // | - - - - - - - - - - - - - - - - - - - - - - J1 - - J1X - - - - - - - - 618 // | - - - - - - - - - - - - - - - - - - - - - - - - J2 - - - - J2X - - - - 619 // | - - - - - - - - - - - - - - - - - - - - - - - - J3 - - J3X - - - - - - 620 // | - - - - - - - - - - - - - - - - - - - - - - - - - - - J4 J4Y - - - - - 621 // | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - J5 J5X 622 // 623 // the idea is to mimic parallel job execution where some jobs don't 624 // increment the consecutive failures, while others will if they executed 625 // after the last fail time or if a job executed successfully in between 626 627 startJ1 := time.Now().Add(-11 * time.Minute) 628 endJ1 := time.Now().Add(-8 * time.Minute) 629 630 startJ2 := time.Now().Add(-9 * time.Minute) 631 endJ2 := time.Now().Add(-4 * time.Minute) 632 633 startJ3 := time.Now().Add(-9 * time.Minute) 634 endJ3 := time.Now().Add(-6 * time.Minute) 635 636 startJ5 := time.Now().Add(-1 * time.Minute) 637 endJ5 := time.Now() 638 639 // report job outcome for J1 640 q.callReportFailure(err, startJ1, endJ1) 641 642 // assert updated state 643 if q.consecutiveFailures != 1 || q.recentErrTime.IsZero() || !q.firstFailureTime.Equal(endJ1) { 644 t.Fatal("unexpected") 645 } 646 647 // report job outcome for J3 648 q.callReportFailure(err, startJ3, endJ3) 649 650 // assert updated state 651 if q.consecutiveFailures != 1 || !q.firstFailureTime.Equal(endJ1) { 652 t.Fatal("unexpected") 653 } 654 655 // report job outcome for J4 656 q.callReportSuccess() 657 658 // assert updated state 659 if q.consecutiveFailures != 0 || !q.firstFailureTime.IsZero() { 660 t.Fatal("unexpected") 661 } 662 663 // report job outcome for J2 664 q.callReportFailure(err, startJ2, endJ2) 665 666 // assert updated state 667 if q.consecutiveFailures != 1 || !q.firstFailureTime.Equal(endJ2) { 668 t.Fatal("unexpected") 669 } 670 671 // report job outcome for J5 672 q.callReportFailure(err, startJ5, endJ5) 673 if q.consecutiveFailures != 2 || !q.firstFailureTime.Equal(endJ5) { 674 t.Fatal("unexpected", q.consecutiveFailures) 675 } 676 }