github.com/djenriquez/nomad-1@v0.8.1/command/agent/consul/check_watcher_test.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // checkRestartRecord is used by a testFakeCtx to record when restarts occur
    14  // due to a watched check.
    15  type checkRestartRecord struct {
    16  	timestamp time.Time
    17  	source    string
    18  	reason    string
    19  	failure   bool
    20  }
    21  
    22  // fakeCheckRestarter is a test implementation of TaskRestarter.
    23  type fakeCheckRestarter struct {
    24  	// restarts is a slice of all of the restarts triggered by the checkWatcher
    25  	restarts []checkRestartRecord
    26  
    27  	// need the checkWatcher to re-Watch restarted tasks like TaskRunner
    28  	watcher *checkWatcher
    29  
    30  	// check to re-Watch on restarts
    31  	check     *structs.ServiceCheck
    32  	allocID   string
    33  	taskName  string
    34  	checkName string
    35  }
    36  
    37  // newFakeCheckRestart creates a new TaskRestarter. It needs all of the
    38  // parameters checkWatcher.Watch expects.
    39  func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter {
    40  	return &fakeCheckRestarter{
    41  		watcher:   w,
    42  		check:     c,
    43  		allocID:   allocID,
    44  		taskName:  taskName,
    45  		checkName: checkName,
    46  	}
    47  }
    48  
    49  // Restart implements part of the TaskRestarter interface needed for check
    50  // watching and is normally fulfilled by a TaskRunner.
    51  //
    52  // Restarts are recorded in the []restarts field and re-Watch the check.
    53  func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) {
    54  	c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure})
    55  
    56  	// Re-Watch the check just like TaskRunner
    57  	c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c)
    58  }
    59  
    60  // String for debugging
    61  func (c *fakeCheckRestarter) String() string {
    62  	s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName)
    63  	for _, r := range c.restarts {
    64  		s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure)
    65  	}
    66  	return s
    67  }
    68  
    69  // checkResponse is a response returned by the fakeChecksAPI after the given
    70  // time.
    71  type checkResponse struct {
    72  	at     time.Time
    73  	id     string
    74  	status string
    75  }
    76  
    77  // fakeChecksAPI implements the Checks() method for testing Consul.
    78  type fakeChecksAPI struct {
    79  	// responses is a map of check ids to their status at a particular
    80  	// time. checkResponses must be in chronological order.
    81  	responses map[string][]checkResponse
    82  }
    83  
    84  func newFakeChecksAPI() *fakeChecksAPI {
    85  	return &fakeChecksAPI{responses: make(map[string][]checkResponse)}
    86  }
    87  
    88  // add a new check status to Consul at the given time.
    89  func (c *fakeChecksAPI) add(id, status string, at time.Time) {
    90  	c.responses[id] = append(c.responses[id], checkResponse{at, id, status})
    91  }
    92  
    93  func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) {
    94  	now := time.Now()
    95  	result := make(map[string]*api.AgentCheck, len(c.responses))
    96  
    97  	// Use the latest response for each check
    98  	for k, vs := range c.responses {
    99  		for _, v := range vs {
   100  			if v.at.After(now) {
   101  				break
   102  			}
   103  			result[k] = &api.AgentCheck{
   104  				CheckID: k,
   105  				Name:    k,
   106  				Status:  v.status,
   107  			}
   108  		}
   109  	}
   110  
   111  	return result, nil
   112  }
   113  
   114  // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test
   115  // logger and faster poll frequency.
   116  func testWatcherSetup() (*fakeChecksAPI, *checkWatcher) {
   117  	fakeAPI := newFakeChecksAPI()
   118  	cw := newCheckWatcher(testLogger(), fakeAPI)
   119  	cw.pollFreq = 10 * time.Millisecond
   120  	return fakeAPI, cw
   121  }
   122  
   123  func testCheck() *structs.ServiceCheck {
   124  	return &structs.ServiceCheck{
   125  		Name:     "testcheck",
   126  		Interval: 100 * time.Millisecond,
   127  		Timeout:  100 * time.Millisecond,
   128  		CheckRestart: &structs.CheckRestart{
   129  			Limit:          3,
   130  			Grace:          100 * time.Millisecond,
   131  			IgnoreWarnings: false,
   132  		},
   133  	}
   134  }
   135  
   136  // TestCheckWatcher_Skip asserts unwatched checks are ignored.
   137  func TestCheckWatcher_Skip(t *testing.T) {
   138  	t.Parallel()
   139  
   140  	// Create a check with restarting disabled
   141  	check := testCheck()
   142  	check.CheckRestart = nil
   143  
   144  	cw := newCheckWatcher(testLogger(), newFakeChecksAPI())
   145  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check)
   146  	cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1)
   147  
   148  	// Check should have been dropped as it's not watched
   149  	if n := len(cw.checkUpdateCh); n != 0 {
   150  		t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n)
   151  	}
   152  }
   153  
   154  // TestCheckWatcher_Healthy asserts healthy tasks are not restarted.
   155  func TestCheckWatcher_Healthy(t *testing.T) {
   156  	t.Parallel()
   157  
   158  	fakeAPI, cw := testWatcherSetup()
   159  
   160  	check1 := testCheck()
   161  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   162  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   163  
   164  	check2 := testCheck()
   165  	check2.CheckRestart.Limit = 1
   166  	check2.CheckRestart.Grace = 0
   167  	restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2)
   168  	cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2)
   169  
   170  	// Make both checks healthy from the beginning
   171  	fakeAPI.add("testcheck1", "passing", time.Time{})
   172  	fakeAPI.add("testcheck2", "passing", time.Time{})
   173  
   174  	// Run
   175  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   176  	defer cancel()
   177  	cw.Run(ctx)
   178  
   179  	// Ensure restart was never called
   180  	if n := len(restarter1.restarts); n > 0 {
   181  		t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1)
   182  	}
   183  	if n := len(restarter2.restarts); n > 0 {
   184  		t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2)
   185  	}
   186  }
   187  
   188  // TestCheckWatcher_HealthyWarning asserts checks in warning with
   189  // ignore_warnings=true do not restart tasks.
   190  func TestCheckWatcher_HealthyWarning(t *testing.T) {
   191  	t.Parallel()
   192  
   193  	fakeAPI, cw := testWatcherSetup()
   194  
   195  	check1 := testCheck()
   196  	check1.CheckRestart.Limit = 1
   197  	check1.CheckRestart.Grace = 0
   198  	check1.CheckRestart.IgnoreWarnings = true
   199  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   200  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   201  
   202  	// Check is always in warning but that's ok
   203  	fakeAPI.add("testcheck1", "warning", time.Time{})
   204  
   205  	// Run
   206  	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
   207  	defer cancel()
   208  	cw.Run(ctx)
   209  
   210  	// Ensure restart was never called on check 1
   211  	if n := len(restarter1.restarts); n > 0 {
   212  		t.Errorf("expected check 1 to not be restarted but found %d", n)
   213  	}
   214  }
   215  
   216  // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy
   217  // before the unhealthy limit is reached do not restart tasks.
   218  func TestCheckWatcher_Flapping(t *testing.T) {
   219  	t.Parallel()
   220  
   221  	fakeAPI, cw := testWatcherSetup()
   222  
   223  	check1 := testCheck()
   224  	check1.CheckRestart.Grace = 0
   225  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   226  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   227  
   228  	// Check flaps and is never failing for the full 200ms needed to restart
   229  	now := time.Now()
   230  	fakeAPI.add("testcheck1", "passing", now)
   231  	fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond))
   232  	fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond))
   233  	fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond))
   234  	fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond))
   235  
   236  	ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
   237  	defer cancel()
   238  	cw.Run(ctx)
   239  
   240  	// Ensure restart was never called on check 1
   241  	if n := len(restarter1.restarts); n > 0 {
   242  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   243  	}
   244  }
   245  
   246  // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts.
   247  func TestCheckWatcher_Unwatch(t *testing.T) {
   248  	t.Parallel()
   249  
   250  	fakeAPI, cw := testWatcherSetup()
   251  
   252  	// Unwatch immediately
   253  	check1 := testCheck()
   254  	check1.CheckRestart.Limit = 1
   255  	check1.CheckRestart.Grace = 100 * time.Millisecond
   256  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   257  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   258  	cw.Unwatch("testcheck1")
   259  
   260  	// Always failing
   261  	fakeAPI.add("testcheck1", "critical", time.Time{})
   262  
   263  	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
   264  	defer cancel()
   265  	cw.Run(ctx)
   266  
   267  	// Ensure restart was never called on check 1
   268  	if n := len(restarter1.restarts); n > 0 {
   269  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   270  	}
   271  }
   272  
   273  // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks
   274  // for a single task, all checks should be removed when any of them restart the
   275  // task to avoid multiple restarts.
   276  func TestCheckWatcher_MultipleChecks(t *testing.T) {
   277  	t.Parallel()
   278  
   279  	fakeAPI, cw := testWatcherSetup()
   280  
   281  	check1 := testCheck()
   282  	check1.CheckRestart.Limit = 1
   283  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   284  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   285  
   286  	check2 := testCheck()
   287  	check2.CheckRestart.Limit = 1
   288  	restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2)
   289  	cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2)
   290  
   291  	check3 := testCheck()
   292  	check3.CheckRestart.Limit = 1
   293  	restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3)
   294  	cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3)
   295  
   296  	// check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart
   297  	now := time.Now()
   298  	fakeAPI.add("testcheck1", "critical", now)
   299  	fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond))
   300  	fakeAPI.add("testcheck2", "critical", now)
   301  	fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond))
   302  	fakeAPI.add("testcheck3", "passing", time.Time{})
   303  
   304  	// Run
   305  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   306  	defer cancel()
   307  	cw.Run(ctx)
   308  
   309  	// Ensure that restart was only called once on check 1 or 2. Since
   310  	// checks are in a map it's random which check triggers the restart
   311  	// first.
   312  	if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 {
   313  		t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s",
   314  			n, restarter1, restarter2)
   315  	}
   316  
   317  	if n := len(restarter3.restarts); n != 0 {
   318  		t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3)
   319  	}
   320  }