github.com/smintz/nomad@v0.8.3/command/agent/consul/check_watcher_test.go

github.com/smintz/nomad@v0.8.3/command/agent/consul/check_watcher_test.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/nomad/helper/testlog"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  // checkRestartRecord is used by a testFakeCtx to record when restarts occur
    15  // due to a watched check.
    16  type checkRestartRecord struct {
    17  	timestamp time.Time
    18  	source    string
    19  	reason    string
    20  	failure   bool
    21  }
    22  
    23  // fakeCheckRestarter is a test implementation of TaskRestarter.
    24  type fakeCheckRestarter struct {
    25  	// restarts is a slice of all of the restarts triggered by the checkWatcher
    26  	restarts []checkRestartRecord
    27  
    28  	// need the checkWatcher to re-Watch restarted tasks like TaskRunner
    29  	watcher *checkWatcher
    30  
    31  	// check to re-Watch on restarts
    32  	check     *structs.ServiceCheck
    33  	allocID   string
    34  	taskName  string
    35  	checkName string
    36  }
    37  
    38  // newFakeCheckRestart creates a new TaskRestarter. It needs all of the
    39  // parameters checkWatcher.Watch expects.
    40  func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter {
    41  	return &fakeCheckRestarter{
    42  		watcher:   w,
    43  		check:     c,
    44  		allocID:   allocID,
    45  		taskName:  taskName,
    46  		checkName: checkName,
    47  	}
    48  }
    49  
    50  // Restart implements part of the TaskRestarter interface needed for check
    51  // watching and is normally fulfilled by a TaskRunner.
    52  //
    53  // Restarts are recorded in the []restarts field and re-Watch the check.
    54  func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) {
    55  	c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure})
    56  
    57  	// Re-Watch the check just like TaskRunner
    58  	c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c)
    59  }
    60  
    61  // String for debugging
    62  func (c *fakeCheckRestarter) String() string {
    63  	s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName)
    64  	for _, r := range c.restarts {
    65  		s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure)
    66  	}
    67  	return s
    68  }
    69  
    70  // checkResponse is a response returned by the fakeChecksAPI after the given
    71  // time.
    72  type checkResponse struct {
    73  	at     time.Time
    74  	id     string
    75  	status string
    76  }
    77  
    78  // fakeChecksAPI implements the Checks() method for testing Consul.
    79  type fakeChecksAPI struct {
    80  	// responses is a map of check ids to their status at a particular
    81  	// time. checkResponses must be in chronological order.
    82  	responses map[string][]checkResponse
    83  }
    84  
    85  func newFakeChecksAPI() *fakeChecksAPI {
    86  	return &fakeChecksAPI{responses: make(map[string][]checkResponse)}
    87  }
    88  
    89  // add a new check status to Consul at the given time.
    90  func (c *fakeChecksAPI) add(id, status string, at time.Time) {
    91  	c.responses[id] = append(c.responses[id], checkResponse{at, id, status})
    92  }
    93  
    94  func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) {
    95  	now := time.Now()
    96  	result := make(map[string]*api.AgentCheck, len(c.responses))
    97  
    98  	// Use the latest response for each check
    99  	for k, vs := range c.responses {
   100  		for _, v := range vs {
   101  			if v.at.After(now) {
   102  				break
   103  			}
   104  			result[k] = &api.AgentCheck{
   105  				CheckID: k,
   106  				Name:    k,
   107  				Status:  v.status,
   108  			}
   109  		}
   110  	}
   111  
   112  	return result, nil
   113  }
   114  
   115  // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test
   116  // logger and faster poll frequency.
   117  func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) {
   118  	fakeAPI := newFakeChecksAPI()
   119  	cw := newCheckWatcher(testlog.Logger(t), fakeAPI)
   120  	cw.pollFreq = 10 * time.Millisecond
   121  	return fakeAPI, cw
   122  }
   123  
   124  func testCheck() *structs.ServiceCheck {
   125  	return &structs.ServiceCheck{
   126  		Name:     "testcheck",
   127  		Interval: 100 * time.Millisecond,
   128  		Timeout:  100 * time.Millisecond,
   129  		CheckRestart: &structs.CheckRestart{
   130  			Limit:          3,
   131  			Grace:          100 * time.Millisecond,
   132  			IgnoreWarnings: false,
   133  		},
   134  	}
   135  }
   136  
   137  // TestCheckWatcher_Skip asserts unwatched checks are ignored.
   138  func TestCheckWatcher_Skip(t *testing.T) {
   139  	t.Parallel()
   140  
   141  	// Create a check with restarting disabled
   142  	check := testCheck()
   143  	check.CheckRestart = nil
   144  
   145  	cw := newCheckWatcher(testlog.Logger(t), newFakeChecksAPI())
   146  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check)
   147  	cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1)
   148  
   149  	// Check should have been dropped as it's not watched
   150  	if n := len(cw.checkUpdateCh); n != 0 {
   151  		t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n)
   152  	}
   153  }
   154  
   155  // TestCheckWatcher_Healthy asserts healthy tasks are not restarted.
   156  func TestCheckWatcher_Healthy(t *testing.T) {
   157  	t.Parallel()
   158  
   159  	fakeAPI, cw := testWatcherSetup(t)
   160  
   161  	check1 := testCheck()
   162  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   163  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   164  
   165  	check2 := testCheck()
   166  	check2.CheckRestart.Limit = 1
   167  	check2.CheckRestart.Grace = 0
   168  	restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2)
   169  	cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2)
   170  
   171  	// Make both checks healthy from the beginning
   172  	fakeAPI.add("testcheck1", "passing", time.Time{})
   173  	fakeAPI.add("testcheck2", "passing", time.Time{})
   174  
   175  	// Run
   176  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   177  	defer cancel()
   178  	cw.Run(ctx)
   179  
   180  	// Ensure restart was never called
   181  	if n := len(restarter1.restarts); n > 0 {
   182  		t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1)
   183  	}
   184  	if n := len(restarter2.restarts); n > 0 {
   185  		t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2)
   186  	}
   187  }
   188  
   189  // TestCheckWatcher_HealthyWarning asserts checks in warning with
   190  // ignore_warnings=true do not restart tasks.
   191  func TestCheckWatcher_HealthyWarning(t *testing.T) {
   192  	t.Parallel()
   193  
   194  	fakeAPI, cw := testWatcherSetup(t)
   195  
   196  	check1 := testCheck()
   197  	check1.CheckRestart.Limit = 1
   198  	check1.CheckRestart.Grace = 0
   199  	check1.CheckRestart.IgnoreWarnings = true
   200  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   201  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   202  
   203  	// Check is always in warning but that's ok
   204  	fakeAPI.add("testcheck1", "warning", time.Time{})
   205  
   206  	// Run
   207  	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
   208  	defer cancel()
   209  	cw.Run(ctx)
   210  
   211  	// Ensure restart was never called on check 1
   212  	if n := len(restarter1.restarts); n > 0 {
   213  		t.Errorf("expected check 1 to not be restarted but found %d", n)
   214  	}
   215  }
   216  
   217  // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy
   218  // before the unhealthy limit is reached do not restart tasks.
   219  func TestCheckWatcher_Flapping(t *testing.T) {
   220  	t.Parallel()
   221  
   222  	fakeAPI, cw := testWatcherSetup(t)
   223  
   224  	check1 := testCheck()
   225  	check1.CheckRestart.Grace = 0
   226  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   227  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   228  
   229  	// Check flaps and is never failing for the full 200ms needed to restart
   230  	now := time.Now()
   231  	fakeAPI.add("testcheck1", "passing", now)
   232  	fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond))
   233  	fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond))
   234  	fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond))
   235  	fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond))
   236  
   237  	ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
   238  	defer cancel()
   239  	cw.Run(ctx)
   240  
   241  	// Ensure restart was never called on check 1
   242  	if n := len(restarter1.restarts); n > 0 {
   243  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   244  	}
   245  }
   246  
   247  // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts.
   248  func TestCheckWatcher_Unwatch(t *testing.T) {
   249  	t.Parallel()
   250  
   251  	fakeAPI, cw := testWatcherSetup(t)
   252  
   253  	// Unwatch immediately
   254  	check1 := testCheck()
   255  	check1.CheckRestart.Limit = 1
   256  	check1.CheckRestart.Grace = 100 * time.Millisecond
   257  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   258  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   259  	cw.Unwatch("testcheck1")
   260  
   261  	// Always failing
   262  	fakeAPI.add("testcheck1", "critical", time.Time{})
   263  
   264  	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
   265  	defer cancel()
   266  	cw.Run(ctx)
   267  
   268  	// Ensure restart was never called on check 1
   269  	if n := len(restarter1.restarts); n > 0 {
   270  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   271  	}
   272  }
   273  
   274  // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks
   275  // for a single task, all checks should be removed when any of them restart the
   276  // task to avoid multiple restarts.
   277  func TestCheckWatcher_MultipleChecks(t *testing.T) {
   278  	t.Parallel()
   279  
   280  	fakeAPI, cw := testWatcherSetup(t)
   281  
   282  	check1 := testCheck()
   283  	check1.CheckRestart.Limit = 1
   284  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   285  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   286  
   287  	check2 := testCheck()
   288  	check2.CheckRestart.Limit = 1
   289  	restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2)
   290  	cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2)
   291  
   292  	check3 := testCheck()
   293  	check3.CheckRestart.Limit = 1
   294  	restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3)
   295  	cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3)
   296  
   297  	// check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart
   298  	now := time.Now()
   299  	fakeAPI.add("testcheck1", "critical", now)
   300  	fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond))
   301  	fakeAPI.add("testcheck2", "critical", now)
   302  	fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond))
   303  	fakeAPI.add("testcheck3", "passing", time.Time{})
   304  
   305  	// Run
   306  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   307  	defer cancel()
   308  	cw.Run(ctx)
   309  
   310  	// Ensure that restart was only called once on check 1 or 2. Since
   311  	// checks are in a map it's random which check triggers the restart
   312  	// first.
   313  	if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 {
   314  		t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s",
   315  			n, restarter1, restarter2)
   316  	}
   317  
   318  	if n := len(restarter3.restarts); n != 0 {
   319  		t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3)
   320  	}
   321  }