github.com/bigcommerce/nomad@v0.9.3-bc/command/agent/consul/check_watcher_test.go

github.com/bigcommerce/nomad@v0.9.3-bc/command/agent/consul/check_watcher_test.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/nomad/helper/testlog"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  // checkRestartRecord is used by a testFakeCtx to record when restarts occur
    15  // due to a watched check.
    16  type checkRestartRecord struct {
    17  	timestamp time.Time
    18  	source    string
    19  	reason    string
    20  	failure   bool
    21  }
    22  
    23  // fakeCheckRestarter is a test implementation of TaskRestarter.
    24  type fakeCheckRestarter struct {
    25  	// restarts is a slice of all of the restarts triggered by the checkWatcher
    26  	restarts []checkRestartRecord
    27  
    28  	// need the checkWatcher to re-Watch restarted tasks like TaskRunner
    29  	watcher *checkWatcher
    30  
    31  	// check to re-Watch on restarts
    32  	check     *structs.ServiceCheck
    33  	allocID   string
    34  	taskName  string
    35  	checkName string
    36  }
    37  
    38  // newFakeCheckRestart creates a new TaskRestarter. It needs all of the
    39  // parameters checkWatcher.Watch expects.
    40  func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter {
    41  	return &fakeCheckRestarter{
    42  		watcher:   w,
    43  		check:     c,
    44  		allocID:   allocID,
    45  		taskName:  taskName,
    46  		checkName: checkName,
    47  	}
    48  }
    49  
    50  // Restart implements part of the TaskRestarter interface needed for check
    51  // watching and is normally fulfilled by a TaskRunner.
    52  //
    53  // Restarts are recorded in the []restarts field and re-Watch the check.
    54  //func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) {
    55  func (c *fakeCheckRestarter) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
    56  	restart := checkRestartRecord{
    57  		timestamp: time.Now(),
    58  		source:    event.Type,
    59  		reason:    event.DisplayMessage,
    60  		failure:   failure,
    61  	}
    62  	c.restarts = append(c.restarts, restart)
    63  
    64  	// Re-Watch the check just like TaskRunner
    65  	c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c)
    66  	return nil
    67  }
    68  
    69  // String for debugging
    70  func (c *fakeCheckRestarter) String() string {
    71  	s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName)
    72  	for _, r := range c.restarts {
    73  		s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure)
    74  	}
    75  	return s
    76  }
    77  
    78  // checkResponse is a response returned by the fakeChecksAPI after the given
    79  // time.
    80  type checkResponse struct {
    81  	at     time.Time
    82  	id     string
    83  	status string
    84  }
    85  
    86  // fakeChecksAPI implements the Checks() method for testing Consul.
    87  type fakeChecksAPI struct {
    88  	// responses is a map of check ids to their status at a particular
    89  	// time. checkResponses must be in chronological order.
    90  	responses map[string][]checkResponse
    91  }
    92  
    93  func newFakeChecksAPI() *fakeChecksAPI {
    94  	return &fakeChecksAPI{responses: make(map[string][]checkResponse)}
    95  }
    96  
    97  // add a new check status to Consul at the given time.
    98  func (c *fakeChecksAPI) add(id, status string, at time.Time) {
    99  	c.responses[id] = append(c.responses[id], checkResponse{at, id, status})
   100  }
   101  
   102  func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) {
   103  	now := time.Now()
   104  	result := make(map[string]*api.AgentCheck, len(c.responses))
   105  
   106  	// Use the latest response for each check
   107  	for k, vs := range c.responses {
   108  		for _, v := range vs {
   109  			if v.at.After(now) {
   110  				break
   111  			}
   112  			result[k] = &api.AgentCheck{
   113  				CheckID: k,
   114  				Name:    k,
   115  				Status:  v.status,
   116  			}
   117  		}
   118  	}
   119  
   120  	return result, nil
   121  }
   122  
   123  // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test
   124  // logger and faster poll frequency.
   125  func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) {
   126  	fakeAPI := newFakeChecksAPI()
   127  	cw := newCheckWatcher(testlog.HCLogger(t), fakeAPI)
   128  	cw.pollFreq = 10 * time.Millisecond
   129  	return fakeAPI, cw
   130  }
   131  
   132  func testCheck() *structs.ServiceCheck {
   133  	return &structs.ServiceCheck{
   134  		Name:     "testcheck",
   135  		Interval: 100 * time.Millisecond,
   136  		Timeout:  100 * time.Millisecond,
   137  		CheckRestart: &structs.CheckRestart{
   138  			Limit:          3,
   139  			Grace:          100 * time.Millisecond,
   140  			IgnoreWarnings: false,
   141  		},
   142  	}
   143  }
   144  
   145  // TestCheckWatcher_Skip asserts unwatched checks are ignored.
   146  func TestCheckWatcher_Skip(t *testing.T) {
   147  	t.Parallel()
   148  
   149  	// Create a check with restarting disabled
   150  	check := testCheck()
   151  	check.CheckRestart = nil
   152  
   153  	cw := newCheckWatcher(testlog.HCLogger(t), newFakeChecksAPI())
   154  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check)
   155  	cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1)
   156  
   157  	// Check should have been dropped as it's not watched
   158  	if n := len(cw.checkUpdateCh); n != 0 {
   159  		t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n)
   160  	}
   161  }
   162  
   163  // TestCheckWatcher_Healthy asserts healthy tasks are not restarted.
   164  func TestCheckWatcher_Healthy(t *testing.T) {
   165  	t.Parallel()
   166  
   167  	fakeAPI, cw := testWatcherSetup(t)
   168  
   169  	check1 := testCheck()
   170  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   171  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   172  
   173  	check2 := testCheck()
   174  	check2.CheckRestart.Limit = 1
   175  	check2.CheckRestart.Grace = 0
   176  	restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2)
   177  	cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2)
   178  
   179  	// Make both checks healthy from the beginning
   180  	fakeAPI.add("testcheck1", "passing", time.Time{})
   181  	fakeAPI.add("testcheck2", "passing", time.Time{})
   182  
   183  	// Run
   184  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   185  	defer cancel()
   186  	cw.Run(ctx)
   187  
   188  	// Ensure restart was never called
   189  	if n := len(restarter1.restarts); n > 0 {
   190  		t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1)
   191  	}
   192  	if n := len(restarter2.restarts); n > 0 {
   193  		t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2)
   194  	}
   195  }
   196  
   197  // TestCheckWatcher_HealthyWarning asserts checks in warning with
   198  // ignore_warnings=true do not restart tasks.
   199  func TestCheckWatcher_HealthyWarning(t *testing.T) {
   200  	t.Parallel()
   201  
   202  	fakeAPI, cw := testWatcherSetup(t)
   203  
   204  	check1 := testCheck()
   205  	check1.CheckRestart.Limit = 1
   206  	check1.CheckRestart.Grace = 0
   207  	check1.CheckRestart.IgnoreWarnings = true
   208  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   209  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   210  
   211  	// Check is always in warning but that's ok
   212  	fakeAPI.add("testcheck1", "warning", time.Time{})
   213  
   214  	// Run
   215  	ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
   216  	defer cancel()
   217  	cw.Run(ctx)
   218  
   219  	// Ensure restart was never called on check 1
   220  	if n := len(restarter1.restarts); n > 0 {
   221  		t.Errorf("expected check 1 to not be restarted but found %d", n)
   222  	}
   223  }
   224  
   225  // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy
   226  // before the unhealthy limit is reached do not restart tasks.
   227  func TestCheckWatcher_Flapping(t *testing.T) {
   228  	t.Parallel()
   229  
   230  	fakeAPI, cw := testWatcherSetup(t)
   231  
   232  	check1 := testCheck()
   233  	check1.CheckRestart.Grace = 0
   234  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   235  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   236  
   237  	// Check flaps and is never failing for the full 200ms needed to restart
   238  	now := time.Now()
   239  	fakeAPI.add("testcheck1", "passing", now)
   240  	fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond))
   241  	fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond))
   242  	fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond))
   243  	fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond))
   244  
   245  	ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
   246  	defer cancel()
   247  	cw.Run(ctx)
   248  
   249  	// Ensure restart was never called on check 1
   250  	if n := len(restarter1.restarts); n > 0 {
   251  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   252  	}
   253  }
   254  
   255  // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts.
   256  func TestCheckWatcher_Unwatch(t *testing.T) {
   257  	t.Parallel()
   258  
   259  	fakeAPI, cw := testWatcherSetup(t)
   260  
   261  	// Unwatch immediately
   262  	check1 := testCheck()
   263  	check1.CheckRestart.Limit = 1
   264  	check1.CheckRestart.Grace = 100 * time.Millisecond
   265  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   266  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   267  	cw.Unwatch("testcheck1")
   268  
   269  	// Always failing
   270  	fakeAPI.add("testcheck1", "critical", time.Time{})
   271  
   272  	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
   273  	defer cancel()
   274  	cw.Run(ctx)
   275  
   276  	// Ensure restart was never called on check 1
   277  	if n := len(restarter1.restarts); n > 0 {
   278  		t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1)
   279  	}
   280  }
   281  
   282  // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks
   283  // for a single task, all checks should be removed when any of them restart the
   284  // task to avoid multiple restarts.
   285  func TestCheckWatcher_MultipleChecks(t *testing.T) {
   286  	t.Parallel()
   287  
   288  	fakeAPI, cw := testWatcherSetup(t)
   289  
   290  	check1 := testCheck()
   291  	check1.CheckRestart.Limit = 1
   292  	restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
   293  	cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)
   294  
   295  	check2 := testCheck()
   296  	check2.CheckRestart.Limit = 1
   297  	restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2)
   298  	cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2)
   299  
   300  	check3 := testCheck()
   301  	check3.CheckRestart.Limit = 1
   302  	restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3)
   303  	cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3)
   304  
   305  	// check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart
   306  	now := time.Now()
   307  	fakeAPI.add("testcheck1", "critical", now)
   308  	fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond))
   309  	fakeAPI.add("testcheck2", "critical", now)
   310  	fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond))
   311  	fakeAPI.add("testcheck3", "passing", time.Time{})
   312  
   313  	// Run
   314  	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
   315  	defer cancel()
   316  	cw.Run(ctx)
   317  
   318  	// Ensure that restart was only called once on check 1 or 2. Since
   319  	// checks are in a map it's random which check triggers the restart
   320  	// first.
   321  	if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 {
   322  		t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s",
   323  			n, restarter1, restarter2)
   324  	}
   325  
   326  	if n := len(restarter3.restarts); n != 0 {
   327  		t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3)
   328  	}
   329  }