github.com/smintz/nomad@v0.8.3/command/agent/consul/check_watcher_test.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "testing" 7 "time" 8 9 "github.com/hashicorp/consul/api" 10 "github.com/hashicorp/nomad/helper/testlog" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 // checkRestartRecord is used by a testFakeCtx to record when restarts occur 15 // due to a watched check. 16 type checkRestartRecord struct { 17 timestamp time.Time 18 source string 19 reason string 20 failure bool 21 } 22 23 // fakeCheckRestarter is a test implementation of TaskRestarter. 24 type fakeCheckRestarter struct { 25 // restarts is a slice of all of the restarts triggered by the checkWatcher 26 restarts []checkRestartRecord 27 28 // need the checkWatcher to re-Watch restarted tasks like TaskRunner 29 watcher *checkWatcher 30 31 // check to re-Watch on restarts 32 check *structs.ServiceCheck 33 allocID string 34 taskName string 35 checkName string 36 } 37 38 // newFakeCheckRestart creates a new TaskRestarter. It needs all of the 39 // parameters checkWatcher.Watch expects. 40 func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { 41 return &fakeCheckRestarter{ 42 watcher: w, 43 check: c, 44 allocID: allocID, 45 taskName: taskName, 46 checkName: checkName, 47 } 48 } 49 50 // Restart implements part of the TaskRestarter interface needed for check 51 // watching and is normally fulfilled by a TaskRunner. 52 // 53 // Restarts are recorded in the []restarts field and re-Watch the check. 54 func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { 55 c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) 56 57 // Re-Watch the check just like TaskRunner 58 c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) 59 } 60 61 // String for debugging 62 func (c *fakeCheckRestarter) String() string { 63 s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) 64 for _, r := range c.restarts { 65 s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) 66 } 67 return s 68 } 69 70 // checkResponse is a response returned by the fakeChecksAPI after the given 71 // time. 72 type checkResponse struct { 73 at time.Time 74 id string 75 status string 76 } 77 78 // fakeChecksAPI implements the Checks() method for testing Consul. 79 type fakeChecksAPI struct { 80 // responses is a map of check ids to their status at a particular 81 // time. checkResponses must be in chronological order. 82 responses map[string][]checkResponse 83 } 84 85 func newFakeChecksAPI() *fakeChecksAPI { 86 return &fakeChecksAPI{responses: make(map[string][]checkResponse)} 87 } 88 89 // add a new check status to Consul at the given time. 90 func (c *fakeChecksAPI) add(id, status string, at time.Time) { 91 c.responses[id] = append(c.responses[id], checkResponse{at, id, status}) 92 } 93 94 func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) { 95 now := time.Now() 96 result := make(map[string]*api.AgentCheck, len(c.responses)) 97 98 // Use the latest response for each check 99 for k, vs := range c.responses { 100 for _, v := range vs { 101 if v.at.After(now) { 102 break 103 } 104 result[k] = &api.AgentCheck{ 105 CheckID: k, 106 Name: k, 107 Status: v.status, 108 } 109 } 110 } 111 112 return result, nil 113 } 114 115 // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test 116 // logger and faster poll frequency. 117 func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) { 118 fakeAPI := newFakeChecksAPI() 119 cw := newCheckWatcher(testlog.Logger(t), fakeAPI) 120 cw.pollFreq = 10 * time.Millisecond 121 return fakeAPI, cw 122 } 123 124 func testCheck() *structs.ServiceCheck { 125 return &structs.ServiceCheck{ 126 Name: "testcheck", 127 Interval: 100 * time.Millisecond, 128 Timeout: 100 * time.Millisecond, 129 CheckRestart: &structs.CheckRestart{ 130 Limit: 3, 131 Grace: 100 * time.Millisecond, 132 IgnoreWarnings: false, 133 }, 134 } 135 } 136 137 // TestCheckWatcher_Skip asserts unwatched checks are ignored. 138 func TestCheckWatcher_Skip(t *testing.T) { 139 t.Parallel() 140 141 // Create a check with restarting disabled 142 check := testCheck() 143 check.CheckRestart = nil 144 145 cw := newCheckWatcher(testlog.Logger(t), newFakeChecksAPI()) 146 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) 147 cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) 148 149 // Check should have been dropped as it's not watched 150 if n := len(cw.checkUpdateCh); n != 0 { 151 t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n) 152 } 153 } 154 155 // TestCheckWatcher_Healthy asserts healthy tasks are not restarted. 156 func TestCheckWatcher_Healthy(t *testing.T) { 157 t.Parallel() 158 159 fakeAPI, cw := testWatcherSetup(t) 160 161 check1 := testCheck() 162 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 163 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 164 165 check2 := testCheck() 166 check2.CheckRestart.Limit = 1 167 check2.CheckRestart.Grace = 0 168 restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) 169 cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) 170 171 // Make both checks healthy from the beginning 172 fakeAPI.add("testcheck1", "passing", time.Time{}) 173 fakeAPI.add("testcheck2", "passing", time.Time{}) 174 175 // Run 176 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 177 defer cancel() 178 cw.Run(ctx) 179 180 // Ensure restart was never called 181 if n := len(restarter1.restarts); n > 0 { 182 t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1) 183 } 184 if n := len(restarter2.restarts); n > 0 { 185 t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) 186 } 187 } 188 189 // TestCheckWatcher_HealthyWarning asserts checks in warning with 190 // ignore_warnings=true do not restart tasks. 191 func TestCheckWatcher_HealthyWarning(t *testing.T) { 192 t.Parallel() 193 194 fakeAPI, cw := testWatcherSetup(t) 195 196 check1 := testCheck() 197 check1.CheckRestart.Limit = 1 198 check1.CheckRestart.Grace = 0 199 check1.CheckRestart.IgnoreWarnings = true 200 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 201 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 202 203 // Check is always in warning but that's ok 204 fakeAPI.add("testcheck1", "warning", time.Time{}) 205 206 // Run 207 ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) 208 defer cancel() 209 cw.Run(ctx) 210 211 // Ensure restart was never called on check 1 212 if n := len(restarter1.restarts); n > 0 { 213 t.Errorf("expected check 1 to not be restarted but found %d", n) 214 } 215 } 216 217 // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy 218 // before the unhealthy limit is reached do not restart tasks. 219 func TestCheckWatcher_Flapping(t *testing.T) { 220 t.Parallel() 221 222 fakeAPI, cw := testWatcherSetup(t) 223 224 check1 := testCheck() 225 check1.CheckRestart.Grace = 0 226 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 227 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 228 229 // Check flaps and is never failing for the full 200ms needed to restart 230 now := time.Now() 231 fakeAPI.add("testcheck1", "passing", now) 232 fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond)) 233 fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond)) 234 fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) 235 fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) 236 237 ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) 238 defer cancel() 239 cw.Run(ctx) 240 241 // Ensure restart was never called on check 1 242 if n := len(restarter1.restarts); n > 0 { 243 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 244 } 245 } 246 247 // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts. 248 func TestCheckWatcher_Unwatch(t *testing.T) { 249 t.Parallel() 250 251 fakeAPI, cw := testWatcherSetup(t) 252 253 // Unwatch immediately 254 check1 := testCheck() 255 check1.CheckRestart.Limit = 1 256 check1.CheckRestart.Grace = 100 * time.Millisecond 257 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 258 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 259 cw.Unwatch("testcheck1") 260 261 // Always failing 262 fakeAPI.add("testcheck1", "critical", time.Time{}) 263 264 ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) 265 defer cancel() 266 cw.Run(ctx) 267 268 // Ensure restart was never called on check 1 269 if n := len(restarter1.restarts); n > 0 { 270 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 271 } 272 } 273 274 // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks 275 // for a single task, all checks should be removed when any of them restart the 276 // task to avoid multiple restarts. 277 func TestCheckWatcher_MultipleChecks(t *testing.T) { 278 t.Parallel() 279 280 fakeAPI, cw := testWatcherSetup(t) 281 282 check1 := testCheck() 283 check1.CheckRestart.Limit = 1 284 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 285 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 286 287 check2 := testCheck() 288 check2.CheckRestart.Limit = 1 289 restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2) 290 cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2) 291 292 check3 := testCheck() 293 check3.CheckRestart.Limit = 1 294 restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3) 295 cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3) 296 297 // check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart 298 now := time.Now() 299 fakeAPI.add("testcheck1", "critical", now) 300 fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond)) 301 fakeAPI.add("testcheck2", "critical", now) 302 fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond)) 303 fakeAPI.add("testcheck3", "passing", time.Time{}) 304 305 // Run 306 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 307 defer cancel() 308 cw.Run(ctx) 309 310 // Ensure that restart was only called once on check 1 or 2. Since 311 // checks are in a map it's random which check triggers the restart 312 // first. 313 if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 { 314 t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s", 315 n, restarter1, restarter2) 316 } 317 318 if n := len(restarter3.restarts); n != 0 { 319 t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3) 320 } 321 }