github.com/bigcommerce/nomad@v0.9.3-bc/command/agent/consul/check_watcher_test.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "testing" 7 "time" 8 9 "github.com/hashicorp/consul/api" 10 "github.com/hashicorp/nomad/helper/testlog" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 // checkRestartRecord is used by a testFakeCtx to record when restarts occur 15 // due to a watched check. 16 type checkRestartRecord struct { 17 timestamp time.Time 18 source string 19 reason string 20 failure bool 21 } 22 23 // fakeCheckRestarter is a test implementation of TaskRestarter. 24 type fakeCheckRestarter struct { 25 // restarts is a slice of all of the restarts triggered by the checkWatcher 26 restarts []checkRestartRecord 27 28 // need the checkWatcher to re-Watch restarted tasks like TaskRunner 29 watcher *checkWatcher 30 31 // check to re-Watch on restarts 32 check *structs.ServiceCheck 33 allocID string 34 taskName string 35 checkName string 36 } 37 38 // newFakeCheckRestart creates a new TaskRestarter. It needs all of the 39 // parameters checkWatcher.Watch expects. 40 func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { 41 return &fakeCheckRestarter{ 42 watcher: w, 43 check: c, 44 allocID: allocID, 45 taskName: taskName, 46 checkName: checkName, 47 } 48 } 49 50 // Restart implements part of the TaskRestarter interface needed for check 51 // watching and is normally fulfilled by a TaskRunner. 52 // 53 // Restarts are recorded in the []restarts field and re-Watch the check. 54 //func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { 55 func (c *fakeCheckRestarter) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { 56 restart := checkRestartRecord{ 57 timestamp: time.Now(), 58 source: event.Type, 59 reason: event.DisplayMessage, 60 failure: failure, 61 } 62 c.restarts = append(c.restarts, restart) 63 64 // Re-Watch the check just like TaskRunner 65 c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) 66 return nil 67 } 68 69 // String for debugging 70 func (c *fakeCheckRestarter) String() string { 71 s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) 72 for _, r := range c.restarts { 73 s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) 74 } 75 return s 76 } 77 78 // checkResponse is a response returned by the fakeChecksAPI after the given 79 // time. 80 type checkResponse struct { 81 at time.Time 82 id string 83 status string 84 } 85 86 // fakeChecksAPI implements the Checks() method for testing Consul. 87 type fakeChecksAPI struct { 88 // responses is a map of check ids to their status at a particular 89 // time. checkResponses must be in chronological order. 90 responses map[string][]checkResponse 91 } 92 93 func newFakeChecksAPI() *fakeChecksAPI { 94 return &fakeChecksAPI{responses: make(map[string][]checkResponse)} 95 } 96 97 // add a new check status to Consul at the given time. 98 func (c *fakeChecksAPI) add(id, status string, at time.Time) { 99 c.responses[id] = append(c.responses[id], checkResponse{at, id, status}) 100 } 101 102 func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) { 103 now := time.Now() 104 result := make(map[string]*api.AgentCheck, len(c.responses)) 105 106 // Use the latest response for each check 107 for k, vs := range c.responses { 108 for _, v := range vs { 109 if v.at.After(now) { 110 break 111 } 112 result[k] = &api.AgentCheck{ 113 CheckID: k, 114 Name: k, 115 Status: v.status, 116 } 117 } 118 } 119 120 return result, nil 121 } 122 123 // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test 124 // logger and faster poll frequency. 125 func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) { 126 fakeAPI := newFakeChecksAPI() 127 cw := newCheckWatcher(testlog.HCLogger(t), fakeAPI) 128 cw.pollFreq = 10 * time.Millisecond 129 return fakeAPI, cw 130 } 131 132 func testCheck() *structs.ServiceCheck { 133 return &structs.ServiceCheck{ 134 Name: "testcheck", 135 Interval: 100 * time.Millisecond, 136 Timeout: 100 * time.Millisecond, 137 CheckRestart: &structs.CheckRestart{ 138 Limit: 3, 139 Grace: 100 * time.Millisecond, 140 IgnoreWarnings: false, 141 }, 142 } 143 } 144 145 // TestCheckWatcher_Skip asserts unwatched checks are ignored. 146 func TestCheckWatcher_Skip(t *testing.T) { 147 t.Parallel() 148 149 // Create a check with restarting disabled 150 check := testCheck() 151 check.CheckRestart = nil 152 153 cw := newCheckWatcher(testlog.HCLogger(t), newFakeChecksAPI()) 154 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) 155 cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) 156 157 // Check should have been dropped as it's not watched 158 if n := len(cw.checkUpdateCh); n != 0 { 159 t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n) 160 } 161 } 162 163 // TestCheckWatcher_Healthy asserts healthy tasks are not restarted. 164 func TestCheckWatcher_Healthy(t *testing.T) { 165 t.Parallel() 166 167 fakeAPI, cw := testWatcherSetup(t) 168 169 check1 := testCheck() 170 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 171 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 172 173 check2 := testCheck() 174 check2.CheckRestart.Limit = 1 175 check2.CheckRestart.Grace = 0 176 restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) 177 cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) 178 179 // Make both checks healthy from the beginning 180 fakeAPI.add("testcheck1", "passing", time.Time{}) 181 fakeAPI.add("testcheck2", "passing", time.Time{}) 182 183 // Run 184 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 185 defer cancel() 186 cw.Run(ctx) 187 188 // Ensure restart was never called 189 if n := len(restarter1.restarts); n > 0 { 190 t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1) 191 } 192 if n := len(restarter2.restarts); n > 0 { 193 t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) 194 } 195 } 196 197 // TestCheckWatcher_HealthyWarning asserts checks in warning with 198 // ignore_warnings=true do not restart tasks. 199 func TestCheckWatcher_HealthyWarning(t *testing.T) { 200 t.Parallel() 201 202 fakeAPI, cw := testWatcherSetup(t) 203 204 check1 := testCheck() 205 check1.CheckRestart.Limit = 1 206 check1.CheckRestart.Grace = 0 207 check1.CheckRestart.IgnoreWarnings = true 208 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 209 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 210 211 // Check is always in warning but that's ok 212 fakeAPI.add("testcheck1", "warning", time.Time{}) 213 214 // Run 215 ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) 216 defer cancel() 217 cw.Run(ctx) 218 219 // Ensure restart was never called on check 1 220 if n := len(restarter1.restarts); n > 0 { 221 t.Errorf("expected check 1 to not be restarted but found %d", n) 222 } 223 } 224 225 // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy 226 // before the unhealthy limit is reached do not restart tasks. 227 func TestCheckWatcher_Flapping(t *testing.T) { 228 t.Parallel() 229 230 fakeAPI, cw := testWatcherSetup(t) 231 232 check1 := testCheck() 233 check1.CheckRestart.Grace = 0 234 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 235 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 236 237 // Check flaps and is never failing for the full 200ms needed to restart 238 now := time.Now() 239 fakeAPI.add("testcheck1", "passing", now) 240 fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond)) 241 fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond)) 242 fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) 243 fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) 244 245 ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) 246 defer cancel() 247 cw.Run(ctx) 248 249 // Ensure restart was never called on check 1 250 if n := len(restarter1.restarts); n > 0 { 251 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 252 } 253 } 254 255 // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts. 256 func TestCheckWatcher_Unwatch(t *testing.T) { 257 t.Parallel() 258 259 fakeAPI, cw := testWatcherSetup(t) 260 261 // Unwatch immediately 262 check1 := testCheck() 263 check1.CheckRestart.Limit = 1 264 check1.CheckRestart.Grace = 100 * time.Millisecond 265 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 266 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 267 cw.Unwatch("testcheck1") 268 269 // Always failing 270 fakeAPI.add("testcheck1", "critical", time.Time{}) 271 272 ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) 273 defer cancel() 274 cw.Run(ctx) 275 276 // Ensure restart was never called on check 1 277 if n := len(restarter1.restarts); n > 0 { 278 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 279 } 280 } 281 282 // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks 283 // for a single task, all checks should be removed when any of them restart the 284 // task to avoid multiple restarts. 285 func TestCheckWatcher_MultipleChecks(t *testing.T) { 286 t.Parallel() 287 288 fakeAPI, cw := testWatcherSetup(t) 289 290 check1 := testCheck() 291 check1.CheckRestart.Limit = 1 292 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 293 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 294 295 check2 := testCheck() 296 check2.CheckRestart.Limit = 1 297 restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2) 298 cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2) 299 300 check3 := testCheck() 301 check3.CheckRestart.Limit = 1 302 restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3) 303 cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3) 304 305 // check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart 306 now := time.Now() 307 fakeAPI.add("testcheck1", "critical", now) 308 fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond)) 309 fakeAPI.add("testcheck2", "critical", now) 310 fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond)) 311 fakeAPI.add("testcheck3", "passing", time.Time{}) 312 313 // Run 314 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 315 defer cancel() 316 cw.Run(ctx) 317 318 // Ensure that restart was only called once on check 1 or 2. Since 319 // checks are in a map it's random which check triggers the restart 320 // first. 321 if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 { 322 t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s", 323 n, restarter1, restarter2) 324 } 325 326 if n := len(restarter3.restarts); n != 0 { 327 t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3) 328 } 329 }