github.com/adityamillind98/nomad@v0.11.8/command/agent/consul/check_watcher_test.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "testing" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 "github.com/hashicorp/nomad/helper/testlog" 12 "github.com/hashicorp/nomad/nomad/structs" 13 "github.com/hashicorp/nomad/testutil" 14 "github.com/stretchr/testify/require" 15 ) 16 17 // checkRestartRecord is used by a testFakeCtx to record when restarts occur 18 // due to a watched check. 19 type checkRestartRecord struct { 20 timestamp time.Time 21 source string 22 reason string 23 failure bool 24 } 25 26 // fakeCheckRestarter is a test implementation of TaskRestarter. 27 type fakeCheckRestarter struct { 28 // restarts is a slice of all of the restarts triggered by the checkWatcher 29 restarts []checkRestartRecord 30 31 // need the checkWatcher to re-Watch restarted tasks like TaskRunner 32 watcher *checkWatcher 33 34 // check to re-Watch on restarts 35 check *structs.ServiceCheck 36 allocID string 37 taskName string 38 checkName string 39 40 mu sync.Mutex 41 } 42 43 // newFakeCheckRestart creates a new TaskRestarter. It needs all of the 44 // parameters checkWatcher.Watch expects. 45 func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { 46 return &fakeCheckRestarter{ 47 watcher: w, 48 check: c, 49 allocID: allocID, 50 taskName: taskName, 51 checkName: checkName, 52 } 53 } 54 55 // Restart implements part of the TaskRestarter interface needed for check 56 // watching and is normally fulfilled by a TaskRunner. 57 // 58 // Restarts are recorded in the []restarts field and re-Watch the check. 59 //func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { 60 func (c *fakeCheckRestarter) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { 61 c.mu.Lock() 62 defer c.mu.Unlock() 63 restart := checkRestartRecord{ 64 timestamp: time.Now(), 65 source: event.Type, 66 reason: event.DisplayMessage, 67 failure: failure, 68 } 69 c.restarts = append(c.restarts, restart) 70 71 // Re-Watch the check just like TaskRunner 72 c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) 73 return nil 74 } 75 76 // String for debugging 77 func (c *fakeCheckRestarter) String() string { 78 c.mu.Lock() 79 defer c.mu.Unlock() 80 81 s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) 82 for _, r := range c.restarts { 83 s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) 84 } 85 return s 86 } 87 88 // GetRestarts for testing in a threadsafe way 89 func (c *fakeCheckRestarter) GetRestarts() []checkRestartRecord { 90 c.mu.Lock() 91 defer c.mu.Unlock() 92 93 o := make([]checkRestartRecord, len(c.restarts)) 94 copy(o, c.restarts) 95 return o 96 } 97 98 // checkResponse is a response returned by the fakeChecksAPI after the given 99 // time. 100 type checkResponse struct { 101 at time.Time 102 id string 103 status string 104 } 105 106 // fakeChecksAPI implements the Checks() method for testing Consul. 107 type fakeChecksAPI struct { 108 // responses is a map of check ids to their status at a particular 109 // time. checkResponses must be in chronological order. 110 responses map[string][]checkResponse 111 112 mu sync.Mutex 113 } 114 115 func newFakeChecksAPI() *fakeChecksAPI { 116 return &fakeChecksAPI{responses: make(map[string][]checkResponse)} 117 } 118 119 // add a new check status to Consul at the given time. 120 func (c *fakeChecksAPI) add(id, status string, at time.Time) { 121 c.mu.Lock() 122 c.responses[id] = append(c.responses[id], checkResponse{at, id, status}) 123 c.mu.Unlock() 124 } 125 126 func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) { 127 c.mu.Lock() 128 defer c.mu.Unlock() 129 now := time.Now() 130 result := make(map[string]*api.AgentCheck, len(c.responses)) 131 132 // Use the latest response for each check 133 for k, vs := range c.responses { 134 for _, v := range vs { 135 if v.at.After(now) { 136 break 137 } 138 result[k] = &api.AgentCheck{ 139 CheckID: k, 140 Name: k, 141 Status: v.status, 142 } 143 } 144 } 145 146 return result, nil 147 } 148 149 // testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test 150 // logger and faster poll frequency. 151 func testWatcherSetup(t *testing.T) (*fakeChecksAPI, *checkWatcher) { 152 fakeAPI := newFakeChecksAPI() 153 cw := newCheckWatcher(testlog.HCLogger(t), fakeAPI) 154 cw.pollFreq = 10 * time.Millisecond 155 return fakeAPI, cw 156 } 157 158 func testCheck() *structs.ServiceCheck { 159 return &structs.ServiceCheck{ 160 Name: "testcheck", 161 Interval: 100 * time.Millisecond, 162 Timeout: 100 * time.Millisecond, 163 CheckRestart: &structs.CheckRestart{ 164 Limit: 3, 165 Grace: 100 * time.Millisecond, 166 IgnoreWarnings: false, 167 }, 168 } 169 } 170 171 // TestCheckWatcher_Skip asserts unwatched checks are ignored. 172 func TestCheckWatcher_Skip(t *testing.T) { 173 t.Parallel() 174 175 // Create a check with restarting disabled 176 check := testCheck() 177 check.CheckRestart = nil 178 179 cw := newCheckWatcher(testlog.HCLogger(t), newFakeChecksAPI()) 180 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) 181 cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) 182 183 // Check should have been dropped as it's not watched 184 if n := len(cw.checkUpdateCh); n != 0 { 185 t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n) 186 } 187 } 188 189 // TestCheckWatcher_Healthy asserts healthy tasks are not restarted. 190 func TestCheckWatcher_Healthy(t *testing.T) { 191 t.Parallel() 192 193 fakeAPI, cw := testWatcherSetup(t) 194 195 check1 := testCheck() 196 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 197 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 198 199 check2 := testCheck() 200 check2.CheckRestart.Limit = 1 201 check2.CheckRestart.Grace = 0 202 restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) 203 cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) 204 205 // Make both checks healthy from the beginning 206 fakeAPI.add("testcheck1", "passing", time.Time{}) 207 fakeAPI.add("testcheck2", "passing", time.Time{}) 208 209 // Run 210 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 211 defer cancel() 212 cw.Run(ctx) 213 214 // Ensure restart was never called 215 if n := len(restarter1.restarts); n > 0 { 216 t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1) 217 } 218 if n := len(restarter2.restarts); n > 0 { 219 t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) 220 } 221 } 222 223 // TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once. 224 func TestCheckWatcher_Unhealthy(t *testing.T) { 225 t.Parallel() 226 227 fakeAPI, cw := testWatcherSetup(t) 228 229 check1 := testCheck() 230 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 231 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 232 233 // Check has always been failing 234 fakeAPI.add("testcheck1", "critical", time.Time{}) 235 236 // Run 237 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 238 defer cancel() 239 cw.Run(ctx) 240 241 // Ensure restart was called exactly once 242 require.Len(t, restarter1.restarts, 1) 243 } 244 245 // TestCheckWatcher_HealthyWarning asserts checks in warning with 246 // ignore_warnings=true do not restart tasks. 247 func TestCheckWatcher_HealthyWarning(t *testing.T) { 248 t.Parallel() 249 250 fakeAPI, cw := testWatcherSetup(t) 251 252 check1 := testCheck() 253 check1.CheckRestart.Limit = 1 254 check1.CheckRestart.Grace = 0 255 check1.CheckRestart.IgnoreWarnings = true 256 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 257 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 258 259 // Check is always in warning but that's ok 260 fakeAPI.add("testcheck1", "warning", time.Time{}) 261 262 // Run 263 ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) 264 defer cancel() 265 cw.Run(ctx) 266 267 // Ensure restart was never called on check 1 268 if n := len(restarter1.restarts); n > 0 { 269 t.Errorf("expected check 1 to not be restarted but found %d", n) 270 } 271 } 272 273 // TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy 274 // before the unhealthy limit is reached do not restart tasks. 275 func TestCheckWatcher_Flapping(t *testing.T) { 276 t.Parallel() 277 278 fakeAPI, cw := testWatcherSetup(t) 279 280 check1 := testCheck() 281 check1.CheckRestart.Grace = 0 282 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 283 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 284 285 // Check flaps and is never failing for the full 200ms needed to restart 286 now := time.Now() 287 fakeAPI.add("testcheck1", "passing", now) 288 fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond)) 289 fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond)) 290 fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) 291 fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) 292 293 ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) 294 defer cancel() 295 cw.Run(ctx) 296 297 // Ensure restart was never called on check 1 298 if n := len(restarter1.restarts); n > 0 { 299 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 300 } 301 } 302 303 // TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts. 304 func TestCheckWatcher_Unwatch(t *testing.T) { 305 t.Parallel() 306 307 fakeAPI, cw := testWatcherSetup(t) 308 309 // Unwatch immediately 310 check1 := testCheck() 311 check1.CheckRestart.Limit = 1 312 check1.CheckRestart.Grace = 100 * time.Millisecond 313 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 314 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 315 cw.Unwatch("testcheck1") 316 317 // Always failing 318 fakeAPI.add("testcheck1", "critical", time.Time{}) 319 320 ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) 321 defer cancel() 322 cw.Run(ctx) 323 324 // Ensure restart was never called on check 1 325 if n := len(restarter1.restarts); n > 0 { 326 t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) 327 } 328 } 329 330 // TestCheckWatcher_MultipleChecks asserts that when there are multiple checks 331 // for a single task, all checks should be removed when any of them restart the 332 // task to avoid multiple restarts. 333 func TestCheckWatcher_MultipleChecks(t *testing.T) { 334 t.Parallel() 335 336 fakeAPI, cw := testWatcherSetup(t) 337 338 check1 := testCheck() 339 check1.CheckRestart.Limit = 1 340 restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) 341 cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) 342 343 check2 := testCheck() 344 check2.CheckRestart.Limit = 1 345 restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2) 346 cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2) 347 348 check3 := testCheck() 349 check3.CheckRestart.Limit = 1 350 restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3) 351 cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3) 352 353 // check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart 354 now := time.Now() 355 fakeAPI.add("testcheck1", "critical", now) 356 fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond)) 357 fakeAPI.add("testcheck2", "critical", now) 358 fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond)) 359 fakeAPI.add("testcheck3", "passing", time.Time{}) 360 361 // Run 362 ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) 363 defer cancel() 364 cw.Run(ctx) 365 366 // Ensure that restart was only called once on check 1 or 2. Since 367 // checks are in a map it's random which check triggers the restart 368 // first. 369 if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 { 370 t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s", 371 n, restarter1, restarter2) 372 } 373 374 if n := len(restarter3.restarts); n != 0 { 375 t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3) 376 } 377 } 378 379 // TestCheckWatcher_Deadlock asserts that check watcher will not deadlock when 380 // attempting to restart a task even if its update queue is full. 381 // https://github.com/hashicorp/nomad/issues/5395 382 func TestCheckWatcher_Deadlock(t *testing.T) { 383 t.Parallel() 384 385 fakeAPI, cw := testWatcherSetup(t) 386 387 // If TR.Restart blocks, restarting len(checkUpdateCh)+1 checks causes 388 // a deadlock due to checkWatcher.Run being blocked in 389 // checkRestart.apply and unable to process updates from the chan! 390 n := cap(cw.checkUpdateCh) + 1 391 checks := make([]*structs.ServiceCheck, n) 392 restarters := make([]*fakeCheckRestarter, n) 393 for i := 0; i < n; i++ { 394 c := testCheck() 395 r := newFakeCheckRestarter(cw, 396 fmt.Sprintf("alloc%d", i), 397 fmt.Sprintf("task%d", i), 398 fmt.Sprintf("check%d", i), 399 c, 400 ) 401 checks[i] = c 402 restarters[i] = r 403 } 404 405 // Run 406 ctx, cancel := context.WithCancel(context.Background()) 407 defer cancel() 408 go cw.Run(ctx) 409 410 // Watch 411 for _, r := range restarters { 412 cw.Watch(r.allocID, r.taskName, r.checkName, r.check, r) 413 } 414 415 // Make them all fail 416 for _, r := range restarters { 417 fakeAPI.add(r.checkName, "critical", time.Time{}) 418 } 419 420 // Ensure that restart was called exactly once on all checks 421 testutil.WaitForResult(func() (bool, error) { 422 for _, r := range restarters { 423 if n := len(r.GetRestarts()); n != 1 { 424 return false, fmt.Errorf("expected 1 restart but found %d", n) 425 } 426 } 427 return true, nil 428 }, func(err error) { 429 require.NoError(t, err) 430 }) 431 }