github.com/outbrain/consul@v1.4.5/agent/proxyprocess/daemon_test.go (about) 1 package proxyprocess 2 3 import ( 4 "io/ioutil" 5 "os" 6 "os/exec" 7 "path/filepath" 8 "strconv" 9 "syscall" 10 "testing" 11 "time" 12 13 "github.com/hashicorp/consul/testutil/retry" 14 "github.com/hashicorp/go-uuid" 15 "github.com/stretchr/testify/require" 16 ) 17 18 func TestDaemon_impl(t *testing.T) { 19 var _ Proxy = new(Daemon) 20 } 21 22 func TestDaemonStartStop(t *testing.T) { 23 t.Parallel() 24 25 require := require.New(t) 26 td, closer := testTempDir(t) 27 defer closer() 28 29 path := filepath.Join(td, "file") 30 uuid, err := uuid.GenerateUUID() 31 require.NoError(err) 32 33 cmd, destroy := helperProcess("start-stop", path) 34 defer destroy() 35 36 d := &Daemon{ 37 Command: cmd, 38 ProxyID: "tubes", 39 ProxyToken: uuid, 40 Logger: testLogger, 41 } 42 require.NoError(d.Start()) 43 defer d.Stop() 44 45 // Wait for the file to exist 46 retry.Run(t, func(r *retry.R) { 47 _, err := os.Stat(path) 48 if err == nil { 49 return 50 } 51 52 r.Fatalf("error: %s", err) 53 }) 54 55 // Verify that the contents of the file is the token. This verifies 56 // that we properly passed the token as an env var. 57 data, err := ioutil.ReadFile(path) 58 require.NoError(err) 59 require.Equal("tubes:"+uuid, string(data)) 60 61 // Stop the process 62 require.NoError(d.Stop()) 63 64 // File should no longer exist. 65 retry.Run(t, func(r *retry.R) { 66 _, err := os.Stat(path) 67 if os.IsNotExist(err) { 68 return 69 } 70 71 // err might be nil here but that's okay 72 r.Fatalf("should not exist: %s", err) 73 }) 74 } 75 76 func TestDaemonRestart(t *testing.T) { 77 t.Parallel() 78 79 require := require.New(t) 80 td, closer := testTempDir(t) 81 defer closer() 82 path := filepath.Join(td, "file") 83 84 cmd, destroy := helperProcess("restart", path) 85 defer destroy() 86 87 d := &Daemon{ 88 Command: cmd, 89 Logger: testLogger, 90 } 91 require.NoError(d.Start()) 92 defer d.Stop() 93 94 // Wait for the file to exist. We save the func so we can reuse the test. 95 waitFile := func() { 96 retry.Run(t, func(r *retry.R) { 97 _, err := os.Stat(path) 98 if err == nil { 99 return 100 } 101 r.Fatalf("error waiting for path: %s", err) 102 }) 103 } 104 waitFile() 105 106 // Delete the file 107 require.NoError(os.Remove(path)) 108 109 // File should re-appear because the process is restart 110 waitFile() 111 } 112 113 func TestDaemonLaunchesNewProcessGroup(t *testing.T) { 114 t.Parallel() 115 116 require := require.New(t) 117 td, closer := testTempDir(t) 118 defer closer() 119 120 path := filepath.Join(td, "file") 121 pidPath := filepath.Join(td, "child.pid") 122 123 // Start the parent process wrapping a start-stop test. The parent is acting 124 // as our "agent". We need an extra indirection to be able to kill the "agent" 125 // and still be running the test process. 126 parentCmd, destroy := helperProcess("parent", pidPath, "start-stop", path) 127 defer destroy() 128 129 // We MUST run this as a separate process group otherwise the Kill below will 130 // kill this test process (and possibly your shell/editor that launched it!) 131 parentCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} 132 133 require.NoError(parentCmd.Start()) 134 135 // Wait for the pid file to exist so we know parent is running 136 retry.Run(t, func(r *retry.R) { 137 _, err := os.Stat(pidPath) 138 if err == nil { 139 return 140 } 141 142 r.Fatalf("error: %s", err) 143 }) 144 145 // And wait for the actual file to be sure the child is running (it should be 146 // since parent doesn't write PID until child starts but the child might not 147 // have completed the write to disk yet which causes flakiness below). 148 retry.Run(t, func(r *retry.R) { 149 _, err := os.Stat(path) 150 if err == nil { 151 return 152 } 153 154 r.Fatalf("error: %s", err) 155 }) 156 157 // Get the child PID 158 bs, err := ioutil.ReadFile(pidPath) 159 require.NoError(err) 160 pid, err := strconv.Atoi(string(bs)) 161 require.NoError(err) 162 proc, err := os.FindProcess(pid) 163 require.NoError(err) 164 165 // Always cleanup child process after 166 defer func() { 167 if proc != nil { 168 proc.Kill() 169 } 170 }() 171 172 // Now kill the parent's whole process group and wait for it 173 pgid, err := syscall.Getpgid(parentCmd.Process.Pid) 174 175 require.NoError(err) 176 // Yep the minus PGid is how you kill a whole process group in unix... no idea 177 // how this works on windows. We TERM no KILL since we rely on the child 178 // catching the signal and deleting it's file to detect correct behavior. 179 require.NoError(syscall.Kill(-pgid, syscall.SIGTERM)) 180 181 _, err = parentCmd.Process.Wait() 182 require.NoError(err) 183 184 // The child should still be running so file should still be there 185 _, err = os.Stat(path) 186 require.NoError(err, "child should still be running") 187 188 // TEST PART 2 - verify that adopting an existing process works and picks up 189 // monitoring even though it's not a child. We can't do this accurately with 190 // Restart test since even if we create a new `Daemon` object the test process 191 // is still the parent. We need the indirection of the `parent` test helper to 192 // actually verify "adoption" on restart works. 193 194 // Start a new parent that will "adopt" the existing child even though it will 195 // not be an actual child process. 196 fosterCmd, destroy := helperProcess("parent", pidPath, "start-stop", path) 197 defer destroy() 198 199 // Don't care about it being same process group this time as we will just kill 200 // it normally. 201 require.NoError(fosterCmd.Start()) 202 defer func() { 203 // Clean up the daemon and wait for it to prevent it becoming a zombie. 204 fosterCmd.Process.Kill() 205 fosterCmd.Wait() 206 }() 207 208 // The child should still be running so file should still be there 209 _, err = os.Stat(path) 210 require.NoError(err, "child should still be running") 211 212 { 213 // Get the child PID - it shouldn't have changed and should be running 214 bs2, err := ioutil.ReadFile(pidPath) 215 require.NoError(err) 216 pid2, err := strconv.Atoi(string(bs2)) 217 require.NoError(err) 218 // Defer a cleanup (til end of test function) 219 proc, err := os.FindProcess(pid) 220 require.NoError(err) 221 defer func() { proc.Kill() }() 222 223 require.Equal(pid, pid2) 224 t.Logf("Child PID was %d and still %d", pid, pid2) 225 } 226 227 // Now killing the child directly should still be restarted by the Daemon 228 require.NoError(proc.Kill()) 229 proc = nil 230 231 retry.Run(t, func(r *retry.R) { 232 // Get the child PID - it should have changed 233 bs, err := ioutil.ReadFile(pidPath) 234 r.Check(err) 235 236 newPid, err := strconv.Atoi(string(bs)) 237 r.Check(err) 238 if newPid == pid { 239 r.Fatalf("Child PID file not changed, Daemon not restarting it") 240 } 241 t.Logf("Child PID was %d and is now %d", pid, newPid) 242 }) 243 244 // I had to run through this test in debugger a lot of times checking ps state 245 // by hand at different points to convince myself it was doing the right 246 // thing. It doesn't help that with verbose logs on it seems that the stdio 247 // from the `parent` process can sometimes miss lines out due to timing. For 248 // example the `[INFO] agent/proxy: daemon exited...` log from Daemon that 249 // indicates that the child was detected to have failed and is restarting is 250 // never output on my Mac at full speed. But if I run in debugger and have it 251 // pause at the step after the child is killed above, then it shows. The 252 // `[DEBUG] agent/proxy: starting proxy:` for the restart does always come 253 // through though which is odd. I assume this is some odd quirk of timing 254 // between processes and stdio or something but it makes debugging this stuff 255 // even harder! 256 257 // Let defer clean up the child process(es) 258 259 // Get the NEW child PID 260 bs, err = ioutil.ReadFile(pidPath) 261 require.NoError(err) 262 pid, err = strconv.Atoi(string(bs)) 263 require.NoError(err) 264 proc2, err := os.FindProcess(pid) 265 require.NoError(err) 266 267 // Always cleanup child process after 268 defer func() { 269 if proc2 != nil { 270 proc2.Kill() 271 } 272 }() 273 } 274 275 func TestDaemonStop_kill(t *testing.T) { 276 t.Parallel() 277 278 require := require.New(t) 279 td, closer := testTempDir(t) 280 defer closer() 281 282 path := filepath.Join(td, "file") 283 284 cmd, destroy := helperProcess("stop-kill", path) 285 defer destroy() 286 287 d := &Daemon{ 288 Command: cmd, 289 ProxyToken: "hello", 290 Logger: testLogger, 291 gracefulWait: 200 * time.Millisecond, 292 } 293 require.NoError(d.Start()) 294 295 // Wait for the file to exist 296 retry.Run(t, func(r *retry.R) { 297 _, err := os.Stat(path) 298 if err == nil { 299 return 300 } 301 302 r.Fatalf("error: %s", err) 303 }) 304 305 // Stop the process 306 require.NoError(d.Stop()) 307 308 // Stat the file so that we can get the mtime 309 fi, err := os.Stat(path) 310 require.NoError(err) 311 mtime := fi.ModTime() 312 313 // The mtime shouldn't change 314 time.Sleep(100 * time.Millisecond) 315 fi, err = os.Stat(path) 316 require.NoError(err) 317 require.Equal(mtime, fi.ModTime()) 318 } 319 320 func TestDaemonStop_killAdopted(t *testing.T) { 321 t.Parallel() 322 323 require := require.New(t) 324 td, closer := testTempDir(t) 325 defer closer() 326 327 path := filepath.Join(td, "file") 328 329 // In this test we want to ensure that graceful/ungraceful stop works with 330 // processes that were adopted by current process but not started by it. (i.e. 331 // we have to poll them not use Wait). 332 // 333 // We could use `parent` indirection to get a child that is actually not 334 // started by this process but that's a lot of hoops to jump through on top of 335 // an already complex multi-process test case. 336 // 337 // For now we rely on an implementation detail of Daemon which is potentially 338 // brittle but beats lots of extra complexity here. Currently, if 339 // Daemon.process is non-nil, the keepAlive loop will explicitly assume it's 340 // not a child and so will use polling to monitor it. If we ever change that 341 // it might invalidate this test and we would either need more indirection 342 // here, or an alternative explicit signal on Daemon like Daemon.forcePoll to 343 // ensure we are exercising that code path. 344 345 // Start the "child" process 346 childCmd, destroy := helperProcess("stop-kill", path) 347 defer destroy() 348 349 require.NoError(childCmd.Start()) 350 go func() { childCmd.Wait() }() // Prevent it becoming a zombie when killed 351 defer func() { childCmd.Process.Kill() }() 352 353 // Create the Daemon 354 cmd, destroy := helperProcess("stop-kill", path) 355 defer destroy() 356 357 d := &Daemon{ 358 Command: cmd, 359 ProxyToken: "hello", 360 Logger: testLogger, 361 gracefulWait: 200 * time.Millisecond, 362 // Can't just set process as it will bypass intializing stopCh etc. 363 } 364 // Adopt the pid from a fake state snapshot (this correctly initializes Daemon 365 // for adoption) 366 fakeSnap := map[string]interface{}{ 367 "Pid": childCmd.Process.Pid, 368 "CommandPath": childCmd.Path, 369 "CommandArgs": childCmd.Args, 370 "CommandDir": childCmd.Dir, 371 "CommandEnv": childCmd.Env, 372 "ProxyToken": d.ProxyToken, 373 } 374 require.NoError(d.UnmarshalSnapshot(fakeSnap)) 375 require.NoError(d.Start()) 376 377 // Wait for the file to exist (child was already running so this doesn't 378 // guarantee that Daemon is in "polling" state) 379 retry.Run(t, func(r *retry.R) { 380 _, err := os.Stat(path) 381 if err == nil { 382 return 383 } 384 385 r.Fatalf("error: %s", err) 386 }) 387 388 // Stop the process 389 require.NoError(d.Stop()) 390 391 // Stat the file so that we can get the mtime 392 fi, err := os.Stat(path) 393 require.NoError(err) 394 mtime := fi.ModTime() 395 396 // The mtime shouldn't change 397 time.Sleep(100 * time.Millisecond) 398 fi, err = os.Stat(path) 399 require.NoError(err) 400 require.Equal(mtime, fi.ModTime()) 401 } 402 403 func TestDaemonStart_pidFile(t *testing.T) { 404 t.Parallel() 405 406 require := require.New(t) 407 td, closer := testTempDir(t) 408 defer closer() 409 410 path := filepath.Join(td, "file") 411 pidPath := filepath.Join(td, "pid") 412 uuid, err := uuid.GenerateUUID() 413 require.NoError(err) 414 415 cmd, destroy := helperProcess("start-once", path) 416 defer destroy() 417 418 d := &Daemon{ 419 Command: cmd, 420 ProxyToken: uuid, 421 Logger: testLogger, 422 PidPath: pidPath, 423 } 424 require.NoError(d.Start()) 425 defer d.Stop() 426 427 // Wait for the file to exist 428 retry.Run(t, func(r *retry.R) { 429 _, err := os.Stat(pidPath) 430 if err == nil { 431 return 432 } 433 434 r.Fatalf("error: %s", err) 435 }) 436 437 // Check the pid file 438 pidRaw, err := ioutil.ReadFile(pidPath) 439 require.NoError(err) 440 require.NotEmpty(pidRaw) 441 442 // Stop 443 require.NoError(d.Stop()) 444 445 // Pid file should be gone 446 _, err = os.Stat(pidPath) 447 require.True(os.IsNotExist(err)) 448 } 449 450 // Verify the pid file changes on restart 451 func TestDaemonRestart_pidFile(t *testing.T) { 452 t.Parallel() 453 454 require := require.New(t) 455 td, closer := testTempDir(t) 456 defer closer() 457 path := filepath.Join(td, "file") 458 pidPath := filepath.Join(td, "pid") 459 460 cmd, destroy := helperProcess("restart", path) 461 defer destroy() 462 463 d := &Daemon{ 464 Command: cmd, 465 Logger: testLogger, 466 PidPath: pidPath, 467 } 468 require.NoError(d.Start()) 469 defer d.Stop() 470 471 // Wait for the file to exist. We save the func so we can reuse the test. 472 waitFile := func(path string) { 473 retry.Run(t, func(r *retry.R) { 474 _, err := os.Stat(path) 475 if err == nil { 476 return 477 } 478 r.Fatalf("error waiting for path: %s", err) 479 }) 480 } 481 waitFile(path) 482 waitFile(pidPath) 483 484 // Check the pid file 485 pidRaw, err := ioutil.ReadFile(pidPath) 486 require.NoError(err) 487 require.NotEmpty(pidRaw) 488 489 // Delete the file 490 require.NoError(os.Remove(pidPath)) 491 require.NoError(os.Remove(path)) 492 493 // File should re-appear because the process is restart 494 waitFile(path) 495 waitFile(pidPath) 496 497 // Check the pid file and it should not equal 498 pidRaw2, err := ioutil.ReadFile(pidPath) 499 require.NoError(err) 500 require.NotEmpty(pidRaw2) 501 require.NotEqual(pidRaw, pidRaw2) 502 } 503 504 func TestDaemonEqual(t *testing.T) { 505 cases := []struct { 506 Name string 507 D1, D2 Proxy 508 Expected bool 509 }{ 510 { 511 "Different type", 512 &Daemon{ 513 Command: &exec.Cmd{}, 514 }, 515 &Noop{}, 516 false, 517 }, 518 519 { 520 "Nil", 521 &Daemon{ 522 Command: &exec.Cmd{}, 523 }, 524 nil, 525 false, 526 }, 527 528 { 529 "Equal", 530 &Daemon{ 531 Command: &exec.Cmd{}, 532 }, 533 &Daemon{ 534 Command: &exec.Cmd{}, 535 }, 536 true, 537 }, 538 539 { 540 "Different proxy ID", 541 &Daemon{ 542 Command: &exec.Cmd{Path: "/foo"}, 543 ProxyID: "web", 544 }, 545 &Daemon{ 546 Command: &exec.Cmd{Path: "/foo"}, 547 ProxyID: "db", 548 }, 549 false, 550 }, 551 552 { 553 "Different path", 554 &Daemon{ 555 Command: &exec.Cmd{Path: "/foo"}, 556 }, 557 &Daemon{ 558 Command: &exec.Cmd{Path: "/bar"}, 559 }, 560 false, 561 }, 562 563 { 564 "Different dir", 565 &Daemon{ 566 Command: &exec.Cmd{Dir: "/foo"}, 567 }, 568 &Daemon{ 569 Command: &exec.Cmd{Dir: "/bar"}, 570 }, 571 false, 572 }, 573 574 { 575 "Different args", 576 &Daemon{ 577 Command: &exec.Cmd{Args: []string{"foo"}}, 578 }, 579 &Daemon{ 580 Command: &exec.Cmd{Args: []string{"bar"}}, 581 }, 582 false, 583 }, 584 585 { 586 "Different token", 587 &Daemon{ 588 Command: &exec.Cmd{}, 589 ProxyToken: "one", 590 }, 591 &Daemon{ 592 Command: &exec.Cmd{}, 593 ProxyToken: "two", 594 }, 595 false, 596 }, 597 } 598 599 for _, tc := range cases { 600 t.Run(tc.Name, func(t *testing.T) { 601 actual := tc.D1.Equal(tc.D2) 602 require.Equal(t, tc.Expected, actual) 603 }) 604 } 605 } 606 607 func TestDaemonMarshalSnapshot(t *testing.T) { 608 cases := []struct { 609 Name string 610 Proxy Proxy 611 Expected map[string]interface{} 612 }{ 613 { 614 "stopped daemon", 615 &Daemon{ 616 Command: &exec.Cmd{Path: "/foo"}, 617 }, 618 nil, 619 }, 620 621 { 622 "basic", 623 &Daemon{ 624 Command: &exec.Cmd{Path: "/foo"}, 625 ProxyID: "web", 626 process: &os.Process{Pid: 42}, 627 }, 628 map[string]interface{}{ 629 "Pid": 42, 630 "CommandPath": "/foo", 631 "CommandArgs": []string(nil), 632 "CommandDir": "", 633 "CommandEnv": []string(nil), 634 "ProxyToken": "", 635 "ProxyID": "web", 636 }, 637 }, 638 } 639 640 for _, tc := range cases { 641 t.Run(tc.Name, func(t *testing.T) { 642 actual := tc.Proxy.MarshalSnapshot() 643 require.Equal(t, tc.Expected, actual) 644 }) 645 } 646 } 647 648 func TestDaemonUnmarshalSnapshot(t *testing.T) { 649 t.Parallel() 650 651 require := require.New(t) 652 td, closer := testTempDir(t) 653 defer closer() 654 655 path := filepath.Join(td, "file") 656 uuid, err := uuid.GenerateUUID() 657 require.NoError(err) 658 659 cmd, destroy := helperProcess("start-stop", path) 660 defer destroy() 661 662 d := &Daemon{ 663 Command: cmd, 664 ProxyToken: uuid, 665 Logger: testLogger, 666 } 667 defer d.Stop() 668 require.NoError(d.Start()) 669 670 // Wait for the file to exist 671 retry.Run(t, func(r *retry.R) { 672 _, err := os.Stat(path) 673 if err == nil { 674 return 675 } 676 677 r.Fatalf("error: %s", err) 678 }) 679 680 // Snapshot 681 snap := d.MarshalSnapshot() 682 683 // Stop the original daemon but keep it alive 684 require.NoError(d.Close()) 685 686 // Restore the second daemon 687 d2 := &Daemon{Logger: testLogger} 688 require.NoError(d2.UnmarshalSnapshot(snap)) 689 690 // Verify the daemon is still running 691 _, err = os.Stat(path) 692 require.NoError(err) 693 694 // Stop the process 695 require.NoError(d2.Stop()) 696 697 // File should no longer exist. 698 retry.Run(t, func(r *retry.R) { 699 _, err := os.Stat(path) 700 if os.IsNotExist(err) { 701 return 702 } 703 704 // err might be nil here but that's okay 705 r.Fatalf("should not exist: %s", err) 706 }) 707 } 708 709 func TestDaemonUnmarshalSnapshot_notRunning(t *testing.T) { 710 t.Parallel() 711 712 require := require.New(t) 713 td, closer := testTempDir(t) 714 defer closer() 715 716 path := filepath.Join(td, "file") 717 uuid, err := uuid.GenerateUUID() 718 require.NoError(err) 719 720 cmd, destroy := helperProcess("start-stop", path) 721 defer destroy() 722 723 d := &Daemon{ 724 Command: cmd, 725 ProxyToken: uuid, 726 Logger: testLogger, 727 } 728 defer d.Stop() 729 require.NoError(d.Start()) 730 731 // Wait for the file to exist 732 retry.Run(t, func(r *retry.R) { 733 _, err := os.Stat(path) 734 if err == nil { 735 return 736 } 737 738 r.Fatalf("error: %s", err) 739 }) 740 741 // Snapshot 742 snap := d.MarshalSnapshot() 743 744 // Stop the original daemon 745 require.NoError(d.Stop()) 746 747 // Restore the second daemon 748 d2 := &Daemon{Logger: testLogger} 749 require.Error(d2.UnmarshalSnapshot(snap)) 750 }