github.com/bigcommerce/nomad@v0.9.3-bc/e2e/clientstate/clientstate.go (about)

     1  package clientstate
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"math/rand"
     8  	"net/http"
     9  	"os"
    10  	"os/exec"
    11  	"path/filepath"
    12  	"strconv"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/hashicorp/nomad/api"
    17  	"github.com/hashicorp/nomad/client/state"
    18  	"github.com/hashicorp/nomad/e2e/e2eutil"
    19  	"github.com/hashicorp/nomad/e2e/execagent"
    20  	"github.com/hashicorp/nomad/e2e/framework"
    21  	"github.com/hashicorp/nomad/helper/discover"
    22  	"github.com/hashicorp/nomad/helper/testlog"
    23  	"github.com/hashicorp/nomad/helper/uuid"
    24  	"github.com/hashicorp/nomad/testutil"
    25  )
    26  
    27  func init() {
    28  	framework.AddSuites(&framework.TestSuite{
    29  		Component:   "clientstate",
    30  		CanRunLocal: true,
    31  		Cases: []framework.TestCase{
    32  			&ClientStateTC{},
    33  		},
    34  	})
    35  }
    36  
    37  type ClientStateTC struct {
    38  	framework.TC
    39  
    40  	// bin is the path to Nomad binary
    41  	bin string
    42  }
    43  
    44  func (tc *ClientStateTC) BeforeAll(f *framework.F) {
    45  	if os.Getenv("NOMAD_TEST_STATE") == "" {
    46  		f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1")
    47  	}
    48  
    49  	bin, err := discover.NomadExecutable()
    50  	f.NoError(err)
    51  	tc.bin = bin
    52  }
    53  
    54  func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) {
    55  	allocfs := client.AllocFS()
    56  	r, err := allocfs.Cat(alloc, path, nil)
    57  	if err != nil {
    58  		return 0, err
    59  	}
    60  	defer r.Close()
    61  
    62  	out, err := ioutil.ReadAll(r)
    63  	if err != nil {
    64  		return 0, err
    65  	}
    66  
    67  	lines := bytes.SplitN(out, []byte{'\n'}, 2)
    68  	if len(lines) != 2 || len(lines[1]) > 0 {
    69  		return 0, fmt.Errorf("expected 1 line not %q", string(out))
    70  	}
    71  
    72  	// Capture pid
    73  	pid, err := strconv.Atoi(string(lines[0]))
    74  	if err != nil {
    75  		return 0, err
    76  	}
    77  
    78  	return pid, nil
    79  }
    80  
    81  // TestClientState_Kill force kills Nomad agents and restarts them in a tight
    82  // loop to assert Nomad is crash safe.
    83  func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) {
    84  	t := f.T()
    85  	t.Parallel()
    86  
    87  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
    88  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
    89  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
    90  	f.NoError(err)
    91  
    92  	f.NoError(serverAgent.Start())
    93  	defer serverAgent.Destroy()
    94  	f.NoError(clientAgent.Start())
    95  	defer clientAgent.Destroy()
    96  
    97  	// Get a client for the server agent to use even while the client is
    98  	// down.
    99  	client, err := serverAgent.Client()
   100  	f.NoError(err)
   101  
   102  	jobID := "sleeper-" + uuid.Generate()[:8]
   103  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID)
   104  	f.Len(allocs, 1)
   105  
   106  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   107  	f.NoError(err)
   108  
   109  	defer func() {
   110  		if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil {
   111  			t.Logf("error stopping job: %v", err)
   112  		}
   113  
   114  		testutil.WaitForResult(func() (bool, error) {
   115  			sum, _, err := client.Jobs().Summary(jobID, nil)
   116  			if err != nil {
   117  				return false, err
   118  			}
   119  			if r := sum.Summary["sleeper"].Running; r > 0 {
   120  				return false, fmt.Errorf("still running: %d", r)
   121  			}
   122  			return true, nil
   123  		}, func(err error) {
   124  			f.NoError(err)
   125  		})
   126  
   127  		//XXX Must use client agent for gc'ing allocs?
   128  		clientAPI, err := clientAgent.Client()
   129  		f.NoError(err)
   130  		if err := clientAPI.Allocations().GC(alloc, nil); err != nil {
   131  			t.Logf("error garbage collecting alloc: %v", err)
   132  		}
   133  
   134  		if err := client.System().GarbageCollect(); err != nil {
   135  			t.Logf("error doing full gc: %v", err)
   136  		}
   137  
   138  		//HACK to wait until things have GC'd
   139  		time.Sleep(time.Second)
   140  	}()
   141  
   142  	assertHealthy := func() {
   143  		t.Helper()
   144  		testutil.WaitForResult(func() (bool, error) {
   145  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   146  			f.NoError(err) // should never error
   147  
   148  			if len(alloc.TaskStates) == 0 {
   149  				return false, fmt.Errorf("waiting for tasks to start")
   150  			}
   151  
   152  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   153  				return false, fmt.Errorf("task should be running: %q", s)
   154  			}
   155  
   156  			// Restarts should never happen
   157  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   158  			return true, nil
   159  		}, func(err error) {
   160  			f.NoError(err)
   161  		})
   162  	}
   163  	assertHealthy()
   164  
   165  	// Find pid
   166  	pid := 0
   167  	testutil.WaitForResult(func() (bool, error) {
   168  		pid, err = getPID(client, alloc, "sleeper/pid")
   169  		return pid > 0, err
   170  	}, func(err error) {
   171  		f.NoError(err)
   172  	})
   173  
   174  	// Kill and restart a few times
   175  	tries := 10
   176  	for i := 0; i < tries; i++ {
   177  		t.Logf("TEST RUN %d/%d", i+1, tries)
   178  
   179  		// Kill -9 the Agent
   180  		agentPid := clientAgent.Cmd.Process.Pid
   181  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   182  
   183  		state, err := clientAgent.Cmd.Process.Wait()
   184  		f.NoError(err)
   185  		f.False(state.Exited()) // kill signal != exited
   186  		f.False(state.Success())
   187  
   188  		// Assert sleeper is still running
   189  		f.NoError(syscall.Kill(pid, 0))
   190  		assertHealthy()
   191  
   192  		// Should not be able to reach its filesystem
   193  		_, err = getPID(client, alloc, "sleeper/pid")
   194  		f.Error(err)
   195  
   196  		// Restart the agent (have to create a new Cmd)
   197  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   198  			"-config", clientAgent.ConfFile,
   199  			"-data-dir", clientAgent.DataDir,
   200  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   201  		)
   202  		clientAgent.Cmd.Stdout = clientOut
   203  		clientAgent.Cmd.Stderr = clientOut
   204  		f.NoError(clientAgent.Start())
   205  
   206  		// Assert a new process did start
   207  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   208  
   209  		// Retrieving the pid should work once it restarts
   210  		testutil.WaitForResult(func() (bool, error) {
   211  			newPid, err := getPID(client, alloc, "sleeper/pid")
   212  			return newPid == pid, err
   213  		}, func(err error) {
   214  			f.NoError(err)
   215  		})
   216  
   217  		// Alloc should still be running
   218  		assertHealthy()
   219  	}
   220  }
   221  
   222  // TestClientState_KillDuringRestart force kills Nomad agents and restarts them
   223  // in a tight loop to assert Nomad is crash safe while a task is restarting.
   224  func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) {
   225  	t := f.T()
   226  	t.Parallel()
   227  
   228  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   229  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   230  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   231  	f.NoError(err)
   232  
   233  	f.NoError(serverAgent.Start())
   234  	defer serverAgent.Destroy()
   235  
   236  	f.NoError(clientAgent.Start())
   237  	defer clientAgent.Destroy()
   238  
   239  	// Get a client for the server agent to use even while the client is
   240  	// down.
   241  	client, err := serverAgent.Client()
   242  	f.NoError(err)
   243  
   244  	jobID := "restarter-" + uuid.Generate()[:8]
   245  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID)
   246  	f.Len(allocs, 1)
   247  
   248  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   249  	f.NoError(err)
   250  
   251  	defer func() {
   252  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   253  		//                   figure out how to fix it
   254  		client.Jobs().Deregister(jobID, false, nil)
   255  		client.System().GarbageCollect()
   256  		time.Sleep(time.Second)
   257  	}()
   258  
   259  	var restarts uint64
   260  	testutil.WaitForResult(func() (bool, error) {
   261  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   262  		f.NoError(err) // should never error
   263  
   264  		if len(alloc.TaskStates) == 0 {
   265  			return false, fmt.Errorf("waiting for tasks to start")
   266  		}
   267  
   268  		n := alloc.TaskStates["restarter"].Restarts
   269  		if n < restarts {
   270  			// Restarts should never decrease; immediately fail
   271  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   272  		}
   273  
   274  		// Capture current restarts
   275  		restarts = n
   276  		return true, nil
   277  	}, func(err error) {
   278  		f.NoError(err)
   279  	})
   280  
   281  	dice := rand.New(rand.NewSource(time.Now().UnixNano()))
   282  
   283  	// Kill and restart agent a few times
   284  	i := 0
   285  	for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); {
   286  		i++
   287  		sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond
   288  		t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts)
   289  
   290  		time.Sleep(sleep)
   291  
   292  		// Ensure restarts are progressing
   293  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   294  		f.NoError(err) // should never error
   295  		n := alloc.TaskStates["restarter"].Restarts
   296  		if n < restarts {
   297  			// Restarts should never decrease; immediately fail
   298  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   299  		}
   300  		if i > 5 && n == 0 {
   301  			// At least one restart should have happened by now
   302  			f.Failf("no restarts", "expected at least 1 restart after %d tries", i)
   303  		}
   304  		restarts = n
   305  
   306  		// Kill -9 Agent
   307  		agentPid := clientAgent.Cmd.Process.Pid
   308  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   309  		t.Logf("[TEST] ===> Killed %d", agentPid)
   310  
   311  		state, err := clientAgent.Cmd.Process.Wait()
   312  		f.NoError(err)
   313  		f.False(state.Exited()) // kill signal != exited
   314  		f.False(state.Success())
   315  
   316  		// Restart the agent (have to create a new Cmd)
   317  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   318  			"-config", clientAgent.ConfFile,
   319  			"-data-dir", clientAgent.DataDir,
   320  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   321  		)
   322  		clientAgent.Cmd.Stdout = clientOut
   323  		clientAgent.Cmd.Stderr = clientOut
   324  		f.NoError(clientAgent.Start())
   325  
   326  		// Assert a new process did start
   327  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   328  		clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP)
   329  		testutil.WaitForResult(func() (bool, error) {
   330  			resp, err := http.Get(clientUrl)
   331  			if err != nil {
   332  				return false, err
   333  			}
   334  			resp.Body.Close()
   335  			return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode)
   336  		}, func(err error) {
   337  			f.NoError(err)
   338  		})
   339  	}
   340  
   341  	t.Logf("[TEST] ===> Final restarts: %d", restarts)
   342  }
   343  
   344  // TestClientState_Corrupt removes task state from the client's state db to
   345  // assert it recovers.
   346  func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) {
   347  	t := f.T()
   348  	t.Parallel()
   349  
   350  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   351  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   352  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   353  	f.NoError(err)
   354  
   355  	f.NoError(serverAgent.Start())
   356  	defer serverAgent.Destroy()
   357  	f.NoError(clientAgent.Start())
   358  	defer clientAgent.Destroy()
   359  
   360  	// Get a client for the server agent to use even while the client is
   361  	// down.
   362  	client, err := serverAgent.Client()
   363  	f.NoError(err)
   364  
   365  	jobID := "sleeper-" + uuid.Generate()[:8]
   366  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID)
   367  	f.Len(allocs, 1)
   368  
   369  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   370  	f.NoError(err)
   371  
   372  	defer func() {
   373  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   374  		//                   figure out how to fix it
   375  		client.Jobs().Deregister(jobID, false, nil)
   376  		client.System().GarbageCollect()
   377  		time.Sleep(time.Second)
   378  	}()
   379  
   380  	assertHealthy := func() {
   381  		t.Helper()
   382  		testutil.WaitForResult(func() (bool, error) {
   383  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   384  			f.NoError(err) // should never error
   385  
   386  			if len(alloc.TaskStates) == 0 {
   387  				return false, fmt.Errorf("waiting for tasks to start")
   388  			}
   389  
   390  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   391  				return false, fmt.Errorf("task should be running: %q", s)
   392  			}
   393  
   394  			// Restarts should never happen
   395  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   396  			return true, nil
   397  		}, func(err error) {
   398  			f.NoError(err)
   399  		})
   400  	}
   401  	assertHealthy()
   402  
   403  	// Find pid
   404  	pid := 0
   405  	testutil.WaitForResult(func() (bool, error) {
   406  		pid, err = getPID(client, alloc, "sleeper/pid")
   407  		return pid > 0, err
   408  	}, func(err error) {
   409  		f.NoError(err)
   410  	})
   411  
   412  	// Kill and corrupt the state
   413  	agentPid := clientAgent.Cmd.Process.Pid
   414  	f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt))
   415  
   416  	procState, err := clientAgent.Cmd.Process.Wait()
   417  	f.NoError(err)
   418  	f.True(procState.Exited())
   419  
   420  	// Assert sleeper is still running
   421  	f.NoError(syscall.Kill(pid, 0))
   422  	assertHealthy()
   423  
   424  	// Remove task bucket from client state
   425  	db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client"))
   426  	f.NoError(err)
   427  
   428  	f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper"))
   429  	f.NoError(db.Close())
   430  
   431  	// Restart the agent (have to create a new Cmd)
   432  	clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   433  		"-config", clientAgent.ConfFile,
   434  		"-data-dir", clientAgent.DataDir,
   435  		"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   436  	)
   437  	clientAgent.Cmd.Stdout = clientOut
   438  	clientAgent.Cmd.Stderr = clientOut
   439  	f.NoError(clientAgent.Start())
   440  
   441  	// Assert a new process did start
   442  	f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   443  
   444  	// Retrieving the pid should work once it restarts.
   445  	// Critically there are now 2 pids because the client task state was
   446  	// lost Nomad started a new copy.
   447  	testutil.WaitForResult(func() (bool, error) {
   448  		allocfs := client.AllocFS()
   449  		r, err := allocfs.Cat(alloc, "sleeper/pid", nil)
   450  		if err != nil {
   451  			return false, err
   452  		}
   453  		defer r.Close()
   454  
   455  		out, err := ioutil.ReadAll(r)
   456  		if err != nil {
   457  			return false, err
   458  		}
   459  
   460  		lines := bytes.SplitN(out, []byte{'\n'}, 3)
   461  		if len(lines) != 3 || len(lines[2]) > 0 {
   462  			return false, fmt.Errorf("expected 2 lines not %v", lines)
   463  		}
   464  
   465  		return true, nil
   466  	}, func(err error) {
   467  		f.NoError(err)
   468  	})
   469  
   470  	// Alloc should still be running
   471  	assertHealthy()
   472  }