github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/clientstate/clientstate.go (about)

     1  package clientstate
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"math/rand"
     8  	"net/http"
     9  	"os"
    10  	"os/exec"
    11  	"path/filepath"
    12  	"strconv"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/hashicorp/nomad/api"
    17  	"github.com/hashicorp/nomad/ci"
    18  	"github.com/hashicorp/nomad/client/state"
    19  	"github.com/hashicorp/nomad/e2e/e2eutil"
    20  	"github.com/hashicorp/nomad/e2e/execagent"
    21  	"github.com/hashicorp/nomad/e2e/framework"
    22  	"github.com/hashicorp/nomad/helper/discover"
    23  	"github.com/hashicorp/nomad/helper/testlog"
    24  	"github.com/hashicorp/nomad/helper/uuid"
    25  	"github.com/hashicorp/nomad/testutil"
    26  )
    27  
    28  func init() {
    29  	framework.AddSuites(&framework.TestSuite{
    30  		Component:   "clientstate",
    31  		CanRunLocal: true,
    32  		Cases: []framework.TestCase{
    33  			&ClientStateTC{},
    34  		},
    35  	})
    36  }
    37  
    38  type ClientStateTC struct {
    39  	framework.TC
    40  
    41  	// bin is the path to Nomad binary
    42  	bin string
    43  }
    44  
    45  func (tc *ClientStateTC) BeforeAll(f *framework.F) {
    46  	if os.Getenv("NOMAD_TEST_STATE") == "" {
    47  		f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1")
    48  	}
    49  
    50  	bin, err := discover.NomadExecutable()
    51  	f.NoError(err)
    52  	tc.bin = bin
    53  }
    54  
    55  func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) {
    56  	allocfs := client.AllocFS()
    57  	r, err := allocfs.Cat(alloc, path, nil)
    58  	if err != nil {
    59  		return 0, err
    60  	}
    61  	defer r.Close()
    62  
    63  	out, err := ioutil.ReadAll(r)
    64  	if err != nil {
    65  		return 0, err
    66  	}
    67  
    68  	lines := bytes.SplitN(out, []byte{'\n'}, 2)
    69  	if len(lines) != 2 || len(lines[1]) > 0 {
    70  		return 0, fmt.Errorf("expected 1 line not %q", string(out))
    71  	}
    72  
    73  	// Capture pid
    74  	pid, err := strconv.Atoi(string(lines[0]))
    75  	if err != nil {
    76  		return 0, err
    77  	}
    78  
    79  	return pid, nil
    80  }
    81  
    82  // TestClientState_Kill force kills Nomad agents and restarts them in a tight
    83  // loop to assert Nomad is crash safe.
    84  func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) {
    85  	t := f.T()
    86  	ci.Parallel(t)
    87  
    88  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
    89  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
    90  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
    91  	f.NoError(err)
    92  
    93  	f.NoError(serverAgent.Start())
    94  	defer serverAgent.Destroy()
    95  	f.NoError(clientAgent.Start())
    96  	defer clientAgent.Destroy()
    97  
    98  	// Get a client for the server agent to use even while the client is
    99  	// down.
   100  	client, err := serverAgent.Client()
   101  	f.NoError(err)
   102  
   103  	jobID := "sleeper-" + uuid.Generate()[:8]
   104  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "")
   105  	f.Len(allocs, 1)
   106  
   107  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   108  	f.NoError(err)
   109  
   110  	defer func() {
   111  		if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil {
   112  			t.Logf("error stopping job: %v", err)
   113  		}
   114  
   115  		testutil.WaitForResult(func() (bool, error) {
   116  			sum, _, err := client.Jobs().Summary(jobID, nil)
   117  			if err != nil {
   118  				return false, err
   119  			}
   120  			if r := sum.Summary["sleeper"].Running; r > 0 {
   121  				return false, fmt.Errorf("still running: %d", r)
   122  			}
   123  			return true, nil
   124  		}, func(err error) {
   125  			f.NoError(err)
   126  		})
   127  
   128  		//XXX Must use client agent for gc'ing allocs?
   129  		clientAPI, err := clientAgent.Client()
   130  		f.NoError(err)
   131  		if err := clientAPI.Allocations().GC(alloc, nil); err != nil {
   132  			t.Logf("error garbage collecting alloc: %v", err)
   133  		}
   134  
   135  		if err := client.System().GarbageCollect(); err != nil {
   136  			t.Logf("error doing full gc: %v", err)
   137  		}
   138  
   139  		//HACK to wait until things have GC'd
   140  		time.Sleep(time.Second)
   141  	}()
   142  
   143  	assertHealthy := func() {
   144  		t.Helper()
   145  		testutil.WaitForResult(func() (bool, error) {
   146  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   147  			f.NoError(err) // should never error
   148  
   149  			if len(alloc.TaskStates) == 0 {
   150  				return false, fmt.Errorf("waiting for tasks to start")
   151  			}
   152  
   153  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   154  				return false, fmt.Errorf("task should be running: %q", s)
   155  			}
   156  
   157  			// Restarts should never happen
   158  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   159  			return true, nil
   160  		}, func(err error) {
   161  			f.NoError(err)
   162  		})
   163  	}
   164  	assertHealthy()
   165  
   166  	// Find pid
   167  	pid := 0
   168  	testutil.WaitForResult(func() (bool, error) {
   169  		pid, err = getPID(client, alloc, "sleeper/pid")
   170  		return pid > 0, err
   171  	}, func(err error) {
   172  		f.NoError(err)
   173  	})
   174  
   175  	// Kill and restart a few times
   176  	tries := 10
   177  	for i := 0; i < tries; i++ {
   178  		t.Logf("TEST RUN %d/%d", i+1, tries)
   179  
   180  		// Kill -9 the Agent
   181  		agentPid := clientAgent.Cmd.Process.Pid
   182  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   183  
   184  		state, err := clientAgent.Cmd.Process.Wait()
   185  		f.NoError(err)
   186  		f.False(state.Exited()) // kill signal != exited
   187  		f.False(state.Success())
   188  
   189  		// Assert sleeper is still running
   190  		f.NoError(syscall.Kill(pid, 0))
   191  		assertHealthy()
   192  
   193  		// Should not be able to reach its filesystem
   194  		_, err = getPID(client, alloc, "sleeper/pid")
   195  		f.Error(err)
   196  
   197  		// Restart the agent (have to create a new Cmd)
   198  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   199  			"-config", clientAgent.ConfFile,
   200  			"-data-dir", clientAgent.DataDir,
   201  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   202  		)
   203  		clientAgent.Cmd.Stdout = clientOut
   204  		clientAgent.Cmd.Stderr = clientOut
   205  		f.NoError(clientAgent.Start())
   206  
   207  		// Assert a new process did start
   208  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   209  
   210  		// Retrieving the pid should work once it restarts
   211  		testutil.WaitForResult(func() (bool, error) {
   212  			newPid, err := getPID(client, alloc, "sleeper/pid")
   213  			return newPid == pid, err
   214  		}, func(err error) {
   215  			f.NoError(err)
   216  		})
   217  
   218  		// Alloc should still be running
   219  		assertHealthy()
   220  	}
   221  }
   222  
   223  // TestClientState_KillDuringRestart force kills Nomad agents and restarts them
   224  // in a tight loop to assert Nomad is crash safe while a task is restarting.
   225  func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) {
   226  	t := f.T()
   227  	ci.Parallel(t)
   228  
   229  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   230  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   231  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   232  	f.NoError(err)
   233  
   234  	f.NoError(serverAgent.Start())
   235  	defer serverAgent.Destroy()
   236  
   237  	f.NoError(clientAgent.Start())
   238  	defer clientAgent.Destroy()
   239  
   240  	// Get a client for the server agent to use even while the client is
   241  	// down.
   242  	client, err := serverAgent.Client()
   243  	f.NoError(err)
   244  
   245  	jobID := "restarter-" + uuid.Generate()[:8]
   246  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID, "")
   247  	f.Len(allocs, 1)
   248  
   249  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   250  	f.NoError(err)
   251  
   252  	defer func() {
   253  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   254  		//                   figure out how to fix it
   255  		client.Jobs().Deregister(jobID, false, nil)
   256  		client.System().GarbageCollect()
   257  		time.Sleep(time.Second)
   258  	}()
   259  
   260  	var restarts uint64
   261  	testutil.WaitForResult(func() (bool, error) {
   262  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   263  		f.NoError(err) // should never error
   264  
   265  		if len(alloc.TaskStates) == 0 {
   266  			return false, fmt.Errorf("waiting for tasks to start")
   267  		}
   268  
   269  		n := alloc.TaskStates["restarter"].Restarts
   270  		if n < restarts {
   271  			// Restarts should never decrease; immediately fail
   272  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   273  		}
   274  
   275  		// Capture current restarts
   276  		restarts = n
   277  		return true, nil
   278  	}, func(err error) {
   279  		f.NoError(err)
   280  	})
   281  
   282  	dice := rand.New(rand.NewSource(time.Now().UnixNano()))
   283  
   284  	// Kill and restart agent a few times
   285  	i := 0
   286  	for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); {
   287  		i++
   288  		sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond
   289  		t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts)
   290  
   291  		time.Sleep(sleep)
   292  
   293  		// Ensure restarts are progressing
   294  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   295  		f.NoError(err) // should never error
   296  		n := alloc.TaskStates["restarter"].Restarts
   297  		if n < restarts {
   298  			// Restarts should never decrease; immediately fail
   299  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   300  		}
   301  		if i > 5 && n == 0 {
   302  			// At least one restart should have happened by now
   303  			f.Failf("no restarts", "expected at least 1 restart after %d tries", i)
   304  		}
   305  		restarts = n
   306  
   307  		// Kill -9 Agent
   308  		agentPid := clientAgent.Cmd.Process.Pid
   309  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   310  		t.Logf("[TEST] ===> Killed %d", agentPid)
   311  
   312  		state, err := clientAgent.Cmd.Process.Wait()
   313  		f.NoError(err)
   314  		f.False(state.Exited()) // kill signal != exited
   315  		f.False(state.Success())
   316  
   317  		// Restart the agent (have to create a new Cmd)
   318  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   319  			"-config", clientAgent.ConfFile,
   320  			"-data-dir", clientAgent.DataDir,
   321  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   322  		)
   323  		clientAgent.Cmd.Stdout = clientOut
   324  		clientAgent.Cmd.Stderr = clientOut
   325  		f.NoError(clientAgent.Start())
   326  
   327  		// Assert a new process did start
   328  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   329  		clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP)
   330  		testutil.WaitForResult(func() (bool, error) {
   331  			resp, err := http.Get(clientUrl)
   332  			if err != nil {
   333  				return false, err
   334  			}
   335  			resp.Body.Close()
   336  			return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode)
   337  		}, func(err error) {
   338  			f.NoError(err)
   339  		})
   340  	}
   341  
   342  	t.Logf("[TEST] ===> Final restarts: %d", restarts)
   343  }
   344  
   345  // TestClientState_Corrupt removes task state from the client's state db to
   346  // assert it recovers.
   347  func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) {
   348  	t := f.T()
   349  	ci.Parallel(t)
   350  
   351  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   352  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   353  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   354  	f.NoError(err)
   355  
   356  	f.NoError(serverAgent.Start())
   357  	defer serverAgent.Destroy()
   358  	f.NoError(clientAgent.Start())
   359  	defer clientAgent.Destroy()
   360  
   361  	// Get a client for the server agent to use even while the client is
   362  	// down.
   363  	client, err := serverAgent.Client()
   364  	f.NoError(err)
   365  
   366  	jobID := "sleeper-" + uuid.Generate()[:8]
   367  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "")
   368  	f.Len(allocs, 1)
   369  
   370  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   371  	f.NoError(err)
   372  
   373  	defer func() {
   374  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   375  		//                   figure out how to fix it
   376  		client.Jobs().Deregister(jobID, false, nil)
   377  		client.System().GarbageCollect()
   378  		time.Sleep(time.Second)
   379  	}()
   380  
   381  	assertHealthy := func() {
   382  		t.Helper()
   383  		testutil.WaitForResult(func() (bool, error) {
   384  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   385  			f.NoError(err) // should never error
   386  
   387  			if len(alloc.TaskStates) == 0 {
   388  				return false, fmt.Errorf("waiting for tasks to start")
   389  			}
   390  
   391  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   392  				return false, fmt.Errorf("task should be running: %q", s)
   393  			}
   394  
   395  			// Restarts should never happen
   396  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   397  			return true, nil
   398  		}, func(err error) {
   399  			f.NoError(err)
   400  		})
   401  	}
   402  	assertHealthy()
   403  
   404  	// Find pid
   405  	pid := 0
   406  	testutil.WaitForResult(func() (bool, error) {
   407  		pid, err = getPID(client, alloc, "sleeper/pid")
   408  		return pid > 0, err
   409  	}, func(err error) {
   410  		f.NoError(err)
   411  	})
   412  
   413  	// Kill and corrupt the state
   414  	agentPid := clientAgent.Cmd.Process.Pid
   415  	f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt))
   416  
   417  	procState, err := clientAgent.Cmd.Process.Wait()
   418  	f.NoError(err)
   419  	f.True(procState.Exited())
   420  
   421  	// Assert sleeper is still running
   422  	f.NoError(syscall.Kill(pid, 0))
   423  	assertHealthy()
   424  
   425  	// Remove task bucket from client state
   426  	db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client"))
   427  	f.NoError(err)
   428  
   429  	f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper"))
   430  	f.NoError(db.Close())
   431  
   432  	// Restart the agent (have to create a new Cmd)
   433  	clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   434  		"-config", clientAgent.ConfFile,
   435  		"-data-dir", clientAgent.DataDir,
   436  		"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   437  	)
   438  	clientAgent.Cmd.Stdout = clientOut
   439  	clientAgent.Cmd.Stderr = clientOut
   440  	f.NoError(clientAgent.Start())
   441  
   442  	// Assert a new process did start
   443  	f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   444  
   445  	// Retrieving the pid should work once it restarts.
   446  	// Critically there are now 2 pids because the client task state was
   447  	// lost Nomad started a new copy.
   448  	testutil.WaitForResult(func() (bool, error) {
   449  		allocfs := client.AllocFS()
   450  		r, err := allocfs.Cat(alloc, "sleeper/pid", nil)
   451  		if err != nil {
   452  			return false, err
   453  		}
   454  		defer r.Close()
   455  
   456  		out, err := ioutil.ReadAll(r)
   457  		if err != nil {
   458  			return false, err
   459  		}
   460  
   461  		lines := bytes.SplitN(out, []byte{'\n'}, 3)
   462  		if len(lines) != 3 || len(lines[2]) > 0 {
   463  			return false, fmt.Errorf("expected 2 lines not %v", lines)
   464  		}
   465  
   466  		return true, nil
   467  	}, func(err error) {
   468  		f.NoError(err)
   469  	})
   470  
   471  	// Alloc should still be running
   472  	assertHealthy()
   473  }