github.com/hernad/nomad@v1.6.112/e2e/clientstate/clientstate.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package clientstate
     5  
     6  import (
     7  	"bytes"
     8  	"fmt"
     9  	"io"
    10  	"math/rand"
    11  	"net/http"
    12  	"os"
    13  	"os/exec"
    14  	"path/filepath"
    15  	"strconv"
    16  	"syscall"
    17  	"time"
    18  
    19  	"github.com/hernad/nomad/api"
    20  	"github.com/hernad/nomad/ci"
    21  	"github.com/hernad/nomad/client/state"
    22  	"github.com/hernad/nomad/e2e/e2eutil"
    23  	"github.com/hernad/nomad/e2e/execagent"
    24  	"github.com/hernad/nomad/e2e/framework"
    25  	"github.com/hernad/nomad/helper/discover"
    26  	"github.com/hernad/nomad/helper/testlog"
    27  	"github.com/hernad/nomad/helper/uuid"
    28  	"github.com/hernad/nomad/testutil"
    29  )
    30  
    31  func init() {
    32  	framework.AddSuites(&framework.TestSuite{
    33  		Component:   "clientstate",
    34  		CanRunLocal: true,
    35  		Cases: []framework.TestCase{
    36  			&ClientStateTC{},
    37  		},
    38  	})
    39  }
    40  
    41  type ClientStateTC struct {
    42  	framework.TC
    43  
    44  	// bin is the path to Nomad binary
    45  	bin string
    46  }
    47  
    48  func (tc *ClientStateTC) BeforeAll(f *framework.F) {
    49  	if os.Getenv("NOMAD_TEST_STATE") == "" {
    50  		f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1")
    51  	}
    52  
    53  	bin, err := discover.NomadExecutable()
    54  	f.NoError(err)
    55  	tc.bin = bin
    56  }
    57  
    58  func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) {
    59  	allocfs := client.AllocFS()
    60  	r, err := allocfs.Cat(alloc, path, nil)
    61  	if err != nil {
    62  		return 0, err
    63  	}
    64  	defer r.Close()
    65  
    66  	out, err := io.ReadAll(r)
    67  	if err != nil {
    68  		return 0, err
    69  	}
    70  
    71  	lines := bytes.SplitN(out, []byte{'\n'}, 2)
    72  	if len(lines) != 2 || len(lines[1]) > 0 {
    73  		return 0, fmt.Errorf("expected 1 line not %q", string(out))
    74  	}
    75  
    76  	// Capture pid
    77  	pid, err := strconv.Atoi(string(lines[0]))
    78  	if err != nil {
    79  		return 0, err
    80  	}
    81  
    82  	return pid, nil
    83  }
    84  
    85  // TestClientState_Kill force kills Nomad agents and restarts them in a tight
    86  // loop to assert Nomad is crash safe.
    87  func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) {
    88  	t := f.T()
    89  	ci.Parallel(t)
    90  
    91  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
    92  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
    93  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
    94  	f.NoError(err)
    95  
    96  	f.NoError(serverAgent.Start())
    97  	defer serverAgent.Destroy()
    98  	f.NoError(clientAgent.Start())
    99  	defer clientAgent.Destroy()
   100  
   101  	// Get a client for the server agent to use even while the client is
   102  	// down.
   103  	client, err := serverAgent.Client()
   104  	f.NoError(err)
   105  
   106  	jobID := "sleeper-" + uuid.Generate()[:8]
   107  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "")
   108  	f.Len(allocs, 1)
   109  
   110  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   111  	f.NoError(err)
   112  
   113  	defer func() {
   114  		if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil {
   115  			t.Logf("error stopping job: %v", err)
   116  		}
   117  
   118  		testutil.WaitForResult(func() (bool, error) {
   119  			sum, _, err := client.Jobs().Summary(jobID, nil)
   120  			if err != nil {
   121  				return false, err
   122  			}
   123  			if r := sum.Summary["sleeper"].Running; r > 0 {
   124  				return false, fmt.Errorf("still running: %d", r)
   125  			}
   126  			return true, nil
   127  		}, func(err error) {
   128  			f.NoError(err)
   129  		})
   130  
   131  		//XXX Must use client agent for gc'ing allocs?
   132  		clientAPI, err := clientAgent.Client()
   133  		f.NoError(err)
   134  		if err := clientAPI.Allocations().GC(alloc, nil); err != nil {
   135  			t.Logf("error garbage collecting alloc: %v", err)
   136  		}
   137  
   138  		if err := client.System().GarbageCollect(); err != nil {
   139  			t.Logf("error doing full gc: %v", err)
   140  		}
   141  
   142  		//HACK to wait until things have GC'd
   143  		time.Sleep(time.Second)
   144  	}()
   145  
   146  	assertHealthy := func() {
   147  		t.Helper()
   148  		testutil.WaitForResult(func() (bool, error) {
   149  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   150  			f.NoError(err) // should never error
   151  
   152  			if len(alloc.TaskStates) == 0 {
   153  				return false, fmt.Errorf("waiting for tasks to start")
   154  			}
   155  
   156  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   157  				return false, fmt.Errorf("task should be running: %q", s)
   158  			}
   159  
   160  			// Restarts should never happen
   161  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   162  			return true, nil
   163  		}, func(err error) {
   164  			f.NoError(err)
   165  		})
   166  	}
   167  	assertHealthy()
   168  
   169  	// Find pid
   170  	pid := 0
   171  	testutil.WaitForResult(func() (bool, error) {
   172  		pid, err = getPID(client, alloc, "sleeper/pid")
   173  		return pid > 0, err
   174  	}, func(err error) {
   175  		f.NoError(err)
   176  	})
   177  
   178  	// Kill and restart a few times
   179  	tries := 10
   180  	for i := 0; i < tries; i++ {
   181  		t.Logf("TEST RUN %d/%d", i+1, tries)
   182  
   183  		// Kill -9 the Agent
   184  		agentPid := clientAgent.Cmd.Process.Pid
   185  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   186  
   187  		state, err := clientAgent.Cmd.Process.Wait()
   188  		f.NoError(err)
   189  		f.False(state.Exited()) // kill signal != exited
   190  		f.False(state.Success())
   191  
   192  		// Assert sleeper is still running
   193  		f.NoError(syscall.Kill(pid, 0))
   194  		assertHealthy()
   195  
   196  		// Should not be able to reach its filesystem
   197  		_, err = getPID(client, alloc, "sleeper/pid")
   198  		f.Error(err)
   199  
   200  		// Restart the agent (have to create a new Cmd)
   201  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   202  			"-config", clientAgent.ConfFile,
   203  			"-data-dir", clientAgent.DataDir,
   204  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   205  		)
   206  		clientAgent.Cmd.Stdout = clientOut
   207  		clientAgent.Cmd.Stderr = clientOut
   208  		f.NoError(clientAgent.Start())
   209  
   210  		// Assert a new process did start
   211  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   212  
   213  		// Retrieving the pid should work once it restarts
   214  		testutil.WaitForResult(func() (bool, error) {
   215  			newPid, err := getPID(client, alloc, "sleeper/pid")
   216  			return newPid == pid, err
   217  		}, func(err error) {
   218  			f.NoError(err)
   219  		})
   220  
   221  		// Alloc should still be running
   222  		assertHealthy()
   223  	}
   224  }
   225  
   226  // TestClientState_KillDuringRestart force kills Nomad agents and restarts them
   227  // in a tight loop to assert Nomad is crash safe while a task is restarting.
   228  func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) {
   229  	t := f.T()
   230  	ci.Parallel(t)
   231  
   232  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   233  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   234  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   235  	f.NoError(err)
   236  
   237  	f.NoError(serverAgent.Start())
   238  	defer serverAgent.Destroy()
   239  
   240  	f.NoError(clientAgent.Start())
   241  	defer clientAgent.Destroy()
   242  
   243  	// Get a client for the server agent to use even while the client is
   244  	// down.
   245  	client, err := serverAgent.Client()
   246  	f.NoError(err)
   247  
   248  	jobID := "restarter-" + uuid.Generate()[:8]
   249  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID, "")
   250  	f.Len(allocs, 1)
   251  
   252  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   253  	f.NoError(err)
   254  
   255  	defer func() {
   256  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   257  		//                   figure out how to fix it
   258  		client.Jobs().Deregister(jobID, false, nil)
   259  		client.System().GarbageCollect()
   260  		time.Sleep(time.Second)
   261  	}()
   262  
   263  	var restarts uint64
   264  	testutil.WaitForResult(func() (bool, error) {
   265  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   266  		f.NoError(err) // should never error
   267  
   268  		if len(alloc.TaskStates) == 0 {
   269  			return false, fmt.Errorf("waiting for tasks to start")
   270  		}
   271  
   272  		n := alloc.TaskStates["restarter"].Restarts
   273  		if n < restarts {
   274  			// Restarts should never decrease; immediately fail
   275  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   276  		}
   277  
   278  		// Capture current restarts
   279  		restarts = n
   280  		return true, nil
   281  	}, func(err error) {
   282  		f.NoError(err)
   283  	})
   284  
   285  	dice := rand.New(rand.NewSource(time.Now().UnixNano()))
   286  
   287  	// Kill and restart agent a few times
   288  	i := 0
   289  	for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); {
   290  		i++
   291  		sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond
   292  		t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts)
   293  
   294  		time.Sleep(sleep)
   295  
   296  		// Ensure restarts are progressing
   297  		alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   298  		f.NoError(err) // should never error
   299  		n := alloc.TaskStates["restarter"].Restarts
   300  		if n < restarts {
   301  			// Restarts should never decrease; immediately fail
   302  			f.Failf("restarts decreased", "%d < %d", n, restarts)
   303  		}
   304  		if i > 5 && n == 0 {
   305  			// At least one restart should have happened by now
   306  			f.Failf("no restarts", "expected at least 1 restart after %d tries", i)
   307  		}
   308  		restarts = n
   309  
   310  		// Kill -9 Agent
   311  		agentPid := clientAgent.Cmd.Process.Pid
   312  		f.NoError(clientAgent.Cmd.Process.Signal(os.Kill))
   313  		t.Logf("[TEST] ===> Killed %d", agentPid)
   314  
   315  		state, err := clientAgent.Cmd.Process.Wait()
   316  		f.NoError(err)
   317  		f.False(state.Exited()) // kill signal != exited
   318  		f.False(state.Success())
   319  
   320  		// Restart the agent (have to create a new Cmd)
   321  		clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   322  			"-config", clientAgent.ConfFile,
   323  			"-data-dir", clientAgent.DataDir,
   324  			"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   325  		)
   326  		clientAgent.Cmd.Stdout = clientOut
   327  		clientAgent.Cmd.Stderr = clientOut
   328  		f.NoError(clientAgent.Start())
   329  
   330  		// Assert a new process did start
   331  		f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   332  		clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP)
   333  		testutil.WaitForResult(func() (bool, error) {
   334  			resp, err := http.Get(clientUrl)
   335  			if err != nil {
   336  				return false, err
   337  			}
   338  			resp.Body.Close()
   339  			return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode)
   340  		}, func(err error) {
   341  			f.NoError(err)
   342  		})
   343  	}
   344  
   345  	t.Logf("[TEST] ===> Final restarts: %d", restarts)
   346  }
   347  
   348  // TestClientState_Corrupt removes task state from the client's state db to
   349  // assert it recovers.
   350  func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) {
   351  	t := f.T()
   352  	ci.Parallel(t)
   353  
   354  	serverOut := testlog.NewPrefixWriter(t, "SERVER: ")
   355  	clientOut := testlog.NewPrefixWriter(t, "CLIENT: ")
   356  	serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut)
   357  	f.NoError(err)
   358  
   359  	f.NoError(serverAgent.Start())
   360  	defer serverAgent.Destroy()
   361  	f.NoError(clientAgent.Start())
   362  	defer clientAgent.Destroy()
   363  
   364  	// Get a client for the server agent to use even while the client is
   365  	// down.
   366  	client, err := serverAgent.Client()
   367  	f.NoError(err)
   368  
   369  	jobID := "sleeper-" + uuid.Generate()[:8]
   370  	allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "")
   371  	f.Len(allocs, 1)
   372  
   373  	alloc, _, err := client.Allocations().Info(allocs[0].ID, nil)
   374  	f.NoError(err)
   375  
   376  	defer func() {
   377  		//FIXME(schmichael): this cleanup is insufficient, but I can't
   378  		//                   figure out how to fix it
   379  		client.Jobs().Deregister(jobID, false, nil)
   380  		client.System().GarbageCollect()
   381  		time.Sleep(time.Second)
   382  	}()
   383  
   384  	assertHealthy := func() {
   385  		t.Helper()
   386  		testutil.WaitForResult(func() (bool, error) {
   387  			alloc, _, err = client.Allocations().Info(alloc.ID, nil)
   388  			f.NoError(err) // should never error
   389  
   390  			if len(alloc.TaskStates) == 0 {
   391  				return false, fmt.Errorf("waiting for tasks to start")
   392  			}
   393  
   394  			if s := alloc.TaskStates["sleeper"].State; s != "running" {
   395  				return false, fmt.Errorf("task should be running: %q", s)
   396  			}
   397  
   398  			// Restarts should never happen
   399  			f.Zero(alloc.TaskStates["sleeper"].Restarts)
   400  			return true, nil
   401  		}, func(err error) {
   402  			f.NoError(err)
   403  		})
   404  	}
   405  	assertHealthy()
   406  
   407  	// Find pid
   408  	pid := 0
   409  	testutil.WaitForResult(func() (bool, error) {
   410  		pid, err = getPID(client, alloc, "sleeper/pid")
   411  		return pid > 0, err
   412  	}, func(err error) {
   413  		f.NoError(err)
   414  	})
   415  
   416  	// Kill and corrupt the state
   417  	agentPid := clientAgent.Cmd.Process.Pid
   418  	f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt))
   419  
   420  	procState, err := clientAgent.Cmd.Process.Wait()
   421  	f.NoError(err)
   422  	f.True(procState.Exited())
   423  
   424  	// Assert sleeper is still running
   425  	f.NoError(syscall.Kill(pid, 0))
   426  	assertHealthy()
   427  
   428  	// Remove task bucket from client state
   429  	db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client"))
   430  	f.NoError(err)
   431  
   432  	f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper"))
   433  	f.NoError(db.Close())
   434  
   435  	// Restart the agent (have to create a new Cmd)
   436  	clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent",
   437  		"-config", clientAgent.ConfFile,
   438  		"-data-dir", clientAgent.DataDir,
   439  		"-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC),
   440  	)
   441  	clientAgent.Cmd.Stdout = clientOut
   442  	clientAgent.Cmd.Stderr = clientOut
   443  	f.NoError(clientAgent.Start())
   444  
   445  	// Assert a new process did start
   446  	f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid)
   447  
   448  	// Retrieving the pid should work once it restarts.
   449  	// Critically there are now 2 pids because the client task state was
   450  	// lost Nomad started a new copy.
   451  	testutil.WaitForResult(func() (bool, error) {
   452  		allocfs := client.AllocFS()
   453  		r, err := allocfs.Cat(alloc, "sleeper/pid", nil)
   454  		if err != nil {
   455  			return false, err
   456  		}
   457  		defer r.Close()
   458  
   459  		out, err := io.ReadAll(r)
   460  		if err != nil {
   461  			return false, err
   462  		}
   463  
   464  		lines := bytes.SplitN(out, []byte{'\n'}, 3)
   465  		if len(lines) != 3 || len(lines[2]) > 0 {
   466  			return false, fmt.Errorf("expected 2 lines not %v", lines)
   467  		}
   468  
   469  		return true, nil
   470  	}, func(err error) {
   471  		f.NoError(err)
   472  	})
   473  
   474  	// Alloc should still be running
   475  	assertHealthy()
   476  }