github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/disconnectedclients/disconnectedclients_test.go

github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/disconnectedclients/disconnectedclients_test.go (about)

     1  package disconnectedclients
     2  
     3  import (
     4  	"fmt"
     5  	"testing"
     6  	"time"
     7  
     8  	"github.com/hashicorp/go-multierror"
     9  	"github.com/hashicorp/nomad/e2e/e2eutil"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/testutil"
    12  	"github.com/shoenig/test/must"
    13  	"github.com/stretchr/testify/require"
    14  )
    15  
    16  const ns = ""
    17  
    18  // typical wait times for this test package
    19  var wait30s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 30}
    20  var wait60s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 60}
    21  
    22  type expectedAllocStatus struct {
    23  	disconnected string
    24  	unchanged    string
    25  	replacement  string
    26  }
    27  
    28  func TestDisconnectedClients(t *testing.T) {
    29  
    30  	nomad := e2eutil.NomadClient(t)
    31  	e2eutil.WaitForLeader(t, nomad)
    32  	e2eutil.WaitForNodesReady(t, nomad, 2) // needs at least 2 to test replacement
    33  
    34  	testCases := []struct {
    35  		skip                    bool
    36  		name                    string
    37  		jobFile                 string
    38  		disconnectFn            func(string, time.Duration) (string, error)
    39  		expectedAfterDisconnect expectedAllocStatus
    40  		expectedAfterReconnect  expectedAllocStatus
    41  	}{
    42  		{
    43  			// test that allocations on clients that are netsplit and
    44  			// marked disconnected are replaced
    45  			name:         "netsplit client no max disconnect",
    46  			jobFile:      "./input/lost_simple.nomad",
    47  			disconnectFn: e2eutil.AgentDisconnect,
    48  			expectedAfterDisconnect: expectedAllocStatus{
    49  				disconnected: "lost",
    50  				unchanged:    "running",
    51  				replacement:  "running",
    52  			},
    53  			expectedAfterReconnect: expectedAllocStatus{
    54  				disconnected: "complete",
    55  				unchanged:    "running",
    56  				replacement:  "running",
    57  			},
    58  		},
    59  		{
    60  			// test that allocations on clients that are netsplit and
    61  			// marked disconnected are replaced but that the
    62  			// replacements are rolled back after reconnection
    63  			skip:         true,
    64  			name:         "netsplit client with max disconnect",
    65  			jobFile:      "./input/lost_max_disconnect.nomad",
    66  			disconnectFn: e2eutil.AgentDisconnect,
    67  			expectedAfterDisconnect: expectedAllocStatus{
    68  				disconnected: "unknown",
    69  				unchanged:    "running",
    70  				replacement:  "running",
    71  			},
    72  			expectedAfterReconnect: expectedAllocStatus{
    73  				disconnected: "running",
    74  				unchanged:    "running",
    75  				replacement:  "complete",
    76  			},
    77  		},
    78  		{
    79  			// test that allocations on clients that are shutdown and
    80  			// marked disconnected are replaced
    81  			skip:         true,
    82  			name:         "shutdown client no max disconnect",
    83  			jobFile:      "./input/lost_simple.nomad",
    84  			disconnectFn: e2eutil.AgentDisconnect,
    85  			expectedAfterDisconnect: expectedAllocStatus{
    86  				disconnected: "lost",
    87  				unchanged:    "running",
    88  				replacement:  "running",
    89  			},
    90  			expectedAfterReconnect: expectedAllocStatus{
    91  				disconnected: "complete",
    92  				unchanged:    "running",
    93  				replacement:  "running",
    94  			},
    95  		},
    96  		{
    97  			// test that allocations on clients that are shutdown and
    98  			// marked disconnected are replaced
    99  			skip:         true,
   100  			name:         "shutdown client with max disconnect",
   101  			jobFile:      "./input/lost_max_disconnect.nomad",
   102  			disconnectFn: e2eutil.AgentDisconnect,
   103  			expectedAfterDisconnect: expectedAllocStatus{
   104  				disconnected: "unknown",
   105  				unchanged:    "running",
   106  				replacement:  "running",
   107  			},
   108  			expectedAfterReconnect: expectedAllocStatus{
   109  				disconnected: "running",
   110  				unchanged:    "running",
   111  				replacement:  "complete",
   112  			},
   113  		},
   114  	}
   115  
   116  	for _, tc := range testCases {
   117  		tc := tc
   118  		t.Run(tc.name, func(t *testing.T) {
   119  
   120  			if tc.skip {
   121  				t.Skip("SKIP BROKEN TEST")
   122  			}
   123  
   124  			jobIDs := []string{}
   125  			t.Cleanup(disconnectedClientsCleanup(t))
   126  			t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
   127  
   128  			jobID := "test-disconnected-clients-" + uuid.Short()
   129  
   130  			err := e2eutil.Register(jobID, tc.jobFile)
   131  			must.NoError(t, err, must.Sprint("failed to register job"))
   132  			jobIDs = append(jobIDs, jobID)
   133  
   134  			err = e2eutil.WaitForAllocStatusExpected(jobID, ns,
   135  				[]string{"running", "running"})
   136  			must.NoError(t, err, must.Sprint("job did not become running"))
   137  
   138  			err = e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil)
   139  			must.NoError(t, err, must.Sprint("deployment did not complete"))
   140  
   141  			// pick one alloc to make our disconnected alloc (and its node)
   142  			allocs, err := e2eutil.AllocsForJob(jobID, ns)
   143  			must.NoError(t, err, must.Sprint("could not query allocs for job"))
   144  			must.SliceLen(t, 2, allocs, must.Sprint("could not find 2 allocs for job"))
   145  
   146  			disconnectedAllocID := allocs[0]["ID"]
   147  			disconnectedNodeID := allocs[0]["Node ID"]
   148  			unchangedAllocID := allocs[1]["ID"]
   149  
   150  			// disconnect the node and wait for the results
   151  
   152  			restartJobID, err := tc.disconnectFn(disconnectedNodeID, 30*time.Second)
   153  			must.NoError(t, err, must.Sprint("expected agent disconnect job to register"))
   154  			jobIDs = append(jobIDs, restartJobID)
   155  
   156  			err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "disconnected", wait60s)
   157  			must.NoError(t, err, must.Sprint("expected node to go down"))
   158  			must.NoError(t, waitForAllocStatusMap(
   159  				jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterDisconnect, wait60s),
   160  			)
   161  
   162  			// wait for the client reconnect
   163  
   164  			err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "ready", wait30s)
   165  			must.NoError(t, err, must.Sprint("expected node to come back up"))
   166  			must.NoError(t, waitForAllocStatusMap(
   167  				jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterReconnect, wait60s),
   168  			)
   169  
   170  			// now get the resulting allocations, should be 3
   171  
   172  			allocs, err = e2eutil.AllocsForJob(jobID, ns)
   173  			must.NoError(t, err, must.Sprint("could not query allocs for job"))
   174  			must.SliceLen(t, 3, allocs, must.Sprint("could not find 3 allocs for job"))
   175  		})
   176  	}
   177  
   178  }
   179  
   180  // disconnectedClientsCleanup sets up a cleanup function to make sure
   181  // we've waited for all the nodes to come back up between tests
   182  func disconnectedClientsCleanup(t *testing.T) func() {
   183  	nodeIDs := []string{}
   184  	nodeStatuses, err := e2eutil.NodeStatusList()
   185  	require.NoError(t, err)
   186  	for _, nodeStatus := range nodeStatuses {
   187  		nodeIDs = append(nodeIDs, nodeStatus["ID"])
   188  	}
   189  	return func() {
   190  		nomad := e2eutil.NomadClient(t)
   191  		t.Logf("waiting for %d nodes to become ready again", len(nodeIDs))
   192  		e2eutil.WaitForNodesReady(t, nomad, len(nodeIDs))
   193  	}
   194  }
   195  
   196  func waitForAllocStatusMap(jobID, disconnectedAllocID, unchangedAllocID string, expected expectedAllocStatus, wc *e2eutil.WaitConfig) error {
   197  	var err error
   198  	interval, retries := wc.OrDefault()
   199  	testutil.WaitForResultRetries(retries, func() (bool, error) {
   200  		time.Sleep(interval)
   201  		allocs, err := e2eutil.AllocsForJob(jobID, ns)
   202  		if err != nil {
   203  			return false, err
   204  		}
   205  
   206  		var merr *multierror.Error
   207  
   208  		for _, alloc := range allocs {
   209  			switch allocID, allocStatus := alloc["ID"], alloc["Status"]; allocID {
   210  			case disconnectedAllocID:
   211  				if allocStatus != expected.disconnected {
   212  					merr = multierror.Append(merr, fmt.Errorf(
   213  						"disconnected alloc %q on node %q should be %q, got %q",
   214  						allocID, alloc["Node ID"], expected.disconnected, allocStatus))
   215  				}
   216  			case unchangedAllocID:
   217  				if allocStatus != expected.unchanged {
   218  					merr = multierror.Append(merr, fmt.Errorf(
   219  						"unchanged alloc %q on node %q should be %q, got %q",
   220  						allocID, alloc["Node ID"], expected.unchanged, allocStatus))
   221  				}
   222  			default:
   223  				if allocStatus != expected.replacement {
   224  					merr = multierror.Append(merr, fmt.Errorf(
   225  						"replacement alloc %q on node %q should be %q, got %q",
   226  						allocID, alloc["Node ID"], expected.replacement, allocStatus))
   227  				}
   228  			}
   229  		}
   230  		if merr != nil {
   231  			return false, merr.ErrorOrNil()
   232  		}
   233  		return true, nil
   234  	}, func(e error) {
   235  		err = e
   236  	})
   237  
   238  	// TODO(tgross): remove this block once this test has stabilized
   239  	if err != nil {
   240  		fmt.Printf("test failed, printing allocation status of all %q allocs for analysis\n", jobID)
   241  		fmt.Println("----------------")
   242  		allocs, _ := e2eutil.AllocsForJob(jobID, ns)
   243  		for _, alloc := range allocs {
   244  			out, _ := e2eutil.Command("nomad", "alloc", "status", alloc["ID"])
   245  			fmt.Println(out)
   246  			fmt.Println("----------------")
   247  		}
   248  	}
   249  
   250  	return err
   251  }