github.com/hernad/nomad@v1.6.112/e2e/disconnectedclients/disconnectedclients_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package disconnectedclients
     5  
     6  import (
     7  	"fmt"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/hashicorp/go-multierror"
    12  	"github.com/hernad/nomad/e2e/e2eutil"
    13  	"github.com/hernad/nomad/helper/uuid"
    14  	"github.com/hernad/nomad/testutil"
    15  	"github.com/shoenig/test/must"
    16  	"github.com/stretchr/testify/require"
    17  )
    18  
    19  const ns = ""
    20  
    21  // typical wait times for this test package
    22  var wait30s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 30}
    23  var wait60s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 60}
    24  
    25  type expectedAllocStatus struct {
    26  	disconnected string
    27  	unchanged    string
    28  	replacement  string
    29  }
    30  
    31  func TestDisconnectedClients(t *testing.T) {
    32  	t.Skip("disconnected clients tests disabled for now")
    33  
    34  	nomad := e2eutil.NomadClient(t)
    35  	e2eutil.WaitForLeader(t, nomad)
    36  	e2eutil.WaitForNodesReady(t, nomad, 2) // needs at least 2 to test replacement
    37  
    38  	testCases := []struct {
    39  		skip                    bool
    40  		name                    string
    41  		jobFile                 string
    42  		disconnectFn            func(string, time.Duration) (string, error)
    43  		expectedAfterDisconnect expectedAllocStatus
    44  		expectedAfterReconnect  expectedAllocStatus
    45  	}{
    46  		{
    47  			// test that allocations on clients that are netsplit and
    48  			// marked disconnected are replaced
    49  			name:         "netsplit client no max disconnect",
    50  			jobFile:      "./input/lost_simple.nomad",
    51  			disconnectFn: e2eutil.AgentDisconnect,
    52  			expectedAfterDisconnect: expectedAllocStatus{
    53  				disconnected: "lost",
    54  				unchanged:    "running",
    55  				replacement:  "running",
    56  			},
    57  			expectedAfterReconnect: expectedAllocStatus{
    58  				disconnected: "complete",
    59  				unchanged:    "running",
    60  				replacement:  "running",
    61  			},
    62  		},
    63  		{
    64  			// test that allocations on clients that are netsplit and
    65  			// marked disconnected are replaced but that the
    66  			// replacements are rolled back after reconnection
    67  			skip:         true,
    68  			name:         "netsplit client with max disconnect",
    69  			jobFile:      "./input/lost_max_disconnect.nomad",
    70  			disconnectFn: e2eutil.AgentDisconnect,
    71  			expectedAfterDisconnect: expectedAllocStatus{
    72  				disconnected: "unknown",
    73  				unchanged:    "running",
    74  				replacement:  "running",
    75  			},
    76  			expectedAfterReconnect: expectedAllocStatus{
    77  				disconnected: "running",
    78  				unchanged:    "running",
    79  				replacement:  "complete",
    80  			},
    81  		},
    82  		{
    83  			// test that allocations on clients that are shutdown and
    84  			// marked disconnected are replaced
    85  			skip:         true,
    86  			name:         "shutdown client no max disconnect",
    87  			jobFile:      "./input/lost_simple.nomad",
    88  			disconnectFn: e2eutil.AgentDisconnect,
    89  			expectedAfterDisconnect: expectedAllocStatus{
    90  				disconnected: "lost",
    91  				unchanged:    "running",
    92  				replacement:  "running",
    93  			},
    94  			expectedAfterReconnect: expectedAllocStatus{
    95  				disconnected: "complete",
    96  				unchanged:    "running",
    97  				replacement:  "running",
    98  			},
    99  		},
   100  		{
   101  			// test that allocations on clients that are shutdown and
   102  			// marked disconnected are replaced
   103  			skip:         true,
   104  			name:         "shutdown client with max disconnect",
   105  			jobFile:      "./input/lost_max_disconnect.nomad",
   106  			disconnectFn: e2eutil.AgentDisconnect,
   107  			expectedAfterDisconnect: expectedAllocStatus{
   108  				disconnected: "unknown",
   109  				unchanged:    "running",
   110  				replacement:  "running",
   111  			},
   112  			expectedAfterReconnect: expectedAllocStatus{
   113  				disconnected: "running",
   114  				unchanged:    "running",
   115  				replacement:  "complete",
   116  			},
   117  		},
   118  	}
   119  
   120  	for _, tc := range testCases {
   121  		tc := tc
   122  		t.Run(tc.name, func(t *testing.T) {
   123  
   124  			if tc.skip {
   125  				t.Skip("SKIP BROKEN TEST")
   126  			}
   127  
   128  			jobIDs := []string{}
   129  			t.Cleanup(disconnectedClientsCleanup(t))
   130  			t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
   131  
   132  			jobID := "test-disconnected-clients-" + uuid.Short()
   133  
   134  			err := e2eutil.Register(jobID, tc.jobFile)
   135  			must.NoError(t, err, must.Sprint("failed to register job"))
   136  			jobIDs = append(jobIDs, jobID)
   137  
   138  			err = e2eutil.WaitForAllocStatusExpected(jobID, ns,
   139  				[]string{"running", "running"})
   140  			must.NoError(t, err, must.Sprint("job did not become running"))
   141  
   142  			err = e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil)
   143  			must.NoError(t, err, must.Sprint("deployment did not complete"))
   144  
   145  			// pick one alloc to make our disconnected alloc (and its node)
   146  			allocs, err := e2eutil.AllocsForJob(jobID, ns)
   147  			must.NoError(t, err, must.Sprint("could not query allocs for job"))
   148  			must.SliceLen(t, 2, allocs, must.Sprint("could not find 2 allocs for job"))
   149  
   150  			disconnectedAllocID := allocs[0]["ID"]
   151  			disconnectedNodeID := allocs[0]["Node ID"]
   152  			unchangedAllocID := allocs[1]["ID"]
   153  
   154  			// disconnect the node and wait for the results
   155  
   156  			restartJobID, err := tc.disconnectFn(disconnectedNodeID, 30*time.Second)
   157  			must.NoError(t, err, must.Sprint("expected agent disconnect job to register"))
   158  			jobIDs = append(jobIDs, restartJobID)
   159  
   160  			err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "disconnected", wait60s)
   161  			must.NoError(t, err, must.Sprint("expected node to go down"))
   162  			must.NoError(t, waitForAllocStatusMap(
   163  				jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterDisconnect, wait60s),
   164  			)
   165  
   166  			// wait for the client reconnect
   167  
   168  			err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "ready", wait30s)
   169  			must.NoError(t, err, must.Sprint("expected node to come back up"))
   170  			must.NoError(t, waitForAllocStatusMap(
   171  				jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterReconnect, wait60s),
   172  			)
   173  
   174  			// now get the resulting allocations, should be 3
   175  
   176  			allocs, err = e2eutil.AllocsForJob(jobID, ns)
   177  			must.NoError(t, err, must.Sprint("could not query allocs for job"))
   178  			must.SliceLen(t, 3, allocs, must.Sprint("could not find 3 allocs for job"))
   179  		})
   180  	}
   181  
   182  }
   183  
   184  // disconnectedClientsCleanup sets up a cleanup function to make sure
   185  // we've waited for all the nodes to come back up between tests
   186  func disconnectedClientsCleanup(t *testing.T) func() {
   187  	nodeIDs := []string{}
   188  	nodeStatuses, err := e2eutil.NodeStatusList()
   189  	require.NoError(t, err)
   190  	for _, nodeStatus := range nodeStatuses {
   191  		nodeIDs = append(nodeIDs, nodeStatus["ID"])
   192  	}
   193  	return func() {
   194  		nomad := e2eutil.NomadClient(t)
   195  		t.Logf("waiting for %d nodes to become ready again", len(nodeIDs))
   196  		e2eutil.WaitForNodesReady(t, nomad, len(nodeIDs))
   197  	}
   198  }
   199  
   200  func waitForAllocStatusMap(jobID, disconnectedAllocID, unchangedAllocID string, expected expectedAllocStatus, wc *e2eutil.WaitConfig) error {
   201  	var err error
   202  	interval, retries := wc.OrDefault()
   203  	testutil.WaitForResultRetries(retries, func() (bool, error) {
   204  		time.Sleep(interval)
   205  		allocs, err := e2eutil.AllocsForJob(jobID, ns)
   206  		if err != nil {
   207  			return false, err
   208  		}
   209  
   210  		var merr *multierror.Error
   211  
   212  		for _, alloc := range allocs {
   213  			switch allocID, allocStatus := alloc["ID"], alloc["Status"]; allocID {
   214  			case disconnectedAllocID:
   215  				if allocStatus != expected.disconnected {
   216  					merr = multierror.Append(merr, fmt.Errorf(
   217  						"disconnected alloc %q on node %q should be %q, got %q",
   218  						allocID, alloc["Node ID"], expected.disconnected, allocStatus))
   219  				}
   220  			case unchangedAllocID:
   221  				if allocStatus != expected.unchanged {
   222  					merr = multierror.Append(merr, fmt.Errorf(
   223  						"unchanged alloc %q on node %q should be %q, got %q",
   224  						allocID, alloc["Node ID"], expected.unchanged, allocStatus))
   225  				}
   226  			default:
   227  				if allocStatus != expected.replacement {
   228  					merr = multierror.Append(merr, fmt.Errorf(
   229  						"replacement alloc %q on node %q should be %q, got %q",
   230  						allocID, alloc["Node ID"], expected.replacement, allocStatus))
   231  				}
   232  			}
   233  		}
   234  		if merr != nil {
   235  			return false, merr.ErrorOrNil()
   236  		}
   237  		return true, nil
   238  	}, func(e error) {
   239  		err = e
   240  	})
   241  
   242  	// TODO(tgross): remove this block once this test has stabilized
   243  	if err != nil {
   244  		fmt.Printf("test failed, printing allocation status of all %q allocs for analysis\n", jobID)
   245  		fmt.Println("----------------")
   246  		allocs, _ := e2eutil.AllocsForJob(jobID, ns)
   247  		for _, alloc := range allocs {
   248  			out, _ := e2eutil.Command("nomad", "alloc", "status", alloc["ID"])
   249  			fmt.Println(out)
   250  			fmt.Println("----------------")
   251  		}
   252  	}
   253  
   254  	return err
   255  }