github.com/hernad/nomad@v1.6.112/e2e/disconnectedclients/disconnectedclients_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package disconnectedclients 5 6 import ( 7 "fmt" 8 "testing" 9 "time" 10 11 "github.com/hashicorp/go-multierror" 12 "github.com/hernad/nomad/e2e/e2eutil" 13 "github.com/hernad/nomad/helper/uuid" 14 "github.com/hernad/nomad/testutil" 15 "github.com/shoenig/test/must" 16 "github.com/stretchr/testify/require" 17 ) 18 19 const ns = "" 20 21 // typical wait times for this test package 22 var wait30s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 30} 23 var wait60s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 60} 24 25 type expectedAllocStatus struct { 26 disconnected string 27 unchanged string 28 replacement string 29 } 30 31 func TestDisconnectedClients(t *testing.T) { 32 t.Skip("disconnected clients tests disabled for now") 33 34 nomad := e2eutil.NomadClient(t) 35 e2eutil.WaitForLeader(t, nomad) 36 e2eutil.WaitForNodesReady(t, nomad, 2) // needs at least 2 to test replacement 37 38 testCases := []struct { 39 skip bool 40 name string 41 jobFile string 42 disconnectFn func(string, time.Duration) (string, error) 43 expectedAfterDisconnect expectedAllocStatus 44 expectedAfterReconnect expectedAllocStatus 45 }{ 46 { 47 // test that allocations on clients that are netsplit and 48 // marked disconnected are replaced 49 name: "netsplit client no max disconnect", 50 jobFile: "./input/lost_simple.nomad", 51 disconnectFn: e2eutil.AgentDisconnect, 52 expectedAfterDisconnect: expectedAllocStatus{ 53 disconnected: "lost", 54 unchanged: "running", 55 replacement: "running", 56 }, 57 expectedAfterReconnect: expectedAllocStatus{ 58 disconnected: "complete", 59 unchanged: "running", 60 replacement: "running", 61 }, 62 }, 63 { 64 // test that allocations on clients that are netsplit and 65 // marked disconnected are replaced but that the 66 // replacements are rolled back after reconnection 67 skip: true, 68 name: "netsplit client with max disconnect", 69 jobFile: "./input/lost_max_disconnect.nomad", 70 disconnectFn: e2eutil.AgentDisconnect, 71 expectedAfterDisconnect: expectedAllocStatus{ 72 disconnected: "unknown", 73 unchanged: "running", 74 replacement: "running", 75 }, 76 expectedAfterReconnect: expectedAllocStatus{ 77 disconnected: "running", 78 unchanged: "running", 79 replacement: "complete", 80 }, 81 }, 82 { 83 // test that allocations on clients that are shutdown and 84 // marked disconnected are replaced 85 skip: true, 86 name: "shutdown client no max disconnect", 87 jobFile: "./input/lost_simple.nomad", 88 disconnectFn: e2eutil.AgentDisconnect, 89 expectedAfterDisconnect: expectedAllocStatus{ 90 disconnected: "lost", 91 unchanged: "running", 92 replacement: "running", 93 }, 94 expectedAfterReconnect: expectedAllocStatus{ 95 disconnected: "complete", 96 unchanged: "running", 97 replacement: "running", 98 }, 99 }, 100 { 101 // test that allocations on clients that are shutdown and 102 // marked disconnected are replaced 103 skip: true, 104 name: "shutdown client with max disconnect", 105 jobFile: "./input/lost_max_disconnect.nomad", 106 disconnectFn: e2eutil.AgentDisconnect, 107 expectedAfterDisconnect: expectedAllocStatus{ 108 disconnected: "unknown", 109 unchanged: "running", 110 replacement: "running", 111 }, 112 expectedAfterReconnect: expectedAllocStatus{ 113 disconnected: "running", 114 unchanged: "running", 115 replacement: "complete", 116 }, 117 }, 118 } 119 120 for _, tc := range testCases { 121 tc := tc 122 t.Run(tc.name, func(t *testing.T) { 123 124 if tc.skip { 125 t.Skip("SKIP BROKEN TEST") 126 } 127 128 jobIDs := []string{} 129 t.Cleanup(disconnectedClientsCleanup(t)) 130 t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) 131 132 jobID := "test-disconnected-clients-" + uuid.Short() 133 134 err := e2eutil.Register(jobID, tc.jobFile) 135 must.NoError(t, err, must.Sprint("failed to register job")) 136 jobIDs = append(jobIDs, jobID) 137 138 err = e2eutil.WaitForAllocStatusExpected(jobID, ns, 139 []string{"running", "running"}) 140 must.NoError(t, err, must.Sprint("job did not become running")) 141 142 err = e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil) 143 must.NoError(t, err, must.Sprint("deployment did not complete")) 144 145 // pick one alloc to make our disconnected alloc (and its node) 146 allocs, err := e2eutil.AllocsForJob(jobID, ns) 147 must.NoError(t, err, must.Sprint("could not query allocs for job")) 148 must.SliceLen(t, 2, allocs, must.Sprint("could not find 2 allocs for job")) 149 150 disconnectedAllocID := allocs[0]["ID"] 151 disconnectedNodeID := allocs[0]["Node ID"] 152 unchangedAllocID := allocs[1]["ID"] 153 154 // disconnect the node and wait for the results 155 156 restartJobID, err := tc.disconnectFn(disconnectedNodeID, 30*time.Second) 157 must.NoError(t, err, must.Sprint("expected agent disconnect job to register")) 158 jobIDs = append(jobIDs, restartJobID) 159 160 err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "disconnected", wait60s) 161 must.NoError(t, err, must.Sprint("expected node to go down")) 162 must.NoError(t, waitForAllocStatusMap( 163 jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterDisconnect, wait60s), 164 ) 165 166 // wait for the client reconnect 167 168 err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "ready", wait30s) 169 must.NoError(t, err, must.Sprint("expected node to come back up")) 170 must.NoError(t, waitForAllocStatusMap( 171 jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterReconnect, wait60s), 172 ) 173 174 // now get the resulting allocations, should be 3 175 176 allocs, err = e2eutil.AllocsForJob(jobID, ns) 177 must.NoError(t, err, must.Sprint("could not query allocs for job")) 178 must.SliceLen(t, 3, allocs, must.Sprint("could not find 3 allocs for job")) 179 }) 180 } 181 182 } 183 184 // disconnectedClientsCleanup sets up a cleanup function to make sure 185 // we've waited for all the nodes to come back up between tests 186 func disconnectedClientsCleanup(t *testing.T) func() { 187 nodeIDs := []string{} 188 nodeStatuses, err := e2eutil.NodeStatusList() 189 require.NoError(t, err) 190 for _, nodeStatus := range nodeStatuses { 191 nodeIDs = append(nodeIDs, nodeStatus["ID"]) 192 } 193 return func() { 194 nomad := e2eutil.NomadClient(t) 195 t.Logf("waiting for %d nodes to become ready again", len(nodeIDs)) 196 e2eutil.WaitForNodesReady(t, nomad, len(nodeIDs)) 197 } 198 } 199 200 func waitForAllocStatusMap(jobID, disconnectedAllocID, unchangedAllocID string, expected expectedAllocStatus, wc *e2eutil.WaitConfig) error { 201 var err error 202 interval, retries := wc.OrDefault() 203 testutil.WaitForResultRetries(retries, func() (bool, error) { 204 time.Sleep(interval) 205 allocs, err := e2eutil.AllocsForJob(jobID, ns) 206 if err != nil { 207 return false, err 208 } 209 210 var merr *multierror.Error 211 212 for _, alloc := range allocs { 213 switch allocID, allocStatus := alloc["ID"], alloc["Status"]; allocID { 214 case disconnectedAllocID: 215 if allocStatus != expected.disconnected { 216 merr = multierror.Append(merr, fmt.Errorf( 217 "disconnected alloc %q on node %q should be %q, got %q", 218 allocID, alloc["Node ID"], expected.disconnected, allocStatus)) 219 } 220 case unchangedAllocID: 221 if allocStatus != expected.unchanged { 222 merr = multierror.Append(merr, fmt.Errorf( 223 "unchanged alloc %q on node %q should be %q, got %q", 224 allocID, alloc["Node ID"], expected.unchanged, allocStatus)) 225 } 226 default: 227 if allocStatus != expected.replacement { 228 merr = multierror.Append(merr, fmt.Errorf( 229 "replacement alloc %q on node %q should be %q, got %q", 230 allocID, alloc["Node ID"], expected.replacement, allocStatus)) 231 } 232 } 233 } 234 if merr != nil { 235 return false, merr.ErrorOrNil() 236 } 237 return true, nil 238 }, func(e error) { 239 err = e 240 }) 241 242 // TODO(tgross): remove this block once this test has stabilized 243 if err != nil { 244 fmt.Printf("test failed, printing allocation status of all %q allocs for analysis\n", jobID) 245 fmt.Println("----------------") 246 allocs, _ := e2eutil.AllocsForJob(jobID, ns) 247 for _, alloc := range allocs { 248 out, _ := e2eutil.Command("nomad", "alloc", "status", alloc["ID"]) 249 fmt.Println(out) 250 fmt.Println("----------------") 251 } 252 } 253 254 return err 255 }