github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/disconnectedclients/disconnectedclients_test.go (about) 1 package disconnectedclients 2 3 import ( 4 "fmt" 5 "testing" 6 "time" 7 8 "github.com/hashicorp/go-multierror" 9 "github.com/hashicorp/nomad/e2e/e2eutil" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/testutil" 12 "github.com/shoenig/test/must" 13 "github.com/stretchr/testify/require" 14 ) 15 16 const ns = "" 17 18 // typical wait times for this test package 19 var wait30s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 30} 20 var wait60s = &e2eutil.WaitConfig{Interval: time.Second, Retries: 60} 21 22 type expectedAllocStatus struct { 23 disconnected string 24 unchanged string 25 replacement string 26 } 27 28 func TestDisconnectedClients(t *testing.T) { 29 30 nomad := e2eutil.NomadClient(t) 31 e2eutil.WaitForLeader(t, nomad) 32 e2eutil.WaitForNodesReady(t, nomad, 2) // needs at least 2 to test replacement 33 34 testCases := []struct { 35 skip bool 36 name string 37 jobFile string 38 disconnectFn func(string, time.Duration) (string, error) 39 expectedAfterDisconnect expectedAllocStatus 40 expectedAfterReconnect expectedAllocStatus 41 }{ 42 { 43 // test that allocations on clients that are netsplit and 44 // marked disconnected are replaced 45 name: "netsplit client no max disconnect", 46 jobFile: "./input/lost_simple.nomad", 47 disconnectFn: e2eutil.AgentDisconnect, 48 expectedAfterDisconnect: expectedAllocStatus{ 49 disconnected: "lost", 50 unchanged: "running", 51 replacement: "running", 52 }, 53 expectedAfterReconnect: expectedAllocStatus{ 54 disconnected: "complete", 55 unchanged: "running", 56 replacement: "running", 57 }, 58 }, 59 { 60 // test that allocations on clients that are netsplit and 61 // marked disconnected are replaced but that the 62 // replacements are rolled back after reconnection 63 skip: true, 64 name: "netsplit client with max disconnect", 65 jobFile: "./input/lost_max_disconnect.nomad", 66 disconnectFn: e2eutil.AgentDisconnect, 67 expectedAfterDisconnect: expectedAllocStatus{ 68 disconnected: "unknown", 69 unchanged: "running", 70 replacement: "running", 71 }, 72 expectedAfterReconnect: expectedAllocStatus{ 73 disconnected: "running", 74 unchanged: "running", 75 replacement: "complete", 76 }, 77 }, 78 { 79 // test that allocations on clients that are shutdown and 80 // marked disconnected are replaced 81 skip: true, 82 name: "shutdown client no max disconnect", 83 jobFile: "./input/lost_simple.nomad", 84 disconnectFn: e2eutil.AgentDisconnect, 85 expectedAfterDisconnect: expectedAllocStatus{ 86 disconnected: "lost", 87 unchanged: "running", 88 replacement: "running", 89 }, 90 expectedAfterReconnect: expectedAllocStatus{ 91 disconnected: "complete", 92 unchanged: "running", 93 replacement: "running", 94 }, 95 }, 96 { 97 // test that allocations on clients that are shutdown and 98 // marked disconnected are replaced 99 skip: true, 100 name: "shutdown client with max disconnect", 101 jobFile: "./input/lost_max_disconnect.nomad", 102 disconnectFn: e2eutil.AgentDisconnect, 103 expectedAfterDisconnect: expectedAllocStatus{ 104 disconnected: "unknown", 105 unchanged: "running", 106 replacement: "running", 107 }, 108 expectedAfterReconnect: expectedAllocStatus{ 109 disconnected: "running", 110 unchanged: "running", 111 replacement: "complete", 112 }, 113 }, 114 } 115 116 for _, tc := range testCases { 117 tc := tc 118 t.Run(tc.name, func(t *testing.T) { 119 120 if tc.skip { 121 t.Skip("SKIP BROKEN TEST") 122 } 123 124 jobIDs := []string{} 125 t.Cleanup(disconnectedClientsCleanup(t)) 126 t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) 127 128 jobID := "test-disconnected-clients-" + uuid.Short() 129 130 err := e2eutil.Register(jobID, tc.jobFile) 131 must.NoError(t, err, must.Sprint("failed to register job")) 132 jobIDs = append(jobIDs, jobID) 133 134 err = e2eutil.WaitForAllocStatusExpected(jobID, ns, 135 []string{"running", "running"}) 136 must.NoError(t, err, must.Sprint("job did not become running")) 137 138 err = e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil) 139 must.NoError(t, err, must.Sprint("deployment did not complete")) 140 141 // pick one alloc to make our disconnected alloc (and its node) 142 allocs, err := e2eutil.AllocsForJob(jobID, ns) 143 must.NoError(t, err, must.Sprint("could not query allocs for job")) 144 must.SliceLen(t, 2, allocs, must.Sprint("could not find 2 allocs for job")) 145 146 disconnectedAllocID := allocs[0]["ID"] 147 disconnectedNodeID := allocs[0]["Node ID"] 148 unchangedAllocID := allocs[1]["ID"] 149 150 // disconnect the node and wait for the results 151 152 restartJobID, err := tc.disconnectFn(disconnectedNodeID, 30*time.Second) 153 must.NoError(t, err, must.Sprint("expected agent disconnect job to register")) 154 jobIDs = append(jobIDs, restartJobID) 155 156 err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "disconnected", wait60s) 157 must.NoError(t, err, must.Sprint("expected node to go down")) 158 must.NoError(t, waitForAllocStatusMap( 159 jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterDisconnect, wait60s), 160 ) 161 162 // wait for the client reconnect 163 164 err = e2eutil.WaitForNodeStatus(disconnectedNodeID, "ready", wait30s) 165 must.NoError(t, err, must.Sprint("expected node to come back up")) 166 must.NoError(t, waitForAllocStatusMap( 167 jobID, disconnectedAllocID, unchangedAllocID, tc.expectedAfterReconnect, wait60s), 168 ) 169 170 // now get the resulting allocations, should be 3 171 172 allocs, err = e2eutil.AllocsForJob(jobID, ns) 173 must.NoError(t, err, must.Sprint("could not query allocs for job")) 174 must.SliceLen(t, 3, allocs, must.Sprint("could not find 3 allocs for job")) 175 }) 176 } 177 178 } 179 180 // disconnectedClientsCleanup sets up a cleanup function to make sure 181 // we've waited for all the nodes to come back up between tests 182 func disconnectedClientsCleanup(t *testing.T) func() { 183 nodeIDs := []string{} 184 nodeStatuses, err := e2eutil.NodeStatusList() 185 require.NoError(t, err) 186 for _, nodeStatus := range nodeStatuses { 187 nodeIDs = append(nodeIDs, nodeStatus["ID"]) 188 } 189 return func() { 190 nomad := e2eutil.NomadClient(t) 191 t.Logf("waiting for %d nodes to become ready again", len(nodeIDs)) 192 e2eutil.WaitForNodesReady(t, nomad, len(nodeIDs)) 193 } 194 } 195 196 func waitForAllocStatusMap(jobID, disconnectedAllocID, unchangedAllocID string, expected expectedAllocStatus, wc *e2eutil.WaitConfig) error { 197 var err error 198 interval, retries := wc.OrDefault() 199 testutil.WaitForResultRetries(retries, func() (bool, error) { 200 time.Sleep(interval) 201 allocs, err := e2eutil.AllocsForJob(jobID, ns) 202 if err != nil { 203 return false, err 204 } 205 206 var merr *multierror.Error 207 208 for _, alloc := range allocs { 209 switch allocID, allocStatus := alloc["ID"], alloc["Status"]; allocID { 210 case disconnectedAllocID: 211 if allocStatus != expected.disconnected { 212 merr = multierror.Append(merr, fmt.Errorf( 213 "disconnected alloc %q on node %q should be %q, got %q", 214 allocID, alloc["Node ID"], expected.disconnected, allocStatus)) 215 } 216 case unchangedAllocID: 217 if allocStatus != expected.unchanged { 218 merr = multierror.Append(merr, fmt.Errorf( 219 "unchanged alloc %q on node %q should be %q, got %q", 220 allocID, alloc["Node ID"], expected.unchanged, allocStatus)) 221 } 222 default: 223 if allocStatus != expected.replacement { 224 merr = multierror.Append(merr, fmt.Errorf( 225 "replacement alloc %q on node %q should be %q, got %q", 226 allocID, alloc["Node ID"], expected.replacement, allocStatus)) 227 } 228 } 229 } 230 if merr != nil { 231 return false, merr.ErrorOrNil() 232 } 233 return true, nil 234 }, func(e error) { 235 err = e 236 }) 237 238 // TODO(tgross): remove this block once this test has stabilized 239 if err != nil { 240 fmt.Printf("test failed, printing allocation status of all %q allocs for analysis\n", jobID) 241 fmt.Println("----------------") 242 allocs, _ := e2eutil.AllocsForJob(jobID, ns) 243 for _, alloc := range allocs { 244 out, _ := e2eutil.Command("nomad", "alloc", "status", alloc["ID"]) 245 fmt.Println(out) 246 fmt.Println("----------------") 247 } 248 } 249 250 return err 251 }