github.com/hernad/nomad@v1.6.112/e2e/consul/script_checks.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package consul 5 6 import ( 7 "bytes" 8 "context" 9 "fmt" 10 "os" 11 "strings" 12 "time" 13 14 capi "github.com/hashicorp/consul/api" 15 napi "github.com/hernad/nomad/api" 16 "github.com/hernad/nomad/e2e/e2eutil" 17 "github.com/hernad/nomad/e2e/framework" 18 "github.com/hernad/nomad/helper/uuid" 19 "github.com/stretchr/testify/require" 20 ) 21 22 type ScriptChecksE2ETest struct { 23 framework.TC 24 jobIds []string 25 } 26 27 func (tc *ScriptChecksE2ETest) BeforeAll(f *framework.F) { 28 // Ensure cluster has leader before running tests 29 e2eutil.WaitForLeader(f.T(), tc.Nomad()) 30 // Ensure that we have at least 1 client node in ready state 31 e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1) 32 } 33 34 // TestGroupScriptCheck runs a job with a single task group with several services 35 // and associated script checks. It updates, stops, etc. the job to verify 36 // that script checks are re-registered as expected. 37 func (tc *ScriptChecksE2ETest) TestGroupScriptCheck(f *framework.F) { 38 r := require.New(f.T()) 39 40 nomadClient := tc.Nomad() 41 consulClient := tc.Consul() 42 43 jobId := "checks_group" + uuid.Short() 44 tc.jobIds = append(tc.jobIds, jobId) 45 46 // Job run: verify that checks were registered in Consul 47 allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), 48 nomadClient, "consul/input/checks_group.nomad", jobId, "") 49 r.Equal(1, len(allocs)) 50 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-1", capi.HealthPassing) 51 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-2", capi.HealthWarning) 52 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-3", capi.HealthCritical) 53 54 // Check in warning state becomes healthy after check passes 55 _, _, err := exec(nomadClient, allocs, 56 []string{"/bin/sh", "-c", "touch /tmp/${NOMAD_ALLOC_ID}-alive-2b"}) 57 r.NoError(err) 58 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-2", capi.HealthPassing) 59 60 // Job update: verify checks are re-registered in Consul 61 allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), 62 nomadClient, "consul/input/checks_group_update.nomad", jobId, "") 63 r.Equal(1, len(allocs)) 64 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-1", capi.HealthPassing) 65 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-2", capi.HealthPassing) 66 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-3", capi.HealthCritical) 67 68 // Verify we don't have any linger script checks running on the client 69 out, _, err := exec(nomadClient, allocs, []string{"pgrep", "sleep"}) 70 r.NoError(err) 71 running := strings.Split(strings.TrimSpace(out.String()), "\n") 72 r.LessOrEqual(len(running), 2) // task itself + 1 check == 2 73 74 // Clean job stop: verify that checks were deregistered in Consul 75 _, _, err = nomadClient.Jobs().Deregister(jobId, false, nil) // nomad job stop 76 r.NoError(err) 77 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "group-service-1") 78 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "group-service-2") 79 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "group-service-3") 80 81 // Restore for next test 82 allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), 83 nomadClient, "consul/input/checks_group.nomad", jobId, "") 84 r.Equal(2, len(allocs)) 85 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-1", capi.HealthPassing) 86 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-2", capi.HealthWarning) 87 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-3", capi.HealthCritical) 88 89 // Crash a task: verify that checks become healthy again 90 _, _, err = exec(nomadClient, allocs, []string{"pkill", "sleep"}) 91 if err != nil && err.Error() != "plugin is shut down" { 92 r.FailNow("unexpected error: %v", err) 93 } 94 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-1", capi.HealthPassing) 95 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-2", capi.HealthWarning) 96 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "group-service-3", capi.HealthCritical) 97 98 // TODO(tgross) ... 99 // Restart client: verify that checks are re-registered 100 } 101 102 // TestTaskScriptCheck runs a job with a single task with several services 103 // and associated script checks. It updates, stops, etc. the job to verify 104 // that script checks are re-registered as expected. 105 func (tc *ScriptChecksE2ETest) TestTaskScriptCheck(f *framework.F) { 106 r := require.New(f.T()) 107 108 nomadClient := tc.Nomad() 109 consulClient := tc.Consul() 110 111 jobId := "checks_task" + uuid.Short() 112 tc.jobIds = append(tc.jobIds, jobId) 113 114 // Job run: verify that checks were registered in Consul 115 allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), 116 nomadClient, "consul/input/checks_task.nomad", jobId, "") 117 r.Equal(1, len(allocs)) 118 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-1", capi.HealthPassing) 119 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-2", capi.HealthWarning) 120 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-3", capi.HealthCritical) 121 122 // Check in warning state becomes healthy after check passes 123 _, _, err := exec(nomadClient, allocs, 124 []string{"/bin/sh", "-c", "touch ${NOMAD_TASK_DIR}/alive-2b"}) 125 r.NoError(err) 126 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-2", capi.HealthPassing) 127 128 // Job update: verify checks are re-registered in Consul 129 allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), 130 nomadClient, "consul/input/checks_task_update.nomad", jobId, "") 131 r.Equal(1, len(allocs)) 132 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-1", capi.HealthPassing) 133 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-2", capi.HealthPassing) 134 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-3", capi.HealthCritical) 135 136 // Verify we don't have any linger script checks running on the client 137 out, _, err := exec(nomadClient, allocs, []string{"pgrep", "sleep"}) 138 r.NoError(err) 139 running := strings.Split(strings.TrimSpace(out.String()), "\n") 140 r.LessOrEqual(len(running), 2) // task itself + 1 check == 2 141 142 // Clean job stop: verify that checks were deregistered in Consul 143 _, _, err = nomadClient.Jobs().Deregister(jobId, false, nil) // nomad job stop 144 r.NoError(err) 145 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "task-service-1") 146 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "task-service-2") 147 e2eutil.RequireConsulDeregistered(r, consulClient, consulNamespace, "task-service-3") 148 149 // Restore for next test 150 allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), 151 nomadClient, "consul/input/checks_task.nomad", jobId, "") 152 r.Equal(2, len(allocs)) 153 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-1", capi.HealthPassing) 154 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-2", capi.HealthWarning) 155 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-3", capi.HealthCritical) 156 157 // Crash a task: verify that checks become healthy again 158 _, _, err = exec(nomadClient, allocs, []string{"pkill", "sleep"}) 159 if err != nil && err.Error() != "plugin is shut down" { 160 r.FailNow("unexpected error: %v", err) 161 } 162 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-1", capi.HealthPassing) 163 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-2", capi.HealthWarning) 164 e2eutil.RequireConsulStatus(r, consulClient, consulNamespace, "task-service-3", capi.HealthCritical) 165 166 // TODO(tgross) ... 167 // Restart client: verify that checks are re-registered 168 } 169 170 func (tc *ScriptChecksE2ETest) AfterEach(f *framework.F) { 171 r := require.New(f.T()) 172 173 nomadClient := tc.Nomad() 174 jobs := nomadClient.Jobs() 175 // Stop all jobs in test 176 for _, id := range tc.jobIds { 177 _, _, err := jobs.Deregister(id, true, nil) 178 r.NoError(err) 179 } 180 // Garbage collect 181 r.NoError(nomadClient.System().GarbageCollect()) 182 } 183 184 func exec(client *napi.Client, allocs []*napi.AllocationListStub, command []string) (bytes.Buffer, bytes.Buffer, error) { 185 ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second) 186 defer cancelFn() 187 188 // we're getting a list of from the registration call here but 189 // one of them might be stopped or stopping, which will return 190 // an error if we try to exec into it. 191 var alloc *napi.Allocation 192 for _, stub := range allocs { 193 if stub.DesiredStatus == "run" { 194 alloc = &napi.Allocation{ 195 ID: stub.ID, 196 Namespace: stub.Namespace, 197 NodeID: stub.NodeID, 198 } 199 } 200 } 201 var stdout, stderr bytes.Buffer 202 if alloc == nil { 203 return stdout, stderr, fmt.Errorf("no allocation ready for exec") 204 } 205 _, err := client.Allocations().Exec(ctx, 206 alloc, "test", false, 207 command, 208 os.Stdin, &stdout, &stderr, 209 make(chan napi.TerminalSize), nil) 210 return stdout, stderr, err 211 }