github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/e2e/connect/acls.go (about) 1 package connect 2 3 import ( 4 "os" 5 "regexp" 6 "strings" 7 "testing" 8 "time" 9 10 capi "github.com/hashicorp/consul/api" 11 napi "github.com/hashicorp/nomad/api" 12 "github.com/hashicorp/nomad/e2e/consulacls" 13 "github.com/hashicorp/nomad/e2e/e2eutil" 14 "github.com/hashicorp/nomad/e2e/framework" 15 "github.com/hashicorp/nomad/helper/uuid" 16 "github.com/hashicorp/nomad/jobspec" 17 "github.com/kr/pretty" 18 "github.com/stretchr/testify/require" 19 ) 20 21 const ( 22 // envConsulToken is the consul http token environment variable 23 envConsulToken = "CONSUL_HTTP_TOKEN" 24 25 // demoConnectJob is the example connect enabled job useful for testing 26 demoConnectJob = "connect/input/demo.nomad" 27 ) 28 29 type ConnectACLsE2ETest struct { 30 framework.TC 31 32 // manageConsulACLs is used to 'enable' and 'disable' Consul ACLs in the 33 // Consul Cluster that has been setup for e2e testing. 34 manageConsulACLs consulacls.Manager 35 // consulMasterToken is set to the generated Consul ACL token after using 36 // the consul-acls-manage.sh script to enable ACLs. 37 consulMasterToken string 38 39 // things to cleanup after each test case 40 jobIDs []string 41 consulPolicyIDs []string 42 consulTokenIDs []string 43 } 44 45 func (tc *ConnectACLsE2ETest) BeforeAll(f *framework.F) { 46 // Wait for Nomad to be ready before doing anything. 47 e2eutil.WaitForLeader(f.T(), tc.Nomad()) 48 e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2) 49 50 // Now enable Consul ACLs, the bootstrapping process for which will be 51 // managed automatically if needed. 52 var err error 53 tc.manageConsulACLs, err = consulacls.New(consulacls.DefaultTFStateFile) 54 require.NoError(f.T(), err) 55 tc.enableConsulACLs(f) 56 57 // Sanity check the consul master token exists, otherwise tests are just 58 // going to be a train wreck. 59 tokenLength := len(tc.consulMasterToken) 60 require.Equal(f.T(), 36, tokenLength, "consul master token wrong length") 61 62 // Sanity check the CONSUL_HTTP_TOKEN is NOT set, because that will cause 63 // the agent checks to fail (which do not allow having a token set (!)). 64 consulTokenEnv := os.Getenv(envConsulToken) 65 require.Empty(f.T(), consulTokenEnv) 66 67 // Wait for Nomad to be ready _again_, since everything was restarted during 68 // the bootstrap process. 69 e2eutil.WaitForLeader(f.T(), tc.Nomad()) 70 e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2) 71 } 72 73 // enableConsulACLs effectively executes `consul-acls-manage.sh enable`, which 74 // will activate Consul ACLs, going through the bootstrap process if necessary. 75 func (tc *ConnectACLsE2ETest) enableConsulACLs(f *framework.F) { 76 tc.consulMasterToken = tc.manageConsulACLs.Enable(f.T()) 77 } 78 79 // AfterAll runs after all tests are complete. 80 // 81 // We disable ConsulACLs in here to isolate the use of Consul ACLs only to 82 // test suites that explicitly want to test with them enabled. 83 func (tc *ConnectACLsE2ETest) AfterAll(f *framework.F) { 84 tc.disableConsulACLs(f) 85 } 86 87 // disableConsulACLs effectively executes `consul-acls-manage.sh disable`, which 88 // will de-activate Consul ACLs. 89 func (tc *ConnectACLsE2ETest) disableConsulACLs(f *framework.F) { 90 tc.manageConsulACLs.Disable(f.T()) 91 } 92 93 // AfterEach does cleanup of Consul ACL objects that were created during each 94 // test case. Each test case may assume it is starting from a "fresh" state - 95 // as if the consul ACL bootstrap process had just taken place. 96 func (tc *ConnectACLsE2ETest) AfterEach(f *framework.F) { 97 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 98 return 99 } 100 101 t := f.T() 102 r := require.New(t) 103 104 // cleanup jobs 105 for _, id := range tc.jobIDs { 106 t.Log("cleanup: deregister nomad job id:", id) 107 _, _, err := tc.Nomad().Jobs().Deregister(id, true, nil) 108 r.NoError(err) 109 } 110 111 // cleanup consul tokens 112 for _, id := range tc.consulTokenIDs { 113 t.Log("cleanup: delete consul token id:", id) 114 _, err := tc.Consul().ACL().TokenDelete(id, &capi.WriteOptions{Token: tc.consulMasterToken}) 115 r.NoError(err) 116 } 117 118 // cleanup consul policies 119 for _, id := range tc.consulPolicyIDs { 120 t.Log("cleanup: delete consul policy id:", id) 121 _, err := tc.Consul().ACL().PolicyDelete(id, &capi.WriteOptions{Token: tc.consulMasterToken}) 122 r.NoError(err) 123 } 124 125 // do garbage collection 126 err := tc.Nomad().System().GarbageCollect() 127 r.NoError(err) 128 129 // assert there are no leftover SI tokens, which may take a minute to be 130 // cleaned up 131 r.Eventually(func() bool { 132 siTokens := tc.countSITokens(t) 133 t.Log("cleanup: checking for remaining SI tokens:", siTokens) 134 return len(siTokens) == 0 135 }, 2*time.Minute, 2*time.Second, "SI tokens did not get removed") 136 137 tc.jobIDs = []string{} 138 tc.consulTokenIDs = []string{} 139 tc.consulPolicyIDs = []string{} 140 } 141 142 type consulPolicy struct { 143 Name string // e.g. nomad-operator 144 Rules string // e.g. service "" { policy="write" } 145 } 146 147 func (tc *ConnectACLsE2ETest) createConsulPolicy(p consulPolicy, f *framework.F) string { 148 r := require.New(f.T()) 149 result, _, err := tc.Consul().ACL().PolicyCreate(&capi.ACLPolicy{ 150 Name: p.Name, 151 Description: "test policy " + p.Name, 152 Rules: p.Rules, 153 }, &capi.WriteOptions{Token: tc.consulMasterToken}) 154 r.NoError(err, "failed to create consul policy") 155 tc.consulPolicyIDs = append(tc.consulPolicyIDs, result.ID) 156 return result.ID 157 } 158 159 func (tc *ConnectACLsE2ETest) createOperatorToken(policyID string, f *framework.F) string { 160 r := require.New(f.T()) 161 token, _, err := tc.Consul().ACL().TokenCreate(&capi.ACLToken{ 162 Description: "operator token", 163 Policies: []*capi.ACLTokenPolicyLink{{ID: policyID}}, 164 }, &capi.WriteOptions{Token: tc.consulMasterToken}) 165 r.NoError(err, "failed to create operator token") 166 tc.consulTokenIDs = append(tc.consulTokenIDs, token.AccessorID) 167 return token.SecretID 168 } 169 170 func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterMasterToken(f *framework.F) { 171 t := f.T() 172 r := require.New(t) 173 174 t.Log("test register Connect job w/ ACLs enabled w/ master token") 175 176 jobID := "connect" + uuid.Generate()[0:8] 177 tc.jobIDs = append(tc.jobIDs, jobID) 178 179 jobAPI := tc.Nomad().Jobs() 180 181 job, err := jobspec.ParseFile(demoConnectJob) 182 r.NoError(err) 183 184 // Set the job file to use the consul master token. 185 // One should never do this in practice, but, it should work. 186 // https://www.consul.io/docs/acl/acl-system.html#builtin-tokens 187 job.ConsulToken = &tc.consulMasterToken 188 189 // Avoid using Register here, because that would actually create and run the 190 // Job which runs the task, creates the SI token, which all needs to be 191 // given time to settle and cleaned up. That is all covered in the big slow 192 // test at the bottom. 193 resp, _, err := jobAPI.Plan(job, false, nil) 194 r.NoError(err) 195 r.NotNil(resp) 196 } 197 198 func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterMissingOperatorToken(f *framework.F) { 199 t := f.T() 200 r := require.New(t) 201 202 t.Log("test register Connect job w/ ACLs enabled w/o operator token") 203 204 job, err := jobspec.ParseFile(demoConnectJob) 205 r.NoError(err) 206 207 jobAPI := tc.Nomad().Jobs() 208 209 // Explicitly show the ConsulToken is not set 210 job.ConsulToken = nil 211 212 _, _, err = jobAPI.Register(job, nil) 213 r.Error(err) 214 215 t.Log("job correctly rejected, with error:", err) 216 } 217 218 func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterFakeOperatorToken(f *framework.F) { 219 t := f.T() 220 r := require.New(t) 221 222 t.Log("test register Connect job w/ ACLs enabled w/ operator token") 223 224 policyID := tc.createConsulPolicy(consulPolicy{ 225 Name: "nomad-operator-policy", 226 Rules: `service "count-api" { policy = "write" } service "count-dashboard" { policy = "write" }`, 227 }, f) 228 t.Log("created operator policy:", policyID) 229 230 // generate a fake consul token token 231 fakeToken := uuid.Generate() 232 job := tc.parseJobSpecFile(t, demoConnectJob) 233 234 jobAPI := tc.Nomad().Jobs() 235 236 // deliberately set the fake Consul token 237 job.ConsulToken = &fakeToken 238 239 // should fail, because the token is fake 240 _, _, err := jobAPI.Register(job, nil) 241 r.Error(err) 242 t.Log("job correctly rejected, with error:", err) 243 } 244 245 func (tc *ConnectACLsE2ETest) TestConnectACLsConnectDemo(f *framework.F) { 246 t := f.T() 247 r := require.New(t) 248 249 t.Log("test register Connect job w/ ACLs enabled w/ operator token") 250 251 // === Setup ACL policy and token === 252 253 // create a policy allowing writes of services "count-api" and "count-dashboard" 254 policyID := tc.createConsulPolicy(consulPolicy{ 255 Name: "nomad-operator-policy", 256 Rules: `service "count-api" { policy = "write" } service "count-dashboard" { policy = "write" }`, 257 }, f) 258 t.Log("created operator policy:", policyID) 259 260 // create a Consul "operator token" blessed with the above policy 261 operatorToken := tc.createOperatorToken(policyID, f) 262 t.Log("created operator token:", operatorToken) 263 264 // === Register the Nomad job === 265 jobID := "connectACL_connect_demo" 266 267 var allocs []*napi.AllocationListStub 268 allocIDs := make(map[string]bool, 2) 269 { 270 271 // parse the example connect jobspec file 272 tc.jobIDs = append(tc.jobIDs, jobID) 273 job := tc.parseJobSpecFile(t, demoConnectJob) 274 job.ID = &jobID 275 jobAPI := tc.Nomad().Jobs() 276 277 // set the valid consul operator token 278 job.ConsulToken = &operatorToken 279 280 // registering the job should succeed 281 resp, _, err := jobAPI.Register(job, nil) 282 r.NoError(err) 283 r.NotNil(resp) 284 r.Empty(resp.Warnings) 285 t.Log("job has been registered with evalID:", resp.EvalID) 286 287 // === Make sure the evaluation actually succeeds === 288 EVAL: 289 qOpts := &napi.QueryOptions{WaitIndex: resp.EvalCreateIndex} 290 evalAPI := tc.Nomad().Evaluations() 291 eval, qMeta, err := evalAPI.Info(resp.EvalID, qOpts) 292 r.NoError(err) 293 qOpts.WaitIndex = qMeta.LastIndex 294 295 switch eval.Status { 296 case "pending": 297 goto EVAL 298 case "complete": 299 // ok! 300 case "failed", "canceled", "blocked": 301 r.Failf("eval %s\n%s\n", eval.Status, pretty.Sprint(eval)) 302 default: 303 r.Failf("unknown eval status: %s\n%s\n", eval.Status, pretty.Sprint(eval)) 304 } 305 306 // assert there were no placement failures 307 r.Zero(eval.FailedTGAllocs, pretty.Sprint(eval.FailedTGAllocs)) 308 r.Len(eval.QueuedAllocations, 2, pretty.Sprint(eval.QueuedAllocations)) 309 310 // === Assert allocs are running === 311 for i := 0; i < 20; i++ { 312 allocs, qMeta, err = evalAPI.Allocations(eval.ID, qOpts) 313 r.NoError(err) 314 r.Len(allocs, 2) 315 qOpts.WaitIndex = qMeta.LastIndex 316 317 running := 0 318 for _, alloc := range allocs { 319 switch alloc.ClientStatus { 320 case "running": 321 running++ 322 case "pending": 323 // keep trying 324 default: 325 r.Failf("alloc failed", "alloc: %s", pretty.Sprint(alloc)) 326 } 327 } 328 329 if running == len(allocs) { 330 t.Log("running:", running, "allocs:", allocs) 331 break 332 } 333 334 time.Sleep(500 * time.Millisecond) 335 } 336 337 for _, a := range allocs { 338 if a.ClientStatus != "running" || a.DesiredStatus != "run" { 339 r.Failf("terminal alloc", "alloc %s (%s) terminal; client=%s desired=%s", a.TaskGroup, a.ID, a.ClientStatus, a.DesiredStatus) 340 } 341 allocIDs[a.ID] = true 342 } 343 } 344 345 // === Check Consul service health === 346 agentAPI := tc.Consul().Agent() 347 348 failing := map[string]*capi.AgentCheck{} 349 for i := 0; i < 60; i++ { 350 checks, err := agentAPI.Checks() 351 require.NoError(t, err) 352 353 // filter out checks for other services 354 for cid, check := range checks { 355 found := false 356 // for _, allocID := range allocIDs { // list 357 for allocID := range allocIDs { // map 358 if strings.Contains(check.ServiceID, allocID) { 359 found = true 360 break 361 } 362 } 363 364 if !found { 365 delete(checks, cid) 366 } 367 } 368 369 // ensure checks are all passing 370 failing = map[string]*capi.AgentCheck{} 371 for _, check := range checks { 372 if check.Status != "passing" { 373 failing[check.CheckID] = check 374 break 375 } 376 } 377 378 if len(failing) == 0 { 379 break 380 } 381 382 t.Logf("still %d checks not passing", len(failing)) 383 384 time.Sleep(time.Second) 385 } 386 387 require.Len(t, failing, 0, pretty.Sprint(failing)) 388 389 // === Check Consul SI tokens were generated for sidecars === 390 foundSITokens := tc.countSITokens(t) 391 r.Equal(2, len(foundSITokens), "expected 2 SI tokens total: %v", foundSITokens) 392 r.Equal(1, foundSITokens["connect-proxy-count-api"], "expected 1 SI token for connect-proxy-count-api: %v", foundSITokens) 393 r.Equal(1, foundSITokens["connect-proxy-count-dashboard"], "expected 1 SI token for connect-proxy-count-dashboard: %v", foundSITokens) 394 395 t.Log("connect job with ACLs enable finished") 396 } 397 398 var ( 399 siTokenRe = regexp.MustCompile(`_nomad_si \[[\w-]{36}] \[[\w-]{36}] \[([\S]+)]`) 400 ) 401 402 func (tc *ConnectACLsE2ETest) serviceofSIToken(description string) string { 403 if m := siTokenRe.FindStringSubmatch(description); len(m) == 2 { 404 return m[1] 405 } 406 return "" 407 } 408 409 func (tc *ConnectACLsE2ETest) countSITokens(t *testing.T) map[string]int { 410 aclAPI := tc.Consul().ACL() 411 tokens, _, err := aclAPI.TokenList(&capi.QueryOptions{ 412 Token: tc.consulMasterToken, 413 }) 414 require.NoError(t, err) 415 416 // count the number of SI tokens matching each service name 417 foundSITokens := make(map[string]int) 418 for _, token := range tokens { 419 if service := tc.serviceofSIToken(token.Description); service != "" { 420 foundSITokens[service]++ 421 } 422 } 423 424 return foundSITokens 425 } 426 427 func (tc *ConnectACLsE2ETest) parseJobSpecFile(t *testing.T, filename string) *napi.Job { 428 job, err := jobspec.ParseFile(filename) 429 require.NoError(t, err) 430 return job 431 }