github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/e2e/connect/acls.go (about)

     1  package connect
     2  
     3  import (
     4  	"os"
     5  	"regexp"
     6  	"strings"
     7  	"testing"
     8  	"time"
     9  
    10  	capi "github.com/hashicorp/consul/api"
    11  	napi "github.com/hashicorp/nomad/api"
    12  	"github.com/hashicorp/nomad/e2e/consulacls"
    13  	"github.com/hashicorp/nomad/e2e/e2eutil"
    14  	"github.com/hashicorp/nomad/e2e/framework"
    15  	"github.com/hashicorp/nomad/helper/uuid"
    16  	"github.com/hashicorp/nomad/jobspec"
    17  	"github.com/kr/pretty"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  const (
    22  	// envConsulToken is the consul http token environment variable
    23  	envConsulToken = "CONSUL_HTTP_TOKEN"
    24  
    25  	// demoConnectJob is the example connect enabled job useful for testing
    26  	demoConnectJob = "connect/input/demo.nomad"
    27  )
    28  
    29  type ConnectACLsE2ETest struct {
    30  	framework.TC
    31  
    32  	// manageConsulACLs is used to 'enable' and 'disable' Consul ACLs in the
    33  	// Consul Cluster that has been setup for e2e testing.
    34  	manageConsulACLs consulacls.Manager
    35  	// consulMasterToken is set to the generated Consul ACL token after using
    36  	// the consul-acls-manage.sh script to enable ACLs.
    37  	consulMasterToken string
    38  
    39  	// things to cleanup after each test case
    40  	jobIDs          []string
    41  	consulPolicyIDs []string
    42  	consulTokenIDs  []string
    43  }
    44  
    45  func (tc *ConnectACLsE2ETest) BeforeAll(f *framework.F) {
    46  	// Wait for Nomad to be ready before doing anything.
    47  	e2eutil.WaitForLeader(f.T(), tc.Nomad())
    48  	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
    49  
    50  	// Now enable Consul ACLs, the bootstrapping process for which will be
    51  	// managed automatically if needed.
    52  	var err error
    53  	tc.manageConsulACLs, err = consulacls.New(consulacls.DefaultTFStateFile)
    54  	require.NoError(f.T(), err)
    55  	tc.enableConsulACLs(f)
    56  
    57  	// Sanity check the consul master token exists, otherwise tests are just
    58  	// going to be a train wreck.
    59  	tokenLength := len(tc.consulMasterToken)
    60  	require.Equal(f.T(), 36, tokenLength, "consul master token wrong length")
    61  
    62  	// Sanity check the CONSUL_HTTP_TOKEN is NOT set, because that will cause
    63  	// the agent checks to fail (which do not allow having a token set (!)).
    64  	consulTokenEnv := os.Getenv(envConsulToken)
    65  	require.Empty(f.T(), consulTokenEnv)
    66  
    67  	// Wait for Nomad to be ready _again_, since everything was restarted during
    68  	// the bootstrap process.
    69  	e2eutil.WaitForLeader(f.T(), tc.Nomad())
    70  	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)
    71  }
    72  
    73  // enableConsulACLs effectively executes `consul-acls-manage.sh enable`, which
    74  // will activate Consul ACLs, going through the bootstrap process if necessary.
    75  func (tc *ConnectACLsE2ETest) enableConsulACLs(f *framework.F) {
    76  	tc.consulMasterToken = tc.manageConsulACLs.Enable(f.T())
    77  }
    78  
    79  // AfterAll runs after all tests are complete.
    80  //
    81  // We disable ConsulACLs in here to isolate the use of Consul ACLs only to
    82  // test suites that explicitly want to test with them enabled.
    83  func (tc *ConnectACLsE2ETest) AfterAll(f *framework.F) {
    84  	tc.disableConsulACLs(f)
    85  }
    86  
    87  // disableConsulACLs effectively executes `consul-acls-manage.sh disable`, which
    88  // will de-activate Consul ACLs.
    89  func (tc *ConnectACLsE2ETest) disableConsulACLs(f *framework.F) {
    90  	tc.manageConsulACLs.Disable(f.T())
    91  }
    92  
    93  // AfterEach does cleanup of Consul ACL objects that were created during each
    94  // test case. Each test case may assume it is starting from a "fresh" state -
    95  // as if the consul ACL bootstrap process had just taken place.
    96  func (tc *ConnectACLsE2ETest) AfterEach(f *framework.F) {
    97  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    98  		return
    99  	}
   100  
   101  	t := f.T()
   102  	r := require.New(t)
   103  
   104  	// cleanup jobs
   105  	for _, id := range tc.jobIDs {
   106  		t.Log("cleanup: deregister nomad job id:", id)
   107  		_, _, err := tc.Nomad().Jobs().Deregister(id, true, nil)
   108  		r.NoError(err)
   109  	}
   110  
   111  	// cleanup consul tokens
   112  	for _, id := range tc.consulTokenIDs {
   113  		t.Log("cleanup: delete consul token id:", id)
   114  		_, err := tc.Consul().ACL().TokenDelete(id, &capi.WriteOptions{Token: tc.consulMasterToken})
   115  		r.NoError(err)
   116  	}
   117  
   118  	// cleanup consul policies
   119  	for _, id := range tc.consulPolicyIDs {
   120  		t.Log("cleanup: delete consul policy id:", id)
   121  		_, err := tc.Consul().ACL().PolicyDelete(id, &capi.WriteOptions{Token: tc.consulMasterToken})
   122  		r.NoError(err)
   123  	}
   124  
   125  	// do garbage collection
   126  	err := tc.Nomad().System().GarbageCollect()
   127  	r.NoError(err)
   128  
   129  	// assert there are no leftover SI tokens, which may take a minute to be
   130  	// cleaned up
   131  	r.Eventually(func() bool {
   132  		siTokens := tc.countSITokens(t)
   133  		t.Log("cleanup: checking for remaining SI tokens:", siTokens)
   134  		return len(siTokens) == 0
   135  	}, 2*time.Minute, 2*time.Second, "SI tokens did not get removed")
   136  
   137  	tc.jobIDs = []string{}
   138  	tc.consulTokenIDs = []string{}
   139  	tc.consulPolicyIDs = []string{}
   140  }
   141  
   142  type consulPolicy struct {
   143  	Name  string // e.g. nomad-operator
   144  	Rules string // e.g. service "" { policy="write" }
   145  }
   146  
   147  func (tc *ConnectACLsE2ETest) createConsulPolicy(p consulPolicy, f *framework.F) string {
   148  	r := require.New(f.T())
   149  	result, _, err := tc.Consul().ACL().PolicyCreate(&capi.ACLPolicy{
   150  		Name:        p.Name,
   151  		Description: "test policy " + p.Name,
   152  		Rules:       p.Rules,
   153  	}, &capi.WriteOptions{Token: tc.consulMasterToken})
   154  	r.NoError(err, "failed to create consul policy")
   155  	tc.consulPolicyIDs = append(tc.consulPolicyIDs, result.ID)
   156  	return result.ID
   157  }
   158  
   159  func (tc *ConnectACLsE2ETest) createOperatorToken(policyID string, f *framework.F) string {
   160  	r := require.New(f.T())
   161  	token, _, err := tc.Consul().ACL().TokenCreate(&capi.ACLToken{
   162  		Description: "operator token",
   163  		Policies:    []*capi.ACLTokenPolicyLink{{ID: policyID}},
   164  	}, &capi.WriteOptions{Token: tc.consulMasterToken})
   165  	r.NoError(err, "failed to create operator token")
   166  	tc.consulTokenIDs = append(tc.consulTokenIDs, token.AccessorID)
   167  	return token.SecretID
   168  }
   169  
   170  func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterMasterToken(f *framework.F) {
   171  	t := f.T()
   172  	r := require.New(t)
   173  
   174  	t.Log("test register Connect job w/ ACLs enabled w/ master token")
   175  
   176  	jobID := "connect" + uuid.Generate()[0:8]
   177  	tc.jobIDs = append(tc.jobIDs, jobID)
   178  
   179  	jobAPI := tc.Nomad().Jobs()
   180  
   181  	job, err := jobspec.ParseFile(demoConnectJob)
   182  	r.NoError(err)
   183  
   184  	// Set the job file to use the consul master token.
   185  	// One should never do this in practice, but, it should work.
   186  	// https://www.consul.io/docs/acl/acl-system.html#builtin-tokens
   187  	job.ConsulToken = &tc.consulMasterToken
   188  
   189  	// Avoid using Register here, because that would actually create and run the
   190  	// Job which runs the task, creates the SI token, which all needs to be
   191  	// given time to settle and cleaned up. That is all covered in the big slow
   192  	// test at the bottom.
   193  	resp, _, err := jobAPI.Plan(job, false, nil)
   194  	r.NoError(err)
   195  	r.NotNil(resp)
   196  }
   197  
   198  func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterMissingOperatorToken(f *framework.F) {
   199  	t := f.T()
   200  	r := require.New(t)
   201  
   202  	t.Log("test register Connect job w/ ACLs enabled w/o operator token")
   203  
   204  	job, err := jobspec.ParseFile(demoConnectJob)
   205  	r.NoError(err)
   206  
   207  	jobAPI := tc.Nomad().Jobs()
   208  
   209  	// Explicitly show the ConsulToken is not set
   210  	job.ConsulToken = nil
   211  
   212  	_, _, err = jobAPI.Register(job, nil)
   213  	r.Error(err)
   214  
   215  	t.Log("job correctly rejected, with error:", err)
   216  }
   217  
   218  func (tc *ConnectACLsE2ETest) TestConnectACLsRegisterFakeOperatorToken(f *framework.F) {
   219  	t := f.T()
   220  	r := require.New(t)
   221  
   222  	t.Log("test register Connect job w/ ACLs enabled w/ operator token")
   223  
   224  	policyID := tc.createConsulPolicy(consulPolicy{
   225  		Name:  "nomad-operator-policy",
   226  		Rules: `service "count-api" { policy = "write" } service "count-dashboard" { policy = "write" }`,
   227  	}, f)
   228  	t.Log("created operator policy:", policyID)
   229  
   230  	// generate a fake consul token token
   231  	fakeToken := uuid.Generate()
   232  	job := tc.parseJobSpecFile(t, demoConnectJob)
   233  
   234  	jobAPI := tc.Nomad().Jobs()
   235  
   236  	// deliberately set the fake Consul token
   237  	job.ConsulToken = &fakeToken
   238  
   239  	// should fail, because the token is fake
   240  	_, _, err := jobAPI.Register(job, nil)
   241  	r.Error(err)
   242  	t.Log("job correctly rejected, with error:", err)
   243  }
   244  
   245  func (tc *ConnectACLsE2ETest) TestConnectACLsConnectDemo(f *framework.F) {
   246  	t := f.T()
   247  	r := require.New(t)
   248  
   249  	t.Log("test register Connect job w/ ACLs enabled w/ operator token")
   250  
   251  	// === Setup ACL policy and token ===
   252  
   253  	// create a policy allowing writes of services "count-api" and "count-dashboard"
   254  	policyID := tc.createConsulPolicy(consulPolicy{
   255  		Name:  "nomad-operator-policy",
   256  		Rules: `service "count-api" { policy = "write" } service "count-dashboard" { policy = "write" }`,
   257  	}, f)
   258  	t.Log("created operator policy:", policyID)
   259  
   260  	// create a Consul "operator token" blessed with the above policy
   261  	operatorToken := tc.createOperatorToken(policyID, f)
   262  	t.Log("created operator token:", operatorToken)
   263  
   264  	// === Register the Nomad job ===
   265  	jobID := "connectACL_connect_demo"
   266  
   267  	var allocs []*napi.AllocationListStub
   268  	allocIDs := make(map[string]bool, 2)
   269  	{
   270  
   271  		// parse the example connect jobspec file
   272  		tc.jobIDs = append(tc.jobIDs, jobID)
   273  		job := tc.parseJobSpecFile(t, demoConnectJob)
   274  		job.ID = &jobID
   275  		jobAPI := tc.Nomad().Jobs()
   276  
   277  		// set the valid consul operator token
   278  		job.ConsulToken = &operatorToken
   279  
   280  		// registering the job should succeed
   281  		resp, _, err := jobAPI.Register(job, nil)
   282  		r.NoError(err)
   283  		r.NotNil(resp)
   284  		r.Empty(resp.Warnings)
   285  		t.Log("job has been registered with evalID:", resp.EvalID)
   286  
   287  		// === Make sure the evaluation actually succeeds ===
   288  	EVAL:
   289  		qOpts := &napi.QueryOptions{WaitIndex: resp.EvalCreateIndex}
   290  		evalAPI := tc.Nomad().Evaluations()
   291  		eval, qMeta, err := evalAPI.Info(resp.EvalID, qOpts)
   292  		r.NoError(err)
   293  		qOpts.WaitIndex = qMeta.LastIndex
   294  
   295  		switch eval.Status {
   296  		case "pending":
   297  			goto EVAL
   298  		case "complete":
   299  		// ok!
   300  		case "failed", "canceled", "blocked":
   301  			r.Failf("eval %s\n%s\n", eval.Status, pretty.Sprint(eval))
   302  		default:
   303  			r.Failf("unknown eval status: %s\n%s\n", eval.Status, pretty.Sprint(eval))
   304  		}
   305  
   306  		// assert there were no placement failures
   307  		r.Zero(eval.FailedTGAllocs, pretty.Sprint(eval.FailedTGAllocs))
   308  		r.Len(eval.QueuedAllocations, 2, pretty.Sprint(eval.QueuedAllocations))
   309  
   310  		// === Assert allocs are running ===
   311  		for i := 0; i < 20; i++ {
   312  			allocs, qMeta, err = evalAPI.Allocations(eval.ID, qOpts)
   313  			r.NoError(err)
   314  			r.Len(allocs, 2)
   315  			qOpts.WaitIndex = qMeta.LastIndex
   316  
   317  			running := 0
   318  			for _, alloc := range allocs {
   319  				switch alloc.ClientStatus {
   320  				case "running":
   321  					running++
   322  				case "pending":
   323  					// keep trying
   324  				default:
   325  					r.Failf("alloc failed", "alloc: %s", pretty.Sprint(alloc))
   326  				}
   327  			}
   328  
   329  			if running == len(allocs) {
   330  				t.Log("running:", running, "allocs:", allocs)
   331  				break
   332  			}
   333  
   334  			time.Sleep(500 * time.Millisecond)
   335  		}
   336  
   337  		for _, a := range allocs {
   338  			if a.ClientStatus != "running" || a.DesiredStatus != "run" {
   339  				r.Failf("terminal alloc", "alloc %s (%s) terminal; client=%s desired=%s", a.TaskGroup, a.ID, a.ClientStatus, a.DesiredStatus)
   340  			}
   341  			allocIDs[a.ID] = true
   342  		}
   343  	}
   344  
   345  	// === Check Consul service health ===
   346  	agentAPI := tc.Consul().Agent()
   347  
   348  	failing := map[string]*capi.AgentCheck{}
   349  	for i := 0; i < 60; i++ {
   350  		checks, err := agentAPI.Checks()
   351  		require.NoError(t, err)
   352  
   353  		// filter out checks for other services
   354  		for cid, check := range checks {
   355  			found := false
   356  			// for _, allocID := range allocIDs { // list
   357  			for allocID := range allocIDs { // map
   358  				if strings.Contains(check.ServiceID, allocID) {
   359  					found = true
   360  					break
   361  				}
   362  			}
   363  
   364  			if !found {
   365  				delete(checks, cid)
   366  			}
   367  		}
   368  
   369  		// ensure checks are all passing
   370  		failing = map[string]*capi.AgentCheck{}
   371  		for _, check := range checks {
   372  			if check.Status != "passing" {
   373  				failing[check.CheckID] = check
   374  				break
   375  			}
   376  		}
   377  
   378  		if len(failing) == 0 {
   379  			break
   380  		}
   381  
   382  		t.Logf("still %d checks not passing", len(failing))
   383  
   384  		time.Sleep(time.Second)
   385  	}
   386  
   387  	require.Len(t, failing, 0, pretty.Sprint(failing))
   388  
   389  	// === Check Consul SI tokens were generated for sidecars ===
   390  	foundSITokens := tc.countSITokens(t)
   391  	r.Equal(2, len(foundSITokens), "expected 2 SI tokens total: %v", foundSITokens)
   392  	r.Equal(1, foundSITokens["connect-proxy-count-api"], "expected 1 SI token for connect-proxy-count-api: %v", foundSITokens)
   393  	r.Equal(1, foundSITokens["connect-proxy-count-dashboard"], "expected 1 SI token for connect-proxy-count-dashboard: %v", foundSITokens)
   394  
   395  	t.Log("connect job with ACLs enable finished")
   396  }
   397  
   398  var (
   399  	siTokenRe = regexp.MustCompile(`_nomad_si \[[\w-]{36}] \[[\w-]{36}] \[([\S]+)]`)
   400  )
   401  
   402  func (tc *ConnectACLsE2ETest) serviceofSIToken(description string) string {
   403  	if m := siTokenRe.FindStringSubmatch(description); len(m) == 2 {
   404  		return m[1]
   405  	}
   406  	return ""
   407  }
   408  
   409  func (tc *ConnectACLsE2ETest) countSITokens(t *testing.T) map[string]int {
   410  	aclAPI := tc.Consul().ACL()
   411  	tokens, _, err := aclAPI.TokenList(&capi.QueryOptions{
   412  		Token: tc.consulMasterToken,
   413  	})
   414  	require.NoError(t, err)
   415  
   416  	// count the number of SI tokens matching each service name
   417  	foundSITokens := make(map[string]int)
   418  	for _, token := range tokens {
   419  		if service := tc.serviceofSIToken(token.Description); service != "" {
   420  			foundSITokens[service]++
   421  		}
   422  	}
   423  
   424  	return foundSITokens
   425  }
   426  
   427  func (tc *ConnectACLsE2ETest) parseJobSpecFile(t *testing.T, filename string) *napi.Job {
   428  	job, err := jobspec.ParseFile(filename)
   429  	require.NoError(t, err)
   430  	return job
   431  }