github.com/hernad/nomad@v1.6.112/nomad/node_pool_endpoint.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"net/http"
    10  	"time"
    11  
    12  	metrics "github.com/armon/go-metrics"
    13  	"github.com/hashicorp/go-memdb"
    14  	multierror "github.com/hashicorp/go-multierror"
    15  
    16  	"github.com/hernad/nomad/acl"
    17  	"github.com/hernad/nomad/helper"
    18  	"github.com/hernad/nomad/nomad/state"
    19  	"github.com/hernad/nomad/nomad/state/paginator"
    20  	"github.com/hernad/nomad/nomad/structs"
    21  )
    22  
    23  // NodePool endpoint is used for node pool management and interaction.
    24  type NodePool struct {
    25  	srv *Server
    26  	ctx *RPCContext
    27  }
    28  
    29  func NewNodePoolEndpoint(srv *Server, ctx *RPCContext) *NodePool {
    30  	return &NodePool{srv: srv, ctx: ctx}
    31  }
    32  
    33  // List is used to retrieve multiple node pools. It supports prefix listing,
    34  // pagination, and filtering.
    35  func (n *NodePool) List(args *structs.NodePoolListRequest, reply *structs.NodePoolListResponse) error {
    36  	authErr := n.srv.Authenticate(n.ctx, args)
    37  	if done, err := n.srv.forward("NodePool.List", args, args, reply); done {
    38  		return err
    39  	}
    40  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricList, args)
    41  	if authErr != nil {
    42  		return structs.ErrPermissionDenied
    43  	}
    44  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "list"}, time.Now())
    45  
    46  	// Resolve ACL token to only return node pools it has access to.
    47  	aclObj, err := n.srv.ResolveACL(args)
    48  	if err != nil {
    49  		return err
    50  	}
    51  
    52  	// Only warn for expiration of a read request.
    53  	_ = n.validateLicense(nil)
    54  
    55  	// Setup blocking query.
    56  	sort := state.SortOption(args.Reverse)
    57  	opts := blockingOptions{
    58  		queryOpts: &args.QueryOptions,
    59  		queryMeta: &reply.QueryMeta,
    60  		run: func(ws memdb.WatchSet, store *state.StateStore) error {
    61  			var err error
    62  			var iter memdb.ResultIterator
    63  
    64  			if prefix := args.QueryOptions.Prefix; prefix != "" {
    65  				iter, err = store.NodePoolsByNamePrefix(ws, prefix, sort)
    66  			} else {
    67  				iter, err = store.NodePools(ws, sort)
    68  			}
    69  			if err != nil {
    70  				return err
    71  			}
    72  
    73  			pageOpts := paginator.StructsTokenizerOptions{WithID: true}
    74  			tokenizer := paginator.NewStructsTokenizer(iter, pageOpts)
    75  			filters := []paginator.Filter{
    76  				// Filter out node pools based on ACL token capabilities.
    77  				paginator.GenericFilter{
    78  					Allow: func(raw interface{}) (bool, error) {
    79  						pool := raw.(*structs.NodePool)
    80  						return aclObj.AllowNodePoolOperation(pool.Name, acl.NodePoolCapabilityRead), nil
    81  					},
    82  				},
    83  			}
    84  
    85  			var pools []*structs.NodePool
    86  			pager, err := paginator.NewPaginator(iter, tokenizer, filters, args.QueryOptions,
    87  				func(raw interface{}) error {
    88  					pool := raw.(*structs.NodePool)
    89  					pools = append(pools, pool)
    90  					return nil
    91  				})
    92  			if err != nil {
    93  				return structs.NewErrRPCCodedf(http.StatusBadRequest, "failed to create result paginator: %v", err)
    94  			}
    95  
    96  			nextToken, err := pager.Page()
    97  			if err != nil {
    98  				return structs.NewErrRPCCodedf(http.StatusBadRequest, "failed to read result page: %v", err)
    99  			}
   100  
   101  			reply.QueryMeta.NextToken = nextToken
   102  			reply.NodePools = pools
   103  
   104  			// Use the last index that affected the node pools table.
   105  			index, err := store.Index("node_pools")
   106  			if err != nil {
   107  				return err
   108  			}
   109  			reply.Index = helper.Max(1, index)
   110  
   111  			// Set the query response.
   112  			n.srv.setQueryMeta(&reply.QueryMeta)
   113  			return nil
   114  		}}
   115  	return n.srv.blockingRPC(&opts)
   116  }
   117  
   118  // GetNodePool returns the specific node pool requested or nil if the node pool
   119  // doesn't exist.
   120  func (n *NodePool) GetNodePool(args *structs.NodePoolSpecificRequest, reply *structs.SingleNodePoolResponse) error {
   121  	authErr := n.srv.Authenticate(n.ctx, args)
   122  	if done, err := n.srv.forward("NodePool.GetNodePool", args, args, reply); done {
   123  		return err
   124  	}
   125  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricRead, args)
   126  	if authErr != nil {
   127  		return structs.ErrPermissionDenied
   128  	}
   129  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "get_node_pool"}, time.Now())
   130  
   131  	// Resolve ACL token and verify it has read capability for the pool.
   132  	aclObj, err := n.srv.ResolveACL(args)
   133  	if err != nil {
   134  		return err
   135  	}
   136  	if !aclObj.AllowNodePoolOperation(args.Name, acl.NodePoolCapabilityRead) {
   137  		return structs.ErrPermissionDenied
   138  	}
   139  
   140  	// Only warn for expiration of a read request.
   141  	_ = n.validateLicense(nil)
   142  
   143  	// Setup the blocking query.
   144  	opts := blockingOptions{
   145  		queryOpts: &args.QueryOptions,
   146  		queryMeta: &reply.QueryMeta,
   147  		run: func(ws memdb.WatchSet, store *state.StateStore) error {
   148  			// Fetch node pool.
   149  			pool, err := store.NodePoolByName(ws, args.Name)
   150  			if err != nil {
   151  				return err
   152  			}
   153  
   154  			reply.NodePool = pool
   155  			if pool != nil {
   156  				reply.Index = pool.ModifyIndex
   157  			} else {
   158  				// Return the last index that affected the node pools table if
   159  				// the requested node pool doesn't exist.
   160  				index, err := store.Index(state.TableNodePools)
   161  				if err != nil {
   162  					return err
   163  				}
   164  				reply.Index = helper.Max(1, index)
   165  			}
   166  			return nil
   167  		}}
   168  	return n.srv.blockingRPC(&opts)
   169  }
   170  
   171  // UpsertNodePools creates or updates the given node pools. Built-in node pools
   172  // cannot be updated.
   173  func (n *NodePool) UpsertNodePools(args *structs.NodePoolUpsertRequest, reply *structs.GenericResponse) error {
   174  	authErr := n.srv.Authenticate(n.ctx, args)
   175  	args.Region = n.srv.config.AuthoritativeRegion
   176  	if done, err := n.srv.forward("NodePool.UpsertNodePools", args, args, reply); done {
   177  		return err
   178  	}
   179  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricWrite, args)
   180  	if authErr != nil {
   181  		return structs.ErrPermissionDenied
   182  	}
   183  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "upsert_node_pools"}, time.Now())
   184  
   185  	// Resolve ACL token and verify it has write capability to all pools in the
   186  	// request.
   187  	aclObj, err := n.srv.ResolveACL(args)
   188  	if err != nil {
   189  		return err
   190  	}
   191  	for _, pool := range args.NodePools {
   192  		if !aclObj.AllowNodePoolOperation(pool.Name, acl.NodePoolCapabilityWrite) {
   193  			return structs.ErrPermissionDenied
   194  		}
   195  
   196  		// Strict enforcement for write requests.
   197  		// If not licensed then requests will be denied.
   198  		if err := n.validateLicense(pool); err != nil {
   199  			return err
   200  		}
   201  	}
   202  
   203  	if !ServersMeetMinimumVersion(
   204  		n.srv.serf.Members(), n.srv.Region(), minNodePoolsVersion, true) {
   205  		return fmt.Errorf("all servers must be running version %v or later to upsert node pools", minNodePoolsVersion)
   206  	}
   207  
   208  	// Validate request.
   209  	if len(args.NodePools) == 0 {
   210  		return structs.NewErrRPCCodedf(http.StatusBadRequest, "must specify at least one node pool")
   211  	}
   212  	for _, pool := range args.NodePools {
   213  		if err := pool.Validate(); err != nil {
   214  			return structs.NewErrRPCCodedf(http.StatusBadRequest, "invalid node pool %q: %v", pool.Name, err)
   215  		}
   216  		if pool.IsBuiltIn() {
   217  			return structs.NewErrRPCCodedf(http.StatusBadRequest, "modifying node pool %q is not allowed", pool.Name)
   218  		}
   219  
   220  		pool.SetHash()
   221  	}
   222  
   223  	// Update via Raft.
   224  	_, index, err := n.srv.raftApply(structs.NodePoolUpsertRequestType, args)
   225  	if err != nil {
   226  		return err
   227  	}
   228  	reply.Index = index
   229  	return nil
   230  }
   231  
   232  // DeleteNodePools deletes the given node pools. Built-in node pools cannot be
   233  // deleted.
   234  func (n *NodePool) DeleteNodePools(args *structs.NodePoolDeleteRequest, reply *structs.GenericResponse) error {
   235  	authErr := n.srv.Authenticate(n.ctx, args)
   236  	args.Region = n.srv.config.AuthoritativeRegion
   237  	if done, err := n.srv.forward("NodePool.DeleteNodePools", args, args, reply); done {
   238  		return err
   239  	}
   240  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricWrite, args)
   241  	if authErr != nil {
   242  		return structs.ErrPermissionDenied
   243  	}
   244  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "delete_node_pools"}, time.Now())
   245  
   246  	// Resolve ACL token and verify it has delete capability to all pools in
   247  	// the request.
   248  	aclObj, err := n.srv.ResolveACL(args)
   249  	if err != nil {
   250  		return err
   251  	}
   252  	for _, name := range args.Names {
   253  		if !aclObj.AllowNodePoolOperation(name, acl.NodePoolCapabilityDelete) {
   254  			return structs.ErrPermissionDenied
   255  		}
   256  	}
   257  
   258  	// Only warn for expiration on delete because just parts of node pools are
   259  	// licensed, so they are allowed to be deleted.
   260  	_ = n.validateLicense(nil)
   261  
   262  	if !ServersMeetMinimumVersion(
   263  		n.srv.serf.Members(), n.srv.Region(), minNodePoolsVersion, true) {
   264  		return fmt.Errorf("all servers must be running version %v or later to delete node pools", minNodePoolsVersion)
   265  	}
   266  
   267  	// Validate request.
   268  	if len(args.Names) == 0 {
   269  		return structs.NewErrRPCCodedf(http.StatusBadRequest, "must specify at least one node pool to delete")
   270  	}
   271  	for _, name := range args.Names {
   272  		if name == "" {
   273  			return structs.NewErrRPCCodedf(http.StatusBadRequest, "node pool name is empty")
   274  		}
   275  	}
   276  
   277  	// Verify that the node pools we're deleting do not have nodes or
   278  	// non-terminal jobs in this region or in any federated region.
   279  	var mErr multierror.Error
   280  	for _, name := range args.Names {
   281  		regionsWithNonTerminal, regionsWithNodes, err := n.nodePoolRegionsInUse(args.AuthToken, name)
   282  		if err != nil {
   283  			_ = multierror.Append(&mErr, err)
   284  		}
   285  		if len(regionsWithNonTerminal) != 0 {
   286  			_ = multierror.Append(&mErr, fmt.Errorf(
   287  				"node pool %q has non-terminal jobs in regions: %v", name, regionsWithNonTerminal))
   288  		}
   289  		if len(regionsWithNodes) != 0 {
   290  			_ = multierror.Append(&mErr, fmt.Errorf(
   291  				"node pool %q has nodes in regions: %v", name, regionsWithNodes))
   292  		}
   293  	}
   294  
   295  	if err := mErr.ErrorOrNil(); err != nil {
   296  		return err
   297  	}
   298  
   299  	// Delete via Raft.
   300  	_, index, err := n.srv.raftApply(structs.NodePoolDeleteRequestType, args)
   301  	if err != nil {
   302  		return err
   303  	}
   304  
   305  	reply.Index = index
   306  	return nil
   307  }
   308  
   309  // nodePoolRegionsInUse returns a list of regions where the node pool is still
   310  // in use for non-terminal jobs, and a list of regions where it is in use by
   311  // nodes.
   312  func (n *NodePool) nodePoolRegionsInUse(token, poolName string) ([]string, []string, error) {
   313  	regions := n.srv.Regions()
   314  	thisRegion := n.srv.Region()
   315  	hasNodes := make([]string, 0, len(regions))
   316  	hasNonTerminal := make([]string, 0, len(regions))
   317  
   318  	// Check if the pool in use in this region
   319  	snap, err := n.srv.State().Snapshot()
   320  	if err != nil {
   321  		return nil, nil, err
   322  	}
   323  	iter, err := snap.NodesByNodePool(nil, poolName)
   324  	if err != nil {
   325  		return nil, nil, err
   326  	}
   327  	found := iter.Next()
   328  	if found != nil {
   329  		hasNodes = append(hasNodes, thisRegion)
   330  	}
   331  	iter, err = snap.JobsByPool(nil, poolName)
   332  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
   333  		job := raw.(*structs.Job)
   334  		if job.Status != structs.JobStatusDead {
   335  			hasNonTerminal = append(hasNonTerminal, thisRegion)
   336  			break
   337  		}
   338  	}
   339  
   340  	for _, region := range regions {
   341  		if region == thisRegion {
   342  			continue
   343  		}
   344  
   345  		nodesReq := &structs.NodePoolNodesRequest{
   346  			Name: poolName,
   347  			QueryOptions: structs.QueryOptions{
   348  				Region:    region,
   349  				AuthToken: token,
   350  				PerPage:   1, // we only care if there are any
   351  			},
   352  		}
   353  		var nodesResp structs.NodePoolNodesResponse
   354  		err := n.srv.RPC("NodePool.ListNodes", nodesReq, &nodesResp)
   355  		if err != nil {
   356  			return hasNodes, hasNonTerminal, err
   357  		}
   358  		if len(nodesResp.Nodes) != 0 {
   359  			hasNodes = append(hasNodes, region)
   360  		}
   361  
   362  		jobsReq := &structs.NodePoolJobsRequest{
   363  			Name: poolName,
   364  			QueryOptions: structs.QueryOptions{
   365  				Region:    region,
   366  				AuthToken: token,
   367  				PerPage:   1, // we only care if there are any
   368  				Filter:    `Status != "dead"`,
   369  			},
   370  		}
   371  		var jobsResp structs.NodePoolJobsResponse
   372  		err = n.srv.RPC("NodePool.ListJobs", jobsReq, &jobsResp)
   373  		if err != nil {
   374  			return hasNodes, hasNonTerminal, err
   375  		}
   376  
   377  		if len(jobsResp.Jobs) != 0 {
   378  			hasNonTerminal = append(hasNonTerminal, region)
   379  		}
   380  
   381  	}
   382  
   383  	return hasNonTerminal, hasNodes, err
   384  }
   385  
   386  // ListJobs is used to retrieve a list of jobs for a given node pool. It supports
   387  // pagination and filtering.
   388  func (n *NodePool) ListJobs(args *structs.NodePoolJobsRequest, reply *structs.NodePoolJobsResponse) error {
   389  	authErr := n.srv.Authenticate(n.ctx, args)
   390  	if done, err := n.srv.forward("NodePool.ListJobs", args, args, reply); done {
   391  		return err
   392  	}
   393  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricList, args)
   394  	if authErr != nil {
   395  		return structs.ErrPermissionDenied
   396  	}
   397  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "list_jobs"}, time.Now())
   398  
   399  	// Resolve ACL token and verify it has read capability for the pool.
   400  	aclObj, err := n.srv.ResolveACL(args)
   401  	if err != nil {
   402  		return err
   403  	}
   404  	if !aclObj.AllowNodePoolOperation(args.Name, acl.NodePoolCapabilityRead) {
   405  		return structs.ErrPermissionDenied
   406  	}
   407  	allowNsFunc := aclObj.AllowNsOpFunc(acl.NamespaceCapabilityListJobs)
   408  	namespace := args.RequestNamespace()
   409  
   410  	// Setup the blocking query. This largely mirrors the Jobs.List RPC but with
   411  	// an additional paginator filter for the node pool.
   412  	opts := blockingOptions{
   413  		queryOpts: &args.QueryOptions,
   414  		queryMeta: &reply.QueryMeta,
   415  		run: func(ws memdb.WatchSet, store *state.StateStore) error {
   416  			// ensure the node pool exists
   417  			pool, err := store.NodePoolByName(ws, args.Name)
   418  			if err != nil {
   419  				return err
   420  			}
   421  			if pool == nil {
   422  				return nil
   423  			}
   424  
   425  			var iter memdb.ResultIterator
   426  
   427  			// Get the namespaces the user is allowed to access.
   428  			allowableNamespaces, err := allowedNSes(aclObj, store, allowNsFunc)
   429  			if errors.Is(err, structs.ErrPermissionDenied) {
   430  				// return empty jobs if token isn't authorized for any
   431  				// namespace, matching other endpoints
   432  				reply.Jobs = make([]*structs.JobListStub, 0)
   433  			} else if err != nil {
   434  				return err
   435  			} else {
   436  
   437  				filters := []paginator.Filter{
   438  					paginator.NamespaceFilter{
   439  						AllowableNamespaces: allowableNamespaces,
   440  					},
   441  				}
   442  
   443  				if namespace == structs.AllNamespacesSentinel {
   444  					iter, err = store.JobsByPool(ws, args.Name)
   445  				} else {
   446  					iter, err = store.JobsByNamespace(ws, namespace)
   447  					filters = append(filters,
   448  						paginator.GenericFilter{
   449  							Allow: func(raw interface{}) (bool, error) {
   450  								job := raw.(*structs.Job)
   451  								if job == nil || job.NodePool != args.Name {
   452  									return false, nil
   453  								}
   454  								return true, nil
   455  							},
   456  						})
   457  				}
   458  				if err != nil {
   459  					return err
   460  				}
   461  
   462  				tokenizer := paginator.NewStructsTokenizer(
   463  					iter,
   464  					paginator.StructsTokenizerOptions{
   465  						WithNamespace: true,
   466  						WithID:        true,
   467  					},
   468  				)
   469  
   470  				var jobs []*structs.JobListStub
   471  
   472  				paginator, err := paginator.NewPaginator(iter, tokenizer, filters, args.QueryOptions,
   473  					func(raw interface{}) error {
   474  						job := raw.(*structs.Job)
   475  						summary, err := store.JobSummaryByID(ws, job.Namespace, job.ID)
   476  						if err != nil || summary == nil {
   477  							return fmt.Errorf("unable to look up summary for job: %v", job.ID)
   478  						}
   479  						jobs = append(jobs, job.Stub(summary, args.Fields))
   480  						return nil
   481  					})
   482  				if err != nil {
   483  					return structs.NewErrRPCCodedf(
   484  						http.StatusBadRequest, "failed to create result paginator: %v", err)
   485  				}
   486  
   487  				nextToken, err := paginator.Page()
   488  				if err != nil {
   489  					return structs.NewErrRPCCodedf(
   490  						http.StatusBadRequest, "failed to read result page: %v", err)
   491  				}
   492  
   493  				reply.QueryMeta.NextToken = nextToken
   494  				reply.Jobs = jobs
   495  			}
   496  
   497  			// Use the last index that affected the jobs table or summary
   498  			jindex, err := store.Index("jobs")
   499  			if err != nil {
   500  				return err
   501  			}
   502  			sindex, err := store.Index("job_summary")
   503  			if err != nil {
   504  				return err
   505  			}
   506  			reply.Index = helper.Max(jindex, sindex)
   507  
   508  			// Set the query response
   509  			n.srv.setQueryMeta(&reply.QueryMeta)
   510  			return nil
   511  		}}
   512  	return n.srv.blockingRPC(&opts)
   513  }
   514  
   515  // ListNodes is used to retrieve a list of nodes for a give node pool. It
   516  // supports pagination and filtering.
   517  func (n *NodePool) ListNodes(args *structs.NodePoolNodesRequest, reply *structs.NodePoolNodesResponse) error {
   518  	authErr := n.srv.Authenticate(n.ctx, args)
   519  	if done, err := n.srv.forward("NodePool.ListNodes", args, args, reply); done {
   520  		return err
   521  	}
   522  	n.srv.MeasureRPCRate("node_pool", structs.RateMetricList, args)
   523  	if authErr != nil {
   524  		return structs.ErrPermissionDenied
   525  	}
   526  	defer metrics.MeasureSince([]string{"nomad", "node_pool", "list_nodes"}, time.Now())
   527  
   528  	// Resolve ACL token and verify it has read capability for nodes and the
   529  	// node pool.
   530  	aclObj, err := n.srv.ResolveACL(args)
   531  	if err != nil {
   532  		return err
   533  	}
   534  
   535  	allowed := aclObj.AllowNodeRead() &&
   536  		aclObj.AllowNodePoolOperation(args.Name, acl.NodePoolCapabilityRead)
   537  	if !allowed {
   538  		return structs.ErrPermissionDenied
   539  	}
   540  
   541  	// Setup blocking query.
   542  	opts := blockingOptions{
   543  		queryOpts: &args.QueryOptions,
   544  		queryMeta: &reply.QueryMeta,
   545  		run: func(ws memdb.WatchSet, store *state.StateStore) error {
   546  			// Verify node pool exists.
   547  			pool, err := store.NodePoolByName(ws, args.Name)
   548  			if err != nil {
   549  				return err
   550  			}
   551  			if pool == nil {
   552  				return nil
   553  			}
   554  
   555  			// Fetch nodes in the pool.
   556  			var iter memdb.ResultIterator
   557  			if args.Name == structs.NodePoolAll {
   558  				iter, err = store.Nodes(ws)
   559  			} else {
   560  				iter, err = store.NodesByNodePool(ws, args.Name)
   561  			}
   562  			if err != nil {
   563  				return err
   564  			}
   565  
   566  			// Setup paginator by node ID.
   567  			pageOpts := paginator.StructsTokenizerOptions{
   568  				WithID: true,
   569  			}
   570  			tokenizer := paginator.NewStructsTokenizer(iter, pageOpts)
   571  
   572  			var nodes []*structs.NodeListStub
   573  			pager, err := paginator.NewPaginator(iter, tokenizer, nil, args.QueryOptions,
   574  				func(raw interface{}) error {
   575  					node := raw.(*structs.Node)
   576  					nodes = append(nodes, node.Stub(args.Fields))
   577  					return nil
   578  				})
   579  			if err != nil {
   580  				return structs.NewErrRPCCodedf(http.StatusBadRequest, "failed to create result paginator: %v", err)
   581  			}
   582  
   583  			nextToken, err := pager.Page()
   584  			if err != nil {
   585  				return structs.NewErrRPCCodedf(http.StatusBadRequest, "failed to read result page: %v", err)
   586  			}
   587  
   588  			reply.QueryMeta.NextToken = nextToken
   589  			reply.Nodes = nodes
   590  
   591  			// Use the last index that affected the nodes table.
   592  			index, err := store.Index("nodes")
   593  			if err != nil {
   594  				return err
   595  			}
   596  			reply.Index = helper.Max(1, index)
   597  
   598  			// Set the query response.
   599  			n.srv.setQueryMeta(&reply.QueryMeta)
   600  			return nil
   601  		}}
   602  	return n.srv.blockingRPC(&opts)
   603  }