github.com/hernad/nomad@v1.6.112/nomad/state/state_store.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package state
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"reflect"
    11  	"sort"
    12  	"strings"
    13  	"time"
    14  
    15  	"github.com/hashicorp/go-bexpr"
    16  	"github.com/hashicorp/go-hclog"
    17  	"github.com/hashicorp/go-memdb"
    18  	"github.com/hashicorp/go-multierror"
    19  	"github.com/hashicorp/go-set"
    20  	"github.com/hernad/nomad/helper/pointer"
    21  	"github.com/hernad/nomad/lib/lang"
    22  	"github.com/hernad/nomad/nomad/stream"
    23  	"github.com/hernad/nomad/nomad/structs"
    24  	"golang.org/x/exp/slices"
    25  )
    26  
    27  // Txn is a transaction against a state store.
    28  // This can be a read or write transaction.
    29  type Txn = *txn
    30  
    31  // SortOption represents how results can be sorted.
    32  type SortOption bool
    33  
    34  const (
    35  	// SortDefault indicates that the result should be returned using the
    36  	// default go-memdb ResultIterator order.
    37  	SortDefault SortOption = false
    38  
    39  	// SortReverse indicates that the result should be returned using the
    40  	// reversed go-memdb ResultIterator order.
    41  	SortReverse SortOption = true
    42  )
    43  
    44  // NodeUpsertOption represents options to configure a NodeUpsert operation.
    45  type NodeUpsertOption uint8
    46  
    47  const (
    48  	// NodeUpsertWithNodePool indicates that the node pool in the node should
    49  	// be created if it doesn't exist.
    50  	NodeUpsertWithNodePool NodeUpsertOption = iota
    51  )
    52  
    53  const (
    54  	// NodeEligibilityEventPlanRejectThreshold is the message used when the node
    55  	// is set to ineligible due to multiple plan failures.
    56  	// This is a preventive measure to signal scheduler workers to not consider
    57  	// the node for future placements.
    58  	// Plan rejections for a node are expected due to the optimistic and
    59  	// concurrent nature of the scheduling process, but repeated failures for
    60  	// the same node may indicate an underlying issue not detected by Nomad.
    61  	// The plan applier keeps track of plan rejection history and will mark
    62  	// nodes as ineligible if they cross a given threshold.
    63  	NodeEligibilityEventPlanRejectThreshold = "Node marked as ineligible for scheduling due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information"
    64  
    65  	// NodeRegisterEventRegistered is the message used when the node becomes
    66  	// registered.
    67  	NodeRegisterEventRegistered = "Node registered"
    68  
    69  	// NodeRegisterEventReregistered is the message used when the node becomes
    70  	// re-registered.
    71  	NodeRegisterEventReregistered = "Node re-registered"
    72  )
    73  
    74  // terminate appends the go-memdb terminator character to s.
    75  //
    76  // We can then use the result for exact matches during prefix
    77  // scans over compound indexes that start with s.
    78  func terminate(s string) string {
    79  	return s + "\x00"
    80  }
    81  
    82  // IndexEntry is used with the "index" table
    83  // for managing the latest Raft index affecting a table.
    84  type IndexEntry struct {
    85  	Key   string
    86  	Value uint64
    87  }
    88  
    89  // StateStoreConfig is used to configure a new state store
    90  type StateStoreConfig struct {
    91  	// Logger is used to output the state store's logs
    92  	Logger hclog.Logger
    93  
    94  	// Region is the region of the server embedding the state store.
    95  	Region string
    96  
    97  	// EnablePublisher is used to enable or disable the event publisher
    98  	EnablePublisher bool
    99  
   100  	// EventBufferSize configures the amount of events to hold in memory
   101  	EventBufferSize int64
   102  }
   103  
   104  // The StateStore is responsible for maintaining all the Nomad
   105  // state. It is manipulated by the FSM which maintains consistency
   106  // through the use of Raft. The goals of the StateStore are to provide
   107  // high concurrency for read operations without blocking writes, and
   108  // to provide write availability in the face of reads. EVERY object
   109  // returned as a result of a read against the state store should be
   110  // considered a constant and NEVER modified in place.
   111  type StateStore struct {
   112  	logger hclog.Logger
   113  	db     *changeTrackerDB
   114  
   115  	// config is the passed in configuration
   116  	config *StateStoreConfig
   117  
   118  	// abandonCh is used to signal watchers that this state store has been
   119  	// abandoned (usually during a restore). This is only ever closed.
   120  	abandonCh chan struct{}
   121  
   122  	// TODO: refactor abandonCh to use a context so that both can use the same
   123  	// cancel mechanism.
   124  	stopEventBroker func()
   125  }
   126  
   127  type streamACLDelegate struct {
   128  	s *StateStore
   129  }
   130  
   131  func (a *streamACLDelegate) TokenProvider() stream.ACLTokenProvider {
   132  	resolver, _ := a.s.Snapshot()
   133  	return resolver
   134  }
   135  
   136  // NewStateStore is used to create a new state store
   137  func NewStateStore(config *StateStoreConfig) (*StateStore, error) {
   138  	// Create the MemDB
   139  	db, err := memdb.NewMemDB(stateStoreSchema())
   140  	if err != nil {
   141  		return nil, fmt.Errorf("state store setup failed: %v", err)
   142  	}
   143  
   144  	// Create the state store
   145  	ctx, cancel := context.WithCancel(context.TODO())
   146  	s := &StateStore{
   147  		logger:          config.Logger.Named("state_store"),
   148  		config:          config,
   149  		abandonCh:       make(chan struct{}),
   150  		stopEventBroker: cancel,
   151  	}
   152  
   153  	if config.EnablePublisher {
   154  		// Create new event publisher using provided config
   155  		broker, err := stream.NewEventBroker(ctx, &streamACLDelegate{s}, stream.EventBrokerCfg{
   156  			EventBufferSize: config.EventBufferSize,
   157  			Logger:          config.Logger,
   158  		})
   159  		if err != nil {
   160  			return nil, fmt.Errorf("creating state store event broker %w", err)
   161  		}
   162  		s.db = NewChangeTrackerDB(db, broker, eventsFromChanges)
   163  	} else {
   164  		s.db = NewChangeTrackerDB(db, nil, noOpProcessChanges)
   165  	}
   166  
   167  	// Initialize the state store with the default namespace and built-in node
   168  	// pools.
   169  	if err := s.namespaceInit(); err != nil {
   170  		return nil, fmt.Errorf("namespace state store initialization failed: %v", err)
   171  	}
   172  	if err := s.nodePoolInit(); err != nil {
   173  		return nil, fmt.Errorf("node pool state store initialization failed: %w", err)
   174  	}
   175  
   176  	return s, nil
   177  }
   178  
   179  // NewWatchSet returns a new memdb.WatchSet that adds the state stores abandonCh
   180  // as a watcher. This is important in that it will notify when this specific
   181  // state store is no longer valid, usually due to a new snapshot being loaded
   182  func (s *StateStore) NewWatchSet() memdb.WatchSet {
   183  	ws := memdb.NewWatchSet()
   184  	ws.Add(s.AbandonCh())
   185  	return ws
   186  }
   187  
   188  func (s *StateStore) EventBroker() (*stream.EventBroker, error) {
   189  	if s.db.publisher == nil {
   190  		return nil, fmt.Errorf("EventBroker not configured")
   191  	}
   192  	return s.db.publisher, nil
   193  }
   194  
   195  // namespaceInit ensures the default namespace exists.
   196  func (s *StateStore) namespaceInit() error {
   197  	// Create the default namespace. This is safe to do every time we create the
   198  	// state store. There are two main cases, a brand new cluster in which case
   199  	// each server will have the same default namespace object, or a new cluster
   200  	// in which case if the default namespace has been modified, it will be
   201  	// overridden by the restore code path.
   202  	defaultNs := &structs.Namespace{
   203  		Name:        structs.DefaultNamespace,
   204  		Description: structs.DefaultNamespaceDescription,
   205  	}
   206  
   207  	if err := s.UpsertNamespaces(1, []*structs.Namespace{defaultNs}); err != nil {
   208  		return fmt.Errorf("inserting default namespace failed: %v", err)
   209  	}
   210  
   211  	return nil
   212  }
   213  
   214  // Config returns the state store configuration.
   215  func (s *StateStore) Config() *StateStoreConfig {
   216  	return s.config
   217  }
   218  
   219  // Snapshot is used to create a point in time snapshot. Because
   220  // we use MemDB, we just need to snapshot the state of the underlying
   221  // database.
   222  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
   223  	memDBSnap := s.db.memdb.Snapshot()
   224  
   225  	store := StateStore{
   226  		logger: s.logger,
   227  		config: s.config,
   228  	}
   229  
   230  	// Create a new change tracker DB that does not publish or track changes
   231  	store.db = NewChangeTrackerDB(memDBSnap, nil, noOpProcessChanges)
   232  
   233  	snap := &StateSnapshot{
   234  		StateStore: store,
   235  	}
   236  	return snap, nil
   237  }
   238  
   239  // SnapshotMinIndex is used to create a state snapshot where the index is
   240  // guaranteed to be greater than or equal to the index parameter.
   241  //
   242  // Some server operations (such as scheduling) exchange objects via RPC
   243  // concurrent with Raft log application, so they must ensure the state store
   244  // snapshot they are operating on is at or after the index the objects
   245  // retrieved via RPC were applied to the Raft log at.
   246  //
   247  // Callers should maintain their own timer metric as the time this method
   248  // blocks indicates Raft log application latency relative to scheduling.
   249  func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) {
   250  	// Ported from work.go:waitForIndex prior to 0.9
   251  
   252  	const backoffBase = 20 * time.Millisecond
   253  	const backoffLimit = 1 * time.Second
   254  	var retries uint
   255  	var retryTimer *time.Timer
   256  
   257  	// XXX: Potential optimization is to set up a watch on the state
   258  	// store's index table and only unblock via a trigger rather than
   259  	// polling.
   260  	for {
   261  		// Get the states current index
   262  		snapshotIndex, err := s.LatestIndex()
   263  		if err != nil {
   264  			return nil, fmt.Errorf("failed to determine state store's index: %w", err)
   265  		}
   266  
   267  		// We only need the FSM state to be as recent as the given index
   268  		if snapshotIndex >= index {
   269  			return s.Snapshot()
   270  		}
   271  
   272  		// Exponential back off
   273  		retries++
   274  		if retryTimer == nil {
   275  			// First retry, start at baseline
   276  			retryTimer = time.NewTimer(backoffBase)
   277  		} else {
   278  			// Subsequent retry, reset timer
   279  			deadline := 1 << (2 * retries) * backoffBase
   280  			if deadline > backoffLimit {
   281  				deadline = backoffLimit
   282  			}
   283  			retryTimer.Reset(deadline)
   284  		}
   285  
   286  		select {
   287  		case <-ctx.Done():
   288  			return nil, ctx.Err()
   289  		case <-retryTimer.C:
   290  		}
   291  	}
   292  }
   293  
   294  // Restore is used to optimize the efficiency of rebuilding
   295  // state by minimizing the number of transactions and checking
   296  // overhead.
   297  func (s *StateStore) Restore() (*StateRestore, error) {
   298  	txn := s.db.WriteTxnRestore()
   299  	r := &StateRestore{
   300  		txn: txn,
   301  	}
   302  	return r, nil
   303  }
   304  
   305  // AbandonCh returns a channel you can wait on to know if the state store was
   306  // abandoned.
   307  func (s *StateStore) AbandonCh() <-chan struct{} {
   308  	return s.abandonCh
   309  }
   310  
   311  // Abandon is used to signal that the given state store has been abandoned.
   312  // Calling this more than one time will panic.
   313  func (s *StateStore) Abandon() {
   314  	s.StopEventBroker()
   315  	close(s.abandonCh)
   316  }
   317  
   318  // StopEventBroker calls the cancel func for the state stores event
   319  // publisher. It should be called during server shutdown.
   320  func (s *StateStore) StopEventBroker() {
   321  	s.stopEventBroker()
   322  }
   323  
   324  // QueryFn is the definition of a function that can be used to implement a basic
   325  // blocking query against the state store.
   326  type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error)
   327  
   328  // BlockingQuery takes a query function and runs the function until the minimum
   329  // query index is met or until the passed context is cancelled.
   330  func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) (
   331  	resp interface{}, index uint64, err error) {
   332  
   333  RUN_QUERY:
   334  	// We capture the state store and its abandon channel but pass a snapshot to
   335  	// the blocking query function. We operate on the snapshot to allow separate
   336  	// calls to the state store not all wrapped within the same transaction.
   337  	abandonCh := s.AbandonCh()
   338  	snap, _ := s.Snapshot()
   339  	stateSnap := &snap.StateStore
   340  
   341  	// We can skip all watch tracking if this isn't a blocking query.
   342  	var ws memdb.WatchSet
   343  	if minIndex > 0 {
   344  		ws = memdb.NewWatchSet()
   345  
   346  		// This channel will be closed if a snapshot is restored and the
   347  		// whole state store is abandoned.
   348  		ws.Add(abandonCh)
   349  	}
   350  
   351  	resp, index, err = query(ws, stateSnap)
   352  	if err != nil {
   353  		return nil, index, err
   354  	}
   355  
   356  	// We haven't reached the min-index yet.
   357  	if minIndex > 0 && index <= minIndex {
   358  		if err := ws.WatchCtx(ctx); err != nil {
   359  			return nil, index, err
   360  		}
   361  
   362  		goto RUN_QUERY
   363  	}
   364  
   365  	return resp, index, nil
   366  }
   367  
   368  // UpsertPlanResults is used to upsert the results of a plan.
   369  func (s *StateStore) UpsertPlanResults(msgType structs.MessageType, index uint64, results *structs.ApplyPlanResultsRequest) error {
   370  	snapshot, err := s.Snapshot()
   371  	if err != nil {
   372  		return err
   373  	}
   374  
   375  	allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped)
   376  	if err != nil {
   377  		return err
   378  	}
   379  
   380  	allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted)
   381  	if err != nil {
   382  		return err
   383  	}
   384  
   385  	// COMPAT 0.11: Remove this denormalization when NodePreemptions is removed
   386  	results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions)
   387  	if err != nil {
   388  		return err
   389  	}
   390  
   391  	txn := s.db.WriteTxnMsgT(msgType, index)
   392  	defer txn.Abort()
   393  
   394  	// Mark nodes as ineligible.
   395  	for _, nodeID := range results.IneligibleNodes {
   396  		s.logger.Warn("marking node as ineligible due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information", "node_id", nodeID)
   397  
   398  		nodeEvent := structs.NewNodeEvent().
   399  			SetSubsystem(structs.NodeEventSubsystemScheduler).
   400  			SetMessage(NodeEligibilityEventPlanRejectThreshold)
   401  
   402  		err := s.updateNodeEligibilityImpl(index, nodeID,
   403  			structs.NodeSchedulingIneligible, results.UpdatedAt, nodeEvent, txn)
   404  		if err != nil {
   405  			return err
   406  		}
   407  	}
   408  
   409  	// Upsert the newly created or updated deployment
   410  	if results.Deployment != nil {
   411  		if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil {
   412  			return err
   413  		}
   414  	}
   415  
   416  	// Update the status of deployments effected by the plan.
   417  	if len(results.DeploymentUpdates) != 0 {
   418  		s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn)
   419  	}
   420  
   421  	if results.EvalID != "" {
   422  		// Update the modify index of the eval id
   423  		if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil {
   424  			return err
   425  		}
   426  	}
   427  
   428  	numAllocs := 0
   429  	if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 {
   430  		// COMPAT 0.11: This branch will be removed, when Alloc is removed
   431  		// Attach the job to all the allocations. It is pulled out in the payload to
   432  		// avoid the redundancy of encoding, but should be denormalized prior to
   433  		// being inserted into MemDB.
   434  		addComputedAllocAttrs(results.Alloc, results.Job)
   435  		numAllocs = len(results.Alloc) + len(results.NodePreemptions)
   436  	} else {
   437  		// Attach the job to all the allocations. It is pulled out in the payload to
   438  		// avoid the redundancy of encoding, but should be denormalized prior to
   439  		// being inserted into MemDB.
   440  		addComputedAllocAttrs(results.AllocsUpdated, results.Job)
   441  		numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted)
   442  	}
   443  
   444  	allocsToUpsert := make([]*structs.Allocation, 0, numAllocs)
   445  
   446  	// COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed
   447  	allocsToUpsert = append(allocsToUpsert, results.Alloc...)
   448  	allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...)
   449  
   450  	allocsToUpsert = append(allocsToUpsert, allocsStopped...)
   451  	allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...)
   452  	allocsToUpsert = append(allocsToUpsert, allocsPreempted...)
   453  
   454  	// handle upgrade path
   455  	for _, alloc := range allocsToUpsert {
   456  		alloc.Canonicalize()
   457  	}
   458  
   459  	if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil {
   460  		return err
   461  	}
   462  
   463  	// Upsert followup evals for allocs that were preempted
   464  	for _, eval := range results.PreemptionEvals {
   465  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   466  			return err
   467  		}
   468  	}
   469  
   470  	return txn.Commit()
   471  }
   472  
   473  // addComputedAllocAttrs adds the computed/derived attributes to the allocation.
   474  // This method is used when an allocation is being denormalized.
   475  func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) {
   476  	structs.DenormalizeAllocationJobs(job, allocs)
   477  
   478  	// COMPAT(0.11): Remove in 0.11
   479  	// Calculate the total resources of allocations. It is pulled out in the
   480  	// payload to avoid encoding something that can be computed, but should be
   481  	// denormalized prior to being inserted into MemDB.
   482  	for _, alloc := range allocs {
   483  		if alloc.Resources != nil {
   484  			continue
   485  		}
   486  
   487  		alloc.Resources = new(structs.Resources)
   488  		for _, task := range alloc.TaskResources {
   489  			alloc.Resources.Add(task)
   490  		}
   491  
   492  		// Add the shared resources
   493  		alloc.Resources.Add(alloc.SharedResources)
   494  	}
   495  }
   496  
   497  // upsertDeploymentUpdates updates the deployments given the passed status
   498  // updates.
   499  func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *txn) error {
   500  	for _, u := range updates {
   501  		if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil {
   502  			return err
   503  		}
   504  	}
   505  
   506  	return nil
   507  }
   508  
   509  // UpsertJobSummary upserts a job summary into the state store.
   510  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
   511  	txn := s.db.WriteTxn(index)
   512  	defer txn.Abort()
   513  
   514  	// Check if the job summary already exists
   515  	existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID)
   516  	if err != nil {
   517  		return fmt.Errorf("job summary lookup failed: %v", err)
   518  	}
   519  
   520  	// Setup the indexes correctly
   521  	if existing != nil {
   522  		jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex
   523  		jobSummary.ModifyIndex = index
   524  	} else {
   525  		jobSummary.CreateIndex = index
   526  		jobSummary.ModifyIndex = index
   527  	}
   528  
   529  	// Update the index
   530  	if err := txn.Insert("job_summary", jobSummary); err != nil {
   531  		return err
   532  	}
   533  
   534  	// Update the indexes table for job summary
   535  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   536  		return fmt.Errorf("index update failed: %v", err)
   537  	}
   538  
   539  	return txn.Commit()
   540  }
   541  
   542  // DeleteJobSummary deletes the job summary with the given ID. This is for
   543  // testing purposes only.
   544  func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error {
   545  	txn := s.db.WriteTxn(index)
   546  	defer txn.Abort()
   547  
   548  	// Delete the job summary
   549  	if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil {
   550  		return fmt.Errorf("deleting job summary failed: %v", err)
   551  	}
   552  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   553  		return fmt.Errorf("index update failed: %v", err)
   554  	}
   555  	return txn.Commit()
   556  }
   557  
   558  // UpsertDeployment is used to insert or update a new deployment.
   559  func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error {
   560  	txn := s.db.WriteTxn(index)
   561  	defer txn.Abort()
   562  	if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil {
   563  		return err
   564  	}
   565  	return txn.Commit()
   566  }
   567  
   568  func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *txn) error {
   569  	// Check if the deployment already exists
   570  	existing, err := txn.First("deployment", "id", deployment.ID)
   571  	if err != nil {
   572  		return fmt.Errorf("deployment lookup failed: %v", err)
   573  	}
   574  
   575  	// Setup the indexes correctly
   576  	if existing != nil {
   577  		deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex
   578  		deployment.ModifyIndex = index
   579  	} else {
   580  		deployment.CreateIndex = index
   581  		deployment.ModifyIndex = index
   582  	}
   583  
   584  	// Insert the deployment
   585  	if err := txn.Insert("deployment", deployment); err != nil {
   586  		return err
   587  	}
   588  
   589  	// Update the indexes table for deployment
   590  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   591  		return fmt.Errorf("index update failed: %v", err)
   592  	}
   593  
   594  	// If the deployment is being marked as complete, set the job to stable.
   595  	if deployment.Status == structs.DeploymentStatusSuccessful {
   596  		if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil {
   597  			return fmt.Errorf("failed to update job stability: %v", err)
   598  		}
   599  	}
   600  
   601  	return nil
   602  }
   603  
   604  func (s *StateStore) Deployments(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
   605  	txn := s.db.ReadTxn()
   606  
   607  	var it memdb.ResultIterator
   608  	var err error
   609  
   610  	switch sort {
   611  	case SortReverse:
   612  		it, err = txn.GetReverse("deployment", "create")
   613  	default:
   614  		it, err = txn.Get("deployment", "create")
   615  	}
   616  
   617  	if err != nil {
   618  		return nil, err
   619  	}
   620  
   621  	ws.Add(it.WatchCh())
   622  
   623  	return it, nil
   624  }
   625  
   626  func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
   627  	txn := s.db.ReadTxn()
   628  
   629  	// Walk the entire deployments table
   630  	iter, err := txn.Get("deployment", "namespace", namespace)
   631  	if err != nil {
   632  		return nil, err
   633  	}
   634  
   635  	ws.Add(iter.WatchCh())
   636  	return iter, nil
   637  }
   638  
   639  func (s *StateStore) DeploymentsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
   640  	txn := s.db.ReadTxn()
   641  
   642  	var (
   643  		it    memdb.ResultIterator
   644  		err   error
   645  		exact = terminate(namespace)
   646  	)
   647  
   648  	switch sort {
   649  	case SortReverse:
   650  		it, err = txn.GetReverse("deployment", "namespace_create_prefix", exact)
   651  	default:
   652  		it, err = txn.Get("deployment", "namespace_create_prefix", exact)
   653  	}
   654  
   655  	if err != nil {
   656  		return nil, err
   657  	}
   658  
   659  	ws.Add(it.WatchCh())
   660  
   661  	return it, nil
   662  }
   663  
   664  func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string, sort SortOption) (memdb.ResultIterator, error) {
   665  	txn := s.db.ReadTxn()
   666  
   667  	var iter memdb.ResultIterator
   668  	var err error
   669  
   670  	// Walk the entire deployments table
   671  	switch sort {
   672  	case SortReverse:
   673  		iter, err = txn.GetReverse("deployment", "id_prefix", deploymentID)
   674  	default:
   675  		iter, err = txn.Get("deployment", "id_prefix", deploymentID)
   676  	}
   677  	if err != nil {
   678  		return nil, err
   679  	}
   680  
   681  	ws.Add(iter.WatchCh())
   682  
   683  	// Wrap the iterator in a filter
   684  	wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace))
   685  	return wrap, nil
   686  }
   687  
   688  // deploymentNamespaceFilter returns a filter function that filters all
   689  // deployment not in the given namespace.
   690  func deploymentNamespaceFilter(namespace string) func(interface{}) bool {
   691  	return func(raw interface{}) bool {
   692  		d, ok := raw.(*structs.Deployment)
   693  		if !ok {
   694  			return true
   695  		}
   696  
   697  		return namespace != structs.AllNamespacesSentinel &&
   698  			d.Namespace != namespace
   699  	}
   700  }
   701  
   702  func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) {
   703  	txn := s.db.ReadTxn()
   704  	return s.deploymentByIDImpl(ws, deploymentID, txn)
   705  }
   706  
   707  func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *txn) (*structs.Deployment, error) {
   708  	watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID)
   709  	if err != nil {
   710  		return nil, fmt.Errorf("deployment lookup failed: %v", err)
   711  	}
   712  	ws.Add(watchCh)
   713  
   714  	if existing != nil {
   715  		return existing.(*structs.Deployment), nil
   716  	}
   717  
   718  	return nil, nil
   719  }
   720  
   721  func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) {
   722  	txn := s.db.ReadTxn()
   723  
   724  	var job *structs.Job
   725  	// Read job from state store
   726  	_, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID)
   727  	if err != nil {
   728  		return nil, fmt.Errorf("job lookup failed: %v", err)
   729  	}
   730  	if existing != nil {
   731  		job = existing.(*structs.Job)
   732  	}
   733  
   734  	// Get an iterator over the deployments
   735  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   736  	if err != nil {
   737  		return nil, err
   738  	}
   739  
   740  	ws.Add(iter.WatchCh())
   741  
   742  	var out []*structs.Deployment
   743  	for {
   744  		raw := iter.Next()
   745  		if raw == nil {
   746  			break
   747  		}
   748  		d := raw.(*structs.Deployment)
   749  
   750  		// If the allocation belongs to a job with the same ID but a different
   751  		// create index and we are not getting all the allocations whose Jobs
   752  		// matches the same Job ID then we skip it
   753  		if !all && job != nil && d.JobCreateIndex != job.CreateIndex {
   754  			continue
   755  		}
   756  		out = append(out, d)
   757  	}
   758  
   759  	return out, nil
   760  }
   761  
   762  // LatestDeploymentByJobID returns the latest deployment for the given job. The
   763  // latest is determined strictly by CreateIndex.
   764  func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) {
   765  	txn := s.db.ReadTxn()
   766  
   767  	// Get an iterator over the deployments
   768  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   769  	if err != nil {
   770  		return nil, err
   771  	}
   772  
   773  	ws.Add(iter.WatchCh())
   774  
   775  	var out *structs.Deployment
   776  	for {
   777  		raw := iter.Next()
   778  		if raw == nil {
   779  			break
   780  		}
   781  
   782  		d := raw.(*structs.Deployment)
   783  		if out == nil || out.CreateIndex < d.CreateIndex {
   784  			out = d
   785  		}
   786  	}
   787  
   788  	return out, nil
   789  }
   790  
   791  // DeleteDeployment is used to delete a set of deployments by ID
   792  func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error {
   793  	txn := s.db.WriteTxn(index)
   794  	defer txn.Abort()
   795  
   796  	if len(deploymentIDs) == 0 {
   797  		return nil
   798  	}
   799  
   800  	for _, deploymentID := range deploymentIDs {
   801  		// Lookup the deployment
   802  		existing, err := txn.First("deployment", "id", deploymentID)
   803  		if err != nil {
   804  			return fmt.Errorf("deployment lookup failed: %v", err)
   805  		}
   806  		if existing == nil {
   807  			return fmt.Errorf("deployment not found")
   808  		}
   809  
   810  		// Delete the deployment
   811  		if err := txn.Delete("deployment", existing); err != nil {
   812  			return fmt.Errorf("deployment delete failed: %v", err)
   813  		}
   814  	}
   815  
   816  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   817  		return fmt.Errorf("index update failed: %v", err)
   818  	}
   819  
   820  	return txn.Commit()
   821  }
   822  
   823  // UpsertScalingEvent is used to insert a new scaling event.
   824  // Only the most recent JobTrackedScalingEvents will be kept.
   825  func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error {
   826  	txn := s.db.WriteTxn(index)
   827  	defer txn.Abort()
   828  
   829  	// Get the existing events
   830  	existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID)
   831  	if err != nil {
   832  		return fmt.Errorf("scaling event lookup failed: %v", err)
   833  	}
   834  
   835  	var jobEvents *structs.JobScalingEvents
   836  	if existing != nil {
   837  		jobEvents = existing.(*structs.JobScalingEvents)
   838  	} else {
   839  		jobEvents = &structs.JobScalingEvents{
   840  			Namespace:     req.Namespace,
   841  			JobID:         req.JobID,
   842  			ScalingEvents: make(map[string][]*structs.ScalingEvent),
   843  		}
   844  	}
   845  
   846  	jobEvents.ModifyIndex = index
   847  	req.ScalingEvent.CreateIndex = index
   848  
   849  	events := jobEvents.ScalingEvents[req.TaskGroup]
   850  	// Prepend this latest event
   851  	events = append(
   852  		[]*structs.ScalingEvent{req.ScalingEvent},
   853  		events...,
   854  	)
   855  	// Truncate older events
   856  	if len(events) > structs.JobTrackedScalingEvents {
   857  		events = events[0:structs.JobTrackedScalingEvents]
   858  	}
   859  	jobEvents.ScalingEvents[req.TaskGroup] = events
   860  
   861  	// Insert the new event
   862  	if err := txn.Insert("scaling_event", jobEvents); err != nil {
   863  		return fmt.Errorf("scaling event insert failed: %v", err)
   864  	}
   865  
   866  	// Update the indexes table for scaling_event
   867  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
   868  		return fmt.Errorf("index update failed: %v", err)
   869  	}
   870  
   871  	return txn.Commit()
   872  }
   873  
   874  // ScalingEvents returns an iterator over all the job scaling events
   875  func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   876  	txn := s.db.ReadTxn()
   877  
   878  	// Walk the entire scaling_event table
   879  	iter, err := txn.Get("scaling_event", "id")
   880  	if err != nil {
   881  		return nil, err
   882  	}
   883  
   884  	ws.Add(iter.WatchCh())
   885  
   886  	return iter, nil
   887  }
   888  
   889  func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) {
   890  	txn := s.db.ReadTxn()
   891  
   892  	watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID)
   893  	if err != nil {
   894  		return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err)
   895  	}
   896  	ws.Add(watchCh)
   897  
   898  	if existing != nil {
   899  		events := existing.(*structs.JobScalingEvents)
   900  		return events.ScalingEvents, events.ModifyIndex, nil
   901  	}
   902  	return nil, 0, nil
   903  }
   904  
   905  // UpsertNode is used to register a node or update a node definition
   906  // This is assumed to be triggered by the client, so we retain the value
   907  // of drain/eligibility which is set by the scheduler.
   908  func (s *StateStore) UpsertNode(msgType structs.MessageType, index uint64, node *structs.Node, opts ...NodeUpsertOption) error {
   909  	txn := s.db.WriteTxnMsgT(msgType, index)
   910  	defer txn.Abort()
   911  
   912  	for _, opt := range opts {
   913  		// Create node pool if necessary.
   914  		if opt == NodeUpsertWithNodePool && node.NodePool != "" {
   915  			_, err := s.fetchOrCreateNodePoolTxn(txn, index, node.NodePool)
   916  			if err != nil {
   917  				return err
   918  			}
   919  		}
   920  	}
   921  
   922  	err := upsertNodeTxn(txn, index, node)
   923  	if err != nil {
   924  		return nil
   925  	}
   926  	return txn.Commit()
   927  }
   928  
   929  func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error {
   930  	// Check if the node already exists
   931  	existing, err := txn.First("nodes", "id", node.ID)
   932  	if err != nil {
   933  		return fmt.Errorf("node lookup failed: %v", err)
   934  	}
   935  
   936  	// Setup the indexes correctly
   937  	if existing != nil {
   938  		exist := existing.(*structs.Node)
   939  		node.CreateIndex = exist.CreateIndex
   940  		node.ModifyIndex = index
   941  
   942  		// Update last missed heartbeat if the node became unresponsive.
   943  		if !exist.UnresponsiveStatus() && node.UnresponsiveStatus() {
   944  			node.LastMissedHeartbeatIndex = index
   945  		}
   946  
   947  		// Retain node events that have already been set on the node
   948  		node.Events = exist.Events
   949  
   950  		// If we are transitioning from down, record the re-registration
   951  		if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown {
   952  			appendNodeEvents(index, node, []*structs.NodeEvent{
   953  				structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   954  					SetMessage(NodeRegisterEventReregistered).
   955  					SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))})
   956  		}
   957  
   958  		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
   959  		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
   960  		node.LastDrain = exist.LastDrain                         // Retain the drain metadata
   961  
   962  		// Retain the last index the node missed a heartbeat.
   963  		if node.LastMissedHeartbeatIndex < exist.LastMissedHeartbeatIndex {
   964  			node.LastMissedHeartbeatIndex = exist.LastMissedHeartbeatIndex
   965  		}
   966  
   967  		// Retain the last index the node updated its allocs.
   968  		if node.LastAllocUpdateIndex < exist.LastAllocUpdateIndex {
   969  			node.LastAllocUpdateIndex = exist.LastAllocUpdateIndex
   970  		}
   971  	} else {
   972  		// Because this is the first time the node is being registered, we should
   973  		// also create a node registration event
   974  		nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   975  			SetMessage(NodeRegisterEventRegistered).
   976  			SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))
   977  		node.Events = []*structs.NodeEvent{nodeEvent}
   978  		node.CreateIndex = index
   979  		node.ModifyIndex = index
   980  	}
   981  
   982  	// Insert the node
   983  	if err := txn.Insert("nodes", node); err != nil {
   984  		return fmt.Errorf("node insert failed: %v", err)
   985  	}
   986  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   987  		return fmt.Errorf("index update failed: %v", err)
   988  	}
   989  	if err := upsertCSIPluginsForNode(txn, node, index); err != nil {
   990  		return fmt.Errorf("csi plugin update failed: %v", err)
   991  	}
   992  
   993  	return nil
   994  }
   995  
   996  // DeleteNode deregisters a batch of nodes
   997  func (s *StateStore) DeleteNode(msgType structs.MessageType, index uint64, nodes []string) error {
   998  	txn := s.db.WriteTxn(index)
   999  	defer txn.Abort()
  1000  
  1001  	err := deleteNodeTxn(txn, index, nodes)
  1002  	if err != nil {
  1003  		return nil
  1004  	}
  1005  	return txn.Commit()
  1006  }
  1007  
  1008  func deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
  1009  	if len(nodes) == 0 {
  1010  		return fmt.Errorf("node ids missing")
  1011  	}
  1012  
  1013  	for _, nodeID := range nodes {
  1014  		existing, err := txn.First("nodes", "id", nodeID)
  1015  		if err != nil {
  1016  			return fmt.Errorf("node lookup failed: %s: %v", nodeID, err)
  1017  		}
  1018  		if existing == nil {
  1019  			return fmt.Errorf("node not found: %s", nodeID)
  1020  		}
  1021  
  1022  		// Delete the node
  1023  		if err := txn.Delete("nodes", existing); err != nil {
  1024  			return fmt.Errorf("node delete failed: %s: %v", nodeID, err)
  1025  		}
  1026  
  1027  		node := existing.(*structs.Node)
  1028  		if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
  1029  			return fmt.Errorf("csi plugin delete failed: %v", err)
  1030  		}
  1031  	}
  1032  
  1033  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1034  		return fmt.Errorf("index update failed: %v", err)
  1035  	}
  1036  
  1037  	return nil
  1038  }
  1039  
  1040  // UpdateNodeStatus is used to update the status of a node
  1041  func (s *StateStore) UpdateNodeStatus(msgType structs.MessageType, index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
  1042  	txn := s.db.WriteTxnMsgT(msgType, index)
  1043  	defer txn.Abort()
  1044  
  1045  	if err := s.updateNodeStatusTxn(txn, nodeID, status, updatedAt, event); err != nil {
  1046  		return err
  1047  	}
  1048  
  1049  	return txn.Commit()
  1050  }
  1051  
  1052  func (s *StateStore) updateNodeStatusTxn(txn *txn, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
  1053  
  1054  	// Lookup the node
  1055  	existing, err := txn.First("nodes", "id", nodeID)
  1056  	if err != nil {
  1057  		return fmt.Errorf("node lookup failed: %v", err)
  1058  	}
  1059  	if existing == nil {
  1060  		return fmt.Errorf("node not found")
  1061  	}
  1062  
  1063  	// Copy the existing node
  1064  	existingNode := existing.(*structs.Node)
  1065  	copyNode := existingNode.Copy()
  1066  	copyNode.StatusUpdatedAt = updatedAt
  1067  
  1068  	// Add the event if given
  1069  	if event != nil {
  1070  		appendNodeEvents(txn.Index, copyNode, []*structs.NodeEvent{event})
  1071  	}
  1072  
  1073  	// Update the status in the copy
  1074  	copyNode.Status = status
  1075  	copyNode.ModifyIndex = txn.Index
  1076  
  1077  	// Update last missed heartbeat if the node became unresponsive or reset it
  1078  	// zero if the node became ready.
  1079  	if !existingNode.UnresponsiveStatus() && copyNode.UnresponsiveStatus() {
  1080  		copyNode.LastMissedHeartbeatIndex = txn.Index
  1081  	} else if existingNode.Status != structs.NodeStatusReady &&
  1082  		copyNode.Status == structs.NodeStatusReady {
  1083  		copyNode.LastMissedHeartbeatIndex = 0
  1084  	}
  1085  
  1086  	// Insert the node
  1087  	if err := txn.Insert("nodes", copyNode); err != nil {
  1088  		return fmt.Errorf("node update failed: %v", err)
  1089  	}
  1090  	if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil {
  1091  		return fmt.Errorf("index update failed: %v", err)
  1092  	}
  1093  	return nil
  1094  }
  1095  
  1096  // BatchUpdateNodeDrain is used to update the drain of a node set of nodes.
  1097  // This is currently only called when node drain is completed by the drainer.
  1098  func (s *StateStore) BatchUpdateNodeDrain(msgType structs.MessageType, index uint64, updatedAt int64,
  1099  	updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error {
  1100  	txn := s.db.WriteTxnMsgT(msgType, index)
  1101  	defer txn.Abort()
  1102  	for node, update := range updates {
  1103  		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt,
  1104  			events[node], nil, "", true); err != nil {
  1105  			return err
  1106  		}
  1107  	}
  1108  	return txn.Commit()
  1109  }
  1110  
  1111  // UpdateNodeDrain is used to update the drain of a node
  1112  func (s *StateStore) UpdateNodeDrain(msgType structs.MessageType, index uint64, nodeID string,
  1113  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
  1114  	event *structs.NodeEvent, drainMeta map[string]string, accessorId string) error {
  1115  
  1116  	txn := s.db.WriteTxnMsgT(msgType, index)
  1117  	defer txn.Abort()
  1118  	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event,
  1119  		drainMeta, accessorId, false); err != nil {
  1120  
  1121  		return err
  1122  	}
  1123  	return txn.Commit()
  1124  }
  1125  
  1126  func (s *StateStore) updateNodeDrainImpl(txn *txn, index uint64, nodeID string,
  1127  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
  1128  	event *structs.NodeEvent, drainMeta map[string]string, accessorId string,
  1129  	drainCompleted bool) error {
  1130  
  1131  	// Lookup the node
  1132  	existing, err := txn.First("nodes", "id", nodeID)
  1133  	if err != nil {
  1134  		return fmt.Errorf("node lookup failed: %v", err)
  1135  	}
  1136  	if existing == nil {
  1137  		return fmt.Errorf("node not found")
  1138  	}
  1139  
  1140  	// Copy the existing node
  1141  	existingNode := existing.(*structs.Node)
  1142  	updatedNode := existingNode.Copy()
  1143  	updatedNode.StatusUpdatedAt = updatedAt
  1144  
  1145  	// Add the event if given
  1146  	if event != nil {
  1147  		appendNodeEvents(index, updatedNode, []*structs.NodeEvent{event})
  1148  	}
  1149  
  1150  	// Update the drain in the copy
  1151  	updatedNode.DrainStrategy = drain
  1152  	if drain != nil {
  1153  		updatedNode.SchedulingEligibility = structs.NodeSchedulingIneligible
  1154  	} else if markEligible {
  1155  		updatedNode.SchedulingEligibility = structs.NodeSchedulingEligible
  1156  	}
  1157  
  1158  	// Update LastDrain
  1159  	updateTime := time.Unix(updatedAt, 0)
  1160  
  1161  	// if drain strategy isn't set before or after, this wasn't a drain operation
  1162  	// in that case, we don't care about .LastDrain
  1163  	drainNoop := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy == nil
  1164  	// otherwise, when done with this method, updatedNode.LastDrain should be set
  1165  	// if starting a new drain operation, create a new LastDrain. otherwise, update the existing one.
  1166  	startedDraining := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy != nil
  1167  	if !drainNoop {
  1168  		if startedDraining {
  1169  			updatedNode.LastDrain = &structs.DrainMetadata{
  1170  				StartedAt: updateTime,
  1171  				Meta:      drainMeta,
  1172  			}
  1173  		} else if updatedNode.LastDrain == nil {
  1174  			// if already draining and LastDrain doesn't exist, we need to create a new one
  1175  			// this could happen if we upgraded to 1.1.x during a drain
  1176  			updatedNode.LastDrain = &structs.DrainMetadata{
  1177  				// we don't have sub-second accuracy on these fields, so truncate this
  1178  				StartedAt: time.Unix(existingNode.DrainStrategy.StartedAt.Unix(), 0),
  1179  				Meta:      drainMeta,
  1180  			}
  1181  		}
  1182  
  1183  		updatedNode.LastDrain.UpdatedAt = updateTime
  1184  
  1185  		// won't have new metadata on drain complete; keep the existing operator-provided metadata
  1186  		// also, keep existing if they didn't provide it
  1187  		if len(drainMeta) != 0 {
  1188  			updatedNode.LastDrain.Meta = drainMeta
  1189  		}
  1190  
  1191  		// we won't have an accessor ID on drain complete, so don't overwrite the existing one
  1192  		if accessorId != "" {
  1193  			updatedNode.LastDrain.AccessorID = accessorId
  1194  		}
  1195  
  1196  		if updatedNode.DrainStrategy != nil {
  1197  			updatedNode.LastDrain.Status = structs.DrainStatusDraining
  1198  		} else if drainCompleted {
  1199  			updatedNode.LastDrain.Status = structs.DrainStatusComplete
  1200  		} else {
  1201  			updatedNode.LastDrain.Status = structs.DrainStatusCanceled
  1202  		}
  1203  	}
  1204  
  1205  	updatedNode.ModifyIndex = index
  1206  
  1207  	// Insert the node
  1208  	if err := txn.Insert("nodes", updatedNode); err != nil {
  1209  		return fmt.Errorf("node update failed: %v", err)
  1210  	}
  1211  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1212  		return fmt.Errorf("index update failed: %v", err)
  1213  	}
  1214  
  1215  	return nil
  1216  }
  1217  
  1218  // UpdateNodeEligibility is used to update the scheduling eligibility of a node
  1219  func (s *StateStore) UpdateNodeEligibility(msgType structs.MessageType, index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error {
  1220  	txn := s.db.WriteTxnMsgT(msgType, index)
  1221  	defer txn.Abort()
  1222  	if err := s.updateNodeEligibilityImpl(index, nodeID, eligibility, updatedAt, event, txn); err != nil {
  1223  		return err
  1224  	}
  1225  	return txn.Commit()
  1226  }
  1227  
  1228  func (s *StateStore) updateNodeEligibilityImpl(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent, txn *txn) error {
  1229  	// Lookup the node
  1230  	existing, err := txn.First("nodes", "id", nodeID)
  1231  	if err != nil {
  1232  		return fmt.Errorf("node lookup failed: %v", err)
  1233  	}
  1234  	if existing == nil {
  1235  		return fmt.Errorf("node not found")
  1236  	}
  1237  
  1238  	// Copy the existing node
  1239  	existingNode := existing.(*structs.Node)
  1240  	copyNode := existingNode.Copy()
  1241  	copyNode.StatusUpdatedAt = updatedAt
  1242  
  1243  	// Add the event if given
  1244  	if event != nil {
  1245  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
  1246  	}
  1247  
  1248  	// Check if this is a valid action
  1249  	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
  1250  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
  1251  	}
  1252  
  1253  	// Update the eligibility in the copy
  1254  	copyNode.SchedulingEligibility = eligibility
  1255  	copyNode.ModifyIndex = index
  1256  
  1257  	// Insert the node
  1258  	if err := txn.Insert("nodes", copyNode); err != nil {
  1259  		return fmt.Errorf("node update failed: %v", err)
  1260  	}
  1261  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1262  		return fmt.Errorf("index update failed: %v", err)
  1263  	}
  1264  
  1265  	return nil
  1266  }
  1267  
  1268  // UpsertNodeEvents adds the node events to the nodes, rotating events as
  1269  // necessary.
  1270  func (s *StateStore) UpsertNodeEvents(msgType structs.MessageType, index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
  1271  	txn := s.db.WriteTxnMsgT(msgType, index)
  1272  	defer txn.Abort()
  1273  
  1274  	for nodeID, events := range nodeEvents {
  1275  		if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil {
  1276  			return err
  1277  		}
  1278  	}
  1279  
  1280  	return txn.Commit()
  1281  }
  1282  
  1283  // upsertNodeEvent upserts a node event for a respective node. It also maintains
  1284  // that a fixed number of node events are ever stored simultaneously, deleting
  1285  // older events once this bound has been reached.
  1286  func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *txn) error {
  1287  	// Lookup the node
  1288  	existing, err := txn.First("nodes", "id", nodeID)
  1289  	if err != nil {
  1290  		return fmt.Errorf("node lookup failed: %v", err)
  1291  	}
  1292  	if existing == nil {
  1293  		return fmt.Errorf("node not found")
  1294  	}
  1295  
  1296  	// Copy the existing node
  1297  	existingNode := existing.(*structs.Node)
  1298  	copyNode := existingNode.Copy()
  1299  	appendNodeEvents(index, copyNode, events)
  1300  
  1301  	// Insert the node
  1302  	if err := txn.Insert("nodes", copyNode); err != nil {
  1303  		return fmt.Errorf("node update failed: %v", err)
  1304  	}
  1305  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1306  		return fmt.Errorf("index update failed: %v", err)
  1307  	}
  1308  
  1309  	return nil
  1310  }
  1311  
  1312  // appendNodeEvents is a helper that takes a node and new events and appends
  1313  // them, pruning older events as needed.
  1314  func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) {
  1315  	// Add the events, updating the indexes
  1316  	for _, e := range events {
  1317  		e.CreateIndex = index
  1318  		node.Events = append(node.Events, e)
  1319  	}
  1320  
  1321  	// Keep node events pruned to not exceed the max allowed
  1322  	if l := len(node.Events); l > structs.MaxRetainedNodeEvents {
  1323  		delta := l - structs.MaxRetainedNodeEvents
  1324  		node.Events = node.Events[delta:]
  1325  	}
  1326  }
  1327  
  1328  // upsertCSIPluginsForNode indexes csi plugins for volume retrieval, with health. It's called
  1329  // on upsertNodeEvents, so that event driven health changes are updated
  1330  func upsertCSIPluginsForNode(txn *txn, node *structs.Node, index uint64) error {
  1331  
  1332  	upsertFn := func(info *structs.CSIInfo) error {
  1333  		raw, err := txn.First("csi_plugins", "id", info.PluginID)
  1334  		if err != nil {
  1335  			return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err)
  1336  		}
  1337  
  1338  		var plug *structs.CSIPlugin
  1339  		if raw != nil {
  1340  			plug = raw.(*structs.CSIPlugin).Copy()
  1341  		} else {
  1342  			if !info.Healthy {
  1343  				// we don't want to create new plugins for unhealthy
  1344  				// allocs, otherwise we'd recreate the plugin when we
  1345  				// get the update for the alloc becoming terminal
  1346  				return nil
  1347  			}
  1348  			plug = structs.NewCSIPlugin(info.PluginID, index)
  1349  		}
  1350  
  1351  		// the plugin may have been created by the job being updated, in which case
  1352  		// this data will not be configured, it's only available to the fingerprint
  1353  		// system
  1354  		plug.Provider = info.Provider
  1355  		plug.Version = info.ProviderVersion
  1356  
  1357  		err = plug.AddPlugin(node.ID, info)
  1358  		if err != nil {
  1359  			return err
  1360  		}
  1361  
  1362  		plug.ModifyIndex = index
  1363  
  1364  		err = txn.Insert("csi_plugins", plug)
  1365  		if err != nil {
  1366  			return fmt.Errorf("csi_plugins insert error: %v", err)
  1367  		}
  1368  
  1369  		return nil
  1370  	}
  1371  
  1372  	inUseController := map[string]struct{}{}
  1373  	inUseNode := map[string]struct{}{}
  1374  
  1375  	for _, info := range node.CSIControllerPlugins {
  1376  		err := upsertFn(info)
  1377  		if err != nil {
  1378  			return err
  1379  		}
  1380  		inUseController[info.PluginID] = struct{}{}
  1381  	}
  1382  
  1383  	for _, info := range node.CSINodePlugins {
  1384  		err := upsertFn(info)
  1385  		if err != nil {
  1386  			return err
  1387  		}
  1388  		inUseNode[info.PluginID] = struct{}{}
  1389  	}
  1390  
  1391  	// remove the client node from any plugin that's not
  1392  	// running on it.
  1393  	iter, err := txn.Get("csi_plugins", "id")
  1394  	if err != nil {
  1395  		return fmt.Errorf("csi_plugins lookup failed: %v", err)
  1396  	}
  1397  	for {
  1398  		raw := iter.Next()
  1399  		if raw == nil {
  1400  			break
  1401  		}
  1402  		plug, ok := raw.(*structs.CSIPlugin)
  1403  		if !ok {
  1404  			continue
  1405  		}
  1406  		plug = plug.Copy()
  1407  
  1408  		var hadDelete bool
  1409  		if _, ok := inUseController[plug.ID]; !ok {
  1410  			if _, asController := plug.Controllers[node.ID]; asController {
  1411  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeController)
  1412  				if err != nil {
  1413  					return err
  1414  				}
  1415  				hadDelete = true
  1416  			}
  1417  		}
  1418  		if _, ok := inUseNode[plug.ID]; !ok {
  1419  			if _, asNode := plug.Nodes[node.ID]; asNode {
  1420  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeNode)
  1421  				if err != nil {
  1422  					return err
  1423  				}
  1424  				hadDelete = true
  1425  			}
  1426  		}
  1427  		// we check this flag both for performance and to make sure we
  1428  		// don't delete a plugin when registering a node plugin but
  1429  		// no controller
  1430  		if hadDelete {
  1431  			err = updateOrGCPlugin(index, txn, plug)
  1432  			if err != nil {
  1433  				return err
  1434  			}
  1435  		}
  1436  	}
  1437  
  1438  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1439  		return fmt.Errorf("index update failed: %v", err)
  1440  	}
  1441  
  1442  	return nil
  1443  }
  1444  
  1445  // deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode
  1446  func deleteNodeCSIPlugins(txn *txn, node *structs.Node, index uint64) error {
  1447  	if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 {
  1448  		return nil
  1449  	}
  1450  
  1451  	names := map[string]struct{}{}
  1452  	for _, info := range node.CSIControllerPlugins {
  1453  		names[info.PluginID] = struct{}{}
  1454  	}
  1455  	for _, info := range node.CSINodePlugins {
  1456  		names[info.PluginID] = struct{}{}
  1457  	}
  1458  
  1459  	for id := range names {
  1460  		raw, err := txn.First("csi_plugins", "id", id)
  1461  		if err != nil {
  1462  			return fmt.Errorf("csi_plugins lookup error %s: %v", id, err)
  1463  		}
  1464  		if raw == nil {
  1465  			// plugin may have been deregistered but we didn't
  1466  			// update the fingerprint yet
  1467  			continue
  1468  		}
  1469  
  1470  		plug := raw.(*structs.CSIPlugin).Copy()
  1471  		err = plug.DeleteNode(node.ID)
  1472  		if err != nil {
  1473  			return err
  1474  		}
  1475  		err = updateOrGCPlugin(index, txn, plug)
  1476  		if err != nil {
  1477  			return err
  1478  		}
  1479  	}
  1480  
  1481  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1482  		return fmt.Errorf("index update failed: %v", err)
  1483  	}
  1484  
  1485  	return nil
  1486  }
  1487  
  1488  // updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
  1489  func updateOrGCPlugin(index uint64, txn Txn, plug *structs.CSIPlugin) error {
  1490  	if plug.IsEmpty() {
  1491  		err := txn.Delete("csi_plugins", plug)
  1492  		if err != nil {
  1493  			return fmt.Errorf("csi_plugins delete error: %v", err)
  1494  		}
  1495  	} else {
  1496  		plug.ModifyIndex = index
  1497  		err := txn.Insert("csi_plugins", plug)
  1498  		if err != nil {
  1499  			return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
  1500  		}
  1501  	}
  1502  	return nil
  1503  }
  1504  
  1505  // deleteJobFromPlugins removes the allocations of this job from any plugins the job is
  1506  // running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn
  1507  func (s *StateStore) deleteJobFromPlugins(index uint64, txn Txn, job *structs.Job) error {
  1508  	ws := memdb.NewWatchSet()
  1509  	summary, err := s.JobSummaryByID(ws, job.Namespace, job.ID)
  1510  	if err != nil {
  1511  		return fmt.Errorf("error getting job summary: %v", err)
  1512  	}
  1513  
  1514  	allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false)
  1515  	if err != nil {
  1516  		return fmt.Errorf("error getting allocations: %v", err)
  1517  	}
  1518  
  1519  	type pair struct {
  1520  		pluginID string
  1521  		alloc    *structs.Allocation
  1522  	}
  1523  
  1524  	plugAllocs := []*pair{}
  1525  	found := map[string]struct{}{}
  1526  
  1527  	// Find plugins for allocs that belong to this job
  1528  	for _, a := range allocs {
  1529  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  1530  		found[tg.Name] = struct{}{}
  1531  		for _, t := range tg.Tasks {
  1532  			if t.CSIPluginConfig == nil {
  1533  				continue
  1534  			}
  1535  			plugAllocs = append(plugAllocs, &pair{
  1536  				pluginID: t.CSIPluginConfig.ID,
  1537  				alloc:    a,
  1538  			})
  1539  		}
  1540  	}
  1541  
  1542  	// Find any plugins that do not yet have allocs for this job
  1543  	for _, tg := range job.TaskGroups {
  1544  		if _, ok := found[tg.Name]; ok {
  1545  			continue
  1546  		}
  1547  
  1548  		for _, t := range tg.Tasks {
  1549  			if t.CSIPluginConfig == nil {
  1550  				continue
  1551  			}
  1552  			plugAllocs = append(plugAllocs, &pair{
  1553  				pluginID: t.CSIPluginConfig.ID,
  1554  			})
  1555  		}
  1556  	}
  1557  
  1558  	plugins := map[string]*structs.CSIPlugin{}
  1559  
  1560  	for _, x := range plugAllocs {
  1561  		plug, ok := plugins[x.pluginID]
  1562  
  1563  		if !ok {
  1564  			plug, err = s.CSIPluginByIDTxn(txn, nil, x.pluginID)
  1565  			if err != nil {
  1566  				return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err)
  1567  			}
  1568  			if plug == nil {
  1569  				// plugin was never successfully registered or has been
  1570  				// GC'd out from under us
  1571  				continue
  1572  			}
  1573  			// only copy once, so we update the same plugin on each alloc
  1574  			plugins[x.pluginID] = plug.Copy()
  1575  			plug = plugins[x.pluginID]
  1576  		}
  1577  
  1578  		if x.alloc == nil {
  1579  			continue
  1580  		}
  1581  		err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID)
  1582  		if err != nil {
  1583  			return err
  1584  		}
  1585  	}
  1586  
  1587  	for _, plug := range plugins {
  1588  		plug.DeleteJob(job, summary)
  1589  		err = updateOrGCPlugin(index, txn, plug)
  1590  		if err != nil {
  1591  			return err
  1592  		}
  1593  	}
  1594  
  1595  	if len(plugins) > 0 {
  1596  		if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1597  			return fmt.Errorf("index update failed: %v", err)
  1598  		}
  1599  	}
  1600  
  1601  	return nil
  1602  }
  1603  
  1604  // NodeByID is used to lookup a node by ID
  1605  func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) {
  1606  	txn := s.db.ReadTxn()
  1607  
  1608  	watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID)
  1609  	if err != nil {
  1610  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1611  	}
  1612  	ws.Add(watchCh)
  1613  
  1614  	if existing != nil {
  1615  		return existing.(*structs.Node), nil
  1616  	}
  1617  	return nil, nil
  1618  }
  1619  
  1620  // NodesByIDPrefix is used to lookup nodes by prefix
  1621  func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  1622  	txn := s.db.ReadTxn()
  1623  
  1624  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
  1625  	if err != nil {
  1626  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1627  	}
  1628  	ws.Add(iter.WatchCh())
  1629  
  1630  	return iter, nil
  1631  }
  1632  
  1633  // NodeBySecretID is used to lookup a node by SecretID
  1634  func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) {
  1635  	txn := s.db.ReadTxn()
  1636  
  1637  	watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID)
  1638  	if err != nil {
  1639  		return nil, fmt.Errorf("node lookup by SecretID failed: %v", err)
  1640  	}
  1641  	ws.Add(watchCh)
  1642  
  1643  	if existing != nil {
  1644  		return existing.(*structs.Node), nil
  1645  	}
  1646  	return nil, nil
  1647  }
  1648  
  1649  // NodesByNodePool returns an iterator over all nodes that are part of the
  1650  // given node pool.
  1651  func (s *StateStore) NodesByNodePool(ws memdb.WatchSet, pool string) (memdb.ResultIterator, error) {
  1652  	txn := s.db.ReadTxn()
  1653  
  1654  	iter, err := txn.Get("nodes", "node_pool", pool)
  1655  	if err != nil {
  1656  		return nil, err
  1657  	}
  1658  
  1659  	ws.Add(iter.WatchCh())
  1660  	return iter, nil
  1661  }
  1662  
  1663  // Nodes returns an iterator over all the nodes
  1664  func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1665  	txn := s.db.ReadTxn()
  1666  
  1667  	// Walk the entire nodes table
  1668  	iter, err := txn.Get("nodes", "id")
  1669  	if err != nil {
  1670  		return nil, err
  1671  	}
  1672  	ws.Add(iter.WatchCh())
  1673  	return iter, nil
  1674  }
  1675  
  1676  // UpsertJob is used to register a job or update a job definition
  1677  func (s *StateStore) UpsertJob(msgType structs.MessageType, index uint64, sub *structs.JobSubmission, job *structs.Job) error {
  1678  	txn := s.db.WriteTxnMsgT(msgType, index)
  1679  	defer txn.Abort()
  1680  	if err := s.upsertJobImpl(index, sub, job, false, txn); err != nil {
  1681  		return err
  1682  	}
  1683  	return txn.Commit()
  1684  }
  1685  
  1686  // UpsertJobTxn is used to register a job or update a job definition, like UpsertJob,
  1687  // but in a transaction.  Useful for when making multiple modifications atomically
  1688  func (s *StateStore) UpsertJobTxn(index uint64, sub *structs.JobSubmission, job *structs.Job, txn Txn) error {
  1689  	return s.upsertJobImpl(index, sub, job, false, txn)
  1690  }
  1691  
  1692  // upsertJobImpl is the implementation for registering a job or updating a job definition
  1693  func (s *StateStore) upsertJobImpl(index uint64, sub *structs.JobSubmission, job *structs.Job, keepVersion bool, txn *txn) error {
  1694  	// Assert the namespace exists
  1695  	if exists, err := s.namespaceExists(txn, job.Namespace); err != nil {
  1696  		return err
  1697  	} else if !exists {
  1698  		return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace)
  1699  	}
  1700  
  1701  	// Upgrade path.
  1702  	// Assert the node pool is set and exists.
  1703  	if job.NodePool == "" {
  1704  		job.NodePool = structs.NodePoolDefault
  1705  	}
  1706  	if exists, err := s.nodePoolExists(txn, job.NodePool); err != nil {
  1707  		return err
  1708  	} else if !exists {
  1709  		return fmt.Errorf("job %q is in nonexistent node pool %q", job.ID, job.NodePool)
  1710  	}
  1711  
  1712  	// Check if the job already exists
  1713  	existing, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1714  	var existingJob *structs.Job
  1715  	if err != nil {
  1716  		return fmt.Errorf("job lookup failed: %v", err)
  1717  	}
  1718  
  1719  	// Setup the indexes correctly
  1720  	if existing != nil {
  1721  		job.CreateIndex = existing.(*structs.Job).CreateIndex
  1722  		job.ModifyIndex = index
  1723  
  1724  		existingJob = existing.(*structs.Job)
  1725  
  1726  		// Bump the version unless asked to keep it. This should only be done
  1727  		// when changing an internal field such as Stable. A spec change should
  1728  		// always come with a version bump
  1729  		if !keepVersion {
  1730  			job.JobModifyIndex = index
  1731  			if job.Version <= existingJob.Version {
  1732  				if sub == nil {
  1733  					// in the reversion case we must set the submission to be
  1734  					// that of the job version we are reverting to
  1735  					sub, _ = s.jobSubmission(nil, job.Namespace, job.ID, job.Version, txn)
  1736  				}
  1737  				job.Version = existingJob.Version + 1
  1738  			}
  1739  		}
  1740  
  1741  		// Compute the job status
  1742  		var err error
  1743  		job.Status, err = s.getJobStatus(txn, job, false)
  1744  		if err != nil {
  1745  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1746  		}
  1747  	} else {
  1748  		job.CreateIndex = index
  1749  		job.ModifyIndex = index
  1750  		job.JobModifyIndex = index
  1751  
  1752  		if err := s.setJobStatus(index, txn, job, false, ""); err != nil {
  1753  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1754  		}
  1755  
  1756  		// Have to get the job again since it could have been updated
  1757  		updated, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1758  		if err != nil {
  1759  			return fmt.Errorf("job lookup failed: %v", err)
  1760  		}
  1761  		if updated != nil {
  1762  			job = updated.(*structs.Job)
  1763  		}
  1764  	}
  1765  
  1766  	if err := s.updateSummaryWithJob(index, job, txn); err != nil {
  1767  		return fmt.Errorf("unable to create job summary: %v", err)
  1768  	}
  1769  
  1770  	if err := s.upsertJobVersion(index, job, txn); err != nil {
  1771  		return fmt.Errorf("unable to upsert job into job_version table: %v", err)
  1772  	}
  1773  
  1774  	if err := s.updateJobScalingPolicies(index, job, txn); err != nil {
  1775  		return fmt.Errorf("unable to update job scaling policies: %v", err)
  1776  	}
  1777  
  1778  	if err := s.updateJobRecommendations(index, txn, existingJob, job); err != nil {
  1779  		return fmt.Errorf("unable to update job recommendations: %v", err)
  1780  	}
  1781  
  1782  	if err := s.updateJobCSIPlugins(index, job, existingJob, txn); err != nil {
  1783  		return fmt.Errorf("unable to update job csi plugins: %v", err)
  1784  	}
  1785  
  1786  	if err := s.updateJobSubmission(index, sub, job.Namespace, job.ID, job.Version, txn); err != nil {
  1787  		return fmt.Errorf("unable to update job submission: %v", err)
  1788  	}
  1789  
  1790  	// Insert the job
  1791  	if err := txn.Insert("jobs", job); err != nil {
  1792  		return fmt.Errorf("job insert failed: %v", err)
  1793  	}
  1794  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1795  		return fmt.Errorf("index update failed: %v", err)
  1796  	}
  1797  
  1798  	return nil
  1799  }
  1800  
  1801  // DeleteJob is used to deregister a job
  1802  func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error {
  1803  	txn := s.db.WriteTxn(index)
  1804  	defer txn.Abort()
  1805  
  1806  	err := s.DeleteJobTxn(index, namespace, jobID, txn)
  1807  	if err == nil {
  1808  		return txn.Commit()
  1809  	}
  1810  	return err
  1811  }
  1812  
  1813  // DeleteJobTxn is used to deregister a job, like DeleteJob,
  1814  // but in a transaction.  Useful for when making multiple modifications atomically
  1815  func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error {
  1816  	// Lookup the node
  1817  	existing, err := txn.First("jobs", "id", namespace, jobID)
  1818  	if err != nil {
  1819  		return fmt.Errorf("job lookup failed: %v", err)
  1820  	}
  1821  	if existing == nil {
  1822  		return fmt.Errorf("job not found")
  1823  	}
  1824  
  1825  	// Check if we should update a parent job summary
  1826  	job := existing.(*structs.Job)
  1827  	if job.ParentID != "" {
  1828  		summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID)
  1829  		if err != nil {
  1830  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  1831  		}
  1832  
  1833  		// Only continue if the summary exists. It could not exist if the parent
  1834  		// job was removed
  1835  		if summaryRaw != nil {
  1836  			existing := summaryRaw.(*structs.JobSummary)
  1837  			pSummary := existing.Copy()
  1838  			if pSummary.Children != nil {
  1839  
  1840  				modified := false
  1841  				switch job.Status {
  1842  				case structs.JobStatusPending:
  1843  					pSummary.Children.Pending--
  1844  					pSummary.Children.Dead++
  1845  					modified = true
  1846  				case structs.JobStatusRunning:
  1847  					pSummary.Children.Running--
  1848  					pSummary.Children.Dead++
  1849  					modified = true
  1850  				case structs.JobStatusDead:
  1851  				default:
  1852  					return fmt.Errorf("unknown old job status %q", job.Status)
  1853  				}
  1854  
  1855  				if modified {
  1856  					// Update the modify index
  1857  					pSummary.ModifyIndex = index
  1858  
  1859  					// Insert the summary
  1860  					if err := txn.Insert("job_summary", pSummary); err != nil {
  1861  						return fmt.Errorf("job summary insert failed: %v", err)
  1862  					}
  1863  					if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1864  						return fmt.Errorf("index update failed: %v", err)
  1865  					}
  1866  				}
  1867  			}
  1868  		}
  1869  	}
  1870  
  1871  	// Delete the job
  1872  	if err := txn.Delete("jobs", existing); err != nil {
  1873  		return fmt.Errorf("job delete failed: %v", err)
  1874  	}
  1875  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1876  		return fmt.Errorf("index update failed: %v", err)
  1877  	}
  1878  
  1879  	// Delete the job versions
  1880  	if err := s.deleteJobVersions(index, job, txn); err != nil {
  1881  		return err
  1882  	}
  1883  
  1884  	// Cleanup plugins registered by this job, before we delete the summary
  1885  	err = s.deleteJobFromPlugins(index, txn, job)
  1886  	if err != nil {
  1887  		return fmt.Errorf("deleting job from plugin: %v", err)
  1888  	}
  1889  
  1890  	// Delete the job summary
  1891  	if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil {
  1892  		return fmt.Errorf("deleting job summary failed: %v", err)
  1893  	}
  1894  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1895  		return fmt.Errorf("index update failed: %v", err)
  1896  	}
  1897  
  1898  	// Delete the job submission
  1899  	if err := s.deleteJobSubmission(job, txn); err != nil {
  1900  		return fmt.Errorf("deleting job submission failed: %v", err)
  1901  	}
  1902  
  1903  	// Delete any remaining job scaling policies
  1904  	if err := s.deleteJobScalingPolicies(index, job, txn); err != nil {
  1905  		return fmt.Errorf("deleting job scaling policies failed: %v", err)
  1906  	}
  1907  
  1908  	// Delete any job recommendations
  1909  	if err := s.deleteRecommendationsByJob(index, txn, job); err != nil {
  1910  		return fmt.Errorf("deleting job recommendatons failed: %v", err)
  1911  	}
  1912  
  1913  	// Delete the scaling events
  1914  	if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil {
  1915  		return fmt.Errorf("deleting job scaling events failed: %v", err)
  1916  	}
  1917  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
  1918  		return fmt.Errorf("index update failed: %v", err)
  1919  	}
  1920  
  1921  	return nil
  1922  }
  1923  
  1924  // deleteJobScalingPolicies deletes any scaling policies associated with the job
  1925  func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {
  1926  	iter, err := s.ScalingPoliciesByJobTxn(nil, job.Namespace, job.ID, txn)
  1927  	if err != nil {
  1928  		return fmt.Errorf("getting job scaling policies for deletion failed: %v", err)
  1929  	}
  1930  
  1931  	// Put them into a slice so there are no safety concerns while actually
  1932  	// performing the deletes
  1933  	policies := []interface{}{}
  1934  	for {
  1935  		raw := iter.Next()
  1936  		if raw == nil {
  1937  			break
  1938  		}
  1939  		policies = append(policies, raw)
  1940  	}
  1941  
  1942  	// Do the deletes
  1943  	for _, p := range policies {
  1944  		if err := txn.Delete("scaling_policy", p); err != nil {
  1945  			return fmt.Errorf("deleting scaling policy failed: %v", err)
  1946  		}
  1947  	}
  1948  
  1949  	if len(policies) > 0 {
  1950  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  1951  			return fmt.Errorf("index update failed: %v", err)
  1952  		}
  1953  	}
  1954  	return nil
  1955  }
  1956  
  1957  func (s *StateStore) deleteJobSubmission(job *structs.Job, txn *txn) error {
  1958  	// find submissions associated with job
  1959  	remove := *set.NewHashSet[*structs.JobSubmission, string](structs.JobTrackedVersions)
  1960  
  1961  	iter, err := txn.Get("job_submission", "id_prefix", job.Namespace, job.ID)
  1962  	if err != nil {
  1963  		return err
  1964  	}
  1965  
  1966  	for {
  1967  		obj := iter.Next()
  1968  		if obj == nil {
  1969  			break
  1970  		}
  1971  		sub := obj.(*structs.JobSubmission)
  1972  
  1973  		// iterating by prefix; ensure we have an exact match
  1974  		if sub.Namespace == job.Namespace && sub.JobID == job.ID {
  1975  			remove.Insert(sub)
  1976  		}
  1977  	}
  1978  
  1979  	// now delete the submissions we found associated with the job
  1980  	for _, sub := range remove.Slice() {
  1981  		err := txn.Delete("job_submission", sub)
  1982  		if err != nil {
  1983  			return err
  1984  		}
  1985  	}
  1986  
  1987  	return nil
  1988  }
  1989  
  1990  // deleteJobVersions deletes all versions of the given job.
  1991  func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *txn) error {
  1992  	iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID)
  1993  	if err != nil {
  1994  		return err
  1995  	}
  1996  
  1997  	// Put them into a slice so there are no safety concerns while actually
  1998  	// performing the deletes
  1999  	jobs := []*structs.Job{}
  2000  	for {
  2001  		raw := iter.Next()
  2002  		if raw == nil {
  2003  			break
  2004  		}
  2005  
  2006  		// Ensure the ID is an exact match
  2007  		j := raw.(*structs.Job)
  2008  		if j.ID != job.ID {
  2009  			continue
  2010  		}
  2011  
  2012  		jobs = append(jobs, j)
  2013  	}
  2014  
  2015  	// Do the deletes
  2016  	for _, j := range jobs {
  2017  		if err := txn.Delete("job_version", j); err != nil {
  2018  			return fmt.Errorf("deleting job versions failed: %v", err)
  2019  		}
  2020  	}
  2021  
  2022  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  2023  		return fmt.Errorf("index update failed: %v", err)
  2024  	}
  2025  
  2026  	return nil
  2027  }
  2028  
  2029  // upsertJobVersion inserts a job into its historic version table and limits the
  2030  // number of job versions that are tracked.
  2031  func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn) error {
  2032  	// Insert the job
  2033  	if err := txn.Insert("job_version", job); err != nil {
  2034  		return fmt.Errorf("failed to insert job into job_version table: %v", err)
  2035  	}
  2036  
  2037  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  2038  		return fmt.Errorf("index update failed: %v", err)
  2039  	}
  2040  
  2041  	// Get all the historic jobs for this ID
  2042  	all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID)
  2043  	if err != nil {
  2044  		return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err)
  2045  	}
  2046  
  2047  	// If we are below the limit there is no GCing to be done
  2048  	if len(all) <= structs.JobTrackedVersions {
  2049  		return nil
  2050  	}
  2051  
  2052  	// We have to delete a historic job to make room.
  2053  	// Find index of the highest versioned stable job
  2054  	stableIdx := -1
  2055  	for i, j := range all {
  2056  		if j.Stable {
  2057  			stableIdx = i
  2058  			break
  2059  		}
  2060  	}
  2061  
  2062  	// If the stable job is the oldest version, do a swap to bring it into the
  2063  	// keep set.
  2064  	max := structs.JobTrackedVersions
  2065  	if stableIdx == max {
  2066  		all[max-1], all[max] = all[max], all[max-1]
  2067  	}
  2068  
  2069  	// Delete the job outside of the set that are being kept.
  2070  	d := all[max]
  2071  	if err := txn.Delete("job_version", d); err != nil {
  2072  		return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version)
  2073  	}
  2074  
  2075  	return nil
  2076  }
  2077  
  2078  // JobSubmission returns the original HCL/Variables context of a job, if available.
  2079  //
  2080  // Note: it is a normal case for the submission context to be unavailable, in which case
  2081  // nil is returned with no error.
  2082  func (s *StateStore) JobSubmission(ws memdb.WatchSet, namespace, jobName string, version uint64) (*structs.JobSubmission, error) {
  2083  	txn := s.db.ReadTxn()
  2084  	return s.jobSubmission(ws, namespace, jobName, version, txn)
  2085  }
  2086  
  2087  func (s *StateStore) jobSubmission(ws memdb.WatchSet, namespace, jobName string, version uint64, txn Txn) (*structs.JobSubmission, error) {
  2088  	watchCh, existing, err := txn.FirstWatch("job_submission", "id", namespace, jobName, version)
  2089  	if err != nil {
  2090  		return nil, fmt.Errorf("job submission lookup failed: %v", err)
  2091  	}
  2092  	ws.Add(watchCh)
  2093  	if existing != nil {
  2094  		return existing.(*structs.JobSubmission), nil
  2095  	}
  2096  	return nil, nil
  2097  }
  2098  
  2099  // JobByID is used to lookup a job by its ID. JobByID returns the current/latest job
  2100  // version.
  2101  func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) {
  2102  	txn := s.db.ReadTxn()
  2103  	return s.JobByIDTxn(ws, namespace, id, txn)
  2104  }
  2105  
  2106  // JobByIDTxn is used to lookup a job by its ID, like  JobByID. JobByID returns the job version
  2107  // accessible through in the transaction
  2108  func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) {
  2109  	watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id)
  2110  	if err != nil {
  2111  		return nil, fmt.Errorf("job lookup failed: %v", err)
  2112  	}
  2113  	ws.Add(watchCh)
  2114  
  2115  	if existing != nil {
  2116  		return existing.(*structs.Job), nil
  2117  	}
  2118  	return nil, nil
  2119  }
  2120  
  2121  // JobsByIDPrefix is used to lookup a job by prefix. If querying all namespaces
  2122  // the prefix will not be filtered by an index.
  2123  func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2124  	if namespace == structs.AllNamespacesSentinel {
  2125  		return s.jobsByIDPrefixAllNamespaces(ws, id)
  2126  	}
  2127  
  2128  	txn := s.db.ReadTxn()
  2129  
  2130  	iter, err := txn.Get("jobs", "id_prefix", namespace, id)
  2131  	if err != nil {
  2132  		return nil, fmt.Errorf("job lookup failed: %v", err)
  2133  	}
  2134  
  2135  	ws.Add(iter.WatchCh())
  2136  
  2137  	return iter, nil
  2138  }
  2139  
  2140  func (s *StateStore) jobsByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  2141  	txn := s.db.ReadTxn()
  2142  
  2143  	// Walk the entire jobs table
  2144  	iter, err := txn.Get("jobs", "id")
  2145  
  2146  	if err != nil {
  2147  		return nil, err
  2148  	}
  2149  
  2150  	ws.Add(iter.WatchCh())
  2151  
  2152  	// Filter the iterator by ID prefix
  2153  	f := func(raw interface{}) bool {
  2154  		job, ok := raw.(*structs.Job)
  2155  		if !ok {
  2156  			return true
  2157  		}
  2158  		return !strings.HasPrefix(job.ID, prefix)
  2159  	}
  2160  	wrap := memdb.NewFilterIterator(iter, f)
  2161  	return wrap, nil
  2162  }
  2163  
  2164  // JobVersionsByID returns all the tracked versions of a job.
  2165  func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  2166  	txn := s.db.ReadTxn()
  2167  
  2168  	return s.jobVersionByID(txn, ws, namespace, id)
  2169  }
  2170  
  2171  // jobVersionByID is the underlying implementation for retrieving all tracked
  2172  // versions of a job and is called under an existing transaction. A watch set
  2173  // can optionally be passed in to add the job histories to the watch set.
  2174  func (s *StateStore) jobVersionByID(txn *txn, ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  2175  	// Get all the historic jobs for this ID
  2176  	iter, err := txn.Get("job_version", "id_prefix", namespace, id)
  2177  	if err != nil {
  2178  		return nil, err
  2179  	}
  2180  
  2181  	ws.Add(iter.WatchCh())
  2182  
  2183  	var all []*structs.Job
  2184  	for {
  2185  		raw := iter.Next()
  2186  		if raw == nil {
  2187  			break
  2188  		}
  2189  
  2190  		// Ensure the ID is an exact match
  2191  		j := raw.(*structs.Job)
  2192  		if j.ID != id {
  2193  			continue
  2194  		}
  2195  
  2196  		all = append(all, j)
  2197  	}
  2198  
  2199  	// Sort in reverse order so that the highest version is first
  2200  	sort.Slice(all, func(i, j int) bool {
  2201  		return all[i].Version > all[j].Version
  2202  	})
  2203  
  2204  	return all, nil
  2205  }
  2206  
  2207  // JobByIDAndVersion returns the job identified by its ID and Version. The
  2208  // passed watchset may be nil.
  2209  func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) {
  2210  	txn := s.db.ReadTxn()
  2211  	return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn)
  2212  }
  2213  
  2214  // jobByIDAndVersionImpl returns the job identified by its ID and Version. The
  2215  // passed watchset may be nil.
  2216  func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string,
  2217  	version uint64, txn *txn) (*structs.Job, error) {
  2218  
  2219  	watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version)
  2220  	if err != nil {
  2221  		return nil, err
  2222  	}
  2223  
  2224  	ws.Add(watchCh)
  2225  
  2226  	if existing != nil {
  2227  		job := existing.(*structs.Job)
  2228  		return job, nil
  2229  	}
  2230  
  2231  	return nil, nil
  2232  }
  2233  
  2234  func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2235  	txn := s.db.ReadTxn()
  2236  
  2237  	// Walk the entire deployments table
  2238  	iter, err := txn.Get("job_version", "id")
  2239  	if err != nil {
  2240  		return nil, err
  2241  	}
  2242  
  2243  	ws.Add(iter.WatchCh())
  2244  	return iter, nil
  2245  }
  2246  
  2247  // Jobs returns an iterator over all the jobs
  2248  func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2249  	txn := s.db.ReadTxn()
  2250  
  2251  	// Walk the entire jobs table
  2252  	iter, err := txn.Get("jobs", "id")
  2253  	if err != nil {
  2254  		return nil, err
  2255  	}
  2256  
  2257  	ws.Add(iter.WatchCh())
  2258  
  2259  	return iter, nil
  2260  }
  2261  
  2262  // JobsByNamespace returns an iterator over all the jobs for the given namespace
  2263  func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2264  	txn := s.db.ReadTxn()
  2265  	return s.jobsByNamespaceImpl(ws, namespace, txn)
  2266  }
  2267  
  2268  // jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace
  2269  func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *txn) (memdb.ResultIterator, error) {
  2270  	// Walk the entire jobs table
  2271  	iter, err := txn.Get("jobs", "id_prefix", namespace, "")
  2272  	if err != nil {
  2273  		return nil, err
  2274  	}
  2275  
  2276  	ws.Add(iter.WatchCh())
  2277  
  2278  	return iter, nil
  2279  }
  2280  
  2281  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
  2282  func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) {
  2283  	txn := s.db.ReadTxn()
  2284  
  2285  	iter, err := txn.Get("jobs", "periodic", periodic)
  2286  	if err != nil {
  2287  		return nil, err
  2288  	}
  2289  
  2290  	ws.Add(iter.WatchCh())
  2291  
  2292  	return iter, nil
  2293  }
  2294  
  2295  // JobsByScheduler returns an iterator over all the jobs with the specific
  2296  // scheduler type.
  2297  func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) {
  2298  	txn := s.db.ReadTxn()
  2299  
  2300  	// Return an iterator for jobs with the specific type.
  2301  	iter, err := txn.Get("jobs", "type", schedulerType)
  2302  	if err != nil {
  2303  		return nil, err
  2304  	}
  2305  
  2306  	ws.Add(iter.WatchCh())
  2307  
  2308  	return iter, nil
  2309  }
  2310  
  2311  // JobsByGC returns an iterator over all jobs eligible or ineligible for garbage
  2312  // collection.
  2313  func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) {
  2314  	txn := s.db.ReadTxn()
  2315  
  2316  	iter, err := txn.Get("jobs", "gc", gc)
  2317  	if err != nil {
  2318  		return nil, err
  2319  	}
  2320  
  2321  	ws.Add(iter.WatchCh())
  2322  
  2323  	return iter, nil
  2324  }
  2325  
  2326  // JobsByPool returns an iterator over all jobs in a given node pool.
  2327  func (s *StateStore) JobsByPool(ws memdb.WatchSet, pool string) (memdb.ResultIterator, error) {
  2328  	txn := s.db.ReadTxn()
  2329  
  2330  	iter, err := txn.Get("jobs", "pool", pool)
  2331  	if err != nil {
  2332  		return nil, err
  2333  	}
  2334  
  2335  	ws.Add(iter.WatchCh())
  2336  
  2337  	return iter, nil
  2338  }
  2339  
  2340  // JobSummaryByID returns a job summary object which matches a specific id.
  2341  func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) {
  2342  	txn := s.db.ReadTxn()
  2343  
  2344  	watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID)
  2345  	if err != nil {
  2346  		return nil, err
  2347  	}
  2348  
  2349  	ws.Add(watchCh)
  2350  
  2351  	if existing != nil {
  2352  		summary := existing.(*structs.JobSummary)
  2353  		return summary, nil
  2354  	}
  2355  
  2356  	return nil, nil
  2357  }
  2358  
  2359  // JobSummaries walks the entire job summary table and returns all the job
  2360  // summary objects
  2361  func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2362  	txn := s.db.ReadTxn()
  2363  
  2364  	iter, err := txn.Get("job_summary", "id")
  2365  	if err != nil {
  2366  		return nil, err
  2367  	}
  2368  
  2369  	ws.Add(iter.WatchCh())
  2370  
  2371  	return iter, nil
  2372  }
  2373  
  2374  // JobSummaryByPrefix is used to look up Job Summary by id prefix
  2375  func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2376  	txn := s.db.ReadTxn()
  2377  
  2378  	iter, err := txn.Get("job_summary", "id_prefix", namespace, id)
  2379  	if err != nil {
  2380  		return nil, fmt.Errorf("job_summary lookup failed: %v", err)
  2381  	}
  2382  
  2383  	ws.Add(iter.WatchCh())
  2384  
  2385  	return iter, nil
  2386  }
  2387  
  2388  // UpsertCSIVolume inserts a volume in the state store.
  2389  func (s *StateStore) UpsertCSIVolume(index uint64, volumes []*structs.CSIVolume) error {
  2390  	txn := s.db.WriteTxn(index)
  2391  	defer txn.Abort()
  2392  
  2393  	for _, v := range volumes {
  2394  		if exists, err := s.namespaceExists(txn, v.Namespace); err != nil {
  2395  			return err
  2396  		} else if !exists {
  2397  			return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace)
  2398  		}
  2399  
  2400  		obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID)
  2401  		if err != nil {
  2402  			return fmt.Errorf("volume existence check error: %v", err)
  2403  		}
  2404  		if obj != nil {
  2405  			// Allow some properties of a volume to be updated in place, but
  2406  			// prevent accidentally overwriting important properties, or
  2407  			// overwriting a volume in use
  2408  			old := obj.(*structs.CSIVolume)
  2409  			if old.ExternalID != v.ExternalID ||
  2410  				old.PluginID != v.PluginID ||
  2411  				old.Provider != v.Provider {
  2412  				return fmt.Errorf("volume identity cannot be updated: %s", v.ID)
  2413  			}
  2414  			s.CSIVolumeDenormalize(nil, old.Copy())
  2415  			if old.InUse() {
  2416  				return fmt.Errorf("volume cannot be updated while in use")
  2417  			}
  2418  
  2419  			v.CreateIndex = old.CreateIndex
  2420  			v.ModifyIndex = index
  2421  		} else {
  2422  			v.CreateIndex = index
  2423  			v.ModifyIndex = index
  2424  		}
  2425  
  2426  		// Allocations are copy on write, so we want to keep the Allocation ID
  2427  		// but we need to clear the pointer so that we don't store it when we
  2428  		// write the volume to the state store. We'll get it from the db in
  2429  		// denormalize.
  2430  		for allocID := range v.ReadAllocs {
  2431  			v.ReadAllocs[allocID] = nil
  2432  		}
  2433  		for allocID := range v.WriteAllocs {
  2434  			v.WriteAllocs[allocID] = nil
  2435  		}
  2436  
  2437  		err = txn.Insert("csi_volumes", v)
  2438  		if err != nil {
  2439  			return fmt.Errorf("volume insert: %v", err)
  2440  		}
  2441  	}
  2442  
  2443  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2444  		return fmt.Errorf("index update failed: %v", err)
  2445  	}
  2446  
  2447  	return txn.Commit()
  2448  }
  2449  
  2450  // CSIVolumes returns the unfiltered list of all volumes. Caller should
  2451  // snapshot if it wants to also denormalize the plugins.
  2452  func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2453  	txn := s.db.ReadTxn()
  2454  	defer txn.Abort()
  2455  
  2456  	iter, err := txn.Get("csi_volumes", "id")
  2457  	if err != nil {
  2458  		return nil, fmt.Errorf("csi_volumes lookup failed: %v", err)
  2459  	}
  2460  
  2461  	ws.Add(iter.WatchCh())
  2462  
  2463  	return iter, nil
  2464  }
  2465  
  2466  // CSIVolumeByID is used to lookup a single volume. Returns a copy of the
  2467  // volume because its plugins and allocations are denormalized to provide
  2468  // accurate Health.
  2469  func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) {
  2470  	txn := s.db.ReadTxn()
  2471  
  2472  	watchCh, obj, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
  2473  	if err != nil {
  2474  		return nil, fmt.Errorf("volume lookup failed for %s: %v", id, err)
  2475  	}
  2476  	ws.Add(watchCh)
  2477  
  2478  	if obj == nil {
  2479  		return nil, nil
  2480  	}
  2481  	vol := obj.(*structs.CSIVolume)
  2482  
  2483  	// we return the volume with the plugins denormalized by default,
  2484  	// because the scheduler needs them for feasibility checking
  2485  	return s.csiVolumeDenormalizePluginsTxn(txn, vol.Copy())
  2486  }
  2487  
  2488  // CSIVolumesByPluginID looks up csi_volumes by pluginID. Caller should
  2489  // snapshot if it wants to also denormalize the plugins.
  2490  func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, prefix, pluginID string) (memdb.ResultIterator, error) {
  2491  	txn := s.db.ReadTxn()
  2492  
  2493  	iter, err := txn.Get("csi_volumes", "plugin_id", pluginID)
  2494  	if err != nil {
  2495  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2496  	}
  2497  
  2498  	// Filter the iterator by namespace
  2499  	f := func(raw interface{}) bool {
  2500  		v, ok := raw.(*structs.CSIVolume)
  2501  		if !ok {
  2502  			return false
  2503  		}
  2504  		return v.Namespace != namespace && strings.HasPrefix(v.ID, prefix)
  2505  	}
  2506  
  2507  	wrap := memdb.NewFilterIterator(iter, f)
  2508  	return wrap, nil
  2509  }
  2510  
  2511  // CSIVolumesByIDPrefix supports search. Caller should snapshot if it wants to
  2512  // also denormalize the plugins. If using a prefix with the wildcard namespace,
  2513  // the results will not use the index prefix.
  2514  func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) {
  2515  	if namespace == structs.AllNamespacesSentinel {
  2516  		return s.csiVolumeByIDPrefixAllNamespaces(ws, volumeID)
  2517  	}
  2518  
  2519  	txn := s.db.ReadTxn()
  2520  
  2521  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID)
  2522  	if err != nil {
  2523  		return nil, err
  2524  	}
  2525  
  2526  	ws.Add(iter.WatchCh())
  2527  
  2528  	return iter, nil
  2529  }
  2530  
  2531  func (s *StateStore) csiVolumeByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  2532  	txn := s.db.ReadTxn()
  2533  
  2534  	// Walk the entire csi_volumes table
  2535  	iter, err := txn.Get("csi_volumes", "id")
  2536  
  2537  	if err != nil {
  2538  		return nil, err
  2539  	}
  2540  
  2541  	ws.Add(iter.WatchCh())
  2542  
  2543  	// Filter the iterator by ID prefix
  2544  	f := func(raw interface{}) bool {
  2545  		v, ok := raw.(*structs.CSIVolume)
  2546  		if !ok {
  2547  			return false
  2548  		}
  2549  		return !strings.HasPrefix(v.ID, prefix)
  2550  	}
  2551  	wrap := memdb.NewFilterIterator(iter, f)
  2552  	return wrap, nil
  2553  }
  2554  
  2555  // CSIVolumesByNodeID looks up CSIVolumes in use on a node. Caller should
  2556  // snapshot if it wants to also denormalize the plugins.
  2557  func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, prefix, nodeID string) (memdb.ResultIterator, error) {
  2558  	allocs, err := s.AllocsByNode(ws, nodeID)
  2559  	if err != nil {
  2560  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  2561  	}
  2562  
  2563  	// Find volume ids for CSI volumes in running allocs, or allocs that we desire to run
  2564  	ids := map[string]string{} // Map volumeID to Namespace
  2565  	for _, a := range allocs {
  2566  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  2567  
  2568  		if !(a.DesiredStatus == structs.AllocDesiredStatusRun ||
  2569  			a.ClientStatus == structs.AllocClientStatusRunning) ||
  2570  			len(tg.Volumes) == 0 {
  2571  			continue
  2572  		}
  2573  
  2574  		for _, v := range tg.Volumes {
  2575  			if v.Type != structs.VolumeTypeCSI {
  2576  				continue
  2577  			}
  2578  			ids[v.Source] = a.Namespace
  2579  		}
  2580  	}
  2581  
  2582  	// Lookup the raw CSIVolumes to match the other list interfaces
  2583  	iter := NewSliceIterator()
  2584  	txn := s.db.ReadTxn()
  2585  	for id, namespace := range ids {
  2586  		if strings.HasPrefix(id, prefix) {
  2587  			watchCh, raw, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
  2588  			if err != nil {
  2589  				return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  2590  			}
  2591  			ws.Add(watchCh)
  2592  			iter.Add(raw)
  2593  		}
  2594  	}
  2595  
  2596  	return iter, nil
  2597  }
  2598  
  2599  // CSIVolumesByNamespace looks up the entire csi_volumes table
  2600  func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {
  2601  	txn := s.db.ReadTxn()
  2602  
  2603  	return s.csiVolumesByNamespaceImpl(txn, ws, namespace, prefix)
  2604  }
  2605  
  2606  func (s *StateStore) csiVolumesByNamespaceImpl(txn *txn, ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {
  2607  
  2608  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, prefix)
  2609  	if err != nil {
  2610  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2611  	}
  2612  
  2613  	ws.Add(iter.WatchCh())
  2614  
  2615  	return iter, nil
  2616  }
  2617  
  2618  // CSIVolumeClaim updates the volume's claim count and allocation list
  2619  func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *structs.CSIVolumeClaim) error {
  2620  	txn := s.db.WriteTxn(index)
  2621  	defer txn.Abort()
  2622  
  2623  	row, err := txn.First("csi_volumes", "id", namespace, id)
  2624  	if err != nil {
  2625  		return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2626  	}
  2627  	if row == nil {
  2628  		return fmt.Errorf("volume not found: %s", id)
  2629  	}
  2630  
  2631  	orig, ok := row.(*structs.CSIVolume)
  2632  	if !ok {
  2633  		return fmt.Errorf("volume row conversion error")
  2634  	}
  2635  
  2636  	var alloc *structs.Allocation
  2637  	if claim.State == structs.CSIVolumeClaimStateTaken {
  2638  		alloc, err = s.allocByIDImpl(txn, nil, claim.AllocationID)
  2639  		if err != nil {
  2640  			s.logger.Error("AllocByID failed", "error", err)
  2641  			return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2642  		}
  2643  		if alloc == nil {
  2644  			s.logger.Error("AllocByID failed to find alloc", "alloc_id", claim.AllocationID)
  2645  			if err != nil {
  2646  				return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2647  			}
  2648  		}
  2649  	}
  2650  
  2651  	volume, err := s.csiVolumeDenormalizePluginsTxn(txn, orig.Copy())
  2652  	if err != nil {
  2653  		return err
  2654  	}
  2655  	volume, err = s.csiVolumeDenormalizeTxn(txn, nil, volume)
  2656  	if err != nil {
  2657  		return err
  2658  	}
  2659  
  2660  	// in the case of a job deregistration, there will be no allocation ID
  2661  	// for the claim but we still want to write an updated index to the volume
  2662  	// so that volume reaping is triggered
  2663  	if claim.AllocationID != "" {
  2664  		err = volume.Claim(claim, alloc)
  2665  		if err != nil {
  2666  			return err
  2667  		}
  2668  	}
  2669  
  2670  	volume.ModifyIndex = index
  2671  
  2672  	// Allocations are copy on write, so we want to keep the Allocation ID
  2673  	// but we need to clear the pointer so that we don't store it when we
  2674  	// write the volume to the state store. We'll get it from the db in
  2675  	// denormalize.
  2676  	for allocID := range volume.ReadAllocs {
  2677  		volume.ReadAllocs[allocID] = nil
  2678  	}
  2679  	for allocID := range volume.WriteAllocs {
  2680  		volume.WriteAllocs[allocID] = nil
  2681  	}
  2682  
  2683  	if err = txn.Insert("csi_volumes", volume); err != nil {
  2684  		return fmt.Errorf("volume update failed: %s: %v", id, err)
  2685  	}
  2686  
  2687  	if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2688  		return fmt.Errorf("index update failed: %v", err)
  2689  	}
  2690  
  2691  	return txn.Commit()
  2692  }
  2693  
  2694  // CSIVolumeDeregister removes the volume from the server
  2695  func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string, force bool) error {
  2696  	txn := s.db.WriteTxn(index)
  2697  	defer txn.Abort()
  2698  
  2699  	for _, id := range ids {
  2700  		existing, err := txn.First("csi_volumes", "id", namespace, id)
  2701  		if err != nil {
  2702  			return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2703  		}
  2704  
  2705  		if existing == nil {
  2706  			return fmt.Errorf("volume not found: %s", id)
  2707  		}
  2708  
  2709  		vol, ok := existing.(*structs.CSIVolume)
  2710  		if !ok {
  2711  			return fmt.Errorf("volume row conversion error: %s", id)
  2712  		}
  2713  
  2714  		// The common case for a volume deregister is when the volume is
  2715  		// unused, but we can also let an operator intervene in the case where
  2716  		// allocations have been stopped but claims can't be freed because
  2717  		// ex. the plugins have all been removed.
  2718  		if vol.InUse() {
  2719  			if !force || !s.volSafeToForce(txn, vol) {
  2720  				return fmt.Errorf("volume in use: %s", id)
  2721  			}
  2722  		}
  2723  
  2724  		if err = txn.Delete("csi_volumes", existing); err != nil {
  2725  			return fmt.Errorf("volume delete failed: %s: %v", id, err)
  2726  		}
  2727  	}
  2728  
  2729  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2730  		return fmt.Errorf("index update failed: %v", err)
  2731  	}
  2732  
  2733  	return txn.Commit()
  2734  }
  2735  
  2736  // volSafeToForce checks if the any of the remaining allocations
  2737  // are in a non-terminal state.
  2738  func (s *StateStore) volSafeToForce(txn Txn, v *structs.CSIVolume) bool {
  2739  	v = v.Copy()
  2740  	vol, err := s.csiVolumeDenormalizeTxn(txn, nil, v)
  2741  	if err != nil {
  2742  		return false
  2743  	}
  2744  
  2745  	for _, alloc := range vol.ReadAllocs {
  2746  		if alloc != nil && !alloc.TerminalStatus() {
  2747  			return false
  2748  		}
  2749  	}
  2750  	for _, alloc := range vol.WriteAllocs {
  2751  		if alloc != nil && !alloc.TerminalStatus() {
  2752  			return false
  2753  		}
  2754  	}
  2755  	return true
  2756  }
  2757  
  2758  // CSIVolumeDenormalizePlugins returns a CSIVolume with current health and
  2759  // plugins, but without allocations.
  2760  // Use this for current volume metadata, handling lists of volumes.
  2761  // Use CSIVolumeDenormalize for volumes containing both health and current
  2762  // allocations.
  2763  func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2764  	if vol == nil {
  2765  		return nil, nil
  2766  	}
  2767  	txn := s.db.ReadTxn()
  2768  	defer txn.Abort()
  2769  	return s.csiVolumeDenormalizePluginsTxn(txn, vol)
  2770  }
  2771  
  2772  // csiVolumeDenormalizePluginsTxn implements
  2773  // CSIVolumeDenormalizePlugins, inside a transaction.
  2774  func (s *StateStore) csiVolumeDenormalizePluginsTxn(txn Txn, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2775  	if vol == nil {
  2776  		return nil, nil
  2777  	}
  2778  	plug, err := s.CSIPluginByIDTxn(txn, nil, vol.PluginID)
  2779  	if err != nil {
  2780  		return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
  2781  	}
  2782  	if plug == nil {
  2783  		vol.ControllersHealthy = 0
  2784  		vol.NodesHealthy = 0
  2785  		vol.Schedulable = false
  2786  		return vol, nil
  2787  	}
  2788  
  2789  	vol.Provider = plug.Provider
  2790  	vol.ProviderVersion = plug.Version
  2791  	vol.ControllerRequired = plug.ControllerRequired
  2792  	vol.ControllersHealthy = plug.ControllersHealthy
  2793  	vol.NodesHealthy = plug.NodesHealthy
  2794  
  2795  	// This value may be stale, but stale is ok
  2796  	vol.ControllersExpected = plug.ControllersExpected
  2797  	vol.NodesExpected = plug.NodesExpected
  2798  
  2799  	vol.Schedulable = vol.NodesHealthy > 0
  2800  	if vol.ControllerRequired {
  2801  		vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable
  2802  	}
  2803  
  2804  	return vol, nil
  2805  }
  2806  
  2807  // CSIVolumeDenormalize returns a CSIVolume with its current
  2808  // Allocations and Claims, including creating new PastClaims for
  2809  // terminal or garbage collected allocations. This ensures we have a
  2810  // consistent state. Note that it mutates the original volume and so
  2811  // should always be called on a Copy after reading from the state
  2812  // store.
  2813  func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2814  	txn := s.db.ReadTxn()
  2815  	return s.csiVolumeDenormalizeTxn(txn, ws, vol)
  2816  }
  2817  
  2818  // csiVolumeDenormalizeTxn implements CSIVolumeDenormalize inside a transaction
  2819  func (s *StateStore) csiVolumeDenormalizeTxn(txn Txn, ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2820  	if vol == nil {
  2821  		return nil, nil
  2822  	}
  2823  
  2824  	// note: denormalize mutates the maps we pass in!
  2825  	denormalize := func(
  2826  		currentAllocs map[string]*structs.Allocation,
  2827  		currentClaims, pastClaims map[string]*structs.CSIVolumeClaim,
  2828  		fallbackMode structs.CSIVolumeClaimMode) error {
  2829  
  2830  		for id := range currentAllocs {
  2831  			a, err := s.allocByIDImpl(txn, ws, id)
  2832  			if err != nil {
  2833  				return err
  2834  			}
  2835  			pastClaim := pastClaims[id]
  2836  			currentClaim := currentClaims[id]
  2837  			if currentClaim == nil {
  2838  				// COMPAT(1.4.0): the CSIVolumeClaim fields were added
  2839  				// after 0.11.1, so claims made before that may be
  2840  				// missing this value. No clusters should see this
  2841  				// anymore, so warn nosily in the logs so that
  2842  				// operators ask us about it. Remove this block and
  2843  				// the now-unused fallbackMode parameter, and return
  2844  				// an error if currentClaim is nil in 1.4.0
  2845  				s.logger.Warn("volume was missing claim for allocation",
  2846  					"volume_id", vol.ID, "alloc", id)
  2847  				currentClaim = &structs.CSIVolumeClaim{
  2848  					AllocationID: a.ID,
  2849  					NodeID:       a.NodeID,
  2850  					Mode:         fallbackMode,
  2851  					State:        structs.CSIVolumeClaimStateTaken,
  2852  				}
  2853  				currentClaims[id] = currentClaim
  2854  			}
  2855  
  2856  			currentAllocs[id] = a
  2857  			if (a == nil || a.TerminalStatus()) && pastClaim == nil {
  2858  				// the alloc is garbage collected but nothing has written a PastClaim,
  2859  				// so create one now
  2860  				pastClaim = &structs.CSIVolumeClaim{
  2861  					AllocationID:   id,
  2862  					NodeID:         currentClaim.NodeID,
  2863  					Mode:           currentClaim.Mode,
  2864  					State:          structs.CSIVolumeClaimStateUnpublishing,
  2865  					AccessMode:     currentClaim.AccessMode,
  2866  					AttachmentMode: currentClaim.AttachmentMode,
  2867  				}
  2868  				pastClaims[id] = pastClaim
  2869  			}
  2870  
  2871  		}
  2872  		return nil
  2873  	}
  2874  
  2875  	err := denormalize(vol.ReadAllocs, vol.ReadClaims, vol.PastClaims,
  2876  		structs.CSIVolumeClaimRead)
  2877  	if err != nil {
  2878  		return nil, err
  2879  	}
  2880  	err = denormalize(vol.WriteAllocs, vol.WriteClaims, vol.PastClaims,
  2881  		structs.CSIVolumeClaimWrite)
  2882  	if err != nil {
  2883  		return nil, err
  2884  	}
  2885  
  2886  	// COMPAT: the AccessMode and AttachmentMode fields were added to claims
  2887  	// in 1.1.0, so claims made before that may be missing this value. In this
  2888  	// case, the volume will already have AccessMode/AttachmentMode until it
  2889  	// no longer has any claims, so set from those values
  2890  	for _, claim := range vol.ReadClaims {
  2891  		if claim.AccessMode == "" || claim.AttachmentMode == "" {
  2892  			claim.AccessMode = vol.AccessMode
  2893  			claim.AttachmentMode = vol.AttachmentMode
  2894  		}
  2895  	}
  2896  	for _, claim := range vol.WriteClaims {
  2897  		if claim.AccessMode == "" || claim.AttachmentMode == "" {
  2898  			claim.AccessMode = vol.AccessMode
  2899  			claim.AttachmentMode = vol.AttachmentMode
  2900  		}
  2901  	}
  2902  
  2903  	return vol, nil
  2904  }
  2905  
  2906  // CSIPlugins returns the unfiltered list of all plugin health status
  2907  func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2908  	txn := s.db.ReadTxn()
  2909  	defer txn.Abort()
  2910  
  2911  	iter, err := txn.Get("csi_plugins", "id")
  2912  	if err != nil {
  2913  		return nil, fmt.Errorf("csi_plugins lookup failed: %v", err)
  2914  	}
  2915  
  2916  	ws.Add(iter.WatchCh())
  2917  
  2918  	return iter, nil
  2919  }
  2920  
  2921  // CSIPluginsByIDPrefix supports search
  2922  func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) {
  2923  	txn := s.db.ReadTxn()
  2924  
  2925  	iter, err := txn.Get("csi_plugins", "id_prefix", pluginID)
  2926  	if err != nil {
  2927  		return nil, err
  2928  	}
  2929  
  2930  	ws.Add(iter.WatchCh())
  2931  
  2932  	return iter, nil
  2933  }
  2934  
  2935  // CSIPluginByID returns a named CSIPlugin. This method creates a new
  2936  // transaction so you should not call it from within another transaction.
  2937  func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2938  	txn := s.db.ReadTxn()
  2939  	plugin, err := s.CSIPluginByIDTxn(txn, ws, id)
  2940  	if err != nil {
  2941  		return nil, err
  2942  	}
  2943  	return plugin, nil
  2944  }
  2945  
  2946  // CSIPluginByIDTxn returns a named CSIPlugin
  2947  func (s *StateStore) CSIPluginByIDTxn(txn Txn, ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2948  
  2949  	watchCh, obj, err := txn.FirstWatch("csi_plugins", "id", id)
  2950  	if err != nil {
  2951  		return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err)
  2952  	}
  2953  
  2954  	ws.Add(watchCh)
  2955  
  2956  	if obj != nil {
  2957  		return obj.(*structs.CSIPlugin), nil
  2958  	}
  2959  	return nil, nil
  2960  }
  2961  
  2962  // CSIPluginDenormalize returns a CSIPlugin with allocation details. Always called on a copy of the plugin.
  2963  func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2964  	txn := s.db.ReadTxn()
  2965  	return s.CSIPluginDenormalizeTxn(txn, ws, plug)
  2966  }
  2967  
  2968  func (s *StateStore) CSIPluginDenormalizeTxn(txn Txn, ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2969  	if plug == nil {
  2970  		return nil, nil
  2971  	}
  2972  
  2973  	// Get the unique list of allocation ids
  2974  	ids := map[string]struct{}{}
  2975  	for _, info := range plug.Controllers {
  2976  		ids[info.AllocID] = struct{}{}
  2977  	}
  2978  	for _, info := range plug.Nodes {
  2979  		ids[info.AllocID] = struct{}{}
  2980  	}
  2981  
  2982  	for id := range ids {
  2983  		alloc, err := s.allocByIDImpl(txn, ws, id)
  2984  		if err != nil {
  2985  			return nil, err
  2986  		}
  2987  		if alloc == nil {
  2988  			continue
  2989  		}
  2990  		plug.Allocations = append(plug.Allocations, alloc.Stub(nil))
  2991  	}
  2992  	sort.Slice(plug.Allocations, func(i, j int) bool {
  2993  		return plug.Allocations[i].ModifyIndex > plug.Allocations[j].ModifyIndex
  2994  	})
  2995  
  2996  	return plug, nil
  2997  }
  2998  
  2999  // UpsertCSIPlugin writes the plugin to the state store. Note: there
  3000  // is currently no raft message for this, as it's intended to support
  3001  // testing use cases.
  3002  func (s *StateStore) UpsertCSIPlugin(index uint64, plug *structs.CSIPlugin) error {
  3003  	txn := s.db.WriteTxn(index)
  3004  	defer txn.Abort()
  3005  
  3006  	existing, err := txn.First("csi_plugins", "id", plug.ID)
  3007  	if err != nil {
  3008  		return fmt.Errorf("csi_plugin lookup error: %s %v", plug.ID, err)
  3009  	}
  3010  
  3011  	plug.ModifyIndex = index
  3012  	if existing != nil {
  3013  		plug.CreateIndex = existing.(*structs.CSIPlugin).CreateIndex
  3014  	}
  3015  
  3016  	err = txn.Insert("csi_plugins", plug)
  3017  	if err != nil {
  3018  		return fmt.Errorf("csi_plugins insert error: %v", err)
  3019  	}
  3020  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  3021  		return fmt.Errorf("index update failed: %v", err)
  3022  	}
  3023  	return txn.Commit()
  3024  }
  3025  
  3026  // DeleteCSIPlugin deletes the plugin if it's not in use.
  3027  func (s *StateStore) DeleteCSIPlugin(index uint64, id string) error {
  3028  	txn := s.db.WriteTxn(index)
  3029  	defer txn.Abort()
  3030  
  3031  	plug, err := s.CSIPluginByIDTxn(txn, nil, id)
  3032  	if err != nil {
  3033  		return err
  3034  	}
  3035  
  3036  	if plug == nil {
  3037  		return nil
  3038  	}
  3039  
  3040  	plug, err = s.CSIPluginDenormalizeTxn(txn, nil, plug.Copy())
  3041  	if err != nil {
  3042  		return err
  3043  	}
  3044  	if !plug.IsEmpty() {
  3045  		return fmt.Errorf("plugin in use")
  3046  	}
  3047  
  3048  	err = txn.Delete("csi_plugins", plug)
  3049  	if err != nil {
  3050  		return fmt.Errorf("csi_plugins delete error: %v", err)
  3051  	}
  3052  	return txn.Commit()
  3053  }
  3054  
  3055  // UpsertPeriodicLaunch is used to register a launch or update it.
  3056  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
  3057  	txn := s.db.WriteTxn(index)
  3058  	defer txn.Abort()
  3059  
  3060  	// Check if the job already exists
  3061  	existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID)
  3062  	if err != nil {
  3063  		return fmt.Errorf("periodic launch lookup failed: %v", err)
  3064  	}
  3065  
  3066  	// Setup the indexes correctly
  3067  	if existing != nil {
  3068  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
  3069  		launch.ModifyIndex = index
  3070  	} else {
  3071  		launch.CreateIndex = index
  3072  		launch.ModifyIndex = index
  3073  	}
  3074  
  3075  	// Insert the job
  3076  	if err := txn.Insert("periodic_launch", launch); err != nil {
  3077  		return fmt.Errorf("launch insert failed: %v", err)
  3078  	}
  3079  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  3080  		return fmt.Errorf("index update failed: %v", err)
  3081  	}
  3082  
  3083  	return txn.Commit()
  3084  }
  3085  
  3086  // DeletePeriodicLaunch is used to delete the periodic launch
  3087  func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error {
  3088  	txn := s.db.WriteTxn(index)
  3089  	defer txn.Abort()
  3090  
  3091  	err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn)
  3092  	if err == nil {
  3093  		return txn.Commit()
  3094  	}
  3095  	return err
  3096  }
  3097  
  3098  // DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch
  3099  // but in a transaction.  Useful for when making multiple modifications atomically
  3100  func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error {
  3101  	// Lookup the launch
  3102  	existing, err := txn.First("periodic_launch", "id", namespace, jobID)
  3103  	if err != nil {
  3104  		return fmt.Errorf("launch lookup failed: %v", err)
  3105  	}
  3106  	if existing == nil {
  3107  		return fmt.Errorf("launch not found")
  3108  	}
  3109  
  3110  	// Delete the launch
  3111  	if err := txn.Delete("periodic_launch", existing); err != nil {
  3112  		return fmt.Errorf("launch delete failed: %v", err)
  3113  	}
  3114  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  3115  		return fmt.Errorf("index update failed: %v", err)
  3116  	}
  3117  
  3118  	return nil
  3119  }
  3120  
  3121  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
  3122  // ID.
  3123  func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) {
  3124  	txn := s.db.ReadTxn()
  3125  
  3126  	watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id)
  3127  	if err != nil {
  3128  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
  3129  	}
  3130  
  3131  	ws.Add(watchCh)
  3132  
  3133  	if existing != nil {
  3134  		return existing.(*structs.PeriodicLaunch), nil
  3135  	}
  3136  	return nil, nil
  3137  }
  3138  
  3139  // PeriodicLaunches returns an iterator over all the periodic launches
  3140  func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3141  	txn := s.db.ReadTxn()
  3142  
  3143  	// Walk the entire table
  3144  	iter, err := txn.Get("periodic_launch", "id")
  3145  	if err != nil {
  3146  		return nil, err
  3147  	}
  3148  
  3149  	ws.Add(iter.WatchCh())
  3150  
  3151  	return iter, nil
  3152  }
  3153  
  3154  // UpsertEvals is used to upsert a set of evaluations
  3155  func (s *StateStore) UpsertEvals(msgType structs.MessageType, index uint64, evals []*structs.Evaluation) error {
  3156  	txn := s.db.WriteTxnMsgT(msgType, index)
  3157  	defer txn.Abort()
  3158  
  3159  	err := s.UpsertEvalsTxn(index, evals, txn)
  3160  	if err == nil {
  3161  		return txn.Commit()
  3162  	}
  3163  	return err
  3164  }
  3165  
  3166  // UpsertEvalsTxn is used to upsert a set of evaluations, like UpsertEvals but
  3167  // in a transaction.  Useful for when making multiple modifications atomically.
  3168  func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error {
  3169  	// Do a nested upsert
  3170  	jobs := make(map[structs.NamespacedID]string, len(evals))
  3171  	for _, eval := range evals {
  3172  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  3173  			return err
  3174  		}
  3175  
  3176  		tuple := structs.NamespacedID{
  3177  			ID:        eval.JobID,
  3178  			Namespace: eval.Namespace,
  3179  		}
  3180  		jobs[tuple] = ""
  3181  	}
  3182  
  3183  	// Set the job's status
  3184  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3185  		return fmt.Errorf("setting job status failed: %v", err)
  3186  	}
  3187  
  3188  	return nil
  3189  }
  3190  
  3191  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
  3192  func (s *StateStore) nestedUpsertEval(txn *txn, index uint64, eval *structs.Evaluation) error {
  3193  	// Lookup the evaluation
  3194  	existing, err := txn.First("evals", "id", eval.ID)
  3195  	if err != nil {
  3196  		return fmt.Errorf("eval lookup failed: %v", err)
  3197  	}
  3198  
  3199  	// Update the indexes
  3200  	if existing != nil {
  3201  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
  3202  		eval.ModifyIndex = index
  3203  	} else {
  3204  		eval.CreateIndex = index
  3205  		eval.ModifyIndex = index
  3206  	}
  3207  
  3208  	// Update the job summary
  3209  	summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID)
  3210  	if err != nil {
  3211  		return fmt.Errorf("job summary lookup failed: %v", err)
  3212  	}
  3213  	if summaryRaw != nil {
  3214  		js := summaryRaw.(*structs.JobSummary).Copy()
  3215  		hasSummaryChanged := false
  3216  		for tg, num := range eval.QueuedAllocations {
  3217  			if summary, ok := js.Summary[tg]; ok {
  3218  				if summary.Queued != num {
  3219  					summary.Queued = num
  3220  					js.Summary[tg] = summary
  3221  					hasSummaryChanged = true
  3222  				}
  3223  			} else {
  3224  				s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace)
  3225  			}
  3226  		}
  3227  
  3228  		// Insert the job summary
  3229  		if hasSummaryChanged {
  3230  			js.ModifyIndex = index
  3231  			if err := txn.Insert("job_summary", js); err != nil {
  3232  				return fmt.Errorf("job summary insert failed: %v", err)
  3233  			}
  3234  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  3235  				return fmt.Errorf("index update failed: %v", err)
  3236  			}
  3237  		}
  3238  	}
  3239  
  3240  	// Check if the job has any blocked evaluations and cancel them
  3241  	if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 {
  3242  		// Get the blocked evaluation for a job if it exists
  3243  		iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked)
  3244  		if err != nil {
  3245  			return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err)
  3246  		}
  3247  
  3248  		var blocked []*structs.Evaluation
  3249  		for {
  3250  			raw := iter.Next()
  3251  			if raw == nil {
  3252  				break
  3253  			}
  3254  			blocked = append(blocked, raw.(*structs.Evaluation))
  3255  		}
  3256  
  3257  		// Go through and update the evals
  3258  		for _, blockedEval := range blocked {
  3259  			newEval := blockedEval.Copy()
  3260  			newEval.Status = structs.EvalStatusCancelled
  3261  			newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", eval.ID)
  3262  			newEval.ModifyIndex = index
  3263  			newEval.ModifyTime = eval.ModifyTime
  3264  
  3265  			if err := txn.Insert("evals", newEval); err != nil {
  3266  				return fmt.Errorf("eval insert failed: %v", err)
  3267  			}
  3268  		}
  3269  	}
  3270  
  3271  	// Insert the eval
  3272  	if err := txn.Insert("evals", eval); err != nil {
  3273  		return fmt.Errorf("eval insert failed: %v", err)
  3274  	}
  3275  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3276  		return fmt.Errorf("index update failed: %v", err)
  3277  	}
  3278  	return nil
  3279  }
  3280  
  3281  // updateEvalModifyIndex is used to update the modify index of an evaluation that has been
  3282  // through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent
  3283  // scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply.
  3284  func (s *StateStore) updateEvalModifyIndex(txn *txn, index uint64, evalID string) error {
  3285  	// Lookup the evaluation
  3286  	existing, err := txn.First("evals", "id", evalID)
  3287  	if err != nil {
  3288  		return fmt.Errorf("eval lookup failed: %v", err)
  3289  	}
  3290  	if existing == nil {
  3291  		s.logger.Error("unable to find eval", "eval_id", evalID)
  3292  		return fmt.Errorf("unable to find eval id %q", evalID)
  3293  	}
  3294  	eval := existing.(*structs.Evaluation).Copy()
  3295  	// Update the indexes
  3296  	eval.ModifyIndex = index
  3297  
  3298  	// Insert the eval
  3299  	if err := txn.Insert("evals", eval); err != nil {
  3300  		return fmt.Errorf("eval insert failed: %v", err)
  3301  	}
  3302  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3303  		return fmt.Errorf("index update failed: %v", err)
  3304  	}
  3305  	return nil
  3306  }
  3307  
  3308  // DeleteEvalsByFilter is used to delete all evals that are both safe to delete
  3309  // and match a filter.
  3310  func (s *StateStore) DeleteEvalsByFilter(index uint64, filterExpr string, pageToken string, perPage int32) error {
  3311  	txn := s.db.WriteTxn(index)
  3312  	defer txn.Abort()
  3313  
  3314  	// These are always user-initiated, so ensure the eval broker is paused.
  3315  	_, schedConfig, err := s.schedulerConfigTxn(txn)
  3316  	if err != nil {
  3317  		return err
  3318  	}
  3319  	if schedConfig == nil || !schedConfig.PauseEvalBroker {
  3320  		return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
  3321  	}
  3322  
  3323  	filter, err := bexpr.CreateEvaluator(filterExpr)
  3324  	if err != nil {
  3325  		return err
  3326  	}
  3327  
  3328  	iter, err := s.Evals(nil, SortDefault)
  3329  	if err != nil {
  3330  		return fmt.Errorf("failed to lookup evals: %v", err)
  3331  	}
  3332  
  3333  	// Note: Paginator imports this package for testing so we can't just use
  3334  	// Paginator
  3335  	pageCount := int32(0)
  3336  
  3337  	for {
  3338  		if pageCount >= perPage {
  3339  			break
  3340  		}
  3341  		raw := iter.Next()
  3342  		if raw == nil {
  3343  			break
  3344  		}
  3345  		eval := raw.(*structs.Evaluation)
  3346  		if eval.ID < pageToken {
  3347  			continue
  3348  		}
  3349  
  3350  		deleteOk, err := s.EvalIsUserDeleteSafe(nil, eval)
  3351  		if !deleteOk || err != nil {
  3352  			continue
  3353  		}
  3354  		match, err := filter.Evaluate(eval)
  3355  		if !match || err != nil {
  3356  			continue
  3357  		}
  3358  		if err := txn.Delete("evals", eval); err != nil {
  3359  			return fmt.Errorf("eval delete failed: %v", err)
  3360  		}
  3361  		pageCount++
  3362  	}
  3363  
  3364  	err = txn.Commit()
  3365  	return err
  3366  }
  3367  
  3368  // EvalIsUserDeleteSafe ensures an evaluation is safe to delete based on its
  3369  // related allocation and job information. This follows similar, but different
  3370  // rules to the eval reap checking, to ensure evaluations for running allocs or
  3371  // allocs which need the evaluation detail are not deleted.
  3372  //
  3373  // Returns both a bool and an error so that error in querying the related
  3374  // objects can be differentiated from reporting that the eval isn't safe to
  3375  // delete.
  3376  func (s *StateStore) EvalIsUserDeleteSafe(ws memdb.WatchSet, eval *structs.Evaluation) (bool, error) {
  3377  
  3378  	job, err := s.JobByID(ws, eval.Namespace, eval.JobID)
  3379  	if err != nil {
  3380  		return false, fmt.Errorf("failed to lookup job for eval: %v", err)
  3381  	}
  3382  
  3383  	allocs, err := s.AllocsByEval(ws, eval.ID)
  3384  	if err != nil {
  3385  		return false, fmt.Errorf("failed to lookup eval allocs: %v", err)
  3386  	}
  3387  
  3388  	return isEvalDeleteSafe(allocs, job), nil
  3389  }
  3390  
  3391  func isEvalDeleteSafe(allocs []*structs.Allocation, job *structs.Job) bool {
  3392  
  3393  	// If the job is deleted, stopped, or dead, all allocs are terminal and
  3394  	// the eval can be deleted.
  3395  	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
  3396  		return true
  3397  	}
  3398  
  3399  	// Iterate the allocations associated to the eval, if any, and check
  3400  	// whether we can delete the eval.
  3401  	for _, alloc := range allocs {
  3402  
  3403  		// If the allocation is still classed as running on the client, or
  3404  		// might be, we can't delete.
  3405  		switch alloc.ClientStatus {
  3406  		case structs.AllocClientStatusRunning, structs.AllocClientStatusUnknown:
  3407  			return false
  3408  		}
  3409  
  3410  		// If the alloc hasn't failed then we don't need to consider it for
  3411  		// rescheduling. Rescheduling needs to copy over information from the
  3412  		// previous alloc so that it can enforce the reschedule policy.
  3413  		if alloc.ClientStatus != structs.AllocClientStatusFailed {
  3414  			continue
  3415  		}
  3416  
  3417  		var reschedulePolicy *structs.ReschedulePolicy
  3418  		tg := job.LookupTaskGroup(alloc.TaskGroup)
  3419  
  3420  		if tg != nil {
  3421  			reschedulePolicy = tg.ReschedulePolicy
  3422  		}
  3423  
  3424  		// No reschedule policy or rescheduling is disabled
  3425  		if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
  3426  			continue
  3427  		}
  3428  
  3429  		// The restart tracking information has not been carried forward.
  3430  		if alloc.NextAllocation == "" {
  3431  			return false
  3432  		}
  3433  
  3434  		// This task has unlimited rescheduling and the alloc has not been
  3435  		// replaced, so we can't delete the eval yet.
  3436  		if reschedulePolicy.Unlimited {
  3437  			return false
  3438  		}
  3439  
  3440  		// No restarts have been attempted yet.
  3441  		if alloc.RescheduleTracker == nil || len(alloc.RescheduleTracker.Events) == 0 {
  3442  			return false
  3443  		}
  3444  	}
  3445  
  3446  	return true
  3447  }
  3448  
  3449  // DeleteEval is used to delete an evaluation
  3450  func (s *StateStore) DeleteEval(index uint64, evals, allocs []string, userInitiated bool) error {
  3451  	txn := s.db.WriteTxn(index)
  3452  	defer txn.Abort()
  3453  
  3454  	// If this deletion has been initiated by an operator, ensure the eval
  3455  	// broker is paused.
  3456  	if userInitiated {
  3457  		_, schedConfig, err := s.schedulerConfigTxn(txn)
  3458  		if err != nil {
  3459  			return err
  3460  		}
  3461  		if schedConfig == nil || !schedConfig.PauseEvalBroker {
  3462  			return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
  3463  		}
  3464  	}
  3465  
  3466  	jobs := make(map[structs.NamespacedID]string, len(evals))
  3467  
  3468  	// evalsTableUpdated and allocsTableUpdated allow us to track whether each
  3469  	// table has been modified. This allows us to skip updating the index table
  3470  	// entries if we do not need to.
  3471  	var evalsTableUpdated, allocsTableUpdated bool
  3472  
  3473  	for _, eval := range evals {
  3474  		existing, err := txn.First("evals", "id", eval)
  3475  		if err != nil {
  3476  			return fmt.Errorf("eval lookup failed: %v", err)
  3477  		}
  3478  		if existing == nil {
  3479  			continue
  3480  		}
  3481  		if err := txn.Delete("evals", existing); err != nil {
  3482  			return fmt.Errorf("eval delete failed: %v", err)
  3483  		}
  3484  
  3485  		// Mark that we have made a successful modification to the evals
  3486  		// table.
  3487  		evalsTableUpdated = true
  3488  
  3489  		eval := existing.(*structs.Evaluation)
  3490  
  3491  		tuple := structs.NamespacedID{
  3492  			ID:        eval.JobID,
  3493  			Namespace: eval.Namespace,
  3494  		}
  3495  		jobs[tuple] = ""
  3496  	}
  3497  
  3498  	for _, alloc := range allocs {
  3499  		raw, err := txn.First("allocs", "id", alloc)
  3500  		if err != nil {
  3501  			return fmt.Errorf("alloc lookup failed: %v", err)
  3502  		}
  3503  		if raw == nil {
  3504  			continue
  3505  		}
  3506  		if err := txn.Delete("allocs", raw); err != nil {
  3507  			return fmt.Errorf("alloc delete failed: %v", err)
  3508  		}
  3509  
  3510  		// Mark that we have made a successful modification to the allocs
  3511  		// table.
  3512  		allocsTableUpdated = true
  3513  	}
  3514  
  3515  	// Update the indexes
  3516  	if evalsTableUpdated {
  3517  		if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3518  			return fmt.Errorf("index update failed: %v", err)
  3519  		}
  3520  	}
  3521  	if allocsTableUpdated {
  3522  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3523  			return fmt.Errorf("index update failed: %v", err)
  3524  		}
  3525  	}
  3526  
  3527  	// Set the job's status
  3528  	if err := s.setJobStatuses(index, txn, jobs, true); err != nil {
  3529  		return fmt.Errorf("setting job status failed: %v", err)
  3530  	}
  3531  
  3532  	return txn.Commit()
  3533  }
  3534  
  3535  // EvalByID is used to lookup an eval by its ID
  3536  func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) {
  3537  	txn := s.db.ReadTxn()
  3538  
  3539  	watchCh, existing, err := txn.FirstWatch("evals", "id", id)
  3540  	if err != nil {
  3541  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3542  	}
  3543  
  3544  	ws.Add(watchCh)
  3545  
  3546  	if existing != nil {
  3547  		return existing.(*structs.Evaluation), nil
  3548  	}
  3549  	return nil, nil
  3550  }
  3551  
  3552  // EvalsRelatedToID is used to retrieve the evals that are related (next,
  3553  // previous, or blocked) to the provided eval ID.
  3554  func (s *StateStore) EvalsRelatedToID(ws memdb.WatchSet, id string) ([]*structs.EvaluationStub, error) {
  3555  	txn := s.db.ReadTxn()
  3556  
  3557  	raw, err := txn.First("evals", "id", id)
  3558  	if err != nil {
  3559  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3560  	}
  3561  	if raw == nil {
  3562  		return nil, nil
  3563  	}
  3564  	eval := raw.(*structs.Evaluation)
  3565  
  3566  	relatedEvals := []*structs.EvaluationStub{}
  3567  	todo := eval.RelatedIDs()
  3568  	done := map[string]bool{
  3569  		eval.ID: true, // don't place the requested eval in the related list.
  3570  	}
  3571  
  3572  	for len(todo) > 0 {
  3573  		// Pop the first value from the todo list.
  3574  		current := todo[0]
  3575  		todo = todo[1:]
  3576  		if current == "" {
  3577  			continue
  3578  		}
  3579  
  3580  		// Skip value if we already have it in the results.
  3581  		if done[current] {
  3582  			continue
  3583  		}
  3584  
  3585  		eval, err := s.EvalByID(ws, current)
  3586  		if err != nil {
  3587  			return nil, err
  3588  		}
  3589  		if eval == nil {
  3590  			continue
  3591  		}
  3592  
  3593  		todo = append(todo, eval.RelatedIDs()...)
  3594  		relatedEvals = append(relatedEvals, eval.Stub())
  3595  		done[eval.ID] = true
  3596  	}
  3597  
  3598  	return relatedEvals, nil
  3599  }
  3600  
  3601  // EvalsByIDPrefix is used to lookup evaluations by prefix in a particular
  3602  // namespace
  3603  func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
  3604  	txn := s.db.ReadTxn()
  3605  
  3606  	var iter memdb.ResultIterator
  3607  	var err error
  3608  
  3609  	// Get an iterator over all evals by the id prefix
  3610  	switch sort {
  3611  	case SortReverse:
  3612  		iter, err = txn.GetReverse("evals", "id_prefix", id)
  3613  	default:
  3614  		iter, err = txn.Get("evals", "id_prefix", id)
  3615  	}
  3616  	if err != nil {
  3617  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3618  	}
  3619  
  3620  	ws.Add(iter.WatchCh())
  3621  
  3622  	// Wrap the iterator in a filter
  3623  	wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace))
  3624  	return wrap, nil
  3625  }
  3626  
  3627  // evalNamespaceFilter returns a filter function that filters all evaluations
  3628  // not in the given namespace.
  3629  func evalNamespaceFilter(namespace string) func(interface{}) bool {
  3630  	return func(raw interface{}) bool {
  3631  		eval, ok := raw.(*structs.Evaluation)
  3632  		if !ok {
  3633  			return true
  3634  		}
  3635  
  3636  		return namespace != structs.AllNamespacesSentinel &&
  3637  			eval.Namespace != namespace
  3638  	}
  3639  }
  3640  
  3641  // EvalsByJob returns all the evaluations by job id
  3642  func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) {
  3643  	txn := s.db.ReadTxn()
  3644  
  3645  	// Get an iterator over the node allocations
  3646  	iter, err := txn.Get("evals", "job_prefix", namespace, jobID)
  3647  	if err != nil {
  3648  		return nil, err
  3649  	}
  3650  
  3651  	ws.Add(iter.WatchCh())
  3652  
  3653  	var out []*structs.Evaluation
  3654  	for {
  3655  		raw := iter.Next()
  3656  		if raw == nil {
  3657  			break
  3658  		}
  3659  
  3660  		e := raw.(*structs.Evaluation)
  3661  
  3662  		// Filter non-exact matches
  3663  		if e.JobID != jobID {
  3664  			continue
  3665  		}
  3666  
  3667  		out = append(out, e)
  3668  	}
  3669  	return out, nil
  3670  }
  3671  
  3672  // Evals returns an iterator over all the evaluations in ascending or descending
  3673  // order of CreationIndex as determined by the reverse parameter.
  3674  func (s *StateStore) Evals(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  3675  	txn := s.db.ReadTxn()
  3676  
  3677  	var it memdb.ResultIterator
  3678  	var err error
  3679  
  3680  	switch sort {
  3681  	case SortReverse:
  3682  		it, err = txn.GetReverse("evals", "create")
  3683  	default:
  3684  		it, err = txn.Get("evals", "create")
  3685  	}
  3686  
  3687  	if err != nil {
  3688  		return nil, err
  3689  	}
  3690  
  3691  	ws.Add(it.WatchCh())
  3692  
  3693  	return it, nil
  3694  }
  3695  
  3696  // EvalsByNamespace returns an iterator over all evaluations in no particular
  3697  // order.
  3698  //
  3699  // todo(shoenig): can this be removed?
  3700  func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  3701  	txn := s.db.ReadTxn()
  3702  
  3703  	it, err := txn.Get("evals", "namespace", namespace)
  3704  	if err != nil {
  3705  		return nil, err
  3706  	}
  3707  
  3708  	ws.Add(it.WatchCh())
  3709  
  3710  	return it, nil
  3711  }
  3712  
  3713  func (s *StateStore) EvalsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
  3714  	txn := s.db.ReadTxn()
  3715  
  3716  	var (
  3717  		it    memdb.ResultIterator
  3718  		err   error
  3719  		exact = terminate(namespace)
  3720  	)
  3721  
  3722  	switch sort {
  3723  	case SortReverse:
  3724  		it, err = txn.GetReverse("evals", "namespace_create_prefix", exact)
  3725  	default:
  3726  		it, err = txn.Get("evals", "namespace_create_prefix", exact)
  3727  	}
  3728  
  3729  	if err != nil {
  3730  		return nil, err
  3731  	}
  3732  
  3733  	ws.Add(it.WatchCh())
  3734  
  3735  	return it, nil
  3736  }
  3737  
  3738  // UpdateAllocsFromClient is used to update an allocation based on input
  3739  // from a client. While the schedulers are the authority on the allocation for
  3740  // most things, some updates are authoritative from the client. Specifically,
  3741  // the desired state comes from the schedulers, while the actual state comes
  3742  // from clients.
  3743  func (s *StateStore) UpdateAllocsFromClient(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
  3744  	txn := s.db.WriteTxnMsgT(msgType, index)
  3745  	defer txn.Abort()
  3746  
  3747  	// Capture all nodes being affected. Alloc updates from clients are batched
  3748  	// so this request may include allocs from several nodes.
  3749  	nodeIDs := set.New[string](1)
  3750  
  3751  	// Handle each of the updated allocations
  3752  	for _, alloc := range allocs {
  3753  		nodeIDs.Insert(alloc.NodeID)
  3754  		if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil {
  3755  			return err
  3756  		}
  3757  	}
  3758  
  3759  	// Update the indexes
  3760  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3761  		return fmt.Errorf("index update failed: %v", err)
  3762  	}
  3763  
  3764  	// Update the index of when nodes last updated their allocs.
  3765  	for _, nodeID := range nodeIDs.List() {
  3766  		if err := s.updateClientAllocUpdateIndex(txn, index, nodeID); err != nil {
  3767  			return fmt.Errorf("node update failed: %v", err)
  3768  		}
  3769  	}
  3770  
  3771  	return txn.Commit()
  3772  }
  3773  
  3774  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
  3775  func (s *StateStore) nestedUpdateAllocFromClient(txn *txn, index uint64, alloc *structs.Allocation) error {
  3776  	// Look for existing alloc
  3777  	existing, err := txn.First("allocs", "id", alloc.ID)
  3778  	if err != nil {
  3779  		return fmt.Errorf("alloc lookup failed: %v", err)
  3780  	}
  3781  
  3782  	// Nothing to do if this does not exist
  3783  	if existing == nil {
  3784  		return nil
  3785  	}
  3786  	exist := existing.(*structs.Allocation)
  3787  
  3788  	// Copy everything from the existing allocation
  3789  	copyAlloc := exist.Copy()
  3790  
  3791  	// Pull in anything the client is the authority on
  3792  	copyAlloc.ClientStatus = alloc.ClientStatus
  3793  	copyAlloc.ClientDescription = alloc.ClientDescription
  3794  	copyAlloc.TaskStates = alloc.TaskStates
  3795  	copyAlloc.NetworkStatus = alloc.NetworkStatus
  3796  
  3797  	// The client can only set its deployment health and timestamp, so just take
  3798  	// those
  3799  	if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil {
  3800  		oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth()
  3801  		newHasHealthy := alloc.DeploymentStatus.HasHealth()
  3802  
  3803  		// We got new health information from the client
  3804  		if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) {
  3805  			// Updated deployment health and timestamp
  3806  			copyAlloc.DeploymentStatus.Healthy = pointer.Of(*alloc.DeploymentStatus.Healthy)
  3807  			copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp
  3808  			copyAlloc.DeploymentStatus.ModifyIndex = index
  3809  		}
  3810  	} else if alloc.DeploymentStatus != nil {
  3811  		// First time getting a deployment status so copy everything and just
  3812  		// set the index
  3813  		copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy()
  3814  		copyAlloc.DeploymentStatus.ModifyIndex = index
  3815  	}
  3816  
  3817  	// Update the modify index
  3818  	copyAlloc.ModifyIndex = index
  3819  
  3820  	// Update the modify time
  3821  	copyAlloc.ModifyTime = alloc.ModifyTime
  3822  
  3823  	if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3824  		return fmt.Errorf("error updating deployment: %v", err)
  3825  	}
  3826  
  3827  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3828  		return fmt.Errorf("error updating job summary: %v", err)
  3829  	}
  3830  
  3831  	if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3832  		return err
  3833  	}
  3834  
  3835  	if err := s.updatePluginForTerminalAlloc(index, copyAlloc, txn); err != nil {
  3836  		return err
  3837  	}
  3838  
  3839  	// Update the allocation
  3840  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  3841  		return fmt.Errorf("alloc insert failed: %v", err)
  3842  	}
  3843  
  3844  	// Set the job's status
  3845  	forceStatus := ""
  3846  	if !copyAlloc.TerminalStatus() {
  3847  		forceStatus = structs.JobStatusRunning
  3848  	}
  3849  
  3850  	tuple := structs.NamespacedID{
  3851  		ID:        exist.JobID,
  3852  		Namespace: exist.Namespace,
  3853  	}
  3854  	jobs := map[structs.NamespacedID]string{tuple: forceStatus}
  3855  
  3856  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3857  		return fmt.Errorf("setting job status failed: %v", err)
  3858  	}
  3859  	return nil
  3860  }
  3861  
  3862  func (s *StateStore) updateClientAllocUpdateIndex(txn *txn, index uint64, nodeID string) error {
  3863  	existing, err := txn.First("nodes", "id", nodeID)
  3864  	if err != nil {
  3865  		return fmt.Errorf("node lookup failed: %v", err)
  3866  	}
  3867  	if existing == nil {
  3868  		return nil
  3869  	}
  3870  
  3871  	node := existing.(*structs.Node)
  3872  	copyNode := node.Copy()
  3873  	copyNode.LastAllocUpdateIndex = index
  3874  
  3875  	if err := txn.Insert("nodes", copyNode); err != nil {
  3876  		return fmt.Errorf("node update failed: %v", err)
  3877  	}
  3878  	if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil {
  3879  		return fmt.Errorf("index update failed: %v", err)
  3880  	}
  3881  	return nil
  3882  }
  3883  
  3884  // UpsertAllocs is used to evict a set of allocations and allocate new ones at
  3885  // the same time.
  3886  func (s *StateStore) UpsertAllocs(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
  3887  	txn := s.db.WriteTxn(index)
  3888  	defer txn.Abort()
  3889  	if err := s.upsertAllocsImpl(index, allocs, txn); err != nil {
  3890  		return err
  3891  	}
  3892  	return txn.Commit()
  3893  }
  3894  
  3895  // upsertAllocs is the actual implementation of UpsertAllocs so that it may be
  3896  // used with an existing transaction.
  3897  func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *txn) error {
  3898  	// Handle the allocations
  3899  	jobs := make(map[structs.NamespacedID]string, 1)
  3900  	for _, alloc := range allocs {
  3901  		existing, err := txn.First("allocs", "id", alloc.ID)
  3902  		if err != nil {
  3903  			return fmt.Errorf("alloc lookup failed: %v", err)
  3904  		}
  3905  		exist, _ := existing.(*structs.Allocation)
  3906  
  3907  		if exist == nil {
  3908  			alloc.CreateIndex = index
  3909  			alloc.ModifyIndex = index
  3910  			alloc.AllocModifyIndex = index
  3911  			if alloc.DeploymentStatus != nil {
  3912  				alloc.DeploymentStatus.ModifyIndex = index
  3913  			}
  3914  
  3915  			// Issue https://github.com/hernad/nomad/issues/2583 uncovered
  3916  			// the a race between a forced garbage collection and the scheduler
  3917  			// marking an allocation as terminal. The issue is that the
  3918  			// allocation from the scheduler has its job normalized and the FSM
  3919  			// will only denormalize if the allocation is not terminal.  However
  3920  			// if the allocation is garbage collected, that will result in a
  3921  			// allocation being upserted for the first time without a job
  3922  			// attached. By returning an error here, it will cause the FSM to
  3923  			// error, causing the plan_apply to error and thus causing the
  3924  			// evaluation to be failed. This will force an index refresh that
  3925  			// should solve this issue.
  3926  			if alloc.Job == nil {
  3927  				return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID)
  3928  			}
  3929  		} else {
  3930  			alloc.CreateIndex = exist.CreateIndex
  3931  			alloc.ModifyIndex = index
  3932  			alloc.AllocModifyIndex = index
  3933  
  3934  			// Keep the clients task states
  3935  			alloc.TaskStates = exist.TaskStates
  3936  
  3937  			// If the scheduler is marking this allocation as lost or unknown we do not
  3938  			// want to reuse the status of the existing allocation.
  3939  			if alloc.ClientStatus != structs.AllocClientStatusLost &&
  3940  				alloc.ClientStatus != structs.AllocClientStatusUnknown {
  3941  				alloc.ClientStatus = exist.ClientStatus
  3942  				alloc.ClientDescription = exist.ClientDescription
  3943  			}
  3944  
  3945  			// The job has been denormalized so re-attach the original job
  3946  			if alloc.Job == nil {
  3947  				alloc.Job = exist.Job
  3948  			}
  3949  		}
  3950  
  3951  		// OPTIMIZATION:
  3952  		// These should be given a map of new to old allocation and the updates
  3953  		// should be one on all changes. The current implementation causes O(n)
  3954  		// lookups/copies/insertions rather than O(1)
  3955  		if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil {
  3956  			return fmt.Errorf("error updating deployment: %v", err)
  3957  		}
  3958  
  3959  		if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil {
  3960  			return fmt.Errorf("error updating job summary: %v", err)
  3961  		}
  3962  
  3963  		if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil {
  3964  			return err
  3965  		}
  3966  
  3967  		if err := s.updatePluginForTerminalAlloc(index, alloc, txn); err != nil {
  3968  			return err
  3969  		}
  3970  
  3971  		if err := txn.Insert("allocs", alloc); err != nil {
  3972  			return fmt.Errorf("alloc insert failed: %v", err)
  3973  		}
  3974  
  3975  		if alloc.PreviousAllocation != "" {
  3976  			prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation)
  3977  			if err != nil {
  3978  				return fmt.Errorf("alloc lookup failed: %v", err)
  3979  			}
  3980  			existingPrevAlloc, _ := prevAlloc.(*structs.Allocation)
  3981  			if existingPrevAlloc != nil {
  3982  				prevAllocCopy := existingPrevAlloc.Copy()
  3983  				prevAllocCopy.NextAllocation = alloc.ID
  3984  				prevAllocCopy.ModifyIndex = index
  3985  				if err := txn.Insert("allocs", prevAllocCopy); err != nil {
  3986  					return fmt.Errorf("alloc insert failed: %v", err)
  3987  				}
  3988  			}
  3989  		}
  3990  
  3991  		// If the allocation is running, force the job to running status.
  3992  		forceStatus := ""
  3993  		if !alloc.TerminalStatus() {
  3994  			forceStatus = structs.JobStatusRunning
  3995  		}
  3996  
  3997  		tuple := structs.NamespacedID{
  3998  			ID:        alloc.JobID,
  3999  			Namespace: alloc.Namespace,
  4000  		}
  4001  		jobs[tuple] = forceStatus
  4002  	}
  4003  
  4004  	// Update the indexes
  4005  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4006  		return fmt.Errorf("index update failed: %v", err)
  4007  	}
  4008  
  4009  	// Set the job's status
  4010  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  4011  		return fmt.Errorf("setting job status failed: %v", err)
  4012  	}
  4013  
  4014  	return nil
  4015  }
  4016  
  4017  // UpdateAllocsDesiredTransitions is used to update a set of allocations
  4018  // desired transitions.
  4019  func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType, index uint64, allocs map[string]*structs.DesiredTransition,
  4020  	evals []*structs.Evaluation) error {
  4021  
  4022  	txn := s.db.WriteTxnMsgT(msgType, index)
  4023  	defer txn.Abort()
  4024  
  4025  	// Handle each of the updated allocations
  4026  	for id, transition := range allocs {
  4027  		if err := s.UpdateAllocDesiredTransitionTxn(txn, index, id, transition); err != nil {
  4028  			return err
  4029  		}
  4030  	}
  4031  
  4032  	for _, eval := range evals {
  4033  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  4034  			return err
  4035  		}
  4036  	}
  4037  
  4038  	// Update the indexes
  4039  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4040  		return fmt.Errorf("index update failed: %v", err)
  4041  	}
  4042  
  4043  	return txn.Commit()
  4044  }
  4045  
  4046  // UpdateAllocDesiredTransitionTxn is used to nest an update of an
  4047  // allocations desired transition
  4048  func (s *StateStore) UpdateAllocDesiredTransitionTxn(
  4049  	txn *txn, index uint64, allocID string,
  4050  	transition *structs.DesiredTransition) error {
  4051  
  4052  	// Look for existing alloc
  4053  	existing, err := txn.First("allocs", "id", allocID)
  4054  	if err != nil {
  4055  		return fmt.Errorf("alloc lookup failed: %v", err)
  4056  	}
  4057  
  4058  	// Nothing to do if this does not exist
  4059  	if existing == nil {
  4060  		return nil
  4061  	}
  4062  	exist := existing.(*structs.Allocation)
  4063  
  4064  	// Copy everything from the existing allocation
  4065  	copyAlloc := exist.Copy()
  4066  
  4067  	// Merge the desired transitions
  4068  	copyAlloc.DesiredTransition.Merge(transition)
  4069  
  4070  	// Update the modify indexes
  4071  	copyAlloc.ModifyIndex = index
  4072  	copyAlloc.AllocModifyIndex = index
  4073  
  4074  	// Update the allocation
  4075  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  4076  		return fmt.Errorf("alloc insert failed: %v", err)
  4077  	}
  4078  
  4079  	return nil
  4080  }
  4081  
  4082  // AllocByID is used to lookup an allocation by its ID
  4083  func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  4084  	txn := s.db.ReadTxn()
  4085  	return s.allocByIDImpl(txn, ws, id)
  4086  }
  4087  
  4088  // allocByIDImpl retrives an allocation and is called under and existing
  4089  // transaction. An optional watch set can be passed to add allocations to the
  4090  // watch set
  4091  func (s *StateStore) allocByIDImpl(txn Txn, ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  4092  	watchCh, raw, err := txn.FirstWatch("allocs", "id", id)
  4093  	if err != nil {
  4094  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  4095  	}
  4096  
  4097  	ws.Add(watchCh)
  4098  
  4099  	if raw == nil {
  4100  		return nil, nil
  4101  	}
  4102  	alloc := raw.(*structs.Allocation)
  4103  	return alloc, nil
  4104  }
  4105  
  4106  // AllocsByIDPrefix is used to lookup allocs by prefix
  4107  func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
  4108  	txn := s.db.ReadTxn()
  4109  
  4110  	var iter memdb.ResultIterator
  4111  	var err error
  4112  
  4113  	switch sort {
  4114  	case SortReverse:
  4115  		iter, err = txn.GetReverse("allocs", "id_prefix", id)
  4116  	default:
  4117  		iter, err = txn.Get("allocs", "id_prefix", id)
  4118  	}
  4119  	if err != nil {
  4120  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  4121  	}
  4122  
  4123  	ws.Add(iter.WatchCh())
  4124  
  4125  	// Wrap the iterator in a filter
  4126  	wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace))
  4127  	return wrap, nil
  4128  }
  4129  
  4130  // allocNamespaceFilter returns a filter function that filters all allocations
  4131  // not in the given namespace.
  4132  func allocNamespaceFilter(namespace string) func(interface{}) bool {
  4133  	return func(raw interface{}) bool {
  4134  		alloc, ok := raw.(*structs.Allocation)
  4135  		if !ok {
  4136  			return true
  4137  		}
  4138  
  4139  		if namespace == structs.AllNamespacesSentinel {
  4140  			return false
  4141  		}
  4142  
  4143  		return alloc.Namespace != namespace
  4144  	}
  4145  }
  4146  
  4147  // AllocsByIDPrefixAllNSs is used to lookup allocs by prefix.
  4148  func (s *StateStore) AllocsByIDPrefixAllNSs(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  4149  	txn := s.db.ReadTxn()
  4150  
  4151  	iter, err := txn.Get("allocs", "id_prefix", prefix)
  4152  	if err != nil {
  4153  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  4154  	}
  4155  
  4156  	ws.Add(iter.WatchCh())
  4157  
  4158  	return iter, nil
  4159  }
  4160  
  4161  // AllocsByNode returns all the allocations by node
  4162  func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  4163  	txn := s.db.ReadTxn()
  4164  
  4165  	return allocsByNodeTxn(txn, ws, node)
  4166  }
  4167  
  4168  func allocsByNodeTxn(txn ReadTxn, ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  4169  	// Get an iterator over the node allocations, using only the
  4170  	// node prefix which ignores the terminal status
  4171  	iter, err := txn.Get("allocs", "node_prefix", node)
  4172  	if err != nil {
  4173  		return nil, err
  4174  	}
  4175  
  4176  	ws.Add(iter.WatchCh())
  4177  
  4178  	var out []*structs.Allocation
  4179  	for {
  4180  		raw := iter.Next()
  4181  		if raw == nil {
  4182  			break
  4183  		}
  4184  		out = append(out, raw.(*structs.Allocation))
  4185  	}
  4186  	return out, nil
  4187  }
  4188  
  4189  // AllocsByNodeTerminal returns all the allocations by node and terminal
  4190  // status.
  4191  func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) {
  4192  	txn := s.db.ReadTxn()
  4193  
  4194  	// Get an iterator over the node allocations
  4195  	iter, err := txn.Get("allocs", "node", node, terminal)
  4196  	if err != nil {
  4197  		return nil, err
  4198  	}
  4199  
  4200  	ws.Add(iter.WatchCh())
  4201  
  4202  	var out []*structs.Allocation
  4203  	for {
  4204  		raw := iter.Next()
  4205  		if raw == nil {
  4206  			break
  4207  		}
  4208  		out = append(out, raw.(*structs.Allocation))
  4209  	}
  4210  	return out, nil
  4211  }
  4212  
  4213  // AllocsByJob returns allocations by job id
  4214  func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) {
  4215  	txn := s.db.ReadTxn()
  4216  
  4217  	// Get the job
  4218  	var job *structs.Job
  4219  	rawJob, err := txn.First("jobs", "id", namespace, jobID)
  4220  	if err != nil {
  4221  		return nil, err
  4222  	}
  4223  	if rawJob != nil {
  4224  		job = rawJob.(*structs.Job)
  4225  	}
  4226  
  4227  	// Get an iterator over the node allocations
  4228  	iter, err := txn.Get("allocs", "job", namespace, jobID)
  4229  	if err != nil {
  4230  		return nil, err
  4231  	}
  4232  
  4233  	ws.Add(iter.WatchCh())
  4234  
  4235  	var out []*structs.Allocation
  4236  	for {
  4237  		raw := iter.Next()
  4238  		if raw == nil {
  4239  			break
  4240  		}
  4241  
  4242  		alloc := raw.(*structs.Allocation)
  4243  		// If the allocation belongs to a job with the same ID but a different
  4244  		// create index and we are not getting all the allocations whose Jobs
  4245  		// matches the same Job ID then we skip it
  4246  		if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex {
  4247  			continue
  4248  		}
  4249  		out = append(out, raw.(*structs.Allocation))
  4250  	}
  4251  	return out, nil
  4252  }
  4253  
  4254  // AllocsByEval returns all the allocations by eval id
  4255  func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) {
  4256  	txn := s.db.ReadTxn()
  4257  
  4258  	// Get an iterator over the eval allocations
  4259  	iter, err := txn.Get("allocs", "eval", evalID)
  4260  	if err != nil {
  4261  		return nil, err
  4262  	}
  4263  
  4264  	ws.Add(iter.WatchCh())
  4265  
  4266  	var out []*structs.Allocation
  4267  	for {
  4268  		raw := iter.Next()
  4269  		if raw == nil {
  4270  			break
  4271  		}
  4272  		out = append(out, raw.(*structs.Allocation))
  4273  	}
  4274  	return out, nil
  4275  }
  4276  
  4277  // AllocsByDeployment returns all the allocations by deployment id
  4278  func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) {
  4279  	txn := s.db.ReadTxn()
  4280  
  4281  	// Get an iterator over the deployments allocations
  4282  	iter, err := txn.Get("allocs", "deployment", deploymentID)
  4283  	if err != nil {
  4284  		return nil, err
  4285  	}
  4286  
  4287  	ws.Add(iter.WatchCh())
  4288  
  4289  	var out []*structs.Allocation
  4290  	for {
  4291  		raw := iter.Next()
  4292  		if raw == nil {
  4293  			break
  4294  		}
  4295  		out = append(out, raw.(*structs.Allocation))
  4296  	}
  4297  	return out, nil
  4298  }
  4299  
  4300  // Allocs returns an iterator over all the evaluations.
  4301  func (s *StateStore) Allocs(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  4302  	txn := s.db.ReadTxn()
  4303  
  4304  	var it memdb.ResultIterator
  4305  	var err error
  4306  
  4307  	switch sort {
  4308  	case SortReverse:
  4309  		it, err = txn.GetReverse("allocs", "create")
  4310  	default:
  4311  		it, err = txn.Get("allocs", "create")
  4312  	}
  4313  
  4314  	if err != nil {
  4315  		return nil, err
  4316  	}
  4317  
  4318  	ws.Add(it.WatchCh())
  4319  
  4320  	return it, nil
  4321  }
  4322  
  4323  func (s *StateStore) AllocsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
  4324  	txn := s.db.ReadTxn()
  4325  
  4326  	var (
  4327  		it    memdb.ResultIterator
  4328  		err   error
  4329  		exact = terminate(namespace)
  4330  	)
  4331  
  4332  	switch sort {
  4333  	case SortReverse:
  4334  		it, err = txn.GetReverse("allocs", "namespace_create_prefix", exact)
  4335  	default:
  4336  		it, err = txn.Get("allocs", "namespace_create_prefix", exact)
  4337  	}
  4338  
  4339  	if err != nil {
  4340  		return nil, err
  4341  	}
  4342  
  4343  	ws.Add(it.WatchCh())
  4344  
  4345  	return it, nil
  4346  }
  4347  
  4348  // AllocsByNamespace returns an iterator over all the allocations in the
  4349  // namespace
  4350  func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  4351  	txn := s.db.ReadTxn()
  4352  	return s.allocsByNamespaceImpl(ws, txn, namespace)
  4353  }
  4354  
  4355  // allocsByNamespaceImpl returns an iterator over all the allocations in the
  4356  // namespace
  4357  func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *txn, namespace string) (memdb.ResultIterator, error) {
  4358  	// Walk the entire table
  4359  	iter, err := txn.Get("allocs", "namespace", namespace)
  4360  	if err != nil {
  4361  		return nil, err
  4362  	}
  4363  
  4364  	ws.Add(iter.WatchCh())
  4365  
  4366  	return iter, nil
  4367  }
  4368  
  4369  // UpsertVaultAccessor is used to register a set of Vault Accessors.
  4370  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  4371  	txn := s.db.WriteTxn(index)
  4372  	defer txn.Abort()
  4373  
  4374  	for _, accessor := range accessors {
  4375  		// Set the create index
  4376  		accessor.CreateIndex = index
  4377  
  4378  		// Insert the accessor
  4379  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  4380  			return fmt.Errorf("accessor insert failed: %v", err)
  4381  		}
  4382  	}
  4383  
  4384  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  4385  		return fmt.Errorf("index update failed: %v", err)
  4386  	}
  4387  
  4388  	return txn.Commit()
  4389  }
  4390  
  4391  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  4392  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  4393  	txn := s.db.WriteTxn(index)
  4394  	defer txn.Abort()
  4395  
  4396  	// Lookup the accessor
  4397  	for _, accessor := range accessors {
  4398  		// Delete the accessor
  4399  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  4400  			return fmt.Errorf("accessor delete failed: %v", err)
  4401  		}
  4402  	}
  4403  
  4404  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  4405  		return fmt.Errorf("index update failed: %v", err)
  4406  	}
  4407  
  4408  	return txn.Commit()
  4409  }
  4410  
  4411  // VaultAccessor returns the given Vault accessor
  4412  func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) {
  4413  	txn := s.db.ReadTxn()
  4414  
  4415  	watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor)
  4416  	if err != nil {
  4417  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  4418  	}
  4419  
  4420  	ws.Add(watchCh)
  4421  
  4422  	if existing != nil {
  4423  		return existing.(*structs.VaultAccessor), nil
  4424  	}
  4425  
  4426  	return nil, nil
  4427  }
  4428  
  4429  // VaultAccessors returns an iterator of Vault accessors.
  4430  func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4431  	txn := s.db.ReadTxn()
  4432  
  4433  	iter, err := txn.Get("vault_accessors", "id")
  4434  	if err != nil {
  4435  		return nil, err
  4436  	}
  4437  
  4438  	ws.Add(iter.WatchCh())
  4439  
  4440  	return iter, nil
  4441  }
  4442  
  4443  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  4444  func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) {
  4445  	txn := s.db.ReadTxn()
  4446  
  4447  	// Get an iterator over the accessors
  4448  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  4449  	if err != nil {
  4450  		return nil, err
  4451  	}
  4452  
  4453  	ws.Add(iter.WatchCh())
  4454  
  4455  	var out []*structs.VaultAccessor
  4456  	for {
  4457  		raw := iter.Next()
  4458  		if raw == nil {
  4459  			break
  4460  		}
  4461  		out = append(out, raw.(*structs.VaultAccessor))
  4462  	}
  4463  	return out, nil
  4464  }
  4465  
  4466  // VaultAccessorsByNode returns all the Vault accessors by node id
  4467  func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) {
  4468  	txn := s.db.ReadTxn()
  4469  
  4470  	// Get an iterator over the accessors
  4471  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  4472  	if err != nil {
  4473  		return nil, err
  4474  	}
  4475  
  4476  	ws.Add(iter.WatchCh())
  4477  
  4478  	var out []*structs.VaultAccessor
  4479  	for {
  4480  		raw := iter.Next()
  4481  		if raw == nil {
  4482  			break
  4483  		}
  4484  		out = append(out, raw.(*structs.VaultAccessor))
  4485  	}
  4486  	return out, nil
  4487  }
  4488  
  4489  func indexEntry(table string, index uint64) *IndexEntry {
  4490  	return &IndexEntry{
  4491  		Key:   table,
  4492  		Value: index,
  4493  	}
  4494  }
  4495  
  4496  const siTokenAccessorTable = "si_token_accessors"
  4497  
  4498  // UpsertSITokenAccessors is used to register a set of Service Identity token accessors.
  4499  func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  4500  	txn := s.db.WriteTxn(index)
  4501  	defer txn.Abort()
  4502  
  4503  	for _, accessor := range accessors {
  4504  		// set the create index
  4505  		accessor.CreateIndex = index
  4506  
  4507  		// insert the accessor
  4508  		if err := txn.Insert(siTokenAccessorTable, accessor); err != nil {
  4509  			return fmt.Errorf("accessor insert failed: %w", err)
  4510  		}
  4511  	}
  4512  
  4513  	// update the index for this table
  4514  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  4515  		return fmt.Errorf("index update failed: %w", err)
  4516  	}
  4517  
  4518  	return txn.Commit()
  4519  }
  4520  
  4521  // DeleteSITokenAccessors is used to delete a set of Service Identity token accessors.
  4522  func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  4523  	txn := s.db.WriteTxn(index)
  4524  	defer txn.Abort()
  4525  
  4526  	// Lookup each accessor
  4527  	for _, accessor := range accessors {
  4528  		// Delete the accessor
  4529  		if err := txn.Delete(siTokenAccessorTable, accessor); err != nil {
  4530  			return fmt.Errorf("accessor delete failed: %w", err)
  4531  		}
  4532  	}
  4533  
  4534  	// update the index for this table
  4535  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  4536  		return fmt.Errorf("index update failed: %w", err)
  4537  	}
  4538  
  4539  	return txn.Commit()
  4540  }
  4541  
  4542  // SITokenAccessor returns the given Service Identity token accessor.
  4543  func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) {
  4544  	txn := s.db.ReadTxn()
  4545  	defer txn.Abort()
  4546  
  4547  	watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID)
  4548  	if err != nil {
  4549  		return nil, fmt.Errorf("accessor lookup failed: %w", err)
  4550  	}
  4551  
  4552  	ws.Add(watchCh)
  4553  
  4554  	if existing != nil {
  4555  		return existing.(*structs.SITokenAccessor), nil
  4556  	}
  4557  
  4558  	return nil, nil
  4559  }
  4560  
  4561  // SITokenAccessors returns an iterator of Service Identity token accessors.
  4562  func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4563  	txn := s.db.ReadTxn()
  4564  	defer txn.Abort()
  4565  
  4566  	iter, err := txn.Get(siTokenAccessorTable, "id")
  4567  	if err != nil {
  4568  		return nil, err
  4569  	}
  4570  
  4571  	ws.Add(iter.WatchCh())
  4572  
  4573  	return iter, nil
  4574  }
  4575  
  4576  // SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID.
  4577  func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) {
  4578  	txn := s.db.ReadTxn()
  4579  	defer txn.Abort()
  4580  
  4581  	// Get an iterator over the accessors
  4582  	iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID)
  4583  	if err != nil {
  4584  		return nil, err
  4585  	}
  4586  
  4587  	ws.Add(iter.WatchCh())
  4588  
  4589  	var result []*structs.SITokenAccessor
  4590  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  4591  		result = append(result, raw.(*structs.SITokenAccessor))
  4592  	}
  4593  
  4594  	return result, nil
  4595  }
  4596  
  4597  // SITokenAccessorsByNode returns all the Service Identity token accessors by node ID.
  4598  func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) {
  4599  	txn := s.db.ReadTxn()
  4600  	defer txn.Abort()
  4601  
  4602  	// Get an iterator over the accessors
  4603  	iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID)
  4604  	if err != nil {
  4605  		return nil, err
  4606  	}
  4607  
  4608  	ws.Add(iter.WatchCh())
  4609  
  4610  	var result []*structs.SITokenAccessor
  4611  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  4612  		result = append(result, raw.(*structs.SITokenAccessor))
  4613  	}
  4614  
  4615  	return result, nil
  4616  }
  4617  
  4618  // UpdateDeploymentStatus is used to make deployment status updates and
  4619  // potentially make a evaluation
  4620  func (s *StateStore) UpdateDeploymentStatus(msgType structs.MessageType, index uint64, req *structs.DeploymentStatusUpdateRequest) error {
  4621  	txn := s.db.WriteTxnMsgT(msgType, index)
  4622  	defer txn.Abort()
  4623  
  4624  	if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  4625  		return err
  4626  	}
  4627  
  4628  	// Upsert the job if necessary
  4629  	if req.Job != nil {
  4630  		if err := s.upsertJobImpl(index, nil, req.Job, false, txn); err != nil {
  4631  			return err
  4632  		}
  4633  	}
  4634  
  4635  	// Upsert the optional eval
  4636  	if req.Eval != nil {
  4637  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4638  			return err
  4639  		}
  4640  	}
  4641  
  4642  	return txn.Commit()
  4643  }
  4644  
  4645  // updateDeploymentStatusImpl is used to make deployment status updates
  4646  func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *txn) error {
  4647  	// Retrieve deployment
  4648  	ws := memdb.NewWatchSet()
  4649  	deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn)
  4650  	if err != nil {
  4651  		return err
  4652  	} else if deployment == nil {
  4653  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID)
  4654  	} else if !deployment.Active() {
  4655  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4656  	}
  4657  
  4658  	// Apply the new status
  4659  	copy := deployment.Copy()
  4660  	copy.Status = u.Status
  4661  	copy.StatusDescription = u.StatusDescription
  4662  	copy.ModifyIndex = index
  4663  
  4664  	// Insert the deployment
  4665  	if err := txn.Insert("deployment", copy); err != nil {
  4666  		return err
  4667  	}
  4668  
  4669  	// Update the index
  4670  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
  4671  		return fmt.Errorf("index update failed: %v", err)
  4672  	}
  4673  
  4674  	// If the deployment is being marked as complete, set the job to stable.
  4675  	if copy.Status == structs.DeploymentStatusSuccessful {
  4676  		if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil {
  4677  			return fmt.Errorf("failed to update job stability: %v", err)
  4678  		}
  4679  	}
  4680  
  4681  	return nil
  4682  }
  4683  
  4684  // UpdateJobStability updates the stability of the given job and version to the
  4685  // desired status.
  4686  func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error {
  4687  	txn := s.db.WriteTxn(index)
  4688  	defer txn.Abort()
  4689  
  4690  	if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil {
  4691  		return err
  4692  	}
  4693  
  4694  	return txn.Commit()
  4695  }
  4696  
  4697  // updateJobStabilityImpl updates the stability of the given job and version
  4698  func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *txn) error {
  4699  	// Get the job that is referenced
  4700  	job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn)
  4701  	if err != nil {
  4702  		return err
  4703  	}
  4704  
  4705  	// Has already been cleared, nothing to do
  4706  	if job == nil {
  4707  		return nil
  4708  	}
  4709  
  4710  	// If the job already has the desired stability, nothing to do
  4711  	if job.Stable == stable {
  4712  		return nil
  4713  	}
  4714  
  4715  	copy := job.Copy()
  4716  	copy.Stable = stable
  4717  	return s.upsertJobImpl(index, nil, copy, true, txn)
  4718  }
  4719  
  4720  // UpdateDeploymentPromotion is used to promote canaries in a deployment and
  4721  // potentially make a evaluation
  4722  func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentPromoteRequest) error {
  4723  	txn := s.db.WriteTxnMsgT(msgType, index)
  4724  	defer txn.Abort()
  4725  
  4726  	// Retrieve deployment and ensure it is not terminal and is active
  4727  	ws := memdb.NewWatchSet()
  4728  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  4729  	if err != nil {
  4730  		return err
  4731  	} else if deployment == nil {
  4732  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  4733  	} else if !deployment.Active() {
  4734  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4735  	}
  4736  
  4737  	// Retrieve effected allocations
  4738  	iter, err := txn.Get("allocs", "deployment", req.DeploymentID)
  4739  	if err != nil {
  4740  		return err
  4741  	}
  4742  
  4743  	// groupIndex is a map of groups being promoted
  4744  	groupIndex := make(map[string]struct{}, len(req.Groups))
  4745  	for _, g := range req.Groups {
  4746  		groupIndex[g] = struct{}{}
  4747  	}
  4748  
  4749  	// canaryIndex is the set of placed canaries in the deployment
  4750  	canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
  4751  	for _, dstate := range deployment.TaskGroups {
  4752  		for _, c := range dstate.PlacedCanaries {
  4753  			canaryIndex[c] = struct{}{}
  4754  		}
  4755  	}
  4756  
  4757  	// healthyCounts is a mapping of group to the number of healthy canaries
  4758  	healthyCounts := make(map[string]int, len(deployment.TaskGroups))
  4759  
  4760  	// promotable is the set of allocations that we can move from canary to
  4761  	// non-canary
  4762  	var promotable []*structs.Allocation
  4763  
  4764  	for {
  4765  		raw := iter.Next()
  4766  		if raw == nil {
  4767  			break
  4768  		}
  4769  
  4770  		alloc := raw.(*structs.Allocation)
  4771  
  4772  		// Check that the alloc is a canary
  4773  		if _, ok := canaryIndex[alloc.ID]; !ok {
  4774  			continue
  4775  		}
  4776  
  4777  		// Check that the canary is part of a group being promoted
  4778  		if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok {
  4779  			continue
  4780  		}
  4781  
  4782  		// Ensure the canaries are healthy
  4783  		if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() {
  4784  			continue
  4785  		}
  4786  
  4787  		healthyCounts[alloc.TaskGroup]++
  4788  		promotable = append(promotable, alloc)
  4789  	}
  4790  
  4791  	// Determine if we have enough healthy allocations
  4792  	var unhealthyErr multierror.Error
  4793  	for tg, dstate := range deployment.TaskGroups {
  4794  		if _, ok := groupIndex[tg]; !req.All && !ok {
  4795  			continue
  4796  		}
  4797  
  4798  		need := dstate.DesiredCanaries
  4799  		if need == 0 {
  4800  			continue
  4801  		}
  4802  
  4803  		if have := healthyCounts[tg]; have < need {
  4804  			multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
  4805  		}
  4806  	}
  4807  
  4808  	if err := unhealthyErr.ErrorOrNil(); err != nil {
  4809  		return err
  4810  	}
  4811  
  4812  	// Update deployment
  4813  	copy := deployment.Copy()
  4814  	copy.ModifyIndex = index
  4815  	for tg, status := range copy.TaskGroups {
  4816  		_, ok := groupIndex[tg]
  4817  		if !req.All && !ok {
  4818  			continue
  4819  		}
  4820  
  4821  		// reset the progress deadline
  4822  		if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() {
  4823  			status.RequireProgressBy = time.Now().Add(status.ProgressDeadline)
  4824  		}
  4825  		status.Promoted = true
  4826  	}
  4827  
  4828  	// If the deployment no longer needs promotion, update its status
  4829  	if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning {
  4830  		copy.StatusDescription = structs.DeploymentStatusDescriptionRunning
  4831  	}
  4832  
  4833  	// Insert the deployment
  4834  	if err := s.upsertDeploymentImpl(index, copy, txn); err != nil {
  4835  		return err
  4836  	}
  4837  
  4838  	// Upsert the optional eval
  4839  	if req.Eval != nil {
  4840  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4841  			return err
  4842  		}
  4843  	}
  4844  
  4845  	// For each promotable allocation remove the canary field
  4846  	for _, alloc := range promotable {
  4847  		promoted := alloc.Copy()
  4848  		promoted.DeploymentStatus.Canary = false
  4849  		promoted.DeploymentStatus.ModifyIndex = index
  4850  		promoted.ModifyIndex = index
  4851  		promoted.AllocModifyIndex = index
  4852  
  4853  		if err := txn.Insert("allocs", promoted); err != nil {
  4854  			return fmt.Errorf("alloc insert failed: %v", err)
  4855  		}
  4856  	}
  4857  
  4858  	// Update the alloc index
  4859  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4860  		return fmt.Errorf("index update failed: %v", err)
  4861  	}
  4862  
  4863  	return txn.Commit()
  4864  }
  4865  
  4866  // UpdateDeploymentAllocHealth is used to update the health of allocations as
  4867  // part of the deployment and potentially make a evaluation
  4868  func (s *StateStore) UpdateDeploymentAllocHealth(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error {
  4869  	txn := s.db.WriteTxnMsgT(msgType, index)
  4870  	defer txn.Abort()
  4871  
  4872  	// Retrieve deployment and ensure it is not terminal and is active
  4873  	ws := memdb.NewWatchSet()
  4874  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  4875  	if err != nil {
  4876  		return err
  4877  	} else if deployment == nil {
  4878  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  4879  	} else if !deployment.Active() {
  4880  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4881  	}
  4882  
  4883  	// Update the health status of each allocation
  4884  	if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 {
  4885  		setAllocHealth := func(id string, healthy bool, ts time.Time) error {
  4886  			existing, err := txn.First("allocs", "id", id)
  4887  			if err != nil {
  4888  				return fmt.Errorf("alloc %q lookup failed: %v", id, err)
  4889  			}
  4890  			if existing == nil {
  4891  				return fmt.Errorf("unknown alloc %q", id)
  4892  			}
  4893  
  4894  			old := existing.(*structs.Allocation)
  4895  			if old.DeploymentID != req.DeploymentID {
  4896  				return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID)
  4897  			}
  4898  
  4899  			// Set the health
  4900  			copy := old.Copy()
  4901  			if copy.DeploymentStatus == nil {
  4902  				copy.DeploymentStatus = &structs.AllocDeploymentStatus{}
  4903  			}
  4904  			copy.DeploymentStatus.Healthy = pointer.Of(healthy)
  4905  			copy.DeploymentStatus.Timestamp = ts
  4906  			copy.DeploymentStatus.ModifyIndex = index
  4907  			copy.ModifyIndex = index
  4908  
  4909  			if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil {
  4910  				return fmt.Errorf("error updating deployment: %v", err)
  4911  			}
  4912  
  4913  			if err := txn.Insert("allocs", copy); err != nil {
  4914  				return fmt.Errorf("alloc insert failed: %v", err)
  4915  			}
  4916  
  4917  			return nil
  4918  		}
  4919  
  4920  		for _, id := range req.HealthyAllocationIDs {
  4921  			if err := setAllocHealth(id, true, req.Timestamp); err != nil {
  4922  				return err
  4923  			}
  4924  		}
  4925  		for _, id := range req.UnhealthyAllocationIDs {
  4926  			if err := setAllocHealth(id, false, req.Timestamp); err != nil {
  4927  				return err
  4928  			}
  4929  		}
  4930  
  4931  		// Update the indexes
  4932  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4933  			return fmt.Errorf("index update failed: %v", err)
  4934  		}
  4935  	}
  4936  
  4937  	// Update the deployment status as needed.
  4938  	if req.DeploymentUpdate != nil {
  4939  		if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  4940  			return err
  4941  		}
  4942  	}
  4943  
  4944  	// Upsert the job if necessary
  4945  	if req.Job != nil {
  4946  		if err := s.upsertJobImpl(index, nil, req.Job, false, txn); err != nil {
  4947  			return err
  4948  		}
  4949  	}
  4950  
  4951  	// Upsert the optional eval
  4952  	if req.Eval != nil {
  4953  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4954  			return err
  4955  		}
  4956  	}
  4957  
  4958  	return txn.Commit()
  4959  }
  4960  
  4961  // LatestIndex returns the greatest index value for all indexes.
  4962  func (s *StateStore) LatestIndex() (uint64, error) {
  4963  	indexes, err := s.Indexes()
  4964  	if err != nil {
  4965  		return 0, err
  4966  	}
  4967  
  4968  	var max uint64 = 0
  4969  	for {
  4970  		raw := indexes.Next()
  4971  		if raw == nil {
  4972  			break
  4973  		}
  4974  
  4975  		// Prepare the request struct
  4976  		idx := raw.(*IndexEntry)
  4977  
  4978  		// Determine the max
  4979  		if idx.Value > max {
  4980  			max = idx.Value
  4981  		}
  4982  	}
  4983  
  4984  	return max, nil
  4985  }
  4986  
  4987  // Index finds the matching index value
  4988  func (s *StateStore) Index(name string) (uint64, error) {
  4989  	txn := s.db.ReadTxn()
  4990  
  4991  	// Lookup the first matching index
  4992  	out, err := txn.First("index", "id", name)
  4993  	if err != nil {
  4994  		return 0, err
  4995  	}
  4996  	if out == nil {
  4997  		return 0, nil
  4998  	}
  4999  	return out.(*IndexEntry).Value, nil
  5000  }
  5001  
  5002  // Indexes returns an iterator over all the indexes
  5003  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  5004  	txn := s.db.ReadTxn()
  5005  
  5006  	// Walk the entire nodes table
  5007  	iter, err := txn.Get("index", "id")
  5008  	if err != nil {
  5009  		return nil, err
  5010  	}
  5011  	return iter, nil
  5012  }
  5013  
  5014  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  5015  // store
  5016  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  5017  	txn := s.db.WriteTxn(index)
  5018  	defer txn.Abort()
  5019  
  5020  	// Get all the jobs
  5021  	iter, err := txn.Get("jobs", "id")
  5022  	if err != nil {
  5023  		return err
  5024  	}
  5025  	// COMPAT: Remove after 0.11
  5026  	// Iterate over jobs to build a list of parent jobs and their children
  5027  	parentMap := make(map[string][]*structs.Job)
  5028  	for {
  5029  		rawJob := iter.Next()
  5030  		if rawJob == nil {
  5031  			break
  5032  		}
  5033  		job := rawJob.(*structs.Job)
  5034  		if job.ParentID != "" {
  5035  			children := parentMap[job.ParentID]
  5036  			children = append(children, job)
  5037  			parentMap[job.ParentID] = children
  5038  		}
  5039  	}
  5040  
  5041  	// Get all the jobs again
  5042  	iter, err = txn.Get("jobs", "id")
  5043  	if err != nil {
  5044  		return err
  5045  	}
  5046  
  5047  	for {
  5048  		rawJob := iter.Next()
  5049  		if rawJob == nil {
  5050  			break
  5051  		}
  5052  		job := rawJob.(*structs.Job)
  5053  
  5054  		if job.IsParameterized() || job.IsPeriodic() {
  5055  			// COMPAT: Remove after 0.11
  5056  
  5057  			// The following block of code fixes incorrect child summaries due to a bug
  5058  			// See https://github.com/hernad/nomad/issues/3886 for details
  5059  			rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  5060  			if err != nil {
  5061  				return err
  5062  			}
  5063  			if rawSummary == nil {
  5064  				continue
  5065  			}
  5066  
  5067  			oldSummary := rawSummary.(*structs.JobSummary)
  5068  
  5069  			// Create an empty summary
  5070  			summary := &structs.JobSummary{
  5071  				JobID:     job.ID,
  5072  				Namespace: job.Namespace,
  5073  				Summary:   make(map[string]structs.TaskGroupSummary),
  5074  				Children:  &structs.JobChildrenSummary{},
  5075  			}
  5076  
  5077  			// Iterate over children of this job if any to fix summary counts
  5078  			children := parentMap[job.ID]
  5079  			for _, childJob := range children {
  5080  				switch childJob.Status {
  5081  				case structs.JobStatusPending:
  5082  					summary.Children.Pending++
  5083  				case structs.JobStatusDead:
  5084  					summary.Children.Dead++
  5085  				case structs.JobStatusRunning:
  5086  					summary.Children.Running++
  5087  				}
  5088  			}
  5089  
  5090  			// Insert the job summary if its different
  5091  			if !reflect.DeepEqual(summary, oldSummary) {
  5092  				// Set the create index of the summary same as the job's create index
  5093  				// and the modify index to the current index
  5094  				summary.CreateIndex = job.CreateIndex
  5095  				summary.ModifyIndex = index
  5096  
  5097  				if err := txn.Insert("job_summary", summary); err != nil {
  5098  					return fmt.Errorf("error inserting job summary: %v", err)
  5099  				}
  5100  			}
  5101  
  5102  			// Done with handling a parent job, continue to next
  5103  			continue
  5104  		}
  5105  
  5106  		// Create a job summary for the job
  5107  		summary := &structs.JobSummary{
  5108  			JobID:     job.ID,
  5109  			Namespace: job.Namespace,
  5110  			Summary:   make(map[string]structs.TaskGroupSummary),
  5111  		}
  5112  		for _, tg := range job.TaskGroups {
  5113  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  5114  		}
  5115  
  5116  		// Find all the allocations for the jobs
  5117  		iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  5118  		if err != nil {
  5119  			return err
  5120  		}
  5121  
  5122  		// Calculate the summary for the job
  5123  		for {
  5124  			rawAlloc := iterAllocs.Next()
  5125  			if rawAlloc == nil {
  5126  				break
  5127  			}
  5128  			alloc := rawAlloc.(*structs.Allocation)
  5129  
  5130  			// Ignore the allocation if it doesn't belong to the currently
  5131  			// registered job. The allocation is checked because of issue #2304
  5132  			if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex {
  5133  				continue
  5134  			}
  5135  
  5136  			tg := summary.Summary[alloc.TaskGroup]
  5137  			switch alloc.ClientStatus {
  5138  			case structs.AllocClientStatusFailed:
  5139  				tg.Failed += 1
  5140  			case structs.AllocClientStatusLost:
  5141  				tg.Lost += 1
  5142  			case structs.AllocClientStatusUnknown:
  5143  				tg.Unknown += 1
  5144  			case structs.AllocClientStatusComplete:
  5145  				tg.Complete += 1
  5146  			case structs.AllocClientStatusRunning:
  5147  				tg.Running += 1
  5148  			case structs.AllocClientStatusPending:
  5149  				tg.Starting += 1
  5150  			default:
  5151  				s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID)
  5152  			}
  5153  			summary.Summary[alloc.TaskGroup] = tg
  5154  		}
  5155  
  5156  		// Set the create index of the summary same as the job's create index
  5157  		// and the modify index to the current index
  5158  		summary.CreateIndex = job.CreateIndex
  5159  		summary.ModifyIndex = index
  5160  
  5161  		// Insert the job summary
  5162  		if err := txn.Insert("job_summary", summary); err != nil {
  5163  			return fmt.Errorf("error inserting job summary: %v", err)
  5164  		}
  5165  	}
  5166  
  5167  	// Update the indexes table for job summary
  5168  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5169  		return fmt.Errorf("index update failed: %v", err)
  5170  	}
  5171  	return txn.Commit()
  5172  }
  5173  
  5174  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  5175  // It takes a map of job IDs to an optional forceStatus string. It returns an
  5176  // error if the job doesn't exist or setJobStatus fails.
  5177  func (s *StateStore) setJobStatuses(index uint64, txn *txn,
  5178  	jobs map[structs.NamespacedID]string, evalDelete bool) error {
  5179  	for tuple, forceStatus := range jobs {
  5180  
  5181  		existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID)
  5182  		if err != nil {
  5183  			return fmt.Errorf("job lookup failed: %v", err)
  5184  		}
  5185  
  5186  		if existing == nil {
  5187  			continue
  5188  		}
  5189  
  5190  		if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  5191  			return err
  5192  		}
  5193  
  5194  	}
  5195  
  5196  	return nil
  5197  }
  5198  
  5199  // setJobStatus sets the status of the job by looking up associated evaluations
  5200  // and allocations. evalDelete should be set to true if setJobStatus is being
  5201  // called because an evaluation is being deleted (potentially because of garbage
  5202  // collection). If forceStatus is non-empty, the job's status will be set to the
  5203  // passed status.
  5204  func (s *StateStore) setJobStatus(index uint64, txn *txn,
  5205  	job *structs.Job, evalDelete bool, forceStatus string) error {
  5206  
  5207  	// Capture the current status so we can check if there is a change
  5208  	oldStatus := job.Status
  5209  	newStatus := forceStatus
  5210  
  5211  	// If forceStatus is not set, compute the jobs status.
  5212  	if forceStatus == "" {
  5213  		var err error
  5214  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  5215  		if err != nil {
  5216  			return err
  5217  		}
  5218  	}
  5219  
  5220  	// Fast-path if the job has not changed.
  5221  	if oldStatus == newStatus {
  5222  		return nil
  5223  	}
  5224  
  5225  	// Copy and update the existing job
  5226  	updated := job.Copy()
  5227  	updated.Status = newStatus
  5228  	updated.ModifyIndex = index
  5229  
  5230  	// Insert the job
  5231  	if err := txn.Insert("jobs", updated); err != nil {
  5232  		return fmt.Errorf("job insert failed: %v", err)
  5233  	}
  5234  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  5235  		return fmt.Errorf("index update failed: %v", err)
  5236  	}
  5237  
  5238  	// Update the children summary
  5239  	if err := s.setJobSummary(txn, updated, index, oldStatus, newStatus); err != nil {
  5240  		return fmt.Errorf("job summary update failed %w", err)
  5241  	}
  5242  	return nil
  5243  }
  5244  
  5245  func (s *StateStore) setJobSummary(txn *txn, updated *structs.Job, index uint64, oldStatus, newStatus string) error {
  5246  	if updated.ParentID == "" {
  5247  		return nil
  5248  	}
  5249  
  5250  	// Try to update the summary of the parent job summary
  5251  	summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID)
  5252  	if err != nil {
  5253  		return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  5254  	}
  5255  
  5256  	// Only continue if the summary exists. It could not exist if the parent
  5257  	// job was removed
  5258  	if summaryRaw != nil {
  5259  		existing := summaryRaw.(*structs.JobSummary)
  5260  		pSummary := existing.Copy()
  5261  		if pSummary.Children == nil {
  5262  			pSummary.Children = new(structs.JobChildrenSummary)
  5263  		}
  5264  
  5265  		// Determine the transition and update the correct fields
  5266  		children := pSummary.Children
  5267  
  5268  		// Decrement old status
  5269  		if oldStatus != "" {
  5270  			switch oldStatus {
  5271  			case structs.JobStatusPending:
  5272  				children.Pending--
  5273  			case structs.JobStatusRunning:
  5274  				children.Running--
  5275  			case structs.JobStatusDead:
  5276  				children.Dead--
  5277  			default:
  5278  				return fmt.Errorf("unknown old job status %q", oldStatus)
  5279  			}
  5280  		}
  5281  
  5282  		// Increment new status
  5283  		switch newStatus {
  5284  		case structs.JobStatusPending:
  5285  			children.Pending++
  5286  		case structs.JobStatusRunning:
  5287  			children.Running++
  5288  		case structs.JobStatusDead:
  5289  			children.Dead++
  5290  		default:
  5291  			return fmt.Errorf("unknown new job status %q", newStatus)
  5292  		}
  5293  
  5294  		// Update the index
  5295  		pSummary.ModifyIndex = index
  5296  
  5297  		// Insert the summary
  5298  		if err := txn.Insert("job_summary", pSummary); err != nil {
  5299  			return fmt.Errorf("job summary insert failed: %v", err)
  5300  		}
  5301  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5302  			return fmt.Errorf("index update failed: %v", err)
  5303  		}
  5304  	}
  5305  	return nil
  5306  }
  5307  
  5308  func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) {
  5309  	// System, Periodic and Parameterized jobs are running until explicitly
  5310  	// stopped.
  5311  	if job.Type == structs.JobTypeSystem ||
  5312  		job.IsParameterized() ||
  5313  		job.IsPeriodic() {
  5314  		if job.Stop {
  5315  			return structs.JobStatusDead, nil
  5316  		}
  5317  		return structs.JobStatusRunning, nil
  5318  	}
  5319  
  5320  	allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  5321  	if err != nil {
  5322  		return "", err
  5323  	}
  5324  
  5325  	// If there is a non-terminal allocation, the job is running.
  5326  	hasAlloc := false
  5327  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  5328  		hasAlloc = true
  5329  		if !alloc.(*structs.Allocation).TerminalStatus() {
  5330  			return structs.JobStatusRunning, nil
  5331  		}
  5332  	}
  5333  
  5334  	evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID)
  5335  	if err != nil {
  5336  		return "", err
  5337  	}
  5338  
  5339  	hasEval := false
  5340  	for raw := evals.Next(); raw != nil; raw = evals.Next() {
  5341  		e := raw.(*structs.Evaluation)
  5342  
  5343  		// Filter non-exact matches
  5344  		if e.JobID != job.ID {
  5345  			continue
  5346  		}
  5347  
  5348  		hasEval = true
  5349  		if !e.TerminalStatus() {
  5350  			return structs.JobStatusPending, nil
  5351  		}
  5352  	}
  5353  
  5354  	// The job is dead if all the allocations and evals are terminal or if there
  5355  	// are no evals because of garbage collection.
  5356  	if evalDelete || hasEval || hasAlloc {
  5357  		return structs.JobStatusDead, nil
  5358  	}
  5359  
  5360  	return structs.JobStatusPending, nil
  5361  }
  5362  
  5363  // updateSummaryWithJob creates or updates job summaries when new jobs are
  5364  // upserted or existing ones are updated
  5365  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  5366  	txn *txn) error {
  5367  
  5368  	// Update the job summary
  5369  	summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  5370  	if err != nil {
  5371  		return fmt.Errorf("job summary lookup failed: %v", err)
  5372  	}
  5373  
  5374  	// Get the summary or create if necessary
  5375  	var summary *structs.JobSummary
  5376  	hasSummaryChanged := false
  5377  	if summaryRaw != nil {
  5378  		summary = summaryRaw.(*structs.JobSummary).Copy()
  5379  	} else {
  5380  		summary = &structs.JobSummary{
  5381  			JobID:       job.ID,
  5382  			Namespace:   job.Namespace,
  5383  			Summary:     make(map[string]structs.TaskGroupSummary),
  5384  			Children:    new(structs.JobChildrenSummary),
  5385  			CreateIndex: index,
  5386  		}
  5387  		hasSummaryChanged = true
  5388  	}
  5389  
  5390  	for _, tg := range job.TaskGroups {
  5391  		if _, ok := summary.Summary[tg.Name]; !ok {
  5392  			newSummary := structs.TaskGroupSummary{
  5393  				Complete: 0,
  5394  				Failed:   0,
  5395  				Running:  0,
  5396  				Starting: 0,
  5397  			}
  5398  			summary.Summary[tg.Name] = newSummary
  5399  			hasSummaryChanged = true
  5400  		}
  5401  	}
  5402  
  5403  	// The job summary has changed, so update the modify index.
  5404  	if hasSummaryChanged {
  5405  		summary.ModifyIndex = index
  5406  
  5407  		// Update the indexes table for job summary
  5408  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5409  			return fmt.Errorf("index update failed: %v", err)
  5410  		}
  5411  		if err := txn.Insert("job_summary", summary); err != nil {
  5412  			return err
  5413  		}
  5414  	}
  5415  
  5416  	return nil
  5417  }
  5418  
  5419  // updateJobScalingPolicies upserts any scaling policies contained in the job and removes
  5420  // any previous scaling policies that were removed from the job
  5421  func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {
  5422  
  5423  	ws := memdb.NewWatchSet()
  5424  
  5425  	scalingPolicies := job.GetScalingPolicies()
  5426  	newTargets := map[string]bool{}
  5427  	for _, p := range scalingPolicies {
  5428  		newTargets[p.JobKey()] = true
  5429  	}
  5430  	// find existing policies that need to be deleted
  5431  	deletedPolicies := []string{}
  5432  	iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn)
  5433  	if err != nil {
  5434  		return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err)
  5435  	}
  5436  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  5437  		oldPolicy := raw.(*structs.ScalingPolicy)
  5438  		if !newTargets[oldPolicy.JobKey()] {
  5439  			deletedPolicies = append(deletedPolicies, oldPolicy.ID)
  5440  		}
  5441  	}
  5442  	err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn)
  5443  	if err != nil {
  5444  		return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err)
  5445  	}
  5446  
  5447  	err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn)
  5448  	if err != nil {
  5449  		return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err)
  5450  	}
  5451  
  5452  	return nil
  5453  }
  5454  
  5455  // updateJobSubmission stores the original job source and variables associated that the
  5456  // job structure originates from. It is up to the job submitter to include the source
  5457  // material, and as such sub may be nil, in which case nothing is stored.
  5458  func (s *StateStore) updateJobSubmission(index uint64, sub *structs.JobSubmission, namespace, jobID string, version uint64, txn *txn) error {
  5459  	// critical that we operate on a copy; the original must not be modified
  5460  	// e.g. in the case of job gc and its last second version bump
  5461  	sub = sub.Copy()
  5462  
  5463  	switch {
  5464  	case sub == nil:
  5465  		return nil
  5466  	case namespace == "":
  5467  		return errors.New("job_submission requires a namespace")
  5468  	case jobID == "":
  5469  		return errors.New("job_submission requires a jobID")
  5470  	default:
  5471  		sub.Namespace = namespace
  5472  		sub.JobID = jobID
  5473  		sub.JobModifyIndex = index
  5474  		sub.Version = version
  5475  	}
  5476  
  5477  	// check if we already have a submission for this (namespace, jobID, version)
  5478  	obj, err := txn.First("job_submission", "id", namespace, jobID, version)
  5479  	if err != nil {
  5480  		return err
  5481  	}
  5482  	if obj != nil {
  5483  		// if we already have a submission for this (namespace, jobID, version)
  5484  		// then there is nothing to do; manually avoid potential for duplicates
  5485  		return nil
  5486  	}
  5487  
  5488  	// insert the job submission for this (namespace, jobID, version)
  5489  	if err := txn.Insert("job_submission", sub); err != nil {
  5490  		return err
  5491  	}
  5492  
  5493  	// prune old job submissions
  5494  	return s.pruneJobSubmissions(namespace, jobID, txn)
  5495  }
  5496  
  5497  func (s *StateStore) pruneJobSubmissions(namespace, jobID string, txn *txn) error {
  5498  	// although the number of tracked submissions is the same as the number of
  5499  	// tracked job versions, do not assume a 1:1 correlation, as there could be
  5500  	// holes in the submissions (or none at all)
  5501  	limit := structs.JobTrackedVersions
  5502  
  5503  	// iterate through all stored submissions
  5504  	iter, err := txn.Get("job_submission", "id_prefix", namespace, jobID)
  5505  	if err != nil {
  5506  		return err
  5507  	}
  5508  
  5509  	stored := make([]lang.Pair[uint64, uint64], 0, limit+1)
  5510  	for next := iter.Next(); next != nil; next = iter.Next() {
  5511  		sub := next.(*structs.JobSubmission)
  5512  		// scanning by prefix; make sure we collect exact matches only
  5513  		if sub.Namespace == namespace && sub.JobID == jobID {
  5514  			stored = append(stored, lang.Pair[uint64, uint64]{First: sub.JobModifyIndex, Second: sub.Version})
  5515  		}
  5516  	}
  5517  
  5518  	// if we are still below the limit, nothing to do
  5519  	if len(stored) <= limit {
  5520  		return nil
  5521  	}
  5522  
  5523  	// sort by job modify index descending so we can just keep the first N
  5524  	slices.SortFunc(stored, func(a, b lang.Pair[uint64, uint64]) bool {
  5525  		return a.First > b.First
  5526  	})
  5527  
  5528  	// remove the outdated submission versions
  5529  	for _, sub := range stored[limit:] {
  5530  		if err = txn.Delete("job_submission", &structs.JobSubmission{
  5531  			Namespace: namespace,
  5532  			JobID:     jobID,
  5533  			Version:   sub.Second,
  5534  		}); err != nil {
  5535  			return err
  5536  		}
  5537  	}
  5538  	return nil
  5539  }
  5540  
  5541  // updateJobCSIPlugins runs on job update, and indexes the job in the plugin
  5542  func (s *StateStore) updateJobCSIPlugins(index uint64, job, prev *structs.Job, txn *txn) error {
  5543  	plugIns := make(map[string]*structs.CSIPlugin)
  5544  
  5545  	upsertFn := func(job *structs.Job, delete bool) error {
  5546  		for _, tg := range job.TaskGroups {
  5547  			for _, t := range tg.Tasks {
  5548  				if t.CSIPluginConfig == nil {
  5549  					continue
  5550  				}
  5551  
  5552  				plugIn, ok := plugIns[t.CSIPluginConfig.ID]
  5553  				if !ok {
  5554  					p, err := s.CSIPluginByIDTxn(txn, nil, t.CSIPluginConfig.ID)
  5555  					if err != nil {
  5556  						return err
  5557  					}
  5558  					if p == nil {
  5559  						plugIn = structs.NewCSIPlugin(t.CSIPluginConfig.ID, index)
  5560  					} else {
  5561  						plugIn = p.Copy()
  5562  						plugIn.ModifyIndex = index
  5563  					}
  5564  					plugIns[plugIn.ID] = plugIn
  5565  				}
  5566  
  5567  				if delete {
  5568  					plugIn.DeleteJob(job, nil)
  5569  				} else {
  5570  					plugIn.AddJob(job, nil)
  5571  				}
  5572  			}
  5573  		}
  5574  
  5575  		return nil
  5576  	}
  5577  
  5578  	if prev != nil {
  5579  		err := upsertFn(prev, true)
  5580  		if err != nil {
  5581  			return err
  5582  		}
  5583  	}
  5584  
  5585  	err := upsertFn(job, false)
  5586  	if err != nil {
  5587  		return err
  5588  	}
  5589  
  5590  	for _, plugIn := range plugIns {
  5591  		err = txn.Insert("csi_plugins", plugIn)
  5592  		if err != nil {
  5593  			return fmt.Errorf("csi_plugins insert error: %v", err)
  5594  		}
  5595  	}
  5596  
  5597  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  5598  		return fmt.Errorf("index update failed: %v", err)
  5599  	}
  5600  
  5601  	return nil
  5602  }
  5603  
  5604  // updateDeploymentWithAlloc is used to update the deployment state associated
  5605  // with the given allocation. The passed alloc may be updated if the deployment
  5606  // status has changed to capture the modify index at which it has changed.
  5607  func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *txn) error {
  5608  	// Nothing to do if the allocation is not associated with a deployment
  5609  	if alloc.DeploymentID == "" {
  5610  		return nil
  5611  	}
  5612  
  5613  	// Get the deployment
  5614  	ws := memdb.NewWatchSet()
  5615  	deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn)
  5616  	if err != nil {
  5617  		return err
  5618  	}
  5619  	if deployment == nil {
  5620  		return nil
  5621  	}
  5622  
  5623  	// Retrieve the deployment state object
  5624  	_, ok := deployment.TaskGroups[alloc.TaskGroup]
  5625  	if !ok {
  5626  		// If the task group isn't part of the deployment, the task group wasn't
  5627  		// part of a rolling update so nothing to do
  5628  		return nil
  5629  	}
  5630  
  5631  	// Do not modify in-place. Instead keep track of what must be done
  5632  	placed := 0
  5633  	healthy := 0
  5634  	unhealthy := 0
  5635  
  5636  	// If there was no existing allocation, this is a placement and we increment
  5637  	// the placement
  5638  	existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth()
  5639  	allocHealthSet := alloc.DeploymentStatus.HasHealth()
  5640  	if existing == nil || existing.DeploymentID != alloc.DeploymentID {
  5641  		placed++
  5642  	} else if !existingHealthSet && allocHealthSet {
  5643  		if *alloc.DeploymentStatus.Healthy {
  5644  			healthy++
  5645  		} else {
  5646  			unhealthy++
  5647  		}
  5648  	} else if existingHealthSet && allocHealthSet {
  5649  		// See if it has gone from healthy to unhealthy
  5650  		if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy {
  5651  			healthy--
  5652  			unhealthy++
  5653  		}
  5654  	}
  5655  
  5656  	// Nothing to do
  5657  	if placed == 0 && healthy == 0 && unhealthy == 0 {
  5658  		return nil
  5659  	}
  5660  
  5661  	// Update the allocation's deployment status modify index
  5662  	if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 {
  5663  		alloc.DeploymentStatus.ModifyIndex = index
  5664  	}
  5665  
  5666  	// Create a copy of the deployment object
  5667  	deploymentCopy := deployment.Copy()
  5668  	deploymentCopy.ModifyIndex = index
  5669  
  5670  	dstate := deploymentCopy.TaskGroups[alloc.TaskGroup]
  5671  	dstate.PlacedAllocs += placed
  5672  	dstate.HealthyAllocs += healthy
  5673  	dstate.UnhealthyAllocs += unhealthy
  5674  
  5675  	// Ensure PlacedCanaries accurately reflects the alloc canary status
  5676  	if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
  5677  		found := false
  5678  		for _, canary := range dstate.PlacedCanaries {
  5679  			if alloc.ID == canary {
  5680  				found = true
  5681  				break
  5682  			}
  5683  		}
  5684  		if !found {
  5685  			dstate.PlacedCanaries = append(dstate.PlacedCanaries, alloc.ID)
  5686  		}
  5687  	}
  5688  
  5689  	// Update the progress deadline
  5690  	if pd := dstate.ProgressDeadline; pd != 0 {
  5691  		// If we are the first placed allocation for the deployment start the progress deadline.
  5692  		if placed != 0 && dstate.RequireProgressBy.IsZero() {
  5693  			// Use modify time instead of create time because we may in-place
  5694  			// update the allocation to be part of a new deployment.
  5695  			dstate.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd)
  5696  		} else if healthy != 0 {
  5697  			if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(dstate.RequireProgressBy) {
  5698  				dstate.RequireProgressBy = d
  5699  			}
  5700  		}
  5701  	}
  5702  
  5703  	// Upsert the deployment
  5704  	if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil {
  5705  		return err
  5706  	}
  5707  
  5708  	return nil
  5709  }
  5710  
  5711  // updateSummaryWithAlloc updates the job summary when allocations are updated
  5712  // or inserted
  5713  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  5714  	existingAlloc *structs.Allocation, txn *txn) error {
  5715  
  5716  	// We don't have to update the summary if the job is missing
  5717  	if alloc.Job == nil {
  5718  		return nil
  5719  	}
  5720  
  5721  	summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID)
  5722  	if err != nil {
  5723  		return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err)
  5724  	}
  5725  
  5726  	if summaryRaw == nil {
  5727  		// Check if the job is de-registered
  5728  		rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID)
  5729  		if err != nil {
  5730  			return fmt.Errorf("unable to query job: %v", err)
  5731  		}
  5732  
  5733  		// If the job is de-registered then we skip updating it's summary
  5734  		if rawJob == nil {
  5735  			return nil
  5736  		}
  5737  
  5738  		return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace)
  5739  	}
  5740  
  5741  	// Get a copy of the existing summary
  5742  	jobSummary := summaryRaw.(*structs.JobSummary).Copy()
  5743  
  5744  	// Not updating the job summary because the allocation doesn't belong to the
  5745  	// currently registered job
  5746  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  5747  		return nil
  5748  	}
  5749  
  5750  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  5751  	if !ok {
  5752  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  5753  	}
  5754  
  5755  	summaryChanged := false
  5756  	if existingAlloc == nil {
  5757  		switch alloc.DesiredStatus {
  5758  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  5759  			s.logger.Error("new allocation inserted into state store with bad desired status",
  5760  				"alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
  5761  		}
  5762  		switch alloc.ClientStatus {
  5763  		case structs.AllocClientStatusPending:
  5764  			tgSummary.Starting += 1
  5765  			if tgSummary.Queued > 0 {
  5766  				tgSummary.Queued -= 1
  5767  			}
  5768  			summaryChanged = true
  5769  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  5770  			structs.AllocClientStatusComplete:
  5771  			s.logger.Error("new allocation inserted into state store with bad client status",
  5772  				"alloc_id", alloc.ID, "client_status", alloc.ClientStatus)
  5773  		}
  5774  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  5775  		// Incrementing the client of the bin of the current state
  5776  		switch alloc.ClientStatus {
  5777  		case structs.AllocClientStatusRunning:
  5778  			tgSummary.Running += 1
  5779  		case structs.AllocClientStatusFailed:
  5780  			tgSummary.Failed += 1
  5781  		case structs.AllocClientStatusPending:
  5782  			tgSummary.Starting += 1
  5783  		case structs.AllocClientStatusComplete:
  5784  			tgSummary.Complete += 1
  5785  		case structs.AllocClientStatusLost:
  5786  			tgSummary.Lost += 1
  5787  		case structs.AllocClientStatusUnknown:
  5788  			tgSummary.Unknown += 1
  5789  		}
  5790  
  5791  		// Decrementing the count of the bin of the last state
  5792  		switch existingAlloc.ClientStatus {
  5793  		case structs.AllocClientStatusRunning:
  5794  			if tgSummary.Running > 0 {
  5795  				tgSummary.Running -= 1
  5796  			}
  5797  		case structs.AllocClientStatusPending:
  5798  			if tgSummary.Starting > 0 {
  5799  				tgSummary.Starting -= 1
  5800  			}
  5801  		case structs.AllocClientStatusLost:
  5802  			if tgSummary.Lost > 0 {
  5803  				tgSummary.Lost -= 1
  5804  			}
  5805  		case structs.AllocClientStatusUnknown:
  5806  			if tgSummary.Unknown > 0 {
  5807  				tgSummary.Unknown -= 1
  5808  			}
  5809  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  5810  		default:
  5811  			s.logger.Error("invalid old client status for allocation",
  5812  				"alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus)
  5813  		}
  5814  		summaryChanged = true
  5815  	}
  5816  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  5817  
  5818  	if summaryChanged {
  5819  		jobSummary.ModifyIndex = index
  5820  
  5821  		s.updatePluginWithJobSummary(index, jobSummary, alloc, txn)
  5822  
  5823  		// Update the indexes table for job summary
  5824  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5825  			return fmt.Errorf("index update failed: %v", err)
  5826  		}
  5827  
  5828  		if err := txn.Insert("job_summary", jobSummary); err != nil {
  5829  			return fmt.Errorf("updating job summary failed: %v", err)
  5830  		}
  5831  	}
  5832  
  5833  	return nil
  5834  }
  5835  
  5836  // updatePluginForTerminalAlloc updates the CSI plugins for an alloc when the
  5837  // allocation is updated or inserted with a terminal server status.
  5838  func (s *StateStore) updatePluginForTerminalAlloc(index uint64, alloc *structs.Allocation,
  5839  	txn *txn) error {
  5840  
  5841  	if !alloc.ServerTerminalStatus() {
  5842  		return nil
  5843  	}
  5844  
  5845  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  5846  	for _, t := range tg.Tasks {
  5847  		if t.CSIPluginConfig != nil {
  5848  			pluginID := t.CSIPluginConfig.ID
  5849  			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
  5850  			if err != nil {
  5851  				return err
  5852  			}
  5853  			if plug == nil {
  5854  				// plugin may not have been created because it never
  5855  				// became healthy, just move on
  5856  				return nil
  5857  			}
  5858  			plug = plug.Copy()
  5859  			err = plug.DeleteAlloc(alloc.ID, alloc.NodeID)
  5860  			if err != nil {
  5861  				return err
  5862  			}
  5863  			err = updateOrGCPlugin(index, txn, plug)
  5864  			if err != nil {
  5865  				return err
  5866  			}
  5867  		}
  5868  	}
  5869  
  5870  	return nil
  5871  }
  5872  
  5873  // updatePluginWithJobSummary updates the CSI plugins for a job when the
  5874  // job summary is updated by an alloc
  5875  func (s *StateStore) updatePluginWithJobSummary(index uint64, summary *structs.JobSummary, alloc *structs.Allocation,
  5876  	txn *txn) error {
  5877  
  5878  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  5879  	if tg == nil {
  5880  		return nil
  5881  	}
  5882  
  5883  	for _, t := range tg.Tasks {
  5884  		if t.CSIPluginConfig != nil {
  5885  			pluginID := t.CSIPluginConfig.ID
  5886  			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
  5887  			if err != nil {
  5888  				return err
  5889  			}
  5890  			if plug == nil {
  5891  				plug = structs.NewCSIPlugin(pluginID, index)
  5892  			} else {
  5893  				plug = plug.Copy()
  5894  			}
  5895  
  5896  			plug.UpdateExpectedWithJob(alloc.Job, summary,
  5897  				alloc.Job.Status == structs.JobStatusDead)
  5898  
  5899  			err = updateOrGCPlugin(index, txn, plug)
  5900  			if err != nil {
  5901  				return err
  5902  			}
  5903  		}
  5904  	}
  5905  
  5906  	return nil
  5907  }
  5908  
  5909  // UpsertACLPolicies is used to create or update a set of ACL policies
  5910  func (s *StateStore) UpsertACLPolicies(msgType structs.MessageType, index uint64, policies []*structs.ACLPolicy) error {
  5911  	txn := s.db.WriteTxnMsgT(msgType, index)
  5912  	defer txn.Abort()
  5913  
  5914  	for _, policy := range policies {
  5915  		// Ensure the policy hash is non-nil. This should be done outside the state store
  5916  		// for performance reasons, but we check here for defense in depth.
  5917  		if len(policy.Hash) == 0 {
  5918  			policy.SetHash()
  5919  		}
  5920  
  5921  		// Check if the policy already exists
  5922  		existing, err := txn.First("acl_policy", "id", policy.Name)
  5923  		if err != nil {
  5924  			return fmt.Errorf("policy lookup failed: %v", err)
  5925  		}
  5926  
  5927  		// Update all the indexes
  5928  		if existing != nil {
  5929  			policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex
  5930  			policy.ModifyIndex = index
  5931  		} else {
  5932  			policy.CreateIndex = index
  5933  			policy.ModifyIndex = index
  5934  		}
  5935  
  5936  		// Update the policy
  5937  		if err := txn.Insert("acl_policy", policy); err != nil {
  5938  			return fmt.Errorf("upserting policy failed: %v", err)
  5939  		}
  5940  	}
  5941  
  5942  	// Update the indexes tabl
  5943  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  5944  		return fmt.Errorf("index update failed: %v", err)
  5945  	}
  5946  
  5947  	return txn.Commit()
  5948  }
  5949  
  5950  // DeleteACLPolicies deletes the policies with the given names
  5951  func (s *StateStore) DeleteACLPolicies(msgType structs.MessageType, index uint64, names []string) error {
  5952  	txn := s.db.WriteTxnMsgT(msgType, index)
  5953  	defer txn.Abort()
  5954  
  5955  	// Delete the policy
  5956  	for _, name := range names {
  5957  		if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil {
  5958  			return fmt.Errorf("deleting acl policy failed: %v", err)
  5959  		}
  5960  	}
  5961  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  5962  		return fmt.Errorf("index update failed: %v", err)
  5963  	}
  5964  	return txn.Commit()
  5965  }
  5966  
  5967  // ACLPolicyByName is used to lookup a policy by name
  5968  func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) {
  5969  	txn := s.db.ReadTxn()
  5970  
  5971  	watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name)
  5972  	if err != nil {
  5973  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  5974  	}
  5975  	ws.Add(watchCh)
  5976  
  5977  	if existing != nil {
  5978  		return existing.(*structs.ACLPolicy), nil
  5979  	}
  5980  	return nil, nil
  5981  }
  5982  
  5983  // ACLPolicyByNamePrefix is used to lookup policies by prefix
  5984  func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  5985  	txn := s.db.ReadTxn()
  5986  
  5987  	iter, err := txn.Get("acl_policy", "id_prefix", prefix)
  5988  	if err != nil {
  5989  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  5990  	}
  5991  	ws.Add(iter.WatchCh())
  5992  
  5993  	return iter, nil
  5994  }
  5995  
  5996  // ACLPolicyByJob is used to lookup policies that have been attached to a
  5997  // specific job
  5998  func (s *StateStore) ACLPolicyByJob(ws memdb.WatchSet, ns, jobID string) (memdb.ResultIterator, error) {
  5999  	txn := s.db.ReadTxn()
  6000  
  6001  	iter, err := txn.Get("acl_policy", "job_prefix", ns, jobID)
  6002  	if err != nil {
  6003  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  6004  	}
  6005  	ws.Add(iter.WatchCh())
  6006  
  6007  	return iter, nil
  6008  }
  6009  
  6010  // ACLPolicies returns an iterator over all the acl policies
  6011  func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6012  	txn := s.db.ReadTxn()
  6013  
  6014  	// Walk the entire table
  6015  	iter, err := txn.Get("acl_policy", "id")
  6016  	if err != nil {
  6017  		return nil, err
  6018  	}
  6019  	ws.Add(iter.WatchCh())
  6020  	return iter, nil
  6021  }
  6022  
  6023  // UpsertACLTokens is used to create or update a set of ACL tokens
  6024  func (s *StateStore) UpsertACLTokens(msgType structs.MessageType, index uint64, tokens []*structs.ACLToken) error {
  6025  	txn := s.db.WriteTxnMsgT(msgType, index)
  6026  	defer txn.Abort()
  6027  
  6028  	for _, token := range tokens {
  6029  		// Ensure the policy hash is non-nil. This should be done outside the state store
  6030  		// for performance reasons, but we check here for defense in depth.
  6031  		if len(token.Hash) == 0 {
  6032  			token.SetHash()
  6033  		}
  6034  
  6035  		// Check if the token already exists
  6036  		existing, err := txn.First("acl_token", "id", token.AccessorID)
  6037  		if err != nil {
  6038  			return fmt.Errorf("token lookup failed: %v", err)
  6039  		}
  6040  
  6041  		// Update all the indexes
  6042  		if existing != nil {
  6043  			existTK := existing.(*structs.ACLToken)
  6044  			token.CreateIndex = existTK.CreateIndex
  6045  			token.ModifyIndex = index
  6046  
  6047  			// Do not allow SecretID or create time to change
  6048  			token.SecretID = existTK.SecretID
  6049  			token.CreateTime = existTK.CreateTime
  6050  
  6051  		} else {
  6052  			token.CreateIndex = index
  6053  			token.ModifyIndex = index
  6054  		}
  6055  
  6056  		// Update the token
  6057  		if err := txn.Insert("acl_token", token); err != nil {
  6058  			return fmt.Errorf("upserting token failed: %v", err)
  6059  		}
  6060  	}
  6061  
  6062  	// Update the indexes table
  6063  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  6064  		return fmt.Errorf("index update failed: %v", err)
  6065  	}
  6066  	return txn.Commit()
  6067  }
  6068  
  6069  // DeleteACLTokens deletes the tokens with the given accessor ids
  6070  func (s *StateStore) DeleteACLTokens(msgType structs.MessageType, index uint64, ids []string) error {
  6071  	txn := s.db.WriteTxnMsgT(msgType, index)
  6072  	defer txn.Abort()
  6073  
  6074  	// Delete the tokens
  6075  	for _, id := range ids {
  6076  		if _, err := txn.DeleteAll("acl_token", "id", id); err != nil {
  6077  			return fmt.Errorf("deleting acl token failed: %v", err)
  6078  		}
  6079  	}
  6080  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  6081  		return fmt.Errorf("index update failed: %v", err)
  6082  	}
  6083  	return txn.Commit()
  6084  }
  6085  
  6086  // ACLTokenByAccessorID is used to lookup a token by accessor ID
  6087  func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) {
  6088  	if id == "" {
  6089  		return nil, fmt.Errorf("acl token lookup failed: missing accessor id")
  6090  	}
  6091  
  6092  	txn := s.db.ReadTxn()
  6093  
  6094  	watchCh, existing, err := txn.FirstWatch("acl_token", "id", id)
  6095  	if err != nil {
  6096  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  6097  	}
  6098  	ws.Add(watchCh)
  6099  
  6100  	// If the existing token is nil, this indicates it does not exist in state.
  6101  	if existing == nil {
  6102  		return nil, nil
  6103  	}
  6104  
  6105  	// Assert the token type which allows us to perform additional work on the
  6106  	// token that is needed before returning the call.
  6107  	token := existing.(*structs.ACLToken)
  6108  
  6109  	// Handle potential staleness of ACL role links.
  6110  	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
  6111  		return nil, err
  6112  	}
  6113  	return token, nil
  6114  }
  6115  
  6116  // ACLTokenBySecretID is used to lookup a token by secret ID
  6117  func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) {
  6118  	if secretID == "" {
  6119  		return nil, fmt.Errorf("acl token lookup failed: missing secret id")
  6120  	}
  6121  
  6122  	txn := s.db.ReadTxn()
  6123  
  6124  	watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID)
  6125  	if err != nil {
  6126  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  6127  	}
  6128  	ws.Add(watchCh)
  6129  
  6130  	// If the existing token is nil, this indicates it does not exist in state.
  6131  	if existing == nil {
  6132  		return nil, nil
  6133  	}
  6134  
  6135  	// Assert the token type which allows us to perform additional work on the
  6136  	// token that is needed before returning the call.
  6137  	token := existing.(*structs.ACLToken)
  6138  
  6139  	// Handle potential staleness of ACL role links.
  6140  	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
  6141  		return nil, err
  6142  	}
  6143  	return token, nil
  6144  }
  6145  
  6146  // ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix
  6147  func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string, sort SortOption) (memdb.ResultIterator, error) {
  6148  	txn := s.db.ReadTxn()
  6149  
  6150  	var iter memdb.ResultIterator
  6151  	var err error
  6152  
  6153  	switch sort {
  6154  	case SortReverse:
  6155  		iter, err = txn.GetReverse("acl_token", "id_prefix", prefix)
  6156  	default:
  6157  		iter, err = txn.Get("acl_token", "id_prefix", prefix)
  6158  	}
  6159  	if err != nil {
  6160  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  6161  	}
  6162  
  6163  	ws.Add(iter.WatchCh())
  6164  	return iter, nil
  6165  }
  6166  
  6167  // ACLTokens returns an iterator over all the tokens
  6168  func (s *StateStore) ACLTokens(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  6169  	txn := s.db.ReadTxn()
  6170  
  6171  	var iter memdb.ResultIterator
  6172  	var err error
  6173  
  6174  	switch sort {
  6175  	case SortReverse:
  6176  		iter, err = txn.GetReverse("acl_token", "create")
  6177  	default:
  6178  		iter, err = txn.Get("acl_token", "create")
  6179  	}
  6180  	if err != nil {
  6181  		return nil, err
  6182  	}
  6183  
  6184  	ws.Add(iter.WatchCh())
  6185  	return iter, nil
  6186  }
  6187  
  6188  // ACLTokensByGlobal returns an iterator over all the tokens filtered by global value
  6189  func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool, sort SortOption) (memdb.ResultIterator, error) {
  6190  	txn := s.db.ReadTxn()
  6191  
  6192  	var iter memdb.ResultIterator
  6193  	var err error
  6194  
  6195  	// Walk the entire table
  6196  	switch sort {
  6197  	case SortReverse:
  6198  		iter, err = txn.GetReverse("acl_token", "global", globalVal)
  6199  	default:
  6200  		iter, err = txn.Get("acl_token", "global", globalVal)
  6201  	}
  6202  	if err != nil {
  6203  		return nil, err
  6204  	}
  6205  
  6206  	ws.Add(iter.WatchCh())
  6207  	return iter, nil
  6208  }
  6209  
  6210  // CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index
  6211  func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) {
  6212  	txn := s.db.ReadTxn()
  6213  
  6214  	// Lookup the bootstrap sentinel
  6215  	out, err := txn.First("index", "id", "acl_token_bootstrap")
  6216  	if err != nil {
  6217  		return false, 0, err
  6218  	}
  6219  
  6220  	// No entry, we haven't bootstrapped yet
  6221  	if out == nil {
  6222  		return true, 0, nil
  6223  	}
  6224  
  6225  	// Return the reset index if we've already bootstrapped
  6226  	return false, out.(*IndexEntry).Value, nil
  6227  }
  6228  
  6229  // BootstrapACLTokens is used to create an initial ACL token.
  6230  func (s *StateStore) BootstrapACLTokens(msgType structs.MessageType, index uint64, resetIndex uint64, token *structs.ACLToken) error {
  6231  	txn := s.db.WriteTxnMsgT(msgType, index)
  6232  	defer txn.Abort()
  6233  
  6234  	// Check if we have already done a bootstrap
  6235  	existing, err := txn.First("index", "id", "acl_token_bootstrap")
  6236  	if err != nil {
  6237  		return fmt.Errorf("bootstrap check failed: %v", err)
  6238  	}
  6239  	if existing != nil {
  6240  		if resetIndex == 0 {
  6241  			return fmt.Errorf("ACL bootstrap already done")
  6242  		} else if resetIndex != existing.(*IndexEntry).Value {
  6243  			return fmt.Errorf("Invalid reset index for ACL bootstrap")
  6244  		}
  6245  	}
  6246  
  6247  	// Update the Create/Modify time
  6248  	token.CreateIndex = index
  6249  	token.ModifyIndex = index
  6250  
  6251  	// Insert the token
  6252  	if err := txn.Insert("acl_token", token); err != nil {
  6253  		return fmt.Errorf("upserting token failed: %v", err)
  6254  	}
  6255  
  6256  	// Update the indexes table, prevents future bootstrap until reset
  6257  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  6258  		return fmt.Errorf("index update failed: %v", err)
  6259  	}
  6260  	if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil {
  6261  		return fmt.Errorf("index update failed: %v", err)
  6262  	}
  6263  	return txn.Commit()
  6264  }
  6265  
  6266  // UpsertOneTimeToken is used to create or update a set of ACL
  6267  // tokens. Validating that we're not upserting an already-expired token is
  6268  // made the responsibility of the caller to facilitate testing.
  6269  func (s *StateStore) UpsertOneTimeToken(msgType structs.MessageType, index uint64, token *structs.OneTimeToken) error {
  6270  	txn := s.db.WriteTxnMsgT(msgType, index)
  6271  	defer txn.Abort()
  6272  
  6273  	// we expect the RPC call to set the ExpiresAt
  6274  	if token.ExpiresAt.IsZero() {
  6275  		return fmt.Errorf("one-time token must have an ExpiresAt time")
  6276  	}
  6277  
  6278  	// Update all the indexes
  6279  	token.CreateIndex = index
  6280  	token.ModifyIndex = index
  6281  
  6282  	// Create the token
  6283  	if err := txn.Insert("one_time_token", token); err != nil {
  6284  		return fmt.Errorf("upserting one-time token failed: %v", err)
  6285  	}
  6286  
  6287  	// Update the indexes table
  6288  	if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6289  		return fmt.Errorf("index update failed: %v", err)
  6290  	}
  6291  	return txn.Commit()
  6292  }
  6293  
  6294  // DeleteOneTimeTokens deletes the tokens with the given ACLToken Accessor IDs
  6295  func (s *StateStore) DeleteOneTimeTokens(msgType structs.MessageType, index uint64, ids []string) error {
  6296  	txn := s.db.WriteTxnMsgT(msgType, index)
  6297  	defer txn.Abort()
  6298  
  6299  	var deleted int
  6300  	for _, id := range ids {
  6301  		d, err := txn.DeleteAll("one_time_token", "id", id)
  6302  		if err != nil {
  6303  			return fmt.Errorf("deleting one-time token failed: %v", err)
  6304  		}
  6305  		deleted += d
  6306  	}
  6307  
  6308  	if deleted > 0 {
  6309  		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6310  			return fmt.Errorf("index update failed: %v", err)
  6311  		}
  6312  	}
  6313  	return txn.Commit()
  6314  }
  6315  
  6316  // ExpireOneTimeTokens deletes tokens that have expired
  6317  func (s *StateStore) ExpireOneTimeTokens(msgType structs.MessageType, index uint64, timestamp time.Time) error {
  6318  	txn := s.db.WriteTxnMsgT(msgType, index)
  6319  	defer txn.Abort()
  6320  
  6321  	iter, err := s.oneTimeTokensExpiredTxn(txn, nil, timestamp)
  6322  	if err != nil {
  6323  		return err
  6324  	}
  6325  
  6326  	var deleted int
  6327  	for {
  6328  		raw := iter.Next()
  6329  		if raw == nil {
  6330  			break
  6331  		}
  6332  		ott, ok := raw.(*structs.OneTimeToken)
  6333  		if !ok || ott == nil {
  6334  			return fmt.Errorf("could not decode one-time token")
  6335  		}
  6336  		d, err := txn.DeleteAll("one_time_token", "secret", ott.OneTimeSecretID)
  6337  		if err != nil {
  6338  			return fmt.Errorf("deleting one-time token failed: %v", err)
  6339  		}
  6340  		deleted += d
  6341  	}
  6342  
  6343  	if deleted > 0 {
  6344  		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6345  			return fmt.Errorf("index update failed: %v", err)
  6346  		}
  6347  	}
  6348  	return txn.Commit()
  6349  }
  6350  
  6351  // oneTimeTokensExpiredTxn returns an iterator over all expired one-time tokens
  6352  func (s *StateStore) oneTimeTokensExpiredTxn(txn *txn, ws memdb.WatchSet, timestamp time.Time) (memdb.ResultIterator, error) {
  6353  	iter, err := txn.Get("one_time_token", "id")
  6354  	if err != nil {
  6355  		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
  6356  	}
  6357  
  6358  	ws.Add(iter.WatchCh())
  6359  	iter = memdb.NewFilterIterator(iter, expiredOneTimeTokenFilter(timestamp))
  6360  	return iter, nil
  6361  }
  6362  
  6363  // OneTimeTokenBySecret is used to lookup a token by secret
  6364  func (s *StateStore) OneTimeTokenBySecret(ws memdb.WatchSet, secret string) (*structs.OneTimeToken, error) {
  6365  	if secret == "" {
  6366  		return nil, fmt.Errorf("one-time token lookup failed: missing secret")
  6367  	}
  6368  
  6369  	txn := s.db.ReadTxn()
  6370  
  6371  	watchCh, existing, err := txn.FirstWatch("one_time_token", "secret", secret)
  6372  	if err != nil {
  6373  		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
  6374  	}
  6375  	ws.Add(watchCh)
  6376  
  6377  	if existing != nil {
  6378  		return existing.(*structs.OneTimeToken), nil
  6379  	}
  6380  	return nil, nil
  6381  }
  6382  
  6383  // expiredOneTimeTokenFilter returns a filter function that returns only
  6384  // expired one-time tokens
  6385  func expiredOneTimeTokenFilter(now time.Time) func(interface{}) bool {
  6386  	return func(raw interface{}) bool {
  6387  		ott, ok := raw.(*structs.OneTimeToken)
  6388  		if !ok {
  6389  			return true
  6390  		}
  6391  
  6392  		return ott.ExpiresAt.After(now)
  6393  	}
  6394  }
  6395  
  6396  // SchedulerConfig is used to get the current Scheduler configuration.
  6397  func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) {
  6398  	tx := s.db.ReadTxn()
  6399  	defer tx.Abort()
  6400  	return s.schedulerConfigTxn(tx)
  6401  }
  6402  
  6403  func (s *StateStore) schedulerConfigTxn(txn *txn) (uint64, *structs.SchedulerConfiguration, error) {
  6404  
  6405  	// Get the scheduler config
  6406  	c, err := txn.First("scheduler_config", "id")
  6407  	if err != nil {
  6408  		return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err)
  6409  	}
  6410  
  6411  	config, ok := c.(*structs.SchedulerConfiguration)
  6412  	if !ok {
  6413  		return 0, nil, nil
  6414  	}
  6415  
  6416  	return config.ModifyIndex, config, nil
  6417  }
  6418  
  6419  // SchedulerSetConfig is used to set the current Scheduler configuration.
  6420  func (s *StateStore) SchedulerSetConfig(index uint64, config *structs.SchedulerConfiguration) error {
  6421  	tx := s.db.WriteTxn(index)
  6422  	defer tx.Abort()
  6423  
  6424  	s.schedulerSetConfigTxn(index, tx, config)
  6425  
  6426  	return tx.Commit()
  6427  }
  6428  
  6429  func (s *StateStore) ClusterMetadata(ws memdb.WatchSet) (*structs.ClusterMetadata, error) {
  6430  	txn := s.db.ReadTxn()
  6431  	defer txn.Abort()
  6432  
  6433  	// Get the cluster metadata
  6434  	watchCh, m, err := txn.FirstWatch("cluster_meta", "id")
  6435  	if err != nil {
  6436  		return nil, fmt.Errorf("failed cluster metadata lookup: %w", err)
  6437  	}
  6438  	ws.Add(watchCh)
  6439  
  6440  	if m != nil {
  6441  		return m.(*structs.ClusterMetadata), nil
  6442  	}
  6443  
  6444  	return nil, nil
  6445  }
  6446  
  6447  func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error {
  6448  	txn := s.db.WriteTxn(index)
  6449  	defer txn.Abort()
  6450  
  6451  	if err := s.setClusterMetadata(txn, meta); err != nil {
  6452  		return fmt.Errorf("set cluster metadata failed: %w", err)
  6453  	}
  6454  
  6455  	return txn.Commit()
  6456  }
  6457  
  6458  // WithWriteTransaction executes the passed function within a write transaction,
  6459  // and returns its result.  If the invocation returns no error, the transaction
  6460  // is committed; otherwise, it's aborted.
  6461  func (s *StateStore) WithWriteTransaction(msgType structs.MessageType, index uint64, fn func(Txn) error) error {
  6462  	tx := s.db.WriteTxnMsgT(msgType, index)
  6463  	defer tx.Abort()
  6464  
  6465  	err := fn(tx)
  6466  	if err == nil {
  6467  		return tx.Commit()
  6468  	}
  6469  	return err
  6470  }
  6471  
  6472  // SchedulerCASConfig is used to update the scheduler configuration with a
  6473  // given Raft index. If the CAS index specified is not equal to the last observed index
  6474  // for the config, then the call is a noop.
  6475  func (s *StateStore) SchedulerCASConfig(index, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) {
  6476  	tx := s.db.WriteTxn(index)
  6477  	defer tx.Abort()
  6478  
  6479  	// Check for an existing config
  6480  	existing, err := tx.First("scheduler_config", "id")
  6481  	if err != nil {
  6482  		return false, fmt.Errorf("failed scheduler config lookup: %s", err)
  6483  	}
  6484  
  6485  	// If the existing index does not match the provided CAS
  6486  	// index arg, then we shouldn't update anything and can safely
  6487  	// return early here.
  6488  	e, ok := existing.(*structs.SchedulerConfiguration)
  6489  	if !ok || (e != nil && e.ModifyIndex != cidx) {
  6490  		return false, nil
  6491  	}
  6492  
  6493  	s.schedulerSetConfigTxn(index, tx, config)
  6494  
  6495  	if err := tx.Commit(); err != nil {
  6496  		return false, err
  6497  	}
  6498  	return true, nil
  6499  }
  6500  
  6501  func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *txn, config *structs.SchedulerConfiguration) error {
  6502  	// Check for an existing config
  6503  	existing, err := tx.First("scheduler_config", "id")
  6504  	if err != nil {
  6505  		return fmt.Errorf("failed scheduler config lookup: %s", err)
  6506  	}
  6507  
  6508  	// Set the indexes.
  6509  	if existing != nil {
  6510  		config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex
  6511  	} else {
  6512  		config.CreateIndex = idx
  6513  	}
  6514  	config.ModifyIndex = idx
  6515  
  6516  	if err := tx.Insert("scheduler_config", config); err != nil {
  6517  		return fmt.Errorf("failed updating scheduler config: %s", err)
  6518  	}
  6519  	return nil
  6520  }
  6521  
  6522  func (s *StateStore) setClusterMetadata(txn *txn, meta *structs.ClusterMetadata) error {
  6523  	// Check for an existing config, if it exists, verify that the cluster ID matches
  6524  	existing, err := txn.First("cluster_meta", "id")
  6525  	if err != nil {
  6526  		return fmt.Errorf("failed cluster meta lookup: %v", err)
  6527  	}
  6528  
  6529  	if existing != nil {
  6530  		existingClusterID := existing.(*structs.ClusterMetadata).ClusterID
  6531  		if meta.ClusterID != existingClusterID && existingClusterID != "" {
  6532  			// there is a bug in cluster ID detection
  6533  			return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID)
  6534  		}
  6535  	}
  6536  
  6537  	// update is technically a noop, unless someday we add more / mutable fields
  6538  	if err := txn.Insert("cluster_meta", meta); err != nil {
  6539  		return fmt.Errorf("set cluster metadata failed: %v", err)
  6540  	}
  6541  
  6542  	return nil
  6543  }
  6544  
  6545  // UpsertScalingPolicies is used to insert a new scaling policy.
  6546  func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error {
  6547  	txn := s.db.WriteTxn(index)
  6548  	defer txn.Abort()
  6549  
  6550  	if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil {
  6551  		return err
  6552  	}
  6553  
  6554  	return txn.Commit()
  6555  }
  6556  
  6557  // UpsertScalingPoliciesTxn is used to insert a new scaling policy.
  6558  func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy,
  6559  	txn *txn) error {
  6560  
  6561  	hadUpdates := false
  6562  
  6563  	for _, policy := range scalingPolicies {
  6564  		// Check if the scaling policy already exists
  6565  		// Policy uniqueness is based on target and type
  6566  		it, err := txn.Get("scaling_policy", "target",
  6567  			policy.Target[structs.ScalingTargetNamespace],
  6568  			policy.Target[structs.ScalingTargetJob],
  6569  			policy.Target[structs.ScalingTargetGroup],
  6570  			policy.Target[structs.ScalingTargetTask],
  6571  		)
  6572  		if err != nil {
  6573  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  6574  		}
  6575  
  6576  		// Check if type matches
  6577  		var existing *structs.ScalingPolicy
  6578  		for raw := it.Next(); raw != nil; raw = it.Next() {
  6579  			p := raw.(*structs.ScalingPolicy)
  6580  			if p.Type == policy.Type {
  6581  				existing = p
  6582  				break
  6583  			}
  6584  		}
  6585  
  6586  		// Setup the indexes correctly
  6587  		if existing != nil {
  6588  			if !existing.Diff(policy) {
  6589  				continue
  6590  			}
  6591  			policy.ID = existing.ID
  6592  			policy.CreateIndex = existing.CreateIndex
  6593  		} else {
  6594  			// policy.ID must have been set already in Job.Register before log apply
  6595  			policy.CreateIndex = index
  6596  		}
  6597  		policy.ModifyIndex = index
  6598  
  6599  		// Insert the scaling policy
  6600  		hadUpdates = true
  6601  		if err := txn.Insert("scaling_policy", policy); err != nil {
  6602  			return err
  6603  		}
  6604  	}
  6605  
  6606  	// Update the indexes table for scaling policy if we updated any policies
  6607  	if hadUpdates {
  6608  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  6609  			return fmt.Errorf("index update failed: %v", err)
  6610  		}
  6611  	}
  6612  
  6613  	return nil
  6614  }
  6615  
  6616  // NamespaceByName is used to lookup a namespace by name
  6617  func (s *StateStore) NamespaceByName(ws memdb.WatchSet, name string) (*structs.Namespace, error) {
  6618  	txn := s.db.ReadTxn()
  6619  	return s.namespaceByNameImpl(ws, txn, name)
  6620  }
  6621  
  6622  // namespaceByNameImpl is used to lookup a namespace by name
  6623  func (s *StateStore) namespaceByNameImpl(ws memdb.WatchSet, txn *txn, name string) (*structs.Namespace, error) {
  6624  	watchCh, existing, err := txn.FirstWatch(TableNamespaces, "id", name)
  6625  	if err != nil {
  6626  		return nil, fmt.Errorf("namespace lookup failed: %v", err)
  6627  	}
  6628  	ws.Add(watchCh)
  6629  
  6630  	if existing != nil {
  6631  		return existing.(*structs.Namespace), nil
  6632  	}
  6633  	return nil, nil
  6634  }
  6635  
  6636  // namespaceExists returns whether a namespace exists
  6637  func (s *StateStore) namespaceExists(txn *txn, namespace string) (bool, error) {
  6638  	if namespace == structs.DefaultNamespace {
  6639  		return true, nil
  6640  	}
  6641  
  6642  	existing, err := txn.First(TableNamespaces, "id", namespace)
  6643  	if err != nil {
  6644  		return false, fmt.Errorf("namespace lookup failed: %v", err)
  6645  	}
  6646  
  6647  	return existing != nil, nil
  6648  }
  6649  
  6650  // NamespacesByNamePrefix is used to lookup namespaces by prefix
  6651  func (s *StateStore) NamespacesByNamePrefix(ws memdb.WatchSet, namePrefix string) (memdb.ResultIterator, error) {
  6652  	txn := s.db.ReadTxn()
  6653  
  6654  	iter, err := txn.Get(TableNamespaces, "id_prefix", namePrefix)
  6655  	if err != nil {
  6656  		return nil, fmt.Errorf("namespaces lookup failed: %v", err)
  6657  	}
  6658  	ws.Add(iter.WatchCh())
  6659  
  6660  	return iter, nil
  6661  }
  6662  
  6663  // Namespaces returns an iterator over all the namespaces
  6664  func (s *StateStore) Namespaces(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6665  	txn := s.db.ReadTxn()
  6666  
  6667  	// Walk the entire namespace table
  6668  	iter, err := txn.Get(TableNamespaces, "id")
  6669  	if err != nil {
  6670  		return nil, err
  6671  	}
  6672  	ws.Add(iter.WatchCh())
  6673  	return iter, nil
  6674  }
  6675  
  6676  func (s *StateStore) NamespaceNames() ([]string, error) {
  6677  	it, err := s.Namespaces(nil)
  6678  	if err != nil {
  6679  		return nil, err
  6680  	}
  6681  
  6682  	nses := []string{}
  6683  	for {
  6684  		next := it.Next()
  6685  		if next == nil {
  6686  			break
  6687  		}
  6688  		ns := next.(*structs.Namespace)
  6689  		nses = append(nses, ns.Name)
  6690  	}
  6691  
  6692  	return nses, nil
  6693  }
  6694  
  6695  // UpsertNamespaces is used to register or update a set of namespaces.
  6696  func (s *StateStore) UpsertNamespaces(index uint64, namespaces []*structs.Namespace) error {
  6697  	txn := s.db.WriteTxn(index)
  6698  	defer txn.Abort()
  6699  
  6700  	for _, ns := range namespaces {
  6701  		// Handle upgrade path.
  6702  		ns.Canonicalize()
  6703  		if err := s.upsertNamespaceImpl(index, txn, ns); err != nil {
  6704  			return err
  6705  		}
  6706  	}
  6707  
  6708  	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
  6709  		return fmt.Errorf("index update failed: %v", err)
  6710  	}
  6711  
  6712  	return txn.Commit()
  6713  }
  6714  
  6715  // upsertNamespaceImpl is used to upsert a namespace
  6716  func (s *StateStore) upsertNamespaceImpl(index uint64, txn *txn, namespace *structs.Namespace) error {
  6717  	// Ensure the namespace hash is non-nil. This should be done outside the state store
  6718  	// for performance reasons, but we check here for defense in depth.
  6719  	ns := namespace
  6720  	if len(ns.Hash) == 0 {
  6721  		ns.SetHash()
  6722  	}
  6723  
  6724  	// Check if the namespace already exists
  6725  	existing, err := txn.First(TableNamespaces, "id", ns.Name)
  6726  	if err != nil {
  6727  		return fmt.Errorf("namespace lookup failed: %v", err)
  6728  	}
  6729  
  6730  	// Setup the indexes correctly and determine which quotas need to be
  6731  	// reconciled
  6732  	var oldQuota string
  6733  	if existing != nil {
  6734  		exist := existing.(*structs.Namespace)
  6735  		ns.CreateIndex = exist.CreateIndex
  6736  		ns.ModifyIndex = index
  6737  
  6738  		// Grab the old quota on the namespace
  6739  		oldQuota = exist.Quota
  6740  	} else {
  6741  		ns.CreateIndex = index
  6742  		ns.ModifyIndex = index
  6743  	}
  6744  
  6745  	// Validate that the quota on the new namespace exists
  6746  	if ns.Quota != "" {
  6747  		exists, err := s.quotaSpecExists(txn, ns.Quota)
  6748  		if err != nil {
  6749  			return fmt.Errorf("looking up namespace quota %q failed: %v", ns.Quota, err)
  6750  		} else if !exists {
  6751  			return fmt.Errorf("namespace %q using non-existent quota %q", ns.Name, ns.Quota)
  6752  		}
  6753  	}
  6754  
  6755  	// Insert the namespace
  6756  	if err := txn.Insert(TableNamespaces, ns); err != nil {
  6757  		return fmt.Errorf("namespace insert failed: %v", err)
  6758  	}
  6759  
  6760  	// Reconcile changed quotas
  6761  	return s.quotaReconcile(index, txn, ns.Quota, oldQuota)
  6762  }
  6763  
  6764  // DeleteNamespaces is used to remove a set of namespaces
  6765  func (s *StateStore) DeleteNamespaces(index uint64, names []string) error {
  6766  	txn := s.db.WriteTxn(index)
  6767  	defer txn.Abort()
  6768  
  6769  	for _, name := range names {
  6770  		// Lookup the namespace
  6771  		existing, err := txn.First(TableNamespaces, "id", name)
  6772  		if err != nil {
  6773  			return fmt.Errorf("namespace lookup failed: %v", err)
  6774  		}
  6775  		if existing == nil {
  6776  			return fmt.Errorf("namespace not found")
  6777  		}
  6778  
  6779  		ns := existing.(*structs.Namespace)
  6780  		if ns.Name == structs.DefaultNamespace {
  6781  			return fmt.Errorf("default namespace can not be deleted")
  6782  		}
  6783  
  6784  		// Ensure that the namespace doesn't have any non-terminal jobs
  6785  		iter, err := s.jobsByNamespaceImpl(nil, name, txn)
  6786  		if err != nil {
  6787  			return err
  6788  		}
  6789  
  6790  		for {
  6791  			raw := iter.Next()
  6792  			if raw == nil {
  6793  				break
  6794  			}
  6795  			job := raw.(*structs.Job)
  6796  
  6797  			if job.Status != structs.JobStatusDead {
  6798  				return fmt.Errorf("namespace %q contains at least one non-terminal job %q. "+
  6799  					"All jobs must be terminal in namespace before it can be deleted", name, job.ID)
  6800  			}
  6801  		}
  6802  
  6803  		vIter, err := s.csiVolumesByNamespaceImpl(txn, nil, name, "")
  6804  		if err != nil {
  6805  			return err
  6806  		}
  6807  		rawVol := vIter.Next()
  6808  		if rawVol != nil {
  6809  			vol := rawVol.(*structs.CSIVolume)
  6810  			return fmt.Errorf("namespace %q contains at least one CSI volume %q. "+
  6811  				"All CSI volumes in namespace must be deleted before it can be deleted", name, vol.ID)
  6812  		}
  6813  
  6814  		varIter, err := s.getVariablesByNamespaceImpl(txn, nil, name)
  6815  		if err != nil {
  6816  			return err
  6817  		}
  6818  		if varIter.Next() != nil {
  6819  			// unlike job/volume, don't show the path here because the user may
  6820  			// not have List permissions on the vars in this namespace
  6821  			return fmt.Errorf("namespace %q contains at least one variable. "+
  6822  				"All variables in namespace must be deleted before it can be deleted", name)
  6823  		}
  6824  
  6825  		// Delete the namespace
  6826  		if err := txn.Delete(TableNamespaces, existing); err != nil {
  6827  			return fmt.Errorf("namespace deletion failed: %v", err)
  6828  		}
  6829  	}
  6830  
  6831  	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
  6832  		return fmt.Errorf("index update failed: %v", err)
  6833  	}
  6834  
  6835  	return txn.Commit()
  6836  }
  6837  
  6838  func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error {
  6839  	txn := s.db.WriteTxn(index)
  6840  	defer txn.Abort()
  6841  
  6842  	err := s.DeleteScalingPoliciesTxn(index, ids, txn)
  6843  	if err == nil {
  6844  		return txn.Commit()
  6845  	}
  6846  
  6847  	return err
  6848  }
  6849  
  6850  // DeleteScalingPoliciesTxn is used to delete a set of scaling policies by ID.
  6851  func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *txn) error {
  6852  	if len(ids) == 0 {
  6853  		return nil
  6854  	}
  6855  
  6856  	for _, id := range ids {
  6857  		// Lookup the scaling policy
  6858  		existing, err := txn.First("scaling_policy", "id", id)
  6859  		if err != nil {
  6860  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  6861  		}
  6862  		if existing == nil {
  6863  			return fmt.Errorf("scaling policy not found")
  6864  		}
  6865  
  6866  		// Delete the scaling policy
  6867  		if err := txn.Delete("scaling_policy", existing); err != nil {
  6868  			return fmt.Errorf("scaling policy delete failed: %v", err)
  6869  		}
  6870  	}
  6871  
  6872  	if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  6873  		return fmt.Errorf("index update failed: %v", err)
  6874  	}
  6875  
  6876  	return nil
  6877  }
  6878  
  6879  // ScalingPolicies returns an iterator over all the scaling policies
  6880  func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6881  	txn := s.db.ReadTxn()
  6882  
  6883  	// Walk the entire scaling_policy table
  6884  	iter, err := txn.Get("scaling_policy", "id")
  6885  	if err != nil {
  6886  		return nil, err
  6887  	}
  6888  
  6889  	ws.Add(iter.WatchCh())
  6890  
  6891  	return iter, nil
  6892  }
  6893  
  6894  // ScalingPoliciesByTypePrefix returns an iterator over scaling policies with a certain type prefix.
  6895  func (s *StateStore) ScalingPoliciesByTypePrefix(ws memdb.WatchSet, t string) (memdb.ResultIterator, error) {
  6896  	txn := s.db.ReadTxn()
  6897  
  6898  	iter, err := txn.Get("scaling_policy", "type_prefix", t)
  6899  	if err != nil {
  6900  		return nil, err
  6901  	}
  6902  
  6903  	ws.Add(iter.WatchCh())
  6904  	return iter, nil
  6905  }
  6906  
  6907  func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace, typ string) (memdb.ResultIterator, error) {
  6908  	txn := s.db.ReadTxn()
  6909  
  6910  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace)
  6911  	if err != nil {
  6912  		return nil, err
  6913  	}
  6914  
  6915  	ws.Add(iter.WatchCh())
  6916  
  6917  	// Wrap the iterator in a filter to exact match the namespace
  6918  	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))
  6919  
  6920  	// If policy type is specified as well, wrap again
  6921  	if typ != "" {
  6922  		iter = memdb.NewFilterIterator(iter, func(raw interface{}) bool {
  6923  			p, ok := raw.(*structs.ScalingPolicy)
  6924  			if !ok {
  6925  				return true
  6926  			}
  6927  			return !strings.HasPrefix(p.Type, typ)
  6928  		})
  6929  	}
  6930  
  6931  	return iter, nil
  6932  }
  6933  
  6934  func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID, policyType string) (memdb.ResultIterator,
  6935  	error) {
  6936  	txn := s.db.ReadTxn()
  6937  	iter, err := s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn)
  6938  	if err != nil {
  6939  		return nil, err
  6940  	}
  6941  
  6942  	if policyType == "" {
  6943  		return iter, nil
  6944  	}
  6945  
  6946  	filter := func(raw interface{}) bool {
  6947  		p, ok := raw.(*structs.ScalingPolicy)
  6948  		if !ok {
  6949  			return true
  6950  		}
  6951  		return policyType != p.Type
  6952  	}
  6953  
  6954  	return memdb.NewFilterIterator(iter, filter), nil
  6955  }
  6956  
  6957  func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string,
  6958  	txn *txn) (memdb.ResultIterator, error) {
  6959  
  6960  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID)
  6961  	if err != nil {
  6962  		return nil, err
  6963  	}
  6964  
  6965  	ws.Add(iter.WatchCh())
  6966  
  6967  	filter := func(raw interface{}) bool {
  6968  		d, ok := raw.(*structs.ScalingPolicy)
  6969  		if !ok {
  6970  			return true
  6971  		}
  6972  
  6973  		return d.Target[structs.ScalingTargetJob] != jobID
  6974  	}
  6975  
  6976  	// Wrap the iterator in a filter
  6977  	wrap := memdb.NewFilterIterator(iter, filter)
  6978  	return wrap, nil
  6979  }
  6980  
  6981  func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) {
  6982  	txn := s.db.ReadTxn()
  6983  
  6984  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id)
  6985  	if err != nil {
  6986  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  6987  	}
  6988  	ws.Add(watchCh)
  6989  
  6990  	if existing != nil {
  6991  		return existing.(*structs.ScalingPolicy), nil
  6992  	}
  6993  
  6994  	return nil, nil
  6995  }
  6996  
  6997  // ScalingPolicyByTargetAndType returns a fully-qualified policy against a target and policy type,
  6998  // or nil if it does not exist. This method does not honor the watchset on the policy type, just the target.
  6999  func (s *StateStore) ScalingPolicyByTargetAndType(ws memdb.WatchSet, target map[string]string, typ string) (*structs.ScalingPolicy,
  7000  	error) {
  7001  	txn := s.db.ReadTxn()
  7002  
  7003  	namespace := target[structs.ScalingTargetNamespace]
  7004  	job := target[structs.ScalingTargetJob]
  7005  	group := target[structs.ScalingTargetGroup]
  7006  	task := target[structs.ScalingTargetTask]
  7007  
  7008  	it, err := txn.Get("scaling_policy", "target", namespace, job, group, task)
  7009  	if err != nil {
  7010  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  7011  	}
  7012  
  7013  	ws.Add(it.WatchCh())
  7014  
  7015  	// Check for type
  7016  	var existing *structs.ScalingPolicy
  7017  	for raw := it.Next(); raw != nil; raw = it.Next() {
  7018  		p := raw.(*structs.ScalingPolicy)
  7019  		if p.Type == typ {
  7020  			existing = p
  7021  			break
  7022  		}
  7023  	}
  7024  
  7025  	if existing != nil {
  7026  		return existing, nil
  7027  	}
  7028  
  7029  	return nil, nil
  7030  }
  7031  
  7032  func (s *StateStore) ScalingPoliciesByIDPrefix(ws memdb.WatchSet, namespace string, prefix string) (memdb.ResultIterator, error) {
  7033  	txn := s.db.ReadTxn()
  7034  
  7035  	iter, err := txn.Get("scaling_policy", "id_prefix", prefix)
  7036  	if err != nil {
  7037  		return nil, fmt.Errorf("scaling policy lookup failed: %v", err)
  7038  	}
  7039  
  7040  	ws.Add(iter.WatchCh())
  7041  
  7042  	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))
  7043  
  7044  	return iter, nil
  7045  }
  7046  
  7047  // scalingPolicyNamespaceFilter returns a filter function that filters all
  7048  // scaling policies not targeting the given namespace.
  7049  func scalingPolicyNamespaceFilter(namespace string) func(interface{}) bool {
  7050  	return func(raw interface{}) bool {
  7051  		p, ok := raw.(*structs.ScalingPolicy)
  7052  		if !ok {
  7053  			return true
  7054  		}
  7055  
  7056  		return p.Target[structs.ScalingTargetNamespace] != namespace
  7057  	}
  7058  }
  7059  
  7060  // StateSnapshot is used to provide a point-in-time snapshot
  7061  type StateSnapshot struct {
  7062  	StateStore
  7063  }
  7064  
  7065  // DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the
  7066  // Allocation for each of the Allocation diffs and merges the updated attributes with
  7067  // the existing Allocation, and attaches the Job provided
  7068  func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error {
  7069  	for nodeID, allocs := range nodeAllocations {
  7070  		denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs)
  7071  		if err != nil {
  7072  			return err
  7073  		}
  7074  
  7075  		nodeAllocations[nodeID] = denormalizedAllocs
  7076  	}
  7077  	return nil
  7078  }
  7079  
  7080  // DenormalizeAllocationSlice queries the Allocation for each allocation diff
  7081  // represented as an Allocation and merges the updated attributes with the existing
  7082  // Allocation, and attaches the Job provided.
  7083  //
  7084  // This should only be called on terminal allocs, particularly stopped or preempted allocs
  7085  func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) {
  7086  	allocDiffs := make([]*structs.AllocationDiff, len(allocs))
  7087  	for i, alloc := range allocs {
  7088  		allocDiffs[i] = alloc.AllocationDiff()
  7089  	}
  7090  
  7091  	return s.DenormalizeAllocationDiffSlice(allocDiffs)
  7092  }
  7093  
  7094  // DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges
  7095  // the updated attributes with the existing Allocation, and attaches the Job provided.
  7096  //
  7097  // This should only be called on terminal alloc, particularly stopped or preempted allocs
  7098  func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) {
  7099  	// Output index for denormalized Allocations
  7100  	j := 0
  7101  
  7102  	denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs))
  7103  	for _, allocDiff := range allocDiffs {
  7104  		alloc, err := s.AllocByID(nil, allocDiff.ID)
  7105  		if err != nil {
  7106  			return nil, fmt.Errorf("alloc lookup failed: %v", err)
  7107  		}
  7108  		if alloc == nil {
  7109  			return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID)
  7110  		}
  7111  
  7112  		// Merge the updates to the Allocation.  Don't update alloc.Job for terminal allocs
  7113  		// so alloc refers to the latest Job view before destruction and to ease handler implementations
  7114  		allocCopy := alloc.Copy()
  7115  
  7116  		if allocDiff.PreemptedByAllocation != "" {
  7117  			allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation
  7118  			allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation)
  7119  			allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict
  7120  		} else {
  7121  			// If alloc is a stopped alloc
  7122  			allocCopy.DesiredDescription = allocDiff.DesiredDescription
  7123  			allocCopy.DesiredStatus = structs.AllocDesiredStatusStop
  7124  			if allocDiff.ClientStatus != "" {
  7125  				allocCopy.ClientStatus = allocDiff.ClientStatus
  7126  			}
  7127  			if allocDiff.FollowupEvalID != "" {
  7128  				allocCopy.FollowupEvalID = allocDiff.FollowupEvalID
  7129  			}
  7130  		}
  7131  		if allocDiff.ModifyTime != 0 {
  7132  			allocCopy.ModifyTime = allocDiff.ModifyTime
  7133  		}
  7134  
  7135  		// Update the allocDiff in the slice to equal the denormalized alloc
  7136  		denormalizedAllocs[j] = allocCopy
  7137  		j++
  7138  	}
  7139  	// Retain only the denormalized Allocations in the slice
  7140  	denormalizedAllocs = denormalizedAllocs[:j]
  7141  	return denormalizedAllocs, nil
  7142  }
  7143  
  7144  func getPreemptedAllocDesiredDescription(preemptedByAllocID string) string {
  7145  	return fmt.Sprintf("Preempted by alloc ID %v", preemptedByAllocID)
  7146  }
  7147  
  7148  // UpsertRootKeyMeta saves root key meta or updates it in-place.
  7149  func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKeyMeta, rekey bool) error {
  7150  	txn := s.db.WriteTxn(index)
  7151  	defer txn.Abort()
  7152  
  7153  	// get any existing key for updating
  7154  	raw, err := txn.First(TableRootKeyMeta, indexID, rootKeyMeta.KeyID)
  7155  	if err != nil {
  7156  		return fmt.Errorf("root key metadata lookup failed: %v", err)
  7157  	}
  7158  
  7159  	isRotation := false
  7160  
  7161  	if raw != nil {
  7162  		existing := raw.(*structs.RootKeyMeta)
  7163  		rootKeyMeta.CreateIndex = existing.CreateIndex
  7164  		rootKeyMeta.CreateTime = existing.CreateTime
  7165  		isRotation = !existing.Active() && rootKeyMeta.Active()
  7166  	} else {
  7167  		rootKeyMeta.CreateIndex = index
  7168  		isRotation = rootKeyMeta.Active()
  7169  	}
  7170  	rootKeyMeta.ModifyIndex = index
  7171  
  7172  	if rekey && !isRotation {
  7173  		return fmt.Errorf("cannot rekey without setting the new key active")
  7174  	}
  7175  
  7176  	// if the upsert is for a newly-active key, we need to set all the
  7177  	// other keys as inactive in the same transaction.
  7178  	if isRotation {
  7179  		iter, err := txn.Get(TableRootKeyMeta, indexID)
  7180  		if err != nil {
  7181  			return err
  7182  		}
  7183  		for {
  7184  			raw := iter.Next()
  7185  			if raw == nil {
  7186  				break
  7187  			}
  7188  			key := raw.(*structs.RootKeyMeta)
  7189  			modified := false
  7190  
  7191  			switch key.State {
  7192  			case structs.RootKeyStateInactive:
  7193  				if rekey {
  7194  					key.SetRekeying()
  7195  					modified = true
  7196  				}
  7197  			case structs.RootKeyStateActive:
  7198  				if rekey {
  7199  					key.SetRekeying()
  7200  				} else {
  7201  					key.SetInactive()
  7202  				}
  7203  				modified = true
  7204  			case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated:
  7205  				// nothing to do
  7206  			}
  7207  
  7208  			if modified {
  7209  				key.ModifyIndex = index
  7210  				if err := txn.Insert(TableRootKeyMeta, key); err != nil {
  7211  					return err
  7212  				}
  7213  			}
  7214  
  7215  		}
  7216  	}
  7217  
  7218  	if err := txn.Insert(TableRootKeyMeta, rootKeyMeta); err != nil {
  7219  		return err
  7220  	}
  7221  
  7222  	// update the indexes table
  7223  	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
  7224  		return fmt.Errorf("index update failed: %v", err)
  7225  	}
  7226  	return txn.Commit()
  7227  }
  7228  
  7229  // DeleteRootKeyMeta deletes a single root key, or returns an error if
  7230  // it doesn't exist.
  7231  func (s *StateStore) DeleteRootKeyMeta(index uint64, keyID string) error {
  7232  	txn := s.db.WriteTxn(index)
  7233  	defer txn.Abort()
  7234  
  7235  	// find the old key
  7236  	existing, err := txn.First(TableRootKeyMeta, indexID, keyID)
  7237  	if err != nil {
  7238  		return fmt.Errorf("root key metadata lookup failed: %v", err)
  7239  	}
  7240  	if existing == nil {
  7241  		return fmt.Errorf("root key metadata not found")
  7242  	}
  7243  	if err := txn.Delete(TableRootKeyMeta, existing); err != nil {
  7244  		return fmt.Errorf("root key metadata delete failed: %v", err)
  7245  	}
  7246  
  7247  	// update the indexes table
  7248  	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
  7249  		return fmt.Errorf("index update failed: %v", err)
  7250  	}
  7251  
  7252  	return txn.Commit()
  7253  }
  7254  
  7255  // RootKeyMetas returns an iterator over all root key metadata
  7256  func (s *StateStore) RootKeyMetas(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  7257  	txn := s.db.ReadTxn()
  7258  
  7259  	iter, err := txn.Get(TableRootKeyMeta, indexID)
  7260  	if err != nil {
  7261  		return nil, err
  7262  	}
  7263  
  7264  	ws.Add(iter.WatchCh())
  7265  	return iter, nil
  7266  }
  7267  
  7268  // RootKeyMetaByID returns a specific root key meta
  7269  func (s *StateStore) RootKeyMetaByID(ws memdb.WatchSet, id string) (*structs.RootKeyMeta, error) {
  7270  	txn := s.db.ReadTxn()
  7271  
  7272  	watchCh, raw, err := txn.FirstWatch(TableRootKeyMeta, indexID, id)
  7273  	if err != nil {
  7274  		return nil, fmt.Errorf("root key metadata lookup failed: %v", err)
  7275  	}
  7276  	ws.Add(watchCh)
  7277  
  7278  	if raw != nil {
  7279  		return raw.(*structs.RootKeyMeta), nil
  7280  	}
  7281  	return nil, nil
  7282  }
  7283  
  7284  // GetActiveRootKeyMeta returns the metadata for the currently active root key
  7285  func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMeta, error) {
  7286  	txn := s.db.ReadTxn()
  7287  
  7288  	iter, err := txn.Get(TableRootKeyMeta, indexID)
  7289  	if err != nil {
  7290  		return nil, err
  7291  	}
  7292  	ws.Add(iter.WatchCh())
  7293  
  7294  	for {
  7295  		raw := iter.Next()
  7296  		if raw == nil {
  7297  			break
  7298  		}
  7299  		key := raw.(*structs.RootKeyMeta)
  7300  		if key.Active() {
  7301  			return key, nil
  7302  		}
  7303  	}
  7304  	return nil, nil
  7305  }
  7306  
  7307  // IsRootKeyMetaInUse determines whether a key has been used to sign a workload
  7308  // identity for a live allocation or encrypt any variables
  7309  func (s *StateStore) IsRootKeyMetaInUse(keyID string) (bool, error) {
  7310  	txn := s.db.ReadTxn()
  7311  
  7312  	iter, err := txn.Get(TableAllocs, indexSigningKey, keyID, true)
  7313  	if err != nil {
  7314  		return false, err
  7315  	}
  7316  	alloc := iter.Next()
  7317  	if alloc != nil {
  7318  		return true, nil
  7319  	}
  7320  
  7321  	iter, err = txn.Get(TableVariables, indexKeyID, keyID)
  7322  	if err != nil {
  7323  		return false, err
  7324  	}
  7325  	variable := iter.Next()
  7326  	if variable != nil {
  7327  		return true, nil
  7328  	}
  7329  
  7330  	return false, nil
  7331  }