github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"reflect"
     8  	"sort"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/hashicorp/go-bexpr"
    13  	"github.com/hashicorp/go-hclog"
    14  	"github.com/hashicorp/go-memdb"
    15  	"github.com/hashicorp/go-multierror"
    16  	"github.com/hashicorp/nomad/helper/pointer"
    17  	"github.com/hashicorp/nomad/nomad/stream"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  )
    20  
    21  // Txn is a transaction against a state store.
    22  // This can be a read or write transaction.
    23  type Txn = *txn
    24  
    25  // SortOption represents how results can be sorted.
    26  type SortOption bool
    27  
    28  const (
    29  	// SortDefault indicates that the result should be returned using the
    30  	// default go-memdb ResultIterator order.
    31  	SortDefault SortOption = false
    32  
    33  	// SortReverse indicates that the result should be returned using the
    34  	// reversed go-memdb ResultIterator order.
    35  	SortReverse SortOption = true
    36  )
    37  
    38  const (
    39  	// NodeEligibilityEventPlanRejectThreshold is the message used when the node
    40  	// is set to ineligible due to multiple plan failures.
    41  	// This is a preventive measure to signal scheduler workers to not consider
    42  	// the node for future placements.
    43  	// Plan rejections for a node are expected due to the optimistic and
    44  	// concurrent nature of the scheduling process, but repeated failures for
    45  	// the same node may indicate an underlying issue not detected by Nomad.
    46  	// The plan applier keeps track of plan rejection history and will mark
    47  	// nodes as ineligible if they cross a given threshold.
    48  	NodeEligibilityEventPlanRejectThreshold = "Node marked as ineligible for scheduling due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information"
    49  
    50  	// NodeRegisterEventRegistered is the message used when the node becomes
    51  	// registered.
    52  	NodeRegisterEventRegistered = "Node registered"
    53  
    54  	// NodeRegisterEventReregistered is the message used when the node becomes
    55  	// re-registered.
    56  	NodeRegisterEventReregistered = "Node re-registered"
    57  )
    58  
    59  // terminate appends the go-memdb terminator character to s.
    60  //
    61  // We can then use the result for exact matches during prefix
    62  // scans over compound indexes that start with s.
    63  func terminate(s string) string {
    64  	return s + "\x00"
    65  }
    66  
    67  // IndexEntry is used with the "index" table
    68  // for managing the latest Raft index affecting a table.
    69  type IndexEntry struct {
    70  	Key   string
    71  	Value uint64
    72  }
    73  
    74  // StateStoreConfig is used to configure a new state store
    75  type StateStoreConfig struct {
    76  	// Logger is used to output the state store's logs
    77  	Logger hclog.Logger
    78  
    79  	// Region is the region of the server embedding the state store.
    80  	Region string
    81  
    82  	// EnablePublisher is used to enable or disable the event publisher
    83  	EnablePublisher bool
    84  
    85  	// EventBufferSize configures the amount of events to hold in memory
    86  	EventBufferSize int64
    87  }
    88  
    89  // The StateStore is responsible for maintaining all the Nomad
    90  // state. It is manipulated by the FSM which maintains consistency
    91  // through the use of Raft. The goals of the StateStore are to provide
    92  // high concurrency for read operations without blocking writes, and
    93  // to provide write availability in the face of reads. EVERY object
    94  // returned as a result of a read against the state store should be
    95  // considered a constant and NEVER modified in place.
    96  type StateStore struct {
    97  	logger hclog.Logger
    98  	db     *changeTrackerDB
    99  
   100  	// config is the passed in configuration
   101  	config *StateStoreConfig
   102  
   103  	// abandonCh is used to signal watchers that this state store has been
   104  	// abandoned (usually during a restore). This is only ever closed.
   105  	abandonCh chan struct{}
   106  
   107  	// TODO: refactor abandonCh to use a context so that both can use the same
   108  	// cancel mechanism.
   109  	stopEventBroker func()
   110  }
   111  
   112  type streamACLDelegate struct {
   113  	s *StateStore
   114  }
   115  
   116  func (a *streamACLDelegate) TokenProvider() stream.ACLTokenProvider {
   117  	resolver, _ := a.s.Snapshot()
   118  	return resolver
   119  }
   120  
   121  // NewStateStore is used to create a new state store
   122  func NewStateStore(config *StateStoreConfig) (*StateStore, error) {
   123  	// Create the MemDB
   124  	db, err := memdb.NewMemDB(stateStoreSchema())
   125  	if err != nil {
   126  		return nil, fmt.Errorf("state store setup failed: %v", err)
   127  	}
   128  
   129  	// Create the state store
   130  	ctx, cancel := context.WithCancel(context.TODO())
   131  	s := &StateStore{
   132  		logger:          config.Logger.Named("state_store"),
   133  		config:          config,
   134  		abandonCh:       make(chan struct{}),
   135  		stopEventBroker: cancel,
   136  	}
   137  
   138  	if config.EnablePublisher {
   139  		// Create new event publisher using provided config
   140  		broker, err := stream.NewEventBroker(ctx, &streamACLDelegate{s}, stream.EventBrokerCfg{
   141  			EventBufferSize: config.EventBufferSize,
   142  			Logger:          config.Logger,
   143  		})
   144  		if err != nil {
   145  			return nil, fmt.Errorf("creating state store event broker %w", err)
   146  		}
   147  		s.db = NewChangeTrackerDB(db, broker, eventsFromChanges)
   148  	} else {
   149  		s.db = NewChangeTrackerDB(db, nil, noOpProcessChanges)
   150  	}
   151  
   152  	// Initialize the state store with the default namespace.
   153  	if err := s.namespaceInit(); err != nil {
   154  		return nil, fmt.Errorf("enterprise state store initialization failed: %v", err)
   155  	}
   156  
   157  	return s, nil
   158  }
   159  
   160  // NewWatchSet returns a new memdb.WatchSet that adds the state stores abandonCh
   161  // as a watcher. This is important in that it will notify when this specific
   162  // state store is no longer valid, usually due to a new snapshot being loaded
   163  func (s *StateStore) NewWatchSet() memdb.WatchSet {
   164  	ws := memdb.NewWatchSet()
   165  	ws.Add(s.AbandonCh())
   166  	return ws
   167  }
   168  
   169  func (s *StateStore) EventBroker() (*stream.EventBroker, error) {
   170  	if s.db.publisher == nil {
   171  		return nil, fmt.Errorf("EventBroker not configured")
   172  	}
   173  	return s.db.publisher, nil
   174  }
   175  
   176  // namespaceInit ensures the default namespace exists.
   177  func (s *StateStore) namespaceInit() error {
   178  	// Create the default namespace. This is safe to do every time we create the
   179  	// state store. There are two main cases, a brand new cluster in which case
   180  	// each server will have the same default namespace object, or a new cluster
   181  	// in which case if the default namespace has been modified, it will be
   182  	// overridden by the restore code path.
   183  	defaultNs := &structs.Namespace{
   184  		Name:        structs.DefaultNamespace,
   185  		Description: structs.DefaultNamespaceDescription,
   186  	}
   187  
   188  	if err := s.UpsertNamespaces(1, []*structs.Namespace{defaultNs}); err != nil {
   189  		return fmt.Errorf("inserting default namespace failed: %v", err)
   190  	}
   191  
   192  	return nil
   193  }
   194  
   195  // Config returns the state store configuration.
   196  func (s *StateStore) Config() *StateStoreConfig {
   197  	return s.config
   198  }
   199  
   200  // Snapshot is used to create a point in time snapshot. Because
   201  // we use MemDB, we just need to snapshot the state of the underlying
   202  // database.
   203  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
   204  	memDBSnap := s.db.memdb.Snapshot()
   205  
   206  	store := StateStore{
   207  		logger: s.logger,
   208  		config: s.config,
   209  	}
   210  
   211  	// Create a new change tracker DB that does not publish or track changes
   212  	store.db = NewChangeTrackerDB(memDBSnap, nil, noOpProcessChanges)
   213  
   214  	snap := &StateSnapshot{
   215  		StateStore: store,
   216  	}
   217  	return snap, nil
   218  }
   219  
   220  // SnapshotMinIndex is used to create a state snapshot where the index is
   221  // guaranteed to be greater than or equal to the index parameter.
   222  //
   223  // Some server operations (such as scheduling) exchange objects via RPC
   224  // concurrent with Raft log application, so they must ensure the state store
   225  // snapshot they are operating on is at or after the index the objects
   226  // retrieved via RPC were applied to the Raft log at.
   227  //
   228  // Callers should maintain their own timer metric as the time this method
   229  // blocks indicates Raft log application latency relative to scheduling.
   230  func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) {
   231  	// Ported from work.go:waitForIndex prior to 0.9
   232  
   233  	const backoffBase = 20 * time.Millisecond
   234  	const backoffLimit = 1 * time.Second
   235  	var retries uint
   236  	var retryTimer *time.Timer
   237  
   238  	// XXX: Potential optimization is to set up a watch on the state
   239  	// store's index table and only unblock via a trigger rather than
   240  	// polling.
   241  	for {
   242  		// Get the states current index
   243  		snapshotIndex, err := s.LatestIndex()
   244  		if err != nil {
   245  			return nil, fmt.Errorf("failed to determine state store's index: %v", err)
   246  		}
   247  
   248  		// We only need the FSM state to be as recent as the given index
   249  		if snapshotIndex >= index {
   250  			return s.Snapshot()
   251  		}
   252  
   253  		// Exponential back off
   254  		retries++
   255  		if retryTimer == nil {
   256  			// First retry, start at baseline
   257  			retryTimer = time.NewTimer(backoffBase)
   258  		} else {
   259  			// Subsequent retry, reset timer
   260  			deadline := 1 << (2 * retries) * backoffBase
   261  			if deadline > backoffLimit {
   262  				deadline = backoffLimit
   263  			}
   264  			retryTimer.Reset(deadline)
   265  		}
   266  
   267  		select {
   268  		case <-ctx.Done():
   269  			return nil, ctx.Err()
   270  		case <-retryTimer.C:
   271  		}
   272  	}
   273  }
   274  
   275  // Restore is used to optimize the efficiency of rebuilding
   276  // state by minimizing the number of transactions and checking
   277  // overhead.
   278  func (s *StateStore) Restore() (*StateRestore, error) {
   279  	txn := s.db.WriteTxnRestore()
   280  	r := &StateRestore{
   281  		txn: txn,
   282  	}
   283  	return r, nil
   284  }
   285  
   286  // AbandonCh returns a channel you can wait on to know if the state store was
   287  // abandoned.
   288  func (s *StateStore) AbandonCh() <-chan struct{} {
   289  	return s.abandonCh
   290  }
   291  
   292  // Abandon is used to signal that the given state store has been abandoned.
   293  // Calling this more than one time will panic.
   294  func (s *StateStore) Abandon() {
   295  	s.StopEventBroker()
   296  	close(s.abandonCh)
   297  }
   298  
   299  // StopEventBroker calls the cancel func for the state stores event
   300  // publisher. It should be called during server shutdown.
   301  func (s *StateStore) StopEventBroker() {
   302  	s.stopEventBroker()
   303  }
   304  
   305  // QueryFn is the definition of a function that can be used to implement a basic
   306  // blocking query against the state store.
   307  type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error)
   308  
   309  // BlockingQuery takes a query function and runs the function until the minimum
   310  // query index is met or until the passed context is cancelled.
   311  func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) (
   312  	resp interface{}, index uint64, err error) {
   313  
   314  RUN_QUERY:
   315  	// We capture the state store and its abandon channel but pass a snapshot to
   316  	// the blocking query function. We operate on the snapshot to allow separate
   317  	// calls to the state store not all wrapped within the same transaction.
   318  	abandonCh := s.AbandonCh()
   319  	snap, _ := s.Snapshot()
   320  	stateSnap := &snap.StateStore
   321  
   322  	// We can skip all watch tracking if this isn't a blocking query.
   323  	var ws memdb.WatchSet
   324  	if minIndex > 0 {
   325  		ws = memdb.NewWatchSet()
   326  
   327  		// This channel will be closed if a snapshot is restored and the
   328  		// whole state store is abandoned.
   329  		ws.Add(abandonCh)
   330  	}
   331  
   332  	resp, index, err = query(ws, stateSnap)
   333  	if err != nil {
   334  		return nil, index, err
   335  	}
   336  
   337  	// We haven't reached the min-index yet.
   338  	if minIndex > 0 && index <= minIndex {
   339  		if err := ws.WatchCtx(ctx); err != nil {
   340  			return nil, index, err
   341  		}
   342  
   343  		goto RUN_QUERY
   344  	}
   345  
   346  	return resp, index, nil
   347  }
   348  
   349  // UpsertPlanResults is used to upsert the results of a plan.
   350  func (s *StateStore) UpsertPlanResults(msgType structs.MessageType, index uint64, results *structs.ApplyPlanResultsRequest) error {
   351  	snapshot, err := s.Snapshot()
   352  	if err != nil {
   353  		return err
   354  	}
   355  
   356  	allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped)
   357  	if err != nil {
   358  		return err
   359  	}
   360  
   361  	allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted)
   362  	if err != nil {
   363  		return err
   364  	}
   365  
   366  	// COMPAT 0.11: Remove this denormalization when NodePreemptions is removed
   367  	results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions)
   368  	if err != nil {
   369  		return err
   370  	}
   371  
   372  	txn := s.db.WriteTxnMsgT(msgType, index)
   373  	defer txn.Abort()
   374  
   375  	// Mark nodes as ineligible.
   376  	for _, nodeID := range results.IneligibleNodes {
   377  		s.logger.Warn("marking node as ineligible due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information", "node_id", nodeID)
   378  
   379  		nodeEvent := structs.NewNodeEvent().
   380  			SetSubsystem(structs.NodeEventSubsystemScheduler).
   381  			SetMessage(NodeEligibilityEventPlanRejectThreshold)
   382  
   383  		err := s.updateNodeEligibilityImpl(index, nodeID,
   384  			structs.NodeSchedulingIneligible, results.UpdatedAt, nodeEvent, txn)
   385  		if err != nil {
   386  			return err
   387  		}
   388  	}
   389  
   390  	// Upsert the newly created or updated deployment
   391  	if results.Deployment != nil {
   392  		if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil {
   393  			return err
   394  		}
   395  	}
   396  
   397  	// Update the status of deployments effected by the plan.
   398  	if len(results.DeploymentUpdates) != 0 {
   399  		s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn)
   400  	}
   401  
   402  	if results.EvalID != "" {
   403  		// Update the modify index of the eval id
   404  		if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil {
   405  			return err
   406  		}
   407  	}
   408  
   409  	numAllocs := 0
   410  	if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 {
   411  		// COMPAT 0.11: This branch will be removed, when Alloc is removed
   412  		// Attach the job to all the allocations. It is pulled out in the payload to
   413  		// avoid the redundancy of encoding, but should be denormalized prior to
   414  		// being inserted into MemDB.
   415  		addComputedAllocAttrs(results.Alloc, results.Job)
   416  		numAllocs = len(results.Alloc) + len(results.NodePreemptions)
   417  	} else {
   418  		// Attach the job to all the allocations. It is pulled out in the payload to
   419  		// avoid the redundancy of encoding, but should be denormalized prior to
   420  		// being inserted into MemDB.
   421  		addComputedAllocAttrs(results.AllocsUpdated, results.Job)
   422  		numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted)
   423  	}
   424  
   425  	allocsToUpsert := make([]*structs.Allocation, 0, numAllocs)
   426  
   427  	// COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed
   428  	allocsToUpsert = append(allocsToUpsert, results.Alloc...)
   429  	allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...)
   430  
   431  	allocsToUpsert = append(allocsToUpsert, allocsStopped...)
   432  	allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...)
   433  	allocsToUpsert = append(allocsToUpsert, allocsPreempted...)
   434  
   435  	// handle upgrade path
   436  	for _, alloc := range allocsToUpsert {
   437  		alloc.Canonicalize()
   438  	}
   439  
   440  	if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil {
   441  		return err
   442  	}
   443  
   444  	// Upsert followup evals for allocs that were preempted
   445  	for _, eval := range results.PreemptionEvals {
   446  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   447  			return err
   448  		}
   449  	}
   450  
   451  	return txn.Commit()
   452  }
   453  
   454  // addComputedAllocAttrs adds the computed/derived attributes to the allocation.
   455  // This method is used when an allocation is being denormalized.
   456  func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) {
   457  	structs.DenormalizeAllocationJobs(job, allocs)
   458  
   459  	// COMPAT(0.11): Remove in 0.11
   460  	// Calculate the total resources of allocations. It is pulled out in the
   461  	// payload to avoid encoding something that can be computed, but should be
   462  	// denormalized prior to being inserted into MemDB.
   463  	for _, alloc := range allocs {
   464  		if alloc.Resources != nil {
   465  			continue
   466  		}
   467  
   468  		alloc.Resources = new(structs.Resources)
   469  		for _, task := range alloc.TaskResources {
   470  			alloc.Resources.Add(task)
   471  		}
   472  
   473  		// Add the shared resources
   474  		alloc.Resources.Add(alloc.SharedResources)
   475  	}
   476  }
   477  
   478  // upsertDeploymentUpdates updates the deployments given the passed status
   479  // updates.
   480  func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *txn) error {
   481  	for _, u := range updates {
   482  		if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil {
   483  			return err
   484  		}
   485  	}
   486  
   487  	return nil
   488  }
   489  
   490  // UpsertJobSummary upserts a job summary into the state store.
   491  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
   492  	txn := s.db.WriteTxn(index)
   493  	defer txn.Abort()
   494  
   495  	// Check if the job summary already exists
   496  	existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID)
   497  	if err != nil {
   498  		return fmt.Errorf("job summary lookup failed: %v", err)
   499  	}
   500  
   501  	// Setup the indexes correctly
   502  	if existing != nil {
   503  		jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex
   504  		jobSummary.ModifyIndex = index
   505  	} else {
   506  		jobSummary.CreateIndex = index
   507  		jobSummary.ModifyIndex = index
   508  	}
   509  
   510  	// Update the index
   511  	if err := txn.Insert("job_summary", jobSummary); err != nil {
   512  		return err
   513  	}
   514  
   515  	// Update the indexes table for job summary
   516  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   517  		return fmt.Errorf("index update failed: %v", err)
   518  	}
   519  
   520  	return txn.Commit()
   521  }
   522  
   523  // DeleteJobSummary deletes the job summary with the given ID. This is for
   524  // testing purposes only.
   525  func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error {
   526  	txn := s.db.WriteTxn(index)
   527  	defer txn.Abort()
   528  
   529  	// Delete the job summary
   530  	if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil {
   531  		return fmt.Errorf("deleting job summary failed: %v", err)
   532  	}
   533  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   534  		return fmt.Errorf("index update failed: %v", err)
   535  	}
   536  	return txn.Commit()
   537  }
   538  
   539  // UpsertDeployment is used to insert or update a new deployment.
   540  func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error {
   541  	txn := s.db.WriteTxn(index)
   542  	defer txn.Abort()
   543  	if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil {
   544  		return err
   545  	}
   546  	return txn.Commit()
   547  }
   548  
   549  func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *txn) error {
   550  	// Check if the deployment already exists
   551  	existing, err := txn.First("deployment", "id", deployment.ID)
   552  	if err != nil {
   553  		return fmt.Errorf("deployment lookup failed: %v", err)
   554  	}
   555  
   556  	// Setup the indexes correctly
   557  	if existing != nil {
   558  		deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex
   559  		deployment.ModifyIndex = index
   560  	} else {
   561  		deployment.CreateIndex = index
   562  		deployment.ModifyIndex = index
   563  	}
   564  
   565  	// Insert the deployment
   566  	if err := txn.Insert("deployment", deployment); err != nil {
   567  		return err
   568  	}
   569  
   570  	// Update the indexes table for deployment
   571  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   572  		return fmt.Errorf("index update failed: %v", err)
   573  	}
   574  
   575  	// If the deployment is being marked as complete, set the job to stable.
   576  	if deployment.Status == structs.DeploymentStatusSuccessful {
   577  		if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil {
   578  			return fmt.Errorf("failed to update job stability: %v", err)
   579  		}
   580  	}
   581  
   582  	return nil
   583  }
   584  
   585  func (s *StateStore) Deployments(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
   586  	txn := s.db.ReadTxn()
   587  
   588  	var it memdb.ResultIterator
   589  	var err error
   590  
   591  	switch sort {
   592  	case SortReverse:
   593  		it, err = txn.GetReverse("deployment", "create")
   594  	default:
   595  		it, err = txn.Get("deployment", "create")
   596  	}
   597  
   598  	if err != nil {
   599  		return nil, err
   600  	}
   601  
   602  	ws.Add(it.WatchCh())
   603  
   604  	return it, nil
   605  }
   606  
   607  func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
   608  	txn := s.db.ReadTxn()
   609  
   610  	// Walk the entire deployments table
   611  	iter, err := txn.Get("deployment", "namespace", namespace)
   612  	if err != nil {
   613  		return nil, err
   614  	}
   615  
   616  	ws.Add(iter.WatchCh())
   617  	return iter, nil
   618  }
   619  
   620  func (s *StateStore) DeploymentsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
   621  	txn := s.db.ReadTxn()
   622  
   623  	var (
   624  		it    memdb.ResultIterator
   625  		err   error
   626  		exact = terminate(namespace)
   627  	)
   628  
   629  	switch sort {
   630  	case SortReverse:
   631  		it, err = txn.GetReverse("deployment", "namespace_create_prefix", exact)
   632  	default:
   633  		it, err = txn.Get("deployment", "namespace_create_prefix", exact)
   634  	}
   635  
   636  	if err != nil {
   637  		return nil, err
   638  	}
   639  
   640  	ws.Add(it.WatchCh())
   641  
   642  	return it, nil
   643  }
   644  
   645  func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string, sort SortOption) (memdb.ResultIterator, error) {
   646  	txn := s.db.ReadTxn()
   647  
   648  	var iter memdb.ResultIterator
   649  	var err error
   650  
   651  	// Walk the entire deployments table
   652  	switch sort {
   653  	case SortReverse:
   654  		iter, err = txn.GetReverse("deployment", "id_prefix", deploymentID)
   655  	default:
   656  		iter, err = txn.Get("deployment", "id_prefix", deploymentID)
   657  	}
   658  	if err != nil {
   659  		return nil, err
   660  	}
   661  
   662  	ws.Add(iter.WatchCh())
   663  
   664  	// Wrap the iterator in a filter
   665  	wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace))
   666  	return wrap, nil
   667  }
   668  
   669  // deploymentNamespaceFilter returns a filter function that filters all
   670  // deployment not in the given namespace.
   671  func deploymentNamespaceFilter(namespace string) func(interface{}) bool {
   672  	return func(raw interface{}) bool {
   673  		d, ok := raw.(*structs.Deployment)
   674  		if !ok {
   675  			return true
   676  		}
   677  
   678  		return d.Namespace != namespace
   679  	}
   680  }
   681  
   682  func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) {
   683  	txn := s.db.ReadTxn()
   684  	return s.deploymentByIDImpl(ws, deploymentID, txn)
   685  }
   686  
   687  func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *txn) (*structs.Deployment, error) {
   688  	watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID)
   689  	if err != nil {
   690  		return nil, fmt.Errorf("deployment lookup failed: %v", err)
   691  	}
   692  	ws.Add(watchCh)
   693  
   694  	if existing != nil {
   695  		return existing.(*structs.Deployment), nil
   696  	}
   697  
   698  	return nil, nil
   699  }
   700  
   701  func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) {
   702  	txn := s.db.ReadTxn()
   703  
   704  	var job *structs.Job
   705  	// Read job from state store
   706  	_, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID)
   707  	if err != nil {
   708  		return nil, fmt.Errorf("job lookup failed: %v", err)
   709  	}
   710  	if existing != nil {
   711  		job = existing.(*structs.Job)
   712  	}
   713  
   714  	// Get an iterator over the deployments
   715  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   716  	if err != nil {
   717  		return nil, err
   718  	}
   719  
   720  	ws.Add(iter.WatchCh())
   721  
   722  	var out []*structs.Deployment
   723  	for {
   724  		raw := iter.Next()
   725  		if raw == nil {
   726  			break
   727  		}
   728  		d := raw.(*structs.Deployment)
   729  
   730  		// If the allocation belongs to a job with the same ID but a different
   731  		// create index and we are not getting all the allocations whose Jobs
   732  		// matches the same Job ID then we skip it
   733  		if !all && job != nil && d.JobCreateIndex != job.CreateIndex {
   734  			continue
   735  		}
   736  		out = append(out, d)
   737  	}
   738  
   739  	return out, nil
   740  }
   741  
   742  // LatestDeploymentByJobID returns the latest deployment for the given job. The
   743  // latest is determined strictly by CreateIndex.
   744  func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) {
   745  	txn := s.db.ReadTxn()
   746  
   747  	// Get an iterator over the deployments
   748  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   749  	if err != nil {
   750  		return nil, err
   751  	}
   752  
   753  	ws.Add(iter.WatchCh())
   754  
   755  	var out *structs.Deployment
   756  	for {
   757  		raw := iter.Next()
   758  		if raw == nil {
   759  			break
   760  		}
   761  
   762  		d := raw.(*structs.Deployment)
   763  		if out == nil || out.CreateIndex < d.CreateIndex {
   764  			out = d
   765  		}
   766  	}
   767  
   768  	return out, nil
   769  }
   770  
   771  // DeleteDeployment is used to delete a set of deployments by ID
   772  func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error {
   773  	txn := s.db.WriteTxn(index)
   774  	defer txn.Abort()
   775  
   776  	if len(deploymentIDs) == 0 {
   777  		return nil
   778  	}
   779  
   780  	for _, deploymentID := range deploymentIDs {
   781  		// Lookup the deployment
   782  		existing, err := txn.First("deployment", "id", deploymentID)
   783  		if err != nil {
   784  			return fmt.Errorf("deployment lookup failed: %v", err)
   785  		}
   786  		if existing == nil {
   787  			return fmt.Errorf("deployment not found")
   788  		}
   789  
   790  		// Delete the deployment
   791  		if err := txn.Delete("deployment", existing); err != nil {
   792  			return fmt.Errorf("deployment delete failed: %v", err)
   793  		}
   794  	}
   795  
   796  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   797  		return fmt.Errorf("index update failed: %v", err)
   798  	}
   799  
   800  	return txn.Commit()
   801  }
   802  
   803  // UpsertScalingEvent is used to insert a new scaling event.
   804  // Only the most recent JobTrackedScalingEvents will be kept.
   805  func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error {
   806  	txn := s.db.WriteTxn(index)
   807  	defer txn.Abort()
   808  
   809  	// Get the existing events
   810  	existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID)
   811  	if err != nil {
   812  		return fmt.Errorf("scaling event lookup failed: %v", err)
   813  	}
   814  
   815  	var jobEvents *structs.JobScalingEvents
   816  	if existing != nil {
   817  		jobEvents = existing.(*structs.JobScalingEvents)
   818  	} else {
   819  		jobEvents = &structs.JobScalingEvents{
   820  			Namespace:     req.Namespace,
   821  			JobID:         req.JobID,
   822  			ScalingEvents: make(map[string][]*structs.ScalingEvent),
   823  		}
   824  	}
   825  
   826  	jobEvents.ModifyIndex = index
   827  	req.ScalingEvent.CreateIndex = index
   828  
   829  	events := jobEvents.ScalingEvents[req.TaskGroup]
   830  	// Prepend this latest event
   831  	events = append(
   832  		[]*structs.ScalingEvent{req.ScalingEvent},
   833  		events...,
   834  	)
   835  	// Truncate older events
   836  	if len(events) > structs.JobTrackedScalingEvents {
   837  		events = events[0:structs.JobTrackedScalingEvents]
   838  	}
   839  	jobEvents.ScalingEvents[req.TaskGroup] = events
   840  
   841  	// Insert the new event
   842  	if err := txn.Insert("scaling_event", jobEvents); err != nil {
   843  		return fmt.Errorf("scaling event insert failed: %v", err)
   844  	}
   845  
   846  	// Update the indexes table for scaling_event
   847  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
   848  		return fmt.Errorf("index update failed: %v", err)
   849  	}
   850  
   851  	return txn.Commit()
   852  }
   853  
   854  // ScalingEvents returns an iterator over all the job scaling events
   855  func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   856  	txn := s.db.ReadTxn()
   857  
   858  	// Walk the entire scaling_event table
   859  	iter, err := txn.Get("scaling_event", "id")
   860  	if err != nil {
   861  		return nil, err
   862  	}
   863  
   864  	ws.Add(iter.WatchCh())
   865  
   866  	return iter, nil
   867  }
   868  
   869  func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) {
   870  	txn := s.db.ReadTxn()
   871  
   872  	watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID)
   873  	if err != nil {
   874  		return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err)
   875  	}
   876  	ws.Add(watchCh)
   877  
   878  	if existing != nil {
   879  		events := existing.(*structs.JobScalingEvents)
   880  		return events.ScalingEvents, events.ModifyIndex, nil
   881  	}
   882  	return nil, 0, nil
   883  }
   884  
   885  // UpsertNode is used to register a node or update a node definition
   886  // This is assumed to be triggered by the client, so we retain the value
   887  // of drain/eligibility which is set by the scheduler.
   888  func (s *StateStore) UpsertNode(msgType structs.MessageType, index uint64, node *structs.Node) error {
   889  	txn := s.db.WriteTxnMsgT(msgType, index)
   890  	defer txn.Abort()
   891  
   892  	err := upsertNodeTxn(txn, index, node)
   893  	if err != nil {
   894  		return nil
   895  	}
   896  	return txn.Commit()
   897  }
   898  
   899  func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error {
   900  	// Check if the node already exists
   901  	existing, err := txn.First("nodes", "id", node.ID)
   902  	if err != nil {
   903  		return fmt.Errorf("node lookup failed: %v", err)
   904  	}
   905  
   906  	// Setup the indexes correctly
   907  	if existing != nil {
   908  		exist := existing.(*structs.Node)
   909  		node.CreateIndex = exist.CreateIndex
   910  		node.ModifyIndex = index
   911  
   912  		// Retain node events that have already been set on the node
   913  		node.Events = exist.Events
   914  
   915  		// If we are transitioning from down, record the re-registration
   916  		if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown {
   917  			appendNodeEvents(index, node, []*structs.NodeEvent{
   918  				structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   919  					SetMessage(NodeRegisterEventReregistered).
   920  					SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))})
   921  		}
   922  
   923  		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
   924  		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
   925  		node.LastDrain = exist.LastDrain                         // Retain the drain metadata
   926  	} else {
   927  		// Because this is the first time the node is being registered, we should
   928  		// also create a node registration event
   929  		nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   930  			SetMessage(NodeRegisterEventRegistered).
   931  			SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))
   932  		node.Events = []*structs.NodeEvent{nodeEvent}
   933  		node.CreateIndex = index
   934  		node.ModifyIndex = index
   935  	}
   936  
   937  	// Insert the node
   938  	if err := txn.Insert("nodes", node); err != nil {
   939  		return fmt.Errorf("node insert failed: %v", err)
   940  	}
   941  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   942  		return fmt.Errorf("index update failed: %v", err)
   943  	}
   944  	if err := upsertCSIPluginsForNode(txn, node, index); err != nil {
   945  		return fmt.Errorf("csi plugin update failed: %v", err)
   946  	}
   947  
   948  	return nil
   949  }
   950  
   951  // DeleteNode deregisters a batch of nodes
   952  func (s *StateStore) DeleteNode(msgType structs.MessageType, index uint64, nodes []string) error {
   953  	txn := s.db.WriteTxn(index)
   954  	defer txn.Abort()
   955  
   956  	err := deleteNodeTxn(txn, index, nodes)
   957  	if err != nil {
   958  		return nil
   959  	}
   960  	return txn.Commit()
   961  }
   962  
   963  func deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
   964  	if len(nodes) == 0 {
   965  		return fmt.Errorf("node ids missing")
   966  	}
   967  
   968  	for _, nodeID := range nodes {
   969  		existing, err := txn.First("nodes", "id", nodeID)
   970  		if err != nil {
   971  			return fmt.Errorf("node lookup failed: %s: %v", nodeID, err)
   972  		}
   973  		if existing == nil {
   974  			return fmt.Errorf("node not found: %s", nodeID)
   975  		}
   976  
   977  		// Delete the node
   978  		if err := txn.Delete("nodes", existing); err != nil {
   979  			return fmt.Errorf("node delete failed: %s: %v", nodeID, err)
   980  		}
   981  
   982  		node := existing.(*structs.Node)
   983  		if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
   984  			return fmt.Errorf("csi plugin delete failed: %v", err)
   985  		}
   986  	}
   987  
   988  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   989  		return fmt.Errorf("index update failed: %v", err)
   990  	}
   991  
   992  	return nil
   993  }
   994  
   995  // UpdateNodeStatus is used to update the status of a node
   996  func (s *StateStore) UpdateNodeStatus(msgType structs.MessageType, index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
   997  	txn := s.db.WriteTxnMsgT(msgType, index)
   998  	defer txn.Abort()
   999  
  1000  	if err := s.updateNodeStatusTxn(txn, nodeID, status, updatedAt, event); err != nil {
  1001  		return err
  1002  	}
  1003  
  1004  	return txn.Commit()
  1005  }
  1006  
  1007  func (s *StateStore) updateNodeStatusTxn(txn *txn, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
  1008  
  1009  	// Lookup the node
  1010  	existing, err := txn.First("nodes", "id", nodeID)
  1011  	if err != nil {
  1012  		return fmt.Errorf("node lookup failed: %v", err)
  1013  	}
  1014  	if existing == nil {
  1015  		return fmt.Errorf("node not found")
  1016  	}
  1017  
  1018  	// Copy the existing node
  1019  	existingNode := existing.(*structs.Node)
  1020  	copyNode := existingNode.Copy()
  1021  	copyNode.StatusUpdatedAt = updatedAt
  1022  
  1023  	// Add the event if given
  1024  	if event != nil {
  1025  		appendNodeEvents(txn.Index, copyNode, []*structs.NodeEvent{event})
  1026  	}
  1027  
  1028  	// Update the status in the copy
  1029  	copyNode.Status = status
  1030  	copyNode.ModifyIndex = txn.Index
  1031  
  1032  	// Insert the node
  1033  	if err := txn.Insert("nodes", copyNode); err != nil {
  1034  		return fmt.Errorf("node update failed: %v", err)
  1035  	}
  1036  	if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil {
  1037  		return fmt.Errorf("index update failed: %v", err)
  1038  	}
  1039  	return nil
  1040  }
  1041  
  1042  // BatchUpdateNodeDrain is used to update the drain of a node set of nodes.
  1043  // This is currently only called when node drain is completed by the drainer.
  1044  func (s *StateStore) BatchUpdateNodeDrain(msgType structs.MessageType, index uint64, updatedAt int64,
  1045  	updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error {
  1046  	txn := s.db.WriteTxnMsgT(msgType, index)
  1047  	defer txn.Abort()
  1048  	for node, update := range updates {
  1049  		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt,
  1050  			events[node], nil, "", true); err != nil {
  1051  			return err
  1052  		}
  1053  	}
  1054  	return txn.Commit()
  1055  }
  1056  
  1057  // UpdateNodeDrain is used to update the drain of a node
  1058  func (s *StateStore) UpdateNodeDrain(msgType structs.MessageType, index uint64, nodeID string,
  1059  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
  1060  	event *structs.NodeEvent, drainMeta map[string]string, accessorId string) error {
  1061  
  1062  	txn := s.db.WriteTxnMsgT(msgType, index)
  1063  	defer txn.Abort()
  1064  	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event,
  1065  		drainMeta, accessorId, false); err != nil {
  1066  
  1067  		return err
  1068  	}
  1069  	return txn.Commit()
  1070  }
  1071  
  1072  func (s *StateStore) updateNodeDrainImpl(txn *txn, index uint64, nodeID string,
  1073  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
  1074  	event *structs.NodeEvent, drainMeta map[string]string, accessorId string,
  1075  	drainCompleted bool) error {
  1076  
  1077  	// Lookup the node
  1078  	existing, err := txn.First("nodes", "id", nodeID)
  1079  	if err != nil {
  1080  		return fmt.Errorf("node lookup failed: %v", err)
  1081  	}
  1082  	if existing == nil {
  1083  		return fmt.Errorf("node not found")
  1084  	}
  1085  
  1086  	// Copy the existing node
  1087  	existingNode := existing.(*structs.Node)
  1088  	updatedNode := existingNode.Copy()
  1089  	updatedNode.StatusUpdatedAt = updatedAt
  1090  
  1091  	// Add the event if given
  1092  	if event != nil {
  1093  		appendNodeEvents(index, updatedNode, []*structs.NodeEvent{event})
  1094  	}
  1095  
  1096  	// Update the drain in the copy
  1097  	updatedNode.DrainStrategy = drain
  1098  	if drain != nil {
  1099  		updatedNode.SchedulingEligibility = structs.NodeSchedulingIneligible
  1100  	} else if markEligible {
  1101  		updatedNode.SchedulingEligibility = structs.NodeSchedulingEligible
  1102  	}
  1103  
  1104  	// Update LastDrain
  1105  	updateTime := time.Unix(updatedAt, 0)
  1106  
  1107  	// if drain strategy isn't set before or after, this wasn't a drain operation
  1108  	// in that case, we don't care about .LastDrain
  1109  	drainNoop := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy == nil
  1110  	// otherwise, when done with this method, updatedNode.LastDrain should be set
  1111  	// if starting a new drain operation, create a new LastDrain. otherwise, update the existing one.
  1112  	startedDraining := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy != nil
  1113  	if !drainNoop {
  1114  		if startedDraining {
  1115  			updatedNode.LastDrain = &structs.DrainMetadata{
  1116  				StartedAt: updateTime,
  1117  				Meta:      drainMeta,
  1118  			}
  1119  		} else if updatedNode.LastDrain == nil {
  1120  			// if already draining and LastDrain doesn't exist, we need to create a new one
  1121  			// this could happen if we upgraded to 1.1.x during a drain
  1122  			updatedNode.LastDrain = &structs.DrainMetadata{
  1123  				// we don't have sub-second accuracy on these fields, so truncate this
  1124  				StartedAt: time.Unix(existingNode.DrainStrategy.StartedAt.Unix(), 0),
  1125  				Meta:      drainMeta,
  1126  			}
  1127  		}
  1128  
  1129  		updatedNode.LastDrain.UpdatedAt = updateTime
  1130  
  1131  		// won't have new metadata on drain complete; keep the existing operator-provided metadata
  1132  		// also, keep existing if they didn't provide it
  1133  		if len(drainMeta) != 0 {
  1134  			updatedNode.LastDrain.Meta = drainMeta
  1135  		}
  1136  
  1137  		// we won't have an accessor ID on drain complete, so don't overwrite the existing one
  1138  		if accessorId != "" {
  1139  			updatedNode.LastDrain.AccessorID = accessorId
  1140  		}
  1141  
  1142  		if updatedNode.DrainStrategy != nil {
  1143  			updatedNode.LastDrain.Status = structs.DrainStatusDraining
  1144  		} else if drainCompleted {
  1145  			updatedNode.LastDrain.Status = structs.DrainStatusComplete
  1146  		} else {
  1147  			updatedNode.LastDrain.Status = structs.DrainStatusCanceled
  1148  		}
  1149  	}
  1150  
  1151  	updatedNode.ModifyIndex = index
  1152  
  1153  	// Insert the node
  1154  	if err := txn.Insert("nodes", updatedNode); err != nil {
  1155  		return fmt.Errorf("node update failed: %v", err)
  1156  	}
  1157  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1158  		return fmt.Errorf("index update failed: %v", err)
  1159  	}
  1160  
  1161  	return nil
  1162  }
  1163  
  1164  // UpdateNodeEligibility is used to update the scheduling eligibility of a node
  1165  func (s *StateStore) UpdateNodeEligibility(msgType structs.MessageType, index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error {
  1166  	txn := s.db.WriteTxnMsgT(msgType, index)
  1167  	defer txn.Abort()
  1168  	if err := s.updateNodeEligibilityImpl(index, nodeID, eligibility, updatedAt, event, txn); err != nil {
  1169  		return err
  1170  	}
  1171  	return txn.Commit()
  1172  }
  1173  
  1174  func (s *StateStore) updateNodeEligibilityImpl(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent, txn *txn) error {
  1175  	// Lookup the node
  1176  	existing, err := txn.First("nodes", "id", nodeID)
  1177  	if err != nil {
  1178  		return fmt.Errorf("node lookup failed: %v", err)
  1179  	}
  1180  	if existing == nil {
  1181  		return fmt.Errorf("node not found")
  1182  	}
  1183  
  1184  	// Copy the existing node
  1185  	existingNode := existing.(*structs.Node)
  1186  	copyNode := existingNode.Copy()
  1187  	copyNode.StatusUpdatedAt = updatedAt
  1188  
  1189  	// Add the event if given
  1190  	if event != nil {
  1191  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
  1192  	}
  1193  
  1194  	// Check if this is a valid action
  1195  	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
  1196  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
  1197  	}
  1198  
  1199  	// Update the eligibility in the copy
  1200  	copyNode.SchedulingEligibility = eligibility
  1201  	copyNode.ModifyIndex = index
  1202  
  1203  	// Insert the node
  1204  	if err := txn.Insert("nodes", copyNode); err != nil {
  1205  		return fmt.Errorf("node update failed: %v", err)
  1206  	}
  1207  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1208  		return fmt.Errorf("index update failed: %v", err)
  1209  	}
  1210  
  1211  	return nil
  1212  }
  1213  
  1214  // UpsertNodeEvents adds the node events to the nodes, rotating events as
  1215  // necessary.
  1216  func (s *StateStore) UpsertNodeEvents(msgType structs.MessageType, index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
  1217  	txn := s.db.WriteTxnMsgT(msgType, index)
  1218  	defer txn.Abort()
  1219  
  1220  	for nodeID, events := range nodeEvents {
  1221  		if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil {
  1222  			return err
  1223  		}
  1224  	}
  1225  
  1226  	return txn.Commit()
  1227  }
  1228  
  1229  // upsertNodeEvent upserts a node event for a respective node. It also maintains
  1230  // that a fixed number of node events are ever stored simultaneously, deleting
  1231  // older events once this bound has been reached.
  1232  func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *txn) error {
  1233  	// Lookup the node
  1234  	existing, err := txn.First("nodes", "id", nodeID)
  1235  	if err != nil {
  1236  		return fmt.Errorf("node lookup failed: %v", err)
  1237  	}
  1238  	if existing == nil {
  1239  		return fmt.Errorf("node not found")
  1240  	}
  1241  
  1242  	// Copy the existing node
  1243  	existingNode := existing.(*structs.Node)
  1244  	copyNode := existingNode.Copy()
  1245  	appendNodeEvents(index, copyNode, events)
  1246  
  1247  	// Insert the node
  1248  	if err := txn.Insert("nodes", copyNode); err != nil {
  1249  		return fmt.Errorf("node update failed: %v", err)
  1250  	}
  1251  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1252  		return fmt.Errorf("index update failed: %v", err)
  1253  	}
  1254  
  1255  	return nil
  1256  }
  1257  
  1258  // appendNodeEvents is a helper that takes a node and new events and appends
  1259  // them, pruning older events as needed.
  1260  func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) {
  1261  	// Add the events, updating the indexes
  1262  	for _, e := range events {
  1263  		e.CreateIndex = index
  1264  		node.Events = append(node.Events, e)
  1265  	}
  1266  
  1267  	// Keep node events pruned to not exceed the max allowed
  1268  	if l := len(node.Events); l > structs.MaxRetainedNodeEvents {
  1269  		delta := l - structs.MaxRetainedNodeEvents
  1270  		node.Events = node.Events[delta:]
  1271  	}
  1272  }
  1273  
  1274  // upsertCSIPluginsForNode indexes csi plugins for volume retrieval, with health. It's called
  1275  // on upsertNodeEvents, so that event driven health changes are updated
  1276  func upsertCSIPluginsForNode(txn *txn, node *structs.Node, index uint64) error {
  1277  
  1278  	upsertFn := func(info *structs.CSIInfo) error {
  1279  		raw, err := txn.First("csi_plugins", "id", info.PluginID)
  1280  		if err != nil {
  1281  			return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err)
  1282  		}
  1283  
  1284  		var plug *structs.CSIPlugin
  1285  		if raw != nil {
  1286  			plug = raw.(*structs.CSIPlugin).Copy()
  1287  		} else {
  1288  			if !info.Healthy {
  1289  				// we don't want to create new plugins for unhealthy
  1290  				// allocs, otherwise we'd recreate the plugin when we
  1291  				// get the update for the alloc becoming terminal
  1292  				return nil
  1293  			}
  1294  			plug = structs.NewCSIPlugin(info.PluginID, index)
  1295  		}
  1296  
  1297  		// the plugin may have been created by the job being updated, in which case
  1298  		// this data will not be configured, it's only available to the fingerprint
  1299  		// system
  1300  		plug.Provider = info.Provider
  1301  		plug.Version = info.ProviderVersion
  1302  
  1303  		err = plug.AddPlugin(node.ID, info)
  1304  		if err != nil {
  1305  			return err
  1306  		}
  1307  
  1308  		plug.ModifyIndex = index
  1309  
  1310  		err = txn.Insert("csi_plugins", plug)
  1311  		if err != nil {
  1312  			return fmt.Errorf("csi_plugins insert error: %v", err)
  1313  		}
  1314  
  1315  		return nil
  1316  	}
  1317  
  1318  	inUseController := map[string]struct{}{}
  1319  	inUseNode := map[string]struct{}{}
  1320  
  1321  	for _, info := range node.CSIControllerPlugins {
  1322  		err := upsertFn(info)
  1323  		if err != nil {
  1324  			return err
  1325  		}
  1326  		inUseController[info.PluginID] = struct{}{}
  1327  	}
  1328  
  1329  	for _, info := range node.CSINodePlugins {
  1330  		err := upsertFn(info)
  1331  		if err != nil {
  1332  			return err
  1333  		}
  1334  		inUseNode[info.PluginID] = struct{}{}
  1335  	}
  1336  
  1337  	// remove the client node from any plugin that's not
  1338  	// running on it.
  1339  	iter, err := txn.Get("csi_plugins", "id")
  1340  	if err != nil {
  1341  		return fmt.Errorf("csi_plugins lookup failed: %v", err)
  1342  	}
  1343  	for {
  1344  		raw := iter.Next()
  1345  		if raw == nil {
  1346  			break
  1347  		}
  1348  		plug, ok := raw.(*structs.CSIPlugin)
  1349  		if !ok {
  1350  			continue
  1351  		}
  1352  		plug = plug.Copy()
  1353  
  1354  		var hadDelete bool
  1355  		if _, ok := inUseController[plug.ID]; !ok {
  1356  			if _, asController := plug.Controllers[node.ID]; asController {
  1357  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeController)
  1358  				if err != nil {
  1359  					return err
  1360  				}
  1361  				hadDelete = true
  1362  			}
  1363  		}
  1364  		if _, ok := inUseNode[plug.ID]; !ok {
  1365  			if _, asNode := plug.Nodes[node.ID]; asNode {
  1366  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeNode)
  1367  				if err != nil {
  1368  					return err
  1369  				}
  1370  				hadDelete = true
  1371  			}
  1372  		}
  1373  		// we check this flag both for performance and to make sure we
  1374  		// don't delete a plugin when registering a node plugin but
  1375  		// no controller
  1376  		if hadDelete {
  1377  			err = updateOrGCPlugin(index, txn, plug)
  1378  			if err != nil {
  1379  				return err
  1380  			}
  1381  		}
  1382  	}
  1383  
  1384  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1385  		return fmt.Errorf("index update failed: %v", err)
  1386  	}
  1387  
  1388  	return nil
  1389  }
  1390  
  1391  // deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode
  1392  func deleteNodeCSIPlugins(txn *txn, node *structs.Node, index uint64) error {
  1393  	if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 {
  1394  		return nil
  1395  	}
  1396  
  1397  	names := map[string]struct{}{}
  1398  	for _, info := range node.CSIControllerPlugins {
  1399  		names[info.PluginID] = struct{}{}
  1400  	}
  1401  	for _, info := range node.CSINodePlugins {
  1402  		names[info.PluginID] = struct{}{}
  1403  	}
  1404  
  1405  	for id := range names {
  1406  		raw, err := txn.First("csi_plugins", "id", id)
  1407  		if err != nil {
  1408  			return fmt.Errorf("csi_plugins lookup error %s: %v", id, err)
  1409  		}
  1410  		if raw == nil {
  1411  			// plugin may have been deregistered but we didn't
  1412  			// update the fingerprint yet
  1413  			continue
  1414  		}
  1415  
  1416  		plug := raw.(*structs.CSIPlugin).Copy()
  1417  		err = plug.DeleteNode(node.ID)
  1418  		if err != nil {
  1419  			return err
  1420  		}
  1421  		err = updateOrGCPlugin(index, txn, plug)
  1422  		if err != nil {
  1423  			return err
  1424  		}
  1425  	}
  1426  
  1427  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1428  		return fmt.Errorf("index update failed: %v", err)
  1429  	}
  1430  
  1431  	return nil
  1432  }
  1433  
  1434  // updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
  1435  func updateOrGCPlugin(index uint64, txn Txn, plug *structs.CSIPlugin) error {
  1436  	plug.ModifyIndex = index
  1437  
  1438  	if plug.IsEmpty() {
  1439  		err := txn.Delete("csi_plugins", plug)
  1440  		if err != nil {
  1441  			return fmt.Errorf("csi_plugins delete error: %v", err)
  1442  		}
  1443  	} else {
  1444  		err := txn.Insert("csi_plugins", plug)
  1445  		if err != nil {
  1446  			return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
  1447  		}
  1448  	}
  1449  	return nil
  1450  }
  1451  
  1452  // deleteJobFromPlugins removes the allocations of this job from any plugins the job is
  1453  // running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn
  1454  func (s *StateStore) deleteJobFromPlugins(index uint64, txn Txn, job *structs.Job) error {
  1455  	ws := memdb.NewWatchSet()
  1456  	summary, err := s.JobSummaryByID(ws, job.Namespace, job.ID)
  1457  	if err != nil {
  1458  		return fmt.Errorf("error getting job summary: %v", err)
  1459  	}
  1460  
  1461  	allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false)
  1462  	if err != nil {
  1463  		return fmt.Errorf("error getting allocations: %v", err)
  1464  	}
  1465  
  1466  	type pair struct {
  1467  		pluginID string
  1468  		alloc    *structs.Allocation
  1469  	}
  1470  
  1471  	plugAllocs := []*pair{}
  1472  	found := map[string]struct{}{}
  1473  
  1474  	// Find plugins for allocs that belong to this job
  1475  	for _, a := range allocs {
  1476  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  1477  		found[tg.Name] = struct{}{}
  1478  		for _, t := range tg.Tasks {
  1479  			if t.CSIPluginConfig == nil {
  1480  				continue
  1481  			}
  1482  			plugAllocs = append(plugAllocs, &pair{
  1483  				pluginID: t.CSIPluginConfig.ID,
  1484  				alloc:    a,
  1485  			})
  1486  		}
  1487  	}
  1488  
  1489  	// Find any plugins that do not yet have allocs for this job
  1490  	for _, tg := range job.TaskGroups {
  1491  		if _, ok := found[tg.Name]; ok {
  1492  			continue
  1493  		}
  1494  
  1495  		for _, t := range tg.Tasks {
  1496  			if t.CSIPluginConfig == nil {
  1497  				continue
  1498  			}
  1499  			plugAllocs = append(plugAllocs, &pair{
  1500  				pluginID: t.CSIPluginConfig.ID,
  1501  			})
  1502  		}
  1503  	}
  1504  
  1505  	plugins := map[string]*structs.CSIPlugin{}
  1506  
  1507  	for _, x := range plugAllocs {
  1508  		plug, ok := plugins[x.pluginID]
  1509  
  1510  		if !ok {
  1511  			plug, err = s.CSIPluginByIDTxn(txn, nil, x.pluginID)
  1512  			if err != nil {
  1513  				return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err)
  1514  			}
  1515  			if plug == nil {
  1516  				// plugin was never successfully registered or has been
  1517  				// GC'd out from under us
  1518  				continue
  1519  			}
  1520  			// only copy once, so we update the same plugin on each alloc
  1521  			plugins[x.pluginID] = plug.Copy()
  1522  			plug = plugins[x.pluginID]
  1523  		}
  1524  
  1525  		if x.alloc == nil {
  1526  			continue
  1527  		}
  1528  		err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID)
  1529  		if err != nil {
  1530  			return err
  1531  		}
  1532  	}
  1533  
  1534  	for _, plug := range plugins {
  1535  		plug.DeleteJob(job, summary)
  1536  		err = updateOrGCPlugin(index, txn, plug)
  1537  		if err != nil {
  1538  			return err
  1539  		}
  1540  	}
  1541  
  1542  	if len(plugins) > 0 {
  1543  		if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1544  			return fmt.Errorf("index update failed: %v", err)
  1545  		}
  1546  	}
  1547  
  1548  	return nil
  1549  }
  1550  
  1551  // NodeByID is used to lookup a node by ID
  1552  func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) {
  1553  	txn := s.db.ReadTxn()
  1554  
  1555  	watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID)
  1556  	if err != nil {
  1557  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1558  	}
  1559  	ws.Add(watchCh)
  1560  
  1561  	if existing != nil {
  1562  		return existing.(*structs.Node), nil
  1563  	}
  1564  	return nil, nil
  1565  }
  1566  
  1567  // NodesByIDPrefix is used to lookup nodes by prefix
  1568  func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  1569  	txn := s.db.ReadTxn()
  1570  
  1571  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
  1572  	if err != nil {
  1573  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1574  	}
  1575  	ws.Add(iter.WatchCh())
  1576  
  1577  	return iter, nil
  1578  }
  1579  
  1580  // NodeBySecretID is used to lookup a node by SecretID
  1581  func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) {
  1582  	txn := s.db.ReadTxn()
  1583  
  1584  	watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID)
  1585  	if err != nil {
  1586  		return nil, fmt.Errorf("node lookup by SecretID failed: %v", err)
  1587  	}
  1588  	ws.Add(watchCh)
  1589  
  1590  	if existing != nil {
  1591  		return existing.(*structs.Node), nil
  1592  	}
  1593  	return nil, nil
  1594  }
  1595  
  1596  // Nodes returns an iterator over all the nodes
  1597  func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1598  	txn := s.db.ReadTxn()
  1599  
  1600  	// Walk the entire nodes table
  1601  	iter, err := txn.Get("nodes", "id")
  1602  	if err != nil {
  1603  		return nil, err
  1604  	}
  1605  	ws.Add(iter.WatchCh())
  1606  	return iter, nil
  1607  }
  1608  
  1609  // UpsertJob is used to register a job or update a job definition
  1610  func (s *StateStore) UpsertJob(msgType structs.MessageType, index uint64, job *structs.Job) error {
  1611  	txn := s.db.WriteTxnMsgT(msgType, index)
  1612  	defer txn.Abort()
  1613  	if err := s.upsertJobImpl(index, job, false, txn); err != nil {
  1614  		return err
  1615  	}
  1616  	return txn.Commit()
  1617  }
  1618  
  1619  // UpsertJobTxn is used to register a job or update a job definition, like UpsertJob,
  1620  // but in a transaction.  Useful for when making multiple modifications atomically
  1621  func (s *StateStore) UpsertJobTxn(index uint64, job *structs.Job, txn Txn) error {
  1622  	return s.upsertJobImpl(index, job, false, txn)
  1623  }
  1624  
  1625  // upsertJobImpl is the implementation for registering a job or updating a job definition
  1626  func (s *StateStore) upsertJobImpl(index uint64, job *structs.Job, keepVersion bool, txn *txn) error {
  1627  	// Assert the namespace exists
  1628  	if exists, err := s.namespaceExists(txn, job.Namespace); err != nil {
  1629  		return err
  1630  	} else if !exists {
  1631  		return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace)
  1632  	}
  1633  
  1634  	// Check if the job already exists
  1635  	existing, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1636  	var existingJob *structs.Job
  1637  	if err != nil {
  1638  		return fmt.Errorf("job lookup failed: %v", err)
  1639  	}
  1640  
  1641  	// Setup the indexes correctly
  1642  	if existing != nil {
  1643  		job.CreateIndex = existing.(*structs.Job).CreateIndex
  1644  		job.ModifyIndex = index
  1645  
  1646  		existingJob = existing.(*structs.Job)
  1647  
  1648  		// Bump the version unless asked to keep it. This should only be done
  1649  		// when changing an internal field such as Stable. A spec change should
  1650  		// always come with a version bump
  1651  		if !keepVersion {
  1652  			job.JobModifyIndex = index
  1653  			if job.Version <= existingJob.Version {
  1654  				job.Version = existingJob.Version + 1
  1655  			}
  1656  		}
  1657  
  1658  		// Compute the job status
  1659  		var err error
  1660  		job.Status, err = s.getJobStatus(txn, job, false)
  1661  		if err != nil {
  1662  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1663  		}
  1664  	} else {
  1665  		job.CreateIndex = index
  1666  		job.ModifyIndex = index
  1667  		job.JobModifyIndex = index
  1668  
  1669  		if err := s.setJobStatus(index, txn, job, false, ""); err != nil {
  1670  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1671  		}
  1672  
  1673  		// Have to get the job again since it could have been updated
  1674  		updated, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1675  		if err != nil {
  1676  			return fmt.Errorf("job lookup failed: %v", err)
  1677  		}
  1678  		if updated != nil {
  1679  			job = updated.(*structs.Job)
  1680  		}
  1681  	}
  1682  
  1683  	if err := s.updateSummaryWithJob(index, job, txn); err != nil {
  1684  		return fmt.Errorf("unable to create job summary: %v", err)
  1685  	}
  1686  
  1687  	if err := s.upsertJobVersion(index, job, txn); err != nil {
  1688  		return fmt.Errorf("unable to upsert job into job_version table: %v", err)
  1689  	}
  1690  
  1691  	if err := s.updateJobScalingPolicies(index, job, txn); err != nil {
  1692  		return fmt.Errorf("unable to update job scaling policies: %v", err)
  1693  	}
  1694  
  1695  	if err := s.updateJobRecommendations(index, txn, existingJob, job); err != nil {
  1696  		return fmt.Errorf("unable to update job recommendations: %v", err)
  1697  	}
  1698  
  1699  	if err := s.updateJobCSIPlugins(index, job, existingJob, txn); err != nil {
  1700  		return fmt.Errorf("unable to update job csi plugins: %v", err)
  1701  	}
  1702  
  1703  	// Insert the job
  1704  	if err := txn.Insert("jobs", job); err != nil {
  1705  		return fmt.Errorf("job insert failed: %v", err)
  1706  	}
  1707  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1708  		return fmt.Errorf("index update failed: %v", err)
  1709  	}
  1710  
  1711  	return nil
  1712  }
  1713  
  1714  // DeleteJob is used to deregister a job
  1715  func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error {
  1716  	txn := s.db.WriteTxn(index)
  1717  	defer txn.Abort()
  1718  
  1719  	err := s.DeleteJobTxn(index, namespace, jobID, txn)
  1720  	if err == nil {
  1721  		return txn.Commit()
  1722  	}
  1723  	return err
  1724  }
  1725  
  1726  // DeleteJobTxn is used to deregister a job, like DeleteJob,
  1727  // but in a transaction.  Useful for when making multiple modifications atomically
  1728  func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error {
  1729  	// Lookup the node
  1730  	existing, err := txn.First("jobs", "id", namespace, jobID)
  1731  	if err != nil {
  1732  		return fmt.Errorf("job lookup failed: %v", err)
  1733  	}
  1734  	if existing == nil {
  1735  		return fmt.Errorf("job not found")
  1736  	}
  1737  
  1738  	// Check if we should update a parent job summary
  1739  	job := existing.(*structs.Job)
  1740  	if job.ParentID != "" {
  1741  		summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID)
  1742  		if err != nil {
  1743  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  1744  		}
  1745  
  1746  		// Only continue if the summary exists. It could not exist if the parent
  1747  		// job was removed
  1748  		if summaryRaw != nil {
  1749  			existing := summaryRaw.(*structs.JobSummary)
  1750  			pSummary := existing.Copy()
  1751  			if pSummary.Children != nil {
  1752  
  1753  				modified := false
  1754  				switch job.Status {
  1755  				case structs.JobStatusPending:
  1756  					pSummary.Children.Pending--
  1757  					pSummary.Children.Dead++
  1758  					modified = true
  1759  				case structs.JobStatusRunning:
  1760  					pSummary.Children.Running--
  1761  					pSummary.Children.Dead++
  1762  					modified = true
  1763  				case structs.JobStatusDead:
  1764  				default:
  1765  					return fmt.Errorf("unknown old job status %q", job.Status)
  1766  				}
  1767  
  1768  				if modified {
  1769  					// Update the modify index
  1770  					pSummary.ModifyIndex = index
  1771  
  1772  					// Insert the summary
  1773  					if err := txn.Insert("job_summary", pSummary); err != nil {
  1774  						return fmt.Errorf("job summary insert failed: %v", err)
  1775  					}
  1776  					if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1777  						return fmt.Errorf("index update failed: %v", err)
  1778  					}
  1779  				}
  1780  			}
  1781  		}
  1782  	}
  1783  
  1784  	// Delete the job
  1785  	if err := txn.Delete("jobs", existing); err != nil {
  1786  		return fmt.Errorf("job delete failed: %v", err)
  1787  	}
  1788  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1789  		return fmt.Errorf("index update failed: %v", err)
  1790  	}
  1791  
  1792  	// Delete the job versions
  1793  	if err := s.deleteJobVersions(index, job, txn); err != nil {
  1794  		return err
  1795  	}
  1796  
  1797  	// Cleanup plugins registered by this job, before we delete the summary
  1798  	err = s.deleteJobFromPlugins(index, txn, job)
  1799  	if err != nil {
  1800  		return fmt.Errorf("deleting job from plugin: %v", err)
  1801  	}
  1802  
  1803  	// Delete the job summary
  1804  	if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil {
  1805  		return fmt.Errorf("deleting job summary failed: %v", err)
  1806  	}
  1807  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1808  		return fmt.Errorf("index update failed: %v", err)
  1809  	}
  1810  
  1811  	// Delete any remaining job scaling policies
  1812  	if err := s.deleteJobScalingPolicies(index, job, txn); err != nil {
  1813  		return fmt.Errorf("deleting job scaling policies failed: %v", err)
  1814  	}
  1815  
  1816  	// Delete any job recommendations
  1817  	if err := s.deleteRecommendationsByJob(index, txn, job); err != nil {
  1818  		return fmt.Errorf("deleting job recommendatons failed: %v", err)
  1819  	}
  1820  
  1821  	// Delete the scaling events
  1822  	if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil {
  1823  		return fmt.Errorf("deleting job scaling events failed: %v", err)
  1824  	}
  1825  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
  1826  		return fmt.Errorf("index update failed: %v", err)
  1827  	}
  1828  
  1829  	return nil
  1830  }
  1831  
  1832  // deleteJobScalingPolicies deletes any scaling policies associated with the job
  1833  func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {
  1834  	iter, err := s.ScalingPoliciesByJobTxn(nil, job.Namespace, job.ID, txn)
  1835  	if err != nil {
  1836  		return fmt.Errorf("getting job scaling policies for deletion failed: %v", err)
  1837  	}
  1838  
  1839  	// Put them into a slice so there are no safety concerns while actually
  1840  	// performing the deletes
  1841  	policies := []interface{}{}
  1842  	for {
  1843  		raw := iter.Next()
  1844  		if raw == nil {
  1845  			break
  1846  		}
  1847  		policies = append(policies, raw)
  1848  	}
  1849  
  1850  	// Do the deletes
  1851  	for _, p := range policies {
  1852  		if err := txn.Delete("scaling_policy", p); err != nil {
  1853  			return fmt.Errorf("deleting scaling policy failed: %v", err)
  1854  		}
  1855  	}
  1856  
  1857  	if len(policies) > 0 {
  1858  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  1859  			return fmt.Errorf("index update failed: %v", err)
  1860  		}
  1861  	}
  1862  	return nil
  1863  }
  1864  
  1865  // deleteJobVersions deletes all versions of the given job.
  1866  func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *txn) error {
  1867  	iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID)
  1868  	if err != nil {
  1869  		return err
  1870  	}
  1871  
  1872  	// Put them into a slice so there are no safety concerns while actually
  1873  	// performing the deletes
  1874  	jobs := []*structs.Job{}
  1875  	for {
  1876  		raw := iter.Next()
  1877  		if raw == nil {
  1878  			break
  1879  		}
  1880  
  1881  		// Ensure the ID is an exact match
  1882  		j := raw.(*structs.Job)
  1883  		if j.ID != job.ID {
  1884  			continue
  1885  		}
  1886  
  1887  		jobs = append(jobs, j)
  1888  	}
  1889  
  1890  	// Do the deletes
  1891  	for _, j := range jobs {
  1892  		if err := txn.Delete("job_version", j); err != nil {
  1893  			return fmt.Errorf("deleting job versions failed: %v", err)
  1894  		}
  1895  	}
  1896  
  1897  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1898  		return fmt.Errorf("index update failed: %v", err)
  1899  	}
  1900  
  1901  	return nil
  1902  }
  1903  
  1904  // upsertJobVersion inserts a job into its historic version table and limits the
  1905  // number of job versions that are tracked.
  1906  func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn) error {
  1907  	// Insert the job
  1908  	if err := txn.Insert("job_version", job); err != nil {
  1909  		return fmt.Errorf("failed to insert job into job_version table: %v", err)
  1910  	}
  1911  
  1912  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1913  		return fmt.Errorf("index update failed: %v", err)
  1914  	}
  1915  
  1916  	// Get all the historic jobs for this ID
  1917  	all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID)
  1918  	if err != nil {
  1919  		return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err)
  1920  	}
  1921  
  1922  	// If we are below the limit there is no GCing to be done
  1923  	if len(all) <= structs.JobTrackedVersions {
  1924  		return nil
  1925  	}
  1926  
  1927  	// We have to delete a historic job to make room.
  1928  	// Find index of the highest versioned stable job
  1929  	stableIdx := -1
  1930  	for i, j := range all {
  1931  		if j.Stable {
  1932  			stableIdx = i
  1933  			break
  1934  		}
  1935  	}
  1936  
  1937  	// If the stable job is the oldest version, do a swap to bring it into the
  1938  	// keep set.
  1939  	max := structs.JobTrackedVersions
  1940  	if stableIdx == max {
  1941  		all[max-1], all[max] = all[max], all[max-1]
  1942  	}
  1943  
  1944  	// Delete the job outside of the set that are being kept.
  1945  	d := all[max]
  1946  	if err := txn.Delete("job_version", d); err != nil {
  1947  		return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version)
  1948  	}
  1949  
  1950  	return nil
  1951  }
  1952  
  1953  // JobByID is used to lookup a job by its ID. JobByID returns the current/latest job
  1954  // version.
  1955  func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) {
  1956  	txn := s.db.ReadTxn()
  1957  	return s.JobByIDTxn(ws, namespace, id, txn)
  1958  }
  1959  
  1960  // JobByIDTxn is used to lookup a job by its ID, like  JobByID. JobByID returns the job version
  1961  // accessible through in the transaction
  1962  func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) {
  1963  	watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id)
  1964  	if err != nil {
  1965  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1966  	}
  1967  	ws.Add(watchCh)
  1968  
  1969  	if existing != nil {
  1970  		return existing.(*structs.Job), nil
  1971  	}
  1972  	return nil, nil
  1973  }
  1974  
  1975  // JobsByIDPrefix is used to lookup a job by prefix. If querying all namespaces
  1976  // the prefix will not be filtered by an index.
  1977  func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  1978  	if namespace == structs.AllNamespacesSentinel {
  1979  		return s.jobsByIDPrefixAllNamespaces(ws, id)
  1980  	}
  1981  
  1982  	txn := s.db.ReadTxn()
  1983  
  1984  	iter, err := txn.Get("jobs", "id_prefix", namespace, id)
  1985  	if err != nil {
  1986  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1987  	}
  1988  
  1989  	ws.Add(iter.WatchCh())
  1990  
  1991  	return iter, nil
  1992  }
  1993  
  1994  func (s *StateStore) jobsByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  1995  	txn := s.db.ReadTxn()
  1996  
  1997  	// Walk the entire jobs table
  1998  	iter, err := txn.Get("jobs", "id")
  1999  
  2000  	if err != nil {
  2001  		return nil, err
  2002  	}
  2003  
  2004  	ws.Add(iter.WatchCh())
  2005  
  2006  	// Filter the iterator by ID prefix
  2007  	f := func(raw interface{}) bool {
  2008  		job, ok := raw.(*structs.Job)
  2009  		if !ok {
  2010  			return true
  2011  		}
  2012  		return !strings.HasPrefix(job.ID, prefix)
  2013  	}
  2014  	wrap := memdb.NewFilterIterator(iter, f)
  2015  	return wrap, nil
  2016  }
  2017  
  2018  // JobVersionsByID returns all the tracked versions of a job.
  2019  func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  2020  	txn := s.db.ReadTxn()
  2021  
  2022  	return s.jobVersionByID(txn, ws, namespace, id)
  2023  }
  2024  
  2025  // jobVersionByID is the underlying implementation for retrieving all tracked
  2026  // versions of a job and is called under an existing transaction. A watch set
  2027  // can optionally be passed in to add the job histories to the watch set.
  2028  func (s *StateStore) jobVersionByID(txn *txn, ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  2029  	// Get all the historic jobs for this ID
  2030  	iter, err := txn.Get("job_version", "id_prefix", namespace, id)
  2031  	if err != nil {
  2032  		return nil, err
  2033  	}
  2034  
  2035  	ws.Add(iter.WatchCh())
  2036  
  2037  	var all []*structs.Job
  2038  	for {
  2039  		raw := iter.Next()
  2040  		if raw == nil {
  2041  			break
  2042  		}
  2043  
  2044  		// Ensure the ID is an exact match
  2045  		j := raw.(*structs.Job)
  2046  		if j.ID != id {
  2047  			continue
  2048  		}
  2049  
  2050  		all = append(all, j)
  2051  	}
  2052  
  2053  	// Sort in reverse order so that the highest version is first
  2054  	sort.Slice(all, func(i, j int) bool {
  2055  		return all[i].Version > all[j].Version
  2056  	})
  2057  
  2058  	return all, nil
  2059  }
  2060  
  2061  // JobByIDAndVersion returns the job identified by its ID and Version. The
  2062  // passed watchset may be nil.
  2063  func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) {
  2064  	txn := s.db.ReadTxn()
  2065  	return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn)
  2066  }
  2067  
  2068  // jobByIDAndVersionImpl returns the job identified by its ID and Version. The
  2069  // passed watchset may be nil.
  2070  func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string,
  2071  	version uint64, txn *txn) (*structs.Job, error) {
  2072  
  2073  	watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version)
  2074  	if err != nil {
  2075  		return nil, err
  2076  	}
  2077  
  2078  	ws.Add(watchCh)
  2079  
  2080  	if existing != nil {
  2081  		job := existing.(*structs.Job)
  2082  		return job, nil
  2083  	}
  2084  
  2085  	return nil, nil
  2086  }
  2087  
  2088  func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2089  	txn := s.db.ReadTxn()
  2090  
  2091  	// Walk the entire deployments table
  2092  	iter, err := txn.Get("job_version", "id")
  2093  	if err != nil {
  2094  		return nil, err
  2095  	}
  2096  
  2097  	ws.Add(iter.WatchCh())
  2098  	return iter, nil
  2099  }
  2100  
  2101  // Jobs returns an iterator over all the jobs
  2102  func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2103  	txn := s.db.ReadTxn()
  2104  
  2105  	// Walk the entire jobs table
  2106  	iter, err := txn.Get("jobs", "id")
  2107  	if err != nil {
  2108  		return nil, err
  2109  	}
  2110  
  2111  	ws.Add(iter.WatchCh())
  2112  
  2113  	return iter, nil
  2114  }
  2115  
  2116  // JobsByNamespace returns an iterator over all the jobs for the given namespace
  2117  func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2118  	txn := s.db.ReadTxn()
  2119  	return s.jobsByNamespaceImpl(ws, namespace, txn)
  2120  }
  2121  
  2122  // jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace
  2123  func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *txn) (memdb.ResultIterator, error) {
  2124  	// Walk the entire jobs table
  2125  	iter, err := txn.Get("jobs", "id_prefix", namespace, "")
  2126  	if err != nil {
  2127  		return nil, err
  2128  	}
  2129  
  2130  	ws.Add(iter.WatchCh())
  2131  
  2132  	return iter, nil
  2133  }
  2134  
  2135  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
  2136  func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) {
  2137  	txn := s.db.ReadTxn()
  2138  
  2139  	iter, err := txn.Get("jobs", "periodic", periodic)
  2140  	if err != nil {
  2141  		return nil, err
  2142  	}
  2143  
  2144  	ws.Add(iter.WatchCh())
  2145  
  2146  	return iter, nil
  2147  }
  2148  
  2149  // JobsByScheduler returns an iterator over all the jobs with the specific
  2150  // scheduler type.
  2151  func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) {
  2152  	txn := s.db.ReadTxn()
  2153  
  2154  	// Return an iterator for jobs with the specific type.
  2155  	iter, err := txn.Get("jobs", "type", schedulerType)
  2156  	if err != nil {
  2157  		return nil, err
  2158  	}
  2159  
  2160  	ws.Add(iter.WatchCh())
  2161  
  2162  	return iter, nil
  2163  }
  2164  
  2165  // JobsByGC returns an iterator over all jobs eligible or ineligible for garbage
  2166  // collection.
  2167  func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) {
  2168  	txn := s.db.ReadTxn()
  2169  
  2170  	iter, err := txn.Get("jobs", "gc", gc)
  2171  	if err != nil {
  2172  		return nil, err
  2173  	}
  2174  
  2175  	ws.Add(iter.WatchCh())
  2176  
  2177  	return iter, nil
  2178  }
  2179  
  2180  // JobSummaryByID returns a job summary object which matches a specific id.
  2181  func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) {
  2182  	txn := s.db.ReadTxn()
  2183  
  2184  	watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID)
  2185  	if err != nil {
  2186  		return nil, err
  2187  	}
  2188  
  2189  	ws.Add(watchCh)
  2190  
  2191  	if existing != nil {
  2192  		summary := existing.(*structs.JobSummary)
  2193  		return summary, nil
  2194  	}
  2195  
  2196  	return nil, nil
  2197  }
  2198  
  2199  // JobSummaries walks the entire job summary table and returns all the job
  2200  // summary objects
  2201  func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2202  	txn := s.db.ReadTxn()
  2203  
  2204  	iter, err := txn.Get("job_summary", "id")
  2205  	if err != nil {
  2206  		return nil, err
  2207  	}
  2208  
  2209  	ws.Add(iter.WatchCh())
  2210  
  2211  	return iter, nil
  2212  }
  2213  
  2214  // JobSummaryByPrefix is used to look up Job Summary by id prefix
  2215  func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2216  	txn := s.db.ReadTxn()
  2217  
  2218  	iter, err := txn.Get("job_summary", "id_prefix", namespace, id)
  2219  	if err != nil {
  2220  		return nil, fmt.Errorf("job_summary lookup failed: %v", err)
  2221  	}
  2222  
  2223  	ws.Add(iter.WatchCh())
  2224  
  2225  	return iter, nil
  2226  }
  2227  
  2228  // UpsertCSIVolume inserts a volume in the state store.
  2229  func (s *StateStore) UpsertCSIVolume(index uint64, volumes []*structs.CSIVolume) error {
  2230  	txn := s.db.WriteTxn(index)
  2231  	defer txn.Abort()
  2232  
  2233  	for _, v := range volumes {
  2234  		if exists, err := s.namespaceExists(txn, v.Namespace); err != nil {
  2235  			return err
  2236  		} else if !exists {
  2237  			return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace)
  2238  		}
  2239  
  2240  		obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID)
  2241  		if err != nil {
  2242  			return fmt.Errorf("volume existence check error: %v", err)
  2243  		}
  2244  		if obj != nil {
  2245  			// Allow some properties of a volume to be updated in place, but
  2246  			// prevent accidentally overwriting important properties, or
  2247  			// overwriting a volume in use
  2248  			old := obj.(*structs.CSIVolume)
  2249  			if old.ExternalID != v.ExternalID ||
  2250  				old.PluginID != v.PluginID ||
  2251  				old.Provider != v.Provider {
  2252  				return fmt.Errorf("volume identity cannot be updated: %s", v.ID)
  2253  			}
  2254  			s.CSIVolumeDenormalize(nil, old.Copy())
  2255  			if old.InUse() {
  2256  				return fmt.Errorf("volume cannot be updated while in use")
  2257  			}
  2258  
  2259  			v.CreateIndex = old.CreateIndex
  2260  			v.ModifyIndex = index
  2261  		} else {
  2262  			v.CreateIndex = index
  2263  			v.ModifyIndex = index
  2264  		}
  2265  
  2266  		// Allocations are copy on write, so we want to keep the Allocation ID
  2267  		// but we need to clear the pointer so that we don't store it when we
  2268  		// write the volume to the state store. We'll get it from the db in
  2269  		// denormalize.
  2270  		for allocID := range v.ReadAllocs {
  2271  			v.ReadAllocs[allocID] = nil
  2272  		}
  2273  		for allocID := range v.WriteAllocs {
  2274  			v.WriteAllocs[allocID] = nil
  2275  		}
  2276  
  2277  		err = txn.Insert("csi_volumes", v)
  2278  		if err != nil {
  2279  			return fmt.Errorf("volume insert: %v", err)
  2280  		}
  2281  	}
  2282  
  2283  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2284  		return fmt.Errorf("index update failed: %v", err)
  2285  	}
  2286  
  2287  	return txn.Commit()
  2288  }
  2289  
  2290  // CSIVolumes returns the unfiltered list of all volumes. Caller should
  2291  // snapshot if it wants to also denormalize the plugins.
  2292  func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2293  	txn := s.db.ReadTxn()
  2294  	defer txn.Abort()
  2295  
  2296  	iter, err := txn.Get("csi_volumes", "id")
  2297  	if err != nil {
  2298  		return nil, fmt.Errorf("csi_volumes lookup failed: %v", err)
  2299  	}
  2300  
  2301  	ws.Add(iter.WatchCh())
  2302  
  2303  	return iter, nil
  2304  }
  2305  
  2306  // CSIVolumeByID is used to lookup a single volume. Returns a copy of the
  2307  // volume because its plugins and allocations are denormalized to provide
  2308  // accurate Health.
  2309  func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) {
  2310  	txn := s.db.ReadTxn()
  2311  
  2312  	watchCh, obj, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
  2313  	if err != nil {
  2314  		return nil, fmt.Errorf("volume lookup failed for %s: %v", id, err)
  2315  	}
  2316  	ws.Add(watchCh)
  2317  
  2318  	if obj == nil {
  2319  		return nil, nil
  2320  	}
  2321  	vol := obj.(*structs.CSIVolume)
  2322  
  2323  	// we return the volume with the plugins denormalized by default,
  2324  	// because the scheduler needs them for feasibility checking
  2325  	return s.csiVolumeDenormalizePluginsTxn(txn, vol.Copy())
  2326  }
  2327  
  2328  // CSIVolumesByPluginID looks up csi_volumes by pluginID. Caller should
  2329  // snapshot if it wants to also denormalize the plugins.
  2330  func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, prefix, pluginID string) (memdb.ResultIterator, error) {
  2331  	txn := s.db.ReadTxn()
  2332  
  2333  	iter, err := txn.Get("csi_volumes", "plugin_id", pluginID)
  2334  	if err != nil {
  2335  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2336  	}
  2337  
  2338  	// Filter the iterator by namespace
  2339  	f := func(raw interface{}) bool {
  2340  		v, ok := raw.(*structs.CSIVolume)
  2341  		if !ok {
  2342  			return false
  2343  		}
  2344  		return v.Namespace != namespace && strings.HasPrefix(v.ID, prefix)
  2345  	}
  2346  
  2347  	wrap := memdb.NewFilterIterator(iter, f)
  2348  	return wrap, nil
  2349  }
  2350  
  2351  // CSIVolumesByIDPrefix supports search. Caller should snapshot if it wants to
  2352  // also denormalize the plugins. If using a prefix with the wildcard namespace,
  2353  // the results will not use the index prefix.
  2354  func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) {
  2355  	if namespace == structs.AllNamespacesSentinel {
  2356  		return s.csiVolumeByIDPrefixAllNamespaces(ws, volumeID)
  2357  	}
  2358  
  2359  	txn := s.db.ReadTxn()
  2360  
  2361  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID)
  2362  	if err != nil {
  2363  		return nil, err
  2364  	}
  2365  
  2366  	ws.Add(iter.WatchCh())
  2367  
  2368  	return iter, nil
  2369  }
  2370  
  2371  func (s *StateStore) csiVolumeByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  2372  	txn := s.db.ReadTxn()
  2373  
  2374  	// Walk the entire csi_volumes table
  2375  	iter, err := txn.Get("csi_volumes", "id")
  2376  
  2377  	if err != nil {
  2378  		return nil, err
  2379  	}
  2380  
  2381  	ws.Add(iter.WatchCh())
  2382  
  2383  	// Filter the iterator by ID prefix
  2384  	f := func(raw interface{}) bool {
  2385  		v, ok := raw.(*structs.CSIVolume)
  2386  		if !ok {
  2387  			return false
  2388  		}
  2389  		return !strings.HasPrefix(v.ID, prefix)
  2390  	}
  2391  	wrap := memdb.NewFilterIterator(iter, f)
  2392  	return wrap, nil
  2393  }
  2394  
  2395  // CSIVolumesByNodeID looks up CSIVolumes in use on a node. Caller should
  2396  // snapshot if it wants to also denormalize the plugins.
  2397  func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, prefix, nodeID string) (memdb.ResultIterator, error) {
  2398  	allocs, err := s.AllocsByNode(ws, nodeID)
  2399  	if err != nil {
  2400  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  2401  	}
  2402  
  2403  	// Find volume ids for CSI volumes in running allocs, or allocs that we desire to run
  2404  	ids := map[string]string{} // Map volumeID to Namespace
  2405  	for _, a := range allocs {
  2406  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  2407  
  2408  		if !(a.DesiredStatus == structs.AllocDesiredStatusRun ||
  2409  			a.ClientStatus == structs.AllocClientStatusRunning) ||
  2410  			len(tg.Volumes) == 0 {
  2411  			continue
  2412  		}
  2413  
  2414  		for _, v := range tg.Volumes {
  2415  			if v.Type != structs.VolumeTypeCSI {
  2416  				continue
  2417  			}
  2418  			ids[v.Source] = a.Namespace
  2419  		}
  2420  	}
  2421  
  2422  	// Lookup the raw CSIVolumes to match the other list interfaces
  2423  	iter := NewSliceIterator()
  2424  	txn := s.db.ReadTxn()
  2425  	for id, namespace := range ids {
  2426  		if strings.HasPrefix(id, prefix) {
  2427  			watchCh, raw, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
  2428  			if err != nil {
  2429  				return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  2430  			}
  2431  			ws.Add(watchCh)
  2432  			iter.Add(raw)
  2433  		}
  2434  	}
  2435  
  2436  	return iter, nil
  2437  }
  2438  
  2439  // CSIVolumesByNamespace looks up the entire csi_volumes table
  2440  func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {
  2441  	txn := s.db.ReadTxn()
  2442  
  2443  	return s.csiVolumesByNamespaceImpl(txn, ws, namespace, prefix)
  2444  }
  2445  
  2446  func (s *StateStore) csiVolumesByNamespaceImpl(txn *txn, ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {
  2447  
  2448  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, prefix)
  2449  	if err != nil {
  2450  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2451  	}
  2452  
  2453  	ws.Add(iter.WatchCh())
  2454  
  2455  	return iter, nil
  2456  }
  2457  
  2458  // CSIVolumeClaim updates the volume's claim count and allocation list
  2459  func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *structs.CSIVolumeClaim) error {
  2460  	txn := s.db.WriteTxn(index)
  2461  	defer txn.Abort()
  2462  
  2463  	row, err := txn.First("csi_volumes", "id", namespace, id)
  2464  	if err != nil {
  2465  		return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2466  	}
  2467  	if row == nil {
  2468  		return fmt.Errorf("volume not found: %s", id)
  2469  	}
  2470  
  2471  	orig, ok := row.(*structs.CSIVolume)
  2472  	if !ok {
  2473  		return fmt.Errorf("volume row conversion error")
  2474  	}
  2475  
  2476  	var alloc *structs.Allocation
  2477  	if claim.State == structs.CSIVolumeClaimStateTaken {
  2478  		alloc, err = s.allocByIDImpl(txn, nil, claim.AllocationID)
  2479  		if err != nil {
  2480  			s.logger.Error("AllocByID failed", "error", err)
  2481  			return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2482  		}
  2483  		if alloc == nil {
  2484  			s.logger.Error("AllocByID failed to find alloc", "alloc_id", claim.AllocationID)
  2485  			if err != nil {
  2486  				return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2487  			}
  2488  		}
  2489  	}
  2490  
  2491  	volume, err := s.csiVolumeDenormalizePluginsTxn(txn, orig.Copy())
  2492  	if err != nil {
  2493  		return err
  2494  	}
  2495  	volume, err = s.csiVolumeDenormalizeTxn(txn, nil, volume)
  2496  	if err != nil {
  2497  		return err
  2498  	}
  2499  
  2500  	// in the case of a job deregistration, there will be no allocation ID
  2501  	// for the claim but we still want to write an updated index to the volume
  2502  	// so that volume reaping is triggered
  2503  	if claim.AllocationID != "" {
  2504  		err = volume.Claim(claim, alloc)
  2505  		if err != nil {
  2506  			return err
  2507  		}
  2508  	}
  2509  
  2510  	volume.ModifyIndex = index
  2511  
  2512  	// Allocations are copy on write, so we want to keep the Allocation ID
  2513  	// but we need to clear the pointer so that we don't store it when we
  2514  	// write the volume to the state store. We'll get it from the db in
  2515  	// denormalize.
  2516  	for allocID := range volume.ReadAllocs {
  2517  		volume.ReadAllocs[allocID] = nil
  2518  	}
  2519  	for allocID := range volume.WriteAllocs {
  2520  		volume.WriteAllocs[allocID] = nil
  2521  	}
  2522  
  2523  	if err = txn.Insert("csi_volumes", volume); err != nil {
  2524  		return fmt.Errorf("volume update failed: %s: %v", id, err)
  2525  	}
  2526  
  2527  	if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2528  		return fmt.Errorf("index update failed: %v", err)
  2529  	}
  2530  
  2531  	return txn.Commit()
  2532  }
  2533  
  2534  // CSIVolumeDeregister removes the volume from the server
  2535  func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string, force bool) error {
  2536  	txn := s.db.WriteTxn(index)
  2537  	defer txn.Abort()
  2538  
  2539  	for _, id := range ids {
  2540  		existing, err := txn.First("csi_volumes", "id", namespace, id)
  2541  		if err != nil {
  2542  			return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2543  		}
  2544  
  2545  		if existing == nil {
  2546  			return fmt.Errorf("volume not found: %s", id)
  2547  		}
  2548  
  2549  		vol, ok := existing.(*structs.CSIVolume)
  2550  		if !ok {
  2551  			return fmt.Errorf("volume row conversion error: %s", id)
  2552  		}
  2553  
  2554  		// The common case for a volume deregister is when the volume is
  2555  		// unused, but we can also let an operator intervene in the case where
  2556  		// allocations have been stopped but claims can't be freed because
  2557  		// ex. the plugins have all been removed.
  2558  		if vol.InUse() {
  2559  			if !force || !s.volSafeToForce(txn, vol) {
  2560  				return fmt.Errorf("volume in use: %s", id)
  2561  			}
  2562  		}
  2563  
  2564  		if err = txn.Delete("csi_volumes", existing); err != nil {
  2565  			return fmt.Errorf("volume delete failed: %s: %v", id, err)
  2566  		}
  2567  	}
  2568  
  2569  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2570  		return fmt.Errorf("index update failed: %v", err)
  2571  	}
  2572  
  2573  	return txn.Commit()
  2574  }
  2575  
  2576  // volSafeToForce checks if the any of the remaining allocations
  2577  // are in a non-terminal state.
  2578  func (s *StateStore) volSafeToForce(txn Txn, v *structs.CSIVolume) bool {
  2579  	vol, err := s.csiVolumeDenormalizeTxn(txn, nil, v)
  2580  	if err != nil {
  2581  		return false
  2582  	}
  2583  
  2584  	for _, alloc := range vol.ReadAllocs {
  2585  		if alloc != nil && !alloc.TerminalStatus() {
  2586  			return false
  2587  		}
  2588  	}
  2589  	for _, alloc := range vol.WriteAllocs {
  2590  		if alloc != nil && !alloc.TerminalStatus() {
  2591  			return false
  2592  		}
  2593  	}
  2594  	return true
  2595  }
  2596  
  2597  // CSIVolumeDenormalizePlugins returns a CSIVolume with current health and
  2598  // plugins, but without allocations.
  2599  // Use this for current volume metadata, handling lists of volumes.
  2600  // Use CSIVolumeDenormalize for volumes containing both health and current
  2601  // allocations.
  2602  func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2603  	if vol == nil {
  2604  		return nil, nil
  2605  	}
  2606  	txn := s.db.ReadTxn()
  2607  	defer txn.Abort()
  2608  	return s.csiVolumeDenormalizePluginsTxn(txn, vol)
  2609  }
  2610  
  2611  // csiVolumeDenormalizePluginsTxn implements
  2612  // CSIVolumeDenormalizePlugins, inside a transaction.
  2613  func (s *StateStore) csiVolumeDenormalizePluginsTxn(txn Txn, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2614  	if vol == nil {
  2615  		return nil, nil
  2616  	}
  2617  	plug, err := s.CSIPluginByIDTxn(txn, nil, vol.PluginID)
  2618  	if err != nil {
  2619  		return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
  2620  	}
  2621  	if plug == nil {
  2622  		vol.ControllersHealthy = 0
  2623  		vol.NodesHealthy = 0
  2624  		vol.Schedulable = false
  2625  		return vol, nil
  2626  	}
  2627  
  2628  	vol.Provider = plug.Provider
  2629  	vol.ProviderVersion = plug.Version
  2630  	vol.ControllerRequired = plug.ControllerRequired
  2631  	vol.ControllersHealthy = plug.ControllersHealthy
  2632  	vol.NodesHealthy = plug.NodesHealthy
  2633  
  2634  	// This value may be stale, but stale is ok
  2635  	vol.ControllersExpected = plug.ControllersExpected
  2636  	vol.NodesExpected = plug.NodesExpected
  2637  
  2638  	vol.Schedulable = vol.NodesHealthy > 0
  2639  	if vol.ControllerRequired {
  2640  		vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable
  2641  	}
  2642  
  2643  	return vol, nil
  2644  }
  2645  
  2646  // CSIVolumeDenormalize returns a CSIVolume with its current
  2647  // Allocations and Claims, including creating new PastClaims for
  2648  // terminal or garbage collected allocations. This ensures we have a
  2649  // consistent state. Note that it mutates the original volume and so
  2650  // should always be called on a Copy after reading from the state
  2651  // store.
  2652  func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2653  	txn := s.db.ReadTxn()
  2654  	return s.csiVolumeDenormalizeTxn(txn, ws, vol)
  2655  }
  2656  
  2657  // csiVolumeDenormalizeTxn implements CSIVolumeDenormalize inside a transaction
  2658  func (s *StateStore) csiVolumeDenormalizeTxn(txn Txn, ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2659  	if vol == nil {
  2660  		return nil, nil
  2661  	}
  2662  
  2663  	// note: denormalize mutates the maps we pass in!
  2664  	denormalize := func(
  2665  		currentAllocs map[string]*structs.Allocation,
  2666  		currentClaims, pastClaims map[string]*structs.CSIVolumeClaim,
  2667  		fallbackMode structs.CSIVolumeClaimMode) error {
  2668  
  2669  		for id := range currentAllocs {
  2670  			a, err := s.allocByIDImpl(txn, ws, id)
  2671  			if err != nil {
  2672  				return err
  2673  			}
  2674  			pastClaim := pastClaims[id]
  2675  			currentClaim := currentClaims[id]
  2676  			if currentClaim == nil {
  2677  				// COMPAT(1.4.0): the CSIVolumeClaim fields were added
  2678  				// after 0.11.1, so claims made before that may be
  2679  				// missing this value. No clusters should see this
  2680  				// anymore, so warn nosily in the logs so that
  2681  				// operators ask us about it. Remove this block and
  2682  				// the now-unused fallbackMode parameter, and return
  2683  				// an error if currentClaim is nil in 1.4.0
  2684  				s.logger.Warn("volume was missing claim for allocation",
  2685  					"volume_id", vol.ID, "alloc", id)
  2686  				currentClaim = &structs.CSIVolumeClaim{
  2687  					AllocationID: a.ID,
  2688  					NodeID:       a.NodeID,
  2689  					Mode:         fallbackMode,
  2690  					State:        structs.CSIVolumeClaimStateTaken,
  2691  				}
  2692  				currentClaims[id] = currentClaim
  2693  			}
  2694  
  2695  			currentAllocs[id] = a
  2696  			if (a == nil || a.TerminalStatus()) && pastClaim == nil {
  2697  				// the alloc is garbage collected but nothing has written a PastClaim,
  2698  				// so create one now
  2699  				pastClaim = &structs.CSIVolumeClaim{
  2700  					AllocationID:   id,
  2701  					NodeID:         currentClaim.NodeID,
  2702  					Mode:           currentClaim.Mode,
  2703  					State:          structs.CSIVolumeClaimStateUnpublishing,
  2704  					AccessMode:     currentClaim.AccessMode,
  2705  					AttachmentMode: currentClaim.AttachmentMode,
  2706  				}
  2707  				pastClaims[id] = pastClaim
  2708  			}
  2709  
  2710  		}
  2711  		return nil
  2712  	}
  2713  
  2714  	err := denormalize(vol.ReadAllocs, vol.ReadClaims, vol.PastClaims,
  2715  		structs.CSIVolumeClaimRead)
  2716  	if err != nil {
  2717  		return nil, err
  2718  	}
  2719  	err = denormalize(vol.WriteAllocs, vol.WriteClaims, vol.PastClaims,
  2720  		structs.CSIVolumeClaimWrite)
  2721  	if err != nil {
  2722  		return nil, err
  2723  	}
  2724  
  2725  	// COMPAT: the AccessMode and AttachmentMode fields were added to claims
  2726  	// in 1.1.0, so claims made before that may be missing this value. In this
  2727  	// case, the volume will already have AccessMode/AttachmentMode until it
  2728  	// no longer has any claims, so set from those values
  2729  	for _, claim := range vol.ReadClaims {
  2730  		if claim.AccessMode == "" || claim.AttachmentMode == "" {
  2731  			claim.AccessMode = vol.AccessMode
  2732  			claim.AttachmentMode = vol.AttachmentMode
  2733  		}
  2734  	}
  2735  	for _, claim := range vol.WriteClaims {
  2736  		if claim.AccessMode == "" || claim.AttachmentMode == "" {
  2737  			claim.AccessMode = vol.AccessMode
  2738  			claim.AttachmentMode = vol.AttachmentMode
  2739  		}
  2740  	}
  2741  
  2742  	return vol, nil
  2743  }
  2744  
  2745  // CSIPlugins returns the unfiltered list of all plugin health status
  2746  func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2747  	txn := s.db.ReadTxn()
  2748  	defer txn.Abort()
  2749  
  2750  	iter, err := txn.Get("csi_plugins", "id")
  2751  	if err != nil {
  2752  		return nil, fmt.Errorf("csi_plugins lookup failed: %v", err)
  2753  	}
  2754  
  2755  	ws.Add(iter.WatchCh())
  2756  
  2757  	return iter, nil
  2758  }
  2759  
  2760  // CSIPluginsByIDPrefix supports search
  2761  func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) {
  2762  	txn := s.db.ReadTxn()
  2763  
  2764  	iter, err := txn.Get("csi_plugins", "id_prefix", pluginID)
  2765  	if err != nil {
  2766  		return nil, err
  2767  	}
  2768  
  2769  	ws.Add(iter.WatchCh())
  2770  
  2771  	return iter, nil
  2772  }
  2773  
  2774  // CSIPluginByID returns a named CSIPlugin. This method creates a new
  2775  // transaction so you should not call it from within another transaction.
  2776  func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2777  	txn := s.db.ReadTxn()
  2778  	plugin, err := s.CSIPluginByIDTxn(txn, ws, id)
  2779  	if err != nil {
  2780  		return nil, err
  2781  	}
  2782  	return plugin, nil
  2783  }
  2784  
  2785  // CSIPluginByIDTxn returns a named CSIPlugin
  2786  func (s *StateStore) CSIPluginByIDTxn(txn Txn, ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2787  
  2788  	watchCh, obj, err := txn.FirstWatch("csi_plugins", "id", id)
  2789  	if err != nil {
  2790  		return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err)
  2791  	}
  2792  
  2793  	ws.Add(watchCh)
  2794  
  2795  	if obj != nil {
  2796  		return obj.(*structs.CSIPlugin), nil
  2797  	}
  2798  	return nil, nil
  2799  }
  2800  
  2801  // CSIPluginDenormalize returns a CSIPlugin with allocation details. Always called on a copy of the plugin.
  2802  func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2803  	txn := s.db.ReadTxn()
  2804  	return s.CSIPluginDenormalizeTxn(txn, ws, plug)
  2805  }
  2806  
  2807  func (s *StateStore) CSIPluginDenormalizeTxn(txn Txn, ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2808  	if plug == nil {
  2809  		return nil, nil
  2810  	}
  2811  
  2812  	// Get the unique list of allocation ids
  2813  	ids := map[string]struct{}{}
  2814  	for _, info := range plug.Controllers {
  2815  		ids[info.AllocID] = struct{}{}
  2816  	}
  2817  	for _, info := range plug.Nodes {
  2818  		ids[info.AllocID] = struct{}{}
  2819  	}
  2820  
  2821  	for id := range ids {
  2822  		alloc, err := s.allocByIDImpl(txn, ws, id)
  2823  		if err != nil {
  2824  			return nil, err
  2825  		}
  2826  		if alloc == nil {
  2827  			continue
  2828  		}
  2829  		plug.Allocations = append(plug.Allocations, alloc.Stub(nil))
  2830  	}
  2831  	sort.Slice(plug.Allocations, func(i, j int) bool {
  2832  		return plug.Allocations[i].ModifyIndex > plug.Allocations[j].ModifyIndex
  2833  	})
  2834  
  2835  	return plug, nil
  2836  }
  2837  
  2838  // UpsertCSIPlugin writes the plugin to the state store. Note: there
  2839  // is currently no raft message for this, as it's intended to support
  2840  // testing use cases.
  2841  func (s *StateStore) UpsertCSIPlugin(index uint64, plug *structs.CSIPlugin) error {
  2842  	txn := s.db.WriteTxn(index)
  2843  	defer txn.Abort()
  2844  
  2845  	existing, err := txn.First("csi_plugins", "id", plug.ID)
  2846  	if err != nil {
  2847  		return fmt.Errorf("csi_plugin lookup error: %s %v", plug.ID, err)
  2848  	}
  2849  
  2850  	plug.ModifyIndex = index
  2851  	if existing != nil {
  2852  		plug.CreateIndex = existing.(*structs.CSIPlugin).CreateIndex
  2853  	}
  2854  
  2855  	err = txn.Insert("csi_plugins", plug)
  2856  	if err != nil {
  2857  		return fmt.Errorf("csi_plugins insert error: %v", err)
  2858  	}
  2859  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  2860  		return fmt.Errorf("index update failed: %v", err)
  2861  	}
  2862  	return txn.Commit()
  2863  }
  2864  
  2865  // DeleteCSIPlugin deletes the plugin if it's not in use.
  2866  func (s *StateStore) DeleteCSIPlugin(index uint64, id string) error {
  2867  	txn := s.db.WriteTxn(index)
  2868  	defer txn.Abort()
  2869  
  2870  	plug, err := s.CSIPluginByIDTxn(txn, nil, id)
  2871  	if err != nil {
  2872  		return err
  2873  	}
  2874  
  2875  	if plug == nil {
  2876  		return nil
  2877  	}
  2878  
  2879  	plug, err = s.CSIPluginDenormalizeTxn(txn, nil, plug.Copy())
  2880  	if err != nil {
  2881  		return err
  2882  	}
  2883  	if !plug.IsEmpty() {
  2884  		return fmt.Errorf("plugin in use")
  2885  	}
  2886  
  2887  	err = txn.Delete("csi_plugins", plug)
  2888  	if err != nil {
  2889  		return fmt.Errorf("csi_plugins delete error: %v", err)
  2890  	}
  2891  	return txn.Commit()
  2892  }
  2893  
  2894  // UpsertPeriodicLaunch is used to register a launch or update it.
  2895  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
  2896  	txn := s.db.WriteTxn(index)
  2897  	defer txn.Abort()
  2898  
  2899  	// Check if the job already exists
  2900  	existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID)
  2901  	if err != nil {
  2902  		return fmt.Errorf("periodic launch lookup failed: %v", err)
  2903  	}
  2904  
  2905  	// Setup the indexes correctly
  2906  	if existing != nil {
  2907  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
  2908  		launch.ModifyIndex = index
  2909  	} else {
  2910  		launch.CreateIndex = index
  2911  		launch.ModifyIndex = index
  2912  	}
  2913  
  2914  	// Insert the job
  2915  	if err := txn.Insert("periodic_launch", launch); err != nil {
  2916  		return fmt.Errorf("launch insert failed: %v", err)
  2917  	}
  2918  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2919  		return fmt.Errorf("index update failed: %v", err)
  2920  	}
  2921  
  2922  	return txn.Commit()
  2923  }
  2924  
  2925  // DeletePeriodicLaunch is used to delete the periodic launch
  2926  func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error {
  2927  	txn := s.db.WriteTxn(index)
  2928  	defer txn.Abort()
  2929  
  2930  	err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn)
  2931  	if err == nil {
  2932  		return txn.Commit()
  2933  	}
  2934  	return err
  2935  }
  2936  
  2937  // DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch
  2938  // but in a transaction.  Useful for when making multiple modifications atomically
  2939  func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error {
  2940  	// Lookup the launch
  2941  	existing, err := txn.First("periodic_launch", "id", namespace, jobID)
  2942  	if err != nil {
  2943  		return fmt.Errorf("launch lookup failed: %v", err)
  2944  	}
  2945  	if existing == nil {
  2946  		return fmt.Errorf("launch not found")
  2947  	}
  2948  
  2949  	// Delete the launch
  2950  	if err := txn.Delete("periodic_launch", existing); err != nil {
  2951  		return fmt.Errorf("launch delete failed: %v", err)
  2952  	}
  2953  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2954  		return fmt.Errorf("index update failed: %v", err)
  2955  	}
  2956  
  2957  	return nil
  2958  }
  2959  
  2960  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
  2961  // ID.
  2962  func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) {
  2963  	txn := s.db.ReadTxn()
  2964  
  2965  	watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id)
  2966  	if err != nil {
  2967  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
  2968  	}
  2969  
  2970  	ws.Add(watchCh)
  2971  
  2972  	if existing != nil {
  2973  		return existing.(*structs.PeriodicLaunch), nil
  2974  	}
  2975  	return nil, nil
  2976  }
  2977  
  2978  // PeriodicLaunches returns an iterator over all the periodic launches
  2979  func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2980  	txn := s.db.ReadTxn()
  2981  
  2982  	// Walk the entire table
  2983  	iter, err := txn.Get("periodic_launch", "id")
  2984  	if err != nil {
  2985  		return nil, err
  2986  	}
  2987  
  2988  	ws.Add(iter.WatchCh())
  2989  
  2990  	return iter, nil
  2991  }
  2992  
  2993  // UpsertEvals is used to upsert a set of evaluations
  2994  func (s *StateStore) UpsertEvals(msgType structs.MessageType, index uint64, evals []*structs.Evaluation) error {
  2995  	txn := s.db.WriteTxnMsgT(msgType, index)
  2996  	defer txn.Abort()
  2997  
  2998  	err := s.UpsertEvalsTxn(index, evals, txn)
  2999  	if err == nil {
  3000  		return txn.Commit()
  3001  	}
  3002  	return err
  3003  }
  3004  
  3005  // UpsertEvalsTxn is used to upsert a set of evaluations, like UpsertEvals but
  3006  // in a transaction.  Useful for when making multiple modifications atomically.
  3007  func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error {
  3008  	// Do a nested upsert
  3009  	jobs := make(map[structs.NamespacedID]string, len(evals))
  3010  	for _, eval := range evals {
  3011  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  3012  			return err
  3013  		}
  3014  
  3015  		tuple := structs.NamespacedID{
  3016  			ID:        eval.JobID,
  3017  			Namespace: eval.Namespace,
  3018  		}
  3019  		jobs[tuple] = ""
  3020  	}
  3021  
  3022  	// Set the job's status
  3023  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3024  		return fmt.Errorf("setting job status failed: %v", err)
  3025  	}
  3026  
  3027  	return nil
  3028  }
  3029  
  3030  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
  3031  func (s *StateStore) nestedUpsertEval(txn *txn, index uint64, eval *structs.Evaluation) error {
  3032  	// Lookup the evaluation
  3033  	existing, err := txn.First("evals", "id", eval.ID)
  3034  	if err != nil {
  3035  		return fmt.Errorf("eval lookup failed: %v", err)
  3036  	}
  3037  
  3038  	// Update the indexes
  3039  	if existing != nil {
  3040  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
  3041  		eval.ModifyIndex = index
  3042  	} else {
  3043  		eval.CreateIndex = index
  3044  		eval.ModifyIndex = index
  3045  	}
  3046  
  3047  	// Update the job summary
  3048  	summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID)
  3049  	if err != nil {
  3050  		return fmt.Errorf("job summary lookup failed: %v", err)
  3051  	}
  3052  	if summaryRaw != nil {
  3053  		js := summaryRaw.(*structs.JobSummary).Copy()
  3054  		hasSummaryChanged := false
  3055  		for tg, num := range eval.QueuedAllocations {
  3056  			if summary, ok := js.Summary[tg]; ok {
  3057  				if summary.Queued != num {
  3058  					summary.Queued = num
  3059  					js.Summary[tg] = summary
  3060  					hasSummaryChanged = true
  3061  				}
  3062  			} else {
  3063  				s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace)
  3064  			}
  3065  		}
  3066  
  3067  		// Insert the job summary
  3068  		if hasSummaryChanged {
  3069  			js.ModifyIndex = index
  3070  			if err := txn.Insert("job_summary", js); err != nil {
  3071  				return fmt.Errorf("job summary insert failed: %v", err)
  3072  			}
  3073  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  3074  				return fmt.Errorf("index update failed: %v", err)
  3075  			}
  3076  		}
  3077  	}
  3078  
  3079  	// Check if the job has any blocked evaluations and cancel them
  3080  	if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 {
  3081  		// Get the blocked evaluation for a job if it exists
  3082  		iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked)
  3083  		if err != nil {
  3084  			return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err)
  3085  		}
  3086  
  3087  		var blocked []*structs.Evaluation
  3088  		for {
  3089  			raw := iter.Next()
  3090  			if raw == nil {
  3091  				break
  3092  			}
  3093  			blocked = append(blocked, raw.(*structs.Evaluation))
  3094  		}
  3095  
  3096  		// Go through and update the evals
  3097  		for _, eval := range blocked {
  3098  			newEval := eval.Copy()
  3099  			newEval.Status = structs.EvalStatusCancelled
  3100  			newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", newEval.ID)
  3101  			newEval.ModifyIndex = index
  3102  
  3103  			if err := txn.Insert("evals", newEval); err != nil {
  3104  				return fmt.Errorf("eval insert failed: %v", err)
  3105  			}
  3106  		}
  3107  	}
  3108  
  3109  	// Insert the eval
  3110  	if err := txn.Insert("evals", eval); err != nil {
  3111  		return fmt.Errorf("eval insert failed: %v", err)
  3112  	}
  3113  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3114  		return fmt.Errorf("index update failed: %v", err)
  3115  	}
  3116  	return nil
  3117  }
  3118  
  3119  // updateEvalModifyIndex is used to update the modify index of an evaluation that has been
  3120  // through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent
  3121  // scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply.
  3122  func (s *StateStore) updateEvalModifyIndex(txn *txn, index uint64, evalID string) error {
  3123  	// Lookup the evaluation
  3124  	existing, err := txn.First("evals", "id", evalID)
  3125  	if err != nil {
  3126  		return fmt.Errorf("eval lookup failed: %v", err)
  3127  	}
  3128  	if existing == nil {
  3129  		s.logger.Error("unable to find eval", "eval_id", evalID)
  3130  		return fmt.Errorf("unable to find eval id %q", evalID)
  3131  	}
  3132  	eval := existing.(*structs.Evaluation).Copy()
  3133  	// Update the indexes
  3134  	eval.ModifyIndex = index
  3135  
  3136  	// Insert the eval
  3137  	if err := txn.Insert("evals", eval); err != nil {
  3138  		return fmt.Errorf("eval insert failed: %v", err)
  3139  	}
  3140  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3141  		return fmt.Errorf("index update failed: %v", err)
  3142  	}
  3143  	return nil
  3144  }
  3145  
  3146  // DeleteEvalsByFilter is used to delete all evals that are both safe to delete
  3147  // and match a filter.
  3148  func (s *StateStore) DeleteEvalsByFilter(index uint64, filterExpr string, pageToken string, perPage int32) error {
  3149  	txn := s.db.WriteTxn(index)
  3150  	defer txn.Abort()
  3151  
  3152  	// These are always user-initiated, so ensure the eval broker is paused.
  3153  	_, schedConfig, err := s.schedulerConfigTxn(txn)
  3154  	if err != nil {
  3155  		return err
  3156  	}
  3157  	if schedConfig == nil || !schedConfig.PauseEvalBroker {
  3158  		return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
  3159  	}
  3160  
  3161  	filter, err := bexpr.CreateEvaluator(filterExpr)
  3162  	if err != nil {
  3163  		return err
  3164  	}
  3165  
  3166  	iter, err := s.Evals(nil, SortDefault)
  3167  	if err != nil {
  3168  		return fmt.Errorf("failed to lookup evals: %v", err)
  3169  	}
  3170  
  3171  	// Note: Paginator imports this package for testing so we can't just use
  3172  	// Paginator
  3173  	pageCount := int32(0)
  3174  
  3175  	for {
  3176  		if pageCount >= perPage {
  3177  			break
  3178  		}
  3179  		raw := iter.Next()
  3180  		if raw == nil {
  3181  			break
  3182  		}
  3183  		eval := raw.(*structs.Evaluation)
  3184  		if eval.ID < pageToken {
  3185  			continue
  3186  		}
  3187  
  3188  		deleteOk, err := s.EvalIsUserDeleteSafe(nil, eval)
  3189  		if !deleteOk || err != nil {
  3190  			continue
  3191  		}
  3192  		match, err := filter.Evaluate(eval)
  3193  		if !match || err != nil {
  3194  			continue
  3195  		}
  3196  		if err := txn.Delete("evals", eval); err != nil {
  3197  			return fmt.Errorf("eval delete failed: %v", err)
  3198  		}
  3199  		pageCount++
  3200  	}
  3201  
  3202  	err = txn.Commit()
  3203  	return err
  3204  }
  3205  
  3206  // EvalIsUserDeleteSafe ensures an evaluation is safe to delete based on its
  3207  // related allocation and job information. This follows similar, but different
  3208  // rules to the eval reap checking, to ensure evaluations for running allocs or
  3209  // allocs which need the evaluation detail are not deleted.
  3210  //
  3211  // Returns both a bool and an error so that error in querying the related
  3212  // objects can be differentiated from reporting that the eval isn't safe to
  3213  // delete.
  3214  func (s *StateStore) EvalIsUserDeleteSafe(ws memdb.WatchSet, eval *structs.Evaluation) (bool, error) {
  3215  
  3216  	job, err := s.JobByID(ws, eval.Namespace, eval.JobID)
  3217  	if err != nil {
  3218  		return false, fmt.Errorf("failed to lookup job for eval: %v", err)
  3219  	}
  3220  
  3221  	allocs, err := s.AllocsByEval(ws, eval.ID)
  3222  	if err != nil {
  3223  		return false, fmt.Errorf("failed to lookup eval allocs: %v", err)
  3224  	}
  3225  
  3226  	return isEvalDeleteSafe(allocs, job), nil
  3227  }
  3228  
  3229  func isEvalDeleteSafe(allocs []*structs.Allocation, job *structs.Job) bool {
  3230  
  3231  	// If the job is deleted, stopped, or dead, all allocs are terminal and
  3232  	// the eval can be deleted.
  3233  	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
  3234  		return true
  3235  	}
  3236  
  3237  	// Iterate the allocations associated to the eval, if any, and check
  3238  	// whether we can delete the eval.
  3239  	for _, alloc := range allocs {
  3240  
  3241  		// If the allocation is still classed as running on the client, or
  3242  		// might be, we can't delete.
  3243  		switch alloc.ClientStatus {
  3244  		case structs.AllocClientStatusRunning, structs.AllocClientStatusUnknown:
  3245  			return false
  3246  		}
  3247  
  3248  		// If the alloc hasn't failed then we don't need to consider it for
  3249  		// rescheduling. Rescheduling needs to copy over information from the
  3250  		// previous alloc so that it can enforce the reschedule policy.
  3251  		if alloc.ClientStatus != structs.AllocClientStatusFailed {
  3252  			continue
  3253  		}
  3254  
  3255  		var reschedulePolicy *structs.ReschedulePolicy
  3256  		tg := job.LookupTaskGroup(alloc.TaskGroup)
  3257  
  3258  		if tg != nil {
  3259  			reschedulePolicy = tg.ReschedulePolicy
  3260  		}
  3261  
  3262  		// No reschedule policy or rescheduling is disabled
  3263  		if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
  3264  			continue
  3265  		}
  3266  
  3267  		// The restart tracking information has not been carried forward.
  3268  		if alloc.NextAllocation == "" {
  3269  			return false
  3270  		}
  3271  
  3272  		// This task has unlimited rescheduling and the alloc has not been
  3273  		// replaced, so we can't delete the eval yet.
  3274  		if reschedulePolicy.Unlimited {
  3275  			return false
  3276  		}
  3277  
  3278  		// No restarts have been attempted yet.
  3279  		if alloc.RescheduleTracker == nil || len(alloc.RescheduleTracker.Events) == 0 {
  3280  			return false
  3281  		}
  3282  	}
  3283  
  3284  	return true
  3285  }
  3286  
  3287  // DeleteEval is used to delete an evaluation
  3288  func (s *StateStore) DeleteEval(index uint64, evals, allocs []string, userInitiated bool) error {
  3289  	txn := s.db.WriteTxn(index)
  3290  	defer txn.Abort()
  3291  
  3292  	// If this deletion has been initiated by an operator, ensure the eval
  3293  	// broker is paused.
  3294  	if userInitiated {
  3295  		_, schedConfig, err := s.schedulerConfigTxn(txn)
  3296  		if err != nil {
  3297  			return err
  3298  		}
  3299  		if schedConfig == nil || !schedConfig.PauseEvalBroker {
  3300  			return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
  3301  		}
  3302  	}
  3303  
  3304  	jobs := make(map[structs.NamespacedID]string, len(evals))
  3305  
  3306  	// evalsTableUpdated and allocsTableUpdated allow us to track whether each
  3307  	// table has been modified. This allows us to skip updating the index table
  3308  	// entries if we do not need to.
  3309  	var evalsTableUpdated, allocsTableUpdated bool
  3310  
  3311  	for _, eval := range evals {
  3312  		existing, err := txn.First("evals", "id", eval)
  3313  		if err != nil {
  3314  			return fmt.Errorf("eval lookup failed: %v", err)
  3315  		}
  3316  		if existing == nil {
  3317  			continue
  3318  		}
  3319  		if err := txn.Delete("evals", existing); err != nil {
  3320  			return fmt.Errorf("eval delete failed: %v", err)
  3321  		}
  3322  
  3323  		// Mark that we have made a successful modification to the evals
  3324  		// table.
  3325  		evalsTableUpdated = true
  3326  
  3327  		eval := existing.(*structs.Evaluation)
  3328  
  3329  		tuple := structs.NamespacedID{
  3330  			ID:        eval.JobID,
  3331  			Namespace: eval.Namespace,
  3332  		}
  3333  		jobs[tuple] = ""
  3334  	}
  3335  
  3336  	for _, alloc := range allocs {
  3337  		raw, err := txn.First("allocs", "id", alloc)
  3338  		if err != nil {
  3339  			return fmt.Errorf("alloc lookup failed: %v", err)
  3340  		}
  3341  		if raw == nil {
  3342  			continue
  3343  		}
  3344  		if err := txn.Delete("allocs", raw); err != nil {
  3345  			return fmt.Errorf("alloc delete failed: %v", err)
  3346  		}
  3347  
  3348  		// Mark that we have made a successful modification to the allocs
  3349  		// table.
  3350  		allocsTableUpdated = true
  3351  	}
  3352  
  3353  	// Update the indexes
  3354  	if evalsTableUpdated {
  3355  		if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  3356  			return fmt.Errorf("index update failed: %v", err)
  3357  		}
  3358  	}
  3359  	if allocsTableUpdated {
  3360  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3361  			return fmt.Errorf("index update failed: %v", err)
  3362  		}
  3363  	}
  3364  
  3365  	// Set the job's status
  3366  	if err := s.setJobStatuses(index, txn, jobs, true); err != nil {
  3367  		return fmt.Errorf("setting job status failed: %v", err)
  3368  	}
  3369  
  3370  	return txn.Commit()
  3371  }
  3372  
  3373  // EvalByID is used to lookup an eval by its ID
  3374  func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) {
  3375  	txn := s.db.ReadTxn()
  3376  
  3377  	watchCh, existing, err := txn.FirstWatch("evals", "id", id)
  3378  	if err != nil {
  3379  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3380  	}
  3381  
  3382  	ws.Add(watchCh)
  3383  
  3384  	if existing != nil {
  3385  		return existing.(*structs.Evaluation), nil
  3386  	}
  3387  	return nil, nil
  3388  }
  3389  
  3390  // EvalsRelatedToID is used to retrieve the evals that are related (next,
  3391  // previous, or blocked) to the provided eval ID.
  3392  func (s *StateStore) EvalsRelatedToID(ws memdb.WatchSet, id string) ([]*structs.EvaluationStub, error) {
  3393  	txn := s.db.ReadTxn()
  3394  
  3395  	raw, err := txn.First("evals", "id", id)
  3396  	if err != nil {
  3397  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3398  	}
  3399  	if raw == nil {
  3400  		return nil, nil
  3401  	}
  3402  	eval := raw.(*structs.Evaluation)
  3403  
  3404  	relatedEvals := []*structs.EvaluationStub{}
  3405  	todo := eval.RelatedIDs()
  3406  	done := map[string]bool{
  3407  		eval.ID: true, // don't place the requested eval in the related list.
  3408  	}
  3409  
  3410  	for len(todo) > 0 {
  3411  		// Pop the first value from the todo list.
  3412  		current := todo[0]
  3413  		todo = todo[1:]
  3414  		if current == "" {
  3415  			continue
  3416  		}
  3417  
  3418  		// Skip value if we already have it in the results.
  3419  		if done[current] {
  3420  			continue
  3421  		}
  3422  
  3423  		eval, err := s.EvalByID(ws, current)
  3424  		if err != nil {
  3425  			return nil, err
  3426  		}
  3427  		if eval == nil {
  3428  			continue
  3429  		}
  3430  
  3431  		todo = append(todo, eval.RelatedIDs()...)
  3432  		relatedEvals = append(relatedEvals, eval.Stub())
  3433  		done[eval.ID] = true
  3434  	}
  3435  
  3436  	return relatedEvals, nil
  3437  }
  3438  
  3439  // EvalsByIDPrefix is used to lookup evaluations by prefix in a particular
  3440  // namespace
  3441  func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
  3442  	txn := s.db.ReadTxn()
  3443  
  3444  	var iter memdb.ResultIterator
  3445  	var err error
  3446  
  3447  	// Get an iterator over all evals by the id prefix
  3448  	switch sort {
  3449  	case SortReverse:
  3450  		iter, err = txn.GetReverse("evals", "id_prefix", id)
  3451  	default:
  3452  		iter, err = txn.Get("evals", "id_prefix", id)
  3453  	}
  3454  	if err != nil {
  3455  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  3456  	}
  3457  
  3458  	ws.Add(iter.WatchCh())
  3459  
  3460  	// Wrap the iterator in a filter
  3461  	wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace))
  3462  	return wrap, nil
  3463  }
  3464  
  3465  // evalNamespaceFilter returns a filter function that filters all evaluations
  3466  // not in the given namespace.
  3467  func evalNamespaceFilter(namespace string) func(interface{}) bool {
  3468  	return func(raw interface{}) bool {
  3469  		eval, ok := raw.(*structs.Evaluation)
  3470  		if !ok {
  3471  			return true
  3472  		}
  3473  
  3474  		return namespace != structs.AllNamespacesSentinel &&
  3475  			eval.Namespace != namespace
  3476  	}
  3477  }
  3478  
  3479  // EvalsByJob returns all the evaluations by job id
  3480  func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) {
  3481  	txn := s.db.ReadTxn()
  3482  
  3483  	// Get an iterator over the node allocations
  3484  	iter, err := txn.Get("evals", "job_prefix", namespace, jobID)
  3485  	if err != nil {
  3486  		return nil, err
  3487  	}
  3488  
  3489  	ws.Add(iter.WatchCh())
  3490  
  3491  	var out []*structs.Evaluation
  3492  	for {
  3493  		raw := iter.Next()
  3494  		if raw == nil {
  3495  			break
  3496  		}
  3497  
  3498  		e := raw.(*structs.Evaluation)
  3499  
  3500  		// Filter non-exact matches
  3501  		if e.JobID != jobID {
  3502  			continue
  3503  		}
  3504  
  3505  		out = append(out, e)
  3506  	}
  3507  	return out, nil
  3508  }
  3509  
  3510  // Evals returns an iterator over all the evaluations in ascending or descending
  3511  // order of CreationIndex as determined by the reverse parameter.
  3512  func (s *StateStore) Evals(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  3513  	txn := s.db.ReadTxn()
  3514  
  3515  	var it memdb.ResultIterator
  3516  	var err error
  3517  
  3518  	switch sort {
  3519  	case SortReverse:
  3520  		it, err = txn.GetReverse("evals", "create")
  3521  	default:
  3522  		it, err = txn.Get("evals", "create")
  3523  	}
  3524  
  3525  	if err != nil {
  3526  		return nil, err
  3527  	}
  3528  
  3529  	ws.Add(it.WatchCh())
  3530  
  3531  	return it, nil
  3532  }
  3533  
  3534  // EvalsByNamespace returns an iterator over all evaluations in no particular
  3535  // order.
  3536  //
  3537  // todo(shoenig): can this be removed?
  3538  func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  3539  	txn := s.db.ReadTxn()
  3540  
  3541  	it, err := txn.Get("evals", "namespace", namespace)
  3542  	if err != nil {
  3543  		return nil, err
  3544  	}
  3545  
  3546  	ws.Add(it.WatchCh())
  3547  
  3548  	return it, nil
  3549  }
  3550  
  3551  func (s *StateStore) EvalsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
  3552  	txn := s.db.ReadTxn()
  3553  
  3554  	var (
  3555  		it    memdb.ResultIterator
  3556  		err   error
  3557  		exact = terminate(namespace)
  3558  	)
  3559  
  3560  	switch sort {
  3561  	case SortReverse:
  3562  		it, err = txn.GetReverse("evals", "namespace_create_prefix", exact)
  3563  	default:
  3564  		it, err = txn.Get("evals", "namespace_create_prefix", exact)
  3565  	}
  3566  
  3567  	if err != nil {
  3568  		return nil, err
  3569  	}
  3570  
  3571  	ws.Add(it.WatchCh())
  3572  
  3573  	return it, nil
  3574  }
  3575  
  3576  // UpdateAllocsFromClient is used to update an allocation based on input
  3577  // from a client. While the schedulers are the authority on the allocation for
  3578  // most things, some updates are authoritative from the client. Specifically,
  3579  // the desired state comes from the schedulers, while the actual state comes
  3580  // from clients.
  3581  func (s *StateStore) UpdateAllocsFromClient(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
  3582  	txn := s.db.WriteTxnMsgT(msgType, index)
  3583  	defer txn.Abort()
  3584  
  3585  	// Handle each of the updated allocations
  3586  	for _, alloc := range allocs {
  3587  		if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil {
  3588  			return err
  3589  		}
  3590  	}
  3591  
  3592  	// Update the indexes
  3593  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3594  		return fmt.Errorf("index update failed: %v", err)
  3595  	}
  3596  
  3597  	return txn.Commit()
  3598  }
  3599  
  3600  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
  3601  func (s *StateStore) nestedUpdateAllocFromClient(txn *txn, index uint64, alloc *structs.Allocation) error {
  3602  	// Look for existing alloc
  3603  	existing, err := txn.First("allocs", "id", alloc.ID)
  3604  	if err != nil {
  3605  		return fmt.Errorf("alloc lookup failed: %v", err)
  3606  	}
  3607  
  3608  	// Nothing to do if this does not exist
  3609  	if existing == nil {
  3610  		return nil
  3611  	}
  3612  	exist := existing.(*structs.Allocation)
  3613  
  3614  	// Copy everything from the existing allocation
  3615  	copyAlloc := exist.Copy()
  3616  
  3617  	// Pull in anything the client is the authority on
  3618  	copyAlloc.ClientStatus = alloc.ClientStatus
  3619  	copyAlloc.ClientDescription = alloc.ClientDescription
  3620  	copyAlloc.TaskStates = alloc.TaskStates
  3621  	copyAlloc.NetworkStatus = alloc.NetworkStatus
  3622  
  3623  	// The client can only set its deployment health and timestamp, so just take
  3624  	// those
  3625  	if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil {
  3626  		oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth()
  3627  		newHasHealthy := alloc.DeploymentStatus.HasHealth()
  3628  
  3629  		// We got new health information from the client
  3630  		if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) {
  3631  			// Updated deployment health and timestamp
  3632  			copyAlloc.DeploymentStatus.Healthy = pointer.Of(*alloc.DeploymentStatus.Healthy)
  3633  			copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp
  3634  			copyAlloc.DeploymentStatus.ModifyIndex = index
  3635  		}
  3636  	} else if alloc.DeploymentStatus != nil {
  3637  		// First time getting a deployment status so copy everything and just
  3638  		// set the index
  3639  		copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy()
  3640  		copyAlloc.DeploymentStatus.ModifyIndex = index
  3641  	}
  3642  
  3643  	// Update the modify index
  3644  	copyAlloc.ModifyIndex = index
  3645  
  3646  	// Update the modify time
  3647  	copyAlloc.ModifyTime = alloc.ModifyTime
  3648  
  3649  	if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3650  		return fmt.Errorf("error updating deployment: %v", err)
  3651  	}
  3652  
  3653  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3654  		return fmt.Errorf("error updating job summary: %v", err)
  3655  	}
  3656  
  3657  	if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil {
  3658  		return err
  3659  	}
  3660  
  3661  	if err := s.updatePluginForTerminalAlloc(index, copyAlloc, txn); err != nil {
  3662  		return err
  3663  	}
  3664  
  3665  	// Update the allocation
  3666  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  3667  		return fmt.Errorf("alloc insert failed: %v", err)
  3668  	}
  3669  
  3670  	// Set the job's status
  3671  	forceStatus := ""
  3672  	if !copyAlloc.TerminalStatus() {
  3673  		forceStatus = structs.JobStatusRunning
  3674  	}
  3675  
  3676  	tuple := structs.NamespacedID{
  3677  		ID:        exist.JobID,
  3678  		Namespace: exist.Namespace,
  3679  	}
  3680  	jobs := map[structs.NamespacedID]string{tuple: forceStatus}
  3681  
  3682  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3683  		return fmt.Errorf("setting job status failed: %v", err)
  3684  	}
  3685  	return nil
  3686  }
  3687  
  3688  // UpsertAllocs is used to evict a set of allocations and allocate new ones at
  3689  // the same time.
  3690  func (s *StateStore) UpsertAllocs(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
  3691  	txn := s.db.WriteTxn(index)
  3692  	defer txn.Abort()
  3693  	if err := s.upsertAllocsImpl(index, allocs, txn); err != nil {
  3694  		return err
  3695  	}
  3696  	return txn.Commit()
  3697  }
  3698  
  3699  // upsertAllocs is the actual implementation of UpsertAllocs so that it may be
  3700  // used with an existing transaction.
  3701  func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *txn) error {
  3702  	// Handle the allocations
  3703  	jobs := make(map[structs.NamespacedID]string, 1)
  3704  	for _, alloc := range allocs {
  3705  		existing, err := txn.First("allocs", "id", alloc.ID)
  3706  		if err != nil {
  3707  			return fmt.Errorf("alloc lookup failed: %v", err)
  3708  		}
  3709  		exist, _ := existing.(*structs.Allocation)
  3710  
  3711  		if exist == nil {
  3712  			alloc.CreateIndex = index
  3713  			alloc.ModifyIndex = index
  3714  			alloc.AllocModifyIndex = index
  3715  			if alloc.DeploymentStatus != nil {
  3716  				alloc.DeploymentStatus.ModifyIndex = index
  3717  			}
  3718  
  3719  			// Issue https://github.com/hashicorp/nomad/issues/2583 uncovered
  3720  			// the a race between a forced garbage collection and the scheduler
  3721  			// marking an allocation as terminal. The issue is that the
  3722  			// allocation from the scheduler has its job normalized and the FSM
  3723  			// will only denormalize if the allocation is not terminal.  However
  3724  			// if the allocation is garbage collected, that will result in a
  3725  			// allocation being upserted for the first time without a job
  3726  			// attached. By returning an error here, it will cause the FSM to
  3727  			// error, causing the plan_apply to error and thus causing the
  3728  			// evaluation to be failed. This will force an index refresh that
  3729  			// should solve this issue.
  3730  			if alloc.Job == nil {
  3731  				return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID)
  3732  			}
  3733  		} else {
  3734  			alloc.CreateIndex = exist.CreateIndex
  3735  			alloc.ModifyIndex = index
  3736  			alloc.AllocModifyIndex = index
  3737  
  3738  			// Keep the clients task states
  3739  			alloc.TaskStates = exist.TaskStates
  3740  
  3741  			// If the scheduler is marking this allocation as lost or unknown we do not
  3742  			// want to reuse the status of the existing allocation.
  3743  			if alloc.ClientStatus != structs.AllocClientStatusLost &&
  3744  				alloc.ClientStatus != structs.AllocClientStatusUnknown {
  3745  				alloc.ClientStatus = exist.ClientStatus
  3746  				alloc.ClientDescription = exist.ClientDescription
  3747  			}
  3748  
  3749  			// The job has been denormalized so re-attach the original job
  3750  			if alloc.Job == nil {
  3751  				alloc.Job = exist.Job
  3752  			}
  3753  		}
  3754  
  3755  		// OPTIMIZATION:
  3756  		// These should be given a map of new to old allocation and the updates
  3757  		// should be one on all changes. The current implementation causes O(n)
  3758  		// lookups/copies/insertions rather than O(1)
  3759  		if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil {
  3760  			return fmt.Errorf("error updating deployment: %v", err)
  3761  		}
  3762  
  3763  		if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil {
  3764  			return fmt.Errorf("error updating job summary: %v", err)
  3765  		}
  3766  
  3767  		if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil {
  3768  			return err
  3769  		}
  3770  
  3771  		if err := s.updatePluginForTerminalAlloc(index, alloc, txn); err != nil {
  3772  			return err
  3773  		}
  3774  
  3775  		if err := txn.Insert("allocs", alloc); err != nil {
  3776  			return fmt.Errorf("alloc insert failed: %v", err)
  3777  		}
  3778  
  3779  		if alloc.PreviousAllocation != "" {
  3780  			prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation)
  3781  			if err != nil {
  3782  				return fmt.Errorf("alloc lookup failed: %v", err)
  3783  			}
  3784  			existingPrevAlloc, _ := prevAlloc.(*structs.Allocation)
  3785  			if existingPrevAlloc != nil {
  3786  				prevAllocCopy := existingPrevAlloc.Copy()
  3787  				prevAllocCopy.NextAllocation = alloc.ID
  3788  				prevAllocCopy.ModifyIndex = index
  3789  				if err := txn.Insert("allocs", prevAllocCopy); err != nil {
  3790  					return fmt.Errorf("alloc insert failed: %v", err)
  3791  				}
  3792  			}
  3793  		}
  3794  
  3795  		// If the allocation is running, force the job to running status.
  3796  		forceStatus := ""
  3797  		if !alloc.TerminalStatus() {
  3798  			forceStatus = structs.JobStatusRunning
  3799  		}
  3800  
  3801  		tuple := structs.NamespacedID{
  3802  			ID:        alloc.JobID,
  3803  			Namespace: alloc.Namespace,
  3804  		}
  3805  		jobs[tuple] = forceStatus
  3806  	}
  3807  
  3808  	// Update the indexes
  3809  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3810  		return fmt.Errorf("index update failed: %v", err)
  3811  	}
  3812  
  3813  	// Set the job's status
  3814  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3815  		return fmt.Errorf("setting job status failed: %v", err)
  3816  	}
  3817  
  3818  	return nil
  3819  }
  3820  
  3821  // UpdateAllocsDesiredTransitions is used to update a set of allocations
  3822  // desired transitions.
  3823  func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType, index uint64, allocs map[string]*structs.DesiredTransition,
  3824  	evals []*structs.Evaluation) error {
  3825  
  3826  	txn := s.db.WriteTxnMsgT(msgType, index)
  3827  	defer txn.Abort()
  3828  
  3829  	// Handle each of the updated allocations
  3830  	for id, transition := range allocs {
  3831  		if err := s.UpdateAllocDesiredTransitionTxn(txn, index, id, transition); err != nil {
  3832  			return err
  3833  		}
  3834  	}
  3835  
  3836  	for _, eval := range evals {
  3837  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  3838  			return err
  3839  		}
  3840  	}
  3841  
  3842  	// Update the indexes
  3843  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3844  		return fmt.Errorf("index update failed: %v", err)
  3845  	}
  3846  
  3847  	return txn.Commit()
  3848  }
  3849  
  3850  // UpdateAllocDesiredTransitionTxn is used to nest an update of an
  3851  // allocations desired transition
  3852  func (s *StateStore) UpdateAllocDesiredTransitionTxn(
  3853  	txn *txn, index uint64, allocID string,
  3854  	transition *structs.DesiredTransition) error {
  3855  
  3856  	// Look for existing alloc
  3857  	existing, err := txn.First("allocs", "id", allocID)
  3858  	if err != nil {
  3859  		return fmt.Errorf("alloc lookup failed: %v", err)
  3860  	}
  3861  
  3862  	// Nothing to do if this does not exist
  3863  	if existing == nil {
  3864  		return nil
  3865  	}
  3866  	exist := existing.(*structs.Allocation)
  3867  
  3868  	// Copy everything from the existing allocation
  3869  	copyAlloc := exist.Copy()
  3870  
  3871  	// Merge the desired transitions
  3872  	copyAlloc.DesiredTransition.Merge(transition)
  3873  
  3874  	// Update the modify indexes
  3875  	copyAlloc.ModifyIndex = index
  3876  	copyAlloc.AllocModifyIndex = index
  3877  
  3878  	// Update the allocation
  3879  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  3880  		return fmt.Errorf("alloc insert failed: %v", err)
  3881  	}
  3882  
  3883  	return nil
  3884  }
  3885  
  3886  // AllocByID is used to lookup an allocation by its ID
  3887  func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  3888  	txn := s.db.ReadTxn()
  3889  	return s.allocByIDImpl(txn, ws, id)
  3890  }
  3891  
  3892  // allocByIDImpl retrives an allocation and is called under and existing
  3893  // transaction. An optional watch set can be passed to add allocations to the
  3894  // watch set
  3895  func (s *StateStore) allocByIDImpl(txn Txn, ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  3896  	watchCh, raw, err := txn.FirstWatch("allocs", "id", id)
  3897  	if err != nil {
  3898  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  3899  	}
  3900  
  3901  	ws.Add(watchCh)
  3902  
  3903  	if raw == nil {
  3904  		return nil, nil
  3905  	}
  3906  	alloc := raw.(*structs.Allocation)
  3907  	return alloc, nil
  3908  }
  3909  
  3910  // AllocsByIDPrefix is used to lookup allocs by prefix
  3911  func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
  3912  	txn := s.db.ReadTxn()
  3913  
  3914  	var iter memdb.ResultIterator
  3915  	var err error
  3916  
  3917  	switch sort {
  3918  	case SortReverse:
  3919  		iter, err = txn.GetReverse("allocs", "id_prefix", id)
  3920  	default:
  3921  		iter, err = txn.Get("allocs", "id_prefix", id)
  3922  	}
  3923  	if err != nil {
  3924  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  3925  	}
  3926  
  3927  	ws.Add(iter.WatchCh())
  3928  
  3929  	// Wrap the iterator in a filter
  3930  	wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace))
  3931  	return wrap, nil
  3932  }
  3933  
  3934  // allocNamespaceFilter returns a filter function that filters all allocations
  3935  // not in the given namespace.
  3936  func allocNamespaceFilter(namespace string) func(interface{}) bool {
  3937  	return func(raw interface{}) bool {
  3938  		alloc, ok := raw.(*structs.Allocation)
  3939  		if !ok {
  3940  			return true
  3941  		}
  3942  
  3943  		if namespace == structs.AllNamespacesSentinel {
  3944  			return false
  3945  		}
  3946  
  3947  		return alloc.Namespace != namespace
  3948  	}
  3949  }
  3950  
  3951  // AllocsByIDPrefixAllNSs is used to lookup allocs by prefix.
  3952  func (s *StateStore) AllocsByIDPrefixAllNSs(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  3953  	txn := s.db.ReadTxn()
  3954  
  3955  	iter, err := txn.Get("allocs", "id_prefix", prefix)
  3956  	if err != nil {
  3957  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  3958  	}
  3959  
  3960  	ws.Add(iter.WatchCh())
  3961  
  3962  	return iter, nil
  3963  }
  3964  
  3965  // AllocsByNode returns all the allocations by node
  3966  func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  3967  	txn := s.db.ReadTxn()
  3968  
  3969  	return allocsByNodeTxn(txn, ws, node)
  3970  }
  3971  
  3972  func allocsByNodeTxn(txn ReadTxn, ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  3973  	// Get an iterator over the node allocations, using only the
  3974  	// node prefix which ignores the terminal status
  3975  	iter, err := txn.Get("allocs", "node_prefix", node)
  3976  	if err != nil {
  3977  		return nil, err
  3978  	}
  3979  
  3980  	ws.Add(iter.WatchCh())
  3981  
  3982  	var out []*structs.Allocation
  3983  	for {
  3984  		raw := iter.Next()
  3985  		if raw == nil {
  3986  			break
  3987  		}
  3988  		out = append(out, raw.(*structs.Allocation))
  3989  	}
  3990  	return out, nil
  3991  }
  3992  
  3993  // AllocsByNodeTerminal returns all the allocations by node and terminal
  3994  // status.
  3995  func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) {
  3996  	txn := s.db.ReadTxn()
  3997  
  3998  	// Get an iterator over the node allocations
  3999  	iter, err := txn.Get("allocs", "node", node, terminal)
  4000  	if err != nil {
  4001  		return nil, err
  4002  	}
  4003  
  4004  	ws.Add(iter.WatchCh())
  4005  
  4006  	var out []*structs.Allocation
  4007  	for {
  4008  		raw := iter.Next()
  4009  		if raw == nil {
  4010  			break
  4011  		}
  4012  		out = append(out, raw.(*structs.Allocation))
  4013  	}
  4014  	return out, nil
  4015  }
  4016  
  4017  // AllocsByJob returns allocations by job id
  4018  func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) {
  4019  	txn := s.db.ReadTxn()
  4020  
  4021  	// Get the job
  4022  	var job *structs.Job
  4023  	rawJob, err := txn.First("jobs", "id", namespace, jobID)
  4024  	if err != nil {
  4025  		return nil, err
  4026  	}
  4027  	if rawJob != nil {
  4028  		job = rawJob.(*structs.Job)
  4029  	}
  4030  
  4031  	// Get an iterator over the node allocations
  4032  	iter, err := txn.Get("allocs", "job", namespace, jobID)
  4033  	if err != nil {
  4034  		return nil, err
  4035  	}
  4036  
  4037  	ws.Add(iter.WatchCh())
  4038  
  4039  	var out []*structs.Allocation
  4040  	for {
  4041  		raw := iter.Next()
  4042  		if raw == nil {
  4043  			break
  4044  		}
  4045  
  4046  		alloc := raw.(*structs.Allocation)
  4047  		// If the allocation belongs to a job with the same ID but a different
  4048  		// create index and we are not getting all the allocations whose Jobs
  4049  		// matches the same Job ID then we skip it
  4050  		if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex {
  4051  			continue
  4052  		}
  4053  		out = append(out, raw.(*structs.Allocation))
  4054  	}
  4055  	return out, nil
  4056  }
  4057  
  4058  // AllocsByEval returns all the allocations by eval id
  4059  func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) {
  4060  	txn := s.db.ReadTxn()
  4061  
  4062  	// Get an iterator over the eval allocations
  4063  	iter, err := txn.Get("allocs", "eval", evalID)
  4064  	if err != nil {
  4065  		return nil, err
  4066  	}
  4067  
  4068  	ws.Add(iter.WatchCh())
  4069  
  4070  	var out []*structs.Allocation
  4071  	for {
  4072  		raw := iter.Next()
  4073  		if raw == nil {
  4074  			break
  4075  		}
  4076  		out = append(out, raw.(*structs.Allocation))
  4077  	}
  4078  	return out, nil
  4079  }
  4080  
  4081  // AllocsByDeployment returns all the allocations by deployment id
  4082  func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) {
  4083  	txn := s.db.ReadTxn()
  4084  
  4085  	// Get an iterator over the deployments allocations
  4086  	iter, err := txn.Get("allocs", "deployment", deploymentID)
  4087  	if err != nil {
  4088  		return nil, err
  4089  	}
  4090  
  4091  	ws.Add(iter.WatchCh())
  4092  
  4093  	var out []*structs.Allocation
  4094  	for {
  4095  		raw := iter.Next()
  4096  		if raw == nil {
  4097  			break
  4098  		}
  4099  		out = append(out, raw.(*structs.Allocation))
  4100  	}
  4101  	return out, nil
  4102  }
  4103  
  4104  // Allocs returns an iterator over all the evaluations.
  4105  func (s *StateStore) Allocs(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  4106  	txn := s.db.ReadTxn()
  4107  
  4108  	var it memdb.ResultIterator
  4109  	var err error
  4110  
  4111  	switch sort {
  4112  	case SortReverse:
  4113  		it, err = txn.GetReverse("allocs", "create")
  4114  	default:
  4115  		it, err = txn.Get("allocs", "create")
  4116  	}
  4117  
  4118  	if err != nil {
  4119  		return nil, err
  4120  	}
  4121  
  4122  	ws.Add(it.WatchCh())
  4123  
  4124  	return it, nil
  4125  }
  4126  
  4127  func (s *StateStore) AllocsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
  4128  	txn := s.db.ReadTxn()
  4129  
  4130  	var (
  4131  		it    memdb.ResultIterator
  4132  		err   error
  4133  		exact = terminate(namespace)
  4134  	)
  4135  
  4136  	switch sort {
  4137  	case SortReverse:
  4138  		it, err = txn.GetReverse("allocs", "namespace_create_prefix", exact)
  4139  	default:
  4140  		it, err = txn.Get("allocs", "namespace_create_prefix", exact)
  4141  	}
  4142  
  4143  	if err != nil {
  4144  		return nil, err
  4145  	}
  4146  
  4147  	ws.Add(it.WatchCh())
  4148  
  4149  	return it, nil
  4150  }
  4151  
  4152  // AllocsByNamespace returns an iterator over all the allocations in the
  4153  // namespace
  4154  func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  4155  	txn := s.db.ReadTxn()
  4156  	return s.allocsByNamespaceImpl(ws, txn, namespace)
  4157  }
  4158  
  4159  // allocsByNamespaceImpl returns an iterator over all the allocations in the
  4160  // namespace
  4161  func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *txn, namespace string) (memdb.ResultIterator, error) {
  4162  	// Walk the entire table
  4163  	iter, err := txn.Get("allocs", "namespace", namespace)
  4164  	if err != nil {
  4165  		return nil, err
  4166  	}
  4167  
  4168  	ws.Add(iter.WatchCh())
  4169  
  4170  	return iter, nil
  4171  }
  4172  
  4173  // UpsertVaultAccessor is used to register a set of Vault Accessors.
  4174  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  4175  	txn := s.db.WriteTxn(index)
  4176  	defer txn.Abort()
  4177  
  4178  	for _, accessor := range accessors {
  4179  		// Set the create index
  4180  		accessor.CreateIndex = index
  4181  
  4182  		// Insert the accessor
  4183  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  4184  			return fmt.Errorf("accessor insert failed: %v", err)
  4185  		}
  4186  	}
  4187  
  4188  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  4189  		return fmt.Errorf("index update failed: %v", err)
  4190  	}
  4191  
  4192  	return txn.Commit()
  4193  }
  4194  
  4195  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  4196  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  4197  	txn := s.db.WriteTxn(index)
  4198  	defer txn.Abort()
  4199  
  4200  	// Lookup the accessor
  4201  	for _, accessor := range accessors {
  4202  		// Delete the accessor
  4203  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  4204  			return fmt.Errorf("accessor delete failed: %v", err)
  4205  		}
  4206  	}
  4207  
  4208  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  4209  		return fmt.Errorf("index update failed: %v", err)
  4210  	}
  4211  
  4212  	return txn.Commit()
  4213  }
  4214  
  4215  // VaultAccessor returns the given Vault accessor
  4216  func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) {
  4217  	txn := s.db.ReadTxn()
  4218  
  4219  	watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor)
  4220  	if err != nil {
  4221  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  4222  	}
  4223  
  4224  	ws.Add(watchCh)
  4225  
  4226  	if existing != nil {
  4227  		return existing.(*structs.VaultAccessor), nil
  4228  	}
  4229  
  4230  	return nil, nil
  4231  }
  4232  
  4233  // VaultAccessors returns an iterator of Vault accessors.
  4234  func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4235  	txn := s.db.ReadTxn()
  4236  
  4237  	iter, err := txn.Get("vault_accessors", "id")
  4238  	if err != nil {
  4239  		return nil, err
  4240  	}
  4241  
  4242  	ws.Add(iter.WatchCh())
  4243  
  4244  	return iter, nil
  4245  }
  4246  
  4247  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  4248  func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) {
  4249  	txn := s.db.ReadTxn()
  4250  
  4251  	// Get an iterator over the accessors
  4252  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  4253  	if err != nil {
  4254  		return nil, err
  4255  	}
  4256  
  4257  	ws.Add(iter.WatchCh())
  4258  
  4259  	var out []*structs.VaultAccessor
  4260  	for {
  4261  		raw := iter.Next()
  4262  		if raw == nil {
  4263  			break
  4264  		}
  4265  		out = append(out, raw.(*structs.VaultAccessor))
  4266  	}
  4267  	return out, nil
  4268  }
  4269  
  4270  // VaultAccessorsByNode returns all the Vault accessors by node id
  4271  func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) {
  4272  	txn := s.db.ReadTxn()
  4273  
  4274  	// Get an iterator over the accessors
  4275  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  4276  	if err != nil {
  4277  		return nil, err
  4278  	}
  4279  
  4280  	ws.Add(iter.WatchCh())
  4281  
  4282  	var out []*structs.VaultAccessor
  4283  	for {
  4284  		raw := iter.Next()
  4285  		if raw == nil {
  4286  			break
  4287  		}
  4288  		out = append(out, raw.(*structs.VaultAccessor))
  4289  	}
  4290  	return out, nil
  4291  }
  4292  
  4293  func indexEntry(table string, index uint64) *IndexEntry {
  4294  	return &IndexEntry{
  4295  		Key:   table,
  4296  		Value: index,
  4297  	}
  4298  }
  4299  
  4300  const siTokenAccessorTable = "si_token_accessors"
  4301  
  4302  // UpsertSITokenAccessors is used to register a set of Service Identity token accessors.
  4303  func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  4304  	txn := s.db.WriteTxn(index)
  4305  	defer txn.Abort()
  4306  
  4307  	for _, accessor := range accessors {
  4308  		// set the create index
  4309  		accessor.CreateIndex = index
  4310  
  4311  		// insert the accessor
  4312  		if err := txn.Insert(siTokenAccessorTable, accessor); err != nil {
  4313  			return fmt.Errorf("accessor insert failed: %w", err)
  4314  		}
  4315  	}
  4316  
  4317  	// update the index for this table
  4318  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  4319  		return fmt.Errorf("index update failed: %w", err)
  4320  	}
  4321  
  4322  	return txn.Commit()
  4323  }
  4324  
  4325  // DeleteSITokenAccessors is used to delete a set of Service Identity token accessors.
  4326  func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  4327  	txn := s.db.WriteTxn(index)
  4328  	defer txn.Abort()
  4329  
  4330  	// Lookup each accessor
  4331  	for _, accessor := range accessors {
  4332  		// Delete the accessor
  4333  		if err := txn.Delete(siTokenAccessorTable, accessor); err != nil {
  4334  			return fmt.Errorf("accessor delete failed: %w", err)
  4335  		}
  4336  	}
  4337  
  4338  	// update the index for this table
  4339  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  4340  		return fmt.Errorf("index update failed: %w", err)
  4341  	}
  4342  
  4343  	return txn.Commit()
  4344  }
  4345  
  4346  // SITokenAccessor returns the given Service Identity token accessor.
  4347  func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) {
  4348  	txn := s.db.ReadTxn()
  4349  	defer txn.Abort()
  4350  
  4351  	watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID)
  4352  	if err != nil {
  4353  		return nil, fmt.Errorf("accessor lookup failed: %w", err)
  4354  	}
  4355  
  4356  	ws.Add(watchCh)
  4357  
  4358  	if existing != nil {
  4359  		return existing.(*structs.SITokenAccessor), nil
  4360  	}
  4361  
  4362  	return nil, nil
  4363  }
  4364  
  4365  // SITokenAccessors returns an iterator of Service Identity token accessors.
  4366  func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4367  	txn := s.db.ReadTxn()
  4368  	defer txn.Abort()
  4369  
  4370  	iter, err := txn.Get(siTokenAccessorTable, "id")
  4371  	if err != nil {
  4372  		return nil, err
  4373  	}
  4374  
  4375  	ws.Add(iter.WatchCh())
  4376  
  4377  	return iter, nil
  4378  }
  4379  
  4380  // SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID.
  4381  func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) {
  4382  	txn := s.db.ReadTxn()
  4383  	defer txn.Abort()
  4384  
  4385  	// Get an iterator over the accessors
  4386  	iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID)
  4387  	if err != nil {
  4388  		return nil, err
  4389  	}
  4390  
  4391  	ws.Add(iter.WatchCh())
  4392  
  4393  	var result []*structs.SITokenAccessor
  4394  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  4395  		result = append(result, raw.(*structs.SITokenAccessor))
  4396  	}
  4397  
  4398  	return result, nil
  4399  }
  4400  
  4401  // SITokenAccessorsByNode returns all the Service Identity token accessors by node ID.
  4402  func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) {
  4403  	txn := s.db.ReadTxn()
  4404  	defer txn.Abort()
  4405  
  4406  	// Get an iterator over the accessors
  4407  	iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID)
  4408  	if err != nil {
  4409  		return nil, err
  4410  	}
  4411  
  4412  	ws.Add(iter.WatchCh())
  4413  
  4414  	var result []*structs.SITokenAccessor
  4415  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  4416  		result = append(result, raw.(*structs.SITokenAccessor))
  4417  	}
  4418  
  4419  	return result, nil
  4420  }
  4421  
  4422  // UpdateDeploymentStatus is used to make deployment status updates and
  4423  // potentially make a evaluation
  4424  func (s *StateStore) UpdateDeploymentStatus(msgType structs.MessageType, index uint64, req *structs.DeploymentStatusUpdateRequest) error {
  4425  	txn := s.db.WriteTxnMsgT(msgType, index)
  4426  	defer txn.Abort()
  4427  
  4428  	if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  4429  		return err
  4430  	}
  4431  
  4432  	// Upsert the job if necessary
  4433  	if req.Job != nil {
  4434  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  4435  			return err
  4436  		}
  4437  	}
  4438  
  4439  	// Upsert the optional eval
  4440  	if req.Eval != nil {
  4441  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4442  			return err
  4443  		}
  4444  	}
  4445  
  4446  	return txn.Commit()
  4447  }
  4448  
  4449  // updateDeploymentStatusImpl is used to make deployment status updates
  4450  func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *txn) error {
  4451  	// Retrieve deployment
  4452  	ws := memdb.NewWatchSet()
  4453  	deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn)
  4454  	if err != nil {
  4455  		return err
  4456  	} else if deployment == nil {
  4457  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID)
  4458  	} else if !deployment.Active() {
  4459  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4460  	}
  4461  
  4462  	// Apply the new status
  4463  	copy := deployment.Copy()
  4464  	copy.Status = u.Status
  4465  	copy.StatusDescription = u.StatusDescription
  4466  	copy.ModifyIndex = index
  4467  
  4468  	// Insert the deployment
  4469  	if err := txn.Insert("deployment", copy); err != nil {
  4470  		return err
  4471  	}
  4472  
  4473  	// Update the index
  4474  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
  4475  		return fmt.Errorf("index update failed: %v", err)
  4476  	}
  4477  
  4478  	// If the deployment is being marked as complete, set the job to stable.
  4479  	if copy.Status == structs.DeploymentStatusSuccessful {
  4480  		if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil {
  4481  			return fmt.Errorf("failed to update job stability: %v", err)
  4482  		}
  4483  	}
  4484  
  4485  	return nil
  4486  }
  4487  
  4488  // UpdateJobStability updates the stability of the given job and version to the
  4489  // desired status.
  4490  func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error {
  4491  	txn := s.db.WriteTxn(index)
  4492  	defer txn.Abort()
  4493  
  4494  	if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil {
  4495  		return err
  4496  	}
  4497  
  4498  	return txn.Commit()
  4499  }
  4500  
  4501  // updateJobStabilityImpl updates the stability of the given job and version
  4502  func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *txn) error {
  4503  	// Get the job that is referenced
  4504  	job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn)
  4505  	if err != nil {
  4506  		return err
  4507  	}
  4508  
  4509  	// Has already been cleared, nothing to do
  4510  	if job == nil {
  4511  		return nil
  4512  	}
  4513  
  4514  	// If the job already has the desired stability, nothing to do
  4515  	if job.Stable == stable {
  4516  		return nil
  4517  	}
  4518  
  4519  	copy := job.Copy()
  4520  	copy.Stable = stable
  4521  	return s.upsertJobImpl(index, copy, true, txn)
  4522  }
  4523  
  4524  // UpdateDeploymentPromotion is used to promote canaries in a deployment and
  4525  // potentially make a evaluation
  4526  func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentPromoteRequest) error {
  4527  	txn := s.db.WriteTxnMsgT(msgType, index)
  4528  	defer txn.Abort()
  4529  
  4530  	// Retrieve deployment and ensure it is not terminal and is active
  4531  	ws := memdb.NewWatchSet()
  4532  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  4533  	if err != nil {
  4534  		return err
  4535  	} else if deployment == nil {
  4536  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  4537  	} else if !deployment.Active() {
  4538  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4539  	}
  4540  
  4541  	// Retrieve effected allocations
  4542  	iter, err := txn.Get("allocs", "deployment", req.DeploymentID)
  4543  	if err != nil {
  4544  		return err
  4545  	}
  4546  
  4547  	// groupIndex is a map of groups being promoted
  4548  	groupIndex := make(map[string]struct{}, len(req.Groups))
  4549  	for _, g := range req.Groups {
  4550  		groupIndex[g] = struct{}{}
  4551  	}
  4552  
  4553  	// canaryIndex is the set of placed canaries in the deployment
  4554  	canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
  4555  	for _, dstate := range deployment.TaskGroups {
  4556  		for _, c := range dstate.PlacedCanaries {
  4557  			canaryIndex[c] = struct{}{}
  4558  		}
  4559  	}
  4560  
  4561  	// healthyCounts is a mapping of group to the number of healthy canaries
  4562  	healthyCounts := make(map[string]int, len(deployment.TaskGroups))
  4563  
  4564  	// promotable is the set of allocations that we can move from canary to
  4565  	// non-canary
  4566  	var promotable []*structs.Allocation
  4567  
  4568  	for {
  4569  		raw := iter.Next()
  4570  		if raw == nil {
  4571  			break
  4572  		}
  4573  
  4574  		alloc := raw.(*structs.Allocation)
  4575  
  4576  		// Check that the alloc is a canary
  4577  		if _, ok := canaryIndex[alloc.ID]; !ok {
  4578  			continue
  4579  		}
  4580  
  4581  		// Check that the canary is part of a group being promoted
  4582  		if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok {
  4583  			continue
  4584  		}
  4585  
  4586  		// Ensure the canaries are healthy
  4587  		if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() {
  4588  			continue
  4589  		}
  4590  
  4591  		healthyCounts[alloc.TaskGroup]++
  4592  		promotable = append(promotable, alloc)
  4593  	}
  4594  
  4595  	// Determine if we have enough healthy allocations
  4596  	var unhealthyErr multierror.Error
  4597  	for tg, dstate := range deployment.TaskGroups {
  4598  		if _, ok := groupIndex[tg]; !req.All && !ok {
  4599  			continue
  4600  		}
  4601  
  4602  		need := dstate.DesiredCanaries
  4603  		if need == 0 {
  4604  			continue
  4605  		}
  4606  
  4607  		if have := healthyCounts[tg]; have < need {
  4608  			multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
  4609  		}
  4610  	}
  4611  
  4612  	if err := unhealthyErr.ErrorOrNil(); err != nil {
  4613  		return err
  4614  	}
  4615  
  4616  	// Update deployment
  4617  	copy := deployment.Copy()
  4618  	copy.ModifyIndex = index
  4619  	for tg, status := range copy.TaskGroups {
  4620  		_, ok := groupIndex[tg]
  4621  		if !req.All && !ok {
  4622  			continue
  4623  		}
  4624  
  4625  		// reset the progress deadline
  4626  		if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() {
  4627  			status.RequireProgressBy = time.Now().Add(status.ProgressDeadline)
  4628  		}
  4629  		status.Promoted = true
  4630  	}
  4631  
  4632  	// If the deployment no longer needs promotion, update its status
  4633  	if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning {
  4634  		copy.StatusDescription = structs.DeploymentStatusDescriptionRunning
  4635  	}
  4636  
  4637  	// Insert the deployment
  4638  	if err := s.upsertDeploymentImpl(index, copy, txn); err != nil {
  4639  		return err
  4640  	}
  4641  
  4642  	// Upsert the optional eval
  4643  	if req.Eval != nil {
  4644  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4645  			return err
  4646  		}
  4647  	}
  4648  
  4649  	// For each promotable allocation remove the canary field
  4650  	for _, alloc := range promotable {
  4651  		promoted := alloc.Copy()
  4652  		promoted.DeploymentStatus.Canary = false
  4653  		promoted.DeploymentStatus.ModifyIndex = index
  4654  		promoted.ModifyIndex = index
  4655  		promoted.AllocModifyIndex = index
  4656  
  4657  		if err := txn.Insert("allocs", promoted); err != nil {
  4658  			return fmt.Errorf("alloc insert failed: %v", err)
  4659  		}
  4660  	}
  4661  
  4662  	// Update the alloc index
  4663  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4664  		return fmt.Errorf("index update failed: %v", err)
  4665  	}
  4666  
  4667  	return txn.Commit()
  4668  }
  4669  
  4670  // UpdateDeploymentAllocHealth is used to update the health of allocations as
  4671  // part of the deployment and potentially make a evaluation
  4672  func (s *StateStore) UpdateDeploymentAllocHealth(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error {
  4673  	txn := s.db.WriteTxnMsgT(msgType, index)
  4674  	defer txn.Abort()
  4675  
  4676  	// Retrieve deployment and ensure it is not terminal and is active
  4677  	ws := memdb.NewWatchSet()
  4678  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  4679  	if err != nil {
  4680  		return err
  4681  	} else if deployment == nil {
  4682  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  4683  	} else if !deployment.Active() {
  4684  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  4685  	}
  4686  
  4687  	// Update the health status of each allocation
  4688  	if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 {
  4689  		setAllocHealth := func(id string, healthy bool, ts time.Time) error {
  4690  			existing, err := txn.First("allocs", "id", id)
  4691  			if err != nil {
  4692  				return fmt.Errorf("alloc %q lookup failed: %v", id, err)
  4693  			}
  4694  			if existing == nil {
  4695  				return fmt.Errorf("unknown alloc %q", id)
  4696  			}
  4697  
  4698  			old := existing.(*structs.Allocation)
  4699  			if old.DeploymentID != req.DeploymentID {
  4700  				return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID)
  4701  			}
  4702  
  4703  			// Set the health
  4704  			copy := old.Copy()
  4705  			if copy.DeploymentStatus == nil {
  4706  				copy.DeploymentStatus = &structs.AllocDeploymentStatus{}
  4707  			}
  4708  			copy.DeploymentStatus.Healthy = pointer.Of(healthy)
  4709  			copy.DeploymentStatus.Timestamp = ts
  4710  			copy.DeploymentStatus.ModifyIndex = index
  4711  			copy.ModifyIndex = index
  4712  
  4713  			if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil {
  4714  				return fmt.Errorf("error updating deployment: %v", err)
  4715  			}
  4716  
  4717  			if err := txn.Insert("allocs", copy); err != nil {
  4718  				return fmt.Errorf("alloc insert failed: %v", err)
  4719  			}
  4720  
  4721  			return nil
  4722  		}
  4723  
  4724  		for _, id := range req.HealthyAllocationIDs {
  4725  			if err := setAllocHealth(id, true, req.Timestamp); err != nil {
  4726  				return err
  4727  			}
  4728  		}
  4729  		for _, id := range req.UnhealthyAllocationIDs {
  4730  			if err := setAllocHealth(id, false, req.Timestamp); err != nil {
  4731  				return err
  4732  			}
  4733  		}
  4734  
  4735  		// Update the indexes
  4736  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  4737  			return fmt.Errorf("index update failed: %v", err)
  4738  		}
  4739  	}
  4740  
  4741  	// Update the deployment status as needed.
  4742  	if req.DeploymentUpdate != nil {
  4743  		if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  4744  			return err
  4745  		}
  4746  	}
  4747  
  4748  	// Upsert the job if necessary
  4749  	if req.Job != nil {
  4750  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  4751  			return err
  4752  		}
  4753  	}
  4754  
  4755  	// Upsert the optional eval
  4756  	if req.Eval != nil {
  4757  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  4758  			return err
  4759  		}
  4760  	}
  4761  
  4762  	return txn.Commit()
  4763  }
  4764  
  4765  // LatestIndex returns the greatest index value for all indexes.
  4766  func (s *StateStore) LatestIndex() (uint64, error) {
  4767  	indexes, err := s.Indexes()
  4768  	if err != nil {
  4769  		return 0, err
  4770  	}
  4771  
  4772  	var max uint64 = 0
  4773  	for {
  4774  		raw := indexes.Next()
  4775  		if raw == nil {
  4776  			break
  4777  		}
  4778  
  4779  		// Prepare the request struct
  4780  		idx := raw.(*IndexEntry)
  4781  
  4782  		// Determine the max
  4783  		if idx.Value > max {
  4784  			max = idx.Value
  4785  		}
  4786  	}
  4787  
  4788  	return max, nil
  4789  }
  4790  
  4791  // Index finds the matching index value
  4792  func (s *StateStore) Index(name string) (uint64, error) {
  4793  	txn := s.db.ReadTxn()
  4794  
  4795  	// Lookup the first matching index
  4796  	out, err := txn.First("index", "id", name)
  4797  	if err != nil {
  4798  		return 0, err
  4799  	}
  4800  	if out == nil {
  4801  		return 0, nil
  4802  	}
  4803  	return out.(*IndexEntry).Value, nil
  4804  }
  4805  
  4806  // Indexes returns an iterator over all the indexes
  4807  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  4808  	txn := s.db.ReadTxn()
  4809  
  4810  	// Walk the entire nodes table
  4811  	iter, err := txn.Get("index", "id")
  4812  	if err != nil {
  4813  		return nil, err
  4814  	}
  4815  	return iter, nil
  4816  }
  4817  
  4818  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  4819  // store
  4820  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  4821  	txn := s.db.WriteTxn(index)
  4822  	defer txn.Abort()
  4823  
  4824  	// Get all the jobs
  4825  	iter, err := txn.Get("jobs", "id")
  4826  	if err != nil {
  4827  		return err
  4828  	}
  4829  	// COMPAT: Remove after 0.11
  4830  	// Iterate over jobs to build a list of parent jobs and their children
  4831  	parentMap := make(map[string][]*structs.Job)
  4832  	for {
  4833  		rawJob := iter.Next()
  4834  		if rawJob == nil {
  4835  			break
  4836  		}
  4837  		job := rawJob.(*structs.Job)
  4838  		if job.ParentID != "" {
  4839  			children := parentMap[job.ParentID]
  4840  			children = append(children, job)
  4841  			parentMap[job.ParentID] = children
  4842  		}
  4843  	}
  4844  
  4845  	// Get all the jobs again
  4846  	iter, err = txn.Get("jobs", "id")
  4847  	if err != nil {
  4848  		return err
  4849  	}
  4850  
  4851  	for {
  4852  		rawJob := iter.Next()
  4853  		if rawJob == nil {
  4854  			break
  4855  		}
  4856  		job := rawJob.(*structs.Job)
  4857  
  4858  		if job.IsParameterized() || job.IsPeriodic() {
  4859  			// COMPAT: Remove after 0.11
  4860  
  4861  			// The following block of code fixes incorrect child summaries due to a bug
  4862  			// See https://github.com/hashicorp/nomad/issues/3886 for details
  4863  			rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  4864  			if err != nil {
  4865  				return err
  4866  			}
  4867  			if rawSummary == nil {
  4868  				continue
  4869  			}
  4870  
  4871  			oldSummary := rawSummary.(*structs.JobSummary)
  4872  
  4873  			// Create an empty summary
  4874  			summary := &structs.JobSummary{
  4875  				JobID:     job.ID,
  4876  				Namespace: job.Namespace,
  4877  				Summary:   make(map[string]structs.TaskGroupSummary),
  4878  				Children:  &structs.JobChildrenSummary{},
  4879  			}
  4880  
  4881  			// Iterate over children of this job if any to fix summary counts
  4882  			children := parentMap[job.ID]
  4883  			for _, childJob := range children {
  4884  				switch childJob.Status {
  4885  				case structs.JobStatusPending:
  4886  					summary.Children.Pending++
  4887  				case structs.JobStatusDead:
  4888  					summary.Children.Dead++
  4889  				case structs.JobStatusRunning:
  4890  					summary.Children.Running++
  4891  				}
  4892  			}
  4893  
  4894  			// Insert the job summary if its different
  4895  			if !reflect.DeepEqual(summary, oldSummary) {
  4896  				// Set the create index of the summary same as the job's create index
  4897  				// and the modify index to the current index
  4898  				summary.CreateIndex = job.CreateIndex
  4899  				summary.ModifyIndex = index
  4900  
  4901  				if err := txn.Insert("job_summary", summary); err != nil {
  4902  					return fmt.Errorf("error inserting job summary: %v", err)
  4903  				}
  4904  			}
  4905  
  4906  			// Done with handling a parent job, continue to next
  4907  			continue
  4908  		}
  4909  
  4910  		// Create a job summary for the job
  4911  		summary := &structs.JobSummary{
  4912  			JobID:     job.ID,
  4913  			Namespace: job.Namespace,
  4914  			Summary:   make(map[string]structs.TaskGroupSummary),
  4915  		}
  4916  		for _, tg := range job.TaskGroups {
  4917  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  4918  		}
  4919  
  4920  		// Find all the allocations for the jobs
  4921  		iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  4922  		if err != nil {
  4923  			return err
  4924  		}
  4925  
  4926  		// Calculate the summary for the job
  4927  		for {
  4928  			rawAlloc := iterAllocs.Next()
  4929  			if rawAlloc == nil {
  4930  				break
  4931  			}
  4932  			alloc := rawAlloc.(*structs.Allocation)
  4933  
  4934  			// Ignore the allocation if it doesn't belong to the currently
  4935  			// registered job. The allocation is checked because of issue #2304
  4936  			if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex {
  4937  				continue
  4938  			}
  4939  
  4940  			tg := summary.Summary[alloc.TaskGroup]
  4941  			switch alloc.ClientStatus {
  4942  			case structs.AllocClientStatusFailed:
  4943  				tg.Failed += 1
  4944  			case structs.AllocClientStatusLost:
  4945  				tg.Lost += 1
  4946  			case structs.AllocClientStatusUnknown:
  4947  				tg.Unknown += 1
  4948  			case structs.AllocClientStatusComplete:
  4949  				tg.Complete += 1
  4950  			case structs.AllocClientStatusRunning:
  4951  				tg.Running += 1
  4952  			case structs.AllocClientStatusPending:
  4953  				tg.Starting += 1
  4954  			default:
  4955  				s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID)
  4956  			}
  4957  			summary.Summary[alloc.TaskGroup] = tg
  4958  		}
  4959  
  4960  		// Set the create index of the summary same as the job's create index
  4961  		// and the modify index to the current index
  4962  		summary.CreateIndex = job.CreateIndex
  4963  		summary.ModifyIndex = index
  4964  
  4965  		// Insert the job summary
  4966  		if err := txn.Insert("job_summary", summary); err != nil {
  4967  			return fmt.Errorf("error inserting job summary: %v", err)
  4968  		}
  4969  	}
  4970  
  4971  	// Update the indexes table for job summary
  4972  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4973  		return fmt.Errorf("index update failed: %v", err)
  4974  	}
  4975  	return txn.Commit()
  4976  }
  4977  
  4978  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  4979  // It takes a map of job IDs to an optional forceStatus string. It returns an
  4980  // error if the job doesn't exist or setJobStatus fails.
  4981  func (s *StateStore) setJobStatuses(index uint64, txn *txn,
  4982  	jobs map[structs.NamespacedID]string, evalDelete bool) error {
  4983  	for tuple, forceStatus := range jobs {
  4984  
  4985  		existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID)
  4986  		if err != nil {
  4987  			return fmt.Errorf("job lookup failed: %v", err)
  4988  		}
  4989  
  4990  		if existing == nil {
  4991  			continue
  4992  		}
  4993  
  4994  		if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  4995  			return err
  4996  		}
  4997  
  4998  	}
  4999  
  5000  	return nil
  5001  }
  5002  
  5003  // setJobStatus sets the status of the job by looking up associated evaluations
  5004  // and allocations. evalDelete should be set to true if setJobStatus is being
  5005  // called because an evaluation is being deleted (potentially because of garbage
  5006  // collection). If forceStatus is non-empty, the job's status will be set to the
  5007  // passed status.
  5008  func (s *StateStore) setJobStatus(index uint64, txn *txn,
  5009  	job *structs.Job, evalDelete bool, forceStatus string) error {
  5010  
  5011  	// Capture the current status so we can check if there is a change
  5012  	oldStatus := job.Status
  5013  	newStatus := forceStatus
  5014  
  5015  	// If forceStatus is not set, compute the jobs status.
  5016  	if forceStatus == "" {
  5017  		var err error
  5018  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  5019  		if err != nil {
  5020  			return err
  5021  		}
  5022  	}
  5023  
  5024  	// Fast-path if the job has not changed.
  5025  	if oldStatus == newStatus {
  5026  		return nil
  5027  	}
  5028  
  5029  	// Copy and update the existing job
  5030  	updated := job.Copy()
  5031  	updated.Status = newStatus
  5032  	updated.ModifyIndex = index
  5033  
  5034  	// Insert the job
  5035  	if err := txn.Insert("jobs", updated); err != nil {
  5036  		return fmt.Errorf("job insert failed: %v", err)
  5037  	}
  5038  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  5039  		return fmt.Errorf("index update failed: %v", err)
  5040  	}
  5041  
  5042  	// Update the children summary
  5043  	if err := s.setJobSummary(txn, updated, index, oldStatus, newStatus); err != nil {
  5044  		return fmt.Errorf("job summary update failed %w", err)
  5045  	}
  5046  	return nil
  5047  }
  5048  
  5049  func (s *StateStore) setJobSummary(txn *txn, updated *structs.Job, index uint64, oldStatus, newStatus string) error {
  5050  	if updated.ParentID == "" {
  5051  		return nil
  5052  	}
  5053  
  5054  	// Try to update the summary of the parent job summary
  5055  	summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID)
  5056  	if err != nil {
  5057  		return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  5058  	}
  5059  
  5060  	// Only continue if the summary exists. It could not exist if the parent
  5061  	// job was removed
  5062  	if summaryRaw != nil {
  5063  		existing := summaryRaw.(*structs.JobSummary)
  5064  		pSummary := existing.Copy()
  5065  		if pSummary.Children == nil {
  5066  			pSummary.Children = new(structs.JobChildrenSummary)
  5067  		}
  5068  
  5069  		// Determine the transition and update the correct fields
  5070  		children := pSummary.Children
  5071  
  5072  		// Decrement old status
  5073  		if oldStatus != "" {
  5074  			switch oldStatus {
  5075  			case structs.JobStatusPending:
  5076  				children.Pending--
  5077  			case structs.JobStatusRunning:
  5078  				children.Running--
  5079  			case structs.JobStatusDead:
  5080  				children.Dead--
  5081  			default:
  5082  				return fmt.Errorf("unknown old job status %q", oldStatus)
  5083  			}
  5084  		}
  5085  
  5086  		// Increment new status
  5087  		switch newStatus {
  5088  		case structs.JobStatusPending:
  5089  			children.Pending++
  5090  		case structs.JobStatusRunning:
  5091  			children.Running++
  5092  		case structs.JobStatusDead:
  5093  			children.Dead++
  5094  		default:
  5095  			return fmt.Errorf("unknown new job status %q", newStatus)
  5096  		}
  5097  
  5098  		// Update the index
  5099  		pSummary.ModifyIndex = index
  5100  
  5101  		// Insert the summary
  5102  		if err := txn.Insert("job_summary", pSummary); err != nil {
  5103  			return fmt.Errorf("job summary insert failed: %v", err)
  5104  		}
  5105  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5106  			return fmt.Errorf("index update failed: %v", err)
  5107  		}
  5108  	}
  5109  	return nil
  5110  }
  5111  
  5112  func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) {
  5113  	// System, Periodic and Parameterized jobs are running until explicitly
  5114  	// stopped.
  5115  	if job.Type == structs.JobTypeSystem ||
  5116  		job.IsParameterized() ||
  5117  		job.IsPeriodic() {
  5118  		if job.Stop {
  5119  			return structs.JobStatusDead, nil
  5120  		}
  5121  		return structs.JobStatusRunning, nil
  5122  	}
  5123  
  5124  	allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  5125  	if err != nil {
  5126  		return "", err
  5127  	}
  5128  
  5129  	// If there is a non-terminal allocation, the job is running.
  5130  	hasAlloc := false
  5131  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  5132  		hasAlloc = true
  5133  		if !alloc.(*structs.Allocation).TerminalStatus() {
  5134  			return structs.JobStatusRunning, nil
  5135  		}
  5136  	}
  5137  
  5138  	evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID)
  5139  	if err != nil {
  5140  		return "", err
  5141  	}
  5142  
  5143  	hasEval := false
  5144  	for raw := evals.Next(); raw != nil; raw = evals.Next() {
  5145  		e := raw.(*structs.Evaluation)
  5146  
  5147  		// Filter non-exact matches
  5148  		if e.JobID != job.ID {
  5149  			continue
  5150  		}
  5151  
  5152  		hasEval = true
  5153  		if !e.TerminalStatus() {
  5154  			return structs.JobStatusPending, nil
  5155  		}
  5156  	}
  5157  
  5158  	// The job is dead if all the allocations and evals are terminal or if there
  5159  	// are no evals because of garbage collection.
  5160  	if evalDelete || hasEval || hasAlloc {
  5161  		return structs.JobStatusDead, nil
  5162  	}
  5163  
  5164  	return structs.JobStatusPending, nil
  5165  }
  5166  
  5167  // updateSummaryWithJob creates or updates job summaries when new jobs are
  5168  // upserted or existing ones are updated
  5169  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  5170  	txn *txn) error {
  5171  
  5172  	// Update the job summary
  5173  	summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  5174  	if err != nil {
  5175  		return fmt.Errorf("job summary lookup failed: %v", err)
  5176  	}
  5177  
  5178  	// Get the summary or create if necessary
  5179  	var summary *structs.JobSummary
  5180  	hasSummaryChanged := false
  5181  	if summaryRaw != nil {
  5182  		summary = summaryRaw.(*structs.JobSummary).Copy()
  5183  	} else {
  5184  		summary = &structs.JobSummary{
  5185  			JobID:       job.ID,
  5186  			Namespace:   job.Namespace,
  5187  			Summary:     make(map[string]structs.TaskGroupSummary),
  5188  			Children:    new(structs.JobChildrenSummary),
  5189  			CreateIndex: index,
  5190  		}
  5191  		hasSummaryChanged = true
  5192  	}
  5193  
  5194  	for _, tg := range job.TaskGroups {
  5195  		if _, ok := summary.Summary[tg.Name]; !ok {
  5196  			newSummary := structs.TaskGroupSummary{
  5197  				Complete: 0,
  5198  				Failed:   0,
  5199  				Running:  0,
  5200  				Starting: 0,
  5201  			}
  5202  			summary.Summary[tg.Name] = newSummary
  5203  			hasSummaryChanged = true
  5204  		}
  5205  	}
  5206  
  5207  	// The job summary has changed, so update the modify index.
  5208  	if hasSummaryChanged {
  5209  		summary.ModifyIndex = index
  5210  
  5211  		// Update the indexes table for job summary
  5212  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5213  			return fmt.Errorf("index update failed: %v", err)
  5214  		}
  5215  		if err := txn.Insert("job_summary", summary); err != nil {
  5216  			return err
  5217  		}
  5218  	}
  5219  
  5220  	return nil
  5221  }
  5222  
  5223  // updateJobScalingPolicies upserts any scaling policies contained in the job and removes
  5224  // any previous scaling policies that were removed from the job
  5225  func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {
  5226  
  5227  	ws := memdb.NewWatchSet()
  5228  
  5229  	scalingPolicies := job.GetScalingPolicies()
  5230  	newTargets := map[string]bool{}
  5231  	for _, p := range scalingPolicies {
  5232  		newTargets[p.JobKey()] = true
  5233  	}
  5234  	// find existing policies that need to be deleted
  5235  	deletedPolicies := []string{}
  5236  	iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn)
  5237  	if err != nil {
  5238  		return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err)
  5239  	}
  5240  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  5241  		oldPolicy := raw.(*structs.ScalingPolicy)
  5242  		if !newTargets[oldPolicy.JobKey()] {
  5243  			deletedPolicies = append(deletedPolicies, oldPolicy.ID)
  5244  		}
  5245  	}
  5246  	err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn)
  5247  	if err != nil {
  5248  		return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err)
  5249  	}
  5250  
  5251  	err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn)
  5252  	if err != nil {
  5253  		return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err)
  5254  	}
  5255  
  5256  	return nil
  5257  }
  5258  
  5259  // updateJobCSIPlugins runs on job update, and indexes the job in the plugin
  5260  func (s *StateStore) updateJobCSIPlugins(index uint64, job, prev *structs.Job, txn *txn) error {
  5261  	plugIns := make(map[string]*structs.CSIPlugin)
  5262  
  5263  	upsertFn := func(job *structs.Job, delete bool) error {
  5264  		for _, tg := range job.TaskGroups {
  5265  			for _, t := range tg.Tasks {
  5266  				if t.CSIPluginConfig == nil {
  5267  					continue
  5268  				}
  5269  
  5270  				plugIn, ok := plugIns[t.CSIPluginConfig.ID]
  5271  				if !ok {
  5272  					p, err := s.CSIPluginByIDTxn(txn, nil, t.CSIPluginConfig.ID)
  5273  					if err != nil {
  5274  						return err
  5275  					}
  5276  					if p == nil {
  5277  						plugIn = structs.NewCSIPlugin(t.CSIPluginConfig.ID, index)
  5278  					} else {
  5279  						plugIn = p.Copy()
  5280  						plugIn.ModifyIndex = index
  5281  					}
  5282  					plugIns[plugIn.ID] = plugIn
  5283  				}
  5284  
  5285  				if delete {
  5286  					plugIn.DeleteJob(job, nil)
  5287  				} else {
  5288  					plugIn.AddJob(job, nil)
  5289  				}
  5290  			}
  5291  		}
  5292  
  5293  		return nil
  5294  	}
  5295  
  5296  	if prev != nil {
  5297  		err := upsertFn(prev, true)
  5298  		if err != nil {
  5299  			return err
  5300  		}
  5301  	}
  5302  
  5303  	err := upsertFn(job, false)
  5304  	if err != nil {
  5305  		return err
  5306  	}
  5307  
  5308  	for _, plugIn := range plugIns {
  5309  		err = txn.Insert("csi_plugins", plugIn)
  5310  		if err != nil {
  5311  			return fmt.Errorf("csi_plugins insert error: %v", err)
  5312  		}
  5313  	}
  5314  
  5315  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  5316  		return fmt.Errorf("index update failed: %v", err)
  5317  	}
  5318  
  5319  	return nil
  5320  }
  5321  
  5322  // updateDeploymentWithAlloc is used to update the deployment state associated
  5323  // with the given allocation. The passed alloc may be updated if the deployment
  5324  // status has changed to capture the modify index at which it has changed.
  5325  func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *txn) error {
  5326  	// Nothing to do if the allocation is not associated with a deployment
  5327  	if alloc.DeploymentID == "" {
  5328  		return nil
  5329  	}
  5330  
  5331  	// Get the deployment
  5332  	ws := memdb.NewWatchSet()
  5333  	deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn)
  5334  	if err != nil {
  5335  		return err
  5336  	}
  5337  	if deployment == nil {
  5338  		return nil
  5339  	}
  5340  
  5341  	// Retrieve the deployment state object
  5342  	_, ok := deployment.TaskGroups[alloc.TaskGroup]
  5343  	if !ok {
  5344  		// If the task group isn't part of the deployment, the task group wasn't
  5345  		// part of a rolling update so nothing to do
  5346  		return nil
  5347  	}
  5348  
  5349  	// Do not modify in-place. Instead keep track of what must be done
  5350  	placed := 0
  5351  	healthy := 0
  5352  	unhealthy := 0
  5353  
  5354  	// If there was no existing allocation, this is a placement and we increment
  5355  	// the placement
  5356  	existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth()
  5357  	allocHealthSet := alloc.DeploymentStatus.HasHealth()
  5358  	if existing == nil || existing.DeploymentID != alloc.DeploymentID {
  5359  		placed++
  5360  	} else if !existingHealthSet && allocHealthSet {
  5361  		if *alloc.DeploymentStatus.Healthy {
  5362  			healthy++
  5363  		} else {
  5364  			unhealthy++
  5365  		}
  5366  	} else if existingHealthSet && allocHealthSet {
  5367  		// See if it has gone from healthy to unhealthy
  5368  		if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy {
  5369  			healthy--
  5370  			unhealthy++
  5371  		}
  5372  	}
  5373  
  5374  	// Nothing to do
  5375  	if placed == 0 && healthy == 0 && unhealthy == 0 {
  5376  		return nil
  5377  	}
  5378  
  5379  	// Update the allocation's deployment status modify index
  5380  	if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 {
  5381  		alloc.DeploymentStatus.ModifyIndex = index
  5382  	}
  5383  
  5384  	// Create a copy of the deployment object
  5385  	deploymentCopy := deployment.Copy()
  5386  	deploymentCopy.ModifyIndex = index
  5387  
  5388  	dstate := deploymentCopy.TaskGroups[alloc.TaskGroup]
  5389  	dstate.PlacedAllocs += placed
  5390  	dstate.HealthyAllocs += healthy
  5391  	dstate.UnhealthyAllocs += unhealthy
  5392  
  5393  	// Ensure PlacedCanaries accurately reflects the alloc canary status
  5394  	if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
  5395  		found := false
  5396  		for _, canary := range dstate.PlacedCanaries {
  5397  			if alloc.ID == canary {
  5398  				found = true
  5399  				break
  5400  			}
  5401  		}
  5402  		if !found {
  5403  			dstate.PlacedCanaries = append(dstate.PlacedCanaries, alloc.ID)
  5404  		}
  5405  	}
  5406  
  5407  	// Update the progress deadline
  5408  	if pd := dstate.ProgressDeadline; pd != 0 {
  5409  		// If we are the first placed allocation for the deployment start the progress deadline.
  5410  		if placed != 0 && dstate.RequireProgressBy.IsZero() {
  5411  			// Use modify time instead of create time because we may in-place
  5412  			// update the allocation to be part of a new deployment.
  5413  			dstate.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd)
  5414  		} else if healthy != 0 {
  5415  			if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(dstate.RequireProgressBy) {
  5416  				dstate.RequireProgressBy = d
  5417  			}
  5418  		}
  5419  	}
  5420  
  5421  	// Upsert the deployment
  5422  	if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil {
  5423  		return err
  5424  	}
  5425  
  5426  	return nil
  5427  }
  5428  
  5429  // updateSummaryWithAlloc updates the job summary when allocations are updated
  5430  // or inserted
  5431  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  5432  	existingAlloc *structs.Allocation, txn *txn) error {
  5433  
  5434  	// We don't have to update the summary if the job is missing
  5435  	if alloc.Job == nil {
  5436  		return nil
  5437  	}
  5438  
  5439  	summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID)
  5440  	if err != nil {
  5441  		return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err)
  5442  	}
  5443  
  5444  	if summaryRaw == nil {
  5445  		// Check if the job is de-registered
  5446  		rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID)
  5447  		if err != nil {
  5448  			return fmt.Errorf("unable to query job: %v", err)
  5449  		}
  5450  
  5451  		// If the job is de-registered then we skip updating it's summary
  5452  		if rawJob == nil {
  5453  			return nil
  5454  		}
  5455  
  5456  		return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace)
  5457  	}
  5458  
  5459  	// Get a copy of the existing summary
  5460  	jobSummary := summaryRaw.(*structs.JobSummary).Copy()
  5461  
  5462  	// Not updating the job summary because the allocation doesn't belong to the
  5463  	// currently registered job
  5464  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  5465  		return nil
  5466  	}
  5467  
  5468  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  5469  	if !ok {
  5470  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  5471  	}
  5472  
  5473  	summaryChanged := false
  5474  	if existingAlloc == nil {
  5475  		switch alloc.DesiredStatus {
  5476  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  5477  			s.logger.Error("new allocation inserted into state store with bad desired status",
  5478  				"alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
  5479  		}
  5480  		switch alloc.ClientStatus {
  5481  		case structs.AllocClientStatusPending:
  5482  			tgSummary.Starting += 1
  5483  			if tgSummary.Queued > 0 {
  5484  				tgSummary.Queued -= 1
  5485  			}
  5486  			summaryChanged = true
  5487  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  5488  			structs.AllocClientStatusComplete:
  5489  			s.logger.Error("new allocation inserted into state store with bad client status",
  5490  				"alloc_id", alloc.ID, "client_status", alloc.ClientStatus)
  5491  		}
  5492  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  5493  		// Incrementing the client of the bin of the current state
  5494  		switch alloc.ClientStatus {
  5495  		case structs.AllocClientStatusRunning:
  5496  			tgSummary.Running += 1
  5497  		case structs.AllocClientStatusFailed:
  5498  			tgSummary.Failed += 1
  5499  		case structs.AllocClientStatusPending:
  5500  			tgSummary.Starting += 1
  5501  		case structs.AllocClientStatusComplete:
  5502  			tgSummary.Complete += 1
  5503  		case structs.AllocClientStatusLost:
  5504  			tgSummary.Lost += 1
  5505  		case structs.AllocClientStatusUnknown:
  5506  			tgSummary.Unknown += 1
  5507  		}
  5508  
  5509  		// Decrementing the count of the bin of the last state
  5510  		switch existingAlloc.ClientStatus {
  5511  		case structs.AllocClientStatusRunning:
  5512  			if tgSummary.Running > 0 {
  5513  				tgSummary.Running -= 1
  5514  			}
  5515  		case structs.AllocClientStatusPending:
  5516  			if tgSummary.Starting > 0 {
  5517  				tgSummary.Starting -= 1
  5518  			}
  5519  		case structs.AllocClientStatusLost:
  5520  			if tgSummary.Lost > 0 {
  5521  				tgSummary.Lost -= 1
  5522  			}
  5523  		case structs.AllocClientStatusUnknown:
  5524  			if tgSummary.Unknown > 0 {
  5525  				tgSummary.Unknown -= 1
  5526  			}
  5527  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  5528  		default:
  5529  			s.logger.Error("invalid old client status for allocation",
  5530  				"alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus)
  5531  		}
  5532  		summaryChanged = true
  5533  	}
  5534  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  5535  
  5536  	if summaryChanged {
  5537  		jobSummary.ModifyIndex = index
  5538  
  5539  		s.updatePluginWithJobSummary(index, jobSummary, alloc, txn)
  5540  
  5541  		// Update the indexes table for job summary
  5542  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  5543  			return fmt.Errorf("index update failed: %v", err)
  5544  		}
  5545  
  5546  		if err := txn.Insert("job_summary", jobSummary); err != nil {
  5547  			return fmt.Errorf("updating job summary failed: %v", err)
  5548  		}
  5549  	}
  5550  
  5551  	return nil
  5552  }
  5553  
  5554  // updatePluginForTerminalAlloc updates the CSI plugins for an alloc when the
  5555  // allocation is updated or inserted with a terminal server status.
  5556  func (s *StateStore) updatePluginForTerminalAlloc(index uint64, alloc *structs.Allocation,
  5557  	txn *txn) error {
  5558  
  5559  	if !alloc.ServerTerminalStatus() {
  5560  		return nil
  5561  	}
  5562  
  5563  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  5564  	for _, t := range tg.Tasks {
  5565  		if t.CSIPluginConfig != nil {
  5566  			pluginID := t.CSIPluginConfig.ID
  5567  			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
  5568  			if err != nil {
  5569  				return err
  5570  			}
  5571  			if plug == nil {
  5572  				// plugin may not have been created because it never
  5573  				// became healthy, just move on
  5574  				return nil
  5575  			}
  5576  			plug = plug.Copy()
  5577  			err = plug.DeleteAlloc(alloc.ID, alloc.NodeID)
  5578  			if err != nil {
  5579  				return err
  5580  			}
  5581  			err = updateOrGCPlugin(index, txn, plug)
  5582  			if err != nil {
  5583  				return err
  5584  			}
  5585  		}
  5586  	}
  5587  
  5588  	return nil
  5589  }
  5590  
  5591  // updatePluginWithJobSummary updates the CSI plugins for a job when the
  5592  // job summary is updated by an alloc
  5593  func (s *StateStore) updatePluginWithJobSummary(index uint64, summary *structs.JobSummary, alloc *structs.Allocation,
  5594  	txn *txn) error {
  5595  
  5596  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  5597  	if tg == nil {
  5598  		return nil
  5599  	}
  5600  
  5601  	for _, t := range tg.Tasks {
  5602  		if t.CSIPluginConfig != nil {
  5603  			pluginID := t.CSIPluginConfig.ID
  5604  			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
  5605  			if err != nil {
  5606  				return err
  5607  			}
  5608  			if plug == nil {
  5609  				plug = structs.NewCSIPlugin(pluginID, index)
  5610  			} else {
  5611  				plug = plug.Copy()
  5612  			}
  5613  
  5614  			plug.UpdateExpectedWithJob(alloc.Job, summary,
  5615  				alloc.Job.Status == structs.JobStatusDead)
  5616  
  5617  			err = updateOrGCPlugin(index, txn, plug)
  5618  			if err != nil {
  5619  				return err
  5620  			}
  5621  		}
  5622  	}
  5623  
  5624  	return nil
  5625  }
  5626  
  5627  // UpsertACLPolicies is used to create or update a set of ACL policies
  5628  func (s *StateStore) UpsertACLPolicies(msgType structs.MessageType, index uint64, policies []*structs.ACLPolicy) error {
  5629  	txn := s.db.WriteTxnMsgT(msgType, index)
  5630  	defer txn.Abort()
  5631  
  5632  	for _, policy := range policies {
  5633  		// Ensure the policy hash is non-nil. This should be done outside the state store
  5634  		// for performance reasons, but we check here for defense in depth.
  5635  		if len(policy.Hash) == 0 {
  5636  			policy.SetHash()
  5637  		}
  5638  
  5639  		// Check if the policy already exists
  5640  		existing, err := txn.First("acl_policy", "id", policy.Name)
  5641  		if err != nil {
  5642  			return fmt.Errorf("policy lookup failed: %v", err)
  5643  		}
  5644  
  5645  		// Update all the indexes
  5646  		if existing != nil {
  5647  			policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex
  5648  			policy.ModifyIndex = index
  5649  		} else {
  5650  			policy.CreateIndex = index
  5651  			policy.ModifyIndex = index
  5652  		}
  5653  
  5654  		// Update the policy
  5655  		if err := txn.Insert("acl_policy", policy); err != nil {
  5656  			return fmt.Errorf("upserting policy failed: %v", err)
  5657  		}
  5658  	}
  5659  
  5660  	// Update the indexes tabl
  5661  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  5662  		return fmt.Errorf("index update failed: %v", err)
  5663  	}
  5664  
  5665  	return txn.Commit()
  5666  }
  5667  
  5668  // DeleteACLPolicies deletes the policies with the given names
  5669  func (s *StateStore) DeleteACLPolicies(msgType structs.MessageType, index uint64, names []string) error {
  5670  	txn := s.db.WriteTxnMsgT(msgType, index)
  5671  	defer txn.Abort()
  5672  
  5673  	// Delete the policy
  5674  	for _, name := range names {
  5675  		if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil {
  5676  			return fmt.Errorf("deleting acl policy failed: %v", err)
  5677  		}
  5678  	}
  5679  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  5680  		return fmt.Errorf("index update failed: %v", err)
  5681  	}
  5682  	return txn.Commit()
  5683  }
  5684  
  5685  // ACLPolicyByName is used to lookup a policy by name
  5686  func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) {
  5687  	txn := s.db.ReadTxn()
  5688  
  5689  	watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name)
  5690  	if err != nil {
  5691  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  5692  	}
  5693  	ws.Add(watchCh)
  5694  
  5695  	if existing != nil {
  5696  		return existing.(*structs.ACLPolicy), nil
  5697  	}
  5698  	return nil, nil
  5699  }
  5700  
  5701  // ACLPolicyByNamePrefix is used to lookup policies by prefix
  5702  func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  5703  	txn := s.db.ReadTxn()
  5704  
  5705  	iter, err := txn.Get("acl_policy", "id_prefix", prefix)
  5706  	if err != nil {
  5707  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  5708  	}
  5709  	ws.Add(iter.WatchCh())
  5710  
  5711  	return iter, nil
  5712  }
  5713  
  5714  // ACLPolicyByJob is used to lookup policies that have been attached to a
  5715  // specific job
  5716  func (s *StateStore) ACLPolicyByJob(ws memdb.WatchSet, ns, jobID string) (memdb.ResultIterator, error) {
  5717  	txn := s.db.ReadTxn()
  5718  
  5719  	iter, err := txn.Get("acl_policy", "job_prefix", ns, jobID)
  5720  	if err != nil {
  5721  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  5722  	}
  5723  	ws.Add(iter.WatchCh())
  5724  
  5725  	return iter, nil
  5726  }
  5727  
  5728  // ACLPolicies returns an iterator over all the acl policies
  5729  func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  5730  	txn := s.db.ReadTxn()
  5731  
  5732  	// Walk the entire table
  5733  	iter, err := txn.Get("acl_policy", "id")
  5734  	if err != nil {
  5735  		return nil, err
  5736  	}
  5737  	ws.Add(iter.WatchCh())
  5738  	return iter, nil
  5739  }
  5740  
  5741  // UpsertACLTokens is used to create or update a set of ACL tokens
  5742  func (s *StateStore) UpsertACLTokens(msgType structs.MessageType, index uint64, tokens []*structs.ACLToken) error {
  5743  	txn := s.db.WriteTxnMsgT(msgType, index)
  5744  	defer txn.Abort()
  5745  
  5746  	for _, token := range tokens {
  5747  		// Ensure the policy hash is non-nil. This should be done outside the state store
  5748  		// for performance reasons, but we check here for defense in depth.
  5749  		if len(token.Hash) == 0 {
  5750  			token.SetHash()
  5751  		}
  5752  
  5753  		// Check if the token already exists
  5754  		existing, err := txn.First("acl_token", "id", token.AccessorID)
  5755  		if err != nil {
  5756  			return fmt.Errorf("token lookup failed: %v", err)
  5757  		}
  5758  
  5759  		// Update all the indexes
  5760  		if existing != nil {
  5761  			existTK := existing.(*structs.ACLToken)
  5762  			token.CreateIndex = existTK.CreateIndex
  5763  			token.ModifyIndex = index
  5764  
  5765  			// Do not allow SecretID or create time to change
  5766  			token.SecretID = existTK.SecretID
  5767  			token.CreateTime = existTK.CreateTime
  5768  
  5769  		} else {
  5770  			token.CreateIndex = index
  5771  			token.ModifyIndex = index
  5772  		}
  5773  
  5774  		// Update the token
  5775  		if err := txn.Insert("acl_token", token); err != nil {
  5776  			return fmt.Errorf("upserting token failed: %v", err)
  5777  		}
  5778  	}
  5779  
  5780  	// Update the indexes table
  5781  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  5782  		return fmt.Errorf("index update failed: %v", err)
  5783  	}
  5784  	return txn.Commit()
  5785  }
  5786  
  5787  // DeleteACLTokens deletes the tokens with the given accessor ids
  5788  func (s *StateStore) DeleteACLTokens(msgType structs.MessageType, index uint64, ids []string) error {
  5789  	txn := s.db.WriteTxnMsgT(msgType, index)
  5790  	defer txn.Abort()
  5791  
  5792  	// Delete the tokens
  5793  	for _, id := range ids {
  5794  		if _, err := txn.DeleteAll("acl_token", "id", id); err != nil {
  5795  			return fmt.Errorf("deleting acl token failed: %v", err)
  5796  		}
  5797  	}
  5798  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  5799  		return fmt.Errorf("index update failed: %v", err)
  5800  	}
  5801  	return txn.Commit()
  5802  }
  5803  
  5804  // ACLTokenByAccessorID is used to lookup a token by accessor ID
  5805  func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) {
  5806  	if id == "" {
  5807  		return nil, fmt.Errorf("acl token lookup failed: missing accessor id")
  5808  	}
  5809  
  5810  	txn := s.db.ReadTxn()
  5811  
  5812  	watchCh, existing, err := txn.FirstWatch("acl_token", "id", id)
  5813  	if err != nil {
  5814  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  5815  	}
  5816  	ws.Add(watchCh)
  5817  
  5818  	// If the existing token is nil, this indicates it does not exist in state.
  5819  	if existing == nil {
  5820  		return nil, nil
  5821  	}
  5822  
  5823  	// Assert the token type which allows us to perform additional work on the
  5824  	// token that is needed before returning the call.
  5825  	token := existing.(*structs.ACLToken)
  5826  
  5827  	// Handle potential staleness of ACL role links.
  5828  	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
  5829  		return nil, err
  5830  	}
  5831  	return token, nil
  5832  }
  5833  
  5834  // ACLTokenBySecretID is used to lookup a token by secret ID
  5835  func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) {
  5836  	if secretID == "" {
  5837  		return nil, fmt.Errorf("acl token lookup failed: missing secret id")
  5838  	}
  5839  
  5840  	txn := s.db.ReadTxn()
  5841  
  5842  	watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID)
  5843  	if err != nil {
  5844  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  5845  	}
  5846  	ws.Add(watchCh)
  5847  
  5848  	// If the existing token is nil, this indicates it does not exist in state.
  5849  	if existing == nil {
  5850  		return nil, nil
  5851  	}
  5852  
  5853  	// Assert the token type which allows us to perform additional work on the
  5854  	// token that is needed before returning the call.
  5855  	token := existing.(*structs.ACLToken)
  5856  
  5857  	// Handle potential staleness of ACL role links.
  5858  	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
  5859  		return nil, err
  5860  	}
  5861  	return token, nil
  5862  }
  5863  
  5864  // ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix
  5865  func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string, sort SortOption) (memdb.ResultIterator, error) {
  5866  	txn := s.db.ReadTxn()
  5867  
  5868  	var iter memdb.ResultIterator
  5869  	var err error
  5870  
  5871  	switch sort {
  5872  	case SortReverse:
  5873  		iter, err = txn.GetReverse("acl_token", "id_prefix", prefix)
  5874  	default:
  5875  		iter, err = txn.Get("acl_token", "id_prefix", prefix)
  5876  	}
  5877  	if err != nil {
  5878  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  5879  	}
  5880  
  5881  	ws.Add(iter.WatchCh())
  5882  	return iter, nil
  5883  }
  5884  
  5885  // ACLTokens returns an iterator over all the tokens
  5886  func (s *StateStore) ACLTokens(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
  5887  	txn := s.db.ReadTxn()
  5888  
  5889  	var iter memdb.ResultIterator
  5890  	var err error
  5891  
  5892  	switch sort {
  5893  	case SortReverse:
  5894  		iter, err = txn.GetReverse("acl_token", "create")
  5895  	default:
  5896  		iter, err = txn.Get("acl_token", "create")
  5897  	}
  5898  	if err != nil {
  5899  		return nil, err
  5900  	}
  5901  
  5902  	ws.Add(iter.WatchCh())
  5903  	return iter, nil
  5904  }
  5905  
  5906  // ACLTokensByGlobal returns an iterator over all the tokens filtered by global value
  5907  func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool, sort SortOption) (memdb.ResultIterator, error) {
  5908  	txn := s.db.ReadTxn()
  5909  
  5910  	var iter memdb.ResultIterator
  5911  	var err error
  5912  
  5913  	// Walk the entire table
  5914  	switch sort {
  5915  	case SortReverse:
  5916  		iter, err = txn.GetReverse("acl_token", "global", globalVal)
  5917  	default:
  5918  		iter, err = txn.Get("acl_token", "global", globalVal)
  5919  	}
  5920  	if err != nil {
  5921  		return nil, err
  5922  	}
  5923  
  5924  	ws.Add(iter.WatchCh())
  5925  	return iter, nil
  5926  }
  5927  
  5928  // CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index
  5929  func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) {
  5930  	txn := s.db.ReadTxn()
  5931  
  5932  	// Lookup the bootstrap sentinel
  5933  	out, err := txn.First("index", "id", "acl_token_bootstrap")
  5934  	if err != nil {
  5935  		return false, 0, err
  5936  	}
  5937  
  5938  	// No entry, we haven't bootstrapped yet
  5939  	if out == nil {
  5940  		return true, 0, nil
  5941  	}
  5942  
  5943  	// Return the reset index if we've already bootstrapped
  5944  	return false, out.(*IndexEntry).Value, nil
  5945  }
  5946  
  5947  // BootstrapACLTokens is used to create an initial ACL token.
  5948  func (s *StateStore) BootstrapACLTokens(msgType structs.MessageType, index uint64, resetIndex uint64, token *structs.ACLToken) error {
  5949  	txn := s.db.WriteTxnMsgT(msgType, index)
  5950  	defer txn.Abort()
  5951  
  5952  	// Check if we have already done a bootstrap
  5953  	existing, err := txn.First("index", "id", "acl_token_bootstrap")
  5954  	if err != nil {
  5955  		return fmt.Errorf("bootstrap check failed: %v", err)
  5956  	}
  5957  	if existing != nil {
  5958  		if resetIndex == 0 {
  5959  			return fmt.Errorf("ACL bootstrap already done")
  5960  		} else if resetIndex != existing.(*IndexEntry).Value {
  5961  			return fmt.Errorf("Invalid reset index for ACL bootstrap")
  5962  		}
  5963  	}
  5964  
  5965  	// Update the Create/Modify time
  5966  	token.CreateIndex = index
  5967  	token.ModifyIndex = index
  5968  
  5969  	// Insert the token
  5970  	if err := txn.Insert("acl_token", token); err != nil {
  5971  		return fmt.Errorf("upserting token failed: %v", err)
  5972  	}
  5973  
  5974  	// Update the indexes table, prevents future bootstrap until reset
  5975  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  5976  		return fmt.Errorf("index update failed: %v", err)
  5977  	}
  5978  	if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil {
  5979  		return fmt.Errorf("index update failed: %v", err)
  5980  	}
  5981  	return txn.Commit()
  5982  }
  5983  
  5984  // UpsertOneTimeToken is used to create or update a set of ACL
  5985  // tokens. Validating that we're not upserting an already-expired token is
  5986  // made the responsibility of the caller to facilitate testing.
  5987  func (s *StateStore) UpsertOneTimeToken(msgType structs.MessageType, index uint64, token *structs.OneTimeToken) error {
  5988  	txn := s.db.WriteTxnMsgT(msgType, index)
  5989  	defer txn.Abort()
  5990  
  5991  	// we expect the RPC call to set the ExpiresAt
  5992  	if token.ExpiresAt.IsZero() {
  5993  		return fmt.Errorf("one-time token must have an ExpiresAt time")
  5994  	}
  5995  
  5996  	// Update all the indexes
  5997  	token.CreateIndex = index
  5998  	token.ModifyIndex = index
  5999  
  6000  	// Create the token
  6001  	if err := txn.Insert("one_time_token", token); err != nil {
  6002  		return fmt.Errorf("upserting one-time token failed: %v", err)
  6003  	}
  6004  
  6005  	// Update the indexes table
  6006  	if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6007  		return fmt.Errorf("index update failed: %v", err)
  6008  	}
  6009  	return txn.Commit()
  6010  }
  6011  
  6012  // DeleteOneTimeTokens deletes the tokens with the given ACLToken Accessor IDs
  6013  func (s *StateStore) DeleteOneTimeTokens(msgType structs.MessageType, index uint64, ids []string) error {
  6014  	txn := s.db.WriteTxnMsgT(msgType, index)
  6015  	defer txn.Abort()
  6016  
  6017  	var deleted int
  6018  	for _, id := range ids {
  6019  		d, err := txn.DeleteAll("one_time_token", "id", id)
  6020  		if err != nil {
  6021  			return fmt.Errorf("deleting one-time token failed: %v", err)
  6022  		}
  6023  		deleted += d
  6024  	}
  6025  
  6026  	if deleted > 0 {
  6027  		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6028  			return fmt.Errorf("index update failed: %v", err)
  6029  		}
  6030  	}
  6031  	return txn.Commit()
  6032  }
  6033  
  6034  // ExpireOneTimeTokens deletes tokens that have expired
  6035  func (s *StateStore) ExpireOneTimeTokens(msgType structs.MessageType, index uint64, timestamp time.Time) error {
  6036  	txn := s.db.WriteTxnMsgT(msgType, index)
  6037  	defer txn.Abort()
  6038  
  6039  	iter, err := s.oneTimeTokensExpiredTxn(txn, nil, timestamp)
  6040  	if err != nil {
  6041  		return err
  6042  	}
  6043  
  6044  	var deleted int
  6045  	for {
  6046  		raw := iter.Next()
  6047  		if raw == nil {
  6048  			break
  6049  		}
  6050  		ott, ok := raw.(*structs.OneTimeToken)
  6051  		if !ok || ott == nil {
  6052  			return fmt.Errorf("could not decode one-time token")
  6053  		}
  6054  		d, err := txn.DeleteAll("one_time_token", "secret", ott.OneTimeSecretID)
  6055  		if err != nil {
  6056  			return fmt.Errorf("deleting one-time token failed: %v", err)
  6057  		}
  6058  		deleted += d
  6059  	}
  6060  
  6061  	if deleted > 0 {
  6062  		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
  6063  			return fmt.Errorf("index update failed: %v", err)
  6064  		}
  6065  	}
  6066  	return txn.Commit()
  6067  }
  6068  
  6069  // oneTimeTokensExpiredTxn returns an iterator over all expired one-time tokens
  6070  func (s *StateStore) oneTimeTokensExpiredTxn(txn *txn, ws memdb.WatchSet, timestamp time.Time) (memdb.ResultIterator, error) {
  6071  	iter, err := txn.Get("one_time_token", "id")
  6072  	if err != nil {
  6073  		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
  6074  	}
  6075  
  6076  	ws.Add(iter.WatchCh())
  6077  	iter = memdb.NewFilterIterator(iter, expiredOneTimeTokenFilter(timestamp))
  6078  	return iter, nil
  6079  }
  6080  
  6081  // OneTimeTokenBySecret is used to lookup a token by secret
  6082  func (s *StateStore) OneTimeTokenBySecret(ws memdb.WatchSet, secret string) (*structs.OneTimeToken, error) {
  6083  	if secret == "" {
  6084  		return nil, fmt.Errorf("one-time token lookup failed: missing secret")
  6085  	}
  6086  
  6087  	txn := s.db.ReadTxn()
  6088  
  6089  	watchCh, existing, err := txn.FirstWatch("one_time_token", "secret", secret)
  6090  	if err != nil {
  6091  		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
  6092  	}
  6093  	ws.Add(watchCh)
  6094  
  6095  	if existing != nil {
  6096  		return existing.(*structs.OneTimeToken), nil
  6097  	}
  6098  	return nil, nil
  6099  }
  6100  
  6101  // expiredOneTimeTokenFilter returns a filter function that returns only
  6102  // expired one-time tokens
  6103  func expiredOneTimeTokenFilter(now time.Time) func(interface{}) bool {
  6104  	return func(raw interface{}) bool {
  6105  		ott, ok := raw.(*structs.OneTimeToken)
  6106  		if !ok {
  6107  			return true
  6108  		}
  6109  
  6110  		return ott.ExpiresAt.After(now)
  6111  	}
  6112  }
  6113  
  6114  // SchedulerConfig is used to get the current Scheduler configuration.
  6115  func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) {
  6116  	tx := s.db.ReadTxn()
  6117  	defer tx.Abort()
  6118  	return s.schedulerConfigTxn(tx)
  6119  }
  6120  
  6121  func (s *StateStore) schedulerConfigTxn(txn *txn) (uint64, *structs.SchedulerConfiguration, error) {
  6122  
  6123  	// Get the scheduler config
  6124  	c, err := txn.First("scheduler_config", "id")
  6125  	if err != nil {
  6126  		return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err)
  6127  	}
  6128  
  6129  	config, ok := c.(*structs.SchedulerConfiguration)
  6130  	if !ok {
  6131  		return 0, nil, nil
  6132  	}
  6133  
  6134  	return config.ModifyIndex, config, nil
  6135  }
  6136  
  6137  // SchedulerSetConfig is used to set the current Scheduler configuration.
  6138  func (s *StateStore) SchedulerSetConfig(index uint64, config *structs.SchedulerConfiguration) error {
  6139  	tx := s.db.WriteTxn(index)
  6140  	defer tx.Abort()
  6141  
  6142  	s.schedulerSetConfigTxn(index, tx, config)
  6143  
  6144  	return tx.Commit()
  6145  }
  6146  
  6147  func (s *StateStore) ClusterMetadata(ws memdb.WatchSet) (*structs.ClusterMetadata, error) {
  6148  	txn := s.db.ReadTxn()
  6149  	defer txn.Abort()
  6150  
  6151  	// Get the cluster metadata
  6152  	watchCh, m, err := txn.FirstWatch("cluster_meta", "id")
  6153  	if err != nil {
  6154  		return nil, fmt.Errorf("failed cluster metadata lookup: %w", err)
  6155  	}
  6156  	ws.Add(watchCh)
  6157  
  6158  	if m != nil {
  6159  		return m.(*structs.ClusterMetadata), nil
  6160  	}
  6161  
  6162  	return nil, nil
  6163  }
  6164  
  6165  func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error {
  6166  	txn := s.db.WriteTxn(index)
  6167  	defer txn.Abort()
  6168  
  6169  	if err := s.setClusterMetadata(txn, meta); err != nil {
  6170  		return fmt.Errorf("set cluster metadata failed: %w", err)
  6171  	}
  6172  
  6173  	return txn.Commit()
  6174  }
  6175  
  6176  // WithWriteTransaction executes the passed function within a write transaction,
  6177  // and returns its result.  If the invocation returns no error, the transaction
  6178  // is committed; otherwise, it's aborted.
  6179  func (s *StateStore) WithWriteTransaction(msgType structs.MessageType, index uint64, fn func(Txn) error) error {
  6180  	tx := s.db.WriteTxnMsgT(msgType, index)
  6181  	defer tx.Abort()
  6182  
  6183  	err := fn(tx)
  6184  	if err == nil {
  6185  		return tx.Commit()
  6186  	}
  6187  	return err
  6188  }
  6189  
  6190  // SchedulerCASConfig is used to update the scheduler configuration with a
  6191  // given Raft index. If the CAS index specified is not equal to the last observed index
  6192  // for the config, then the call is a noop.
  6193  func (s *StateStore) SchedulerCASConfig(index, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) {
  6194  	tx := s.db.WriteTxn(index)
  6195  	defer tx.Abort()
  6196  
  6197  	// Check for an existing config
  6198  	existing, err := tx.First("scheduler_config", "id")
  6199  	if err != nil {
  6200  		return false, fmt.Errorf("failed scheduler config lookup: %s", err)
  6201  	}
  6202  
  6203  	// If the existing index does not match the provided CAS
  6204  	// index arg, then we shouldn't update anything and can safely
  6205  	// return early here.
  6206  	e, ok := existing.(*structs.SchedulerConfiguration)
  6207  	if !ok || (e != nil && e.ModifyIndex != cidx) {
  6208  		return false, nil
  6209  	}
  6210  
  6211  	s.schedulerSetConfigTxn(index, tx, config)
  6212  
  6213  	if err := tx.Commit(); err != nil {
  6214  		return false, err
  6215  	}
  6216  	return true, nil
  6217  }
  6218  
  6219  func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *txn, config *structs.SchedulerConfiguration) error {
  6220  	// Check for an existing config
  6221  	existing, err := tx.First("scheduler_config", "id")
  6222  	if err != nil {
  6223  		return fmt.Errorf("failed scheduler config lookup: %s", err)
  6224  	}
  6225  
  6226  	// Set the indexes.
  6227  	if existing != nil {
  6228  		config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex
  6229  	} else {
  6230  		config.CreateIndex = idx
  6231  	}
  6232  	config.ModifyIndex = idx
  6233  
  6234  	if err := tx.Insert("scheduler_config", config); err != nil {
  6235  		return fmt.Errorf("failed updating scheduler config: %s", err)
  6236  	}
  6237  	return nil
  6238  }
  6239  
  6240  func (s *StateStore) setClusterMetadata(txn *txn, meta *structs.ClusterMetadata) error {
  6241  	// Check for an existing config, if it exists, verify that the cluster ID matches
  6242  	existing, err := txn.First("cluster_meta", "id")
  6243  	if err != nil {
  6244  		return fmt.Errorf("failed cluster meta lookup: %v", err)
  6245  	}
  6246  
  6247  	if existing != nil {
  6248  		existingClusterID := existing.(*structs.ClusterMetadata).ClusterID
  6249  		if meta.ClusterID != existingClusterID && existingClusterID != "" {
  6250  			// there is a bug in cluster ID detection
  6251  			return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID)
  6252  		}
  6253  	}
  6254  
  6255  	// update is technically a noop, unless someday we add more / mutable fields
  6256  	if err := txn.Insert("cluster_meta", meta); err != nil {
  6257  		return fmt.Errorf("set cluster metadata failed: %v", err)
  6258  	}
  6259  
  6260  	return nil
  6261  }
  6262  
  6263  // UpsertScalingPolicies is used to insert a new scaling policy.
  6264  func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error {
  6265  	txn := s.db.WriteTxn(index)
  6266  	defer txn.Abort()
  6267  
  6268  	if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil {
  6269  		return err
  6270  	}
  6271  
  6272  	return txn.Commit()
  6273  }
  6274  
  6275  // UpsertScalingPoliciesTxn is used to insert a new scaling policy.
  6276  func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy,
  6277  	txn *txn) error {
  6278  
  6279  	hadUpdates := false
  6280  
  6281  	for _, policy := range scalingPolicies {
  6282  		// Check if the scaling policy already exists
  6283  		// Policy uniqueness is based on target and type
  6284  		it, err := txn.Get("scaling_policy", "target",
  6285  			policy.Target[structs.ScalingTargetNamespace],
  6286  			policy.Target[structs.ScalingTargetJob],
  6287  			policy.Target[structs.ScalingTargetGroup],
  6288  			policy.Target[structs.ScalingTargetTask],
  6289  		)
  6290  		if err != nil {
  6291  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  6292  		}
  6293  
  6294  		// Check if type matches
  6295  		var existing *structs.ScalingPolicy
  6296  		for raw := it.Next(); raw != nil; raw = it.Next() {
  6297  			p := raw.(*structs.ScalingPolicy)
  6298  			if p.Type == policy.Type {
  6299  				existing = p
  6300  				break
  6301  			}
  6302  		}
  6303  
  6304  		// Setup the indexes correctly
  6305  		if existing != nil {
  6306  			if !existing.Diff(policy) {
  6307  				continue
  6308  			}
  6309  			policy.ID = existing.ID
  6310  			policy.CreateIndex = existing.CreateIndex
  6311  		} else {
  6312  			// policy.ID must have been set already in Job.Register before log apply
  6313  			policy.CreateIndex = index
  6314  		}
  6315  		policy.ModifyIndex = index
  6316  
  6317  		// Insert the scaling policy
  6318  		hadUpdates = true
  6319  		if err := txn.Insert("scaling_policy", policy); err != nil {
  6320  			return err
  6321  		}
  6322  	}
  6323  
  6324  	// Update the indexes table for scaling policy if we updated any policies
  6325  	if hadUpdates {
  6326  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  6327  			return fmt.Errorf("index update failed: %v", err)
  6328  		}
  6329  	}
  6330  
  6331  	return nil
  6332  }
  6333  
  6334  // NamespaceByName is used to lookup a namespace by name
  6335  func (s *StateStore) NamespaceByName(ws memdb.WatchSet, name string) (*structs.Namespace, error) {
  6336  	txn := s.db.ReadTxn()
  6337  	return s.namespaceByNameImpl(ws, txn, name)
  6338  }
  6339  
  6340  // namespaceByNameImpl is used to lookup a namespace by name
  6341  func (s *StateStore) namespaceByNameImpl(ws memdb.WatchSet, txn *txn, name string) (*structs.Namespace, error) {
  6342  	watchCh, existing, err := txn.FirstWatch(TableNamespaces, "id", name)
  6343  	if err != nil {
  6344  		return nil, fmt.Errorf("namespace lookup failed: %v", err)
  6345  	}
  6346  	ws.Add(watchCh)
  6347  
  6348  	if existing != nil {
  6349  		return existing.(*structs.Namespace), nil
  6350  	}
  6351  	return nil, nil
  6352  }
  6353  
  6354  // namespaceExists returns whether a namespace exists
  6355  func (s *StateStore) namespaceExists(txn *txn, namespace string) (bool, error) {
  6356  	if namespace == structs.DefaultNamespace {
  6357  		return true, nil
  6358  	}
  6359  
  6360  	existing, err := txn.First(TableNamespaces, "id", namespace)
  6361  	if err != nil {
  6362  		return false, fmt.Errorf("namespace lookup failed: %v", err)
  6363  	}
  6364  
  6365  	return existing != nil, nil
  6366  }
  6367  
  6368  // NamespacesByNamePrefix is used to lookup namespaces by prefix
  6369  func (s *StateStore) NamespacesByNamePrefix(ws memdb.WatchSet, namePrefix string) (memdb.ResultIterator, error) {
  6370  	txn := s.db.ReadTxn()
  6371  
  6372  	iter, err := txn.Get(TableNamespaces, "id_prefix", namePrefix)
  6373  	if err != nil {
  6374  		return nil, fmt.Errorf("namespaces lookup failed: %v", err)
  6375  	}
  6376  	ws.Add(iter.WatchCh())
  6377  
  6378  	return iter, nil
  6379  }
  6380  
  6381  // Namespaces returns an iterator over all the namespaces
  6382  func (s *StateStore) Namespaces(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6383  	txn := s.db.ReadTxn()
  6384  
  6385  	// Walk the entire namespace table
  6386  	iter, err := txn.Get(TableNamespaces, "id")
  6387  	if err != nil {
  6388  		return nil, err
  6389  	}
  6390  	ws.Add(iter.WatchCh())
  6391  	return iter, nil
  6392  }
  6393  
  6394  func (s *StateStore) NamespaceNames() ([]string, error) {
  6395  	it, err := s.Namespaces(nil)
  6396  	if err != nil {
  6397  		return nil, err
  6398  	}
  6399  
  6400  	nses := []string{}
  6401  	for {
  6402  		next := it.Next()
  6403  		if next == nil {
  6404  			break
  6405  		}
  6406  		ns := next.(*structs.Namespace)
  6407  		nses = append(nses, ns.Name)
  6408  	}
  6409  
  6410  	return nses, nil
  6411  }
  6412  
  6413  // UpsertNamespaces is used to register or update a set of namespaces.
  6414  func (s *StateStore) UpsertNamespaces(index uint64, namespaces []*structs.Namespace) error {
  6415  	txn := s.db.WriteTxn(index)
  6416  	defer txn.Abort()
  6417  
  6418  	for _, ns := range namespaces {
  6419  		if err := s.upsertNamespaceImpl(index, txn, ns); err != nil {
  6420  			return err
  6421  		}
  6422  	}
  6423  
  6424  	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
  6425  		return fmt.Errorf("index update failed: %v", err)
  6426  	}
  6427  
  6428  	return txn.Commit()
  6429  }
  6430  
  6431  // upsertNamespaceImpl is used to upsert a namespace
  6432  func (s *StateStore) upsertNamespaceImpl(index uint64, txn *txn, namespace *structs.Namespace) error {
  6433  	// Ensure the namespace hash is non-nil. This should be done outside the state store
  6434  	// for performance reasons, but we check here for defense in depth.
  6435  	ns := namespace
  6436  	if len(ns.Hash) == 0 {
  6437  		ns.SetHash()
  6438  	}
  6439  
  6440  	// Check if the namespace already exists
  6441  	existing, err := txn.First(TableNamespaces, "id", ns.Name)
  6442  	if err != nil {
  6443  		return fmt.Errorf("namespace lookup failed: %v", err)
  6444  	}
  6445  
  6446  	// Setup the indexes correctly and determine which quotas need to be
  6447  	// reconciled
  6448  	var oldQuota string
  6449  	if existing != nil {
  6450  		exist := existing.(*structs.Namespace)
  6451  		ns.CreateIndex = exist.CreateIndex
  6452  		ns.ModifyIndex = index
  6453  
  6454  		// Grab the old quota on the namespace
  6455  		oldQuota = exist.Quota
  6456  	} else {
  6457  		ns.CreateIndex = index
  6458  		ns.ModifyIndex = index
  6459  	}
  6460  
  6461  	// Validate that the quota on the new namespace exists
  6462  	if ns.Quota != "" {
  6463  		exists, err := s.quotaSpecExists(txn, ns.Quota)
  6464  		if err != nil {
  6465  			return fmt.Errorf("looking up namespace quota %q failed: %v", ns.Quota, err)
  6466  		} else if !exists {
  6467  			return fmt.Errorf("namespace %q using non-existent quota %q", ns.Name, ns.Quota)
  6468  		}
  6469  	}
  6470  
  6471  	// Insert the namespace
  6472  	if err := txn.Insert(TableNamespaces, ns); err != nil {
  6473  		return fmt.Errorf("namespace insert failed: %v", err)
  6474  	}
  6475  
  6476  	// Reconcile changed quotas
  6477  	return s.quotaReconcile(index, txn, ns.Quota, oldQuota)
  6478  }
  6479  
  6480  // DeleteNamespaces is used to remove a set of namespaces
  6481  func (s *StateStore) DeleteNamespaces(index uint64, names []string) error {
  6482  	txn := s.db.WriteTxn(index)
  6483  	defer txn.Abort()
  6484  
  6485  	for _, name := range names {
  6486  		// Lookup the namespace
  6487  		existing, err := txn.First(TableNamespaces, "id", name)
  6488  		if err != nil {
  6489  			return fmt.Errorf("namespace lookup failed: %v", err)
  6490  		}
  6491  		if existing == nil {
  6492  			return fmt.Errorf("namespace not found")
  6493  		}
  6494  
  6495  		ns := existing.(*structs.Namespace)
  6496  		if ns.Name == structs.DefaultNamespace {
  6497  			return fmt.Errorf("default namespace can not be deleted")
  6498  		}
  6499  
  6500  		// Ensure that the namespace doesn't have any non-terminal jobs
  6501  		iter, err := s.jobsByNamespaceImpl(nil, name, txn)
  6502  		if err != nil {
  6503  			return err
  6504  		}
  6505  
  6506  		for {
  6507  			raw := iter.Next()
  6508  			if raw == nil {
  6509  				break
  6510  			}
  6511  			job := raw.(*structs.Job)
  6512  
  6513  			if job.Status != structs.JobStatusDead {
  6514  				return fmt.Errorf("namespace %q contains at least one non-terminal job %q. "+
  6515  					"All jobs must be terminal in namespace before it can be deleted", name, job.ID)
  6516  			}
  6517  		}
  6518  
  6519  		vIter, err := s.csiVolumesByNamespaceImpl(txn, nil, name, "")
  6520  		if err != nil {
  6521  			return err
  6522  		}
  6523  		rawVol := vIter.Next()
  6524  		if rawVol != nil {
  6525  			vol := rawVol.(*structs.CSIVolume)
  6526  			return fmt.Errorf("namespace %q contains at least one CSI volume %q. "+
  6527  				"All CSI volumes in namespace must be deleted before it can be deleted", name, vol.ID)
  6528  		}
  6529  
  6530  		varIter, err := s.getVariablesByNamespaceImpl(txn, nil, name)
  6531  		if err != nil {
  6532  			return err
  6533  		}
  6534  		if varIter.Next() != nil {
  6535  			// unlike job/volume, don't show the path here because the user may
  6536  			// not have List permissions on the vars in this namespace
  6537  			return fmt.Errorf("namespace %q contains at least one variable. "+
  6538  				"All variables in namespace must be deleted before it can be deleted", name)
  6539  		}
  6540  
  6541  		// Delete the namespace
  6542  		if err := txn.Delete(TableNamespaces, existing); err != nil {
  6543  			return fmt.Errorf("namespace deletion failed: %v", err)
  6544  		}
  6545  	}
  6546  
  6547  	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
  6548  		return fmt.Errorf("index update failed: %v", err)
  6549  	}
  6550  
  6551  	return txn.Commit()
  6552  }
  6553  
  6554  func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error {
  6555  	txn := s.db.WriteTxn(index)
  6556  	defer txn.Abort()
  6557  
  6558  	err := s.DeleteScalingPoliciesTxn(index, ids, txn)
  6559  	if err == nil {
  6560  		return txn.Commit()
  6561  	}
  6562  
  6563  	return err
  6564  }
  6565  
  6566  // DeleteScalingPoliciesTxn is used to delete a set of scaling policies by ID.
  6567  func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *txn) error {
  6568  	if len(ids) == 0 {
  6569  		return nil
  6570  	}
  6571  
  6572  	for _, id := range ids {
  6573  		// Lookup the scaling policy
  6574  		existing, err := txn.First("scaling_policy", "id", id)
  6575  		if err != nil {
  6576  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  6577  		}
  6578  		if existing == nil {
  6579  			return fmt.Errorf("scaling policy not found")
  6580  		}
  6581  
  6582  		// Delete the scaling policy
  6583  		if err := txn.Delete("scaling_policy", existing); err != nil {
  6584  			return fmt.Errorf("scaling policy delete failed: %v", err)
  6585  		}
  6586  	}
  6587  
  6588  	if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  6589  		return fmt.Errorf("index update failed: %v", err)
  6590  	}
  6591  
  6592  	return nil
  6593  }
  6594  
  6595  // ScalingPolicies returns an iterator over all the scaling policies
  6596  func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6597  	txn := s.db.ReadTxn()
  6598  
  6599  	// Walk the entire scaling_policy table
  6600  	iter, err := txn.Get("scaling_policy", "id")
  6601  	if err != nil {
  6602  		return nil, err
  6603  	}
  6604  
  6605  	ws.Add(iter.WatchCh())
  6606  
  6607  	return iter, nil
  6608  }
  6609  
  6610  // ScalingPoliciesByTypePrefix returns an iterator over scaling policies with a certain type prefix.
  6611  func (s *StateStore) ScalingPoliciesByTypePrefix(ws memdb.WatchSet, t string) (memdb.ResultIterator, error) {
  6612  	txn := s.db.ReadTxn()
  6613  
  6614  	iter, err := txn.Get("scaling_policy", "type_prefix", t)
  6615  	if err != nil {
  6616  		return nil, err
  6617  	}
  6618  
  6619  	ws.Add(iter.WatchCh())
  6620  	return iter, nil
  6621  }
  6622  
  6623  func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace, typ string) (memdb.ResultIterator, error) {
  6624  	txn := s.db.ReadTxn()
  6625  
  6626  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace)
  6627  	if err != nil {
  6628  		return nil, err
  6629  	}
  6630  
  6631  	ws.Add(iter.WatchCh())
  6632  
  6633  	// Wrap the iterator in a filter to exact match the namespace
  6634  	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))
  6635  
  6636  	// If policy type is specified as well, wrap again
  6637  	if typ != "" {
  6638  		iter = memdb.NewFilterIterator(iter, func(raw interface{}) bool {
  6639  			p, ok := raw.(*structs.ScalingPolicy)
  6640  			if !ok {
  6641  				return true
  6642  			}
  6643  			return !strings.HasPrefix(p.Type, typ)
  6644  		})
  6645  	}
  6646  
  6647  	return iter, nil
  6648  }
  6649  
  6650  func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID, policyType string) (memdb.ResultIterator,
  6651  	error) {
  6652  	txn := s.db.ReadTxn()
  6653  	iter, err := s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn)
  6654  	if err != nil {
  6655  		return nil, err
  6656  	}
  6657  
  6658  	if policyType == "" {
  6659  		return iter, nil
  6660  	}
  6661  
  6662  	filter := func(raw interface{}) bool {
  6663  		p, ok := raw.(*structs.ScalingPolicy)
  6664  		if !ok {
  6665  			return true
  6666  		}
  6667  		return policyType != p.Type
  6668  	}
  6669  
  6670  	return memdb.NewFilterIterator(iter, filter), nil
  6671  }
  6672  
  6673  func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string,
  6674  	txn *txn) (memdb.ResultIterator, error) {
  6675  
  6676  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID)
  6677  	if err != nil {
  6678  		return nil, err
  6679  	}
  6680  
  6681  	ws.Add(iter.WatchCh())
  6682  
  6683  	filter := func(raw interface{}) bool {
  6684  		d, ok := raw.(*structs.ScalingPolicy)
  6685  		if !ok {
  6686  			return true
  6687  		}
  6688  
  6689  		return d.Target[structs.ScalingTargetJob] != jobID
  6690  	}
  6691  
  6692  	// Wrap the iterator in a filter
  6693  	wrap := memdb.NewFilterIterator(iter, filter)
  6694  	return wrap, nil
  6695  }
  6696  
  6697  func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) {
  6698  	txn := s.db.ReadTxn()
  6699  
  6700  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id)
  6701  	if err != nil {
  6702  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  6703  	}
  6704  	ws.Add(watchCh)
  6705  
  6706  	if existing != nil {
  6707  		return existing.(*structs.ScalingPolicy), nil
  6708  	}
  6709  
  6710  	return nil, nil
  6711  }
  6712  
  6713  // ScalingPolicyByTargetAndType returns a fully-qualified policy against a target and policy type,
  6714  // or nil if it does not exist. This method does not honor the watchset on the policy type, just the target.
  6715  func (s *StateStore) ScalingPolicyByTargetAndType(ws memdb.WatchSet, target map[string]string, typ string) (*structs.ScalingPolicy,
  6716  	error) {
  6717  	txn := s.db.ReadTxn()
  6718  
  6719  	namespace := target[structs.ScalingTargetNamespace]
  6720  	job := target[structs.ScalingTargetJob]
  6721  	group := target[structs.ScalingTargetGroup]
  6722  	task := target[structs.ScalingTargetTask]
  6723  
  6724  	it, err := txn.Get("scaling_policy", "target", namespace, job, group, task)
  6725  	if err != nil {
  6726  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  6727  	}
  6728  
  6729  	ws.Add(it.WatchCh())
  6730  
  6731  	// Check for type
  6732  	var existing *structs.ScalingPolicy
  6733  	for raw := it.Next(); raw != nil; raw = it.Next() {
  6734  		p := raw.(*structs.ScalingPolicy)
  6735  		if p.Type == typ {
  6736  			existing = p
  6737  			break
  6738  		}
  6739  	}
  6740  
  6741  	if existing != nil {
  6742  		return existing, nil
  6743  	}
  6744  
  6745  	return nil, nil
  6746  }
  6747  
  6748  func (s *StateStore) ScalingPoliciesByIDPrefix(ws memdb.WatchSet, namespace string, prefix string) (memdb.ResultIterator, error) {
  6749  	txn := s.db.ReadTxn()
  6750  
  6751  	iter, err := txn.Get("scaling_policy", "id_prefix", prefix)
  6752  	if err != nil {
  6753  		return nil, fmt.Errorf("scaling policy lookup failed: %v", err)
  6754  	}
  6755  
  6756  	ws.Add(iter.WatchCh())
  6757  
  6758  	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))
  6759  
  6760  	return iter, nil
  6761  }
  6762  
  6763  // scalingPolicyNamespaceFilter returns a filter function that filters all
  6764  // scaling policies not targeting the given namespace.
  6765  func scalingPolicyNamespaceFilter(namespace string) func(interface{}) bool {
  6766  	return func(raw interface{}) bool {
  6767  		p, ok := raw.(*structs.ScalingPolicy)
  6768  		if !ok {
  6769  			return true
  6770  		}
  6771  
  6772  		return p.Target[structs.ScalingTargetNamespace] != namespace
  6773  	}
  6774  }
  6775  
  6776  // StateSnapshot is used to provide a point-in-time snapshot
  6777  type StateSnapshot struct {
  6778  	StateStore
  6779  }
  6780  
  6781  // DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the
  6782  // Allocation for each of the Allocation diffs and merges the updated attributes with
  6783  // the existing Allocation, and attaches the Job provided
  6784  func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error {
  6785  	for nodeID, allocs := range nodeAllocations {
  6786  		denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs)
  6787  		if err != nil {
  6788  			return err
  6789  		}
  6790  
  6791  		nodeAllocations[nodeID] = denormalizedAllocs
  6792  	}
  6793  	return nil
  6794  }
  6795  
  6796  // DenormalizeAllocationSlice queries the Allocation for each allocation diff
  6797  // represented as an Allocation and merges the updated attributes with the existing
  6798  // Allocation, and attaches the Job provided.
  6799  //
  6800  // This should only be called on terminal allocs, particularly stopped or preempted allocs
  6801  func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) {
  6802  	allocDiffs := make([]*structs.AllocationDiff, len(allocs))
  6803  	for i, alloc := range allocs {
  6804  		allocDiffs[i] = alloc.AllocationDiff()
  6805  	}
  6806  
  6807  	return s.DenormalizeAllocationDiffSlice(allocDiffs)
  6808  }
  6809  
  6810  // DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges
  6811  // the updated attributes with the existing Allocation, and attaches the Job provided.
  6812  //
  6813  // This should only be called on terminal alloc, particularly stopped or preempted allocs
  6814  func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) {
  6815  	// Output index for denormalized Allocations
  6816  	j := 0
  6817  
  6818  	denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs))
  6819  	for _, allocDiff := range allocDiffs {
  6820  		alloc, err := s.AllocByID(nil, allocDiff.ID)
  6821  		if err != nil {
  6822  			return nil, fmt.Errorf("alloc lookup failed: %v", err)
  6823  		}
  6824  		if alloc == nil {
  6825  			return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID)
  6826  		}
  6827  
  6828  		// Merge the updates to the Allocation.  Don't update alloc.Job for terminal allocs
  6829  		// so alloc refers to the latest Job view before destruction and to ease handler implementations
  6830  		allocCopy := alloc.Copy()
  6831  
  6832  		if allocDiff.PreemptedByAllocation != "" {
  6833  			allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation
  6834  			allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation)
  6835  			allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict
  6836  		} else {
  6837  			// If alloc is a stopped alloc
  6838  			allocCopy.DesiredDescription = allocDiff.DesiredDescription
  6839  			allocCopy.DesiredStatus = structs.AllocDesiredStatusStop
  6840  			if allocDiff.ClientStatus != "" {
  6841  				allocCopy.ClientStatus = allocDiff.ClientStatus
  6842  			}
  6843  			if allocDiff.FollowupEvalID != "" {
  6844  				allocCopy.FollowupEvalID = allocDiff.FollowupEvalID
  6845  			}
  6846  		}
  6847  		if allocDiff.ModifyTime != 0 {
  6848  			allocCopy.ModifyTime = allocDiff.ModifyTime
  6849  		}
  6850  
  6851  		// Update the allocDiff in the slice to equal the denormalized alloc
  6852  		denormalizedAllocs[j] = allocCopy
  6853  		j++
  6854  	}
  6855  	// Retain only the denormalized Allocations in the slice
  6856  	denormalizedAllocs = denormalizedAllocs[:j]
  6857  	return denormalizedAllocs, nil
  6858  }
  6859  
  6860  func getPreemptedAllocDesiredDescription(preemptedByAllocID string) string {
  6861  	return fmt.Sprintf("Preempted by alloc ID %v", preemptedByAllocID)
  6862  }
  6863  
  6864  // UpsertRootKeyMeta saves root key meta or updates it in-place.
  6865  func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKeyMeta, rekey bool) error {
  6866  	txn := s.db.WriteTxn(index)
  6867  	defer txn.Abort()
  6868  
  6869  	// get any existing key for updating
  6870  	raw, err := txn.First(TableRootKeyMeta, indexID, rootKeyMeta.KeyID)
  6871  	if err != nil {
  6872  		return fmt.Errorf("root key metadata lookup failed: %v", err)
  6873  	}
  6874  
  6875  	isRotation := false
  6876  
  6877  	if raw != nil {
  6878  		existing := raw.(*structs.RootKeyMeta)
  6879  		rootKeyMeta.CreateIndex = existing.CreateIndex
  6880  		rootKeyMeta.CreateTime = existing.CreateTime
  6881  		isRotation = !existing.Active() && rootKeyMeta.Active()
  6882  	} else {
  6883  		rootKeyMeta.CreateIndex = index
  6884  		isRotation = rootKeyMeta.Active()
  6885  	}
  6886  	rootKeyMeta.ModifyIndex = index
  6887  
  6888  	if rekey && !isRotation {
  6889  		return fmt.Errorf("cannot rekey without setting the new key active")
  6890  	}
  6891  
  6892  	// if the upsert is for a newly-active key, we need to set all the
  6893  	// other keys as inactive in the same transaction.
  6894  	if isRotation {
  6895  		iter, err := txn.Get(TableRootKeyMeta, indexID)
  6896  		if err != nil {
  6897  			return err
  6898  		}
  6899  		for {
  6900  			raw := iter.Next()
  6901  			if raw == nil {
  6902  				break
  6903  			}
  6904  			key := raw.(*structs.RootKeyMeta)
  6905  			modified := false
  6906  
  6907  			switch key.State {
  6908  			case structs.RootKeyStateInactive:
  6909  				if rekey {
  6910  					key.SetRekeying()
  6911  					modified = true
  6912  				}
  6913  			case structs.RootKeyStateActive:
  6914  				if rekey {
  6915  					key.SetRekeying()
  6916  				} else {
  6917  					key.SetInactive()
  6918  				}
  6919  				modified = true
  6920  			case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated:
  6921  				// nothing to do
  6922  			}
  6923  
  6924  			if modified {
  6925  				key.ModifyIndex = index
  6926  				if err := txn.Insert(TableRootKeyMeta, key); err != nil {
  6927  					return err
  6928  				}
  6929  			}
  6930  
  6931  		}
  6932  	}
  6933  
  6934  	if err := txn.Insert(TableRootKeyMeta, rootKeyMeta); err != nil {
  6935  		return err
  6936  	}
  6937  
  6938  	// update the indexes table
  6939  	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
  6940  		return fmt.Errorf("index update failed: %v", err)
  6941  	}
  6942  	return txn.Commit()
  6943  }
  6944  
  6945  // DeleteRootKeyMeta deletes a single root key, or returns an error if
  6946  // it doesn't exist.
  6947  func (s *StateStore) DeleteRootKeyMeta(index uint64, keyID string) error {
  6948  	txn := s.db.WriteTxn(index)
  6949  	defer txn.Abort()
  6950  
  6951  	// find the old key
  6952  	existing, err := txn.First(TableRootKeyMeta, indexID, keyID)
  6953  	if err != nil {
  6954  		return fmt.Errorf("root key metadata lookup failed: %v", err)
  6955  	}
  6956  	if existing == nil {
  6957  		return fmt.Errorf("root key metadata not found")
  6958  	}
  6959  	if err := txn.Delete(TableRootKeyMeta, existing); err != nil {
  6960  		return fmt.Errorf("root key metadata delete failed: %v", err)
  6961  	}
  6962  
  6963  	// update the indexes table
  6964  	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
  6965  		return fmt.Errorf("index update failed: %v", err)
  6966  	}
  6967  
  6968  	return txn.Commit()
  6969  }
  6970  
  6971  // RootKeyMetas returns an iterator over all root key metadata
  6972  func (s *StateStore) RootKeyMetas(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  6973  	txn := s.db.ReadTxn()
  6974  
  6975  	iter, err := txn.Get(TableRootKeyMeta, indexID)
  6976  	if err != nil {
  6977  		return nil, err
  6978  	}
  6979  
  6980  	ws.Add(iter.WatchCh())
  6981  	return iter, nil
  6982  }
  6983  
  6984  // RootKeyMetaByID returns a specific root key meta
  6985  func (s *StateStore) RootKeyMetaByID(ws memdb.WatchSet, id string) (*structs.RootKeyMeta, error) {
  6986  	txn := s.db.ReadTxn()
  6987  
  6988  	watchCh, raw, err := txn.FirstWatch(TableRootKeyMeta, indexID, id)
  6989  	if err != nil {
  6990  		return nil, fmt.Errorf("root key metadata lookup failed: %v", err)
  6991  	}
  6992  	ws.Add(watchCh)
  6993  
  6994  	if raw != nil {
  6995  		return raw.(*structs.RootKeyMeta), nil
  6996  	}
  6997  	return nil, nil
  6998  }
  6999  
  7000  // GetActiveRootKeyMeta returns the metadata for the currently active root key
  7001  func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMeta, error) {
  7002  	txn := s.db.ReadTxn()
  7003  
  7004  	iter, err := txn.Get(TableRootKeyMeta, indexID)
  7005  	if err != nil {
  7006  		return nil, err
  7007  	}
  7008  	ws.Add(iter.WatchCh())
  7009  
  7010  	for {
  7011  		raw := iter.Next()
  7012  		if raw == nil {
  7013  			break
  7014  		}
  7015  		key := raw.(*structs.RootKeyMeta)
  7016  		if key.Active() {
  7017  			return key, nil
  7018  		}
  7019  	}
  7020  	return nil, nil
  7021  }
  7022  
  7023  // IsRootKeyMetaInUse determines whether a key has been used to sign a workload
  7024  // identity for a live allocation or encrypt any variables
  7025  func (s *StateStore) IsRootKeyMetaInUse(keyID string) (bool, error) {
  7026  	txn := s.db.ReadTxn()
  7027  
  7028  	iter, err := txn.Get(TableAllocs, indexSigningKey, keyID, true)
  7029  	if err != nil {
  7030  		return false, err
  7031  	}
  7032  	alloc := iter.Next()
  7033  	if alloc != nil {
  7034  		return true, nil
  7035  	}
  7036  
  7037  	iter, err = txn.Get(TableVariables, indexKeyID, keyID)
  7038  	if err != nil {
  7039  		return false, err
  7040  	}
  7041  	variable := iter.Next()
  7042  	if variable != nil {
  7043  		return true, nil
  7044  	}
  7045  
  7046  	return false, nil
  7047  }