github.com/ThomasObenaus/nomad@v0.11.1/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"reflect"
     7  	"sort"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	multierror "github.com/hashicorp/go-multierror"
    13  	"github.com/pkg/errors"
    14  
    15  	"github.com/hashicorp/nomad/helper"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  // Txn is a transaction against a state store.
    20  // This can be a read or write transaction.
    21  type Txn = *memdb.Txn
    22  
    23  const (
    24  	// NodeRegisterEventReregistered is the message used when the node becomes
    25  	// reregistered.
    26  	NodeRegisterEventRegistered = "Node registered"
    27  
    28  	// NodeRegisterEventReregistered is the message used when the node becomes
    29  	// reregistered.
    30  	NodeRegisterEventReregistered = "Node re-registered"
    31  )
    32  
    33  // IndexEntry is used with the "index" table
    34  // for managing the latest Raft index affecting a table.
    35  type IndexEntry struct {
    36  	Key   string
    37  	Value uint64
    38  }
    39  
    40  // StateStoreConfig is used to configure a new state store
    41  type StateStoreConfig struct {
    42  	// Logger is used to output the state store's logs
    43  	Logger log.Logger
    44  
    45  	// Region is the region of the server embedding the state store.
    46  	Region string
    47  }
    48  
    49  // The StateStore is responsible for maintaining all the Nomad
    50  // state. It is manipulated by the FSM which maintains consistency
    51  // through the use of Raft. The goals of the StateStore are to provide
    52  // high concurrency for read operations without blocking writes, and
    53  // to provide write availability in the face of reads. EVERY object
    54  // returned as a result of a read against the state store should be
    55  // considered a constant and NEVER modified in place.
    56  type StateStore struct {
    57  	logger log.Logger
    58  	db     *memdb.MemDB
    59  
    60  	// config is the passed in configuration
    61  	config *StateStoreConfig
    62  
    63  	// abandonCh is used to signal watchers that this state store has been
    64  	// abandoned (usually during a restore). This is only ever closed.
    65  	abandonCh chan struct{}
    66  }
    67  
    68  // NewStateStore is used to create a new state store
    69  func NewStateStore(config *StateStoreConfig) (*StateStore, error) {
    70  	// Create the MemDB
    71  	db, err := memdb.NewMemDB(stateStoreSchema())
    72  	if err != nil {
    73  		return nil, fmt.Errorf("state store setup failed: %v", err)
    74  	}
    75  
    76  	// Create the state store
    77  	s := &StateStore{
    78  		logger:    config.Logger.Named("state_store"),
    79  		db:        db,
    80  		config:    config,
    81  		abandonCh: make(chan struct{}),
    82  	}
    83  	return s, nil
    84  }
    85  
    86  // Config returns the state store configuration.
    87  func (s *StateStore) Config() *StateStoreConfig {
    88  	return s.config
    89  }
    90  
    91  // Snapshot is used to create a point in time snapshot. Because
    92  // we use MemDB, we just need to snapshot the state of the underlying
    93  // database.
    94  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    95  	snap := &StateSnapshot{
    96  		StateStore: StateStore{
    97  			logger: s.logger,
    98  			config: s.config,
    99  			db:     s.db.Snapshot(),
   100  		},
   101  	}
   102  	return snap, nil
   103  }
   104  
   105  // SnapshotMinIndex is used to create a state snapshot where the index is
   106  // guaranteed to be greater than or equal to the index parameter.
   107  //
   108  // Some server operations (such as scheduling) exchange objects via RPC
   109  // concurrent with Raft log application, so they must ensure the state store
   110  // snapshot they are operating on is at or after the index the objects
   111  // retrieved via RPC were applied to the Raft log at.
   112  //
   113  // Callers should maintain their own timer metric as the time this method
   114  // blocks indicates Raft log application latency relative to scheduling.
   115  func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) {
   116  	// Ported from work.go:waitForIndex prior to 0.9
   117  
   118  	const backoffBase = 20 * time.Millisecond
   119  	const backoffLimit = 1 * time.Second
   120  	var retries uint
   121  	var retryTimer *time.Timer
   122  
   123  	// XXX: Potential optimization is to set up a watch on the state
   124  	// store's index table and only unblock via a trigger rather than
   125  	// polling.
   126  	for {
   127  		// Get the states current index
   128  		snapshotIndex, err := s.LatestIndex()
   129  		if err != nil {
   130  			return nil, fmt.Errorf("failed to determine state store's index: %v", err)
   131  		}
   132  
   133  		// We only need the FSM state to be as recent as the given index
   134  		if snapshotIndex >= index {
   135  			return s.Snapshot()
   136  		}
   137  
   138  		// Exponential back off
   139  		retries++
   140  		if retryTimer == nil {
   141  			// First retry, start at baseline
   142  			retryTimer = time.NewTimer(backoffBase)
   143  		} else {
   144  			// Subsequent retry, reset timer
   145  			deadline := 1 << (2 * retries) * backoffBase
   146  			if deadline > backoffLimit {
   147  				deadline = backoffLimit
   148  			}
   149  			retryTimer.Reset(deadline)
   150  		}
   151  
   152  		select {
   153  		case <-ctx.Done():
   154  			return nil, ctx.Err()
   155  		case <-retryTimer.C:
   156  		}
   157  	}
   158  }
   159  
   160  // Restore is used to optimize the efficiency of rebuilding
   161  // state by minimizing the number of transactions and checking
   162  // overhead.
   163  func (s *StateStore) Restore() (*StateRestore, error) {
   164  	txn := s.db.Txn(true)
   165  	r := &StateRestore{
   166  		txn: txn,
   167  	}
   168  	return r, nil
   169  }
   170  
   171  // AbandonCh returns a channel you can wait on to know if the state store was
   172  // abandoned.
   173  func (s *StateStore) AbandonCh() <-chan struct{} {
   174  	return s.abandonCh
   175  }
   176  
   177  // Abandon is used to signal that the given state store has been abandoned.
   178  // Calling this more than one time will panic.
   179  func (s *StateStore) Abandon() {
   180  	close(s.abandonCh)
   181  }
   182  
   183  // QueryFn is the definition of a function that can be used to implement a basic
   184  // blocking query against the state store.
   185  type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error)
   186  
   187  // BlockingQuery takes a query function and runs the function until the minimum
   188  // query index is met or until the passed context is cancelled.
   189  func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) (
   190  	resp interface{}, index uint64, err error) {
   191  
   192  RUN_QUERY:
   193  	// We capture the state store and its abandon channel but pass a snapshot to
   194  	// the blocking query function. We operate on the snapshot to allow separate
   195  	// calls to the state store not all wrapped within the same transaction.
   196  	abandonCh := s.AbandonCh()
   197  	snap, _ := s.Snapshot()
   198  	stateSnap := &snap.StateStore
   199  
   200  	// We can skip all watch tracking if this isn't a blocking query.
   201  	var ws memdb.WatchSet
   202  	if minIndex > 0 {
   203  		ws = memdb.NewWatchSet()
   204  
   205  		// This channel will be closed if a snapshot is restored and the
   206  		// whole state store is abandoned.
   207  		ws.Add(abandonCh)
   208  	}
   209  
   210  	resp, index, err = query(ws, stateSnap)
   211  	if err != nil {
   212  		return nil, index, err
   213  	}
   214  
   215  	// We haven't reached the min-index yet.
   216  	if minIndex > 0 && index <= minIndex {
   217  		if err := ws.WatchCtx(ctx); err != nil {
   218  			return nil, index, err
   219  		}
   220  
   221  		goto RUN_QUERY
   222  	}
   223  
   224  	return resp, index, nil
   225  }
   226  
   227  // UpsertPlanResults is used to upsert the results of a plan.
   228  func (s *StateStore) UpsertPlanResults(index uint64, results *structs.ApplyPlanResultsRequest) error {
   229  	snapshot, err := s.Snapshot()
   230  	if err != nil {
   231  		return err
   232  	}
   233  
   234  	allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped)
   235  	if err != nil {
   236  		return err
   237  	}
   238  
   239  	allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted)
   240  	if err != nil {
   241  		return err
   242  	}
   243  
   244  	// COMPAT 0.11: Remove this denormalization when NodePreemptions is removed
   245  	results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions)
   246  	if err != nil {
   247  		return err
   248  	}
   249  
   250  	txn := s.db.Txn(true)
   251  	defer txn.Abort()
   252  
   253  	// Upsert the newly created or updated deployment
   254  	if results.Deployment != nil {
   255  		if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil {
   256  			return err
   257  		}
   258  	}
   259  
   260  	// Update the status of deployments effected by the plan.
   261  	if len(results.DeploymentUpdates) != 0 {
   262  		s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn)
   263  	}
   264  
   265  	if results.EvalID != "" {
   266  		// Update the modify index of the eval id
   267  		if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil {
   268  			return err
   269  		}
   270  	}
   271  
   272  	numAllocs := 0
   273  	if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 {
   274  		// COMPAT 0.11: This branch will be removed, when Alloc is removed
   275  		// Attach the job to all the allocations. It is pulled out in the payload to
   276  		// avoid the redundancy of encoding, but should be denormalized prior to
   277  		// being inserted into MemDB.
   278  		addComputedAllocAttrs(results.Alloc, results.Job)
   279  		numAllocs = len(results.Alloc) + len(results.NodePreemptions)
   280  	} else {
   281  		// Attach the job to all the allocations. It is pulled out in the payload to
   282  		// avoid the redundancy of encoding, but should be denormalized prior to
   283  		// being inserted into MemDB.
   284  		addComputedAllocAttrs(results.AllocsUpdated, results.Job)
   285  		numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted)
   286  	}
   287  
   288  	allocsToUpsert := make([]*structs.Allocation, 0, numAllocs)
   289  
   290  	// COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed
   291  	allocsToUpsert = append(allocsToUpsert, results.Alloc...)
   292  	allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...)
   293  
   294  	allocsToUpsert = append(allocsToUpsert, allocsStopped...)
   295  	allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...)
   296  	allocsToUpsert = append(allocsToUpsert, allocsPreempted...)
   297  
   298  	// handle upgrade path
   299  	for _, alloc := range allocsToUpsert {
   300  		alloc.Canonicalize()
   301  	}
   302  
   303  	if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil {
   304  		return err
   305  	}
   306  
   307  	// Upsert followup evals for allocs that were preempted
   308  	for _, eval := range results.PreemptionEvals {
   309  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   310  			return err
   311  		}
   312  	}
   313  
   314  	txn.Commit()
   315  	return nil
   316  }
   317  
   318  // addComputedAllocAttrs adds the computed/derived attributes to the allocation.
   319  // This method is used when an allocation is being denormalized.
   320  func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) {
   321  	structs.DenormalizeAllocationJobs(job, allocs)
   322  
   323  	// COMPAT(0.11): Remove in 0.11
   324  	// Calculate the total resources of allocations. It is pulled out in the
   325  	// payload to avoid encoding something that can be computed, but should be
   326  	// denormalized prior to being inserted into MemDB.
   327  	for _, alloc := range allocs {
   328  		if alloc.Resources != nil {
   329  			continue
   330  		}
   331  
   332  		alloc.Resources = new(structs.Resources)
   333  		for _, task := range alloc.TaskResources {
   334  			alloc.Resources.Add(task)
   335  		}
   336  
   337  		// Add the shared resources
   338  		alloc.Resources.Add(alloc.SharedResources)
   339  	}
   340  }
   341  
   342  // upsertDeploymentUpdates updates the deployments given the passed status
   343  // updates.
   344  func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *memdb.Txn) error {
   345  	for _, u := range updates {
   346  		if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil {
   347  			return err
   348  		}
   349  	}
   350  
   351  	return nil
   352  }
   353  
   354  // UpsertJobSummary upserts a job summary into the state store.
   355  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
   356  	txn := s.db.Txn(true)
   357  	defer txn.Abort()
   358  
   359  	// Check if the job summary already exists
   360  	existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID)
   361  	if err != nil {
   362  		return fmt.Errorf("job summary lookup failed: %v", err)
   363  	}
   364  
   365  	// Setup the indexes correctly
   366  	if existing != nil {
   367  		jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex
   368  		jobSummary.ModifyIndex = index
   369  	} else {
   370  		jobSummary.CreateIndex = index
   371  		jobSummary.ModifyIndex = index
   372  	}
   373  
   374  	// Update the index
   375  	if err := txn.Insert("job_summary", jobSummary); err != nil {
   376  		return err
   377  	}
   378  
   379  	// Update the indexes table for job summary
   380  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   381  		return fmt.Errorf("index update failed: %v", err)
   382  	}
   383  
   384  	txn.Commit()
   385  	return nil
   386  }
   387  
   388  // DeleteJobSummary deletes the job summary with the given ID. This is for
   389  // testing purposes only.
   390  func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error {
   391  	txn := s.db.Txn(true)
   392  	defer txn.Abort()
   393  
   394  	// Delete the job summary
   395  	if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil {
   396  		return fmt.Errorf("deleting job summary failed: %v", err)
   397  	}
   398  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   399  		return fmt.Errorf("index update failed: %v", err)
   400  	}
   401  	txn.Commit()
   402  	return nil
   403  }
   404  
   405  // UpsertDeployment is used to insert a new deployment. If cancelPrior is set to
   406  // true, all prior deployments for the same job will be cancelled.
   407  func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error {
   408  	txn := s.db.Txn(true)
   409  	defer txn.Abort()
   410  	if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil {
   411  		return err
   412  	}
   413  	txn.Commit()
   414  	return nil
   415  }
   416  
   417  func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *memdb.Txn) error {
   418  	// Check if the deployment already exists
   419  	existing, err := txn.First("deployment", "id", deployment.ID)
   420  	if err != nil {
   421  		return fmt.Errorf("deployment lookup failed: %v", err)
   422  	}
   423  
   424  	// Setup the indexes correctly
   425  	if existing != nil {
   426  		deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex
   427  		deployment.ModifyIndex = index
   428  	} else {
   429  		deployment.CreateIndex = index
   430  		deployment.ModifyIndex = index
   431  	}
   432  
   433  	// Insert the deployment
   434  	if err := txn.Insert("deployment", deployment); err != nil {
   435  		return err
   436  	}
   437  
   438  	// Update the indexes table for deployment
   439  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   440  		return fmt.Errorf("index update failed: %v", err)
   441  	}
   442  
   443  	// If the deployment is being marked as complete, set the job to stable.
   444  	if deployment.Status == structs.DeploymentStatusSuccessful {
   445  		if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil {
   446  			return fmt.Errorf("failed to update job stability: %v", err)
   447  		}
   448  	}
   449  
   450  	return nil
   451  }
   452  
   453  func (s *StateStore) Deployments(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   454  	txn := s.db.Txn(false)
   455  
   456  	// Walk the entire deployments table
   457  	iter, err := txn.Get("deployment", "id")
   458  	if err != nil {
   459  		return nil, err
   460  	}
   461  
   462  	ws.Add(iter.WatchCh())
   463  	return iter, nil
   464  }
   465  
   466  func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
   467  	txn := s.db.Txn(false)
   468  
   469  	// Walk the entire deployments table
   470  	iter, err := txn.Get("deployment", "namespace", namespace)
   471  	if err != nil {
   472  		return nil, err
   473  	}
   474  
   475  	ws.Add(iter.WatchCh())
   476  	return iter, nil
   477  }
   478  
   479  func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string) (memdb.ResultIterator, error) {
   480  	txn := s.db.Txn(false)
   481  
   482  	// Walk the entire deployments table
   483  	iter, err := txn.Get("deployment", "id_prefix", deploymentID)
   484  	if err != nil {
   485  		return nil, err
   486  	}
   487  
   488  	ws.Add(iter.WatchCh())
   489  
   490  	// Wrap the iterator in a filter
   491  	wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace))
   492  	return wrap, nil
   493  }
   494  
   495  // deploymentNamespaceFilter returns a filter function that filters all
   496  // deployment not in the given namespace.
   497  func deploymentNamespaceFilter(namespace string) func(interface{}) bool {
   498  	return func(raw interface{}) bool {
   499  		d, ok := raw.(*structs.Deployment)
   500  		if !ok {
   501  			return true
   502  		}
   503  
   504  		return d.Namespace != namespace
   505  	}
   506  }
   507  
   508  func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) {
   509  	txn := s.db.Txn(false)
   510  	return s.deploymentByIDImpl(ws, deploymentID, txn)
   511  }
   512  
   513  func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *memdb.Txn) (*structs.Deployment, error) {
   514  	watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID)
   515  	if err != nil {
   516  		return nil, fmt.Errorf("deployment lookup failed: %v", err)
   517  	}
   518  	ws.Add(watchCh)
   519  
   520  	if existing != nil {
   521  		return existing.(*structs.Deployment), nil
   522  	}
   523  
   524  	return nil, nil
   525  }
   526  
   527  func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) {
   528  	txn := s.db.Txn(false)
   529  
   530  	var job *structs.Job
   531  	// Read job from state store
   532  	_, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID)
   533  	if err != nil {
   534  		return nil, fmt.Errorf("job lookup failed: %v", err)
   535  	}
   536  	if existing != nil {
   537  		job = existing.(*structs.Job)
   538  	}
   539  
   540  	// Get an iterator over the deployments
   541  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   542  	if err != nil {
   543  		return nil, err
   544  	}
   545  
   546  	ws.Add(iter.WatchCh())
   547  
   548  	var out []*structs.Deployment
   549  	for {
   550  		raw := iter.Next()
   551  		if raw == nil {
   552  			break
   553  		}
   554  		d := raw.(*structs.Deployment)
   555  
   556  		// If the allocation belongs to a job with the same ID but a different
   557  		// create index and we are not getting all the allocations whose Jobs
   558  		// matches the same Job ID then we skip it
   559  		if !all && job != nil && d.JobCreateIndex != job.CreateIndex {
   560  			continue
   561  		}
   562  		out = append(out, d)
   563  	}
   564  
   565  	return out, nil
   566  }
   567  
   568  // LatestDeploymentByJobID returns the latest deployment for the given job. The
   569  // latest is determined strictly by CreateIndex.
   570  func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) {
   571  	txn := s.db.Txn(false)
   572  
   573  	// Get an iterator over the deployments
   574  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   575  	if err != nil {
   576  		return nil, err
   577  	}
   578  
   579  	ws.Add(iter.WatchCh())
   580  
   581  	var out *structs.Deployment
   582  	for {
   583  		raw := iter.Next()
   584  		if raw == nil {
   585  			break
   586  		}
   587  
   588  		d := raw.(*structs.Deployment)
   589  		if out == nil || out.CreateIndex < d.CreateIndex {
   590  			out = d
   591  		}
   592  	}
   593  
   594  	return out, nil
   595  }
   596  
   597  // DeleteDeployment is used to delete a set of deployments by ID
   598  func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error {
   599  	txn := s.db.Txn(true)
   600  	defer txn.Abort()
   601  
   602  	if len(deploymentIDs) == 0 {
   603  		return nil
   604  	}
   605  
   606  	for _, deploymentID := range deploymentIDs {
   607  		// Lookup the deployment
   608  		existing, err := txn.First("deployment", "id", deploymentID)
   609  		if err != nil {
   610  			return fmt.Errorf("deployment lookup failed: %v", err)
   611  		}
   612  		if existing == nil {
   613  			return fmt.Errorf("deployment not found")
   614  		}
   615  
   616  		// Delete the deployment
   617  		if err := txn.Delete("deployment", existing); err != nil {
   618  			return fmt.Errorf("deployment delete failed: %v", err)
   619  		}
   620  	}
   621  
   622  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   623  		return fmt.Errorf("index update failed: %v", err)
   624  	}
   625  
   626  	txn.Commit()
   627  	return nil
   628  }
   629  
   630  // UpsertScalingEvent is used to insert a new scaling event.
   631  // Only the most recent JobTrackedScalingEvents will be kept.
   632  func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error {
   633  	txn := s.db.Txn(true)
   634  	defer txn.Abort()
   635  
   636  	// Get the existing events
   637  	existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID)
   638  	if err != nil {
   639  		return fmt.Errorf("scaling event lookup failed: %v", err)
   640  	}
   641  
   642  	var jobEvents *structs.JobScalingEvents
   643  	if existing != nil {
   644  		jobEvents = existing.(*structs.JobScalingEvents)
   645  	} else {
   646  		jobEvents = &structs.JobScalingEvents{
   647  			Namespace:     req.Namespace,
   648  			JobID:         req.JobID,
   649  			ScalingEvents: make(map[string][]*structs.ScalingEvent),
   650  		}
   651  	}
   652  
   653  	jobEvents.ModifyIndex = index
   654  	req.ScalingEvent.CreateIndex = index
   655  
   656  	events := jobEvents.ScalingEvents[req.TaskGroup]
   657  	// Prepend this latest event
   658  	events = append(
   659  		[]*structs.ScalingEvent{req.ScalingEvent},
   660  		events...,
   661  	)
   662  	// Truncate older events
   663  	if len(events) > structs.JobTrackedScalingEvents {
   664  		events = events[0:structs.JobTrackedScalingEvents]
   665  	}
   666  	jobEvents.ScalingEvents[req.TaskGroup] = events
   667  
   668  	// Insert the new event
   669  	if err := txn.Insert("scaling_event", jobEvents); err != nil {
   670  		return fmt.Errorf("scaling event insert failed: %v", err)
   671  	}
   672  
   673  	// Update the indexes table for scaling_event
   674  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
   675  		return fmt.Errorf("index update failed: %v", err)
   676  	}
   677  
   678  	txn.Commit()
   679  	return nil
   680  }
   681  
   682  // ScalingEvents returns an iterator over all the job scaling events
   683  func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   684  	txn := s.db.Txn(false)
   685  
   686  	// Walk the entire scaling_event table
   687  	iter, err := txn.Get("scaling_event", "id")
   688  	if err != nil {
   689  		return nil, err
   690  	}
   691  
   692  	ws.Add(iter.WatchCh())
   693  
   694  	return iter, nil
   695  }
   696  
   697  func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) {
   698  	txn := s.db.Txn(false)
   699  
   700  	watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID)
   701  	if err != nil {
   702  		return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err)
   703  	}
   704  	ws.Add(watchCh)
   705  
   706  	if existing != nil {
   707  		events := existing.(*structs.JobScalingEvents)
   708  		return events.ScalingEvents, events.ModifyIndex, nil
   709  	}
   710  	return nil, 0, nil
   711  }
   712  
   713  // UpsertNode is used to register a node or update a node definition
   714  // This is assumed to be triggered by the client, so we retain the value
   715  // of drain/eligibility which is set by the scheduler.
   716  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
   717  	txn := s.db.Txn(true)
   718  	defer txn.Abort()
   719  
   720  	// Check if the node already exists
   721  	existing, err := txn.First("nodes", "id", node.ID)
   722  	if err != nil {
   723  		return fmt.Errorf("node lookup failed: %v", err)
   724  	}
   725  
   726  	// Setup the indexes correctly
   727  	if existing != nil {
   728  		exist := existing.(*structs.Node)
   729  		node.CreateIndex = exist.CreateIndex
   730  		node.ModifyIndex = index
   731  
   732  		// Retain node events that have already been set on the node
   733  		node.Events = exist.Events
   734  
   735  		// If we are transitioning from down, record the re-registration
   736  		if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown {
   737  			appendNodeEvents(index, node, []*structs.NodeEvent{
   738  				structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   739  					SetMessage(NodeRegisterEventReregistered).
   740  					SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))})
   741  		}
   742  
   743  		node.Drain = exist.Drain                                 // Retain the drain mode
   744  		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
   745  		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
   746  	} else {
   747  		// Because this is the first time the node is being registered, we should
   748  		// also create a node registration event
   749  		nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   750  			SetMessage(NodeRegisterEventRegistered).
   751  			SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))
   752  		node.Events = []*structs.NodeEvent{nodeEvent}
   753  		node.CreateIndex = index
   754  		node.ModifyIndex = index
   755  	}
   756  
   757  	// Insert the node
   758  	if err := txn.Insert("nodes", node); err != nil {
   759  		return fmt.Errorf("node insert failed: %v", err)
   760  	}
   761  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   762  		return fmt.Errorf("index update failed: %v", err)
   763  	}
   764  	if err := upsertNodeCSIPlugins(txn, node, index); err != nil {
   765  		return fmt.Errorf("csi plugin update failed: %v", err)
   766  	}
   767  
   768  	txn.Commit()
   769  	return nil
   770  }
   771  
   772  // DeleteNode deregisters a batch of nodes
   773  func (s *StateStore) DeleteNode(index uint64, nodes []string) error {
   774  	if len(nodes) == 0 {
   775  		return fmt.Errorf("node ids missing")
   776  	}
   777  
   778  	txn := s.db.Txn(true)
   779  	defer txn.Abort()
   780  
   781  	for _, nodeID := range nodes {
   782  		existing, err := txn.First("nodes", "id", nodeID)
   783  		if err != nil {
   784  			return fmt.Errorf("node lookup failed: %s: %v", nodeID, err)
   785  		}
   786  		if existing == nil {
   787  			return fmt.Errorf("node not found: %s", nodeID)
   788  		}
   789  
   790  		// Delete the node
   791  		if err := txn.Delete("nodes", existing); err != nil {
   792  			return fmt.Errorf("node delete failed: %s: %v", nodeID, err)
   793  		}
   794  
   795  		node := existing.(*structs.Node)
   796  		if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
   797  			return fmt.Errorf("csi plugin delete failed: %v", err)
   798  		}
   799  	}
   800  
   801  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   802  		return fmt.Errorf("index update failed: %v", err)
   803  	}
   804  
   805  	txn.Commit()
   806  	return nil
   807  }
   808  
   809  // UpdateNodeStatus is used to update the status of a node
   810  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
   811  	txn := s.db.Txn(true)
   812  	defer txn.Abort()
   813  
   814  	// Lookup the node
   815  	existing, err := txn.First("nodes", "id", nodeID)
   816  	if err != nil {
   817  		return fmt.Errorf("node lookup failed: %v", err)
   818  	}
   819  	if existing == nil {
   820  		return fmt.Errorf("node not found")
   821  	}
   822  
   823  	// Copy the existing node
   824  	existingNode := existing.(*structs.Node)
   825  	copyNode := existingNode.Copy()
   826  	copyNode.StatusUpdatedAt = updatedAt
   827  
   828  	// Add the event if given
   829  	if event != nil {
   830  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   831  	}
   832  
   833  	// Update the status in the copy
   834  	copyNode.Status = status
   835  	copyNode.ModifyIndex = index
   836  
   837  	// Insert the node
   838  	if err := txn.Insert("nodes", copyNode); err != nil {
   839  		return fmt.Errorf("node update failed: %v", err)
   840  	}
   841  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   842  		return fmt.Errorf("index update failed: %v", err)
   843  	}
   844  
   845  	txn.Commit()
   846  	return nil
   847  }
   848  
   849  // BatchUpdateNodeDrain is used to update the drain of a node set of nodes
   850  func (s *StateStore) BatchUpdateNodeDrain(index uint64, updatedAt int64, updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error {
   851  	txn := s.db.Txn(true)
   852  	defer txn.Abort()
   853  	for node, update := range updates {
   854  		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt, events[node]); err != nil {
   855  			return err
   856  		}
   857  	}
   858  	txn.Commit()
   859  	return nil
   860  }
   861  
   862  // UpdateNodeDrain is used to update the drain of a node
   863  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
   864  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent) error {
   865  
   866  	txn := s.db.Txn(true)
   867  	defer txn.Abort()
   868  	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event); err != nil {
   869  		return err
   870  	}
   871  	txn.Commit()
   872  	return nil
   873  }
   874  
   875  func (s *StateStore) updateNodeDrainImpl(txn *memdb.Txn, index uint64, nodeID string,
   876  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent) error {
   877  
   878  	// Lookup the node
   879  	existing, err := txn.First("nodes", "id", nodeID)
   880  	if err != nil {
   881  		return fmt.Errorf("node lookup failed: %v", err)
   882  	}
   883  	if existing == nil {
   884  		return fmt.Errorf("node not found")
   885  	}
   886  
   887  	// Copy the existing node
   888  	existingNode := existing.(*structs.Node)
   889  	copyNode := existingNode.Copy()
   890  	copyNode.StatusUpdatedAt = updatedAt
   891  
   892  	// Add the event if given
   893  	if event != nil {
   894  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   895  	}
   896  
   897  	// Update the drain in the copy
   898  	copyNode.Drain = drain != nil // COMPAT: Remove in Nomad 0.10
   899  	copyNode.DrainStrategy = drain
   900  	if drain != nil {
   901  		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
   902  	} else if markEligible {
   903  		copyNode.SchedulingEligibility = structs.NodeSchedulingEligible
   904  	}
   905  
   906  	copyNode.ModifyIndex = index
   907  
   908  	// Insert the node
   909  	if err := txn.Insert("nodes", copyNode); err != nil {
   910  		return fmt.Errorf("node update failed: %v", err)
   911  	}
   912  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   913  		return fmt.Errorf("index update failed: %v", err)
   914  	}
   915  
   916  	return nil
   917  }
   918  
   919  // UpdateNodeEligibility is used to update the scheduling eligibility of a node
   920  func (s *StateStore) UpdateNodeEligibility(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error {
   921  
   922  	txn := s.db.Txn(true)
   923  	defer txn.Abort()
   924  
   925  	// Lookup the node
   926  	existing, err := txn.First("nodes", "id", nodeID)
   927  	if err != nil {
   928  		return fmt.Errorf("node lookup failed: %v", err)
   929  	}
   930  	if existing == nil {
   931  		return fmt.Errorf("node not found")
   932  	}
   933  
   934  	// Copy the existing node
   935  	existingNode := existing.(*structs.Node)
   936  	copyNode := existingNode.Copy()
   937  	copyNode.StatusUpdatedAt = updatedAt
   938  
   939  	// Add the event if given
   940  	if event != nil {
   941  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   942  	}
   943  
   944  	// Check if this is a valid action
   945  	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
   946  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
   947  	}
   948  
   949  	// Update the eligibility in the copy
   950  	copyNode.SchedulingEligibility = eligibility
   951  	copyNode.ModifyIndex = index
   952  
   953  	// Insert the node
   954  	if err := txn.Insert("nodes", copyNode); err != nil {
   955  		return fmt.Errorf("node update failed: %v", err)
   956  	}
   957  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   958  		return fmt.Errorf("index update failed: %v", err)
   959  	}
   960  
   961  	txn.Commit()
   962  	return nil
   963  }
   964  
   965  // UpsertNodeEvents adds the node events to the nodes, rotating events as
   966  // necessary.
   967  func (s *StateStore) UpsertNodeEvents(index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
   968  	txn := s.db.Txn(true)
   969  	defer txn.Abort()
   970  
   971  	for nodeID, events := range nodeEvents {
   972  		if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil {
   973  			return err
   974  		}
   975  	}
   976  
   977  	txn.Commit()
   978  	return nil
   979  }
   980  
   981  // upsertNodeEvent upserts a node event for a respective node. It also maintains
   982  // that a fixed number of node events are ever stored simultaneously, deleting
   983  // older events once this bound has been reached.
   984  func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *memdb.Txn) error {
   985  	// Lookup the node
   986  	existing, err := txn.First("nodes", "id", nodeID)
   987  	if err != nil {
   988  		return fmt.Errorf("node lookup failed: %v", err)
   989  	}
   990  	if existing == nil {
   991  		return fmt.Errorf("node not found")
   992  	}
   993  
   994  	// Copy the existing node
   995  	existingNode := existing.(*structs.Node)
   996  	copyNode := existingNode.Copy()
   997  	appendNodeEvents(index, copyNode, events)
   998  
   999  	// Insert the node
  1000  	if err := txn.Insert("nodes", copyNode); err != nil {
  1001  		return fmt.Errorf("node update failed: %v", err)
  1002  	}
  1003  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1004  		return fmt.Errorf("index update failed: %v", err)
  1005  	}
  1006  
  1007  	return nil
  1008  }
  1009  
  1010  // appendNodeEvents is a helper that takes a node and new events and appends
  1011  // them, pruning older events as needed.
  1012  func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) {
  1013  	// Add the events, updating the indexes
  1014  	for _, e := range events {
  1015  		e.CreateIndex = index
  1016  		node.Events = append(node.Events, e)
  1017  	}
  1018  
  1019  	// Keep node events pruned to not exceed the max allowed
  1020  	if l := len(node.Events); l > structs.MaxRetainedNodeEvents {
  1021  		delta := l - structs.MaxRetainedNodeEvents
  1022  		node.Events = node.Events[delta:]
  1023  	}
  1024  }
  1025  
  1026  // upsertNodeCSIPlugins indexes csi plugins for volume retrieval, with health. It's called
  1027  // on upsertNodeEvents, so that event driven health changes are updated
  1028  func upsertNodeCSIPlugins(txn *memdb.Txn, node *structs.Node, index uint64) error {
  1029  
  1030  	loop := func(info *structs.CSIInfo) error {
  1031  		raw, err := txn.First("csi_plugins", "id", info.PluginID)
  1032  		if err != nil {
  1033  			return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err)
  1034  		}
  1035  
  1036  		var plug *structs.CSIPlugin
  1037  		if raw != nil {
  1038  			plug = raw.(*structs.CSIPlugin).Copy()
  1039  		} else {
  1040  			plug = structs.NewCSIPlugin(info.PluginID, index)
  1041  			plug.Provider = info.Provider
  1042  			plug.Version = info.ProviderVersion
  1043  		}
  1044  
  1045  		err = plug.AddPlugin(node.ID, info)
  1046  		if err != nil {
  1047  			return err
  1048  		}
  1049  
  1050  		plug.ModifyIndex = index
  1051  
  1052  		err = txn.Insert("csi_plugins", plug)
  1053  		if err != nil {
  1054  			return fmt.Errorf("csi_plugins insert error: %v", err)
  1055  		}
  1056  
  1057  		return nil
  1058  	}
  1059  
  1060  	inUse := map[string]struct{}{}
  1061  	for _, info := range node.CSIControllerPlugins {
  1062  		err := loop(info)
  1063  		if err != nil {
  1064  			return err
  1065  		}
  1066  		inUse[info.PluginID] = struct{}{}
  1067  	}
  1068  
  1069  	for _, info := range node.CSINodePlugins {
  1070  		err := loop(info)
  1071  		if err != nil {
  1072  			return err
  1073  		}
  1074  		inUse[info.PluginID] = struct{}{}
  1075  	}
  1076  
  1077  	// remove the client node from any plugin that's not
  1078  	// running on it.
  1079  	iter, err := txn.Get("csi_plugins", "id")
  1080  	if err != nil {
  1081  		return fmt.Errorf("csi_plugins lookup failed: %v", err)
  1082  	}
  1083  	for {
  1084  		raw := iter.Next()
  1085  		if raw == nil {
  1086  			break
  1087  		}
  1088  		plug := raw.(*structs.CSIPlugin)
  1089  		_, ok := inUse[plug.ID]
  1090  		if !ok {
  1091  			_, asController := plug.Controllers[node.ID]
  1092  			_, asNode := plug.Nodes[node.ID]
  1093  			if asController || asNode {
  1094  				err = deleteNodeFromPlugin(txn, plug.Copy(), node, index)
  1095  				if err != nil {
  1096  					return err
  1097  				}
  1098  			}
  1099  		}
  1100  	}
  1101  
  1102  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1103  		return fmt.Errorf("index update failed: %v", err)
  1104  	}
  1105  
  1106  	return nil
  1107  }
  1108  
  1109  // deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode
  1110  func deleteNodeCSIPlugins(txn *memdb.Txn, node *structs.Node, index uint64) error {
  1111  	if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 {
  1112  		return nil
  1113  	}
  1114  
  1115  	names := map[string]struct{}{}
  1116  	for _, info := range node.CSIControllerPlugins {
  1117  		names[info.PluginID] = struct{}{}
  1118  	}
  1119  	for _, info := range node.CSINodePlugins {
  1120  		names[info.PluginID] = struct{}{}
  1121  	}
  1122  
  1123  	for id := range names {
  1124  		raw, err := txn.First("csi_plugins", "id", id)
  1125  		if err != nil {
  1126  			return fmt.Errorf("csi_plugins lookup error %s: %v", id, err)
  1127  		}
  1128  		if raw == nil {
  1129  			return fmt.Errorf("csi_plugins missing plugin %s", id)
  1130  		}
  1131  
  1132  		plug := raw.(*structs.CSIPlugin).Copy()
  1133  		err = deleteNodeFromPlugin(txn, plug, node, index)
  1134  		if err != nil {
  1135  			return err
  1136  		}
  1137  	}
  1138  
  1139  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1140  		return fmt.Errorf("index update failed: %v", err)
  1141  	}
  1142  
  1143  	return nil
  1144  }
  1145  
  1146  func deleteNodeFromPlugin(txn *memdb.Txn, plug *structs.CSIPlugin, node *structs.Node, index uint64) error {
  1147  	err := plug.DeleteNode(node.ID)
  1148  	if err != nil {
  1149  		return err
  1150  	}
  1151  	return updateOrGCPlugin(index, txn, plug)
  1152  }
  1153  
  1154  // updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
  1155  func updateOrGCPlugin(index uint64, txn *memdb.Txn, plug *structs.CSIPlugin) error {
  1156  	plug.ModifyIndex = index
  1157  
  1158  	if plug.IsEmpty() {
  1159  		err := txn.Delete("csi_plugins", plug)
  1160  		if err != nil {
  1161  			return fmt.Errorf("csi_plugins delete error: %v", err)
  1162  		}
  1163  	} else {
  1164  		err := txn.Insert("csi_plugins", plug)
  1165  		if err != nil {
  1166  			return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
  1167  		}
  1168  	}
  1169  	return nil
  1170  }
  1171  
  1172  // deleteJobFromPlugin removes the allocations of this job from any plugins the job is
  1173  // running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn
  1174  func (s *StateStore) deleteJobFromPlugin(index uint64, txn *memdb.Txn, job *structs.Job) error {
  1175  	ws := memdb.NewWatchSet()
  1176  	allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false)
  1177  	if err != nil {
  1178  		return fmt.Errorf("error getting allocations: %v", err)
  1179  	}
  1180  
  1181  	type pair struct {
  1182  		pluginID string
  1183  		alloc    *structs.Allocation
  1184  	}
  1185  
  1186  	plugAllocs := []*pair{}
  1187  	plugins := map[string]*structs.CSIPlugin{}
  1188  
  1189  	for _, a := range allocs {
  1190  		// if its nil, we can just panic
  1191  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  1192  		for _, t := range tg.Tasks {
  1193  			if t.CSIPluginConfig != nil {
  1194  				plugAllocs = append(plugAllocs, &pair{
  1195  					pluginID: t.CSIPluginConfig.ID,
  1196  					alloc:    a,
  1197  				})
  1198  			}
  1199  		}
  1200  	}
  1201  
  1202  	for _, x := range plugAllocs {
  1203  		plug, ok := plugins[x.pluginID]
  1204  
  1205  		if !ok {
  1206  			plug, err = s.CSIPluginByID(ws, x.pluginID)
  1207  			if err != nil {
  1208  				return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err)
  1209  			}
  1210  			if plug == nil {
  1211  				return fmt.Errorf("plugin missing: %s %v", x.pluginID, err)
  1212  			}
  1213  			// only copy once, so we update the same plugin on each alloc
  1214  			plugins[x.pluginID] = plug.Copy()
  1215  			plug = plugins[x.pluginID]
  1216  		}
  1217  
  1218  		err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID)
  1219  		if err != nil {
  1220  			return err
  1221  		}
  1222  	}
  1223  
  1224  	for _, plug := range plugins {
  1225  		err = updateOrGCPlugin(index, txn, plug)
  1226  		if err != nil {
  1227  			return err
  1228  		}
  1229  	}
  1230  
  1231  	if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1232  		return fmt.Errorf("index update failed: %v", err)
  1233  	}
  1234  
  1235  	return nil
  1236  }
  1237  
  1238  // NodeByID is used to lookup a node by ID
  1239  func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) {
  1240  	txn := s.db.Txn(false)
  1241  
  1242  	watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID)
  1243  	if err != nil {
  1244  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1245  	}
  1246  	ws.Add(watchCh)
  1247  
  1248  	if existing != nil {
  1249  		return existing.(*structs.Node), nil
  1250  	}
  1251  	return nil, nil
  1252  }
  1253  
  1254  // NodesByIDPrefix is used to lookup nodes by prefix
  1255  func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  1256  	txn := s.db.Txn(false)
  1257  
  1258  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
  1259  	if err != nil {
  1260  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1261  	}
  1262  	ws.Add(iter.WatchCh())
  1263  
  1264  	return iter, nil
  1265  }
  1266  
  1267  // NodeBySecretID is used to lookup a node by SecretID
  1268  func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) {
  1269  	txn := s.db.Txn(false)
  1270  
  1271  	watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID)
  1272  	if err != nil {
  1273  		return nil, fmt.Errorf("node lookup by SecretID failed: %v", err)
  1274  	}
  1275  	ws.Add(watchCh)
  1276  
  1277  	if existing != nil {
  1278  		return existing.(*structs.Node), nil
  1279  	}
  1280  	return nil, nil
  1281  }
  1282  
  1283  // Nodes returns an iterator over all the nodes
  1284  func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1285  	txn := s.db.Txn(false)
  1286  
  1287  	// Walk the entire nodes table
  1288  	iter, err := txn.Get("nodes", "id")
  1289  	if err != nil {
  1290  		return nil, err
  1291  	}
  1292  	ws.Add(iter.WatchCh())
  1293  	return iter, nil
  1294  }
  1295  
  1296  // UpsertJob is used to register a job or update a job definition
  1297  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
  1298  	txn := s.db.Txn(true)
  1299  	defer txn.Abort()
  1300  	if err := s.upsertJobImpl(index, job, false, txn); err != nil {
  1301  		return err
  1302  	}
  1303  	txn.Commit()
  1304  	return nil
  1305  }
  1306  
  1307  // UpsertJobTxn is used to register a job or update a job definition, like UpsertJob,
  1308  // but in a transaction.  Useful for when making multiple modifications atomically
  1309  func (s *StateStore) UpsertJobTxn(index uint64, job *structs.Job, txn Txn) error {
  1310  	return s.upsertJobImpl(index, job, false, txn)
  1311  }
  1312  
  1313  // upsertJobImpl is the implementation for registering a job or updating a job definition
  1314  func (s *StateStore) upsertJobImpl(index uint64, job *structs.Job, keepVersion bool, txn *memdb.Txn) error {
  1315  	// Assert the namespace exists
  1316  	if exists, err := s.namespaceExists(txn, job.Namespace); err != nil {
  1317  		return err
  1318  	} else if !exists {
  1319  		return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace)
  1320  	}
  1321  
  1322  	// Check if the job already exists
  1323  	existing, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1324  	if err != nil {
  1325  		return fmt.Errorf("job lookup failed: %v", err)
  1326  	}
  1327  
  1328  	// Setup the indexes correctly
  1329  	if existing != nil {
  1330  		job.CreateIndex = existing.(*structs.Job).CreateIndex
  1331  		job.ModifyIndex = index
  1332  
  1333  		// Bump the version unless asked to keep it. This should only be done
  1334  		// when changing an internal field such as Stable. A spec change should
  1335  		// always come with a version bump
  1336  		if !keepVersion {
  1337  			job.JobModifyIndex = index
  1338  			job.Version = existing.(*structs.Job).Version + 1
  1339  		}
  1340  
  1341  		// Compute the job status
  1342  		var err error
  1343  		job.Status, err = s.getJobStatus(txn, job, false)
  1344  		if err != nil {
  1345  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1346  		}
  1347  	} else {
  1348  		job.CreateIndex = index
  1349  		job.ModifyIndex = index
  1350  		job.JobModifyIndex = index
  1351  		job.Version = 0
  1352  
  1353  		if err := s.setJobStatus(index, txn, job, false, ""); err != nil {
  1354  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1355  		}
  1356  
  1357  		// Have to get the job again since it could have been updated
  1358  		updated, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1359  		if err != nil {
  1360  			return fmt.Errorf("job lookup failed: %v", err)
  1361  		}
  1362  		if updated != nil {
  1363  			job = updated.(*structs.Job)
  1364  		}
  1365  	}
  1366  
  1367  	if err := s.updateSummaryWithJob(index, job, txn); err != nil {
  1368  		return fmt.Errorf("unable to create job summary: %v", err)
  1369  	}
  1370  
  1371  	if err := s.upsertJobVersion(index, job, txn); err != nil {
  1372  		return fmt.Errorf("unable to upsert job into job_version table: %v", err)
  1373  	}
  1374  
  1375  	if err := s.updateJobScalingPolicies(index, job, txn); err != nil {
  1376  		return fmt.Errorf("unable to update job scaling policies: %v", err)
  1377  	}
  1378  
  1379  	// Insert the job
  1380  	if err := txn.Insert("jobs", job); err != nil {
  1381  		return fmt.Errorf("job insert failed: %v", err)
  1382  	}
  1383  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1384  		return fmt.Errorf("index update failed: %v", err)
  1385  	}
  1386  
  1387  	return nil
  1388  }
  1389  
  1390  // DeleteJob is used to deregister a job
  1391  func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error {
  1392  	txn := s.db.Txn(true)
  1393  	defer txn.Abort()
  1394  
  1395  	err := s.DeleteJobTxn(index, namespace, jobID, txn)
  1396  	if err == nil {
  1397  		txn.Commit()
  1398  	}
  1399  	return err
  1400  }
  1401  
  1402  // DeleteJobTxn is used to deregister a job, like DeleteJob,
  1403  // but in a transaction.  Useful for when making multiple modifications atomically
  1404  func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error {
  1405  	// Lookup the node
  1406  	existing, err := txn.First("jobs", "id", namespace, jobID)
  1407  	if err != nil {
  1408  		return fmt.Errorf("job lookup failed: %v", err)
  1409  	}
  1410  	if existing == nil {
  1411  		return fmt.Errorf("job not found")
  1412  	}
  1413  
  1414  	// Check if we should update a parent job summary
  1415  	job := existing.(*structs.Job)
  1416  	if job.ParentID != "" {
  1417  		summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID)
  1418  		if err != nil {
  1419  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  1420  		}
  1421  
  1422  		// Only continue if the summary exists. It could not exist if the parent
  1423  		// job was removed
  1424  		if summaryRaw != nil {
  1425  			existing := summaryRaw.(*structs.JobSummary)
  1426  			pSummary := existing.Copy()
  1427  			if pSummary.Children != nil {
  1428  
  1429  				modified := false
  1430  				switch job.Status {
  1431  				case structs.JobStatusPending:
  1432  					pSummary.Children.Pending--
  1433  					pSummary.Children.Dead++
  1434  					modified = true
  1435  				case structs.JobStatusRunning:
  1436  					pSummary.Children.Running--
  1437  					pSummary.Children.Dead++
  1438  					modified = true
  1439  				case structs.JobStatusDead:
  1440  				default:
  1441  					return fmt.Errorf("unknown old job status %q", job.Status)
  1442  				}
  1443  
  1444  				if modified {
  1445  					// Update the modify index
  1446  					pSummary.ModifyIndex = index
  1447  
  1448  					// Insert the summary
  1449  					if err := txn.Insert("job_summary", pSummary); err != nil {
  1450  						return fmt.Errorf("job summary insert failed: %v", err)
  1451  					}
  1452  					if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1453  						return fmt.Errorf("index update failed: %v", err)
  1454  					}
  1455  				}
  1456  			}
  1457  		}
  1458  	}
  1459  
  1460  	// Delete the job
  1461  	if err := txn.Delete("jobs", existing); err != nil {
  1462  		return fmt.Errorf("job delete failed: %v", err)
  1463  	}
  1464  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1465  		return fmt.Errorf("index update failed: %v", err)
  1466  	}
  1467  
  1468  	// Delete the job versions
  1469  	if err := s.deleteJobVersions(index, job, txn); err != nil {
  1470  		return err
  1471  	}
  1472  
  1473  	// Delete the job summary
  1474  	if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil {
  1475  		return fmt.Errorf("deleting job summary failed: %v", err)
  1476  	}
  1477  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1478  		return fmt.Errorf("index update failed: %v", err)
  1479  	}
  1480  
  1481  	// Delete any job scaling policies
  1482  	numDeletedScalingPolicies, err := txn.DeleteAll("scaling_policy", "target_prefix", namespace, jobID)
  1483  	if err != nil {
  1484  		return fmt.Errorf("deleting job scaling policies failed: %v", err)
  1485  	}
  1486  	if numDeletedScalingPolicies > 0 {
  1487  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  1488  			return fmt.Errorf("index update failed: %v", err)
  1489  		}
  1490  	}
  1491  
  1492  	// Delete the scaling events
  1493  	if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil {
  1494  		return fmt.Errorf("deleting job scaling events failed: %v", err)
  1495  	}
  1496  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
  1497  		return fmt.Errorf("index update failed: %v", err)
  1498  	}
  1499  
  1500  	// Cleanup plugins registered by this job
  1501  	err = s.deleteJobFromPlugin(index, txn, job)
  1502  	if err != nil {
  1503  		return fmt.Errorf("deleting job from plugin: %v", err)
  1504  	}
  1505  
  1506  	return nil
  1507  }
  1508  
  1509  // deleteJobVersions deletes all versions of the given job.
  1510  func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *memdb.Txn) error {
  1511  	iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID)
  1512  	if err != nil {
  1513  		return err
  1514  	}
  1515  
  1516  	// Put them into a slice so there are no safety concerns while actually
  1517  	// performing the deletes
  1518  	jobs := []*structs.Job{}
  1519  	for {
  1520  		raw := iter.Next()
  1521  		if raw == nil {
  1522  			break
  1523  		}
  1524  
  1525  		// Ensure the ID is an exact match
  1526  		j := raw.(*structs.Job)
  1527  		if j.ID != job.ID {
  1528  			continue
  1529  		}
  1530  
  1531  		jobs = append(jobs, j)
  1532  	}
  1533  
  1534  	// Do the deletes
  1535  	for _, j := range jobs {
  1536  		if err := txn.Delete("job_version", j); err != nil {
  1537  			return fmt.Errorf("deleting job versions failed: %v", err)
  1538  		}
  1539  	}
  1540  
  1541  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1542  		return fmt.Errorf("index update failed: %v", err)
  1543  	}
  1544  
  1545  	return nil
  1546  }
  1547  
  1548  // upsertJobVersion inserts a job into its historic version table and limits the
  1549  // number of job versions that are tracked.
  1550  func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *memdb.Txn) error {
  1551  	// Insert the job
  1552  	if err := txn.Insert("job_version", job); err != nil {
  1553  		return fmt.Errorf("failed to insert job into job_version table: %v", err)
  1554  	}
  1555  
  1556  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1557  		return fmt.Errorf("index update failed: %v", err)
  1558  	}
  1559  
  1560  	// Get all the historic jobs for this ID
  1561  	all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID)
  1562  	if err != nil {
  1563  		return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err)
  1564  	}
  1565  
  1566  	// If we are below the limit there is no GCing to be done
  1567  	if len(all) <= structs.JobTrackedVersions {
  1568  		return nil
  1569  	}
  1570  
  1571  	// We have to delete a historic job to make room.
  1572  	// Find index of the highest versioned stable job
  1573  	stableIdx := -1
  1574  	for i, j := range all {
  1575  		if j.Stable {
  1576  			stableIdx = i
  1577  			break
  1578  		}
  1579  	}
  1580  
  1581  	// If the stable job is the oldest version, do a swap to bring it into the
  1582  	// keep set.
  1583  	max := structs.JobTrackedVersions
  1584  	if stableIdx == max {
  1585  		all[max-1], all[max] = all[max], all[max-1]
  1586  	}
  1587  
  1588  	// Delete the job outside of the set that are being kept.
  1589  	d := all[max]
  1590  	if err := txn.Delete("job_version", d); err != nil {
  1591  		return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version)
  1592  	}
  1593  
  1594  	return nil
  1595  }
  1596  
  1597  // JobByID is used to lookup a job by its ID. JobByID returns the current/latest job
  1598  // version.
  1599  func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) {
  1600  	txn := s.db.Txn(false)
  1601  	return s.JobByIDTxn(ws, namespace, id, txn)
  1602  }
  1603  
  1604  // JobByIDTxn is used to lookup a job by its ID, like  JobByID. JobByID returns the job version
  1605  // accessible through in the transaction
  1606  func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) {
  1607  	watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id)
  1608  	if err != nil {
  1609  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1610  	}
  1611  	ws.Add(watchCh)
  1612  
  1613  	if existing != nil {
  1614  		return existing.(*structs.Job), nil
  1615  	}
  1616  	return nil, nil
  1617  }
  1618  
  1619  // JobsByIDPrefix is used to lookup a job by prefix
  1620  func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  1621  	txn := s.db.Txn(false)
  1622  
  1623  	iter, err := txn.Get("jobs", "id_prefix", namespace, id)
  1624  	if err != nil {
  1625  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1626  	}
  1627  
  1628  	ws.Add(iter.WatchCh())
  1629  
  1630  	return iter, nil
  1631  }
  1632  
  1633  // JobVersionsByID returns all the tracked versions of a job.
  1634  func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  1635  	txn := s.db.Txn(false)
  1636  
  1637  	return s.jobVersionByID(txn, &ws, namespace, id)
  1638  }
  1639  
  1640  // jobVersionByID is the underlying implementation for retrieving all tracked
  1641  // versions of a job and is called under an existing transaction. A watch set
  1642  // can optionally be passed in to add the job histories to the watch set.
  1643  func (s *StateStore) jobVersionByID(txn *memdb.Txn, ws *memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  1644  	// Get all the historic jobs for this ID
  1645  	iter, err := txn.Get("job_version", "id_prefix", namespace, id)
  1646  	if err != nil {
  1647  		return nil, err
  1648  	}
  1649  
  1650  	if ws != nil {
  1651  		ws.Add(iter.WatchCh())
  1652  	}
  1653  
  1654  	var all []*structs.Job
  1655  	for {
  1656  		raw := iter.Next()
  1657  		if raw == nil {
  1658  			break
  1659  		}
  1660  
  1661  		// Ensure the ID is an exact match
  1662  		j := raw.(*structs.Job)
  1663  		if j.ID != id {
  1664  			continue
  1665  		}
  1666  
  1667  		all = append(all, j)
  1668  	}
  1669  
  1670  	// Sort in reverse order so that the highest version is first
  1671  	sort.Slice(all, func(i, j int) bool {
  1672  		return all[i].Version > all[j].Version
  1673  	})
  1674  
  1675  	return all, nil
  1676  }
  1677  
  1678  // JobByIDAndVersion returns the job identified by its ID and Version. The
  1679  // passed watchset may be nil.
  1680  func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) {
  1681  	txn := s.db.Txn(false)
  1682  	return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn)
  1683  }
  1684  
  1685  // jobByIDAndVersionImpl returns the job identified by its ID and Version. The
  1686  // passed watchset may be nil.
  1687  func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string,
  1688  	version uint64, txn *memdb.Txn) (*structs.Job, error) {
  1689  
  1690  	watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version)
  1691  	if err != nil {
  1692  		return nil, err
  1693  	}
  1694  
  1695  	if ws != nil {
  1696  		ws.Add(watchCh)
  1697  	}
  1698  
  1699  	if existing != nil {
  1700  		job := existing.(*structs.Job)
  1701  		return job, nil
  1702  	}
  1703  
  1704  	return nil, nil
  1705  }
  1706  
  1707  func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1708  	txn := s.db.Txn(false)
  1709  
  1710  	// Walk the entire deployments table
  1711  	iter, err := txn.Get("job_version", "id")
  1712  	if err != nil {
  1713  		return nil, err
  1714  	}
  1715  
  1716  	ws.Add(iter.WatchCh())
  1717  	return iter, nil
  1718  }
  1719  
  1720  // Jobs returns an iterator over all the jobs
  1721  func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1722  	txn := s.db.Txn(false)
  1723  
  1724  	// Walk the entire jobs table
  1725  	iter, err := txn.Get("jobs", "id")
  1726  	if err != nil {
  1727  		return nil, err
  1728  	}
  1729  
  1730  	ws.Add(iter.WatchCh())
  1731  
  1732  	return iter, nil
  1733  }
  1734  
  1735  // JobsByNamespace returns an iterator over all the jobs for the given namespace
  1736  func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  1737  	txn := s.db.Txn(false)
  1738  	return s.jobsByNamespaceImpl(ws, namespace, txn)
  1739  }
  1740  
  1741  // jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace
  1742  func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *memdb.Txn) (memdb.ResultIterator, error) {
  1743  	// Walk the entire jobs table
  1744  	iter, err := txn.Get("jobs", "id_prefix", namespace, "")
  1745  	if err != nil {
  1746  		return nil, err
  1747  	}
  1748  
  1749  	ws.Add(iter.WatchCh())
  1750  
  1751  	return iter, nil
  1752  }
  1753  
  1754  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
  1755  func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) {
  1756  	txn := s.db.Txn(false)
  1757  
  1758  	iter, err := txn.Get("jobs", "periodic", periodic)
  1759  	if err != nil {
  1760  		return nil, err
  1761  	}
  1762  
  1763  	ws.Add(iter.WatchCh())
  1764  
  1765  	return iter, nil
  1766  }
  1767  
  1768  // JobsByScheduler returns an iterator over all the jobs with the specific
  1769  // scheduler type.
  1770  func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) {
  1771  	txn := s.db.Txn(false)
  1772  
  1773  	// Return an iterator for jobs with the specific type.
  1774  	iter, err := txn.Get("jobs", "type", schedulerType)
  1775  	if err != nil {
  1776  		return nil, err
  1777  	}
  1778  
  1779  	ws.Add(iter.WatchCh())
  1780  
  1781  	return iter, nil
  1782  }
  1783  
  1784  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
  1785  // collection.
  1786  func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) {
  1787  	txn := s.db.Txn(false)
  1788  
  1789  	iter, err := txn.Get("jobs", "gc", gc)
  1790  	if err != nil {
  1791  		return nil, err
  1792  	}
  1793  
  1794  	ws.Add(iter.WatchCh())
  1795  
  1796  	return iter, nil
  1797  }
  1798  
  1799  // JobSummary returns a job summary object which matches a specific id.
  1800  func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) {
  1801  	txn := s.db.Txn(false)
  1802  
  1803  	watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID)
  1804  	if err != nil {
  1805  		return nil, err
  1806  	}
  1807  
  1808  	ws.Add(watchCh)
  1809  
  1810  	if existing != nil {
  1811  		summary := existing.(*structs.JobSummary)
  1812  		return summary, nil
  1813  	}
  1814  
  1815  	return nil, nil
  1816  }
  1817  
  1818  // JobSummaries walks the entire job summary table and returns all the job
  1819  // summary objects
  1820  func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1821  	txn := s.db.Txn(false)
  1822  
  1823  	iter, err := txn.Get("job_summary", "id")
  1824  	if err != nil {
  1825  		return nil, err
  1826  	}
  1827  
  1828  	ws.Add(iter.WatchCh())
  1829  
  1830  	return iter, nil
  1831  }
  1832  
  1833  // JobSummaryByPrefix is used to look up Job Summary by id prefix
  1834  func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  1835  	txn := s.db.Txn(false)
  1836  
  1837  	iter, err := txn.Get("job_summary", "id_prefix", namespace, id)
  1838  	if err != nil {
  1839  		return nil, fmt.Errorf("job_summary lookup failed: %v", err)
  1840  	}
  1841  
  1842  	ws.Add(iter.WatchCh())
  1843  
  1844  	return iter, nil
  1845  }
  1846  
  1847  // CSIVolumeRegister adds a volume to the server store, failing if it already exists
  1848  func (s *StateStore) CSIVolumeRegister(index uint64, volumes []*structs.CSIVolume) error {
  1849  	txn := s.db.Txn(true)
  1850  	defer txn.Abort()
  1851  
  1852  	for _, v := range volumes {
  1853  		if exists, err := s.namespaceExists(txn, v.Namespace); err != nil {
  1854  			return err
  1855  		} else if !exists {
  1856  			return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace)
  1857  		}
  1858  
  1859  		// Check for volume existence
  1860  		obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID)
  1861  		if err != nil {
  1862  			return fmt.Errorf("volume existence check error: %v", err)
  1863  		}
  1864  		if obj != nil {
  1865  			// Allow some properties of a volume to be updated in place, but
  1866  			// prevent accidentally overwriting important properties, or
  1867  			// overwriting a volume in use
  1868  			old, ok := obj.(*structs.CSIVolume)
  1869  			if ok &&
  1870  				old.InUse() ||
  1871  				old.ExternalID != v.ExternalID ||
  1872  				old.PluginID != v.PluginID ||
  1873  				old.Provider != v.Provider {
  1874  				return fmt.Errorf("volume exists: %s", v.ID)
  1875  			}
  1876  		}
  1877  
  1878  		if v.CreateIndex == 0 {
  1879  			v.CreateIndex = index
  1880  			v.ModifyIndex = index
  1881  		}
  1882  
  1883  		err = txn.Insert("csi_volumes", v)
  1884  		if err != nil {
  1885  			return fmt.Errorf("volume insert: %v", err)
  1886  		}
  1887  	}
  1888  
  1889  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  1890  		return fmt.Errorf("index update failed: %v", err)
  1891  	}
  1892  
  1893  	txn.Commit()
  1894  	return nil
  1895  }
  1896  
  1897  // CSIVolumes returns the unfiltered list of all volumes
  1898  func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1899  	txn := s.db.Txn(false)
  1900  	defer txn.Abort()
  1901  
  1902  	iter, err := txn.Get("csi_volumes", "id")
  1903  	if err != nil {
  1904  		return nil, fmt.Errorf("csi_volumes lookup failed: %v", err)
  1905  	}
  1906  
  1907  	ws.Add(iter.WatchCh())
  1908  
  1909  	return iter, nil
  1910  }
  1911  
  1912  // CSIVolumeByID is used to lookup a single volume. Returns a copy of the volume
  1913  // because its plugins are denormalized to provide accurate Health.
  1914  func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) {
  1915  	txn := s.db.Txn(false)
  1916  
  1917  	watchCh, obj, err := txn.FirstWatch("csi_volumes", "id_prefix", namespace, id)
  1918  	if err != nil {
  1919  		return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  1920  	}
  1921  	ws.Add(watchCh)
  1922  
  1923  	if obj == nil {
  1924  		return nil, nil
  1925  	}
  1926  
  1927  	vol := obj.(*structs.CSIVolume)
  1928  	return s.CSIVolumeDenormalizePlugins(ws, vol.Copy())
  1929  }
  1930  
  1931  // CSIVolumes looks up csi_volumes by pluginID
  1932  func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, pluginID string) (memdb.ResultIterator, error) {
  1933  	txn := s.db.Txn(false)
  1934  
  1935  	iter, err := txn.Get("csi_volumes", "plugin_id", pluginID)
  1936  	if err != nil {
  1937  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  1938  	}
  1939  
  1940  	// Filter the iterator by namespace
  1941  	f := func(raw interface{}) bool {
  1942  		v, ok := raw.(*structs.CSIVolume)
  1943  		if !ok {
  1944  			return false
  1945  		}
  1946  		return v.Namespace != namespace
  1947  	}
  1948  
  1949  	wrap := memdb.NewFilterIterator(iter, f)
  1950  	return wrap, nil
  1951  }
  1952  
  1953  // CSIVolumesByIDPrefix supports search
  1954  func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) {
  1955  	txn := s.db.Txn(false)
  1956  
  1957  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID)
  1958  	if err != nil {
  1959  		return nil, err
  1960  	}
  1961  
  1962  	ws.Add(iter.WatchCh())
  1963  	return iter, nil
  1964  }
  1965  
  1966  // CSIVolumesByNodeID looks up CSIVolumes in use on a node
  1967  func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  1968  	allocs, err := s.AllocsByNode(ws, nodeID)
  1969  	if err != nil {
  1970  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  1971  	}
  1972  
  1973  	// Find volume ids for CSI volumes in running allocs, or allocs that we desire to run
  1974  	ids := map[string]string{} // Map volumeID to Namespace
  1975  	for _, a := range allocs {
  1976  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  1977  
  1978  		if !(a.DesiredStatus == structs.AllocDesiredStatusRun ||
  1979  			a.ClientStatus == structs.AllocClientStatusRunning) ||
  1980  			len(tg.Volumes) == 0 {
  1981  			continue
  1982  		}
  1983  
  1984  		for _, v := range tg.Volumes {
  1985  			if v.Type != structs.VolumeTypeCSI {
  1986  				continue
  1987  			}
  1988  			ids[v.Source] = a.Namespace
  1989  		}
  1990  	}
  1991  
  1992  	// Lookup the raw CSIVolumes to match the other list interfaces
  1993  	iter := NewSliceIterator()
  1994  	txn := s.db.Txn(false)
  1995  	for id, namespace := range ids {
  1996  		raw, err := txn.First("csi_volumes", "id", namespace, id)
  1997  		if err != nil {
  1998  			return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  1999  		}
  2000  		iter.Add(raw)
  2001  	}
  2002  
  2003  	return iter, nil
  2004  }
  2005  
  2006  // CSIVolumesByNamespace looks up the entire csi_volumes table
  2007  func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2008  	txn := s.db.Txn(false)
  2009  
  2010  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, "")
  2011  	if err != nil {
  2012  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2013  	}
  2014  	ws.Add(iter.WatchCh())
  2015  
  2016  	return iter, nil
  2017  }
  2018  
  2019  // CSIVolumeClaim updates the volume's claim count and allocation list
  2020  func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, alloc *structs.Allocation, claim structs.CSIVolumeClaimMode) error {
  2021  	txn := s.db.Txn(true)
  2022  	defer txn.Abort()
  2023  
  2024  	row, err := txn.First("csi_volumes", "id", namespace, id)
  2025  	if err != nil {
  2026  		return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2027  	}
  2028  	if row == nil {
  2029  		return fmt.Errorf("volume not found: %s", id)
  2030  	}
  2031  
  2032  	orig, ok := row.(*structs.CSIVolume)
  2033  	if !ok {
  2034  		return fmt.Errorf("volume row conversion error")
  2035  	}
  2036  
  2037  	ws := memdb.NewWatchSet()
  2038  	volume, err := s.CSIVolumeDenormalizePlugins(ws, orig.Copy())
  2039  	if err != nil {
  2040  		return err
  2041  	}
  2042  
  2043  	volume, err = s.CSIVolumeDenormalize(ws, volume)
  2044  	if err != nil {
  2045  		return err
  2046  	}
  2047  
  2048  	err = volume.Claim(claim, alloc)
  2049  	if err != nil {
  2050  		return err
  2051  	}
  2052  
  2053  	volume.ModifyIndex = index
  2054  
  2055  	if err = txn.Insert("csi_volumes", volume); err != nil {
  2056  		return fmt.Errorf("volume update failed: %s: %v", id, err)
  2057  	}
  2058  
  2059  	if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2060  		return fmt.Errorf("index update failed: %v", err)
  2061  	}
  2062  
  2063  	txn.Commit()
  2064  	return nil
  2065  }
  2066  
  2067  // CSIVolumeDeregister removes the volume from the server
  2068  func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string) error {
  2069  	txn := s.db.Txn(true)
  2070  	defer txn.Abort()
  2071  
  2072  	for _, id := range ids {
  2073  		existing, err := txn.First("csi_volumes", "id_prefix", namespace, id)
  2074  		if err != nil {
  2075  			return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2076  		}
  2077  
  2078  		if existing == nil {
  2079  			return fmt.Errorf("volume not found: %s", id)
  2080  		}
  2081  
  2082  		vol, ok := existing.(*structs.CSIVolume)
  2083  		if !ok {
  2084  			return fmt.Errorf("volume row conversion error: %s", id)
  2085  		}
  2086  
  2087  		if vol.InUse() {
  2088  			return fmt.Errorf("volume in use: %s", id)
  2089  		}
  2090  
  2091  		if err = txn.Delete("csi_volumes", existing); err != nil {
  2092  			return fmt.Errorf("volume delete failed: %s: %v", id, err)
  2093  		}
  2094  	}
  2095  
  2096  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2097  		return fmt.Errorf("index update failed: %v", err)
  2098  	}
  2099  
  2100  	txn.Commit()
  2101  	return nil
  2102  }
  2103  
  2104  // CSIVolumeDenormalizePlugins returns a CSIVolume with current health and plugins, but
  2105  // without allocations
  2106  // Use this for current volume metadata, handling lists of volumes
  2107  // Use CSIVolumeDenormalize for volumes containing both health and current allocations
  2108  func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2109  	if vol == nil {
  2110  		return nil, nil
  2111  	}
  2112  
  2113  	// Lookup CSIPlugin, the health records, and calculate volume health
  2114  	txn := s.db.Txn(false)
  2115  	defer txn.Abort()
  2116  
  2117  	plug, err := s.CSIPluginByID(ws, vol.PluginID)
  2118  	if err != nil {
  2119  		return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
  2120  	}
  2121  	if plug == nil {
  2122  		vol.ControllersHealthy = 0
  2123  		vol.NodesHealthy = 0
  2124  		vol.Schedulable = false
  2125  		return vol, nil
  2126  	}
  2127  
  2128  	vol.Provider = plug.Provider
  2129  	vol.ProviderVersion = plug.Version
  2130  	vol.ControllerRequired = plug.ControllerRequired
  2131  	vol.ControllersHealthy = plug.ControllersHealthy
  2132  	vol.NodesHealthy = plug.NodesHealthy
  2133  	// This number is incorrect! The expected number of node plugins is actually this +
  2134  	// the number of blocked evaluations for the jobs controlling these plugins
  2135  	vol.ControllersExpected = len(plug.Controllers)
  2136  	vol.NodesExpected = len(plug.Nodes)
  2137  
  2138  	vol.Schedulable = vol.NodesHealthy > 0
  2139  	if vol.ControllerRequired {
  2140  		vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable
  2141  	}
  2142  
  2143  	return vol, nil
  2144  }
  2145  
  2146  // CSIVolumeDenormalize returns a CSIVolume with allocations
  2147  func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2148  	for id := range vol.ReadAllocs {
  2149  		a, err := s.AllocByID(ws, id)
  2150  		if err != nil {
  2151  			return nil, err
  2152  		}
  2153  		if a != nil {
  2154  			vol.ReadAllocs[id] = a
  2155  		}
  2156  	}
  2157  
  2158  	for id := range vol.WriteAllocs {
  2159  		a, err := s.AllocByID(ws, id)
  2160  		if err != nil {
  2161  			return nil, err
  2162  		}
  2163  		if a != nil {
  2164  			vol.WriteAllocs[id] = a
  2165  		}
  2166  	}
  2167  
  2168  	return vol, nil
  2169  }
  2170  
  2171  // CSIPlugins returns the unfiltered list of all plugin health status
  2172  func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2173  	txn := s.db.Txn(false)
  2174  	defer txn.Abort()
  2175  
  2176  	iter, err := txn.Get("csi_plugins", "id")
  2177  	if err != nil {
  2178  		return nil, fmt.Errorf("csi_plugins lookup failed: %v", err)
  2179  	}
  2180  
  2181  	ws.Add(iter.WatchCh())
  2182  
  2183  	return iter, nil
  2184  }
  2185  
  2186  // CSIPluginsByIDPrefix supports search
  2187  func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) {
  2188  	txn := s.db.Txn(false)
  2189  
  2190  	iter, err := txn.Get("csi_plugins", "id_prefix", pluginID)
  2191  	if err != nil {
  2192  		return nil, err
  2193  	}
  2194  
  2195  	ws.Add(iter.WatchCh())
  2196  
  2197  	return iter, nil
  2198  }
  2199  
  2200  // CSIPluginByID returns the one named CSIPlugin
  2201  func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2202  	txn := s.db.Txn(false)
  2203  	defer txn.Abort()
  2204  
  2205  	raw, err := txn.First("csi_plugins", "id_prefix", id)
  2206  	if err != nil {
  2207  		return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err)
  2208  	}
  2209  
  2210  	if raw == nil {
  2211  		return nil, nil
  2212  	}
  2213  
  2214  	plug := raw.(*structs.CSIPlugin)
  2215  
  2216  	return plug, nil
  2217  }
  2218  
  2219  // CSIPluginDenormalize returns a CSIPlugin with allocation details
  2220  func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2221  	if plug == nil {
  2222  		return nil, nil
  2223  	}
  2224  
  2225  	// Get the unique list of allocation ids
  2226  	ids := map[string]struct{}{}
  2227  	for _, info := range plug.Controllers {
  2228  		ids[info.AllocID] = struct{}{}
  2229  	}
  2230  	for _, info := range plug.Nodes {
  2231  		ids[info.AllocID] = struct{}{}
  2232  	}
  2233  
  2234  	for id := range ids {
  2235  		alloc, err := s.AllocByID(ws, id)
  2236  		if err != nil {
  2237  			return nil, err
  2238  		}
  2239  		if alloc == nil {
  2240  			continue
  2241  		}
  2242  		plug.Allocations = append(plug.Allocations, alloc.Stub())
  2243  	}
  2244  
  2245  	return plug, nil
  2246  }
  2247  
  2248  // UpsertPeriodicLaunch is used to register a launch or update it.
  2249  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
  2250  	txn := s.db.Txn(true)
  2251  	defer txn.Abort()
  2252  
  2253  	// Check if the job already exists
  2254  	existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID)
  2255  	if err != nil {
  2256  		return fmt.Errorf("periodic launch lookup failed: %v", err)
  2257  	}
  2258  
  2259  	// Setup the indexes correctly
  2260  	if existing != nil {
  2261  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
  2262  		launch.ModifyIndex = index
  2263  	} else {
  2264  		launch.CreateIndex = index
  2265  		launch.ModifyIndex = index
  2266  	}
  2267  
  2268  	// Insert the job
  2269  	if err := txn.Insert("periodic_launch", launch); err != nil {
  2270  		return fmt.Errorf("launch insert failed: %v", err)
  2271  	}
  2272  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2273  		return fmt.Errorf("index update failed: %v", err)
  2274  	}
  2275  
  2276  	txn.Commit()
  2277  	return nil
  2278  }
  2279  
  2280  // DeletePeriodicLaunch is used to delete the periodic launch
  2281  func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error {
  2282  	txn := s.db.Txn(true)
  2283  	defer txn.Abort()
  2284  
  2285  	err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn)
  2286  	if err == nil {
  2287  		txn.Commit()
  2288  	}
  2289  	return err
  2290  }
  2291  
  2292  // DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch
  2293  // but in a transaction.  Useful for when making multiple modifications atomically
  2294  func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error {
  2295  	// Lookup the launch
  2296  	existing, err := txn.First("periodic_launch", "id", namespace, jobID)
  2297  	if err != nil {
  2298  		return fmt.Errorf("launch lookup failed: %v", err)
  2299  	}
  2300  	if existing == nil {
  2301  		return fmt.Errorf("launch not found")
  2302  	}
  2303  
  2304  	// Delete the launch
  2305  	if err := txn.Delete("periodic_launch", existing); err != nil {
  2306  		return fmt.Errorf("launch delete failed: %v", err)
  2307  	}
  2308  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2309  		return fmt.Errorf("index update failed: %v", err)
  2310  	}
  2311  
  2312  	return nil
  2313  }
  2314  
  2315  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
  2316  // ID.
  2317  func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) {
  2318  	txn := s.db.Txn(false)
  2319  
  2320  	watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id)
  2321  	if err != nil {
  2322  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
  2323  	}
  2324  
  2325  	ws.Add(watchCh)
  2326  
  2327  	if existing != nil {
  2328  		return existing.(*structs.PeriodicLaunch), nil
  2329  	}
  2330  	return nil, nil
  2331  }
  2332  
  2333  // PeriodicLaunches returns an iterator over all the periodic launches
  2334  func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2335  	txn := s.db.Txn(false)
  2336  
  2337  	// Walk the entire table
  2338  	iter, err := txn.Get("periodic_launch", "id")
  2339  	if err != nil {
  2340  		return nil, err
  2341  	}
  2342  
  2343  	ws.Add(iter.WatchCh())
  2344  
  2345  	return iter, nil
  2346  }
  2347  
  2348  // UpsertEvals is used to upsert a set of evaluations
  2349  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
  2350  	txn := s.db.Txn(true)
  2351  	defer txn.Abort()
  2352  
  2353  	err := s.UpsertEvalsTxn(index, evals, txn)
  2354  	if err == nil {
  2355  		txn.Commit()
  2356  	}
  2357  	return err
  2358  }
  2359  
  2360  // UpsertEvals is used to upsert a set of evaluations, like UpsertEvals
  2361  // but in a transaction.  Useful for when making multiple modifications atomically
  2362  func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error {
  2363  	// Do a nested upsert
  2364  	jobs := make(map[structs.NamespacedID]string, len(evals))
  2365  	for _, eval := range evals {
  2366  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  2367  			return err
  2368  		}
  2369  
  2370  		tuple := structs.NamespacedID{
  2371  			ID:        eval.JobID,
  2372  			Namespace: eval.Namespace,
  2373  		}
  2374  		jobs[tuple] = ""
  2375  	}
  2376  
  2377  	// Set the job's status
  2378  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  2379  		return fmt.Errorf("setting job status failed: %v", err)
  2380  	}
  2381  
  2382  	return nil
  2383  }
  2384  
  2385  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
  2386  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
  2387  	// Lookup the evaluation
  2388  	existing, err := txn.First("evals", "id", eval.ID)
  2389  	if err != nil {
  2390  		return fmt.Errorf("eval lookup failed: %v", err)
  2391  	}
  2392  
  2393  	// Update the indexes
  2394  	if existing != nil {
  2395  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
  2396  		eval.ModifyIndex = index
  2397  	} else {
  2398  		eval.CreateIndex = index
  2399  		eval.ModifyIndex = index
  2400  	}
  2401  
  2402  	// Update the job summary
  2403  	summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID)
  2404  	if err != nil {
  2405  		return fmt.Errorf("job summary lookup failed: %v", err)
  2406  	}
  2407  	if summaryRaw != nil {
  2408  		js := summaryRaw.(*structs.JobSummary).Copy()
  2409  		hasSummaryChanged := false
  2410  		for tg, num := range eval.QueuedAllocations {
  2411  			if summary, ok := js.Summary[tg]; ok {
  2412  				if summary.Queued != num {
  2413  					summary.Queued = num
  2414  					js.Summary[tg] = summary
  2415  					hasSummaryChanged = true
  2416  				}
  2417  			} else {
  2418  				s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace)
  2419  			}
  2420  		}
  2421  
  2422  		// Insert the job summary
  2423  		if hasSummaryChanged {
  2424  			js.ModifyIndex = index
  2425  			if err := txn.Insert("job_summary", js); err != nil {
  2426  				return fmt.Errorf("job summary insert failed: %v", err)
  2427  			}
  2428  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  2429  				return fmt.Errorf("index update failed: %v", err)
  2430  			}
  2431  		}
  2432  	}
  2433  
  2434  	// Check if the job has any blocked evaluations and cancel them
  2435  	if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 {
  2436  		// Get the blocked evaluation for a job if it exists
  2437  		iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked)
  2438  		if err != nil {
  2439  			return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err)
  2440  		}
  2441  
  2442  		var blocked []*structs.Evaluation
  2443  		for {
  2444  			raw := iter.Next()
  2445  			if raw == nil {
  2446  				break
  2447  			}
  2448  			blocked = append(blocked, raw.(*structs.Evaluation))
  2449  		}
  2450  
  2451  		// Go through and update the evals
  2452  		for _, eval := range blocked {
  2453  			newEval := eval.Copy()
  2454  			newEval.Status = structs.EvalStatusCancelled
  2455  			newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", newEval.ID)
  2456  			newEval.ModifyIndex = index
  2457  
  2458  			if err := txn.Insert("evals", newEval); err != nil {
  2459  				return fmt.Errorf("eval insert failed: %v", err)
  2460  			}
  2461  		}
  2462  	}
  2463  
  2464  	// Insert the eval
  2465  	if err := txn.Insert("evals", eval); err != nil {
  2466  		return fmt.Errorf("eval insert failed: %v", err)
  2467  	}
  2468  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2469  		return fmt.Errorf("index update failed: %v", err)
  2470  	}
  2471  	return nil
  2472  }
  2473  
  2474  // updateEvalModifyIndex is used to update the modify index of an evaluation that has been
  2475  // through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent
  2476  // scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply.
  2477  func (s *StateStore) updateEvalModifyIndex(txn *memdb.Txn, index uint64, evalID string) error {
  2478  	// Lookup the evaluation
  2479  	existing, err := txn.First("evals", "id", evalID)
  2480  	if err != nil {
  2481  		return fmt.Errorf("eval lookup failed: %v", err)
  2482  	}
  2483  	if existing == nil {
  2484  		s.logger.Error("unable to find eval", "eval_id", evalID)
  2485  		return fmt.Errorf("unable to find eval id %q", evalID)
  2486  	}
  2487  	eval := existing.(*structs.Evaluation).Copy()
  2488  	// Update the indexes
  2489  	eval.ModifyIndex = index
  2490  
  2491  	// Insert the eval
  2492  	if err := txn.Insert("evals", eval); err != nil {
  2493  		return fmt.Errorf("eval insert failed: %v", err)
  2494  	}
  2495  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2496  		return fmt.Errorf("index update failed: %v", err)
  2497  	}
  2498  	return nil
  2499  }
  2500  
  2501  // DeleteEval is used to delete an evaluation
  2502  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
  2503  	txn := s.db.Txn(true)
  2504  	defer txn.Abort()
  2505  
  2506  	jobs := make(map[structs.NamespacedID]string, len(evals))
  2507  	for _, eval := range evals {
  2508  		existing, err := txn.First("evals", "id", eval)
  2509  		if err != nil {
  2510  			return fmt.Errorf("eval lookup failed: %v", err)
  2511  		}
  2512  		if existing == nil {
  2513  			continue
  2514  		}
  2515  		if err := txn.Delete("evals", existing); err != nil {
  2516  			return fmt.Errorf("eval delete failed: %v", err)
  2517  		}
  2518  		eval := existing.(*structs.Evaluation)
  2519  
  2520  		tuple := structs.NamespacedID{
  2521  			ID:        eval.JobID,
  2522  			Namespace: eval.Namespace,
  2523  		}
  2524  		jobs[tuple] = ""
  2525  	}
  2526  
  2527  	for _, alloc := range allocs {
  2528  		raw, err := txn.First("allocs", "id", alloc)
  2529  		if err != nil {
  2530  			return fmt.Errorf("alloc lookup failed: %v", err)
  2531  		}
  2532  		if raw == nil {
  2533  			continue
  2534  		}
  2535  		if err := txn.Delete("allocs", raw); err != nil {
  2536  			return fmt.Errorf("alloc delete failed: %v", err)
  2537  		}
  2538  	}
  2539  
  2540  	// Update the indexes
  2541  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2542  		return fmt.Errorf("index update failed: %v", err)
  2543  	}
  2544  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2545  		return fmt.Errorf("index update failed: %v", err)
  2546  	}
  2547  
  2548  	// Set the job's status
  2549  	if err := s.setJobStatuses(index, txn, jobs, true); err != nil {
  2550  		return fmt.Errorf("setting job status failed: %v", err)
  2551  	}
  2552  
  2553  	txn.Commit()
  2554  	return nil
  2555  }
  2556  
  2557  // EvalByID is used to lookup an eval by its ID
  2558  func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) {
  2559  	txn := s.db.Txn(false)
  2560  
  2561  	watchCh, existing, err := txn.FirstWatch("evals", "id", id)
  2562  	if err != nil {
  2563  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  2564  	}
  2565  
  2566  	ws.Add(watchCh)
  2567  
  2568  	if existing != nil {
  2569  		return existing.(*structs.Evaluation), nil
  2570  	}
  2571  	return nil, nil
  2572  }
  2573  
  2574  // EvalsByIDPrefix is used to lookup evaluations by prefix in a particular
  2575  // namespace
  2576  func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2577  	txn := s.db.Txn(false)
  2578  
  2579  	// Get an iterator over all evals by the id prefix
  2580  	iter, err := txn.Get("evals", "id_prefix", id)
  2581  	if err != nil {
  2582  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  2583  	}
  2584  
  2585  	ws.Add(iter.WatchCh())
  2586  
  2587  	// Wrap the iterator in a filter
  2588  	wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace))
  2589  	return wrap, nil
  2590  }
  2591  
  2592  // evalNamespaceFilter returns a filter function that filters all evaluations
  2593  // not in the given namespace.
  2594  func evalNamespaceFilter(namespace string) func(interface{}) bool {
  2595  	return func(raw interface{}) bool {
  2596  		eval, ok := raw.(*structs.Evaluation)
  2597  		if !ok {
  2598  			return true
  2599  		}
  2600  
  2601  		return eval.Namespace != namespace
  2602  	}
  2603  }
  2604  
  2605  // EvalsByJob returns all the evaluations by job id
  2606  func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) {
  2607  	txn := s.db.Txn(false)
  2608  
  2609  	// Get an iterator over the node allocations
  2610  	iter, err := txn.Get("evals", "job_prefix", namespace, jobID)
  2611  	if err != nil {
  2612  		return nil, err
  2613  	}
  2614  
  2615  	ws.Add(iter.WatchCh())
  2616  
  2617  	var out []*structs.Evaluation
  2618  	for {
  2619  		raw := iter.Next()
  2620  		if raw == nil {
  2621  			break
  2622  		}
  2623  
  2624  		e := raw.(*structs.Evaluation)
  2625  
  2626  		// Filter non-exact matches
  2627  		if e.JobID != jobID {
  2628  			continue
  2629  		}
  2630  
  2631  		out = append(out, e)
  2632  	}
  2633  	return out, nil
  2634  }
  2635  
  2636  // Evals returns an iterator over all the evaluations
  2637  func (s *StateStore) Evals(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2638  	txn := s.db.Txn(false)
  2639  
  2640  	// Walk the entire table
  2641  	iter, err := txn.Get("evals", "id")
  2642  	if err != nil {
  2643  		return nil, err
  2644  	}
  2645  
  2646  	ws.Add(iter.WatchCh())
  2647  
  2648  	return iter, nil
  2649  }
  2650  
  2651  // EvalsByNamespace returns an iterator over all the evaluations in the given
  2652  // namespace
  2653  func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2654  	txn := s.db.Txn(false)
  2655  
  2656  	// Walk the entire table
  2657  	iter, err := txn.Get("evals", "namespace", namespace)
  2658  	if err != nil {
  2659  		return nil, err
  2660  	}
  2661  
  2662  	ws.Add(iter.WatchCh())
  2663  
  2664  	return iter, nil
  2665  }
  2666  
  2667  // UpdateAllocsFromClient is used to update an allocation based on input
  2668  // from a client. While the schedulers are the authority on the allocation for
  2669  // most things, some updates are authoritative from the client. Specifically,
  2670  // the desired state comes from the schedulers, while the actual state comes
  2671  // from clients.
  2672  func (s *StateStore) UpdateAllocsFromClient(index uint64, allocs []*structs.Allocation) error {
  2673  	txn := s.db.Txn(true)
  2674  	defer txn.Abort()
  2675  
  2676  	// Handle each of the updated allocations
  2677  	for _, alloc := range allocs {
  2678  		if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil {
  2679  			return err
  2680  		}
  2681  	}
  2682  
  2683  	// Update the indexes
  2684  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2685  		return fmt.Errorf("index update failed: %v", err)
  2686  	}
  2687  
  2688  	txn.Commit()
  2689  	return nil
  2690  }
  2691  
  2692  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
  2693  func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, index uint64, alloc *structs.Allocation) error {
  2694  	// Look for existing alloc
  2695  	existing, err := txn.First("allocs", "id", alloc.ID)
  2696  	if err != nil {
  2697  		return fmt.Errorf("alloc lookup failed: %v", err)
  2698  	}
  2699  
  2700  	// Nothing to do if this does not exist
  2701  	if existing == nil {
  2702  		return nil
  2703  	}
  2704  	exist := existing.(*structs.Allocation)
  2705  
  2706  	// Copy everything from the existing allocation
  2707  	copyAlloc := exist.Copy()
  2708  
  2709  	// Pull in anything the client is the authority on
  2710  	copyAlloc.ClientStatus = alloc.ClientStatus
  2711  	copyAlloc.ClientDescription = alloc.ClientDescription
  2712  	copyAlloc.TaskStates = alloc.TaskStates
  2713  
  2714  	// The client can only set its deployment health and timestamp, so just take
  2715  	// those
  2716  	if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil {
  2717  		oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth()
  2718  		newHasHealthy := alloc.DeploymentStatus.HasHealth()
  2719  
  2720  		// We got new health information from the client
  2721  		if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) {
  2722  			// Updated deployment health and timestamp
  2723  			copyAlloc.DeploymentStatus.Healthy = helper.BoolToPtr(*alloc.DeploymentStatus.Healthy)
  2724  			copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp
  2725  			copyAlloc.DeploymentStatus.ModifyIndex = index
  2726  		}
  2727  	} else if alloc.DeploymentStatus != nil {
  2728  		// First time getting a deployment status so copy everything and just
  2729  		// set the index
  2730  		copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy()
  2731  		copyAlloc.DeploymentStatus.ModifyIndex = index
  2732  	}
  2733  
  2734  	// Update the modify index
  2735  	copyAlloc.ModifyIndex = index
  2736  
  2737  	// Update the modify time
  2738  	copyAlloc.ModifyTime = alloc.ModifyTime
  2739  
  2740  	if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2741  		return fmt.Errorf("error updating deployment: %v", err)
  2742  	}
  2743  
  2744  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2745  		return fmt.Errorf("error updating job summary: %v", err)
  2746  	}
  2747  
  2748  	if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2749  		return err
  2750  	}
  2751  
  2752  	// Update the allocation
  2753  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  2754  		return fmt.Errorf("alloc insert failed: %v", err)
  2755  	}
  2756  
  2757  	// Set the job's status
  2758  	forceStatus := ""
  2759  	if !copyAlloc.TerminalStatus() {
  2760  		forceStatus = structs.JobStatusRunning
  2761  	}
  2762  
  2763  	tuple := structs.NamespacedID{
  2764  		ID:        exist.JobID,
  2765  		Namespace: exist.Namespace,
  2766  	}
  2767  	jobs := map[structs.NamespacedID]string{tuple: forceStatus}
  2768  
  2769  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  2770  		return fmt.Errorf("setting job status failed: %v", err)
  2771  	}
  2772  	return nil
  2773  }
  2774  
  2775  // UpsertAllocs is used to evict a set of allocations and allocate new ones at
  2776  // the same time.
  2777  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
  2778  	txn := s.db.Txn(true)
  2779  	defer txn.Abort()
  2780  	if err := s.upsertAllocsImpl(index, allocs, txn); err != nil {
  2781  		return err
  2782  	}
  2783  	txn.Commit()
  2784  	return nil
  2785  }
  2786  
  2787  // upsertAllocs is the actual implementation of UpsertAllocs so that it may be
  2788  // used with an existing transaction.
  2789  func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *memdb.Txn) error {
  2790  	// Handle the allocations
  2791  	jobs := make(map[structs.NamespacedID]string, 1)
  2792  	for _, alloc := range allocs {
  2793  		existing, err := txn.First("allocs", "id", alloc.ID)
  2794  		if err != nil {
  2795  			return fmt.Errorf("alloc lookup failed: %v", err)
  2796  		}
  2797  		exist, _ := existing.(*structs.Allocation)
  2798  
  2799  		if exist == nil {
  2800  			alloc.CreateIndex = index
  2801  			alloc.ModifyIndex = index
  2802  			alloc.AllocModifyIndex = index
  2803  			if alloc.DeploymentStatus != nil {
  2804  				alloc.DeploymentStatus.ModifyIndex = index
  2805  			}
  2806  
  2807  			// Issue https://github.com/hashicorp/nomad/issues/2583 uncovered
  2808  			// the a race between a forced garbage collection and the scheduler
  2809  			// marking an allocation as terminal. The issue is that the
  2810  			// allocation from the scheduler has its job normalized and the FSM
  2811  			// will only denormalize if the allocation is not terminal.  However
  2812  			// if the allocation is garbage collected, that will result in a
  2813  			// allocation being upserted for the first time without a job
  2814  			// attached. By returning an error here, it will cause the FSM to
  2815  			// error, causing the plan_apply to error and thus causing the
  2816  			// evaluation to be failed. This will force an index refresh that
  2817  			// should solve this issue.
  2818  			if alloc.Job == nil {
  2819  				return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID)
  2820  			}
  2821  		} else {
  2822  			alloc.CreateIndex = exist.CreateIndex
  2823  			alloc.ModifyIndex = index
  2824  			alloc.AllocModifyIndex = index
  2825  
  2826  			// Keep the clients task states
  2827  			alloc.TaskStates = exist.TaskStates
  2828  
  2829  			// If the scheduler is marking this allocation as lost we do not
  2830  			// want to reuse the status of the existing allocation.
  2831  			if alloc.ClientStatus != structs.AllocClientStatusLost {
  2832  				alloc.ClientStatus = exist.ClientStatus
  2833  				alloc.ClientDescription = exist.ClientDescription
  2834  			}
  2835  
  2836  			// The job has been denormalized so re-attach the original job
  2837  			if alloc.Job == nil {
  2838  				alloc.Job = exist.Job
  2839  			}
  2840  		}
  2841  
  2842  		// OPTIMIZATION:
  2843  		// These should be given a map of new to old allocation and the updates
  2844  		// should be one on all changes. The current implementation causes O(n)
  2845  		// lookups/copies/insertions rather than O(1)
  2846  		if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil {
  2847  			return fmt.Errorf("error updating deployment: %v", err)
  2848  		}
  2849  
  2850  		if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil {
  2851  			return fmt.Errorf("error updating job summary: %v", err)
  2852  		}
  2853  
  2854  		if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil {
  2855  			return err
  2856  		}
  2857  
  2858  		if err := txn.Insert("allocs", alloc); err != nil {
  2859  			return fmt.Errorf("alloc insert failed: %v", err)
  2860  		}
  2861  
  2862  		if alloc.PreviousAllocation != "" {
  2863  			prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation)
  2864  			if err != nil {
  2865  				return fmt.Errorf("alloc lookup failed: %v", err)
  2866  			}
  2867  			existingPrevAlloc, _ := prevAlloc.(*structs.Allocation)
  2868  			if existingPrevAlloc != nil {
  2869  				prevAllocCopy := existingPrevAlloc.Copy()
  2870  				prevAllocCopy.NextAllocation = alloc.ID
  2871  				prevAllocCopy.ModifyIndex = index
  2872  				if err := txn.Insert("allocs", prevAllocCopy); err != nil {
  2873  					return fmt.Errorf("alloc insert failed: %v", err)
  2874  				}
  2875  			}
  2876  		}
  2877  
  2878  		// If the allocation is running, force the job to running status.
  2879  		forceStatus := ""
  2880  		if !alloc.TerminalStatus() {
  2881  			forceStatus = structs.JobStatusRunning
  2882  		}
  2883  
  2884  		tuple := structs.NamespacedID{
  2885  			ID:        alloc.JobID,
  2886  			Namespace: alloc.Namespace,
  2887  		}
  2888  		jobs[tuple] = forceStatus
  2889  	}
  2890  
  2891  	// Update the indexes
  2892  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2893  		return fmt.Errorf("index update failed: %v", err)
  2894  	}
  2895  
  2896  	// Set the job's status
  2897  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  2898  		return fmt.Errorf("setting job status failed: %v", err)
  2899  	}
  2900  
  2901  	return nil
  2902  }
  2903  
  2904  // UpdateAllocsDesiredTransitions is used to update a set of allocations
  2905  // desired transitions.
  2906  func (s *StateStore) UpdateAllocsDesiredTransitions(index uint64, allocs map[string]*structs.DesiredTransition,
  2907  	evals []*structs.Evaluation) error {
  2908  
  2909  	txn := s.db.Txn(true)
  2910  	defer txn.Abort()
  2911  
  2912  	// Handle each of the updated allocations
  2913  	for id, transition := range allocs {
  2914  		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transition); err != nil {
  2915  			return err
  2916  		}
  2917  	}
  2918  
  2919  	for _, eval := range evals {
  2920  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  2921  			return err
  2922  		}
  2923  	}
  2924  
  2925  	// Update the indexes
  2926  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2927  		return fmt.Errorf("index update failed: %v", err)
  2928  	}
  2929  
  2930  	txn.Commit()
  2931  	return nil
  2932  }
  2933  
  2934  // nestedUpdateAllocDesiredTransition is used to nest an update of an
  2935  // allocations desired transition
  2936  func (s *StateStore) nestedUpdateAllocDesiredTransition(
  2937  	txn *memdb.Txn, index uint64, allocID string,
  2938  	transition *structs.DesiredTransition) error {
  2939  
  2940  	// Look for existing alloc
  2941  	existing, err := txn.First("allocs", "id", allocID)
  2942  	if err != nil {
  2943  		return fmt.Errorf("alloc lookup failed: %v", err)
  2944  	}
  2945  
  2946  	// Nothing to do if this does not exist
  2947  	if existing == nil {
  2948  		return nil
  2949  	}
  2950  	exist := existing.(*structs.Allocation)
  2951  
  2952  	// Copy everything from the existing allocation
  2953  	copyAlloc := exist.Copy()
  2954  
  2955  	// Merge the desired transitions
  2956  	copyAlloc.DesiredTransition.Merge(transition)
  2957  
  2958  	// Update the modify index
  2959  	copyAlloc.ModifyIndex = index
  2960  
  2961  	// Update the allocation
  2962  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  2963  		return fmt.Errorf("alloc insert failed: %v", err)
  2964  	}
  2965  
  2966  	return nil
  2967  }
  2968  
  2969  // AllocByID is used to lookup an allocation by its ID
  2970  func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  2971  	txn := s.db.Txn(false)
  2972  
  2973  	watchCh, existing, err := txn.FirstWatch("allocs", "id", id)
  2974  	if err != nil {
  2975  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  2976  	}
  2977  
  2978  	ws.Add(watchCh)
  2979  
  2980  	if existing != nil {
  2981  		return existing.(*structs.Allocation), nil
  2982  	}
  2983  	return nil, nil
  2984  }
  2985  
  2986  // AllocsByIDPrefix is used to lookup allocs by prefix
  2987  func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2988  	txn := s.db.Txn(false)
  2989  
  2990  	iter, err := txn.Get("allocs", "id_prefix", id)
  2991  	if err != nil {
  2992  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  2993  	}
  2994  
  2995  	ws.Add(iter.WatchCh())
  2996  
  2997  	// Wrap the iterator in a filter
  2998  	wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace))
  2999  	return wrap, nil
  3000  }
  3001  
  3002  // allocNamespaceFilter returns a filter function that filters all allocations
  3003  // not in the given namespace.
  3004  func allocNamespaceFilter(namespace string) func(interface{}) bool {
  3005  	return func(raw interface{}) bool {
  3006  		alloc, ok := raw.(*structs.Allocation)
  3007  		if !ok {
  3008  			return true
  3009  		}
  3010  
  3011  		return alloc.Namespace != namespace
  3012  	}
  3013  }
  3014  
  3015  // AllocsByNode returns all the allocations by node
  3016  func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  3017  	txn := s.db.Txn(false)
  3018  
  3019  	// Get an iterator over the node allocations, using only the
  3020  	// node prefix which ignores the terminal status
  3021  	iter, err := txn.Get("allocs", "node_prefix", node)
  3022  	if err != nil {
  3023  		return nil, err
  3024  	}
  3025  
  3026  	ws.Add(iter.WatchCh())
  3027  
  3028  	var out []*structs.Allocation
  3029  	for {
  3030  		raw := iter.Next()
  3031  		if raw == nil {
  3032  			break
  3033  		}
  3034  		out = append(out, raw.(*structs.Allocation))
  3035  	}
  3036  	return out, nil
  3037  }
  3038  
  3039  // AllocsByNode returns all the allocations by node and terminal status
  3040  func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) {
  3041  	txn := s.db.Txn(false)
  3042  
  3043  	// Get an iterator over the node allocations
  3044  	iter, err := txn.Get("allocs", "node", node, terminal)
  3045  	if err != nil {
  3046  		return nil, err
  3047  	}
  3048  
  3049  	ws.Add(iter.WatchCh())
  3050  
  3051  	var out []*structs.Allocation
  3052  	for {
  3053  		raw := iter.Next()
  3054  		if raw == nil {
  3055  			break
  3056  		}
  3057  		out = append(out, raw.(*structs.Allocation))
  3058  	}
  3059  	return out, nil
  3060  }
  3061  
  3062  // AllocsByJob returns allocations by job id
  3063  func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) {
  3064  	txn := s.db.Txn(false)
  3065  
  3066  	// Get the job
  3067  	var job *structs.Job
  3068  	rawJob, err := txn.First("jobs", "id", namespace, jobID)
  3069  	if err != nil {
  3070  		return nil, err
  3071  	}
  3072  	if rawJob != nil {
  3073  		job = rawJob.(*structs.Job)
  3074  	}
  3075  
  3076  	// Get an iterator over the node allocations
  3077  	iter, err := txn.Get("allocs", "job", namespace, jobID)
  3078  	if err != nil {
  3079  		return nil, err
  3080  	}
  3081  
  3082  	ws.Add(iter.WatchCh())
  3083  
  3084  	var out []*structs.Allocation
  3085  	for {
  3086  		raw := iter.Next()
  3087  		if raw == nil {
  3088  			break
  3089  		}
  3090  
  3091  		alloc := raw.(*structs.Allocation)
  3092  		// If the allocation belongs to a job with the same ID but a different
  3093  		// create index and we are not getting all the allocations whose Jobs
  3094  		// matches the same Job ID then we skip it
  3095  		if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex {
  3096  			continue
  3097  		}
  3098  		out = append(out, raw.(*structs.Allocation))
  3099  	}
  3100  	return out, nil
  3101  }
  3102  
  3103  // AllocsByEval returns all the allocations by eval id
  3104  func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) {
  3105  	txn := s.db.Txn(false)
  3106  
  3107  	// Get an iterator over the eval allocations
  3108  	iter, err := txn.Get("allocs", "eval", evalID)
  3109  	if err != nil {
  3110  		return nil, err
  3111  	}
  3112  
  3113  	ws.Add(iter.WatchCh())
  3114  
  3115  	var out []*structs.Allocation
  3116  	for {
  3117  		raw := iter.Next()
  3118  		if raw == nil {
  3119  			break
  3120  		}
  3121  		out = append(out, raw.(*structs.Allocation))
  3122  	}
  3123  	return out, nil
  3124  }
  3125  
  3126  // AllocsByDeployment returns all the allocations by deployment id
  3127  func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) {
  3128  	txn := s.db.Txn(false)
  3129  
  3130  	// Get an iterator over the deployments allocations
  3131  	iter, err := txn.Get("allocs", "deployment", deploymentID)
  3132  	if err != nil {
  3133  		return nil, err
  3134  	}
  3135  
  3136  	ws.Add(iter.WatchCh())
  3137  
  3138  	var out []*structs.Allocation
  3139  	for {
  3140  		raw := iter.Next()
  3141  		if raw == nil {
  3142  			break
  3143  		}
  3144  		out = append(out, raw.(*structs.Allocation))
  3145  	}
  3146  	return out, nil
  3147  }
  3148  
  3149  // Allocs returns an iterator over all the evaluations
  3150  func (s *StateStore) Allocs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3151  	txn := s.db.Txn(false)
  3152  
  3153  	// Walk the entire table
  3154  	iter, err := txn.Get("allocs", "id")
  3155  	if err != nil {
  3156  		return nil, err
  3157  	}
  3158  
  3159  	ws.Add(iter.WatchCh())
  3160  
  3161  	return iter, nil
  3162  }
  3163  
  3164  // AllocsByNamespace returns an iterator over all the allocations in the
  3165  // namespace
  3166  func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  3167  	txn := s.db.Txn(false)
  3168  	return s.allocsByNamespaceImpl(ws, txn, namespace)
  3169  }
  3170  
  3171  // allocsByNamespaceImpl returns an iterator over all the allocations in the
  3172  // namespace
  3173  func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *memdb.Txn, namespace string) (memdb.ResultIterator, error) {
  3174  	// Walk the entire table
  3175  	iter, err := txn.Get("allocs", "namespace", namespace)
  3176  	if err != nil {
  3177  		return nil, err
  3178  	}
  3179  
  3180  	ws.Add(iter.WatchCh())
  3181  
  3182  	return iter, nil
  3183  }
  3184  
  3185  // UpsertVaultAccessors is used to register a set of Vault Accessors
  3186  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  3187  	txn := s.db.Txn(true)
  3188  	defer txn.Abort()
  3189  
  3190  	for _, accessor := range accessors {
  3191  		// Set the create index
  3192  		accessor.CreateIndex = index
  3193  
  3194  		// Insert the accessor
  3195  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  3196  			return fmt.Errorf("accessor insert failed: %v", err)
  3197  		}
  3198  	}
  3199  
  3200  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  3201  		return fmt.Errorf("index update failed: %v", err)
  3202  	}
  3203  
  3204  	txn.Commit()
  3205  	return nil
  3206  }
  3207  
  3208  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  3209  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  3210  	txn := s.db.Txn(true)
  3211  	defer txn.Abort()
  3212  
  3213  	// Lookup the accessor
  3214  	for _, accessor := range accessors {
  3215  		// Delete the accessor
  3216  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  3217  			return fmt.Errorf("accessor delete failed: %v", err)
  3218  		}
  3219  	}
  3220  
  3221  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  3222  		return fmt.Errorf("index update failed: %v", err)
  3223  	}
  3224  
  3225  	txn.Commit()
  3226  	return nil
  3227  }
  3228  
  3229  // VaultAccessor returns the given Vault accessor
  3230  func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) {
  3231  	txn := s.db.Txn(false)
  3232  
  3233  	watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor)
  3234  	if err != nil {
  3235  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  3236  	}
  3237  
  3238  	ws.Add(watchCh)
  3239  
  3240  	if existing != nil {
  3241  		return existing.(*structs.VaultAccessor), nil
  3242  	}
  3243  
  3244  	return nil, nil
  3245  }
  3246  
  3247  // VaultAccessors returns an iterator of Vault accessors.
  3248  func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3249  	txn := s.db.Txn(false)
  3250  
  3251  	iter, err := txn.Get("vault_accessors", "id")
  3252  	if err != nil {
  3253  		return nil, err
  3254  	}
  3255  
  3256  	ws.Add(iter.WatchCh())
  3257  
  3258  	return iter, nil
  3259  }
  3260  
  3261  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  3262  func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) {
  3263  	txn := s.db.Txn(false)
  3264  
  3265  	// Get an iterator over the accessors
  3266  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  3267  	if err != nil {
  3268  		return nil, err
  3269  	}
  3270  
  3271  	ws.Add(iter.WatchCh())
  3272  
  3273  	var out []*structs.VaultAccessor
  3274  	for {
  3275  		raw := iter.Next()
  3276  		if raw == nil {
  3277  			break
  3278  		}
  3279  		out = append(out, raw.(*structs.VaultAccessor))
  3280  	}
  3281  	return out, nil
  3282  }
  3283  
  3284  // VaultAccessorsByNode returns all the Vault accessors by node id
  3285  func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) {
  3286  	txn := s.db.Txn(false)
  3287  
  3288  	// Get an iterator over the accessors
  3289  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  3290  	if err != nil {
  3291  		return nil, err
  3292  	}
  3293  
  3294  	ws.Add(iter.WatchCh())
  3295  
  3296  	var out []*structs.VaultAccessor
  3297  	for {
  3298  		raw := iter.Next()
  3299  		if raw == nil {
  3300  			break
  3301  		}
  3302  		out = append(out, raw.(*structs.VaultAccessor))
  3303  	}
  3304  	return out, nil
  3305  }
  3306  
  3307  func indexEntry(table string, index uint64) *IndexEntry {
  3308  	return &IndexEntry{
  3309  		Key:   table,
  3310  		Value: index,
  3311  	}
  3312  }
  3313  
  3314  const siTokenAccessorTable = "si_token_accessors"
  3315  
  3316  // UpsertSITokenAccessors is used to register a set of Service Identity token accessors.
  3317  func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  3318  	txn := s.db.Txn(true)
  3319  	defer txn.Abort()
  3320  
  3321  	for _, accessor := range accessors {
  3322  		// set the create index
  3323  		accessor.CreateIndex = index
  3324  
  3325  		// insert the accessor
  3326  		if err := txn.Insert(siTokenAccessorTable, accessor); err != nil {
  3327  			return errors.Wrap(err, "accessor insert failed")
  3328  		}
  3329  	}
  3330  
  3331  	// update the index for this table
  3332  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  3333  		return errors.Wrap(err, "index update failed")
  3334  	}
  3335  
  3336  	txn.Commit()
  3337  	return nil
  3338  }
  3339  
  3340  // DeleteSITokenAccessors is used to delete a set of Service Identity token accessors.
  3341  func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  3342  	txn := s.db.Txn(true)
  3343  	defer txn.Abort()
  3344  
  3345  	// Lookup each accessor
  3346  	for _, accessor := range accessors {
  3347  		// Delete the accessor
  3348  		if err := txn.Delete(siTokenAccessorTable, accessor); err != nil {
  3349  			return errors.Wrap(err, "accessor delete failed")
  3350  		}
  3351  	}
  3352  
  3353  	// update the index for this table
  3354  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  3355  		return errors.Wrap(err, "index update failed")
  3356  	}
  3357  
  3358  	txn.Commit()
  3359  	return nil
  3360  }
  3361  
  3362  // SITokenAccessor returns the given Service Identity token accessor.
  3363  func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) {
  3364  	txn := s.db.Txn(false)
  3365  	defer txn.Abort()
  3366  
  3367  	watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID)
  3368  	if err != nil {
  3369  		return nil, errors.Wrap(err, "accessor lookup failed")
  3370  	}
  3371  
  3372  	ws.Add(watchCh)
  3373  
  3374  	if existing != nil {
  3375  		return existing.(*structs.SITokenAccessor), nil
  3376  	}
  3377  
  3378  	return nil, nil
  3379  }
  3380  
  3381  // SITokenAccessors returns an iterator of Service Identity token accessors.
  3382  func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3383  	txn := s.db.Txn(false)
  3384  	defer txn.Abort()
  3385  
  3386  	iter, err := txn.Get(siTokenAccessorTable, "id")
  3387  	if err != nil {
  3388  		return nil, err
  3389  	}
  3390  
  3391  	ws.Add(iter.WatchCh())
  3392  
  3393  	return iter, nil
  3394  }
  3395  
  3396  // SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID.
  3397  func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) {
  3398  	txn := s.db.Txn(false)
  3399  	defer txn.Abort()
  3400  
  3401  	// Get an iterator over the accessors
  3402  	iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID)
  3403  	if err != nil {
  3404  		return nil, err
  3405  	}
  3406  
  3407  	ws.Add(iter.WatchCh())
  3408  
  3409  	var result []*structs.SITokenAccessor
  3410  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  3411  		result = append(result, raw.(*structs.SITokenAccessor))
  3412  	}
  3413  
  3414  	return result, nil
  3415  }
  3416  
  3417  // SITokenAccessorsByNode returns all the Service Identity token accessors by node ID.
  3418  func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) {
  3419  	txn := s.db.Txn(false)
  3420  	defer txn.Abort()
  3421  
  3422  	// Get an iterator over the accessors
  3423  	iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID)
  3424  	if err != nil {
  3425  		return nil, err
  3426  	}
  3427  
  3428  	ws.Add(iter.WatchCh())
  3429  
  3430  	var result []*structs.SITokenAccessor
  3431  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  3432  		result = append(result, raw.(*structs.SITokenAccessor))
  3433  	}
  3434  
  3435  	return result, nil
  3436  }
  3437  
  3438  // UpdateDeploymentStatus is used to make deployment status updates and
  3439  // potentially make a evaluation
  3440  func (s *StateStore) UpdateDeploymentStatus(index uint64, req *structs.DeploymentStatusUpdateRequest) error {
  3441  	txn := s.db.Txn(true)
  3442  	defer txn.Abort()
  3443  
  3444  	if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  3445  		return err
  3446  	}
  3447  
  3448  	// Upsert the job if necessary
  3449  	if req.Job != nil {
  3450  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  3451  			return err
  3452  		}
  3453  	}
  3454  
  3455  	// Upsert the optional eval
  3456  	if req.Eval != nil {
  3457  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3458  			return err
  3459  		}
  3460  	}
  3461  
  3462  	txn.Commit()
  3463  	return nil
  3464  }
  3465  
  3466  // updateDeploymentStatusImpl is used to make deployment status updates
  3467  func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *memdb.Txn) error {
  3468  	// Retrieve deployment
  3469  	ws := memdb.NewWatchSet()
  3470  	deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn)
  3471  	if err != nil {
  3472  		return err
  3473  	} else if deployment == nil {
  3474  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID)
  3475  	} else if !deployment.Active() {
  3476  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3477  	}
  3478  
  3479  	// Apply the new status
  3480  	copy := deployment.Copy()
  3481  	copy.Status = u.Status
  3482  	copy.StatusDescription = u.StatusDescription
  3483  	copy.ModifyIndex = index
  3484  
  3485  	// Insert the deployment
  3486  	if err := txn.Insert("deployment", copy); err != nil {
  3487  		return err
  3488  	}
  3489  
  3490  	// Update the index
  3491  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
  3492  		return fmt.Errorf("index update failed: %v", err)
  3493  	}
  3494  
  3495  	// If the deployment is being marked as complete, set the job to stable.
  3496  	if copy.Status == structs.DeploymentStatusSuccessful {
  3497  		if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil {
  3498  			return fmt.Errorf("failed to update job stability: %v", err)
  3499  		}
  3500  	}
  3501  
  3502  	return nil
  3503  }
  3504  
  3505  // UpdateJobStability updates the stability of the given job and version to the
  3506  // desired status.
  3507  func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error {
  3508  	txn := s.db.Txn(true)
  3509  	defer txn.Abort()
  3510  
  3511  	if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil {
  3512  		return err
  3513  	}
  3514  
  3515  	txn.Commit()
  3516  	return nil
  3517  }
  3518  
  3519  // updateJobStabilityImpl updates the stability of the given job and version
  3520  func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *memdb.Txn) error {
  3521  	// Get the job that is referenced
  3522  	job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn)
  3523  	if err != nil {
  3524  		return err
  3525  	}
  3526  
  3527  	// Has already been cleared, nothing to do
  3528  	if job == nil {
  3529  		return nil
  3530  	}
  3531  
  3532  	// If the job already has the desired stability, nothing to do
  3533  	if job.Stable == stable {
  3534  		return nil
  3535  	}
  3536  
  3537  	copy := job.Copy()
  3538  	copy.Stable = stable
  3539  	return s.upsertJobImpl(index, copy, true, txn)
  3540  }
  3541  
  3542  // UpdateDeploymentPromotion is used to promote canaries in a deployment and
  3543  // potentially make a evaluation
  3544  func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyDeploymentPromoteRequest) error {
  3545  	txn := s.db.Txn(true)
  3546  	defer txn.Abort()
  3547  
  3548  	// Retrieve deployment and ensure it is not terminal and is active
  3549  	ws := memdb.NewWatchSet()
  3550  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  3551  	if err != nil {
  3552  		return err
  3553  	} else if deployment == nil {
  3554  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  3555  	} else if !deployment.Active() {
  3556  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3557  	}
  3558  
  3559  	// Retrieve effected allocations
  3560  	iter, err := txn.Get("allocs", "deployment", req.DeploymentID)
  3561  	if err != nil {
  3562  		return err
  3563  	}
  3564  
  3565  	// groupIndex is a map of groups being promoted
  3566  	groupIndex := make(map[string]struct{}, len(req.Groups))
  3567  	for _, g := range req.Groups {
  3568  		groupIndex[g] = struct{}{}
  3569  	}
  3570  
  3571  	// canaryIndex is the set of placed canaries in the deployment
  3572  	canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
  3573  	for _, state := range deployment.TaskGroups {
  3574  		for _, c := range state.PlacedCanaries {
  3575  			canaryIndex[c] = struct{}{}
  3576  		}
  3577  	}
  3578  
  3579  	// healthyCounts is a mapping of group to the number of healthy canaries
  3580  	healthyCounts := make(map[string]int, len(deployment.TaskGroups))
  3581  
  3582  	// promotable is the set of allocations that we can move from canary to
  3583  	// non-canary
  3584  	var promotable []*structs.Allocation
  3585  
  3586  	for {
  3587  		raw := iter.Next()
  3588  		if raw == nil {
  3589  			break
  3590  		}
  3591  
  3592  		alloc := raw.(*structs.Allocation)
  3593  
  3594  		// Check that the alloc is a canary
  3595  		if _, ok := canaryIndex[alloc.ID]; !ok {
  3596  			continue
  3597  		}
  3598  
  3599  		// Check that the canary is part of a group being promoted
  3600  		if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok {
  3601  			continue
  3602  		}
  3603  
  3604  		// Ensure the canaries are healthy
  3605  		if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() {
  3606  			continue
  3607  		}
  3608  
  3609  		healthyCounts[alloc.TaskGroup]++
  3610  		promotable = append(promotable, alloc)
  3611  	}
  3612  
  3613  	// Determine if we have enough healthy allocations
  3614  	var unhealthyErr multierror.Error
  3615  	for tg, state := range deployment.TaskGroups {
  3616  		if _, ok := groupIndex[tg]; !req.All && !ok {
  3617  			continue
  3618  		}
  3619  
  3620  		need := state.DesiredCanaries
  3621  		if need == 0 {
  3622  			continue
  3623  		}
  3624  
  3625  		if have := healthyCounts[tg]; have < need {
  3626  			multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
  3627  		}
  3628  	}
  3629  
  3630  	if err := unhealthyErr.ErrorOrNil(); err != nil {
  3631  		return err
  3632  	}
  3633  
  3634  	// Update deployment
  3635  	copy := deployment.Copy()
  3636  	copy.ModifyIndex = index
  3637  	for tg, status := range copy.TaskGroups {
  3638  		_, ok := groupIndex[tg]
  3639  		if !req.All && !ok {
  3640  			continue
  3641  		}
  3642  
  3643  		status.Promoted = true
  3644  	}
  3645  
  3646  	// If the deployment no longer needs promotion, update its status
  3647  	if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning {
  3648  		copy.StatusDescription = structs.DeploymentStatusDescriptionRunning
  3649  	}
  3650  
  3651  	// Insert the deployment
  3652  	if err := s.upsertDeploymentImpl(index, copy, txn); err != nil {
  3653  		return err
  3654  	}
  3655  
  3656  	// Upsert the optional eval
  3657  	if req.Eval != nil {
  3658  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3659  			return err
  3660  		}
  3661  	}
  3662  
  3663  	// For each promotable allocation remove the canary field
  3664  	for _, alloc := range promotable {
  3665  		promoted := alloc.Copy()
  3666  		promoted.DeploymentStatus.Canary = false
  3667  		promoted.DeploymentStatus.ModifyIndex = index
  3668  		promoted.ModifyIndex = index
  3669  		promoted.AllocModifyIndex = index
  3670  
  3671  		if err := txn.Insert("allocs", promoted); err != nil {
  3672  			return fmt.Errorf("alloc insert failed: %v", err)
  3673  		}
  3674  	}
  3675  
  3676  	// Update the alloc index
  3677  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3678  		return fmt.Errorf("index update failed: %v", err)
  3679  	}
  3680  
  3681  	txn.Commit()
  3682  	return nil
  3683  }
  3684  
  3685  // UpdateDeploymentAllocHealth is used to update the health of allocations as
  3686  // part of the deployment and potentially make a evaluation
  3687  func (s *StateStore) UpdateDeploymentAllocHealth(index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error {
  3688  	txn := s.db.Txn(true)
  3689  	defer txn.Abort()
  3690  
  3691  	// Retrieve deployment and ensure it is not terminal and is active
  3692  	ws := memdb.NewWatchSet()
  3693  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  3694  	if err != nil {
  3695  		return err
  3696  	} else if deployment == nil {
  3697  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  3698  	} else if !deployment.Active() {
  3699  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3700  	}
  3701  
  3702  	// Update the health status of each allocation
  3703  	if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 {
  3704  		setAllocHealth := func(id string, healthy bool, ts time.Time) error {
  3705  			existing, err := txn.First("allocs", "id", id)
  3706  			if err != nil {
  3707  				return fmt.Errorf("alloc %q lookup failed: %v", id, err)
  3708  			}
  3709  			if existing == nil {
  3710  				return fmt.Errorf("unknown alloc %q", id)
  3711  			}
  3712  
  3713  			old := existing.(*structs.Allocation)
  3714  			if old.DeploymentID != req.DeploymentID {
  3715  				return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID)
  3716  			}
  3717  
  3718  			// Set the health
  3719  			copy := old.Copy()
  3720  			if copy.DeploymentStatus == nil {
  3721  				copy.DeploymentStatus = &structs.AllocDeploymentStatus{}
  3722  			}
  3723  			copy.DeploymentStatus.Healthy = helper.BoolToPtr(healthy)
  3724  			copy.DeploymentStatus.Timestamp = ts
  3725  			copy.DeploymentStatus.ModifyIndex = index
  3726  			copy.ModifyIndex = index
  3727  
  3728  			if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil {
  3729  				return fmt.Errorf("error updating deployment: %v", err)
  3730  			}
  3731  
  3732  			if err := txn.Insert("allocs", copy); err != nil {
  3733  				return fmt.Errorf("alloc insert failed: %v", err)
  3734  			}
  3735  
  3736  			return nil
  3737  		}
  3738  
  3739  		for _, id := range req.HealthyAllocationIDs {
  3740  			if err := setAllocHealth(id, true, req.Timestamp); err != nil {
  3741  				return err
  3742  			}
  3743  		}
  3744  		for _, id := range req.UnhealthyAllocationIDs {
  3745  			if err := setAllocHealth(id, false, req.Timestamp); err != nil {
  3746  				return err
  3747  			}
  3748  		}
  3749  
  3750  		// Update the indexes
  3751  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3752  			return fmt.Errorf("index update failed: %v", err)
  3753  		}
  3754  	}
  3755  
  3756  	// Update the deployment status as needed.
  3757  	if req.DeploymentUpdate != nil {
  3758  		if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  3759  			return err
  3760  		}
  3761  	}
  3762  
  3763  	// Upsert the job if necessary
  3764  	if req.Job != nil {
  3765  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  3766  			return err
  3767  		}
  3768  	}
  3769  
  3770  	// Upsert the optional eval
  3771  	if req.Eval != nil {
  3772  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3773  			return err
  3774  		}
  3775  	}
  3776  
  3777  	txn.Commit()
  3778  	return nil
  3779  }
  3780  
  3781  // LastIndex returns the greatest index value for all indexes
  3782  func (s *StateStore) LatestIndex() (uint64, error) {
  3783  	indexes, err := s.Indexes()
  3784  	if err != nil {
  3785  		return 0, err
  3786  	}
  3787  
  3788  	var max uint64 = 0
  3789  	for {
  3790  		raw := indexes.Next()
  3791  		if raw == nil {
  3792  			break
  3793  		}
  3794  
  3795  		// Prepare the request struct
  3796  		idx := raw.(*IndexEntry)
  3797  
  3798  		// Determine the max
  3799  		if idx.Value > max {
  3800  			max = idx.Value
  3801  		}
  3802  	}
  3803  
  3804  	return max, nil
  3805  }
  3806  
  3807  // Index finds the matching index value
  3808  func (s *StateStore) Index(name string) (uint64, error) {
  3809  	txn := s.db.Txn(false)
  3810  
  3811  	// Lookup the first matching index
  3812  	out, err := txn.First("index", "id", name)
  3813  	if err != nil {
  3814  		return 0, err
  3815  	}
  3816  	if out == nil {
  3817  		return 0, nil
  3818  	}
  3819  	return out.(*IndexEntry).Value, nil
  3820  }
  3821  
  3822  // RemoveIndex is a helper method to remove an index for testing purposes
  3823  func (s *StateStore) RemoveIndex(name string) error {
  3824  	txn := s.db.Txn(true)
  3825  	defer txn.Abort()
  3826  
  3827  	if _, err := txn.DeleteAll("index", "id", name); err != nil {
  3828  		return err
  3829  	}
  3830  
  3831  	txn.Commit()
  3832  	return nil
  3833  }
  3834  
  3835  // Indexes returns an iterator over all the indexes
  3836  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  3837  	txn := s.db.Txn(false)
  3838  
  3839  	// Walk the entire nodes table
  3840  	iter, err := txn.Get("index", "id")
  3841  	if err != nil {
  3842  		return nil, err
  3843  	}
  3844  	return iter, nil
  3845  }
  3846  
  3847  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  3848  // store
  3849  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  3850  	txn := s.db.Txn(true)
  3851  	defer txn.Abort()
  3852  
  3853  	// Get all the jobs
  3854  	iter, err := txn.Get("jobs", "id")
  3855  	if err != nil {
  3856  		return err
  3857  	}
  3858  	// COMPAT: Remove after 0.11
  3859  	// Iterate over jobs to build a list of parent jobs and their children
  3860  	parentMap := make(map[string][]*structs.Job)
  3861  	for {
  3862  		rawJob := iter.Next()
  3863  		if rawJob == nil {
  3864  			break
  3865  		}
  3866  		job := rawJob.(*structs.Job)
  3867  		if job.ParentID != "" {
  3868  			children := parentMap[job.ParentID]
  3869  			children = append(children, job)
  3870  			parentMap[job.ParentID] = children
  3871  		}
  3872  	}
  3873  
  3874  	// Get all the jobs again
  3875  	iter, err = txn.Get("jobs", "id")
  3876  	if err != nil {
  3877  		return err
  3878  	}
  3879  
  3880  	for {
  3881  		rawJob := iter.Next()
  3882  		if rawJob == nil {
  3883  			break
  3884  		}
  3885  		job := rawJob.(*structs.Job)
  3886  
  3887  		if job.IsParameterized() || job.IsPeriodic() {
  3888  			// COMPAT: Remove after 0.11
  3889  
  3890  			// The following block of code fixes incorrect child summaries due to a bug
  3891  			// See https://github.com/hashicorp/nomad/issues/3886 for details
  3892  			rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  3893  			if err != nil {
  3894  				return err
  3895  			}
  3896  			if rawSummary == nil {
  3897  				continue
  3898  			}
  3899  
  3900  			oldSummary := rawSummary.(*structs.JobSummary)
  3901  
  3902  			// Create an empty summary
  3903  			summary := &structs.JobSummary{
  3904  				JobID:     job.ID,
  3905  				Namespace: job.Namespace,
  3906  				Summary:   make(map[string]structs.TaskGroupSummary),
  3907  				Children:  &structs.JobChildrenSummary{},
  3908  			}
  3909  
  3910  			// Iterate over children of this job if any to fix summary counts
  3911  			children := parentMap[job.ID]
  3912  			for _, childJob := range children {
  3913  				switch childJob.Status {
  3914  				case structs.JobStatusPending:
  3915  					summary.Children.Pending++
  3916  				case structs.JobStatusDead:
  3917  					summary.Children.Dead++
  3918  				case structs.JobStatusRunning:
  3919  					summary.Children.Running++
  3920  				}
  3921  			}
  3922  
  3923  			// Insert the job summary if its different
  3924  			if !reflect.DeepEqual(summary, oldSummary) {
  3925  				// Set the create index of the summary same as the job's create index
  3926  				// and the modify index to the current index
  3927  				summary.CreateIndex = job.CreateIndex
  3928  				summary.ModifyIndex = index
  3929  
  3930  				if err := txn.Insert("job_summary", summary); err != nil {
  3931  					return fmt.Errorf("error inserting job summary: %v", err)
  3932  				}
  3933  			}
  3934  
  3935  			// Done with handling a parent job, continue to next
  3936  			continue
  3937  		}
  3938  
  3939  		// Create a job summary for the job
  3940  		summary := &structs.JobSummary{
  3941  			JobID:     job.ID,
  3942  			Namespace: job.Namespace,
  3943  			Summary:   make(map[string]structs.TaskGroupSummary),
  3944  		}
  3945  		for _, tg := range job.TaskGroups {
  3946  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  3947  		}
  3948  
  3949  		// Find all the allocations for the jobs
  3950  		iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  3951  		if err != nil {
  3952  			return err
  3953  		}
  3954  
  3955  		// Calculate the summary for the job
  3956  		for {
  3957  			rawAlloc := iterAllocs.Next()
  3958  			if rawAlloc == nil {
  3959  				break
  3960  			}
  3961  			alloc := rawAlloc.(*structs.Allocation)
  3962  
  3963  			// Ignore the allocation if it doesn't belong to the currently
  3964  			// registered job. The allocation is checked because of issue #2304
  3965  			if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex {
  3966  				continue
  3967  			}
  3968  
  3969  			tg := summary.Summary[alloc.TaskGroup]
  3970  			switch alloc.ClientStatus {
  3971  			case structs.AllocClientStatusFailed:
  3972  				tg.Failed += 1
  3973  			case structs.AllocClientStatusLost:
  3974  				tg.Lost += 1
  3975  			case structs.AllocClientStatusComplete:
  3976  				tg.Complete += 1
  3977  			case structs.AllocClientStatusRunning:
  3978  				tg.Running += 1
  3979  			case structs.AllocClientStatusPending:
  3980  				tg.Starting += 1
  3981  			default:
  3982  				s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID)
  3983  			}
  3984  			summary.Summary[alloc.TaskGroup] = tg
  3985  		}
  3986  
  3987  		// Set the create index of the summary same as the job's create index
  3988  		// and the modify index to the current index
  3989  		summary.CreateIndex = job.CreateIndex
  3990  		summary.ModifyIndex = index
  3991  
  3992  		// Insert the job summary
  3993  		if err := txn.Insert("job_summary", summary); err != nil {
  3994  			return fmt.Errorf("error inserting job summary: %v", err)
  3995  		}
  3996  	}
  3997  
  3998  	// Update the indexes table for job summary
  3999  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4000  		return fmt.Errorf("index update failed: %v", err)
  4001  	}
  4002  	txn.Commit()
  4003  	return nil
  4004  }
  4005  
  4006  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  4007  // It takes a map of job IDs to an optional forceStatus string. It returns an
  4008  // error if the job doesn't exist or setJobStatus fails.
  4009  func (s *StateStore) setJobStatuses(index uint64, txn *memdb.Txn,
  4010  	jobs map[structs.NamespacedID]string, evalDelete bool) error {
  4011  	for tuple, forceStatus := range jobs {
  4012  
  4013  		existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID)
  4014  		if err != nil {
  4015  			return fmt.Errorf("job lookup failed: %v", err)
  4016  		}
  4017  
  4018  		if existing == nil {
  4019  			continue
  4020  		}
  4021  
  4022  		if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  4023  			return err
  4024  		}
  4025  	}
  4026  
  4027  	return nil
  4028  }
  4029  
  4030  // setJobStatus sets the status of the job by looking up associated evaluations
  4031  // and allocations. evalDelete should be set to true if setJobStatus is being
  4032  // called because an evaluation is being deleted (potentially because of garbage
  4033  // collection). If forceStatus is non-empty, the job's status will be set to the
  4034  // passed status.
  4035  func (s *StateStore) setJobStatus(index uint64, txn *memdb.Txn,
  4036  	job *structs.Job, evalDelete bool, forceStatus string) error {
  4037  
  4038  	// Capture the current status so we can check if there is a change
  4039  	oldStatus := job.Status
  4040  	if index == job.CreateIndex {
  4041  		oldStatus = ""
  4042  	}
  4043  	newStatus := forceStatus
  4044  
  4045  	// If forceStatus is not set, compute the jobs status.
  4046  	if forceStatus == "" {
  4047  		var err error
  4048  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  4049  		if err != nil {
  4050  			return err
  4051  		}
  4052  	}
  4053  
  4054  	// Fast-path if nothing has changed.
  4055  	if oldStatus == newStatus {
  4056  		return nil
  4057  	}
  4058  
  4059  	// Copy and update the existing job
  4060  	updated := job.Copy()
  4061  	updated.Status = newStatus
  4062  	updated.ModifyIndex = index
  4063  
  4064  	// Insert the job
  4065  	if err := txn.Insert("jobs", updated); err != nil {
  4066  		return fmt.Errorf("job insert failed: %v", err)
  4067  	}
  4068  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  4069  		return fmt.Errorf("index update failed: %v", err)
  4070  	}
  4071  
  4072  	// Update the children summary
  4073  	if updated.ParentID != "" {
  4074  		// Try to update the summary of the parent job summary
  4075  		summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID)
  4076  		if err != nil {
  4077  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  4078  		}
  4079  
  4080  		// Only continue if the summary exists. It could not exist if the parent
  4081  		// job was removed
  4082  		if summaryRaw != nil {
  4083  			existing := summaryRaw.(*structs.JobSummary)
  4084  			pSummary := existing.Copy()
  4085  			if pSummary.Children == nil {
  4086  				pSummary.Children = new(structs.JobChildrenSummary)
  4087  			}
  4088  
  4089  			// Determine the transition and update the correct fields
  4090  			children := pSummary.Children
  4091  
  4092  			// Decrement old status
  4093  			if oldStatus != "" {
  4094  				switch oldStatus {
  4095  				case structs.JobStatusPending:
  4096  					children.Pending--
  4097  				case structs.JobStatusRunning:
  4098  					children.Running--
  4099  				case structs.JobStatusDead:
  4100  					children.Dead--
  4101  				default:
  4102  					return fmt.Errorf("unknown old job status %q", oldStatus)
  4103  				}
  4104  			}
  4105  
  4106  			// Increment new status
  4107  			switch newStatus {
  4108  			case structs.JobStatusPending:
  4109  				children.Pending++
  4110  			case structs.JobStatusRunning:
  4111  				children.Running++
  4112  			case structs.JobStatusDead:
  4113  				children.Dead++
  4114  			default:
  4115  				return fmt.Errorf("unknown new job status %q", newStatus)
  4116  			}
  4117  
  4118  			// Update the index
  4119  			pSummary.ModifyIndex = index
  4120  
  4121  			// Insert the summary
  4122  			if err := txn.Insert("job_summary", pSummary); err != nil {
  4123  				return fmt.Errorf("job summary insert failed: %v", err)
  4124  			}
  4125  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4126  				return fmt.Errorf("index update failed: %v", err)
  4127  			}
  4128  		}
  4129  	}
  4130  
  4131  	return nil
  4132  }
  4133  
  4134  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  4135  	// System, Periodic and Parameterized jobs are running until explicitly
  4136  	// stopped
  4137  	if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() {
  4138  		if job.Stop {
  4139  			return structs.JobStatusDead, nil
  4140  		}
  4141  
  4142  		return structs.JobStatusRunning, nil
  4143  	}
  4144  
  4145  	allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  4146  	if err != nil {
  4147  		return "", err
  4148  	}
  4149  
  4150  	// If there is a non-terminal allocation, the job is running.
  4151  	hasAlloc := false
  4152  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  4153  		hasAlloc = true
  4154  		if !alloc.(*structs.Allocation).TerminalStatus() {
  4155  			return structs.JobStatusRunning, nil
  4156  		}
  4157  	}
  4158  
  4159  	evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID)
  4160  	if err != nil {
  4161  		return "", err
  4162  	}
  4163  
  4164  	hasEval := false
  4165  	for raw := evals.Next(); raw != nil; raw = evals.Next() {
  4166  		e := raw.(*structs.Evaluation)
  4167  
  4168  		// Filter non-exact matches
  4169  		if e.JobID != job.ID {
  4170  			continue
  4171  		}
  4172  
  4173  		hasEval = true
  4174  		if !e.TerminalStatus() {
  4175  			return structs.JobStatusPending, nil
  4176  		}
  4177  	}
  4178  
  4179  	// The job is dead if all the allocations and evals are terminal or if there
  4180  	// are no evals because of garbage collection.
  4181  	if evalDelete || hasEval || hasAlloc {
  4182  		return structs.JobStatusDead, nil
  4183  	}
  4184  
  4185  	return structs.JobStatusPending, nil
  4186  }
  4187  
  4188  // updateSummaryWithJob creates or updates job summaries when new jobs are
  4189  // upserted or existing ones are updated
  4190  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  4191  	txn *memdb.Txn) error {
  4192  
  4193  	// Update the job summary
  4194  	summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  4195  	if err != nil {
  4196  		return fmt.Errorf("job summary lookup failed: %v", err)
  4197  	}
  4198  
  4199  	// Get the summary or create if necessary
  4200  	var summary *structs.JobSummary
  4201  	hasSummaryChanged := false
  4202  	if summaryRaw != nil {
  4203  		summary = summaryRaw.(*structs.JobSummary).Copy()
  4204  	} else {
  4205  		summary = &structs.JobSummary{
  4206  			JobID:       job.ID,
  4207  			Namespace:   job.Namespace,
  4208  			Summary:     make(map[string]structs.TaskGroupSummary),
  4209  			Children:    new(structs.JobChildrenSummary),
  4210  			CreateIndex: index,
  4211  		}
  4212  		hasSummaryChanged = true
  4213  	}
  4214  
  4215  	for _, tg := range job.TaskGroups {
  4216  		if _, ok := summary.Summary[tg.Name]; !ok {
  4217  			newSummary := structs.TaskGroupSummary{
  4218  				Complete: 0,
  4219  				Failed:   0,
  4220  				Running:  0,
  4221  				Starting: 0,
  4222  			}
  4223  			summary.Summary[tg.Name] = newSummary
  4224  			hasSummaryChanged = true
  4225  		}
  4226  	}
  4227  
  4228  	// The job summary has changed, so update the modify index.
  4229  	if hasSummaryChanged {
  4230  		summary.ModifyIndex = index
  4231  
  4232  		// Update the indexes table for job summary
  4233  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4234  			return fmt.Errorf("index update failed: %v", err)
  4235  		}
  4236  		if err := txn.Insert("job_summary", summary); err != nil {
  4237  			return err
  4238  		}
  4239  	}
  4240  
  4241  	return nil
  4242  }
  4243  
  4244  // updateJobScalingPolicies upserts any scaling policies contained in the job and removes
  4245  // any previous scaling policies that were removed from the job
  4246  func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *memdb.Txn) error {
  4247  
  4248  	ws := memdb.NewWatchSet()
  4249  
  4250  	scalingPolicies := job.GetScalingPolicies()
  4251  	newTargets := map[string]struct{}{}
  4252  	for _, p := range scalingPolicies {
  4253  		newTargets[p.Target[structs.ScalingTargetGroup]] = struct{}{}
  4254  	}
  4255  	// find existing policies that need to be deleted
  4256  	deletedPolicies := []string{}
  4257  	iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn)
  4258  	if err != nil {
  4259  		return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err)
  4260  	}
  4261  	for {
  4262  		raw := iter.Next()
  4263  		if raw == nil {
  4264  			break
  4265  		}
  4266  		oldPolicy := raw.(*structs.ScalingPolicy)
  4267  		if _, ok := newTargets[oldPolicy.Target[structs.ScalingTargetGroup]]; !ok {
  4268  			deletedPolicies = append(deletedPolicies, oldPolicy.ID)
  4269  		}
  4270  	}
  4271  	err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn)
  4272  	if err != nil {
  4273  		return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err)
  4274  	}
  4275  
  4276  	err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn)
  4277  	if err != nil {
  4278  		return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err)
  4279  	}
  4280  
  4281  	return nil
  4282  }
  4283  
  4284  // updateDeploymentWithAlloc is used to update the deployment state associated
  4285  // with the given allocation. The passed alloc may be updated if the deployment
  4286  // status has changed to capture the modify index at which it has changed.
  4287  func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *memdb.Txn) error {
  4288  	// Nothing to do if the allocation is not associated with a deployment
  4289  	if alloc.DeploymentID == "" {
  4290  		return nil
  4291  	}
  4292  
  4293  	// Get the deployment
  4294  	ws := memdb.NewWatchSet()
  4295  	deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn)
  4296  	if err != nil {
  4297  		return err
  4298  	}
  4299  	if deployment == nil {
  4300  		return nil
  4301  	}
  4302  
  4303  	// Retrieve the deployment state object
  4304  	_, ok := deployment.TaskGroups[alloc.TaskGroup]
  4305  	if !ok {
  4306  		// If the task group isn't part of the deployment, the task group wasn't
  4307  		// part of a rolling update so nothing to do
  4308  		return nil
  4309  	}
  4310  
  4311  	// Do not modify in-place. Instead keep track of what must be done
  4312  	placed := 0
  4313  	healthy := 0
  4314  	unhealthy := 0
  4315  
  4316  	// If there was no existing allocation, this is a placement and we increment
  4317  	// the placement
  4318  	existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth()
  4319  	allocHealthSet := alloc.DeploymentStatus.HasHealth()
  4320  	if existing == nil || existing.DeploymentID != alloc.DeploymentID {
  4321  		placed++
  4322  	} else if !existingHealthSet && allocHealthSet {
  4323  		if *alloc.DeploymentStatus.Healthy {
  4324  			healthy++
  4325  		} else {
  4326  			unhealthy++
  4327  		}
  4328  	} else if existingHealthSet && allocHealthSet {
  4329  		// See if it has gone from healthy to unhealthy
  4330  		if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy {
  4331  			healthy--
  4332  			unhealthy++
  4333  		}
  4334  	}
  4335  
  4336  	// Nothing to do
  4337  	if placed == 0 && healthy == 0 && unhealthy == 0 {
  4338  		return nil
  4339  	}
  4340  
  4341  	// Update the allocation's deployment status modify index
  4342  	if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 {
  4343  		alloc.DeploymentStatus.ModifyIndex = index
  4344  	}
  4345  
  4346  	// Create a copy of the deployment object
  4347  	deploymentCopy := deployment.Copy()
  4348  	deploymentCopy.ModifyIndex = index
  4349  
  4350  	state := deploymentCopy.TaskGroups[alloc.TaskGroup]
  4351  	state.PlacedAllocs += placed
  4352  	state.HealthyAllocs += healthy
  4353  	state.UnhealthyAllocs += unhealthy
  4354  
  4355  	// Ensure PlacedCanaries accurately reflects the alloc canary status
  4356  	if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
  4357  		found := false
  4358  		for _, canary := range state.PlacedCanaries {
  4359  			if alloc.ID == canary {
  4360  				found = true
  4361  				break
  4362  			}
  4363  		}
  4364  		if !found {
  4365  			state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID)
  4366  		}
  4367  	}
  4368  
  4369  	// Update the progress deadline
  4370  	if pd := state.ProgressDeadline; pd != 0 {
  4371  		// If we are the first placed allocation for the deployment start the progress deadline.
  4372  		if placed != 0 && state.RequireProgressBy.IsZero() {
  4373  			// Use modify time instead of create time because we may in-place
  4374  			// update the allocation to be part of a new deployment.
  4375  			state.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd)
  4376  		} else if healthy != 0 {
  4377  			if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(state.RequireProgressBy) {
  4378  				state.RequireProgressBy = d
  4379  			}
  4380  		}
  4381  	}
  4382  
  4383  	// Upsert the deployment
  4384  	if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil {
  4385  		return err
  4386  	}
  4387  
  4388  	return nil
  4389  }
  4390  
  4391  // updateSummaryWithAlloc updates the job summary when allocations are updated
  4392  // or inserted
  4393  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  4394  	existingAlloc *structs.Allocation, txn *memdb.Txn) error {
  4395  
  4396  	// We don't have to update the summary if the job is missing
  4397  	if alloc.Job == nil {
  4398  		return nil
  4399  	}
  4400  
  4401  	summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID)
  4402  	if err != nil {
  4403  		return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err)
  4404  	}
  4405  
  4406  	if summaryRaw == nil {
  4407  		// Check if the job is de-registered
  4408  		rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID)
  4409  		if err != nil {
  4410  			return fmt.Errorf("unable to query job: %v", err)
  4411  		}
  4412  
  4413  		// If the job is de-registered then we skip updating it's summary
  4414  		if rawJob == nil {
  4415  			return nil
  4416  		}
  4417  
  4418  		return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace)
  4419  	}
  4420  
  4421  	// Get a copy of the existing summary
  4422  	jobSummary := summaryRaw.(*structs.JobSummary).Copy()
  4423  
  4424  	// Not updating the job summary because the allocation doesn't belong to the
  4425  	// currently registered job
  4426  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  4427  		return nil
  4428  	}
  4429  
  4430  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  4431  	if !ok {
  4432  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  4433  	}
  4434  
  4435  	summaryChanged := false
  4436  	if existingAlloc == nil {
  4437  		switch alloc.DesiredStatus {
  4438  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  4439  			s.logger.Error("new allocation inserted into state store with bad desired status",
  4440  				"alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
  4441  		}
  4442  		switch alloc.ClientStatus {
  4443  		case structs.AllocClientStatusPending:
  4444  			tgSummary.Starting += 1
  4445  			if tgSummary.Queued > 0 {
  4446  				tgSummary.Queued -= 1
  4447  			}
  4448  			summaryChanged = true
  4449  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  4450  			structs.AllocClientStatusComplete:
  4451  			s.logger.Error("new allocation inserted into state store with bad client status",
  4452  				"alloc_id", alloc.ID, "client_status", alloc.ClientStatus)
  4453  		}
  4454  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  4455  		// Incrementing the client of the bin of the current state
  4456  		switch alloc.ClientStatus {
  4457  		case structs.AllocClientStatusRunning:
  4458  			tgSummary.Running += 1
  4459  		case structs.AllocClientStatusFailed:
  4460  			tgSummary.Failed += 1
  4461  		case structs.AllocClientStatusPending:
  4462  			tgSummary.Starting += 1
  4463  		case structs.AllocClientStatusComplete:
  4464  			tgSummary.Complete += 1
  4465  		case structs.AllocClientStatusLost:
  4466  			tgSummary.Lost += 1
  4467  		}
  4468  
  4469  		// Decrementing the count of the bin of the last state
  4470  		switch existingAlloc.ClientStatus {
  4471  		case structs.AllocClientStatusRunning:
  4472  			if tgSummary.Running > 0 {
  4473  				tgSummary.Running -= 1
  4474  			}
  4475  		case structs.AllocClientStatusPending:
  4476  			if tgSummary.Starting > 0 {
  4477  				tgSummary.Starting -= 1
  4478  			}
  4479  		case structs.AllocClientStatusLost:
  4480  			if tgSummary.Lost > 0 {
  4481  				tgSummary.Lost -= 1
  4482  			}
  4483  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  4484  		default:
  4485  			s.logger.Error("invalid old client status for allocatio",
  4486  				"alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus)
  4487  		}
  4488  		summaryChanged = true
  4489  	}
  4490  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  4491  
  4492  	if summaryChanged {
  4493  		jobSummary.ModifyIndex = index
  4494  
  4495  		// Update the indexes table for job summary
  4496  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4497  			return fmt.Errorf("index update failed: %v", err)
  4498  		}
  4499  
  4500  		if err := txn.Insert("job_summary", jobSummary); err != nil {
  4501  			return fmt.Errorf("updating job summary failed: %v", err)
  4502  		}
  4503  	}
  4504  
  4505  	return nil
  4506  }
  4507  
  4508  // UpsertACLPolicies is used to create or update a set of ACL policies
  4509  func (s *StateStore) UpsertACLPolicies(index uint64, policies []*structs.ACLPolicy) error {
  4510  	txn := s.db.Txn(true)
  4511  	defer txn.Abort()
  4512  
  4513  	for _, policy := range policies {
  4514  		// Ensure the policy hash is non-nil. This should be done outside the state store
  4515  		// for performance reasons, but we check here for defense in depth.
  4516  		if len(policy.Hash) == 0 {
  4517  			policy.SetHash()
  4518  		}
  4519  
  4520  		// Check if the policy already exists
  4521  		existing, err := txn.First("acl_policy", "id", policy.Name)
  4522  		if err != nil {
  4523  			return fmt.Errorf("policy lookup failed: %v", err)
  4524  		}
  4525  
  4526  		// Update all the indexes
  4527  		if existing != nil {
  4528  			policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex
  4529  			policy.ModifyIndex = index
  4530  		} else {
  4531  			policy.CreateIndex = index
  4532  			policy.ModifyIndex = index
  4533  		}
  4534  
  4535  		// Update the policy
  4536  		if err := txn.Insert("acl_policy", policy); err != nil {
  4537  			return fmt.Errorf("upserting policy failed: %v", err)
  4538  		}
  4539  	}
  4540  
  4541  	// Update the indexes tabl
  4542  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  4543  		return fmt.Errorf("index update failed: %v", err)
  4544  	}
  4545  
  4546  	txn.Commit()
  4547  	return nil
  4548  }
  4549  
  4550  // DeleteACLPolicies deletes the policies with the given names
  4551  func (s *StateStore) DeleteACLPolicies(index uint64, names []string) error {
  4552  	txn := s.db.Txn(true)
  4553  	defer txn.Abort()
  4554  
  4555  	// Delete the policy
  4556  	for _, name := range names {
  4557  		if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil {
  4558  			return fmt.Errorf("deleting acl policy failed: %v", err)
  4559  		}
  4560  	}
  4561  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  4562  		return fmt.Errorf("index update failed: %v", err)
  4563  	}
  4564  	txn.Commit()
  4565  	return nil
  4566  }
  4567  
  4568  // ACLPolicyByName is used to lookup a policy by name
  4569  func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) {
  4570  	txn := s.db.Txn(false)
  4571  
  4572  	watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name)
  4573  	if err != nil {
  4574  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  4575  	}
  4576  	ws.Add(watchCh)
  4577  
  4578  	if existing != nil {
  4579  		return existing.(*structs.ACLPolicy), nil
  4580  	}
  4581  	return nil, nil
  4582  }
  4583  
  4584  // ACLPolicyByNamePrefix is used to lookup policies by prefix
  4585  func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  4586  	txn := s.db.Txn(false)
  4587  
  4588  	iter, err := txn.Get("acl_policy", "id_prefix", prefix)
  4589  	if err != nil {
  4590  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  4591  	}
  4592  	ws.Add(iter.WatchCh())
  4593  
  4594  	return iter, nil
  4595  }
  4596  
  4597  // ACLPolicies returns an iterator over all the acl policies
  4598  func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4599  	txn := s.db.Txn(false)
  4600  
  4601  	// Walk the entire table
  4602  	iter, err := txn.Get("acl_policy", "id")
  4603  	if err != nil {
  4604  		return nil, err
  4605  	}
  4606  	ws.Add(iter.WatchCh())
  4607  	return iter, nil
  4608  }
  4609  
  4610  // UpsertACLTokens is used to create or update a set of ACL tokens
  4611  func (s *StateStore) UpsertACLTokens(index uint64, tokens []*structs.ACLToken) error {
  4612  	txn := s.db.Txn(true)
  4613  	defer txn.Abort()
  4614  
  4615  	for _, token := range tokens {
  4616  		// Ensure the policy hash is non-nil. This should be done outside the state store
  4617  		// for performance reasons, but we check here for defense in depth.
  4618  		if len(token.Hash) == 0 {
  4619  			token.SetHash()
  4620  		}
  4621  
  4622  		// Check if the token already exists
  4623  		existing, err := txn.First("acl_token", "id", token.AccessorID)
  4624  		if err != nil {
  4625  			return fmt.Errorf("token lookup failed: %v", err)
  4626  		}
  4627  
  4628  		// Update all the indexes
  4629  		if existing != nil {
  4630  			existTK := existing.(*structs.ACLToken)
  4631  			token.CreateIndex = existTK.CreateIndex
  4632  			token.ModifyIndex = index
  4633  
  4634  			// Do not allow SecretID or create time to change
  4635  			token.SecretID = existTK.SecretID
  4636  			token.CreateTime = existTK.CreateTime
  4637  
  4638  		} else {
  4639  			token.CreateIndex = index
  4640  			token.ModifyIndex = index
  4641  		}
  4642  
  4643  		// Update the token
  4644  		if err := txn.Insert("acl_token", token); err != nil {
  4645  			return fmt.Errorf("upserting token failed: %v", err)
  4646  		}
  4647  	}
  4648  
  4649  	// Update the indexes table
  4650  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4651  		return fmt.Errorf("index update failed: %v", err)
  4652  	}
  4653  	txn.Commit()
  4654  	return nil
  4655  }
  4656  
  4657  // DeleteACLTokens deletes the tokens with the given accessor ids
  4658  func (s *StateStore) DeleteACLTokens(index uint64, ids []string) error {
  4659  	txn := s.db.Txn(true)
  4660  	defer txn.Abort()
  4661  
  4662  	// Delete the tokens
  4663  	for _, id := range ids {
  4664  		if _, err := txn.DeleteAll("acl_token", "id", id); err != nil {
  4665  			return fmt.Errorf("deleting acl token failed: %v", err)
  4666  		}
  4667  	}
  4668  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4669  		return fmt.Errorf("index update failed: %v", err)
  4670  	}
  4671  	txn.Commit()
  4672  	return nil
  4673  }
  4674  
  4675  // ACLTokenByAccessorID is used to lookup a token by accessor ID
  4676  func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) {
  4677  	if id == "" {
  4678  		return nil, fmt.Errorf("acl token lookup failed: missing accessor id")
  4679  	}
  4680  
  4681  	txn := s.db.Txn(false)
  4682  
  4683  	watchCh, existing, err := txn.FirstWatch("acl_token", "id", id)
  4684  	if err != nil {
  4685  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4686  	}
  4687  	ws.Add(watchCh)
  4688  
  4689  	if existing != nil {
  4690  		return existing.(*structs.ACLToken), nil
  4691  	}
  4692  	return nil, nil
  4693  }
  4694  
  4695  // ACLTokenBySecretID is used to lookup a token by secret ID
  4696  func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) {
  4697  	if secretID == "" {
  4698  		return nil, fmt.Errorf("acl token lookup failed: missing secret id")
  4699  	}
  4700  
  4701  	txn := s.db.Txn(false)
  4702  
  4703  	watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID)
  4704  	if err != nil {
  4705  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4706  	}
  4707  	ws.Add(watchCh)
  4708  
  4709  	if existing != nil {
  4710  		return existing.(*structs.ACLToken), nil
  4711  	}
  4712  	return nil, nil
  4713  }
  4714  
  4715  // ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix
  4716  func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  4717  	txn := s.db.Txn(false)
  4718  
  4719  	iter, err := txn.Get("acl_token", "id_prefix", prefix)
  4720  	if err != nil {
  4721  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4722  	}
  4723  	ws.Add(iter.WatchCh())
  4724  	return iter, nil
  4725  }
  4726  
  4727  // ACLTokens returns an iterator over all the tokens
  4728  func (s *StateStore) ACLTokens(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4729  	txn := s.db.Txn(false)
  4730  
  4731  	// Walk the entire table
  4732  	iter, err := txn.Get("acl_token", "id")
  4733  	if err != nil {
  4734  		return nil, err
  4735  	}
  4736  	ws.Add(iter.WatchCh())
  4737  	return iter, nil
  4738  }
  4739  
  4740  // ACLTokensByGlobal returns an iterator over all the tokens filtered by global value
  4741  func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool) (memdb.ResultIterator, error) {
  4742  	txn := s.db.Txn(false)
  4743  
  4744  	// Walk the entire table
  4745  	iter, err := txn.Get("acl_token", "global", globalVal)
  4746  	if err != nil {
  4747  		return nil, err
  4748  	}
  4749  	ws.Add(iter.WatchCh())
  4750  	return iter, nil
  4751  }
  4752  
  4753  // CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index
  4754  func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) {
  4755  	txn := s.db.Txn(false)
  4756  
  4757  	// Lookup the bootstrap sentinel
  4758  	out, err := txn.First("index", "id", "acl_token_bootstrap")
  4759  	if err != nil {
  4760  		return false, 0, err
  4761  	}
  4762  
  4763  	// No entry, we haven't bootstrapped yet
  4764  	if out == nil {
  4765  		return true, 0, nil
  4766  	}
  4767  
  4768  	// Return the reset index if we've already bootstrapped
  4769  	return false, out.(*IndexEntry).Value, nil
  4770  }
  4771  
  4772  // BootstrapACLToken is used to create an initial ACL token
  4773  func (s *StateStore) BootstrapACLTokens(index, resetIndex uint64, token *structs.ACLToken) error {
  4774  	txn := s.db.Txn(true)
  4775  	defer txn.Abort()
  4776  
  4777  	// Check if we have already done a bootstrap
  4778  	existing, err := txn.First("index", "id", "acl_token_bootstrap")
  4779  	if err != nil {
  4780  		return fmt.Errorf("bootstrap check failed: %v", err)
  4781  	}
  4782  	if existing != nil {
  4783  		if resetIndex == 0 {
  4784  			return fmt.Errorf("ACL bootstrap already done")
  4785  		} else if resetIndex != existing.(*IndexEntry).Value {
  4786  			return fmt.Errorf("Invalid reset index for ACL bootstrap")
  4787  		}
  4788  	}
  4789  
  4790  	// Update the Create/Modify time
  4791  	token.CreateIndex = index
  4792  	token.ModifyIndex = index
  4793  
  4794  	// Insert the token
  4795  	if err := txn.Insert("acl_token", token); err != nil {
  4796  		return fmt.Errorf("upserting token failed: %v", err)
  4797  	}
  4798  
  4799  	// Update the indexes table, prevents future bootstrap until reset
  4800  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4801  		return fmt.Errorf("index update failed: %v", err)
  4802  	}
  4803  	if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil {
  4804  		return fmt.Errorf("index update failed: %v", err)
  4805  	}
  4806  	txn.Commit()
  4807  	return nil
  4808  }
  4809  
  4810  // SchedulerConfig is used to get the current Scheduler configuration.
  4811  func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) {
  4812  	tx := s.db.Txn(false)
  4813  	defer tx.Abort()
  4814  
  4815  	// Get the scheduler config
  4816  	c, err := tx.First("scheduler_config", "id")
  4817  	if err != nil {
  4818  		return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err)
  4819  	}
  4820  
  4821  	config, ok := c.(*structs.SchedulerConfiguration)
  4822  	if !ok {
  4823  		return 0, nil, nil
  4824  	}
  4825  
  4826  	return config.ModifyIndex, config, nil
  4827  }
  4828  
  4829  // SchedulerSetConfig is used to set the current Scheduler configuration.
  4830  func (s *StateStore) SchedulerSetConfig(idx uint64, config *structs.SchedulerConfiguration) error {
  4831  	tx := s.db.Txn(true)
  4832  	defer tx.Abort()
  4833  
  4834  	s.schedulerSetConfigTxn(idx, tx, config)
  4835  
  4836  	tx.Commit()
  4837  	return nil
  4838  }
  4839  
  4840  func (s *StateStore) ClusterMetadata() (*structs.ClusterMetadata, error) {
  4841  	txn := s.db.Txn(false)
  4842  	defer txn.Abort()
  4843  
  4844  	// Get the cluster metadata
  4845  	m, err := txn.First("cluster_meta", "id")
  4846  	if err != nil {
  4847  		return nil, errors.Wrap(err, "failed cluster metadata lookup")
  4848  	}
  4849  
  4850  	if m != nil {
  4851  		return m.(*structs.ClusterMetadata), nil
  4852  	}
  4853  
  4854  	return nil, nil
  4855  }
  4856  
  4857  func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error {
  4858  	txn := s.db.Txn(true)
  4859  	defer txn.Abort()
  4860  
  4861  	if err := s.setClusterMetadata(txn, meta); err != nil {
  4862  		return errors.Wrap(err, "set cluster metadata failed")
  4863  	}
  4864  
  4865  	txn.Commit()
  4866  	return nil
  4867  }
  4868  
  4869  // WithWriteTransaction executes the passed function within a write transaction,
  4870  // and returns its result.  If the invocation returns no error, the transaction
  4871  // is committed; otherwise, it's aborted.
  4872  func (s *StateStore) WithWriteTransaction(fn func(Txn) error) error {
  4873  	tx := s.db.Txn(true)
  4874  	defer tx.Abort()
  4875  
  4876  	err := fn(tx)
  4877  	if err == nil {
  4878  		tx.Commit()
  4879  	}
  4880  	return err
  4881  }
  4882  
  4883  // SchedulerCASConfig is used to update the scheduler configuration with a
  4884  // given Raft index. If the CAS index specified is not equal to the last observed index
  4885  // for the config, then the call is a noop.
  4886  func (s *StateStore) SchedulerCASConfig(idx, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) {
  4887  	tx := s.db.Txn(true)
  4888  	defer tx.Abort()
  4889  
  4890  	// Check for an existing config
  4891  	existing, err := tx.First("scheduler_config", "id")
  4892  	if err != nil {
  4893  		return false, fmt.Errorf("failed scheduler config lookup: %s", err)
  4894  	}
  4895  
  4896  	// If the existing index does not match the provided CAS
  4897  	// index arg, then we shouldn't update anything and can safely
  4898  	// return early here.
  4899  	e, ok := existing.(*structs.SchedulerConfiguration)
  4900  	if !ok || (e != nil && e.ModifyIndex != cidx) {
  4901  		return false, nil
  4902  	}
  4903  
  4904  	s.schedulerSetConfigTxn(idx, tx, config)
  4905  
  4906  	tx.Commit()
  4907  	return true, nil
  4908  }
  4909  
  4910  func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *memdb.Txn, config *structs.SchedulerConfiguration) error {
  4911  	// Check for an existing config
  4912  	existing, err := tx.First("scheduler_config", "id")
  4913  	if err != nil {
  4914  		return fmt.Errorf("failed scheduler config lookup: %s", err)
  4915  	}
  4916  
  4917  	// Set the indexes.
  4918  	if existing != nil {
  4919  		config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex
  4920  	} else {
  4921  		config.CreateIndex = idx
  4922  	}
  4923  	config.ModifyIndex = idx
  4924  
  4925  	if err := tx.Insert("scheduler_config", config); err != nil {
  4926  		return fmt.Errorf("failed updating scheduler config: %s", err)
  4927  	}
  4928  	return nil
  4929  }
  4930  
  4931  func (s *StateStore) setClusterMetadata(txn *memdb.Txn, meta *structs.ClusterMetadata) error {
  4932  	// Check for an existing config, if it exists, sanity check the cluster ID matches
  4933  	existing, err := txn.First("cluster_meta", "id")
  4934  	if err != nil {
  4935  		return fmt.Errorf("failed cluster meta lookup: %v", err)
  4936  	}
  4937  
  4938  	if existing != nil {
  4939  		existingClusterID := existing.(*structs.ClusterMetadata).ClusterID
  4940  		if meta.ClusterID != existingClusterID {
  4941  			// there is a bug in cluster ID detection
  4942  			return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID)
  4943  		}
  4944  	}
  4945  
  4946  	// update is technically a noop, unless someday we add more / mutable fields
  4947  	if err := txn.Insert("cluster_meta", meta); err != nil {
  4948  		return fmt.Errorf("set cluster metadata failed: %v", err)
  4949  	}
  4950  
  4951  	return nil
  4952  }
  4953  
  4954  // UpsertScalingPolicy is used to insert a new scaling policy.
  4955  func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error {
  4956  	txn := s.db.Txn(true)
  4957  	defer txn.Abort()
  4958  
  4959  	if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil {
  4960  		return err
  4961  	}
  4962  
  4963  	txn.Commit()
  4964  	return nil
  4965  }
  4966  
  4967  // upsertScalingPolicy is used to insert a new scaling policy.
  4968  func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy,
  4969  	txn *memdb.Txn) error {
  4970  
  4971  	hadUpdates := false
  4972  
  4973  	for _, policy := range scalingPolicies {
  4974  		// Check if the scaling policy already exists
  4975  		existing, err := txn.First("scaling_policy", "target",
  4976  			policy.Target[structs.ScalingTargetNamespace],
  4977  			policy.Target[structs.ScalingTargetJob],
  4978  			policy.Target[structs.ScalingTargetGroup])
  4979  		if err != nil {
  4980  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  4981  		}
  4982  
  4983  		// Setup the indexes correctly
  4984  		if existing != nil {
  4985  			p := existing.(*structs.ScalingPolicy)
  4986  			if !p.Diff(policy) {
  4987  				continue
  4988  			}
  4989  			policy.ID = p.ID
  4990  			policy.CreateIndex = p.CreateIndex
  4991  			policy.ModifyIndex = index
  4992  		} else {
  4993  			// policy.ID must have been set already in Job.Register before log apply
  4994  			policy.CreateIndex = index
  4995  			policy.ModifyIndex = index
  4996  		}
  4997  
  4998  		// Insert the scaling policy
  4999  		hadUpdates = true
  5000  		if err := txn.Insert("scaling_policy", policy); err != nil {
  5001  			return err
  5002  		}
  5003  	}
  5004  
  5005  	// Update the indexes table for scaling policy
  5006  	if hadUpdates {
  5007  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  5008  			return fmt.Errorf("index update failed: %v", err)
  5009  		}
  5010  	}
  5011  
  5012  	return nil
  5013  }
  5014  
  5015  func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error {
  5016  	txn := s.db.Txn(true)
  5017  	defer txn.Abort()
  5018  
  5019  	err := s.DeleteScalingPoliciesTxn(index, ids, txn)
  5020  	if err == nil {
  5021  		txn.Commit()
  5022  	}
  5023  
  5024  	return err
  5025  }
  5026  
  5027  // DeleteScalingPolicies is used to delete a set of scaling policies by ID
  5028  func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *memdb.Txn) error {
  5029  	if len(ids) == 0 {
  5030  		return nil
  5031  	}
  5032  
  5033  	for _, id := range ids {
  5034  		// Lookup the scaling policy
  5035  		existing, err := txn.First("scaling_policy", "id", id)
  5036  		if err != nil {
  5037  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  5038  		}
  5039  		if existing == nil {
  5040  			return fmt.Errorf("scaling policy not found")
  5041  		}
  5042  
  5043  		// Delete the scaling policy
  5044  		if err := txn.Delete("scaling_policy", existing); err != nil {
  5045  			return fmt.Errorf("scaling policy delete failed: %v", err)
  5046  		}
  5047  	}
  5048  
  5049  	if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  5050  		return fmt.Errorf("index update failed: %v", err)
  5051  	}
  5052  
  5053  	return nil
  5054  }
  5055  
  5056  // ScalingPolicies returns an iterator over all the scaling policies
  5057  func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  5058  	txn := s.db.Txn(false)
  5059  
  5060  	// Walk the entire scaling_policy table
  5061  	iter, err := txn.Get("scaling_policy", "id")
  5062  	if err != nil {
  5063  		return nil, err
  5064  	}
  5065  
  5066  	ws.Add(iter.WatchCh())
  5067  
  5068  	return iter, nil
  5069  }
  5070  
  5071  func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  5072  	txn := s.db.Txn(false)
  5073  
  5074  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace)
  5075  	if err != nil {
  5076  		return nil, err
  5077  	}
  5078  
  5079  	ws.Add(iter.WatchCh())
  5080  	return iter, nil
  5081  }
  5082  
  5083  func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID string) (memdb.ResultIterator, error) {
  5084  	txn := s.db.Txn(false)
  5085  	return s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn)
  5086  }
  5087  
  5088  func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string,
  5089  	txn *memdb.Txn) (memdb.ResultIterator, error) {
  5090  
  5091  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID)
  5092  	if err != nil {
  5093  		return nil, err
  5094  	}
  5095  
  5096  	ws.Add(iter.WatchCh())
  5097  	return iter, nil
  5098  }
  5099  
  5100  func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) {
  5101  	txn := s.db.Txn(false)
  5102  
  5103  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id)
  5104  	if err != nil {
  5105  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  5106  	}
  5107  	ws.Add(watchCh)
  5108  
  5109  	if existing != nil {
  5110  		return existing.(*structs.ScalingPolicy), nil
  5111  	}
  5112  
  5113  	return nil, nil
  5114  }
  5115  
  5116  func (s *StateStore) ScalingPolicyByTarget(ws memdb.WatchSet, target map[string]string) (*structs.ScalingPolicy,
  5117  	error) {
  5118  	txn := s.db.Txn(false)
  5119  
  5120  	// currently, only scaling policy type is against a task group
  5121  	namespace := target[structs.ScalingTargetNamespace]
  5122  	job := target[structs.ScalingTargetJob]
  5123  	group := target[structs.ScalingTargetGroup]
  5124  
  5125  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "target", namespace, job, group)
  5126  	if err != nil {
  5127  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  5128  	}
  5129  	ws.Add(watchCh)
  5130  
  5131  	if existing != nil {
  5132  		return existing.(*structs.ScalingPolicy), nil
  5133  	}
  5134  
  5135  	return nil, nil
  5136  }
  5137  
  5138  // StateSnapshot is used to provide a point-in-time snapshot
  5139  type StateSnapshot struct {
  5140  	StateStore
  5141  }
  5142  
  5143  // DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the
  5144  // Allocation for each of the Allocation diffs and merges the updated attributes with
  5145  // the existing Allocation, and attaches the Job provided
  5146  func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error {
  5147  	for nodeID, allocs := range nodeAllocations {
  5148  		denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs)
  5149  		if err != nil {
  5150  			return err
  5151  		}
  5152  
  5153  		nodeAllocations[nodeID] = denormalizedAllocs
  5154  	}
  5155  	return nil
  5156  }
  5157  
  5158  // DenormalizeAllocationSlice queries the Allocation for each allocation diff
  5159  // represented as an Allocation and merges the updated attributes with the existing
  5160  // Allocation, and attaches the Job provided.
  5161  //
  5162  // This should only be called on terminal allocs, particularly stopped or preempted allocs
  5163  func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) {
  5164  	allocDiffs := make([]*structs.AllocationDiff, len(allocs))
  5165  	for i, alloc := range allocs {
  5166  		allocDiffs[i] = alloc.AllocationDiff()
  5167  	}
  5168  
  5169  	return s.DenormalizeAllocationDiffSlice(allocDiffs)
  5170  }
  5171  
  5172  // DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges
  5173  // the updated attributes with the existing Allocation, and attaches the Job provided.
  5174  //
  5175  // This should only be called on terminal alloc, particularly stopped or preempted allocs
  5176  func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) {
  5177  	// Output index for denormalized Allocations
  5178  	j := 0
  5179  
  5180  	denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs))
  5181  	for _, allocDiff := range allocDiffs {
  5182  		alloc, err := s.AllocByID(nil, allocDiff.ID)
  5183  		if err != nil {
  5184  			return nil, fmt.Errorf("alloc lookup failed: %v", err)
  5185  		}
  5186  		if alloc == nil {
  5187  			return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID)
  5188  		}
  5189  
  5190  		// Merge the updates to the Allocation.  Don't update alloc.Job for terminal allocs
  5191  		// so alloc refers to the latest Job view before destruction and to ease handler implementations
  5192  		allocCopy := alloc.Copy()
  5193  
  5194  		if allocDiff.PreemptedByAllocation != "" {
  5195  			allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation
  5196  			allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation)
  5197  			allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict
  5198  		} else {
  5199  			// If alloc is a stopped alloc
  5200  			allocCopy.DesiredDescription = allocDiff.DesiredDescription
  5201  			allocCopy.DesiredStatus = structs.AllocDesiredStatusStop
  5202  			if allocDiff.ClientStatus != "" {
  5203  				allocCopy.ClientStatus = allocDiff.ClientStatus
  5204  			}
  5205  		}
  5206  		if allocDiff.ModifyTime != 0 {
  5207  			allocCopy.ModifyTime = allocDiff.ModifyTime
  5208  		}
  5209  
  5210  		// Update the allocDiff in the slice to equal the denormalized alloc
  5211  		denormalizedAllocs[j] = allocCopy
  5212  		j++
  5213  	}
  5214  	// Retain only the denormalized Allocations in the slice
  5215  	denormalizedAllocs = denormalizedAllocs[:j]
  5216  	return denormalizedAllocs, nil
  5217  }
  5218  
  5219  func getPreemptedAllocDesiredDescription(PreemptedByAllocID string) string {
  5220  	return fmt.Sprintf("Preempted by alloc ID %v", PreemptedByAllocID)
  5221  }
  5222  
  5223  // StateRestore is used to optimize the performance when
  5224  // restoring state by only using a single large transaction
  5225  // instead of thousands of sub transactions
  5226  type StateRestore struct {
  5227  	txn *memdb.Txn
  5228  }
  5229  
  5230  // Abort is used to abort the restore operation
  5231  func (s *StateRestore) Abort() {
  5232  	s.txn.Abort()
  5233  }
  5234  
  5235  // Commit is used to commit the restore operation
  5236  func (s *StateRestore) Commit() {
  5237  	s.txn.Commit()
  5238  }
  5239  
  5240  // NodeRestore is used to restore a node
  5241  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  5242  	if err := r.txn.Insert("nodes", node); err != nil {
  5243  		return fmt.Errorf("node insert failed: %v", err)
  5244  	}
  5245  	return nil
  5246  }
  5247  
  5248  // JobRestore is used to restore a job
  5249  func (r *StateRestore) JobRestore(job *structs.Job) error {
  5250  	if err := r.txn.Insert("jobs", job); err != nil {
  5251  		return fmt.Errorf("job insert failed: %v", err)
  5252  	}
  5253  	return nil
  5254  }
  5255  
  5256  // EvalRestore is used to restore an evaluation
  5257  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  5258  	if err := r.txn.Insert("evals", eval); err != nil {
  5259  		return fmt.Errorf("eval insert failed: %v", err)
  5260  	}
  5261  	return nil
  5262  }
  5263  
  5264  // AllocRestore is used to restore an allocation
  5265  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  5266  	if err := r.txn.Insert("allocs", alloc); err != nil {
  5267  		return fmt.Errorf("alloc insert failed: %v", err)
  5268  	}
  5269  	return nil
  5270  }
  5271  
  5272  // IndexRestore is used to restore an index
  5273  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  5274  	if err := r.txn.Insert("index", idx); err != nil {
  5275  		return fmt.Errorf("index insert failed: %v", err)
  5276  	}
  5277  	return nil
  5278  }
  5279  
  5280  // PeriodicLaunchRestore is used to restore a periodic launch.
  5281  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  5282  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  5283  		return fmt.Errorf("periodic launch insert failed: %v", err)
  5284  	}
  5285  	return nil
  5286  }
  5287  
  5288  // JobSummaryRestore is used to restore a job summary
  5289  func (r *StateRestore) JobSummaryRestore(jobSummary *structs.JobSummary) error {
  5290  	if err := r.txn.Insert("job_summary", jobSummary); err != nil {
  5291  		return fmt.Errorf("job summary insert failed: %v", err)
  5292  	}
  5293  	return nil
  5294  }
  5295  
  5296  // JobVersionRestore is used to restore a job version
  5297  func (r *StateRestore) JobVersionRestore(version *structs.Job) error {
  5298  	if err := r.txn.Insert("job_version", version); err != nil {
  5299  		return fmt.Errorf("job version insert failed: %v", err)
  5300  	}
  5301  	return nil
  5302  }
  5303  
  5304  // DeploymentRestore is used to restore a deployment
  5305  func (r *StateRestore) DeploymentRestore(deployment *structs.Deployment) error {
  5306  	if err := r.txn.Insert("deployment", deployment); err != nil {
  5307  		return fmt.Errorf("deployment insert failed: %v", err)
  5308  	}
  5309  	return nil
  5310  }
  5311  
  5312  // VaultAccessorRestore is used to restore a vault accessor
  5313  func (r *StateRestore) VaultAccessorRestore(accessor *structs.VaultAccessor) error {
  5314  	if err := r.txn.Insert("vault_accessors", accessor); err != nil {
  5315  		return fmt.Errorf("vault accessor insert failed: %v", err)
  5316  	}
  5317  	return nil
  5318  }
  5319  
  5320  // SITokenAccessorRestore is used to restore an SI token accessor
  5321  func (r *StateRestore) SITokenAccessorRestore(accessor *structs.SITokenAccessor) error {
  5322  	if err := r.txn.Insert(siTokenAccessorTable, accessor); err != nil {
  5323  		return errors.Wrap(err, "si token accessor insert failed")
  5324  	}
  5325  	return nil
  5326  }
  5327  
  5328  // ACLPolicyRestore is used to restore an ACL policy
  5329  func (r *StateRestore) ACLPolicyRestore(policy *structs.ACLPolicy) error {
  5330  	if err := r.txn.Insert("acl_policy", policy); err != nil {
  5331  		return fmt.Errorf("inserting acl policy failed: %v", err)
  5332  	}
  5333  	return nil
  5334  }
  5335  
  5336  // ACLTokenRestore is used to restore an ACL token
  5337  func (r *StateRestore) ACLTokenRestore(token *structs.ACLToken) error {
  5338  	if err := r.txn.Insert("acl_token", token); err != nil {
  5339  		return fmt.Errorf("inserting acl token failed: %v", err)
  5340  	}
  5341  	return nil
  5342  }
  5343  
  5344  func (r *StateRestore) SchedulerConfigRestore(schedConfig *structs.SchedulerConfiguration) error {
  5345  	if err := r.txn.Insert("scheduler_config", schedConfig); err != nil {
  5346  		return fmt.Errorf("inserting scheduler config failed: %s", err)
  5347  	}
  5348  	return nil
  5349  }
  5350  
  5351  func (r *StateRestore) ClusterMetadataRestore(meta *structs.ClusterMetadata) error {
  5352  	if err := r.txn.Insert("cluster_meta", meta); err != nil {
  5353  		return fmt.Errorf("inserting cluster meta failed: %v", err)
  5354  	}
  5355  	return nil
  5356  }
  5357  
  5358  // ScalingPolicyRestore is used to restore a scaling policy
  5359  func (r *StateRestore) ScalingPolicyRestore(scalingPolicy *structs.ScalingPolicy) error {
  5360  	if err := r.txn.Insert("scaling_policy", scalingPolicy); err != nil {
  5361  		return fmt.Errorf("scaling policy insert failed: %v", err)
  5362  	}
  5363  	return nil
  5364  }
  5365  
  5366  // CSIPluginRestore is used to restore a CSI plugin
  5367  func (r *StateRestore) CSIPluginRestore(plugin *structs.CSIPlugin) error {
  5368  	if err := r.txn.Insert("csi_plugins", plugin); err != nil {
  5369  		return fmt.Errorf("csi plugin insert failed: %v", err)
  5370  	}
  5371  	return nil
  5372  }
  5373  
  5374  // CSIVolumeRestore is used to restore a CSI volume
  5375  func (r *StateRestore) CSIVolumeRestore(volume *structs.CSIVolume) error {
  5376  	if err := r.txn.Insert("csi_volumes", volume); err != nil {
  5377  		return fmt.Errorf("csi volume insert failed: %v", err)
  5378  	}
  5379  	return nil
  5380  }
  5381  
  5382  func (r *StateRestore) ScalingEventsRestore(jobEvents *structs.JobScalingEvents) error {
  5383  	if err := r.txn.Insert("scaling_event", jobEvents); err != nil {
  5384  		return fmt.Errorf("scaling event insert failed: %v", err)
  5385  	}
  5386  	return nil
  5387  }