
     1  package state
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"reflect"
     7  	"sort"
     8  	"time"
    10  	log ""
    11  	memdb ""
    12  	multierror ""
    13  	""
    15  	""
    16  	""
    17  )
    19  // Txn is a transaction against a state store.
    20  // This can be a read or write transaction.
    21  type Txn = *memdb.Txn
    23  const (
    24  	// NodeRegisterEventReregistered is the message used when the node becomes
    25  	// reregistered.
    26  	NodeRegisterEventRegistered = "Node registered"
    28  	// NodeRegisterEventReregistered is the message used when the node becomes
    29  	// reregistered.
    30  	NodeRegisterEventReregistered = "Node re-registered"
    31  )
    33  // IndexEntry is used with the "index" table
    34  // for managing the latest Raft index affecting a table.
    35  type IndexEntry struct {
    36  	Key   string
    37  	Value uint64
    38  }
    40  // StateStoreConfig is used to configure a new state store
    41  type StateStoreConfig struct {
    42  	// Logger is used to output the state store's logs
    43  	Logger log.Logger
    45  	// Region is the region of the server embedding the state store.
    46  	Region string
    47  }
    49  // The StateStore is responsible for maintaining all the Nomad
    50  // state. It is manipulated by the FSM which maintains consistency
    51  // through the use of Raft. The goals of the StateStore are to provide
    52  // high concurrency for read operations without blocking writes, and
    53  // to provide write availability in the face of reads. EVERY object
    54  // returned as a result of a read against the state store should be
    55  // considered a constant and NEVER modified in place.
    56  type StateStore struct {
    57  	logger log.Logger
    58  	db     *memdb.MemDB
    60  	// config is the passed in configuration
    61  	config *StateStoreConfig
    63  	// abandonCh is used to signal watchers that this state store has been
    64  	// abandoned (usually during a restore). This is only ever closed.
    65  	abandonCh chan struct{}
    66  }
    68  // NewStateStore is used to create a new state store
    69  func NewStateStore(config *StateStoreConfig) (*StateStore, error) {
    70  	// Create the MemDB
    71  	db, err := memdb.NewMemDB(stateStoreSchema())
    72  	if err != nil {
    73  		return nil, fmt.Errorf("state store setup failed: %v", err)
    74  	}
    76  	// Create the state store
    77  	s := &StateStore{
    78  		logger:    config.Logger.Named("state_store"),
    79  		db:        db,
    80  		config:    config,
    81  		abandonCh: make(chan struct{}),
    82  	}
    83  	return s, nil
    84  }
    86  // Config returns the state store configuration.
    87  func (s *StateStore) Config() *StateStoreConfig {
    88  	return s.config
    89  }
    91  // Snapshot is used to create a point in time snapshot. Because
    92  // we use MemDB, we just need to snapshot the state of the underlying
    93  // database.
    94  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    95  	snap := &StateSnapshot{
    96  		StateStore: StateStore{
    97  			logger: s.logger,
    98  			config: s.config,
    99  			db:     s.db.Snapshot(),
   100  		},
   101  	}
   102  	return snap, nil
   103  }
   105  // SnapshotMinIndex is used to create a state snapshot where the index is
   106  // guaranteed to be greater than or equal to the index parameter.
   107  //
   108  // Some server operations (such as scheduling) exchange objects via RPC
   109  // concurrent with Raft log application, so they must ensure the state store
   110  // snapshot they are operating on is at or after the index the objects
   111  // retrieved via RPC were applied to the Raft log at.
   112  //
   113  // Callers should maintain their own timer metric as the time this method
   114  // blocks indicates Raft log application latency relative to scheduling.
   115  func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) {
   116  	// Ported from work.go:waitForIndex prior to 0.9
   118  	const backoffBase = 20 * time.Millisecond
   119  	const backoffLimit = 1 * time.Second
   120  	var retries uint
   121  	var retryTimer *time.Timer
   123  	// XXX: Potential optimization is to set up a watch on the state
   124  	// store's index table and only unblock via a trigger rather than
   125  	// polling.
   126  	for {
   127  		// Get the states current index
   128  		snapshotIndex, err := s.LatestIndex()
   129  		if err != nil {
   130  			return nil, fmt.Errorf("failed to determine state store's index: %v", err)
   131  		}
   133  		// We only need the FSM state to be as recent as the given index
   134  		if snapshotIndex >= index {
   135  			return s.Snapshot()
   136  		}
   138  		// Exponential back off
   139  		retries++
   140  		if retryTimer == nil {
   141  			// First retry, start at baseline
   142  			retryTimer = time.NewTimer(backoffBase)
   143  		} else {
   144  			// Subsequent retry, reset timer
   145  			deadline := 1 << (2 * retries) * backoffBase
   146  			if deadline > backoffLimit {
   147  				deadline = backoffLimit
   148  			}
   149  			retryTimer.Reset(deadline)
   150  		}
   152  		select {
   153  		case <-ctx.Done():
   154  			return nil, ctx.Err()
   155  		case <-retryTimer.C:
   156  		}
   157  	}
   158  }
   160  // Restore is used to optimize the efficiency of rebuilding
   161  // state by minimizing the number of transactions and checking
   162  // overhead.
   163  func (s *StateStore) Restore() (*StateRestore, error) {
   164  	txn := s.db.Txn(true)
   165  	r := &StateRestore{
   166  		txn: txn,
   167  	}
   168  	return r, nil
   169  }
   171  // AbandonCh returns a channel you can wait on to know if the state store was
   172  // abandoned.
   173  func (s *StateStore) AbandonCh() <-chan struct{} {
   174  	return s.abandonCh
   175  }
   177  // Abandon is used to signal that the given state store has been abandoned.
   178  // Calling this more than one time will panic.
   179  func (s *StateStore) Abandon() {
   180  	close(s.abandonCh)
   181  }
   183  // QueryFn is the definition of a function that can be used to implement a basic
   184  // blocking query against the state store.
   185  type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error)
   187  // BlockingQuery takes a query function and runs the function until the minimum
   188  // query index is met or until the passed context is cancelled.
   189  func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) (
   190  	resp interface{}, index uint64, err error) {
   192  RUN_QUERY:
   193  	// We capture the state store and its abandon channel but pass a snapshot to
   194  	// the blocking query function. We operate on the snapshot to allow separate
   195  	// calls to the state store not all wrapped within the same transaction.
   196  	abandonCh := s.AbandonCh()
   197  	snap, _ := s.Snapshot()
   198  	stateSnap := &snap.StateStore
   200  	// We can skip all watch tracking if this isn't a blocking query.
   201  	var ws memdb.WatchSet
   202  	if minIndex > 0 {
   203  		ws = memdb.NewWatchSet()
   205  		// This channel will be closed if a snapshot is restored and the
   206  		// whole state store is abandoned.
   207  		ws.Add(abandonCh)
   208  	}
   210  	resp, index, err = query(ws, stateSnap)
   211  	if err != nil {
   212  		return nil, index, err
   213  	}
   215  	// We haven't reached the min-index yet.
   216  	if minIndex > 0 && index <= minIndex {
   217  		if err := ws.WatchCtx(ctx); err != nil {
   218  			return nil, index, err
   219  		}
   221  		goto RUN_QUERY
   222  	}
   224  	return resp, index, nil
   225  }
   227  // UpsertPlanResults is used to upsert the results of a plan.
   228  func (s *StateStore) UpsertPlanResults(index uint64, results *structs.ApplyPlanResultsRequest) error {
   229  	snapshot, err := s.Snapshot()
   230  	if err != nil {
   231  		return err
   232  	}
   234  	allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped)
   235  	if err != nil {
   236  		return err
   237  	}
   239  	allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted)
   240  	if err != nil {
   241  		return err
   242  	}
   244  	// COMPAT 0.11: Remove this denormalization when NodePreemptions is removed
   245  	results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions)
   246  	if err != nil {
   247  		return err
   248  	}
   250  	txn := s.db.Txn(true)
   251  	defer txn.Abort()
   253  	// Upsert the newly created or updated deployment
   254  	if results.Deployment != nil {
   255  		if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil {
   256  			return err
   257  		}
   258  	}
   260  	// Update the status of deployments effected by the plan.
   261  	if len(results.DeploymentUpdates) != 0 {
   262  		s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn)
   263  	}
   265  	if results.EvalID != "" {
   266  		// Update the modify index of the eval id
   267  		if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil {
   268  			return err
   269  		}
   270  	}
   272  	numAllocs := 0
   273  	if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 {
   274  		// COMPAT 0.11: This branch will be removed, when Alloc is removed
   275  		// Attach the job to all the allocations. It is pulled out in the payload to
   276  		// avoid the redundancy of encoding, but should be denormalized prior to
   277  		// being inserted into MemDB.
   278  		addComputedAllocAttrs(results.Alloc, results.Job)
   279  		numAllocs = len(results.Alloc) + len(results.NodePreemptions)
   280  	} else {
   281  		// Attach the job to all the allocations. It is pulled out in the payload to
   282  		// avoid the redundancy of encoding, but should be denormalized prior to
   283  		// being inserted into MemDB.
   284  		addComputedAllocAttrs(results.AllocsUpdated, results.Job)
   285  		numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted)
   286  	}
   288  	allocsToUpsert := make([]*structs.Allocation, 0, numAllocs)
   290  	// COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed
   291  	allocsToUpsert = append(allocsToUpsert, results.Alloc...)
   292  	allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...)
   294  	allocsToUpsert = append(allocsToUpsert, allocsStopped...)
   295  	allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...)
   296  	allocsToUpsert = append(allocsToUpsert, allocsPreempted...)
   298  	// handle upgrade path
   299  	for _, alloc := range allocsToUpsert {
   300  		alloc.Canonicalize()
   301  	}
   303  	if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil {
   304  		return err
   305  	}
   307  	// Upsert followup evals for allocs that were preempted
   308  	for _, eval := range results.PreemptionEvals {
   309  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   310  			return err
   311  		}
   312  	}
   314  	txn.Commit()
   315  	return nil
   316  }
   318  // addComputedAllocAttrs adds the computed/derived attributes to the allocation.
   319  // This method is used when an allocation is being denormalized.
   320  func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) {
   321  	structs.DenormalizeAllocationJobs(job, allocs)
   323  	// COMPAT(0.11): Remove in 0.11
   324  	// Calculate the total resources of allocations. It is pulled out in the
   325  	// payload to avoid encoding something that can be computed, but should be
   326  	// denormalized prior to being inserted into MemDB.
   327  	for _, alloc := range allocs {
   328  		if alloc.Resources != nil {
   329  			continue
   330  		}
   332  		alloc.Resources = new(structs.Resources)
   333  		for _, task := range alloc.TaskResources {
   334  			alloc.Resources.Add(task)
   335  		}
   337  		// Add the shared resources
   338  		alloc.Resources.Add(alloc.SharedResources)
   339  	}
   340  }
   342  // upsertDeploymentUpdates updates the deployments given the passed status
   343  // updates.
   344  func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *memdb.Txn) error {
   345  	for _, u := range updates {
   346  		if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil {
   347  			return err
   348  		}
   349  	}
   351  	return nil
   352  }
   354  // UpsertJobSummary upserts a job summary into the state store.
   355  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
   356  	txn := s.db.Txn(true)
   357  	defer txn.Abort()
   359  	// Check if the job summary already exists
   360  	existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID)
   361  	if err != nil {
   362  		return fmt.Errorf("job summary lookup failed: %v", err)
   363  	}
   365  	// Setup the indexes correctly
   366  	if existing != nil {
   367  		jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex
   368  		jobSummary.ModifyIndex = index
   369  	} else {
   370  		jobSummary.CreateIndex = index
   371  		jobSummary.ModifyIndex = index
   372  	}
   374  	// Update the index
   375  	if err := txn.Insert("job_summary", jobSummary); err != nil {
   376  		return err
   377  	}
   379  	// Update the indexes table for job summary
   380  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   381  		return fmt.Errorf("index update failed: %v", err)
   382  	}
   384  	txn.Commit()
   385  	return nil
   386  }
   388  // DeleteJobSummary deletes the job summary with the given ID. This is for
   389  // testing purposes only.
   390  func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error {
   391  	txn := s.db.Txn(true)
   392  	defer txn.Abort()
   394  	// Delete the job summary
   395  	if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil {
   396  		return fmt.Errorf("deleting job summary failed: %v", err)
   397  	}
   398  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   399  		return fmt.Errorf("index update failed: %v", err)
   400  	}
   401  	txn.Commit()
   402  	return nil
   403  }
   405  // UpsertDeployment is used to insert a new deployment. If cancelPrior is set to
   406  // true, all prior deployments for the same job will be cancelled.
   407  func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error {
   408  	txn := s.db.Txn(true)
   409  	defer txn.Abort()
   410  	if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil {
   411  		return err
   412  	}
   413  	txn.Commit()
   414  	return nil
   415  }
   417  func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *memdb.Txn) error {
   418  	// Check if the deployment already exists
   419  	existing, err := txn.First("deployment", "id", deployment.ID)
   420  	if err != nil {
   421  		return fmt.Errorf("deployment lookup failed: %v", err)
   422  	}
   424  	// Setup the indexes correctly
   425  	if existing != nil {
   426  		deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex
   427  		deployment.ModifyIndex = index
   428  	} else {
   429  		deployment.CreateIndex = index
   430  		deployment.ModifyIndex = index
   431  	}
   433  	// Insert the deployment
   434  	if err := txn.Insert("deployment", deployment); err != nil {
   435  		return err
   436  	}
   438  	// Update the indexes table for deployment
   439  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   440  		return fmt.Errorf("index update failed: %v", err)
   441  	}
   443  	// If the deployment is being marked as complete, set the job to stable.
   444  	if deployment.Status == structs.DeploymentStatusSuccessful {
   445  		if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil {
   446  			return fmt.Errorf("failed to update job stability: %v", err)
   447  		}
   448  	}
   450  	return nil
   451  }
   453  func (s *StateStore) Deployments(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   454  	txn := s.db.Txn(false)
   456  	// Walk the entire deployments table
   457  	iter, err := txn.Get("deployment", "id")
   458  	if err != nil {
   459  		return nil, err
   460  	}
   462  	ws.Add(iter.WatchCh())
   463  	return iter, nil
   464  }
   466  func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
   467  	txn := s.db.Txn(false)
   469  	// Walk the entire deployments table
   470  	iter, err := txn.Get("deployment", "namespace", namespace)
   471  	if err != nil {
   472  		return nil, err
   473  	}
   475  	ws.Add(iter.WatchCh())
   476  	return iter, nil
   477  }
   479  func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string) (memdb.ResultIterator, error) {
   480  	txn := s.db.Txn(false)
   482  	// Walk the entire deployments table
   483  	iter, err := txn.Get("deployment", "id_prefix", deploymentID)
   484  	if err != nil {
   485  		return nil, err
   486  	}
   488  	ws.Add(iter.WatchCh())
   490  	// Wrap the iterator in a filter
   491  	wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace))
   492  	return wrap, nil
   493  }
   495  // deploymentNamespaceFilter returns a filter function that filters all
   496  // deployment not in the given namespace.
   497  func deploymentNamespaceFilter(namespace string) func(interface{}) bool {
   498  	return func(raw interface{}) bool {
   499  		d, ok := raw.(*structs.Deployment)
   500  		if !ok {
   501  			return true
   502  		}
   504  		return d.Namespace != namespace
   505  	}
   506  }
   508  func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) {
   509  	txn := s.db.Txn(false)
   510  	return s.deploymentByIDImpl(ws, deploymentID, txn)
   511  }
   513  func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *memdb.Txn) (*structs.Deployment, error) {
   514  	watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID)
   515  	if err != nil {
   516  		return nil, fmt.Errorf("deployment lookup failed: %v", err)
   517  	}
   518  	ws.Add(watchCh)
   520  	if existing != nil {
   521  		return existing.(*structs.Deployment), nil
   522  	}
   524  	return nil, nil
   525  }
   527  func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) {
   528  	txn := s.db.Txn(false)
   530  	var job *structs.Job
   531  	// Read job from state store
   532  	_, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID)
   533  	if err != nil {
   534  		return nil, fmt.Errorf("job lookup failed: %v", err)
   535  	}
   536  	if existing != nil {
   537  		job = existing.(*structs.Job)
   538  	}
   540  	// Get an iterator over the deployments
   541  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   542  	if err != nil {
   543  		return nil, err
   544  	}
   546  	ws.Add(iter.WatchCh())
   548  	var out []*structs.Deployment
   549  	for {
   550  		raw := iter.Next()
   551  		if raw == nil {
   552  			break
   553  		}
   554  		d := raw.(*structs.Deployment)
   556  		// If the allocation belongs to a job with the same ID but a different
   557  		// create index and we are not getting all the allocations whose Jobs
   558  		// matches the same Job ID then we skip it
   559  		if !all && job != nil && d.JobCreateIndex != job.CreateIndex {
   560  			continue
   561  		}
   562  		out = append(out, d)
   563  	}
   565  	return out, nil
   566  }
   568  // LatestDeploymentByJobID returns the latest deployment for the given job. The
   569  // latest is determined strictly by CreateIndex.
   570  func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) {
   571  	txn := s.db.Txn(false)
   573  	// Get an iterator over the deployments
   574  	iter, err := txn.Get("deployment", "job", namespace, jobID)
   575  	if err != nil {
   576  		return nil, err
   577  	}
   579  	ws.Add(iter.WatchCh())
   581  	var out *structs.Deployment
   582  	for {
   583  		raw := iter.Next()
   584  		if raw == nil {
   585  			break
   586  		}
   588  		d := raw.(*structs.Deployment)
   589  		if out == nil || out.CreateIndex < d.CreateIndex {
   590  			out = d
   591  		}
   592  	}
   594  	return out, nil
   595  }
   597  // DeleteDeployment is used to delete a set of deployments by ID
   598  func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error {
   599  	txn := s.db.Txn(true)
   600  	defer txn.Abort()
   602  	if len(deploymentIDs) == 0 {
   603  		return nil
   604  	}
   606  	for _, deploymentID := range deploymentIDs {
   607  		// Lookup the deployment
   608  		existing, err := txn.First("deployment", "id", deploymentID)
   609  		if err != nil {
   610  			return fmt.Errorf("deployment lookup failed: %v", err)
   611  		}
   612  		if existing == nil {
   613  			return fmt.Errorf("deployment not found")
   614  		}
   616  		// Delete the deployment
   617  		if err := txn.Delete("deployment", existing); err != nil {
   618  			return fmt.Errorf("deployment delete failed: %v", err)
   619  		}
   620  	}
   622  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
   623  		return fmt.Errorf("index update failed: %v", err)
   624  	}
   626  	txn.Commit()
   627  	return nil
   628  }
   630  // UpsertScalingEvent is used to insert a new scaling event.
   631  // Only the most recent JobTrackedScalingEvents will be kept.
   632  func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error {
   633  	txn := s.db.Txn(true)
   634  	defer txn.Abort()
   636  	// Get the existing events
   637  	existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID)
   638  	if err != nil {
   639  		return fmt.Errorf("scaling event lookup failed: %v", err)
   640  	}
   642  	var jobEvents *structs.JobScalingEvents
   643  	if existing != nil {
   644  		jobEvents = existing.(*structs.JobScalingEvents)
   645  	} else {
   646  		jobEvents = &structs.JobScalingEvents{
   647  			Namespace:     req.Namespace,
   648  			JobID:         req.JobID,
   649  			ScalingEvents: make(map[string][]*structs.ScalingEvent),
   650  		}
   651  	}
   653  	jobEvents.ModifyIndex = index
   654  	req.ScalingEvent.CreateIndex = index
   656  	events := jobEvents.ScalingEvents[req.TaskGroup]
   657  	// Prepend this latest event
   658  	events = append(
   659  		[]*structs.ScalingEvent{req.ScalingEvent},
   660  		events...,
   661  	)
   662  	// Truncate older events
   663  	if len(events) > structs.JobTrackedScalingEvents {
   664  		events = events[0:structs.JobTrackedScalingEvents]
   665  	}
   666  	jobEvents.ScalingEvents[req.TaskGroup] = events
   668  	// Insert the new event
   669  	if err := txn.Insert("scaling_event", jobEvents); err != nil {
   670  		return fmt.Errorf("scaling event insert failed: %v", err)
   671  	}
   673  	// Update the indexes table for scaling_event
   674  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
   675  		return fmt.Errorf("index update failed: %v", err)
   676  	}
   678  	txn.Commit()
   679  	return nil
   680  }
   682  // ScalingEvents returns an iterator over all the job scaling events
   683  func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) {
   684  	txn := s.db.Txn(false)
   686  	// Walk the entire scaling_event table
   687  	iter, err := txn.Get("scaling_event", "id")
   688  	if err != nil {
   689  		return nil, err
   690  	}
   692  	ws.Add(iter.WatchCh())
   694  	return iter, nil
   695  }
   697  func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) {
   698  	txn := s.db.Txn(false)
   700  	watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID)
   701  	if err != nil {
   702  		return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err)
   703  	}
   704  	ws.Add(watchCh)
   706  	if existing != nil {
   707  		events := existing.(*structs.JobScalingEvents)
   708  		return events.ScalingEvents, events.ModifyIndex, nil
   709  	}
   710  	return nil, 0, nil
   711  }
   713  // UpsertNode is used to register a node or update a node definition
   714  // This is assumed to be triggered by the client, so we retain the value
   715  // of drain/eligibility which is set by the scheduler.
   716  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
   717  	txn := s.db.Txn(true)
   718  	defer txn.Abort()
   720  	// Check if the node already exists
   721  	existing, err := txn.First("nodes", "id", node.ID)
   722  	if err != nil {
   723  		return fmt.Errorf("node lookup failed: %v", err)
   724  	}
   726  	// Setup the indexes correctly
   727  	if existing != nil {
   728  		exist := existing.(*structs.Node)
   729  		node.CreateIndex = exist.CreateIndex
   730  		node.ModifyIndex = index
   732  		// Retain node events that have already been set on the node
   733  		node.Events = exist.Events
   735  		// If we are transitioning from down, record the re-registration
   736  		if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown {
   737  			appendNodeEvents(index, node, []*structs.NodeEvent{
   738  				structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   739  					SetMessage(NodeRegisterEventReregistered).
   740  					SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))})
   741  		}
   743  		node.Drain = exist.Drain                                 // Retain the drain mode
   744  		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
   745  		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
   746  	} else {
   747  		// Because this is the first time the node is being registered, we should
   748  		// also create a node registration event
   749  		nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
   750  			SetMessage(NodeRegisterEventRegistered).
   751  			SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))
   752  		node.Events = []*structs.NodeEvent{nodeEvent}
   753  		node.CreateIndex = index
   754  		node.ModifyIndex = index
   755  	}
   757  	// Insert the node
   758  	if err := txn.Insert("nodes", node); err != nil {
   759  		return fmt.Errorf("node insert failed: %v", err)
   760  	}
   761  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   762  		return fmt.Errorf("index update failed: %v", err)
   763  	}
   764  	if err := upsertNodeCSIPlugins(txn, node, index); err != nil {
   765  		return fmt.Errorf("csi plugin update failed: %v", err)
   766  	}
   768  	txn.Commit()
   769  	return nil
   770  }
   772  // DeleteNode deregisters a batch of nodes
   773  func (s *StateStore) DeleteNode(index uint64, nodes []string) error {
   774  	if len(nodes) == 0 {
   775  		return fmt.Errorf("node ids missing")
   776  	}
   778  	txn := s.db.Txn(true)
   779  	defer txn.Abort()
   781  	for _, nodeID := range nodes {
   782  		existing, err := txn.First("nodes", "id", nodeID)
   783  		if err != nil {
   784  			return fmt.Errorf("node lookup failed: %s: %v", nodeID, err)
   785  		}
   786  		if existing == nil {
   787  			return fmt.Errorf("node not found: %s", nodeID)
   788  		}
   790  		// Delete the node
   791  		if err := txn.Delete("nodes", existing); err != nil {
   792  			return fmt.Errorf("node delete failed: %s: %v", nodeID, err)
   793  		}
   795  		node := existing.(*structs.Node)
   796  		if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
   797  			return fmt.Errorf("csi plugin delete failed: %v", err)
   798  		}
   799  	}
   801  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   802  		return fmt.Errorf("index update failed: %v", err)
   803  	}
   805  	txn.Commit()
   806  	return nil
   807  }
   809  // UpdateNodeStatus is used to update the status of a node
   810  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
   811  	txn := s.db.Txn(true)
   812  	defer txn.Abort()
   814  	// Lookup the node
   815  	existing, err := txn.First("nodes", "id", nodeID)
   816  	if err != nil {
   817  		return fmt.Errorf("node lookup failed: %v", err)
   818  	}
   819  	if existing == nil {
   820  		return fmt.Errorf("node not found")
   821  	}
   823  	// Copy the existing node
   824  	existingNode := existing.(*structs.Node)
   825  	copyNode := existingNode.Copy()
   826  	copyNode.StatusUpdatedAt = updatedAt
   828  	// Add the event if given
   829  	if event != nil {
   830  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   831  	}
   833  	// Update the status in the copy
   834  	copyNode.Status = status
   835  	copyNode.ModifyIndex = index
   837  	// Insert the node
   838  	if err := txn.Insert("nodes", copyNode); err != nil {
   839  		return fmt.Errorf("node update failed: %v", err)
   840  	}
   841  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   842  		return fmt.Errorf("index update failed: %v", err)
   843  	}
   845  	txn.Commit()
   846  	return nil
   847  }
   849  // BatchUpdateNodeDrain is used to update the drain of a node set of nodes
   850  func (s *StateStore) BatchUpdateNodeDrain(index uint64, updatedAt int64, updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error {
   851  	txn := s.db.Txn(true)
   852  	defer txn.Abort()
   853  	for node, update := range updates {
   854  		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt, events[node]); err != nil {
   855  			return err
   856  		}
   857  	}
   858  	txn.Commit()
   859  	return nil
   860  }
   862  // UpdateNodeDrain is used to update the drain of a node
   863  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
   864  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent) error {
   866  	txn := s.db.Txn(true)
   867  	defer txn.Abort()
   868  	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event); err != nil {
   869  		return err
   870  	}
   871  	txn.Commit()
   872  	return nil
   873  }
   875  func (s *StateStore) updateNodeDrainImpl(txn *memdb.Txn, index uint64, nodeID string,
   876  	drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent) error {
   878  	// Lookup the node
   879  	existing, err := txn.First("nodes", "id", nodeID)
   880  	if err != nil {
   881  		return fmt.Errorf("node lookup failed: %v", err)
   882  	}
   883  	if existing == nil {
   884  		return fmt.Errorf("node not found")
   885  	}
   887  	// Copy the existing node
   888  	existingNode := existing.(*structs.Node)
   889  	copyNode := existingNode.Copy()
   890  	copyNode.StatusUpdatedAt = updatedAt
   892  	// Add the event if given
   893  	if event != nil {
   894  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   895  	}
   897  	// Update the drain in the copy
   898  	copyNode.Drain = drain != nil // COMPAT: Remove in Nomad 0.10
   899  	copyNode.DrainStrategy = drain
   900  	if drain != nil {
   901  		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
   902  	} else if markEligible {
   903  		copyNode.SchedulingEligibility = structs.NodeSchedulingEligible
   904  	}
   906  	copyNode.ModifyIndex = index
   908  	// Insert the node
   909  	if err := txn.Insert("nodes", copyNode); err != nil {
   910  		return fmt.Errorf("node update failed: %v", err)
   911  	}
   912  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   913  		return fmt.Errorf("index update failed: %v", err)
   914  	}
   916  	return nil
   917  }
   919  // UpdateNodeEligibility is used to update the scheduling eligibility of a node
   920  func (s *StateStore) UpdateNodeEligibility(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error {
   922  	txn := s.db.Txn(true)
   923  	defer txn.Abort()
   925  	// Lookup the node
   926  	existing, err := txn.First("nodes", "id", nodeID)
   927  	if err != nil {
   928  		return fmt.Errorf("node lookup failed: %v", err)
   929  	}
   930  	if existing == nil {
   931  		return fmt.Errorf("node not found")
   932  	}
   934  	// Copy the existing node
   935  	existingNode := existing.(*structs.Node)
   936  	copyNode := existingNode.Copy()
   937  	copyNode.StatusUpdatedAt = updatedAt
   939  	// Add the event if given
   940  	if event != nil {
   941  		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
   942  	}
   944  	// Check if this is a valid action
   945  	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
   946  		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
   947  	}
   949  	// Update the eligibility in the copy
   950  	copyNode.SchedulingEligibility = eligibility
   951  	copyNode.ModifyIndex = index
   953  	// Insert the node
   954  	if err := txn.Insert("nodes", copyNode); err != nil {
   955  		return fmt.Errorf("node update failed: %v", err)
   956  	}
   957  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   958  		return fmt.Errorf("index update failed: %v", err)
   959  	}
   961  	txn.Commit()
   962  	return nil
   963  }
   965  // UpsertNodeEvents adds the node events to the nodes, rotating events as
   966  // necessary.
   967  func (s *StateStore) UpsertNodeEvents(index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
   968  	txn := s.db.Txn(true)
   969  	defer txn.Abort()
   971  	for nodeID, events := range nodeEvents {
   972  		if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil {
   973  			return err
   974  		}
   975  	}
   977  	txn.Commit()
   978  	return nil
   979  }
   981  // upsertNodeEvent upserts a node event for a respective node. It also maintains
   982  // that a fixed number of node events are ever stored simultaneously, deleting
   983  // older events once this bound has been reached.
   984  func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *memdb.Txn) error {
   985  	// Lookup the node
   986  	existing, err := txn.First("nodes", "id", nodeID)
   987  	if err != nil {
   988  		return fmt.Errorf("node lookup failed: %v", err)
   989  	}
   990  	if existing == nil {
   991  		return fmt.Errorf("node not found")
   992  	}
   994  	// Copy the existing node
   995  	existingNode := existing.(*structs.Node)
   996  	copyNode := existingNode.Copy()
   997  	appendNodeEvents(index, copyNode, events)
   999  	// Insert the node
  1000  	if err := txn.Insert("nodes", copyNode); err != nil {
  1001  		return fmt.Errorf("node update failed: %v", err)
  1002  	}
  1003  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
  1004  		return fmt.Errorf("index update failed: %v", err)
  1005  	}
  1007  	return nil
  1008  }
  1010  // appendNodeEvents is a helper that takes a node and new events and appends
  1011  // them, pruning older events as needed.
  1012  func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) {
  1013  	// Add the events, updating the indexes
  1014  	for _, e := range events {
  1015  		e.CreateIndex = index
  1016  		node.Events = append(node.Events, e)
  1017  	}
  1019  	// Keep node events pruned to not exceed the max allowed
  1020  	if l := len(node.Events); l > structs.MaxRetainedNodeEvents {
  1021  		delta := l - structs.MaxRetainedNodeEvents
  1022  		node.Events = node.Events[delta:]
  1023  	}
  1024  }
  1026  // upsertNodeCSIPlugins indexes csi plugins for volume retrieval, with health. It's called
  1027  // on upsertNodeEvents, so that event driven health changes are updated
  1028  func upsertNodeCSIPlugins(txn *memdb.Txn, node *structs.Node, index uint64) error {
  1030  	loop := func(info *structs.CSIInfo) error {
  1031  		raw, err := txn.First("csi_plugins", "id", info.PluginID)
  1032  		if err != nil {
  1033  			return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err)
  1034  		}
  1036  		var plug *structs.CSIPlugin
  1037  		if raw != nil {
  1038  			plug = raw.(*structs.CSIPlugin).Copy()
  1039  		} else {
  1040  			if !info.Healthy {
  1041  				// we don't want to create new plugins for unhealthy
  1042  				// allocs, otherwise we'd recreate the plugin when we
  1043  				// get the update for the alloc becoming terminal
  1044  				return nil
  1045  			}
  1046  			plug = structs.NewCSIPlugin(info.PluginID, index)
  1047  			plug.Provider = info.Provider
  1048  			plug.Version = info.ProviderVersion
  1049  		}
  1051  		err = plug.AddPlugin(node.ID, info)
  1052  		if err != nil {
  1053  			return err
  1054  		}
  1056  		plug.ModifyIndex = index
  1058  		err = txn.Insert("csi_plugins", plug)
  1059  		if err != nil {
  1060  			return fmt.Errorf("csi_plugins insert error: %v", err)
  1061  		}
  1063  		return nil
  1064  	}
  1066  	inUseController := map[string]struct{}{}
  1067  	inUseNode := map[string]struct{}{}
  1069  	for _, info := range node.CSIControllerPlugins {
  1070  		err := loop(info)
  1071  		if err != nil {
  1072  			return err
  1073  		}
  1074  		inUseController[info.PluginID] = struct{}{}
  1075  	}
  1077  	for _, info := range node.CSINodePlugins {
  1078  		err := loop(info)
  1079  		if err != nil {
  1080  			return err
  1081  		}
  1082  		inUseNode[info.PluginID] = struct{}{}
  1083  	}
  1085  	// remove the client node from any plugin that's not
  1086  	// running on it.
  1087  	iter, err := txn.Get("csi_plugins", "id")
  1088  	if err != nil {
  1089  		return fmt.Errorf("csi_plugins lookup failed: %v", err)
  1090  	}
  1091  	for {
  1092  		raw := iter.Next()
  1093  		if raw == nil {
  1094  			break
  1095  		}
  1096  		plug := raw.(*structs.CSIPlugin)
  1098  		var hadDelete bool
  1099  		if _, ok := inUseController[plug.ID]; !ok {
  1100  			if _, asController := plug.Controllers[node.ID]; asController {
  1101  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeController)
  1102  				if err != nil {
  1103  					return err
  1104  				}
  1105  				hadDelete = true
  1106  			}
  1107  		}
  1108  		if _, ok := inUseNode[plug.ID]; !ok {
  1109  			if _, asNode := plug.Nodes[node.ID]; asNode {
  1110  				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeNode)
  1111  				if err != nil {
  1112  					return err
  1113  				}
  1114  				hadDelete = true
  1115  			}
  1116  		}
  1117  		// we check this flag both for performance and to make sure we
  1118  		// don't delete a plugin when registering a node plugin but
  1119  		// no controller
  1120  		if hadDelete {
  1121  			err = updateOrGCPlugin(index, txn, plug)
  1122  			if err != nil {
  1123  				return err
  1124  			}
  1125  		}
  1126  	}
  1128  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1129  		return fmt.Errorf("index update failed: %v", err)
  1130  	}
  1132  	return nil
  1133  }
  1135  // deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode
  1136  func deleteNodeCSIPlugins(txn *memdb.Txn, node *structs.Node, index uint64) error {
  1137  	if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 {
  1138  		return nil
  1139  	}
  1141  	names := map[string]struct{}{}
  1142  	for _, info := range node.CSIControllerPlugins {
  1143  		names[info.PluginID] = struct{}{}
  1144  	}
  1145  	for _, info := range node.CSINodePlugins {
  1146  		names[info.PluginID] = struct{}{}
  1147  	}
  1149  	for id := range names {
  1150  		raw, err := txn.First("csi_plugins", "id", id)
  1151  		if err != nil {
  1152  			return fmt.Errorf("csi_plugins lookup error %s: %v", id, err)
  1153  		}
  1154  		if raw == nil {
  1155  			return fmt.Errorf("csi_plugins missing plugin %s", id)
  1156  		}
  1158  		plug := raw.(*structs.CSIPlugin).Copy()
  1159  		err = plug.DeleteNode(node.ID)
  1160  		if err != nil {
  1161  			return err
  1162  		}
  1163  		err = updateOrGCPlugin(index, txn, plug)
  1164  		if err != nil {
  1165  			return err
  1166  		}
  1167  	}
  1169  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1170  		return fmt.Errorf("index update failed: %v", err)
  1171  	}
  1173  	return nil
  1174  }
  1176  // updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
  1177  func updateOrGCPlugin(index uint64, txn *memdb.Txn, plug *structs.CSIPlugin) error {
  1178  	plug.ModifyIndex = index
  1180  	if plug.IsEmpty() {
  1181  		err := txn.Delete("csi_plugins", plug)
  1182  		if err != nil {
  1183  			return fmt.Errorf("csi_plugins delete error: %v", err)
  1184  		}
  1185  	} else {
  1186  		err := txn.Insert("csi_plugins", plug)
  1187  		if err != nil {
  1188  			return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
  1189  		}
  1190  	}
  1191  	return nil
  1192  }
  1194  // deleteJobFromPlugin removes the allocations of this job from any plugins the job is
  1195  // running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn
  1196  func (s *StateStore) deleteJobFromPlugin(index uint64, txn *memdb.Txn, job *structs.Job) error {
  1197  	ws := memdb.NewWatchSet()
  1198  	allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false)
  1199  	if err != nil {
  1200  		return fmt.Errorf("error getting allocations: %v", err)
  1201  	}
  1203  	type pair struct {
  1204  		pluginID string
  1205  		alloc    *structs.Allocation
  1206  	}
  1208  	plugAllocs := []*pair{}
  1209  	plugins := map[string]*structs.CSIPlugin{}
  1211  	for _, a := range allocs {
  1212  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  1213  		for _, t := range tg.Tasks {
  1214  			if t.CSIPluginConfig != nil {
  1215  				plugAllocs = append(plugAllocs, &pair{
  1216  					pluginID: t.CSIPluginConfig.ID,
  1217  					alloc:    a,
  1218  				})
  1219  			}
  1220  		}
  1221  	}
  1223  	for _, x := range plugAllocs {
  1224  		plug, ok := plugins[x.pluginID]
  1226  		if !ok {
  1227  			plug, err = s.CSIPluginByID(ws, x.pluginID)
  1228  			if err != nil {
  1229  				return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err)
  1230  			}
  1231  			if plug == nil {
  1232  				return fmt.Errorf("plugin missing: %s %v", x.pluginID, err)
  1233  			}
  1234  			// only copy once, so we update the same plugin on each alloc
  1235  			plugins[x.pluginID] = plug.Copy()
  1236  			plug = plugins[x.pluginID]
  1237  		}
  1239  		err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID)
  1240  		if err != nil {
  1241  			return err
  1242  		}
  1243  	}
  1245  	for _, plug := range plugins {
  1246  		err = updateOrGCPlugin(index, txn, plug)
  1247  		if err != nil {
  1248  			return err
  1249  		}
  1250  	}
  1252  	if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  1253  		return fmt.Errorf("index update failed: %v", err)
  1254  	}
  1256  	return nil
  1257  }
  1259  // NodeByID is used to lookup a node by ID
  1260  func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) {
  1261  	txn := s.db.Txn(false)
  1263  	watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID)
  1264  	if err != nil {
  1265  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1266  	}
  1267  	ws.Add(watchCh)
  1269  	if existing != nil {
  1270  		return existing.(*structs.Node), nil
  1271  	}
  1272  	return nil, nil
  1273  }
  1275  // NodesByIDPrefix is used to lookup nodes by prefix
  1276  func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  1277  	txn := s.db.Txn(false)
  1279  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
  1280  	if err != nil {
  1281  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1282  	}
  1283  	ws.Add(iter.WatchCh())
  1285  	return iter, nil
  1286  }
  1288  // NodesByToken is used to lookup nodes by token
  1289  func (s *StateStore) NodesByToken(ws memdb.WatchSet, token string) (memdb.ResultIterator, error) {
  1290  	txn := s.db.Txn(false)
  1292  	iter, err := txn.Get("nodes", "token", token)
  1293  	if err != nil {
  1294  		return nil, fmt.Errorf("node lookup failed: %v", err)
  1295  	}
  1296  	ws.Add(iter.WatchCh())
  1298  	return iter, nil
  1299  }
  1301  // NodeBySecretID is used to lookup a node by SecretID
  1302  func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) {
  1303  	txn := s.db.Txn(false)
  1305  	watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID)
  1306  	if err != nil {
  1307  		return nil, fmt.Errorf("node lookup by SecretID failed: %v", err)
  1308  	}
  1309  	ws.Add(watchCh)
  1311  	if existing != nil {
  1312  		return existing.(*structs.Node), nil
  1313  	}
  1314  	return nil, nil
  1315  }
  1317  // Nodes returns an iterator over all the nodes
  1318  func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1319  	txn := s.db.Txn(false)
  1321  	// Walk the entire nodes table
  1322  	iter, err := txn.Get("nodes", "id")
  1323  	if err != nil {
  1324  		return nil, err
  1325  	}
  1326  	ws.Add(iter.WatchCh())
  1327  	return iter, nil
  1328  }
  1330  // UpsertJob is used to register a job or update a job definition
  1331  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
  1332  	txn := s.db.Txn(true)
  1333  	defer txn.Abort()
  1334  	if err := s.upsertJobImpl(index, job, false, txn); err != nil {
  1335  		return err
  1336  	}
  1337  	txn.Commit()
  1338  	return nil
  1339  }
  1341  // UpsertJobTxn is used to register a job or update a job definition, like UpsertJob,
  1342  // but in a transaction.  Useful for when making multiple modifications atomically
  1343  func (s *StateStore) UpsertJobTxn(index uint64, job *structs.Job, txn Txn) error {
  1344  	return s.upsertJobImpl(index, job, false, txn)
  1345  }
  1347  // upsertJobImpl is the implementation for registering a job or updating a job definition
  1348  func (s *StateStore) upsertJobImpl(index uint64, job *structs.Job, keepVersion bool, txn *memdb.Txn) error {
  1349  	// Assert the namespace exists
  1350  	if exists, err := s.namespaceExists(txn, job.Namespace); err != nil {
  1351  		return err
  1352  	} else if !exists {
  1353  		return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace)
  1354  	}
  1356  	// Check if the job already exists
  1357  	existing, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1358  	if err != nil {
  1359  		return fmt.Errorf("job lookup failed: %v", err)
  1360  	}
  1362  	// Setup the indexes correctly
  1363  	if existing != nil {
  1364  		job.CreateIndex = existing.(*structs.Job).CreateIndex
  1365  		job.ModifyIndex = index
  1367  		// Bump the version unless asked to keep it. This should only be done
  1368  		// when changing an internal field such as Stable. A spec change should
  1369  		// always come with a version bump
  1370  		if !keepVersion {
  1371  			job.JobModifyIndex = index
  1372  			job.Version = existing.(*structs.Job).Version + 1
  1373  		}
  1375  		// Compute the job status
  1376  		var err error
  1377  		job.Status, err = s.getJobStatus(txn, job, false)
  1378  		if err != nil {
  1379  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1380  		}
  1381  	} else {
  1382  		job.CreateIndex = index
  1383  		job.ModifyIndex = index
  1384  		job.JobModifyIndex = index
  1385  		job.Version = 0
  1387  		if err := s.setJobStatus(index, txn, job, false, ""); err != nil {
  1388  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
  1389  		}
  1391  		// Have to get the job again since it could have been updated
  1392  		updated, err := txn.First("jobs", "id", job.Namespace, job.ID)
  1393  		if err != nil {
  1394  			return fmt.Errorf("job lookup failed: %v", err)
  1395  		}
  1396  		if updated != nil {
  1397  			job = updated.(*structs.Job)
  1398  		}
  1399  	}
  1401  	if err := s.updateSummaryWithJob(index, job, txn); err != nil {
  1402  		return fmt.Errorf("unable to create job summary: %v", err)
  1403  	}
  1405  	if err := s.upsertJobVersion(index, job, txn); err != nil {
  1406  		return fmt.Errorf("unable to upsert job into job_version table: %v", err)
  1407  	}
  1409  	if err := s.updateJobScalingPolicies(index, job, txn); err != nil {
  1410  		return fmt.Errorf("unable to update job scaling policies: %v", err)
  1411  	}
  1413  	// Insert the job
  1414  	if err := txn.Insert("jobs", job); err != nil {
  1415  		return fmt.Errorf("job insert failed: %v", err)
  1416  	}
  1417  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1418  		return fmt.Errorf("index update failed: %v", err)
  1419  	}
  1421  	return nil
  1422  }
  1424  // DeleteJob is used to deregister a job
  1425  func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error {
  1426  	txn := s.db.Txn(true)
  1427  	defer txn.Abort()
  1429  	err := s.DeleteJobTxn(index, namespace, jobID, txn)
  1430  	if err == nil {
  1431  		txn.Commit()
  1432  	}
  1433  	return err
  1434  }
  1436  // DeleteJobTxn is used to deregister a job, like DeleteJob,
  1437  // but in a transaction.  Useful for when making multiple modifications atomically
  1438  func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error {
  1439  	// Lookup the node
  1440  	existing, err := txn.First("jobs", "id", namespace, jobID)
  1441  	if err != nil {
  1442  		return fmt.Errorf("job lookup failed: %v", err)
  1443  	}
  1444  	if existing == nil {
  1445  		return fmt.Errorf("job not found")
  1446  	}
  1448  	// Check if we should update a parent job summary
  1449  	job := existing.(*structs.Job)
  1450  	if job.ParentID != "" {
  1451  		summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID)
  1452  		if err != nil {
  1453  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  1454  		}
  1456  		// Only continue if the summary exists. It could not exist if the parent
  1457  		// job was removed
  1458  		if summaryRaw != nil {
  1459  			existing := summaryRaw.(*structs.JobSummary)
  1460  			pSummary := existing.Copy()
  1461  			if pSummary.Children != nil {
  1463  				modified := false
  1464  				switch job.Status {
  1465  				case structs.JobStatusPending:
  1466  					pSummary.Children.Pending--
  1467  					pSummary.Children.Dead++
  1468  					modified = true
  1469  				case structs.JobStatusRunning:
  1470  					pSummary.Children.Running--
  1471  					pSummary.Children.Dead++
  1472  					modified = true
  1473  				case structs.JobStatusDead:
  1474  				default:
  1475  					return fmt.Errorf("unknown old job status %q", job.Status)
  1476  				}
  1478  				if modified {
  1479  					// Update the modify index
  1480  					pSummary.ModifyIndex = index
  1482  					// Insert the summary
  1483  					if err := txn.Insert("job_summary", pSummary); err != nil {
  1484  						return fmt.Errorf("job summary insert failed: %v", err)
  1485  					}
  1486  					if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1487  						return fmt.Errorf("index update failed: %v", err)
  1488  					}
  1489  				}
  1490  			}
  1491  		}
  1492  	}
  1494  	// Delete the job
  1495  	if err := txn.Delete("jobs", existing); err != nil {
  1496  		return fmt.Errorf("job delete failed: %v", err)
  1497  	}
  1498  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1499  		return fmt.Errorf("index update failed: %v", err)
  1500  	}
  1502  	// Delete the job versions
  1503  	if err := s.deleteJobVersions(index, job, txn); err != nil {
  1504  		return err
  1505  	}
  1507  	// Delete the job summary
  1508  	if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil {
  1509  		return fmt.Errorf("deleting job summary failed: %v", err)
  1510  	}
  1511  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1512  		return fmt.Errorf("index update failed: %v", err)
  1513  	}
  1515  	// Delete any remaining job scaling policies
  1516  	if err := s.deleteJobScalingPolicies(index, job, txn); err != nil {
  1517  		return fmt.Errorf("deleting job scaling policies failed: %v", err)
  1518  	}
  1520  	// Delete the scaling events
  1521  	if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil {
  1522  		return fmt.Errorf("deleting job scaling events failed: %v", err)
  1523  	}
  1524  	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
  1525  		return fmt.Errorf("index update failed: %v", err)
  1526  	}
  1528  	// Cleanup plugins registered by this job
  1529  	err = s.deleteJobFromPlugin(index, txn, job)
  1530  	if err != nil {
  1531  		return fmt.Errorf("deleting job from plugin: %v", err)
  1532  	}
  1534  	return nil
  1535  }
  1537  // deleteJobScalingPolicies deletes any scaling policies associated with the job
  1538  func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, txn *memdb.Txn) error {
  1539  	numDeletedScalingPolicies, err := txn.DeleteAll("scaling_policy", "target_prefix", job.Namespace, job.ID)
  1540  	if err != nil {
  1541  		return fmt.Errorf("deleting job scaling policies failed: %v", err)
  1542  	}
  1543  	if numDeletedScalingPolicies > 0 {
  1544  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  1545  			return fmt.Errorf("index update failed: %v", err)
  1546  		}
  1547  	}
  1548  	return nil
  1549  }
  1551  // deleteJobVersions deletes all versions of the given job.
  1552  func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *memdb.Txn) error {
  1553  	iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID)
  1554  	if err != nil {
  1555  		return err
  1556  	}
  1558  	// Put them into a slice so there are no safety concerns while actually
  1559  	// performing the deletes
  1560  	jobs := []*structs.Job{}
  1561  	for {
  1562  		raw := iter.Next()
  1563  		if raw == nil {
  1564  			break
  1565  		}
  1567  		// Ensure the ID is an exact match
  1568  		j := raw.(*structs.Job)
  1569  		if j.ID != job.ID {
  1570  			continue
  1571  		}
  1573  		jobs = append(jobs, j)
  1574  	}
  1576  	// Do the deletes
  1577  	for _, j := range jobs {
  1578  		if err := txn.Delete("job_version", j); err != nil {
  1579  			return fmt.Errorf("deleting job versions failed: %v", err)
  1580  		}
  1581  	}
  1583  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1584  		return fmt.Errorf("index update failed: %v", err)
  1585  	}
  1587  	return nil
  1588  }
  1590  // upsertJobVersion inserts a job into its historic version table and limits the
  1591  // number of job versions that are tracked.
  1592  func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *memdb.Txn) error {
  1593  	// Insert the job
  1594  	if err := txn.Insert("job_version", job); err != nil {
  1595  		return fmt.Errorf("failed to insert job into job_version table: %v", err)
  1596  	}
  1598  	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
  1599  		return fmt.Errorf("index update failed: %v", err)
  1600  	}
  1602  	// Get all the historic jobs for this ID
  1603  	all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID)
  1604  	if err != nil {
  1605  		return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err)
  1606  	}
  1608  	// If we are below the limit there is no GCing to be done
  1609  	if len(all) <= structs.JobTrackedVersions {
  1610  		return nil
  1611  	}
  1613  	// We have to delete a historic job to make room.
  1614  	// Find index of the highest versioned stable job
  1615  	stableIdx := -1
  1616  	for i, j := range all {
  1617  		if j.Stable {
  1618  			stableIdx = i
  1619  			break
  1620  		}
  1621  	}
  1623  	// If the stable job is the oldest version, do a swap to bring it into the
  1624  	// keep set.
  1625  	max := structs.JobTrackedVersions
  1626  	if stableIdx == max {
  1627  		all[max-1], all[max] = all[max], all[max-1]
  1628  	}
  1630  	// Delete the job outside of the set that are being kept.
  1631  	d := all[max]
  1632  	if err := txn.Delete("job_version", d); err != nil {
  1633  		return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version)
  1634  	}
  1636  	return nil
  1637  }
  1639  // JobByID is used to lookup a job by its ID. JobByID returns the current/latest job
  1640  // version.
  1641  func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) {
  1642  	txn := s.db.Txn(false)
  1643  	return s.JobByIDTxn(ws, namespace, id, txn)
  1644  }
  1646  // JobByIDTxn is used to lookup a job by its ID, like  JobByID. JobByID returns the job version
  1647  // accessible through in the transaction
  1648  func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) {
  1649  	watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id)
  1650  	if err != nil {
  1651  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1652  	}
  1653  	ws.Add(watchCh)
  1655  	if existing != nil {
  1656  		return existing.(*structs.Job), nil
  1657  	}
  1658  	return nil, nil
  1659  }
  1661  // JobsByIDPrefix is used to lookup a job by prefix
  1662  func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  1663  	txn := s.db.Txn(false)
  1665  	iter, err := txn.Get("jobs", "id_prefix", namespace, id)
  1666  	if err != nil {
  1667  		return nil, fmt.Errorf("job lookup failed: %v", err)
  1668  	}
  1670  	ws.Add(iter.WatchCh())
  1672  	return iter, nil
  1673  }
  1675  // JobVersionsByID returns all the tracked versions of a job.
  1676  func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  1677  	txn := s.db.Txn(false)
  1679  	return s.jobVersionByID(txn, &ws, namespace, id)
  1680  }
  1682  // jobVersionByID is the underlying implementation for retrieving all tracked
  1683  // versions of a job and is called under an existing transaction. A watch set
  1684  // can optionally be passed in to add the job histories to the watch set.
  1685  func (s *StateStore) jobVersionByID(txn *memdb.Txn, ws *memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
  1686  	// Get all the historic jobs for this ID
  1687  	iter, err := txn.Get("job_version", "id_prefix", namespace, id)
  1688  	if err != nil {
  1689  		return nil, err
  1690  	}
  1692  	if ws != nil {
  1693  		ws.Add(iter.WatchCh())
  1694  	}
  1696  	var all []*structs.Job
  1697  	for {
  1698  		raw := iter.Next()
  1699  		if raw == nil {
  1700  			break
  1701  		}
  1703  		// Ensure the ID is an exact match
  1704  		j := raw.(*structs.Job)
  1705  		if j.ID != id {
  1706  			continue
  1707  		}
  1709  		all = append(all, j)
  1710  	}
  1712  	// Sort in reverse order so that the highest version is first
  1713  	sort.Slice(all, func(i, j int) bool {
  1714  		return all[i].Version > all[j].Version
  1715  	})
  1717  	return all, nil
  1718  }
  1720  // JobByIDAndVersion returns the job identified by its ID and Version. The
  1721  // passed watchset may be nil.
  1722  func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) {
  1723  	txn := s.db.Txn(false)
  1724  	return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn)
  1725  }
  1727  // jobByIDAndVersionImpl returns the job identified by its ID and Version. The
  1728  // passed watchset may be nil.
  1729  func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string,
  1730  	version uint64, txn *memdb.Txn) (*structs.Job, error) {
  1732  	watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version)
  1733  	if err != nil {
  1734  		return nil, err
  1735  	}
  1737  	if ws != nil {
  1738  		ws.Add(watchCh)
  1739  	}
  1741  	if existing != nil {
  1742  		job := existing.(*structs.Job)
  1743  		return job, nil
  1744  	}
  1746  	return nil, nil
  1747  }
  1749  func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1750  	txn := s.db.Txn(false)
  1752  	// Walk the entire deployments table
  1753  	iter, err := txn.Get("job_version", "id")
  1754  	if err != nil {
  1755  		return nil, err
  1756  	}
  1758  	ws.Add(iter.WatchCh())
  1759  	return iter, nil
  1760  }
  1762  // Jobs returns an iterator over all the jobs
  1763  func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1764  	txn := s.db.Txn(false)
  1766  	// Walk the entire jobs table
  1767  	iter, err := txn.Get("jobs", "id")
  1768  	if err != nil {
  1769  		return nil, err
  1770  	}
  1772  	ws.Add(iter.WatchCh())
  1774  	return iter, nil
  1775  }
  1777  // JobsByNamespace returns an iterator over all the jobs for the given namespace
  1778  func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  1779  	txn := s.db.Txn(false)
  1780  	return s.jobsByNamespaceImpl(ws, namespace, txn)
  1781  }
  1783  // jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace
  1784  func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *memdb.Txn) (memdb.ResultIterator, error) {
  1785  	// Walk the entire jobs table
  1786  	iter, err := txn.Get("jobs", "id_prefix", namespace, "")
  1787  	if err != nil {
  1788  		return nil, err
  1789  	}
  1791  	ws.Add(iter.WatchCh())
  1793  	return iter, nil
  1794  }
  1796  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
  1797  func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) {
  1798  	txn := s.db.Txn(false)
  1800  	iter, err := txn.Get("jobs", "periodic", periodic)
  1801  	if err != nil {
  1802  		return nil, err
  1803  	}
  1805  	ws.Add(iter.WatchCh())
  1807  	return iter, nil
  1808  }
  1810  // JobsByScheduler returns an iterator over all the jobs with the specific
  1811  // scheduler type.
  1812  func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) {
  1813  	txn := s.db.Txn(false)
  1815  	// Return an iterator for jobs with the specific type.
  1816  	iter, err := txn.Get("jobs", "type", schedulerType)
  1817  	if err != nil {
  1818  		return nil, err
  1819  	}
  1821  	ws.Add(iter.WatchCh())
  1823  	return iter, nil
  1824  }
  1826  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
  1827  // collection.
  1828  func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) {
  1829  	txn := s.db.Txn(false)
  1831  	iter, err := txn.Get("jobs", "gc", gc)
  1832  	if err != nil {
  1833  		return nil, err
  1834  	}
  1836  	ws.Add(iter.WatchCh())
  1838  	return iter, nil
  1839  }
  1841  // JobSummary returns a job summary object which matches a specific id.
  1842  func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) {
  1843  	txn := s.db.Txn(false)
  1845  	watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID)
  1846  	if err != nil {
  1847  		return nil, err
  1848  	}
  1850  	ws.Add(watchCh)
  1852  	if existing != nil {
  1853  		summary := existing.(*structs.JobSummary)
  1854  		return summary, nil
  1855  	}
  1857  	return nil, nil
  1858  }
  1860  // JobSummaries walks the entire job summary table and returns all the job
  1861  // summary objects
  1862  func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1863  	txn := s.db.Txn(false)
  1865  	iter, err := txn.Get("job_summary", "id")
  1866  	if err != nil {
  1867  		return nil, err
  1868  	}
  1870  	ws.Add(iter.WatchCh())
  1872  	return iter, nil
  1873  }
  1875  // JobSummaryByPrefix is used to look up Job Summary by id prefix
  1876  func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  1877  	txn := s.db.Txn(false)
  1879  	iter, err := txn.Get("job_summary", "id_prefix", namespace, id)
  1880  	if err != nil {
  1881  		return nil, fmt.Errorf("job_summary lookup failed: %v", err)
  1882  	}
  1884  	ws.Add(iter.WatchCh())
  1886  	return iter, nil
  1887  }
  1889  // CSIVolumeRegister adds a volume to the server store, failing if it already exists
  1890  func (s *StateStore) CSIVolumeRegister(index uint64, volumes []*structs.CSIVolume) error {
  1891  	txn := s.db.Txn(true)
  1892  	defer txn.Abort()
  1894  	for _, v := range volumes {
  1895  		if exists, err := s.namespaceExists(txn, v.Namespace); err != nil {
  1896  			return err
  1897  		} else if !exists {
  1898  			return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace)
  1899  		}
  1901  		// Check for volume existence
  1902  		obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID)
  1903  		if err != nil {
  1904  			return fmt.Errorf("volume existence check error: %v", err)
  1905  		}
  1906  		if obj != nil {
  1907  			// Allow some properties of a volume to be updated in place, but
  1908  			// prevent accidentally overwriting important properties, or
  1909  			// overwriting a volume in use
  1910  			old, ok := obj.(*structs.CSIVolume)
  1911  			if ok &&
  1912  				old.InUse() ||
  1913  				old.ExternalID != v.ExternalID ||
  1914  				old.PluginID != v.PluginID ||
  1915  				old.Provider != v.Provider {
  1916  				return fmt.Errorf("volume exists: %s", v.ID)
  1917  			}
  1918  		}
  1920  		if v.CreateIndex == 0 {
  1921  			v.CreateIndex = index
  1922  			v.ModifyIndex = index
  1923  		}
  1925  		err = txn.Insert("csi_volumes", v)
  1926  		if err != nil {
  1927  			return fmt.Errorf("volume insert: %v", err)
  1928  		}
  1929  	}
  1931  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  1932  		return fmt.Errorf("index update failed: %v", err)
  1933  	}
  1935  	txn.Commit()
  1936  	return nil
  1937  }
  1939  // CSIVolumes returns the unfiltered list of all volumes
  1940  func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  1941  	txn := s.db.Txn(false)
  1942  	defer txn.Abort()
  1944  	iter, err := txn.Get("csi_volumes", "id")
  1945  	if err != nil {
  1946  		return nil, fmt.Errorf("csi_volumes lookup failed: %v", err)
  1947  	}
  1949  	ws.Add(iter.WatchCh())
  1951  	return iter, nil
  1952  }
  1954  // CSIVolumeByID is used to lookup a single volume. Returns a copy of the volume
  1955  // because its plugins are denormalized to provide accurate Health.
  1956  func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) {
  1957  	txn := s.db.Txn(false)
  1959  	watchCh, obj, err := txn.FirstWatch("csi_volumes", "id_prefix", namespace, id)
  1960  	if err != nil {
  1961  		return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  1962  	}
  1963  	ws.Add(watchCh)
  1965  	if obj == nil {
  1966  		return nil, nil
  1967  	}
  1969  	vol := obj.(*structs.CSIVolume)
  1970  	return s.CSIVolumeDenormalizePlugins(ws, vol.Copy())
  1971  }
  1973  // CSIVolumes looks up csi_volumes by pluginID
  1974  func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, pluginID string) (memdb.ResultIterator, error) {
  1975  	txn := s.db.Txn(false)
  1977  	iter, err := txn.Get("csi_volumes", "plugin_id", pluginID)
  1978  	if err != nil {
  1979  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  1980  	}
  1982  	// Filter the iterator by namespace
  1983  	f := func(raw interface{}) bool {
  1984  		v, ok := raw.(*structs.CSIVolume)
  1985  		if !ok {
  1986  			return false
  1987  		}
  1988  		return v.Namespace != namespace
  1989  	}
  1991  	wrap := memdb.NewFilterIterator(iter, f)
  1992  	return wrap, nil
  1993  }
  1995  // CSIVolumesByIDPrefix supports search
  1996  func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) {
  1997  	txn := s.db.Txn(false)
  1999  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID)
  2000  	if err != nil {
  2001  		return nil, err
  2002  	}
  2004  	ws.Add(iter.WatchCh())
  2005  	return iter, nil
  2006  }
  2008  // CSIVolumesByNodeID looks up CSIVolumes in use on a node
  2009  func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
  2010  	allocs, err := s.AllocsByNode(ws, nodeID)
  2011  	if err != nil {
  2012  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  2013  	}
  2015  	// Find volume ids for CSI volumes in running allocs, or allocs that we desire to run
  2016  	ids := map[string]string{} // Map volumeID to Namespace
  2017  	for _, a := range allocs {
  2018  		tg := a.Job.LookupTaskGroup(a.TaskGroup)
  2020  		if !(a.DesiredStatus == structs.AllocDesiredStatusRun ||
  2021  			a.ClientStatus == structs.AllocClientStatusRunning) ||
  2022  			len(tg.Volumes) == 0 {
  2023  			continue
  2024  		}
  2026  		for _, v := range tg.Volumes {
  2027  			if v.Type != structs.VolumeTypeCSI {
  2028  				continue
  2029  			}
  2030  			ids[v.Source] = a.Namespace
  2031  		}
  2032  	}
  2034  	// Lookup the raw CSIVolumes to match the other list interfaces
  2035  	iter := NewSliceIterator()
  2036  	txn := s.db.Txn(false)
  2037  	for id, namespace := range ids {
  2038  		raw, err := txn.First("csi_volumes", "id", namespace, id)
  2039  		if err != nil {
  2040  			return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
  2041  		}
  2042  		iter.Add(raw)
  2043  	}
  2045  	return iter, nil
  2046  }
  2048  // CSIVolumesByNamespace looks up the entire csi_volumes table
  2049  func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2050  	txn := s.db.Txn(false)
  2052  	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, "")
  2053  	if err != nil {
  2054  		return nil, fmt.Errorf("volume lookup failed: %v", err)
  2055  	}
  2056  	ws.Add(iter.WatchCh())
  2058  	return iter, nil
  2059  }
  2061  // CSIVolumeClaim updates the volume's claim count and allocation list
  2062  func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *structs.CSIVolumeClaim) error {
  2063  	txn := s.db.Txn(true)
  2064  	defer txn.Abort()
  2065  	ws := memdb.NewWatchSet()
  2067  	row, err := txn.First("csi_volumes", "id", namespace, id)
  2068  	if err != nil {
  2069  		return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2070  	}
  2071  	if row == nil {
  2072  		return fmt.Errorf("volume not found: %s", id)
  2073  	}
  2075  	orig, ok := row.(*structs.CSIVolume)
  2076  	if !ok {
  2077  		return fmt.Errorf("volume row conversion error")
  2078  	}
  2080  	var alloc *structs.Allocation
  2081  	if claim.Mode != structs.CSIVolumeClaimRelease {
  2082  		alloc, err = s.AllocByID(ws, claim.AllocationID)
  2083  		if err != nil {
  2084  			s.logger.Error("AllocByID failed", "error", err)
  2085  			return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2086  		}
  2087  		if alloc == nil {
  2088  			s.logger.Error("AllocByID failed to find alloc", "alloc_id", claim.AllocationID)
  2089  			if err != nil {
  2090  				return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
  2091  			}
  2092  		}
  2093  	}
  2095  	volume, err := s.CSIVolumeDenormalizePlugins(ws, orig.Copy())
  2096  	if err != nil {
  2097  		return err
  2098  	}
  2100  	volume, err = s.CSIVolumeDenormalize(ws, volume)
  2101  	if err != nil {
  2102  		return err
  2103  	}
  2105  	// in the case of a job deregistration, there will be no allocation ID
  2106  	// for the claim but we still want to write an updated index to the volume
  2107  	// so that volume reaping is triggered
  2108  	if claim.AllocationID != "" {
  2109  		err = volume.Claim(claim, alloc)
  2110  		if err != nil {
  2111  			return err
  2112  		}
  2113  	}
  2115  	volume.ModifyIndex = index
  2117  	if err = txn.Insert("csi_volumes", volume); err != nil {
  2118  		return fmt.Errorf("volume update failed: %s: %v", id, err)
  2119  	}
  2121  	if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2122  		return fmt.Errorf("index update failed: %v", err)
  2123  	}
  2125  	txn.Commit()
  2126  	return nil
  2127  }
  2129  // CSIVolumeDeregister removes the volume from the server
  2130  func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string) error {
  2131  	txn := s.db.Txn(true)
  2132  	defer txn.Abort()
  2134  	for _, id := range ids {
  2135  		existing, err := txn.First("csi_volumes", "id_prefix", namespace, id)
  2136  		if err != nil {
  2137  			return fmt.Errorf("volume lookup failed: %s: %v", id, err)
  2138  		}
  2140  		if existing == nil {
  2141  			return fmt.Errorf("volume not found: %s", id)
  2142  		}
  2144  		vol, ok := existing.(*structs.CSIVolume)
  2145  		if !ok {
  2146  			return fmt.Errorf("volume row conversion error: %s", id)
  2147  		}
  2149  		if vol.InUse() {
  2150  			return fmt.Errorf("volume in use: %s", id)
  2151  		}
  2153  		if err = txn.Delete("csi_volumes", existing); err != nil {
  2154  			return fmt.Errorf("volume delete failed: %s: %v", id, err)
  2155  		}
  2156  	}
  2158  	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
  2159  		return fmt.Errorf("index update failed: %v", err)
  2160  	}
  2162  	txn.Commit()
  2163  	return nil
  2164  }
  2166  // CSIVolumeDenormalizePlugins returns a CSIVolume with current health and plugins, but
  2167  // without allocations
  2168  // Use this for current volume metadata, handling lists of volumes
  2169  // Use CSIVolumeDenormalize for volumes containing both health and current allocations
  2170  func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2171  	if vol == nil {
  2172  		return nil, nil
  2173  	}
  2174  	// Lookup CSIPlugin, the health records, and calculate volume health
  2175  	txn := s.db.Txn(false)
  2176  	defer txn.Abort()
  2178  	plug, err := s.CSIPluginByID(ws, vol.PluginID)
  2179  	if err != nil {
  2180  		return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
  2181  	}
  2182  	if plug == nil {
  2183  		vol.ControllersHealthy = 0
  2184  		vol.NodesHealthy = 0
  2185  		vol.Schedulable = false
  2186  		return vol, nil
  2187  	}
  2189  	vol.Provider = plug.Provider
  2190  	vol.ProviderVersion = plug.Version
  2191  	vol.ControllerRequired = plug.ControllerRequired
  2192  	vol.ControllersHealthy = plug.ControllersHealthy
  2193  	vol.NodesHealthy = plug.NodesHealthy
  2194  	// This number is incorrect! The expected number of node plugins is actually this +
  2195  	// the number of blocked evaluations for the jobs controlling these plugins
  2196  	vol.ControllersExpected = len(plug.Controllers)
  2197  	vol.NodesExpected = len(plug.Nodes)
  2199  	vol.Schedulable = vol.NodesHealthy > 0
  2200  	if vol.ControllerRequired {
  2201  		vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable
  2202  	}
  2204  	return vol, nil
  2205  }
  2207  // CSIVolumeDenormalize returns a CSIVolume with allocations
  2208  func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
  2209  	for id := range vol.ReadAllocs {
  2210  		a, err := s.AllocByID(ws, id)
  2211  		if err != nil {
  2212  			return nil, err
  2213  		}
  2214  		if a != nil {
  2215  			vol.ReadAllocs[id] = a
  2216  			// COMPAT(1.0): the CSIVolumeClaim fields were added
  2217  			// after 0.11.1, so claims made before that may be
  2218  			// missing this value. (same for WriteAlloc below)
  2219  			if _, ok := vol.ReadClaims[id]; !ok {
  2220  				vol.ReadClaims[id] = &structs.CSIVolumeClaim{
  2221  					AllocationID: a.ID,
  2222  					NodeID:       a.NodeID,
  2223  					Mode:         structs.CSIVolumeClaimRead,
  2224  					State:        structs.CSIVolumeClaimStateTaken,
  2225  				}
  2226  			}
  2227  		}
  2228  	}
  2230  	for id := range vol.WriteAllocs {
  2231  		a, err := s.AllocByID(ws, id)
  2232  		if err != nil {
  2233  			return nil, err
  2234  		}
  2235  		if a != nil {
  2236  			vol.WriteAllocs[id] = a
  2237  			if _, ok := vol.WriteClaims[id]; !ok {
  2238  				vol.WriteClaims[id] = &structs.CSIVolumeClaim{
  2239  					AllocationID: a.ID,
  2240  					NodeID:       a.NodeID,
  2241  					Mode:         structs.CSIVolumeClaimWrite,
  2242  					State:        structs.CSIVolumeClaimStateTaken,
  2243  				}
  2244  			}
  2245  		}
  2246  	}
  2248  	return vol, nil
  2249  }
  2251  // CSIPlugins returns the unfiltered list of all plugin health status
  2252  func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2253  	txn := s.db.Txn(false)
  2254  	defer txn.Abort()
  2256  	iter, err := txn.Get("csi_plugins", "id")
  2257  	if err != nil {
  2258  		return nil, fmt.Errorf("csi_plugins lookup failed: %v", err)
  2259  	}
  2261  	ws.Add(iter.WatchCh())
  2263  	return iter, nil
  2264  }
  2266  // CSIPluginsByIDPrefix supports search
  2267  func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) {
  2268  	txn := s.db.Txn(false)
  2270  	iter, err := txn.Get("csi_plugins", "id_prefix", pluginID)
  2271  	if err != nil {
  2272  		return nil, err
  2273  	}
  2275  	ws.Add(iter.WatchCh())
  2277  	return iter, nil
  2278  }
  2280  // CSIPluginByID returns the one named CSIPlugin
  2281  func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
  2282  	txn := s.db.Txn(false)
  2283  	defer txn.Abort()
  2285  	raw, err := txn.First("csi_plugins", "id_prefix", id)
  2286  	if err != nil {
  2287  		return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err)
  2288  	}
  2290  	if raw == nil {
  2291  		return nil, nil
  2292  	}
  2294  	plug := raw.(*structs.CSIPlugin)
  2296  	return plug, nil
  2297  }
  2299  // CSIPluginDenormalize returns a CSIPlugin with allocation details
  2300  func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
  2301  	if plug == nil {
  2302  		return nil, nil
  2303  	}
  2305  	// Get the unique list of allocation ids
  2306  	ids := map[string]struct{}{}
  2307  	for _, info := range plug.Controllers {
  2308  		ids[info.AllocID] = struct{}{}
  2309  	}
  2310  	for _, info := range plug.Nodes {
  2311  		ids[info.AllocID] = struct{}{}
  2312  	}
  2314  	for id := range ids {
  2315  		alloc, err := s.AllocByID(ws, id)
  2316  		if err != nil {
  2317  			return nil, err
  2318  		}
  2319  		if alloc == nil {
  2320  			continue
  2321  		}
  2322  		plug.Allocations = append(plug.Allocations, alloc.Stub())
  2323  	}
  2325  	return plug, nil
  2326  }
  2328  // UpsertCSIPlugin writes the plugin to the state store. Note: there
  2329  // is currently no raft message for this, as it's intended to support
  2330  // testing use cases.
  2331  func (s *StateStore) UpsertCSIPlugin(index uint64, plug *structs.CSIPlugin) error {
  2332  	txn := s.db.Txn(true)
  2333  	defer txn.Abort()
  2335  	existing, err := txn.First("csi_plugins", "id", plug.ID)
  2336  	if err != nil {
  2337  		return fmt.Errorf("csi_plugin lookup error: %s %v", plug.ID, err)
  2338  	}
  2340  	plug.ModifyIndex = index
  2341  	if existing != nil {
  2342  		plug.CreateIndex = existing.(*structs.CSIPlugin).CreateIndex
  2343  	}
  2345  	err = txn.Insert("csi_plugins", plug)
  2346  	if err != nil {
  2347  		return fmt.Errorf("csi_plugins insert error: %v", err)
  2348  	}
  2349  	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
  2350  		return fmt.Errorf("index update failed: %v", err)
  2351  	}
  2352  	txn.Commit()
  2353  	return nil
  2354  }
  2356  // DeleteCSIPlugin deletes the plugin if it's not in use.
  2357  func (s *StateStore) DeleteCSIPlugin(index uint64, id string) error {
  2358  	txn := s.db.Txn(true)
  2359  	defer txn.Abort()
  2360  	ws := memdb.NewWatchSet()
  2362  	plug, err := s.CSIPluginByID(ws, id)
  2363  	if err != nil {
  2364  		return err
  2365  	}
  2367  	if plug == nil {
  2368  		return nil
  2369  	}
  2371  	plug, err = s.CSIPluginDenormalize(ws, plug.Copy())
  2372  	if err != nil {
  2373  		return err
  2374  	}
  2375  	if !plug.IsEmpty() {
  2376  		return fmt.Errorf("plugin in use")
  2377  	}
  2379  	err = txn.Delete("csi_plugins", plug)
  2380  	if err != nil {
  2381  		return fmt.Errorf("csi_plugins delete error: %v", err)
  2382  	}
  2383  	txn.Commit()
  2384  	return nil
  2385  }
  2387  // UpsertPeriodicLaunch is used to register a launch or update it.
  2388  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
  2389  	txn := s.db.Txn(true)
  2390  	defer txn.Abort()
  2392  	// Check if the job already exists
  2393  	existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID)
  2394  	if err != nil {
  2395  		return fmt.Errorf("periodic launch lookup failed: %v", err)
  2396  	}
  2398  	// Setup the indexes correctly
  2399  	if existing != nil {
  2400  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
  2401  		launch.ModifyIndex = index
  2402  	} else {
  2403  		launch.CreateIndex = index
  2404  		launch.ModifyIndex = index
  2405  	}
  2407  	// Insert the job
  2408  	if err := txn.Insert("periodic_launch", launch); err != nil {
  2409  		return fmt.Errorf("launch insert failed: %v", err)
  2410  	}
  2411  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2412  		return fmt.Errorf("index update failed: %v", err)
  2413  	}
  2415  	txn.Commit()
  2416  	return nil
  2417  }
  2419  // DeletePeriodicLaunch is used to delete the periodic launch
  2420  func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error {
  2421  	txn := s.db.Txn(true)
  2422  	defer txn.Abort()
  2424  	err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn)
  2425  	if err == nil {
  2426  		txn.Commit()
  2427  	}
  2428  	return err
  2429  }
  2431  // DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch
  2432  // but in a transaction.  Useful for when making multiple modifications atomically
  2433  func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error {
  2434  	// Lookup the launch
  2435  	existing, err := txn.First("periodic_launch", "id", namespace, jobID)
  2436  	if err != nil {
  2437  		return fmt.Errorf("launch lookup failed: %v", err)
  2438  	}
  2439  	if existing == nil {
  2440  		return fmt.Errorf("launch not found")
  2441  	}
  2443  	// Delete the launch
  2444  	if err := txn.Delete("periodic_launch", existing); err != nil {
  2445  		return fmt.Errorf("launch delete failed: %v", err)
  2446  	}
  2447  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
  2448  		return fmt.Errorf("index update failed: %v", err)
  2449  	}
  2451  	return nil
  2452  }
  2454  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
  2455  // ID.
  2456  func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) {
  2457  	txn := s.db.Txn(false)
  2459  	watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id)
  2460  	if err != nil {
  2461  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
  2462  	}
  2464  	ws.Add(watchCh)
  2466  	if existing != nil {
  2467  		return existing.(*structs.PeriodicLaunch), nil
  2468  	}
  2469  	return nil, nil
  2470  }
  2472  // PeriodicLaunches returns an iterator over all the periodic launches
  2473  func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2474  	txn := s.db.Txn(false)
  2476  	// Walk the entire table
  2477  	iter, err := txn.Get("periodic_launch", "id")
  2478  	if err != nil {
  2479  		return nil, err
  2480  	}
  2482  	ws.Add(iter.WatchCh())
  2484  	return iter, nil
  2485  }
  2487  // UpsertEvals is used to upsert a set of evaluations
  2488  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
  2489  	txn := s.db.Txn(true)
  2490  	defer txn.Abort()
  2492  	err := s.UpsertEvalsTxn(index, evals, txn)
  2493  	if err == nil {
  2494  		txn.Commit()
  2495  	}
  2496  	return err
  2497  }
  2499  // UpsertEvals is used to upsert a set of evaluations, like UpsertEvals
  2500  // but in a transaction.  Useful for when making multiple modifications atomically
  2501  func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error {
  2502  	// Do a nested upsert
  2503  	jobs := make(map[structs.NamespacedID]string, len(evals))
  2504  	for _, eval := range evals {
  2505  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  2506  			return err
  2507  		}
  2509  		tuple := structs.NamespacedID{
  2510  			ID:        eval.JobID,
  2511  			Namespace: eval.Namespace,
  2512  		}
  2513  		jobs[tuple] = ""
  2514  	}
  2516  	// Set the job's status
  2517  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  2518  		return fmt.Errorf("setting job status failed: %v", err)
  2519  	}
  2521  	return nil
  2522  }
  2524  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
  2525  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
  2526  	// Lookup the evaluation
  2527  	existing, err := txn.First("evals", "id", eval.ID)
  2528  	if err != nil {
  2529  		return fmt.Errorf("eval lookup failed: %v", err)
  2530  	}
  2532  	// Update the indexes
  2533  	if existing != nil {
  2534  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
  2535  		eval.ModifyIndex = index
  2536  	} else {
  2537  		eval.CreateIndex = index
  2538  		eval.ModifyIndex = index
  2539  	}
  2541  	// Update the job summary
  2542  	summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID)
  2543  	if err != nil {
  2544  		return fmt.Errorf("job summary lookup failed: %v", err)
  2545  	}
  2546  	if summaryRaw != nil {
  2547  		js := summaryRaw.(*structs.JobSummary).Copy()
  2548  		hasSummaryChanged := false
  2549  		for tg, num := range eval.QueuedAllocations {
  2550  			if summary, ok := js.Summary[tg]; ok {
  2551  				if summary.Queued != num {
  2552  					summary.Queued = num
  2553  					js.Summary[tg] = summary
  2554  					hasSummaryChanged = true
  2555  				}
  2556  			} else {
  2557  				s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace)
  2558  			}
  2559  		}
  2561  		// Insert the job summary
  2562  		if hasSummaryChanged {
  2563  			js.ModifyIndex = index
  2564  			if err := txn.Insert("job_summary", js); err != nil {
  2565  				return fmt.Errorf("job summary insert failed: %v", err)
  2566  			}
  2567  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  2568  				return fmt.Errorf("index update failed: %v", err)
  2569  			}
  2570  		}
  2571  	}
  2573  	// Check if the job has any blocked evaluations and cancel them
  2574  	if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 {
  2575  		// Get the blocked evaluation for a job if it exists
  2576  		iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked)
  2577  		if err != nil {
  2578  			return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err)
  2579  		}
  2581  		var blocked []*structs.Evaluation
  2582  		for {
  2583  			raw := iter.Next()
  2584  			if raw == nil {
  2585  				break
  2586  			}
  2587  			blocked = append(blocked, raw.(*structs.Evaluation))
  2588  		}
  2590  		// Go through and update the evals
  2591  		for _, eval := range blocked {
  2592  			newEval := eval.Copy()
  2593  			newEval.Status = structs.EvalStatusCancelled
  2594  			newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", newEval.ID)
  2595  			newEval.ModifyIndex = index
  2597  			if err := txn.Insert("evals", newEval); err != nil {
  2598  				return fmt.Errorf("eval insert failed: %v", err)
  2599  			}
  2600  		}
  2601  	}
  2603  	// Insert the eval
  2604  	if err := txn.Insert("evals", eval); err != nil {
  2605  		return fmt.Errorf("eval insert failed: %v", err)
  2606  	}
  2607  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2608  		return fmt.Errorf("index update failed: %v", err)
  2609  	}
  2610  	return nil
  2611  }
  2613  // updateEvalModifyIndex is used to update the modify index of an evaluation that has been
  2614  // through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent
  2615  // scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply.
  2616  func (s *StateStore) updateEvalModifyIndex(txn *memdb.Txn, index uint64, evalID string) error {
  2617  	// Lookup the evaluation
  2618  	existing, err := txn.First("evals", "id", evalID)
  2619  	if err != nil {
  2620  		return fmt.Errorf("eval lookup failed: %v", err)
  2621  	}
  2622  	if existing == nil {
  2623  		s.logger.Error("unable to find eval", "eval_id", evalID)
  2624  		return fmt.Errorf("unable to find eval id %q", evalID)
  2625  	}
  2626  	eval := existing.(*structs.Evaluation).Copy()
  2627  	// Update the indexes
  2628  	eval.ModifyIndex = index
  2630  	// Insert the eval
  2631  	if err := txn.Insert("evals", eval); err != nil {
  2632  		return fmt.Errorf("eval insert failed: %v", err)
  2633  	}
  2634  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2635  		return fmt.Errorf("index update failed: %v", err)
  2636  	}
  2637  	return nil
  2638  }
  2640  // DeleteEval is used to delete an evaluation
  2641  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
  2642  	txn := s.db.Txn(true)
  2643  	defer txn.Abort()
  2645  	jobs := make(map[structs.NamespacedID]string, len(evals))
  2646  	for _, eval := range evals {
  2647  		existing, err := txn.First("evals", "id", eval)
  2648  		if err != nil {
  2649  			return fmt.Errorf("eval lookup failed: %v", err)
  2650  		}
  2651  		if existing == nil {
  2652  			continue
  2653  		}
  2654  		if err := txn.Delete("evals", existing); err != nil {
  2655  			return fmt.Errorf("eval delete failed: %v", err)
  2656  		}
  2657  		eval := existing.(*structs.Evaluation)
  2659  		tuple := structs.NamespacedID{
  2660  			ID:        eval.JobID,
  2661  			Namespace: eval.Namespace,
  2662  		}
  2663  		jobs[tuple] = ""
  2664  	}
  2666  	for _, alloc := range allocs {
  2667  		raw, err := txn.First("allocs", "id", alloc)
  2668  		if err != nil {
  2669  			return fmt.Errorf("alloc lookup failed: %v", err)
  2670  		}
  2671  		if raw == nil {
  2672  			continue
  2673  		}
  2674  		if err := txn.Delete("allocs", raw); err != nil {
  2675  			return fmt.Errorf("alloc delete failed: %v", err)
  2676  		}
  2677  	}
  2679  	// Update the indexes
  2680  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
  2681  		return fmt.Errorf("index update failed: %v", err)
  2682  	}
  2683  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2684  		return fmt.Errorf("index update failed: %v", err)
  2685  	}
  2687  	// Set the job's status
  2688  	if err := s.setJobStatuses(index, txn, jobs, true); err != nil {
  2689  		return fmt.Errorf("setting job status failed: %v", err)
  2690  	}
  2692  	txn.Commit()
  2693  	return nil
  2694  }
  2696  // EvalByID is used to lookup an eval by its ID
  2697  func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) {
  2698  	txn := s.db.Txn(false)
  2700  	watchCh, existing, err := txn.FirstWatch("evals", "id", id)
  2701  	if err != nil {
  2702  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  2703  	}
  2705  	ws.Add(watchCh)
  2707  	if existing != nil {
  2708  		return existing.(*structs.Evaluation), nil
  2709  	}
  2710  	return nil, nil
  2711  }
  2713  // EvalsByIDPrefix is used to lookup evaluations by prefix in a particular
  2714  // namespace
  2715  func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  2716  	txn := s.db.Txn(false)
  2718  	// Get an iterator over all evals by the id prefix
  2719  	iter, err := txn.Get("evals", "id_prefix", id)
  2720  	if err != nil {
  2721  		return nil, fmt.Errorf("eval lookup failed: %v", err)
  2722  	}
  2724  	ws.Add(iter.WatchCh())
  2726  	// Wrap the iterator in a filter
  2727  	wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace))
  2728  	return wrap, nil
  2729  }
  2731  // evalNamespaceFilter returns a filter function that filters all evaluations
  2732  // not in the given namespace.
  2733  func evalNamespaceFilter(namespace string) func(interface{}) bool {
  2734  	return func(raw interface{}) bool {
  2735  		eval, ok := raw.(*structs.Evaluation)
  2736  		if !ok {
  2737  			return true
  2738  		}
  2740  		return eval.Namespace != namespace
  2741  	}
  2742  }
  2744  // EvalsByJob returns all the evaluations by job id
  2745  func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) {
  2746  	txn := s.db.Txn(false)
  2748  	// Get an iterator over the node allocations
  2749  	iter, err := txn.Get("evals", "job_prefix", namespace, jobID)
  2750  	if err != nil {
  2751  		return nil, err
  2752  	}
  2754  	ws.Add(iter.WatchCh())
  2756  	var out []*structs.Evaluation
  2757  	for {
  2758  		raw := iter.Next()
  2759  		if raw == nil {
  2760  			break
  2761  		}
  2763  		e := raw.(*structs.Evaluation)
  2765  		// Filter non-exact matches
  2766  		if e.JobID != jobID {
  2767  			continue
  2768  		}
  2770  		out = append(out, e)
  2771  	}
  2772  	return out, nil
  2773  }
  2775  // Evals returns an iterator over all the evaluations
  2776  func (s *StateStore) Evals(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  2777  	txn := s.db.Txn(false)
  2779  	// Walk the entire table
  2780  	iter, err := txn.Get("evals", "id")
  2781  	if err != nil {
  2782  		return nil, err
  2783  	}
  2785  	ws.Add(iter.WatchCh())
  2787  	return iter, nil
  2788  }
  2790  // EvalsByNamespace returns an iterator over all the evaluations in the given
  2791  // namespace
  2792  func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  2793  	txn := s.db.Txn(false)
  2795  	// Walk the entire table
  2796  	iter, err := txn.Get("evals", "namespace", namespace)
  2797  	if err != nil {
  2798  		return nil, err
  2799  	}
  2801  	ws.Add(iter.WatchCh())
  2803  	return iter, nil
  2804  }
  2806  // UpdateAllocsFromClient is used to update an allocation based on input
  2807  // from a client. While the schedulers are the authority on the allocation for
  2808  // most things, some updates are authoritative from the client. Specifically,
  2809  // the desired state comes from the schedulers, while the actual state comes
  2810  // from clients.
  2811  func (s *StateStore) UpdateAllocsFromClient(index uint64, allocs []*structs.Allocation) error {
  2812  	txn := s.db.Txn(true)
  2813  	defer txn.Abort()
  2815  	// Handle each of the updated allocations
  2816  	for _, alloc := range allocs {
  2817  		if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil {
  2818  			return err
  2819  		}
  2820  	}
  2822  	// Update the indexes
  2823  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  2824  		return fmt.Errorf("index update failed: %v", err)
  2825  	}
  2827  	txn.Commit()
  2828  	return nil
  2829  }
  2831  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
  2832  func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, index uint64, alloc *structs.Allocation) error {
  2833  	// Look for existing alloc
  2834  	existing, err := txn.First("allocs", "id", alloc.ID)
  2835  	if err != nil {
  2836  		return fmt.Errorf("alloc lookup failed: %v", err)
  2837  	}
  2839  	// Nothing to do if this does not exist
  2840  	if existing == nil {
  2841  		return nil
  2842  	}
  2843  	exist := existing.(*structs.Allocation)
  2845  	// Copy everything from the existing allocation
  2846  	copyAlloc := exist.Copy()
  2848  	// Pull in anything the client is the authority on
  2849  	copyAlloc.ClientStatus = alloc.ClientStatus
  2850  	copyAlloc.ClientDescription = alloc.ClientDescription
  2851  	copyAlloc.TaskStates = alloc.TaskStates
  2853  	// The client can only set its deployment health and timestamp, so just take
  2854  	// those
  2855  	if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil {
  2856  		oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth()
  2857  		newHasHealthy := alloc.DeploymentStatus.HasHealth()
  2859  		// We got new health information from the client
  2860  		if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) {
  2861  			// Updated deployment health and timestamp
  2862  			copyAlloc.DeploymentStatus.Healthy = helper.BoolToPtr(*alloc.DeploymentStatus.Healthy)
  2863  			copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp
  2864  			copyAlloc.DeploymentStatus.ModifyIndex = index
  2865  		}
  2866  	} else if alloc.DeploymentStatus != nil {
  2867  		// First time getting a deployment status so copy everything and just
  2868  		// set the index
  2869  		copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy()
  2870  		copyAlloc.DeploymentStatus.ModifyIndex = index
  2871  	}
  2873  	// Update the modify index
  2874  	copyAlloc.ModifyIndex = index
  2876  	// Update the modify time
  2877  	copyAlloc.ModifyTime = alloc.ModifyTime
  2879  	if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2880  		return fmt.Errorf("error updating deployment: %v", err)
  2881  	}
  2883  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2884  		return fmt.Errorf("error updating job summary: %v", err)
  2885  	}
  2887  	if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil {
  2888  		return err
  2889  	}
  2891  	if err := s.updatePluginWithAlloc(index, copyAlloc, txn); err != nil {
  2892  		return err
  2893  	}
  2895  	// Update the allocation
  2896  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  2897  		return fmt.Errorf("alloc insert failed: %v", err)
  2898  	}
  2900  	// Set the job's status
  2901  	forceStatus := ""
  2902  	if !copyAlloc.TerminalStatus() {
  2903  		forceStatus = structs.JobStatusRunning
  2904  	}
  2906  	tuple := structs.NamespacedID{
  2907  		ID:        exist.JobID,
  2908  		Namespace: exist.Namespace,
  2909  	}
  2910  	jobs := map[structs.NamespacedID]string{tuple: forceStatus}
  2912  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  2913  		return fmt.Errorf("setting job status failed: %v", err)
  2914  	}
  2915  	return nil
  2916  }
  2918  // UpsertAllocs is used to evict a set of allocations and allocate new ones at
  2919  // the same time.
  2920  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
  2921  	txn := s.db.Txn(true)
  2922  	defer txn.Abort()
  2923  	if err := s.upsertAllocsImpl(index, allocs, txn); err != nil {
  2924  		return err
  2925  	}
  2926  	txn.Commit()
  2927  	return nil
  2928  }
  2930  // upsertAllocs is the actual implementation of UpsertAllocs so that it may be
  2931  // used with an existing transaction.
  2932  func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *memdb.Txn) error {
  2933  	// Handle the allocations
  2934  	jobs := make(map[structs.NamespacedID]string, 1)
  2935  	for _, alloc := range allocs {
  2936  		existing, err := txn.First("allocs", "id", alloc.ID)
  2937  		if err != nil {
  2938  			return fmt.Errorf("alloc lookup failed: %v", err)
  2939  		}
  2940  		exist, _ := existing.(*structs.Allocation)
  2942  		if exist == nil {
  2943  			alloc.CreateIndex = index
  2944  			alloc.ModifyIndex = index
  2945  			alloc.AllocModifyIndex = index
  2946  			if alloc.DeploymentStatus != nil {
  2947  				alloc.DeploymentStatus.ModifyIndex = index
  2948  			}
  2950  			// Issue uncovered
  2951  			// the a race between a forced garbage collection and the scheduler
  2952  			// marking an allocation as terminal. The issue is that the
  2953  			// allocation from the scheduler has its job normalized and the FSM
  2954  			// will only denormalize if the allocation is not terminal.  However
  2955  			// if the allocation is garbage collected, that will result in a
  2956  			// allocation being upserted for the first time without a job
  2957  			// attached. By returning an error here, it will cause the FSM to
  2958  			// error, causing the plan_apply to error and thus causing the
  2959  			// evaluation to be failed. This will force an index refresh that
  2960  			// should solve this issue.
  2961  			if alloc.Job == nil {
  2962  				return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID)
  2963  			}
  2964  		} else {
  2965  			alloc.CreateIndex = exist.CreateIndex
  2966  			alloc.ModifyIndex = index
  2967  			alloc.AllocModifyIndex = index
  2969  			// Keep the clients task states
  2970  			alloc.TaskStates = exist.TaskStates
  2972  			// If the scheduler is marking this allocation as lost we do not
  2973  			// want to reuse the status of the existing allocation.
  2974  			if alloc.ClientStatus != structs.AllocClientStatusLost {
  2975  				alloc.ClientStatus = exist.ClientStatus
  2976  				alloc.ClientDescription = exist.ClientDescription
  2977  			}
  2979  			// The job has been denormalized so re-attach the original job
  2980  			if alloc.Job == nil {
  2981  				alloc.Job = exist.Job
  2982  			}
  2983  		}
  2985  		// OPTIMIZATION:
  2986  		// These should be given a map of new to old allocation and the updates
  2987  		// should be one on all changes. The current implementation causes O(n)
  2988  		// lookups/copies/insertions rather than O(1)
  2989  		if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil {
  2990  			return fmt.Errorf("error updating deployment: %v", err)
  2991  		}
  2993  		if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil {
  2994  			return fmt.Errorf("error updating job summary: %v", err)
  2995  		}
  2997  		if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil {
  2998  			return err
  2999  		}
  3001  		if err := s.updatePluginWithAlloc(index, alloc, txn); err != nil {
  3002  			return err
  3003  		}
  3005  		if err := txn.Insert("allocs", alloc); err != nil {
  3006  			return fmt.Errorf("alloc insert failed: %v", err)
  3007  		}
  3009  		if alloc.PreviousAllocation != "" {
  3010  			prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation)
  3011  			if err != nil {
  3012  				return fmt.Errorf("alloc lookup failed: %v", err)
  3013  			}
  3014  			existingPrevAlloc, _ := prevAlloc.(*structs.Allocation)
  3015  			if existingPrevAlloc != nil {
  3016  				prevAllocCopy := existingPrevAlloc.Copy()
  3017  				prevAllocCopy.NextAllocation = alloc.ID
  3018  				prevAllocCopy.ModifyIndex = index
  3019  				if err := txn.Insert("allocs", prevAllocCopy); err != nil {
  3020  					return fmt.Errorf("alloc insert failed: %v", err)
  3021  				}
  3022  			}
  3023  		}
  3025  		// If the allocation is running, force the job to running status.
  3026  		forceStatus := ""
  3027  		if !alloc.TerminalStatus() {
  3028  			forceStatus = structs.JobStatusRunning
  3029  		}
  3031  		tuple := structs.NamespacedID{
  3032  			ID:        alloc.JobID,
  3033  			Namespace: alloc.Namespace,
  3034  		}
  3035  		jobs[tuple] = forceStatus
  3036  	}
  3038  	// Update the indexes
  3039  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3040  		return fmt.Errorf("index update failed: %v", err)
  3041  	}
  3043  	// Set the job's status
  3044  	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
  3045  		return fmt.Errorf("setting job status failed: %v", err)
  3046  	}
  3048  	return nil
  3049  }
  3051  // UpdateAllocsDesiredTransitions is used to update a set of allocations
  3052  // desired transitions.
  3053  func (s *StateStore) UpdateAllocsDesiredTransitions(index uint64, allocs map[string]*structs.DesiredTransition,
  3054  	evals []*structs.Evaluation) error {
  3056  	txn := s.db.Txn(true)
  3057  	defer txn.Abort()
  3059  	// Handle each of the updated allocations
  3060  	for id, transition := range allocs {
  3061  		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transition); err != nil {
  3062  			return err
  3063  		}
  3064  	}
  3066  	for _, eval := range evals {
  3067  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
  3068  			return err
  3069  		}
  3070  	}
  3072  	// Update the indexes
  3073  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3074  		return fmt.Errorf("index update failed: %v", err)
  3075  	}
  3077  	txn.Commit()
  3078  	return nil
  3079  }
  3081  // nestedUpdateAllocDesiredTransition is used to nest an update of an
  3082  // allocations desired transition
  3083  func (s *StateStore) nestedUpdateAllocDesiredTransition(
  3084  	txn *memdb.Txn, index uint64, allocID string,
  3085  	transition *structs.DesiredTransition) error {
  3087  	// Look for existing alloc
  3088  	existing, err := txn.First("allocs", "id", allocID)
  3089  	if err != nil {
  3090  		return fmt.Errorf("alloc lookup failed: %v", err)
  3091  	}
  3093  	// Nothing to do if this does not exist
  3094  	if existing == nil {
  3095  		return nil
  3096  	}
  3097  	exist := existing.(*structs.Allocation)
  3099  	// Copy everything from the existing allocation
  3100  	copyAlloc := exist.Copy()
  3102  	// Merge the desired transitions
  3103  	copyAlloc.DesiredTransition.Merge(transition)
  3105  	// Update the modify index
  3106  	copyAlloc.ModifyIndex = index
  3108  	// Update the allocation
  3109  	if err := txn.Insert("allocs", copyAlloc); err != nil {
  3110  		return fmt.Errorf("alloc insert failed: %v", err)
  3111  	}
  3113  	return nil
  3114  }
  3116  // AllocByID is used to lookup an allocation by its ID
  3117  func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
  3118  	txn := s.db.Txn(false)
  3120  	watchCh, existing, err := txn.FirstWatch("allocs", "id", id)
  3121  	if err != nil {
  3122  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  3123  	}
  3125  	ws.Add(watchCh)
  3127  	if existing != nil {
  3128  		return existing.(*structs.Allocation), nil
  3129  	}
  3130  	return nil, nil
  3131  }
  3133  // AllocsByIDPrefix is used to lookup allocs by prefix
  3134  func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
  3135  	txn := s.db.Txn(false)
  3137  	iter, err := txn.Get("allocs", "id_prefix", id)
  3138  	if err != nil {
  3139  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  3140  	}
  3142  	ws.Add(iter.WatchCh())
  3144  	// Wrap the iterator in a filter
  3145  	wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace))
  3146  	return wrap, nil
  3147  }
  3149  // allocNamespaceFilter returns a filter function that filters all allocations
  3150  // not in the given namespace.
  3151  func allocNamespaceFilter(namespace string) func(interface{}) bool {
  3152  	return func(raw interface{}) bool {
  3153  		alloc, ok := raw.(*structs.Allocation)
  3154  		if !ok {
  3155  			return true
  3156  		}
  3158  		return alloc.Namespace != namespace
  3159  	}
  3160  }
  3162  // AllocsByNode returns all the allocations by node
  3163  func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
  3164  	txn := s.db.Txn(false)
  3166  	// Get an iterator over the node allocations, using only the
  3167  	// node prefix which ignores the terminal status
  3168  	iter, err := txn.Get("allocs", "node_prefix", node)
  3169  	if err != nil {
  3170  		return nil, err
  3171  	}
  3173  	ws.Add(iter.WatchCh())
  3175  	var out []*structs.Allocation
  3176  	for {
  3177  		raw := iter.Next()
  3178  		if raw == nil {
  3179  			break
  3180  		}
  3181  		out = append(out, raw.(*structs.Allocation))
  3182  	}
  3183  	return out, nil
  3184  }
  3186  // AllocsByNode returns all the allocations by node and terminal status
  3187  func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) {
  3188  	txn := s.db.Txn(false)
  3190  	// Get an iterator over the node allocations
  3191  	iter, err := txn.Get("allocs", "node", node, terminal)
  3192  	if err != nil {
  3193  		return nil, err
  3194  	}
  3196  	ws.Add(iter.WatchCh())
  3198  	var out []*structs.Allocation
  3199  	for {
  3200  		raw := iter.Next()
  3201  		if raw == nil {
  3202  			break
  3203  		}
  3204  		out = append(out, raw.(*structs.Allocation))
  3205  	}
  3206  	return out, nil
  3207  }
  3209  // AllocsByJob returns allocations by job id
  3210  func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) {
  3211  	txn := s.db.Txn(false)
  3213  	// Get the job
  3214  	var job *structs.Job
  3215  	rawJob, err := txn.First("jobs", "id", namespace, jobID)
  3216  	if err != nil {
  3217  		return nil, err
  3218  	}
  3219  	if rawJob != nil {
  3220  		job = rawJob.(*structs.Job)
  3221  	}
  3223  	// Get an iterator over the node allocations
  3224  	iter, err := txn.Get("allocs", "job", namespace, jobID)
  3225  	if err != nil {
  3226  		return nil, err
  3227  	}
  3229  	ws.Add(iter.WatchCh())
  3231  	var out []*structs.Allocation
  3232  	for {
  3233  		raw := iter.Next()
  3234  		if raw == nil {
  3235  			break
  3236  		}
  3238  		alloc := raw.(*structs.Allocation)
  3239  		// If the allocation belongs to a job with the same ID but a different
  3240  		// create index and we are not getting all the allocations whose Jobs
  3241  		// matches the same Job ID then we skip it
  3242  		if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex {
  3243  			continue
  3244  		}
  3245  		out = append(out, raw.(*structs.Allocation))
  3246  	}
  3247  	return out, nil
  3248  }
  3250  // AllocsByEval returns all the allocations by eval id
  3251  func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) {
  3252  	txn := s.db.Txn(false)
  3254  	// Get an iterator over the eval allocations
  3255  	iter, err := txn.Get("allocs", "eval", evalID)
  3256  	if err != nil {
  3257  		return nil, err
  3258  	}
  3260  	ws.Add(iter.WatchCh())
  3262  	var out []*structs.Allocation
  3263  	for {
  3264  		raw := iter.Next()
  3265  		if raw == nil {
  3266  			break
  3267  		}
  3268  		out = append(out, raw.(*structs.Allocation))
  3269  	}
  3270  	return out, nil
  3271  }
  3273  // AllocsByDeployment returns all the allocations by deployment id
  3274  func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) {
  3275  	txn := s.db.Txn(false)
  3277  	// Get an iterator over the deployments allocations
  3278  	iter, err := txn.Get("allocs", "deployment", deploymentID)
  3279  	if err != nil {
  3280  		return nil, err
  3281  	}
  3283  	ws.Add(iter.WatchCh())
  3285  	var out []*structs.Allocation
  3286  	for {
  3287  		raw := iter.Next()
  3288  		if raw == nil {
  3289  			break
  3290  		}
  3291  		out = append(out, raw.(*structs.Allocation))
  3292  	}
  3293  	return out, nil
  3294  }
  3296  // Allocs returns an iterator over all the evaluations
  3297  func (s *StateStore) Allocs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3298  	txn := s.db.Txn(false)
  3300  	// Walk the entire table
  3301  	iter, err := txn.Get("allocs", "id")
  3302  	if err != nil {
  3303  		return nil, err
  3304  	}
  3306  	ws.Add(iter.WatchCh())
  3308  	return iter, nil
  3309  }
  3311  // AllocsByNamespace returns an iterator over all the allocations in the
  3312  // namespace
  3313  func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  3314  	txn := s.db.Txn(false)
  3315  	return s.allocsByNamespaceImpl(ws, txn, namespace)
  3316  }
  3318  // allocsByNamespaceImpl returns an iterator over all the allocations in the
  3319  // namespace
  3320  func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *memdb.Txn, namespace string) (memdb.ResultIterator, error) {
  3321  	// Walk the entire table
  3322  	iter, err := txn.Get("allocs", "namespace", namespace)
  3323  	if err != nil {
  3324  		return nil, err
  3325  	}
  3327  	ws.Add(iter.WatchCh())
  3329  	return iter, nil
  3330  }
  3332  // UpsertVaultAccessors is used to register a set of Vault Accessors
  3333  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  3334  	txn := s.db.Txn(true)
  3335  	defer txn.Abort()
  3337  	for _, accessor := range accessors {
  3338  		// Set the create index
  3339  		accessor.CreateIndex = index
  3341  		// Insert the accessor
  3342  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  3343  			return fmt.Errorf("accessor insert failed: %v", err)
  3344  		}
  3345  	}
  3347  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  3348  		return fmt.Errorf("index update failed: %v", err)
  3349  	}
  3351  	txn.Commit()
  3352  	return nil
  3353  }
  3355  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  3356  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  3357  	txn := s.db.Txn(true)
  3358  	defer txn.Abort()
  3360  	// Lookup the accessor
  3361  	for _, accessor := range accessors {
  3362  		// Delete the accessor
  3363  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  3364  			return fmt.Errorf("accessor delete failed: %v", err)
  3365  		}
  3366  	}
  3368  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  3369  		return fmt.Errorf("index update failed: %v", err)
  3370  	}
  3372  	txn.Commit()
  3373  	return nil
  3374  }
  3376  // VaultAccessor returns the given Vault accessor
  3377  func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) {
  3378  	txn := s.db.Txn(false)
  3380  	watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor)
  3381  	if err != nil {
  3382  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  3383  	}
  3385  	ws.Add(watchCh)
  3387  	if existing != nil {
  3388  		return existing.(*structs.VaultAccessor), nil
  3389  	}
  3391  	return nil, nil
  3392  }
  3394  // VaultAccessors returns an iterator of Vault accessors.
  3395  func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3396  	txn := s.db.Txn(false)
  3398  	iter, err := txn.Get("vault_accessors", "id")
  3399  	if err != nil {
  3400  		return nil, err
  3401  	}
  3403  	ws.Add(iter.WatchCh())
  3405  	return iter, nil
  3406  }
  3408  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  3409  func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) {
  3410  	txn := s.db.Txn(false)
  3412  	// Get an iterator over the accessors
  3413  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  3414  	if err != nil {
  3415  		return nil, err
  3416  	}
  3418  	ws.Add(iter.WatchCh())
  3420  	var out []*structs.VaultAccessor
  3421  	for {
  3422  		raw := iter.Next()
  3423  		if raw == nil {
  3424  			break
  3425  		}
  3426  		out = append(out, raw.(*structs.VaultAccessor))
  3427  	}
  3428  	return out, nil
  3429  }
  3431  // VaultAccessorsByNode returns all the Vault accessors by node id
  3432  func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) {
  3433  	txn := s.db.Txn(false)
  3435  	// Get an iterator over the accessors
  3436  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  3437  	if err != nil {
  3438  		return nil, err
  3439  	}
  3441  	ws.Add(iter.WatchCh())
  3443  	var out []*structs.VaultAccessor
  3444  	for {
  3445  		raw := iter.Next()
  3446  		if raw == nil {
  3447  			break
  3448  		}
  3449  		out = append(out, raw.(*structs.VaultAccessor))
  3450  	}
  3451  	return out, nil
  3452  }
  3454  func indexEntry(table string, index uint64) *IndexEntry {
  3455  	return &IndexEntry{
  3456  		Key:   table,
  3457  		Value: index,
  3458  	}
  3459  }
  3461  const siTokenAccessorTable = "si_token_accessors"
  3463  // UpsertSITokenAccessors is used to register a set of Service Identity token accessors.
  3464  func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  3465  	txn := s.db.Txn(true)
  3466  	defer txn.Abort()
  3468  	for _, accessor := range accessors {
  3469  		// set the create index
  3470  		accessor.CreateIndex = index
  3472  		// insert the accessor
  3473  		if err := txn.Insert(siTokenAccessorTable, accessor); err != nil {
  3474  			return errors.Wrap(err, "accessor insert failed")
  3475  		}
  3476  	}
  3478  	// update the index for this table
  3479  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  3480  		return errors.Wrap(err, "index update failed")
  3481  	}
  3483  	txn.Commit()
  3484  	return nil
  3485  }
  3487  // DeleteSITokenAccessors is used to delete a set of Service Identity token accessors.
  3488  func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
  3489  	txn := s.db.Txn(true)
  3490  	defer txn.Abort()
  3492  	// Lookup each accessor
  3493  	for _, accessor := range accessors {
  3494  		// Delete the accessor
  3495  		if err := txn.Delete(siTokenAccessorTable, accessor); err != nil {
  3496  			return errors.Wrap(err, "accessor delete failed")
  3497  		}
  3498  	}
  3500  	// update the index for this table
  3501  	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
  3502  		return errors.Wrap(err, "index update failed")
  3503  	}
  3505  	txn.Commit()
  3506  	return nil
  3507  }
  3509  // SITokenAccessor returns the given Service Identity token accessor.
  3510  func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) {
  3511  	txn := s.db.Txn(false)
  3512  	defer txn.Abort()
  3514  	watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID)
  3515  	if err != nil {
  3516  		return nil, errors.Wrap(err, "accessor lookup failed")
  3517  	}
  3519  	ws.Add(watchCh)
  3521  	if existing != nil {
  3522  		return existing.(*structs.SITokenAccessor), nil
  3523  	}
  3525  	return nil, nil
  3526  }
  3528  // SITokenAccessors returns an iterator of Service Identity token accessors.
  3529  func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  3530  	txn := s.db.Txn(false)
  3531  	defer txn.Abort()
  3533  	iter, err := txn.Get(siTokenAccessorTable, "id")
  3534  	if err != nil {
  3535  		return nil, err
  3536  	}
  3538  	ws.Add(iter.WatchCh())
  3540  	return iter, nil
  3541  }
  3543  // SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID.
  3544  func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) {
  3545  	txn := s.db.Txn(false)
  3546  	defer txn.Abort()
  3548  	// Get an iterator over the accessors
  3549  	iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID)
  3550  	if err != nil {
  3551  		return nil, err
  3552  	}
  3554  	ws.Add(iter.WatchCh())
  3556  	var result []*structs.SITokenAccessor
  3557  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  3558  		result = append(result, raw.(*structs.SITokenAccessor))
  3559  	}
  3561  	return result, nil
  3562  }
  3564  // SITokenAccessorsByNode returns all the Service Identity token accessors by node ID.
  3565  func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) {
  3566  	txn := s.db.Txn(false)
  3567  	defer txn.Abort()
  3569  	// Get an iterator over the accessors
  3570  	iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID)
  3571  	if err != nil {
  3572  		return nil, err
  3573  	}
  3575  	ws.Add(iter.WatchCh())
  3577  	var result []*structs.SITokenAccessor
  3578  	for raw := iter.Next(); raw != nil; raw = iter.Next() {
  3579  		result = append(result, raw.(*structs.SITokenAccessor))
  3580  	}
  3582  	return result, nil
  3583  }
  3585  // UpdateDeploymentStatus is used to make deployment status updates and
  3586  // potentially make a evaluation
  3587  func (s *StateStore) UpdateDeploymentStatus(index uint64, req *structs.DeploymentStatusUpdateRequest) error {
  3588  	txn := s.db.Txn(true)
  3589  	defer txn.Abort()
  3591  	if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  3592  		return err
  3593  	}
  3595  	// Upsert the job if necessary
  3596  	if req.Job != nil {
  3597  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  3598  			return err
  3599  		}
  3600  	}
  3602  	// Upsert the optional eval
  3603  	if req.Eval != nil {
  3604  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3605  			return err
  3606  		}
  3607  	}
  3609  	txn.Commit()
  3610  	return nil
  3611  }
  3613  // updateDeploymentStatusImpl is used to make deployment status updates
  3614  func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *memdb.Txn) error {
  3615  	// Retrieve deployment
  3616  	ws := memdb.NewWatchSet()
  3617  	deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn)
  3618  	if err != nil {
  3619  		return err
  3620  	} else if deployment == nil {
  3621  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID)
  3622  	} else if !deployment.Active() {
  3623  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3624  	}
  3626  	// Apply the new status
  3627  	copy := deployment.Copy()
  3628  	copy.Status = u.Status
  3629  	copy.StatusDescription = u.StatusDescription
  3630  	copy.ModifyIndex = index
  3632  	// Insert the deployment
  3633  	if err := txn.Insert("deployment", copy); err != nil {
  3634  		return err
  3635  	}
  3637  	// Update the index
  3638  	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
  3639  		return fmt.Errorf("index update failed: %v", err)
  3640  	}
  3642  	// If the deployment is being marked as complete, set the job to stable.
  3643  	if copy.Status == structs.DeploymentStatusSuccessful {
  3644  		if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil {
  3645  			return fmt.Errorf("failed to update job stability: %v", err)
  3646  		}
  3647  	}
  3649  	return nil
  3650  }
  3652  // UpdateJobStability updates the stability of the given job and version to the
  3653  // desired status.
  3654  func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error {
  3655  	txn := s.db.Txn(true)
  3656  	defer txn.Abort()
  3658  	if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil {
  3659  		return err
  3660  	}
  3662  	txn.Commit()
  3663  	return nil
  3664  }
  3666  // updateJobStabilityImpl updates the stability of the given job and version
  3667  func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *memdb.Txn) error {
  3668  	// Get the job that is referenced
  3669  	job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn)
  3670  	if err != nil {
  3671  		return err
  3672  	}
  3674  	// Has already been cleared, nothing to do
  3675  	if job == nil {
  3676  		return nil
  3677  	}
  3679  	// If the job already has the desired stability, nothing to do
  3680  	if job.Stable == stable {
  3681  		return nil
  3682  	}
  3684  	copy := job.Copy()
  3685  	copy.Stable = stable
  3686  	return s.upsertJobImpl(index, copy, true, txn)
  3687  }
  3689  // UpdateDeploymentPromotion is used to promote canaries in a deployment and
  3690  // potentially make a evaluation
  3691  func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyDeploymentPromoteRequest) error {
  3692  	txn := s.db.Txn(true)
  3693  	defer txn.Abort()
  3695  	// Retrieve deployment and ensure it is not terminal and is active
  3696  	ws := memdb.NewWatchSet()
  3697  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  3698  	if err != nil {
  3699  		return err
  3700  	} else if deployment == nil {
  3701  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  3702  	} else if !deployment.Active() {
  3703  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3704  	}
  3706  	// Retrieve effected allocations
  3707  	iter, err := txn.Get("allocs", "deployment", req.DeploymentID)
  3708  	if err != nil {
  3709  		return err
  3710  	}
  3712  	// groupIndex is a map of groups being promoted
  3713  	groupIndex := make(map[string]struct{}, len(req.Groups))
  3714  	for _, g := range req.Groups {
  3715  		groupIndex[g] = struct{}{}
  3716  	}
  3718  	// canaryIndex is the set of placed canaries in the deployment
  3719  	canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
  3720  	for _, state := range deployment.TaskGroups {
  3721  		for _, c := range state.PlacedCanaries {
  3722  			canaryIndex[c] = struct{}{}
  3723  		}
  3724  	}
  3726  	// healthyCounts is a mapping of group to the number of healthy canaries
  3727  	healthyCounts := make(map[string]int, len(deployment.TaskGroups))
  3729  	// promotable is the set of allocations that we can move from canary to
  3730  	// non-canary
  3731  	var promotable []*structs.Allocation
  3733  	for {
  3734  		raw := iter.Next()
  3735  		if raw == nil {
  3736  			break
  3737  		}
  3739  		alloc := raw.(*structs.Allocation)
  3741  		// Check that the alloc is a canary
  3742  		if _, ok := canaryIndex[alloc.ID]; !ok {
  3743  			continue
  3744  		}
  3746  		// Check that the canary is part of a group being promoted
  3747  		if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok {
  3748  			continue
  3749  		}
  3751  		// Ensure the canaries are healthy
  3752  		if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() {
  3753  			continue
  3754  		}
  3756  		healthyCounts[alloc.TaskGroup]++
  3757  		promotable = append(promotable, alloc)
  3758  	}
  3760  	// Determine if we have enough healthy allocations
  3761  	var unhealthyErr multierror.Error
  3762  	for tg, state := range deployment.TaskGroups {
  3763  		if _, ok := groupIndex[tg]; !req.All && !ok {
  3764  			continue
  3765  		}
  3767  		need := state.DesiredCanaries
  3768  		if need == 0 {
  3769  			continue
  3770  		}
  3772  		if have := healthyCounts[tg]; have < need {
  3773  			multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
  3774  		}
  3775  	}
  3777  	if err := unhealthyErr.ErrorOrNil(); err != nil {
  3778  		return err
  3779  	}
  3781  	// Update deployment
  3782  	copy := deployment.Copy()
  3783  	copy.ModifyIndex = index
  3784  	for tg, status := range copy.TaskGroups {
  3785  		_, ok := groupIndex[tg]
  3786  		if !req.All && !ok {
  3787  			continue
  3788  		}
  3790  		status.Promoted = true
  3791  	}
  3793  	// If the deployment no longer needs promotion, update its status
  3794  	if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning {
  3795  		copy.StatusDescription = structs.DeploymentStatusDescriptionRunning
  3796  	}
  3798  	// Insert the deployment
  3799  	if err := s.upsertDeploymentImpl(index, copy, txn); err != nil {
  3800  		return err
  3801  	}
  3803  	// Upsert the optional eval
  3804  	if req.Eval != nil {
  3805  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3806  			return err
  3807  		}
  3808  	}
  3810  	// For each promotable allocation remove the canary field
  3811  	for _, alloc := range promotable {
  3812  		promoted := alloc.Copy()
  3813  		promoted.DeploymentStatus.Canary = false
  3814  		promoted.DeploymentStatus.ModifyIndex = index
  3815  		promoted.ModifyIndex = index
  3816  		promoted.AllocModifyIndex = index
  3818  		if err := txn.Insert("allocs", promoted); err != nil {
  3819  			return fmt.Errorf("alloc insert failed: %v", err)
  3820  		}
  3821  	}
  3823  	// Update the alloc index
  3824  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3825  		return fmt.Errorf("index update failed: %v", err)
  3826  	}
  3828  	txn.Commit()
  3829  	return nil
  3830  }
  3832  // UpdateDeploymentAllocHealth is used to update the health of allocations as
  3833  // part of the deployment and potentially make a evaluation
  3834  func (s *StateStore) UpdateDeploymentAllocHealth(index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error {
  3835  	txn := s.db.Txn(true)
  3836  	defer txn.Abort()
  3838  	// Retrieve deployment and ensure it is not terminal and is active
  3839  	ws := memdb.NewWatchSet()
  3840  	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
  3841  	if err != nil {
  3842  		return err
  3843  	} else if deployment == nil {
  3844  		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
  3845  	} else if !deployment.Active() {
  3846  		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
  3847  	}
  3849  	// Update the health status of each allocation
  3850  	if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 {
  3851  		setAllocHealth := func(id string, healthy bool, ts time.Time) error {
  3852  			existing, err := txn.First("allocs", "id", id)
  3853  			if err != nil {
  3854  				return fmt.Errorf("alloc %q lookup failed: %v", id, err)
  3855  			}
  3856  			if existing == nil {
  3857  				return fmt.Errorf("unknown alloc %q", id)
  3858  			}
  3860  			old := existing.(*structs.Allocation)
  3861  			if old.DeploymentID != req.DeploymentID {
  3862  				return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID)
  3863  			}
  3865  			// Set the health
  3866  			copy := old.Copy()
  3867  			if copy.DeploymentStatus == nil {
  3868  				copy.DeploymentStatus = &structs.AllocDeploymentStatus{}
  3869  			}
  3870  			copy.DeploymentStatus.Healthy = helper.BoolToPtr(healthy)
  3871  			copy.DeploymentStatus.Timestamp = ts
  3872  			copy.DeploymentStatus.ModifyIndex = index
  3873  			copy.ModifyIndex = index
  3875  			if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil {
  3876  				return fmt.Errorf("error updating deployment: %v", err)
  3877  			}
  3879  			if err := txn.Insert("allocs", copy); err != nil {
  3880  				return fmt.Errorf("alloc insert failed: %v", err)
  3881  			}
  3883  			return nil
  3884  		}
  3886  		for _, id := range req.HealthyAllocationIDs {
  3887  			if err := setAllocHealth(id, true, req.Timestamp); err != nil {
  3888  				return err
  3889  			}
  3890  		}
  3891  		for _, id := range req.UnhealthyAllocationIDs {
  3892  			if err := setAllocHealth(id, false, req.Timestamp); err != nil {
  3893  				return err
  3894  			}
  3895  		}
  3897  		// Update the indexes
  3898  		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
  3899  			return fmt.Errorf("index update failed: %v", err)
  3900  		}
  3901  	}
  3903  	// Update the deployment status as needed.
  3904  	if req.DeploymentUpdate != nil {
  3905  		if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
  3906  			return err
  3907  		}
  3908  	}
  3910  	// Upsert the job if necessary
  3911  	if req.Job != nil {
  3912  		if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil {
  3913  			return err
  3914  		}
  3915  	}
  3917  	// Upsert the optional eval
  3918  	if req.Eval != nil {
  3919  		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
  3920  			return err
  3921  		}
  3922  	}
  3924  	txn.Commit()
  3925  	return nil
  3926  }
  3928  // LastIndex returns the greatest index value for all indexes
  3929  func (s *StateStore) LatestIndex() (uint64, error) {
  3930  	indexes, err := s.Indexes()
  3931  	if err != nil {
  3932  		return 0, err
  3933  	}
  3935  	var max uint64 = 0
  3936  	for {
  3937  		raw := indexes.Next()
  3938  		if raw == nil {
  3939  			break
  3940  		}
  3942  		// Prepare the request struct
  3943  		idx := raw.(*IndexEntry)
  3945  		// Determine the max
  3946  		if idx.Value > max {
  3947  			max = idx.Value
  3948  		}
  3949  	}
  3951  	return max, nil
  3952  }
  3954  // Index finds the matching index value
  3955  func (s *StateStore) Index(name string) (uint64, error) {
  3956  	txn := s.db.Txn(false)
  3958  	// Lookup the first matching index
  3959  	out, err := txn.First("index", "id", name)
  3960  	if err != nil {
  3961  		return 0, err
  3962  	}
  3963  	if out == nil {
  3964  		return 0, nil
  3965  	}
  3966  	return out.(*IndexEntry).Value, nil
  3967  }
  3969  // RemoveIndex is a helper method to remove an index for testing purposes
  3970  func (s *StateStore) RemoveIndex(name string) error {
  3971  	txn := s.db.Txn(true)
  3972  	defer txn.Abort()
  3974  	if _, err := txn.DeleteAll("index", "id", name); err != nil {
  3975  		return err
  3976  	}
  3978  	txn.Commit()
  3979  	return nil
  3980  }
  3982  // Indexes returns an iterator over all the indexes
  3983  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  3984  	txn := s.db.Txn(false)
  3986  	// Walk the entire nodes table
  3987  	iter, err := txn.Get("index", "id")
  3988  	if err != nil {
  3989  		return nil, err
  3990  	}
  3991  	return iter, nil
  3992  }
  3994  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  3995  // store
  3996  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  3997  	txn := s.db.Txn(true)
  3998  	defer txn.Abort()
  4000  	// Get all the jobs
  4001  	iter, err := txn.Get("jobs", "id")
  4002  	if err != nil {
  4003  		return err
  4004  	}
  4005  	// COMPAT: Remove after 0.11
  4006  	// Iterate over jobs to build a list of parent jobs and their children
  4007  	parentMap := make(map[string][]*structs.Job)
  4008  	for {
  4009  		rawJob := iter.Next()
  4010  		if rawJob == nil {
  4011  			break
  4012  		}
  4013  		job := rawJob.(*structs.Job)
  4014  		if job.ParentID != "" {
  4015  			children := parentMap[job.ParentID]
  4016  			children = append(children, job)
  4017  			parentMap[job.ParentID] = children
  4018  		}
  4019  	}
  4021  	// Get all the jobs again
  4022  	iter, err = txn.Get("jobs", "id")
  4023  	if err != nil {
  4024  		return err
  4025  	}
  4027  	for {
  4028  		rawJob := iter.Next()
  4029  		if rawJob == nil {
  4030  			break
  4031  		}
  4032  		job := rawJob.(*structs.Job)
  4034  		if job.IsParameterized() || job.IsPeriodic() {
  4035  			// COMPAT: Remove after 0.11
  4037  			// The following block of code fixes incorrect child summaries due to a bug
  4038  			// See for details
  4039  			rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  4040  			if err != nil {
  4041  				return err
  4042  			}
  4043  			if rawSummary == nil {
  4044  				continue
  4045  			}
  4047  			oldSummary := rawSummary.(*structs.JobSummary)
  4049  			// Create an empty summary
  4050  			summary := &structs.JobSummary{
  4051  				JobID:     job.ID,
  4052  				Namespace: job.Namespace,
  4053  				Summary:   make(map[string]structs.TaskGroupSummary),
  4054  				Children:  &structs.JobChildrenSummary{},
  4055  			}
  4057  			// Iterate over children of this job if any to fix summary counts
  4058  			children := parentMap[job.ID]
  4059  			for _, childJob := range children {
  4060  				switch childJob.Status {
  4061  				case structs.JobStatusPending:
  4062  					summary.Children.Pending++
  4063  				case structs.JobStatusDead:
  4064  					summary.Children.Dead++
  4065  				case structs.JobStatusRunning:
  4066  					summary.Children.Running++
  4067  				}
  4068  			}
  4070  			// Insert the job summary if its different
  4071  			if !reflect.DeepEqual(summary, oldSummary) {
  4072  				// Set the create index of the summary same as the job's create index
  4073  				// and the modify index to the current index
  4074  				summary.CreateIndex = job.CreateIndex
  4075  				summary.ModifyIndex = index
  4077  				if err := txn.Insert("job_summary", summary); err != nil {
  4078  					return fmt.Errorf("error inserting job summary: %v", err)
  4079  				}
  4080  			}
  4082  			// Done with handling a parent job, continue to next
  4083  			continue
  4084  		}
  4086  		// Create a job summary for the job
  4087  		summary := &structs.JobSummary{
  4088  			JobID:     job.ID,
  4089  			Namespace: job.Namespace,
  4090  			Summary:   make(map[string]structs.TaskGroupSummary),
  4091  		}
  4092  		for _, tg := range job.TaskGroups {
  4093  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  4094  		}
  4096  		// Find all the allocations for the jobs
  4097  		iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  4098  		if err != nil {
  4099  			return err
  4100  		}
  4102  		// Calculate the summary for the job
  4103  		for {
  4104  			rawAlloc := iterAllocs.Next()
  4105  			if rawAlloc == nil {
  4106  				break
  4107  			}
  4108  			alloc := rawAlloc.(*structs.Allocation)
  4110  			// Ignore the allocation if it doesn't belong to the currently
  4111  			// registered job. The allocation is checked because of issue #2304
  4112  			if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex {
  4113  				continue
  4114  			}
  4116  			tg := summary.Summary[alloc.TaskGroup]
  4117  			switch alloc.ClientStatus {
  4118  			case structs.AllocClientStatusFailed:
  4119  				tg.Failed += 1
  4120  			case structs.AllocClientStatusLost:
  4121  				tg.Lost += 1
  4122  			case structs.AllocClientStatusComplete:
  4123  				tg.Complete += 1
  4124  			case structs.AllocClientStatusRunning:
  4125  				tg.Running += 1
  4126  			case structs.AllocClientStatusPending:
  4127  				tg.Starting += 1
  4128  			default:
  4129  				s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID)
  4130  			}
  4131  			summary.Summary[alloc.TaskGroup] = tg
  4132  		}
  4134  		// Set the create index of the summary same as the job's create index
  4135  		// and the modify index to the current index
  4136  		summary.CreateIndex = job.CreateIndex
  4137  		summary.ModifyIndex = index
  4139  		// Insert the job summary
  4140  		if err := txn.Insert("job_summary", summary); err != nil {
  4141  			return fmt.Errorf("error inserting job summary: %v", err)
  4142  		}
  4143  	}
  4145  	// Update the indexes table for job summary
  4146  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4147  		return fmt.Errorf("index update failed: %v", err)
  4148  	}
  4149  	txn.Commit()
  4150  	return nil
  4151  }
  4153  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  4154  // It takes a map of job IDs to an optional forceStatus string. It returns an
  4155  // error if the job doesn't exist or setJobStatus fails.
  4156  func (s *StateStore) setJobStatuses(index uint64, txn *memdb.Txn,
  4157  	jobs map[structs.NamespacedID]string, evalDelete bool) error {
  4158  	for tuple, forceStatus := range jobs {
  4160  		existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID)
  4161  		if err != nil {
  4162  			return fmt.Errorf("job lookup failed: %v", err)
  4163  		}
  4165  		if existing == nil {
  4166  			continue
  4167  		}
  4169  		if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  4170  			return err
  4171  		}
  4172  	}
  4174  	return nil
  4175  }
  4177  // setJobStatus sets the status of the job by looking up associated evaluations
  4178  // and allocations. evalDelete should be set to true if setJobStatus is being
  4179  // called because an evaluation is being deleted (potentially because of garbage
  4180  // collection). If forceStatus is non-empty, the job's status will be set to the
  4181  // passed status.
  4182  func (s *StateStore) setJobStatus(index uint64, txn *memdb.Txn,
  4183  	job *structs.Job, evalDelete bool, forceStatus string) error {
  4185  	// Capture the current status so we can check if there is a change
  4186  	oldStatus := job.Status
  4187  	if index == job.CreateIndex {
  4188  		oldStatus = ""
  4189  	}
  4190  	newStatus := forceStatus
  4192  	// If forceStatus is not set, compute the jobs status.
  4193  	if forceStatus == "" {
  4194  		var err error
  4195  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  4196  		if err != nil {
  4197  			return err
  4198  		}
  4199  	}
  4201  	// Fast-path if nothing has changed.
  4202  	if oldStatus == newStatus {
  4203  		return nil
  4204  	}
  4206  	// Copy and update the existing job
  4207  	updated := job.Copy()
  4208  	updated.Status = newStatus
  4209  	updated.ModifyIndex = index
  4211  	// Insert the job
  4212  	if err := txn.Insert("jobs", updated); err != nil {
  4213  		return fmt.Errorf("job insert failed: %v", err)
  4214  	}
  4215  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  4216  		return fmt.Errorf("index update failed: %v", err)
  4217  	}
  4219  	// Update the children summary
  4220  	if updated.ParentID != "" {
  4221  		// Try to update the summary of the parent job summary
  4222  		summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID)
  4223  		if err != nil {
  4224  			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
  4225  		}
  4227  		// Only continue if the summary exists. It could not exist if the parent
  4228  		// job was removed
  4229  		if summaryRaw != nil {
  4230  			existing := summaryRaw.(*structs.JobSummary)
  4231  			pSummary := existing.Copy()
  4232  			if pSummary.Children == nil {
  4233  				pSummary.Children = new(structs.JobChildrenSummary)
  4234  			}
  4236  			// Determine the transition and update the correct fields
  4237  			children := pSummary.Children
  4239  			// Decrement old status
  4240  			if oldStatus != "" {
  4241  				switch oldStatus {
  4242  				case structs.JobStatusPending:
  4243  					children.Pending--
  4244  				case structs.JobStatusRunning:
  4245  					children.Running--
  4246  				case structs.JobStatusDead:
  4247  					children.Dead--
  4248  				default:
  4249  					return fmt.Errorf("unknown old job status %q", oldStatus)
  4250  				}
  4251  			}
  4253  			// Increment new status
  4254  			switch newStatus {
  4255  			case structs.JobStatusPending:
  4256  				children.Pending++
  4257  			case structs.JobStatusRunning:
  4258  				children.Running++
  4259  			case structs.JobStatusDead:
  4260  				children.Dead++
  4261  			default:
  4262  				return fmt.Errorf("unknown new job status %q", newStatus)
  4263  			}
  4265  			// Update the index
  4266  			pSummary.ModifyIndex = index
  4268  			// Insert the summary
  4269  			if err := txn.Insert("job_summary", pSummary); err != nil {
  4270  				return fmt.Errorf("job summary insert failed: %v", err)
  4271  			}
  4272  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4273  				return fmt.Errorf("index update failed: %v", err)
  4274  			}
  4275  		}
  4276  	}
  4278  	return nil
  4279  }
  4281  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  4282  	// System, Periodic and Parameterized jobs are running until explicitly
  4283  	// stopped
  4284  	if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() {
  4285  		if job.Stop {
  4286  			return structs.JobStatusDead, nil
  4287  		}
  4289  		return structs.JobStatusRunning, nil
  4290  	}
  4292  	allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
  4293  	if err != nil {
  4294  		return "", err
  4295  	}
  4297  	// If there is a non-terminal allocation, the job is running.
  4298  	hasAlloc := false
  4299  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  4300  		hasAlloc = true
  4301  		if !alloc.(*structs.Allocation).TerminalStatus() {
  4302  			return structs.JobStatusRunning, nil
  4303  		}
  4304  	}
  4306  	evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID)
  4307  	if err != nil {
  4308  		return "", err
  4309  	}
  4311  	hasEval := false
  4312  	for raw := evals.Next(); raw != nil; raw = evals.Next() {
  4313  		e := raw.(*structs.Evaluation)
  4315  		// Filter non-exact matches
  4316  		if e.JobID != job.ID {
  4317  			continue
  4318  		}
  4320  		hasEval = true
  4321  		if !e.TerminalStatus() {
  4322  			return structs.JobStatusPending, nil
  4323  		}
  4324  	}
  4326  	// The job is dead if all the allocations and evals are terminal or if there
  4327  	// are no evals because of garbage collection.
  4328  	if evalDelete || hasEval || hasAlloc {
  4329  		return structs.JobStatusDead, nil
  4330  	}
  4332  	return structs.JobStatusPending, nil
  4333  }
  4335  // updateSummaryWithJob creates or updates job summaries when new jobs are
  4336  // upserted or existing ones are updated
  4337  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  4338  	txn *memdb.Txn) error {
  4340  	// Update the job summary
  4341  	summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID)
  4342  	if err != nil {
  4343  		return fmt.Errorf("job summary lookup failed: %v", err)
  4344  	}
  4346  	// Get the summary or create if necessary
  4347  	var summary *structs.JobSummary
  4348  	hasSummaryChanged := false
  4349  	if summaryRaw != nil {
  4350  		summary = summaryRaw.(*structs.JobSummary).Copy()
  4351  	} else {
  4352  		summary = &structs.JobSummary{
  4353  			JobID:       job.ID,
  4354  			Namespace:   job.Namespace,
  4355  			Summary:     make(map[string]structs.TaskGroupSummary),
  4356  			Children:    new(structs.JobChildrenSummary),
  4357  			CreateIndex: index,
  4358  		}
  4359  		hasSummaryChanged = true
  4360  	}
  4362  	for _, tg := range job.TaskGroups {
  4363  		if _, ok := summary.Summary[tg.Name]; !ok {
  4364  			newSummary := structs.TaskGroupSummary{
  4365  				Complete: 0,
  4366  				Failed:   0,
  4367  				Running:  0,
  4368  				Starting: 0,
  4369  			}
  4370  			summary.Summary[tg.Name] = newSummary
  4371  			hasSummaryChanged = true
  4372  		}
  4373  	}
  4375  	// The job summary has changed, so update the modify index.
  4376  	if hasSummaryChanged {
  4377  		summary.ModifyIndex = index
  4379  		// Update the indexes table for job summary
  4380  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4381  			return fmt.Errorf("index update failed: %v", err)
  4382  		}
  4383  		if err := txn.Insert("job_summary", summary); err != nil {
  4384  			return err
  4385  		}
  4386  	}
  4388  	return nil
  4389  }
  4391  // updateJobScalingPolicies upserts any scaling policies contained in the job and removes
  4392  // any previous scaling policies that were removed from the job
  4393  func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *memdb.Txn) error {
  4395  	ws := memdb.NewWatchSet()
  4397  	if job.Stop {
  4398  		if err := s.deleteJobScalingPolicies(index, job, txn); err != nil {
  4399  			return fmt.Errorf("deleting job scaling policies failed: %v", err)
  4400  		}
  4401  		return nil
  4402  	}
  4404  	scalingPolicies := job.GetScalingPolicies()
  4405  	newTargets := map[string]struct{}{}
  4406  	for _, p := range scalingPolicies {
  4407  		newTargets[p.Target[structs.ScalingTargetGroup]] = struct{}{}
  4408  	}
  4409  	// find existing policies that need to be deleted
  4410  	deletedPolicies := []string{}
  4411  	iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn)
  4412  	if err != nil {
  4413  		return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err)
  4414  	}
  4415  	for {
  4416  		raw := iter.Next()
  4417  		if raw == nil {
  4418  			break
  4419  		}
  4420  		oldPolicy := raw.(*structs.ScalingPolicy)
  4421  		if _, ok := newTargets[oldPolicy.Target[structs.ScalingTargetGroup]]; !ok {
  4422  			deletedPolicies = append(deletedPolicies, oldPolicy.ID)
  4423  		}
  4424  	}
  4425  	err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn)
  4426  	if err != nil {
  4427  		return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err)
  4428  	}
  4430  	err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn)
  4431  	if err != nil {
  4432  		return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err)
  4433  	}
  4435  	return nil
  4436  }
  4438  // updateDeploymentWithAlloc is used to update the deployment state associated
  4439  // with the given allocation. The passed alloc may be updated if the deployment
  4440  // status has changed to capture the modify index at which it has changed.
  4441  func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *memdb.Txn) error {
  4442  	// Nothing to do if the allocation is not associated with a deployment
  4443  	if alloc.DeploymentID == "" {
  4444  		return nil
  4445  	}
  4447  	// Get the deployment
  4448  	ws := memdb.NewWatchSet()
  4449  	deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn)
  4450  	if err != nil {
  4451  		return err
  4452  	}
  4453  	if deployment == nil {
  4454  		return nil
  4455  	}
  4457  	// Retrieve the deployment state object
  4458  	_, ok := deployment.TaskGroups[alloc.TaskGroup]
  4459  	if !ok {
  4460  		// If the task group isn't part of the deployment, the task group wasn't
  4461  		// part of a rolling update so nothing to do
  4462  		return nil
  4463  	}
  4465  	// Do not modify in-place. Instead keep track of what must be done
  4466  	placed := 0
  4467  	healthy := 0
  4468  	unhealthy := 0
  4470  	// If there was no existing allocation, this is a placement and we increment
  4471  	// the placement
  4472  	existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth()
  4473  	allocHealthSet := alloc.DeploymentStatus.HasHealth()
  4474  	if existing == nil || existing.DeploymentID != alloc.DeploymentID {
  4475  		placed++
  4476  	} else if !existingHealthSet && allocHealthSet {
  4477  		if *alloc.DeploymentStatus.Healthy {
  4478  			healthy++
  4479  		} else {
  4480  			unhealthy++
  4481  		}
  4482  	} else if existingHealthSet && allocHealthSet {
  4483  		// See if it has gone from healthy to unhealthy
  4484  		if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy {
  4485  			healthy--
  4486  			unhealthy++
  4487  		}
  4488  	}
  4490  	// Nothing to do
  4491  	if placed == 0 && healthy == 0 && unhealthy == 0 {
  4492  		return nil
  4493  	}
  4495  	// Update the allocation's deployment status modify index
  4496  	if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 {
  4497  		alloc.DeploymentStatus.ModifyIndex = index
  4498  	}
  4500  	// Create a copy of the deployment object
  4501  	deploymentCopy := deployment.Copy()
  4502  	deploymentCopy.ModifyIndex = index
  4504  	state := deploymentCopy.TaskGroups[alloc.TaskGroup]
  4505  	state.PlacedAllocs += placed
  4506  	state.HealthyAllocs += healthy
  4507  	state.UnhealthyAllocs += unhealthy
  4509  	// Ensure PlacedCanaries accurately reflects the alloc canary status
  4510  	if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
  4511  		found := false
  4512  		for _, canary := range state.PlacedCanaries {
  4513  			if alloc.ID == canary {
  4514  				found = true
  4515  				break
  4516  			}
  4517  		}
  4518  		if !found {
  4519  			state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID)
  4520  		}
  4521  	}
  4523  	// Update the progress deadline
  4524  	if pd := state.ProgressDeadline; pd != 0 {
  4525  		// If we are the first placed allocation for the deployment start the progress deadline.
  4526  		if placed != 0 && state.RequireProgressBy.IsZero() {
  4527  			// Use modify time instead of create time because we may in-place
  4528  			// update the allocation to be part of a new deployment.
  4529  			state.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd)
  4530  		} else if healthy != 0 {
  4531  			if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(state.RequireProgressBy) {
  4532  				state.RequireProgressBy = d
  4533  			}
  4534  		}
  4535  	}
  4537  	// Upsert the deployment
  4538  	if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil {
  4539  		return err
  4540  	}
  4542  	return nil
  4543  }
  4545  // updateSummaryWithAlloc updates the job summary when allocations are updated
  4546  // or inserted
  4547  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  4548  	existingAlloc *structs.Allocation, txn *memdb.Txn) error {
  4550  	// We don't have to update the summary if the job is missing
  4551  	if alloc.Job == nil {
  4552  		return nil
  4553  	}
  4555  	summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID)
  4556  	if err != nil {
  4557  		return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err)
  4558  	}
  4560  	if summaryRaw == nil {
  4561  		// Check if the job is de-registered
  4562  		rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID)
  4563  		if err != nil {
  4564  			return fmt.Errorf("unable to query job: %v", err)
  4565  		}
  4567  		// If the job is de-registered then we skip updating it's summary
  4568  		if rawJob == nil {
  4569  			return nil
  4570  		}
  4572  		return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace)
  4573  	}
  4575  	// Get a copy of the existing summary
  4576  	jobSummary := summaryRaw.(*structs.JobSummary).Copy()
  4578  	// Not updating the job summary because the allocation doesn't belong to the
  4579  	// currently registered job
  4580  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  4581  		return nil
  4582  	}
  4584  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  4585  	if !ok {
  4586  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  4587  	}
  4589  	summaryChanged := false
  4590  	if existingAlloc == nil {
  4591  		switch alloc.DesiredStatus {
  4592  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  4593  			s.logger.Error("new allocation inserted into state store with bad desired status",
  4594  				"alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
  4595  		}
  4596  		switch alloc.ClientStatus {
  4597  		case structs.AllocClientStatusPending:
  4598  			tgSummary.Starting += 1
  4599  			if tgSummary.Queued > 0 {
  4600  				tgSummary.Queued -= 1
  4601  			}
  4602  			summaryChanged = true
  4603  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  4604  			structs.AllocClientStatusComplete:
  4605  			s.logger.Error("new allocation inserted into state store with bad client status",
  4606  				"alloc_id", alloc.ID, "client_status", alloc.ClientStatus)
  4607  		}
  4608  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  4609  		// Incrementing the client of the bin of the current state
  4610  		switch alloc.ClientStatus {
  4611  		case structs.AllocClientStatusRunning:
  4612  			tgSummary.Running += 1
  4613  		case structs.AllocClientStatusFailed:
  4614  			tgSummary.Failed += 1
  4615  		case structs.AllocClientStatusPending:
  4616  			tgSummary.Starting += 1
  4617  		case structs.AllocClientStatusComplete:
  4618  			tgSummary.Complete += 1
  4619  		case structs.AllocClientStatusLost:
  4620  			tgSummary.Lost += 1
  4621  		}
  4623  		// Decrementing the count of the bin of the last state
  4624  		switch existingAlloc.ClientStatus {
  4625  		case structs.AllocClientStatusRunning:
  4626  			if tgSummary.Running > 0 {
  4627  				tgSummary.Running -= 1
  4628  			}
  4629  		case structs.AllocClientStatusPending:
  4630  			if tgSummary.Starting > 0 {
  4631  				tgSummary.Starting -= 1
  4632  			}
  4633  		case structs.AllocClientStatusLost:
  4634  			if tgSummary.Lost > 0 {
  4635  				tgSummary.Lost -= 1
  4636  			}
  4637  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  4638  		default:
  4639  			s.logger.Error("invalid old client status for allocation",
  4640  				"alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus)
  4641  		}
  4642  		summaryChanged = true
  4643  	}
  4644  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  4646  	if summaryChanged {
  4647  		jobSummary.ModifyIndex = index
  4649  		// Update the indexes table for job summary
  4650  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  4651  			return fmt.Errorf("index update failed: %v", err)
  4652  		}
  4654  		if err := txn.Insert("job_summary", jobSummary); err != nil {
  4655  			return fmt.Errorf("updating job summary failed: %v", err)
  4656  		}
  4657  	}
  4659  	return nil
  4660  }
  4662  // updatePluginWithAlloc updates the CSI plugins for an alloc when the
  4663  // allocation is updated or inserted with a terminal server status.
  4664  func (s *StateStore) updatePluginWithAlloc(index uint64, alloc *structs.Allocation,
  4665  	txn *memdb.Txn) error {
  4666  	if !alloc.ServerTerminalStatus() {
  4667  		return nil
  4668  	}
  4670  	ws := memdb.NewWatchSet()
  4671  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
  4672  	for _, t := range tg.Tasks {
  4673  		if t.CSIPluginConfig != nil {
  4674  			pluginID := t.CSIPluginConfig.ID
  4675  			plug, err := s.CSIPluginByID(ws, pluginID)
  4676  			if err != nil {
  4677  				return err
  4678  			}
  4679  			if plug == nil {
  4680  				// plugin may not have been created because it never
  4681  				// became healthy, just move on
  4682  				return nil
  4683  			}
  4684  			err = plug.DeleteAlloc(alloc.ID, alloc.NodeID)
  4685  			if err != nil {
  4686  				return err
  4687  			}
  4688  			err = updateOrGCPlugin(index, txn, plug)
  4689  			if err != nil {
  4690  				return err
  4691  			}
  4692  		}
  4693  	}
  4695  	return nil
  4696  }
  4698  // UpsertACLPolicies is used to create or update a set of ACL policies
  4699  func (s *StateStore) UpsertACLPolicies(index uint64, policies []*structs.ACLPolicy) error {
  4700  	txn := s.db.Txn(true)
  4701  	defer txn.Abort()
  4703  	for _, policy := range policies {
  4704  		// Ensure the policy hash is non-nil. This should be done outside the state store
  4705  		// for performance reasons, but we check here for defense in depth.
  4706  		if len(policy.Hash) == 0 {
  4707  			policy.SetHash()
  4708  		}
  4710  		// Check if the policy already exists
  4711  		existing, err := txn.First("acl_policy", "id", policy.Name)
  4712  		if err != nil {
  4713  			return fmt.Errorf("policy lookup failed: %v", err)
  4714  		}
  4716  		// Update all the indexes
  4717  		if existing != nil {
  4718  			policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex
  4719  			policy.ModifyIndex = index
  4720  		} else {
  4721  			policy.CreateIndex = index
  4722  			policy.ModifyIndex = index
  4723  		}
  4725  		// Update the policy
  4726  		if err := txn.Insert("acl_policy", policy); err != nil {
  4727  			return fmt.Errorf("upserting policy failed: %v", err)
  4728  		}
  4729  	}
  4731  	// Update the indexes tabl
  4732  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  4733  		return fmt.Errorf("index update failed: %v", err)
  4734  	}
  4736  	txn.Commit()
  4737  	return nil
  4738  }
  4740  // DeleteACLPolicies deletes the policies with the given names
  4741  func (s *StateStore) DeleteACLPolicies(index uint64, names []string) error {
  4742  	txn := s.db.Txn(true)
  4743  	defer txn.Abort()
  4745  	// Delete the policy
  4746  	for _, name := range names {
  4747  		if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil {
  4748  			return fmt.Errorf("deleting acl policy failed: %v", err)
  4749  		}
  4750  	}
  4751  	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
  4752  		return fmt.Errorf("index update failed: %v", err)
  4753  	}
  4754  	txn.Commit()
  4755  	return nil
  4756  }
  4758  // ACLPolicyByName is used to lookup a policy by name
  4759  func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) {
  4760  	txn := s.db.Txn(false)
  4762  	watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name)
  4763  	if err != nil {
  4764  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  4765  	}
  4766  	ws.Add(watchCh)
  4768  	if existing != nil {
  4769  		return existing.(*structs.ACLPolicy), nil
  4770  	}
  4771  	return nil, nil
  4772  }
  4774  // ACLPolicyByNamePrefix is used to lookup policies by prefix
  4775  func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  4776  	txn := s.db.Txn(false)
  4778  	iter, err := txn.Get("acl_policy", "id_prefix", prefix)
  4779  	if err != nil {
  4780  		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
  4781  	}
  4782  	ws.Add(iter.WatchCh())
  4784  	return iter, nil
  4785  }
  4787  // ACLPolicies returns an iterator over all the acl policies
  4788  func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4789  	txn := s.db.Txn(false)
  4791  	// Walk the entire table
  4792  	iter, err := txn.Get("acl_policy", "id")
  4793  	if err != nil {
  4794  		return nil, err
  4795  	}
  4796  	ws.Add(iter.WatchCh())
  4797  	return iter, nil
  4798  }
  4800  // UpsertACLTokens is used to create or update a set of ACL tokens
  4801  func (s *StateStore) UpsertACLTokens(index uint64, tokens []*structs.ACLToken) error {
  4802  	txn := s.db.Txn(true)
  4803  	defer txn.Abort()
  4805  	for _, token := range tokens {
  4806  		// Ensure the policy hash is non-nil. This should be done outside the state store
  4807  		// for performance reasons, but we check here for defense in depth.
  4808  		if len(token.Hash) == 0 {
  4809  			token.SetHash()
  4810  		}
  4812  		// Check if the token already exists
  4813  		existing, err := txn.First("acl_token", "id", token.AccessorID)
  4814  		if err != nil {
  4815  			return fmt.Errorf("token lookup failed: %v", err)
  4816  		}
  4818  		// Update all the indexes
  4819  		if existing != nil {
  4820  			existTK := existing.(*structs.ACLToken)
  4821  			token.CreateIndex = existTK.CreateIndex
  4822  			token.ModifyIndex = index
  4824  			// Do not allow SecretID or create time to change
  4825  			token.SecretID = existTK.SecretID
  4826  			token.CreateTime = existTK.CreateTime
  4828  		} else {
  4829  			token.CreateIndex = index
  4830  			token.ModifyIndex = index
  4831  		}
  4833  		// Update the token
  4834  		if err := txn.Insert("acl_token", token); err != nil {
  4835  			return fmt.Errorf("upserting token failed: %v", err)
  4836  		}
  4837  	}
  4839  	// Update the indexes table
  4840  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4841  		return fmt.Errorf("index update failed: %v", err)
  4842  	}
  4843  	txn.Commit()
  4844  	return nil
  4845  }
  4847  // DeleteACLTokens deletes the tokens with the given accessor ids
  4848  func (s *StateStore) DeleteACLTokens(index uint64, ids []string) error {
  4849  	txn := s.db.Txn(true)
  4850  	defer txn.Abort()
  4852  	// Delete the tokens
  4853  	for _, id := range ids {
  4854  		if _, err := txn.DeleteAll("acl_token", "id", id); err != nil {
  4855  			return fmt.Errorf("deleting acl token failed: %v", err)
  4856  		}
  4857  	}
  4858  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4859  		return fmt.Errorf("index update failed: %v", err)
  4860  	}
  4861  	txn.Commit()
  4862  	return nil
  4863  }
  4865  // ACLTokenByAccessorID is used to lookup a token by accessor ID
  4866  func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) {
  4867  	if id == "" {
  4868  		return nil, fmt.Errorf("acl token lookup failed: missing accessor id")
  4869  	}
  4871  	txn := s.db.Txn(false)
  4873  	watchCh, existing, err := txn.FirstWatch("acl_token", "id", id)
  4874  	if err != nil {
  4875  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4876  	}
  4877  	ws.Add(watchCh)
  4879  	if existing != nil {
  4880  		return existing.(*structs.ACLToken), nil
  4881  	}
  4882  	return nil, nil
  4883  }
  4885  // ACLTokenBySecretID is used to lookup a token by secret ID
  4886  func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) {
  4887  	if secretID == "" {
  4888  		return nil, fmt.Errorf("acl token lookup failed: missing secret id")
  4889  	}
  4891  	txn := s.db.Txn(false)
  4893  	watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID)
  4894  	if err != nil {
  4895  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4896  	}
  4897  	ws.Add(watchCh)
  4899  	if existing != nil {
  4900  		return existing.(*structs.ACLToken), nil
  4901  	}
  4902  	return nil, nil
  4903  }
  4905  // ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix
  4906  func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
  4907  	txn := s.db.Txn(false)
  4909  	iter, err := txn.Get("acl_token", "id_prefix", prefix)
  4910  	if err != nil {
  4911  		return nil, fmt.Errorf("acl token lookup failed: %v", err)
  4912  	}
  4913  	ws.Add(iter.WatchCh())
  4914  	return iter, nil
  4915  }
  4917  // ACLTokens returns an iterator over all the tokens
  4918  func (s *StateStore) ACLTokens(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  4919  	txn := s.db.Txn(false)
  4921  	// Walk the entire table
  4922  	iter, err := txn.Get("acl_token", "id")
  4923  	if err != nil {
  4924  		return nil, err
  4925  	}
  4926  	ws.Add(iter.WatchCh())
  4927  	return iter, nil
  4928  }
  4930  // ACLTokensByGlobal returns an iterator over all the tokens filtered by global value
  4931  func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool) (memdb.ResultIterator, error) {
  4932  	txn := s.db.Txn(false)
  4934  	// Walk the entire table
  4935  	iter, err := txn.Get("acl_token", "global", globalVal)
  4936  	if err != nil {
  4937  		return nil, err
  4938  	}
  4939  	ws.Add(iter.WatchCh())
  4940  	return iter, nil
  4941  }
  4943  // CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index
  4944  func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) {
  4945  	txn := s.db.Txn(false)
  4947  	// Lookup the bootstrap sentinel
  4948  	out, err := txn.First("index", "id", "acl_token_bootstrap")
  4949  	if err != nil {
  4950  		return false, 0, err
  4951  	}
  4953  	// No entry, we haven't bootstrapped yet
  4954  	if out == nil {
  4955  		return true, 0, nil
  4956  	}
  4958  	// Return the reset index if we've already bootstrapped
  4959  	return false, out.(*IndexEntry).Value, nil
  4960  }
  4962  // BootstrapACLToken is used to create an initial ACL token
  4963  func (s *StateStore) BootstrapACLTokens(index, resetIndex uint64, token *structs.ACLToken) error {
  4964  	txn := s.db.Txn(true)
  4965  	defer txn.Abort()
  4967  	// Check if we have already done a bootstrap
  4968  	existing, err := txn.First("index", "id", "acl_token_bootstrap")
  4969  	if err != nil {
  4970  		return fmt.Errorf("bootstrap check failed: %v", err)
  4971  	}
  4972  	if existing != nil {
  4973  		if resetIndex == 0 {
  4974  			return fmt.Errorf("ACL bootstrap already done")
  4975  		} else if resetIndex != existing.(*IndexEntry).Value {
  4976  			return fmt.Errorf("Invalid reset index for ACL bootstrap")
  4977  		}
  4978  	}
  4980  	// Update the Create/Modify time
  4981  	token.CreateIndex = index
  4982  	token.ModifyIndex = index
  4984  	// Insert the token
  4985  	if err := txn.Insert("acl_token", token); err != nil {
  4986  		return fmt.Errorf("upserting token failed: %v", err)
  4987  	}
  4989  	// Update the indexes table, prevents future bootstrap until reset
  4990  	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
  4991  		return fmt.Errorf("index update failed: %v", err)
  4992  	}
  4993  	if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil {
  4994  		return fmt.Errorf("index update failed: %v", err)
  4995  	}
  4996  	txn.Commit()
  4997  	return nil
  4998  }
  5000  // SchedulerConfig is used to get the current Scheduler configuration.
  5001  func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) {
  5002  	tx := s.db.Txn(false)
  5003  	defer tx.Abort()
  5005  	// Get the scheduler config
  5006  	c, err := tx.First("scheduler_config", "id")
  5007  	if err != nil {
  5008  		return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err)
  5009  	}
  5011  	config, ok := c.(*structs.SchedulerConfiguration)
  5012  	if !ok {
  5013  		return 0, nil, nil
  5014  	}
  5016  	return config.ModifyIndex, config, nil
  5017  }
  5019  // SchedulerSetConfig is used to set the current Scheduler configuration.
  5020  func (s *StateStore) SchedulerSetConfig(idx uint64, config *structs.SchedulerConfiguration) error {
  5021  	tx := s.db.Txn(true)
  5022  	defer tx.Abort()
  5024  	s.schedulerSetConfigTxn(idx, tx, config)
  5026  	tx.Commit()
  5027  	return nil
  5028  }
  5030  func (s *StateStore) ClusterMetadata() (*structs.ClusterMetadata, error) {
  5031  	txn := s.db.Txn(false)
  5032  	defer txn.Abort()
  5034  	// Get the cluster metadata
  5035  	m, err := txn.First("cluster_meta", "id")
  5036  	if err != nil {
  5037  		return nil, errors.Wrap(err, "failed cluster metadata lookup")
  5038  	}
  5040  	if m != nil {
  5041  		return m.(*structs.ClusterMetadata), nil
  5042  	}
  5044  	return nil, nil
  5045  }
  5047  func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error {
  5048  	txn := s.db.Txn(true)
  5049  	defer txn.Abort()
  5051  	if err := s.setClusterMetadata(txn, meta); err != nil {
  5052  		return errors.Wrap(err, "set cluster metadata failed")
  5053  	}
  5055  	txn.Commit()
  5056  	return nil
  5057  }
  5059  // WithWriteTransaction executes the passed function within a write transaction,
  5060  // and returns its result.  If the invocation returns no error, the transaction
  5061  // is committed; otherwise, it's aborted.
  5062  func (s *StateStore) WithWriteTransaction(fn func(Txn) error) error {
  5063  	tx := s.db.Txn(true)
  5064  	defer tx.Abort()
  5066  	err := fn(tx)
  5067  	if err == nil {
  5068  		tx.Commit()
  5069  	}
  5070  	return err
  5071  }
  5073  // SchedulerCASConfig is used to update the scheduler configuration with a
  5074  // given Raft index. If the CAS index specified is not equal to the last observed index
  5075  // for the config, then the call is a noop.
  5076  func (s *StateStore) SchedulerCASConfig(idx, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) {
  5077  	tx := s.db.Txn(true)
  5078  	defer tx.Abort()
  5080  	// Check for an existing config
  5081  	existing, err := tx.First("scheduler_config", "id")
  5082  	if err != nil {
  5083  		return false, fmt.Errorf("failed scheduler config lookup: %s", err)
  5084  	}
  5086  	// If the existing index does not match the provided CAS
  5087  	// index arg, then we shouldn't update anything and can safely
  5088  	// return early here.
  5089  	e, ok := existing.(*structs.SchedulerConfiguration)
  5090  	if !ok || (e != nil && e.ModifyIndex != cidx) {
  5091  		return false, nil
  5092  	}
  5094  	s.schedulerSetConfigTxn(idx, tx, config)
  5096  	tx.Commit()
  5097  	return true, nil
  5098  }
  5100  func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *memdb.Txn, config *structs.SchedulerConfiguration) error {
  5101  	// Check for an existing config
  5102  	existing, err := tx.First("scheduler_config", "id")
  5103  	if err != nil {
  5104  		return fmt.Errorf("failed scheduler config lookup: %s", err)
  5105  	}
  5107  	// Set the indexes.
  5108  	if existing != nil {
  5109  		config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex
  5110  	} else {
  5111  		config.CreateIndex = idx
  5112  	}
  5113  	config.ModifyIndex = idx
  5115  	if err := tx.Insert("scheduler_config", config); err != nil {
  5116  		return fmt.Errorf("failed updating scheduler config: %s", err)
  5117  	}
  5118  	return nil
  5119  }
  5121  func (s *StateStore) setClusterMetadata(txn *memdb.Txn, meta *structs.ClusterMetadata) error {
  5122  	// Check for an existing config, if it exists, sanity check the cluster ID matches
  5123  	existing, err := txn.First("cluster_meta", "id")
  5124  	if err != nil {
  5125  		return fmt.Errorf("failed cluster meta lookup: %v", err)
  5126  	}
  5128  	if existing != nil {
  5129  		existingClusterID := existing.(*structs.ClusterMetadata).ClusterID
  5130  		if meta.ClusterID != existingClusterID {
  5131  			// there is a bug in cluster ID detection
  5132  			return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID)
  5133  		}
  5134  	}
  5136  	// update is technically a noop, unless someday we add more / mutable fields
  5137  	if err := txn.Insert("cluster_meta", meta); err != nil {
  5138  		return fmt.Errorf("set cluster metadata failed: %v", err)
  5139  	}
  5141  	return nil
  5142  }
  5144  // UpsertScalingPolicy is used to insert a new scaling policy.
  5145  func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error {
  5146  	txn := s.db.Txn(true)
  5147  	defer txn.Abort()
  5149  	if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil {
  5150  		return err
  5151  	}
  5153  	txn.Commit()
  5154  	return nil
  5155  }
  5157  // upsertScalingPolicy is used to insert a new scaling policy.
  5158  func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy,
  5159  	txn *memdb.Txn) error {
  5161  	hadUpdates := false
  5163  	for _, policy := range scalingPolicies {
  5164  		// Check if the scaling policy already exists
  5165  		existing, err := txn.First("scaling_policy", "target",
  5166  			policy.Target[structs.ScalingTargetNamespace],
  5167  			policy.Target[structs.ScalingTargetJob],
  5168  			policy.Target[structs.ScalingTargetGroup])
  5169  		if err != nil {
  5170  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  5171  		}
  5173  		// Setup the indexes correctly
  5174  		if existing != nil {
  5175  			p := existing.(*structs.ScalingPolicy)
  5176  			if !p.Diff(policy) {
  5177  				continue
  5178  			}
  5179  			policy.ID = p.ID
  5180  			policy.CreateIndex = p.CreateIndex
  5181  			policy.ModifyIndex = index
  5182  		} else {
  5183  			// policy.ID must have been set already in Job.Register before log apply
  5184  			policy.CreateIndex = index
  5185  			policy.ModifyIndex = index
  5186  		}
  5188  		// Insert the scaling policy
  5189  		hadUpdates = true
  5190  		if err := txn.Insert("scaling_policy", policy); err != nil {
  5191  			return err
  5192  		}
  5193  	}
  5195  	// Update the indexes table for scaling policy
  5196  	if hadUpdates {
  5197  		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  5198  			return fmt.Errorf("index update failed: %v", err)
  5199  		}
  5200  	}
  5202  	return nil
  5203  }
  5205  func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error {
  5206  	txn := s.db.Txn(true)
  5207  	defer txn.Abort()
  5209  	err := s.DeleteScalingPoliciesTxn(index, ids, txn)
  5210  	if err == nil {
  5211  		txn.Commit()
  5212  	}
  5214  	return err
  5215  }
  5217  // DeleteScalingPolicies is used to delete a set of scaling policies by ID
  5218  func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *memdb.Txn) error {
  5219  	if len(ids) == 0 {
  5220  		return nil
  5221  	}
  5223  	for _, id := range ids {
  5224  		// Lookup the scaling policy
  5225  		existing, err := txn.First("scaling_policy", "id", id)
  5226  		if err != nil {
  5227  			return fmt.Errorf("scaling policy lookup failed: %v", err)
  5228  		}
  5229  		if existing == nil {
  5230  			return fmt.Errorf("scaling policy not found")
  5231  		}
  5233  		// Delete the scaling policy
  5234  		if err := txn.Delete("scaling_policy", existing); err != nil {
  5235  			return fmt.Errorf("scaling policy delete failed: %v", err)
  5236  		}
  5237  	}
  5239  	if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
  5240  		return fmt.Errorf("index update failed: %v", err)
  5241  	}
  5243  	return nil
  5244  }
  5246  // ScalingPolicies returns an iterator over all the scaling policies
  5247  func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
  5248  	txn := s.db.Txn(false)
  5250  	// Walk the entire scaling_policy table
  5251  	iter, err := txn.Get("scaling_policy", "id")
  5252  	if err != nil {
  5253  		return nil, err
  5254  	}
  5256  	ws.Add(iter.WatchCh())
  5258  	return iter, nil
  5259  }
  5261  func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
  5262  	txn := s.db.Txn(false)
  5264  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace)
  5265  	if err != nil {
  5266  		return nil, err
  5267  	}
  5269  	ws.Add(iter.WatchCh())
  5270  	return iter, nil
  5271  }
  5273  func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID string) (memdb.ResultIterator, error) {
  5274  	txn := s.db.Txn(false)
  5275  	return s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn)
  5276  }
  5278  func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string,
  5279  	txn *memdb.Txn) (memdb.ResultIterator, error) {
  5281  	iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID)
  5282  	if err != nil {
  5283  		return nil, err
  5284  	}
  5286  	ws.Add(iter.WatchCh())
  5287  	return iter, nil
  5288  }
  5290  func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) {
  5291  	txn := s.db.Txn(false)
  5293  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id)
  5294  	if err != nil {
  5295  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  5296  	}
  5297  	ws.Add(watchCh)
  5299  	if existing != nil {
  5300  		return existing.(*structs.ScalingPolicy), nil
  5301  	}
  5303  	return nil, nil
  5304  }
  5306  func (s *StateStore) ScalingPolicyByTarget(ws memdb.WatchSet, target map[string]string) (*structs.ScalingPolicy,
  5307  	error) {
  5308  	txn := s.db.Txn(false)
  5310  	// currently, only scaling policy type is against a task group
  5311  	namespace := target[structs.ScalingTargetNamespace]
  5312  	job := target[structs.ScalingTargetJob]
  5313  	group := target[structs.ScalingTargetGroup]
  5315  	watchCh, existing, err := txn.FirstWatch("scaling_policy", "target", namespace, job, group)
  5316  	if err != nil {
  5317  		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
  5318  	}
  5319  	ws.Add(watchCh)
  5321  	if existing != nil {
  5322  		return existing.(*structs.ScalingPolicy), nil
  5323  	}
  5325  	return nil, nil
  5326  }
  5328  // StateSnapshot is used to provide a point-in-time snapshot
  5329  type StateSnapshot struct {
  5330  	StateStore
  5331  }
  5333  // DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the
  5334  // Allocation for each of the Allocation diffs and merges the updated attributes with
  5335  // the existing Allocation, and attaches the Job provided
  5336  func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error {
  5337  	for nodeID, allocs := range nodeAllocations {
  5338  		denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs)
  5339  		if err != nil {
  5340  			return err
  5341  		}
  5343  		nodeAllocations[nodeID] = denormalizedAllocs
  5344  	}
  5345  	return nil
  5346  }
  5348  // DenormalizeAllocationSlice queries the Allocation for each allocation diff
  5349  // represented as an Allocation and merges the updated attributes with the existing
  5350  // Allocation, and attaches the Job provided.
  5351  //
  5352  // This should only be called on terminal allocs, particularly stopped or preempted allocs
  5353  func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) {
  5354  	allocDiffs := make([]*structs.AllocationDiff, len(allocs))
  5355  	for i, alloc := range allocs {
  5356  		allocDiffs[i] = alloc.AllocationDiff()
  5357  	}
  5359  	return s.DenormalizeAllocationDiffSlice(allocDiffs)
  5360  }
  5362  // DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges
  5363  // the updated attributes with the existing Allocation, and attaches the Job provided.
  5364  //
  5365  // This should only be called on terminal alloc, particularly stopped or preempted allocs
  5366  func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) {
  5367  	// Output index for denormalized Allocations
  5368  	j := 0
  5370  	denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs))
  5371  	for _, allocDiff := range allocDiffs {
  5372  		alloc, err := s.AllocByID(nil, allocDiff.ID)
  5373  		if err != nil {
  5374  			return nil, fmt.Errorf("alloc lookup failed: %v", err)
  5375  		}
  5376  		if alloc == nil {
  5377  			return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID)
  5378  		}
  5380  		// Merge the updates to the Allocation.  Don't update alloc.Job for terminal allocs
  5381  		// so alloc refers to the latest Job view before destruction and to ease handler implementations
  5382  		allocCopy := alloc.Copy()
  5384  		if allocDiff.PreemptedByAllocation != "" {
  5385  			allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation
  5386  			allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation)
  5387  			allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict
  5388  		} else {
  5389  			// If alloc is a stopped alloc
  5390  			allocCopy.DesiredDescription = allocDiff.DesiredDescription
  5391  			allocCopy.DesiredStatus = structs.AllocDesiredStatusStop
  5392  			if allocDiff.ClientStatus != "" {
  5393  				allocCopy.ClientStatus = allocDiff.ClientStatus
  5394  			}
  5395  		}
  5396  		if allocDiff.ModifyTime != 0 {
  5397  			allocCopy.ModifyTime = allocDiff.ModifyTime
  5398  		}
  5400  		// Update the allocDiff in the slice to equal the denormalized alloc
  5401  		denormalizedAllocs[j] = allocCopy
  5402  		j++
  5403  	}
  5404  	// Retain only the denormalized Allocations in the slice
  5405  	denormalizedAllocs = denormalizedAllocs[:j]
  5406  	return denormalizedAllocs, nil
  5407  }
  5409  func getPreemptedAllocDesiredDescription(PreemptedByAllocID string) string {
  5410  	return fmt.Sprintf("Preempted by alloc ID %v", PreemptedByAllocID)
  5411  }
  5413  // StateRestore is used to optimize the performance when
  5414  // restoring state by only using a single large transaction
  5415  // instead of thousands of sub transactions
  5416  type StateRestore struct {
  5417  	txn *memdb.Txn
  5418  }
  5420  // Abort is used to abort the restore operation
  5421  func (s *StateRestore) Abort() {
  5422  	s.txn.Abort()
  5423  }
  5425  // Commit is used to commit the restore operation
  5426  func (s *StateRestore) Commit() {
  5427  	s.txn.Commit()
  5428  }
  5430  // NodeRestore is used to restore a node
  5431  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  5432  	if err := r.txn.Insert("nodes", node); err != nil {
  5433  		return fmt.Errorf("node insert failed: %v", err)
  5434  	}
  5435  	return nil
  5436  }
  5438  // JobRestore is used to restore a job
  5439  func (r *StateRestore) JobRestore(job *structs.Job) error {
  5440  	if err := r.txn.Insert("jobs", job); err != nil {
  5441  		return fmt.Errorf("job insert failed: %v", err)
  5442  	}
  5443  	return nil
  5444  }
  5446  // EvalRestore is used to restore an evaluation
  5447  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  5448  	if err := r.txn.Insert("evals", eval); err != nil {
  5449  		return fmt.Errorf("eval insert failed: %v", err)
  5450  	}
  5451  	return nil
  5452  }
  5454  // AllocRestore is used to restore an allocation
  5455  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  5456  	if err := r.txn.Insert("allocs", alloc); err != nil {
  5457  		return fmt.Errorf("alloc insert failed: %v", err)
  5458  	}
  5459  	return nil
  5460  }
  5462  // IndexRestore is used to restore an index
  5463  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  5464  	if err := r.txn.Insert("index", idx); err != nil {
  5465  		return fmt.Errorf("index insert failed: %v", err)
  5466  	}
  5467  	return nil
  5468  }
  5470  // PeriodicLaunchRestore is used to restore a periodic launch.
  5471  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  5472  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  5473  		return fmt.Errorf("periodic launch insert failed: %v", err)
  5474  	}
  5475  	return nil
  5476  }
  5478  // JobSummaryRestore is used to restore a job summary
  5479  func (r *StateRestore) JobSummaryRestore(jobSummary *structs.JobSummary) error {
  5480  	if err := r.txn.Insert("job_summary", jobSummary); err != nil {
  5481  		return fmt.Errorf("job summary insert failed: %v", err)
  5482  	}
  5483  	return nil
  5484  }
  5486  // JobVersionRestore is used to restore a job version
  5487  func (r *StateRestore) JobVersionRestore(version *structs.Job) error {
  5488  	if err := r.txn.Insert("job_version", version); err != nil {
  5489  		return fmt.Errorf("job version insert failed: %v", err)
  5490  	}
  5491  	return nil
  5492  }
  5494  // DeploymentRestore is used to restore a deployment
  5495  func (r *StateRestore) DeploymentRestore(deployment *structs.Deployment) error {
  5496  	if err := r.txn.Insert("deployment", deployment); err != nil {
  5497  		return fmt.Errorf("deployment insert failed: %v", err)
  5498  	}
  5499  	return nil
  5500  }
  5502  // VaultAccessorRestore is used to restore a vault accessor
  5503  func (r *StateRestore) VaultAccessorRestore(accessor *structs.VaultAccessor) error {
  5504  	if err := r.txn.Insert("vault_accessors", accessor); err != nil {
  5505  		return fmt.Errorf("vault accessor insert failed: %v", err)
  5506  	}
  5507  	return nil
  5508  }
  5510  // SITokenAccessorRestore is used to restore an SI token accessor
  5511  func (r *StateRestore) SITokenAccessorRestore(accessor *structs.SITokenAccessor) error {
  5512  	if err := r.txn.Insert(siTokenAccessorTable, accessor); err != nil {
  5513  		return errors.Wrap(err, "si token accessor insert failed")
  5514  	}
  5515  	return nil
  5516  }
  5518  // ACLPolicyRestore is used to restore an ACL policy
  5519  func (r *StateRestore) ACLPolicyRestore(policy *structs.ACLPolicy) error {
  5520  	if err := r.txn.Insert("acl_policy", policy); err != nil {
  5521  		return fmt.Errorf("inserting acl policy failed: %v", err)
  5522  	}
  5523  	return nil
  5524  }
  5526  // ACLTokenRestore is used to restore an ACL token
  5527  func (r *StateRestore) ACLTokenRestore(token *structs.ACLToken) error {
  5528  	if err := r.txn.Insert("acl_token", token); err != nil {
  5529  		return fmt.Errorf("inserting acl token failed: %v", err)
  5530  	}
  5531  	return nil
  5532  }
  5534  func (r *StateRestore) SchedulerConfigRestore(schedConfig *structs.SchedulerConfiguration) error {
  5535  	if err := r.txn.Insert("scheduler_config", schedConfig); err != nil {
  5536  		return fmt.Errorf("inserting scheduler config failed: %s", err)
  5537  	}
  5538  	return nil
  5539  }
  5541  func (r *StateRestore) ClusterMetadataRestore(meta *structs.ClusterMetadata) error {
  5542  	if err := r.txn.Insert("cluster_meta", meta); err != nil {
  5543  		return fmt.Errorf("inserting cluster meta failed: %v", err)
  5544  	}
  5545  	return nil
  5546  }
  5548  // ScalingPolicyRestore is used to restore a scaling policy
  5549  func (r *StateRestore) ScalingPolicyRestore(scalingPolicy *structs.ScalingPolicy) error {
  5550  	if err := r.txn.Insert("scaling_policy", scalingPolicy); err != nil {
  5551  		return fmt.Errorf("scaling policy insert failed: %v", err)
  5552  	}
  5553  	return nil
  5554  }
  5556  // CSIPluginRestore is used to restore a CSI plugin
  5557  func (r *StateRestore) CSIPluginRestore(plugin *structs.CSIPlugin) error {
  5558  	if err := r.txn.Insert("csi_plugins", plugin); err != nil {
  5559  		return fmt.Errorf("csi plugin insert failed: %v", err)
  5560  	}
  5561  	return nil
  5562  }
  5564  // CSIVolumeRestore is used to restore a CSI volume
  5565  func (r *StateRestore) CSIVolumeRestore(volume *structs.CSIVolume) error {
  5566  	if err := r.txn.Insert("csi_volumes", volume); err != nil {
  5567  		return fmt.Errorf("csi volume insert failed: %v", err)
  5568  	}
  5569  	return nil
  5570  }
  5572  func (r *StateRestore) ScalingEventsRestore(jobEvents *structs.JobScalingEvents) error {
  5573  	if err := r.txn.Insert("scaling_event", jobEvents); err != nil {
  5574  		return fmt.Errorf("scaling event insert failed: %v", err)
  5575  	}
  5576  	return nil
  5577  }