github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/nomad/watch"
    12  )
    13  
    14  // IndexEntry is used with the "index" table
    15  // for managing the latest Raft index affecting a table.
    16  type IndexEntry struct {
    17  	Key   string
    18  	Value uint64
    19  }
    20  
    21  // The StateStore is responsible for maintaining all the Nomad
    22  // state. It is manipulated by the FSM which maintains consistency
    23  // through the use of Raft. The goals of the StateStore are to provide
    24  // high concurrency for read operations without blocking writes, and
    25  // to provide write availability in the face of reads. EVERY object
    26  // returned as a result of a read against the state store should be
    27  // considered a constant and NEVER modified in place.
    28  type StateStore struct {
    29  	logger *log.Logger
    30  	db     *memdb.MemDB
    31  	watch  *stateWatch
    32  }
    33  
    34  // NewStateStore is used to create a new state store
    35  func NewStateStore(logOutput io.Writer) (*StateStore, error) {
    36  	// Create the MemDB
    37  	db, err := memdb.NewMemDB(stateStoreSchema())
    38  	if err != nil {
    39  		return nil, fmt.Errorf("state store setup failed: %v", err)
    40  	}
    41  
    42  	// Create the state store
    43  	s := &StateStore{
    44  		logger: log.New(logOutput, "", log.LstdFlags),
    45  		db:     db,
    46  		watch:  newStateWatch(),
    47  	}
    48  	return s, nil
    49  }
    50  
    51  // Snapshot is used to create a point in time snapshot. Because
    52  // we use MemDB, we just need to snapshot the state of the underlying
    53  // database.
    54  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    55  	snap := &StateSnapshot{
    56  		StateStore: StateStore{
    57  			logger: s.logger,
    58  			db:     s.db.Snapshot(),
    59  			watch:  s.watch,
    60  		},
    61  	}
    62  	return snap, nil
    63  }
    64  
    65  // Restore is used to optimize the efficiency of rebuilding
    66  // state by minimizing the number of transactions and checking
    67  // overhead.
    68  func (s *StateStore) Restore() (*StateRestore, error) {
    69  	txn := s.db.Txn(true)
    70  	r := &StateRestore{
    71  		txn:   txn,
    72  		watch: s.watch,
    73  		items: watch.NewItems(),
    74  	}
    75  	return r, nil
    76  }
    77  
    78  // Watch subscribes a channel to a set of watch items.
    79  func (s *StateStore) Watch(items watch.Items, notify chan struct{}) {
    80  	s.watch.watch(items, notify)
    81  }
    82  
    83  // StopWatch unsubscribes a channel from a set of watch items.
    84  func (s *StateStore) StopWatch(items watch.Items, notify chan struct{}) {
    85  	s.watch.stopWatch(items, notify)
    86  }
    87  
    88  // UpsertJobSummary upserts a job summary into the state store.
    89  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
    90  	txn := s.db.Txn(true)
    91  	defer txn.Abort()
    92  
    93  	// Update the index
    94  	if err := txn.Insert("job_summary", *jobSummary); err != nil {
    95  		return err
    96  	}
    97  
    98  	// Update the indexes table for job summary
    99  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   100  		return fmt.Errorf("index update failed: %v", err)
   101  	}
   102  
   103  	txn.Commit()
   104  	return nil
   105  }
   106  
   107  // DeleteJobSummary deletes the job summary with the given ID. This is for
   108  // testing purposes only.
   109  func (s *StateStore) DeleteJobSummary(index uint64, id string) error {
   110  	txn := s.db.Txn(true)
   111  	defer txn.Abort()
   112  
   113  	// Delete the job summary
   114  	if _, err := txn.DeleteAll("job_summary", "id", id); err != nil {
   115  		return fmt.Errorf("deleting job summary failed: %v", err)
   116  	}
   117  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   118  		return fmt.Errorf("index update failed: %v", err)
   119  	}
   120  	txn.Commit()
   121  	return nil
   122  }
   123  
   124  // UpsertNode is used to register a node or update a node definition
   125  // This is assumed to be triggered by the client, so we retain the value
   126  // of drain which is set by the scheduler.
   127  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
   128  	txn := s.db.Txn(true)
   129  	defer txn.Abort()
   130  
   131  	watcher := watch.NewItems()
   132  	watcher.Add(watch.Item{Table: "nodes"})
   133  	watcher.Add(watch.Item{Node: node.ID})
   134  
   135  	// Check if the node already exists
   136  	existing, err := txn.First("nodes", "id", node.ID)
   137  	if err != nil {
   138  		return fmt.Errorf("node lookup failed: %v", err)
   139  	}
   140  
   141  	// Setup the indexes correctly
   142  	if existing != nil {
   143  		exist := existing.(*structs.Node)
   144  		node.CreateIndex = exist.CreateIndex
   145  		node.ModifyIndex = index
   146  		node.Drain = exist.Drain // Retain the drain mode
   147  	} else {
   148  		node.CreateIndex = index
   149  		node.ModifyIndex = index
   150  	}
   151  
   152  	// Insert the node
   153  	if err := txn.Insert("nodes", node); err != nil {
   154  		return fmt.Errorf("node insert failed: %v", err)
   155  	}
   156  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   157  		return fmt.Errorf("index update failed: %v", err)
   158  	}
   159  
   160  	txn.Defer(func() { s.watch.notify(watcher) })
   161  	txn.Commit()
   162  	return nil
   163  }
   164  
   165  // DeleteNode is used to deregister a node
   166  func (s *StateStore) DeleteNode(index uint64, nodeID string) error {
   167  	txn := s.db.Txn(true)
   168  	defer txn.Abort()
   169  
   170  	// Lookup the node
   171  	existing, err := txn.First("nodes", "id", nodeID)
   172  	if err != nil {
   173  		return fmt.Errorf("node lookup failed: %v", err)
   174  	}
   175  	if existing == nil {
   176  		return fmt.Errorf("node not found")
   177  	}
   178  
   179  	watcher := watch.NewItems()
   180  	watcher.Add(watch.Item{Table: "nodes"})
   181  	watcher.Add(watch.Item{Node: nodeID})
   182  
   183  	// Delete the node
   184  	if err := txn.Delete("nodes", existing); err != nil {
   185  		return fmt.Errorf("node delete failed: %v", err)
   186  	}
   187  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   188  		return fmt.Errorf("index update failed: %v", err)
   189  	}
   190  
   191  	txn.Defer(func() { s.watch.notify(watcher) })
   192  	txn.Commit()
   193  	return nil
   194  }
   195  
   196  // UpdateNodeStatus is used to update the status of a node
   197  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error {
   198  	txn := s.db.Txn(true)
   199  	defer txn.Abort()
   200  
   201  	watcher := watch.NewItems()
   202  	watcher.Add(watch.Item{Table: "nodes"})
   203  	watcher.Add(watch.Item{Node: nodeID})
   204  
   205  	// Lookup the node
   206  	existing, err := txn.First("nodes", "id", nodeID)
   207  	if err != nil {
   208  		return fmt.Errorf("node lookup failed: %v", err)
   209  	}
   210  	if existing == nil {
   211  		return fmt.Errorf("node not found")
   212  	}
   213  
   214  	// Copy the existing node
   215  	existingNode := existing.(*structs.Node)
   216  	copyNode := new(structs.Node)
   217  	*copyNode = *existingNode
   218  
   219  	// Update the status in the copy
   220  	copyNode.Status = status
   221  	copyNode.ModifyIndex = index
   222  
   223  	// Insert the node
   224  	if err := txn.Insert("nodes", copyNode); err != nil {
   225  		return fmt.Errorf("node update failed: %v", err)
   226  	}
   227  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   228  		return fmt.Errorf("index update failed: %v", err)
   229  	}
   230  
   231  	txn.Defer(func() { s.watch.notify(watcher) })
   232  	txn.Commit()
   233  	return nil
   234  }
   235  
   236  // UpdateNodeDrain is used to update the drain of a node
   237  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error {
   238  	txn := s.db.Txn(true)
   239  	defer txn.Abort()
   240  
   241  	watcher := watch.NewItems()
   242  	watcher.Add(watch.Item{Table: "nodes"})
   243  	watcher.Add(watch.Item{Node: nodeID})
   244  
   245  	// Lookup the node
   246  	existing, err := txn.First("nodes", "id", nodeID)
   247  	if err != nil {
   248  		return fmt.Errorf("node lookup failed: %v", err)
   249  	}
   250  	if existing == nil {
   251  		return fmt.Errorf("node not found")
   252  	}
   253  
   254  	// Copy the existing node
   255  	existingNode := existing.(*structs.Node)
   256  	copyNode := new(structs.Node)
   257  	*copyNode = *existingNode
   258  
   259  	// Update the drain in the copy
   260  	copyNode.Drain = drain
   261  	copyNode.ModifyIndex = index
   262  
   263  	// Insert the node
   264  	if err := txn.Insert("nodes", copyNode); err != nil {
   265  		return fmt.Errorf("node update failed: %v", err)
   266  	}
   267  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   268  		return fmt.Errorf("index update failed: %v", err)
   269  	}
   270  
   271  	txn.Defer(func() { s.watch.notify(watcher) })
   272  	txn.Commit()
   273  	return nil
   274  }
   275  
   276  // NodeByID is used to lookup a node by ID
   277  func (s *StateStore) NodeByID(nodeID string) (*structs.Node, error) {
   278  	txn := s.db.Txn(false)
   279  
   280  	existing, err := txn.First("nodes", "id", nodeID)
   281  	if err != nil {
   282  		return nil, fmt.Errorf("node lookup failed: %v", err)
   283  	}
   284  
   285  	if existing != nil {
   286  		return existing.(*structs.Node), nil
   287  	}
   288  	return nil, nil
   289  }
   290  
   291  // NodesByIDPrefix is used to lookup nodes by prefix
   292  func (s *StateStore) NodesByIDPrefix(nodeID string) (memdb.ResultIterator, error) {
   293  	txn := s.db.Txn(false)
   294  
   295  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
   296  	if err != nil {
   297  		return nil, fmt.Errorf("node lookup failed: %v", err)
   298  	}
   299  
   300  	return iter, nil
   301  }
   302  
   303  // Nodes returns an iterator over all the nodes
   304  func (s *StateStore) Nodes() (memdb.ResultIterator, error) {
   305  	txn := s.db.Txn(false)
   306  
   307  	// Walk the entire nodes table
   308  	iter, err := txn.Get("nodes", "id")
   309  	if err != nil {
   310  		return nil, err
   311  	}
   312  	return iter, nil
   313  }
   314  
   315  // UpsertJob is used to register a job or update a job definition
   316  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
   317  	txn := s.db.Txn(true)
   318  	defer txn.Abort()
   319  
   320  	watcher := watch.NewItems()
   321  	watcher.Add(watch.Item{Table: "jobs"})
   322  	watcher.Add(watch.Item{Job: job.ID})
   323  
   324  	// Check if the job already exists
   325  	existing, err := txn.First("jobs", "id", job.ID)
   326  	if err != nil {
   327  		return fmt.Errorf("job lookup failed: %v", err)
   328  	}
   329  
   330  	// Setup the indexes correctly
   331  	if existing != nil {
   332  		job.CreateIndex = existing.(*structs.Job).CreateIndex
   333  		job.ModifyIndex = index
   334  		job.JobModifyIndex = index
   335  
   336  		// Compute the job status
   337  		var err error
   338  		job.Status, err = s.getJobStatus(txn, job, false)
   339  		if err != nil {
   340  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
   341  		}
   342  	} else {
   343  		job.CreateIndex = index
   344  		job.ModifyIndex = index
   345  		job.JobModifyIndex = index
   346  
   347  		// If we are inserting the job for the first time, we don't need to
   348  		// calculate the jobs status as it is known.
   349  		if job.IsPeriodic() {
   350  			job.Status = structs.JobStatusRunning
   351  		} else {
   352  			job.Status = structs.JobStatusPending
   353  		}
   354  	}
   355  
   356  	if err := s.updateSummaryWithJob(index, job, watcher, txn); err != nil {
   357  		return fmt.Errorf("unable to create job summary: %v", err)
   358  	}
   359  
   360  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
   361  	// COMPAT 0.4.1 -> 0.5
   362  	s.addEphemeralDiskToTaskGroups(job)
   363  
   364  	// Insert the job
   365  	if err := txn.Insert("jobs", job); err != nil {
   366  		return fmt.Errorf("job insert failed: %v", err)
   367  	}
   368  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   369  		return fmt.Errorf("index update failed: %v", err)
   370  	}
   371  
   372  	txn.Defer(func() { s.watch.notify(watcher) })
   373  	txn.Commit()
   374  	return nil
   375  }
   376  
   377  // DeleteJob is used to deregister a job
   378  func (s *StateStore) DeleteJob(index uint64, jobID string) error {
   379  	txn := s.db.Txn(true)
   380  	defer txn.Abort()
   381  
   382  	// Lookup the node
   383  	existing, err := txn.First("jobs", "id", jobID)
   384  	if err != nil {
   385  		return fmt.Errorf("job lookup failed: %v", err)
   386  	}
   387  	if existing == nil {
   388  		return fmt.Errorf("job not found")
   389  	}
   390  
   391  	watcher := watch.NewItems()
   392  	watcher.Add(watch.Item{Table: "jobs"})
   393  	watcher.Add(watch.Item{Job: jobID})
   394  	watcher.Add(watch.Item{Table: "job_summary"})
   395  	watcher.Add(watch.Item{JobSummary: jobID})
   396  
   397  	// Delete the node
   398  	if err := txn.Delete("jobs", existing); err != nil {
   399  		return fmt.Errorf("job delete failed: %v", err)
   400  	}
   401  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   402  		return fmt.Errorf("index update failed: %v", err)
   403  	}
   404  
   405  	// Delete the job summary
   406  	if _, err = txn.DeleteAll("job_summary", "id", jobID); err != nil {
   407  		return fmt.Errorf("deleing job summary failed: %v", err)
   408  	}
   409  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   410  		return fmt.Errorf("index update failed: %v", err)
   411  	}
   412  
   413  	txn.Defer(func() { s.watch.notify(watcher) })
   414  	txn.Commit()
   415  	return nil
   416  }
   417  
   418  // JobByID is used to lookup a job by its ID
   419  func (s *StateStore) JobByID(id string) (*structs.Job, error) {
   420  	txn := s.db.Txn(false)
   421  
   422  	existing, err := txn.First("jobs", "id", id)
   423  	if err != nil {
   424  		return nil, fmt.Errorf("job lookup failed: %v", err)
   425  	}
   426  
   427  	if existing != nil {
   428  		return existing.(*structs.Job), nil
   429  	}
   430  	return nil, nil
   431  }
   432  
   433  // JobsByIDPrefix is used to lookup a job by prefix
   434  func (s *StateStore) JobsByIDPrefix(id string) (memdb.ResultIterator, error) {
   435  	txn := s.db.Txn(false)
   436  
   437  	iter, err := txn.Get("jobs", "id_prefix", id)
   438  	if err != nil {
   439  		return nil, fmt.Errorf("job lookup failed: %v", err)
   440  	}
   441  
   442  	return iter, nil
   443  }
   444  
   445  // Jobs returns an iterator over all the jobs
   446  func (s *StateStore) Jobs() (memdb.ResultIterator, error) {
   447  	txn := s.db.Txn(false)
   448  
   449  	// Walk the entire jobs table
   450  	iter, err := txn.Get("jobs", "id")
   451  	if err != nil {
   452  		return nil, err
   453  	}
   454  	return iter, nil
   455  }
   456  
   457  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
   458  func (s *StateStore) JobsByPeriodic(periodic bool) (memdb.ResultIterator, error) {
   459  	txn := s.db.Txn(false)
   460  
   461  	iter, err := txn.Get("jobs", "periodic", periodic)
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  	return iter, nil
   466  }
   467  
   468  // JobsByScheduler returns an iterator over all the jobs with the specific
   469  // scheduler type.
   470  func (s *StateStore) JobsByScheduler(schedulerType string) (memdb.ResultIterator, error) {
   471  	txn := s.db.Txn(false)
   472  
   473  	// Return an iterator for jobs with the specific type.
   474  	iter, err := txn.Get("jobs", "type", schedulerType)
   475  	if err != nil {
   476  		return nil, err
   477  	}
   478  	return iter, nil
   479  }
   480  
   481  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
   482  // collection.
   483  func (s *StateStore) JobsByGC(gc bool) (memdb.ResultIterator, error) {
   484  	txn := s.db.Txn(false)
   485  
   486  	iter, err := txn.Get("jobs", "gc", gc)
   487  	if err != nil {
   488  		return nil, err
   489  	}
   490  	return iter, nil
   491  }
   492  
   493  // JobSummary returns a job summary object which matches a specific id.
   494  func (s *StateStore) JobSummaryByID(jobID string) (*structs.JobSummary, error) {
   495  	txn := s.db.Txn(false)
   496  
   497  	existing, err := txn.First("job_summary", "id", jobID)
   498  	if err != nil {
   499  		return nil, err
   500  	}
   501  	if existing != nil {
   502  		summary := existing.(structs.JobSummary)
   503  		return summary.Copy(), nil
   504  	}
   505  
   506  	return nil, nil
   507  }
   508  
   509  // JobSummaries walks the entire job summary table and returns all the job
   510  // summary objects
   511  func (s *StateStore) JobSummaries() (memdb.ResultIterator, error) {
   512  	txn := s.db.Txn(false)
   513  
   514  	iter, err := txn.Get("job_summary", "id")
   515  	if err != nil {
   516  		return nil, err
   517  	}
   518  	return iter, nil
   519  }
   520  
   521  // JobSummaryByPrefix is used to look up Job Summary by id prefix
   522  func (s *StateStore) JobSummaryByPrefix(id string) (memdb.ResultIterator, error) {
   523  	txn := s.db.Txn(false)
   524  
   525  	iter, err := txn.Get("job_summary", "id_prefix", id)
   526  	if err != nil {
   527  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   528  	}
   529  
   530  	return iter, nil
   531  }
   532  
   533  // UpsertPeriodicLaunch is used to register a launch or update it.
   534  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
   535  	txn := s.db.Txn(true)
   536  	defer txn.Abort()
   537  
   538  	watcher := watch.NewItems()
   539  	watcher.Add(watch.Item{Table: "periodic_launch"})
   540  	watcher.Add(watch.Item{Job: launch.ID})
   541  
   542  	// Check if the job already exists
   543  	existing, err := txn.First("periodic_launch", "id", launch.ID)
   544  	if err != nil {
   545  		return fmt.Errorf("periodic launch lookup failed: %v", err)
   546  	}
   547  
   548  	// Setup the indexes correctly
   549  	if existing != nil {
   550  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
   551  		launch.ModifyIndex = index
   552  	} else {
   553  		launch.CreateIndex = index
   554  		launch.ModifyIndex = index
   555  	}
   556  
   557  	// Insert the job
   558  	if err := txn.Insert("periodic_launch", launch); err != nil {
   559  		return fmt.Errorf("launch insert failed: %v", err)
   560  	}
   561  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   562  		return fmt.Errorf("index update failed: %v", err)
   563  	}
   564  
   565  	txn.Defer(func() { s.watch.notify(watcher) })
   566  	txn.Commit()
   567  	return nil
   568  }
   569  
   570  // DeletePeriodicLaunch is used to delete the periodic launch
   571  func (s *StateStore) DeletePeriodicLaunch(index uint64, jobID string) error {
   572  	txn := s.db.Txn(true)
   573  	defer txn.Abort()
   574  
   575  	// Lookup the launch
   576  	existing, err := txn.First("periodic_launch", "id", jobID)
   577  	if err != nil {
   578  		return fmt.Errorf("launch lookup failed: %v", err)
   579  	}
   580  	if existing == nil {
   581  		return fmt.Errorf("launch not found")
   582  	}
   583  
   584  	watcher := watch.NewItems()
   585  	watcher.Add(watch.Item{Table: "periodic_launch"})
   586  	watcher.Add(watch.Item{Job: jobID})
   587  
   588  	// Delete the launch
   589  	if err := txn.Delete("periodic_launch", existing); err != nil {
   590  		return fmt.Errorf("launch delete failed: %v", err)
   591  	}
   592  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   593  		return fmt.Errorf("index update failed: %v", err)
   594  	}
   595  
   596  	txn.Defer(func() { s.watch.notify(watcher) })
   597  	txn.Commit()
   598  	return nil
   599  }
   600  
   601  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
   602  // ID.
   603  func (s *StateStore) PeriodicLaunchByID(id string) (*structs.PeriodicLaunch, error) {
   604  	txn := s.db.Txn(false)
   605  
   606  	existing, err := txn.First("periodic_launch", "id", id)
   607  	if err != nil {
   608  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
   609  	}
   610  
   611  	if existing != nil {
   612  		return existing.(*structs.PeriodicLaunch), nil
   613  	}
   614  	return nil, nil
   615  }
   616  
   617  // PeriodicLaunches returns an iterator over all the periodic launches
   618  func (s *StateStore) PeriodicLaunches() (memdb.ResultIterator, error) {
   619  	txn := s.db.Txn(false)
   620  
   621  	// Walk the entire table
   622  	iter, err := txn.Get("periodic_launch", "id")
   623  	if err != nil {
   624  		return nil, err
   625  	}
   626  	return iter, nil
   627  }
   628  
   629  // UpsertEvaluation is used to upsert an evaluation
   630  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
   631  	txn := s.db.Txn(true)
   632  	defer txn.Abort()
   633  
   634  	watcher := watch.NewItems()
   635  	watcher.Add(watch.Item{Table: "evals"})
   636  
   637  	// Do a nested upsert
   638  	jobs := make(map[string]string, len(evals))
   639  	for _, eval := range evals {
   640  		watcher.Add(watch.Item{Eval: eval.ID})
   641  		watcher.Add(watch.Item{EvalJob: eval.JobID})
   642  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   643  			return err
   644  		}
   645  
   646  		jobs[eval.JobID] = ""
   647  	}
   648  
   649  	// Set the job's status
   650  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   651  		return fmt.Errorf("setting job status failed: %v", err)
   652  	}
   653  
   654  	txn.Defer(func() { s.watch.notify(watcher) })
   655  	txn.Commit()
   656  	return nil
   657  }
   658  
   659  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
   660  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
   661  	// Lookup the evaluation
   662  	existing, err := txn.First("evals", "id", eval.ID)
   663  	if err != nil {
   664  		return fmt.Errorf("eval lookup failed: %v", err)
   665  	}
   666  
   667  	// Update the indexes
   668  	if existing != nil {
   669  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
   670  		eval.ModifyIndex = index
   671  	} else {
   672  		eval.CreateIndex = index
   673  		eval.ModifyIndex = index
   674  	}
   675  
   676  	// Update the job summary
   677  	summaryRaw, err := txn.First("job_summary", "id", eval.JobID)
   678  	if err != nil {
   679  		return fmt.Errorf("job summary lookup failed: %v", err)
   680  	}
   681  	if summaryRaw != nil {
   682  		js := summaryRaw.(structs.JobSummary)
   683  		var hasSummaryChanged bool
   684  		for tg, num := range eval.QueuedAllocations {
   685  			if summary, ok := js.Summary[tg]; ok {
   686  				if summary.Queued != num {
   687  					summary.Queued = num
   688  					js.Summary[tg] = summary
   689  					hasSummaryChanged = true
   690  				}
   691  			} else {
   692  				s.logger.Printf("[ERR] state_store: unable to update queued for job %q and task group %q", eval.JobID, tg)
   693  			}
   694  		}
   695  
   696  		// Insert the job summary
   697  		if hasSummaryChanged {
   698  			js.ModifyIndex = index
   699  			if err := txn.Insert("job_summary", js); err != nil {
   700  				return fmt.Errorf("job summary insert failed: %v", err)
   701  			}
   702  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   703  				return fmt.Errorf("index update failed: %v", err)
   704  			}
   705  		}
   706  	}
   707  
   708  	// Insert the eval
   709  	if err := txn.Insert("evals", eval); err != nil {
   710  		return fmt.Errorf("eval insert failed: %v", err)
   711  	}
   712  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   713  		return fmt.Errorf("index update failed: %v", err)
   714  	}
   715  	return nil
   716  }
   717  
   718  // DeleteEval is used to delete an evaluation
   719  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
   720  	txn := s.db.Txn(true)
   721  	defer txn.Abort()
   722  	watcher := watch.NewItems()
   723  	watcher.Add(watch.Item{Table: "evals"})
   724  	watcher.Add(watch.Item{Table: "allocs"})
   725  
   726  	jobs := make(map[string]string, len(evals))
   727  	for _, eval := range evals {
   728  		existing, err := txn.First("evals", "id", eval)
   729  		if err != nil {
   730  			return fmt.Errorf("eval lookup failed: %v", err)
   731  		}
   732  		if existing == nil {
   733  			continue
   734  		}
   735  		if err := txn.Delete("evals", existing); err != nil {
   736  			return fmt.Errorf("eval delete failed: %v", err)
   737  		}
   738  		jobID := existing.(*structs.Evaluation).JobID
   739  		watcher.Add(watch.Item{Eval: eval})
   740  		watcher.Add(watch.Item{EvalJob: jobID})
   741  		jobs[jobID] = ""
   742  	}
   743  
   744  	for _, alloc := range allocs {
   745  		existing, err := txn.First("allocs", "id", alloc)
   746  		if err != nil {
   747  			return fmt.Errorf("alloc lookup failed: %v", err)
   748  		}
   749  		if existing == nil {
   750  			continue
   751  		}
   752  		if err := txn.Delete("allocs", existing); err != nil {
   753  			return fmt.Errorf("alloc delete failed: %v", err)
   754  		}
   755  		realAlloc := existing.(*structs.Allocation)
   756  		watcher.Add(watch.Item{Alloc: realAlloc.ID})
   757  		watcher.Add(watch.Item{AllocEval: realAlloc.EvalID})
   758  		watcher.Add(watch.Item{AllocJob: realAlloc.JobID})
   759  		watcher.Add(watch.Item{AllocNode: realAlloc.NodeID})
   760  	}
   761  
   762  	// Update the indexes
   763  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   764  		return fmt.Errorf("index update failed: %v", err)
   765  	}
   766  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   767  		return fmt.Errorf("index update failed: %v", err)
   768  	}
   769  
   770  	// Set the job's status
   771  	if err := s.setJobStatuses(index, watcher, txn, jobs, true); err != nil {
   772  		return fmt.Errorf("setting job status failed: %v", err)
   773  	}
   774  
   775  	txn.Defer(func() { s.watch.notify(watcher) })
   776  	txn.Commit()
   777  	return nil
   778  }
   779  
   780  // EvalByID is used to lookup an eval by its ID
   781  func (s *StateStore) EvalByID(id string) (*structs.Evaluation, error) {
   782  	txn := s.db.Txn(false)
   783  
   784  	existing, err := txn.First("evals", "id", id)
   785  	if err != nil {
   786  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   787  	}
   788  
   789  	if existing != nil {
   790  		return existing.(*structs.Evaluation), nil
   791  	}
   792  	return nil, nil
   793  }
   794  
   795  // EvalsByIDPrefix is used to lookup evaluations by prefix
   796  func (s *StateStore) EvalsByIDPrefix(id string) (memdb.ResultIterator, error) {
   797  	txn := s.db.Txn(false)
   798  
   799  	iter, err := txn.Get("evals", "id_prefix", id)
   800  	if err != nil {
   801  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   802  	}
   803  
   804  	return iter, nil
   805  }
   806  
   807  // EvalsByJob returns all the evaluations by job id
   808  func (s *StateStore) EvalsByJob(jobID string) ([]*structs.Evaluation, error) {
   809  	txn := s.db.Txn(false)
   810  
   811  	// Get an iterator over the node allocations
   812  	iter, err := txn.Get("evals", "job", jobID)
   813  	if err != nil {
   814  		return nil, err
   815  	}
   816  
   817  	var out []*structs.Evaluation
   818  	for {
   819  		raw := iter.Next()
   820  		if raw == nil {
   821  			break
   822  		}
   823  		out = append(out, raw.(*structs.Evaluation))
   824  	}
   825  	return out, nil
   826  }
   827  
   828  // Evals returns an iterator over all the evaluations
   829  func (s *StateStore) Evals() (memdb.ResultIterator, error) {
   830  	txn := s.db.Txn(false)
   831  
   832  	// Walk the entire table
   833  	iter, err := txn.Get("evals", "id")
   834  	if err != nil {
   835  		return nil, err
   836  	}
   837  	return iter, nil
   838  }
   839  
   840  // UpdateAllocsFromClient is used to update an allocation based on input
   841  
   842  // from a client. While the schedulers are the authority on the allocation for
   843  // most things, some updates are authoritative from the client. Specifically,
   844  // the desired state comes from the schedulers, while the actual state comes
   845  // from clients.
   846  func (s *StateStore) UpdateAllocsFromClient(index uint64, allocs []*structs.Allocation) error {
   847  	txn := s.db.Txn(true)
   848  	defer txn.Abort()
   849  
   850  	// Setup the watcher
   851  	watcher := watch.NewItems()
   852  	watcher.Add(watch.Item{Table: "allocs"})
   853  
   854  	// Handle each of the updated allocations
   855  	for _, alloc := range allocs {
   856  		if err := s.nestedUpdateAllocFromClient(txn, watcher, index, alloc); err != nil {
   857  			return err
   858  		}
   859  	}
   860  
   861  	// Update the indexes
   862  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   863  		return fmt.Errorf("index update failed: %v", err)
   864  	}
   865  
   866  	txn.Defer(func() { s.watch.notify(watcher) })
   867  	txn.Commit()
   868  	return nil
   869  }
   870  
   871  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
   872  func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, watcher watch.Items, index uint64, alloc *structs.Allocation) error {
   873  	// Look for existing alloc
   874  	existing, err := txn.First("allocs", "id", alloc.ID)
   875  	if err != nil {
   876  		return fmt.Errorf("alloc lookup failed: %v", err)
   877  	}
   878  
   879  	// Nothing to do if this does not exist
   880  	if existing == nil {
   881  		return nil
   882  	}
   883  	exist := existing.(*structs.Allocation)
   884  	// Trigger the watcher
   885  	watcher.Add(watch.Item{Alloc: alloc.ID})
   886  	watcher.Add(watch.Item{AllocEval: exist.EvalID})
   887  	watcher.Add(watch.Item{AllocJob: exist.JobID})
   888  	watcher.Add(watch.Item{AllocNode: exist.NodeID})
   889  
   890  	// Copy everything from the existing allocation
   891  	copyAlloc := new(structs.Allocation)
   892  	*copyAlloc = *exist
   893  
   894  	// Pull in anything the client is the authority on
   895  	copyAlloc.ClientStatus = alloc.ClientStatus
   896  	copyAlloc.ClientDescription = alloc.ClientDescription
   897  	copyAlloc.TaskStates = alloc.TaskStates
   898  
   899  	// Update the modify index
   900  	copyAlloc.ModifyIndex = index
   901  
   902  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, watcher, txn); err != nil {
   903  		return fmt.Errorf("error updating job summary: %v", err)
   904  	}
   905  
   906  	// Update the allocation
   907  	if err := txn.Insert("allocs", copyAlloc); err != nil {
   908  		return fmt.Errorf("alloc insert failed: %v", err)
   909  	}
   910  
   911  	// Set the job's status
   912  	forceStatus := ""
   913  	if !copyAlloc.TerminalStatus() {
   914  		forceStatus = structs.JobStatusRunning
   915  	}
   916  	jobs := map[string]string{exist.JobID: forceStatus}
   917  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   918  		return fmt.Errorf("setting job status failed: %v", err)
   919  	}
   920  	return nil
   921  }
   922  
   923  // UpsertAllocs is used to evict a set of allocations
   924  // and allocate new ones at the same time.
   925  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
   926  	txn := s.db.Txn(true)
   927  	defer txn.Abort()
   928  
   929  	watcher := watch.NewItems()
   930  	watcher.Add(watch.Item{Table: "allocs"})
   931  
   932  	// Handle the allocations
   933  	jobs := make(map[string]string, 1)
   934  	for _, alloc := range allocs {
   935  		existing, err := txn.First("allocs", "id", alloc.ID)
   936  		if err != nil {
   937  			return fmt.Errorf("alloc lookup failed: %v", err)
   938  		}
   939  		exist, _ := existing.(*structs.Allocation)
   940  
   941  		if exist == nil {
   942  			alloc.CreateIndex = index
   943  			alloc.ModifyIndex = index
   944  			alloc.AllocModifyIndex = index
   945  		} else {
   946  			alloc.CreateIndex = exist.CreateIndex
   947  			alloc.ModifyIndex = index
   948  			alloc.AllocModifyIndex = index
   949  
   950  			// If the scheduler is marking this allocation as lost we do not
   951  			// want to reuse the status of the existing allocation.
   952  			if alloc.ClientStatus != structs.AllocClientStatusLost {
   953  				alloc.ClientStatus = exist.ClientStatus
   954  				alloc.ClientDescription = exist.ClientDescription
   955  			}
   956  
   957  			// The job has been denormalized so re-attach the original job
   958  			if alloc.Job == nil {
   959  				alloc.Job = exist.Job
   960  			}
   961  		}
   962  
   963  		if err := s.updateSummaryWithAlloc(index, alloc, exist, watcher, txn); err != nil {
   964  			return fmt.Errorf("error updating job summary: %v", err)
   965  		}
   966  
   967  		// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
   968  		// COMPAT 0.4.1 -> 0.5
   969  		if alloc.Job != nil {
   970  			s.addEphemeralDiskToTaskGroups(alloc.Job)
   971  		}
   972  
   973  		if err := txn.Insert("allocs", alloc); err != nil {
   974  			return fmt.Errorf("alloc insert failed: %v", err)
   975  		}
   976  
   977  		// If the allocation is running, force the job to running status.
   978  		forceStatus := ""
   979  		if !alloc.TerminalStatus() {
   980  			forceStatus = structs.JobStatusRunning
   981  		}
   982  		jobs[alloc.JobID] = forceStatus
   983  
   984  		watcher.Add(watch.Item{Alloc: alloc.ID})
   985  		watcher.Add(watch.Item{AllocEval: alloc.EvalID})
   986  		watcher.Add(watch.Item{AllocJob: alloc.JobID})
   987  		watcher.Add(watch.Item{AllocNode: alloc.NodeID})
   988  	}
   989  
   990  	// Update the indexes
   991  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   992  		return fmt.Errorf("index update failed: %v", err)
   993  	}
   994  
   995  	// Set the job's status
   996  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   997  		return fmt.Errorf("setting job status failed: %v", err)
   998  	}
   999  
  1000  	txn.Defer(func() { s.watch.notify(watcher) })
  1001  	txn.Commit()
  1002  	return nil
  1003  }
  1004  
  1005  // AllocByID is used to lookup an allocation by its ID
  1006  func (s *StateStore) AllocByID(id string) (*structs.Allocation, error) {
  1007  	txn := s.db.Txn(false)
  1008  
  1009  	existing, err := txn.First("allocs", "id", id)
  1010  	if err != nil {
  1011  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  1012  	}
  1013  
  1014  	if existing != nil {
  1015  		return existing.(*structs.Allocation), nil
  1016  	}
  1017  	return nil, nil
  1018  }
  1019  
  1020  // AllocsByIDPrefix is used to lookup allocs by prefix
  1021  func (s *StateStore) AllocsByIDPrefix(id string) (memdb.ResultIterator, error) {
  1022  	txn := s.db.Txn(false)
  1023  
  1024  	iter, err := txn.Get("allocs", "id_prefix", id)
  1025  	if err != nil {
  1026  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  1027  	}
  1028  
  1029  	return iter, nil
  1030  }
  1031  
  1032  // AllocsByNode returns all the allocations by node
  1033  func (s *StateStore) AllocsByNode(node string) ([]*structs.Allocation, error) {
  1034  	txn := s.db.Txn(false)
  1035  
  1036  	// Get an iterator over the node allocations, using only the
  1037  	// node prefix which ignores the terminal status
  1038  	iter, err := txn.Get("allocs", "node_prefix", node)
  1039  	if err != nil {
  1040  		return nil, err
  1041  	}
  1042  
  1043  	var out []*structs.Allocation
  1044  	for {
  1045  		raw := iter.Next()
  1046  		if raw == nil {
  1047  			break
  1048  		}
  1049  		out = append(out, raw.(*structs.Allocation))
  1050  	}
  1051  	return out, nil
  1052  }
  1053  
  1054  // AllocsByNode returns all the allocations by node and terminal status
  1055  func (s *StateStore) AllocsByNodeTerminal(node string, terminal bool) ([]*structs.Allocation, error) {
  1056  	txn := s.db.Txn(false)
  1057  
  1058  	// Get an iterator over the node allocations
  1059  	iter, err := txn.Get("allocs", "node", node, terminal)
  1060  	if err != nil {
  1061  		return nil, err
  1062  	}
  1063  
  1064  	var out []*structs.Allocation
  1065  	for {
  1066  		raw := iter.Next()
  1067  		if raw == nil {
  1068  			break
  1069  		}
  1070  		out = append(out, raw.(*structs.Allocation))
  1071  	}
  1072  	return out, nil
  1073  }
  1074  
  1075  // AllocsByJob returns all the allocations by job id
  1076  func (s *StateStore) AllocsByJob(jobID string) ([]*structs.Allocation, error) {
  1077  	txn := s.db.Txn(false)
  1078  
  1079  	// Get an iterator over the node allocations
  1080  	iter, err := txn.Get("allocs", "job", jobID)
  1081  	if err != nil {
  1082  		return nil, err
  1083  	}
  1084  
  1085  	var out []*structs.Allocation
  1086  	for {
  1087  		raw := iter.Next()
  1088  		if raw == nil {
  1089  			break
  1090  		}
  1091  		out = append(out, raw.(*structs.Allocation))
  1092  	}
  1093  	return out, nil
  1094  }
  1095  
  1096  // AllocsByEval returns all the allocations by eval id
  1097  func (s *StateStore) AllocsByEval(evalID string) ([]*structs.Allocation, error) {
  1098  	txn := s.db.Txn(false)
  1099  
  1100  	// Get an iterator over the eval allocations
  1101  	iter, err := txn.Get("allocs", "eval", evalID)
  1102  	if err != nil {
  1103  		return nil, err
  1104  	}
  1105  
  1106  	var out []*structs.Allocation
  1107  	for {
  1108  		raw := iter.Next()
  1109  		if raw == nil {
  1110  			break
  1111  		}
  1112  		out = append(out, raw.(*structs.Allocation))
  1113  	}
  1114  	return out, nil
  1115  }
  1116  
  1117  // Allocs returns an iterator over all the evaluations
  1118  func (s *StateStore) Allocs() (memdb.ResultIterator, error) {
  1119  	txn := s.db.Txn(false)
  1120  
  1121  	// Walk the entire table
  1122  	iter, err := txn.Get("allocs", "id")
  1123  	if err != nil {
  1124  		return nil, err
  1125  	}
  1126  	return iter, nil
  1127  }
  1128  
  1129  // UpsertVaultAccessors is used to register a set of Vault Accessors
  1130  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  1131  	txn := s.db.Txn(true)
  1132  	defer txn.Abort()
  1133  
  1134  	for _, accessor := range accessors {
  1135  		// Set the create index
  1136  		accessor.CreateIndex = index
  1137  
  1138  		// Insert the accessor
  1139  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  1140  			return fmt.Errorf("accessor insert failed: %v", err)
  1141  		}
  1142  	}
  1143  
  1144  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  1145  		return fmt.Errorf("index update failed: %v", err)
  1146  	}
  1147  
  1148  	txn.Commit()
  1149  	return nil
  1150  }
  1151  
  1152  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  1153  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  1154  	txn := s.db.Txn(true)
  1155  	defer txn.Abort()
  1156  
  1157  	// Lookup the accessor
  1158  	for _, accessor := range accessors {
  1159  		// Delete the accessor
  1160  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  1161  			return fmt.Errorf("accessor delete failed: %v", err)
  1162  		}
  1163  	}
  1164  
  1165  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  1166  		return fmt.Errorf("index update failed: %v", err)
  1167  	}
  1168  
  1169  	txn.Commit()
  1170  	return nil
  1171  }
  1172  
  1173  // VaultAccessor returns the given Vault accessor
  1174  func (s *StateStore) VaultAccessor(accessor string) (*structs.VaultAccessor, error) {
  1175  	txn := s.db.Txn(false)
  1176  
  1177  	existing, err := txn.First("vault_accessors", "id", accessor)
  1178  	if err != nil {
  1179  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  1180  	}
  1181  
  1182  	if existing != nil {
  1183  		return existing.(*structs.VaultAccessor), nil
  1184  	}
  1185  
  1186  	return nil, nil
  1187  }
  1188  
  1189  // VaultAccessors returns an iterator of Vault accessors.
  1190  func (s *StateStore) VaultAccessors() (memdb.ResultIterator, error) {
  1191  	txn := s.db.Txn(false)
  1192  
  1193  	iter, err := txn.Get("vault_accessors", "id")
  1194  	if err != nil {
  1195  		return nil, err
  1196  	}
  1197  	return iter, nil
  1198  }
  1199  
  1200  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  1201  func (s *StateStore) VaultAccessorsByAlloc(allocID string) ([]*structs.VaultAccessor, error) {
  1202  	txn := s.db.Txn(false)
  1203  
  1204  	// Get an iterator over the accessors
  1205  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  1206  	if err != nil {
  1207  		return nil, err
  1208  	}
  1209  
  1210  	var out []*structs.VaultAccessor
  1211  	for {
  1212  		raw := iter.Next()
  1213  		if raw == nil {
  1214  			break
  1215  		}
  1216  		out = append(out, raw.(*structs.VaultAccessor))
  1217  	}
  1218  	return out, nil
  1219  }
  1220  
  1221  // VaultAccessorsByNode returns all the Vault accessors by node id
  1222  func (s *StateStore) VaultAccessorsByNode(nodeID string) ([]*structs.VaultAccessor, error) {
  1223  	txn := s.db.Txn(false)
  1224  
  1225  	// Get an iterator over the accessors
  1226  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  1227  	if err != nil {
  1228  		return nil, err
  1229  	}
  1230  
  1231  	var out []*structs.VaultAccessor
  1232  	for {
  1233  		raw := iter.Next()
  1234  		if raw == nil {
  1235  			break
  1236  		}
  1237  		out = append(out, raw.(*structs.VaultAccessor))
  1238  	}
  1239  	return out, nil
  1240  }
  1241  
  1242  // LastIndex returns the greatest index value for all indexes
  1243  func (s *StateStore) LatestIndex() (uint64, error) {
  1244  	indexes, err := s.Indexes()
  1245  	if err != nil {
  1246  		return 0, err
  1247  	}
  1248  
  1249  	var max uint64 = 0
  1250  	for {
  1251  		raw := indexes.Next()
  1252  		if raw == nil {
  1253  			break
  1254  		}
  1255  
  1256  		// Prepare the request struct
  1257  		idx := raw.(*IndexEntry)
  1258  
  1259  		// Determine the max
  1260  		if idx.Value > max {
  1261  			max = idx.Value
  1262  		}
  1263  	}
  1264  
  1265  	return max, nil
  1266  }
  1267  
  1268  // Index finds the matching index value
  1269  func (s *StateStore) Index(name string) (uint64, error) {
  1270  	txn := s.db.Txn(false)
  1271  
  1272  	// Lookup the first matching index
  1273  	out, err := txn.First("index", "id", name)
  1274  	if err != nil {
  1275  		return 0, err
  1276  	}
  1277  	if out == nil {
  1278  		return 0, nil
  1279  	}
  1280  	return out.(*IndexEntry).Value, nil
  1281  }
  1282  
  1283  // RemoveIndex is a helper method to remove an index for testing purposes
  1284  func (s *StateStore) RemoveIndex(name string) error {
  1285  	txn := s.db.Txn(true)
  1286  	defer txn.Abort()
  1287  
  1288  	if _, err := txn.DeleteAll("index", "id", name); err != nil {
  1289  		return err
  1290  	}
  1291  
  1292  	txn.Commit()
  1293  	return nil
  1294  }
  1295  
  1296  // Indexes returns an iterator over all the indexes
  1297  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  1298  	txn := s.db.Txn(false)
  1299  
  1300  	// Walk the entire nodes table
  1301  	iter, err := txn.Get("index", "id")
  1302  	if err != nil {
  1303  		return nil, err
  1304  	}
  1305  	return iter, nil
  1306  }
  1307  
  1308  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  1309  // store
  1310  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  1311  	txn := s.db.Txn(true)
  1312  	defer txn.Abort()
  1313  
  1314  	// Get all the jobs
  1315  	iter, err := txn.Get("jobs", "id")
  1316  	if err != nil {
  1317  		return err
  1318  	}
  1319  	for {
  1320  		rawJob := iter.Next()
  1321  		if rawJob == nil {
  1322  			break
  1323  		}
  1324  		job := rawJob.(*structs.Job)
  1325  
  1326  		// Create a job summary for the job
  1327  		summary := structs.JobSummary{
  1328  			JobID:   job.ID,
  1329  			Summary: make(map[string]structs.TaskGroupSummary),
  1330  		}
  1331  		for _, tg := range job.TaskGroups {
  1332  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  1333  		}
  1334  
  1335  		// Find all the allocations for the jobs
  1336  		iterAllocs, err := txn.Get("allocs", "job", job.ID)
  1337  		if err != nil {
  1338  			return err
  1339  		}
  1340  
  1341  		// Calculate the summary for the job
  1342  		for {
  1343  			rawAlloc := iterAllocs.Next()
  1344  			if rawAlloc == nil {
  1345  				break
  1346  			}
  1347  			alloc := rawAlloc.(*structs.Allocation)
  1348  
  1349  			// Ignore the allocation if it doesn't belong to the currently
  1350  			// registered job
  1351  			if alloc.Job.CreateIndex != job.CreateIndex {
  1352  				continue
  1353  			}
  1354  
  1355  			tg := summary.Summary[alloc.TaskGroup]
  1356  			switch alloc.ClientStatus {
  1357  			case structs.AllocClientStatusFailed:
  1358  				tg.Failed += 1
  1359  			case structs.AllocClientStatusLost:
  1360  				tg.Lost += 1
  1361  			case structs.AllocClientStatusComplete:
  1362  				tg.Complete += 1
  1363  			case structs.AllocClientStatusRunning:
  1364  				tg.Running += 1
  1365  			case structs.AllocClientStatusPending:
  1366  				tg.Starting += 1
  1367  			default:
  1368  				s.logger.Printf("[ERR] state_store: invalid client status: %v in allocation %q", alloc.ClientStatus, alloc.ID)
  1369  			}
  1370  			summary.Summary[alloc.TaskGroup] = tg
  1371  		}
  1372  
  1373  		// Set the create index of the summary same as the job's create index
  1374  		// and the modify index to the current index
  1375  		summary.CreateIndex = job.CreateIndex
  1376  		summary.ModifyIndex = index
  1377  
  1378  		// Insert the job summary
  1379  		if err := txn.Insert("job_summary", summary); err != nil {
  1380  			return fmt.Errorf("error inserting job summary: %v", err)
  1381  		}
  1382  	}
  1383  
  1384  	// Update the indexes table for job summary
  1385  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1386  		return fmt.Errorf("index update failed: %v", err)
  1387  	}
  1388  	txn.Commit()
  1389  	return nil
  1390  }
  1391  
  1392  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  1393  // It takes a map of job IDs to an optional forceStatus string. It returns an
  1394  // error if the job doesn't exist or setJobStatus fails.
  1395  func (s *StateStore) setJobStatuses(index uint64, watcher watch.Items, txn *memdb.Txn,
  1396  	jobs map[string]string, evalDelete bool) error {
  1397  	for job, forceStatus := range jobs {
  1398  		existing, err := txn.First("jobs", "id", job)
  1399  		if err != nil {
  1400  			return fmt.Errorf("job lookup failed: %v", err)
  1401  		}
  1402  
  1403  		if existing == nil {
  1404  			continue
  1405  		}
  1406  
  1407  		if err := s.setJobStatus(index, watcher, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  1408  			return err
  1409  		}
  1410  	}
  1411  
  1412  	return nil
  1413  }
  1414  
  1415  // setJobStatus sets the status of the job by looking up associated evaluations
  1416  // and allocations. evalDelete should be set to true if setJobStatus is being
  1417  // called because an evaluation is being deleted (potentially because of garbage
  1418  // collection). If forceStatus is non-empty, the job's status will be set to the
  1419  // passed status.
  1420  func (s *StateStore) setJobStatus(index uint64, watcher watch.Items, txn *memdb.Txn,
  1421  	job *structs.Job, evalDelete bool, forceStatus string) error {
  1422  
  1423  	// Capture the current status so we can check if there is a change
  1424  	oldStatus := job.Status
  1425  	newStatus := forceStatus
  1426  
  1427  	// If forceStatus is not set, compute the jobs status.
  1428  	if forceStatus == "" {
  1429  		var err error
  1430  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  1431  		if err != nil {
  1432  			return err
  1433  		}
  1434  	}
  1435  
  1436  	// Fast-path if nothing has changed.
  1437  	if oldStatus == newStatus {
  1438  		return nil
  1439  	}
  1440  
  1441  	// The job has changed, so add to watcher.
  1442  	watcher.Add(watch.Item{Table: "jobs"})
  1443  	watcher.Add(watch.Item{Job: job.ID})
  1444  
  1445  	// Copy and update the existing job
  1446  	updated := job.Copy()
  1447  	updated.Status = newStatus
  1448  	updated.ModifyIndex = index
  1449  
  1450  	// Insert the job
  1451  	if err := txn.Insert("jobs", updated); err != nil {
  1452  		return fmt.Errorf("job insert failed: %v", err)
  1453  	}
  1454  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1455  		return fmt.Errorf("index update failed: %v", err)
  1456  	}
  1457  	return nil
  1458  }
  1459  
  1460  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  1461  	allocs, err := txn.Get("allocs", "job", job.ID)
  1462  	if err != nil {
  1463  		return "", err
  1464  	}
  1465  
  1466  	// If there is a non-terminal allocation, the job is running.
  1467  	hasAlloc := false
  1468  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  1469  		hasAlloc = true
  1470  		if !alloc.(*structs.Allocation).TerminalStatus() {
  1471  			return structs.JobStatusRunning, nil
  1472  		}
  1473  	}
  1474  
  1475  	evals, err := txn.Get("evals", "job", job.ID)
  1476  	if err != nil {
  1477  		return "", err
  1478  	}
  1479  
  1480  	hasEval := false
  1481  	for eval := evals.Next(); eval != nil; eval = evals.Next() {
  1482  		hasEval = true
  1483  		if !eval.(*structs.Evaluation).TerminalStatus() {
  1484  			return structs.JobStatusPending, nil
  1485  		}
  1486  	}
  1487  
  1488  	// The job is dead if all the allocations and evals are terminal or if there
  1489  	// are no evals because of garbage collection.
  1490  	if evalDelete || hasEval || hasAlloc {
  1491  		return structs.JobStatusDead, nil
  1492  	}
  1493  
  1494  	// If there are no allocations or evaluations it is a new job. If the job is
  1495  	// periodic, we mark it as running as it will never have an
  1496  	// allocation/evaluation against it.
  1497  	if job.IsPeriodic() {
  1498  		return structs.JobStatusRunning, nil
  1499  	}
  1500  	return structs.JobStatusPending, nil
  1501  }
  1502  
  1503  // updateSummaryWithJob creates or updates job summaries when new jobs are
  1504  // upserted or existing ones are updated
  1505  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  1506  	watcher watch.Items, txn *memdb.Txn) error {
  1507  
  1508  	existing, err := s.JobSummaryByID(job.ID)
  1509  	if err != nil {
  1510  		return fmt.Errorf("unable to retrieve summary for job: %v", err)
  1511  	}
  1512  	var hasSummaryChanged bool
  1513  	if existing == nil {
  1514  		existing = &structs.JobSummary{
  1515  			JobID:       job.ID,
  1516  			Summary:     make(map[string]structs.TaskGroupSummary),
  1517  			CreateIndex: index,
  1518  		}
  1519  		hasSummaryChanged = true
  1520  	}
  1521  	for _, tg := range job.TaskGroups {
  1522  		if _, ok := existing.Summary[tg.Name]; !ok {
  1523  			newSummary := structs.TaskGroupSummary{
  1524  				Complete: 0,
  1525  				Failed:   0,
  1526  				Running:  0,
  1527  				Starting: 0,
  1528  			}
  1529  			existing.Summary[tg.Name] = newSummary
  1530  			hasSummaryChanged = true
  1531  		}
  1532  	}
  1533  
  1534  	// The job summary has changed, so add to watcher and update the modify
  1535  	// index.
  1536  	if hasSummaryChanged {
  1537  		existing.ModifyIndex = index
  1538  		watcher.Add(watch.Item{Table: "job_summary"})
  1539  		watcher.Add(watch.Item{JobSummary: job.ID})
  1540  
  1541  		// Update the indexes table for job summary
  1542  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1543  			return fmt.Errorf("index update failed: %v", err)
  1544  		}
  1545  		if err := txn.Insert("job_summary", *existing); err != nil {
  1546  			return err
  1547  		}
  1548  	}
  1549  
  1550  	return nil
  1551  }
  1552  
  1553  // updateSummaryWithAlloc updates the job summary when allocations are updated
  1554  // or inserted
  1555  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  1556  	existingAlloc *structs.Allocation, watcher watch.Items, txn *memdb.Txn) error {
  1557  
  1558  	// We don't have to update the summary if the job is missing
  1559  	if alloc.Job == nil {
  1560  		return nil
  1561  	}
  1562  
  1563  	summaryRaw, err := txn.First("job_summary", "id", alloc.JobID)
  1564  	if err != nil {
  1565  		return fmt.Errorf("unable to lookup job summary for job id %q: %v", err)
  1566  	}
  1567  	if summaryRaw == nil {
  1568  		// Check if the job is de-registered
  1569  		rawJob, err := txn.First("jobs", "id", alloc.JobID)
  1570  		if err != nil {
  1571  			return fmt.Errorf("unable to query job: %v", err)
  1572  		}
  1573  
  1574  		// If the job is de-registered then we skip updating it's summary
  1575  		if rawJob == nil {
  1576  			return nil
  1577  		}
  1578  		return fmt.Errorf("job summary for job %q is not present", alloc.JobID)
  1579  	}
  1580  	summary := summaryRaw.(structs.JobSummary)
  1581  	jobSummary := summary.Copy()
  1582  
  1583  	// Not updating the job summary because the allocation doesn't belong to the
  1584  	// currently registered job
  1585  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  1586  		return nil
  1587  	}
  1588  
  1589  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  1590  	if !ok {
  1591  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  1592  	}
  1593  	var summaryChanged bool
  1594  	if existingAlloc == nil {
  1595  		switch alloc.DesiredStatus {
  1596  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  1597  			s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v",
  1598  				alloc.ID, alloc.DesiredStatus)
  1599  		}
  1600  		switch alloc.ClientStatus {
  1601  		case structs.AllocClientStatusPending:
  1602  			tgSummary.Starting += 1
  1603  			if tgSummary.Queued > 0 {
  1604  				tgSummary.Queued -= 1
  1605  			}
  1606  			summaryChanged = true
  1607  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  1608  			structs.AllocClientStatusComplete:
  1609  			s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v",
  1610  				alloc.ID, alloc.ClientStatus)
  1611  		}
  1612  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  1613  		// Incrementing the client of the bin of the current state
  1614  		switch alloc.ClientStatus {
  1615  		case structs.AllocClientStatusRunning:
  1616  			tgSummary.Running += 1
  1617  		case structs.AllocClientStatusFailed:
  1618  			tgSummary.Failed += 1
  1619  		case structs.AllocClientStatusPending:
  1620  			tgSummary.Starting += 1
  1621  		case structs.AllocClientStatusComplete:
  1622  			tgSummary.Complete += 1
  1623  		case structs.AllocClientStatusLost:
  1624  			tgSummary.Lost += 1
  1625  		}
  1626  
  1627  		// Decrementing the count of the bin of the last state
  1628  		switch existingAlloc.ClientStatus {
  1629  		case structs.AllocClientStatusRunning:
  1630  			tgSummary.Running -= 1
  1631  		case structs.AllocClientStatusPending:
  1632  			tgSummary.Starting -= 1
  1633  		case structs.AllocClientStatusLost:
  1634  			tgSummary.Lost -= 1
  1635  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  1636  		default:
  1637  			s.logger.Printf("[ERR] state_store: invalid old state of allocation with id: %v, and state: %v",
  1638  				existingAlloc.ID, existingAlloc.ClientStatus)
  1639  		}
  1640  		summaryChanged = true
  1641  	}
  1642  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  1643  
  1644  	if summaryChanged {
  1645  		jobSummary.ModifyIndex = index
  1646  		watcher.Add(watch.Item{Table: "job_summary"})
  1647  		watcher.Add(watch.Item{JobSummary: alloc.JobID})
  1648  
  1649  		// Update the indexes table for job summary
  1650  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1651  			return fmt.Errorf("index update failed: %v", err)
  1652  		}
  1653  
  1654  		if err := txn.Insert("job_summary", *jobSummary); err != nil {
  1655  			return fmt.Errorf("updating job summary failed: %v", err)
  1656  		}
  1657  	}
  1658  
  1659  	return nil
  1660  }
  1661  
  1662  // addEphemeralDiskToTaskGroups adds missing EphemeralDisk objects to TaskGroups
  1663  func (s *StateStore) addEphemeralDiskToTaskGroups(job *structs.Job) {
  1664  	for _, tg := range job.TaskGroups {
  1665  		var diskMB int
  1666  		for _, task := range tg.Tasks {
  1667  			if task.Resources != nil {
  1668  				diskMB += task.Resources.DiskMB
  1669  				task.Resources.DiskMB = 0
  1670  			}
  1671  		}
  1672  		if tg.EphemeralDisk != nil {
  1673  			continue
  1674  		}
  1675  		tg.EphemeralDisk = &structs.EphemeralDisk{
  1676  			SizeMB: diskMB,
  1677  		}
  1678  	}
  1679  }
  1680  
  1681  // StateSnapshot is used to provide a point-in-time snapshot
  1682  type StateSnapshot struct {
  1683  	StateStore
  1684  }
  1685  
  1686  // StateRestore is used to optimize the performance when
  1687  // restoring state by only using a single large transaction
  1688  // instead of thousands of sub transactions
  1689  type StateRestore struct {
  1690  	txn   *memdb.Txn
  1691  	watch *stateWatch
  1692  	items watch.Items
  1693  }
  1694  
  1695  // Abort is used to abort the restore operation
  1696  func (s *StateRestore) Abort() {
  1697  	s.txn.Abort()
  1698  }
  1699  
  1700  // Commit is used to commit the restore operation
  1701  func (s *StateRestore) Commit() {
  1702  	s.txn.Defer(func() { s.watch.notify(s.items) })
  1703  	s.txn.Commit()
  1704  }
  1705  
  1706  // NodeRestore is used to restore a node
  1707  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  1708  	r.items.Add(watch.Item{Table: "nodes"})
  1709  	r.items.Add(watch.Item{Node: node.ID})
  1710  	if err := r.txn.Insert("nodes", node); err != nil {
  1711  		return fmt.Errorf("node insert failed: %v", err)
  1712  	}
  1713  	return nil
  1714  }
  1715  
  1716  // JobRestore is used to restore a job
  1717  func (r *StateRestore) JobRestore(job *structs.Job) error {
  1718  	r.items.Add(watch.Item{Table: "jobs"})
  1719  	r.items.Add(watch.Item{Job: job.ID})
  1720  
  1721  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
  1722  	// COMPAT 0.4.1 -> 0.5
  1723  	r.addEphemeralDiskToTaskGroups(job)
  1724  
  1725  	if err := r.txn.Insert("jobs", job); err != nil {
  1726  		return fmt.Errorf("job insert failed: %v", err)
  1727  	}
  1728  	return nil
  1729  }
  1730  
  1731  // EvalRestore is used to restore an evaluation
  1732  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  1733  	r.items.Add(watch.Item{Table: "evals"})
  1734  	r.items.Add(watch.Item{Eval: eval.ID})
  1735  	r.items.Add(watch.Item{EvalJob: eval.JobID})
  1736  	if err := r.txn.Insert("evals", eval); err != nil {
  1737  		return fmt.Errorf("eval insert failed: %v", err)
  1738  	}
  1739  	return nil
  1740  }
  1741  
  1742  // AllocRestore is used to restore an allocation
  1743  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  1744  	r.items.Add(watch.Item{Table: "allocs"})
  1745  	r.items.Add(watch.Item{Alloc: alloc.ID})
  1746  	r.items.Add(watch.Item{AllocEval: alloc.EvalID})
  1747  	r.items.Add(watch.Item{AllocJob: alloc.JobID})
  1748  	r.items.Add(watch.Item{AllocNode: alloc.NodeID})
  1749  
  1750  	// Set the shared resources if it's not present
  1751  	// COMPAT 0.4.1 -> 0.5
  1752  	if alloc.SharedResources == nil {
  1753  		alloc.SharedResources = &structs.Resources{
  1754  			DiskMB: alloc.Resources.DiskMB,
  1755  		}
  1756  	}
  1757  
  1758  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
  1759  	if alloc.Job != nil {
  1760  		r.addEphemeralDiskToTaskGroups(alloc.Job)
  1761  	}
  1762  
  1763  	if err := r.txn.Insert("allocs", alloc); err != nil {
  1764  		return fmt.Errorf("alloc insert failed: %v", err)
  1765  	}
  1766  	return nil
  1767  }
  1768  
  1769  // IndexRestore is used to restore an index
  1770  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  1771  	if err := r.txn.Insert("index", idx); err != nil {
  1772  		return fmt.Errorf("index insert failed: %v", err)
  1773  	}
  1774  	return nil
  1775  }
  1776  
  1777  // PeriodicLaunchRestore is used to restore a periodic launch.
  1778  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  1779  	r.items.Add(watch.Item{Table: "periodic_launch"})
  1780  	r.items.Add(watch.Item{Job: launch.ID})
  1781  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  1782  		return fmt.Errorf("periodic launch insert failed: %v", err)
  1783  	}
  1784  	return nil
  1785  }
  1786  
  1787  // JobSummaryRestore is used to restore a job summary
  1788  func (r *StateRestore) JobSummaryRestore(jobSummary *structs.JobSummary) error {
  1789  	if err := r.txn.Insert("job_summary", *jobSummary); err != nil {
  1790  		return fmt.Errorf("job summary insert failed: %v", err)
  1791  	}
  1792  	return nil
  1793  }
  1794  
  1795  // VaultAccessorRestore is used to restore a vault accessor
  1796  func (r *StateRestore) VaultAccessorRestore(accessor *structs.VaultAccessor) error {
  1797  	if err := r.txn.Insert("vault_accessors", accessor); err != nil {
  1798  		return fmt.Errorf("vault accessor insert failed: %v", err)
  1799  	}
  1800  	return nil
  1801  }
  1802  
  1803  // addEphemeralDiskToTaskGroups adds missing EphemeralDisk objects to TaskGroups
  1804  func (r *StateRestore) addEphemeralDiskToTaskGroups(job *structs.Job) {
  1805  	for _, tg := range job.TaskGroups {
  1806  		if tg.EphemeralDisk != nil {
  1807  			continue
  1808  		}
  1809  		var sizeMB int
  1810  		for _, task := range tg.Tasks {
  1811  			if task.Resources != nil {
  1812  				sizeMB += task.Resources.DiskMB
  1813  				task.Resources.DiskMB = 0
  1814  			}
  1815  		}
  1816  		tg.EphemeralDisk = &structs.EphemeralDisk{
  1817  			SizeMB: sizeMB,
  1818  		}
  1819  	}
  1820  }
  1821  
  1822  // stateWatch holds shared state for watching updates. This is
  1823  // outside of StateStore so it can be shared with snapshots.
  1824  type stateWatch struct {
  1825  	items map[watch.Item]*NotifyGroup
  1826  	l     sync.Mutex
  1827  }
  1828  
  1829  // newStateWatch creates a new stateWatch for change notification.
  1830  func newStateWatch() *stateWatch {
  1831  	return &stateWatch{
  1832  		items: make(map[watch.Item]*NotifyGroup),
  1833  	}
  1834  }
  1835  
  1836  // watch subscribes a channel to the given watch items.
  1837  func (w *stateWatch) watch(items watch.Items, ch chan struct{}) {
  1838  	w.l.Lock()
  1839  	defer w.l.Unlock()
  1840  
  1841  	for item, _ := range items {
  1842  		grp, ok := w.items[item]
  1843  		if !ok {
  1844  			grp = new(NotifyGroup)
  1845  			w.items[item] = grp
  1846  		}
  1847  		grp.Wait(ch)
  1848  	}
  1849  }
  1850  
  1851  // stopWatch unsubscribes a channel from the given watch items.
  1852  func (w *stateWatch) stopWatch(items watch.Items, ch chan struct{}) {
  1853  	w.l.Lock()
  1854  	defer w.l.Unlock()
  1855  
  1856  	for item, _ := range items {
  1857  		if grp, ok := w.items[item]; ok {
  1858  			grp.Clear(ch)
  1859  			if grp.Empty() {
  1860  				delete(w.items, item)
  1861  			}
  1862  		}
  1863  	}
  1864  }
  1865  
  1866  // notify is used to fire notifications on the given watch items.
  1867  func (w *stateWatch) notify(items watch.Items) {
  1868  	w.l.Lock()
  1869  	defer w.l.Unlock()
  1870  
  1871  	for wi, _ := range items {
  1872  		if grp, ok := w.items[wi]; ok {
  1873  			grp.Notify()
  1874  		}
  1875  	}
  1876  }