github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/nomad/watch"
    12  )
    13  
    14  // IndexEntry is used with the "index" table
    15  // for managing the latest Raft index affecting a table.
    16  type IndexEntry struct {
    17  	Key   string
    18  	Value uint64
    19  }
    20  
    21  // The StateStore is responsible for maintaining all the Nomad
    22  // state. It is manipulated by the FSM which maintains consistency
    23  // through the use of Raft. The goals of the StateStore are to provide
    24  // high concurrency for read operations without blocking writes, and
    25  // to provide write availability in the face of reads. EVERY object
    26  // returned as a result of a read against the state store should be
    27  // considered a constant and NEVER modified in place.
    28  type StateStore struct {
    29  	logger *log.Logger
    30  	db     *memdb.MemDB
    31  	watch  *stateWatch
    32  }
    33  
    34  // NewStateStore is used to create a new state store
    35  func NewStateStore(logOutput io.Writer) (*StateStore, error) {
    36  	// Create the MemDB
    37  	db, err := memdb.NewMemDB(stateStoreSchema())
    38  	if err != nil {
    39  		return nil, fmt.Errorf("state store setup failed: %v", err)
    40  	}
    41  
    42  	// Create the state store
    43  	s := &StateStore{
    44  		logger: log.New(logOutput, "", log.LstdFlags),
    45  		db:     db,
    46  		watch:  newStateWatch(),
    47  	}
    48  	return s, nil
    49  }
    50  
    51  // Snapshot is used to create a point in time snapshot. Because
    52  // we use MemDB, we just need to snapshot the state of the underlying
    53  // database.
    54  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    55  	snap := &StateSnapshot{
    56  		StateStore: StateStore{
    57  			logger: s.logger,
    58  			db:     s.db.Snapshot(),
    59  			watch:  s.watch,
    60  		},
    61  	}
    62  	return snap, nil
    63  }
    64  
    65  // Restore is used to optimize the efficiency of rebuilding
    66  // state by minimizing the number of transactions and checking
    67  // overhead.
    68  func (s *StateStore) Restore() (*StateRestore, error) {
    69  	txn := s.db.Txn(true)
    70  	r := &StateRestore{
    71  		txn:   txn,
    72  		watch: s.watch,
    73  		items: watch.NewItems(),
    74  	}
    75  	return r, nil
    76  }
    77  
    78  // Watch subscribes a channel to a set of watch items.
    79  func (s *StateStore) Watch(items watch.Items, notify chan struct{}) {
    80  	s.watch.watch(items, notify)
    81  }
    82  
    83  // StopWatch unsubscribes a channel from a set of watch items.
    84  func (s *StateStore) StopWatch(items watch.Items, notify chan struct{}) {
    85  	s.watch.stopWatch(items, notify)
    86  }
    87  
    88  // UpsertJobSummary upserts a job summary into the state store.
    89  func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
    90  	txn := s.db.Txn(true)
    91  	defer txn.Abort()
    92  
    93  	// Update the index
    94  	if err := txn.Insert("job_summary", *jobSummary); err != nil {
    95  		return err
    96  	}
    97  
    98  	// Update the indexes table for job summary
    99  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   100  		return fmt.Errorf("index update failed: %v", err)
   101  	}
   102  
   103  	txn.Commit()
   104  	return nil
   105  }
   106  
   107  // DeleteJobSummary deletes the job summary with the given ID. This is for
   108  // testing purposes only.
   109  func (s *StateStore) DeleteJobSummary(index uint64, id string) error {
   110  	txn := s.db.Txn(true)
   111  	defer txn.Abort()
   112  
   113  	// Delete the job summary
   114  	if _, err := txn.DeleteAll("job_summary", "id", id); err != nil {
   115  		return fmt.Errorf("deleting job summary failed: %v", err)
   116  	}
   117  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   118  		return fmt.Errorf("index update failed: %v", err)
   119  	}
   120  	txn.Commit()
   121  	return nil
   122  }
   123  
   124  // UpsertNode is used to register a node or update a node definition
   125  // This is assumed to be triggered by the client, so we retain the value
   126  // of drain which is set by the scheduler.
   127  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
   128  	txn := s.db.Txn(true)
   129  	defer txn.Abort()
   130  
   131  	watcher := watch.NewItems()
   132  	watcher.Add(watch.Item{Table: "nodes"})
   133  	watcher.Add(watch.Item{Node: node.ID})
   134  
   135  	// Check if the node already exists
   136  	existing, err := txn.First("nodes", "id", node.ID)
   137  	if err != nil {
   138  		return fmt.Errorf("node lookup failed: %v", err)
   139  	}
   140  
   141  	// Setup the indexes correctly
   142  	if existing != nil {
   143  		exist := existing.(*structs.Node)
   144  		node.CreateIndex = exist.CreateIndex
   145  		node.ModifyIndex = index
   146  		node.Drain = exist.Drain // Retain the drain mode
   147  	} else {
   148  		node.CreateIndex = index
   149  		node.ModifyIndex = index
   150  	}
   151  
   152  	// Insert the node
   153  	if err := txn.Insert("nodes", node); err != nil {
   154  		return fmt.Errorf("node insert failed: %v", err)
   155  	}
   156  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   157  		return fmt.Errorf("index update failed: %v", err)
   158  	}
   159  
   160  	txn.Defer(func() { s.watch.notify(watcher) })
   161  	txn.Commit()
   162  	return nil
   163  }
   164  
   165  // DeleteNode is used to deregister a node
   166  func (s *StateStore) DeleteNode(index uint64, nodeID string) error {
   167  	txn := s.db.Txn(true)
   168  	defer txn.Abort()
   169  
   170  	// Lookup the node
   171  	existing, err := txn.First("nodes", "id", nodeID)
   172  	if err != nil {
   173  		return fmt.Errorf("node lookup failed: %v", err)
   174  	}
   175  	if existing == nil {
   176  		return fmt.Errorf("node not found")
   177  	}
   178  
   179  	watcher := watch.NewItems()
   180  	watcher.Add(watch.Item{Table: "nodes"})
   181  	watcher.Add(watch.Item{Node: nodeID})
   182  
   183  	// Delete the node
   184  	if err := txn.Delete("nodes", existing); err != nil {
   185  		return fmt.Errorf("node delete failed: %v", err)
   186  	}
   187  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   188  		return fmt.Errorf("index update failed: %v", err)
   189  	}
   190  
   191  	txn.Defer(func() { s.watch.notify(watcher) })
   192  	txn.Commit()
   193  	return nil
   194  }
   195  
   196  // UpdateNodeStatus is used to update the status of a node
   197  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error {
   198  	txn := s.db.Txn(true)
   199  	defer txn.Abort()
   200  
   201  	watcher := watch.NewItems()
   202  	watcher.Add(watch.Item{Table: "nodes"})
   203  	watcher.Add(watch.Item{Node: nodeID})
   204  
   205  	// Lookup the node
   206  	existing, err := txn.First("nodes", "id", nodeID)
   207  	if err != nil {
   208  		return fmt.Errorf("node lookup failed: %v", err)
   209  	}
   210  	if existing == nil {
   211  		return fmt.Errorf("node not found")
   212  	}
   213  
   214  	// Copy the existing node
   215  	existingNode := existing.(*structs.Node)
   216  	copyNode := new(structs.Node)
   217  	*copyNode = *existingNode
   218  
   219  	// Update the status in the copy
   220  	copyNode.Status = status
   221  	copyNode.ModifyIndex = index
   222  
   223  	// Insert the node
   224  	if err := txn.Insert("nodes", copyNode); err != nil {
   225  		return fmt.Errorf("node update failed: %v", err)
   226  	}
   227  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   228  		return fmt.Errorf("index update failed: %v", err)
   229  	}
   230  
   231  	txn.Defer(func() { s.watch.notify(watcher) })
   232  	txn.Commit()
   233  	return nil
   234  }
   235  
   236  // UpdateNodeDrain is used to update the drain of a node
   237  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error {
   238  	txn := s.db.Txn(true)
   239  	defer txn.Abort()
   240  
   241  	watcher := watch.NewItems()
   242  	watcher.Add(watch.Item{Table: "nodes"})
   243  	watcher.Add(watch.Item{Node: nodeID})
   244  
   245  	// Lookup the node
   246  	existing, err := txn.First("nodes", "id", nodeID)
   247  	if err != nil {
   248  		return fmt.Errorf("node lookup failed: %v", err)
   249  	}
   250  	if existing == nil {
   251  		return fmt.Errorf("node not found")
   252  	}
   253  
   254  	// Copy the existing node
   255  	existingNode := existing.(*structs.Node)
   256  	copyNode := new(structs.Node)
   257  	*copyNode = *existingNode
   258  
   259  	// Update the drain in the copy
   260  	copyNode.Drain = drain
   261  	copyNode.ModifyIndex = index
   262  
   263  	// Insert the node
   264  	if err := txn.Insert("nodes", copyNode); err != nil {
   265  		return fmt.Errorf("node update failed: %v", err)
   266  	}
   267  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   268  		return fmt.Errorf("index update failed: %v", err)
   269  	}
   270  
   271  	txn.Defer(func() { s.watch.notify(watcher) })
   272  	txn.Commit()
   273  	return nil
   274  }
   275  
   276  // NodeByID is used to lookup a node by ID
   277  func (s *StateStore) NodeByID(nodeID string) (*structs.Node, error) {
   278  	txn := s.db.Txn(false)
   279  
   280  	existing, err := txn.First("nodes", "id", nodeID)
   281  	if err != nil {
   282  		return nil, fmt.Errorf("node lookup failed: %v", err)
   283  	}
   284  
   285  	if existing != nil {
   286  		return existing.(*structs.Node), nil
   287  	}
   288  	return nil, nil
   289  }
   290  
   291  // NodesByIDPrefix is used to lookup nodes by prefix
   292  func (s *StateStore) NodesByIDPrefix(nodeID string) (memdb.ResultIterator, error) {
   293  	txn := s.db.Txn(false)
   294  
   295  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
   296  	if err != nil {
   297  		return nil, fmt.Errorf("node lookup failed: %v", err)
   298  	}
   299  
   300  	return iter, nil
   301  }
   302  
   303  // Nodes returns an iterator over all the nodes
   304  func (s *StateStore) Nodes() (memdb.ResultIterator, error) {
   305  	txn := s.db.Txn(false)
   306  
   307  	// Walk the entire nodes table
   308  	iter, err := txn.Get("nodes", "id")
   309  	if err != nil {
   310  		return nil, err
   311  	}
   312  	return iter, nil
   313  }
   314  
   315  // UpsertJob is used to register a job or update a job definition
   316  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
   317  	txn := s.db.Txn(true)
   318  	defer txn.Abort()
   319  
   320  	watcher := watch.NewItems()
   321  	watcher.Add(watch.Item{Table: "jobs"})
   322  	watcher.Add(watch.Item{Job: job.ID})
   323  
   324  	// Check if the job already exists
   325  	existing, err := txn.First("jobs", "id", job.ID)
   326  	if err != nil {
   327  		return fmt.Errorf("job lookup failed: %v", err)
   328  	}
   329  
   330  	// Setup the indexes correctly
   331  	if existing != nil {
   332  		job.CreateIndex = existing.(*structs.Job).CreateIndex
   333  		job.ModifyIndex = index
   334  		job.JobModifyIndex = index
   335  
   336  		// Compute the job status
   337  		var err error
   338  		job.Status, err = s.getJobStatus(txn, job, false)
   339  		if err != nil {
   340  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
   341  		}
   342  	} else {
   343  		job.CreateIndex = index
   344  		job.ModifyIndex = index
   345  		job.JobModifyIndex = index
   346  
   347  		// If we are inserting the job for the first time, we don't need to
   348  		// calculate the jobs status as it is known.
   349  		if job.IsPeriodic() {
   350  			job.Status = structs.JobStatusRunning
   351  		} else {
   352  			job.Status = structs.JobStatusPending
   353  		}
   354  	}
   355  
   356  	if err := s.updateSummaryWithJob(index, job, watcher, txn); err != nil {
   357  		return fmt.Errorf("unable to create job summary: %v", err)
   358  	}
   359  
   360  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
   361  	// COMPAT 0.4.1 -> 0.5
   362  	s.addEphemeralDiskToTaskGroups(job)
   363  
   364  	// Insert the job
   365  	if err := txn.Insert("jobs", job); err != nil {
   366  		return fmt.Errorf("job insert failed: %v", err)
   367  	}
   368  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   369  		return fmt.Errorf("index update failed: %v", err)
   370  	}
   371  
   372  	txn.Defer(func() { s.watch.notify(watcher) })
   373  	txn.Commit()
   374  	return nil
   375  }
   376  
   377  // DeleteJob is used to deregister a job
   378  func (s *StateStore) DeleteJob(index uint64, jobID string) error {
   379  	txn := s.db.Txn(true)
   380  	defer txn.Abort()
   381  
   382  	// Lookup the node
   383  	existing, err := txn.First("jobs", "id", jobID)
   384  	if err != nil {
   385  		return fmt.Errorf("job lookup failed: %v", err)
   386  	}
   387  	if existing == nil {
   388  		return fmt.Errorf("job not found")
   389  	}
   390  
   391  	watcher := watch.NewItems()
   392  	watcher.Add(watch.Item{Table: "jobs"})
   393  	watcher.Add(watch.Item{Job: jobID})
   394  	watcher.Add(watch.Item{Table: "job_summary"})
   395  	watcher.Add(watch.Item{JobSummary: jobID})
   396  
   397  	// Delete the node
   398  	if err := txn.Delete("jobs", existing); err != nil {
   399  		return fmt.Errorf("job delete failed: %v", err)
   400  	}
   401  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   402  		return fmt.Errorf("index update failed: %v", err)
   403  	}
   404  
   405  	// Delete the job summary
   406  	if _, err = txn.DeleteAll("job_summary", "id", jobID); err != nil {
   407  		return fmt.Errorf("deleing job summary failed: %v", err)
   408  	}
   409  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   410  		return fmt.Errorf("index update failed: %v", err)
   411  	}
   412  
   413  	txn.Defer(func() { s.watch.notify(watcher) })
   414  	txn.Commit()
   415  	return nil
   416  }
   417  
   418  // JobByID is used to lookup a job by its ID
   419  func (s *StateStore) JobByID(id string) (*structs.Job, error) {
   420  	txn := s.db.Txn(false)
   421  
   422  	existing, err := txn.First("jobs", "id", id)
   423  	if err != nil {
   424  		return nil, fmt.Errorf("job lookup failed: %v", err)
   425  	}
   426  
   427  	if existing != nil {
   428  		return existing.(*structs.Job), nil
   429  	}
   430  	return nil, nil
   431  }
   432  
   433  // JobsByIDPrefix is used to lookup a job by prefix
   434  func (s *StateStore) JobsByIDPrefix(id string) (memdb.ResultIterator, error) {
   435  	txn := s.db.Txn(false)
   436  
   437  	iter, err := txn.Get("jobs", "id_prefix", id)
   438  	if err != nil {
   439  		return nil, fmt.Errorf("job lookup failed: %v", err)
   440  	}
   441  
   442  	return iter, nil
   443  }
   444  
   445  // Jobs returns an iterator over all the jobs
   446  func (s *StateStore) Jobs() (memdb.ResultIterator, error) {
   447  	txn := s.db.Txn(false)
   448  
   449  	// Walk the entire jobs table
   450  	iter, err := txn.Get("jobs", "id")
   451  	if err != nil {
   452  		return nil, err
   453  	}
   454  	return iter, nil
   455  }
   456  
   457  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
   458  func (s *StateStore) JobsByPeriodic(periodic bool) (memdb.ResultIterator, error) {
   459  	txn := s.db.Txn(false)
   460  
   461  	iter, err := txn.Get("jobs", "periodic", periodic)
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  	return iter, nil
   466  }
   467  
   468  // JobsByScheduler returns an iterator over all the jobs with the specific
   469  // scheduler type.
   470  func (s *StateStore) JobsByScheduler(schedulerType string) (memdb.ResultIterator, error) {
   471  	txn := s.db.Txn(false)
   472  
   473  	// Return an iterator for jobs with the specific type.
   474  	iter, err := txn.Get("jobs", "type", schedulerType)
   475  	if err != nil {
   476  		return nil, err
   477  	}
   478  	return iter, nil
   479  }
   480  
   481  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
   482  // collection.
   483  func (s *StateStore) JobsByGC(gc bool) (memdb.ResultIterator, error) {
   484  	txn := s.db.Txn(false)
   485  
   486  	iter, err := txn.Get("jobs", "gc", gc)
   487  	if err != nil {
   488  		return nil, err
   489  	}
   490  	return iter, nil
   491  }
   492  
   493  // JobSummary returns a job summary object which matches a specific id.
   494  func (s *StateStore) JobSummaryByID(jobID string) (*structs.JobSummary, error) {
   495  	txn := s.db.Txn(false)
   496  
   497  	existing, err := txn.First("job_summary", "id", jobID)
   498  	if err != nil {
   499  		return nil, err
   500  	}
   501  	if existing != nil {
   502  		summary := existing.(structs.JobSummary)
   503  		return summary.Copy(), nil
   504  	}
   505  
   506  	return nil, nil
   507  }
   508  
   509  // JobSummaries walks the entire job summary table and returns all the job
   510  // summary objects
   511  func (s *StateStore) JobSummaries() (memdb.ResultIterator, error) {
   512  	txn := s.db.Txn(false)
   513  
   514  	iter, err := txn.Get("job_summary", "id")
   515  	if err != nil {
   516  		return nil, err
   517  	}
   518  	return iter, nil
   519  }
   520  
   521  // JobSummaryByPrefix is used to look up Job Summary by id prefix
   522  func (s *StateStore) JobSummaryByPrefix(id string) (memdb.ResultIterator, error) {
   523  	txn := s.db.Txn(false)
   524  
   525  	iter, err := txn.Get("job_summary", "id_prefix", id)
   526  	if err != nil {
   527  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   528  	}
   529  
   530  	return iter, nil
   531  }
   532  
   533  // UpsertPeriodicLaunch is used to register a launch or update it.
   534  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
   535  	txn := s.db.Txn(true)
   536  	defer txn.Abort()
   537  
   538  	watcher := watch.NewItems()
   539  	watcher.Add(watch.Item{Table: "periodic_launch"})
   540  	watcher.Add(watch.Item{Job: launch.ID})
   541  
   542  	// Check if the job already exists
   543  	existing, err := txn.First("periodic_launch", "id", launch.ID)
   544  	if err != nil {
   545  		return fmt.Errorf("periodic launch lookup failed: %v", err)
   546  	}
   547  
   548  	// Setup the indexes correctly
   549  	if existing != nil {
   550  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
   551  		launch.ModifyIndex = index
   552  	} else {
   553  		launch.CreateIndex = index
   554  		launch.ModifyIndex = index
   555  	}
   556  
   557  	// Insert the job
   558  	if err := txn.Insert("periodic_launch", launch); err != nil {
   559  		return fmt.Errorf("launch insert failed: %v", err)
   560  	}
   561  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   562  		return fmt.Errorf("index update failed: %v", err)
   563  	}
   564  
   565  	txn.Defer(func() { s.watch.notify(watcher) })
   566  	txn.Commit()
   567  	return nil
   568  }
   569  
   570  // DeletePeriodicLaunch is used to delete the periodic launch
   571  func (s *StateStore) DeletePeriodicLaunch(index uint64, jobID string) error {
   572  	txn := s.db.Txn(true)
   573  	defer txn.Abort()
   574  
   575  	// Lookup the launch
   576  	existing, err := txn.First("periodic_launch", "id", jobID)
   577  	if err != nil {
   578  		return fmt.Errorf("launch lookup failed: %v", err)
   579  	}
   580  	if existing == nil {
   581  		return fmt.Errorf("launch not found")
   582  	}
   583  
   584  	watcher := watch.NewItems()
   585  	watcher.Add(watch.Item{Table: "periodic_launch"})
   586  	watcher.Add(watch.Item{Job: jobID})
   587  
   588  	// Delete the launch
   589  	if err := txn.Delete("periodic_launch", existing); err != nil {
   590  		return fmt.Errorf("launch delete failed: %v", err)
   591  	}
   592  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   593  		return fmt.Errorf("index update failed: %v", err)
   594  	}
   595  
   596  	txn.Defer(func() { s.watch.notify(watcher) })
   597  	txn.Commit()
   598  	return nil
   599  }
   600  
   601  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
   602  // ID.
   603  func (s *StateStore) PeriodicLaunchByID(id string) (*structs.PeriodicLaunch, error) {
   604  	txn := s.db.Txn(false)
   605  
   606  	existing, err := txn.First("periodic_launch", "id", id)
   607  	if err != nil {
   608  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
   609  	}
   610  
   611  	if existing != nil {
   612  		return existing.(*structs.PeriodicLaunch), nil
   613  	}
   614  	return nil, nil
   615  }
   616  
   617  // PeriodicLaunches returns an iterator over all the periodic launches
   618  func (s *StateStore) PeriodicLaunches() (memdb.ResultIterator, error) {
   619  	txn := s.db.Txn(false)
   620  
   621  	// Walk the entire table
   622  	iter, err := txn.Get("periodic_launch", "id")
   623  	if err != nil {
   624  		return nil, err
   625  	}
   626  	return iter, nil
   627  }
   628  
   629  // UpsertEvaluation is used to upsert an evaluation
   630  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
   631  	txn := s.db.Txn(true)
   632  	defer txn.Abort()
   633  
   634  	watcher := watch.NewItems()
   635  	watcher.Add(watch.Item{Table: "evals"})
   636  
   637  	// Do a nested upsert
   638  	jobs := make(map[string]string, len(evals))
   639  	for _, eval := range evals {
   640  		watcher.Add(watch.Item{Eval: eval.ID})
   641  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   642  			return err
   643  		}
   644  
   645  		jobs[eval.JobID] = ""
   646  	}
   647  
   648  	// Set the job's status
   649  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   650  		return fmt.Errorf("setting job status failed: %v", err)
   651  	}
   652  
   653  	txn.Defer(func() { s.watch.notify(watcher) })
   654  	txn.Commit()
   655  	return nil
   656  }
   657  
   658  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
   659  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
   660  	// Lookup the evaluation
   661  	existing, err := txn.First("evals", "id", eval.ID)
   662  	if err != nil {
   663  		return fmt.Errorf("eval lookup failed: %v", err)
   664  	}
   665  
   666  	// Update the indexes
   667  	if existing != nil {
   668  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
   669  		eval.ModifyIndex = index
   670  	} else {
   671  		eval.CreateIndex = index
   672  		eval.ModifyIndex = index
   673  	}
   674  
   675  	// Update the job summary
   676  	summaryRaw, err := txn.First("job_summary", "id", eval.JobID)
   677  	if err != nil {
   678  		return fmt.Errorf("job summary lookup failed: %v", err)
   679  	}
   680  	if summaryRaw != nil {
   681  		js := summaryRaw.(structs.JobSummary)
   682  		var hasSummaryChanged bool
   683  		for tg, num := range eval.QueuedAllocations {
   684  			if summary, ok := js.Summary[tg]; ok {
   685  				if summary.Queued != num {
   686  					summary.Queued = num
   687  					js.Summary[tg] = summary
   688  					hasSummaryChanged = true
   689  				}
   690  			} else {
   691  				s.logger.Printf("[ERR] state_store: unable to update queued for job %q and task group %q", eval.JobID, tg)
   692  			}
   693  		}
   694  
   695  		// Insert the job summary
   696  		if hasSummaryChanged {
   697  			js.ModifyIndex = index
   698  			if err := txn.Insert("job_summary", js); err != nil {
   699  				return fmt.Errorf("job summary insert failed: %v", err)
   700  			}
   701  			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
   702  				return fmt.Errorf("index update failed: %v", err)
   703  			}
   704  		}
   705  	}
   706  
   707  	// Insert the eval
   708  	if err := txn.Insert("evals", eval); err != nil {
   709  		return fmt.Errorf("eval insert failed: %v", err)
   710  	}
   711  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   712  		return fmt.Errorf("index update failed: %v", err)
   713  	}
   714  	return nil
   715  }
   716  
   717  // DeleteEval is used to delete an evaluation
   718  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
   719  	txn := s.db.Txn(true)
   720  	defer txn.Abort()
   721  	watcher := watch.NewItems()
   722  	watcher.Add(watch.Item{Table: "evals"})
   723  	watcher.Add(watch.Item{Table: "allocs"})
   724  
   725  	jobs := make(map[string]string, len(evals))
   726  	for _, eval := range evals {
   727  		existing, err := txn.First("evals", "id", eval)
   728  		if err != nil {
   729  			return fmt.Errorf("eval lookup failed: %v", err)
   730  		}
   731  		if existing == nil {
   732  			continue
   733  		}
   734  		if err := txn.Delete("evals", existing); err != nil {
   735  			return fmt.Errorf("eval delete failed: %v", err)
   736  		}
   737  		watcher.Add(watch.Item{Eval: eval})
   738  		jobs[existing.(*structs.Evaluation).JobID] = ""
   739  	}
   740  
   741  	for _, alloc := range allocs {
   742  		existing, err := txn.First("allocs", "id", alloc)
   743  		if err != nil {
   744  			return fmt.Errorf("alloc lookup failed: %v", err)
   745  		}
   746  		if existing == nil {
   747  			continue
   748  		}
   749  		if err := txn.Delete("allocs", existing); err != nil {
   750  			return fmt.Errorf("alloc delete failed: %v", err)
   751  		}
   752  		realAlloc := existing.(*structs.Allocation)
   753  		watcher.Add(watch.Item{Alloc: realAlloc.ID})
   754  		watcher.Add(watch.Item{AllocEval: realAlloc.EvalID})
   755  		watcher.Add(watch.Item{AllocJob: realAlloc.JobID})
   756  		watcher.Add(watch.Item{AllocNode: realAlloc.NodeID})
   757  	}
   758  
   759  	// Update the indexes
   760  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   761  		return fmt.Errorf("index update failed: %v", err)
   762  	}
   763  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   764  		return fmt.Errorf("index update failed: %v", err)
   765  	}
   766  
   767  	// Set the job's status
   768  	if err := s.setJobStatuses(index, watcher, txn, jobs, true); err != nil {
   769  		return fmt.Errorf("setting job status failed: %v", err)
   770  	}
   771  
   772  	txn.Defer(func() { s.watch.notify(watcher) })
   773  	txn.Commit()
   774  	return nil
   775  }
   776  
   777  // EvalByID is used to lookup an eval by its ID
   778  func (s *StateStore) EvalByID(id string) (*structs.Evaluation, error) {
   779  	txn := s.db.Txn(false)
   780  
   781  	existing, err := txn.First("evals", "id", id)
   782  	if err != nil {
   783  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   784  	}
   785  
   786  	if existing != nil {
   787  		return existing.(*structs.Evaluation), nil
   788  	}
   789  	return nil, nil
   790  }
   791  
   792  // EvalsByIDPrefix is used to lookup evaluations by prefix
   793  func (s *StateStore) EvalsByIDPrefix(id string) (memdb.ResultIterator, error) {
   794  	txn := s.db.Txn(false)
   795  
   796  	iter, err := txn.Get("evals", "id_prefix", id)
   797  	if err != nil {
   798  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   799  	}
   800  
   801  	return iter, nil
   802  }
   803  
   804  // EvalsByJob returns all the evaluations by job id
   805  func (s *StateStore) EvalsByJob(jobID string) ([]*structs.Evaluation, error) {
   806  	txn := s.db.Txn(false)
   807  
   808  	// Get an iterator over the node allocations
   809  	iter, err := txn.Get("evals", "job", jobID)
   810  	if err != nil {
   811  		return nil, err
   812  	}
   813  
   814  	var out []*structs.Evaluation
   815  	for {
   816  		raw := iter.Next()
   817  		if raw == nil {
   818  			break
   819  		}
   820  		out = append(out, raw.(*structs.Evaluation))
   821  	}
   822  	return out, nil
   823  }
   824  
   825  // Evals returns an iterator over all the evaluations
   826  func (s *StateStore) Evals() (memdb.ResultIterator, error) {
   827  	txn := s.db.Txn(false)
   828  
   829  	// Walk the entire table
   830  	iter, err := txn.Get("evals", "id")
   831  	if err != nil {
   832  		return nil, err
   833  	}
   834  	return iter, nil
   835  }
   836  
   837  // UpdateAllocsFromClient is used to update an allocation based on input
   838  
   839  // from a client. While the schedulers are the authority on the allocation for
   840  // most things, some updates are authoritative from the client. Specifically,
   841  // the desired state comes from the schedulers, while the actual state comes
   842  // from clients.
   843  func (s *StateStore) UpdateAllocsFromClient(index uint64, allocs []*structs.Allocation) error {
   844  	txn := s.db.Txn(true)
   845  	defer txn.Abort()
   846  
   847  	// Setup the watcher
   848  	watcher := watch.NewItems()
   849  	watcher.Add(watch.Item{Table: "allocs"})
   850  
   851  	// Handle each of the updated allocations
   852  	for _, alloc := range allocs {
   853  		if err := s.nestedUpdateAllocFromClient(txn, watcher, index, alloc); err != nil {
   854  			return err
   855  		}
   856  	}
   857  
   858  	// Update the indexes
   859  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   860  		return fmt.Errorf("index update failed: %v", err)
   861  	}
   862  
   863  	txn.Defer(func() { s.watch.notify(watcher) })
   864  	txn.Commit()
   865  	return nil
   866  }
   867  
   868  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
   869  func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, watcher watch.Items, index uint64, alloc *structs.Allocation) error {
   870  	// Look for existing alloc
   871  	existing, err := txn.First("allocs", "id", alloc.ID)
   872  	if err != nil {
   873  		return fmt.Errorf("alloc lookup failed: %v", err)
   874  	}
   875  
   876  	// Nothing to do if this does not exist
   877  	if existing == nil {
   878  		return nil
   879  	}
   880  	exist := existing.(*structs.Allocation)
   881  	// Trigger the watcher
   882  	watcher.Add(watch.Item{Alloc: alloc.ID})
   883  	watcher.Add(watch.Item{AllocEval: exist.EvalID})
   884  	watcher.Add(watch.Item{AllocJob: exist.JobID})
   885  	watcher.Add(watch.Item{AllocNode: exist.NodeID})
   886  
   887  	// Copy everything from the existing allocation
   888  	copyAlloc := new(structs.Allocation)
   889  	*copyAlloc = *exist
   890  
   891  	// Pull in anything the client is the authority on
   892  	copyAlloc.ClientStatus = alloc.ClientStatus
   893  	copyAlloc.ClientDescription = alloc.ClientDescription
   894  	copyAlloc.TaskStates = alloc.TaskStates
   895  
   896  	// Update the modify index
   897  	copyAlloc.ModifyIndex = index
   898  
   899  	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, watcher, txn); err != nil {
   900  		return fmt.Errorf("error updating job summary: %v", err)
   901  	}
   902  
   903  	// Update the allocation
   904  	if err := txn.Insert("allocs", copyAlloc); err != nil {
   905  		return fmt.Errorf("alloc insert failed: %v", err)
   906  	}
   907  
   908  	// Set the job's status
   909  	forceStatus := ""
   910  	if !copyAlloc.TerminalStatus() {
   911  		forceStatus = structs.JobStatusRunning
   912  	}
   913  	jobs := map[string]string{exist.JobID: forceStatus}
   914  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   915  		return fmt.Errorf("setting job status failed: %v", err)
   916  	}
   917  	return nil
   918  }
   919  
   920  // UpsertAllocs is used to evict a set of allocations
   921  // and allocate new ones at the same time.
   922  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
   923  	txn := s.db.Txn(true)
   924  	defer txn.Abort()
   925  
   926  	watcher := watch.NewItems()
   927  	watcher.Add(watch.Item{Table: "allocs"})
   928  
   929  	// Handle the allocations
   930  	jobs := make(map[string]string, 1)
   931  	for _, alloc := range allocs {
   932  		existing, err := txn.First("allocs", "id", alloc.ID)
   933  		if err != nil {
   934  			return fmt.Errorf("alloc lookup failed: %v", err)
   935  		}
   936  		exist, _ := existing.(*structs.Allocation)
   937  
   938  		if exist == nil {
   939  			alloc.CreateIndex = index
   940  			alloc.ModifyIndex = index
   941  			alloc.AllocModifyIndex = index
   942  		} else {
   943  			alloc.CreateIndex = exist.CreateIndex
   944  			alloc.ModifyIndex = index
   945  			alloc.AllocModifyIndex = index
   946  
   947  			// If the scheduler is marking this allocation as lost we do not
   948  			// want to reuse the status of the existing allocation.
   949  			if alloc.ClientStatus != structs.AllocClientStatusLost {
   950  				alloc.ClientStatus = exist.ClientStatus
   951  				alloc.ClientDescription = exist.ClientDescription
   952  			}
   953  
   954  			// The job has been denormalized so re-attach the original job
   955  			if alloc.Job == nil {
   956  				alloc.Job = exist.Job
   957  			}
   958  		}
   959  
   960  		if err := s.updateSummaryWithAlloc(index, alloc, exist, watcher, txn); err != nil {
   961  			return fmt.Errorf("error updating job summary: %v", err)
   962  		}
   963  
   964  		// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
   965  		// COMPAT 0.4.1 -> 0.5
   966  		if alloc.Job != nil {
   967  			s.addEphemeralDiskToTaskGroups(alloc.Job)
   968  		}
   969  
   970  		if err := txn.Insert("allocs", alloc); err != nil {
   971  			return fmt.Errorf("alloc insert failed: %v", err)
   972  		}
   973  
   974  		// If the allocation is running, force the job to running status.
   975  		forceStatus := ""
   976  		if !alloc.TerminalStatus() {
   977  			forceStatus = structs.JobStatusRunning
   978  		}
   979  		jobs[alloc.JobID] = forceStatus
   980  
   981  		watcher.Add(watch.Item{Alloc: alloc.ID})
   982  		watcher.Add(watch.Item{AllocEval: alloc.EvalID})
   983  		watcher.Add(watch.Item{AllocJob: alloc.JobID})
   984  		watcher.Add(watch.Item{AllocNode: alloc.NodeID})
   985  	}
   986  
   987  	// Update the indexes
   988  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   989  		return fmt.Errorf("index update failed: %v", err)
   990  	}
   991  
   992  	// Set the job's status
   993  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   994  		return fmt.Errorf("setting job status failed: %v", err)
   995  	}
   996  
   997  	txn.Defer(func() { s.watch.notify(watcher) })
   998  	txn.Commit()
   999  	return nil
  1000  }
  1001  
  1002  // AllocByID is used to lookup an allocation by its ID
  1003  func (s *StateStore) AllocByID(id string) (*structs.Allocation, error) {
  1004  	txn := s.db.Txn(false)
  1005  
  1006  	existing, err := txn.First("allocs", "id", id)
  1007  	if err != nil {
  1008  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  1009  	}
  1010  
  1011  	if existing != nil {
  1012  		return existing.(*structs.Allocation), nil
  1013  	}
  1014  	return nil, nil
  1015  }
  1016  
  1017  // AllocsByIDPrefix is used to lookup allocs by prefix
  1018  func (s *StateStore) AllocsByIDPrefix(id string) (memdb.ResultIterator, error) {
  1019  	txn := s.db.Txn(false)
  1020  
  1021  	iter, err := txn.Get("allocs", "id_prefix", id)
  1022  	if err != nil {
  1023  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
  1024  	}
  1025  
  1026  	return iter, nil
  1027  }
  1028  
  1029  // AllocsByNode returns all the allocations by node
  1030  func (s *StateStore) AllocsByNode(node string) ([]*structs.Allocation, error) {
  1031  	txn := s.db.Txn(false)
  1032  
  1033  	// Get an iterator over the node allocations, using only the
  1034  	// node prefix which ignores the terminal status
  1035  	iter, err := txn.Get("allocs", "node_prefix", node)
  1036  	if err != nil {
  1037  		return nil, err
  1038  	}
  1039  
  1040  	var out []*structs.Allocation
  1041  	for {
  1042  		raw := iter.Next()
  1043  		if raw == nil {
  1044  			break
  1045  		}
  1046  		out = append(out, raw.(*structs.Allocation))
  1047  	}
  1048  	return out, nil
  1049  }
  1050  
  1051  // AllocsByNode returns all the allocations by node and terminal status
  1052  func (s *StateStore) AllocsByNodeTerminal(node string, terminal bool) ([]*structs.Allocation, error) {
  1053  	txn := s.db.Txn(false)
  1054  
  1055  	// Get an iterator over the node allocations
  1056  	iter, err := txn.Get("allocs", "node", node, terminal)
  1057  	if err != nil {
  1058  		return nil, err
  1059  	}
  1060  
  1061  	var out []*structs.Allocation
  1062  	for {
  1063  		raw := iter.Next()
  1064  		if raw == nil {
  1065  			break
  1066  		}
  1067  		out = append(out, raw.(*structs.Allocation))
  1068  	}
  1069  	return out, nil
  1070  }
  1071  
  1072  // AllocsByJob returns all the allocations by job id
  1073  func (s *StateStore) AllocsByJob(jobID string) ([]*structs.Allocation, error) {
  1074  	txn := s.db.Txn(false)
  1075  
  1076  	// Get an iterator over the node allocations
  1077  	iter, err := txn.Get("allocs", "job", jobID)
  1078  	if err != nil {
  1079  		return nil, err
  1080  	}
  1081  
  1082  	var out []*structs.Allocation
  1083  	for {
  1084  		raw := iter.Next()
  1085  		if raw == nil {
  1086  			break
  1087  		}
  1088  		out = append(out, raw.(*structs.Allocation))
  1089  	}
  1090  	return out, nil
  1091  }
  1092  
  1093  // AllocsByEval returns all the allocations by eval id
  1094  func (s *StateStore) AllocsByEval(evalID string) ([]*structs.Allocation, error) {
  1095  	txn := s.db.Txn(false)
  1096  
  1097  	// Get an iterator over the eval allocations
  1098  	iter, err := txn.Get("allocs", "eval", evalID)
  1099  	if err != nil {
  1100  		return nil, err
  1101  	}
  1102  
  1103  	var out []*structs.Allocation
  1104  	for {
  1105  		raw := iter.Next()
  1106  		if raw == nil {
  1107  			break
  1108  		}
  1109  		out = append(out, raw.(*structs.Allocation))
  1110  	}
  1111  	return out, nil
  1112  }
  1113  
  1114  // Allocs returns an iterator over all the evaluations
  1115  func (s *StateStore) Allocs() (memdb.ResultIterator, error) {
  1116  	txn := s.db.Txn(false)
  1117  
  1118  	// Walk the entire table
  1119  	iter, err := txn.Get("allocs", "id")
  1120  	if err != nil {
  1121  		return nil, err
  1122  	}
  1123  	return iter, nil
  1124  }
  1125  
  1126  // UpsertVaultAccessors is used to register a set of Vault Accessors
  1127  func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
  1128  	txn := s.db.Txn(true)
  1129  	defer txn.Abort()
  1130  
  1131  	for _, accessor := range accessors {
  1132  		// Set the create index
  1133  		accessor.CreateIndex = index
  1134  
  1135  		// Insert the accessor
  1136  		if err := txn.Insert("vault_accessors", accessor); err != nil {
  1137  			return fmt.Errorf("accessor insert failed: %v", err)
  1138  		}
  1139  	}
  1140  
  1141  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  1142  		return fmt.Errorf("index update failed: %v", err)
  1143  	}
  1144  
  1145  	txn.Commit()
  1146  	return nil
  1147  }
  1148  
  1149  // DeleteVaultAccessors is used to delete a set of Vault Accessors
  1150  func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
  1151  	txn := s.db.Txn(true)
  1152  	defer txn.Abort()
  1153  
  1154  	// Lookup the accessor
  1155  	for _, accessor := range accessors {
  1156  		// Delete the accessor
  1157  		if err := txn.Delete("vault_accessors", accessor); err != nil {
  1158  			return fmt.Errorf("accessor delete failed: %v", err)
  1159  		}
  1160  	}
  1161  
  1162  	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
  1163  		return fmt.Errorf("index update failed: %v", err)
  1164  	}
  1165  
  1166  	txn.Commit()
  1167  	return nil
  1168  }
  1169  
  1170  // VaultAccessor returns the given Vault accessor
  1171  func (s *StateStore) VaultAccessor(accessor string) (*structs.VaultAccessor, error) {
  1172  	txn := s.db.Txn(false)
  1173  
  1174  	existing, err := txn.First("vault_accessors", "id", accessor)
  1175  	if err != nil {
  1176  		return nil, fmt.Errorf("accessor lookup failed: %v", err)
  1177  	}
  1178  
  1179  	if existing != nil {
  1180  		return existing.(*structs.VaultAccessor), nil
  1181  	}
  1182  
  1183  	return nil, nil
  1184  }
  1185  
  1186  // VaultAccessors returns an iterator of Vault accessors.
  1187  func (s *StateStore) VaultAccessors() (memdb.ResultIterator, error) {
  1188  	txn := s.db.Txn(false)
  1189  
  1190  	iter, err := txn.Get("vault_accessors", "id")
  1191  	if err != nil {
  1192  		return nil, err
  1193  	}
  1194  	return iter, nil
  1195  }
  1196  
  1197  // VaultAccessorsByAlloc returns all the Vault accessors by alloc id
  1198  func (s *StateStore) VaultAccessorsByAlloc(allocID string) ([]*structs.VaultAccessor, error) {
  1199  	txn := s.db.Txn(false)
  1200  
  1201  	// Get an iterator over the accessors
  1202  	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
  1203  	if err != nil {
  1204  		return nil, err
  1205  	}
  1206  
  1207  	var out []*structs.VaultAccessor
  1208  	for {
  1209  		raw := iter.Next()
  1210  		if raw == nil {
  1211  			break
  1212  		}
  1213  		out = append(out, raw.(*structs.VaultAccessor))
  1214  	}
  1215  	return out, nil
  1216  }
  1217  
  1218  // VaultAccessorsByNode returns all the Vault accessors by node id
  1219  func (s *StateStore) VaultAccessorsByNode(nodeID string) ([]*structs.VaultAccessor, error) {
  1220  	txn := s.db.Txn(false)
  1221  
  1222  	// Get an iterator over the accessors
  1223  	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
  1224  	if err != nil {
  1225  		return nil, err
  1226  	}
  1227  
  1228  	var out []*structs.VaultAccessor
  1229  	for {
  1230  		raw := iter.Next()
  1231  		if raw == nil {
  1232  			break
  1233  		}
  1234  		out = append(out, raw.(*structs.VaultAccessor))
  1235  	}
  1236  	return out, nil
  1237  }
  1238  
  1239  // LastIndex returns the greatest index value for all indexes
  1240  func (s *StateStore) LatestIndex() (uint64, error) {
  1241  	indexes, err := s.Indexes()
  1242  	if err != nil {
  1243  		return 0, err
  1244  	}
  1245  
  1246  	var max uint64 = 0
  1247  	for {
  1248  		raw := indexes.Next()
  1249  		if raw == nil {
  1250  			break
  1251  		}
  1252  
  1253  		// Prepare the request struct
  1254  		idx := raw.(*IndexEntry)
  1255  
  1256  		// Determine the max
  1257  		if idx.Value > max {
  1258  			max = idx.Value
  1259  		}
  1260  	}
  1261  
  1262  	return max, nil
  1263  }
  1264  
  1265  // Index finds the matching index value
  1266  func (s *StateStore) Index(name string) (uint64, error) {
  1267  	txn := s.db.Txn(false)
  1268  
  1269  	// Lookup the first matching index
  1270  	out, err := txn.First("index", "id", name)
  1271  	if err != nil {
  1272  		return 0, err
  1273  	}
  1274  	if out == nil {
  1275  		return 0, nil
  1276  	}
  1277  	return out.(*IndexEntry).Value, nil
  1278  }
  1279  
  1280  // RemoveIndex is a helper method to remove an index for testing purposes
  1281  func (s *StateStore) RemoveIndex(name string) error {
  1282  	txn := s.db.Txn(true)
  1283  	defer txn.Abort()
  1284  
  1285  	if _, err := txn.DeleteAll("index", "id", name); err != nil {
  1286  		return err
  1287  	}
  1288  
  1289  	txn.Commit()
  1290  	return nil
  1291  }
  1292  
  1293  // Indexes returns an iterator over all the indexes
  1294  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  1295  	txn := s.db.Txn(false)
  1296  
  1297  	// Walk the entire nodes table
  1298  	iter, err := txn.Get("index", "id")
  1299  	if err != nil {
  1300  		return nil, err
  1301  	}
  1302  	return iter, nil
  1303  }
  1304  
  1305  // ReconcileJobSummaries re-creates summaries for all jobs present in the state
  1306  // store
  1307  func (s *StateStore) ReconcileJobSummaries(index uint64) error {
  1308  	txn := s.db.Txn(true)
  1309  	defer txn.Abort()
  1310  
  1311  	// Get all the jobs
  1312  	iter, err := txn.Get("jobs", "id")
  1313  	if err != nil {
  1314  		return err
  1315  	}
  1316  	for {
  1317  		rawJob := iter.Next()
  1318  		if rawJob == nil {
  1319  			break
  1320  		}
  1321  		job := rawJob.(*structs.Job)
  1322  
  1323  		// Create a job summary for the job
  1324  		summary := structs.JobSummary{
  1325  			JobID:   job.ID,
  1326  			Summary: make(map[string]structs.TaskGroupSummary),
  1327  		}
  1328  		for _, tg := range job.TaskGroups {
  1329  			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
  1330  		}
  1331  
  1332  		// Find all the allocations for the jobs
  1333  		iterAllocs, err := txn.Get("allocs", "job", job.ID)
  1334  		if err != nil {
  1335  			return err
  1336  		}
  1337  
  1338  		// Calculate the summary for the job
  1339  		for {
  1340  			rawAlloc := iterAllocs.Next()
  1341  			if rawAlloc == nil {
  1342  				break
  1343  			}
  1344  			alloc := rawAlloc.(*structs.Allocation)
  1345  
  1346  			// Ignore the allocation if it doesn't belong to the currently
  1347  			// registered job
  1348  			if alloc.Job.CreateIndex != job.CreateIndex {
  1349  				continue
  1350  			}
  1351  
  1352  			tg := summary.Summary[alloc.TaskGroup]
  1353  			switch alloc.ClientStatus {
  1354  			case structs.AllocClientStatusFailed:
  1355  				tg.Failed += 1
  1356  			case structs.AllocClientStatusLost:
  1357  				tg.Lost += 1
  1358  			case structs.AllocClientStatusComplete:
  1359  				tg.Complete += 1
  1360  			case structs.AllocClientStatusRunning:
  1361  				tg.Running += 1
  1362  			case structs.AllocClientStatusPending:
  1363  				tg.Starting += 1
  1364  			default:
  1365  				s.logger.Printf("[ERR] state_store: invalid client status: %v in allocation %q", alloc.ClientStatus, alloc.ID)
  1366  			}
  1367  			summary.Summary[alloc.TaskGroup] = tg
  1368  		}
  1369  
  1370  		// Set the create index of the summary same as the job's create index
  1371  		// and the modify index to the current index
  1372  		summary.CreateIndex = job.CreateIndex
  1373  		summary.ModifyIndex = index
  1374  
  1375  		// Insert the job summary
  1376  		if err := txn.Insert("job_summary", summary); err != nil {
  1377  			return fmt.Errorf("error inserting job summary: %v", err)
  1378  		}
  1379  	}
  1380  
  1381  	// Update the indexes table for job summary
  1382  	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1383  		return fmt.Errorf("index update failed: %v", err)
  1384  	}
  1385  	txn.Commit()
  1386  	return nil
  1387  }
  1388  
  1389  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  1390  // It takes a map of job IDs to an optional forceStatus string. It returns an
  1391  // error if the job doesn't exist or setJobStatus fails.
  1392  func (s *StateStore) setJobStatuses(index uint64, watcher watch.Items, txn *memdb.Txn,
  1393  	jobs map[string]string, evalDelete bool) error {
  1394  	for job, forceStatus := range jobs {
  1395  		existing, err := txn.First("jobs", "id", job)
  1396  		if err != nil {
  1397  			return fmt.Errorf("job lookup failed: %v", err)
  1398  		}
  1399  
  1400  		if existing == nil {
  1401  			continue
  1402  		}
  1403  
  1404  		if err := s.setJobStatus(index, watcher, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  1405  			return err
  1406  		}
  1407  	}
  1408  
  1409  	return nil
  1410  }
  1411  
  1412  // setJobStatus sets the status of the job by looking up associated evaluations
  1413  // and allocations. evalDelete should be set to true if setJobStatus is being
  1414  // called because an evaluation is being deleted (potentially because of garbage
  1415  // collection). If forceStatus is non-empty, the job's status will be set to the
  1416  // passed status.
  1417  func (s *StateStore) setJobStatus(index uint64, watcher watch.Items, txn *memdb.Txn,
  1418  	job *structs.Job, evalDelete bool, forceStatus string) error {
  1419  
  1420  	// Capture the current status so we can check if there is a change
  1421  	oldStatus := job.Status
  1422  	newStatus := forceStatus
  1423  
  1424  	// If forceStatus is not set, compute the jobs status.
  1425  	if forceStatus == "" {
  1426  		var err error
  1427  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  1428  		if err != nil {
  1429  			return err
  1430  		}
  1431  	}
  1432  
  1433  	// Fast-path if nothing has changed.
  1434  	if oldStatus == newStatus {
  1435  		return nil
  1436  	}
  1437  
  1438  	// The job has changed, so add to watcher.
  1439  	watcher.Add(watch.Item{Table: "jobs"})
  1440  	watcher.Add(watch.Item{Job: job.ID})
  1441  
  1442  	// Copy and update the existing job
  1443  	updated := job.Copy()
  1444  	updated.Status = newStatus
  1445  	updated.ModifyIndex = index
  1446  
  1447  	// Insert the job
  1448  	if err := txn.Insert("jobs", updated); err != nil {
  1449  		return fmt.Errorf("job insert failed: %v", err)
  1450  	}
  1451  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1452  		return fmt.Errorf("index update failed: %v", err)
  1453  	}
  1454  	return nil
  1455  }
  1456  
  1457  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  1458  	allocs, err := txn.Get("allocs", "job", job.ID)
  1459  	if err != nil {
  1460  		return "", err
  1461  	}
  1462  
  1463  	// If there is a non-terminal allocation, the job is running.
  1464  	hasAlloc := false
  1465  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  1466  		hasAlloc = true
  1467  		if !alloc.(*structs.Allocation).TerminalStatus() {
  1468  			return structs.JobStatusRunning, nil
  1469  		}
  1470  	}
  1471  
  1472  	evals, err := txn.Get("evals", "job", job.ID)
  1473  	if err != nil {
  1474  		return "", err
  1475  	}
  1476  
  1477  	hasEval := false
  1478  	for eval := evals.Next(); eval != nil; eval = evals.Next() {
  1479  		hasEval = true
  1480  		if !eval.(*structs.Evaluation).TerminalStatus() {
  1481  			return structs.JobStatusPending, nil
  1482  		}
  1483  	}
  1484  
  1485  	// The job is dead if all the allocations and evals are terminal or if there
  1486  	// are no evals because of garbage collection.
  1487  	if evalDelete || hasEval || hasAlloc {
  1488  		return structs.JobStatusDead, nil
  1489  	}
  1490  
  1491  	// If there are no allocations or evaluations it is a new job. If the job is
  1492  	// periodic, we mark it as running as it will never have an
  1493  	// allocation/evaluation against it.
  1494  	if job.IsPeriodic() {
  1495  		return structs.JobStatusRunning, nil
  1496  	}
  1497  	return structs.JobStatusPending, nil
  1498  }
  1499  
  1500  // updateSummaryWithJob creates or updates job summaries when new jobs are
  1501  // upserted or existing ones are updated
  1502  func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
  1503  	watcher watch.Items, txn *memdb.Txn) error {
  1504  
  1505  	existing, err := s.JobSummaryByID(job.ID)
  1506  	if err != nil {
  1507  		return fmt.Errorf("unable to retrieve summary for job: %v", err)
  1508  	}
  1509  	var hasSummaryChanged bool
  1510  	if existing == nil {
  1511  		existing = &structs.JobSummary{
  1512  			JobID:       job.ID,
  1513  			Summary:     make(map[string]structs.TaskGroupSummary),
  1514  			CreateIndex: index,
  1515  		}
  1516  		hasSummaryChanged = true
  1517  	}
  1518  	for _, tg := range job.TaskGroups {
  1519  		if _, ok := existing.Summary[tg.Name]; !ok {
  1520  			newSummary := structs.TaskGroupSummary{
  1521  				Complete: 0,
  1522  				Failed:   0,
  1523  				Running:  0,
  1524  				Starting: 0,
  1525  			}
  1526  			existing.Summary[tg.Name] = newSummary
  1527  			hasSummaryChanged = true
  1528  		}
  1529  	}
  1530  
  1531  	// The job summary has changed, so add to watcher and update the modify
  1532  	// index.
  1533  	if hasSummaryChanged {
  1534  		existing.ModifyIndex = index
  1535  		watcher.Add(watch.Item{Table: "job_summary"})
  1536  		watcher.Add(watch.Item{JobSummary: job.ID})
  1537  
  1538  		// Update the indexes table for job summary
  1539  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1540  			return fmt.Errorf("index update failed: %v", err)
  1541  		}
  1542  		if err := txn.Insert("job_summary", *existing); err != nil {
  1543  			return err
  1544  		}
  1545  	}
  1546  
  1547  	return nil
  1548  }
  1549  
  1550  // updateSummaryWithAlloc updates the job summary when allocations are updated
  1551  // or inserted
  1552  func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
  1553  	existingAlloc *structs.Allocation, watcher watch.Items, txn *memdb.Txn) error {
  1554  
  1555  	// We don't have to update the summary if the job is missing
  1556  	if alloc.Job == nil {
  1557  		return nil
  1558  	}
  1559  
  1560  	summaryRaw, err := txn.First("job_summary", "id", alloc.JobID)
  1561  	if err != nil {
  1562  		return fmt.Errorf("unable to lookup job summary for job id %q: %v", err)
  1563  	}
  1564  	if summaryRaw == nil {
  1565  		// Check if the job is de-registered
  1566  		rawJob, err := txn.First("jobs", "id", alloc.JobID)
  1567  		if err != nil {
  1568  			return fmt.Errorf("unable to query job: %v", err)
  1569  		}
  1570  
  1571  		// If the job is de-registered then we skip updating it's summary
  1572  		if rawJob == nil {
  1573  			return nil
  1574  		}
  1575  		return fmt.Errorf("job summary for job %q is not present", alloc.JobID)
  1576  	}
  1577  	summary := summaryRaw.(structs.JobSummary)
  1578  	jobSummary := summary.Copy()
  1579  
  1580  	// Not updating the job summary because the allocation doesn't belong to the
  1581  	// currently registered job
  1582  	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
  1583  		return nil
  1584  	}
  1585  
  1586  	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
  1587  	if !ok {
  1588  		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
  1589  	}
  1590  	var summaryChanged bool
  1591  	if existingAlloc == nil {
  1592  		switch alloc.DesiredStatus {
  1593  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
  1594  			s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v",
  1595  				alloc.ID, alloc.DesiredStatus)
  1596  		}
  1597  		switch alloc.ClientStatus {
  1598  		case structs.AllocClientStatusPending:
  1599  			tgSummary.Starting += 1
  1600  			if tgSummary.Queued > 0 {
  1601  				tgSummary.Queued -= 1
  1602  			}
  1603  			summaryChanged = true
  1604  		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
  1605  			structs.AllocClientStatusComplete:
  1606  			s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v",
  1607  				alloc.ID, alloc.ClientStatus)
  1608  		}
  1609  	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
  1610  		// Incrementing the client of the bin of the current state
  1611  		switch alloc.ClientStatus {
  1612  		case structs.AllocClientStatusRunning:
  1613  			tgSummary.Running += 1
  1614  		case structs.AllocClientStatusFailed:
  1615  			tgSummary.Failed += 1
  1616  		case structs.AllocClientStatusPending:
  1617  			tgSummary.Starting += 1
  1618  		case structs.AllocClientStatusComplete:
  1619  			tgSummary.Complete += 1
  1620  		case structs.AllocClientStatusLost:
  1621  			tgSummary.Lost += 1
  1622  		}
  1623  
  1624  		// Decrementing the count of the bin of the last state
  1625  		switch existingAlloc.ClientStatus {
  1626  		case structs.AllocClientStatusRunning:
  1627  			tgSummary.Running -= 1
  1628  		case structs.AllocClientStatusPending:
  1629  			tgSummary.Starting -= 1
  1630  		case structs.AllocClientStatusLost:
  1631  			tgSummary.Lost -= 1
  1632  		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
  1633  		default:
  1634  			s.logger.Printf("[ERR] state_store: invalid old state of allocation with id: %v, and state: %v",
  1635  				existingAlloc.ID, existingAlloc.ClientStatus)
  1636  		}
  1637  		summaryChanged = true
  1638  	}
  1639  	jobSummary.Summary[alloc.TaskGroup] = tgSummary
  1640  
  1641  	if summaryChanged {
  1642  		jobSummary.ModifyIndex = index
  1643  		watcher.Add(watch.Item{Table: "job_summary"})
  1644  		watcher.Add(watch.Item{JobSummary: alloc.JobID})
  1645  
  1646  		// Update the indexes table for job summary
  1647  		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
  1648  			return fmt.Errorf("index update failed: %v", err)
  1649  		}
  1650  
  1651  		if err := txn.Insert("job_summary", *jobSummary); err != nil {
  1652  			return fmt.Errorf("updating job summary failed: %v", err)
  1653  		}
  1654  	}
  1655  
  1656  	return nil
  1657  }
  1658  
  1659  // addEphemeralDiskToTaskGroups adds missing EphemeralDisk objects to TaskGroups
  1660  func (s *StateStore) addEphemeralDiskToTaskGroups(job *structs.Job) {
  1661  	for _, tg := range job.TaskGroups {
  1662  		if tg.EphemeralDisk != nil {
  1663  			continue
  1664  		}
  1665  		var diskMB int
  1666  		for _, task := range tg.Tasks {
  1667  			if task.Resources != nil {
  1668  				diskMB += task.Resources.DiskMB
  1669  				task.Resources.DiskMB = 0
  1670  			}
  1671  		}
  1672  		tg.EphemeralDisk = &structs.EphemeralDisk{
  1673  			SizeMB: diskMB,
  1674  		}
  1675  	}
  1676  }
  1677  
  1678  // StateSnapshot is used to provide a point-in-time snapshot
  1679  type StateSnapshot struct {
  1680  	StateStore
  1681  }
  1682  
  1683  // StateRestore is used to optimize the performance when
  1684  // restoring state by only using a single large transaction
  1685  // instead of thousands of sub transactions
  1686  type StateRestore struct {
  1687  	txn   *memdb.Txn
  1688  	watch *stateWatch
  1689  	items watch.Items
  1690  }
  1691  
  1692  // Abort is used to abort the restore operation
  1693  func (s *StateRestore) Abort() {
  1694  	s.txn.Abort()
  1695  }
  1696  
  1697  // Commit is used to commit the restore operation
  1698  func (s *StateRestore) Commit() {
  1699  	s.txn.Defer(func() { s.watch.notify(s.items) })
  1700  	s.txn.Commit()
  1701  }
  1702  
  1703  // NodeRestore is used to restore a node
  1704  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  1705  	r.items.Add(watch.Item{Table: "nodes"})
  1706  	r.items.Add(watch.Item{Node: node.ID})
  1707  	if err := r.txn.Insert("nodes", node); err != nil {
  1708  		return fmt.Errorf("node insert failed: %v", err)
  1709  	}
  1710  	return nil
  1711  }
  1712  
  1713  // JobRestore is used to restore a job
  1714  func (r *StateRestore) JobRestore(job *structs.Job) error {
  1715  	r.items.Add(watch.Item{Table: "jobs"})
  1716  	r.items.Add(watch.Item{Job: job.ID})
  1717  
  1718  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
  1719  	// COMPAT 0.4.1 -> 0.5
  1720  	r.addEphemeralDiskToTaskGroups(job)
  1721  
  1722  	if err := r.txn.Insert("jobs", job); err != nil {
  1723  		return fmt.Errorf("job insert failed: %v", err)
  1724  	}
  1725  	return nil
  1726  }
  1727  
  1728  // EvalRestore is used to restore an evaluation
  1729  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  1730  	r.items.Add(watch.Item{Table: "evals"})
  1731  	r.items.Add(watch.Item{Eval: eval.ID})
  1732  	if err := r.txn.Insert("evals", eval); err != nil {
  1733  		return fmt.Errorf("eval insert failed: %v", err)
  1734  	}
  1735  	return nil
  1736  }
  1737  
  1738  // AllocRestore is used to restore an allocation
  1739  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  1740  	r.items.Add(watch.Item{Table: "allocs"})
  1741  	r.items.Add(watch.Item{Alloc: alloc.ID})
  1742  	r.items.Add(watch.Item{AllocEval: alloc.EvalID})
  1743  	r.items.Add(watch.Item{AllocJob: alloc.JobID})
  1744  	r.items.Add(watch.Item{AllocNode: alloc.NodeID})
  1745  
  1746  	// Set the shared resources if it's not present
  1747  	// COMPAT 0.4.1 -> 0.5
  1748  	if alloc.SharedResources == nil {
  1749  		alloc.SharedResources = &structs.Resources{
  1750  			DiskMB: alloc.Resources.DiskMB,
  1751  		}
  1752  	}
  1753  
  1754  	// Create the EphemeralDisk if it's nil by adding up DiskMB from task resources.
  1755  	if alloc.Job != nil {
  1756  		r.addEphemeralDiskToTaskGroups(alloc.Job)
  1757  	}
  1758  
  1759  	if err := r.txn.Insert("allocs", alloc); err != nil {
  1760  		return fmt.Errorf("alloc insert failed: %v", err)
  1761  	}
  1762  	return nil
  1763  }
  1764  
  1765  // IndexRestore is used to restore an index
  1766  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  1767  	if err := r.txn.Insert("index", idx); err != nil {
  1768  		return fmt.Errorf("index insert failed: %v", err)
  1769  	}
  1770  	return nil
  1771  }
  1772  
  1773  // PeriodicLaunchRestore is used to restore a periodic launch.
  1774  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  1775  	r.items.Add(watch.Item{Table: "periodic_launch"})
  1776  	r.items.Add(watch.Item{Job: launch.ID})
  1777  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  1778  		return fmt.Errorf("periodic launch insert failed: %v", err)
  1779  	}
  1780  	return nil
  1781  }
  1782  
  1783  // JobSummaryRestore is used to restore a job summary
  1784  func (r *StateRestore) JobSummaryRestore(jobSummary *structs.JobSummary) error {
  1785  	if err := r.txn.Insert("job_summary", *jobSummary); err != nil {
  1786  		return fmt.Errorf("job summary insert failed: %v", err)
  1787  	}
  1788  	return nil
  1789  }
  1790  
  1791  // VaultAccessorRestore is used to restore a vault accessor
  1792  func (r *StateRestore) VaultAccessorRestore(accessor *structs.VaultAccessor) error {
  1793  	if err := r.txn.Insert("vault_accessors", accessor); err != nil {
  1794  		return fmt.Errorf("vault accessor insert failed: %v", err)
  1795  	}
  1796  	return nil
  1797  }
  1798  
  1799  // addEphemeralDiskToTaskGroups adds missing EphemeralDisk objects to TaskGroups
  1800  func (r *StateRestore) addEphemeralDiskToTaskGroups(job *structs.Job) {
  1801  	for _, tg := range job.TaskGroups {
  1802  		if tg.EphemeralDisk != nil {
  1803  			continue
  1804  		}
  1805  		var sizeMB int
  1806  		for _, task := range tg.Tasks {
  1807  			if task.Resources != nil {
  1808  				sizeMB += task.Resources.DiskMB
  1809  				task.Resources.DiskMB = 0
  1810  			}
  1811  		}
  1812  		tg.EphemeralDisk = &structs.EphemeralDisk{
  1813  			SizeMB: sizeMB,
  1814  		}
  1815  	}
  1816  }
  1817  
  1818  // stateWatch holds shared state for watching updates. This is
  1819  // outside of StateStore so it can be shared with snapshots.
  1820  type stateWatch struct {
  1821  	items map[watch.Item]*NotifyGroup
  1822  	l     sync.Mutex
  1823  }
  1824  
  1825  // newStateWatch creates a new stateWatch for change notification.
  1826  func newStateWatch() *stateWatch {
  1827  	return &stateWatch{
  1828  		items: make(map[watch.Item]*NotifyGroup),
  1829  	}
  1830  }
  1831  
  1832  // watch subscribes a channel to the given watch items.
  1833  func (w *stateWatch) watch(items watch.Items, ch chan struct{}) {
  1834  	w.l.Lock()
  1835  	defer w.l.Unlock()
  1836  
  1837  	for item, _ := range items {
  1838  		grp, ok := w.items[item]
  1839  		if !ok {
  1840  			grp = new(NotifyGroup)
  1841  			w.items[item] = grp
  1842  		}
  1843  		grp.Wait(ch)
  1844  	}
  1845  }
  1846  
  1847  // stopWatch unsubscribes a channel from the given watch items.
  1848  func (w *stateWatch) stopWatch(items watch.Items, ch chan struct{}) {
  1849  	w.l.Lock()
  1850  	defer w.l.Unlock()
  1851  
  1852  	for item, _ := range items {
  1853  		if grp, ok := w.items[item]; ok {
  1854  			grp.Clear(ch)
  1855  			if grp.Empty() {
  1856  				delete(w.items, item)
  1857  			}
  1858  		}
  1859  	}
  1860  }
  1861  
  1862  // notify is used to fire notifications on the given watch items.
  1863  func (w *stateWatch) notify(items watch.Items) {
  1864  	w.l.Lock()
  1865  	defer w.l.Unlock()
  1866  
  1867  	for wi, _ := range items {
  1868  		if grp, ok := w.items[wi]; ok {
  1869  			grp.Notify()
  1870  		}
  1871  	}
  1872  }