github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/nomad/watch"
    12  )
    13  
    14  // IndexEntry is used with the "index" table
    15  // for managing the latest Raft index affecting a table.
    16  type IndexEntry struct {
    17  	Key   string
    18  	Value uint64
    19  }
    20  
    21  // The StateStore is responsible for maintaining all the Nomad
    22  // state. It is manipulated by the FSM which maintains consistency
    23  // through the use of Raft. The goals of the StateStore are to provide
    24  // high concurrency for read operations without blocking writes, and
    25  // to provide write availability in the face of reads. EVERY object
    26  // returned as a result of a read against the state store should be
    27  // considered a constant and NEVER modified in place.
    28  type StateStore struct {
    29  	logger *log.Logger
    30  	db     *memdb.MemDB
    31  	watch  *stateWatch
    32  }
    33  
    34  // NewStateStore is used to create a new state store
    35  func NewStateStore(logOutput io.Writer) (*StateStore, error) {
    36  	// Create the MemDB
    37  	db, err := memdb.NewMemDB(stateStoreSchema())
    38  	if err != nil {
    39  		return nil, fmt.Errorf("state store setup failed: %v", err)
    40  	}
    41  
    42  	// Create the state store
    43  	s := &StateStore{
    44  		logger: log.New(logOutput, "", log.LstdFlags),
    45  		db:     db,
    46  		watch:  newStateWatch(),
    47  	}
    48  	return s, nil
    49  }
    50  
    51  // Snapshot is used to create a point in time snapshot. Because
    52  // we use MemDB, we just need to snapshot the state of the underlying
    53  // database.
    54  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    55  	snap := &StateSnapshot{
    56  		StateStore: StateStore{
    57  			logger: s.logger,
    58  			db:     s.db.Snapshot(),
    59  			watch:  s.watch,
    60  		},
    61  	}
    62  	return snap, nil
    63  }
    64  
    65  // Restore is used to optimize the efficiency of rebuilding
    66  // state by minimizing the number of transactions and checking
    67  // overhead.
    68  func (s *StateStore) Restore() (*StateRestore, error) {
    69  	txn := s.db.Txn(true)
    70  	r := &StateRestore{
    71  		txn:   txn,
    72  		watch: s.watch,
    73  		items: watch.NewItems(),
    74  	}
    75  	return r, nil
    76  }
    77  
    78  // Watch subscribes a channel to a set of watch items.
    79  func (s *StateStore) Watch(items watch.Items, notify chan struct{}) {
    80  	s.watch.watch(items, notify)
    81  }
    82  
    83  // StopWatch unsubscribes a channel from a set of watch items.
    84  func (s *StateStore) StopWatch(items watch.Items, notify chan struct{}) {
    85  	s.watch.stopWatch(items, notify)
    86  }
    87  
    88  // UpsertNode is used to register a node or update a node definition
    89  // This is assumed to be triggered by the client, so we retain the value
    90  // of drain which is set by the scheduler.
    91  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
    92  	txn := s.db.Txn(true)
    93  	defer txn.Abort()
    94  
    95  	watcher := watch.NewItems()
    96  	watcher.Add(watch.Item{Table: "nodes"})
    97  	watcher.Add(watch.Item{Node: node.ID})
    98  
    99  	// Check if the node already exists
   100  	existing, err := txn.First("nodes", "id", node.ID)
   101  	if err != nil {
   102  		return fmt.Errorf("node lookup failed: %v", err)
   103  	}
   104  
   105  	// Setup the indexes correctly
   106  	if existing != nil {
   107  		exist := existing.(*structs.Node)
   108  		node.CreateIndex = exist.CreateIndex
   109  		node.ModifyIndex = index
   110  		node.Drain = exist.Drain // Retain the drain mode
   111  	} else {
   112  		node.CreateIndex = index
   113  		node.ModifyIndex = index
   114  	}
   115  
   116  	// Insert the node
   117  	if err := txn.Insert("nodes", node); err != nil {
   118  		return fmt.Errorf("node insert failed: %v", err)
   119  	}
   120  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   121  		return fmt.Errorf("index update failed: %v", err)
   122  	}
   123  
   124  	txn.Defer(func() { s.watch.notify(watcher) })
   125  	txn.Commit()
   126  	return nil
   127  }
   128  
   129  // DeleteNode is used to deregister a node
   130  func (s *StateStore) DeleteNode(index uint64, nodeID string) error {
   131  	txn := s.db.Txn(true)
   132  	defer txn.Abort()
   133  
   134  	// Lookup the node
   135  	existing, err := txn.First("nodes", "id", nodeID)
   136  	if err != nil {
   137  		return fmt.Errorf("node lookup failed: %v", err)
   138  	}
   139  	if existing == nil {
   140  		return fmt.Errorf("node not found")
   141  	}
   142  
   143  	watcher := watch.NewItems()
   144  	watcher.Add(watch.Item{Table: "nodes"})
   145  	watcher.Add(watch.Item{Node: nodeID})
   146  
   147  	// Delete the node
   148  	if err := txn.Delete("nodes", existing); err != nil {
   149  		return fmt.Errorf("node delete failed: %v", err)
   150  	}
   151  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   152  		return fmt.Errorf("index update failed: %v", err)
   153  	}
   154  
   155  	txn.Defer(func() { s.watch.notify(watcher) })
   156  	txn.Commit()
   157  	return nil
   158  }
   159  
   160  // UpdateNodeStatus is used to update the status of a node
   161  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error {
   162  	txn := s.db.Txn(true)
   163  	defer txn.Abort()
   164  
   165  	watcher := watch.NewItems()
   166  	watcher.Add(watch.Item{Table: "nodes"})
   167  	watcher.Add(watch.Item{Node: nodeID})
   168  
   169  	// Lookup the node
   170  	existing, err := txn.First("nodes", "id", nodeID)
   171  	if err != nil {
   172  		return fmt.Errorf("node lookup failed: %v", err)
   173  	}
   174  	if existing == nil {
   175  		return fmt.Errorf("node not found")
   176  	}
   177  
   178  	// Copy the existing node
   179  	existingNode := existing.(*structs.Node)
   180  	copyNode := new(structs.Node)
   181  	*copyNode = *existingNode
   182  
   183  	// Update the status in the copy
   184  	copyNode.Status = status
   185  	copyNode.ModifyIndex = index
   186  
   187  	// Insert the node
   188  	if err := txn.Insert("nodes", copyNode); err != nil {
   189  		return fmt.Errorf("node update failed: %v", err)
   190  	}
   191  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   192  		return fmt.Errorf("index update failed: %v", err)
   193  	}
   194  
   195  	txn.Defer(func() { s.watch.notify(watcher) })
   196  	txn.Commit()
   197  	return nil
   198  }
   199  
   200  // UpdateNodeDrain is used to update the drain of a node
   201  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error {
   202  	txn := s.db.Txn(true)
   203  	defer txn.Abort()
   204  
   205  	watcher := watch.NewItems()
   206  	watcher.Add(watch.Item{Table: "nodes"})
   207  	watcher.Add(watch.Item{Node: nodeID})
   208  
   209  	// Lookup the node
   210  	existing, err := txn.First("nodes", "id", nodeID)
   211  	if err != nil {
   212  		return fmt.Errorf("node lookup failed: %v", err)
   213  	}
   214  	if existing == nil {
   215  		return fmt.Errorf("node not found")
   216  	}
   217  
   218  	// Copy the existing node
   219  	existingNode := existing.(*structs.Node)
   220  	copyNode := new(structs.Node)
   221  	*copyNode = *existingNode
   222  
   223  	// Update the drain in the copy
   224  	copyNode.Drain = drain
   225  	copyNode.ModifyIndex = index
   226  
   227  	// Insert the node
   228  	if err := txn.Insert("nodes", copyNode); err != nil {
   229  		return fmt.Errorf("node update failed: %v", err)
   230  	}
   231  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   232  		return fmt.Errorf("index update failed: %v", err)
   233  	}
   234  
   235  	txn.Defer(func() { s.watch.notify(watcher) })
   236  	txn.Commit()
   237  	return nil
   238  }
   239  
   240  // NodeByID is used to lookup a node by ID
   241  func (s *StateStore) NodeByID(nodeID string) (*structs.Node, error) {
   242  	txn := s.db.Txn(false)
   243  
   244  	existing, err := txn.First("nodes", "id", nodeID)
   245  	if err != nil {
   246  		return nil, fmt.Errorf("node lookup failed: %v", err)
   247  	}
   248  
   249  	if existing != nil {
   250  		return existing.(*structs.Node), nil
   251  	}
   252  	return nil, nil
   253  }
   254  
   255  // NodesByIDPrefix is used to lookup nodes by prefix
   256  func (s *StateStore) NodesByIDPrefix(nodeID string) (memdb.ResultIterator, error) {
   257  	txn := s.db.Txn(false)
   258  
   259  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
   260  	if err != nil {
   261  		return nil, fmt.Errorf("node lookup failed: %v", err)
   262  	}
   263  
   264  	return iter, nil
   265  }
   266  
   267  // Nodes returns an iterator over all the nodes
   268  func (s *StateStore) Nodes() (memdb.ResultIterator, error) {
   269  	txn := s.db.Txn(false)
   270  
   271  	// Walk the entire nodes table
   272  	iter, err := txn.Get("nodes", "id")
   273  	if err != nil {
   274  		return nil, err
   275  	}
   276  	return iter, nil
   277  }
   278  
   279  // UpsertJob is used to register a job or update a job definition
   280  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
   281  	txn := s.db.Txn(true)
   282  	defer txn.Abort()
   283  
   284  	watcher := watch.NewItems()
   285  	watcher.Add(watch.Item{Table: "jobs"})
   286  	watcher.Add(watch.Item{Job: job.ID})
   287  
   288  	// Check if the job already exists
   289  	existing, err := txn.First("jobs", "id", job.ID)
   290  	if err != nil {
   291  		return fmt.Errorf("job lookup failed: %v", err)
   292  	}
   293  
   294  	// Setup the indexes correctly
   295  	if existing != nil {
   296  		job.CreateIndex = existing.(*structs.Job).CreateIndex
   297  		job.ModifyIndex = index
   298  		job.JobModifyIndex = index
   299  
   300  		// Compute the job status
   301  		var err error
   302  		job.Status, err = s.getJobStatus(txn, job, false)
   303  		if err != nil {
   304  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
   305  		}
   306  	} else {
   307  		job.CreateIndex = index
   308  		job.ModifyIndex = index
   309  		job.JobModifyIndex = index
   310  
   311  		// If we are inserting the job for the first time, we don't need to
   312  		// calculate the jobs status as it is known.
   313  		if job.IsPeriodic() {
   314  			job.Status = structs.JobStatusRunning
   315  		} else {
   316  			job.Status = structs.JobStatusPending
   317  		}
   318  	}
   319  
   320  	// Insert the job
   321  	if err := txn.Insert("jobs", job); err != nil {
   322  		return fmt.Errorf("job insert failed: %v", err)
   323  	}
   324  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   325  		return fmt.Errorf("index update failed: %v", err)
   326  	}
   327  
   328  	txn.Defer(func() { s.watch.notify(watcher) })
   329  	txn.Commit()
   330  	return nil
   331  }
   332  
   333  // DeleteJob is used to deregister a job
   334  func (s *StateStore) DeleteJob(index uint64, jobID string) error {
   335  	txn := s.db.Txn(true)
   336  	defer txn.Abort()
   337  
   338  	// Lookup the node
   339  	existing, err := txn.First("jobs", "id", jobID)
   340  	if err != nil {
   341  		return fmt.Errorf("job lookup failed: %v", err)
   342  	}
   343  	if existing == nil {
   344  		return fmt.Errorf("job not found")
   345  	}
   346  
   347  	watcher := watch.NewItems()
   348  	watcher.Add(watch.Item{Table: "jobs"})
   349  	watcher.Add(watch.Item{Job: jobID})
   350  
   351  	// Delete the node
   352  	if err := txn.Delete("jobs", existing); err != nil {
   353  		return fmt.Errorf("job delete failed: %v", err)
   354  	}
   355  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   356  		return fmt.Errorf("index update failed: %v", err)
   357  	}
   358  
   359  	txn.Defer(func() { s.watch.notify(watcher) })
   360  	txn.Commit()
   361  	return nil
   362  }
   363  
   364  // JobByID is used to lookup a job by its ID
   365  func (s *StateStore) JobByID(id string) (*structs.Job, error) {
   366  	txn := s.db.Txn(false)
   367  
   368  	existing, err := txn.First("jobs", "id", id)
   369  	if err != nil {
   370  		return nil, fmt.Errorf("job lookup failed: %v", err)
   371  	}
   372  
   373  	if existing != nil {
   374  		return existing.(*structs.Job), nil
   375  	}
   376  	return nil, nil
   377  }
   378  
   379  // JobsByIDPrefix is used to lookup a job by prefix
   380  func (s *StateStore) JobsByIDPrefix(id string) (memdb.ResultIterator, error) {
   381  	txn := s.db.Txn(false)
   382  
   383  	iter, err := txn.Get("jobs", "id_prefix", id)
   384  	if err != nil {
   385  		return nil, fmt.Errorf("job lookup failed: %v", err)
   386  	}
   387  
   388  	return iter, nil
   389  }
   390  
   391  // Jobs returns an iterator over all the jobs
   392  func (s *StateStore) Jobs() (memdb.ResultIterator, error) {
   393  	txn := s.db.Txn(false)
   394  
   395  	// Walk the entire jobs table
   396  	iter, err := txn.Get("jobs", "id")
   397  	if err != nil {
   398  		return nil, err
   399  	}
   400  	return iter, nil
   401  }
   402  
   403  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
   404  func (s *StateStore) JobsByPeriodic(periodic bool) (memdb.ResultIterator, error) {
   405  	txn := s.db.Txn(false)
   406  
   407  	iter, err := txn.Get("jobs", "periodic", periodic)
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	return iter, nil
   412  }
   413  
   414  // JobsByScheduler returns an iterator over all the jobs with the specific
   415  // scheduler type.
   416  func (s *StateStore) JobsByScheduler(schedulerType string) (memdb.ResultIterator, error) {
   417  	txn := s.db.Txn(false)
   418  
   419  	// Return an iterator for jobs with the specific type.
   420  	iter, err := txn.Get("jobs", "type", schedulerType)
   421  	if err != nil {
   422  		return nil, err
   423  	}
   424  	return iter, nil
   425  }
   426  
   427  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
   428  // collection.
   429  func (s *StateStore) JobsByGC(gc bool) (memdb.ResultIterator, error) {
   430  	txn := s.db.Txn(false)
   431  
   432  	iter, err := txn.Get("jobs", "gc", gc)
   433  	if err != nil {
   434  		return nil, err
   435  	}
   436  	return iter, nil
   437  }
   438  
   439  // UpsertPeriodicLaunch is used to register a launch or update it.
   440  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
   441  	txn := s.db.Txn(true)
   442  	defer txn.Abort()
   443  
   444  	watcher := watch.NewItems()
   445  	watcher.Add(watch.Item{Table: "periodic_launch"})
   446  	watcher.Add(watch.Item{Job: launch.ID})
   447  
   448  	// Check if the job already exists
   449  	existing, err := txn.First("periodic_launch", "id", launch.ID)
   450  	if err != nil {
   451  		return fmt.Errorf("periodic launch lookup failed: %v", err)
   452  	}
   453  
   454  	// Setup the indexes correctly
   455  	if existing != nil {
   456  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
   457  		launch.ModifyIndex = index
   458  	} else {
   459  		launch.CreateIndex = index
   460  		launch.ModifyIndex = index
   461  	}
   462  
   463  	// Insert the job
   464  	if err := txn.Insert("periodic_launch", launch); err != nil {
   465  		return fmt.Errorf("launch insert failed: %v", err)
   466  	}
   467  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   468  		return fmt.Errorf("index update failed: %v", err)
   469  	}
   470  
   471  	txn.Defer(func() { s.watch.notify(watcher) })
   472  	txn.Commit()
   473  	return nil
   474  }
   475  
   476  // DeletePeriodicLaunch is used to delete the periodic launch
   477  func (s *StateStore) DeletePeriodicLaunch(index uint64, jobID string) error {
   478  	txn := s.db.Txn(true)
   479  	defer txn.Abort()
   480  
   481  	// Lookup the launch
   482  	existing, err := txn.First("periodic_launch", "id", jobID)
   483  	if err != nil {
   484  		return fmt.Errorf("launch lookup failed: %v", err)
   485  	}
   486  	if existing == nil {
   487  		return fmt.Errorf("launch not found")
   488  	}
   489  
   490  	watcher := watch.NewItems()
   491  	watcher.Add(watch.Item{Table: "periodic_launch"})
   492  	watcher.Add(watch.Item{Job: jobID})
   493  
   494  	// Delete the launch
   495  	if err := txn.Delete("periodic_launch", existing); err != nil {
   496  		return fmt.Errorf("launch delete failed: %v", err)
   497  	}
   498  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   499  		return fmt.Errorf("index update failed: %v", err)
   500  	}
   501  
   502  	txn.Defer(func() { s.watch.notify(watcher) })
   503  	txn.Commit()
   504  	return nil
   505  }
   506  
   507  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
   508  // ID.
   509  func (s *StateStore) PeriodicLaunchByID(id string) (*structs.PeriodicLaunch, error) {
   510  	txn := s.db.Txn(false)
   511  
   512  	existing, err := txn.First("periodic_launch", "id", id)
   513  	if err != nil {
   514  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
   515  	}
   516  
   517  	if existing != nil {
   518  		return existing.(*structs.PeriodicLaunch), nil
   519  	}
   520  	return nil, nil
   521  }
   522  
   523  // PeriodicLaunches returns an iterator over all the periodic launches
   524  func (s *StateStore) PeriodicLaunches() (memdb.ResultIterator, error) {
   525  	txn := s.db.Txn(false)
   526  
   527  	// Walk the entire table
   528  	iter, err := txn.Get("periodic_launch", "id")
   529  	if err != nil {
   530  		return nil, err
   531  	}
   532  	return iter, nil
   533  }
   534  
   535  // UpsertEvaluation is used to upsert an evaluation
   536  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
   537  	txn := s.db.Txn(true)
   538  	defer txn.Abort()
   539  
   540  	watcher := watch.NewItems()
   541  	watcher.Add(watch.Item{Table: "evals"})
   542  
   543  	// Do a nested upsert
   544  	jobs := make(map[string]string, len(evals))
   545  	for _, eval := range evals {
   546  		watcher.Add(watch.Item{Eval: eval.ID})
   547  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   548  			return err
   549  		}
   550  
   551  		jobs[eval.JobID] = ""
   552  	}
   553  
   554  	// Set the job's status
   555  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   556  		return fmt.Errorf("setting job status failed: %v", err)
   557  	}
   558  
   559  	txn.Defer(func() { s.watch.notify(watcher) })
   560  	txn.Commit()
   561  	return nil
   562  }
   563  
   564  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
   565  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
   566  	// Lookup the evaluation
   567  	existing, err := txn.First("evals", "id", eval.ID)
   568  	if err != nil {
   569  		return fmt.Errorf("eval lookup failed: %v", err)
   570  	}
   571  
   572  	// Update the indexes
   573  	if existing != nil {
   574  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
   575  		eval.ModifyIndex = index
   576  	} else {
   577  		eval.CreateIndex = index
   578  		eval.ModifyIndex = index
   579  	}
   580  
   581  	// Insert the eval
   582  	if err := txn.Insert("evals", eval); err != nil {
   583  		return fmt.Errorf("eval insert failed: %v", err)
   584  	}
   585  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   586  		return fmt.Errorf("index update failed: %v", err)
   587  	}
   588  	return nil
   589  }
   590  
   591  // DeleteEval is used to delete an evaluation
   592  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
   593  	txn := s.db.Txn(true)
   594  	defer txn.Abort()
   595  	watcher := watch.NewItems()
   596  	watcher.Add(watch.Item{Table: "evals"})
   597  	watcher.Add(watch.Item{Table: "allocs"})
   598  
   599  	jobs := make(map[string]string, len(evals))
   600  	for _, eval := range evals {
   601  		existing, err := txn.First("evals", "id", eval)
   602  		if err != nil {
   603  			return fmt.Errorf("eval lookup failed: %v", err)
   604  		}
   605  		if existing == nil {
   606  			continue
   607  		}
   608  		if err := txn.Delete("evals", existing); err != nil {
   609  			return fmt.Errorf("eval delete failed: %v", err)
   610  		}
   611  		watcher.Add(watch.Item{Eval: eval})
   612  		jobs[existing.(*structs.Evaluation).JobID] = ""
   613  	}
   614  
   615  	for _, alloc := range allocs {
   616  		existing, err := txn.First("allocs", "id", alloc)
   617  		if err != nil {
   618  			return fmt.Errorf("alloc lookup failed: %v", err)
   619  		}
   620  		if existing == nil {
   621  			continue
   622  		}
   623  		if err := txn.Delete("allocs", existing); err != nil {
   624  			return fmt.Errorf("alloc delete failed: %v", err)
   625  		}
   626  		realAlloc := existing.(*structs.Allocation)
   627  		watcher.Add(watch.Item{Alloc: realAlloc.ID})
   628  		watcher.Add(watch.Item{AllocEval: realAlloc.EvalID})
   629  		watcher.Add(watch.Item{AllocJob: realAlloc.JobID})
   630  		watcher.Add(watch.Item{AllocNode: realAlloc.NodeID})
   631  	}
   632  
   633  	// Update the indexes
   634  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   635  		return fmt.Errorf("index update failed: %v", err)
   636  	}
   637  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   638  		return fmt.Errorf("index update failed: %v", err)
   639  	}
   640  
   641  	// Set the job's status
   642  	if err := s.setJobStatuses(index, watcher, txn, jobs, true); err != nil {
   643  		return fmt.Errorf("setting job status failed: %v", err)
   644  	}
   645  
   646  	txn.Defer(func() { s.watch.notify(watcher) })
   647  	txn.Commit()
   648  	return nil
   649  }
   650  
   651  // EvalByID is used to lookup an eval by its ID
   652  func (s *StateStore) EvalByID(id string) (*structs.Evaluation, error) {
   653  	txn := s.db.Txn(false)
   654  
   655  	existing, err := txn.First("evals", "id", id)
   656  	if err != nil {
   657  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   658  	}
   659  
   660  	if existing != nil {
   661  		return existing.(*structs.Evaluation), nil
   662  	}
   663  	return nil, nil
   664  }
   665  
   666  // EvalsByIDPrefix is used to lookup evaluations by prefix
   667  func (s *StateStore) EvalsByIDPrefix(id string) (memdb.ResultIterator, error) {
   668  	txn := s.db.Txn(false)
   669  
   670  	iter, err := txn.Get("evals", "id_prefix", id)
   671  	if err != nil {
   672  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   673  	}
   674  
   675  	return iter, nil
   676  }
   677  
   678  // EvalsByJob returns all the evaluations by job id
   679  func (s *StateStore) EvalsByJob(jobID string) ([]*structs.Evaluation, error) {
   680  	txn := s.db.Txn(false)
   681  
   682  	// Get an iterator over the node allocations
   683  	iter, err := txn.Get("evals", "job", jobID)
   684  	if err != nil {
   685  		return nil, err
   686  	}
   687  
   688  	var out []*structs.Evaluation
   689  	for {
   690  		raw := iter.Next()
   691  		if raw == nil {
   692  			break
   693  		}
   694  		out = append(out, raw.(*structs.Evaluation))
   695  	}
   696  	return out, nil
   697  }
   698  
   699  // Evals returns an iterator over all the evaluations
   700  func (s *StateStore) Evals() (memdb.ResultIterator, error) {
   701  	txn := s.db.Txn(false)
   702  
   703  	// Walk the entire table
   704  	iter, err := txn.Get("evals", "id")
   705  	if err != nil {
   706  		return nil, err
   707  	}
   708  	return iter, nil
   709  }
   710  
   711  // UpdateAllocFromClient is used to update an allocation based on input
   712  // from a client. While the schedulers are the authority on the allocation for
   713  // most things, some updates are authoritative from the client. Specifically,
   714  // the desired state comes from the schedulers, while the actual state comes
   715  // from clients.
   716  func (s *StateStore) UpdateAllocsFromClient(index uint64, allocs []*structs.Allocation) error {
   717  	txn := s.db.Txn(true)
   718  	defer txn.Abort()
   719  
   720  	// Setup the watcher
   721  	watcher := watch.NewItems()
   722  	watcher.Add(watch.Item{Table: "allocs"})
   723  
   724  	// Handle each of the updated allocations
   725  	for _, alloc := range allocs {
   726  		if err := s.nestedUpdateAllocFromClient(txn, watcher, index, alloc); err != nil {
   727  			return err
   728  		}
   729  	}
   730  
   731  	// Update the indexes
   732  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   733  		return fmt.Errorf("index update failed: %v", err)
   734  	}
   735  
   736  	txn.Defer(func() { s.watch.notify(watcher) })
   737  	txn.Commit()
   738  	return nil
   739  }
   740  
   741  // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
   742  func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, watcher watch.Items, index uint64, alloc *structs.Allocation) error {
   743  	// Look for existing alloc
   744  	existing, err := txn.First("allocs", "id", alloc.ID)
   745  	if err != nil {
   746  		return fmt.Errorf("alloc lookup failed: %v", err)
   747  	}
   748  
   749  	// Nothing to do if this does not exist
   750  	if existing == nil {
   751  		return nil
   752  	}
   753  	exist := existing.(*structs.Allocation)
   754  
   755  	// Trigger the watcher
   756  	watcher.Add(watch.Item{Alloc: alloc.ID})
   757  	watcher.Add(watch.Item{AllocEval: exist.EvalID})
   758  	watcher.Add(watch.Item{AllocJob: exist.JobID})
   759  	watcher.Add(watch.Item{AllocNode: exist.NodeID})
   760  
   761  	// Copy everything from the existing allocation
   762  	copyAlloc := new(structs.Allocation)
   763  	*copyAlloc = *exist
   764  
   765  	// Pull in anything the client is the authority on
   766  	copyAlloc.ClientStatus = alloc.ClientStatus
   767  	copyAlloc.ClientDescription = alloc.ClientDescription
   768  	copyAlloc.TaskStates = alloc.TaskStates
   769  
   770  	// Update the modify index
   771  	copyAlloc.ModifyIndex = index
   772  
   773  	// Update the allocation
   774  	if err := txn.Insert("allocs", copyAlloc); err != nil {
   775  		return fmt.Errorf("alloc insert failed: %v", err)
   776  	}
   777  
   778  	// Set the job's status
   779  	forceStatus := ""
   780  	if !copyAlloc.TerminalStatus() {
   781  		forceStatus = structs.JobStatusRunning
   782  	}
   783  	jobs := map[string]string{exist.JobID: forceStatus}
   784  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   785  		return fmt.Errorf("setting job status failed: %v", err)
   786  	}
   787  	return nil
   788  }
   789  
   790  // UpsertAllocs is used to evict a set of allocations
   791  // and allocate new ones at the same time.
   792  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
   793  	txn := s.db.Txn(true)
   794  	defer txn.Abort()
   795  
   796  	watcher := watch.NewItems()
   797  	watcher.Add(watch.Item{Table: "allocs"})
   798  
   799  	// Handle the allocations
   800  	jobs := make(map[string]string, 1)
   801  	for _, alloc := range allocs {
   802  		existing, err := txn.First("allocs", "id", alloc.ID)
   803  		if err != nil {
   804  			return fmt.Errorf("alloc lookup failed: %v", err)
   805  		}
   806  
   807  		if existing == nil {
   808  			alloc.CreateIndex = index
   809  			alloc.ModifyIndex = index
   810  			alloc.AllocModifyIndex = index
   811  		} else {
   812  			exist := existing.(*structs.Allocation)
   813  			alloc.CreateIndex = exist.CreateIndex
   814  			alloc.ModifyIndex = index
   815  			alloc.AllocModifyIndex = index
   816  			alloc.ClientStatus = exist.ClientStatus
   817  			alloc.ClientDescription = exist.ClientDescription
   818  		}
   819  		if err := txn.Insert("allocs", alloc); err != nil {
   820  			return fmt.Errorf("alloc insert failed: %v", err)
   821  		}
   822  
   823  		// If the allocation is running, force the job to running status.
   824  		forceStatus := ""
   825  		if !alloc.TerminalStatus() {
   826  			forceStatus = structs.JobStatusRunning
   827  		}
   828  		jobs[alloc.JobID] = forceStatus
   829  
   830  		watcher.Add(watch.Item{Alloc: alloc.ID})
   831  		watcher.Add(watch.Item{AllocEval: alloc.EvalID})
   832  		watcher.Add(watch.Item{AllocJob: alloc.JobID})
   833  		watcher.Add(watch.Item{AllocNode: alloc.NodeID})
   834  	}
   835  
   836  	// Update the indexes
   837  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   838  		return fmt.Errorf("index update failed: %v", err)
   839  	}
   840  
   841  	// Set the job's status
   842  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   843  		return fmt.Errorf("setting job status failed: %v", err)
   844  	}
   845  
   846  	txn.Defer(func() { s.watch.notify(watcher) })
   847  	txn.Commit()
   848  	return nil
   849  }
   850  
   851  // AllocByID is used to lookup an allocation by its ID
   852  func (s *StateStore) AllocByID(id string) (*structs.Allocation, error) {
   853  	txn := s.db.Txn(false)
   854  
   855  	existing, err := txn.First("allocs", "id", id)
   856  	if err != nil {
   857  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
   858  	}
   859  
   860  	if existing != nil {
   861  		return existing.(*structs.Allocation), nil
   862  	}
   863  	return nil, nil
   864  }
   865  
   866  // AllocsByIDPrefix is used to lookup allocs by prefix
   867  func (s *StateStore) AllocsByIDPrefix(id string) (memdb.ResultIterator, error) {
   868  	txn := s.db.Txn(false)
   869  
   870  	iter, err := txn.Get("allocs", "id_prefix", id)
   871  	if err != nil {
   872  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
   873  	}
   874  
   875  	return iter, nil
   876  }
   877  
   878  // AllocsByNode returns all the allocations by node
   879  func (s *StateStore) AllocsByNode(node string) ([]*structs.Allocation, error) {
   880  	txn := s.db.Txn(false)
   881  
   882  	// Get an iterator over the node allocations, using only the
   883  	// node prefix which ignores the terminal status
   884  	iter, err := txn.Get("allocs", "node_prefix", node)
   885  	if err != nil {
   886  		return nil, err
   887  	}
   888  
   889  	var out []*structs.Allocation
   890  	for {
   891  		raw := iter.Next()
   892  		if raw == nil {
   893  			break
   894  		}
   895  		out = append(out, raw.(*structs.Allocation))
   896  	}
   897  	return out, nil
   898  }
   899  
   900  // AllocsByNode returns all the allocations by node and terminal status
   901  func (s *StateStore) AllocsByNodeTerminal(node string, terminal bool) ([]*structs.Allocation, error) {
   902  	txn := s.db.Txn(false)
   903  
   904  	// Get an iterator over the node allocations
   905  	iter, err := txn.Get("allocs", "node", node, terminal)
   906  	if err != nil {
   907  		return nil, err
   908  	}
   909  
   910  	var out []*structs.Allocation
   911  	for {
   912  		raw := iter.Next()
   913  		if raw == nil {
   914  			break
   915  		}
   916  		out = append(out, raw.(*structs.Allocation))
   917  	}
   918  	return out, nil
   919  }
   920  
   921  // AllocsByJob returns all the allocations by job id
   922  func (s *StateStore) AllocsByJob(jobID string) ([]*structs.Allocation, error) {
   923  	txn := s.db.Txn(false)
   924  
   925  	// Get an iterator over the node allocations
   926  	iter, err := txn.Get("allocs", "job", jobID)
   927  	if err != nil {
   928  		return nil, err
   929  	}
   930  
   931  	var out []*structs.Allocation
   932  	for {
   933  		raw := iter.Next()
   934  		if raw == nil {
   935  			break
   936  		}
   937  		out = append(out, raw.(*structs.Allocation))
   938  	}
   939  	return out, nil
   940  }
   941  
   942  // AllocsByEval returns all the allocations by eval id
   943  func (s *StateStore) AllocsByEval(evalID string) ([]*structs.Allocation, error) {
   944  	txn := s.db.Txn(false)
   945  
   946  	// Get an iterator over the eval allocations
   947  	iter, err := txn.Get("allocs", "eval", evalID)
   948  	if err != nil {
   949  		return nil, err
   950  	}
   951  
   952  	var out []*structs.Allocation
   953  	for {
   954  		raw := iter.Next()
   955  		if raw == nil {
   956  			break
   957  		}
   958  		out = append(out, raw.(*structs.Allocation))
   959  	}
   960  	return out, nil
   961  }
   962  
   963  // Allocs returns an iterator over all the evaluations
   964  func (s *StateStore) Allocs() (memdb.ResultIterator, error) {
   965  	txn := s.db.Txn(false)
   966  
   967  	// Walk the entire table
   968  	iter, err := txn.Get("allocs", "id")
   969  	if err != nil {
   970  		return nil, err
   971  	}
   972  	return iter, nil
   973  }
   974  
   975  // LastIndex returns the greatest index value for all indexes
   976  func (s *StateStore) LatestIndex() (uint64, error) {
   977  	indexes, err := s.Indexes()
   978  	if err != nil {
   979  		return 0, err
   980  	}
   981  
   982  	var max uint64 = 0
   983  	for {
   984  		raw := indexes.Next()
   985  		if raw == nil {
   986  			break
   987  		}
   988  
   989  		// Prepare the request struct
   990  		idx := raw.(*IndexEntry)
   991  
   992  		// Determine the max
   993  		if idx.Value > max {
   994  			max = idx.Value
   995  		}
   996  	}
   997  
   998  	return max, nil
   999  }
  1000  
  1001  // Index finds the matching index value
  1002  func (s *StateStore) Index(name string) (uint64, error) {
  1003  	txn := s.db.Txn(false)
  1004  
  1005  	// Lookup the first matching index
  1006  	out, err := txn.First("index", "id", name)
  1007  	if err != nil {
  1008  		return 0, err
  1009  	}
  1010  	if out == nil {
  1011  		return 0, nil
  1012  	}
  1013  	return out.(*IndexEntry).Value, nil
  1014  }
  1015  
  1016  // Indexes returns an iterator over all the indexes
  1017  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
  1018  	txn := s.db.Txn(false)
  1019  
  1020  	// Walk the entire nodes table
  1021  	iter, err := txn.Get("index", "id")
  1022  	if err != nil {
  1023  		return nil, err
  1024  	}
  1025  	return iter, nil
  1026  }
  1027  
  1028  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
  1029  // It takes a map of job IDs to an optional forceStatus string. It returns an
  1030  // error if the job doesn't exist or setJobStatus fails.
  1031  func (s *StateStore) setJobStatuses(index uint64, watcher watch.Items, txn *memdb.Txn,
  1032  	jobs map[string]string, evalDelete bool) error {
  1033  	for job, forceStatus := range jobs {
  1034  		existing, err := txn.First("jobs", "id", job)
  1035  		if err != nil {
  1036  			return fmt.Errorf("job lookup failed: %v", err)
  1037  		}
  1038  
  1039  		if existing == nil {
  1040  			continue
  1041  		}
  1042  
  1043  		if err := s.setJobStatus(index, watcher, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
  1044  			return err
  1045  		}
  1046  	}
  1047  
  1048  	return nil
  1049  }
  1050  
  1051  // setJobStatus sets the status of the job by looking up associated evaluations
  1052  // and allocations. evalDelete should be set to true if setJobStatus is being
  1053  // called because an evaluation is being deleted (potentially because of garbage
  1054  // collection). If forceStatus is non-empty, the job's status will be set to the
  1055  // passed status.
  1056  func (s *StateStore) setJobStatus(index uint64, watcher watch.Items, txn *memdb.Txn,
  1057  	job *structs.Job, evalDelete bool, forceStatus string) error {
  1058  
  1059  	// Capture the current status so we can check if there is a change
  1060  	oldStatus := job.Status
  1061  	newStatus := forceStatus
  1062  
  1063  	// If forceStatus is not set, compute the jobs status.
  1064  	if forceStatus == "" {
  1065  		var err error
  1066  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  1067  		if err != nil {
  1068  			return err
  1069  		}
  1070  	}
  1071  
  1072  	// Fast-path if nothing has changed.
  1073  	if oldStatus == newStatus {
  1074  		return nil
  1075  	}
  1076  
  1077  	// The job has changed, so add to watcher.
  1078  	watcher.Add(watch.Item{Table: "jobs"})
  1079  	watcher.Add(watch.Item{Job: job.ID})
  1080  
  1081  	// Copy and update the existing job
  1082  	updated := job.Copy()
  1083  	updated.Status = newStatus
  1084  	updated.ModifyIndex = index
  1085  
  1086  	// Insert the job
  1087  	if err := txn.Insert("jobs", updated); err != nil {
  1088  		return fmt.Errorf("job insert failed: %v", err)
  1089  	}
  1090  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1091  		return fmt.Errorf("index update failed: %v", err)
  1092  	}
  1093  	return nil
  1094  }
  1095  
  1096  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  1097  	allocs, err := txn.Get("allocs", "job", job.ID)
  1098  	if err != nil {
  1099  		return "", err
  1100  	}
  1101  
  1102  	// If there is a non-terminal allocation, the job is running.
  1103  	hasAlloc := false
  1104  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  1105  		hasAlloc = true
  1106  		if !alloc.(*structs.Allocation).TerminalStatus() {
  1107  			return structs.JobStatusRunning, nil
  1108  		}
  1109  	}
  1110  
  1111  	evals, err := txn.Get("evals", "job", job.ID)
  1112  	if err != nil {
  1113  		return "", err
  1114  	}
  1115  
  1116  	hasEval := false
  1117  	for eval := evals.Next(); eval != nil; eval = evals.Next() {
  1118  		hasEval = true
  1119  		if !eval.(*structs.Evaluation).TerminalStatus() {
  1120  			return structs.JobStatusPending, nil
  1121  		}
  1122  	}
  1123  
  1124  	// The job is dead if all the allocations and evals are terminal or if there
  1125  	// are no evals because of garbage collection.
  1126  	if evalDelete || hasEval || hasAlloc {
  1127  		return structs.JobStatusDead, nil
  1128  	}
  1129  
  1130  	// If there are no allocations or evaluations it is a new job. If the job is
  1131  	// periodic, we mark it as running as it will never have an
  1132  	// allocation/evaluation against it.
  1133  	if job.IsPeriodic() {
  1134  		return structs.JobStatusRunning, nil
  1135  	}
  1136  	return structs.JobStatusPending, nil
  1137  }
  1138  
  1139  // StateSnapshot is used to provide a point-in-time snapshot
  1140  type StateSnapshot struct {
  1141  	StateStore
  1142  }
  1143  
  1144  // StateRestore is used to optimize the performance when
  1145  // restoring state by only using a single large transaction
  1146  // instead of thousands of sub transactions
  1147  type StateRestore struct {
  1148  	txn   *memdb.Txn
  1149  	watch *stateWatch
  1150  	items watch.Items
  1151  }
  1152  
  1153  // Abort is used to abort the restore operation
  1154  func (s *StateRestore) Abort() {
  1155  	s.txn.Abort()
  1156  }
  1157  
  1158  // Commit is used to commit the restore operation
  1159  func (s *StateRestore) Commit() {
  1160  	s.txn.Defer(func() { s.watch.notify(s.items) })
  1161  	s.txn.Commit()
  1162  }
  1163  
  1164  // NodeRestore is used to restore a node
  1165  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  1166  	r.items.Add(watch.Item{Table: "nodes"})
  1167  	r.items.Add(watch.Item{Node: node.ID})
  1168  	if err := r.txn.Insert("nodes", node); err != nil {
  1169  		return fmt.Errorf("node insert failed: %v", err)
  1170  	}
  1171  	return nil
  1172  }
  1173  
  1174  // JobRestore is used to restore a job
  1175  func (r *StateRestore) JobRestore(job *structs.Job) error {
  1176  	r.items.Add(watch.Item{Table: "jobs"})
  1177  	r.items.Add(watch.Item{Job: job.ID})
  1178  	if err := r.txn.Insert("jobs", job); err != nil {
  1179  		return fmt.Errorf("job insert failed: %v", err)
  1180  	}
  1181  	return nil
  1182  }
  1183  
  1184  // EvalRestore is used to restore an evaluation
  1185  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  1186  	r.items.Add(watch.Item{Table: "evals"})
  1187  	r.items.Add(watch.Item{Eval: eval.ID})
  1188  	if err := r.txn.Insert("evals", eval); err != nil {
  1189  		return fmt.Errorf("eval insert failed: %v", err)
  1190  	}
  1191  	return nil
  1192  }
  1193  
  1194  // AllocRestore is used to restore an allocation
  1195  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  1196  	r.items.Add(watch.Item{Table: "allocs"})
  1197  	r.items.Add(watch.Item{Alloc: alloc.ID})
  1198  	r.items.Add(watch.Item{AllocEval: alloc.EvalID})
  1199  	r.items.Add(watch.Item{AllocJob: alloc.JobID})
  1200  	r.items.Add(watch.Item{AllocNode: alloc.NodeID})
  1201  	if err := r.txn.Insert("allocs", alloc); err != nil {
  1202  		return fmt.Errorf("alloc insert failed: %v", err)
  1203  	}
  1204  	return nil
  1205  }
  1206  
  1207  // IndexRestore is used to restore an index
  1208  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  1209  	if err := r.txn.Insert("index", idx); err != nil {
  1210  		return fmt.Errorf("index insert failed: %v", err)
  1211  	}
  1212  	return nil
  1213  }
  1214  
  1215  // PeriodicLaunchRestore is used to restore a periodic launch.
  1216  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  1217  	r.items.Add(watch.Item{Table: "periodic_launch"})
  1218  	r.items.Add(watch.Item{Job: launch.ID})
  1219  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  1220  		return fmt.Errorf("periodic launch insert failed: %v", err)
  1221  	}
  1222  	return nil
  1223  }
  1224  
  1225  // stateWatch holds shared state for watching updates. This is
  1226  // outside of StateStore so it can be shared with snapshots.
  1227  type stateWatch struct {
  1228  	items map[watch.Item]*NotifyGroup
  1229  	l     sync.Mutex
  1230  }
  1231  
  1232  // newStateWatch creates a new stateWatch for change notification.
  1233  func newStateWatch() *stateWatch {
  1234  	return &stateWatch{
  1235  		items: make(map[watch.Item]*NotifyGroup),
  1236  	}
  1237  }
  1238  
  1239  // watch subscribes a channel to the given watch items.
  1240  func (w *stateWatch) watch(items watch.Items, ch chan struct{}) {
  1241  	w.l.Lock()
  1242  	defer w.l.Unlock()
  1243  
  1244  	for item, _ := range items {
  1245  		grp, ok := w.items[item]
  1246  		if !ok {
  1247  			grp = new(NotifyGroup)
  1248  			w.items[item] = grp
  1249  		}
  1250  		grp.Wait(ch)
  1251  	}
  1252  }
  1253  
  1254  // stopWatch unsubscribes a channel from the given watch items.
  1255  func (w *stateWatch) stopWatch(items watch.Items, ch chan struct{}) {
  1256  	w.l.Lock()
  1257  	defer w.l.Unlock()
  1258  
  1259  	for item, _ := range items {
  1260  		if grp, ok := w.items[item]; ok {
  1261  			grp.Clear(ch)
  1262  			if grp.Empty() {
  1263  				delete(w.items, item)
  1264  			}
  1265  		}
  1266  	}
  1267  }
  1268  
  1269  // notify is used to fire notifications on the given watch items.
  1270  func (w *stateWatch) notify(items watch.Items) {
  1271  	w.l.Lock()
  1272  	defer w.l.Unlock()
  1273  
  1274  	for wi, _ := range items {
  1275  		if grp, ok := w.items[wi]; ok {
  1276  			grp.Notify()
  1277  		}
  1278  	}
  1279  }