github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/state/state_store.go (about)

     1  package state
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"sync"
     8  
     9  	"github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/hashicorp/nomad/nomad/watch"
    12  )
    13  
    14  // IndexEntry is used with the "index" table
    15  // for managing the latest Raft index affecting a table.
    16  type IndexEntry struct {
    17  	Key   string
    18  	Value uint64
    19  }
    20  
    21  // The StateStore is responsible for maintaining all the Nomad
    22  // state. It is manipulated by the FSM which maintains consistency
    23  // through the use of Raft. The goals of the StateStore are to provide
    24  // high concurrency for read operations without blocking writes, and
    25  // to provide write availability in the face of reads. EVERY object
    26  // returned as a result of a read against the state store should be
    27  // considered a constant and NEVER modified in place.
    28  type StateStore struct {
    29  	logger *log.Logger
    30  	db     *memdb.MemDB
    31  	watch  *stateWatch
    32  }
    33  
    34  // NewStateStore is used to create a new state store
    35  func NewStateStore(logOutput io.Writer) (*StateStore, error) {
    36  	// Create the MemDB
    37  	db, err := memdb.NewMemDB(stateStoreSchema())
    38  	if err != nil {
    39  		return nil, fmt.Errorf("state store setup failed: %v", err)
    40  	}
    41  
    42  	// Create the state store
    43  	s := &StateStore{
    44  		logger: log.New(logOutput, "", log.LstdFlags),
    45  		db:     db,
    46  		watch:  newStateWatch(),
    47  	}
    48  	return s, nil
    49  }
    50  
    51  // Snapshot is used to create a point in time snapshot. Because
    52  // we use MemDB, we just need to snapshot the state of the underlying
    53  // database.
    54  func (s *StateStore) Snapshot() (*StateSnapshot, error) {
    55  	snap := &StateSnapshot{
    56  		StateStore: StateStore{
    57  			logger: s.logger,
    58  			db:     s.db.Snapshot(),
    59  			watch:  s.watch,
    60  		},
    61  	}
    62  	return snap, nil
    63  }
    64  
    65  // Restore is used to optimize the efficiency of rebuilding
    66  // state by minimizing the number of transactions and checking
    67  // overhead.
    68  func (s *StateStore) Restore() (*StateRestore, error) {
    69  	txn := s.db.Txn(true)
    70  	r := &StateRestore{
    71  		txn:   txn,
    72  		watch: s.watch,
    73  		items: watch.NewItems(),
    74  	}
    75  	return r, nil
    76  }
    77  
    78  // Watch subscribes a channel to a set of watch items.
    79  func (s *StateStore) Watch(items watch.Items, notify chan struct{}) {
    80  	s.watch.watch(items, notify)
    81  }
    82  
    83  // StopWatch unsubscribes a channel from a set of watch items.
    84  func (s *StateStore) StopWatch(items watch.Items, notify chan struct{}) {
    85  	s.watch.stopWatch(items, notify)
    86  }
    87  
    88  // UpsertNode is used to register a node or update a node definition
    89  // This is assumed to be triggered by the client, so we retain the value
    90  // of drain which is set by the scheduler.
    91  func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
    92  	txn := s.db.Txn(true)
    93  	defer txn.Abort()
    94  
    95  	watcher := watch.NewItems()
    96  	watcher.Add(watch.Item{Table: "nodes"})
    97  	watcher.Add(watch.Item{Node: node.ID})
    98  
    99  	// Check if the node already exists
   100  	existing, err := txn.First("nodes", "id", node.ID)
   101  	if err != nil {
   102  		return fmt.Errorf("node lookup failed: %v", err)
   103  	}
   104  
   105  	// Setup the indexes correctly
   106  	if existing != nil {
   107  		exist := existing.(*structs.Node)
   108  		node.CreateIndex = exist.CreateIndex
   109  		node.ModifyIndex = index
   110  		node.Drain = exist.Drain // Retain the drain mode
   111  	} else {
   112  		node.CreateIndex = index
   113  		node.ModifyIndex = index
   114  	}
   115  
   116  	// Insert the node
   117  	if err := txn.Insert("nodes", node); err != nil {
   118  		return fmt.Errorf("node insert failed: %v", err)
   119  	}
   120  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   121  		return fmt.Errorf("index update failed: %v", err)
   122  	}
   123  
   124  	txn.Defer(func() { s.watch.notify(watcher) })
   125  	txn.Commit()
   126  	return nil
   127  }
   128  
   129  // DeleteNode is used to deregister a node
   130  func (s *StateStore) DeleteNode(index uint64, nodeID string) error {
   131  	txn := s.db.Txn(true)
   132  	defer txn.Abort()
   133  
   134  	// Lookup the node
   135  	existing, err := txn.First("nodes", "id", nodeID)
   136  	if err != nil {
   137  		return fmt.Errorf("node lookup failed: %v", err)
   138  	}
   139  	if existing == nil {
   140  		return fmt.Errorf("node not found")
   141  	}
   142  
   143  	watcher := watch.NewItems()
   144  	watcher.Add(watch.Item{Table: "nodes"})
   145  	watcher.Add(watch.Item{Node: nodeID})
   146  
   147  	// Delete the node
   148  	if err := txn.Delete("nodes", existing); err != nil {
   149  		return fmt.Errorf("node delete failed: %v", err)
   150  	}
   151  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   152  		return fmt.Errorf("index update failed: %v", err)
   153  	}
   154  
   155  	txn.Defer(func() { s.watch.notify(watcher) })
   156  	txn.Commit()
   157  	return nil
   158  }
   159  
   160  // UpdateNodeStatus is used to update the status of a node
   161  func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error {
   162  	txn := s.db.Txn(true)
   163  	defer txn.Abort()
   164  
   165  	watcher := watch.NewItems()
   166  	watcher.Add(watch.Item{Table: "nodes"})
   167  	watcher.Add(watch.Item{Node: nodeID})
   168  
   169  	// Lookup the node
   170  	existing, err := txn.First("nodes", "id", nodeID)
   171  	if err != nil {
   172  		return fmt.Errorf("node lookup failed: %v", err)
   173  	}
   174  	if existing == nil {
   175  		return fmt.Errorf("node not found")
   176  	}
   177  
   178  	// Copy the existing node
   179  	existingNode := existing.(*structs.Node)
   180  	copyNode := new(structs.Node)
   181  	*copyNode = *existingNode
   182  
   183  	// Update the status in the copy
   184  	copyNode.Status = status
   185  	copyNode.ModifyIndex = index
   186  
   187  	// Insert the node
   188  	if err := txn.Insert("nodes", copyNode); err != nil {
   189  		return fmt.Errorf("node update failed: %v", err)
   190  	}
   191  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   192  		return fmt.Errorf("index update failed: %v", err)
   193  	}
   194  
   195  	txn.Defer(func() { s.watch.notify(watcher) })
   196  	txn.Commit()
   197  	return nil
   198  }
   199  
   200  // UpdateNodeDrain is used to update the drain of a node
   201  func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error {
   202  	txn := s.db.Txn(true)
   203  	defer txn.Abort()
   204  
   205  	watcher := watch.NewItems()
   206  	watcher.Add(watch.Item{Table: "nodes"})
   207  	watcher.Add(watch.Item{Node: nodeID})
   208  
   209  	// Lookup the node
   210  	existing, err := txn.First("nodes", "id", nodeID)
   211  	if err != nil {
   212  		return fmt.Errorf("node lookup failed: %v", err)
   213  	}
   214  	if existing == nil {
   215  		return fmt.Errorf("node not found")
   216  	}
   217  
   218  	// Copy the existing node
   219  	existingNode := existing.(*structs.Node)
   220  	copyNode := new(structs.Node)
   221  	*copyNode = *existingNode
   222  
   223  	// Update the drain in the copy
   224  	copyNode.Drain = drain
   225  	copyNode.ModifyIndex = index
   226  
   227  	// Insert the node
   228  	if err := txn.Insert("nodes", copyNode); err != nil {
   229  		return fmt.Errorf("node update failed: %v", err)
   230  	}
   231  	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
   232  		return fmt.Errorf("index update failed: %v", err)
   233  	}
   234  
   235  	txn.Defer(func() { s.watch.notify(watcher) })
   236  	txn.Commit()
   237  	return nil
   238  }
   239  
   240  // NodeByID is used to lookup a node by ID
   241  func (s *StateStore) NodeByID(nodeID string) (*structs.Node, error) {
   242  	txn := s.db.Txn(false)
   243  
   244  	existing, err := txn.First("nodes", "id", nodeID)
   245  	if err != nil {
   246  		return nil, fmt.Errorf("node lookup failed: %v", err)
   247  	}
   248  
   249  	if existing != nil {
   250  		return existing.(*structs.Node), nil
   251  	}
   252  	return nil, nil
   253  }
   254  
   255  // NodesByIDPrefix is used to lookup nodes by prefix
   256  func (s *StateStore) NodesByIDPrefix(nodeID string) (memdb.ResultIterator, error) {
   257  	txn := s.db.Txn(false)
   258  
   259  	iter, err := txn.Get("nodes", "id_prefix", nodeID)
   260  	if err != nil {
   261  		return nil, fmt.Errorf("node lookup failed: %v", err)
   262  	}
   263  
   264  	return iter, nil
   265  }
   266  
   267  // Nodes returns an iterator over all the nodes
   268  func (s *StateStore) Nodes() (memdb.ResultIterator, error) {
   269  	txn := s.db.Txn(false)
   270  
   271  	// Walk the entire nodes table
   272  	iter, err := txn.Get("nodes", "id")
   273  	if err != nil {
   274  		return nil, err
   275  	}
   276  	return iter, nil
   277  }
   278  
   279  // UpsertJob is used to register a job or update a job definition
   280  func (s *StateStore) UpsertJob(index uint64, job *structs.Job) error {
   281  	txn := s.db.Txn(true)
   282  	defer txn.Abort()
   283  
   284  	watcher := watch.NewItems()
   285  	watcher.Add(watch.Item{Table: "jobs"})
   286  	watcher.Add(watch.Item{Job: job.ID})
   287  
   288  	// Check if the job already exists
   289  	existing, err := txn.First("jobs", "id", job.ID)
   290  	if err != nil {
   291  		return fmt.Errorf("job lookup failed: %v", err)
   292  	}
   293  
   294  	// Setup the indexes correctly
   295  	if existing != nil {
   296  		job.CreateIndex = existing.(*structs.Job).CreateIndex
   297  		job.ModifyIndex = index
   298  		job.JobModifyIndex = index
   299  
   300  		// Compute the job status
   301  		var err error
   302  		job.Status, err = s.getJobStatus(txn, job, false)
   303  		if err != nil {
   304  			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
   305  		}
   306  	} else {
   307  		job.CreateIndex = index
   308  		job.ModifyIndex = index
   309  		job.JobModifyIndex = index
   310  
   311  		// If we are inserting the job for the first time, we don't need to
   312  		// calculate the jobs status as it is known.
   313  		if job.IsPeriodic() {
   314  			job.Status = structs.JobStatusRunning
   315  		} else {
   316  			job.Status = structs.JobStatusPending
   317  		}
   318  	}
   319  
   320  	// Insert the job
   321  	if err := txn.Insert("jobs", job); err != nil {
   322  		return fmt.Errorf("job insert failed: %v", err)
   323  	}
   324  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   325  		return fmt.Errorf("index update failed: %v", err)
   326  	}
   327  
   328  	txn.Defer(func() { s.watch.notify(watcher) })
   329  	txn.Commit()
   330  	return nil
   331  }
   332  
   333  // DeleteJob is used to deregister a job
   334  func (s *StateStore) DeleteJob(index uint64, jobID string) error {
   335  	txn := s.db.Txn(true)
   336  	defer txn.Abort()
   337  
   338  	// Lookup the node
   339  	existing, err := txn.First("jobs", "id", jobID)
   340  	if err != nil {
   341  		return fmt.Errorf("job lookup failed: %v", err)
   342  	}
   343  	if existing == nil {
   344  		return fmt.Errorf("job not found")
   345  	}
   346  
   347  	watcher := watch.NewItems()
   348  	watcher.Add(watch.Item{Table: "jobs"})
   349  	watcher.Add(watch.Item{Job: jobID})
   350  
   351  	// Delete the node
   352  	if err := txn.Delete("jobs", existing); err != nil {
   353  		return fmt.Errorf("job delete failed: %v", err)
   354  	}
   355  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
   356  		return fmt.Errorf("index update failed: %v", err)
   357  	}
   358  
   359  	txn.Defer(func() { s.watch.notify(watcher) })
   360  	txn.Commit()
   361  	return nil
   362  }
   363  
   364  // JobByID is used to lookup a job by its ID
   365  func (s *StateStore) JobByID(id string) (*structs.Job, error) {
   366  	txn := s.db.Txn(false)
   367  
   368  	existing, err := txn.First("jobs", "id", id)
   369  	if err != nil {
   370  		return nil, fmt.Errorf("job lookup failed: %v", err)
   371  	}
   372  
   373  	if existing != nil {
   374  		return existing.(*structs.Job), nil
   375  	}
   376  	return nil, nil
   377  }
   378  
   379  // JobsByIDPrefix is used to lookup a job by prefix
   380  func (s *StateStore) JobsByIDPrefix(id string) (memdb.ResultIterator, error) {
   381  	txn := s.db.Txn(false)
   382  
   383  	iter, err := txn.Get("jobs", "id_prefix", id)
   384  	if err != nil {
   385  		return nil, fmt.Errorf("job lookup failed: %v", err)
   386  	}
   387  
   388  	return iter, nil
   389  }
   390  
   391  // Jobs returns an iterator over all the jobs
   392  func (s *StateStore) Jobs() (memdb.ResultIterator, error) {
   393  	txn := s.db.Txn(false)
   394  
   395  	// Walk the entire jobs table
   396  	iter, err := txn.Get("jobs", "id")
   397  	if err != nil {
   398  		return nil, err
   399  	}
   400  	return iter, nil
   401  }
   402  
   403  // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
   404  func (s *StateStore) JobsByPeriodic(periodic bool) (memdb.ResultIterator, error) {
   405  	txn := s.db.Txn(false)
   406  
   407  	iter, err := txn.Get("jobs", "periodic", periodic)
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	return iter, nil
   412  }
   413  
   414  // JobsByScheduler returns an iterator over all the jobs with the specific
   415  // scheduler type.
   416  func (s *StateStore) JobsByScheduler(schedulerType string) (memdb.ResultIterator, error) {
   417  	txn := s.db.Txn(false)
   418  
   419  	// Return an iterator for jobs with the specific type.
   420  	iter, err := txn.Get("jobs", "type", schedulerType)
   421  	if err != nil {
   422  		return nil, err
   423  	}
   424  	return iter, nil
   425  }
   426  
   427  // JobsByGC returns an iterator over all jobs eligible or uneligible for garbage
   428  // collection.
   429  func (s *StateStore) JobsByGC(gc bool) (memdb.ResultIterator, error) {
   430  	txn := s.db.Txn(false)
   431  
   432  	iter, err := txn.Get("jobs", "gc", gc)
   433  	if err != nil {
   434  		return nil, err
   435  	}
   436  	return iter, nil
   437  }
   438  
   439  // UpsertPeriodicLaunch is used to register a launch or update it.
   440  func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
   441  	txn := s.db.Txn(true)
   442  	defer txn.Abort()
   443  
   444  	watcher := watch.NewItems()
   445  	watcher.Add(watch.Item{Table: "periodic_launch"})
   446  	watcher.Add(watch.Item{Job: launch.ID})
   447  
   448  	// Check if the job already exists
   449  	existing, err := txn.First("periodic_launch", "id", launch.ID)
   450  	if err != nil {
   451  		return fmt.Errorf("periodic launch lookup failed: %v", err)
   452  	}
   453  
   454  	// Setup the indexes correctly
   455  	if existing != nil {
   456  		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
   457  		launch.ModifyIndex = index
   458  	} else {
   459  		launch.CreateIndex = index
   460  		launch.ModifyIndex = index
   461  	}
   462  
   463  	// Insert the job
   464  	if err := txn.Insert("periodic_launch", launch); err != nil {
   465  		return fmt.Errorf("launch insert failed: %v", err)
   466  	}
   467  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   468  		return fmt.Errorf("index update failed: %v", err)
   469  	}
   470  
   471  	txn.Defer(func() { s.watch.notify(watcher) })
   472  	txn.Commit()
   473  	return nil
   474  }
   475  
   476  // DeletePeriodicLaunch is used to delete the periodic launch
   477  func (s *StateStore) DeletePeriodicLaunch(index uint64, jobID string) error {
   478  	txn := s.db.Txn(true)
   479  	defer txn.Abort()
   480  
   481  	// Lookup the launch
   482  	existing, err := txn.First("periodic_launch", "id", jobID)
   483  	if err != nil {
   484  		return fmt.Errorf("launch lookup failed: %v", err)
   485  	}
   486  	if existing == nil {
   487  		return fmt.Errorf("launch not found")
   488  	}
   489  
   490  	watcher := watch.NewItems()
   491  	watcher.Add(watch.Item{Table: "periodic_launch"})
   492  	watcher.Add(watch.Item{Job: jobID})
   493  
   494  	// Delete the launch
   495  	if err := txn.Delete("periodic_launch", existing); err != nil {
   496  		return fmt.Errorf("launch delete failed: %v", err)
   497  	}
   498  	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
   499  		return fmt.Errorf("index update failed: %v", err)
   500  	}
   501  
   502  	txn.Defer(func() { s.watch.notify(watcher) })
   503  	txn.Commit()
   504  	return nil
   505  }
   506  
   507  // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
   508  // ID.
   509  func (s *StateStore) PeriodicLaunchByID(id string) (*structs.PeriodicLaunch, error) {
   510  	txn := s.db.Txn(false)
   511  
   512  	existing, err := txn.First("periodic_launch", "id", id)
   513  	if err != nil {
   514  		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
   515  	}
   516  
   517  	if existing != nil {
   518  		return existing.(*structs.PeriodicLaunch), nil
   519  	}
   520  	return nil, nil
   521  }
   522  
   523  // PeriodicLaunches returns an iterator over all the periodic launches
   524  func (s *StateStore) PeriodicLaunches() (memdb.ResultIterator, error) {
   525  	txn := s.db.Txn(false)
   526  
   527  	// Walk the entire table
   528  	iter, err := txn.Get("periodic_launch", "id")
   529  	if err != nil {
   530  		return nil, err
   531  	}
   532  	return iter, nil
   533  }
   534  
   535  // UpsertEvaluation is used to upsert an evaluation
   536  func (s *StateStore) UpsertEvals(index uint64, evals []*structs.Evaluation) error {
   537  	txn := s.db.Txn(true)
   538  	defer txn.Abort()
   539  
   540  	watcher := watch.NewItems()
   541  	watcher.Add(watch.Item{Table: "evals"})
   542  
   543  	// Do a nested upsert
   544  	jobs := make(map[string]string, len(evals))
   545  	for _, eval := range evals {
   546  		watcher.Add(watch.Item{Eval: eval.ID})
   547  		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
   548  			return err
   549  		}
   550  
   551  		jobs[eval.JobID] = ""
   552  	}
   553  
   554  	// Set the job's status
   555  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   556  		return fmt.Errorf("setting job status failed: %v", err)
   557  	}
   558  
   559  	txn.Defer(func() { s.watch.notify(watcher) })
   560  	txn.Commit()
   561  	return nil
   562  }
   563  
   564  // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
   565  func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *structs.Evaluation) error {
   566  	// Lookup the evaluation
   567  	existing, err := txn.First("evals", "id", eval.ID)
   568  	if err != nil {
   569  		return fmt.Errorf("eval lookup failed: %v", err)
   570  	}
   571  
   572  	// Update the indexes
   573  	if existing != nil {
   574  		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
   575  		eval.ModifyIndex = index
   576  	} else {
   577  		eval.CreateIndex = index
   578  		eval.ModifyIndex = index
   579  	}
   580  
   581  	// Insert the eval
   582  	if err := txn.Insert("evals", eval); err != nil {
   583  		return fmt.Errorf("eval insert failed: %v", err)
   584  	}
   585  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   586  		return fmt.Errorf("index update failed: %v", err)
   587  	}
   588  	return nil
   589  }
   590  
   591  // DeleteEval is used to delete an evaluation
   592  func (s *StateStore) DeleteEval(index uint64, evals []string, allocs []string) error {
   593  	txn := s.db.Txn(true)
   594  	defer txn.Abort()
   595  	watcher := watch.NewItems()
   596  	watcher.Add(watch.Item{Table: "evals"})
   597  	watcher.Add(watch.Item{Table: "allocs"})
   598  
   599  	jobs := make(map[string]string, len(evals))
   600  	for _, eval := range evals {
   601  		existing, err := txn.First("evals", "id", eval)
   602  		if err != nil {
   603  			return fmt.Errorf("eval lookup failed: %v", err)
   604  		}
   605  		if existing == nil {
   606  			continue
   607  		}
   608  		if err := txn.Delete("evals", existing); err != nil {
   609  			return fmt.Errorf("eval delete failed: %v", err)
   610  		}
   611  		watcher.Add(watch.Item{Eval: eval})
   612  		jobs[existing.(*structs.Evaluation).JobID] = ""
   613  	}
   614  
   615  	for _, alloc := range allocs {
   616  		existing, err := txn.First("allocs", "id", alloc)
   617  		if err != nil {
   618  			return fmt.Errorf("alloc lookup failed: %v", err)
   619  		}
   620  		if existing == nil {
   621  			continue
   622  		}
   623  		if err := txn.Delete("allocs", existing); err != nil {
   624  			return fmt.Errorf("alloc delete failed: %v", err)
   625  		}
   626  		realAlloc := existing.(*structs.Allocation)
   627  		watcher.Add(watch.Item{Alloc: realAlloc.ID})
   628  		watcher.Add(watch.Item{AllocEval: realAlloc.EvalID})
   629  		watcher.Add(watch.Item{AllocJob: realAlloc.JobID})
   630  		watcher.Add(watch.Item{AllocNode: realAlloc.NodeID})
   631  	}
   632  
   633  	// Update the indexes
   634  	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
   635  		return fmt.Errorf("index update failed: %v", err)
   636  	}
   637  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   638  		return fmt.Errorf("index update failed: %v", err)
   639  	}
   640  
   641  	// Set the job's status
   642  	if err := s.setJobStatuses(index, watcher, txn, jobs, true); err != nil {
   643  		return fmt.Errorf("setting job status failed: %v", err)
   644  	}
   645  
   646  	txn.Defer(func() { s.watch.notify(watcher) })
   647  	txn.Commit()
   648  	return nil
   649  }
   650  
   651  // EvalByID is used to lookup an eval by its ID
   652  func (s *StateStore) EvalByID(id string) (*structs.Evaluation, error) {
   653  	txn := s.db.Txn(false)
   654  
   655  	existing, err := txn.First("evals", "id", id)
   656  	if err != nil {
   657  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   658  	}
   659  
   660  	if existing != nil {
   661  		return existing.(*structs.Evaluation), nil
   662  	}
   663  	return nil, nil
   664  }
   665  
   666  // EvalsByIDPrefix is used to lookup evaluations by prefix
   667  func (s *StateStore) EvalsByIDPrefix(id string) (memdb.ResultIterator, error) {
   668  	txn := s.db.Txn(false)
   669  
   670  	iter, err := txn.Get("evals", "id_prefix", id)
   671  	if err != nil {
   672  		return nil, fmt.Errorf("eval lookup failed: %v", err)
   673  	}
   674  
   675  	return iter, nil
   676  }
   677  
   678  // EvalsByJob returns all the evaluations by job id
   679  func (s *StateStore) EvalsByJob(jobID string) ([]*structs.Evaluation, error) {
   680  	txn := s.db.Txn(false)
   681  
   682  	// Get an iterator over the node allocations
   683  	iter, err := txn.Get("evals", "job", jobID)
   684  	if err != nil {
   685  		return nil, err
   686  	}
   687  
   688  	var out []*structs.Evaluation
   689  	for {
   690  		raw := iter.Next()
   691  		if raw == nil {
   692  			break
   693  		}
   694  		out = append(out, raw.(*structs.Evaluation))
   695  	}
   696  	return out, nil
   697  }
   698  
   699  // Evals returns an iterator over all the evaluations
   700  func (s *StateStore) Evals() (memdb.ResultIterator, error) {
   701  	txn := s.db.Txn(false)
   702  
   703  	// Walk the entire table
   704  	iter, err := txn.Get("evals", "id")
   705  	if err != nil {
   706  		return nil, err
   707  	}
   708  	return iter, nil
   709  }
   710  
   711  // UpdateAllocFromClient is used to update an allocation based on input
   712  // from a client. While the schedulers are the authority on the allocation for
   713  // most things, some updates are authoritative from the client. Specifically,
   714  // the desired state comes from the schedulers, while the actual state comes
   715  // from clients.
   716  func (s *StateStore) UpdateAllocFromClient(index uint64, alloc *structs.Allocation) error {
   717  	txn := s.db.Txn(true)
   718  	defer txn.Abort()
   719  
   720  	watcher := watch.NewItems()
   721  	watcher.Add(watch.Item{Table: "allocs"})
   722  	watcher.Add(watch.Item{Alloc: alloc.ID})
   723  	watcher.Add(watch.Item{AllocEval: alloc.EvalID})
   724  	watcher.Add(watch.Item{AllocJob: alloc.JobID})
   725  	watcher.Add(watch.Item{AllocNode: alloc.NodeID})
   726  
   727  	// Look for existing alloc
   728  	existing, err := txn.First("allocs", "id", alloc.ID)
   729  	if err != nil {
   730  		return fmt.Errorf("alloc lookup failed: %v", err)
   731  	}
   732  
   733  	// Nothing to do if this does not exist
   734  	if existing == nil {
   735  		return nil
   736  	}
   737  	exist := existing.(*structs.Allocation)
   738  
   739  	// Copy everything from the existing allocation
   740  	copyAlloc := new(structs.Allocation)
   741  	*copyAlloc = *exist
   742  
   743  	// Pull in anything the client is the authority on
   744  	copyAlloc.ClientStatus = alloc.ClientStatus
   745  	copyAlloc.ClientDescription = alloc.ClientDescription
   746  	copyAlloc.TaskStates = alloc.TaskStates
   747  
   748  	// Update the modify index
   749  	copyAlloc.ModifyIndex = index
   750  
   751  	// Update the allocation
   752  	if err := txn.Insert("allocs", copyAlloc); err != nil {
   753  		return fmt.Errorf("alloc insert failed: %v", err)
   754  	}
   755  
   756  	// Update the indexes
   757  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   758  		return fmt.Errorf("index update failed: %v", err)
   759  	}
   760  
   761  	// Set the job's status
   762  	forceStatus := ""
   763  	if !copyAlloc.TerminalStatus() {
   764  		forceStatus = structs.JobStatusRunning
   765  	}
   766  	jobs := map[string]string{alloc.JobID: forceStatus}
   767  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   768  		return fmt.Errorf("setting job status failed: %v", err)
   769  	}
   770  
   771  	txn.Defer(func() { s.watch.notify(watcher) })
   772  	txn.Commit()
   773  	return nil
   774  }
   775  
   776  // UpsertAllocs is used to evict a set of allocations
   777  // and allocate new ones at the same time.
   778  func (s *StateStore) UpsertAllocs(index uint64, allocs []*structs.Allocation) error {
   779  	txn := s.db.Txn(true)
   780  	defer txn.Abort()
   781  
   782  	watcher := watch.NewItems()
   783  	watcher.Add(watch.Item{Table: "allocs"})
   784  
   785  	// Handle the allocations
   786  	jobs := make(map[string]string, 1)
   787  	for _, alloc := range allocs {
   788  		existing, err := txn.First("allocs", "id", alloc.ID)
   789  		if err != nil {
   790  			return fmt.Errorf("alloc lookup failed: %v", err)
   791  		}
   792  
   793  		if existing == nil {
   794  			alloc.CreateIndex = index
   795  			alloc.ModifyIndex = index
   796  		} else {
   797  			exist := existing.(*structs.Allocation)
   798  			alloc.CreateIndex = exist.CreateIndex
   799  			alloc.ModifyIndex = index
   800  			alloc.ClientStatus = exist.ClientStatus
   801  			alloc.ClientDescription = exist.ClientDescription
   802  		}
   803  		if err := txn.Insert("allocs", alloc); err != nil {
   804  			return fmt.Errorf("alloc insert failed: %v", err)
   805  		}
   806  
   807  		// If the allocation is running, force the job to running status.
   808  		forceStatus := ""
   809  		if !alloc.TerminalStatus() {
   810  			forceStatus = structs.JobStatusRunning
   811  		}
   812  		jobs[alloc.JobID] = forceStatus
   813  
   814  		watcher.Add(watch.Item{Alloc: alloc.ID})
   815  		watcher.Add(watch.Item{AllocEval: alloc.EvalID})
   816  		watcher.Add(watch.Item{AllocJob: alloc.JobID})
   817  		watcher.Add(watch.Item{AllocNode: alloc.NodeID})
   818  	}
   819  
   820  	// Update the indexes
   821  	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
   822  		return fmt.Errorf("index update failed: %v", err)
   823  	}
   824  
   825  	// Set the job's status
   826  	if err := s.setJobStatuses(index, watcher, txn, jobs, false); err != nil {
   827  		return fmt.Errorf("setting job status failed: %v", err)
   828  	}
   829  
   830  	txn.Defer(func() { s.watch.notify(watcher) })
   831  	txn.Commit()
   832  	return nil
   833  }
   834  
   835  // AllocByID is used to lookup an allocation by its ID
   836  func (s *StateStore) AllocByID(id string) (*structs.Allocation, error) {
   837  	txn := s.db.Txn(false)
   838  
   839  	existing, err := txn.First("allocs", "id", id)
   840  	if err != nil {
   841  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
   842  	}
   843  
   844  	if existing != nil {
   845  		return existing.(*structs.Allocation), nil
   846  	}
   847  	return nil, nil
   848  }
   849  
   850  // AllocsByIDPrefix is used to lookup allocs by prefix
   851  func (s *StateStore) AllocsByIDPrefix(id string) (memdb.ResultIterator, error) {
   852  	txn := s.db.Txn(false)
   853  
   854  	iter, err := txn.Get("allocs", "id_prefix", id)
   855  	if err != nil {
   856  		return nil, fmt.Errorf("alloc lookup failed: %v", err)
   857  	}
   858  
   859  	return iter, nil
   860  }
   861  
   862  // AllocsByNode returns all the allocations by node
   863  func (s *StateStore) AllocsByNode(node string) ([]*structs.Allocation, error) {
   864  	txn := s.db.Txn(false)
   865  
   866  	// Get an iterator over the node allocations
   867  	iter, err := txn.Get("allocs", "node", node)
   868  	if err != nil {
   869  		return nil, err
   870  	}
   871  
   872  	var out []*structs.Allocation
   873  	for {
   874  		raw := iter.Next()
   875  		if raw == nil {
   876  			break
   877  		}
   878  		out = append(out, raw.(*structs.Allocation))
   879  	}
   880  	return out, nil
   881  }
   882  
   883  // AllocsByJob returns all the allocations by job id
   884  func (s *StateStore) AllocsByJob(jobID string) ([]*structs.Allocation, error) {
   885  	txn := s.db.Txn(false)
   886  
   887  	// Get an iterator over the node allocations
   888  	iter, err := txn.Get("allocs", "job", jobID)
   889  	if err != nil {
   890  		return nil, err
   891  	}
   892  
   893  	var out []*structs.Allocation
   894  	for {
   895  		raw := iter.Next()
   896  		if raw == nil {
   897  			break
   898  		}
   899  		out = append(out, raw.(*structs.Allocation))
   900  	}
   901  	return out, nil
   902  }
   903  
   904  // AllocsByEval returns all the allocations by eval id
   905  func (s *StateStore) AllocsByEval(evalID string) ([]*structs.Allocation, error) {
   906  	txn := s.db.Txn(false)
   907  
   908  	// Get an iterator over the eval allocations
   909  	iter, err := txn.Get("allocs", "eval", evalID)
   910  	if err != nil {
   911  		return nil, err
   912  	}
   913  
   914  	var out []*structs.Allocation
   915  	for {
   916  		raw := iter.Next()
   917  		if raw == nil {
   918  			break
   919  		}
   920  		out = append(out, raw.(*structs.Allocation))
   921  	}
   922  	return out, nil
   923  }
   924  
   925  // Allocs returns an iterator over all the evaluations
   926  func (s *StateStore) Allocs() (memdb.ResultIterator, error) {
   927  	txn := s.db.Txn(false)
   928  
   929  	// Walk the entire table
   930  	iter, err := txn.Get("allocs", "id")
   931  	if err != nil {
   932  		return nil, err
   933  	}
   934  	return iter, nil
   935  }
   936  
   937  // Index finds the matching index value
   938  func (s *StateStore) Index(name string) (uint64, error) {
   939  	txn := s.db.Txn(false)
   940  
   941  	// Lookup the first matching index
   942  	out, err := txn.First("index", "id", name)
   943  	if err != nil {
   944  		return 0, err
   945  	}
   946  	if out == nil {
   947  		return 0, nil
   948  	}
   949  	return out.(*IndexEntry).Value, nil
   950  }
   951  
   952  // Indexes returns an iterator over all the indexes
   953  func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
   954  	txn := s.db.Txn(false)
   955  
   956  	// Walk the entire nodes table
   957  	iter, err := txn.Get("index", "id")
   958  	if err != nil {
   959  		return nil, err
   960  	}
   961  	return iter, nil
   962  }
   963  
   964  // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
   965  // It takes a map of job IDs to an optional forceStatus string. It returns an
   966  // error if the job doesn't exist or setJobStatus fails.
   967  func (s *StateStore) setJobStatuses(index uint64, watcher watch.Items, txn *memdb.Txn,
   968  	jobs map[string]string, evalDelete bool) error {
   969  	for job, forceStatus := range jobs {
   970  		existing, err := txn.First("jobs", "id", job)
   971  		if err != nil {
   972  			return fmt.Errorf("job lookup failed: %v", err)
   973  		}
   974  
   975  		if existing == nil {
   976  			continue
   977  		}
   978  
   979  		if err := s.setJobStatus(index, watcher, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
   980  			return err
   981  		}
   982  	}
   983  
   984  	return nil
   985  }
   986  
   987  // setJobStatus sets the status of the job by looking up associated evaluations
   988  // and allocations. evalDelete should be set to true if setJobStatus is being
   989  // called because an evaluation is being deleted (potentially because of garbage
   990  // collection). If forceStatus is non-empty, the job's status will be set to the
   991  // passed status.
   992  func (s *StateStore) setJobStatus(index uint64, watcher watch.Items, txn *memdb.Txn,
   993  	job *structs.Job, evalDelete bool, forceStatus string) error {
   994  
   995  	// Capture the current status so we can check if there is a change
   996  	oldStatus := job.Status
   997  	newStatus := forceStatus
   998  
   999  	// If forceStatus is not set, compute the jobs status.
  1000  	if forceStatus == "" {
  1001  		var err error
  1002  		newStatus, err = s.getJobStatus(txn, job, evalDelete)
  1003  		if err != nil {
  1004  			return err
  1005  		}
  1006  	}
  1007  
  1008  	// Fast-path if nothing has changed.
  1009  	if oldStatus == newStatus {
  1010  		return nil
  1011  	}
  1012  
  1013  	// The job has changed, so add to watcher.
  1014  	watcher.Add(watch.Item{Table: "jobs"})
  1015  	watcher.Add(watch.Item{Job: job.ID})
  1016  
  1017  	// Copy and update the existing job
  1018  	updated := job.Copy()
  1019  	updated.Status = newStatus
  1020  	updated.ModifyIndex = index
  1021  
  1022  	// Insert the job
  1023  	if err := txn.Insert("jobs", updated); err != nil {
  1024  		return fmt.Errorf("job insert failed: %v", err)
  1025  	}
  1026  	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
  1027  		return fmt.Errorf("index update failed: %v", err)
  1028  	}
  1029  	return nil
  1030  }
  1031  
  1032  func (s *StateStore) getJobStatus(txn *memdb.Txn, job *structs.Job, evalDelete bool) (string, error) {
  1033  	allocs, err := txn.Get("allocs", "job", job.ID)
  1034  	if err != nil {
  1035  		return "", err
  1036  	}
  1037  
  1038  	// If there is a non-terminal allocation, the job is running.
  1039  	hasAlloc := false
  1040  	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
  1041  		hasAlloc = true
  1042  		if !alloc.(*structs.Allocation).TerminalStatus() {
  1043  			return structs.JobStatusRunning, nil
  1044  		}
  1045  	}
  1046  
  1047  	evals, err := txn.Get("evals", "job", job.ID)
  1048  	if err != nil {
  1049  		return "", err
  1050  	}
  1051  
  1052  	hasEval := false
  1053  	for eval := evals.Next(); eval != nil; eval = evals.Next() {
  1054  		hasEval = true
  1055  		if !eval.(*structs.Evaluation).TerminalStatus() {
  1056  			return structs.JobStatusPending, nil
  1057  		}
  1058  	}
  1059  
  1060  	// The job is dead if all the allocations and evals are terminal or if there
  1061  	// are no evals because of garbage collection.
  1062  	if evalDelete || hasEval || hasAlloc {
  1063  		return structs.JobStatusDead, nil
  1064  	}
  1065  
  1066  	// If there are no allocations or evaluations it is a new job. If the job is
  1067  	// periodic, we mark it as running as it will never have an
  1068  	// allocation/evaluation against it.
  1069  	if job.IsPeriodic() {
  1070  		return structs.JobStatusRunning, nil
  1071  	}
  1072  	return structs.JobStatusPending, nil
  1073  }
  1074  
  1075  // StateSnapshot is used to provide a point-in-time snapshot
  1076  type StateSnapshot struct {
  1077  	StateStore
  1078  }
  1079  
  1080  // StateRestore is used to optimize the performance when
  1081  // restoring state by only using a single large transaction
  1082  // instead of thousands of sub transactions
  1083  type StateRestore struct {
  1084  	txn   *memdb.Txn
  1085  	watch *stateWatch
  1086  	items watch.Items
  1087  }
  1088  
  1089  // Abort is used to abort the restore operation
  1090  func (s *StateRestore) Abort() {
  1091  	s.txn.Abort()
  1092  }
  1093  
  1094  // Commit is used to commit the restore operation
  1095  func (s *StateRestore) Commit() {
  1096  	s.txn.Defer(func() { s.watch.notify(s.items) })
  1097  	s.txn.Commit()
  1098  }
  1099  
  1100  // NodeRestore is used to restore a node
  1101  func (r *StateRestore) NodeRestore(node *structs.Node) error {
  1102  	r.items.Add(watch.Item{Table: "nodes"})
  1103  	r.items.Add(watch.Item{Node: node.ID})
  1104  	if err := r.txn.Insert("nodes", node); err != nil {
  1105  		return fmt.Errorf("node insert failed: %v", err)
  1106  	}
  1107  	return nil
  1108  }
  1109  
  1110  // JobRestore is used to restore a job
  1111  func (r *StateRestore) JobRestore(job *structs.Job) error {
  1112  	r.items.Add(watch.Item{Table: "jobs"})
  1113  	r.items.Add(watch.Item{Job: job.ID})
  1114  	if err := r.txn.Insert("jobs", job); err != nil {
  1115  		return fmt.Errorf("job insert failed: %v", err)
  1116  	}
  1117  	return nil
  1118  }
  1119  
  1120  // EvalRestore is used to restore an evaluation
  1121  func (r *StateRestore) EvalRestore(eval *structs.Evaluation) error {
  1122  	r.items.Add(watch.Item{Table: "evals"})
  1123  	r.items.Add(watch.Item{Eval: eval.ID})
  1124  	if err := r.txn.Insert("evals", eval); err != nil {
  1125  		return fmt.Errorf("eval insert failed: %v", err)
  1126  	}
  1127  	return nil
  1128  }
  1129  
  1130  // AllocRestore is used to restore an allocation
  1131  func (r *StateRestore) AllocRestore(alloc *structs.Allocation) error {
  1132  	r.items.Add(watch.Item{Table: "allocs"})
  1133  	r.items.Add(watch.Item{Alloc: alloc.ID})
  1134  	r.items.Add(watch.Item{AllocEval: alloc.EvalID})
  1135  	r.items.Add(watch.Item{AllocJob: alloc.JobID})
  1136  	r.items.Add(watch.Item{AllocNode: alloc.NodeID})
  1137  	if err := r.txn.Insert("allocs", alloc); err != nil {
  1138  		return fmt.Errorf("alloc insert failed: %v", err)
  1139  	}
  1140  	return nil
  1141  }
  1142  
  1143  // IndexRestore is used to restore an index
  1144  func (r *StateRestore) IndexRestore(idx *IndexEntry) error {
  1145  	if err := r.txn.Insert("index", idx); err != nil {
  1146  		return fmt.Errorf("index insert failed: %v", err)
  1147  	}
  1148  	return nil
  1149  }
  1150  
  1151  // PeriodicLaunchRestore is used to restore a periodic launch.
  1152  func (r *StateRestore) PeriodicLaunchRestore(launch *structs.PeriodicLaunch) error {
  1153  	r.items.Add(watch.Item{Table: "periodic_launch"})
  1154  	r.items.Add(watch.Item{Job: launch.ID})
  1155  	if err := r.txn.Insert("periodic_launch", launch); err != nil {
  1156  		return fmt.Errorf("periodic launch insert failed: %v", err)
  1157  	}
  1158  	return nil
  1159  }
  1160  
  1161  // stateWatch holds shared state for watching updates. This is
  1162  // outside of StateStore so it can be shared with snapshots.
  1163  type stateWatch struct {
  1164  	items map[watch.Item]*NotifyGroup
  1165  	l     sync.Mutex
  1166  }
  1167  
  1168  // newStateWatch creates a new stateWatch for change notification.
  1169  func newStateWatch() *stateWatch {
  1170  	return &stateWatch{
  1171  		items: make(map[watch.Item]*NotifyGroup),
  1172  	}
  1173  }
  1174  
  1175  // watch subscribes a channel to the given watch items.
  1176  func (w *stateWatch) watch(items watch.Items, ch chan struct{}) {
  1177  	w.l.Lock()
  1178  	defer w.l.Unlock()
  1179  
  1180  	for item, _ := range items {
  1181  		grp, ok := w.items[item]
  1182  		if !ok {
  1183  			grp = new(NotifyGroup)
  1184  			w.items[item] = grp
  1185  		}
  1186  		grp.Wait(ch)
  1187  	}
  1188  }
  1189  
  1190  // stopWatch unsubscribes a channel from the given watch items.
  1191  func (w *stateWatch) stopWatch(items watch.Items, ch chan struct{}) {
  1192  	w.l.Lock()
  1193  	defer w.l.Unlock()
  1194  
  1195  	for item, _ := range items {
  1196  		if grp, ok := w.items[item]; ok {
  1197  			grp.Clear(ch)
  1198  			if grp.Empty() {
  1199  				delete(w.items, item)
  1200  			}
  1201  		}
  1202  	}
  1203  }
  1204  
  1205  // notify is used to fire notifications on the given watch items.
  1206  func (w *stateWatch) notify(items watch.Items) {
  1207  	w.l.Lock()
  1208  	defer w.l.Unlock()
  1209  
  1210  	for wi, _ := range items {
  1211  		if grp, ok := w.items[wi]; ok {
  1212  			grp.Notify()
  1213  		}
  1214  	}
  1215  }