github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/fsm.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"reflect"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/armon/go-metrics"
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"github.com/hashicorp/nomad/scheduler"
    16  	"github.com/hashicorp/raft"
    17  	"github.com/ugorji/go/codec"
    18  )
    19  
    20  const (
    21  	// timeTableGranularity is the granularity of index to time tracking
    22  	timeTableGranularity = 5 * time.Minute
    23  
    24  	// timeTableLimit is the maximum limit of our tracking
    25  	timeTableLimit = 72 * time.Hour
    26  )
    27  
    28  // SnapshotType is prefixed to a record in the FSM snapshot
    29  // so that we can determine the type for restore
    30  type SnapshotType byte
    31  
    32  const (
    33  	NodeSnapshot SnapshotType = iota
    34  	JobSnapshot
    35  	IndexSnapshot
    36  	EvalSnapshot
    37  	AllocSnapshot
    38  	TimeTableSnapshot
    39  	PeriodicLaunchSnapshot
    40  	JobSummarySnapshot
    41  	VaultAccessorSnapshot
    42  	JobVersionSnapshot
    43  	DeploymentSnapshot
    44  )
    45  
    46  // nomadFSM implements a finite state machine that is used
    47  // along with Raft to provide strong consistency. We implement
    48  // this outside the Server to avoid exposing this outside the package.
    49  type nomadFSM struct {
    50  	evalBroker         *EvalBroker
    51  	blockedEvals       *BlockedEvals
    52  	periodicDispatcher *PeriodicDispatch
    53  	logOutput          io.Writer
    54  	logger             *log.Logger
    55  	state              *state.StateStore
    56  	timetable          *TimeTable
    57  
    58  	// stateLock is only used to protect outside callers to State() from
    59  	// racing with Restore(), which is called by Raft (it puts in a totally
    60  	// new state store). Everything internal here is synchronized by the
    61  	// Raft side, so doesn't need to lock this.
    62  	stateLock sync.RWMutex
    63  }
    64  
    65  // nomadSnapshot is used to provide a snapshot of the current
    66  // state in a way that can be accessed concurrently with operations
    67  // that may modify the live state.
    68  type nomadSnapshot struct {
    69  	snap      *state.StateSnapshot
    70  	timetable *TimeTable
    71  }
    72  
    73  // snapshotHeader is the first entry in our snapshot
    74  type snapshotHeader struct {
    75  }
    76  
    77  // NewFSMPath is used to construct a new FSM with a blank state
    78  func NewFSM(evalBroker *EvalBroker, periodic *PeriodicDispatch,
    79  	blocked *BlockedEvals, logOutput io.Writer) (*nomadFSM, error) {
    80  	// Create a state store
    81  	state, err := state.NewStateStore(logOutput)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  
    86  	fsm := &nomadFSM{
    87  		evalBroker:         evalBroker,
    88  		periodicDispatcher: periodic,
    89  		blockedEvals:       blocked,
    90  		logOutput:          logOutput,
    91  		logger:             log.New(logOutput, "", log.LstdFlags),
    92  		state:              state,
    93  		timetable:          NewTimeTable(timeTableGranularity, timeTableLimit),
    94  	}
    95  	return fsm, nil
    96  }
    97  
    98  // Close is used to cleanup resources associated with the FSM
    99  func (n *nomadFSM) Close() error {
   100  	return nil
   101  }
   102  
   103  // State is used to return a handle to the current state
   104  func (n *nomadFSM) State() *state.StateStore {
   105  	n.stateLock.RLock()
   106  	defer n.stateLock.RUnlock()
   107  	return n.state
   108  }
   109  
   110  // TimeTable returns the time table of transactions
   111  func (n *nomadFSM) TimeTable() *TimeTable {
   112  	return n.timetable
   113  }
   114  
   115  func (n *nomadFSM) Apply(log *raft.Log) interface{} {
   116  	buf := log.Data
   117  	msgType := structs.MessageType(buf[0])
   118  
   119  	// Witness this write
   120  	n.timetable.Witness(log.Index, time.Now().UTC())
   121  
   122  	// Check if this message type should be ignored when unknown. This is
   123  	// used so that new commands can be added with developer control if older
   124  	// versions can safely ignore the command, or if they should crash.
   125  	ignoreUnknown := false
   126  	if msgType&structs.IgnoreUnknownTypeFlag == structs.IgnoreUnknownTypeFlag {
   127  		msgType &= ^structs.IgnoreUnknownTypeFlag
   128  		ignoreUnknown = true
   129  	}
   130  
   131  	switch msgType {
   132  	case structs.NodeRegisterRequestType:
   133  		return n.applyUpsertNode(buf[1:], log.Index)
   134  	case structs.NodeDeregisterRequestType:
   135  		return n.applyDeregisterNode(buf[1:], log.Index)
   136  	case structs.NodeUpdateStatusRequestType:
   137  		return n.applyStatusUpdate(buf[1:], log.Index)
   138  	case structs.NodeUpdateDrainRequestType:
   139  		return n.applyDrainUpdate(buf[1:], log.Index)
   140  	case structs.JobRegisterRequestType:
   141  		return n.applyUpsertJob(buf[1:], log.Index)
   142  	case structs.JobDeregisterRequestType:
   143  		return n.applyDeregisterJob(buf[1:], log.Index)
   144  	case structs.EvalUpdateRequestType:
   145  		return n.applyUpdateEval(buf[1:], log.Index)
   146  	case structs.EvalDeleteRequestType:
   147  		return n.applyDeleteEval(buf[1:], log.Index)
   148  	case structs.AllocUpdateRequestType:
   149  		return n.applyAllocUpdate(buf[1:], log.Index)
   150  	case structs.AllocClientUpdateRequestType:
   151  		return n.applyAllocClientUpdate(buf[1:], log.Index)
   152  	case structs.ReconcileJobSummariesRequestType:
   153  		return n.applyReconcileSummaries(buf[1:], log.Index)
   154  	case structs.VaultAccessorRegisterRequestType:
   155  		return n.applyUpsertVaultAccessor(buf[1:], log.Index)
   156  	case structs.VaultAccessorDegisterRequestType:
   157  		return n.applyDeregisterVaultAccessor(buf[1:], log.Index)
   158  	case structs.ApplyPlanResultsRequestType:
   159  		return n.applyPlanResults(buf[1:], log.Index)
   160  	default:
   161  		if ignoreUnknown {
   162  			n.logger.Printf("[WARN] nomad.fsm: ignoring unknown message type (%d), upgrade to newer version", msgType)
   163  			return nil
   164  		} else {
   165  			panic(fmt.Errorf("failed to apply request: %#v", buf))
   166  		}
   167  	}
   168  }
   169  
   170  func (n *nomadFSM) applyUpsertNode(buf []byte, index uint64) interface{} {
   171  	defer metrics.MeasureSince([]string{"nomad", "fsm", "register_node"}, time.Now())
   172  	var req structs.NodeRegisterRequest
   173  	if err := structs.Decode(buf, &req); err != nil {
   174  		panic(fmt.Errorf("failed to decode request: %v", err))
   175  	}
   176  
   177  	if err := n.state.UpsertNode(index, req.Node); err != nil {
   178  		n.logger.Printf("[ERR] nomad.fsm: UpsertNode failed: %v", err)
   179  		return err
   180  	}
   181  
   182  	// Unblock evals for the nodes computed node class if it is in a ready
   183  	// state.
   184  	if req.Node.Status == structs.NodeStatusReady {
   185  		n.blockedEvals.Unblock(req.Node.ComputedClass, index)
   186  	}
   187  
   188  	return nil
   189  }
   190  
   191  func (n *nomadFSM) applyDeregisterNode(buf []byte, index uint64) interface{} {
   192  	defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_node"}, time.Now())
   193  	var req structs.NodeDeregisterRequest
   194  	if err := structs.Decode(buf, &req); err != nil {
   195  		panic(fmt.Errorf("failed to decode request: %v", err))
   196  	}
   197  
   198  	if err := n.state.DeleteNode(index, req.NodeID); err != nil {
   199  		n.logger.Printf("[ERR] nomad.fsm: DeleteNode failed: %v", err)
   200  		return err
   201  	}
   202  	return nil
   203  }
   204  
   205  func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} {
   206  	defer metrics.MeasureSince([]string{"nomad", "fsm", "node_status_update"}, time.Now())
   207  	var req structs.NodeUpdateStatusRequest
   208  	if err := structs.Decode(buf, &req); err != nil {
   209  		panic(fmt.Errorf("failed to decode request: %v", err))
   210  	}
   211  
   212  	if err := n.state.UpdateNodeStatus(index, req.NodeID, req.Status); err != nil {
   213  		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeStatus failed: %v", err)
   214  		return err
   215  	}
   216  
   217  	// Unblock evals for the nodes computed node class if it is in a ready
   218  	// state.
   219  	if req.Status == structs.NodeStatusReady {
   220  		ws := memdb.NewWatchSet()
   221  		node, err := n.state.NodeByID(ws, req.NodeID)
   222  		if err != nil {
   223  			n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", req.NodeID, err)
   224  			return err
   225  
   226  		}
   227  		n.blockedEvals.Unblock(node.ComputedClass, index)
   228  	}
   229  
   230  	return nil
   231  }
   232  
   233  func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
   234  	defer metrics.MeasureSince([]string{"nomad", "fsm", "node_drain_update"}, time.Now())
   235  	var req structs.NodeUpdateDrainRequest
   236  	if err := structs.Decode(buf, &req); err != nil {
   237  		panic(fmt.Errorf("failed to decode request: %v", err))
   238  	}
   239  
   240  	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.Drain); err != nil {
   241  		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err)
   242  		return err
   243  	}
   244  	return nil
   245  }
   246  
   247  func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} {
   248  	defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now())
   249  	var req structs.JobRegisterRequest
   250  	if err := structs.Decode(buf, &req); err != nil {
   251  		panic(fmt.Errorf("failed to decode request: %v", err))
   252  	}
   253  
   254  	// COMPAT: Remove in 0.6
   255  	// Empty maps and slices should be treated as nil to avoid
   256  	// un-intended destructive updates in scheduler since we use
   257  	// reflect.DeepEqual. Starting Nomad 0.4.1, job submission sanatizes
   258  	// the incoming job.
   259  	req.Job.Canonicalize()
   260  
   261  	if err := n.state.UpsertJob(index, req.Job); err != nil {
   262  		n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err)
   263  		return err
   264  	}
   265  
   266  	// We always add the job to the periodic dispatcher because there is the
   267  	// possibility that the periodic spec was removed and then we should stop
   268  	// tracking it.
   269  	if err := n.periodicDispatcher.Add(req.Job); err != nil {
   270  		n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Add failed: %v", err)
   271  		return err
   272  	}
   273  
   274  	// Create a watch set
   275  	ws := memdb.NewWatchSet()
   276  
   277  	// If it is periodic, record the time it was inserted. This is necessary for
   278  	// recovering during leader election. It is possible that from the time it
   279  	// is added to when it was suppose to launch, leader election occurs and the
   280  	// job was not launched. In this case, we use the insertion time to
   281  	// determine if a launch was missed.
   282  	if req.Job.IsPeriodic() {
   283  		prevLaunch, err := n.state.PeriodicLaunchByID(ws, req.Job.ID)
   284  		if err != nil {
   285  			n.logger.Printf("[ERR] nomad.fsm: PeriodicLaunchByID failed: %v", err)
   286  			return err
   287  		}
   288  
   289  		// Record the insertion time as a launch. We overload the launch table
   290  		// such that the first entry is the insertion time.
   291  		if prevLaunch == nil {
   292  			launch := &structs.PeriodicLaunch{ID: req.Job.ID, Launch: time.Now()}
   293  			if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil {
   294  				n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err)
   295  				return err
   296  			}
   297  		}
   298  	}
   299  
   300  	// Check if the parent job is periodic and mark the launch time.
   301  	parentID := req.Job.ParentID
   302  	if parentID != "" {
   303  		parent, err := n.state.JobByID(ws, parentID)
   304  		if err != nil {
   305  			n.logger.Printf("[ERR] nomad.fsm: JobByID(%v) lookup for parent failed: %v", parentID, err)
   306  			return err
   307  		} else if parent == nil {
   308  			// The parent has been deregistered.
   309  			return nil
   310  		}
   311  
   312  		if parent.IsPeriodic() && !parent.IsParameterized() {
   313  			t, err := n.periodicDispatcher.LaunchTime(req.Job.ID)
   314  			if err != nil {
   315  				n.logger.Printf("[ERR] nomad.fsm: LaunchTime(%v) failed: %v", req.Job.ID, err)
   316  				return err
   317  			}
   318  
   319  			launch := &structs.PeriodicLaunch{ID: parentID, Launch: t}
   320  			if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil {
   321  				n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err)
   322  				return err
   323  			}
   324  		}
   325  	}
   326  
   327  	return nil
   328  }
   329  
   330  func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} {
   331  	defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_job"}, time.Now())
   332  	var req structs.JobDeregisterRequest
   333  	if err := structs.Decode(buf, &req); err != nil {
   334  		panic(fmt.Errorf("failed to decode request: %v", err))
   335  	}
   336  
   337  	// If it is periodic remove it from the dispatcher
   338  	if err := n.periodicDispatcher.Remove(req.JobID); err != nil {
   339  		n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Remove failed: %v", err)
   340  		return err
   341  	}
   342  
   343  	if req.Purge {
   344  		if err := n.state.DeleteJob(index, req.JobID); err != nil {
   345  			n.logger.Printf("[ERR] nomad.fsm: DeleteJob failed: %v", err)
   346  			return err
   347  		}
   348  
   349  		// We always delete from the periodic launch table because it is possible that
   350  		// the job was updated to be non-perioidic, thus checking if it is periodic
   351  		// doesn't ensure we clean it up properly.
   352  		n.state.DeletePeriodicLaunch(index, req.JobID)
   353  	} else {
   354  		// Get the current job and mark it as stopped and re-insert it.
   355  		ws := memdb.NewWatchSet()
   356  		current, err := n.state.JobByID(ws, req.JobID)
   357  		if err != nil {
   358  			n.logger.Printf("[ERR] nomad.fsm: JobByID lookup failed: %v", err)
   359  			return err
   360  		}
   361  
   362  		if current == nil {
   363  			return fmt.Errorf("job %q doesn't exist to be deregistered", req.JobID)
   364  		}
   365  
   366  		stopped := current.Copy()
   367  		stopped.Stop = true
   368  
   369  		if err := n.state.UpsertJob(index, stopped); err != nil {
   370  			n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err)
   371  			return err
   372  		}
   373  	}
   374  
   375  	return nil
   376  }
   377  
   378  func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} {
   379  	defer metrics.MeasureSince([]string{"nomad", "fsm", "update_eval"}, time.Now())
   380  	var req structs.EvalUpdateRequest
   381  	if err := structs.Decode(buf, &req); err != nil {
   382  		panic(fmt.Errorf("failed to decode request: %v", err))
   383  	}
   384  
   385  	if err := n.state.UpsertEvals(index, req.Evals); err != nil {
   386  		n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err)
   387  		return err
   388  	}
   389  
   390  	for _, eval := range req.Evals {
   391  		if eval.ShouldEnqueue() {
   392  			n.evalBroker.Enqueue(eval)
   393  		} else if eval.ShouldBlock() {
   394  			n.blockedEvals.Block(eval)
   395  		} else if eval.Status == structs.EvalStatusComplete &&
   396  			len(eval.FailedTGAllocs) == 0 {
   397  			// If we have a successful evaluation for a node, untrack any
   398  			// blocked evaluation
   399  			n.blockedEvals.Untrack(eval.JobID)
   400  		}
   401  	}
   402  	return nil
   403  }
   404  
   405  func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} {
   406  	defer metrics.MeasureSince([]string{"nomad", "fsm", "delete_eval"}, time.Now())
   407  	var req structs.EvalDeleteRequest
   408  	if err := structs.Decode(buf, &req); err != nil {
   409  		panic(fmt.Errorf("failed to decode request: %v", err))
   410  	}
   411  
   412  	if err := n.state.DeleteEval(index, req.Evals, req.Allocs); err != nil {
   413  		n.logger.Printf("[ERR] nomad.fsm: DeleteEval failed: %v", err)
   414  		return err
   415  	}
   416  	return nil
   417  }
   418  
   419  func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} {
   420  	defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update"}, time.Now())
   421  	var req structs.AllocUpdateRequest
   422  	if err := structs.Decode(buf, &req); err != nil {
   423  		panic(fmt.Errorf("failed to decode request: %v", err))
   424  	}
   425  
   426  	// Attach the job to all the allocations. It is pulled out in the
   427  	// payload to avoid the redundancy of encoding, but should be denormalized
   428  	// prior to being inserted into MemDB.
   429  	structs.DenormalizeAllocationJobs(req.Job, req.Alloc)
   430  
   431  	// Calculate the total resources of allocations. It is pulled out in the
   432  	// payload to avoid encoding something that can be computed, but should be
   433  	// denormalized prior to being inserted into MemDB.
   434  	for _, alloc := range req.Alloc {
   435  		if alloc.Resources != nil {
   436  			// COMPAT 0.4.1 -> 0.5
   437  			// Set the shared resources for allocations which don't have them
   438  			if alloc.SharedResources == nil {
   439  				alloc.SharedResources = &structs.Resources{
   440  					DiskMB: alloc.Resources.DiskMB,
   441  				}
   442  			}
   443  
   444  			continue
   445  		}
   446  
   447  		alloc.Resources = new(structs.Resources)
   448  		for _, task := range alloc.TaskResources {
   449  			alloc.Resources.Add(task)
   450  		}
   451  
   452  		// Add the shared resources
   453  		alloc.Resources.Add(alloc.SharedResources)
   454  	}
   455  
   456  	if err := n.state.UpsertAllocs(index, req.Alloc); err != nil {
   457  		n.logger.Printf("[ERR] nomad.fsm: UpsertAllocs failed: %v", err)
   458  		return err
   459  	}
   460  	return nil
   461  }
   462  
   463  func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} {
   464  	defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_client_update"}, time.Now())
   465  	var req structs.AllocUpdateRequest
   466  	if err := structs.Decode(buf, &req); err != nil {
   467  		panic(fmt.Errorf("failed to decode request: %v", err))
   468  	}
   469  	if len(req.Alloc) == 0 {
   470  		return nil
   471  	}
   472  
   473  	// Create a watch set
   474  	ws := memdb.NewWatchSet()
   475  
   476  	// Updating the allocs with the job id and task group name
   477  	for _, alloc := range req.Alloc {
   478  		if existing, _ := n.state.AllocByID(ws, alloc.ID); existing != nil {
   479  			alloc.JobID = existing.JobID
   480  			alloc.TaskGroup = existing.TaskGroup
   481  		}
   482  	}
   483  
   484  	// Update all the client allocations
   485  	if err := n.state.UpdateAllocsFromClient(index, req.Alloc); err != nil {
   486  		n.logger.Printf("[ERR] nomad.fsm: UpdateAllocFromClient failed: %v", err)
   487  		return err
   488  	}
   489  
   490  	// Unblock evals for the nodes computed node class if the client has
   491  	// finished running an allocation.
   492  	for _, alloc := range req.Alloc {
   493  		if alloc.ClientStatus == structs.AllocClientStatusComplete ||
   494  			alloc.ClientStatus == structs.AllocClientStatusFailed {
   495  			nodeID := alloc.NodeID
   496  			node, err := n.state.NodeByID(ws, nodeID)
   497  			if err != nil || node == nil {
   498  				n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", nodeID, err)
   499  				return err
   500  
   501  			}
   502  			n.blockedEvals.Unblock(node.ComputedClass, index)
   503  		}
   504  	}
   505  
   506  	return nil
   507  }
   508  
   509  // applyReconcileSummaries reconciles summaries for all the jobs
   510  func (n *nomadFSM) applyReconcileSummaries(buf []byte, index uint64) interface{} {
   511  	if err := n.state.ReconcileJobSummaries(index); err != nil {
   512  		return err
   513  	}
   514  	return n.reconcileQueuedAllocations(index)
   515  }
   516  
   517  // applyUpsertVaultAccessor stores the Vault accessors for a given allocation
   518  // and task
   519  func (n *nomadFSM) applyUpsertVaultAccessor(buf []byte, index uint64) interface{} {
   520  	defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_vault_accessor"}, time.Now())
   521  	var req structs.VaultAccessorsRequest
   522  	if err := structs.Decode(buf, &req); err != nil {
   523  		panic(fmt.Errorf("failed to decode request: %v", err))
   524  	}
   525  
   526  	if err := n.state.UpsertVaultAccessor(index, req.Accessors); err != nil {
   527  		n.logger.Printf("[ERR] nomad.fsm: UpsertVaultAccessor failed: %v", err)
   528  		return err
   529  	}
   530  
   531  	return nil
   532  }
   533  
   534  // applyDeregisterVaultAccessor deregisters a set of Vault accessors
   535  func (n *nomadFSM) applyDeregisterVaultAccessor(buf []byte, index uint64) interface{} {
   536  	defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_vault_accessor"}, time.Now())
   537  	var req structs.VaultAccessorsRequest
   538  	if err := structs.Decode(buf, &req); err != nil {
   539  		panic(fmt.Errorf("failed to decode request: %v", err))
   540  	}
   541  
   542  	if err := n.state.DeleteVaultAccessors(index, req.Accessors); err != nil {
   543  		n.logger.Printf("[ERR] nomad.fsm: DeregisterVaultAccessor failed: %v", err)
   544  		return err
   545  	}
   546  
   547  	return nil
   548  }
   549  
   550  // applyPlanApply applies the results of a plan application
   551  func (n *nomadFSM) applyPlanResults(buf []byte, index uint64) interface{} {
   552  	defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_plan_results"}, time.Now())
   553  	var req structs.ApplyPlanResultsRequest
   554  	if err := structs.Decode(buf, &req); err != nil {
   555  		panic(fmt.Errorf("failed to decode request: %v", err))
   556  	}
   557  
   558  	if err := n.state.UpsertPlanResults(index, &req); err != nil {
   559  		n.logger.Printf("[ERR] nomad.fsm: ApplyPlan failed: %v", err)
   560  		return err
   561  	}
   562  
   563  	return nil
   564  }
   565  
   566  func (n *nomadFSM) Snapshot() (raft.FSMSnapshot, error) {
   567  	// Create a new snapshot
   568  	snap, err := n.state.Snapshot()
   569  	if err != nil {
   570  		return nil, err
   571  	}
   572  
   573  	ns := &nomadSnapshot{
   574  		snap:      snap,
   575  		timetable: n.timetable,
   576  	}
   577  	return ns, nil
   578  }
   579  
   580  func (n *nomadFSM) Restore(old io.ReadCloser) error {
   581  	defer old.Close()
   582  
   583  	// Create a new state store
   584  	newState, err := state.NewStateStore(n.logOutput)
   585  	if err != nil {
   586  		return err
   587  	}
   588  
   589  	// Start the state restore
   590  	restore, err := newState.Restore()
   591  	if err != nil {
   592  		return err
   593  	}
   594  	defer restore.Abort()
   595  
   596  	// Create a decoder
   597  	dec := codec.NewDecoder(old, structs.MsgpackHandle)
   598  
   599  	// Read in the header
   600  	var header snapshotHeader
   601  	if err := dec.Decode(&header); err != nil {
   602  		return err
   603  	}
   604  
   605  	// Populate the new state
   606  	msgType := make([]byte, 1)
   607  	for {
   608  		// Read the message type
   609  		_, err := old.Read(msgType)
   610  		if err == io.EOF {
   611  			break
   612  		} else if err != nil {
   613  			return err
   614  		}
   615  
   616  		// Decode
   617  		switch SnapshotType(msgType[0]) {
   618  		case TimeTableSnapshot:
   619  			if err := n.timetable.Deserialize(dec); err != nil {
   620  				return fmt.Errorf("time table deserialize failed: %v", err)
   621  			}
   622  
   623  		case NodeSnapshot:
   624  			node := new(structs.Node)
   625  			if err := dec.Decode(node); err != nil {
   626  				return err
   627  			}
   628  			if err := restore.NodeRestore(node); err != nil {
   629  				return err
   630  			}
   631  
   632  		case JobSnapshot:
   633  			job := new(structs.Job)
   634  			if err := dec.Decode(job); err != nil {
   635  				return err
   636  			}
   637  
   638  			// COMPAT: Remove in 0.5
   639  			// Empty maps and slices should be treated as nil to avoid
   640  			// un-intended destructive updates in scheduler since we use
   641  			// reflect.DeepEqual. Starting Nomad 0.4.1, job submission sanatizes
   642  			// the incoming job.
   643  			job.Canonicalize()
   644  
   645  			if err := restore.JobRestore(job); err != nil {
   646  				return err
   647  			}
   648  
   649  		case EvalSnapshot:
   650  			eval := new(structs.Evaluation)
   651  			if err := dec.Decode(eval); err != nil {
   652  				return err
   653  			}
   654  			if err := restore.EvalRestore(eval); err != nil {
   655  				return err
   656  			}
   657  
   658  		case AllocSnapshot:
   659  			alloc := new(structs.Allocation)
   660  			if err := dec.Decode(alloc); err != nil {
   661  				return err
   662  			}
   663  			if err := restore.AllocRestore(alloc); err != nil {
   664  				return err
   665  			}
   666  
   667  		case IndexSnapshot:
   668  			idx := new(state.IndexEntry)
   669  			if err := dec.Decode(idx); err != nil {
   670  				return err
   671  			}
   672  			if err := restore.IndexRestore(idx); err != nil {
   673  				return err
   674  			}
   675  
   676  		case PeriodicLaunchSnapshot:
   677  			launch := new(structs.PeriodicLaunch)
   678  			if err := dec.Decode(launch); err != nil {
   679  				return err
   680  			}
   681  			if err := restore.PeriodicLaunchRestore(launch); err != nil {
   682  				return err
   683  			}
   684  
   685  		case JobSummarySnapshot:
   686  			summary := new(structs.JobSummary)
   687  			if err := dec.Decode(summary); err != nil {
   688  				return err
   689  			}
   690  			if err := restore.JobSummaryRestore(summary); err != nil {
   691  				return err
   692  			}
   693  
   694  		case VaultAccessorSnapshot:
   695  			accessor := new(structs.VaultAccessor)
   696  			if err := dec.Decode(accessor); err != nil {
   697  				return err
   698  			}
   699  			if err := restore.VaultAccessorRestore(accessor); err != nil {
   700  				return err
   701  			}
   702  
   703  		case JobVersionSnapshot:
   704  			version := new(structs.Job)
   705  			if err := dec.Decode(version); err != nil {
   706  				return err
   707  			}
   708  			if err := restore.JobVersionRestore(version); err != nil {
   709  				return err
   710  			}
   711  
   712  		case DeploymentSnapshot:
   713  			deployment := new(structs.Deployment)
   714  			if err := dec.Decode(deployment); err != nil {
   715  				return err
   716  			}
   717  			if err := restore.DeploymentRestore(deployment); err != nil {
   718  				return err
   719  			}
   720  
   721  		default:
   722  			return fmt.Errorf("Unrecognized snapshot type: %v", msgType)
   723  		}
   724  	}
   725  
   726  	restore.Commit()
   727  
   728  	// Create Job Summaries
   729  	// COMPAT 0.4 -> 0.4.1
   730  	// We can remove this in 0.5. This exists so that the server creates job
   731  	// summaries if they were not present previously. When users upgrade to 0.5
   732  	// from 0.4.1, the snapshot will contain job summaries so it will be safe to
   733  	// remove this block.
   734  	index, err := newState.Index("job_summary")
   735  	if err != nil {
   736  		return fmt.Errorf("couldn't fetch index of job summary table: %v", err)
   737  	}
   738  
   739  	// If the index is 0 that means there is no job summary in the snapshot so
   740  	// we will have to create them
   741  	if index == 0 {
   742  		// query the latest index
   743  		latestIndex, err := newState.LatestIndex()
   744  		if err != nil {
   745  			return fmt.Errorf("unable to query latest index: %v", index)
   746  		}
   747  		if err := newState.ReconcileJobSummaries(latestIndex); err != nil {
   748  			return fmt.Errorf("error reconciling summaries: %v", err)
   749  		}
   750  	}
   751  
   752  	// External code might be calling State(), so we need to synchronize
   753  	// here to make sure we swap in the new state store atomically.
   754  	n.stateLock.Lock()
   755  	stateOld := n.state
   756  	n.state = newState
   757  	n.stateLock.Unlock()
   758  
   759  	// Signal that the old state store has been abandoned. This is required
   760  	// because we don't operate on it any more, we just throw it away, so
   761  	// blocking queries won't see any changes and need to be woken up.
   762  	stateOld.Abandon()
   763  
   764  	return nil
   765  }
   766  
   767  // reconcileSummaries re-calculates the queued allocations for every job that we
   768  // created a Job Summary during the snap shot restore
   769  func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error {
   770  	// Get all the jobs
   771  	ws := memdb.NewWatchSet()
   772  	iter, err := n.state.Jobs(ws)
   773  	if err != nil {
   774  		return err
   775  	}
   776  
   777  	snap, err := n.state.Snapshot()
   778  	if err != nil {
   779  		return fmt.Errorf("unable to create snapshot: %v", err)
   780  	}
   781  
   782  	// Invoking the scheduler for every job so that we can populate the number
   783  	// of queued allocations for every job
   784  	for {
   785  		rawJob := iter.Next()
   786  		if rawJob == nil {
   787  			break
   788  		}
   789  		job := rawJob.(*structs.Job)
   790  		planner := &scheduler.Harness{
   791  			State: &snap.StateStore,
   792  		}
   793  		// Create an eval and mark it as requiring annotations and insert that as well
   794  		eval := &structs.Evaluation{
   795  			ID:             structs.GenerateUUID(),
   796  			Priority:       job.Priority,
   797  			Type:           job.Type,
   798  			TriggeredBy:    structs.EvalTriggerJobRegister,
   799  			JobID:          job.ID,
   800  			JobModifyIndex: job.JobModifyIndex + 1,
   801  			Status:         structs.EvalStatusPending,
   802  			AnnotatePlan:   true,
   803  		}
   804  
   805  		// Create the scheduler and run it
   806  		sched, err := scheduler.NewScheduler(eval.Type, n.logger, snap, planner)
   807  		if err != nil {
   808  			return err
   809  		}
   810  
   811  		if err := sched.Process(eval); err != nil {
   812  			return err
   813  		}
   814  
   815  		// Get the job summary from the fsm state store
   816  		originalSummary, err := n.state.JobSummaryByID(ws, job.ID)
   817  		if err != nil {
   818  			return err
   819  		}
   820  		summary := originalSummary.Copy()
   821  
   822  		// Add the allocations scheduler has made to queued since these
   823  		// allocations are never getting placed until the scheduler is invoked
   824  		// with a real planner
   825  		if l := len(planner.Plans); l != 1 {
   826  			return fmt.Errorf("unexpected number of plans during restore %d. Please file an issue including the logs", l)
   827  		}
   828  		for _, allocations := range planner.Plans[0].NodeAllocation {
   829  			for _, allocation := range allocations {
   830  				tgSummary, ok := summary.Summary[allocation.TaskGroup]
   831  				if !ok {
   832  					return fmt.Errorf("task group %q not found while updating queued count", allocation.TaskGroup)
   833  				}
   834  				tgSummary.Queued += 1
   835  				summary.Summary[allocation.TaskGroup] = tgSummary
   836  			}
   837  		}
   838  
   839  		// Add the queued allocations attached to the evaluation to the queued
   840  		// counter of the job summary
   841  		if l := len(planner.Evals); l != 1 {
   842  			return fmt.Errorf("unexpected number of evals during restore %d. Please file an issue including the logs", l)
   843  		}
   844  		for tg, queued := range planner.Evals[0].QueuedAllocations {
   845  			tgSummary, ok := summary.Summary[tg]
   846  			if !ok {
   847  				return fmt.Errorf("task group %q not found while updating queued count", tg)
   848  			}
   849  
   850  			// We add instead of setting here because we want to take into
   851  			// consideration what the scheduler with a mock planner thinks it
   852  			// placed. Those should be counted as queued as well
   853  			tgSummary.Queued += queued
   854  			summary.Summary[tg] = tgSummary
   855  		}
   856  
   857  		if !reflect.DeepEqual(summary, originalSummary) {
   858  			summary.ModifyIndex = index
   859  			if err := n.state.UpsertJobSummary(index, summary); err != nil {
   860  				return err
   861  			}
   862  		}
   863  	}
   864  	return nil
   865  }
   866  
   867  func (s *nomadSnapshot) Persist(sink raft.SnapshotSink) error {
   868  	defer metrics.MeasureSince([]string{"nomad", "fsm", "persist"}, time.Now())
   869  	// Register the nodes
   870  	encoder := codec.NewEncoder(sink, structs.MsgpackHandle)
   871  
   872  	// Write the header
   873  	header := snapshotHeader{}
   874  	if err := encoder.Encode(&header); err != nil {
   875  		sink.Cancel()
   876  		return err
   877  	}
   878  
   879  	// Write the time table
   880  	sink.Write([]byte{byte(TimeTableSnapshot)})
   881  	if err := s.timetable.Serialize(encoder); err != nil {
   882  		sink.Cancel()
   883  		return err
   884  	}
   885  
   886  	// Write all the data out
   887  	if err := s.persistIndexes(sink, encoder); err != nil {
   888  		sink.Cancel()
   889  		return err
   890  	}
   891  	if err := s.persistNodes(sink, encoder); err != nil {
   892  		sink.Cancel()
   893  		return err
   894  	}
   895  	if err := s.persistJobs(sink, encoder); err != nil {
   896  		sink.Cancel()
   897  		return err
   898  	}
   899  	if err := s.persistEvals(sink, encoder); err != nil {
   900  		sink.Cancel()
   901  		return err
   902  	}
   903  	if err := s.persistAllocs(sink, encoder); err != nil {
   904  		sink.Cancel()
   905  		return err
   906  	}
   907  	if err := s.persistPeriodicLaunches(sink, encoder); err != nil {
   908  		sink.Cancel()
   909  		return err
   910  	}
   911  	if err := s.persistJobSummaries(sink, encoder); err != nil {
   912  		sink.Cancel()
   913  		return err
   914  	}
   915  	if err := s.persistVaultAccessors(sink, encoder); err != nil {
   916  		sink.Cancel()
   917  		return err
   918  	}
   919  	if err := s.persistJobVersions(sink, encoder); err != nil {
   920  		sink.Cancel()
   921  		return err
   922  	}
   923  	if err := s.persistDeployments(sink, encoder); err != nil {
   924  		sink.Cancel()
   925  		return err
   926  	}
   927  	return nil
   928  }
   929  
   930  func (s *nomadSnapshot) persistIndexes(sink raft.SnapshotSink,
   931  	encoder *codec.Encoder) error {
   932  	// Get all the indexes
   933  	iter, err := s.snap.Indexes()
   934  	if err != nil {
   935  		return err
   936  	}
   937  
   938  	for {
   939  		// Get the next item
   940  		raw := iter.Next()
   941  		if raw == nil {
   942  			break
   943  		}
   944  
   945  		// Prepare the request struct
   946  		idx := raw.(*state.IndexEntry)
   947  
   948  		// Write out a node registration
   949  		sink.Write([]byte{byte(IndexSnapshot)})
   950  		if err := encoder.Encode(idx); err != nil {
   951  			return err
   952  		}
   953  	}
   954  	return nil
   955  }
   956  
   957  func (s *nomadSnapshot) persistNodes(sink raft.SnapshotSink,
   958  	encoder *codec.Encoder) error {
   959  	// Get all the nodes
   960  	ws := memdb.NewWatchSet()
   961  	nodes, err := s.snap.Nodes(ws)
   962  	if err != nil {
   963  		return err
   964  	}
   965  
   966  	for {
   967  		// Get the next item
   968  		raw := nodes.Next()
   969  		if raw == nil {
   970  			break
   971  		}
   972  
   973  		// Prepare the request struct
   974  		node := raw.(*structs.Node)
   975  
   976  		// Write out a node registration
   977  		sink.Write([]byte{byte(NodeSnapshot)})
   978  		if err := encoder.Encode(node); err != nil {
   979  			return err
   980  		}
   981  	}
   982  	return nil
   983  }
   984  
   985  func (s *nomadSnapshot) persistJobs(sink raft.SnapshotSink,
   986  	encoder *codec.Encoder) error {
   987  	// Get all the jobs
   988  	ws := memdb.NewWatchSet()
   989  	jobs, err := s.snap.Jobs(ws)
   990  	if err != nil {
   991  		return err
   992  	}
   993  
   994  	for {
   995  		// Get the next item
   996  		raw := jobs.Next()
   997  		if raw == nil {
   998  			break
   999  		}
  1000  
  1001  		// Prepare the request struct
  1002  		job := raw.(*structs.Job)
  1003  
  1004  		// Write out a job registration
  1005  		sink.Write([]byte{byte(JobSnapshot)})
  1006  		if err := encoder.Encode(job); err != nil {
  1007  			return err
  1008  		}
  1009  	}
  1010  	return nil
  1011  }
  1012  
  1013  func (s *nomadSnapshot) persistEvals(sink raft.SnapshotSink,
  1014  	encoder *codec.Encoder) error {
  1015  	// Get all the evaluations
  1016  	ws := memdb.NewWatchSet()
  1017  	evals, err := s.snap.Evals(ws)
  1018  	if err != nil {
  1019  		return err
  1020  	}
  1021  
  1022  	for {
  1023  		// Get the next item
  1024  		raw := evals.Next()
  1025  		if raw == nil {
  1026  			break
  1027  		}
  1028  
  1029  		// Prepare the request struct
  1030  		eval := raw.(*structs.Evaluation)
  1031  
  1032  		// Write out the evaluation
  1033  		sink.Write([]byte{byte(EvalSnapshot)})
  1034  		if err := encoder.Encode(eval); err != nil {
  1035  			return err
  1036  		}
  1037  	}
  1038  	return nil
  1039  }
  1040  
  1041  func (s *nomadSnapshot) persistAllocs(sink raft.SnapshotSink,
  1042  	encoder *codec.Encoder) error {
  1043  	// Get all the allocations
  1044  	ws := memdb.NewWatchSet()
  1045  	allocs, err := s.snap.Allocs(ws)
  1046  	if err != nil {
  1047  		return err
  1048  	}
  1049  
  1050  	for {
  1051  		// Get the next item
  1052  		raw := allocs.Next()
  1053  		if raw == nil {
  1054  			break
  1055  		}
  1056  
  1057  		// Prepare the request struct
  1058  		alloc := raw.(*structs.Allocation)
  1059  
  1060  		// Write out the evaluation
  1061  		sink.Write([]byte{byte(AllocSnapshot)})
  1062  		if err := encoder.Encode(alloc); err != nil {
  1063  			return err
  1064  		}
  1065  	}
  1066  	return nil
  1067  }
  1068  
  1069  func (s *nomadSnapshot) persistPeriodicLaunches(sink raft.SnapshotSink,
  1070  	encoder *codec.Encoder) error {
  1071  	// Get all the jobs
  1072  	ws := memdb.NewWatchSet()
  1073  	launches, err := s.snap.PeriodicLaunches(ws)
  1074  	if err != nil {
  1075  		return err
  1076  	}
  1077  
  1078  	for {
  1079  		// Get the next item
  1080  		raw := launches.Next()
  1081  		if raw == nil {
  1082  			break
  1083  		}
  1084  
  1085  		// Prepare the request struct
  1086  		launch := raw.(*structs.PeriodicLaunch)
  1087  
  1088  		// Write out a job registration
  1089  		sink.Write([]byte{byte(PeriodicLaunchSnapshot)})
  1090  		if err := encoder.Encode(launch); err != nil {
  1091  			return err
  1092  		}
  1093  	}
  1094  	return nil
  1095  }
  1096  
  1097  func (s *nomadSnapshot) persistJobSummaries(sink raft.SnapshotSink,
  1098  	encoder *codec.Encoder) error {
  1099  
  1100  	ws := memdb.NewWatchSet()
  1101  	summaries, err := s.snap.JobSummaries(ws)
  1102  	if err != nil {
  1103  		return err
  1104  	}
  1105  
  1106  	for {
  1107  		raw := summaries.Next()
  1108  		if raw == nil {
  1109  			break
  1110  		}
  1111  
  1112  		jobSummary := raw.(*structs.JobSummary)
  1113  
  1114  		sink.Write([]byte{byte(JobSummarySnapshot)})
  1115  		if err := encoder.Encode(jobSummary); err != nil {
  1116  			return err
  1117  		}
  1118  	}
  1119  	return nil
  1120  }
  1121  
  1122  func (s *nomadSnapshot) persistVaultAccessors(sink raft.SnapshotSink,
  1123  	encoder *codec.Encoder) error {
  1124  
  1125  	ws := memdb.NewWatchSet()
  1126  	accessors, err := s.snap.VaultAccessors(ws)
  1127  	if err != nil {
  1128  		return err
  1129  	}
  1130  
  1131  	for {
  1132  		raw := accessors.Next()
  1133  		if raw == nil {
  1134  			break
  1135  		}
  1136  
  1137  		accessor := raw.(*structs.VaultAccessor)
  1138  
  1139  		sink.Write([]byte{byte(VaultAccessorSnapshot)})
  1140  		if err := encoder.Encode(accessor); err != nil {
  1141  			return err
  1142  		}
  1143  	}
  1144  	return nil
  1145  }
  1146  
  1147  func (s *nomadSnapshot) persistJobVersions(sink raft.SnapshotSink,
  1148  	encoder *codec.Encoder) error {
  1149  	// Get all the jobs
  1150  	ws := memdb.NewWatchSet()
  1151  	versions, err := s.snap.JobVersions(ws)
  1152  	if err != nil {
  1153  		return err
  1154  	}
  1155  
  1156  	for {
  1157  		// Get the next item
  1158  		raw := versions.Next()
  1159  		if raw == nil {
  1160  			break
  1161  		}
  1162  
  1163  		// Prepare the request struct
  1164  		job := raw.(*structs.Job)
  1165  
  1166  		// Write out a job registration
  1167  		sink.Write([]byte{byte(JobVersionSnapshot)})
  1168  		if err := encoder.Encode(job); err != nil {
  1169  			return err
  1170  		}
  1171  	}
  1172  	return nil
  1173  }
  1174  
  1175  func (s *nomadSnapshot) persistDeployments(sink raft.SnapshotSink,
  1176  	encoder *codec.Encoder) error {
  1177  	// Get all the jobs
  1178  	ws := memdb.NewWatchSet()
  1179  	deployments, err := s.snap.Deployments(ws)
  1180  	if err != nil {
  1181  		return err
  1182  	}
  1183  
  1184  	for {
  1185  		// Get the next item
  1186  		raw := deployments.Next()
  1187  		if raw == nil {
  1188  			break
  1189  		}
  1190  
  1191  		// Prepare the request struct
  1192  		deployment := raw.(*structs.Deployment)
  1193  
  1194  		// Write out a job registration
  1195  		sink.Write([]byte{byte(DeploymentSnapshot)})
  1196  		if err := encoder.Encode(deployment); err != nil {
  1197  			return err
  1198  		}
  1199  	}
  1200  	return nil
  1201  }
  1202  
  1203  // Release is a no-op, as we just need to GC the pointer
  1204  // to the state store snapshot. There is nothing to explicitly
  1205  // cleanup.
  1206  func (s *nomadSnapshot) Release() {}