github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/nomad/fsm.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "io" 6 "reflect" 7 "sync" 8 "time" 9 10 metrics "github.com/armon/go-metrics" 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/go-msgpack/codec" 14 "github.com/hashicorp/nomad/helper/uuid" 15 "github.com/hashicorp/nomad/nomad/state" 16 "github.com/hashicorp/nomad/nomad/structs" 17 "github.com/hashicorp/nomad/scheduler" 18 "github.com/hashicorp/raft" 19 "github.com/pkg/errors" 20 ) 21 22 const ( 23 // timeTableGranularity is the granularity of index to time tracking 24 timeTableGranularity = 5 * time.Minute 25 26 // timeTableLimit is the maximum limit of our tracking 27 timeTableLimit = 72 * time.Hour 28 ) 29 30 // SnapshotType is prefixed to a record in the FSM snapshot 31 // so that we can determine the type for restore 32 type SnapshotType byte 33 34 const ( 35 NodeSnapshot SnapshotType = 0 36 JobSnapshot SnapshotType = 1 37 IndexSnapshot SnapshotType = 2 38 EvalSnapshot SnapshotType = 3 39 AllocSnapshot SnapshotType = 4 40 TimeTableSnapshot SnapshotType = 5 41 PeriodicLaunchSnapshot SnapshotType = 6 42 JobSummarySnapshot SnapshotType = 7 43 VaultAccessorSnapshot SnapshotType = 8 44 JobVersionSnapshot SnapshotType = 9 45 DeploymentSnapshot SnapshotType = 10 46 ACLPolicySnapshot SnapshotType = 11 47 ACLTokenSnapshot SnapshotType = 12 48 SchedulerConfigSnapshot SnapshotType = 13 49 ClusterMetadataSnapshot SnapshotType = 14 50 ServiceIdentityTokenAccessorSnapshot SnapshotType = 15 51 ScalingPolicySnapshot SnapshotType = 16 52 CSIPluginSnapshot SnapshotType = 17 53 CSIVolumeSnapshot SnapshotType = 18 54 ScalingEventsSnapshot SnapshotType = 19 55 EventSinkSnapshot SnapshotType = 20 56 // Namespace appliers were moved from enterprise and therefore start at 64 57 NamespaceSnapshot SnapshotType = 64 58 ) 59 60 // LogApplier is the definition of a function that can apply a Raft log 61 type LogApplier func(buf []byte, index uint64) interface{} 62 63 // LogAppliers is a mapping of the Raft MessageType to the appropriate log 64 // applier 65 type LogAppliers map[structs.MessageType]LogApplier 66 67 // SnapshotRestorer is the definition of a function that can apply a Raft log 68 type SnapshotRestorer func(restore *state.StateRestore, dec *codec.Decoder) error 69 70 // SnapshotRestorers is a mapping of the SnapshotType to the appropriate 71 // snapshot restorer. 72 type SnapshotRestorers map[SnapshotType]SnapshotRestorer 73 74 // nomadFSM implements a finite state machine that is used 75 // along with Raft to provide strong consistency. We implement 76 // this outside the Server to avoid exposing this outside the package. 77 type nomadFSM struct { 78 evalBroker *EvalBroker 79 blockedEvals *BlockedEvals 80 periodicDispatcher *PeriodicDispatch 81 logger log.Logger 82 state *state.StateStore 83 timetable *TimeTable 84 85 // config is the FSM config 86 config *FSMConfig 87 88 // enterpriseAppliers holds the set of enterprise only LogAppliers 89 enterpriseAppliers LogAppliers 90 91 // enterpriseRestorers holds the set of enterprise only snapshot restorers 92 enterpriseRestorers SnapshotRestorers 93 94 // stateLock is only used to protect outside callers to State() from 95 // racing with Restore(), which is called by Raft (it puts in a totally 96 // new state store). Everything internal here is synchronized by the 97 // Raft side, so doesn't need to lock this. 98 stateLock sync.RWMutex 99 } 100 101 // nomadSnapshot is used to provide a snapshot of the current 102 // state in a way that can be accessed concurrently with operations 103 // that may modify the live state. 104 type nomadSnapshot struct { 105 snap *state.StateSnapshot 106 timetable *TimeTable 107 } 108 109 // snapshotHeader is the first entry in our snapshot 110 type snapshotHeader struct { 111 } 112 113 // FSMConfig is used to configure the FSM 114 type FSMConfig struct { 115 // EvalBroker is the evaluation broker evaluations should be added to 116 EvalBroker *EvalBroker 117 118 // Periodic is the periodic job dispatcher that periodic jobs should be 119 // added/removed from 120 Periodic *PeriodicDispatch 121 122 // BlockedEvals is the blocked eval tracker that blocked evaluations should 123 // be added to. 124 Blocked *BlockedEvals 125 126 // Logger is the logger used by the FSM 127 Logger log.Logger 128 129 // Region is the region of the server embedding the FSM 130 Region string 131 132 // EnableEventBroker specifies if the FSMs state store should enable 133 // it's event publisher. 134 EnableEventBroker bool 135 136 // EventBufferSize is the amount of messages to hold in memory 137 EventBufferSize int64 138 } 139 140 // NewFSMPath is used to construct a new FSM with a blank state 141 func NewFSM(config *FSMConfig) (*nomadFSM, error) { 142 // Create a state store 143 sconfig := &state.StateStoreConfig{ 144 Logger: config.Logger, 145 Region: config.Region, 146 EnablePublisher: config.EnableEventBroker, 147 EventBufferSize: config.EventBufferSize, 148 } 149 state, err := state.NewStateStore(sconfig) 150 if err != nil { 151 return nil, err 152 } 153 154 fsm := &nomadFSM{ 155 evalBroker: config.EvalBroker, 156 periodicDispatcher: config.Periodic, 157 blockedEvals: config.Blocked, 158 logger: config.Logger.Named("fsm"), 159 config: config, 160 state: state, 161 timetable: NewTimeTable(timeTableGranularity, timeTableLimit), 162 enterpriseAppliers: make(map[structs.MessageType]LogApplier, 8), 163 enterpriseRestorers: make(map[SnapshotType]SnapshotRestorer, 8), 164 } 165 166 // Register all the log applier functions 167 fsm.registerLogAppliers() 168 169 // Register all the snapshot restorer functions 170 fsm.registerSnapshotRestorers() 171 172 return fsm, nil 173 } 174 175 // Close is used to cleanup resources associated with the FSM 176 func (n *nomadFSM) Close() error { 177 n.state.StopEventBroker() 178 return nil 179 } 180 181 // State is used to return a handle to the current state 182 func (n *nomadFSM) State() *state.StateStore { 183 n.stateLock.RLock() 184 defer n.stateLock.RUnlock() 185 return n.state 186 } 187 188 // TimeTable returns the time table of transactions 189 func (n *nomadFSM) TimeTable() *TimeTable { 190 return n.timetable 191 } 192 193 func (n *nomadFSM) Apply(log *raft.Log) interface{} { 194 buf := log.Data 195 msgType := structs.MessageType(buf[0]) 196 197 // Witness this write 198 n.timetable.Witness(log.Index, time.Now().UTC()) 199 200 // Check if this message type should be ignored when unknown. This is 201 // used so that new commands can be added with developer control if older 202 // versions can safely ignore the command, or if they should crash. 203 ignoreUnknown := false 204 if msgType&structs.IgnoreUnknownTypeFlag == structs.IgnoreUnknownTypeFlag { 205 msgType &= ^structs.IgnoreUnknownTypeFlag 206 ignoreUnknown = true 207 } 208 209 switch msgType { 210 case structs.NodeRegisterRequestType: 211 return n.applyUpsertNode(msgType, buf[1:], log.Index) 212 case structs.NodeDeregisterRequestType: 213 return n.applyDeregisterNode(msgType, buf[1:], log.Index) 214 case structs.NodeUpdateStatusRequestType: 215 return n.applyStatusUpdate(msgType, buf[1:], log.Index) 216 case structs.NodeUpdateDrainRequestType: 217 return n.applyDrainUpdate(msgType, buf[1:], log.Index) 218 case structs.JobRegisterRequestType: 219 return n.applyUpsertJob(msgType, buf[1:], log.Index) 220 case structs.JobDeregisterRequestType: 221 return n.applyDeregisterJob(msgType, buf[1:], log.Index) 222 case structs.EvalUpdateRequestType: 223 return n.applyUpdateEval(msgType, buf[1:], log.Index) 224 case structs.EvalDeleteRequestType: 225 return n.applyDeleteEval(buf[1:], log.Index) 226 case structs.AllocUpdateRequestType: 227 return n.applyAllocUpdate(msgType, buf[1:], log.Index) 228 case structs.AllocClientUpdateRequestType: 229 return n.applyAllocClientUpdate(msgType, buf[1:], log.Index) 230 case structs.ReconcileJobSummariesRequestType: 231 return n.applyReconcileSummaries(buf[1:], log.Index) 232 case structs.VaultAccessorRegisterRequestType: 233 return n.applyUpsertVaultAccessor(buf[1:], log.Index) 234 case structs.VaultAccessorDeregisterRequestType: 235 return n.applyDeregisterVaultAccessor(buf[1:], log.Index) 236 case structs.ApplyPlanResultsRequestType: 237 return n.applyPlanResults(msgType, buf[1:], log.Index) 238 case structs.DeploymentStatusUpdateRequestType: 239 return n.applyDeploymentStatusUpdate(msgType, buf[1:], log.Index) 240 case structs.DeploymentPromoteRequestType: 241 return n.applyDeploymentPromotion(msgType, buf[1:], log.Index) 242 case structs.DeploymentAllocHealthRequestType: 243 return n.applyDeploymentAllocHealth(msgType, buf[1:], log.Index) 244 case structs.DeploymentDeleteRequestType: 245 return n.applyDeploymentDelete(buf[1:], log.Index) 246 case structs.JobStabilityRequestType: 247 return n.applyJobStability(buf[1:], log.Index) 248 case structs.ACLPolicyUpsertRequestType: 249 return n.applyACLPolicyUpsert(msgType, buf[1:], log.Index) 250 case structs.ACLPolicyDeleteRequestType: 251 return n.applyACLPolicyDelete(msgType, buf[1:], log.Index) 252 case structs.ACLTokenUpsertRequestType: 253 return n.applyACLTokenUpsert(msgType, buf[1:], log.Index) 254 case structs.ACLTokenDeleteRequestType: 255 return n.applyACLTokenDelete(msgType, buf[1:], log.Index) 256 case structs.ACLTokenBootstrapRequestType: 257 return n.applyACLTokenBootstrap(msgType, buf[1:], log.Index) 258 case structs.AutopilotRequestType: 259 return n.applyAutopilotUpdate(buf[1:], log.Index) 260 case structs.UpsertNodeEventsType: 261 return n.applyUpsertNodeEvent(msgType, buf[1:], log.Index) 262 case structs.JobBatchDeregisterRequestType: 263 return n.applyBatchDeregisterJob(msgType, buf[1:], log.Index) 264 case structs.AllocUpdateDesiredTransitionRequestType: 265 return n.applyAllocUpdateDesiredTransition(msgType, buf[1:], log.Index) 266 case structs.NodeUpdateEligibilityRequestType: 267 return n.applyNodeEligibilityUpdate(msgType, buf[1:], log.Index) 268 case structs.BatchNodeUpdateDrainRequestType: 269 return n.applyBatchDrainUpdate(msgType, buf[1:], log.Index) 270 case structs.SchedulerConfigRequestType: 271 return n.applySchedulerConfigUpdate(buf[1:], log.Index) 272 case structs.NodeBatchDeregisterRequestType: 273 return n.applyDeregisterNodeBatch(msgType, buf[1:], log.Index) 274 case structs.ClusterMetadataRequestType: 275 return n.applyClusterMetadata(buf[1:], log.Index) 276 case structs.ServiceIdentityAccessorRegisterRequestType: 277 return n.applyUpsertSIAccessor(buf[1:], log.Index) 278 case structs.ServiceIdentityAccessorDeregisterRequestType: 279 return n.applyDeregisterSIAccessor(buf[1:], log.Index) 280 case structs.CSIVolumeRegisterRequestType: 281 return n.applyCSIVolumeRegister(buf[1:], log.Index) 282 case structs.CSIVolumeDeregisterRequestType: 283 return n.applyCSIVolumeDeregister(buf[1:], log.Index) 284 case structs.CSIVolumeClaimRequestType: 285 return n.applyCSIVolumeClaim(buf[1:], log.Index) 286 case structs.ScalingEventRegisterRequestType: 287 return n.applyUpsertScalingEvent(buf[1:], log.Index) 288 case structs.CSIVolumeClaimBatchRequestType: 289 return n.applyCSIVolumeBatchClaim(buf[1:], log.Index) 290 case structs.CSIPluginDeleteRequestType: 291 return n.applyCSIPluginDelete(buf[1:], log.Index) 292 case structs.NamespaceUpsertRequestType: 293 return n.applyNamespaceUpsert(buf[1:], log.Index) 294 case structs.NamespaceDeleteRequestType: 295 return n.applyNamespaceDelete(buf[1:], log.Index) 296 // COMPAT(1.0): These messages were added and removed during the 1.0-beta 297 // series and should not be immediately reused for other purposes 298 case structs.EventSinkUpsertRequestType, 299 structs.EventSinkDeleteRequestType, 300 structs.BatchEventSinkUpdateProgressType: 301 return nil 302 } 303 304 // Check enterprise only message types. 305 if applier, ok := n.enterpriseAppliers[msgType]; ok { 306 return applier(buf[1:], log.Index) 307 } 308 309 // We didn't match anything, either panic or ignore 310 if ignoreUnknown { 311 n.logger.Warn("ignoring unknown message type, upgrade to newer version", "msg_type", msgType) 312 return nil 313 } 314 315 panic(fmt.Errorf("failed to apply request: %#v", buf)) 316 } 317 318 func (n *nomadFSM) applyClusterMetadata(buf []byte, index uint64) interface{} { 319 defer metrics.MeasureSince([]string{"nomad", "fsm", "cluster_meta"}, time.Now()) 320 321 var req structs.ClusterMetadata 322 if err := structs.Decode(buf, &req); err != nil { 323 panic(fmt.Errorf("failed to decode request: %v", err)) 324 } 325 326 if err := n.state.ClusterSetMetadata(index, &req); err != nil { 327 n.logger.Error("ClusterSetMetadata failed", "error", err) 328 return err 329 } 330 331 n.logger.Trace("ClusterSetMetadata", "cluster_id", req.ClusterID, "create_time", req.CreateTime) 332 333 return nil 334 } 335 336 func (n *nomadFSM) applyUpsertNode(reqType structs.MessageType, buf []byte, index uint64) interface{} { 337 defer metrics.MeasureSince([]string{"nomad", "fsm", "register_node"}, time.Now()) 338 var req structs.NodeRegisterRequest 339 if err := structs.Decode(buf, &req); err != nil { 340 panic(fmt.Errorf("failed to decode request: %v", err)) 341 } 342 343 // Handle upgrade paths 344 req.Node.Canonicalize() 345 346 if err := n.state.UpsertNode(reqType, index, req.Node); err != nil { 347 n.logger.Error("UpsertNode failed", "error", err) 348 return err 349 } 350 351 // Unblock evals for the nodes computed node class if it is in a ready 352 // state. 353 if req.Node.Status == structs.NodeStatusReady { 354 n.blockedEvals.Unblock(req.Node.ComputedClass, index) 355 } 356 357 return nil 358 } 359 360 func (n *nomadFSM) applyDeregisterNode(reqType structs.MessageType, buf []byte, index uint64) interface{} { 361 defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_node"}, time.Now()) 362 var req structs.NodeDeregisterRequest 363 if err := structs.Decode(buf, &req); err != nil { 364 panic(fmt.Errorf("failed to decode request: %v", err)) 365 } 366 367 if err := n.state.DeleteNode(reqType, index, []string{req.NodeID}); err != nil { 368 n.logger.Error("DeleteNode failed", "error", err) 369 return err 370 } 371 372 return nil 373 } 374 375 func (n *nomadFSM) applyDeregisterNodeBatch(reqType structs.MessageType, buf []byte, index uint64) interface{} { 376 defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_deregister_node"}, time.Now()) 377 var req structs.NodeBatchDeregisterRequest 378 if err := structs.Decode(buf, &req); err != nil { 379 panic(fmt.Errorf("failed to decode request: %v", err)) 380 } 381 382 if err := n.state.DeleteNode(reqType, index, req.NodeIDs); err != nil { 383 n.logger.Error("DeleteNode failed", "error", err) 384 return err 385 } 386 387 return nil 388 } 389 390 func (n *nomadFSM) applyStatusUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 391 defer metrics.MeasureSince([]string{"nomad", "fsm", "node_status_update"}, time.Now()) 392 var req structs.NodeUpdateStatusRequest 393 if err := structs.Decode(buf, &req); err != nil { 394 panic(fmt.Errorf("failed to decode request: %v", err)) 395 } 396 397 if err := n.state.UpdateNodeStatus(msgType, index, req.NodeID, req.Status, req.UpdatedAt, req.NodeEvent); err != nil { 398 n.logger.Error("UpdateNodeStatus failed", "error", err) 399 return err 400 } 401 402 // Unblock evals for the nodes computed node class if it is in a ready 403 // state. 404 if req.Status == structs.NodeStatusReady { 405 ws := memdb.NewWatchSet() 406 node, err := n.state.NodeByID(ws, req.NodeID) 407 if err != nil { 408 n.logger.Error("looking up node failed", "node_id", req.NodeID, "error", err) 409 return err 410 411 } 412 n.blockedEvals.Unblock(node.ComputedClass, index) 413 n.blockedEvals.UnblockNode(req.NodeID, index) 414 } 415 416 return nil 417 } 418 419 func (n *nomadFSM) applyDrainUpdate(reqType structs.MessageType, buf []byte, index uint64) interface{} { 420 defer metrics.MeasureSince([]string{"nomad", "fsm", "node_drain_update"}, time.Now()) 421 var req structs.NodeUpdateDrainRequest 422 if err := structs.Decode(buf, &req); err != nil { 423 panic(fmt.Errorf("failed to decode request: %v", err)) 424 } 425 426 // COMPAT Remove in version 0.10 427 // As part of Nomad 0.8 we have deprecated the drain boolean in favor of a 428 // drain strategy but we need to handle the upgrade path where the Raft log 429 // contains drain updates with just the drain boolean being manipulated. 430 if req.Drain && req.DrainStrategy == nil { 431 // Mark the drain strategy as a force to imitate the old style drain 432 // functionality. 433 req.DrainStrategy = &structs.DrainStrategy{ 434 DrainSpec: structs.DrainSpec{ 435 Deadline: -1 * time.Second, 436 }, 437 } 438 } 439 440 if err := n.state.UpdateNodeDrain(reqType, index, req.NodeID, req.DrainStrategy, req.MarkEligible, req.UpdatedAt, req.NodeEvent); err != nil { 441 n.logger.Error("UpdateNodeDrain failed", "error", err) 442 return err 443 } 444 return nil 445 } 446 447 func (n *nomadFSM) applyBatchDrainUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 448 defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now()) 449 var req structs.BatchNodeUpdateDrainRequest 450 if err := structs.Decode(buf, &req); err != nil { 451 panic(fmt.Errorf("failed to decode request: %v", err)) 452 } 453 454 if err := n.state.BatchUpdateNodeDrain(msgType, index, req.UpdatedAt, req.Updates, req.NodeEvents); err != nil { 455 n.logger.Error("BatchUpdateNodeDrain failed", "error", err) 456 return err 457 } 458 return nil 459 } 460 461 func (n *nomadFSM) applyNodeEligibilityUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 462 defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now()) 463 var req structs.NodeUpdateEligibilityRequest 464 if err := structs.Decode(buf, &req); err != nil { 465 panic(fmt.Errorf("failed to decode request: %v", err)) 466 } 467 468 // Lookup the existing node 469 node, err := n.state.NodeByID(nil, req.NodeID) 470 if err != nil { 471 n.logger.Error("UpdateNodeEligibility failed to lookup node", "node_id", req.NodeID, "error", err) 472 return err 473 } 474 475 if err := n.state.UpdateNodeEligibility(msgType, index, req.NodeID, req.Eligibility, req.UpdatedAt, req.NodeEvent); err != nil { 476 n.logger.Error("UpdateNodeEligibility failed", "error", err) 477 return err 478 } 479 480 // Unblock evals for the nodes computed node class if it is in a ready 481 // state. 482 if node != nil && node.SchedulingEligibility == structs.NodeSchedulingIneligible && 483 req.Eligibility == structs.NodeSchedulingEligible { 484 n.blockedEvals.Unblock(node.ComputedClass, index) 485 n.blockedEvals.UnblockNode(req.NodeID, index) 486 } 487 488 return nil 489 } 490 491 func (n *nomadFSM) applyUpsertJob(msgType structs.MessageType, buf []byte, index uint64) interface{} { 492 defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now()) 493 var req structs.JobRegisterRequest 494 if err := structs.Decode(buf, &req); err != nil { 495 panic(fmt.Errorf("failed to decode request: %v", err)) 496 } 497 498 /* Handle upgrade paths: 499 * - Empty maps and slices should be treated as nil to avoid 500 * un-intended destructive updates in scheduler since we use 501 * reflect.DeepEqual. Starting Nomad 0.4.1, job submission sanitizes 502 * the incoming job. 503 * - Migrate from old style upgrade stanza that used only a stagger. 504 */ 505 req.Job.Canonicalize() 506 507 if err := n.state.UpsertJob(msgType, index, req.Job); err != nil { 508 n.logger.Error("UpsertJob failed", "error", err) 509 return err 510 } 511 512 // We always add the job to the periodic dispatcher because there is the 513 // possibility that the periodic spec was removed and then we should stop 514 // tracking it. 515 if err := n.periodicDispatcher.Add(req.Job); err != nil { 516 n.logger.Error("periodicDispatcher.Add failed", "error", err) 517 return fmt.Errorf("failed adding job to periodic dispatcher: %v", err) 518 } 519 520 // Create a watch set 521 ws := memdb.NewWatchSet() 522 523 // If it is an active periodic job, record the time it was inserted. This is 524 // necessary for recovering during leader election. It is possible that from 525 // the time it is added to when it was suppose to launch, leader election 526 // occurs and the job was not launched. In this case, we use the insertion 527 // time to determine if a launch was missed. 528 if req.Job.IsPeriodicActive() { 529 prevLaunch, err := n.state.PeriodicLaunchByID(ws, req.Namespace, req.Job.ID) 530 if err != nil { 531 n.logger.Error("PeriodicLaunchByID failed", "error", err) 532 return err 533 } 534 535 // Record the insertion time as a launch. We overload the launch table 536 // such that the first entry is the insertion time. 537 if prevLaunch == nil { 538 launch := &structs.PeriodicLaunch{ 539 ID: req.Job.ID, 540 Namespace: req.Namespace, 541 Launch: time.Now(), 542 } 543 if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { 544 n.logger.Error("UpsertPeriodicLaunch failed", "error", err) 545 return err 546 } 547 } 548 } 549 550 // Check if the parent job is periodic and mark the launch time. 551 parentID := req.Job.ParentID 552 if parentID != "" { 553 parent, err := n.state.JobByID(ws, req.Namespace, parentID) 554 if err != nil { 555 n.logger.Error("JobByID lookup for parent failed", "parent_id", parentID, "namespace", req.Namespace, "error", err) 556 return err 557 } else if parent == nil { 558 // The parent has been deregistered. 559 return nil 560 } 561 562 if parent.IsPeriodic() && !parent.IsParameterized() { 563 t, err := n.periodicDispatcher.LaunchTime(req.Job.ID) 564 if err != nil { 565 n.logger.Error("LaunchTime failed", "job", req.Job.NamespacedID(), "error", err) 566 return err 567 } 568 569 launch := &structs.PeriodicLaunch{ 570 ID: parentID, 571 Namespace: req.Namespace, 572 Launch: t, 573 } 574 if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { 575 n.logger.Error("UpsertPeriodicLaunch failed", "error", err) 576 return err 577 } 578 } 579 } 580 581 // COMPAT: Prior to Nomad 0.12.x evaluations were submitted in a separate Raft log, 582 // so this may be nil during server upgrades. 583 if req.Eval != nil { 584 req.Eval.JobModifyIndex = index 585 if err := n.upsertEvals(msgType, index, []*structs.Evaluation{req.Eval}); err != nil { 586 return err 587 } 588 } 589 590 return nil 591 } 592 593 func (n *nomadFSM) applyDeregisterJob(msgType structs.MessageType, buf []byte, index uint64) interface{} { 594 defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_job"}, time.Now()) 595 var req structs.JobDeregisterRequest 596 if err := structs.Decode(buf, &req); err != nil { 597 panic(fmt.Errorf("failed to decode request: %v", err)) 598 } 599 600 err := n.state.WithWriteTransaction(msgType, index, func(tx state.Txn) error { 601 err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge, tx) 602 603 if err != nil { 604 n.logger.Error("deregistering job failed", 605 "error", err, "job", req.JobID, "namespace", req.Namespace) 606 return err 607 } 608 609 return nil 610 }) 611 612 // COMPAT: Prior to Nomad 0.12.x evaluations were submitted in a separate Raft log, 613 // so this may be nil during server upgrades. 614 // always attempt upsert eval even if job deregister fail 615 if req.Eval != nil { 616 req.Eval.JobModifyIndex = index 617 if err := n.upsertEvals(msgType, index, []*structs.Evaluation{req.Eval}); err != nil { 618 return err 619 } 620 } 621 622 if err != nil { 623 return err 624 } 625 626 return nil 627 } 628 629 func (n *nomadFSM) applyBatchDeregisterJob(msgType structs.MessageType, buf []byte, index uint64) interface{} { 630 defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_deregister_job"}, time.Now()) 631 var req structs.JobBatchDeregisterRequest 632 if err := structs.Decode(buf, &req); err != nil { 633 panic(fmt.Errorf("failed to decode request: %v", err)) 634 } 635 636 // Perform all store updates atomically to ensure a consistent view for store readers. 637 // A partial update may increment the snapshot index, allowing eval brokers to process 638 // evals for jobs whose deregistering didn't get committed yet. 639 err := n.state.WithWriteTransaction(msgType, index, func(tx state.Txn) error { 640 for jobNS, options := range req.Jobs { 641 if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge, tx); err != nil { 642 n.logger.Error("deregistering job failed", "job", jobNS.ID, "error", err) 643 return err 644 } 645 } 646 647 if err := n.state.UpsertEvalsTxn(index, req.Evals, tx); err != nil { 648 n.logger.Error("UpsertEvals failed", "error", err) 649 return err 650 } 651 652 return nil 653 }) 654 655 if err != nil { 656 return err 657 } 658 659 // perform the side effects outside the transactions 660 n.handleUpsertedEvals(req.Evals) 661 return nil 662 } 663 664 // handleJobDeregister is used to deregister a job. Leaves error logging up to 665 // caller. 666 func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool, tx state.Txn) error { 667 // If it is periodic remove it from the dispatcher 668 if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil { 669 return fmt.Errorf("periodicDispatcher.Remove failed: %w", err) 670 } 671 672 if purge { 673 if err := n.state.DeleteJobTxn(index, namespace, jobID, tx); err != nil { 674 return fmt.Errorf("DeleteJob failed: %w", err) 675 } 676 677 // We always delete from the periodic launch table because it is possible that 678 // the job was updated to be non-periodic, thus checking if it is periodic 679 // doesn't ensure we clean it up properly. 680 n.state.DeletePeriodicLaunchTxn(index, namespace, jobID, tx) 681 } else { 682 // Get the current job and mark it as stopped and re-insert it. 683 ws := memdb.NewWatchSet() 684 current, err := n.state.JobByIDTxn(ws, namespace, jobID, tx) 685 if err != nil { 686 return fmt.Errorf("JobByID lookup failed: %w", err) 687 } 688 689 if current == nil { 690 return fmt.Errorf("job %q in namespace %q doesn't exist to be deregistered", jobID, namespace) 691 } 692 693 stopped := current.Copy() 694 stopped.Stop = true 695 696 if err := n.state.UpsertJobTxn(index, stopped, tx); err != nil { 697 return fmt.Errorf("UpsertJob failed: %w", err) 698 } 699 } 700 701 return nil 702 } 703 704 func (n *nomadFSM) applyUpdateEval(msgType structs.MessageType, buf []byte, index uint64) interface{} { 705 defer metrics.MeasureSince([]string{"nomad", "fsm", "update_eval"}, time.Now()) 706 var req structs.EvalUpdateRequest 707 if err := structs.Decode(buf, &req); err != nil { 708 panic(fmt.Errorf("failed to decode request: %v", err)) 709 } 710 711 return n.upsertEvals(msgType, index, req.Evals) 712 } 713 714 func (n *nomadFSM) upsertEvals(msgType structs.MessageType, index uint64, evals []*structs.Evaluation) error { 715 if err := n.state.UpsertEvals(msgType, index, evals); err != nil { 716 n.logger.Error("UpsertEvals failed", "error", err) 717 return err 718 } 719 720 n.handleUpsertedEvals(evals) 721 return nil 722 } 723 724 // handleUpsertingEval is a helper for taking action after upserting 725 // evaluations. 726 func (n *nomadFSM) handleUpsertedEvals(evals []*structs.Evaluation) { 727 for _, eval := range evals { 728 n.handleUpsertedEval(eval) 729 } 730 } 731 732 // handleUpsertingEval is a helper for taking action after upserting an eval. 733 func (n *nomadFSM) handleUpsertedEval(eval *structs.Evaluation) { 734 if eval == nil { 735 return 736 } 737 738 if eval.ShouldEnqueue() { 739 n.evalBroker.Enqueue(eval) 740 } else if eval.ShouldBlock() { 741 n.blockedEvals.Block(eval) 742 } else if eval.Status == structs.EvalStatusComplete && 743 len(eval.FailedTGAllocs) == 0 { 744 // If we have a successful evaluation for a node, untrack any 745 // blocked evaluation 746 n.blockedEvals.Untrack(eval.JobID, eval.Namespace) 747 } 748 } 749 750 func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} { 751 defer metrics.MeasureSince([]string{"nomad", "fsm", "delete_eval"}, time.Now()) 752 var req structs.EvalDeleteRequest 753 if err := structs.Decode(buf, &req); err != nil { 754 panic(fmt.Errorf("failed to decode request: %v", err)) 755 } 756 757 if err := n.state.DeleteEval(index, req.Evals, req.Allocs); err != nil { 758 n.logger.Error("DeleteEval failed", "error", err) 759 return err 760 } 761 return nil 762 } 763 764 func (n *nomadFSM) applyAllocUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 765 defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update"}, time.Now()) 766 var req structs.AllocUpdateRequest 767 if err := structs.Decode(buf, &req); err != nil { 768 panic(fmt.Errorf("failed to decode request: %v", err)) 769 } 770 771 // Attach the job to all the allocations. It is pulled out in the 772 // payload to avoid the redundancy of encoding, but should be denormalized 773 // prior to being inserted into MemDB. 774 structs.DenormalizeAllocationJobs(req.Job, req.Alloc) 775 776 for _, alloc := range req.Alloc { 777 // COMPAT(0.11): Remove in 0.11 778 // Calculate the total resources of allocations. It is pulled out in the 779 // payload to avoid encoding something that can be computed, but should be 780 // denormalized prior to being inserted into MemDB. 781 if alloc.Resources == nil { 782 alloc.Resources = new(structs.Resources) 783 for _, task := range alloc.TaskResources { 784 alloc.Resources.Add(task) 785 } 786 787 // Add the shared resources 788 alloc.Resources.Add(alloc.SharedResources) 789 } 790 791 // Handle upgrade path 792 alloc.Canonicalize() 793 } 794 795 if err := n.state.UpsertAllocs(msgType, index, req.Alloc); err != nil { 796 n.logger.Error("UpsertAllocs failed", "error", err) 797 return err 798 } 799 return nil 800 } 801 802 func (n *nomadFSM) applyAllocClientUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 803 defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_client_update"}, time.Now()) 804 var req structs.AllocUpdateRequest 805 if err := structs.Decode(buf, &req); err != nil { 806 panic(fmt.Errorf("failed to decode request: %v", err)) 807 } 808 if len(req.Alloc) == 0 { 809 return nil 810 } 811 812 // Create a watch set 813 ws := memdb.NewWatchSet() 814 815 // Updating the allocs with the job id and task group name 816 for _, alloc := range req.Alloc { 817 if existing, _ := n.state.AllocByID(ws, alloc.ID); existing != nil { 818 alloc.JobID = existing.JobID 819 alloc.TaskGroup = existing.TaskGroup 820 } 821 } 822 823 // Update all the client allocations 824 if err := n.state.UpdateAllocsFromClient(msgType, index, req.Alloc); err != nil { 825 n.logger.Error("UpdateAllocFromClient failed", "error", err) 826 return err 827 } 828 829 // Update any evals 830 if len(req.Evals) > 0 { 831 if err := n.upsertEvals(msgType, index, req.Evals); err != nil { 832 n.logger.Error("applyAllocClientUpdate failed to update evaluations", "error", err) 833 return err 834 } 835 } 836 837 // Unblock evals for the nodes computed node class if the client has 838 // finished running an allocation. 839 for _, alloc := range req.Alloc { 840 if alloc.ClientStatus == structs.AllocClientStatusComplete || 841 alloc.ClientStatus == structs.AllocClientStatusFailed { 842 nodeID := alloc.NodeID 843 node, err := n.state.NodeByID(ws, nodeID) 844 if err != nil || node == nil { 845 n.logger.Error("looking up node failed", "node_id", nodeID, "error", err) 846 return err 847 848 } 849 850 // Unblock any associated quota 851 quota, err := n.allocQuota(alloc.ID) 852 if err != nil { 853 n.logger.Error("looking up quota associated with alloc failed", "alloc_id", alloc.ID, "error", err) 854 return err 855 } 856 857 n.blockedEvals.UnblockClassAndQuota(node.ComputedClass, quota, index) 858 n.blockedEvals.UnblockNode(node.ID, index) 859 } 860 } 861 862 return nil 863 } 864 865 // applyAllocUpdateDesiredTransition is used to update the desired transitions 866 // of a set of allocations. 867 func (n *nomadFSM) applyAllocUpdateDesiredTransition(msgType structs.MessageType, buf []byte, index uint64) interface{} { 868 defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now()) 869 var req structs.AllocUpdateDesiredTransitionRequest 870 if err := structs.Decode(buf, &req); err != nil { 871 panic(fmt.Errorf("failed to decode request: %v", err)) 872 } 873 874 if err := n.state.UpdateAllocsDesiredTransitions(msgType, index, req.Allocs, req.Evals); err != nil { 875 n.logger.Error("UpdateAllocsDesiredTransitions failed", "error", err) 876 return err 877 } 878 879 n.handleUpsertedEvals(req.Evals) 880 return nil 881 } 882 883 // applyReconcileSummaries reconciles summaries for all the jobs 884 func (n *nomadFSM) applyReconcileSummaries(buf []byte, index uint64) interface{} { 885 if err := n.state.ReconcileJobSummaries(index); err != nil { 886 return err 887 } 888 return n.reconcileQueuedAllocations(index) 889 } 890 891 // applyUpsertNodeEvent tracks the given node events. 892 func (n *nomadFSM) applyUpsertNodeEvent(msgType structs.MessageType, buf []byte, index uint64) interface{} { 893 defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_node_events"}, time.Now()) 894 var req structs.EmitNodeEventsRequest 895 if err := structs.Decode(buf, &req); err != nil { 896 panic(fmt.Errorf("failed to decode EmitNodeEventsRequest: %v", err)) 897 } 898 899 if err := n.state.UpsertNodeEvents(msgType, index, req.NodeEvents); err != nil { 900 n.logger.Error("failed to add node events", "error", err) 901 return err 902 } 903 904 return nil 905 } 906 907 // applyUpsertVaultAccessor stores the Vault accessors for a given allocation 908 // and task 909 func (n *nomadFSM) applyUpsertVaultAccessor(buf []byte, index uint64) interface{} { 910 defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_vault_accessor"}, time.Now()) 911 var req structs.VaultAccessorsRequest 912 if err := structs.Decode(buf, &req); err != nil { 913 panic(fmt.Errorf("failed to decode request: %v", err)) 914 } 915 916 if err := n.state.UpsertVaultAccessor(index, req.Accessors); err != nil { 917 n.logger.Error("UpsertVaultAccessor failed", "error", err) 918 return err 919 } 920 921 return nil 922 } 923 924 // applyDeregisterVaultAccessor deregisters a set of Vault accessors 925 func (n *nomadFSM) applyDeregisterVaultAccessor(buf []byte, index uint64) interface{} { 926 defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_vault_accessor"}, time.Now()) 927 var req structs.VaultAccessorsRequest 928 if err := structs.Decode(buf, &req); err != nil { 929 panic(fmt.Errorf("failed to decode request: %v", err)) 930 } 931 932 if err := n.state.DeleteVaultAccessors(index, req.Accessors); err != nil { 933 n.logger.Error("DeregisterVaultAccessor failed", "error", err) 934 return err 935 } 936 937 return nil 938 } 939 940 func (n *nomadFSM) applyUpsertSIAccessor(buf []byte, index uint64) interface{} { 941 defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_si_accessor"}, time.Now()) 942 var request structs.SITokenAccessorsRequest 943 if err := structs.Decode(buf, &request); err != nil { 944 panic(errors.Wrap(err, "failed to decode request")) 945 } 946 947 if err := n.state.UpsertSITokenAccessors(index, request.Accessors); err != nil { 948 n.logger.Error("UpsertSITokenAccessors failed", "error", err) 949 return err 950 } 951 952 return nil 953 } 954 955 func (n *nomadFSM) applyDeregisterSIAccessor(buf []byte, index uint64) interface{} { 956 defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_si_accessor"}, time.Now()) 957 var request structs.SITokenAccessorsRequest 958 if err := structs.Decode(buf, &request); err != nil { 959 panic(errors.Wrap(err, "failed to decode request")) 960 } 961 962 if err := n.state.DeleteSITokenAccessors(index, request.Accessors); err != nil { 963 n.logger.Error("DeregisterSITokenAccessor failed", "error", err) 964 return err 965 } 966 967 return nil 968 } 969 970 // applyPlanApply applies the results of a plan application 971 func (n *nomadFSM) applyPlanResults(msgType structs.MessageType, buf []byte, index uint64) interface{} { 972 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_plan_results"}, time.Now()) 973 var req structs.ApplyPlanResultsRequest 974 if err := structs.Decode(buf, &req); err != nil { 975 panic(fmt.Errorf("failed to decode request: %v", err)) 976 } 977 978 if err := n.state.UpsertPlanResults(msgType, index, &req); err != nil { 979 n.logger.Error("ApplyPlan failed", "error", err) 980 return err 981 } 982 983 // Add evals for jobs that were preempted 984 n.handleUpsertedEvals(req.PreemptionEvals) 985 return nil 986 } 987 988 // applyDeploymentStatusUpdate is used to update the status of an existing 989 // deployment 990 func (n *nomadFSM) applyDeploymentStatusUpdate(msgType structs.MessageType, buf []byte, index uint64) interface{} { 991 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_status_update"}, time.Now()) 992 var req structs.DeploymentStatusUpdateRequest 993 if err := structs.Decode(buf, &req); err != nil { 994 panic(fmt.Errorf("failed to decode request: %v", err)) 995 } 996 997 if err := n.state.UpdateDeploymentStatus(msgType, index, &req); err != nil { 998 n.logger.Error("UpsertDeploymentStatusUpdate failed", "error", err) 999 return err 1000 } 1001 1002 n.handleUpsertedEval(req.Eval) 1003 return nil 1004 } 1005 1006 // applyDeploymentPromotion is used to promote canaries in a deployment 1007 func (n *nomadFSM) applyDeploymentPromotion(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1008 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_promotion"}, time.Now()) 1009 var req structs.ApplyDeploymentPromoteRequest 1010 if err := structs.Decode(buf, &req); err != nil { 1011 panic(fmt.Errorf("failed to decode request: %v", err)) 1012 } 1013 1014 if err := n.state.UpdateDeploymentPromotion(msgType, index, &req); err != nil { 1015 n.logger.Error("UpsertDeploymentPromotion failed", "error", err) 1016 return err 1017 } 1018 1019 n.handleUpsertedEval(req.Eval) 1020 return nil 1021 } 1022 1023 // applyDeploymentAllocHealth is used to set the health of allocations as part 1024 // of a deployment 1025 func (n *nomadFSM) applyDeploymentAllocHealth(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1026 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_alloc_health"}, time.Now()) 1027 var req structs.ApplyDeploymentAllocHealthRequest 1028 if err := structs.Decode(buf, &req); err != nil { 1029 panic(fmt.Errorf("failed to decode request: %v", err)) 1030 } 1031 1032 if err := n.state.UpdateDeploymentAllocHealth(msgType, index, &req); err != nil { 1033 n.logger.Error("UpsertDeploymentAllocHealth failed", "error", err) 1034 return err 1035 } 1036 1037 n.handleUpsertedEval(req.Eval) 1038 return nil 1039 } 1040 1041 // applyDeploymentDelete is used to delete a set of deployments 1042 func (n *nomadFSM) applyDeploymentDelete(buf []byte, index uint64) interface{} { 1043 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_delete"}, time.Now()) 1044 var req structs.DeploymentDeleteRequest 1045 if err := structs.Decode(buf, &req); err != nil { 1046 panic(fmt.Errorf("failed to decode request: %v", err)) 1047 } 1048 1049 if err := n.state.DeleteDeployment(index, req.Deployments); err != nil { 1050 n.logger.Error("DeleteDeployment failed", "error", err) 1051 return err 1052 } 1053 1054 return nil 1055 } 1056 1057 // applyJobStability is used to set the stability of a job 1058 func (n *nomadFSM) applyJobStability(buf []byte, index uint64) interface{} { 1059 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_job_stability"}, time.Now()) 1060 var req structs.JobStabilityRequest 1061 if err := structs.Decode(buf, &req); err != nil { 1062 panic(fmt.Errorf("failed to decode request: %v", err)) 1063 } 1064 1065 if err := n.state.UpdateJobStability(index, req.Namespace, req.JobID, req.JobVersion, req.Stable); err != nil { 1066 n.logger.Error("UpdateJobStability failed", "error", err) 1067 return err 1068 } 1069 1070 return nil 1071 } 1072 1073 // applyACLPolicyUpsert is used to upsert a set of policies 1074 func (n *nomadFSM) applyACLPolicyUpsert(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1075 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_upsert"}, time.Now()) 1076 var req structs.ACLPolicyUpsertRequest 1077 if err := structs.Decode(buf, &req); err != nil { 1078 panic(fmt.Errorf("failed to decode request: %v", err)) 1079 } 1080 1081 if err := n.state.UpsertACLPolicies(msgType, index, req.Policies); err != nil { 1082 n.logger.Error("UpsertACLPolicies failed", "error", err) 1083 return err 1084 } 1085 return nil 1086 } 1087 1088 // applyACLPolicyDelete is used to delete a set of policies 1089 func (n *nomadFSM) applyACLPolicyDelete(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1090 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_delete"}, time.Now()) 1091 var req structs.ACLPolicyDeleteRequest 1092 if err := structs.Decode(buf, &req); err != nil { 1093 panic(fmt.Errorf("failed to decode request: %v", err)) 1094 } 1095 1096 if err := n.state.DeleteACLPolicies(msgType, index, req.Names); err != nil { 1097 n.logger.Error("DeleteACLPolicies failed", "error", err) 1098 return err 1099 } 1100 return nil 1101 } 1102 1103 // applyACLTokenUpsert is used to upsert a set of policies 1104 func (n *nomadFSM) applyACLTokenUpsert(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1105 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_upsert"}, time.Now()) 1106 var req structs.ACLTokenUpsertRequest 1107 if err := structs.Decode(buf, &req); err != nil { 1108 panic(fmt.Errorf("failed to decode request: %v", err)) 1109 } 1110 1111 if err := n.state.UpsertACLTokens(msgType, index, req.Tokens); err != nil { 1112 n.logger.Error("UpsertACLTokens failed", "error", err) 1113 return err 1114 } 1115 return nil 1116 } 1117 1118 // applyACLTokenDelete is used to delete a set of policies 1119 func (n *nomadFSM) applyACLTokenDelete(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1120 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_delete"}, time.Now()) 1121 var req structs.ACLTokenDeleteRequest 1122 if err := structs.Decode(buf, &req); err != nil { 1123 panic(fmt.Errorf("failed to decode request: %v", err)) 1124 } 1125 1126 if err := n.state.DeleteACLTokens(msgType, index, req.AccessorIDs); err != nil { 1127 n.logger.Error("DeleteACLTokens failed", "error", err) 1128 return err 1129 } 1130 return nil 1131 } 1132 1133 // applyACLTokenBootstrap is used to bootstrap an ACL token 1134 func (n *nomadFSM) applyACLTokenBootstrap(msgType structs.MessageType, buf []byte, index uint64) interface{} { 1135 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_bootstrap"}, time.Now()) 1136 var req structs.ACLTokenBootstrapRequest 1137 if err := structs.Decode(buf, &req); err != nil { 1138 panic(fmt.Errorf("failed to decode request: %v", err)) 1139 } 1140 1141 if err := n.state.BootstrapACLTokens(msgType, index, req.ResetIndex, req.Token); err != nil { 1142 n.logger.Error("BootstrapACLToken failed", "error", err) 1143 return err 1144 } 1145 return nil 1146 } 1147 1148 func (n *nomadFSM) applyAutopilotUpdate(buf []byte, index uint64) interface{} { 1149 var req structs.AutopilotSetConfigRequest 1150 if err := structs.Decode(buf, &req); err != nil { 1151 panic(fmt.Errorf("failed to decode request: %v", err)) 1152 } 1153 defer metrics.MeasureSince([]string{"nomad", "fsm", "autopilot"}, time.Now()) 1154 1155 if req.CAS { 1156 act, err := n.state.AutopilotCASConfig(index, req.Config.ModifyIndex, &req.Config) 1157 if err != nil { 1158 return err 1159 } 1160 return act 1161 } 1162 return n.state.AutopilotSetConfig(index, &req.Config) 1163 } 1164 1165 func (n *nomadFSM) applySchedulerConfigUpdate(buf []byte, index uint64) interface{} { 1166 var req structs.SchedulerSetConfigRequest 1167 if err := structs.Decode(buf, &req); err != nil { 1168 panic(fmt.Errorf("failed to decode request: %v", err)) 1169 } 1170 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_scheduler_config"}, time.Now()) 1171 1172 req.Config.Canonicalize() 1173 1174 if req.CAS { 1175 applied, err := n.state.SchedulerCASConfig(index, req.Config.ModifyIndex, &req.Config) 1176 if err != nil { 1177 return err 1178 } 1179 return applied 1180 } 1181 return n.state.SchedulerSetConfig(index, &req.Config) 1182 } 1183 1184 func (n *nomadFSM) applyCSIVolumeRegister(buf []byte, index uint64) interface{} { 1185 var req structs.CSIVolumeRegisterRequest 1186 if err := structs.Decode(buf, &req); err != nil { 1187 panic(fmt.Errorf("failed to decode request: %v", err)) 1188 } 1189 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_register"}, time.Now()) 1190 1191 if err := n.state.CSIVolumeRegister(index, req.Volumes); err != nil { 1192 n.logger.Error("CSIVolumeRegister failed", "error", err) 1193 return err 1194 } 1195 1196 return nil 1197 } 1198 1199 func (n *nomadFSM) applyCSIVolumeDeregister(buf []byte, index uint64) interface{} { 1200 var req structs.CSIVolumeDeregisterRequest 1201 if err := structs.Decode(buf, &req); err != nil { 1202 panic(fmt.Errorf("failed to decode request: %v", err)) 1203 } 1204 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_deregister"}, time.Now()) 1205 1206 if err := n.state.CSIVolumeDeregister(index, req.RequestNamespace(), req.VolumeIDs, req.Force); err != nil { 1207 n.logger.Error("CSIVolumeDeregister failed", "error", err) 1208 return err 1209 } 1210 1211 return nil 1212 } 1213 1214 func (n *nomadFSM) applyCSIVolumeBatchClaim(buf []byte, index uint64) interface{} { 1215 var batch *structs.CSIVolumeClaimBatchRequest 1216 if err := structs.Decode(buf, &batch); err != nil { 1217 panic(fmt.Errorf("failed to decode request: %v", err)) 1218 } 1219 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_batch_claim"}, time.Now()) 1220 1221 for _, req := range batch.Claims { 1222 err := n.state.CSIVolumeClaim(index, req.RequestNamespace(), 1223 req.VolumeID, req.ToClaim()) 1224 if err != nil { 1225 n.logger.Error("CSIVolumeClaim for batch failed", "error", err) 1226 return err // note: fails the remaining batch 1227 } 1228 } 1229 return nil 1230 } 1231 1232 func (n *nomadFSM) applyCSIVolumeClaim(buf []byte, index uint64) interface{} { 1233 var req structs.CSIVolumeClaimRequest 1234 if err := structs.Decode(buf, &req); err != nil { 1235 panic(fmt.Errorf("failed to decode request: %v", err)) 1236 } 1237 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_claim"}, time.Now()) 1238 1239 if err := n.state.CSIVolumeClaim(index, req.RequestNamespace(), req.VolumeID, req.ToClaim()); err != nil { 1240 n.logger.Error("CSIVolumeClaim failed", "error", err) 1241 return err 1242 } 1243 return nil 1244 } 1245 1246 func (n *nomadFSM) applyCSIPluginDelete(buf []byte, index uint64) interface{} { 1247 var req structs.CSIPluginDeleteRequest 1248 if err := structs.Decode(buf, &req); err != nil { 1249 panic(fmt.Errorf("failed to decode request: %v", err)) 1250 } 1251 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_plugin_delete"}, time.Now()) 1252 1253 if err := n.state.DeleteCSIPlugin(index, req.ID); err != nil { 1254 // "plugin in use" is an error for the state store but not for typical 1255 // callers, so reduce log noise by not logging that case here 1256 if err.Error() != "plugin in use" { 1257 n.logger.Error("DeleteCSIPlugin failed", "error", err) 1258 } 1259 return err 1260 } 1261 return nil 1262 } 1263 1264 // applyNamespaceUpsert is used to upsert a set of namespaces 1265 func (n *nomadFSM) applyNamespaceUpsert(buf []byte, index uint64) interface{} { 1266 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_namespace_upsert"}, time.Now()) 1267 var req structs.NamespaceUpsertRequest 1268 if err := structs.Decode(buf, &req); err != nil { 1269 panic(fmt.Errorf("failed to decode request: %v", err)) 1270 } 1271 1272 var trigger []string 1273 for _, ns := range req.Namespaces { 1274 old, err := n.state.NamespaceByName(nil, ns.Name) 1275 if err != nil { 1276 n.logger.Error("namespace lookup failed", "error", err) 1277 return err 1278 } 1279 1280 // If we are changing the quota on a namespace trigger evals for the 1281 // older quota. 1282 if old != nil && old.Quota != "" && old.Quota != ns.Quota { 1283 trigger = append(trigger, old.Quota) 1284 } 1285 } 1286 1287 if err := n.state.UpsertNamespaces(index, req.Namespaces); err != nil { 1288 n.logger.Error("UpsertNamespaces failed", "error", err) 1289 return err 1290 } 1291 1292 // Send the unblocks 1293 for _, quota := range trigger { 1294 n.blockedEvals.UnblockQuota(quota, index) 1295 } 1296 1297 return nil 1298 } 1299 1300 // applyNamespaceDelete is used to delete a set of namespaces 1301 func (n *nomadFSM) applyNamespaceDelete(buf []byte, index uint64) interface{} { 1302 defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_namespace_delete"}, time.Now()) 1303 var req structs.NamespaceDeleteRequest 1304 if err := structs.Decode(buf, &req); err != nil { 1305 panic(fmt.Errorf("failed to decode request: %v", err)) 1306 } 1307 1308 if err := n.state.DeleteNamespaces(index, req.Namespaces); err != nil { 1309 n.logger.Error("DeleteNamespaces failed", "error", err) 1310 } 1311 1312 return nil 1313 } 1314 1315 func (n *nomadFSM) Snapshot() (raft.FSMSnapshot, error) { 1316 // Create a new snapshot 1317 snap, err := n.state.Snapshot() 1318 if err != nil { 1319 return nil, err 1320 } 1321 1322 ns := &nomadSnapshot{ 1323 snap: snap, 1324 timetable: n.timetable, 1325 } 1326 return ns, nil 1327 } 1328 1329 func (n *nomadFSM) Restore(old io.ReadCloser) error { 1330 defer old.Close() 1331 1332 // Create a new state store 1333 config := &state.StateStoreConfig{ 1334 Logger: n.config.Logger, 1335 Region: n.config.Region, 1336 EnablePublisher: n.config.EnableEventBroker, 1337 EventBufferSize: n.config.EventBufferSize, 1338 } 1339 newState, err := state.NewStateStore(config) 1340 if err != nil { 1341 return err 1342 } 1343 1344 // Start the state restore 1345 restore, err := newState.Restore() 1346 if err != nil { 1347 return err 1348 } 1349 defer restore.Abort() 1350 1351 // Create a decoder 1352 dec := codec.NewDecoder(old, structs.MsgpackHandle) 1353 1354 // Read in the header 1355 var header snapshotHeader 1356 if err := dec.Decode(&header); err != nil { 1357 return err 1358 } 1359 1360 // Populate the new state 1361 msgType := make([]byte, 1) 1362 for { 1363 // Read the message type 1364 _, err := old.Read(msgType) 1365 if err == io.EOF { 1366 break 1367 } else if err != nil { 1368 return err 1369 } 1370 1371 // Decode 1372 snapType := SnapshotType(msgType[0]) 1373 switch snapType { 1374 case TimeTableSnapshot: 1375 if err := n.timetable.Deserialize(dec); err != nil { 1376 return fmt.Errorf("time table deserialize failed: %v", err) 1377 } 1378 1379 case NodeSnapshot: 1380 node := new(structs.Node) 1381 if err := dec.Decode(node); err != nil { 1382 return err 1383 } 1384 1385 // Handle upgrade paths 1386 node.Canonicalize() 1387 1388 if err := restore.NodeRestore(node); err != nil { 1389 return err 1390 } 1391 1392 case JobSnapshot: 1393 job := new(structs.Job) 1394 if err := dec.Decode(job); err != nil { 1395 return err 1396 } 1397 1398 /* Handle upgrade paths: 1399 * - Empty maps and slices should be treated as nil to avoid 1400 * un-intended destructive updates in scheduler since we use 1401 * reflect.DeepEqual. Starting Nomad 0.4.1, job submission sanitizes 1402 * the incoming job. 1403 * - Migrate from old style upgrade stanza that used only a stagger. 1404 */ 1405 job.Canonicalize() 1406 1407 if err := restore.JobRestore(job); err != nil { 1408 return err 1409 } 1410 1411 case EvalSnapshot: 1412 eval := new(structs.Evaluation) 1413 if err := dec.Decode(eval); err != nil { 1414 return err 1415 } 1416 1417 if err := restore.EvalRestore(eval); err != nil { 1418 return err 1419 } 1420 1421 case AllocSnapshot: 1422 alloc := new(structs.Allocation) 1423 if err := dec.Decode(alloc); err != nil { 1424 return err 1425 } 1426 1427 // Handle upgrade path 1428 alloc.Canonicalize() 1429 1430 if err := restore.AllocRestore(alloc); err != nil { 1431 return err 1432 } 1433 1434 case IndexSnapshot: 1435 idx := new(state.IndexEntry) 1436 if err := dec.Decode(idx); err != nil { 1437 return err 1438 } 1439 if err := restore.IndexRestore(idx); err != nil { 1440 return err 1441 } 1442 1443 case PeriodicLaunchSnapshot: 1444 launch := new(structs.PeriodicLaunch) 1445 if err := dec.Decode(launch); err != nil { 1446 return err 1447 } 1448 1449 if err := restore.PeriodicLaunchRestore(launch); err != nil { 1450 return err 1451 } 1452 1453 case JobSummarySnapshot: 1454 summary := new(structs.JobSummary) 1455 if err := dec.Decode(summary); err != nil { 1456 return err 1457 } 1458 1459 if err := restore.JobSummaryRestore(summary); err != nil { 1460 return err 1461 } 1462 1463 case VaultAccessorSnapshot: 1464 accessor := new(structs.VaultAccessor) 1465 if err := dec.Decode(accessor); err != nil { 1466 return err 1467 } 1468 if err := restore.VaultAccessorRestore(accessor); err != nil { 1469 return err 1470 } 1471 1472 case ServiceIdentityTokenAccessorSnapshot: 1473 accessor := new(structs.SITokenAccessor) 1474 if err := dec.Decode(accessor); err != nil { 1475 return err 1476 } 1477 if err := restore.SITokenAccessorRestore(accessor); err != nil { 1478 return err 1479 } 1480 1481 case JobVersionSnapshot: 1482 version := new(structs.Job) 1483 if err := dec.Decode(version); err != nil { 1484 return err 1485 } 1486 1487 if err := restore.JobVersionRestore(version); err != nil { 1488 return err 1489 } 1490 1491 case DeploymentSnapshot: 1492 deployment := new(structs.Deployment) 1493 if err := dec.Decode(deployment); err != nil { 1494 return err 1495 } 1496 1497 if err := restore.DeploymentRestore(deployment); err != nil { 1498 return err 1499 } 1500 1501 case ACLPolicySnapshot: 1502 policy := new(structs.ACLPolicy) 1503 if err := dec.Decode(policy); err != nil { 1504 return err 1505 } 1506 if err := restore.ACLPolicyRestore(policy); err != nil { 1507 return err 1508 } 1509 1510 case ACLTokenSnapshot: 1511 token := new(structs.ACLToken) 1512 if err := dec.Decode(token); err != nil { 1513 return err 1514 } 1515 if err := restore.ACLTokenRestore(token); err != nil { 1516 return err 1517 } 1518 1519 case SchedulerConfigSnapshot: 1520 schedConfig := new(structs.SchedulerConfiguration) 1521 if err := dec.Decode(schedConfig); err != nil { 1522 return err 1523 } 1524 schedConfig.Canonicalize() 1525 if err := restore.SchedulerConfigRestore(schedConfig); err != nil { 1526 return err 1527 } 1528 1529 case ClusterMetadataSnapshot: 1530 meta := new(structs.ClusterMetadata) 1531 if err := dec.Decode(meta); err != nil { 1532 return err 1533 } 1534 if err := restore.ClusterMetadataRestore(meta); err != nil { 1535 return err 1536 } 1537 1538 case ScalingEventsSnapshot: 1539 jobScalingEvents := new(structs.JobScalingEvents) 1540 if err := dec.Decode(jobScalingEvents); err != nil { 1541 return err 1542 } 1543 1544 if err := restore.ScalingEventsRestore(jobScalingEvents); err != nil { 1545 return err 1546 } 1547 1548 case ScalingPolicySnapshot: 1549 scalingPolicy := new(structs.ScalingPolicy) 1550 if err := dec.Decode(scalingPolicy); err != nil { 1551 return err 1552 } 1553 1554 // Handle upgrade path: 1555 // - Set policy type if empty 1556 scalingPolicy.Canonicalize() 1557 1558 if err := restore.ScalingPolicyRestore(scalingPolicy); err != nil { 1559 return err 1560 } 1561 1562 case CSIPluginSnapshot: 1563 plugin := new(structs.CSIPlugin) 1564 if err := dec.Decode(plugin); err != nil { 1565 return err 1566 } 1567 1568 if err := restore.CSIPluginRestore(plugin); err != nil { 1569 return err 1570 } 1571 1572 case CSIVolumeSnapshot: 1573 plugin := new(structs.CSIVolume) 1574 if err := dec.Decode(plugin); err != nil { 1575 return err 1576 } 1577 1578 if err := restore.CSIVolumeRestore(plugin); err != nil { 1579 return err 1580 } 1581 1582 case NamespaceSnapshot: 1583 namespace := new(structs.Namespace) 1584 if err := dec.Decode(namespace); err != nil { 1585 return err 1586 } 1587 if err := restore.NamespaceRestore(namespace); err != nil { 1588 return err 1589 } 1590 1591 // COMPAT(1.0): Allow 1.0-beta clusterers to gracefully handle 1592 case EventSinkSnapshot: 1593 return nil 1594 default: 1595 // Check if this is an enterprise only object being restored 1596 restorer, ok := n.enterpriseRestorers[snapType] 1597 if !ok { 1598 return fmt.Errorf("Unrecognized snapshot type: %v", msgType) 1599 } 1600 1601 // Restore the enterprise only object 1602 if err := restorer(restore, dec); err != nil { 1603 return err 1604 } 1605 } 1606 } 1607 1608 if err := restore.Commit(); err != nil { 1609 return err 1610 } 1611 1612 // COMPAT Remove in 0.10 1613 // Clean up active deployments that do not have a job 1614 if err := n.failLeakedDeployments(newState); err != nil { 1615 return err 1616 } 1617 1618 // External code might be calling State(), so we need to synchronize 1619 // here to make sure we swap in the new state store atomically. 1620 n.stateLock.Lock() 1621 stateOld := n.state 1622 n.state = newState 1623 n.stateLock.Unlock() 1624 1625 // Signal that the old state store has been abandoned. This is required 1626 // because we don't operate on it any more, we just throw it away, so 1627 // blocking queries won't see any changes and need to be woken up. 1628 stateOld.Abandon() 1629 1630 return nil 1631 } 1632 1633 // failLeakedDeployments is used to fail deployments that do not have a job. 1634 // This state is a broken invariant that should not occur since 0.8.X. 1635 func (n *nomadFSM) failLeakedDeployments(state *state.StateStore) error { 1636 // Scan for deployments that are referencing a job that no longer exists. 1637 // This could happen if multiple deployments were created for a given job 1638 // and thus the older deployment leaks and then the job is removed. 1639 iter, err := state.Deployments(nil) 1640 if err != nil { 1641 return fmt.Errorf("failed to query deployments: %v", err) 1642 } 1643 1644 dindex, err := state.Index("deployment") 1645 if err != nil { 1646 return fmt.Errorf("couldn't fetch index of deployments table: %v", err) 1647 } 1648 1649 for { 1650 raw := iter.Next() 1651 if raw == nil { 1652 break 1653 } 1654 1655 d := raw.(*structs.Deployment) 1656 1657 // We are only looking for active deployments where the job no longer 1658 // exists 1659 if !d.Active() { 1660 continue 1661 } 1662 1663 // Find the job 1664 job, err := state.JobByID(nil, d.Namespace, d.JobID) 1665 if err != nil { 1666 return fmt.Errorf("failed to lookup job %s from deployment %q: %v", d.JobID, d.ID, err) 1667 } 1668 1669 // Job exists. 1670 if job != nil { 1671 continue 1672 } 1673 1674 // Update the deployment to be terminal 1675 failed := d.Copy() 1676 failed.Status = structs.DeploymentStatusCancelled 1677 failed.StatusDescription = structs.DeploymentStatusDescriptionStoppedJob 1678 if err := state.UpsertDeployment(dindex, failed); err != nil { 1679 return fmt.Errorf("failed to mark leaked deployment %q as failed: %v", failed.ID, err) 1680 } 1681 } 1682 1683 return nil 1684 } 1685 1686 // reconcileQueuedAllocations re-calculates the queued allocations for every job that we 1687 // created a Job Summary during the snap shot restore 1688 func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error { 1689 // Get all the jobs 1690 ws := memdb.NewWatchSet() 1691 iter, err := n.state.Jobs(ws) 1692 if err != nil { 1693 return err 1694 } 1695 1696 snap, err := n.state.Snapshot() 1697 if err != nil { 1698 return fmt.Errorf("unable to create snapshot: %v", err) 1699 } 1700 1701 // Invoking the scheduler for every job so that we can populate the number 1702 // of queued allocations for every job 1703 for { 1704 rawJob := iter.Next() 1705 if rawJob == nil { 1706 break 1707 } 1708 job := rawJob.(*structs.Job) 1709 1710 // Nothing to do for queued allocations if the job is a parent periodic/parameterized job 1711 if job.IsParameterized() || job.IsPeriodic() { 1712 continue 1713 } 1714 planner := &scheduler.Harness{ 1715 State: &snap.StateStore, 1716 } 1717 // Create an eval and mark it as requiring annotations and insert that as well 1718 eval := &structs.Evaluation{ 1719 ID: uuid.Generate(), 1720 Namespace: job.Namespace, 1721 Priority: job.Priority, 1722 Type: job.Type, 1723 TriggeredBy: structs.EvalTriggerJobRegister, 1724 JobID: job.ID, 1725 JobModifyIndex: job.JobModifyIndex + 1, 1726 Status: structs.EvalStatusPending, 1727 AnnotatePlan: true, 1728 } 1729 // Ignore eval event creation during snapshot restore 1730 snap.UpsertEvals(structs.IgnoreUnknownTypeFlag, 100, []*structs.Evaluation{eval}) 1731 // Create the scheduler and run it 1732 sched, err := scheduler.NewScheduler(eval.Type, n.logger, snap, planner) 1733 if err != nil { 1734 return err 1735 } 1736 1737 if err := sched.Process(eval); err != nil { 1738 return err 1739 } 1740 1741 // Get the job summary from the fsm state store 1742 originalSummary, err := n.state.JobSummaryByID(ws, job.Namespace, job.ID) 1743 if err != nil { 1744 return err 1745 } 1746 summary := originalSummary.Copy() 1747 1748 // Add the allocations scheduler has made to queued since these 1749 // allocations are never getting placed until the scheduler is invoked 1750 // with a real planner 1751 if l := len(planner.Plans); l != 1 { 1752 return fmt.Errorf("unexpected number of plans during restore %d. Please file an issue including the logs", l) 1753 } 1754 for _, allocations := range planner.Plans[0].NodeAllocation { 1755 for _, allocation := range allocations { 1756 tgSummary, ok := summary.Summary[allocation.TaskGroup] 1757 if !ok { 1758 return fmt.Errorf("task group %q not found while updating queued count", allocation.TaskGroup) 1759 } 1760 tgSummary.Queued += 1 1761 summary.Summary[allocation.TaskGroup] = tgSummary 1762 } 1763 } 1764 1765 // Add the queued allocations attached to the evaluation to the queued 1766 // counter of the job summary 1767 if l := len(planner.Evals); l != 1 { 1768 return fmt.Errorf("unexpected number of evals during restore %d. Please file an issue including the logs", l) 1769 } 1770 for tg, queued := range planner.Evals[0].QueuedAllocations { 1771 tgSummary, ok := summary.Summary[tg] 1772 if !ok { 1773 return fmt.Errorf("task group %q not found while updating queued count", tg) 1774 } 1775 1776 // We add instead of setting here because we want to take into 1777 // consideration what the scheduler with a mock planner thinks it 1778 // placed. Those should be counted as queued as well 1779 tgSummary.Queued += queued 1780 summary.Summary[tg] = tgSummary 1781 } 1782 1783 if !reflect.DeepEqual(summary, originalSummary) { 1784 summary.ModifyIndex = index 1785 if err := n.state.UpsertJobSummary(index, summary); err != nil { 1786 return err 1787 } 1788 } 1789 } 1790 return nil 1791 } 1792 1793 func (n *nomadFSM) applyUpsertScalingEvent(buf []byte, index uint64) interface{} { 1794 defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_scaling_event"}, time.Now()) 1795 var req structs.ScalingEventRequest 1796 if err := structs.Decode(buf, &req); err != nil { 1797 panic(fmt.Errorf("failed to decode request: %v", err)) 1798 } 1799 1800 if err := n.state.UpsertScalingEvent(index, &req); err != nil { 1801 n.logger.Error("UpsertScalingEvent failed", "error", err) 1802 return err 1803 } 1804 1805 return nil 1806 } 1807 1808 func (s *nomadSnapshot) Persist(sink raft.SnapshotSink) error { 1809 defer metrics.MeasureSince([]string{"nomad", "fsm", "persist"}, time.Now()) 1810 // Register the nodes 1811 encoder := codec.NewEncoder(sink, structs.MsgpackHandle) 1812 1813 // Write the header 1814 header := snapshotHeader{} 1815 if err := encoder.Encode(&header); err != nil { 1816 sink.Cancel() 1817 return err 1818 } 1819 1820 // Write the time table 1821 sink.Write([]byte{byte(TimeTableSnapshot)}) 1822 if err := s.timetable.Serialize(encoder); err != nil { 1823 sink.Cancel() 1824 return err 1825 } 1826 1827 // Write all the data out 1828 if err := s.persistIndexes(sink, encoder); err != nil { 1829 sink.Cancel() 1830 return err 1831 } 1832 if err := s.persistNodes(sink, encoder); err != nil { 1833 sink.Cancel() 1834 return err 1835 } 1836 if err := s.persistJobs(sink, encoder); err != nil { 1837 sink.Cancel() 1838 return err 1839 } 1840 if err := s.persistEvals(sink, encoder); err != nil { 1841 sink.Cancel() 1842 return err 1843 } 1844 if err := s.persistAllocs(sink, encoder); err != nil { 1845 sink.Cancel() 1846 return err 1847 } 1848 if err := s.persistPeriodicLaunches(sink, encoder); err != nil { 1849 sink.Cancel() 1850 return err 1851 } 1852 if err := s.persistJobSummaries(sink, encoder); err != nil { 1853 sink.Cancel() 1854 return err 1855 } 1856 if err := s.persistVaultAccessors(sink, encoder); err != nil { 1857 sink.Cancel() 1858 return err 1859 } 1860 if err := s.persistSITokenAccessors(sink, encoder); err != nil { 1861 sink.Cancel() 1862 return err 1863 } 1864 if err := s.persistJobVersions(sink, encoder); err != nil { 1865 sink.Cancel() 1866 return err 1867 } 1868 if err := s.persistDeployments(sink, encoder); err != nil { 1869 sink.Cancel() 1870 return err 1871 } 1872 if err := s.persistScalingPolicies(sink, encoder); err != nil { 1873 sink.Cancel() 1874 return err 1875 } 1876 if err := s.persistScalingEvents(sink, encoder); err != nil { 1877 sink.Cancel() 1878 return err 1879 } 1880 if err := s.persistCSIPlugins(sink, encoder); err != nil { 1881 sink.Cancel() 1882 return err 1883 } 1884 if err := s.persistCSIVolumes(sink, encoder); err != nil { 1885 sink.Cancel() 1886 return err 1887 } 1888 if err := s.persistACLPolicies(sink, encoder); err != nil { 1889 sink.Cancel() 1890 return err 1891 } 1892 if err := s.persistACLTokens(sink, encoder); err != nil { 1893 sink.Cancel() 1894 return err 1895 } 1896 if err := s.persistNamespaces(sink, encoder); err != nil { 1897 sink.Cancel() 1898 return err 1899 } 1900 if err := s.persistEnterpriseTables(sink, encoder); err != nil { 1901 sink.Cancel() 1902 return err 1903 } 1904 if err := s.persistSchedulerConfig(sink, encoder); err != nil { 1905 sink.Cancel() 1906 return err 1907 } 1908 if err := s.persistClusterMetadata(sink, encoder); err != nil { 1909 sink.Cancel() 1910 return err 1911 } 1912 return nil 1913 } 1914 1915 func (s *nomadSnapshot) persistIndexes(sink raft.SnapshotSink, 1916 encoder *codec.Encoder) error { 1917 // Get all the indexes 1918 iter, err := s.snap.Indexes() 1919 if err != nil { 1920 return err 1921 } 1922 1923 for { 1924 // Get the next item 1925 raw := iter.Next() 1926 if raw == nil { 1927 break 1928 } 1929 1930 // Prepare the request struct 1931 idx := raw.(*state.IndexEntry) 1932 1933 // Write out a node registration 1934 sink.Write([]byte{byte(IndexSnapshot)}) 1935 if err := encoder.Encode(idx); err != nil { 1936 return err 1937 } 1938 } 1939 return nil 1940 } 1941 1942 func (s *nomadSnapshot) persistNodes(sink raft.SnapshotSink, 1943 encoder *codec.Encoder) error { 1944 // Get all the nodes 1945 ws := memdb.NewWatchSet() 1946 nodes, err := s.snap.Nodes(ws) 1947 if err != nil { 1948 return err 1949 } 1950 1951 for { 1952 // Get the next item 1953 raw := nodes.Next() 1954 if raw == nil { 1955 break 1956 } 1957 1958 // Prepare the request struct 1959 node := raw.(*structs.Node) 1960 1961 // Write out a node registration 1962 sink.Write([]byte{byte(NodeSnapshot)}) 1963 if err := encoder.Encode(node); err != nil { 1964 return err 1965 } 1966 } 1967 return nil 1968 } 1969 1970 func (s *nomadSnapshot) persistJobs(sink raft.SnapshotSink, 1971 encoder *codec.Encoder) error { 1972 // Get all the jobs 1973 ws := memdb.NewWatchSet() 1974 jobs, err := s.snap.Jobs(ws) 1975 if err != nil { 1976 return err 1977 } 1978 1979 for { 1980 // Get the next item 1981 raw := jobs.Next() 1982 if raw == nil { 1983 break 1984 } 1985 1986 // Prepare the request struct 1987 job := raw.(*structs.Job) 1988 1989 // Write out a job registration 1990 sink.Write([]byte{byte(JobSnapshot)}) 1991 if err := encoder.Encode(job); err != nil { 1992 return err 1993 } 1994 } 1995 return nil 1996 } 1997 1998 func (s *nomadSnapshot) persistEvals(sink raft.SnapshotSink, 1999 encoder *codec.Encoder) error { 2000 // Get all the evaluations 2001 ws := memdb.NewWatchSet() 2002 evals, err := s.snap.Evals(ws) 2003 if err != nil { 2004 return err 2005 } 2006 2007 for { 2008 // Get the next item 2009 raw := evals.Next() 2010 if raw == nil { 2011 break 2012 } 2013 2014 // Prepare the request struct 2015 eval := raw.(*structs.Evaluation) 2016 2017 // Write out the evaluation 2018 sink.Write([]byte{byte(EvalSnapshot)}) 2019 if err := encoder.Encode(eval); err != nil { 2020 return err 2021 } 2022 } 2023 return nil 2024 } 2025 2026 func (s *nomadSnapshot) persistAllocs(sink raft.SnapshotSink, 2027 encoder *codec.Encoder) error { 2028 // Get all the allocations 2029 ws := memdb.NewWatchSet() 2030 allocs, err := s.snap.Allocs(ws) 2031 if err != nil { 2032 return err 2033 } 2034 2035 for { 2036 // Get the next item 2037 raw := allocs.Next() 2038 if raw == nil { 2039 break 2040 } 2041 2042 // Prepare the request struct 2043 alloc := raw.(*structs.Allocation) 2044 2045 // Write out the evaluation 2046 sink.Write([]byte{byte(AllocSnapshot)}) 2047 if err := encoder.Encode(alloc); err != nil { 2048 return err 2049 } 2050 } 2051 return nil 2052 } 2053 2054 func (s *nomadSnapshot) persistPeriodicLaunches(sink raft.SnapshotSink, 2055 encoder *codec.Encoder) error { 2056 // Get all the jobs 2057 ws := memdb.NewWatchSet() 2058 launches, err := s.snap.PeriodicLaunches(ws) 2059 if err != nil { 2060 return err 2061 } 2062 2063 for { 2064 // Get the next item 2065 raw := launches.Next() 2066 if raw == nil { 2067 break 2068 } 2069 2070 // Prepare the request struct 2071 launch := raw.(*structs.PeriodicLaunch) 2072 2073 // Write out a job registration 2074 sink.Write([]byte{byte(PeriodicLaunchSnapshot)}) 2075 if err := encoder.Encode(launch); err != nil { 2076 return err 2077 } 2078 } 2079 return nil 2080 } 2081 2082 func (s *nomadSnapshot) persistJobSummaries(sink raft.SnapshotSink, 2083 encoder *codec.Encoder) error { 2084 2085 ws := memdb.NewWatchSet() 2086 summaries, err := s.snap.JobSummaries(ws) 2087 if err != nil { 2088 return err 2089 } 2090 2091 for { 2092 raw := summaries.Next() 2093 if raw == nil { 2094 break 2095 } 2096 2097 jobSummary := raw.(*structs.JobSummary) 2098 2099 sink.Write([]byte{byte(JobSummarySnapshot)}) 2100 if err := encoder.Encode(jobSummary); err != nil { 2101 return err 2102 } 2103 } 2104 return nil 2105 } 2106 2107 func (s *nomadSnapshot) persistVaultAccessors(sink raft.SnapshotSink, 2108 encoder *codec.Encoder) error { 2109 2110 ws := memdb.NewWatchSet() 2111 accessors, err := s.snap.VaultAccessors(ws) 2112 if err != nil { 2113 return err 2114 } 2115 2116 for { 2117 raw := accessors.Next() 2118 if raw == nil { 2119 break 2120 } 2121 2122 accessor := raw.(*structs.VaultAccessor) 2123 2124 sink.Write([]byte{byte(VaultAccessorSnapshot)}) 2125 if err := encoder.Encode(accessor); err != nil { 2126 return err 2127 } 2128 } 2129 return nil 2130 } 2131 2132 func (s *nomadSnapshot) persistSITokenAccessors(sink raft.SnapshotSink, encoder *codec.Encoder) error { 2133 ws := memdb.NewWatchSet() 2134 accessors, err := s.snap.SITokenAccessors(ws) 2135 if err != nil { 2136 return err 2137 } 2138 2139 for raw := accessors.Next(); raw != nil; raw = accessors.Next() { 2140 accessor := raw.(*structs.SITokenAccessor) 2141 sink.Write([]byte{byte(ServiceIdentityTokenAccessorSnapshot)}) 2142 if err := encoder.Encode(accessor); err != nil { 2143 return err 2144 } 2145 } 2146 return nil 2147 } 2148 2149 func (s *nomadSnapshot) persistJobVersions(sink raft.SnapshotSink, 2150 encoder *codec.Encoder) error { 2151 // Get all the jobs 2152 ws := memdb.NewWatchSet() 2153 versions, err := s.snap.JobVersions(ws) 2154 if err != nil { 2155 return err 2156 } 2157 2158 for { 2159 // Get the next item 2160 raw := versions.Next() 2161 if raw == nil { 2162 break 2163 } 2164 2165 // Prepare the request struct 2166 job := raw.(*structs.Job) 2167 2168 // Write out a job registration 2169 sink.Write([]byte{byte(JobVersionSnapshot)}) 2170 if err := encoder.Encode(job); err != nil { 2171 return err 2172 } 2173 } 2174 return nil 2175 } 2176 2177 func (s *nomadSnapshot) persistDeployments(sink raft.SnapshotSink, 2178 encoder *codec.Encoder) error { 2179 // Get all the jobs 2180 ws := memdb.NewWatchSet() 2181 deployments, err := s.snap.Deployments(ws) 2182 if err != nil { 2183 return err 2184 } 2185 2186 for { 2187 // Get the next item 2188 raw := deployments.Next() 2189 if raw == nil { 2190 break 2191 } 2192 2193 // Prepare the request struct 2194 deployment := raw.(*structs.Deployment) 2195 2196 // Write out a job registration 2197 sink.Write([]byte{byte(DeploymentSnapshot)}) 2198 if err := encoder.Encode(deployment); err != nil { 2199 return err 2200 } 2201 } 2202 return nil 2203 } 2204 2205 func (s *nomadSnapshot) persistACLPolicies(sink raft.SnapshotSink, 2206 encoder *codec.Encoder) error { 2207 // Get all the policies 2208 ws := memdb.NewWatchSet() 2209 policies, err := s.snap.ACLPolicies(ws) 2210 if err != nil { 2211 return err 2212 } 2213 2214 for { 2215 // Get the next item 2216 raw := policies.Next() 2217 if raw == nil { 2218 break 2219 } 2220 2221 // Prepare the request struct 2222 policy := raw.(*structs.ACLPolicy) 2223 2224 // Write out a policy registration 2225 sink.Write([]byte{byte(ACLPolicySnapshot)}) 2226 if err := encoder.Encode(policy); err != nil { 2227 return err 2228 } 2229 } 2230 return nil 2231 } 2232 2233 func (s *nomadSnapshot) persistACLTokens(sink raft.SnapshotSink, 2234 encoder *codec.Encoder) error { 2235 // Get all the policies 2236 ws := memdb.NewWatchSet() 2237 tokens, err := s.snap.ACLTokens(ws) 2238 if err != nil { 2239 return err 2240 } 2241 2242 for { 2243 // Get the next item 2244 raw := tokens.Next() 2245 if raw == nil { 2246 break 2247 } 2248 2249 // Prepare the request struct 2250 token := raw.(*structs.ACLToken) 2251 2252 // Write out a token registration 2253 sink.Write([]byte{byte(ACLTokenSnapshot)}) 2254 if err := encoder.Encode(token); err != nil { 2255 return err 2256 } 2257 } 2258 return nil 2259 } 2260 2261 // persistNamespaces persists all the namespaces. 2262 func (s *nomadSnapshot) persistNamespaces(sink raft.SnapshotSink, encoder *codec.Encoder) error { 2263 // Get all the jobs 2264 ws := memdb.NewWatchSet() 2265 namespaces, err := s.snap.Namespaces(ws) 2266 if err != nil { 2267 return err 2268 } 2269 2270 for { 2271 // Get the next item 2272 raw := namespaces.Next() 2273 if raw == nil { 2274 break 2275 } 2276 2277 // Prepare the request struct 2278 namespace := raw.(*structs.Namespace) 2279 2280 // Write out a namespace registration 2281 sink.Write([]byte{byte(NamespaceSnapshot)}) 2282 if err := encoder.Encode(namespace); err != nil { 2283 return err 2284 } 2285 } 2286 return nil 2287 } 2288 2289 func (s *nomadSnapshot) persistSchedulerConfig(sink raft.SnapshotSink, 2290 encoder *codec.Encoder) error { 2291 // Get scheduler config 2292 _, schedConfig, err := s.snap.SchedulerConfig() 2293 if err != nil { 2294 return err 2295 } 2296 if schedConfig == nil { 2297 return nil 2298 } 2299 // Write out scheduler config 2300 sink.Write([]byte{byte(SchedulerConfigSnapshot)}) 2301 if err := encoder.Encode(schedConfig); err != nil { 2302 return err 2303 } 2304 return nil 2305 } 2306 2307 func (s *nomadSnapshot) persistClusterMetadata(sink raft.SnapshotSink, 2308 encoder *codec.Encoder) error { 2309 2310 // Get the cluster metadata 2311 ws := memdb.NewWatchSet() 2312 clusterMetadata, err := s.snap.ClusterMetadata(ws) 2313 if err != nil { 2314 return err 2315 } 2316 if clusterMetadata == nil { 2317 return nil 2318 } 2319 2320 // Write out the cluster metadata 2321 sink.Write([]byte{byte(ClusterMetadataSnapshot)}) 2322 if err := encoder.Encode(clusterMetadata); err != nil { 2323 return err 2324 } 2325 2326 return nil 2327 } 2328 2329 func (s *nomadSnapshot) persistScalingPolicies(sink raft.SnapshotSink, 2330 encoder *codec.Encoder) error { 2331 2332 // Get all the scaling policies 2333 ws := memdb.NewWatchSet() 2334 scalingPolicies, err := s.snap.ScalingPolicies(ws) 2335 if err != nil { 2336 return err 2337 } 2338 2339 for { 2340 // Get the next item 2341 raw := scalingPolicies.Next() 2342 if raw == nil { 2343 break 2344 } 2345 2346 // Prepare the request struct 2347 scalingPolicy := raw.(*structs.ScalingPolicy) 2348 2349 // Write out a scaling policy snapshot 2350 sink.Write([]byte{byte(ScalingPolicySnapshot)}) 2351 if err := encoder.Encode(scalingPolicy); err != nil { 2352 return err 2353 } 2354 } 2355 return nil 2356 } 2357 2358 func (s *nomadSnapshot) persistScalingEvents(sink raft.SnapshotSink, encoder *codec.Encoder) error { 2359 // Get all the scaling events 2360 ws := memdb.NewWatchSet() 2361 iter, err := s.snap.ScalingEvents(ws) 2362 if err != nil { 2363 return err 2364 } 2365 2366 for { 2367 // Get the next item 2368 raw := iter.Next() 2369 if raw == nil { 2370 break 2371 } 2372 2373 // Prepare the request struct 2374 events := raw.(*structs.JobScalingEvents) 2375 2376 // Write out a scaling events snapshot 2377 sink.Write([]byte{byte(ScalingEventsSnapshot)}) 2378 if err := encoder.Encode(events); err != nil { 2379 return err 2380 } 2381 } 2382 return nil 2383 } 2384 2385 func (s *nomadSnapshot) persistCSIPlugins(sink raft.SnapshotSink, 2386 encoder *codec.Encoder) error { 2387 2388 // Get all the CSI plugins 2389 ws := memdb.NewWatchSet() 2390 plugins, err := s.snap.CSIPlugins(ws) 2391 if err != nil { 2392 return err 2393 } 2394 2395 for { 2396 // Get the next item 2397 raw := plugins.Next() 2398 if raw == nil { 2399 break 2400 } 2401 2402 // Prepare the request struct 2403 plugin := raw.(*structs.CSIPlugin) 2404 2405 // Write out a plugin snapshot 2406 sink.Write([]byte{byte(CSIPluginSnapshot)}) 2407 if err := encoder.Encode(plugin); err != nil { 2408 return err 2409 } 2410 } 2411 return nil 2412 } 2413 2414 func (s *nomadSnapshot) persistCSIVolumes(sink raft.SnapshotSink, 2415 encoder *codec.Encoder) error { 2416 2417 // Get all the CSI volumes 2418 ws := memdb.NewWatchSet() 2419 volumes, err := s.snap.CSIVolumes(ws) 2420 if err != nil { 2421 return err 2422 } 2423 2424 for { 2425 // Get the next item 2426 raw := volumes.Next() 2427 if raw == nil { 2428 break 2429 } 2430 2431 // Prepare the request struct 2432 volume := raw.(*structs.CSIVolume) 2433 2434 // Write out a volume snapshot 2435 sink.Write([]byte{byte(CSIVolumeSnapshot)}) 2436 if err := encoder.Encode(volume); err != nil { 2437 return err 2438 } 2439 } 2440 return nil 2441 } 2442 2443 // Release is a no-op, as we just need to GC the pointer 2444 // to the state store snapshot. There is nothing to explicitly 2445 // cleanup. 2446 func (s *nomadSnapshot) Release() {}