github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/design/tla/WorkerSpec.tla (about) 1 ----------------------------- MODULE WorkerSpec ----------------------------- 2 3 EXTENDS Types, Tasks, EventCounter 4 5 VARIABLE nodes \* Maps nodes to SwarmKit's view of their NodeState 6 7 (* The possible states of a node, as recorded by SwarmKit. *) 8 nodeUp == "up" 9 nodeDown == "down" 10 NodeState == { nodeUp, nodeDown } 11 12 WorkerTypeOK == 13 \* Nodes are up or down 14 /\ nodes \in [ Node -> NodeState ] 15 16 ----------------------------------------------------------------------------- 17 18 \* Actions performed by worker nodes (actually, by the dispatcher on their behalf) 19 20 (* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *) 21 IsUp(n) == nodes[n] = nodeUp 22 23 (* Try to advance containers towards `desired_state' if we're not there yet. *) 24 ProgressTask == 25 /\ UNCHANGED << nodes, nEvents >> 26 /\ \E t \in tasks, 27 s2 \in TaskState : \* The state we want to move to 28 LET t2 == [t EXCEPT !.status.state = s2] 29 IN 30 /\ s2 \preceq t.desired_state \* Can't be after the desired state 31 /\ << State(t), State(t2) >> \in { \* Possible ``progress'' (desirable) transitions 32 << assigned, accepted >>, 33 << accepted, preparing >>, 34 << preparing, ready >>, 35 << ready, starting >>, 36 << starting, running >> 37 } 38 /\ IsUp(t.node) \* Node must be connected to SwarmKit 39 /\ UpdateTasks(t :> t2) 40 41 (* A running container finishes because we stopped it. *) 42 ShutdownComplete == 43 /\ UNCHANGED << nodes, nEvents >> 44 /\ \E t \in tasks : 45 /\ t.desired_state \in {shutdown, remove} \* We are trying to stop it 46 /\ State(t) = running \* It is currently running 47 /\ IsUp(t.node) 48 /\ UpdateTasks(t :> [t EXCEPT !.status.state = shutdown]) \* It becomes shutdown 49 50 (* A node can reject a task once it's responsible for it (it has reached `assigned') 51 until it reaches the `running' state. 52 Note that an ``accepted'' task can still be rejected. *) 53 RejectTask == 54 /\ UNCHANGED << nodes >> 55 /\ CountEvent 56 /\ \E t \in tasks : 57 /\ State(t) \in { assigned, accepted, preparing, ready, starting } 58 /\ IsUp(t.node) 59 /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected]) 60 61 (* We notify the managers that some running containers have finished. 62 There might be several updates at once (e.g. if we're reconnecting). *) 63 ContainerExit == 64 /\ UNCHANGED << nodes >> 65 /\ CountEvent 66 /\ \E n \in Node : 67 /\ IsUp(n) 68 /\ \E ts \in SUBSET { t \in tasks : t.node = n /\ State(t) = running } : 69 \* Each container could have ended in either state: 70 \E s2 \in [ ts -> { failed, complete } ] : 71 UpdateTasks( [ t \in ts |-> 72 [t EXCEPT !.status.state = 73 \* Report `failed' as `shutdown' if we wanted to shut down 74 IF s2[t] = failed /\ t.desired_state = shutdown THEN shutdown 75 ELSE s2[t]] 76 ] ) 77 78 (* Tasks assigned to a node and for which the node is responsible. *) 79 TasksOwnedByNode(n) == { t \in tasks : 80 /\ t.node = n 81 /\ assigned \preceq State(t) 82 /\ State(t) \prec remove 83 } 84 85 (* The dispatcher notices that the worker is down (the connection is lost). *) 86 WorkerDown == 87 /\ UNCHANGED << tasks >> 88 /\ CountEvent 89 /\ \E n \in Node : 90 /\ IsUp(n) 91 /\ nodes' = [nodes EXCEPT ![n] = nodeDown] 92 93 (* When the node reconnects to the cluster, it gets an assignment set from the dispatcher 94 which does not include any tasks that have been marked orphaned and then deleted. 95 Any time an agent gets an assignment set that does not include some task it has running, 96 it shuts down those tasks. *) 97 WorkerUp == 98 /\ UNCHANGED << tasks, nEvents >> 99 /\ \E n \in Node : 100 /\ ~IsUp(n) 101 /\ nodes' = [nodes EXCEPT ![n] = nodeUp] 102 103 (* If SwarmKit sees a node as down for a long time (48 hours or so) then 104 it marks all the node's tasks as orphaned. 105 106 ``Moving a task to the Orphaned state is not desirable, 107 because it's the one case where we break the otherwise invariant 108 that the agent sets all states past ASSIGNED.'' 109 *) 110 OrphanTasks == 111 /\ UNCHANGED << nodes, nEvents >> 112 /\ \E n \in Node : 113 LET affected == { t \in TasksOwnedByNode(n) : Runnable(t) } 114 IN 115 /\ ~IsUp(n) \* Node `n' is still detected as down 116 /\ UpdateTasks([ t \in affected |-> 117 [t EXCEPT !.status.state = orphaned] ]) 118 119 (* Actions we require to happen eventually when possible. *) 120 AgentProgress == 121 \/ ProgressTask 122 \/ ShutdownComplete 123 \/ OrphanTasks 124 \/ WorkerUp 125 126 (* All actions of the agent/worker. *) 127 Agent == 128 \/ AgentProgress 129 \/ RejectTask 130 \/ ContainerExit 131 \/ WorkerDown 132 133 =============================================================================