github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/design/tla/WorkerSpec.tla

github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/design/tla/WorkerSpec.tla (about)

     1  ----------------------------- MODULE WorkerSpec -----------------------------
     2  
     3  EXTENDS Types, Tasks, EventCounter
     4  
     5  VARIABLE nodes          \* Maps nodes to SwarmKit's view of their NodeState
     6  
     7  (* The possible states of a node, as recorded by SwarmKit. *)
     8  nodeUp   == "up"
     9  nodeDown == "down"
    10  NodeState == { nodeUp, nodeDown }
    11  
    12  WorkerTypeOK ==
    13    \* Nodes are up or down
    14    /\ nodes \in [ Node -> NodeState ]
    15  
    16  -----------------------------------------------------------------------------
    17  
    18  \*  Actions performed by worker nodes (actually, by the dispatcher on their behalf)
    19  
    20  (* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *)
    21  IsUp(n) == nodes[n] = nodeUp
    22  
    23  (* Try to advance containers towards `desired_state' if we're not there yet. *)
    24  ProgressTask ==
    25    /\ UNCHANGED << nodes, nEvents >>
    26    /\ \E t  \in tasks,
    27          s2 \in TaskState :   \* The state we want to move to
    28          LET t2 == [t EXCEPT !.status.state = s2]
    29          IN
    30          /\ s2 \preceq t.desired_state       \* Can't be after the desired state
    31          /\ << State(t), State(t2) >> \in {  \* Possible ``progress'' (desirable) transitions
    32               << assigned, accepted >>,
    33               << accepted, preparing >>,
    34               << preparing, ready >>,
    35               << ready, starting >>,
    36               << starting, running >>
    37             }
    38          /\ IsUp(t.node)                     \* Node must be connected to SwarmKit
    39          /\ UpdateTasks(t :> t2)
    40  
    41  (* A running container finishes because we stopped it. *)
    42  ShutdownComplete ==
    43    /\ UNCHANGED << nodes, nEvents >>
    44    /\ \E t \in tasks :
    45       /\ t.desired_state \in {shutdown, remove}                  \* We are trying to stop it
    46       /\ State(t) = running                                      \* It is currently running
    47       /\ IsUp(t.node)
    48       /\ UpdateTasks(t :> [t EXCEPT !.status.state = shutdown])  \* It becomes shutdown
    49  
    50  (* A node can reject a task once it's responsible for it (it has reached `assigned')
    51     until it reaches the `running' state.
    52     Note that an ``accepted'' task can still be rejected. *)
    53  RejectTask ==
    54    /\ UNCHANGED << nodes >>
    55    /\ CountEvent
    56    /\ \E t \in tasks :
    57         /\ State(t) \in { assigned, accepted, preparing, ready, starting }
    58         /\ IsUp(t.node)
    59         /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected])
    60  
    61  (* We notify the managers that some running containers have finished.
    62     There might be several updates at once (e.g. if we're reconnecting). *)
    63  ContainerExit ==
    64    /\ UNCHANGED << nodes >>
    65    /\ CountEvent
    66    /\ \E n \in Node :
    67          /\ IsUp(n)
    68          /\ \E ts \in SUBSET { t \in tasks : t.node = n /\ State(t) = running } :
    69             \* Each container could have ended in either state:
    70             \E s2 \in [ ts -> { failed, complete } ] :
    71               UpdateTasks( [ t \in ts |->
    72                               [t EXCEPT !.status.state =
    73                                 \* Report `failed' as `shutdown' if we wanted to shut down
    74                                 IF s2[t] = failed /\ t.desired_state = shutdown THEN shutdown
    75                                 ELSE s2[t]]
    76                          ] )
    77  
    78  (* Tasks assigned to a node and for which the node is responsible. *)
    79  TasksOwnedByNode(n) == { t \in tasks :
    80    /\ t.node = n
    81    /\ assigned \preceq State(t)
    82    /\ State(t) \prec remove
    83  }
    84  
    85  (* The dispatcher notices that the worker is down (the connection is lost). *)
    86  WorkerDown ==
    87    /\ UNCHANGED << tasks >>
    88    /\ CountEvent
    89    /\ \E n \in Node :
    90         /\ IsUp(n)
    91         /\ nodes' = [nodes EXCEPT ![n] = nodeDown]
    92  
    93  (* When the node reconnects to the cluster, it gets an assignment set from the dispatcher
    94     which does not include any tasks that have been marked orphaned and then deleted.
    95     Any time an agent gets an assignment set that does not include some task it has running,
    96     it shuts down those tasks. *)
    97  WorkerUp ==
    98    /\ UNCHANGED << tasks, nEvents >>
    99    /\ \E n \in Node :
   100         /\ ~IsUp(n)
   101         /\ nodes' = [nodes EXCEPT ![n] = nodeUp]
   102  
   103  (* If SwarmKit sees a node as down for a long time (48 hours or so) then
   104     it marks all the node's tasks as orphaned.
   105  
   106     ``Moving a task to the Orphaned state is not desirable,
   107     because it's the one case where we break the otherwise invariant
   108     that the agent sets all states past ASSIGNED.''
   109  *)
   110  OrphanTasks ==
   111    /\ UNCHANGED << nodes, nEvents >>
   112    /\ \E n \in Node :
   113         LET affected == { t \in TasksOwnedByNode(n) : Runnable(t) }
   114         IN
   115         /\ ~IsUp(n)    \* Node `n' is still detected as down
   116         /\ UpdateTasks([ t \in affected |->
   117                           [t EXCEPT !.status.state = orphaned] ])
   118  
   119  (* Actions we require to happen eventually when possible. *)
   120  AgentProgress ==
   121    \/ ProgressTask
   122    \/ ShutdownComplete
   123    \/ OrphanTasks
   124    \/ WorkerUp
   125  
   126  (* All actions of the agent/worker. *)
   127  Agent ==
   128    \/ AgentProgress
   129    \/ RejectTask
   130    \/ ContainerExit
   131    \/ WorkerDown
   132  
   133  =============================================================================