gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/threads.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"gvisor.dev/gvisor/pkg/atomicbitops"
    21  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    22  	"gvisor.dev/gvisor/pkg/sync"
    23  	"gvisor.dev/gvisor/pkg/waiter"
    24  )
    25  
    26  // TasksLimit is the maximum number of threads for untrusted application.
    27  // Linux doesn't really limit this directly, rather it is limited by total
    28  // memory size, stacks allocated and a global maximum. There's no real reason
    29  // for us to limit it either, (esp. since threads are backed by go routines),
    30  // and we would expect to hit resource limits long before hitting this number.
    31  // However, for correctness, we still check that the user doesn't exceed this
    32  // number.
    33  //
    34  // Note that because of the way futexes are implemented, there *are* in fact
    35  // serious restrictions on valid thread IDs. They are limited to 2^30 - 1
    36  // (kernel/fork.c:MAX_THREADS).
    37  const TasksLimit = (1 << 16)
    38  
    39  // ThreadID is a generic thread identifier.
    40  //
    41  // +marshal
    42  type ThreadID int32
    43  
    44  // String returns a decimal representation of the ThreadID.
    45  func (tid ThreadID) String() string {
    46  	return fmt.Sprintf("%d", tid)
    47  }
    48  
    49  // initTID is the TID given to the first task added to each PID namespace. The
    50  // thread group led by initTID is called the namespace's init process. The
    51  // death of a PID namespace's init process causes all tasks visible in that
    52  // namespace to be killed.
    53  const initTID ThreadID = 1
    54  
    55  // A TaskSet comprises all tasks in a system.
    56  //
    57  // +stateify savable
    58  type TaskSet struct {
    59  	// mu protects all relationships between tasks and thread groups in the
    60  	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
    61  	mu taskSetRWMutex `state:"nosave"`
    62  
    63  	// Root is the root PID namespace, in which all tasks in the TaskSet are
    64  	// visible. The Root pointer is immutable.
    65  	Root *PIDNamespace
    66  
    67  	// sessions is the set of all sessions.
    68  	sessions sessionList
    69  
    70  	// stopCount is the number of active external stops applicable to all tasks
    71  	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
    72  	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
    73  	// by mu.
    74  	//
    75  	// stopCount is not saved for the same reason as Task.stopCount; it is
    76  	// always reset to zero after restore.
    77  	stopCount int32 `state:"nosave"`
    78  
    79  	// liveGoroutines is the number of non-exited task goroutines in the
    80  	// TaskSet.
    81  	//
    82  	// liveGoroutines is not saved; it is reset as task goroutines are
    83  	// restarted by Task.Start.
    84  	liveGoroutines sync.WaitGroup `state:"nosave"`
    85  
    86  	// runningGoroutines is the number of running task goroutines in the
    87  	// TaskSet.
    88  	//
    89  	// runningGoroutines is not saved; its counter value is required to be zero
    90  	// at time of save (but note that this is not necessarily the same thing as
    91  	// sync.WaitGroup's zero value).
    92  	runningGoroutines sync.WaitGroup `state:"nosave"`
    93  
    94  	// aioGoroutines is the number of goroutines running async I/O
    95  	// callbacks.
    96  	//
    97  	// aioGoroutines is not saved but is required to be zero at the time of
    98  	// save.
    99  	aioGoroutines sync.WaitGroup `state:"nosave"`
   100  }
   101  
   102  // newTaskSet returns a new, empty TaskSet.
   103  func newTaskSet(pidns *PIDNamespace) *TaskSet {
   104  	ts := &TaskSet{Root: pidns}
   105  	pidns.owner = ts
   106  	return ts
   107  }
   108  
   109  // forEachThreadGroupLocked applies f to each thread group in ts.
   110  //
   111  // Preconditions: ts.mu must be locked (for reading or writing).
   112  func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
   113  	for tg := range ts.Root.tgids {
   114  		f(tg)
   115  	}
   116  }
   117  
   118  // forEachTaskLocked applies f to each Task in ts.
   119  //
   120  // Preconditions: ts.mu must be locked (for reading or writing).
   121  func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
   122  	for t := range ts.Root.tids {
   123  		f(t)
   124  	}
   125  }
   126  
   127  // A PIDNamespace represents a PID namespace, a bimap between thread IDs and
   128  // tasks. See the pid_namespaces(7) man page for further details.
   129  //
   130  // N.B. A task is said to be visible in a PID namespace if the PID namespace
   131  // contains a thread ID that maps to that task.
   132  //
   133  // +stateify savable
   134  type PIDNamespace struct {
   135  	// owner is the TaskSet that this PID namespace belongs to. The owner
   136  	// pointer is immutable.
   137  	owner *TaskSet
   138  
   139  	// parent is the PID namespace of the process that created this one. If
   140  	// this is the root PID namespace, parent is nil. The parent pointer is
   141  	// immutable.
   142  	//
   143  	// Invariant: All tasks that are visible in this namespace are also visible
   144  	// in all ancestor namespaces.
   145  	parent *PIDNamespace
   146  
   147  	// userns is the user namespace with which this PID namespace is
   148  	// associated. Privileged operations on this PID namespace must have
   149  	// appropriate capabilities in userns. The userns pointer is immutable.
   150  	userns *auth.UserNamespace
   151  
   152  	// id is a unique ID assigned to the PID namespace. id is immutable.
   153  	id uint64
   154  
   155  	// The following fields are protected by owner.mu.
   156  
   157  	// last is the last ThreadID to be allocated in this namespace.
   158  	last ThreadID
   159  
   160  	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
   161  	// the namespace.
   162  	tasks map[ThreadID]*Task
   163  
   164  	// tids is a mapping from tasks visible in this namespace to their
   165  	// identifiers in this namespace.
   166  	tids map[*Task]ThreadID
   167  
   168  	// tgids is a mapping from thread groups visible in this namespace to
   169  	// their identifiers in this namespace.
   170  	//
   171  	// The content of tgids is equivalent to tids[tg.leader]. This exists
   172  	// primarily as an optimization to quickly find all thread groups.
   173  	tgids map[*ThreadGroup]ThreadID
   174  
   175  	// sessions is a mapping from SessionIDs in this namespace to sessions
   176  	// visible in the namespace.
   177  	sessions map[SessionID]*Session
   178  
   179  	// sids is a mapping from sessions visible in this namespace to their
   180  	// identifiers in this namespace.
   181  	sids map[*Session]SessionID
   182  
   183  	// processGroups is a mapping from ProcessGroupIDs in this namespace to
   184  	// process groups visible in the namespace.
   185  	processGroups map[ProcessGroupID]*ProcessGroup
   186  
   187  	// pgids is a mapping from process groups visible in this namespace to
   188  	// their identifiers in this namespace.
   189  	pgids map[*ProcessGroup]ProcessGroupID
   190  
   191  	// exiting indicates that the namespace's init process is exiting or has
   192  	// exited.
   193  	exiting bool
   194  
   195  	// pidNamespaceData contains additional per-PID-namespace data.
   196  	extra pidNamespaceData
   197  }
   198  
   199  func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
   200  	return &PIDNamespace{
   201  		owner:         ts,
   202  		parent:        parent,
   203  		userns:        userns,
   204  		id:            lastPIDNSID.Add(1),
   205  		tasks:         make(map[ThreadID]*Task),
   206  		tids:          make(map[*Task]ThreadID),
   207  		tgids:         make(map[*ThreadGroup]ThreadID),
   208  		sessions:      make(map[SessionID]*Session),
   209  		sids:          make(map[*Session]SessionID),
   210  		processGroups: make(map[ProcessGroupID]*ProcessGroup),
   211  		pgids:         make(map[*ProcessGroup]ProcessGroupID),
   212  		extra:         newPIDNamespaceData(),
   213  	}
   214  }
   215  
   216  // lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID
   217  // namespace.
   218  //
   219  // This is global rather than being per-TaskSet or Kernel because
   220  // NewRootPIDNamespace() is called before the Kernel is initialized.
   221  var lastPIDNSID atomicbitops.Uint64
   222  
   223  // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
   224  // yet when root namespace is created and must be set by caller.
   225  func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
   226  	return newPIDNamespace(nil, nil, userns)
   227  }
   228  
   229  // NewChild returns a new, empty PID namespace that is a child of ns. Authority
   230  // over the new PID namespace is controlled by userns.
   231  func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
   232  	return newPIDNamespace(ns.owner, ns, userns)
   233  }
   234  
   235  // TaskWithID returns the task with thread ID tid in PID namespace ns. If no
   236  // task has that TID, TaskWithID returns nil.
   237  func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
   238  	ns.owner.mu.RLock()
   239  	t := ns.tasks[tid]
   240  	ns.owner.mu.RUnlock()
   241  	return t
   242  }
   243  
   244  // ID returns a non-zero ID that is unique across PID namespaces.
   245  func (ns *PIDNamespace) ID() uint64 {
   246  	return ns.id
   247  }
   248  
   249  // ThreadGroupWithID returns the thread group led by the task with thread ID
   250  // tid in PID namespace ns. If no task has that TID, or if the task with that
   251  // TID is not a thread group leader, ThreadGroupWithID returns nil.
   252  func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
   253  	ns.owner.mu.RLock()
   254  	defer ns.owner.mu.RUnlock()
   255  	t := ns.tasks[tid]
   256  	if t == nil {
   257  		return nil
   258  	}
   259  	if t != t.tg.leader {
   260  		return nil
   261  	}
   262  	return t.tg
   263  }
   264  
   265  // IDOfTask returns the TID assigned to the given task in PID namespace ns. If
   266  // the task is not visible in that namespace, IDOfTask returns 0. (This return
   267  // value is significant in some cases, e.g. getppid() is documented as
   268  // returning 0 if the caller's parent is in an ancestor namespace and
   269  // consequently not visible to the caller.) If the task is nil, IDOfTask returns
   270  // 0.
   271  func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
   272  	ns.owner.mu.RLock()
   273  	id := ns.tids[t]
   274  	ns.owner.mu.RUnlock()
   275  	return id
   276  }
   277  
   278  // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
   279  // If the task is not visible in that namespace, IDOfThreadGroup returns 0.
   280  func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
   281  	ns.owner.mu.RLock()
   282  	id := ns.tgids[tg]
   283  	ns.owner.mu.RUnlock()
   284  	return id
   285  }
   286  
   287  // Tasks returns a snapshot of the tasks in ns.
   288  func (ns *PIDNamespace) Tasks() []*Task {
   289  	ns.owner.mu.RLock()
   290  	defer ns.owner.mu.RUnlock()
   291  	tasks := make([]*Task, 0, len(ns.tasks))
   292  	for t := range ns.tids {
   293  		tasks = append(tasks, t)
   294  	}
   295  	return tasks
   296  }
   297  
   298  // NumTasks returns the number of tasks in ns.
   299  func (ns *PIDNamespace) NumTasks() int {
   300  	ns.owner.mu.RLock()
   301  	defer ns.owner.mu.RUnlock()
   302  	return len(ns.tids)
   303  }
   304  
   305  // NumTasksPerContainer returns the number of tasks in ns that belongs to given container.
   306  func (ns *PIDNamespace) NumTasksPerContainer(cid string) int {
   307  	ns.owner.mu.RLock()
   308  	defer ns.owner.mu.RUnlock()
   309  
   310  	tasks := 0
   311  	for t := range ns.tids {
   312  		if t.ContainerID() == cid {
   313  			tasks++
   314  		}
   315  	}
   316  	return tasks
   317  }
   318  
   319  // ThreadGroups returns a snapshot of the thread groups in ns.
   320  func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
   321  	return ns.ThreadGroupsAppend(nil)
   322  }
   323  
   324  // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
   325  func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
   326  	ns.owner.mu.RLock()
   327  	defer ns.owner.mu.RUnlock()
   328  	for tg := range ns.tgids {
   329  		tgs = append(tgs, tg)
   330  	}
   331  	return tgs
   332  }
   333  
   334  // UserNamespace returns the user namespace associated with PID namespace ns.
   335  func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
   336  	return ns.userns
   337  }
   338  
   339  // Root returns the root PID namespace of ns.
   340  func (ns *PIDNamespace) Root() *PIDNamespace {
   341  	return ns.owner.Root
   342  }
   343  
   344  // A threadGroupNode defines the relationship between a thread group and the
   345  // rest of the system. Conceptually, threadGroupNode is data belonging to the
   346  // owning TaskSet, as if TaskSet contained a field `nodes
   347  // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
   348  // threadGroupNode is embedded in the ThreadGroup it represents.
   349  // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
   350  // threadGroupEntry's methods on ThreadGroup to make it implement
   351  // threadGroupLinker.)
   352  //
   353  // +stateify savable
   354  type threadGroupNode struct {
   355  	// pidns is the PID namespace containing the thread group and all of its
   356  	// member tasks. The pidns pointer is immutable.
   357  	pidns *PIDNamespace
   358  
   359  	// pidWithinNS the thread ID of the leader of this thread group within pidns.
   360  	// Useful to avoid using locks when determining a thread group leader's own
   361  	// TID.
   362  	pidWithinNS atomicbitops.Int32
   363  
   364  	// eventQueue is notified whenever a event of interest to Task.Wait occurs
   365  	// in a child of this thread group, or a ptrace tracee of a task in this
   366  	// thread group. Events are defined in task_exit.go.
   367  	eventQueue waiter.Queue
   368  
   369  	// leader is the thread group's leader, which is the oldest task in the
   370  	// thread group; usually the last task in the thread group to call
   371  	// execve(), or if no such task exists then the first task in the thread
   372  	// group, which was created by a call to fork() or clone() without
   373  	// CLONE_THREAD. Once a thread group has been made visible to the rest of
   374  	// the system by TaskSet.newTask, leader is never nil.
   375  	//
   376  	// Note that it's possible for the leader to exit without causing the rest
   377  	// of the thread group to exit; in such a case, leader will still be valid
   378  	// and non-nil, but leader will not be in tasks.
   379  	//
   380  	// leader is protected by the TaskSet mutex.
   381  	leader *Task
   382  
   383  	// If execing is not nil, it is a task in the thread group that has killed
   384  	// all other tasks so that it can become the thread group leader and
   385  	// perform an execve. (execing may already be the thread group leader.)
   386  	//
   387  	// execing is analogous to Linux's signal_struct::group_exit_task.
   388  	//
   389  	// execing is protected by the TaskSet mutex.
   390  	execing *Task
   391  
   392  	// tasks is all tasks in the thread group that have not yet been reaped.
   393  	//
   394  	// tasks is protected by both the TaskSet mutex and the signal mutex:
   395  	// Mutating tasks requires locking the TaskSet mutex for writing *and*
   396  	// locking the signal mutex. Reading tasks requires locking the TaskSet
   397  	// mutex *or* locking the signal mutex.
   398  	tasks taskList
   399  
   400  	// tasksCount is the number of tasks in the thread group that have not yet
   401  	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
   402  	//
   403  	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
   404  	// as with tasks.
   405  	tasksCount int
   406  
   407  	// liveTasks is the number of tasks in the thread group that have not yet
   408  	// reached TaskExitZombie.
   409  	//
   410  	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
   411  	liveTasks int
   412  
   413  	// activeTasks is the number of tasks in the thread group that have not yet
   414  	// reached TaskExitInitiated.
   415  	//
   416  	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
   417  	// as with tasks.
   418  	activeTasks int
   419  }
   420  
   421  // PIDNamespace returns the PID namespace containing tg.
   422  func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
   423  	return tg.pidns
   424  }
   425  
   426  // TaskSet returns the TaskSet containing tg.
   427  func (tg *ThreadGroup) TaskSet() *TaskSet {
   428  	return tg.pidns.owner
   429  }
   430  
   431  // Leader returns tg's leader.
   432  func (tg *ThreadGroup) Leader() *Task {
   433  	tg.pidns.owner.mu.RLock()
   434  	defer tg.pidns.owner.mu.RUnlock()
   435  	return tg.leader
   436  }
   437  
   438  // Count returns the number of non-exited threads in the group.
   439  func (tg *ThreadGroup) Count() int {
   440  	tg.pidns.owner.mu.RLock()
   441  	defer tg.pidns.owner.mu.RUnlock()
   442  	var count int
   443  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   444  		count++
   445  	}
   446  	return count
   447  }
   448  
   449  // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
   450  // all tasks in tg.
   451  func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
   452  	tg.pidns.owner.mu.RLock()
   453  	defer tg.pidns.owner.mu.RUnlock()
   454  
   455  	var tasks []ThreadID
   456  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   457  		if id, ok := pidns.tids[t]; ok {
   458  			tasks = append(tasks, id)
   459  		}
   460  	}
   461  	return tasks
   462  }
   463  
   464  // ID returns tg's leader's thread ID in its own PID namespace.
   465  // If tg's leader is dead, ID returns 0.
   466  func (tg *ThreadGroup) ID() ThreadID {
   467  	return ThreadID(tg.pidWithinNS.Load())
   468  }
   469  
   470  // A taskNode defines the relationship between a task and the rest of the
   471  // system. The comments on threadGroupNode also apply to taskNode.
   472  //
   473  // +stateify savable
   474  type taskNode struct {
   475  	// tg is the thread group that this task belongs to. The tg pointer is
   476  	// immutable.
   477  	tg *ThreadGroup `state:"wait"`
   478  
   479  	// taskEntry links into tg.tasks. Note that this means that
   480  	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
   481  	// group. See threadGroupNode.tasks for synchronization info.
   482  	taskEntry
   483  
   484  	// parent is the task's parent. parent may be nil.
   485  	//
   486  	// parent is protected by the TaskSet mutex.
   487  	parent *Task
   488  
   489  	// children is this task's children.
   490  	//
   491  	// children is protected by the TaskSet mutex.
   492  	children map[*Task]struct{}
   493  
   494  	// If childPIDNamespace is not nil, all new tasks created by this task will
   495  	// be members of childPIDNamespace rather than this one. (As a corollary,
   496  	// this task becomes unable to create sibling tasks in the same thread
   497  	// group.)
   498  	//
   499  	// childPIDNamespace is exclusive to the task goroutine.
   500  	childPIDNamespace *PIDNamespace
   501  }
   502  
   503  // ThreadGroup returns the thread group containing t.
   504  func (t *Task) ThreadGroup() *ThreadGroup {
   505  	return t.tg
   506  }
   507  
   508  // PIDNamespace returns the PID namespace containing t.
   509  func (t *Task) PIDNamespace() *PIDNamespace {
   510  	return t.tg.pidns
   511  }
   512  
   513  // TaskSet returns the TaskSet containing t.
   514  func (t *Task) TaskSet() *TaskSet {
   515  	return t.tg.pidns.owner
   516  }
   517  
   518  // Timekeeper returns the system Timekeeper.
   519  func (t *Task) Timekeeper() *Timekeeper {
   520  	return t.k.timekeeper
   521  }
   522  
   523  // Parent returns t's parent.
   524  func (t *Task) Parent() *Task {
   525  	t.tg.pidns.owner.mu.RLock()
   526  	defer t.tg.pidns.owner.mu.RUnlock()
   527  	return t.parent
   528  }
   529  
   530  // ParentLocked returns t's parent. Caller must ensure t's TaskSet mu
   531  // is locked for at least reading.
   532  //
   533  // +checklocks:t.tg.pidns.owner.mu
   534  func (t *Task) ParentLocked() *Task {
   535  	return t.parent
   536  }
   537  
   538  // ThreadID returns t's thread ID in its own PID namespace. If the task is
   539  // dead, ThreadID returns 0.
   540  func (t *Task) ThreadID() ThreadID {
   541  	return t.tg.pidns.IDOfTask(t)
   542  }
   543  
   544  // TGIDInRoot returns t's TGID in the root PID namespace.
   545  func (t *Task) TGIDInRoot() ThreadID {
   546  	return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg)
   547  }
   548  
   549  // Children returns children of this task.
   550  func (t *Task) Children() map[*Task]struct{} {
   551  	t.tg.pidns.owner.mu.RLock()
   552  	defer t.tg.pidns.owner.mu.RUnlock()
   553  
   554  	children := make(map[*Task]struct{}, len(t.children))
   555  	for child, val := range t.children {
   556  		children[child] = val
   557  	}
   558  
   559  	return children
   560  }