github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/threads.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    21  	"github.com/SagerNet/gvisor/pkg/sync"
    22  	"github.com/SagerNet/gvisor/pkg/waiter"
    23  )
    24  
    25  // TasksLimit is the maximum number of threads for untrusted application.
    26  // Linux doesn't really limit this directly, rather it is limited by total
    27  // memory size, stacks allocated and a global maximum. There's no real reason
    28  // for us to limit it either, (esp. since threads are backed by go routines),
    29  // and we would expect to hit resource limits long before hitting this number.
    30  // However, for correctness, we still check that the user doesn't exceed this
    31  // number.
    32  //
    33  // Note that because of the way futexes are implemented, there *are* in fact
    34  // serious restrictions on valid thread IDs. They are limited to 2^30 - 1
    35  // (kernel/fork.c:MAX_THREADS).
    36  const TasksLimit = (1 << 16)
    37  
    38  // ThreadID is a generic thread identifier.
    39  //
    40  // +marshal
    41  type ThreadID int32
    42  
    43  // String returns a decimal representation of the ThreadID.
    44  func (tid ThreadID) String() string {
    45  	return fmt.Sprintf("%d", tid)
    46  }
    47  
    48  // InitTID is the TID given to the first task added to each PID namespace. The
    49  // thread group led by InitTID is called the namespace's init process. The
    50  // death of a PID namespace's init process causes all tasks visible in that
    51  // namespace to be killed.
    52  const InitTID ThreadID = 1
    53  
    54  // A TaskSet comprises all tasks in a system.
    55  //
    56  // +stateify savable
    57  type TaskSet struct {
    58  	// mu protects all relationships between tasks and thread groups in the
    59  	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
    60  	mu sync.RWMutex `state:"nosave"`
    61  
    62  	// Root is the root PID namespace, in which all tasks in the TaskSet are
    63  	// visible. The Root pointer is immutable.
    64  	Root *PIDNamespace
    65  
    66  	// sessions is the set of all sessions.
    67  	sessions sessionList
    68  
    69  	// stopCount is the number of active external stops applicable to all tasks
    70  	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
    71  	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
    72  	// by mu.
    73  	//
    74  	// stopCount is not saved for the same reason as Task.stopCount; it is
    75  	// always reset to zero after restore.
    76  	stopCount int32 `state:"nosave"`
    77  
    78  	// liveGoroutines is the number of non-exited task goroutines in the
    79  	// TaskSet.
    80  	//
    81  	// liveGoroutines is not saved; it is reset as task goroutines are
    82  	// restarted by Task.Start.
    83  	liveGoroutines sync.WaitGroup `state:"nosave"`
    84  
    85  	// runningGoroutines is the number of running task goroutines in the
    86  	// TaskSet.
    87  	//
    88  	// runningGoroutines is not saved; its counter value is required to be zero
    89  	// at time of save (but note that this is not necessarily the same thing as
    90  	// sync.WaitGroup's zero value).
    91  	runningGoroutines sync.WaitGroup `state:"nosave"`
    92  
    93  	// aioGoroutines is the number of goroutines running async I/O
    94  	// callbacks.
    95  	//
    96  	// aioGoroutines is not saved but is required to be zero at the time of
    97  	// save.
    98  	aioGoroutines sync.WaitGroup `state:"nosave"`
    99  }
   100  
   101  // newTaskSet returns a new, empty TaskSet.
   102  func newTaskSet(pidns *PIDNamespace) *TaskSet {
   103  	ts := &TaskSet{Root: pidns}
   104  	pidns.owner = ts
   105  	return ts
   106  }
   107  
   108  // forEachThreadGroupLocked applies f to each thread group in ts.
   109  //
   110  // Preconditions: ts.mu must be locked (for reading or writing).
   111  func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
   112  	for tg := range ts.Root.tgids {
   113  		f(tg)
   114  	}
   115  }
   116  
   117  // forEachTaskLocked applies f to each Task in ts.
   118  //
   119  // Preconditions: ts.mu must be locked (for reading or writing).
   120  func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
   121  	for t := range ts.Root.tids {
   122  		f(t)
   123  	}
   124  }
   125  
   126  // A PIDNamespace represents a PID namespace, a bimap between thread IDs and
   127  // tasks. See the pid_namespaces(7) man page for further details.
   128  //
   129  // N.B. A task is said to be visible in a PID namespace if the PID namespace
   130  // contains a thread ID that maps to that task.
   131  //
   132  // +stateify savable
   133  type PIDNamespace struct {
   134  	// owner is the TaskSet that this PID namespace belongs to. The owner
   135  	// pointer is immutable.
   136  	owner *TaskSet
   137  
   138  	// parent is the PID namespace of the process that created this one. If
   139  	// this is the root PID namespace, parent is nil. The parent pointer is
   140  	// immutable.
   141  	//
   142  	// Invariant: All tasks that are visible in this namespace are also visible
   143  	// in all ancestor namespaces.
   144  	parent *PIDNamespace
   145  
   146  	// userns is the user namespace with which this PID namespace is
   147  	// associated. Privileged operations on this PID namespace must have
   148  	// appropriate capabilities in userns. The userns pointer is immutable.
   149  	userns *auth.UserNamespace
   150  
   151  	// The following fields are protected by owner.mu.
   152  
   153  	// last is the last ThreadID to be allocated in this namespace.
   154  	last ThreadID
   155  
   156  	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
   157  	// the namespace.
   158  	tasks map[ThreadID]*Task
   159  
   160  	// tids is a mapping from tasks visible in this namespace to their
   161  	// identifiers in this namespace.
   162  	tids map[*Task]ThreadID
   163  
   164  	// tgids is a mapping from thread groups visible in this namespace to
   165  	// their identifiers in this namespace.
   166  	//
   167  	// The content of tgids is equivalent to tids[tg.leader]. This exists
   168  	// primarily as an optimization to quickly find all thread groups.
   169  	tgids map[*ThreadGroup]ThreadID
   170  
   171  	// sessions is a mapping from SessionIDs in this namespace to sessions
   172  	// visible in the namespace.
   173  	sessions map[SessionID]*Session
   174  
   175  	// sids is a mapping from sessions visible in this namespace to their
   176  	// identifiers in this namespace.
   177  	sids map[*Session]SessionID
   178  
   179  	// processGroups is a mapping from ProcessGroupIDs in this namespace to
   180  	// process groups visible in the namespace.
   181  	processGroups map[ProcessGroupID]*ProcessGroup
   182  
   183  	// pgids is a mapping from process groups visible in this namespace to
   184  	// their identifiers in this namespace.
   185  	pgids map[*ProcessGroup]ProcessGroupID
   186  
   187  	// exiting indicates that the namespace's init process is exiting or has
   188  	// exited.
   189  	exiting bool
   190  }
   191  
   192  func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
   193  	return &PIDNamespace{
   194  		owner:         ts,
   195  		parent:        parent,
   196  		userns:        userns,
   197  		tasks:         make(map[ThreadID]*Task),
   198  		tids:          make(map[*Task]ThreadID),
   199  		tgids:         make(map[*ThreadGroup]ThreadID),
   200  		sessions:      make(map[SessionID]*Session),
   201  		sids:          make(map[*Session]SessionID),
   202  		processGroups: make(map[ProcessGroupID]*ProcessGroup),
   203  		pgids:         make(map[*ProcessGroup]ProcessGroupID),
   204  	}
   205  }
   206  
   207  // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
   208  // yet when root namespace is created and must be set by caller.
   209  func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
   210  	return newPIDNamespace(nil, nil, userns)
   211  }
   212  
   213  // NewChild returns a new, empty PID namespace that is a child of ns. Authority
   214  // over the new PID namespace is controlled by userns.
   215  func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
   216  	return newPIDNamespace(ns.owner, ns, userns)
   217  }
   218  
   219  // TaskWithID returns the task with thread ID tid in PID namespace ns. If no
   220  // task has that TID, TaskWithID returns nil.
   221  func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
   222  	ns.owner.mu.RLock()
   223  	t := ns.tasks[tid]
   224  	ns.owner.mu.RUnlock()
   225  	return t
   226  }
   227  
   228  // ThreadGroupWithID returns the thread group led by the task with thread ID
   229  // tid in PID namespace ns. If no task has that TID, or if the task with that
   230  // TID is not a thread group leader, ThreadGroupWithID returns nil.
   231  func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
   232  	ns.owner.mu.RLock()
   233  	defer ns.owner.mu.RUnlock()
   234  	t := ns.tasks[tid]
   235  	if t == nil {
   236  		return nil
   237  	}
   238  	if t != t.tg.leader {
   239  		return nil
   240  	}
   241  	return t.tg
   242  }
   243  
   244  // IDOfTask returns the TID assigned to the given task in PID namespace ns. If
   245  // the task is not visible in that namespace, IDOfTask returns 0. (This return
   246  // value is significant in some cases, e.g. getppid() is documented as
   247  // returning 0 if the caller's parent is in an ancestor namespace and
   248  // consequently not visible to the caller.) If the task is nil, IDOfTask returns
   249  // 0.
   250  func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
   251  	ns.owner.mu.RLock()
   252  	id := ns.tids[t]
   253  	ns.owner.mu.RUnlock()
   254  	return id
   255  }
   256  
   257  // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
   258  // If the task is not visible in that namespace, IDOfThreadGroup returns 0.
   259  func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
   260  	ns.owner.mu.RLock()
   261  	id := ns.tgids[tg]
   262  	ns.owner.mu.RUnlock()
   263  	return id
   264  }
   265  
   266  // Tasks returns a snapshot of the tasks in ns.
   267  func (ns *PIDNamespace) Tasks() []*Task {
   268  	ns.owner.mu.RLock()
   269  	defer ns.owner.mu.RUnlock()
   270  	tasks := make([]*Task, 0, len(ns.tasks))
   271  	for t := range ns.tids {
   272  		tasks = append(tasks, t)
   273  	}
   274  	return tasks
   275  }
   276  
   277  // NumTasks returns the number of tasks in ns.
   278  func (ns *PIDNamespace) NumTasks() int {
   279  	ns.owner.mu.RLock()
   280  	defer ns.owner.mu.RUnlock()
   281  	return len(ns.tids)
   282  }
   283  
   284  // ThreadGroups returns a snapshot of the thread groups in ns.
   285  func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
   286  	return ns.ThreadGroupsAppend(nil)
   287  }
   288  
   289  // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
   290  func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
   291  	ns.owner.mu.RLock()
   292  	defer ns.owner.mu.RUnlock()
   293  	for tg := range ns.tgids {
   294  		tgs = append(tgs, tg)
   295  	}
   296  	return tgs
   297  }
   298  
   299  // UserNamespace returns the user namespace associated with PID namespace ns.
   300  func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
   301  	return ns.userns
   302  }
   303  
   304  // Root returns the root PID namespace of ns.
   305  func (ns *PIDNamespace) Root() *PIDNamespace {
   306  	return ns.owner.Root
   307  }
   308  
   309  // A threadGroupNode defines the relationship between a thread group and the
   310  // rest of the system. Conceptually, threadGroupNode is data belonging to the
   311  // owning TaskSet, as if TaskSet contained a field `nodes
   312  // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
   313  // threadGroupNode is embedded in the ThreadGroup it represents.
   314  // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
   315  // threadGroupEntry's methods on ThreadGroup to make it implement
   316  // threadGroupLinker.)
   317  //
   318  // +stateify savable
   319  type threadGroupNode struct {
   320  	// pidns is the PID namespace containing the thread group and all of its
   321  	// member tasks. The pidns pointer is immutable.
   322  	pidns *PIDNamespace
   323  
   324  	// eventQueue is notified whenever a event of interest to Task.Wait occurs
   325  	// in a child of this thread group, or a ptrace tracee of a task in this
   326  	// thread group. Events are defined in task_exit.go.
   327  	//
   328  	// Note that we cannot check and save this wait queue similarly to other
   329  	// wait queues, as the queue will not be empty by the time of saving, due
   330  	// to the wait sourced from Exec().
   331  	eventQueue waiter.Queue `state:"nosave"`
   332  
   333  	// leader is the thread group's leader, which is the oldest task in the
   334  	// thread group; usually the last task in the thread group to call
   335  	// execve(), or if no such task exists then the first task in the thread
   336  	// group, which was created by a call to fork() or clone() without
   337  	// CLONE_THREAD. Once a thread group has been made visible to the rest of
   338  	// the system by TaskSet.newTask, leader is never nil.
   339  	//
   340  	// Note that it's possible for the leader to exit without causing the rest
   341  	// of the thread group to exit; in such a case, leader will still be valid
   342  	// and non-nil, but leader will not be in tasks.
   343  	//
   344  	// leader is protected by the TaskSet mutex.
   345  	leader *Task
   346  
   347  	// If execing is not nil, it is a task in the thread group that has killed
   348  	// all other tasks so that it can become the thread group leader and
   349  	// perform an execve. (execing may already be the thread group leader.)
   350  	//
   351  	// execing is analogous to Linux's signal_struct::group_exit_task.
   352  	//
   353  	// execing is protected by the TaskSet mutex.
   354  	execing *Task
   355  
   356  	// tasks is all tasks in the thread group that have not yet been reaped.
   357  	//
   358  	// tasks is protected by both the TaskSet mutex and the signal mutex:
   359  	// Mutating tasks requires locking the TaskSet mutex for writing *and*
   360  	// locking the signal mutex. Reading tasks requires locking the TaskSet
   361  	// mutex *or* locking the signal mutex.
   362  	tasks taskList
   363  
   364  	// tasksCount is the number of tasks in the thread group that have not yet
   365  	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
   366  	//
   367  	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
   368  	// as with tasks.
   369  	tasksCount int
   370  
   371  	// liveTasks is the number of tasks in the thread group that have not yet
   372  	// reached TaskExitZombie.
   373  	//
   374  	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
   375  	liveTasks int
   376  
   377  	// activeTasks is the number of tasks in the thread group that have not yet
   378  	// reached TaskExitInitiated.
   379  	//
   380  	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
   381  	// as with tasks.
   382  	activeTasks int
   383  }
   384  
   385  // PIDNamespace returns the PID namespace containing tg.
   386  func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
   387  	return tg.pidns
   388  }
   389  
   390  // TaskSet returns the TaskSet containing tg.
   391  func (tg *ThreadGroup) TaskSet() *TaskSet {
   392  	return tg.pidns.owner
   393  }
   394  
   395  // Leader returns tg's leader.
   396  func (tg *ThreadGroup) Leader() *Task {
   397  	tg.pidns.owner.mu.RLock()
   398  	defer tg.pidns.owner.mu.RUnlock()
   399  	return tg.leader
   400  }
   401  
   402  // Count returns the number of non-exited threads in the group.
   403  func (tg *ThreadGroup) Count() int {
   404  	tg.pidns.owner.mu.RLock()
   405  	defer tg.pidns.owner.mu.RUnlock()
   406  	var count int
   407  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   408  		count++
   409  	}
   410  	return count
   411  }
   412  
   413  // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
   414  // all tasks in tg.
   415  func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
   416  	tg.pidns.owner.mu.RLock()
   417  	defer tg.pidns.owner.mu.RUnlock()
   418  
   419  	var tasks []ThreadID
   420  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   421  		if id, ok := pidns.tids[t]; ok {
   422  			tasks = append(tasks, id)
   423  		}
   424  	}
   425  	return tasks
   426  }
   427  
   428  // ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
   429  // is dead, ID returns 0.
   430  func (tg *ThreadGroup) ID() ThreadID {
   431  	tg.pidns.owner.mu.RLock()
   432  	id := tg.pidns.tgids[tg]
   433  	tg.pidns.owner.mu.RUnlock()
   434  	return id
   435  }
   436  
   437  // A taskNode defines the relationship between a task and the rest of the
   438  // system. The comments on threadGroupNode also apply to taskNode.
   439  //
   440  // +stateify savable
   441  type taskNode struct {
   442  	// tg is the thread group that this task belongs to. The tg pointer is
   443  	// immutable.
   444  	tg *ThreadGroup `state:"wait"`
   445  
   446  	// taskEntry links into tg.tasks. Note that this means that
   447  	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
   448  	// group. See threadGroupNode.tasks for synchronization info.
   449  	taskEntry
   450  
   451  	// parent is the task's parent. parent may be nil.
   452  	//
   453  	// parent is protected by the TaskSet mutex.
   454  	parent *Task
   455  
   456  	// children is this task's children.
   457  	//
   458  	// children is protected by the TaskSet mutex.
   459  	children map[*Task]struct{}
   460  
   461  	// If childPIDNamespace is not nil, all new tasks created by this task will
   462  	// be members of childPIDNamespace rather than this one. (As a corollary,
   463  	// this task becomes unable to create sibling tasks in the same thread
   464  	// group.)
   465  	//
   466  	// childPIDNamespace is exclusive to the task goroutine.
   467  	childPIDNamespace *PIDNamespace
   468  }
   469  
   470  // ThreadGroup returns the thread group containing t.
   471  func (t *Task) ThreadGroup() *ThreadGroup {
   472  	return t.tg
   473  }
   474  
   475  // PIDNamespace returns the PID namespace containing t.
   476  func (t *Task) PIDNamespace() *PIDNamespace {
   477  	return t.tg.pidns
   478  }
   479  
   480  // TaskSet returns the TaskSet containing t.
   481  func (t *Task) TaskSet() *TaskSet {
   482  	return t.tg.pidns.owner
   483  }
   484  
   485  // Timekeeper returns the system Timekeeper.
   486  func (t *Task) Timekeeper() *Timekeeper {
   487  	return t.k.timekeeper
   488  }
   489  
   490  // Parent returns t's parent.
   491  func (t *Task) Parent() *Task {
   492  	t.tg.pidns.owner.mu.RLock()
   493  	defer t.tg.pidns.owner.mu.RUnlock()
   494  	return t.parent
   495  }
   496  
   497  // ThreadID returns t's thread ID in its own PID namespace. If the task is
   498  // dead, ThreadID returns 0.
   499  func (t *Task) ThreadID() ThreadID {
   500  	return t.tg.pidns.IDOfTask(t)
   501  }
   502  
   503  // TGIDInRoot returns t's TGID in the root PID namespace.
   504  func (t *Task) TGIDInRoot() ThreadID {
   505  	return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg)
   506  }