github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/threads.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops"
    21  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth"
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/sync"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/waiter"
    24  )
    25  
    26  // TasksLimit is the maximum number of threads for untrusted application.
    27  // Linux doesn't really limit this directly, rather it is limited by total
    28  // memory size, stacks allocated and a global maximum. There's no real reason
    29  // for us to limit it either, (esp. since threads are backed by go routines),
    30  // and we would expect to hit resource limits long before hitting this number.
    31  // However, for correctness, we still check that the user doesn't exceed this
    32  // number.
    33  //
    34  // Note that because of the way futexes are implemented, there *are* in fact
    35  // serious restrictions on valid thread IDs. They are limited to 2^30 - 1
    36  // (kernel/fork.c:MAX_THREADS).
    37  const TasksLimit = (1 << 16)
    38  
    39  // ThreadID is a generic thread identifier.
    40  //
    41  // +marshal
    42  type ThreadID int32
    43  
    44  // String returns a decimal representation of the ThreadID.
    45  func (tid ThreadID) String() string {
    46  	return fmt.Sprintf("%d", tid)
    47  }
    48  
    49  // initTID is the TID given to the first task added to each PID namespace. The
    50  // thread group led by initTID is called the namespace's init process. The
    51  // death of a PID namespace's init process causes all tasks visible in that
    52  // namespace to be killed.
    53  const initTID ThreadID = 1
    54  
    55  // A TaskSet comprises all tasks in a system.
    56  //
    57  // +stateify savable
    58  type TaskSet struct {
    59  	// mu protects all relationships between tasks and thread groups in the
    60  	// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
    61  	mu taskSetRWMutex `state:"nosave"`
    62  
    63  	// Root is the root PID namespace, in which all tasks in the TaskSet are
    64  	// visible. The Root pointer is immutable.
    65  	Root *PIDNamespace
    66  
    67  	// sessions is the set of all sessions.
    68  	sessions sessionList
    69  
    70  	// stopCount is the number of active external stops applicable to all tasks
    71  	// in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
    72  	// paired with a call to TaskSet.EndExternalStop). stopCount is protected
    73  	// by mu.
    74  	//
    75  	// stopCount is not saved for the same reason as Task.stopCount; it is
    76  	// always reset to zero after restore.
    77  	stopCount int32 `state:"nosave"`
    78  
    79  	// liveGoroutines is the number of non-exited task goroutines in the
    80  	// TaskSet.
    81  	//
    82  	// liveGoroutines is not saved; it is reset as task goroutines are
    83  	// restarted by Task.Start.
    84  	liveGoroutines sync.WaitGroup `state:"nosave"`
    85  
    86  	// runningGoroutines is the number of running task goroutines in the
    87  	// TaskSet.
    88  	//
    89  	// runningGoroutines is not saved; its counter value is required to be zero
    90  	// at time of save (but note that this is not necessarily the same thing as
    91  	// sync.WaitGroup's zero value).
    92  	runningGoroutines sync.WaitGroup `state:"nosave"`
    93  
    94  	// aioGoroutines is the number of goroutines running async I/O
    95  	// callbacks.
    96  	//
    97  	// aioGoroutines is not saved but is required to be zero at the time of
    98  	// save.
    99  	aioGoroutines sync.WaitGroup `state:"nosave"`
   100  }
   101  
   102  // newTaskSet returns a new, empty TaskSet.
   103  func newTaskSet(pidns *PIDNamespace) *TaskSet {
   104  	ts := &TaskSet{Root: pidns}
   105  	pidns.owner = ts
   106  	return ts
   107  }
   108  
   109  // forEachThreadGroupLocked applies f to each thread group in ts.
   110  //
   111  // Preconditions: ts.mu must be locked (for reading or writing).
   112  func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
   113  	for tg := range ts.Root.tgids {
   114  		f(tg)
   115  	}
   116  }
   117  
   118  // forEachTaskLocked applies f to each Task in ts.
   119  //
   120  // Preconditions: ts.mu must be locked (for reading or writing).
   121  func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
   122  	for t := range ts.Root.tids {
   123  		f(t)
   124  	}
   125  }
   126  
   127  // A PIDNamespace represents a PID namespace, a bimap between thread IDs and
   128  // tasks. See the pid_namespaces(7) man page for further details.
   129  //
   130  // N.B. A task is said to be visible in a PID namespace if the PID namespace
   131  // contains a thread ID that maps to that task.
   132  //
   133  // +stateify savable
   134  type PIDNamespace struct {
   135  	// owner is the TaskSet that this PID namespace belongs to. The owner
   136  	// pointer is immutable.
   137  	owner *TaskSet
   138  
   139  	// parent is the PID namespace of the process that created this one. If
   140  	// this is the root PID namespace, parent is nil. The parent pointer is
   141  	// immutable.
   142  	//
   143  	// Invariant: All tasks that are visible in this namespace are also visible
   144  	// in all ancestor namespaces.
   145  	parent *PIDNamespace
   146  
   147  	// userns is the user namespace with which this PID namespace is
   148  	// associated. Privileged operations on this PID namespace must have
   149  	// appropriate capabilities in userns. The userns pointer is immutable.
   150  	userns *auth.UserNamespace
   151  
   152  	// id is a unique ID assigned to the PID namespace. id is immutable.
   153  	id uint64
   154  
   155  	// The following fields are protected by owner.mu.
   156  
   157  	// last is the last ThreadID to be allocated in this namespace.
   158  	last ThreadID
   159  
   160  	// tasks is a mapping from ThreadIDs in this namespace to tasks visible in
   161  	// the namespace.
   162  	tasks map[ThreadID]*Task
   163  
   164  	// tids is a mapping from tasks visible in this namespace to their
   165  	// identifiers in this namespace.
   166  	tids map[*Task]ThreadID
   167  
   168  	// tgids is a mapping from thread groups visible in this namespace to
   169  	// their identifiers in this namespace.
   170  	//
   171  	// The content of tgids is equivalent to tids[tg.leader]. This exists
   172  	// primarily as an optimization to quickly find all thread groups.
   173  	tgids map[*ThreadGroup]ThreadID
   174  
   175  	// sessions is a mapping from SessionIDs in this namespace to sessions
   176  	// visible in the namespace.
   177  	sessions map[SessionID]*Session
   178  
   179  	// sids is a mapping from sessions visible in this namespace to their
   180  	// identifiers in this namespace.
   181  	sids map[*Session]SessionID
   182  
   183  	// processGroups is a mapping from ProcessGroupIDs in this namespace to
   184  	// process groups visible in the namespace.
   185  	processGroups map[ProcessGroupID]*ProcessGroup
   186  
   187  	// pgids is a mapping from process groups visible in this namespace to
   188  	// their identifiers in this namespace.
   189  	pgids map[*ProcessGroup]ProcessGroupID
   190  
   191  	// exiting indicates that the namespace's init process is exiting or has
   192  	// exited.
   193  	exiting bool
   194  
   195  	// pidNamespaceData contains additional per-PID-namespace data.
   196  	extra pidNamespaceData
   197  }
   198  
   199  func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
   200  	return &PIDNamespace{
   201  		owner:         ts,
   202  		parent:        parent,
   203  		userns:        userns,
   204  		id:            lastPIDNSID.Add(1),
   205  		tasks:         make(map[ThreadID]*Task),
   206  		tids:          make(map[*Task]ThreadID),
   207  		tgids:         make(map[*ThreadGroup]ThreadID),
   208  		sessions:      make(map[SessionID]*Session),
   209  		sids:          make(map[*Session]SessionID),
   210  		processGroups: make(map[ProcessGroupID]*ProcessGroup),
   211  		pgids:         make(map[*ProcessGroup]ProcessGroupID),
   212  		extra:         newPIDNamespaceData(),
   213  	}
   214  }
   215  
   216  // lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID
   217  // namespace.
   218  //
   219  // This is global rather than being per-TaskSet or Kernel because
   220  // NewRootPIDNamespace() is called before the Kernel is initialized.
   221  var lastPIDNSID atomicbitops.Uint64
   222  
   223  // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
   224  // yet when root namespace is created and must be set by caller.
   225  func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
   226  	return newPIDNamespace(nil, nil, userns)
   227  }
   228  
   229  // NewChild returns a new, empty PID namespace that is a child of ns. Authority
   230  // over the new PID namespace is controlled by userns.
   231  func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
   232  	return newPIDNamespace(ns.owner, ns, userns)
   233  }
   234  
   235  // TaskWithID returns the task with thread ID tid in PID namespace ns. If no
   236  // task has that TID, TaskWithID returns nil.
   237  func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
   238  	ns.owner.mu.RLock()
   239  	t := ns.tasks[tid]
   240  	ns.owner.mu.RUnlock()
   241  	return t
   242  }
   243  
   244  // ID returns a non-zero ID that is unique across PID namespaces.
   245  func (ns *PIDNamespace) ID() uint64 {
   246  	return ns.id
   247  }
   248  
   249  // ThreadGroupWithID returns the thread group led by the task with thread ID
   250  // tid in PID namespace ns. If no task has that TID, or if the task with that
   251  // TID is not a thread group leader, ThreadGroupWithID returns nil.
   252  func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
   253  	ns.owner.mu.RLock()
   254  	defer ns.owner.mu.RUnlock()
   255  	t := ns.tasks[tid]
   256  	if t == nil {
   257  		return nil
   258  	}
   259  	if t != t.tg.leader {
   260  		return nil
   261  	}
   262  	return t.tg
   263  }
   264  
   265  // IDOfTask returns the TID assigned to the given task in PID namespace ns. If
   266  // the task is not visible in that namespace, IDOfTask returns 0. (This return
   267  // value is significant in some cases, e.g. getppid() is documented as
   268  // returning 0 if the caller's parent is in an ancestor namespace and
   269  // consequently not visible to the caller.) If the task is nil, IDOfTask returns
   270  // 0.
   271  func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
   272  	ns.owner.mu.RLock()
   273  	id := ns.tids[t]
   274  	ns.owner.mu.RUnlock()
   275  	return id
   276  }
   277  
   278  // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
   279  // If the task is not visible in that namespace, IDOfThreadGroup returns 0.
   280  func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
   281  	ns.owner.mu.RLock()
   282  	id := ns.tgids[tg]
   283  	ns.owner.mu.RUnlock()
   284  	return id
   285  }
   286  
   287  // Tasks returns a snapshot of the tasks in ns.
   288  func (ns *PIDNamespace) Tasks() []*Task {
   289  	ns.owner.mu.RLock()
   290  	defer ns.owner.mu.RUnlock()
   291  	tasks := make([]*Task, 0, len(ns.tasks))
   292  	for t := range ns.tids {
   293  		tasks = append(tasks, t)
   294  	}
   295  	return tasks
   296  }
   297  
   298  // NumTasks returns the number of tasks in ns.
   299  func (ns *PIDNamespace) NumTasks() int {
   300  	ns.owner.mu.RLock()
   301  	defer ns.owner.mu.RUnlock()
   302  	return len(ns.tids)
   303  }
   304  
   305  // ThreadGroups returns a snapshot of the thread groups in ns.
   306  func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
   307  	return ns.ThreadGroupsAppend(nil)
   308  }
   309  
   310  // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
   311  func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
   312  	ns.owner.mu.RLock()
   313  	defer ns.owner.mu.RUnlock()
   314  	for tg := range ns.tgids {
   315  		tgs = append(tgs, tg)
   316  	}
   317  	return tgs
   318  }
   319  
   320  // UserNamespace returns the user namespace associated with PID namespace ns.
   321  func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
   322  	return ns.userns
   323  }
   324  
   325  // Root returns the root PID namespace of ns.
   326  func (ns *PIDNamespace) Root() *PIDNamespace {
   327  	return ns.owner.Root
   328  }
   329  
   330  // A threadGroupNode defines the relationship between a thread group and the
   331  // rest of the system. Conceptually, threadGroupNode is data belonging to the
   332  // owning TaskSet, as if TaskSet contained a field `nodes
   333  // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
   334  // threadGroupNode is embedded in the ThreadGroup it represents.
   335  // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
   336  // threadGroupEntry's methods on ThreadGroup to make it implement
   337  // threadGroupLinker.)
   338  //
   339  // +stateify savable
   340  type threadGroupNode struct {
   341  	// pidns is the PID namespace containing the thread group and all of its
   342  	// member tasks. The pidns pointer is immutable.
   343  	pidns *PIDNamespace
   344  
   345  	// pidWithinNS the thread ID of the leader of this thread group within pidns.
   346  	// Useful to avoid using locks when determining a thread group leader's own
   347  	// TID.
   348  	pidWithinNS atomicbitops.Int32
   349  
   350  	// eventQueue is notified whenever a event of interest to Task.Wait occurs
   351  	// in a child of this thread group, or a ptrace tracee of a task in this
   352  	// thread group. Events are defined in task_exit.go.
   353  	eventQueue waiter.Queue
   354  
   355  	// leader is the thread group's leader, which is the oldest task in the
   356  	// thread group; usually the last task in the thread group to call
   357  	// execve(), or if no such task exists then the first task in the thread
   358  	// group, which was created by a call to fork() or clone() without
   359  	// CLONE_THREAD. Once a thread group has been made visible to the rest of
   360  	// the system by TaskSet.newTask, leader is never nil.
   361  	//
   362  	// Note that it's possible for the leader to exit without causing the rest
   363  	// of the thread group to exit; in such a case, leader will still be valid
   364  	// and non-nil, but leader will not be in tasks.
   365  	//
   366  	// leader is protected by the TaskSet mutex.
   367  	leader *Task
   368  
   369  	// If execing is not nil, it is a task in the thread group that has killed
   370  	// all other tasks so that it can become the thread group leader and
   371  	// perform an execve. (execing may already be the thread group leader.)
   372  	//
   373  	// execing is analogous to Linux's signal_struct::group_exit_task.
   374  	//
   375  	// execing is protected by the TaskSet mutex.
   376  	execing *Task
   377  
   378  	// tasks is all tasks in the thread group that have not yet been reaped.
   379  	//
   380  	// tasks is protected by both the TaskSet mutex and the signal mutex:
   381  	// Mutating tasks requires locking the TaskSet mutex for writing *and*
   382  	// locking the signal mutex. Reading tasks requires locking the TaskSet
   383  	// mutex *or* locking the signal mutex.
   384  	tasks taskList
   385  
   386  	// tasksCount is the number of tasks in the thread group that have not yet
   387  	// been reaped; equivalently, tasksCount is the number of tasks in tasks.
   388  	//
   389  	// tasksCount is protected by both the TaskSet mutex and the signal mutex,
   390  	// as with tasks.
   391  	tasksCount int
   392  
   393  	// liveTasks is the number of tasks in the thread group that have not yet
   394  	// reached TaskExitZombie.
   395  	//
   396  	// liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
   397  	liveTasks int
   398  
   399  	// activeTasks is the number of tasks in the thread group that have not yet
   400  	// reached TaskExitInitiated.
   401  	//
   402  	// activeTasks is protected by both the TaskSet mutex and the signal mutex,
   403  	// as with tasks.
   404  	activeTasks int
   405  }
   406  
   407  // PIDNamespace returns the PID namespace containing tg.
   408  func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
   409  	return tg.pidns
   410  }
   411  
   412  // TaskSet returns the TaskSet containing tg.
   413  func (tg *ThreadGroup) TaskSet() *TaskSet {
   414  	return tg.pidns.owner
   415  }
   416  
   417  // Leader returns tg's leader.
   418  func (tg *ThreadGroup) Leader() *Task {
   419  	tg.pidns.owner.mu.RLock()
   420  	defer tg.pidns.owner.mu.RUnlock()
   421  	return tg.leader
   422  }
   423  
   424  // Count returns the number of non-exited threads in the group.
   425  func (tg *ThreadGroup) Count() int {
   426  	tg.pidns.owner.mu.RLock()
   427  	defer tg.pidns.owner.mu.RUnlock()
   428  	var count int
   429  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   430  		count++
   431  	}
   432  	return count
   433  }
   434  
   435  // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
   436  // all tasks in tg.
   437  func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
   438  	tg.pidns.owner.mu.RLock()
   439  	defer tg.pidns.owner.mu.RUnlock()
   440  
   441  	var tasks []ThreadID
   442  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   443  		if id, ok := pidns.tids[t]; ok {
   444  			tasks = append(tasks, id)
   445  		}
   446  	}
   447  	return tasks
   448  }
   449  
   450  // ID returns tg's leader's thread ID in its own PID namespace.
   451  // If tg's leader is dead, ID returns 0.
   452  func (tg *ThreadGroup) ID() ThreadID {
   453  	return ThreadID(tg.pidWithinNS.Load())
   454  }
   455  
   456  // A taskNode defines the relationship between a task and the rest of the
   457  // system. The comments on threadGroupNode also apply to taskNode.
   458  //
   459  // +stateify savable
   460  type taskNode struct {
   461  	// tg is the thread group that this task belongs to. The tg pointer is
   462  	// immutable.
   463  	tg *ThreadGroup `state:"wait"`
   464  
   465  	// taskEntry links into tg.tasks. Note that this means that
   466  	// Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
   467  	// group. See threadGroupNode.tasks for synchronization info.
   468  	taskEntry
   469  
   470  	// parent is the task's parent. parent may be nil.
   471  	//
   472  	// parent is protected by the TaskSet mutex.
   473  	parent *Task
   474  
   475  	// children is this task's children.
   476  	//
   477  	// children is protected by the TaskSet mutex.
   478  	children map[*Task]struct{}
   479  
   480  	// If childPIDNamespace is not nil, all new tasks created by this task will
   481  	// be members of childPIDNamespace rather than this one. (As a corollary,
   482  	// this task becomes unable to create sibling tasks in the same thread
   483  	// group.)
   484  	//
   485  	// childPIDNamespace is exclusive to the task goroutine.
   486  	childPIDNamespace *PIDNamespace
   487  }
   488  
   489  // ThreadGroup returns the thread group containing t.
   490  func (t *Task) ThreadGroup() *ThreadGroup {
   491  	return t.tg
   492  }
   493  
   494  // PIDNamespace returns the PID namespace containing t.
   495  func (t *Task) PIDNamespace() *PIDNamespace {
   496  	return t.tg.pidns
   497  }
   498  
   499  // TaskSet returns the TaskSet containing t.
   500  func (t *Task) TaskSet() *TaskSet {
   501  	return t.tg.pidns.owner
   502  }
   503  
   504  // Timekeeper returns the system Timekeeper.
   505  func (t *Task) Timekeeper() *Timekeeper {
   506  	return t.k.timekeeper
   507  }
   508  
   509  // Parent returns t's parent.
   510  func (t *Task) Parent() *Task {
   511  	t.tg.pidns.owner.mu.RLock()
   512  	defer t.tg.pidns.owner.mu.RUnlock()
   513  	return t.parent
   514  }
   515  
   516  // ParentLocked returns t's parent. Caller must ensure t's TaskSet mu
   517  // is locked for at least reading.
   518  //
   519  // +checklocks:t.tg.pidns.owner.mu
   520  func (t *Task) ParentLocked() *Task {
   521  	return t.parent
   522  }
   523  
   524  // ThreadID returns t's thread ID in its own PID namespace. If the task is
   525  // dead, ThreadID returns 0.
   526  func (t *Task) ThreadID() ThreadID {
   527  	return t.tg.pidns.IDOfTask(t)
   528  }
   529  
   530  // TGIDInRoot returns t's TGID in the root PID namespace.
   531  func (t *Task) TGIDInRoot() ThreadID {
   532  	return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg)
   533  }