github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task_start.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/metacubex/gvisor/pkg/abi/linux"
    21  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    22  	"github.com/metacubex/gvisor/pkg/context"
    23  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    24  	"github.com/metacubex/gvisor/pkg/hostarch"
    25  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    26  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/metacubex/gvisor/pkg/sentry/kernel/futex"
    28  	"github.com/metacubex/gvisor/pkg/sentry/kernel/sched"
    29  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    30  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    31  )
    32  
    33  // TaskConfig defines the configuration of a new Task (see below).
    34  type TaskConfig struct {
    35  	// Kernel is the owning Kernel.
    36  	Kernel *Kernel
    37  
    38  	// Parent is the new task's parent. Parent may be nil.
    39  	Parent *Task
    40  
    41  	// If InheritParent is not nil, use InheritParent's parent as the new
    42  	// task's parent.
    43  	InheritParent *Task
    44  
    45  	// ThreadGroup is the ThreadGroup the new task belongs to.
    46  	ThreadGroup *ThreadGroup
    47  
    48  	// SignalMask is the new task's initial signal mask.
    49  	SignalMask linux.SignalSet
    50  
    51  	// TaskImage is the TaskImage of the new task. Ownership of the
    52  	// TaskImage is transferred to TaskSet.NewTask, whether or not it
    53  	// succeeds.
    54  	TaskImage *TaskImage
    55  
    56  	// FSContext is the FSContext of the new task. A reference must be held on
    57  	// FSContext, which is transferred to TaskSet.NewTask whether or not it
    58  	// succeeds.
    59  	FSContext *FSContext
    60  
    61  	// FDTable is the FDTableof the new task. A reference must be held on
    62  	// FDMap, which is transferred to TaskSet.NewTask whether or not it
    63  	// succeeds.
    64  	FDTable *FDTable
    65  
    66  	// Credentials is the Credentials of the new task.
    67  	Credentials *auth.Credentials
    68  
    69  	// Niceness is the niceness of the new task.
    70  	Niceness int
    71  
    72  	// NetworkNamespace is the network namespace to be used for the new task.
    73  	NetworkNamespace *inet.Namespace
    74  
    75  	// AllowedCPUMask contains the cpus that this task can run on.
    76  	AllowedCPUMask sched.CPUSet
    77  
    78  	// UTSNamespace is the UTSNamespace of the new task.
    79  	UTSNamespace *UTSNamespace
    80  
    81  	// IPCNamespace is the IPCNamespace of the new task.
    82  	IPCNamespace *IPCNamespace
    83  
    84  	// MountNamespace is the MountNamespace of the new task.
    85  	MountNamespace *vfs.MountNamespace
    86  
    87  	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
    88  	RSeqAddr hostarch.Addr
    89  
    90  	// RSeqSignature is the signature that the rseq abort IP must be signed
    91  	// with.
    92  	RSeqSignature uint32
    93  
    94  	// ContainerID is the container the new task belongs to.
    95  	ContainerID string
    96  
    97  	// InitialCgroups are the cgroups the container is initialised to.
    98  	InitialCgroups map[Cgroup]struct{}
    99  
   100  	// UserCounters is user resource counters.
   101  	UserCounters *UserCounters
   102  
   103  	// SessionKeyring is the session keyring associated with the parent task.
   104  	// It may be nil.
   105  	SessionKeyring *auth.Key
   106  }
   107  
   108  // NewTask creates a new task defined by cfg.
   109  //
   110  // NewTask does not start the returned task; the caller must call Task.Start.
   111  //
   112  // If successful, NewTask transfers references held by cfg to the new task.
   113  // Otherwise, NewTask releases them.
   114  func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   115  	var err error
   116  	cleanup := func() {
   117  		cfg.TaskImage.release(ctx)
   118  		cfg.FSContext.DecRef(ctx)
   119  		cfg.FDTable.DecRef(ctx)
   120  		cfg.UTSNamespace.DecRef(ctx)
   121  		cfg.IPCNamespace.DecRef(ctx)
   122  		cfg.NetworkNamespace.DecRef(ctx)
   123  		if cfg.MountNamespace != nil {
   124  			cfg.MountNamespace.DecRef(ctx)
   125  		}
   126  	}
   127  	if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
   128  		cleanup()
   129  		return nil, err
   130  	}
   131  	t, err := ts.newTask(ctx, cfg)
   132  	if err != nil {
   133  		cfg.UserCounters.decRLimitNProc()
   134  		cleanup()
   135  		return nil, err
   136  	}
   137  	return t, nil
   138  }
   139  
   140  // newTask is a helper for TaskSet.NewTask that only takes ownership of parts
   141  // of cfg if it succeeds.
   142  func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   143  	srcT := TaskFromContext(ctx)
   144  	tg := cfg.ThreadGroup
   145  	image := cfg.TaskImage
   146  	t := &Task{
   147  		taskNode: taskNode{
   148  			tg:       tg,
   149  			parent:   cfg.Parent,
   150  			children: make(map[*Task]struct{}),
   151  		},
   152  		runState:       (*runApp)(nil),
   153  		interruptChan:  make(chan struct{}, 1),
   154  		signalMask:     atomicbitops.FromUint64(uint64(cfg.SignalMask)),
   155  		signalStack:    linux.SignalStack{Flags: linux.SS_DISABLE},
   156  		image:          *image,
   157  		fsContext:      cfg.FSContext,
   158  		fdTable:        cfg.FDTable,
   159  		k:              cfg.Kernel,
   160  		ptraceTracees:  make(map[*Task]struct{}),
   161  		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
   162  		ioUsage:        &usage.IO{},
   163  		niceness:       cfg.Niceness,
   164  		utsns:          cfg.UTSNamespace,
   165  		ipcns:          cfg.IPCNamespace,
   166  		mountNamespace: cfg.MountNamespace,
   167  		rseqCPU:        -1,
   168  		rseqAddr:       cfg.RSeqAddr,
   169  		rseqSignature:  cfg.RSeqSignature,
   170  		futexWaiter:    futex.NewWaiter(),
   171  		containerID:    cfg.ContainerID,
   172  		cgroups:        make(map[Cgroup]struct{}),
   173  		userCounters:   cfg.UserCounters,
   174  		sessionKeyring: cfg.SessionKeyring,
   175  	}
   176  	t.netns = cfg.NetworkNamespace
   177  	t.creds.Store(cfg.Credentials)
   178  	t.endStopCond.L = &t.tg.signalHandlers.mu
   179  	// We don't construct t.blockingTimer until Task.run(); see that function
   180  	// for justification.
   181  
   182  	var (
   183  		cg                 Cgroup
   184  		charged, committed bool
   185  	)
   186  
   187  	// Reserve cgroup PIDs controller charge. This is either committed when the
   188  	// new task enters the cgroup below, or rolled back on failure.
   189  	//
   190  	// We may also get here from a non-task context (for example, when
   191  	// creating the init task, or from the exec control command). In these cases
   192  	// we skip charging the pids controller, as non-userspace task creation
   193  	// bypasses pid limits.
   194  	if srcT != nil {
   195  		var err error
   196  		if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
   197  			return nil, err
   198  		}
   199  		if charged {
   200  			defer func() {
   201  				if !committed {
   202  					if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil {
   203  						panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err))
   204  					}
   205  				}
   206  				// Ref from ChargeFor. Note that we need to drop this outside of
   207  				// TaskSet.mu critical sections.
   208  				cg.DecRef(ctx)
   209  			}()
   210  		}
   211  	}
   212  
   213  	// Make the new task (and possibly thread group) visible to the rest of
   214  	// the system atomically.
   215  	ts.mu.Lock()
   216  	defer ts.mu.Unlock()
   217  	tg.signalHandlers.mu.Lock()
   218  	defer tg.signalHandlers.mu.Unlock()
   219  	if tg.exiting || tg.execing != nil {
   220  		// If the caller is in the same thread group, then what we return
   221  		// doesn't matter too much since the caller will exit before it returns
   222  		// to userspace. If the caller isn't in the same thread group, then
   223  		// we're in uncharted territory and can return whatever we want.
   224  		return nil, linuxerr.EINTR
   225  	}
   226  	if err := ts.assignTIDsLocked(t); err != nil {
   227  		return nil, err
   228  	}
   229  	// Below this point, newTask is expected not to fail (there is no rollback
   230  	// of assignTIDsLocked or any of the following).
   231  
   232  	// Logging on t's behalf will panic if t.logPrefix hasn't been
   233  	// initialized. This is the earliest point at which we can do so
   234  	// (since t now has thread IDs).
   235  	t.updateInfoLocked()
   236  
   237  	if cfg.InheritParent != nil {
   238  		t.parent = cfg.InheritParent.parent
   239  	}
   240  	if t.parent != nil {
   241  		t.parent.children[t] = struct{}{}
   242  	}
   243  
   244  	// If InitialCgroups is not nil, the new task will be placed in the
   245  	// specified cgroups. Otherwise, if srcT is not nil, the new task will
   246  	// be placed in the srcT's cgroups. If neither is specified, the new task
   247  	// will be in the root cgroups.
   248  	t.EnterInitialCgroups(srcT, cfg.InitialCgroups)
   249  	committed = true
   250  
   251  	if tg.leader == nil {
   252  		// New thread group.
   253  		tg.leader = t
   254  		if parentPG := tg.parentPG(); parentPG == nil {
   255  			tg.createSession()
   256  		} else {
   257  			// Inherit the process group and terminal.
   258  			parentPG.incRefWithParent(parentPG)
   259  			tg.processGroup = parentPG
   260  			tg.tty = t.parent.tg.tty
   261  		}
   262  
   263  		// If our parent is a child subreaper, or if it has a child
   264  		// subreaper, then this new thread group does as well.
   265  		if t.parent != nil {
   266  			tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper
   267  		}
   268  	}
   269  	tg.tasks.PushBack(t)
   270  	tg.tasksCount++
   271  	tg.liveTasks++
   272  	tg.activeTasks++
   273  
   274  	// Propagate external TaskSet stops to the new task.
   275  	t.stopCount = atomicbitops.FromInt32(ts.stopCount)
   276  
   277  	t.mu.Lock()
   278  	defer t.mu.Unlock()
   279  
   280  	t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t]))
   281  
   282  	t.startTime = t.k.RealtimeClock().Now()
   283  
   284  	// As a final step, initialize the platform context. This may require
   285  	// other pieces to be initialized as the task is used the context.
   286  	t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())
   287  
   288  	return t, nil
   289  }
   290  
   291  // assignTIDsLocked ensures that new task t is visible in all PID namespaces in
   292  // which it should be visible.
   293  //
   294  // Preconditions: ts.mu must be locked for writing.
   295  func (ts *TaskSet) assignTIDsLocked(t *Task) error {
   296  	type allocatedTID struct {
   297  		ns  *PIDNamespace
   298  		tid ThreadID
   299  	}
   300  	var allocatedTIDs []allocatedTID
   301  	var tid ThreadID
   302  	var err error
   303  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   304  		if tid, err = ns.allocateTID(); err != nil {
   305  			break
   306  		}
   307  		if err = ns.addTask(t, tid); err != nil {
   308  			break
   309  		}
   310  		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
   311  	}
   312  	if err != nil {
   313  		// Failure. Remove the tids we already allocated in descendant
   314  		// namespaces.
   315  		for _, a := range allocatedTIDs {
   316  			a.ns.deleteTask(t)
   317  		}
   318  		return err
   319  	}
   320  	t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg]))
   321  	return nil
   322  }
   323  
   324  // allocateTID returns an unused ThreadID from ns.
   325  //
   326  // Preconditions: ns.owner.mu must be locked for writing.
   327  func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
   328  	if ns.exiting {
   329  		// "In this case, a subsequent fork(2) into this PID namespace will
   330  		// fail with the error ENOMEM; it is not possible to create a new
   331  		// processes [sic] in a PID namespace whose init process has
   332  		// terminated." - pid_namespaces(7)
   333  		return 0, linuxerr.ENOMEM
   334  	}
   335  	tid := ns.last
   336  	for {
   337  		// Next.
   338  		tid++
   339  		if tid > TasksLimit {
   340  			tid = initTID + 1
   341  		}
   342  
   343  		// Is it available?
   344  		tidInUse := func() bool {
   345  			if _, ok := ns.tasks[tid]; ok {
   346  				return true
   347  			}
   348  			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
   349  				return true
   350  			}
   351  			if _, ok := ns.sessions[SessionID(tid)]; ok {
   352  				return true
   353  			}
   354  			return false
   355  		}()
   356  
   357  		if !tidInUse {
   358  			ns.last = tid
   359  			return tid, nil
   360  		}
   361  
   362  		// Did we do a full cycle?
   363  		if tid == ns.last {
   364  			// No tid available.
   365  			return 0, linuxerr.EAGAIN
   366  		}
   367  	}
   368  }
   369  
   370  // Start starts the task goroutine. Start must be called exactly once for each
   371  // task returned by NewTask.
   372  //
   373  // 'tid' must be the task's TID in the root PID namespace and it's used for
   374  // debugging purposes only (set as parameter to Task.run to make it visible
   375  // in stack dumps).
   376  func (t *Task) Start(tid ThreadID) {
   377  	// If the task was restored, it may be "starting" after having already exited.
   378  	if t.runState == nil {
   379  		return
   380  	}
   381  	t.goroutineStopped.Add(1)
   382  	t.tg.liveGoroutines.Add(1)
   383  	t.tg.pidns.owner.liveGoroutines.Add(1)
   384  	t.tg.pidns.owner.runningGoroutines.Add(1)
   385  
   386  	// Task is now running in system mode.
   387  	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
   388  
   389  	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
   390  	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
   391  }