gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_start.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"gvisor.dev/gvisor/pkg/abi/linux"
    21  	"gvisor.dev/gvisor/pkg/atomicbitops"
    22  	"gvisor.dev/gvisor/pkg/context"
    23  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    24  	"gvisor.dev/gvisor/pkg/hostarch"
    25  	"gvisor.dev/gvisor/pkg/sentry/inet"
    26  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    27  	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
    28  	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
    29  	"gvisor.dev/gvisor/pkg/sentry/usage"
    30  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    31  )
    32  
    33  // TaskConfig defines the configuration of a new Task (see below).
    34  type TaskConfig struct {
    35  	// Kernel is the owning Kernel.
    36  	Kernel *Kernel
    37  
    38  	// Parent is the new task's parent. Parent may be nil.
    39  	Parent *Task
    40  
    41  	// If InheritParent is not nil, use InheritParent's parent as the new
    42  	// task's parent.
    43  	InheritParent *Task
    44  
    45  	// ThreadGroup is the ThreadGroup the new task belongs to.
    46  	ThreadGroup *ThreadGroup
    47  
    48  	// SignalMask is the new task's initial signal mask.
    49  	SignalMask linux.SignalSet
    50  
    51  	// TaskImage is the TaskImage of the new task. Ownership of the
    52  	// TaskImage is transferred to TaskSet.NewTask, whether or not it
    53  	// succeeds.
    54  	TaskImage *TaskImage
    55  
    56  	// FSContext is the FSContext of the new task. A reference must be held on
    57  	// FSContext, which is transferred to TaskSet.NewTask whether or not it
    58  	// succeeds.
    59  	FSContext *FSContext
    60  
    61  	// FDTable is the FDTableof the new task. A reference must be held on
    62  	// FDMap, which is transferred to TaskSet.NewTask whether or not it
    63  	// succeeds.
    64  	FDTable *FDTable
    65  
    66  	// Credentials is the Credentials of the new task.
    67  	Credentials *auth.Credentials
    68  
    69  	// Niceness is the niceness of the new task.
    70  	Niceness int
    71  
    72  	// NetworkNamespace is the network namespace to be used for the new task.
    73  	NetworkNamespace *inet.Namespace
    74  
    75  	// AllowedCPUMask contains the cpus that this task can run on.
    76  	AllowedCPUMask sched.CPUSet
    77  
    78  	// UTSNamespace is the UTSNamespace of the new task.
    79  	UTSNamespace *UTSNamespace
    80  
    81  	// IPCNamespace is the IPCNamespace of the new task.
    82  	IPCNamespace *IPCNamespace
    83  
    84  	// MountNamespace is the MountNamespace of the new task.
    85  	MountNamespace *vfs.MountNamespace
    86  
    87  	// RSeqAddr is a pointer to the userspace linux.RSeq structure.
    88  	RSeqAddr hostarch.Addr
    89  
    90  	// RSeqSignature is the signature that the rseq abort IP must be signed
    91  	// with.
    92  	RSeqSignature uint32
    93  
    94  	// ContainerID is the container the new task belongs to.
    95  	ContainerID string
    96  
    97  	// InitialCgroups are the cgroups the container is initialised to.
    98  	InitialCgroups map[Cgroup]struct{}
    99  
   100  	// UserCounters is user resource counters.
   101  	UserCounters *UserCounters
   102  
   103  	// SessionKeyring is the session keyring associated with the parent task.
   104  	// It may be nil.
   105  	SessionKeyring *auth.Key
   106  
   107  	Origin TaskOrigin
   108  }
   109  
   110  // NewTask creates a new task defined by cfg.
   111  //
   112  // NewTask does not start the returned task; the caller must call Task.Start.
   113  //
   114  // If successful, NewTask transfers references held by cfg to the new task.
   115  // Otherwise, NewTask releases them.
   116  func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   117  	var err error
   118  	cleanup := func() {
   119  		cfg.TaskImage.release(ctx)
   120  		cfg.FSContext.DecRef(ctx)
   121  		cfg.FDTable.DecRef(ctx)
   122  		cfg.UTSNamespace.DecRef(ctx)
   123  		cfg.IPCNamespace.DecRef(ctx)
   124  		cfg.NetworkNamespace.DecRef(ctx)
   125  		if cfg.MountNamespace != nil {
   126  			cfg.MountNamespace.DecRef(ctx)
   127  		}
   128  	}
   129  	if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
   130  		cleanup()
   131  		return nil, err
   132  	}
   133  	t, err := ts.newTask(ctx, cfg)
   134  	if err != nil {
   135  		cfg.UserCounters.decRLimitNProc()
   136  		cleanup()
   137  		return nil, err
   138  	}
   139  	return t, nil
   140  }
   141  
   142  // newTask is a helper for TaskSet.NewTask that only takes ownership of parts
   143  // of cfg if it succeeds.
   144  func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   145  	srcT := TaskFromContext(ctx)
   146  	tg := cfg.ThreadGroup
   147  	image := cfg.TaskImage
   148  	t := &Task{
   149  		taskNode: taskNode{
   150  			tg:       tg,
   151  			parent:   cfg.Parent,
   152  			children: make(map[*Task]struct{}),
   153  		},
   154  		runState:       (*runApp)(nil),
   155  		interruptChan:  make(chan struct{}, 1),
   156  		signalMask:     atomicbitops.FromUint64(uint64(cfg.SignalMask)),
   157  		signalStack:    linux.SignalStack{Flags: linux.SS_DISABLE},
   158  		image:          *image,
   159  		fsContext:      cfg.FSContext,
   160  		fdTable:        cfg.FDTable,
   161  		k:              cfg.Kernel,
   162  		ptraceTracees:  make(map[*Task]struct{}),
   163  		allowedCPUMask: cfg.AllowedCPUMask.Copy(),
   164  		ioUsage:        &usage.IO{},
   165  		niceness:       cfg.Niceness,
   166  		utsns:          cfg.UTSNamespace,
   167  		ipcns:          cfg.IPCNamespace,
   168  		mountNamespace: cfg.MountNamespace,
   169  		rseqCPU:        -1,
   170  		rseqAddr:       cfg.RSeqAddr,
   171  		rseqSignature:  cfg.RSeqSignature,
   172  		futexWaiter:    futex.NewWaiter(),
   173  		containerID:    cfg.ContainerID,
   174  		cgroups:        make(map[Cgroup]struct{}),
   175  		userCounters:   cfg.UserCounters,
   176  		sessionKeyring: cfg.SessionKeyring,
   177  		Origin:         cfg.Origin,
   178  	}
   179  	t.netns = cfg.NetworkNamespace
   180  	t.creds.Store(cfg.Credentials)
   181  	t.endStopCond.L = &t.tg.signalHandlers.mu
   182  	// We don't construct t.blockingTimer until Task.run(); see that function
   183  	// for justification.
   184  
   185  	var (
   186  		cg                 Cgroup
   187  		charged, committed bool
   188  	)
   189  
   190  	// Reserve cgroup PIDs controller charge. This is either committed when the
   191  	// new task enters the cgroup below, or rolled back on failure.
   192  	//
   193  	// We may also get here from a non-task context (for example, when
   194  	// creating the init task, or from the exec control command). In these cases
   195  	// we skip charging the pids controller, as non-userspace task creation
   196  	// bypasses pid limits.
   197  	if srcT != nil {
   198  		var err error
   199  		if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
   200  			return nil, err
   201  		}
   202  		if charged {
   203  			defer func() {
   204  				if !committed {
   205  					if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil {
   206  						panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err))
   207  					}
   208  				}
   209  				// Ref from ChargeFor. Note that we need to drop this outside of
   210  				// TaskSet.mu critical sections.
   211  				cg.DecRef(ctx)
   212  			}()
   213  		}
   214  	}
   215  
   216  	// Make the new task (and possibly thread group) visible to the rest of
   217  	// the system atomically.
   218  	ts.mu.Lock()
   219  	defer ts.mu.Unlock()
   220  	tg.signalHandlers.mu.Lock()
   221  	defer tg.signalHandlers.mu.Unlock()
   222  	if tg.exiting || tg.execing != nil {
   223  		// If the caller is in the same thread group, then what we return
   224  		// doesn't matter too much since the caller will exit before it returns
   225  		// to userspace. If the caller isn't in the same thread group, then
   226  		// we're in uncharted territory and can return whatever we want.
   227  		return nil, linuxerr.EINTR
   228  	}
   229  	if err := ts.assignTIDsLocked(t); err != nil {
   230  		return nil, err
   231  	}
   232  	// Below this point, newTask is expected not to fail (there is no rollback
   233  	// of assignTIDsLocked or any of the following).
   234  
   235  	// Logging on t's behalf will panic if t.logPrefix hasn't been
   236  	// initialized. This is the earliest point at which we can do so
   237  	// (since t now has thread IDs).
   238  	t.updateInfoLocked()
   239  
   240  	if cfg.InheritParent != nil {
   241  		t.parent = cfg.InheritParent.parent
   242  	}
   243  	if t.parent != nil {
   244  		t.parent.children[t] = struct{}{}
   245  	}
   246  
   247  	// If InitialCgroups is not nil, the new task will be placed in the
   248  	// specified cgroups. Otherwise, if srcT is not nil, the new task will
   249  	// be placed in the srcT's cgroups. If neither is specified, the new task
   250  	// will be in the root cgroups.
   251  	t.EnterInitialCgroups(srcT, cfg.InitialCgroups)
   252  	committed = true
   253  
   254  	if tg.leader == nil {
   255  		// New thread group.
   256  		tg.leader = t
   257  		if parentPG := tg.parentPG(); parentPG == nil {
   258  			tg.createSession()
   259  		} else {
   260  			// Inherit the process group and terminal.
   261  			parentPG.incRefWithParent(parentPG)
   262  			tg.processGroup = parentPG
   263  			tg.tty = t.parent.tg.tty
   264  		}
   265  
   266  		// If our parent is a child subreaper, or if it has a child
   267  		// subreaper, then this new thread group does as well.
   268  		if t.parent != nil {
   269  			tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper
   270  		}
   271  	}
   272  	tg.tasks.PushBack(t)
   273  	tg.tasksCount++
   274  	tg.liveTasks++
   275  	tg.activeTasks++
   276  
   277  	// Propagate external TaskSet stops to the new task.
   278  	t.stopCount = atomicbitops.FromInt32(ts.stopCount)
   279  
   280  	t.mu.Lock()
   281  	defer t.mu.Unlock()
   282  
   283  	t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t]))
   284  
   285  	t.startTime = t.k.RealtimeClock().Now()
   286  
   287  	// As a final step, initialize the platform context. This may require
   288  	// other pieces to be initialized as the task is used the context.
   289  	t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())
   290  
   291  	return t, nil
   292  }
   293  
   294  // assignTIDsLocked ensures that new task t is visible in all PID namespaces in
   295  // which it should be visible.
   296  //
   297  // Preconditions: ts.mu must be locked for writing.
   298  func (ts *TaskSet) assignTIDsLocked(t *Task) error {
   299  	type allocatedTID struct {
   300  		ns  *PIDNamespace
   301  		tid ThreadID
   302  	}
   303  	var allocatedTIDs []allocatedTID
   304  	var tid ThreadID
   305  	var err error
   306  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   307  		if tid, err = ns.allocateTID(); err != nil {
   308  			break
   309  		}
   310  		if err = ns.addTask(t, tid); err != nil {
   311  			break
   312  		}
   313  		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
   314  	}
   315  	if err != nil {
   316  		// Failure. Remove the tids we already allocated in descendant
   317  		// namespaces.
   318  		for _, a := range allocatedTIDs {
   319  			a.ns.deleteTask(t)
   320  		}
   321  		return err
   322  	}
   323  	t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg]))
   324  	return nil
   325  }
   326  
   327  // allocateTID returns an unused ThreadID from ns.
   328  //
   329  // Preconditions: ns.owner.mu must be locked for writing.
   330  func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
   331  	if ns.exiting {
   332  		// "In this case, a subsequent fork(2) into this PID namespace will
   333  		// fail with the error ENOMEM; it is not possible to create a new
   334  		// processes [sic] in a PID namespace whose init process has
   335  		// terminated." - pid_namespaces(7)
   336  		return 0, linuxerr.ENOMEM
   337  	}
   338  	tid := ns.last
   339  	for {
   340  		// Next.
   341  		tid++
   342  		if tid > TasksLimit {
   343  			tid = initTID + 1
   344  		}
   345  
   346  		// Is it available?
   347  		tidInUse := func() bool {
   348  			if _, ok := ns.tasks[tid]; ok {
   349  				return true
   350  			}
   351  			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
   352  				return true
   353  			}
   354  			if _, ok := ns.sessions[SessionID(tid)]; ok {
   355  				return true
   356  			}
   357  			return false
   358  		}()
   359  
   360  		if !tidInUse {
   361  			ns.last = tid
   362  			return tid, nil
   363  		}
   364  
   365  		// Did we do a full cycle?
   366  		if tid == ns.last {
   367  			// No tid available.
   368  			return 0, linuxerr.EAGAIN
   369  		}
   370  	}
   371  }
   372  
   373  // Start starts the task goroutine. Start must be called exactly once for each
   374  // task returned by NewTask.
   375  //
   376  // 'tid' must be the task's TID in the root PID namespace and it's used for
   377  // debugging purposes only (set as parameter to Task.run to make it visible
   378  // in stack dumps).
   379  func (t *Task) Start(tid ThreadID) {
   380  	// If the task was restored, it may be "starting" after having already exited.
   381  	if t.runState == nil {
   382  		return
   383  	}
   384  	t.goroutineStopped.Add(1)
   385  	t.tg.liveGoroutines.Add(1)
   386  	t.tg.pidns.owner.liveGoroutines.Add(1)
   387  	t.tg.pidns.owner.runningGoroutines.Add(1)
   388  
   389  	// Task is now running in system mode.
   390  	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
   391  
   392  	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
   393  	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
   394  }