github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_start.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    21  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    22  	"github.com/MerlinKodo/gvisor/pkg/context"
    23  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    24  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/futex"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/sched"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    31  )
    32  
    33  // TaskConfig defines the configuration of a new Task (see below).
    34  type TaskConfig struct {
    35  	// Kernel is the owning Kernel.
    36  	Kernel *Kernel
    37  
    38  	// Parent is the new task's parent. Parent may be nil.
    39  	Parent *Task
    40  
    41  	// If InheritParent is not nil, use InheritParent's parent as the new
    42  	// task's parent.
    43  	InheritParent *Task
    44  
    45  	// ThreadGroup is the ThreadGroup the new task belongs to.
    46  	ThreadGroup *ThreadGroup
    47  
    48  	// SignalMask is the new task's initial signal mask.
    49  	SignalMask linux.SignalSet
    50  
    51  	// TaskImage is the TaskImage of the new task. Ownership of the
    52  	// TaskImage is transferred to TaskSet.NewTask, whether or not it
    53  	// succeeds.
    54  	TaskImage *TaskImage
    55  
    56  	// FSContext is the FSContext of the new task. A reference must be held on
    57  	// FSContext, which is transferred to TaskSet.NewTask whether or not it
    58  	// succeeds.
    59  	FSContext *FSContext
    60  
    61  	// FDTable is the FDTableof the new task. A reference must be held on
    62  	// FDMap, which is transferred to TaskSet.NewTask whether or not it
    63  	// succeeds.
    64  	FDTable *FDTable
    65  
    66  	// Credentials is the Credentials of the new task.
    67  	Credentials *auth.Credentials
    68  
    69  	// Niceness is the niceness of the new task.
    70  	Niceness int
    71  
    72  	// NetworkNamespace is the network namespace to be used for the new task.
    73  	NetworkNamespace *inet.Namespace
    74  
    75  	// AllowedCPUMask contains the cpus that this task can run on.
    76  	AllowedCPUMask sched.CPUSet
    77  
    78  	// UTSNamespace is the UTSNamespace of the new task.
    79  	UTSNamespace *UTSNamespace
    80  
    81  	// IPCNamespace is the IPCNamespace of the new task.
    82  	IPCNamespace *IPCNamespace
    83  
    84  	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
    85  	AbstractSocketNamespace *AbstractSocketNamespace
    86  
    87  	// MountNamespace is the MountNamespace of the new task.
    88  	MountNamespace *vfs.MountNamespace
    89  
    90  	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
    91  	RSeqAddr hostarch.Addr
    92  
    93  	// RSeqSignature is the signature that the rseq abort IP must be signed
    94  	// with.
    95  	RSeqSignature uint32
    96  
    97  	// ContainerID is the container the new task belongs to.
    98  	ContainerID string
    99  
   100  	// InitialCgroups are the cgroups the container is initialised to.
   101  	InitialCgroups map[Cgroup]struct{}
   102  
   103  	// UserCounters is user resource counters.
   104  	UserCounters *userCounters
   105  
   106  	// SessionKeyring is the session keyring associated with the parent task.
   107  	// It may be nil.
   108  	SessionKeyring *auth.Key
   109  }
   110  
   111  // NewTask creates a new task defined by cfg.
   112  //
   113  // NewTask does not start the returned task; the caller must call Task.Start.
   114  //
   115  // If successful, NewTask transfers references held by cfg to the new task.
   116  // Otherwise, NewTask releases them.
   117  func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   118  	var err error
   119  	cleanup := func() {
   120  		cfg.TaskImage.release(ctx)
   121  		cfg.FSContext.DecRef(ctx)
   122  		cfg.FDTable.DecRef(ctx)
   123  		cfg.UTSNamespace.DecRef(ctx)
   124  		cfg.IPCNamespace.DecRef(ctx)
   125  		cfg.NetworkNamespace.DecRef(ctx)
   126  		if cfg.MountNamespace != nil {
   127  			cfg.MountNamespace.DecRef(ctx)
   128  		}
   129  	}
   130  	if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
   131  		cleanup()
   132  		return nil, err
   133  	}
   134  	t, err := ts.newTask(ctx, cfg)
   135  	if err != nil {
   136  		cfg.UserCounters.decRLimitNProc()
   137  		cleanup()
   138  		return nil, err
   139  	}
   140  	return t, nil
   141  }
   142  
   143  // newTask is a helper for TaskSet.NewTask that only takes ownership of parts
   144  // of cfg if it succeeds.
   145  func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   146  	srcT := TaskFromContext(ctx)
   147  	tg := cfg.ThreadGroup
   148  	image := cfg.TaskImage
   149  	t := &Task{
   150  		taskNode: taskNode{
   151  			tg:       tg,
   152  			parent:   cfg.Parent,
   153  			children: make(map[*Task]struct{}),
   154  		},
   155  		runState:        (*runApp)(nil),
   156  		interruptChan:   make(chan struct{}, 1),
   157  		signalMask:      atomicbitops.FromUint64(uint64(cfg.SignalMask)),
   158  		signalStack:     linux.SignalStack{Flags: linux.SS_DISABLE},
   159  		image:           *image,
   160  		fsContext:       cfg.FSContext,
   161  		fdTable:         cfg.FDTable,
   162  		k:               cfg.Kernel,
   163  		ptraceTracees:   make(map[*Task]struct{}),
   164  		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
   165  		ioUsage:         &usage.IO{},
   166  		niceness:        cfg.Niceness,
   167  		utsns:           cfg.UTSNamespace,
   168  		ipcns:           cfg.IPCNamespace,
   169  		abstractSockets: cfg.AbstractSocketNamespace,
   170  		mountNamespace:  cfg.MountNamespace,
   171  		rseqCPU:         -1,
   172  		rseqAddr:        cfg.RSeqAddr,
   173  		rseqSignature:   cfg.RSeqSignature,
   174  		futexWaiter:     futex.NewWaiter(),
   175  		containerID:     cfg.ContainerID,
   176  		cgroups:         make(map[Cgroup]struct{}),
   177  		userCounters:    cfg.UserCounters,
   178  		sessionKeyring:  cfg.SessionKeyring,
   179  	}
   180  	t.netns = cfg.NetworkNamespace
   181  	t.creds.Store(cfg.Credentials)
   182  	t.endStopCond.L = &t.tg.signalHandlers.mu
   183  	t.ptraceTracer.Store((*Task)(nil))
   184  	// We don't construct t.blockingTimer until Task.run(); see that function
   185  	// for justification.
   186  
   187  	var (
   188  		cg                 Cgroup
   189  		charged, committed bool
   190  	)
   191  
   192  	// Reserve cgroup PIDs controller charge. This is either commited when the
   193  	// new task enters the cgroup below, or rolled back on failure.
   194  	//
   195  	// We may also get here from a non-task context (for example, when
   196  	// creating the init task, or from the exec control command). In these cases
   197  	// we skip charging the pids controller, as non-userspace task creation
   198  	// bypasses pid limits.
   199  	if srcT != nil {
   200  		var err error
   201  		if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
   202  			return nil, err
   203  		}
   204  		if charged {
   205  			defer func() {
   206  				if !committed {
   207  					if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil {
   208  						panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err))
   209  					}
   210  				}
   211  				// Ref from ChargeFor. Note that we need to drop this outside of
   212  				// TaskSet.mu critical sections.
   213  				cg.DecRef(ctx)
   214  			}()
   215  		}
   216  	}
   217  
   218  	// Make the new task (and possibly thread group) visible to the rest of
   219  	// the system atomically.
   220  	ts.mu.Lock()
   221  	defer ts.mu.Unlock()
   222  	tg.signalHandlers.mu.Lock()
   223  	defer tg.signalHandlers.mu.Unlock()
   224  	if tg.exiting || tg.execing != nil {
   225  		// If the caller is in the same thread group, then what we return
   226  		// doesn't matter too much since the caller will exit before it returns
   227  		// to userspace. If the caller isn't in the same thread group, then
   228  		// we're in uncharted territory and can return whatever we want.
   229  		return nil, linuxerr.EINTR
   230  	}
   231  	if err := ts.assignTIDsLocked(t); err != nil {
   232  		return nil, err
   233  	}
   234  	// Below this point, newTask is expected not to fail (there is no rollback
   235  	// of assignTIDsLocked or any of the following).
   236  
   237  	// Logging on t's behalf will panic if t.logPrefix hasn't been
   238  	// initialized. This is the earliest point at which we can do so
   239  	// (since t now has thread IDs).
   240  	t.updateInfoLocked()
   241  
   242  	if cfg.InheritParent != nil {
   243  		t.parent = cfg.InheritParent.parent
   244  	}
   245  	if t.parent != nil {
   246  		t.parent.children[t] = struct{}{}
   247  	}
   248  
   249  	// If InitialCgroups is not nil, the new task will be placed in the
   250  	// specified cgroups. Otherwise, if srcT is not nil, the new task will
   251  	// be placed in the srcT's cgroups. If neither is specified, the new task
   252  	// will be in the root cgroups.
   253  	t.EnterInitialCgroups(srcT, cfg.InitialCgroups)
   254  	committed = true
   255  
   256  	if tg.leader == nil {
   257  		// New thread group.
   258  		tg.leader = t
   259  		if parentPG := tg.parentPG(); parentPG == nil {
   260  			tg.createSession()
   261  		} else {
   262  			// Inherit the process group and terminal.
   263  			parentPG.incRefWithParent(parentPG)
   264  			tg.processGroup = parentPG
   265  			tg.tty = t.parent.tg.tty
   266  		}
   267  
   268  		// If our parent is a child subreaper, or if it has a child
   269  		// subreaper, then this new thread group does as well.
   270  		if t.parent != nil {
   271  			tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper
   272  		}
   273  	}
   274  	tg.tasks.PushBack(t)
   275  	tg.tasksCount++
   276  	tg.liveTasks++
   277  	tg.activeTasks++
   278  
   279  	// Propagate external TaskSet stops to the new task.
   280  	t.stopCount = atomicbitops.FromInt32(ts.stopCount)
   281  
   282  	t.mu.Lock()
   283  	defer t.mu.Unlock()
   284  
   285  	t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t]))
   286  
   287  	t.startTime = t.k.RealtimeClock().Now()
   288  
   289  	// As a final step, initialize the platform context. This may require
   290  	// other pieces to be initialized as the task is used the context.
   291  	t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())
   292  
   293  	return t, nil
   294  }
   295  
   296  // assignTIDsLocked ensures that new task t is visible in all PID namespaces in
   297  // which it should be visible.
   298  //
   299  // Preconditions: ts.mu must be locked for writing.
   300  func (ts *TaskSet) assignTIDsLocked(t *Task) error {
   301  	type allocatedTID struct {
   302  		ns  *PIDNamespace
   303  		tid ThreadID
   304  	}
   305  	var allocatedTIDs []allocatedTID
   306  	var tid ThreadID
   307  	var err error
   308  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   309  		if tid, err = ns.allocateTID(); err != nil {
   310  			break
   311  		}
   312  		if err = ns.addTask(t, tid); err != nil {
   313  			break
   314  		}
   315  		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
   316  	}
   317  	if err != nil {
   318  		// Failure. Remove the tids we already allocated in descendant
   319  		// namespaces.
   320  		for _, a := range allocatedTIDs {
   321  			a.ns.deleteTask(t)
   322  		}
   323  		return err
   324  	}
   325  	t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg]))
   326  	return nil
   327  }
   328  
   329  // allocateTID returns an unused ThreadID from ns.
   330  //
   331  // Preconditions: ns.owner.mu must be locked for writing.
   332  func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
   333  	if ns.exiting {
   334  		// "In this case, a subsequent fork(2) into this PID namespace will
   335  		// fail with the error ENOMEM; it is not possible to create a new
   336  		// processes [sic] in a PID namespace whose init process has
   337  		// terminated." - pid_namespaces(7)
   338  		return 0, linuxerr.ENOMEM
   339  	}
   340  	tid := ns.last
   341  	for {
   342  		// Next.
   343  		tid++
   344  		if tid > TasksLimit {
   345  			tid = initTID + 1
   346  		}
   347  
   348  		// Is it available?
   349  		tidInUse := func() bool {
   350  			if _, ok := ns.tasks[tid]; ok {
   351  				return true
   352  			}
   353  			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
   354  				return true
   355  			}
   356  			if _, ok := ns.sessions[SessionID(tid)]; ok {
   357  				return true
   358  			}
   359  			return false
   360  		}()
   361  
   362  		if !tidInUse {
   363  			ns.last = tid
   364  			return tid, nil
   365  		}
   366  
   367  		// Did we do a full cycle?
   368  		if tid == ns.last {
   369  			// No tid available.
   370  			return 0, linuxerr.EAGAIN
   371  		}
   372  	}
   373  }
   374  
   375  // Start starts the task goroutine. Start must be called exactly once for each
   376  // task returned by NewTask.
   377  //
   378  // 'tid' must be the task's TID in the root PID namespace and it's used for
   379  // debugging purposes only (set as parameter to Task.run to make it visible
   380  // in stack dumps).
   381  func (t *Task) Start(tid ThreadID) {
   382  	// If the task was restored, it may be "starting" after having already exited.
   383  	if t.runState == nil {
   384  		return
   385  	}
   386  	t.goroutineStopped.Add(1)
   387  	t.tg.liveGoroutines.Add(1)
   388  	t.tg.pidns.owner.liveGoroutines.Add(1)
   389  	t.tg.pidns.owner.runningGoroutines.Add(1)
   390  
   391  	// Task is now running in system mode.
   392  	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
   393  
   394  	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
   395  	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
   396  }