github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_start.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    19  	"github.com/SagerNet/gvisor/pkg/context"
    20  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    21  	"github.com/SagerNet/gvisor/pkg/hostarch"
    22  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/futex"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/sched"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    28  	"github.com/SagerNet/gvisor/pkg/syserror"
    29  )
    30  
    31  // TaskConfig defines the configuration of a new Task (see below).
    32  type TaskConfig struct {
    33  	// Kernel is the owning Kernel.
    34  	Kernel *Kernel
    35  
    36  	// Parent is the new task's parent. Parent may be nil.
    37  	Parent *Task
    38  
    39  	// If InheritParent is not nil, use InheritParent's parent as the new
    40  	// task's parent.
    41  	InheritParent *Task
    42  
    43  	// ThreadGroup is the ThreadGroup the new task belongs to.
    44  	ThreadGroup *ThreadGroup
    45  
    46  	// SignalMask is the new task's initial signal mask.
    47  	SignalMask linux.SignalSet
    48  
    49  	// TaskImage is the TaskImage of the new task. Ownership of the
    50  	// TaskImage is transferred to TaskSet.NewTask, whether or not it
    51  	// succeeds.
    52  	TaskImage *TaskImage
    53  
    54  	// FSContext is the FSContext of the new task. A reference must be held on
    55  	// FSContext, which is transferred to TaskSet.NewTask whether or not it
    56  	// succeeds.
    57  	FSContext *FSContext
    58  
    59  	// FDTable is the FDTableof the new task. A reference must be held on
    60  	// FDMap, which is transferred to TaskSet.NewTask whether or not it
    61  	// succeeds.
    62  	FDTable *FDTable
    63  
    64  	// Credentials is the Credentials of the new task.
    65  	Credentials *auth.Credentials
    66  
    67  	// Niceness is the niceness of the new task.
    68  	Niceness int
    69  
    70  	// NetworkNamespace is the network namespace to be used for the new task.
    71  	NetworkNamespace *inet.Namespace
    72  
    73  	// AllowedCPUMask contains the cpus that this task can run on.
    74  	AllowedCPUMask sched.CPUSet
    75  
    76  	// UTSNamespace is the UTSNamespace of the new task.
    77  	UTSNamespace *UTSNamespace
    78  
    79  	// IPCNamespace is the IPCNamespace of the new task.
    80  	IPCNamespace *IPCNamespace
    81  
    82  	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
    83  	AbstractSocketNamespace *AbstractSocketNamespace
    84  
    85  	// MountNamespaceVFS2 is the MountNamespace of the new task.
    86  	MountNamespaceVFS2 *vfs.MountNamespace
    87  
    88  	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
    89  	RSeqAddr hostarch.Addr
    90  
    91  	// RSeqSignature is the signature that the rseq abort IP must be signed
    92  	// with.
    93  	RSeqSignature uint32
    94  
    95  	// ContainerID is the container the new task belongs to.
    96  	ContainerID string
    97  }
    98  
    99  // NewTask creates a new task defined by cfg.
   100  //
   101  // NewTask does not start the returned task; the caller must call Task.Start.
   102  //
   103  // If successful, NewTask transfers references held by cfg to the new task.
   104  // Otherwise, NewTask releases them.
   105  func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   106  	t, err := ts.newTask(cfg)
   107  	if err != nil {
   108  		cfg.TaskImage.release()
   109  		cfg.FSContext.DecRef(ctx)
   110  		cfg.FDTable.DecRef(ctx)
   111  		cfg.IPCNamespace.DecRef(ctx)
   112  		if cfg.MountNamespaceVFS2 != nil {
   113  			cfg.MountNamespaceVFS2.DecRef(ctx)
   114  		}
   115  		return nil, err
   116  	}
   117  	return t, nil
   118  }
   119  
   120  // newTask is a helper for TaskSet.NewTask that only takes ownership of parts
   121  // of cfg if it succeeds.
   122  func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
   123  	tg := cfg.ThreadGroup
   124  	image := cfg.TaskImage
   125  	t := &Task{
   126  		taskNode: taskNode{
   127  			tg:       tg,
   128  			parent:   cfg.Parent,
   129  			children: make(map[*Task]struct{}),
   130  		},
   131  		runState:           (*runApp)(nil),
   132  		interruptChan:      make(chan struct{}, 1),
   133  		signalMask:         cfg.SignalMask,
   134  		signalStack:        linux.SignalStack{Flags: linux.SS_DISABLE},
   135  		image:              *image,
   136  		fsContext:          cfg.FSContext,
   137  		fdTable:            cfg.FDTable,
   138  		p:                  cfg.Kernel.Platform.NewContext(),
   139  		k:                  cfg.Kernel,
   140  		ptraceTracees:      make(map[*Task]struct{}),
   141  		allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
   142  		ioUsage:            &usage.IO{},
   143  		niceness:           cfg.Niceness,
   144  		netns:              cfg.NetworkNamespace,
   145  		utsns:              cfg.UTSNamespace,
   146  		ipcns:              cfg.IPCNamespace,
   147  		abstractSockets:    cfg.AbstractSocketNamespace,
   148  		mountNamespaceVFS2: cfg.MountNamespaceVFS2,
   149  		rseqCPU:            -1,
   150  		rseqAddr:           cfg.RSeqAddr,
   151  		rseqSignature:      cfg.RSeqSignature,
   152  		futexWaiter:        futex.NewWaiter(),
   153  		containerID:        cfg.ContainerID,
   154  		cgroups:            make(map[Cgroup]struct{}),
   155  	}
   156  	t.creds.Store(cfg.Credentials)
   157  	t.endStopCond.L = &t.tg.signalHandlers.mu
   158  	t.ptraceTracer.Store((*Task)(nil))
   159  	// We don't construct t.blockingTimer until Task.run(); see that function
   160  	// for justification.
   161  
   162  	// Make the new task (and possibly thread group) visible to the rest of
   163  	// the system atomically.
   164  	ts.mu.Lock()
   165  	defer ts.mu.Unlock()
   166  	tg.signalHandlers.mu.Lock()
   167  	defer tg.signalHandlers.mu.Unlock()
   168  	if tg.exiting || tg.execing != nil {
   169  		// If the caller is in the same thread group, then what we return
   170  		// doesn't matter too much since the caller will exit before it returns
   171  		// to userspace. If the caller isn't in the same thread group, then
   172  		// we're in uncharted territory and can return whatever we want.
   173  		return nil, syserror.EINTR
   174  	}
   175  	if err := ts.assignTIDsLocked(t); err != nil {
   176  		return nil, err
   177  	}
   178  	// Below this point, newTask is expected not to fail (there is no rollback
   179  	// of assignTIDsLocked or any of the following).
   180  
   181  	// Logging on t's behalf will panic if t.logPrefix hasn't been
   182  	// initialized. This is the earliest point at which we can do so
   183  	// (since t now has thread IDs).
   184  	t.updateInfoLocked()
   185  
   186  	if cfg.InheritParent != nil {
   187  		t.parent = cfg.InheritParent.parent
   188  	}
   189  	if t.parent != nil {
   190  		t.parent.children[t] = struct{}{}
   191  	}
   192  
   193  	if VFS2Enabled {
   194  		t.EnterInitialCgroups(t.parent)
   195  	}
   196  
   197  	if tg.leader == nil {
   198  		// New thread group.
   199  		tg.leader = t
   200  		if parentPG := tg.parentPG(); parentPG == nil {
   201  			tg.createSession()
   202  		} else {
   203  			// Inherit the process group and terminal.
   204  			parentPG.incRefWithParent(parentPG)
   205  			tg.processGroup = parentPG
   206  			tg.tty = t.parent.tg.tty
   207  		}
   208  	}
   209  	tg.tasks.PushBack(t)
   210  	tg.tasksCount++
   211  	tg.liveTasks++
   212  	tg.activeTasks++
   213  
   214  	// Propagate external TaskSet stops to the new task.
   215  	t.stopCount = ts.stopCount
   216  
   217  	t.mu.Lock()
   218  	defer t.mu.Unlock()
   219  
   220  	t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
   221  
   222  	t.startTime = t.k.RealtimeClock().Now()
   223  
   224  	return t, nil
   225  }
   226  
   227  // assignTIDsLocked ensures that new task t is visible in all PID namespaces in
   228  // which it should be visible.
   229  //
   230  // Preconditions: ts.mu must be locked for writing.
   231  func (ts *TaskSet) assignTIDsLocked(t *Task) error {
   232  	type allocatedTID struct {
   233  		ns  *PIDNamespace
   234  		tid ThreadID
   235  	}
   236  	var allocatedTIDs []allocatedTID
   237  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   238  		tid, err := ns.allocateTID()
   239  		if err != nil {
   240  			// Failure. Remove the tids we already allocated in descendant
   241  			// namespaces.
   242  			for _, a := range allocatedTIDs {
   243  				delete(a.ns.tasks, a.tid)
   244  				delete(a.ns.tids, t)
   245  				if t.tg.leader == nil {
   246  					delete(a.ns.tgids, t.tg)
   247  				}
   248  			}
   249  			return err
   250  		}
   251  		ns.tasks[tid] = t
   252  		ns.tids[t] = tid
   253  		if t.tg.leader == nil {
   254  			// New thread group.
   255  			ns.tgids[t.tg] = tid
   256  		}
   257  		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
   258  	}
   259  	return nil
   260  }
   261  
   262  // allocateTID returns an unused ThreadID from ns.
   263  //
   264  // Preconditions: ns.owner.mu must be locked for writing.
   265  func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
   266  	if ns.exiting {
   267  		// "In this case, a subsequent fork(2) into this PID namespace will
   268  		// fail with the error ENOMEM; it is not possible to create a new
   269  		// processes [sic] in a PID namespace whose init process has
   270  		// terminated." - pid_namespaces(7)
   271  		return 0, syserror.ENOMEM
   272  	}
   273  	tid := ns.last
   274  	for {
   275  		// Next.
   276  		tid++
   277  		if tid > TasksLimit {
   278  			tid = InitTID + 1
   279  		}
   280  
   281  		// Is it available?
   282  		tidInUse := func() bool {
   283  			if _, ok := ns.tasks[tid]; ok {
   284  				return true
   285  			}
   286  			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
   287  				return true
   288  			}
   289  			if _, ok := ns.sessions[SessionID(tid)]; ok {
   290  				return true
   291  			}
   292  			return false
   293  		}()
   294  
   295  		if !tidInUse {
   296  			ns.last = tid
   297  			return tid, nil
   298  		}
   299  
   300  		// Did we do a full cycle?
   301  		if tid == ns.last {
   302  			// No tid available.
   303  			return 0, linuxerr.EAGAIN
   304  		}
   305  	}
   306  }
   307  
   308  // Start starts the task goroutine. Start must be called exactly once for each
   309  // task returned by NewTask.
   310  //
   311  // 'tid' must be the task's TID in the root PID namespace and it's used for
   312  // debugging purposes only (set as parameter to Task.run to make it visible
   313  // in stack dumps).
   314  func (t *Task) Start(tid ThreadID) {
   315  	// If the task was restored, it may be "starting" after having already exited.
   316  	if t.runState == nil {
   317  		return
   318  	}
   319  	t.goroutineStopped.Add(1)
   320  	t.tg.liveGoroutines.Add(1)
   321  	t.tg.pidns.owner.liveGoroutines.Add(1)
   322  	t.tg.pidns.owner.runningGoroutines.Add(1)
   323  
   324  	// Task is now running in system mode.
   325  	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
   326  
   327  	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
   328  	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
   329  }