github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/task_start.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops"
    22  	"github.com/ttpreport/gvisor-ligolo/pkg/context"
    23  	"github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr"
    24  	"github.com/ttpreport/gvisor-ligolo/pkg/hostarch"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/inet"
    26  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth"
    27  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/futex"
    28  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/sched"
    29  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/usage"
    30  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/vfs"
    31  )
    32  
    33  // TaskConfig defines the configuration of a new Task (see below).
    34  type TaskConfig struct {
    35  	// Kernel is the owning Kernel.
    36  	Kernel *Kernel
    37  
    38  	// Parent is the new task's parent. Parent may be nil.
    39  	Parent *Task
    40  
    41  	// If InheritParent is not nil, use InheritParent's parent as the new
    42  	// task's parent.
    43  	InheritParent *Task
    44  
    45  	// ThreadGroup is the ThreadGroup the new task belongs to.
    46  	ThreadGroup *ThreadGroup
    47  
    48  	// SignalMask is the new task's initial signal mask.
    49  	SignalMask linux.SignalSet
    50  
    51  	// TaskImage is the TaskImage of the new task. Ownership of the
    52  	// TaskImage is transferred to TaskSet.NewTask, whether or not it
    53  	// succeeds.
    54  	TaskImage *TaskImage
    55  
    56  	// FSContext is the FSContext of the new task. A reference must be held on
    57  	// FSContext, which is transferred to TaskSet.NewTask whether or not it
    58  	// succeeds.
    59  	FSContext *FSContext
    60  
    61  	// FDTable is the FDTableof the new task. A reference must be held on
    62  	// FDMap, which is transferred to TaskSet.NewTask whether or not it
    63  	// succeeds.
    64  	FDTable *FDTable
    65  
    66  	// Credentials is the Credentials of the new task.
    67  	Credentials *auth.Credentials
    68  
    69  	// Niceness is the niceness of the new task.
    70  	Niceness int
    71  
    72  	// NetworkNamespace is the network namespace to be used for the new task.
    73  	NetworkNamespace *inet.Namespace
    74  
    75  	// AllowedCPUMask contains the cpus that this task can run on.
    76  	AllowedCPUMask sched.CPUSet
    77  
    78  	// UTSNamespace is the UTSNamespace of the new task.
    79  	UTSNamespace *UTSNamespace
    80  
    81  	// IPCNamespace is the IPCNamespace of the new task.
    82  	IPCNamespace *IPCNamespace
    83  
    84  	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
    85  	AbstractSocketNamespace *AbstractSocketNamespace
    86  
    87  	// MountNamespace is the MountNamespace of the new task.
    88  	MountNamespace *vfs.MountNamespace
    89  
    90  	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
    91  	RSeqAddr hostarch.Addr
    92  
    93  	// RSeqSignature is the signature that the rseq abort IP must be signed
    94  	// with.
    95  	RSeqSignature uint32
    96  
    97  	// ContainerID is the container the new task belongs to.
    98  	ContainerID string
    99  
   100  	// InitialCgroups are the cgroups the container is initialised to.
   101  	InitialCgroups map[Cgroup]struct{}
   102  
   103  	// UserCounters is user resource counters.
   104  	UserCounters *userCounters
   105  }
   106  
   107  // NewTask creates a new task defined by cfg.
   108  //
   109  // NewTask does not start the returned task; the caller must call Task.Start.
   110  //
   111  // If successful, NewTask transfers references held by cfg to the new task.
   112  // Otherwise, NewTask releases them.
   113  func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   114  	var err error
   115  	cleanup := func() {
   116  		cfg.TaskImage.release(ctx)
   117  		cfg.FSContext.DecRef(ctx)
   118  		cfg.FDTable.DecRef(ctx)
   119  		cfg.IPCNamespace.DecRef(ctx)
   120  		cfg.NetworkNamespace.DecRef(ctx)
   121  		if cfg.MountNamespace != nil {
   122  			cfg.MountNamespace.DecRef(ctx)
   123  		}
   124  	}
   125  	if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil {
   126  		cleanup()
   127  		return nil, err
   128  	}
   129  	t, err := ts.newTask(ctx, cfg)
   130  	if err != nil {
   131  		cfg.UserCounters.decRLimitNProc()
   132  		cleanup()
   133  		return nil, err
   134  	}
   135  	return t, nil
   136  }
   137  
   138  // newTask is a helper for TaskSet.NewTask that only takes ownership of parts
   139  // of cfg if it succeeds.
   140  func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
   141  	srcT := TaskFromContext(ctx)
   142  	tg := cfg.ThreadGroup
   143  	image := cfg.TaskImage
   144  	t := &Task{
   145  		taskNode: taskNode{
   146  			tg:       tg,
   147  			parent:   cfg.Parent,
   148  			children: make(map[*Task]struct{}),
   149  		},
   150  		runState:        (*runApp)(nil),
   151  		interruptChan:   make(chan struct{}, 1),
   152  		signalMask:      atomicbitops.FromUint64(uint64(cfg.SignalMask)),
   153  		signalStack:     linux.SignalStack{Flags: linux.SS_DISABLE},
   154  		image:           *image,
   155  		fsContext:       cfg.FSContext,
   156  		fdTable:         cfg.FDTable,
   157  		k:               cfg.Kernel,
   158  		ptraceTracees:   make(map[*Task]struct{}),
   159  		allowedCPUMask:  cfg.AllowedCPUMask.Copy(),
   160  		ioUsage:         &usage.IO{},
   161  		niceness:        cfg.Niceness,
   162  		utsns:           cfg.UTSNamespace,
   163  		ipcns:           cfg.IPCNamespace,
   164  		abstractSockets: cfg.AbstractSocketNamespace,
   165  		mountNamespace:  cfg.MountNamespace,
   166  		rseqCPU:         -1,
   167  		rseqAddr:        cfg.RSeqAddr,
   168  		rseqSignature:   cfg.RSeqSignature,
   169  		futexWaiter:     futex.NewWaiter(),
   170  		containerID:     cfg.ContainerID,
   171  		cgroups:         make(map[Cgroup]struct{}),
   172  		userCounters:    cfg.UserCounters,
   173  	}
   174  	t.netns.Store(cfg.NetworkNamespace)
   175  	t.creds.Store(cfg.Credentials)
   176  	t.endStopCond.L = &t.tg.signalHandlers.mu
   177  	t.ptraceTracer.Store((*Task)(nil))
   178  	// We don't construct t.blockingTimer until Task.run(); see that function
   179  	// for justification.
   180  
   181  	var (
   182  		cg                 Cgroup
   183  		charged, committed bool
   184  	)
   185  
   186  	// Reserve cgroup PIDs controller charge. This is either commited when the
   187  	// new task enters the cgroup below, or rolled back on failure.
   188  	//
   189  	// We may also get here from a non-task context (for example, when
   190  	// creating the init task, or from the exec control command). In these cases
   191  	// we skip charging the pids controller, as non-userspace task creation
   192  	// bypasses pid limits.
   193  	if srcT != nil {
   194  		var err error
   195  		if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil {
   196  			return nil, err
   197  		}
   198  		if charged {
   199  			defer func() {
   200  				if !committed {
   201  					if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil {
   202  						panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err))
   203  					}
   204  				}
   205  				// Ref from ChargeFor. Note that we need to drop this outside of
   206  				// TaskSet.mu critical sections.
   207  				cg.DecRef(ctx)
   208  			}()
   209  		}
   210  	}
   211  
   212  	// Make the new task (and possibly thread group) visible to the rest of
   213  	// the system atomically.
   214  	ts.mu.Lock()
   215  	defer ts.mu.Unlock()
   216  	tg.signalHandlers.mu.Lock()
   217  	defer tg.signalHandlers.mu.Unlock()
   218  	if tg.exiting || tg.execing != nil {
   219  		// If the caller is in the same thread group, then what we return
   220  		// doesn't matter too much since the caller will exit before it returns
   221  		// to userspace. If the caller isn't in the same thread group, then
   222  		// we're in uncharted territory and can return whatever we want.
   223  		return nil, linuxerr.EINTR
   224  	}
   225  	if err := ts.assignTIDsLocked(t); err != nil {
   226  		return nil, err
   227  	}
   228  	// Below this point, newTask is expected not to fail (there is no rollback
   229  	// of assignTIDsLocked or any of the following).
   230  
   231  	// Logging on t's behalf will panic if t.logPrefix hasn't been
   232  	// initialized. This is the earliest point at which we can do so
   233  	// (since t now has thread IDs).
   234  	t.updateInfoLocked()
   235  
   236  	if cfg.InheritParent != nil {
   237  		t.parent = cfg.InheritParent.parent
   238  	}
   239  	if t.parent != nil {
   240  		t.parent.children[t] = struct{}{}
   241  	}
   242  
   243  	// If InitialCgroups is not nil, the new task will be placed in the
   244  	// specified cgroups. Otherwise, if srcT is not nil, the new task will
   245  	// be placed in the srcT's cgroups. If neither is specified, the new task
   246  	// will be in the root cgroups.
   247  	t.EnterInitialCgroups(srcT, cfg.InitialCgroups)
   248  	committed = true
   249  
   250  	if tg.leader == nil {
   251  		// New thread group.
   252  		tg.leader = t
   253  		if parentPG := tg.parentPG(); parentPG == nil {
   254  			tg.createSession()
   255  		} else {
   256  			// Inherit the process group and terminal.
   257  			parentPG.incRefWithParent(parentPG)
   258  			tg.processGroup = parentPG
   259  			tg.tty = t.parent.tg.tty
   260  		}
   261  
   262  		// If our parent is a child subreaper, or if it has a child
   263  		// subreaper, then this new thread group does as well.
   264  		if t.parent != nil {
   265  			tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper
   266  		}
   267  	}
   268  	tg.tasks.PushBack(t)
   269  	tg.tasksCount++
   270  	tg.liveTasks++
   271  	tg.activeTasks++
   272  
   273  	// Propagate external TaskSet stops to the new task.
   274  	t.stopCount = atomicbitops.FromInt32(ts.stopCount)
   275  
   276  	t.mu.Lock()
   277  	defer t.mu.Unlock()
   278  
   279  	t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t]))
   280  
   281  	t.startTime = t.k.RealtimeClock().Now()
   282  
   283  	// As a final step, initialize the platform context. This may require
   284  	// other pieces to be initialized as the task is used the context.
   285  	t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext())
   286  
   287  	return t, nil
   288  }
   289  
   290  // assignTIDsLocked ensures that new task t is visible in all PID namespaces in
   291  // which it should be visible.
   292  //
   293  // Preconditions: ts.mu must be locked for writing.
   294  func (ts *TaskSet) assignTIDsLocked(t *Task) error {
   295  	type allocatedTID struct {
   296  		ns  *PIDNamespace
   297  		tid ThreadID
   298  	}
   299  	var allocatedTIDs []allocatedTID
   300  	var tid ThreadID
   301  	var err error
   302  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   303  		if tid, err = ns.allocateTID(); err != nil {
   304  			break
   305  		}
   306  		if err = ns.addTask(t, tid); err != nil {
   307  			break
   308  		}
   309  		allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
   310  	}
   311  	if err != nil {
   312  		// Failure. Remove the tids we already allocated in descendant
   313  		// namespaces.
   314  		for _, a := range allocatedTIDs {
   315  			a.ns.deleteTask(t)
   316  		}
   317  		return err
   318  	}
   319  	t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg]))
   320  	return nil
   321  }
   322  
   323  // allocateTID returns an unused ThreadID from ns.
   324  //
   325  // Preconditions: ns.owner.mu must be locked for writing.
   326  func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
   327  	if ns.exiting {
   328  		// "In this case, a subsequent fork(2) into this PID namespace will
   329  		// fail with the error ENOMEM; it is not possible to create a new
   330  		// processes [sic] in a PID namespace whose init process has
   331  		// terminated." - pid_namespaces(7)
   332  		return 0, linuxerr.ENOMEM
   333  	}
   334  	tid := ns.last
   335  	for {
   336  		// Next.
   337  		tid++
   338  		if tid > TasksLimit {
   339  			tid = initTID + 1
   340  		}
   341  
   342  		// Is it available?
   343  		tidInUse := func() bool {
   344  			if _, ok := ns.tasks[tid]; ok {
   345  				return true
   346  			}
   347  			if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
   348  				return true
   349  			}
   350  			if _, ok := ns.sessions[SessionID(tid)]; ok {
   351  				return true
   352  			}
   353  			return false
   354  		}()
   355  
   356  		if !tidInUse {
   357  			ns.last = tid
   358  			return tid, nil
   359  		}
   360  
   361  		// Did we do a full cycle?
   362  		if tid == ns.last {
   363  			// No tid available.
   364  			return 0, linuxerr.EAGAIN
   365  		}
   366  	}
   367  }
   368  
   369  // Start starts the task goroutine. Start must be called exactly once for each
   370  // task returned by NewTask.
   371  //
   372  // 'tid' must be the task's TID in the root PID namespace and it's used for
   373  // debugging purposes only (set as parameter to Task.run to make it visible
   374  // in stack dumps).
   375  func (t *Task) Start(tid ThreadID) {
   376  	// If the task was restored, it may be "starting" after having already exited.
   377  	if t.runState == nil {
   378  		return
   379  	}
   380  	t.goroutineStopped.Add(1)
   381  	t.tg.liveGoroutines.Add(1)
   382  	t.tg.pidns.owner.liveGoroutines.Add(1)
   383  	t.tg.pidns.owner.runningGoroutines.Add(1)
   384  
   385  	// Task is now running in system mode.
   386  	t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
   387  
   388  	// Use the task's TID in the root PID namespace to make it visible in stack dumps.
   389  	go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
   390  }