github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/task_start.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 21 "github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops" 22 "github.com/ttpreport/gvisor-ligolo/pkg/context" 23 "github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr" 24 "github.com/ttpreport/gvisor-ligolo/pkg/hostarch" 25 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/inet" 26 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth" 27 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/futex" 28 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/sched" 29 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/usage" 30 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/vfs" 31 ) 32 33 // TaskConfig defines the configuration of a new Task (see below). 34 type TaskConfig struct { 35 // Kernel is the owning Kernel. 36 Kernel *Kernel 37 38 // Parent is the new task's parent. Parent may be nil. 39 Parent *Task 40 41 // If InheritParent is not nil, use InheritParent's parent as the new 42 // task's parent. 43 InheritParent *Task 44 45 // ThreadGroup is the ThreadGroup the new task belongs to. 46 ThreadGroup *ThreadGroup 47 48 // SignalMask is the new task's initial signal mask. 49 SignalMask linux.SignalSet 50 51 // TaskImage is the TaskImage of the new task. Ownership of the 52 // TaskImage is transferred to TaskSet.NewTask, whether or not it 53 // succeeds. 54 TaskImage *TaskImage 55 56 // FSContext is the FSContext of the new task. A reference must be held on 57 // FSContext, which is transferred to TaskSet.NewTask whether or not it 58 // succeeds. 59 FSContext *FSContext 60 61 // FDTable is the FDTableof the new task. A reference must be held on 62 // FDMap, which is transferred to TaskSet.NewTask whether or not it 63 // succeeds. 64 FDTable *FDTable 65 66 // Credentials is the Credentials of the new task. 67 Credentials *auth.Credentials 68 69 // Niceness is the niceness of the new task. 70 Niceness int 71 72 // NetworkNamespace is the network namespace to be used for the new task. 73 NetworkNamespace *inet.Namespace 74 75 // AllowedCPUMask contains the cpus that this task can run on. 76 AllowedCPUMask sched.CPUSet 77 78 // UTSNamespace is the UTSNamespace of the new task. 79 UTSNamespace *UTSNamespace 80 81 // IPCNamespace is the IPCNamespace of the new task. 82 IPCNamespace *IPCNamespace 83 84 // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. 85 AbstractSocketNamespace *AbstractSocketNamespace 86 87 // MountNamespace is the MountNamespace of the new task. 88 MountNamespace *vfs.MountNamespace 89 90 // RSeqAddr is a pointer to the the userspace linux.RSeq structure. 91 RSeqAddr hostarch.Addr 92 93 // RSeqSignature is the signature that the rseq abort IP must be signed 94 // with. 95 RSeqSignature uint32 96 97 // ContainerID is the container the new task belongs to. 98 ContainerID string 99 100 // InitialCgroups are the cgroups the container is initialised to. 101 InitialCgroups map[Cgroup]struct{} 102 103 // UserCounters is user resource counters. 104 UserCounters *userCounters 105 } 106 107 // NewTask creates a new task defined by cfg. 108 // 109 // NewTask does not start the returned task; the caller must call Task.Start. 110 // 111 // If successful, NewTask transfers references held by cfg to the new task. 112 // Otherwise, NewTask releases them. 113 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 114 var err error 115 cleanup := func() { 116 cfg.TaskImage.release(ctx) 117 cfg.FSContext.DecRef(ctx) 118 cfg.FDTable.DecRef(ctx) 119 cfg.IPCNamespace.DecRef(ctx) 120 cfg.NetworkNamespace.DecRef(ctx) 121 if cfg.MountNamespace != nil { 122 cfg.MountNamespace.DecRef(ctx) 123 } 124 } 125 if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { 126 cleanup() 127 return nil, err 128 } 129 t, err := ts.newTask(ctx, cfg) 130 if err != nil { 131 cfg.UserCounters.decRLimitNProc() 132 cleanup() 133 return nil, err 134 } 135 return t, nil 136 } 137 138 // newTask is a helper for TaskSet.NewTask that only takes ownership of parts 139 // of cfg if it succeeds. 140 func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 141 srcT := TaskFromContext(ctx) 142 tg := cfg.ThreadGroup 143 image := cfg.TaskImage 144 t := &Task{ 145 taskNode: taskNode{ 146 tg: tg, 147 parent: cfg.Parent, 148 children: make(map[*Task]struct{}), 149 }, 150 runState: (*runApp)(nil), 151 interruptChan: make(chan struct{}, 1), 152 signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), 153 signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, 154 image: *image, 155 fsContext: cfg.FSContext, 156 fdTable: cfg.FDTable, 157 k: cfg.Kernel, 158 ptraceTracees: make(map[*Task]struct{}), 159 allowedCPUMask: cfg.AllowedCPUMask.Copy(), 160 ioUsage: &usage.IO{}, 161 niceness: cfg.Niceness, 162 utsns: cfg.UTSNamespace, 163 ipcns: cfg.IPCNamespace, 164 abstractSockets: cfg.AbstractSocketNamespace, 165 mountNamespace: cfg.MountNamespace, 166 rseqCPU: -1, 167 rseqAddr: cfg.RSeqAddr, 168 rseqSignature: cfg.RSeqSignature, 169 futexWaiter: futex.NewWaiter(), 170 containerID: cfg.ContainerID, 171 cgroups: make(map[Cgroup]struct{}), 172 userCounters: cfg.UserCounters, 173 } 174 t.netns.Store(cfg.NetworkNamespace) 175 t.creds.Store(cfg.Credentials) 176 t.endStopCond.L = &t.tg.signalHandlers.mu 177 t.ptraceTracer.Store((*Task)(nil)) 178 // We don't construct t.blockingTimer until Task.run(); see that function 179 // for justification. 180 181 var ( 182 cg Cgroup 183 charged, committed bool 184 ) 185 186 // Reserve cgroup PIDs controller charge. This is either commited when the 187 // new task enters the cgroup below, or rolled back on failure. 188 // 189 // We may also get here from a non-task context (for example, when 190 // creating the init task, or from the exec control command). In these cases 191 // we skip charging the pids controller, as non-userspace task creation 192 // bypasses pid limits. 193 if srcT != nil { 194 var err error 195 if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { 196 return nil, err 197 } 198 if charged { 199 defer func() { 200 if !committed { 201 if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { 202 panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) 203 } 204 } 205 // Ref from ChargeFor. Note that we need to drop this outside of 206 // TaskSet.mu critical sections. 207 cg.DecRef(ctx) 208 }() 209 } 210 } 211 212 // Make the new task (and possibly thread group) visible to the rest of 213 // the system atomically. 214 ts.mu.Lock() 215 defer ts.mu.Unlock() 216 tg.signalHandlers.mu.Lock() 217 defer tg.signalHandlers.mu.Unlock() 218 if tg.exiting || tg.execing != nil { 219 // If the caller is in the same thread group, then what we return 220 // doesn't matter too much since the caller will exit before it returns 221 // to userspace. If the caller isn't in the same thread group, then 222 // we're in uncharted territory and can return whatever we want. 223 return nil, linuxerr.EINTR 224 } 225 if err := ts.assignTIDsLocked(t); err != nil { 226 return nil, err 227 } 228 // Below this point, newTask is expected not to fail (there is no rollback 229 // of assignTIDsLocked or any of the following). 230 231 // Logging on t's behalf will panic if t.logPrefix hasn't been 232 // initialized. This is the earliest point at which we can do so 233 // (since t now has thread IDs). 234 t.updateInfoLocked() 235 236 if cfg.InheritParent != nil { 237 t.parent = cfg.InheritParent.parent 238 } 239 if t.parent != nil { 240 t.parent.children[t] = struct{}{} 241 } 242 243 // If InitialCgroups is not nil, the new task will be placed in the 244 // specified cgroups. Otherwise, if srcT is not nil, the new task will 245 // be placed in the srcT's cgroups. If neither is specified, the new task 246 // will be in the root cgroups. 247 t.EnterInitialCgroups(srcT, cfg.InitialCgroups) 248 committed = true 249 250 if tg.leader == nil { 251 // New thread group. 252 tg.leader = t 253 if parentPG := tg.parentPG(); parentPG == nil { 254 tg.createSession() 255 } else { 256 // Inherit the process group and terminal. 257 parentPG.incRefWithParent(parentPG) 258 tg.processGroup = parentPG 259 tg.tty = t.parent.tg.tty 260 } 261 262 // If our parent is a child subreaper, or if it has a child 263 // subreaper, then this new thread group does as well. 264 if t.parent != nil { 265 tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper 266 } 267 } 268 tg.tasks.PushBack(t) 269 tg.tasksCount++ 270 tg.liveTasks++ 271 tg.activeTasks++ 272 273 // Propagate external TaskSet stops to the new task. 274 t.stopCount = atomicbitops.FromInt32(ts.stopCount) 275 276 t.mu.Lock() 277 defer t.mu.Unlock() 278 279 t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) 280 281 t.startTime = t.k.RealtimeClock().Now() 282 283 // As a final step, initialize the platform context. This may require 284 // other pieces to be initialized as the task is used the context. 285 t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) 286 287 return t, nil 288 } 289 290 // assignTIDsLocked ensures that new task t is visible in all PID namespaces in 291 // which it should be visible. 292 // 293 // Preconditions: ts.mu must be locked for writing. 294 func (ts *TaskSet) assignTIDsLocked(t *Task) error { 295 type allocatedTID struct { 296 ns *PIDNamespace 297 tid ThreadID 298 } 299 var allocatedTIDs []allocatedTID 300 var tid ThreadID 301 var err error 302 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 303 if tid, err = ns.allocateTID(); err != nil { 304 break 305 } 306 if err = ns.addTask(t, tid); err != nil { 307 break 308 } 309 allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) 310 } 311 if err != nil { 312 // Failure. Remove the tids we already allocated in descendant 313 // namespaces. 314 for _, a := range allocatedTIDs { 315 a.ns.deleteTask(t) 316 } 317 return err 318 } 319 t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) 320 return nil 321 } 322 323 // allocateTID returns an unused ThreadID from ns. 324 // 325 // Preconditions: ns.owner.mu must be locked for writing. 326 func (ns *PIDNamespace) allocateTID() (ThreadID, error) { 327 if ns.exiting { 328 // "In this case, a subsequent fork(2) into this PID namespace will 329 // fail with the error ENOMEM; it is not possible to create a new 330 // processes [sic] in a PID namespace whose init process has 331 // terminated." - pid_namespaces(7) 332 return 0, linuxerr.ENOMEM 333 } 334 tid := ns.last 335 for { 336 // Next. 337 tid++ 338 if tid > TasksLimit { 339 tid = initTID + 1 340 } 341 342 // Is it available? 343 tidInUse := func() bool { 344 if _, ok := ns.tasks[tid]; ok { 345 return true 346 } 347 if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { 348 return true 349 } 350 if _, ok := ns.sessions[SessionID(tid)]; ok { 351 return true 352 } 353 return false 354 }() 355 356 if !tidInUse { 357 ns.last = tid 358 return tid, nil 359 } 360 361 // Did we do a full cycle? 362 if tid == ns.last { 363 // No tid available. 364 return 0, linuxerr.EAGAIN 365 } 366 } 367 } 368 369 // Start starts the task goroutine. Start must be called exactly once for each 370 // task returned by NewTask. 371 // 372 // 'tid' must be the task's TID in the root PID namespace and it's used for 373 // debugging purposes only (set as parameter to Task.run to make it visible 374 // in stack dumps). 375 func (t *Task) Start(tid ThreadID) { 376 // If the task was restored, it may be "starting" after having already exited. 377 if t.runState == nil { 378 return 379 } 380 t.goroutineStopped.Add(1) 381 t.tg.liveGoroutines.Add(1) 382 t.tg.pidns.owner.liveGoroutines.Add(1) 383 t.tg.pidns.owner.runningGoroutines.Add(1) 384 385 // Task is now running in system mode. 386 t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) 387 388 // Use the task's TID in the root PID namespace to make it visible in stack dumps. 389 go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops 390 }