gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_start.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "gvisor.dev/gvisor/pkg/abi/linux" 21 "gvisor.dev/gvisor/pkg/atomicbitops" 22 "gvisor.dev/gvisor/pkg/context" 23 "gvisor.dev/gvisor/pkg/errors/linuxerr" 24 "gvisor.dev/gvisor/pkg/hostarch" 25 "gvisor.dev/gvisor/pkg/sentry/inet" 26 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 27 "gvisor.dev/gvisor/pkg/sentry/kernel/futex" 28 "gvisor.dev/gvisor/pkg/sentry/kernel/sched" 29 "gvisor.dev/gvisor/pkg/sentry/usage" 30 "gvisor.dev/gvisor/pkg/sentry/vfs" 31 ) 32 33 // TaskConfig defines the configuration of a new Task (see below). 34 type TaskConfig struct { 35 // Kernel is the owning Kernel. 36 Kernel *Kernel 37 38 // Parent is the new task's parent. Parent may be nil. 39 Parent *Task 40 41 // If InheritParent is not nil, use InheritParent's parent as the new 42 // task's parent. 43 InheritParent *Task 44 45 // ThreadGroup is the ThreadGroup the new task belongs to. 46 ThreadGroup *ThreadGroup 47 48 // SignalMask is the new task's initial signal mask. 49 SignalMask linux.SignalSet 50 51 // TaskImage is the TaskImage of the new task. Ownership of the 52 // TaskImage is transferred to TaskSet.NewTask, whether or not it 53 // succeeds. 54 TaskImage *TaskImage 55 56 // FSContext is the FSContext of the new task. A reference must be held on 57 // FSContext, which is transferred to TaskSet.NewTask whether or not it 58 // succeeds. 59 FSContext *FSContext 60 61 // FDTable is the FDTableof the new task. A reference must be held on 62 // FDMap, which is transferred to TaskSet.NewTask whether or not it 63 // succeeds. 64 FDTable *FDTable 65 66 // Credentials is the Credentials of the new task. 67 Credentials *auth.Credentials 68 69 // Niceness is the niceness of the new task. 70 Niceness int 71 72 // NetworkNamespace is the network namespace to be used for the new task. 73 NetworkNamespace *inet.Namespace 74 75 // AllowedCPUMask contains the cpus that this task can run on. 76 AllowedCPUMask sched.CPUSet 77 78 // UTSNamespace is the UTSNamespace of the new task. 79 UTSNamespace *UTSNamespace 80 81 // IPCNamespace is the IPCNamespace of the new task. 82 IPCNamespace *IPCNamespace 83 84 // MountNamespace is the MountNamespace of the new task. 85 MountNamespace *vfs.MountNamespace 86 87 // RSeqAddr is a pointer to the userspace linux.RSeq structure. 88 RSeqAddr hostarch.Addr 89 90 // RSeqSignature is the signature that the rseq abort IP must be signed 91 // with. 92 RSeqSignature uint32 93 94 // ContainerID is the container the new task belongs to. 95 ContainerID string 96 97 // InitialCgroups are the cgroups the container is initialised to. 98 InitialCgroups map[Cgroup]struct{} 99 100 // UserCounters is user resource counters. 101 UserCounters *UserCounters 102 103 // SessionKeyring is the session keyring associated with the parent task. 104 // It may be nil. 105 SessionKeyring *auth.Key 106 107 Origin TaskOrigin 108 } 109 110 // NewTask creates a new task defined by cfg. 111 // 112 // NewTask does not start the returned task; the caller must call Task.Start. 113 // 114 // If successful, NewTask transfers references held by cfg to the new task. 115 // Otherwise, NewTask releases them. 116 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 117 var err error 118 cleanup := func() { 119 cfg.TaskImage.release(ctx) 120 cfg.FSContext.DecRef(ctx) 121 cfg.FDTable.DecRef(ctx) 122 cfg.UTSNamespace.DecRef(ctx) 123 cfg.IPCNamespace.DecRef(ctx) 124 cfg.NetworkNamespace.DecRef(ctx) 125 if cfg.MountNamespace != nil { 126 cfg.MountNamespace.DecRef(ctx) 127 } 128 } 129 if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { 130 cleanup() 131 return nil, err 132 } 133 t, err := ts.newTask(ctx, cfg) 134 if err != nil { 135 cfg.UserCounters.decRLimitNProc() 136 cleanup() 137 return nil, err 138 } 139 return t, nil 140 } 141 142 // newTask is a helper for TaskSet.NewTask that only takes ownership of parts 143 // of cfg if it succeeds. 144 func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 145 srcT := TaskFromContext(ctx) 146 tg := cfg.ThreadGroup 147 image := cfg.TaskImage 148 t := &Task{ 149 taskNode: taskNode{ 150 tg: tg, 151 parent: cfg.Parent, 152 children: make(map[*Task]struct{}), 153 }, 154 runState: (*runApp)(nil), 155 interruptChan: make(chan struct{}, 1), 156 signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), 157 signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, 158 image: *image, 159 fsContext: cfg.FSContext, 160 fdTable: cfg.FDTable, 161 k: cfg.Kernel, 162 ptraceTracees: make(map[*Task]struct{}), 163 allowedCPUMask: cfg.AllowedCPUMask.Copy(), 164 ioUsage: &usage.IO{}, 165 niceness: cfg.Niceness, 166 utsns: cfg.UTSNamespace, 167 ipcns: cfg.IPCNamespace, 168 mountNamespace: cfg.MountNamespace, 169 rseqCPU: -1, 170 rseqAddr: cfg.RSeqAddr, 171 rseqSignature: cfg.RSeqSignature, 172 futexWaiter: futex.NewWaiter(), 173 containerID: cfg.ContainerID, 174 cgroups: make(map[Cgroup]struct{}), 175 userCounters: cfg.UserCounters, 176 sessionKeyring: cfg.SessionKeyring, 177 Origin: cfg.Origin, 178 } 179 t.netns = cfg.NetworkNamespace 180 t.creds.Store(cfg.Credentials) 181 t.endStopCond.L = &t.tg.signalHandlers.mu 182 // We don't construct t.blockingTimer until Task.run(); see that function 183 // for justification. 184 185 var ( 186 cg Cgroup 187 charged, committed bool 188 ) 189 190 // Reserve cgroup PIDs controller charge. This is either committed when the 191 // new task enters the cgroup below, or rolled back on failure. 192 // 193 // We may also get here from a non-task context (for example, when 194 // creating the init task, or from the exec control command). In these cases 195 // we skip charging the pids controller, as non-userspace task creation 196 // bypasses pid limits. 197 if srcT != nil { 198 var err error 199 if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { 200 return nil, err 201 } 202 if charged { 203 defer func() { 204 if !committed { 205 if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { 206 panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) 207 } 208 } 209 // Ref from ChargeFor. Note that we need to drop this outside of 210 // TaskSet.mu critical sections. 211 cg.DecRef(ctx) 212 }() 213 } 214 } 215 216 // Make the new task (and possibly thread group) visible to the rest of 217 // the system atomically. 218 ts.mu.Lock() 219 defer ts.mu.Unlock() 220 tg.signalHandlers.mu.Lock() 221 defer tg.signalHandlers.mu.Unlock() 222 if tg.exiting || tg.execing != nil { 223 // If the caller is in the same thread group, then what we return 224 // doesn't matter too much since the caller will exit before it returns 225 // to userspace. If the caller isn't in the same thread group, then 226 // we're in uncharted territory and can return whatever we want. 227 return nil, linuxerr.EINTR 228 } 229 if err := ts.assignTIDsLocked(t); err != nil { 230 return nil, err 231 } 232 // Below this point, newTask is expected not to fail (there is no rollback 233 // of assignTIDsLocked or any of the following). 234 235 // Logging on t's behalf will panic if t.logPrefix hasn't been 236 // initialized. This is the earliest point at which we can do so 237 // (since t now has thread IDs). 238 t.updateInfoLocked() 239 240 if cfg.InheritParent != nil { 241 t.parent = cfg.InheritParent.parent 242 } 243 if t.parent != nil { 244 t.parent.children[t] = struct{}{} 245 } 246 247 // If InitialCgroups is not nil, the new task will be placed in the 248 // specified cgroups. Otherwise, if srcT is not nil, the new task will 249 // be placed in the srcT's cgroups. If neither is specified, the new task 250 // will be in the root cgroups. 251 t.EnterInitialCgroups(srcT, cfg.InitialCgroups) 252 committed = true 253 254 if tg.leader == nil { 255 // New thread group. 256 tg.leader = t 257 if parentPG := tg.parentPG(); parentPG == nil { 258 tg.createSession() 259 } else { 260 // Inherit the process group and terminal. 261 parentPG.incRefWithParent(parentPG) 262 tg.processGroup = parentPG 263 tg.tty = t.parent.tg.tty 264 } 265 266 // If our parent is a child subreaper, or if it has a child 267 // subreaper, then this new thread group does as well. 268 if t.parent != nil { 269 tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper 270 } 271 } 272 tg.tasks.PushBack(t) 273 tg.tasksCount++ 274 tg.liveTasks++ 275 tg.activeTasks++ 276 277 // Propagate external TaskSet stops to the new task. 278 t.stopCount = atomicbitops.FromInt32(ts.stopCount) 279 280 t.mu.Lock() 281 defer t.mu.Unlock() 282 283 t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) 284 285 t.startTime = t.k.RealtimeClock().Now() 286 287 // As a final step, initialize the platform context. This may require 288 // other pieces to be initialized as the task is used the context. 289 t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) 290 291 return t, nil 292 } 293 294 // assignTIDsLocked ensures that new task t is visible in all PID namespaces in 295 // which it should be visible. 296 // 297 // Preconditions: ts.mu must be locked for writing. 298 func (ts *TaskSet) assignTIDsLocked(t *Task) error { 299 type allocatedTID struct { 300 ns *PIDNamespace 301 tid ThreadID 302 } 303 var allocatedTIDs []allocatedTID 304 var tid ThreadID 305 var err error 306 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 307 if tid, err = ns.allocateTID(); err != nil { 308 break 309 } 310 if err = ns.addTask(t, tid); err != nil { 311 break 312 } 313 allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) 314 } 315 if err != nil { 316 // Failure. Remove the tids we already allocated in descendant 317 // namespaces. 318 for _, a := range allocatedTIDs { 319 a.ns.deleteTask(t) 320 } 321 return err 322 } 323 t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) 324 return nil 325 } 326 327 // allocateTID returns an unused ThreadID from ns. 328 // 329 // Preconditions: ns.owner.mu must be locked for writing. 330 func (ns *PIDNamespace) allocateTID() (ThreadID, error) { 331 if ns.exiting { 332 // "In this case, a subsequent fork(2) into this PID namespace will 333 // fail with the error ENOMEM; it is not possible to create a new 334 // processes [sic] in a PID namespace whose init process has 335 // terminated." - pid_namespaces(7) 336 return 0, linuxerr.ENOMEM 337 } 338 tid := ns.last 339 for { 340 // Next. 341 tid++ 342 if tid > TasksLimit { 343 tid = initTID + 1 344 } 345 346 // Is it available? 347 tidInUse := func() bool { 348 if _, ok := ns.tasks[tid]; ok { 349 return true 350 } 351 if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { 352 return true 353 } 354 if _, ok := ns.sessions[SessionID(tid)]; ok { 355 return true 356 } 357 return false 358 }() 359 360 if !tidInUse { 361 ns.last = tid 362 return tid, nil 363 } 364 365 // Did we do a full cycle? 366 if tid == ns.last { 367 // No tid available. 368 return 0, linuxerr.EAGAIN 369 } 370 } 371 } 372 373 // Start starts the task goroutine. Start must be called exactly once for each 374 // task returned by NewTask. 375 // 376 // 'tid' must be the task's TID in the root PID namespace and it's used for 377 // debugging purposes only (set as parameter to Task.run to make it visible 378 // in stack dumps). 379 func (t *Task) Start(tid ThreadID) { 380 // If the task was restored, it may be "starting" after having already exited. 381 if t.runState == nil { 382 return 383 } 384 t.goroutineStopped.Add(1) 385 t.tg.liveGoroutines.Add(1) 386 t.tg.pidns.owner.liveGoroutines.Add(1) 387 t.tg.pidns.owner.runningGoroutines.Add(1) 388 389 // Task is now running in system mode. 390 t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) 391 392 // Use the task's TID in the root PID namespace to make it visible in stack dumps. 393 go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops 394 }