github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task_start.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/atomicbitops" 22 "github.com/metacubex/gvisor/pkg/context" 23 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 24 "github.com/metacubex/gvisor/pkg/hostarch" 25 "github.com/metacubex/gvisor/pkg/sentry/inet" 26 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 27 "github.com/metacubex/gvisor/pkg/sentry/kernel/futex" 28 "github.com/metacubex/gvisor/pkg/sentry/kernel/sched" 29 "github.com/metacubex/gvisor/pkg/sentry/usage" 30 "github.com/metacubex/gvisor/pkg/sentry/vfs" 31 ) 32 33 // TaskConfig defines the configuration of a new Task (see below). 34 type TaskConfig struct { 35 // Kernel is the owning Kernel. 36 Kernel *Kernel 37 38 // Parent is the new task's parent. Parent may be nil. 39 Parent *Task 40 41 // If InheritParent is not nil, use InheritParent's parent as the new 42 // task's parent. 43 InheritParent *Task 44 45 // ThreadGroup is the ThreadGroup the new task belongs to. 46 ThreadGroup *ThreadGroup 47 48 // SignalMask is the new task's initial signal mask. 49 SignalMask linux.SignalSet 50 51 // TaskImage is the TaskImage of the new task. Ownership of the 52 // TaskImage is transferred to TaskSet.NewTask, whether or not it 53 // succeeds. 54 TaskImage *TaskImage 55 56 // FSContext is the FSContext of the new task. A reference must be held on 57 // FSContext, which is transferred to TaskSet.NewTask whether or not it 58 // succeeds. 59 FSContext *FSContext 60 61 // FDTable is the FDTableof the new task. A reference must be held on 62 // FDMap, which is transferred to TaskSet.NewTask whether or not it 63 // succeeds. 64 FDTable *FDTable 65 66 // Credentials is the Credentials of the new task. 67 Credentials *auth.Credentials 68 69 // Niceness is the niceness of the new task. 70 Niceness int 71 72 // NetworkNamespace is the network namespace to be used for the new task. 73 NetworkNamespace *inet.Namespace 74 75 // AllowedCPUMask contains the cpus that this task can run on. 76 AllowedCPUMask sched.CPUSet 77 78 // UTSNamespace is the UTSNamespace of the new task. 79 UTSNamespace *UTSNamespace 80 81 // IPCNamespace is the IPCNamespace of the new task. 82 IPCNamespace *IPCNamespace 83 84 // MountNamespace is the MountNamespace of the new task. 85 MountNamespace *vfs.MountNamespace 86 87 // RSeqAddr is a pointer to the the userspace linux.RSeq structure. 88 RSeqAddr hostarch.Addr 89 90 // RSeqSignature is the signature that the rseq abort IP must be signed 91 // with. 92 RSeqSignature uint32 93 94 // ContainerID is the container the new task belongs to. 95 ContainerID string 96 97 // InitialCgroups are the cgroups the container is initialised to. 98 InitialCgroups map[Cgroup]struct{} 99 100 // UserCounters is user resource counters. 101 UserCounters *UserCounters 102 103 // SessionKeyring is the session keyring associated with the parent task. 104 // It may be nil. 105 SessionKeyring *auth.Key 106 } 107 108 // NewTask creates a new task defined by cfg. 109 // 110 // NewTask does not start the returned task; the caller must call Task.Start. 111 // 112 // If successful, NewTask transfers references held by cfg to the new task. 113 // Otherwise, NewTask releases them. 114 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 115 var err error 116 cleanup := func() { 117 cfg.TaskImage.release(ctx) 118 cfg.FSContext.DecRef(ctx) 119 cfg.FDTable.DecRef(ctx) 120 cfg.UTSNamespace.DecRef(ctx) 121 cfg.IPCNamespace.DecRef(ctx) 122 cfg.NetworkNamespace.DecRef(ctx) 123 if cfg.MountNamespace != nil { 124 cfg.MountNamespace.DecRef(ctx) 125 } 126 } 127 if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { 128 cleanup() 129 return nil, err 130 } 131 t, err := ts.newTask(ctx, cfg) 132 if err != nil { 133 cfg.UserCounters.decRLimitNProc() 134 cleanup() 135 return nil, err 136 } 137 return t, nil 138 } 139 140 // newTask is a helper for TaskSet.NewTask that only takes ownership of parts 141 // of cfg if it succeeds. 142 func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 143 srcT := TaskFromContext(ctx) 144 tg := cfg.ThreadGroup 145 image := cfg.TaskImage 146 t := &Task{ 147 taskNode: taskNode{ 148 tg: tg, 149 parent: cfg.Parent, 150 children: make(map[*Task]struct{}), 151 }, 152 runState: (*runApp)(nil), 153 interruptChan: make(chan struct{}, 1), 154 signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), 155 signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, 156 image: *image, 157 fsContext: cfg.FSContext, 158 fdTable: cfg.FDTable, 159 k: cfg.Kernel, 160 ptraceTracees: make(map[*Task]struct{}), 161 allowedCPUMask: cfg.AllowedCPUMask.Copy(), 162 ioUsage: &usage.IO{}, 163 niceness: cfg.Niceness, 164 utsns: cfg.UTSNamespace, 165 ipcns: cfg.IPCNamespace, 166 mountNamespace: cfg.MountNamespace, 167 rseqCPU: -1, 168 rseqAddr: cfg.RSeqAddr, 169 rseqSignature: cfg.RSeqSignature, 170 futexWaiter: futex.NewWaiter(), 171 containerID: cfg.ContainerID, 172 cgroups: make(map[Cgroup]struct{}), 173 userCounters: cfg.UserCounters, 174 sessionKeyring: cfg.SessionKeyring, 175 } 176 t.netns = cfg.NetworkNamespace 177 t.creds.Store(cfg.Credentials) 178 t.endStopCond.L = &t.tg.signalHandlers.mu 179 // We don't construct t.blockingTimer until Task.run(); see that function 180 // for justification. 181 182 var ( 183 cg Cgroup 184 charged, committed bool 185 ) 186 187 // Reserve cgroup PIDs controller charge. This is either committed when the 188 // new task enters the cgroup below, or rolled back on failure. 189 // 190 // We may also get here from a non-task context (for example, when 191 // creating the init task, or from the exec control command). In these cases 192 // we skip charging the pids controller, as non-userspace task creation 193 // bypasses pid limits. 194 if srcT != nil { 195 var err error 196 if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { 197 return nil, err 198 } 199 if charged { 200 defer func() { 201 if !committed { 202 if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { 203 panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) 204 } 205 } 206 // Ref from ChargeFor. Note that we need to drop this outside of 207 // TaskSet.mu critical sections. 208 cg.DecRef(ctx) 209 }() 210 } 211 } 212 213 // Make the new task (and possibly thread group) visible to the rest of 214 // the system atomically. 215 ts.mu.Lock() 216 defer ts.mu.Unlock() 217 tg.signalHandlers.mu.Lock() 218 defer tg.signalHandlers.mu.Unlock() 219 if tg.exiting || tg.execing != nil { 220 // If the caller is in the same thread group, then what we return 221 // doesn't matter too much since the caller will exit before it returns 222 // to userspace. If the caller isn't in the same thread group, then 223 // we're in uncharted territory and can return whatever we want. 224 return nil, linuxerr.EINTR 225 } 226 if err := ts.assignTIDsLocked(t); err != nil { 227 return nil, err 228 } 229 // Below this point, newTask is expected not to fail (there is no rollback 230 // of assignTIDsLocked or any of the following). 231 232 // Logging on t's behalf will panic if t.logPrefix hasn't been 233 // initialized. This is the earliest point at which we can do so 234 // (since t now has thread IDs). 235 t.updateInfoLocked() 236 237 if cfg.InheritParent != nil { 238 t.parent = cfg.InheritParent.parent 239 } 240 if t.parent != nil { 241 t.parent.children[t] = struct{}{} 242 } 243 244 // If InitialCgroups is not nil, the new task will be placed in the 245 // specified cgroups. Otherwise, if srcT is not nil, the new task will 246 // be placed in the srcT's cgroups. If neither is specified, the new task 247 // will be in the root cgroups. 248 t.EnterInitialCgroups(srcT, cfg.InitialCgroups) 249 committed = true 250 251 if tg.leader == nil { 252 // New thread group. 253 tg.leader = t 254 if parentPG := tg.parentPG(); parentPG == nil { 255 tg.createSession() 256 } else { 257 // Inherit the process group and terminal. 258 parentPG.incRefWithParent(parentPG) 259 tg.processGroup = parentPG 260 tg.tty = t.parent.tg.tty 261 } 262 263 // If our parent is a child subreaper, or if it has a child 264 // subreaper, then this new thread group does as well. 265 if t.parent != nil { 266 tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper 267 } 268 } 269 tg.tasks.PushBack(t) 270 tg.tasksCount++ 271 tg.liveTasks++ 272 tg.activeTasks++ 273 274 // Propagate external TaskSet stops to the new task. 275 t.stopCount = atomicbitops.FromInt32(ts.stopCount) 276 277 t.mu.Lock() 278 defer t.mu.Unlock() 279 280 t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) 281 282 t.startTime = t.k.RealtimeClock().Now() 283 284 // As a final step, initialize the platform context. This may require 285 // other pieces to be initialized as the task is used the context. 286 t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) 287 288 return t, nil 289 } 290 291 // assignTIDsLocked ensures that new task t is visible in all PID namespaces in 292 // which it should be visible. 293 // 294 // Preconditions: ts.mu must be locked for writing. 295 func (ts *TaskSet) assignTIDsLocked(t *Task) error { 296 type allocatedTID struct { 297 ns *PIDNamespace 298 tid ThreadID 299 } 300 var allocatedTIDs []allocatedTID 301 var tid ThreadID 302 var err error 303 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 304 if tid, err = ns.allocateTID(); err != nil { 305 break 306 } 307 if err = ns.addTask(t, tid); err != nil { 308 break 309 } 310 allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) 311 } 312 if err != nil { 313 // Failure. Remove the tids we already allocated in descendant 314 // namespaces. 315 for _, a := range allocatedTIDs { 316 a.ns.deleteTask(t) 317 } 318 return err 319 } 320 t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) 321 return nil 322 } 323 324 // allocateTID returns an unused ThreadID from ns. 325 // 326 // Preconditions: ns.owner.mu must be locked for writing. 327 func (ns *PIDNamespace) allocateTID() (ThreadID, error) { 328 if ns.exiting { 329 // "In this case, a subsequent fork(2) into this PID namespace will 330 // fail with the error ENOMEM; it is not possible to create a new 331 // processes [sic] in a PID namespace whose init process has 332 // terminated." - pid_namespaces(7) 333 return 0, linuxerr.ENOMEM 334 } 335 tid := ns.last 336 for { 337 // Next. 338 tid++ 339 if tid > TasksLimit { 340 tid = initTID + 1 341 } 342 343 // Is it available? 344 tidInUse := func() bool { 345 if _, ok := ns.tasks[tid]; ok { 346 return true 347 } 348 if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { 349 return true 350 } 351 if _, ok := ns.sessions[SessionID(tid)]; ok { 352 return true 353 } 354 return false 355 }() 356 357 if !tidInUse { 358 ns.last = tid 359 return tid, nil 360 } 361 362 // Did we do a full cycle? 363 if tid == ns.last { 364 // No tid available. 365 return 0, linuxerr.EAGAIN 366 } 367 } 368 } 369 370 // Start starts the task goroutine. Start must be called exactly once for each 371 // task returned by NewTask. 372 // 373 // 'tid' must be the task's TID in the root PID namespace and it's used for 374 // debugging purposes only (set as parameter to Task.run to make it visible 375 // in stack dumps). 376 func (t *Task) Start(tid ThreadID) { 377 // If the task was restored, it may be "starting" after having already exited. 378 if t.runState == nil { 379 return 380 } 381 t.goroutineStopped.Add(1) 382 t.tg.liveGoroutines.Add(1) 383 t.tg.pidns.owner.liveGoroutines.Add(1) 384 t.tg.pidns.owner.runningGoroutines.Add(1) 385 386 // Task is now running in system mode. 387 t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) 388 389 // Use the task's TID in the root PID namespace to make it visible in stack dumps. 390 go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops 391 }