github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_start.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 21 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 22 "github.com/MerlinKodo/gvisor/pkg/context" 23 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 24 "github.com/MerlinKodo/gvisor/pkg/hostarch" 25 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/futex" 28 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/sched" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 31 ) 32 33 // TaskConfig defines the configuration of a new Task (see below). 34 type TaskConfig struct { 35 // Kernel is the owning Kernel. 36 Kernel *Kernel 37 38 // Parent is the new task's parent. Parent may be nil. 39 Parent *Task 40 41 // If InheritParent is not nil, use InheritParent's parent as the new 42 // task's parent. 43 InheritParent *Task 44 45 // ThreadGroup is the ThreadGroup the new task belongs to. 46 ThreadGroup *ThreadGroup 47 48 // SignalMask is the new task's initial signal mask. 49 SignalMask linux.SignalSet 50 51 // TaskImage is the TaskImage of the new task. Ownership of the 52 // TaskImage is transferred to TaskSet.NewTask, whether or not it 53 // succeeds. 54 TaskImage *TaskImage 55 56 // FSContext is the FSContext of the new task. A reference must be held on 57 // FSContext, which is transferred to TaskSet.NewTask whether or not it 58 // succeeds. 59 FSContext *FSContext 60 61 // FDTable is the FDTableof the new task. A reference must be held on 62 // FDMap, which is transferred to TaskSet.NewTask whether or not it 63 // succeeds. 64 FDTable *FDTable 65 66 // Credentials is the Credentials of the new task. 67 Credentials *auth.Credentials 68 69 // Niceness is the niceness of the new task. 70 Niceness int 71 72 // NetworkNamespace is the network namespace to be used for the new task. 73 NetworkNamespace *inet.Namespace 74 75 // AllowedCPUMask contains the cpus that this task can run on. 76 AllowedCPUMask sched.CPUSet 77 78 // UTSNamespace is the UTSNamespace of the new task. 79 UTSNamespace *UTSNamespace 80 81 // IPCNamespace is the IPCNamespace of the new task. 82 IPCNamespace *IPCNamespace 83 84 // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. 85 AbstractSocketNamespace *AbstractSocketNamespace 86 87 // MountNamespace is the MountNamespace of the new task. 88 MountNamespace *vfs.MountNamespace 89 90 // RSeqAddr is a pointer to the the userspace linux.RSeq structure. 91 RSeqAddr hostarch.Addr 92 93 // RSeqSignature is the signature that the rseq abort IP must be signed 94 // with. 95 RSeqSignature uint32 96 97 // ContainerID is the container the new task belongs to. 98 ContainerID string 99 100 // InitialCgroups are the cgroups the container is initialised to. 101 InitialCgroups map[Cgroup]struct{} 102 103 // UserCounters is user resource counters. 104 UserCounters *userCounters 105 106 // SessionKeyring is the session keyring associated with the parent task. 107 // It may be nil. 108 SessionKeyring *auth.Key 109 } 110 111 // NewTask creates a new task defined by cfg. 112 // 113 // NewTask does not start the returned task; the caller must call Task.Start. 114 // 115 // If successful, NewTask transfers references held by cfg to the new task. 116 // Otherwise, NewTask releases them. 117 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 118 var err error 119 cleanup := func() { 120 cfg.TaskImage.release(ctx) 121 cfg.FSContext.DecRef(ctx) 122 cfg.FDTable.DecRef(ctx) 123 cfg.UTSNamespace.DecRef(ctx) 124 cfg.IPCNamespace.DecRef(ctx) 125 cfg.NetworkNamespace.DecRef(ctx) 126 if cfg.MountNamespace != nil { 127 cfg.MountNamespace.DecRef(ctx) 128 } 129 } 130 if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { 131 cleanup() 132 return nil, err 133 } 134 t, err := ts.newTask(ctx, cfg) 135 if err != nil { 136 cfg.UserCounters.decRLimitNProc() 137 cleanup() 138 return nil, err 139 } 140 return t, nil 141 } 142 143 // newTask is a helper for TaskSet.NewTask that only takes ownership of parts 144 // of cfg if it succeeds. 145 func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 146 srcT := TaskFromContext(ctx) 147 tg := cfg.ThreadGroup 148 image := cfg.TaskImage 149 t := &Task{ 150 taskNode: taskNode{ 151 tg: tg, 152 parent: cfg.Parent, 153 children: make(map[*Task]struct{}), 154 }, 155 runState: (*runApp)(nil), 156 interruptChan: make(chan struct{}, 1), 157 signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), 158 signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, 159 image: *image, 160 fsContext: cfg.FSContext, 161 fdTable: cfg.FDTable, 162 k: cfg.Kernel, 163 ptraceTracees: make(map[*Task]struct{}), 164 allowedCPUMask: cfg.AllowedCPUMask.Copy(), 165 ioUsage: &usage.IO{}, 166 niceness: cfg.Niceness, 167 utsns: cfg.UTSNamespace, 168 ipcns: cfg.IPCNamespace, 169 abstractSockets: cfg.AbstractSocketNamespace, 170 mountNamespace: cfg.MountNamespace, 171 rseqCPU: -1, 172 rseqAddr: cfg.RSeqAddr, 173 rseqSignature: cfg.RSeqSignature, 174 futexWaiter: futex.NewWaiter(), 175 containerID: cfg.ContainerID, 176 cgroups: make(map[Cgroup]struct{}), 177 userCounters: cfg.UserCounters, 178 sessionKeyring: cfg.SessionKeyring, 179 } 180 t.netns = cfg.NetworkNamespace 181 t.creds.Store(cfg.Credentials) 182 t.endStopCond.L = &t.tg.signalHandlers.mu 183 t.ptraceTracer.Store((*Task)(nil)) 184 // We don't construct t.blockingTimer until Task.run(); see that function 185 // for justification. 186 187 var ( 188 cg Cgroup 189 charged, committed bool 190 ) 191 192 // Reserve cgroup PIDs controller charge. This is either commited when the 193 // new task enters the cgroup below, or rolled back on failure. 194 // 195 // We may also get here from a non-task context (for example, when 196 // creating the init task, or from the exec control command). In these cases 197 // we skip charging the pids controller, as non-userspace task creation 198 // bypasses pid limits. 199 if srcT != nil { 200 var err error 201 if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { 202 return nil, err 203 } 204 if charged { 205 defer func() { 206 if !committed { 207 if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { 208 panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) 209 } 210 } 211 // Ref from ChargeFor. Note that we need to drop this outside of 212 // TaskSet.mu critical sections. 213 cg.DecRef(ctx) 214 }() 215 } 216 } 217 218 // Make the new task (and possibly thread group) visible to the rest of 219 // the system atomically. 220 ts.mu.Lock() 221 defer ts.mu.Unlock() 222 tg.signalHandlers.mu.Lock() 223 defer tg.signalHandlers.mu.Unlock() 224 if tg.exiting || tg.execing != nil { 225 // If the caller is in the same thread group, then what we return 226 // doesn't matter too much since the caller will exit before it returns 227 // to userspace. If the caller isn't in the same thread group, then 228 // we're in uncharted territory and can return whatever we want. 229 return nil, linuxerr.EINTR 230 } 231 if err := ts.assignTIDsLocked(t); err != nil { 232 return nil, err 233 } 234 // Below this point, newTask is expected not to fail (there is no rollback 235 // of assignTIDsLocked or any of the following). 236 237 // Logging on t's behalf will panic if t.logPrefix hasn't been 238 // initialized. This is the earliest point at which we can do so 239 // (since t now has thread IDs). 240 t.updateInfoLocked() 241 242 if cfg.InheritParent != nil { 243 t.parent = cfg.InheritParent.parent 244 } 245 if t.parent != nil { 246 t.parent.children[t] = struct{}{} 247 } 248 249 // If InitialCgroups is not nil, the new task will be placed in the 250 // specified cgroups. Otherwise, if srcT is not nil, the new task will 251 // be placed in the srcT's cgroups. If neither is specified, the new task 252 // will be in the root cgroups. 253 t.EnterInitialCgroups(srcT, cfg.InitialCgroups) 254 committed = true 255 256 if tg.leader == nil { 257 // New thread group. 258 tg.leader = t 259 if parentPG := tg.parentPG(); parentPG == nil { 260 tg.createSession() 261 } else { 262 // Inherit the process group and terminal. 263 parentPG.incRefWithParent(parentPG) 264 tg.processGroup = parentPG 265 tg.tty = t.parent.tg.tty 266 } 267 268 // If our parent is a child subreaper, or if it has a child 269 // subreaper, then this new thread group does as well. 270 if t.parent != nil { 271 tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper 272 } 273 } 274 tg.tasks.PushBack(t) 275 tg.tasksCount++ 276 tg.liveTasks++ 277 tg.activeTasks++ 278 279 // Propagate external TaskSet stops to the new task. 280 t.stopCount = atomicbitops.FromInt32(ts.stopCount) 281 282 t.mu.Lock() 283 defer t.mu.Unlock() 284 285 t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) 286 287 t.startTime = t.k.RealtimeClock().Now() 288 289 // As a final step, initialize the platform context. This may require 290 // other pieces to be initialized as the task is used the context. 291 t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) 292 293 return t, nil 294 } 295 296 // assignTIDsLocked ensures that new task t is visible in all PID namespaces in 297 // which it should be visible. 298 // 299 // Preconditions: ts.mu must be locked for writing. 300 func (ts *TaskSet) assignTIDsLocked(t *Task) error { 301 type allocatedTID struct { 302 ns *PIDNamespace 303 tid ThreadID 304 } 305 var allocatedTIDs []allocatedTID 306 var tid ThreadID 307 var err error 308 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 309 if tid, err = ns.allocateTID(); err != nil { 310 break 311 } 312 if err = ns.addTask(t, tid); err != nil { 313 break 314 } 315 allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) 316 } 317 if err != nil { 318 // Failure. Remove the tids we already allocated in descendant 319 // namespaces. 320 for _, a := range allocatedTIDs { 321 a.ns.deleteTask(t) 322 } 323 return err 324 } 325 t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) 326 return nil 327 } 328 329 // allocateTID returns an unused ThreadID from ns. 330 // 331 // Preconditions: ns.owner.mu must be locked for writing. 332 func (ns *PIDNamespace) allocateTID() (ThreadID, error) { 333 if ns.exiting { 334 // "In this case, a subsequent fork(2) into this PID namespace will 335 // fail with the error ENOMEM; it is not possible to create a new 336 // processes [sic] in a PID namespace whose init process has 337 // terminated." - pid_namespaces(7) 338 return 0, linuxerr.ENOMEM 339 } 340 tid := ns.last 341 for { 342 // Next. 343 tid++ 344 if tid > TasksLimit { 345 tid = initTID + 1 346 } 347 348 // Is it available? 349 tidInUse := func() bool { 350 if _, ok := ns.tasks[tid]; ok { 351 return true 352 } 353 if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { 354 return true 355 } 356 if _, ok := ns.sessions[SessionID(tid)]; ok { 357 return true 358 } 359 return false 360 }() 361 362 if !tidInUse { 363 ns.last = tid 364 return tid, nil 365 } 366 367 // Did we do a full cycle? 368 if tid == ns.last { 369 // No tid available. 370 return 0, linuxerr.EAGAIN 371 } 372 } 373 } 374 375 // Start starts the task goroutine. Start must be called exactly once for each 376 // task returned by NewTask. 377 // 378 // 'tid' must be the task's TID in the root PID namespace and it's used for 379 // debugging purposes only (set as parameter to Task.run to make it visible 380 // in stack dumps). 381 func (t *Task) Start(tid ThreadID) { 382 // If the task was restored, it may be "starting" after having already exited. 383 if t.runState == nil { 384 return 385 } 386 t.goroutineStopped.Add(1) 387 t.tg.liveGoroutines.Add(1) 388 t.tg.pidns.owner.liveGoroutines.Add(1) 389 t.tg.pidns.owner.runningGoroutines.Add(1) 390 391 // Task is now running in system mode. 392 t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) 393 394 // Use the task's TID in the root PID namespace to make it visible in stack dumps. 395 go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops 396 }