github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_start.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "github.com/SagerNet/gvisor/pkg/abi/linux" 19 "github.com/SagerNet/gvisor/pkg/context" 20 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 21 "github.com/SagerNet/gvisor/pkg/hostarch" 22 "github.com/SagerNet/gvisor/pkg/sentry/inet" 23 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 24 "github.com/SagerNet/gvisor/pkg/sentry/kernel/futex" 25 "github.com/SagerNet/gvisor/pkg/sentry/kernel/sched" 26 "github.com/SagerNet/gvisor/pkg/sentry/usage" 27 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 28 "github.com/SagerNet/gvisor/pkg/syserror" 29 ) 30 31 // TaskConfig defines the configuration of a new Task (see below). 32 type TaskConfig struct { 33 // Kernel is the owning Kernel. 34 Kernel *Kernel 35 36 // Parent is the new task's parent. Parent may be nil. 37 Parent *Task 38 39 // If InheritParent is not nil, use InheritParent's parent as the new 40 // task's parent. 41 InheritParent *Task 42 43 // ThreadGroup is the ThreadGroup the new task belongs to. 44 ThreadGroup *ThreadGroup 45 46 // SignalMask is the new task's initial signal mask. 47 SignalMask linux.SignalSet 48 49 // TaskImage is the TaskImage of the new task. Ownership of the 50 // TaskImage is transferred to TaskSet.NewTask, whether or not it 51 // succeeds. 52 TaskImage *TaskImage 53 54 // FSContext is the FSContext of the new task. A reference must be held on 55 // FSContext, which is transferred to TaskSet.NewTask whether or not it 56 // succeeds. 57 FSContext *FSContext 58 59 // FDTable is the FDTableof the new task. A reference must be held on 60 // FDMap, which is transferred to TaskSet.NewTask whether or not it 61 // succeeds. 62 FDTable *FDTable 63 64 // Credentials is the Credentials of the new task. 65 Credentials *auth.Credentials 66 67 // Niceness is the niceness of the new task. 68 Niceness int 69 70 // NetworkNamespace is the network namespace to be used for the new task. 71 NetworkNamespace *inet.Namespace 72 73 // AllowedCPUMask contains the cpus that this task can run on. 74 AllowedCPUMask sched.CPUSet 75 76 // UTSNamespace is the UTSNamespace of the new task. 77 UTSNamespace *UTSNamespace 78 79 // IPCNamespace is the IPCNamespace of the new task. 80 IPCNamespace *IPCNamespace 81 82 // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. 83 AbstractSocketNamespace *AbstractSocketNamespace 84 85 // MountNamespaceVFS2 is the MountNamespace of the new task. 86 MountNamespaceVFS2 *vfs.MountNamespace 87 88 // RSeqAddr is a pointer to the the userspace linux.RSeq structure. 89 RSeqAddr hostarch.Addr 90 91 // RSeqSignature is the signature that the rseq abort IP must be signed 92 // with. 93 RSeqSignature uint32 94 95 // ContainerID is the container the new task belongs to. 96 ContainerID string 97 } 98 99 // NewTask creates a new task defined by cfg. 100 // 101 // NewTask does not start the returned task; the caller must call Task.Start. 102 // 103 // If successful, NewTask transfers references held by cfg to the new task. 104 // Otherwise, NewTask releases them. 105 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { 106 t, err := ts.newTask(cfg) 107 if err != nil { 108 cfg.TaskImage.release() 109 cfg.FSContext.DecRef(ctx) 110 cfg.FDTable.DecRef(ctx) 111 cfg.IPCNamespace.DecRef(ctx) 112 if cfg.MountNamespaceVFS2 != nil { 113 cfg.MountNamespaceVFS2.DecRef(ctx) 114 } 115 return nil, err 116 } 117 return t, nil 118 } 119 120 // newTask is a helper for TaskSet.NewTask that only takes ownership of parts 121 // of cfg if it succeeds. 122 func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { 123 tg := cfg.ThreadGroup 124 image := cfg.TaskImage 125 t := &Task{ 126 taskNode: taskNode{ 127 tg: tg, 128 parent: cfg.Parent, 129 children: make(map[*Task]struct{}), 130 }, 131 runState: (*runApp)(nil), 132 interruptChan: make(chan struct{}, 1), 133 signalMask: cfg.SignalMask, 134 signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, 135 image: *image, 136 fsContext: cfg.FSContext, 137 fdTable: cfg.FDTable, 138 p: cfg.Kernel.Platform.NewContext(), 139 k: cfg.Kernel, 140 ptraceTracees: make(map[*Task]struct{}), 141 allowedCPUMask: cfg.AllowedCPUMask.Copy(), 142 ioUsage: &usage.IO{}, 143 niceness: cfg.Niceness, 144 netns: cfg.NetworkNamespace, 145 utsns: cfg.UTSNamespace, 146 ipcns: cfg.IPCNamespace, 147 abstractSockets: cfg.AbstractSocketNamespace, 148 mountNamespaceVFS2: cfg.MountNamespaceVFS2, 149 rseqCPU: -1, 150 rseqAddr: cfg.RSeqAddr, 151 rseqSignature: cfg.RSeqSignature, 152 futexWaiter: futex.NewWaiter(), 153 containerID: cfg.ContainerID, 154 cgroups: make(map[Cgroup]struct{}), 155 } 156 t.creds.Store(cfg.Credentials) 157 t.endStopCond.L = &t.tg.signalHandlers.mu 158 t.ptraceTracer.Store((*Task)(nil)) 159 // We don't construct t.blockingTimer until Task.run(); see that function 160 // for justification. 161 162 // Make the new task (and possibly thread group) visible to the rest of 163 // the system atomically. 164 ts.mu.Lock() 165 defer ts.mu.Unlock() 166 tg.signalHandlers.mu.Lock() 167 defer tg.signalHandlers.mu.Unlock() 168 if tg.exiting || tg.execing != nil { 169 // If the caller is in the same thread group, then what we return 170 // doesn't matter too much since the caller will exit before it returns 171 // to userspace. If the caller isn't in the same thread group, then 172 // we're in uncharted territory and can return whatever we want. 173 return nil, syserror.EINTR 174 } 175 if err := ts.assignTIDsLocked(t); err != nil { 176 return nil, err 177 } 178 // Below this point, newTask is expected not to fail (there is no rollback 179 // of assignTIDsLocked or any of the following). 180 181 // Logging on t's behalf will panic if t.logPrefix hasn't been 182 // initialized. This is the earliest point at which we can do so 183 // (since t now has thread IDs). 184 t.updateInfoLocked() 185 186 if cfg.InheritParent != nil { 187 t.parent = cfg.InheritParent.parent 188 } 189 if t.parent != nil { 190 t.parent.children[t] = struct{}{} 191 } 192 193 if VFS2Enabled { 194 t.EnterInitialCgroups(t.parent) 195 } 196 197 if tg.leader == nil { 198 // New thread group. 199 tg.leader = t 200 if parentPG := tg.parentPG(); parentPG == nil { 201 tg.createSession() 202 } else { 203 // Inherit the process group and terminal. 204 parentPG.incRefWithParent(parentPG) 205 tg.processGroup = parentPG 206 tg.tty = t.parent.tg.tty 207 } 208 } 209 tg.tasks.PushBack(t) 210 tg.tasksCount++ 211 tg.liveTasks++ 212 tg.activeTasks++ 213 214 // Propagate external TaskSet stops to the new task. 215 t.stopCount = ts.stopCount 216 217 t.mu.Lock() 218 defer t.mu.Unlock() 219 220 t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t]) 221 222 t.startTime = t.k.RealtimeClock().Now() 223 224 return t, nil 225 } 226 227 // assignTIDsLocked ensures that new task t is visible in all PID namespaces in 228 // which it should be visible. 229 // 230 // Preconditions: ts.mu must be locked for writing. 231 func (ts *TaskSet) assignTIDsLocked(t *Task) error { 232 type allocatedTID struct { 233 ns *PIDNamespace 234 tid ThreadID 235 } 236 var allocatedTIDs []allocatedTID 237 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 238 tid, err := ns.allocateTID() 239 if err != nil { 240 // Failure. Remove the tids we already allocated in descendant 241 // namespaces. 242 for _, a := range allocatedTIDs { 243 delete(a.ns.tasks, a.tid) 244 delete(a.ns.tids, t) 245 if t.tg.leader == nil { 246 delete(a.ns.tgids, t.tg) 247 } 248 } 249 return err 250 } 251 ns.tasks[tid] = t 252 ns.tids[t] = tid 253 if t.tg.leader == nil { 254 // New thread group. 255 ns.tgids[t.tg] = tid 256 } 257 allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) 258 } 259 return nil 260 } 261 262 // allocateTID returns an unused ThreadID from ns. 263 // 264 // Preconditions: ns.owner.mu must be locked for writing. 265 func (ns *PIDNamespace) allocateTID() (ThreadID, error) { 266 if ns.exiting { 267 // "In this case, a subsequent fork(2) into this PID namespace will 268 // fail with the error ENOMEM; it is not possible to create a new 269 // processes [sic] in a PID namespace whose init process has 270 // terminated." - pid_namespaces(7) 271 return 0, syserror.ENOMEM 272 } 273 tid := ns.last 274 for { 275 // Next. 276 tid++ 277 if tid > TasksLimit { 278 tid = InitTID + 1 279 } 280 281 // Is it available? 282 tidInUse := func() bool { 283 if _, ok := ns.tasks[tid]; ok { 284 return true 285 } 286 if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { 287 return true 288 } 289 if _, ok := ns.sessions[SessionID(tid)]; ok { 290 return true 291 } 292 return false 293 }() 294 295 if !tidInUse { 296 ns.last = tid 297 return tid, nil 298 } 299 300 // Did we do a full cycle? 301 if tid == ns.last { 302 // No tid available. 303 return 0, linuxerr.EAGAIN 304 } 305 } 306 } 307 308 // Start starts the task goroutine. Start must be called exactly once for each 309 // task returned by NewTask. 310 // 311 // 'tid' must be the task's TID in the root PID namespace and it's used for 312 // debugging purposes only (set as parameter to Task.run to make it visible 313 // in stack dumps). 314 func (t *Task) Start(tid ThreadID) { 315 // If the task was restored, it may be "starting" after having already exited. 316 if t.runState == nil { 317 return 318 } 319 t.goroutineStopped.Add(1) 320 t.tg.liveGoroutines.Add(1) 321 t.tg.pidns.owner.liveGoroutines.Add(1) 322 t.tg.pidns.owner.runningGoroutines.Add(1) 323 324 // Task is now running in system mode. 325 t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) 326 327 // Use the task's TID in the root PID namespace to make it visible in stack dumps. 328 go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops 329 }