github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/threads.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 21 "github.com/SagerNet/gvisor/pkg/sync" 22 "github.com/SagerNet/gvisor/pkg/waiter" 23 ) 24 25 // TasksLimit is the maximum number of threads for untrusted application. 26 // Linux doesn't really limit this directly, rather it is limited by total 27 // memory size, stacks allocated and a global maximum. There's no real reason 28 // for us to limit it either, (esp. since threads are backed by go routines), 29 // and we would expect to hit resource limits long before hitting this number. 30 // However, for correctness, we still check that the user doesn't exceed this 31 // number. 32 // 33 // Note that because of the way futexes are implemented, there *are* in fact 34 // serious restrictions on valid thread IDs. They are limited to 2^30 - 1 35 // (kernel/fork.c:MAX_THREADS). 36 const TasksLimit = (1 << 16) 37 38 // ThreadID is a generic thread identifier. 39 // 40 // +marshal 41 type ThreadID int32 42 43 // String returns a decimal representation of the ThreadID. 44 func (tid ThreadID) String() string { 45 return fmt.Sprintf("%d", tid) 46 } 47 48 // InitTID is the TID given to the first task added to each PID namespace. The 49 // thread group led by InitTID is called the namespace's init process. The 50 // death of a PID namespace's init process causes all tasks visible in that 51 // namespace to be killed. 52 const InitTID ThreadID = 1 53 54 // A TaskSet comprises all tasks in a system. 55 // 56 // +stateify savable 57 type TaskSet struct { 58 // mu protects all relationships between tasks and thread groups in the 59 // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) 60 mu sync.RWMutex `state:"nosave"` 61 62 // Root is the root PID namespace, in which all tasks in the TaskSet are 63 // visible. The Root pointer is immutable. 64 Root *PIDNamespace 65 66 // sessions is the set of all sessions. 67 sessions sessionList 68 69 // stopCount is the number of active external stops applicable to all tasks 70 // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been 71 // paired with a call to TaskSet.EndExternalStop). stopCount is protected 72 // by mu. 73 // 74 // stopCount is not saved for the same reason as Task.stopCount; it is 75 // always reset to zero after restore. 76 stopCount int32 `state:"nosave"` 77 78 // liveGoroutines is the number of non-exited task goroutines in the 79 // TaskSet. 80 // 81 // liveGoroutines is not saved; it is reset as task goroutines are 82 // restarted by Task.Start. 83 liveGoroutines sync.WaitGroup `state:"nosave"` 84 85 // runningGoroutines is the number of running task goroutines in the 86 // TaskSet. 87 // 88 // runningGoroutines is not saved; its counter value is required to be zero 89 // at time of save (but note that this is not necessarily the same thing as 90 // sync.WaitGroup's zero value). 91 runningGoroutines sync.WaitGroup `state:"nosave"` 92 93 // aioGoroutines is the number of goroutines running async I/O 94 // callbacks. 95 // 96 // aioGoroutines is not saved but is required to be zero at the time of 97 // save. 98 aioGoroutines sync.WaitGroup `state:"nosave"` 99 } 100 101 // newTaskSet returns a new, empty TaskSet. 102 func newTaskSet(pidns *PIDNamespace) *TaskSet { 103 ts := &TaskSet{Root: pidns} 104 pidns.owner = ts 105 return ts 106 } 107 108 // forEachThreadGroupLocked applies f to each thread group in ts. 109 // 110 // Preconditions: ts.mu must be locked (for reading or writing). 111 func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { 112 for tg := range ts.Root.tgids { 113 f(tg) 114 } 115 } 116 117 // forEachTaskLocked applies f to each Task in ts. 118 // 119 // Preconditions: ts.mu must be locked (for reading or writing). 120 func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) { 121 for t := range ts.Root.tids { 122 f(t) 123 } 124 } 125 126 // A PIDNamespace represents a PID namespace, a bimap between thread IDs and 127 // tasks. See the pid_namespaces(7) man page for further details. 128 // 129 // N.B. A task is said to be visible in a PID namespace if the PID namespace 130 // contains a thread ID that maps to that task. 131 // 132 // +stateify savable 133 type PIDNamespace struct { 134 // owner is the TaskSet that this PID namespace belongs to. The owner 135 // pointer is immutable. 136 owner *TaskSet 137 138 // parent is the PID namespace of the process that created this one. If 139 // this is the root PID namespace, parent is nil. The parent pointer is 140 // immutable. 141 // 142 // Invariant: All tasks that are visible in this namespace are also visible 143 // in all ancestor namespaces. 144 parent *PIDNamespace 145 146 // userns is the user namespace with which this PID namespace is 147 // associated. Privileged operations on this PID namespace must have 148 // appropriate capabilities in userns. The userns pointer is immutable. 149 userns *auth.UserNamespace 150 151 // The following fields are protected by owner.mu. 152 153 // last is the last ThreadID to be allocated in this namespace. 154 last ThreadID 155 156 // tasks is a mapping from ThreadIDs in this namespace to tasks visible in 157 // the namespace. 158 tasks map[ThreadID]*Task 159 160 // tids is a mapping from tasks visible in this namespace to their 161 // identifiers in this namespace. 162 tids map[*Task]ThreadID 163 164 // tgids is a mapping from thread groups visible in this namespace to 165 // their identifiers in this namespace. 166 // 167 // The content of tgids is equivalent to tids[tg.leader]. This exists 168 // primarily as an optimization to quickly find all thread groups. 169 tgids map[*ThreadGroup]ThreadID 170 171 // sessions is a mapping from SessionIDs in this namespace to sessions 172 // visible in the namespace. 173 sessions map[SessionID]*Session 174 175 // sids is a mapping from sessions visible in this namespace to their 176 // identifiers in this namespace. 177 sids map[*Session]SessionID 178 179 // processGroups is a mapping from ProcessGroupIDs in this namespace to 180 // process groups visible in the namespace. 181 processGroups map[ProcessGroupID]*ProcessGroup 182 183 // pgids is a mapping from process groups visible in this namespace to 184 // their identifiers in this namespace. 185 pgids map[*ProcessGroup]ProcessGroupID 186 187 // exiting indicates that the namespace's init process is exiting or has 188 // exited. 189 exiting bool 190 } 191 192 func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { 193 return &PIDNamespace{ 194 owner: ts, 195 parent: parent, 196 userns: userns, 197 tasks: make(map[ThreadID]*Task), 198 tids: make(map[*Task]ThreadID), 199 tgids: make(map[*ThreadGroup]ThreadID), 200 sessions: make(map[SessionID]*Session), 201 sids: make(map[*Session]SessionID), 202 processGroups: make(map[ProcessGroupID]*ProcessGroup), 203 pgids: make(map[*ProcessGroup]ProcessGroupID), 204 } 205 } 206 207 // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available 208 // yet when root namespace is created and must be set by caller. 209 func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { 210 return newPIDNamespace(nil, nil, userns) 211 } 212 213 // NewChild returns a new, empty PID namespace that is a child of ns. Authority 214 // over the new PID namespace is controlled by userns. 215 func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { 216 return newPIDNamespace(ns.owner, ns, userns) 217 } 218 219 // TaskWithID returns the task with thread ID tid in PID namespace ns. If no 220 // task has that TID, TaskWithID returns nil. 221 func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { 222 ns.owner.mu.RLock() 223 t := ns.tasks[tid] 224 ns.owner.mu.RUnlock() 225 return t 226 } 227 228 // ThreadGroupWithID returns the thread group led by the task with thread ID 229 // tid in PID namespace ns. If no task has that TID, or if the task with that 230 // TID is not a thread group leader, ThreadGroupWithID returns nil. 231 func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { 232 ns.owner.mu.RLock() 233 defer ns.owner.mu.RUnlock() 234 t := ns.tasks[tid] 235 if t == nil { 236 return nil 237 } 238 if t != t.tg.leader { 239 return nil 240 } 241 return t.tg 242 } 243 244 // IDOfTask returns the TID assigned to the given task in PID namespace ns. If 245 // the task is not visible in that namespace, IDOfTask returns 0. (This return 246 // value is significant in some cases, e.g. getppid() is documented as 247 // returning 0 if the caller's parent is in an ancestor namespace and 248 // consequently not visible to the caller.) If the task is nil, IDOfTask returns 249 // 0. 250 func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { 251 ns.owner.mu.RLock() 252 id := ns.tids[t] 253 ns.owner.mu.RUnlock() 254 return id 255 } 256 257 // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. 258 // If the task is not visible in that namespace, IDOfThreadGroup returns 0. 259 func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { 260 ns.owner.mu.RLock() 261 id := ns.tgids[tg] 262 ns.owner.mu.RUnlock() 263 return id 264 } 265 266 // Tasks returns a snapshot of the tasks in ns. 267 func (ns *PIDNamespace) Tasks() []*Task { 268 ns.owner.mu.RLock() 269 defer ns.owner.mu.RUnlock() 270 tasks := make([]*Task, 0, len(ns.tasks)) 271 for t := range ns.tids { 272 tasks = append(tasks, t) 273 } 274 return tasks 275 } 276 277 // NumTasks returns the number of tasks in ns. 278 func (ns *PIDNamespace) NumTasks() int { 279 ns.owner.mu.RLock() 280 defer ns.owner.mu.RUnlock() 281 return len(ns.tids) 282 } 283 284 // ThreadGroups returns a snapshot of the thread groups in ns. 285 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { 286 return ns.ThreadGroupsAppend(nil) 287 } 288 289 // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. 290 func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { 291 ns.owner.mu.RLock() 292 defer ns.owner.mu.RUnlock() 293 for tg := range ns.tgids { 294 tgs = append(tgs, tg) 295 } 296 return tgs 297 } 298 299 // UserNamespace returns the user namespace associated with PID namespace ns. 300 func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { 301 return ns.userns 302 } 303 304 // Root returns the root PID namespace of ns. 305 func (ns *PIDNamespace) Root() *PIDNamespace { 306 return ns.owner.Root 307 } 308 309 // A threadGroupNode defines the relationship between a thread group and the 310 // rest of the system. Conceptually, threadGroupNode is data belonging to the 311 // owning TaskSet, as if TaskSet contained a field `nodes 312 // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, 313 // threadGroupNode is embedded in the ThreadGroup it represents. 314 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose 315 // threadGroupEntry's methods on ThreadGroup to make it implement 316 // threadGroupLinker.) 317 // 318 // +stateify savable 319 type threadGroupNode struct { 320 // pidns is the PID namespace containing the thread group and all of its 321 // member tasks. The pidns pointer is immutable. 322 pidns *PIDNamespace 323 324 // eventQueue is notified whenever a event of interest to Task.Wait occurs 325 // in a child of this thread group, or a ptrace tracee of a task in this 326 // thread group. Events are defined in task_exit.go. 327 // 328 // Note that we cannot check and save this wait queue similarly to other 329 // wait queues, as the queue will not be empty by the time of saving, due 330 // to the wait sourced from Exec(). 331 eventQueue waiter.Queue `state:"nosave"` 332 333 // leader is the thread group's leader, which is the oldest task in the 334 // thread group; usually the last task in the thread group to call 335 // execve(), or if no such task exists then the first task in the thread 336 // group, which was created by a call to fork() or clone() without 337 // CLONE_THREAD. Once a thread group has been made visible to the rest of 338 // the system by TaskSet.newTask, leader is never nil. 339 // 340 // Note that it's possible for the leader to exit without causing the rest 341 // of the thread group to exit; in such a case, leader will still be valid 342 // and non-nil, but leader will not be in tasks. 343 // 344 // leader is protected by the TaskSet mutex. 345 leader *Task 346 347 // If execing is not nil, it is a task in the thread group that has killed 348 // all other tasks so that it can become the thread group leader and 349 // perform an execve. (execing may already be the thread group leader.) 350 // 351 // execing is analogous to Linux's signal_struct::group_exit_task. 352 // 353 // execing is protected by the TaskSet mutex. 354 execing *Task 355 356 // tasks is all tasks in the thread group that have not yet been reaped. 357 // 358 // tasks is protected by both the TaskSet mutex and the signal mutex: 359 // Mutating tasks requires locking the TaskSet mutex for writing *and* 360 // locking the signal mutex. Reading tasks requires locking the TaskSet 361 // mutex *or* locking the signal mutex. 362 tasks taskList 363 364 // tasksCount is the number of tasks in the thread group that have not yet 365 // been reaped; equivalently, tasksCount is the number of tasks in tasks. 366 // 367 // tasksCount is protected by both the TaskSet mutex and the signal mutex, 368 // as with tasks. 369 tasksCount int 370 371 // liveTasks is the number of tasks in the thread group that have not yet 372 // reached TaskExitZombie. 373 // 374 // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). 375 liveTasks int 376 377 // activeTasks is the number of tasks in the thread group that have not yet 378 // reached TaskExitInitiated. 379 // 380 // activeTasks is protected by both the TaskSet mutex and the signal mutex, 381 // as with tasks. 382 activeTasks int 383 } 384 385 // PIDNamespace returns the PID namespace containing tg. 386 func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { 387 return tg.pidns 388 } 389 390 // TaskSet returns the TaskSet containing tg. 391 func (tg *ThreadGroup) TaskSet() *TaskSet { 392 return tg.pidns.owner 393 } 394 395 // Leader returns tg's leader. 396 func (tg *ThreadGroup) Leader() *Task { 397 tg.pidns.owner.mu.RLock() 398 defer tg.pidns.owner.mu.RUnlock() 399 return tg.leader 400 } 401 402 // Count returns the number of non-exited threads in the group. 403 func (tg *ThreadGroup) Count() int { 404 tg.pidns.owner.mu.RLock() 405 defer tg.pidns.owner.mu.RUnlock() 406 var count int 407 for t := tg.tasks.Front(); t != nil; t = t.Next() { 408 count++ 409 } 410 return count 411 } 412 413 // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for 414 // all tasks in tg. 415 func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { 416 tg.pidns.owner.mu.RLock() 417 defer tg.pidns.owner.mu.RUnlock() 418 419 var tasks []ThreadID 420 for t := tg.tasks.Front(); t != nil; t = t.Next() { 421 if id, ok := pidns.tids[t]; ok { 422 tasks = append(tasks, id) 423 } 424 } 425 return tasks 426 } 427 428 // ID returns tg's leader's thread ID in its own PID namespace. If tg's leader 429 // is dead, ID returns 0. 430 func (tg *ThreadGroup) ID() ThreadID { 431 tg.pidns.owner.mu.RLock() 432 id := tg.pidns.tgids[tg] 433 tg.pidns.owner.mu.RUnlock() 434 return id 435 } 436 437 // A taskNode defines the relationship between a task and the rest of the 438 // system. The comments on threadGroupNode also apply to taskNode. 439 // 440 // +stateify savable 441 type taskNode struct { 442 // tg is the thread group that this task belongs to. The tg pointer is 443 // immutable. 444 tg *ThreadGroup `state:"wait"` 445 446 // taskEntry links into tg.tasks. Note that this means that 447 // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread 448 // group. See threadGroupNode.tasks for synchronization info. 449 taskEntry 450 451 // parent is the task's parent. parent may be nil. 452 // 453 // parent is protected by the TaskSet mutex. 454 parent *Task 455 456 // children is this task's children. 457 // 458 // children is protected by the TaskSet mutex. 459 children map[*Task]struct{} 460 461 // If childPIDNamespace is not nil, all new tasks created by this task will 462 // be members of childPIDNamespace rather than this one. (As a corollary, 463 // this task becomes unable to create sibling tasks in the same thread 464 // group.) 465 // 466 // childPIDNamespace is exclusive to the task goroutine. 467 childPIDNamespace *PIDNamespace 468 } 469 470 // ThreadGroup returns the thread group containing t. 471 func (t *Task) ThreadGroup() *ThreadGroup { 472 return t.tg 473 } 474 475 // PIDNamespace returns the PID namespace containing t. 476 func (t *Task) PIDNamespace() *PIDNamespace { 477 return t.tg.pidns 478 } 479 480 // TaskSet returns the TaskSet containing t. 481 func (t *Task) TaskSet() *TaskSet { 482 return t.tg.pidns.owner 483 } 484 485 // Timekeeper returns the system Timekeeper. 486 func (t *Task) Timekeeper() *Timekeeper { 487 return t.k.timekeeper 488 } 489 490 // Parent returns t's parent. 491 func (t *Task) Parent() *Task { 492 t.tg.pidns.owner.mu.RLock() 493 defer t.tg.pidns.owner.mu.RUnlock() 494 return t.parent 495 } 496 497 // ThreadID returns t's thread ID in its own PID namespace. If the task is 498 // dead, ThreadID returns 0. 499 func (t *Task) ThreadID() ThreadID { 500 return t.tg.pidns.IDOfTask(t) 501 } 502 503 // TGIDInRoot returns t's TGID in the root PID namespace. 504 func (t *Task) TGIDInRoot() ThreadID { 505 return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) 506 }