github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/threads.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 21 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 22 "github.com/MerlinKodo/gvisor/pkg/sync" 23 "github.com/MerlinKodo/gvisor/pkg/waiter" 24 ) 25 26 // TasksLimit is the maximum number of threads for untrusted application. 27 // Linux doesn't really limit this directly, rather it is limited by total 28 // memory size, stacks allocated and a global maximum. There's no real reason 29 // for us to limit it either, (esp. since threads are backed by go routines), 30 // and we would expect to hit resource limits long before hitting this number. 31 // However, for correctness, we still check that the user doesn't exceed this 32 // number. 33 // 34 // Note that because of the way futexes are implemented, there *are* in fact 35 // serious restrictions on valid thread IDs. They are limited to 2^30 - 1 36 // (kernel/fork.c:MAX_THREADS). 37 const TasksLimit = (1 << 16) 38 39 // ThreadID is a generic thread identifier. 40 // 41 // +marshal 42 type ThreadID int32 43 44 // String returns a decimal representation of the ThreadID. 45 func (tid ThreadID) String() string { 46 return fmt.Sprintf("%d", tid) 47 } 48 49 // initTID is the TID given to the first task added to each PID namespace. The 50 // thread group led by initTID is called the namespace's init process. The 51 // death of a PID namespace's init process causes all tasks visible in that 52 // namespace to be killed. 53 const initTID ThreadID = 1 54 55 // A TaskSet comprises all tasks in a system. 56 // 57 // +stateify savable 58 type TaskSet struct { 59 // mu protects all relationships between tasks and thread groups in the 60 // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) 61 mu taskSetRWMutex `state:"nosave"` 62 63 // Root is the root PID namespace, in which all tasks in the TaskSet are 64 // visible. The Root pointer is immutable. 65 Root *PIDNamespace 66 67 // sessions is the set of all sessions. 68 sessions sessionList 69 70 // stopCount is the number of active external stops applicable to all tasks 71 // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been 72 // paired with a call to TaskSet.EndExternalStop). stopCount is protected 73 // by mu. 74 // 75 // stopCount is not saved for the same reason as Task.stopCount; it is 76 // always reset to zero after restore. 77 stopCount int32 `state:"nosave"` 78 79 // liveGoroutines is the number of non-exited task goroutines in the 80 // TaskSet. 81 // 82 // liveGoroutines is not saved; it is reset as task goroutines are 83 // restarted by Task.Start. 84 liveGoroutines sync.WaitGroup `state:"nosave"` 85 86 // runningGoroutines is the number of running task goroutines in the 87 // TaskSet. 88 // 89 // runningGoroutines is not saved; its counter value is required to be zero 90 // at time of save (but note that this is not necessarily the same thing as 91 // sync.WaitGroup's zero value). 92 runningGoroutines sync.WaitGroup `state:"nosave"` 93 94 // aioGoroutines is the number of goroutines running async I/O 95 // callbacks. 96 // 97 // aioGoroutines is not saved but is required to be zero at the time of 98 // save. 99 aioGoroutines sync.WaitGroup `state:"nosave"` 100 } 101 102 // newTaskSet returns a new, empty TaskSet. 103 func newTaskSet(pidns *PIDNamespace) *TaskSet { 104 ts := &TaskSet{Root: pidns} 105 pidns.owner = ts 106 return ts 107 } 108 109 // forEachThreadGroupLocked applies f to each thread group in ts. 110 // 111 // Preconditions: ts.mu must be locked (for reading or writing). 112 func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { 113 for tg := range ts.Root.tgids { 114 f(tg) 115 } 116 } 117 118 // forEachTaskLocked applies f to each Task in ts. 119 // 120 // Preconditions: ts.mu must be locked (for reading or writing). 121 func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) { 122 for t := range ts.Root.tids { 123 f(t) 124 } 125 } 126 127 // A PIDNamespace represents a PID namespace, a bimap between thread IDs and 128 // tasks. See the pid_namespaces(7) man page for further details. 129 // 130 // N.B. A task is said to be visible in a PID namespace if the PID namespace 131 // contains a thread ID that maps to that task. 132 // 133 // +stateify savable 134 type PIDNamespace struct { 135 // owner is the TaskSet that this PID namespace belongs to. The owner 136 // pointer is immutable. 137 owner *TaskSet 138 139 // parent is the PID namespace of the process that created this one. If 140 // this is the root PID namespace, parent is nil. The parent pointer is 141 // immutable. 142 // 143 // Invariant: All tasks that are visible in this namespace are also visible 144 // in all ancestor namespaces. 145 parent *PIDNamespace 146 147 // userns is the user namespace with which this PID namespace is 148 // associated. Privileged operations on this PID namespace must have 149 // appropriate capabilities in userns. The userns pointer is immutable. 150 userns *auth.UserNamespace 151 152 // id is a unique ID assigned to the PID namespace. id is immutable. 153 id uint64 154 155 // The following fields are protected by owner.mu. 156 157 // last is the last ThreadID to be allocated in this namespace. 158 last ThreadID 159 160 // tasks is a mapping from ThreadIDs in this namespace to tasks visible in 161 // the namespace. 162 tasks map[ThreadID]*Task 163 164 // tids is a mapping from tasks visible in this namespace to their 165 // identifiers in this namespace. 166 tids map[*Task]ThreadID 167 168 // tgids is a mapping from thread groups visible in this namespace to 169 // their identifiers in this namespace. 170 // 171 // The content of tgids is equivalent to tids[tg.leader]. This exists 172 // primarily as an optimization to quickly find all thread groups. 173 tgids map[*ThreadGroup]ThreadID 174 175 // sessions is a mapping from SessionIDs in this namespace to sessions 176 // visible in the namespace. 177 sessions map[SessionID]*Session 178 179 // sids is a mapping from sessions visible in this namespace to their 180 // identifiers in this namespace. 181 sids map[*Session]SessionID 182 183 // processGroups is a mapping from ProcessGroupIDs in this namespace to 184 // process groups visible in the namespace. 185 processGroups map[ProcessGroupID]*ProcessGroup 186 187 // pgids is a mapping from process groups visible in this namespace to 188 // their identifiers in this namespace. 189 pgids map[*ProcessGroup]ProcessGroupID 190 191 // exiting indicates that the namespace's init process is exiting or has 192 // exited. 193 exiting bool 194 195 // pidNamespaceData contains additional per-PID-namespace data. 196 extra pidNamespaceData 197 } 198 199 func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { 200 return &PIDNamespace{ 201 owner: ts, 202 parent: parent, 203 userns: userns, 204 id: lastPIDNSID.Add(1), 205 tasks: make(map[ThreadID]*Task), 206 tids: make(map[*Task]ThreadID), 207 tgids: make(map[*ThreadGroup]ThreadID), 208 sessions: make(map[SessionID]*Session), 209 sids: make(map[*Session]SessionID), 210 processGroups: make(map[ProcessGroupID]*ProcessGroup), 211 pgids: make(map[*ProcessGroup]ProcessGroupID), 212 extra: newPIDNamespaceData(), 213 } 214 } 215 216 // lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID 217 // namespace. 218 // 219 // This is global rather than being per-TaskSet or Kernel because 220 // NewRootPIDNamespace() is called before the Kernel is initialized. 221 var lastPIDNSID atomicbitops.Uint64 222 223 // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available 224 // yet when root namespace is created and must be set by caller. 225 func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { 226 return newPIDNamespace(nil, nil, userns) 227 } 228 229 // NewChild returns a new, empty PID namespace that is a child of ns. Authority 230 // over the new PID namespace is controlled by userns. 231 func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { 232 return newPIDNamespace(ns.owner, ns, userns) 233 } 234 235 // TaskWithID returns the task with thread ID tid in PID namespace ns. If no 236 // task has that TID, TaskWithID returns nil. 237 func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { 238 ns.owner.mu.RLock() 239 t := ns.tasks[tid] 240 ns.owner.mu.RUnlock() 241 return t 242 } 243 244 // ID returns a non-zero ID that is unique across PID namespaces. 245 func (ns *PIDNamespace) ID() uint64 { 246 return ns.id 247 } 248 249 // ThreadGroupWithID returns the thread group led by the task with thread ID 250 // tid in PID namespace ns. If no task has that TID, or if the task with that 251 // TID is not a thread group leader, ThreadGroupWithID returns nil. 252 func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { 253 ns.owner.mu.RLock() 254 defer ns.owner.mu.RUnlock() 255 t := ns.tasks[tid] 256 if t == nil { 257 return nil 258 } 259 if t != t.tg.leader { 260 return nil 261 } 262 return t.tg 263 } 264 265 // IDOfTask returns the TID assigned to the given task in PID namespace ns. If 266 // the task is not visible in that namespace, IDOfTask returns 0. (This return 267 // value is significant in some cases, e.g. getppid() is documented as 268 // returning 0 if the caller's parent is in an ancestor namespace and 269 // consequently not visible to the caller.) If the task is nil, IDOfTask returns 270 // 0. 271 func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { 272 ns.owner.mu.RLock() 273 id := ns.tids[t] 274 ns.owner.mu.RUnlock() 275 return id 276 } 277 278 // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. 279 // If the task is not visible in that namespace, IDOfThreadGroup returns 0. 280 func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { 281 ns.owner.mu.RLock() 282 id := ns.tgids[tg] 283 ns.owner.mu.RUnlock() 284 return id 285 } 286 287 // Tasks returns a snapshot of the tasks in ns. 288 func (ns *PIDNamespace) Tasks() []*Task { 289 ns.owner.mu.RLock() 290 defer ns.owner.mu.RUnlock() 291 tasks := make([]*Task, 0, len(ns.tasks)) 292 for t := range ns.tids { 293 tasks = append(tasks, t) 294 } 295 return tasks 296 } 297 298 // NumTasks returns the number of tasks in ns. 299 func (ns *PIDNamespace) NumTasks() int { 300 ns.owner.mu.RLock() 301 defer ns.owner.mu.RUnlock() 302 return len(ns.tids) 303 } 304 305 // NumTasksPerContainer returns the number of tasks in ns that belongs to given container. 306 func (ns *PIDNamespace) NumTasksPerContainer(cid string) int { 307 ns.owner.mu.RLock() 308 defer ns.owner.mu.RUnlock() 309 310 tasks := 0 311 for t := range ns.tids { 312 if t.ContainerID() == cid { 313 tasks++ 314 } 315 } 316 return tasks 317 } 318 319 // ThreadGroups returns a snapshot of the thread groups in ns. 320 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { 321 return ns.ThreadGroupsAppend(nil) 322 } 323 324 // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. 325 func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { 326 ns.owner.mu.RLock() 327 defer ns.owner.mu.RUnlock() 328 for tg := range ns.tgids { 329 tgs = append(tgs, tg) 330 } 331 return tgs 332 } 333 334 // UserNamespace returns the user namespace associated with PID namespace ns. 335 func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { 336 return ns.userns 337 } 338 339 // Root returns the root PID namespace of ns. 340 func (ns *PIDNamespace) Root() *PIDNamespace { 341 return ns.owner.Root 342 } 343 344 // A threadGroupNode defines the relationship between a thread group and the 345 // rest of the system. Conceptually, threadGroupNode is data belonging to the 346 // owning TaskSet, as if TaskSet contained a field `nodes 347 // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, 348 // threadGroupNode is embedded in the ThreadGroup it represents. 349 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose 350 // threadGroupEntry's methods on ThreadGroup to make it implement 351 // threadGroupLinker.) 352 // 353 // +stateify savable 354 type threadGroupNode struct { 355 // pidns is the PID namespace containing the thread group and all of its 356 // member tasks. The pidns pointer is immutable. 357 pidns *PIDNamespace 358 359 // pidWithinNS the thread ID of the leader of this thread group within pidns. 360 // Useful to avoid using locks when determining a thread group leader's own 361 // TID. 362 pidWithinNS atomicbitops.Int32 363 364 // eventQueue is notified whenever a event of interest to Task.Wait occurs 365 // in a child of this thread group, or a ptrace tracee of a task in this 366 // thread group. Events are defined in task_exit.go. 367 eventQueue waiter.Queue 368 369 // leader is the thread group's leader, which is the oldest task in the 370 // thread group; usually the last task in the thread group to call 371 // execve(), or if no such task exists then the first task in the thread 372 // group, which was created by a call to fork() or clone() without 373 // CLONE_THREAD. Once a thread group has been made visible to the rest of 374 // the system by TaskSet.newTask, leader is never nil. 375 // 376 // Note that it's possible for the leader to exit without causing the rest 377 // of the thread group to exit; in such a case, leader will still be valid 378 // and non-nil, but leader will not be in tasks. 379 // 380 // leader is protected by the TaskSet mutex. 381 leader *Task 382 383 // If execing is not nil, it is a task in the thread group that has killed 384 // all other tasks so that it can become the thread group leader and 385 // perform an execve. (execing may already be the thread group leader.) 386 // 387 // execing is analogous to Linux's signal_struct::group_exit_task. 388 // 389 // execing is protected by the TaskSet mutex. 390 execing *Task 391 392 // tasks is all tasks in the thread group that have not yet been reaped. 393 // 394 // tasks is protected by both the TaskSet mutex and the signal mutex: 395 // Mutating tasks requires locking the TaskSet mutex for writing *and* 396 // locking the signal mutex. Reading tasks requires locking the TaskSet 397 // mutex *or* locking the signal mutex. 398 tasks taskList 399 400 // tasksCount is the number of tasks in the thread group that have not yet 401 // been reaped; equivalently, tasksCount is the number of tasks in tasks. 402 // 403 // tasksCount is protected by both the TaskSet mutex and the signal mutex, 404 // as with tasks. 405 tasksCount int 406 407 // liveTasks is the number of tasks in the thread group that have not yet 408 // reached TaskExitZombie. 409 // 410 // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). 411 liveTasks int 412 413 // activeTasks is the number of tasks in the thread group that have not yet 414 // reached TaskExitInitiated. 415 // 416 // activeTasks is protected by both the TaskSet mutex and the signal mutex, 417 // as with tasks. 418 activeTasks int 419 } 420 421 // PIDNamespace returns the PID namespace containing tg. 422 func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { 423 return tg.pidns 424 } 425 426 // TaskSet returns the TaskSet containing tg. 427 func (tg *ThreadGroup) TaskSet() *TaskSet { 428 return tg.pidns.owner 429 } 430 431 // Leader returns tg's leader. 432 func (tg *ThreadGroup) Leader() *Task { 433 tg.pidns.owner.mu.RLock() 434 defer tg.pidns.owner.mu.RUnlock() 435 return tg.leader 436 } 437 438 // Count returns the number of non-exited threads in the group. 439 func (tg *ThreadGroup) Count() int { 440 tg.pidns.owner.mu.RLock() 441 defer tg.pidns.owner.mu.RUnlock() 442 var count int 443 for t := tg.tasks.Front(); t != nil; t = t.Next() { 444 count++ 445 } 446 return count 447 } 448 449 // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for 450 // all tasks in tg. 451 func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { 452 tg.pidns.owner.mu.RLock() 453 defer tg.pidns.owner.mu.RUnlock() 454 455 var tasks []ThreadID 456 for t := tg.tasks.Front(); t != nil; t = t.Next() { 457 if id, ok := pidns.tids[t]; ok { 458 tasks = append(tasks, id) 459 } 460 } 461 return tasks 462 } 463 464 // ID returns tg's leader's thread ID in its own PID namespace. 465 // If tg's leader is dead, ID returns 0. 466 func (tg *ThreadGroup) ID() ThreadID { 467 return ThreadID(tg.pidWithinNS.Load()) 468 } 469 470 // A taskNode defines the relationship between a task and the rest of the 471 // system. The comments on threadGroupNode also apply to taskNode. 472 // 473 // +stateify savable 474 type taskNode struct { 475 // tg is the thread group that this task belongs to. The tg pointer is 476 // immutable. 477 tg *ThreadGroup `state:"wait"` 478 479 // taskEntry links into tg.tasks. Note that this means that 480 // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread 481 // group. See threadGroupNode.tasks for synchronization info. 482 taskEntry 483 484 // parent is the task's parent. parent may be nil. 485 // 486 // parent is protected by the TaskSet mutex. 487 parent *Task 488 489 // children is this task's children. 490 // 491 // children is protected by the TaskSet mutex. 492 children map[*Task]struct{} 493 494 // If childPIDNamespace is not nil, all new tasks created by this task will 495 // be members of childPIDNamespace rather than this one. (As a corollary, 496 // this task becomes unable to create sibling tasks in the same thread 497 // group.) 498 // 499 // childPIDNamespace is exclusive to the task goroutine. 500 childPIDNamespace *PIDNamespace 501 } 502 503 // ThreadGroup returns the thread group containing t. 504 func (t *Task) ThreadGroup() *ThreadGroup { 505 return t.tg 506 } 507 508 // PIDNamespace returns the PID namespace containing t. 509 func (t *Task) PIDNamespace() *PIDNamespace { 510 return t.tg.pidns 511 } 512 513 // TaskSet returns the TaskSet containing t. 514 func (t *Task) TaskSet() *TaskSet { 515 return t.tg.pidns.owner 516 } 517 518 // Timekeeper returns the system Timekeeper. 519 func (t *Task) Timekeeper() *Timekeeper { 520 return t.k.timekeeper 521 } 522 523 // Parent returns t's parent. 524 func (t *Task) Parent() *Task { 525 t.tg.pidns.owner.mu.RLock() 526 defer t.tg.pidns.owner.mu.RUnlock() 527 return t.parent 528 } 529 530 // ParentLocked returns t's parent. Caller must ensure t's TaskSet mu 531 // is locked for at least reading. 532 // 533 // +checklocks:t.tg.pidns.owner.mu 534 func (t *Task) ParentLocked() *Task { 535 return t.parent 536 } 537 538 // ThreadID returns t's thread ID in its own PID namespace. If the task is 539 // dead, ThreadID returns 0. 540 func (t *Task) ThreadID() ThreadID { 541 return t.tg.pidns.IDOfTask(t) 542 } 543 544 // TGIDInRoot returns t's TGID in the root PID namespace. 545 func (t *Task) TGIDInRoot() ThreadID { 546 return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) 547 } 548 549 // Children returns children of this task. 550 func (t *Task) Children() map[*Task]struct{} { 551 t.tg.pidns.owner.mu.RLock() 552 defer t.tg.pidns.owner.mu.RUnlock() 553 554 children := make(map[*Task]struct{}, len(t.children)) 555 for child, val := range t.children { 556 children[child] = val 557 } 558 559 return children 560 }