github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/kernel/threads.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops" 21 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth" 22 "github.com/ttpreport/gvisor-ligolo/pkg/sync" 23 "github.com/ttpreport/gvisor-ligolo/pkg/waiter" 24 ) 25 26 // TasksLimit is the maximum number of threads for untrusted application. 27 // Linux doesn't really limit this directly, rather it is limited by total 28 // memory size, stacks allocated and a global maximum. There's no real reason 29 // for us to limit it either, (esp. since threads are backed by go routines), 30 // and we would expect to hit resource limits long before hitting this number. 31 // However, for correctness, we still check that the user doesn't exceed this 32 // number. 33 // 34 // Note that because of the way futexes are implemented, there *are* in fact 35 // serious restrictions on valid thread IDs. They are limited to 2^30 - 1 36 // (kernel/fork.c:MAX_THREADS). 37 const TasksLimit = (1 << 16) 38 39 // ThreadID is a generic thread identifier. 40 // 41 // +marshal 42 type ThreadID int32 43 44 // String returns a decimal representation of the ThreadID. 45 func (tid ThreadID) String() string { 46 return fmt.Sprintf("%d", tid) 47 } 48 49 // initTID is the TID given to the first task added to each PID namespace. The 50 // thread group led by initTID is called the namespace's init process. The 51 // death of a PID namespace's init process causes all tasks visible in that 52 // namespace to be killed. 53 const initTID ThreadID = 1 54 55 // A TaskSet comprises all tasks in a system. 56 // 57 // +stateify savable 58 type TaskSet struct { 59 // mu protects all relationships between tasks and thread groups in the 60 // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) 61 mu taskSetRWMutex `state:"nosave"` 62 63 // Root is the root PID namespace, in which all tasks in the TaskSet are 64 // visible. The Root pointer is immutable. 65 Root *PIDNamespace 66 67 // sessions is the set of all sessions. 68 sessions sessionList 69 70 // stopCount is the number of active external stops applicable to all tasks 71 // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been 72 // paired with a call to TaskSet.EndExternalStop). stopCount is protected 73 // by mu. 74 // 75 // stopCount is not saved for the same reason as Task.stopCount; it is 76 // always reset to zero after restore. 77 stopCount int32 `state:"nosave"` 78 79 // liveGoroutines is the number of non-exited task goroutines in the 80 // TaskSet. 81 // 82 // liveGoroutines is not saved; it is reset as task goroutines are 83 // restarted by Task.Start. 84 liveGoroutines sync.WaitGroup `state:"nosave"` 85 86 // runningGoroutines is the number of running task goroutines in the 87 // TaskSet. 88 // 89 // runningGoroutines is not saved; its counter value is required to be zero 90 // at time of save (but note that this is not necessarily the same thing as 91 // sync.WaitGroup's zero value). 92 runningGoroutines sync.WaitGroup `state:"nosave"` 93 94 // aioGoroutines is the number of goroutines running async I/O 95 // callbacks. 96 // 97 // aioGoroutines is not saved but is required to be zero at the time of 98 // save. 99 aioGoroutines sync.WaitGroup `state:"nosave"` 100 } 101 102 // newTaskSet returns a new, empty TaskSet. 103 func newTaskSet(pidns *PIDNamespace) *TaskSet { 104 ts := &TaskSet{Root: pidns} 105 pidns.owner = ts 106 return ts 107 } 108 109 // forEachThreadGroupLocked applies f to each thread group in ts. 110 // 111 // Preconditions: ts.mu must be locked (for reading or writing). 112 func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { 113 for tg := range ts.Root.tgids { 114 f(tg) 115 } 116 } 117 118 // forEachTaskLocked applies f to each Task in ts. 119 // 120 // Preconditions: ts.mu must be locked (for reading or writing). 121 func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) { 122 for t := range ts.Root.tids { 123 f(t) 124 } 125 } 126 127 // A PIDNamespace represents a PID namespace, a bimap between thread IDs and 128 // tasks. See the pid_namespaces(7) man page for further details. 129 // 130 // N.B. A task is said to be visible in a PID namespace if the PID namespace 131 // contains a thread ID that maps to that task. 132 // 133 // +stateify savable 134 type PIDNamespace struct { 135 // owner is the TaskSet that this PID namespace belongs to. The owner 136 // pointer is immutable. 137 owner *TaskSet 138 139 // parent is the PID namespace of the process that created this one. If 140 // this is the root PID namespace, parent is nil. The parent pointer is 141 // immutable. 142 // 143 // Invariant: All tasks that are visible in this namespace are also visible 144 // in all ancestor namespaces. 145 parent *PIDNamespace 146 147 // userns is the user namespace with which this PID namespace is 148 // associated. Privileged operations on this PID namespace must have 149 // appropriate capabilities in userns. The userns pointer is immutable. 150 userns *auth.UserNamespace 151 152 // id is a unique ID assigned to the PID namespace. id is immutable. 153 id uint64 154 155 // The following fields are protected by owner.mu. 156 157 // last is the last ThreadID to be allocated in this namespace. 158 last ThreadID 159 160 // tasks is a mapping from ThreadIDs in this namespace to tasks visible in 161 // the namespace. 162 tasks map[ThreadID]*Task 163 164 // tids is a mapping from tasks visible in this namespace to their 165 // identifiers in this namespace. 166 tids map[*Task]ThreadID 167 168 // tgids is a mapping from thread groups visible in this namespace to 169 // their identifiers in this namespace. 170 // 171 // The content of tgids is equivalent to tids[tg.leader]. This exists 172 // primarily as an optimization to quickly find all thread groups. 173 tgids map[*ThreadGroup]ThreadID 174 175 // sessions is a mapping from SessionIDs in this namespace to sessions 176 // visible in the namespace. 177 sessions map[SessionID]*Session 178 179 // sids is a mapping from sessions visible in this namespace to their 180 // identifiers in this namespace. 181 sids map[*Session]SessionID 182 183 // processGroups is a mapping from ProcessGroupIDs in this namespace to 184 // process groups visible in the namespace. 185 processGroups map[ProcessGroupID]*ProcessGroup 186 187 // pgids is a mapping from process groups visible in this namespace to 188 // their identifiers in this namespace. 189 pgids map[*ProcessGroup]ProcessGroupID 190 191 // exiting indicates that the namespace's init process is exiting or has 192 // exited. 193 exiting bool 194 195 // pidNamespaceData contains additional per-PID-namespace data. 196 extra pidNamespaceData 197 } 198 199 func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { 200 return &PIDNamespace{ 201 owner: ts, 202 parent: parent, 203 userns: userns, 204 id: lastPIDNSID.Add(1), 205 tasks: make(map[ThreadID]*Task), 206 tids: make(map[*Task]ThreadID), 207 tgids: make(map[*ThreadGroup]ThreadID), 208 sessions: make(map[SessionID]*Session), 209 sids: make(map[*Session]SessionID), 210 processGroups: make(map[ProcessGroupID]*ProcessGroup), 211 pgids: make(map[*ProcessGroup]ProcessGroupID), 212 extra: newPIDNamespaceData(), 213 } 214 } 215 216 // lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID 217 // namespace. 218 // 219 // This is global rather than being per-TaskSet or Kernel because 220 // NewRootPIDNamespace() is called before the Kernel is initialized. 221 var lastPIDNSID atomicbitops.Uint64 222 223 // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available 224 // yet when root namespace is created and must be set by caller. 225 func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { 226 return newPIDNamespace(nil, nil, userns) 227 } 228 229 // NewChild returns a new, empty PID namespace that is a child of ns. Authority 230 // over the new PID namespace is controlled by userns. 231 func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { 232 return newPIDNamespace(ns.owner, ns, userns) 233 } 234 235 // TaskWithID returns the task with thread ID tid in PID namespace ns. If no 236 // task has that TID, TaskWithID returns nil. 237 func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { 238 ns.owner.mu.RLock() 239 t := ns.tasks[tid] 240 ns.owner.mu.RUnlock() 241 return t 242 } 243 244 // ID returns a non-zero ID that is unique across PID namespaces. 245 func (ns *PIDNamespace) ID() uint64 { 246 return ns.id 247 } 248 249 // ThreadGroupWithID returns the thread group led by the task with thread ID 250 // tid in PID namespace ns. If no task has that TID, or if the task with that 251 // TID is not a thread group leader, ThreadGroupWithID returns nil. 252 func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { 253 ns.owner.mu.RLock() 254 defer ns.owner.mu.RUnlock() 255 t := ns.tasks[tid] 256 if t == nil { 257 return nil 258 } 259 if t != t.tg.leader { 260 return nil 261 } 262 return t.tg 263 } 264 265 // IDOfTask returns the TID assigned to the given task in PID namespace ns. If 266 // the task is not visible in that namespace, IDOfTask returns 0. (This return 267 // value is significant in some cases, e.g. getppid() is documented as 268 // returning 0 if the caller's parent is in an ancestor namespace and 269 // consequently not visible to the caller.) If the task is nil, IDOfTask returns 270 // 0. 271 func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { 272 ns.owner.mu.RLock() 273 id := ns.tids[t] 274 ns.owner.mu.RUnlock() 275 return id 276 } 277 278 // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. 279 // If the task is not visible in that namespace, IDOfThreadGroup returns 0. 280 func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { 281 ns.owner.mu.RLock() 282 id := ns.tgids[tg] 283 ns.owner.mu.RUnlock() 284 return id 285 } 286 287 // Tasks returns a snapshot of the tasks in ns. 288 func (ns *PIDNamespace) Tasks() []*Task { 289 ns.owner.mu.RLock() 290 defer ns.owner.mu.RUnlock() 291 tasks := make([]*Task, 0, len(ns.tasks)) 292 for t := range ns.tids { 293 tasks = append(tasks, t) 294 } 295 return tasks 296 } 297 298 // NumTasks returns the number of tasks in ns. 299 func (ns *PIDNamespace) NumTasks() int { 300 ns.owner.mu.RLock() 301 defer ns.owner.mu.RUnlock() 302 return len(ns.tids) 303 } 304 305 // ThreadGroups returns a snapshot of the thread groups in ns. 306 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { 307 return ns.ThreadGroupsAppend(nil) 308 } 309 310 // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. 311 func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { 312 ns.owner.mu.RLock() 313 defer ns.owner.mu.RUnlock() 314 for tg := range ns.tgids { 315 tgs = append(tgs, tg) 316 } 317 return tgs 318 } 319 320 // UserNamespace returns the user namespace associated with PID namespace ns. 321 func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { 322 return ns.userns 323 } 324 325 // Root returns the root PID namespace of ns. 326 func (ns *PIDNamespace) Root() *PIDNamespace { 327 return ns.owner.Root 328 } 329 330 // A threadGroupNode defines the relationship between a thread group and the 331 // rest of the system. Conceptually, threadGroupNode is data belonging to the 332 // owning TaskSet, as if TaskSet contained a field `nodes 333 // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, 334 // threadGroupNode is embedded in the ThreadGroup it represents. 335 // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose 336 // threadGroupEntry's methods on ThreadGroup to make it implement 337 // threadGroupLinker.) 338 // 339 // +stateify savable 340 type threadGroupNode struct { 341 // pidns is the PID namespace containing the thread group and all of its 342 // member tasks. The pidns pointer is immutable. 343 pidns *PIDNamespace 344 345 // pidWithinNS the thread ID of the leader of this thread group within pidns. 346 // Useful to avoid using locks when determining a thread group leader's own 347 // TID. 348 pidWithinNS atomicbitops.Int32 349 350 // eventQueue is notified whenever a event of interest to Task.Wait occurs 351 // in a child of this thread group, or a ptrace tracee of a task in this 352 // thread group. Events are defined in task_exit.go. 353 eventQueue waiter.Queue 354 355 // leader is the thread group's leader, which is the oldest task in the 356 // thread group; usually the last task in the thread group to call 357 // execve(), or if no such task exists then the first task in the thread 358 // group, which was created by a call to fork() or clone() without 359 // CLONE_THREAD. Once a thread group has been made visible to the rest of 360 // the system by TaskSet.newTask, leader is never nil. 361 // 362 // Note that it's possible for the leader to exit without causing the rest 363 // of the thread group to exit; in such a case, leader will still be valid 364 // and non-nil, but leader will not be in tasks. 365 // 366 // leader is protected by the TaskSet mutex. 367 leader *Task 368 369 // If execing is not nil, it is a task in the thread group that has killed 370 // all other tasks so that it can become the thread group leader and 371 // perform an execve. (execing may already be the thread group leader.) 372 // 373 // execing is analogous to Linux's signal_struct::group_exit_task. 374 // 375 // execing is protected by the TaskSet mutex. 376 execing *Task 377 378 // tasks is all tasks in the thread group that have not yet been reaped. 379 // 380 // tasks is protected by both the TaskSet mutex and the signal mutex: 381 // Mutating tasks requires locking the TaskSet mutex for writing *and* 382 // locking the signal mutex. Reading tasks requires locking the TaskSet 383 // mutex *or* locking the signal mutex. 384 tasks taskList 385 386 // tasksCount is the number of tasks in the thread group that have not yet 387 // been reaped; equivalently, tasksCount is the number of tasks in tasks. 388 // 389 // tasksCount is protected by both the TaskSet mutex and the signal mutex, 390 // as with tasks. 391 tasksCount int 392 393 // liveTasks is the number of tasks in the thread group that have not yet 394 // reached TaskExitZombie. 395 // 396 // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). 397 liveTasks int 398 399 // activeTasks is the number of tasks in the thread group that have not yet 400 // reached TaskExitInitiated. 401 // 402 // activeTasks is protected by both the TaskSet mutex and the signal mutex, 403 // as with tasks. 404 activeTasks int 405 } 406 407 // PIDNamespace returns the PID namespace containing tg. 408 func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { 409 return tg.pidns 410 } 411 412 // TaskSet returns the TaskSet containing tg. 413 func (tg *ThreadGroup) TaskSet() *TaskSet { 414 return tg.pidns.owner 415 } 416 417 // Leader returns tg's leader. 418 func (tg *ThreadGroup) Leader() *Task { 419 tg.pidns.owner.mu.RLock() 420 defer tg.pidns.owner.mu.RUnlock() 421 return tg.leader 422 } 423 424 // Count returns the number of non-exited threads in the group. 425 func (tg *ThreadGroup) Count() int { 426 tg.pidns.owner.mu.RLock() 427 defer tg.pidns.owner.mu.RUnlock() 428 var count int 429 for t := tg.tasks.Front(); t != nil; t = t.Next() { 430 count++ 431 } 432 return count 433 } 434 435 // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for 436 // all tasks in tg. 437 func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { 438 tg.pidns.owner.mu.RLock() 439 defer tg.pidns.owner.mu.RUnlock() 440 441 var tasks []ThreadID 442 for t := tg.tasks.Front(); t != nil; t = t.Next() { 443 if id, ok := pidns.tids[t]; ok { 444 tasks = append(tasks, id) 445 } 446 } 447 return tasks 448 } 449 450 // ID returns tg's leader's thread ID in its own PID namespace. 451 // If tg's leader is dead, ID returns 0. 452 func (tg *ThreadGroup) ID() ThreadID { 453 return ThreadID(tg.pidWithinNS.Load()) 454 } 455 456 // A taskNode defines the relationship between a task and the rest of the 457 // system. The comments on threadGroupNode also apply to taskNode. 458 // 459 // +stateify savable 460 type taskNode struct { 461 // tg is the thread group that this task belongs to. The tg pointer is 462 // immutable. 463 tg *ThreadGroup `state:"wait"` 464 465 // taskEntry links into tg.tasks. Note that this means that 466 // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread 467 // group. See threadGroupNode.tasks for synchronization info. 468 taskEntry 469 470 // parent is the task's parent. parent may be nil. 471 // 472 // parent is protected by the TaskSet mutex. 473 parent *Task 474 475 // children is this task's children. 476 // 477 // children is protected by the TaskSet mutex. 478 children map[*Task]struct{} 479 480 // If childPIDNamespace is not nil, all new tasks created by this task will 481 // be members of childPIDNamespace rather than this one. (As a corollary, 482 // this task becomes unable to create sibling tasks in the same thread 483 // group.) 484 // 485 // childPIDNamespace is exclusive to the task goroutine. 486 childPIDNamespace *PIDNamespace 487 } 488 489 // ThreadGroup returns the thread group containing t. 490 func (t *Task) ThreadGroup() *ThreadGroup { 491 return t.tg 492 } 493 494 // PIDNamespace returns the PID namespace containing t. 495 func (t *Task) PIDNamespace() *PIDNamespace { 496 return t.tg.pidns 497 } 498 499 // TaskSet returns the TaskSet containing t. 500 func (t *Task) TaskSet() *TaskSet { 501 return t.tg.pidns.owner 502 } 503 504 // Timekeeper returns the system Timekeeper. 505 func (t *Task) Timekeeper() *Timekeeper { 506 return t.k.timekeeper 507 } 508 509 // Parent returns t's parent. 510 func (t *Task) Parent() *Task { 511 t.tg.pidns.owner.mu.RLock() 512 defer t.tg.pidns.owner.mu.RUnlock() 513 return t.parent 514 } 515 516 // ParentLocked returns t's parent. Caller must ensure t's TaskSet mu 517 // is locked for at least reading. 518 // 519 // +checklocks:t.tg.pidns.owner.mu 520 func (t *Task) ParentLocked() *Task { 521 return t.parent 522 } 523 524 // ThreadID returns t's thread ID in its own PID namespace. If the task is 525 // dead, ThreadID returns 0. 526 func (t *Task) ThreadID() ThreadID { 527 return t.tg.pidns.IDOfTask(t) 528 } 529 530 // TGIDInRoot returns t's TGID in the root PID namespace. 531 func (t *Task) TGIDInRoot() ThreadID { 532 return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) 533 }