github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_sched.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 // CPU scheduling, real and fake. 18 19 import ( 20 "fmt" 21 "math/rand" 22 "sync/atomic" 23 "time" 24 25 "github.com/SagerNet/gvisor/pkg/abi/linux" 26 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 27 "github.com/SagerNet/gvisor/pkg/sentry/hostcpu" 28 "github.com/SagerNet/gvisor/pkg/sentry/kernel/sched" 29 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 30 "github.com/SagerNet/gvisor/pkg/sentry/limits" 31 "github.com/SagerNet/gvisor/pkg/sentry/usage" 32 ) 33 34 // TaskGoroutineState is a coarse representation of the current execution 35 // status of a kernel.Task goroutine. 36 type TaskGoroutineState int 37 38 const ( 39 // TaskGoroutineNonexistent indicates that the task goroutine has either 40 // not yet been created by Task.Start() or has returned from Task.run(). 41 // This must be the zero value for TaskGoroutineState. 42 TaskGoroutineNonexistent TaskGoroutineState = iota 43 44 // TaskGoroutineRunningSys indicates that the task goroutine is executing 45 // sentry code. 46 TaskGoroutineRunningSys 47 48 // TaskGoroutineRunningApp indicates that the task goroutine is executing 49 // application code. 50 TaskGoroutineRunningApp 51 52 // TaskGoroutineBlockedInterruptible indicates that the task goroutine is 53 // blocked in Task.block(), and hence may be woken by Task.interrupt() 54 // (e.g. due to signal delivery). 55 TaskGoroutineBlockedInterruptible 56 57 // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is 58 // stopped outside of Task.block() and Task.doStop(), and hence cannot be 59 // woken by Task.interrupt(). 60 TaskGoroutineBlockedUninterruptible 61 62 // TaskGoroutineStopped indicates that the task goroutine is blocked in 63 // Task.doStop(). TaskGoroutineStopped is similar to 64 // TaskGoroutineBlockedUninterruptible, but is a separate state to make it 65 // possible to determine when Task.stop is meaningful. 66 TaskGoroutineStopped 67 ) 68 69 // TaskGoroutineSchedInfo contains task goroutine scheduling state which must 70 // be read and updated atomically. 71 // 72 // +stateify savable 73 type TaskGoroutineSchedInfo struct { 74 // Timestamp was the value of Kernel.cpuClock when this 75 // TaskGoroutineSchedInfo was last updated. 76 Timestamp uint64 77 78 // State is the current state of the task goroutine. 79 State TaskGoroutineState 80 81 // UserTicks is the amount of time the task goroutine has spent executing 82 // its associated Task's application code, in units of linux.ClockTick. 83 UserTicks uint64 84 85 // SysTicks is the amount of time the task goroutine has spent executing in 86 // the sentry, in units of linux.ClockTick. 87 SysTicks uint64 88 } 89 90 // userTicksAt returns the extrapolated value of ts.UserTicks after 91 // Kernel.CPUClockNow() indicates a time of now. 92 // 93 // Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is 94 // monotonic, this is satisfied if now is the result of a previous call to 95 // Kernel.CPUClockNow().) This requirement exists because otherwise a racing 96 // change to t.gosched can cause userTicksAt to adjust stats by too much, 97 // making the observed stats non-monotonic. 98 func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 { 99 if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp { 100 // Update stats to reflect execution since the last update. 101 return ts.UserTicks + (now - ts.Timestamp) 102 } 103 return ts.UserTicks 104 } 105 106 // sysTicksAt returns the extrapolated value of ts.SysTicks after 107 // Kernel.CPUClockNow() indicates a time of now. 108 // 109 // Preconditions: As for userTicksAt. 110 func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 { 111 if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys { 112 return ts.SysTicks + (now - ts.Timestamp) 113 } 114 return ts.SysTicks 115 } 116 117 // Preconditions: The caller must be running on the task goroutine. 118 func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { 119 now := t.k.CPUClockNow() 120 if t.gosched.State != TaskGoroutineRunningSys { 121 panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) 122 } 123 t.goschedSeq.BeginWrite() 124 // This function is very hot; avoid defer. 125 t.gosched.SysTicks += now - t.gosched.Timestamp 126 t.gosched.Timestamp = now 127 t.gosched.State = state 128 t.goschedSeq.EndWrite() 129 130 if state != TaskGoroutineRunningApp { 131 // Task is blocking/stopping. 132 t.k.decRunningTasks() 133 } 134 } 135 136 // Preconditions: 137 // * The caller must be running on the task goroutine 138 // * The caller must be leaving a state indicated by a previous call to 139 // t.accountTaskGoroutineEnter(state). 140 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { 141 if state != TaskGoroutineRunningApp { 142 // Task is unblocking/continuing. 143 t.k.incRunningTasks() 144 } 145 146 now := t.k.CPUClockNow() 147 if t.gosched.State != state { 148 panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) 149 } 150 t.goschedSeq.BeginWrite() 151 // This function is very hot; avoid defer. 152 if state == TaskGoroutineRunningApp { 153 t.gosched.UserTicks += now - t.gosched.Timestamp 154 } 155 t.gosched.Timestamp = now 156 t.gosched.State = TaskGoroutineRunningSys 157 t.goschedSeq.EndWrite() 158 } 159 160 // Preconditions: The caller must be running on the task goroutine. 161 func (t *Task) accountTaskGoroutineRunning() { 162 now := t.k.CPUClockNow() 163 if t.gosched.State != TaskGoroutineRunningSys { 164 panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) 165 } 166 t.goschedSeq.BeginWrite() 167 t.gosched.SysTicks += now - t.gosched.Timestamp 168 t.gosched.Timestamp = now 169 t.goschedSeq.EndWrite() 170 } 171 172 // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. 173 // Most clients should use t.CPUStats() instead. 174 func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { 175 return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) 176 } 177 178 // CPUStats returns the CPU usage statistics of t. 179 func (t *Task) CPUStats() usage.CPUStats { 180 return t.cpuStatsAt(t.k.CPUClockNow()) 181 } 182 183 // Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. 184 func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { 185 tsched := t.TaskGoroutineSchedInfo() 186 return usage.CPUStats{ 187 UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)), 188 SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)), 189 VoluntarySwitches: atomic.LoadUint64(&t.yieldCount), 190 } 191 } 192 193 // CPUStats returns the combined CPU usage statistics of all past and present 194 // threads in tg. 195 func (tg *ThreadGroup) CPUStats() usage.CPUStats { 196 tg.pidns.owner.mu.RLock() 197 defer tg.pidns.owner.mu.RUnlock() 198 // Hack to get a pointer to the Kernel. 199 if tg.leader == nil { 200 // Per comment on tg.leader, this is only possible if nothing in the 201 // ThreadGroup has ever executed anyway. 202 return usage.CPUStats{} 203 } 204 return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) 205 } 206 207 // Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus: 208 // * The TaskSet mutex must be locked. 209 func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { 210 stats := tg.exitedCPUStats 211 // Account for live tasks. 212 for t := tg.tasks.Front(); t != nil; t = t.Next() { 213 stats.Accumulate(t.cpuStatsAt(now)) 214 } 215 return stats 216 } 217 218 // JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return 219 // resource usage statistics for all children of [tg] that have terminated and 220 // been waited for. These statistics will include the resources used by 221 // grandchildren, and further removed descendants, if all of the intervening 222 // descendants waited on their terminated children." 223 func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { 224 tg.pidns.owner.mu.RLock() 225 defer tg.pidns.owner.mu.RUnlock() 226 return tg.childCPUStats 227 } 228 229 // taskClock is a ktime.Clock that measures the time that a task has spent 230 // executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID. 231 // 232 // +stateify savable 233 type taskClock struct { 234 t *Task 235 236 // If includeSys is true, the taskClock includes both time spent executing 237 // application code as well as time spent in the sentry. Otherwise, the 238 // taskClock includes only time spent executing application code. 239 includeSys bool 240 241 // Implements waiter.Waitable. TimeUntil wouldn't change its estimation 242 // based on either of the clock events, so there's no event to be 243 // notified for. 244 ktime.NoClockEvents `state:"nosave"` 245 246 // Implements ktime.Clock.WallTimeUntil. 247 // 248 // As an upper bound, a task's clock cannot advance faster than CPU 249 // time. It would have to execute at a rate of more than 1 task-second 250 // per 1 CPU-second, which isn't possible. 251 ktime.WallRateClock `state:"nosave"` 252 } 253 254 // UserCPUClock returns a clock measuring the CPU time the task has spent 255 // executing application code. 256 func (t *Task) UserCPUClock() ktime.Clock { 257 return &taskClock{t: t, includeSys: false} 258 } 259 260 // CPUClock returns a clock measuring the CPU time the task has spent executing 261 // application and "kernel" code. 262 func (t *Task) CPUClock() ktime.Clock { 263 return &taskClock{t: t, includeSys: true} 264 } 265 266 // Now implements ktime.Clock.Now. 267 func (tc *taskClock) Now() ktime.Time { 268 stats := tc.t.CPUStats() 269 if tc.includeSys { 270 return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) 271 } 272 return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) 273 } 274 275 // tgClock is a ktime.Clock that measures the time a thread group has spent 276 // executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID. 277 // 278 // +stateify savable 279 type tgClock struct { 280 tg *ThreadGroup 281 282 // If includeSys is true, the tgClock includes both time spent executing 283 // application code as well as time spent in the sentry. Otherwise, the 284 // tgClock includes only time spent executing application code. 285 includeSys bool 286 287 // Implements waiter.Waitable. 288 ktime.ClockEventsQueue `state:"nosave"` 289 } 290 291 // Now implements ktime.Clock.Now. 292 func (tgc *tgClock) Now() ktime.Time { 293 stats := tgc.tg.CPUStats() 294 if tgc.includeSys { 295 return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) 296 } 297 return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) 298 } 299 300 // WallTimeUntil implements ktime.Clock.WallTimeUntil. 301 func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { 302 // Thread group CPU time should not exceed wall time * live tasks, since 303 // task goroutines exit after the transition to TaskExitZombie in 304 // runExitNotify. 305 tgc.tg.pidns.owner.mu.RLock() 306 n := tgc.tg.liveTasks 307 tgc.tg.pidns.owner.mu.RUnlock() 308 if n == 0 { 309 if t.Before(now) { 310 return 0 311 } 312 // The timer tick raced with thread group exit, after which no more 313 // tasks can enter the thread group. So tgc.Now() will never advance 314 // again. Return a large delay; the timer should be stopped long before 315 // it comes again anyway. 316 return time.Hour 317 } 318 // This is a lower bound on the amount of time that can elapse before an 319 // associated timer expires, so returning this value tends to result in a 320 // sequence of closely-spaced ticks just before timer expiry. To avoid 321 // this, round up to the nearest ClockTick; CPU usage measurements are 322 // limited to this resolution anyway. 323 remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond 324 return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick 325 } 326 327 // UserCPUClock returns a ktime.Clock that measures the time that a thread 328 // group has spent executing. 329 func (tg *ThreadGroup) UserCPUClock() ktime.Clock { 330 return &tgClock{tg: tg, includeSys: false} 331 } 332 333 // CPUClock returns a ktime.Clock that measures the time that a thread group 334 // has spent executing, including sentry time. 335 func (tg *ThreadGroup) CPUClock() ktime.Clock { 336 return &tgClock{tg: tg, includeSys: true} 337 } 338 339 type kernelCPUClockTicker struct { 340 k *Kernel 341 342 // These are essentially kernelCPUClockTicker.Notify local variables that 343 // are cached between calls to reduce allocations. 344 rng *rand.Rand 345 tgs []*ThreadGroup 346 } 347 348 func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker { 349 return &kernelCPUClockTicker{ 350 k: k, 351 rng: rand.New(rand.NewSource(rand.Int63())), 352 } 353 } 354 355 // Notify implements ktime.TimerListener.Notify. 356 func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { 357 // Only increment cpuClock by 1 regardless of the number of expirations. 358 // This approximately compensates for cases where thread throttling or bad 359 // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and 360 // presumably task goroutines as well, from executing for a long period of 361 // time. It's also necessary to prevent CPU clocks from seeing large 362 // discontinuous jumps. 363 now := atomic.AddUint64(&ticker.k.cpuClock, 1) 364 365 // Check thread group CPU timers. 366 tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs) 367 for _, tg := range tgs { 368 if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 { 369 continue 370 } 371 372 ticker.k.tasks.mu.RLock() 373 if tg.leader == nil { 374 // No tasks have ever run in this thread group. 375 ticker.k.tasks.mu.RUnlock() 376 continue 377 } 378 // Accumulate thread group CPU stats, and randomly select running tasks 379 // using reservoir sampling to receive CPU timer signals. 380 var virtReceiver *Task 381 nrVirtCandidates := 0 382 var profReceiver *Task 383 nrProfCandidates := 0 384 tgUserTime := tg.exitedCPUStats.UserTime 385 tgSysTime := tg.exitedCPUStats.SysTime 386 for t := tg.tasks.Front(); t != nil; t = t.Next() { 387 tsched := t.TaskGoroutineSchedInfo() 388 tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)) 389 tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)) 390 switch tsched.State { 391 case TaskGoroutineRunningApp: 392 // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU 393 // timers. 394 nrVirtCandidates++ 395 if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 { 396 virtReceiver = t 397 } 398 fallthrough 399 case TaskGoroutineRunningSys: 400 // Considered by ITIMER_PROF and RLIMIT_CPU timers. 401 nrProfCandidates++ 402 if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 { 403 profReceiver = t 404 } 405 } 406 } 407 tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds()) 408 tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds()) 409 410 // All of the following are standard (not real-time) signals, which are 411 // automatically deduplicated, so we ignore the number of expirations. 412 tg.signalHandlers.mu.Lock() 413 // It should only be possible for these timers to advance if we found 414 // at least one running task. 415 if virtReceiver != nil { 416 // ITIMER_VIRTUAL 417 newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow) 418 tg.itimerVirtSetting = newItimerVirtSetting 419 if exp != 0 { 420 virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true) 421 } 422 } 423 if profReceiver != nil { 424 // ITIMER_PROF 425 newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow) 426 tg.itimerProfSetting = newItimerProfSetting 427 if exp != 0 { 428 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true) 429 } 430 // RLIMIT_CPU soft limit 431 newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow) 432 tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting 433 if exp != 0 { 434 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true) 435 } 436 // RLIMIT_CPU hard limit 437 rlimitCPUMax := tg.limits.Get(limits.CPU).Max 438 if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) { 439 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) 440 } 441 } 442 tg.signalHandlers.mu.Unlock() 443 444 ticker.k.tasks.mu.RUnlock() 445 } 446 447 // Retain tgs between calls to Notify to reduce allocations. 448 for i := range tgs { 449 tgs[i] = nil 450 } 451 ticker.tgs = tgs[:0] 452 453 // If nothing is running, we can disable the timer. 454 tasks := atomic.LoadInt64(&ticker.k.runningTasks) 455 if tasks == 0 { 456 ticker.k.runningTasksMu.Lock() 457 defer ticker.k.runningTasksMu.Unlock() 458 tasks := atomic.LoadInt64(&ticker.k.runningTasks) 459 if tasks != 0 { 460 // Raced with a 0 -> 1 transition. 461 return setting, false 462 } 463 464 // Stop the timer. We must cache the current setting so the 465 // kernel can access it without violating the lock order. 466 ticker.k.cpuClockTickerSetting = setting 467 ticker.k.cpuClockTickerDisabled = true 468 setting.Enabled = false 469 return setting, true 470 } 471 472 return setting, false 473 } 474 475 // Destroy implements ktime.TimerListener.Destroy. 476 func (ticker *kernelCPUClockTicker) Destroy() { 477 } 478 479 // randInt31n returns a random integer in [0, n). 480 // 481 // randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported. 482 // See that function for details. 483 func randInt31n(rng *rand.Rand, n int32) int32 { 484 v := rng.Uint32() 485 prod := uint64(v) * uint64(n) 486 low := uint32(prod) 487 if low < uint32(n) { 488 thresh := uint32(-n) % uint32(n) 489 for low < thresh { 490 v = rng.Uint32() 491 prod = uint64(v) * uint64(n) 492 low = uint32(prod) 493 } 494 } 495 return int32(prod >> 32) 496 } 497 498 // NotifyRlimitCPUUpdated is called by setrlimit. 499 // 500 // Preconditions: The caller must be running on the task goroutine. 501 func (t *Task) NotifyRlimitCPUUpdated() { 502 t.k.cpuClockTicker.Atomically(func() { 503 t.tg.pidns.owner.mu.RLock() 504 defer t.tg.pidns.owner.mu.RUnlock() 505 t.tg.signalHandlers.mu.Lock() 506 defer t.tg.signalHandlers.mu.Unlock() 507 rlimitCPU := t.tg.limits.Get(limits.CPU) 508 t.tg.rlimitCPUSoftSetting = ktime.Setting{ 509 Enabled: rlimitCPU.Cur != limits.Infinity, 510 Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()), 511 Period: time.Second, 512 } 513 if rlimitCPU.Max != limits.Infinity { 514 // Check if tg is already over the hard limit. 515 tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow()) 516 tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds()) 517 if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) { 518 t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) 519 } 520 } 521 t.tg.updateCPUTimersEnabledLocked() 522 }) 523 } 524 525 // Preconditions: The signal mutex must be locked. 526 func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { 527 rlimitCPU := tg.limits.Get(limits.CPU) 528 if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity { 529 atomic.StoreUint32(&tg.cpuTimersEnabled, 1) 530 } else { 531 atomic.StoreUint32(&tg.cpuTimersEnabled, 0) 532 } 533 } 534 535 // StateStatus returns a string representation of the task's current state, 536 // appropriate for /proc/[pid]/status. 537 func (t *Task) StateStatus() string { 538 switch s := t.TaskGoroutineSchedInfo().State; s { 539 case TaskGoroutineNonexistent, TaskGoroutineRunningSys: 540 t.tg.pidns.owner.mu.RLock() 541 defer t.tg.pidns.owner.mu.RUnlock() 542 switch t.exitState { 543 case TaskExitZombie: 544 return "Z (zombie)" 545 case TaskExitDead: 546 return "X (dead)" 547 default: 548 // The task goroutine can't exit before passing through 549 // runExitNotify, so if s == TaskGoroutineNonexistent, the task has 550 // been created but the task goroutine hasn't yet started. The 551 // Linux equivalent is struct task_struct::state == TASK_NEW 552 // (kernel/fork.c:copy_process() => 553 // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is 554 // masked out by TASK_REPORT for /proc/[pid]/status, leaving only 555 // TASK_RUNNING. 556 return "R (running)" 557 } 558 case TaskGoroutineRunningApp: 559 return "R (running)" 560 case TaskGoroutineBlockedInterruptible: 561 return "S (sleeping)" 562 case TaskGoroutineStopped: 563 t.tg.signalHandlers.mu.Lock() 564 defer t.tg.signalHandlers.mu.Unlock() 565 switch t.stop.(type) { 566 case *groupStop: 567 return "T (stopped)" 568 case *ptraceStop: 569 return "t (tracing stop)" 570 } 571 fallthrough 572 case TaskGoroutineBlockedUninterruptible: 573 // This is the name Linux uses for TASK_UNINTERRUPTIBLE and 574 // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): 575 // fs/proc/array.c:task_state_array. 576 return "D (disk sleep)" 577 default: 578 panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) 579 } 580 } 581 582 // CPUMask returns a copy of t's allowed CPU mask. 583 func (t *Task) CPUMask() sched.CPUSet { 584 t.mu.Lock() 585 defer t.mu.Unlock() 586 return t.allowedCPUMask.Copy() 587 } 588 589 // SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of 590 // mask. 591 // 592 // Preconditions: mask.Size() == 593 // sched.CPUSetSize(t.Kernel().ApplicationCores()). 594 func (t *Task) SetCPUMask(mask sched.CPUSet) error { 595 if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { 596 panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) 597 } 598 599 // Remove CPUs in mask above Kernel.applicationCores. 600 mask.ClearAbove(t.k.applicationCores) 601 602 // Ensure that at least 1 CPU is still allowed. 603 if mask.NumCPUs() == 0 { 604 return linuxerr.EINVAL 605 } 606 607 if t.k.useHostCores { 608 // No-op; pretend the mask was immediately changed back. 609 return nil 610 } 611 612 t.tg.pidns.owner.mu.RLock() 613 rootTID := t.tg.pidns.owner.Root.tids[t] 614 t.tg.pidns.owner.mu.RUnlock() 615 616 t.mu.Lock() 617 defer t.mu.Unlock() 618 t.allowedCPUMask = mask 619 atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID)) 620 return nil 621 } 622 623 // CPU returns the cpu id for a given task. 624 func (t *Task) CPU() int32 { 625 if t.k.useHostCores { 626 return int32(hostcpu.GetCPU()) 627 } 628 629 return atomic.LoadInt32(&t.cpu) 630 } 631 632 // assignCPU returns the virtualized CPU number for the task with global TID 633 // tid and allowedCPUMask allowed. 634 func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { 635 // To pretend that threads are evenly distributed to allowed CPUs, choose n 636 // to be less than the number of CPUs in allowed ... 637 n := int(tid) % int(allowed.NumCPUs()) 638 // ... then pick the nth CPU in allowed. 639 allowed.ForEachCPU(func(c uint) { 640 if n--; n == 0 { 641 cpu = int32(c) 642 } 643 }) 644 return cpu 645 } 646 647 // Niceness returns t's niceness. 648 func (t *Task) Niceness() int { 649 t.mu.Lock() 650 defer t.mu.Unlock() 651 return t.niceness 652 } 653 654 // Priority returns t's priority. 655 func (t *Task) Priority() int { 656 t.mu.Lock() 657 defer t.mu.Unlock() 658 return t.niceness + 20 659 } 660 661 // SetNiceness sets t's niceness to n. 662 func (t *Task) SetNiceness(n int) { 663 t.mu.Lock() 664 defer t.mu.Unlock() 665 t.niceness = n 666 } 667 668 // NumaPolicy returns t's current numa policy. 669 func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { 670 t.mu.Lock() 671 defer t.mu.Unlock() 672 return t.numaPolicy, t.numaNodeMask 673 } 674 675 // SetNumaPolicy sets t's numa policy. 676 func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { 677 t.mu.Lock() 678 defer t.mu.Unlock() 679 t.numaPolicy = policy 680 t.numaNodeMask = nodeMask 681 }