github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_sched.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 // CPU scheduling, real and fake. 18 19 import ( 20 "fmt" 21 "math/rand" 22 "time" 23 24 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 25 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 26 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostcpu" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched" 28 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 31 ) 32 33 // TaskGoroutineState is a coarse representation of the current execution 34 // status of a kernel.Task goroutine. 35 type TaskGoroutineState int 36 37 const ( 38 // TaskGoroutineNonexistent indicates that the task goroutine has either 39 // not yet been created by Task.Start() or has returned from Task.run(). 40 // This must be the zero value for TaskGoroutineState. 41 TaskGoroutineNonexistent TaskGoroutineState = iota 42 43 // TaskGoroutineRunningSys indicates that the task goroutine is executing 44 // sentry code. 45 TaskGoroutineRunningSys 46 47 // TaskGoroutineRunningApp indicates that the task goroutine is executing 48 // application code. 49 TaskGoroutineRunningApp 50 51 // TaskGoroutineBlockedInterruptible indicates that the task goroutine is 52 // blocked in Task.block(), and hence may be woken by Task.interrupt() 53 // (e.g. due to signal delivery). 54 TaskGoroutineBlockedInterruptible 55 56 // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is 57 // stopped outside of Task.block() and Task.doStop(), and hence cannot be 58 // woken by Task.interrupt(). 59 TaskGoroutineBlockedUninterruptible 60 61 // TaskGoroutineStopped indicates that the task goroutine is blocked in 62 // Task.doStop(). TaskGoroutineStopped is similar to 63 // TaskGoroutineBlockedUninterruptible, but is a separate state to make it 64 // possible to determine when Task.stop is meaningful. 65 TaskGoroutineStopped 66 ) 67 68 // TaskGoroutineSchedInfo contains task goroutine scheduling state which must 69 // be read and updated atomically. 70 // 71 // +stateify savable 72 type TaskGoroutineSchedInfo struct { 73 // Timestamp was the value of Kernel.cpuClock when this 74 // TaskGoroutineSchedInfo was last updated. 75 Timestamp uint64 76 77 // State is the current state of the task goroutine. 78 State TaskGoroutineState 79 80 // UserTicks is the amount of time the task goroutine has spent executing 81 // its associated Task's application code, in units of linux.ClockTick. 82 UserTicks uint64 83 84 // SysTicks is the amount of time the task goroutine has spent executing in 85 // the sentry, in units of linux.ClockTick. 86 SysTicks uint64 87 } 88 89 // userTicksAt returns the extrapolated value of ts.UserTicks after 90 // Kernel.CPUClockNow() indicates a time of now. 91 // 92 // Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is 93 // monotonic, this is satisfied if now is the result of a previous call to 94 // Kernel.CPUClockNow().) This requirement exists because otherwise a racing 95 // change to t.gosched can cause userTicksAt to adjust stats by too much, 96 // making the observed stats non-monotonic. 97 func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 { 98 if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp { 99 // Update stats to reflect execution since the last update. 100 return ts.UserTicks + (now - ts.Timestamp) 101 } 102 return ts.UserTicks 103 } 104 105 // sysTicksAt returns the extrapolated value of ts.SysTicks after 106 // Kernel.CPUClockNow() indicates a time of now. 107 // 108 // Preconditions: As for userTicksAt. 109 func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 { 110 if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys { 111 return ts.SysTicks + (now - ts.Timestamp) 112 } 113 return ts.SysTicks 114 } 115 116 // Preconditions: The caller must be running on the task goroutine. 117 func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { 118 now := t.k.CPUClockNow() 119 if t.gosched.State != TaskGoroutineRunningSys { 120 panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) 121 } 122 t.goschedSeq.BeginWrite() 123 // This function is very hot; avoid defer. 124 t.gosched.SysTicks += now - t.gosched.Timestamp 125 t.gosched.Timestamp = now 126 t.gosched.State = state 127 t.goschedSeq.EndWrite() 128 129 if state != TaskGoroutineRunningApp { 130 // Task is blocking/stopping. 131 t.k.decRunningTasks() 132 } 133 } 134 135 // Preconditions: 136 // - The caller must be running on the task goroutine 137 // - The caller must be leaving a state indicated by a previous call to 138 // t.accountTaskGoroutineEnter(state). 139 func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { 140 if state != TaskGoroutineRunningApp { 141 // Task is unblocking/continuing. 142 t.k.incRunningTasks() 143 } 144 145 now := t.k.CPUClockNow() 146 if t.gosched.State != state { 147 panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) 148 } 149 t.goschedSeq.BeginWrite() 150 // This function is very hot; avoid defer. 151 if state == TaskGoroutineRunningApp { 152 t.gosched.UserTicks += now - t.gosched.Timestamp 153 } 154 t.gosched.Timestamp = now 155 t.gosched.State = TaskGoroutineRunningSys 156 t.goschedSeq.EndWrite() 157 } 158 159 // Preconditions: The caller must be running on the task goroutine. 160 func (t *Task) accountTaskGoroutineRunning() { 161 now := t.k.CPUClockNow() 162 if t.gosched.State != TaskGoroutineRunningSys { 163 panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) 164 } 165 t.goschedSeq.BeginWrite() 166 t.gosched.SysTicks += now - t.gosched.Timestamp 167 t.gosched.Timestamp = now 168 t.goschedSeq.EndWrite() 169 } 170 171 // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. 172 // Most clients should use t.CPUStats() instead. 173 func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { 174 return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) 175 } 176 177 // CPUStats returns the CPU usage statistics of t. 178 func (t *Task) CPUStats() usage.CPUStats { 179 return t.cpuStatsAt(t.k.CPUClockNow()) 180 } 181 182 // Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. 183 func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { 184 tsched := t.TaskGoroutineSchedInfo() 185 return usage.CPUStats{ 186 UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)), 187 SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)), 188 VoluntarySwitches: t.yieldCount.Load(), 189 } 190 } 191 192 // CPUStats returns the combined CPU usage statistics of all past and present 193 // threads in tg. 194 func (tg *ThreadGroup) CPUStats() usage.CPUStats { 195 tg.pidns.owner.mu.RLock() 196 defer tg.pidns.owner.mu.RUnlock() 197 // Hack to get a pointer to the Kernel. 198 if tg.leader == nil { 199 // Per comment on tg.leader, this is only possible if nothing in the 200 // ThreadGroup has ever executed anyway. 201 return usage.CPUStats{} 202 } 203 return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) 204 } 205 206 // Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus: 207 // - The TaskSet mutex must be locked. 208 func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { 209 stats := tg.exitedCPUStats 210 // Account for live tasks. 211 for t := tg.tasks.Front(); t != nil; t = t.Next() { 212 stats.Accumulate(t.cpuStatsAt(now)) 213 } 214 return stats 215 } 216 217 // JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return 218 // resource usage statistics for all children of [tg] that have terminated and 219 // been waited for. These statistics will include the resources used by 220 // grandchildren, and further removed descendants, if all of the intervening 221 // descendants waited on their terminated children." 222 func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { 223 tg.pidns.owner.mu.RLock() 224 defer tg.pidns.owner.mu.RUnlock() 225 return tg.childCPUStats 226 } 227 228 // taskClock is a ktime.Clock that measures the time that a task has spent 229 // executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID. 230 // 231 // +stateify savable 232 type taskClock struct { 233 t *Task 234 235 // If includeSys is true, the taskClock includes both time spent executing 236 // application code as well as time spent in the sentry. Otherwise, the 237 // taskClock includes only time spent executing application code. 238 includeSys bool 239 240 // Implements waiter.Waitable. TimeUntil wouldn't change its estimation 241 // based on either of the clock events, so there's no event to be 242 // notified for. 243 ktime.NoClockEvents `state:"nosave"` 244 245 // Implements ktime.Clock.WallTimeUntil. 246 // 247 // As an upper bound, a task's clock cannot advance faster than CPU 248 // time. It would have to execute at a rate of more than 1 task-second 249 // per 1 CPU-second, which isn't possible. 250 ktime.WallRateClock `state:"nosave"` 251 } 252 253 // UserCPUClock returns a clock measuring the CPU time the task has spent 254 // executing application code. 255 func (t *Task) UserCPUClock() ktime.Clock { 256 return &taskClock{t: t, includeSys: false} 257 } 258 259 // CPUClock returns a clock measuring the CPU time the task has spent executing 260 // application and "kernel" code. 261 func (t *Task) CPUClock() ktime.Clock { 262 return &taskClock{t: t, includeSys: true} 263 } 264 265 // Now implements ktime.Clock.Now. 266 func (tc *taskClock) Now() ktime.Time { 267 stats := tc.t.CPUStats() 268 if tc.includeSys { 269 return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) 270 } 271 return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) 272 } 273 274 // tgClock is a ktime.Clock that measures the time a thread group has spent 275 // executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID. 276 // 277 // +stateify savable 278 type tgClock struct { 279 tg *ThreadGroup 280 281 // If includeSys is true, the tgClock includes both time spent executing 282 // application code as well as time spent in the sentry. Otherwise, the 283 // tgClock includes only time spent executing application code. 284 includeSys bool 285 286 // Implements waiter.Waitable. 287 ktime.ClockEventsQueue `state:"nosave"` 288 } 289 290 // Now implements ktime.Clock.Now. 291 func (tgc *tgClock) Now() ktime.Time { 292 stats := tgc.tg.CPUStats() 293 if tgc.includeSys { 294 return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) 295 } 296 return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) 297 } 298 299 // WallTimeUntil implements ktime.Clock.WallTimeUntil. 300 func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { 301 // Thread group CPU time should not exceed wall time * live tasks, since 302 // task goroutines exit after the transition to TaskExitZombie in 303 // runExitNotify. 304 tgc.tg.pidns.owner.mu.RLock() 305 n := tgc.tg.liveTasks 306 tgc.tg.pidns.owner.mu.RUnlock() 307 if n == 0 { 308 if t.Before(now) { 309 return 0 310 } 311 // The timer tick raced with thread group exit, after which no more 312 // tasks can enter the thread group. So tgc.Now() will never advance 313 // again. Return a large delay; the timer should be stopped long before 314 // it comes again anyway. 315 return time.Hour 316 } 317 // This is a lower bound on the amount of time that can elapse before an 318 // associated timer expires, so returning this value tends to result in a 319 // sequence of closely-spaced ticks just before timer expiry. To avoid 320 // this, round up to the nearest ClockTick; CPU usage measurements are 321 // limited to this resolution anyway. 322 remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond 323 return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick 324 } 325 326 // UserCPUClock returns a ktime.Clock that measures the time that a thread 327 // group has spent executing. 328 func (tg *ThreadGroup) UserCPUClock() ktime.Clock { 329 return &tgClock{tg: tg, includeSys: false} 330 } 331 332 // CPUClock returns a ktime.Clock that measures the time that a thread group 333 // has spent executing, including sentry time. 334 func (tg *ThreadGroup) CPUClock() ktime.Clock { 335 return &tgClock{tg: tg, includeSys: true} 336 } 337 338 func (k *Kernel) runCPUClockTicker() { 339 rng := rand.New(rand.NewSource(rand.Int63())) 340 var tgs []*ThreadGroup 341 342 for { 343 // Stop the CPU clock while nothing is running. 344 if k.runningTasks.Load() == 0 { 345 k.runningTasksMu.Lock() 346 if k.runningTasks.Load() == 0 { 347 k.cpuClockTickerRunning = false 348 k.cpuClockTickerStopCond.Broadcast() 349 k.runningTasksCond.Wait() 350 // k.cpuClockTickerRunning was set to true by our waker 351 // (Kernel.incRunningTasks()). For reasons described there, we must 352 // process at least one CPU clock tick between calls to 353 // k.runningTasksCond.Wait(). 354 } 355 k.runningTasksMu.Unlock() 356 } 357 358 // Wait for the next CPU clock tick. 359 select { 360 case <-k.cpuClockTickTimer.C: 361 k.cpuClockTickTimer.Reset(linux.ClockTick) 362 case <-k.cpuClockTickerWakeCh: 363 continue 364 } 365 366 // Advance the CPU clock, and timers based on the CPU clock, atomically 367 // under cpuClockMu. 368 k.cpuClockMu.Lock() 369 now := k.cpuClock.Add(1) 370 371 // Check thread group CPU timers. 372 tgs = k.tasks.Root.ThreadGroupsAppend(tgs) 373 for _, tg := range tgs { 374 if tg.cpuTimersEnabled.Load() == 0 { 375 continue 376 } 377 378 k.tasks.mu.RLock() 379 if tg.leader == nil { 380 // No tasks have ever run in this thread group. 381 k.tasks.mu.RUnlock() 382 continue 383 } 384 // Accumulate thread group CPU stats, and randomly select running tasks 385 // using reservoir sampling to receive CPU timer signals. 386 var virtReceiver *Task 387 nrVirtCandidates := 0 388 var profReceiver *Task 389 nrProfCandidates := 0 390 tgUserTime := tg.exitedCPUStats.UserTime 391 tgSysTime := tg.exitedCPUStats.SysTime 392 for t := tg.tasks.Front(); t != nil; t = t.Next() { 393 tsched := t.TaskGoroutineSchedInfo() 394 tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)) 395 tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)) 396 switch tsched.State { 397 case TaskGoroutineRunningApp: 398 // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU 399 // timers. 400 nrVirtCandidates++ 401 if int(randInt31n(rng, int32(nrVirtCandidates))) == 0 { 402 virtReceiver = t 403 } 404 fallthrough 405 case TaskGoroutineRunningSys: 406 // Considered by ITIMER_PROF and RLIMIT_CPU timers. 407 nrProfCandidates++ 408 if int(randInt31n(rng, int32(nrProfCandidates))) == 0 { 409 profReceiver = t 410 } 411 } 412 } 413 tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds()) 414 tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds()) 415 416 // All of the following are standard (not real-time) signals, which are 417 // automatically deduplicated, so we ignore the number of expirations. 418 tg.signalHandlers.mu.Lock() 419 // It should only be possible for these timers to advance if we found 420 // at least one running task. 421 if virtReceiver != nil { 422 // ITIMER_VIRTUAL 423 newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow) 424 tg.itimerVirtSetting = newItimerVirtSetting 425 if exp != 0 { 426 virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true) 427 } 428 } 429 if profReceiver != nil { 430 // ITIMER_PROF 431 newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow) 432 tg.itimerProfSetting = newItimerProfSetting 433 if exp != 0 { 434 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true) 435 } 436 // RLIMIT_CPU soft limit 437 newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow) 438 tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting 439 if exp != 0 { 440 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true) 441 } 442 // RLIMIT_CPU hard limit 443 rlimitCPUMax := tg.limits.Get(limits.CPU).Max 444 if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) { 445 profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) 446 } 447 } 448 tg.signalHandlers.mu.Unlock() 449 450 k.tasks.mu.RUnlock() 451 } 452 453 k.cpuClockMu.Unlock() 454 455 // Retain tgs between calls to Notify to reduce allocations. 456 for i := range tgs { 457 tgs[i] = nil 458 } 459 tgs = tgs[:0] 460 } 461 } 462 463 // randInt31n returns a random integer in [0, n). 464 // 465 // randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported. 466 // See that function for details. 467 func randInt31n(rng *rand.Rand, n int32) int32 { 468 v := rng.Uint32() 469 prod := uint64(v) * uint64(n) 470 low := uint32(prod) 471 if low < uint32(n) { 472 thresh := uint32(-n) % uint32(n) 473 for low < thresh { 474 v = rng.Uint32() 475 prod = uint64(v) * uint64(n) 476 low = uint32(prod) 477 } 478 } 479 return int32(prod >> 32) 480 } 481 482 // NotifyRlimitCPUUpdated is called by setrlimit. 483 // 484 // Preconditions: The caller must be running on the task goroutine. 485 func (t *Task) NotifyRlimitCPUUpdated() { 486 t.k.cpuClockMu.Lock() 487 defer t.k.cpuClockMu.Unlock() 488 t.tg.pidns.owner.mu.RLock() 489 defer t.tg.pidns.owner.mu.RUnlock() 490 t.tg.signalHandlers.mu.Lock() 491 defer t.tg.signalHandlers.mu.Unlock() 492 rlimitCPU := t.tg.limits.Get(limits.CPU) 493 t.tg.rlimitCPUSoftSetting = ktime.Setting{ 494 Enabled: rlimitCPU.Cur != limits.Infinity, 495 Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()), 496 Period: time.Second, 497 } 498 if rlimitCPU.Max != limits.Infinity { 499 // Check if tg is already over the hard limit. 500 tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow()) 501 tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds()) 502 if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) { 503 t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) 504 } 505 } 506 t.tg.updateCPUTimersEnabledLocked() 507 } 508 509 // Preconditions: The signal mutex must be locked. 510 func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { 511 rlimitCPU := tg.limits.Get(limits.CPU) 512 if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity { 513 tg.cpuTimersEnabled.Store(1) 514 } else { 515 tg.cpuTimersEnabled.Store(0) 516 } 517 } 518 519 // StateStatus returns a string representation of the task's current state, 520 // appropriate for /proc/[pid]/status. 521 func (t *Task) StateStatus() string { 522 switch s := t.TaskGoroutineSchedInfo().State; s { 523 case TaskGoroutineNonexistent, TaskGoroutineRunningSys: 524 t.tg.pidns.owner.mu.RLock() 525 defer t.tg.pidns.owner.mu.RUnlock() 526 switch t.exitState { 527 case TaskExitZombie: 528 return "Z (zombie)" 529 case TaskExitDead: 530 return "X (dead)" 531 default: 532 // The task goroutine can't exit before passing through 533 // runExitNotify, so if s == TaskGoroutineNonexistent, the task has 534 // been created but the task goroutine hasn't yet started. The 535 // Linux equivalent is struct task_struct::state == TASK_NEW 536 // (kernel/fork.c:copy_process() => 537 // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is 538 // masked out by TASK_REPORT for /proc/[pid]/status, leaving only 539 // TASK_RUNNING. 540 return "R (running)" 541 } 542 case TaskGoroutineRunningApp: 543 return "R (running)" 544 case TaskGoroutineBlockedInterruptible: 545 return "S (sleeping)" 546 case TaskGoroutineStopped: 547 t.tg.signalHandlers.mu.Lock() 548 defer t.tg.signalHandlers.mu.Unlock() 549 switch t.stop.(type) { 550 case *groupStop: 551 return "T (stopped)" 552 case *ptraceStop: 553 return "t (tracing stop)" 554 } 555 fallthrough 556 case TaskGoroutineBlockedUninterruptible: 557 // This is the name Linux uses for TASK_UNINTERRUPTIBLE and 558 // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): 559 // fs/proc/array.c:task_state_array. 560 return "D (disk sleep)" 561 default: 562 panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) 563 } 564 } 565 566 // CPUMask returns a copy of t's allowed CPU mask. 567 func (t *Task) CPUMask() sched.CPUSet { 568 t.mu.Lock() 569 defer t.mu.Unlock() 570 return t.allowedCPUMask.Copy() 571 } 572 573 // SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of 574 // mask. 575 // 576 // Preconditions: mask.Size() == 577 // sched.CPUSetSize(t.Kernel().ApplicationCores()). 578 func (t *Task) SetCPUMask(mask sched.CPUSet) error { 579 if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { 580 panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) 581 } 582 583 // Remove CPUs in mask above Kernel.applicationCores. 584 mask.ClearAbove(t.k.applicationCores) 585 586 // Ensure that at least 1 CPU is still allowed. 587 if mask.NumCPUs() == 0 { 588 return linuxerr.EINVAL 589 } 590 591 if t.k.useHostCores { 592 // No-op; pretend the mask was immediately changed back. 593 return nil 594 } 595 596 t.tg.pidns.owner.mu.RLock() 597 rootTID := t.tg.pidns.owner.Root.tids[t] 598 t.tg.pidns.owner.mu.RUnlock() 599 600 t.mu.Lock() 601 defer t.mu.Unlock() 602 t.allowedCPUMask = mask 603 t.cpu.Store(assignCPU(mask, rootTID)) 604 return nil 605 } 606 607 // CPU returns the cpu id for a given task. 608 func (t *Task) CPU() int32 { 609 if t.k.useHostCores { 610 return int32(hostcpu.GetCPU()) 611 } 612 613 return t.cpu.Load() 614 } 615 616 // assignCPU returns the virtualized CPU number for the task with global TID 617 // tid and allowedCPUMask allowed. 618 func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { 619 // To pretend that threads are evenly distributed to allowed CPUs, choose n 620 // to be less than the number of CPUs in allowed ... 621 n := int(tid) % int(allowed.NumCPUs()) 622 // ... then pick the nth CPU in allowed. 623 allowed.ForEachCPU(func(c uint) { 624 if n--; n == 0 { 625 cpu = int32(c) 626 } 627 }) 628 return cpu 629 } 630 631 // Niceness returns t's niceness. 632 func (t *Task) Niceness() int { 633 t.mu.Lock() 634 defer t.mu.Unlock() 635 return t.niceness 636 } 637 638 // Priority returns t's priority. 639 func (t *Task) Priority() int { 640 t.mu.Lock() 641 defer t.mu.Unlock() 642 return t.niceness + 20 643 } 644 645 // SetNiceness sets t's niceness to n. 646 func (t *Task) SetNiceness(n int) { 647 t.mu.Lock() 648 defer t.mu.Unlock() 649 t.niceness = n 650 } 651 652 // NumaPolicy returns t's current numa policy. 653 func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { 654 t.mu.Lock() 655 defer t.mu.Unlock() 656 return t.numaPolicy, t.numaNodeMask 657 } 658 659 // SetNumaPolicy sets t's numa policy. 660 func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { 661 t.mu.Lock() 662 defer t.mu.Unlock() 663 t.numaPolicy = policy 664 t.numaNodeMask = nodeMask 665 }