github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_sched.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  // CPU scheduling, real and fake.
    18  
    19  import (
    20  	"fmt"
    21  	"math/rand"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    26  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/hostcpu"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/sched"
    29  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    32  )
    33  
    34  // TaskGoroutineState is a coarse representation of the current execution
    35  // status of a kernel.Task goroutine.
    36  type TaskGoroutineState int
    37  
    38  const (
    39  	// TaskGoroutineNonexistent indicates that the task goroutine has either
    40  	// not yet been created by Task.Start() or has returned from Task.run().
    41  	// This must be the zero value for TaskGoroutineState.
    42  	TaskGoroutineNonexistent TaskGoroutineState = iota
    43  
    44  	// TaskGoroutineRunningSys indicates that the task goroutine is executing
    45  	// sentry code.
    46  	TaskGoroutineRunningSys
    47  
    48  	// TaskGoroutineRunningApp indicates that the task goroutine is executing
    49  	// application code.
    50  	TaskGoroutineRunningApp
    51  
    52  	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
    53  	// blocked in Task.block(), and hence may be woken by Task.interrupt()
    54  	// (e.g. due to signal delivery).
    55  	TaskGoroutineBlockedInterruptible
    56  
    57  	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
    58  	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
    59  	// woken by Task.interrupt().
    60  	TaskGoroutineBlockedUninterruptible
    61  
    62  	// TaskGoroutineStopped indicates that the task goroutine is blocked in
    63  	// Task.doStop(). TaskGoroutineStopped is similar to
    64  	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
    65  	// possible to determine when Task.stop is meaningful.
    66  	TaskGoroutineStopped
    67  )
    68  
    69  // TaskGoroutineSchedInfo contains task goroutine scheduling state which must
    70  // be read and updated atomically.
    71  //
    72  // +stateify savable
    73  type TaskGoroutineSchedInfo struct {
    74  	// Timestamp was the value of Kernel.cpuClock when this
    75  	// TaskGoroutineSchedInfo was last updated.
    76  	Timestamp uint64
    77  
    78  	// State is the current state of the task goroutine.
    79  	State TaskGoroutineState
    80  
    81  	// UserTicks is the amount of time the task goroutine has spent executing
    82  	// its associated Task's application code, in units of linux.ClockTick.
    83  	UserTicks uint64
    84  
    85  	// SysTicks is the amount of time the task goroutine has spent executing in
    86  	// the sentry, in units of linux.ClockTick.
    87  	SysTicks uint64
    88  }
    89  
    90  // userTicksAt returns the extrapolated value of ts.UserTicks after
    91  // Kernel.CPUClockNow() indicates a time of now.
    92  //
    93  // Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
    94  // monotonic, this is satisfied if now is the result of a previous call to
    95  // Kernel.CPUClockNow().) This requirement exists because otherwise a racing
    96  // change to t.gosched can cause userTicksAt to adjust stats by too much,
    97  // making the observed stats non-monotonic.
    98  func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
    99  	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
   100  		// Update stats to reflect execution since the last update.
   101  		return ts.UserTicks + (now - ts.Timestamp)
   102  	}
   103  	return ts.UserTicks
   104  }
   105  
   106  // sysTicksAt returns the extrapolated value of ts.SysTicks after
   107  // Kernel.CPUClockNow() indicates a time of now.
   108  //
   109  // Preconditions: As for userTicksAt.
   110  func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
   111  	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
   112  		return ts.SysTicks + (now - ts.Timestamp)
   113  	}
   114  	return ts.SysTicks
   115  }
   116  
   117  // Preconditions: The caller must be running on the task goroutine.
   118  func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
   119  	now := t.k.CPUClockNow()
   120  	if t.gosched.State != TaskGoroutineRunningSys {
   121  		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
   122  	}
   123  	t.goschedSeq.BeginWrite()
   124  	// This function is very hot; avoid defer.
   125  	t.gosched.SysTicks += now - t.gosched.Timestamp
   126  	t.gosched.Timestamp = now
   127  	t.gosched.State = state
   128  	t.goschedSeq.EndWrite()
   129  
   130  	if state != TaskGoroutineRunningApp {
   131  		// Task is blocking/stopping.
   132  		t.k.decRunningTasks()
   133  	}
   134  }
   135  
   136  // Preconditions:
   137  // * The caller must be running on the task goroutine
   138  // * The caller must be leaving a state indicated by a previous call to
   139  //   t.accountTaskGoroutineEnter(state).
   140  func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
   141  	if state != TaskGoroutineRunningApp {
   142  		// Task is unblocking/continuing.
   143  		t.k.incRunningTasks()
   144  	}
   145  
   146  	now := t.k.CPUClockNow()
   147  	if t.gosched.State != state {
   148  		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
   149  	}
   150  	t.goschedSeq.BeginWrite()
   151  	// This function is very hot; avoid defer.
   152  	if state == TaskGoroutineRunningApp {
   153  		t.gosched.UserTicks += now - t.gosched.Timestamp
   154  	}
   155  	t.gosched.Timestamp = now
   156  	t.gosched.State = TaskGoroutineRunningSys
   157  	t.goschedSeq.EndWrite()
   158  }
   159  
   160  // Preconditions: The caller must be running on the task goroutine.
   161  func (t *Task) accountTaskGoroutineRunning() {
   162  	now := t.k.CPUClockNow()
   163  	if t.gosched.State != TaskGoroutineRunningSys {
   164  		panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
   165  	}
   166  	t.goschedSeq.BeginWrite()
   167  	t.gosched.SysTicks += now - t.gosched.Timestamp
   168  	t.gosched.Timestamp = now
   169  	t.goschedSeq.EndWrite()
   170  }
   171  
   172  // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
   173  // Most clients should use t.CPUStats() instead.
   174  func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
   175  	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
   176  }
   177  
   178  // CPUStats returns the CPU usage statistics of t.
   179  func (t *Task) CPUStats() usage.CPUStats {
   180  	return t.cpuStatsAt(t.k.CPUClockNow())
   181  }
   182  
   183  // Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
   184  func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
   185  	tsched := t.TaskGoroutineSchedInfo()
   186  	return usage.CPUStats{
   187  		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
   188  		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
   189  		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
   190  	}
   191  }
   192  
   193  // CPUStats returns the combined CPU usage statistics of all past and present
   194  // threads in tg.
   195  func (tg *ThreadGroup) CPUStats() usage.CPUStats {
   196  	tg.pidns.owner.mu.RLock()
   197  	defer tg.pidns.owner.mu.RUnlock()
   198  	// Hack to get a pointer to the Kernel.
   199  	if tg.leader == nil {
   200  		// Per comment on tg.leader, this is only possible if nothing in the
   201  		// ThreadGroup has ever executed anyway.
   202  		return usage.CPUStats{}
   203  	}
   204  	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
   205  }
   206  
   207  // Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
   208  // * The TaskSet mutex must be locked.
   209  func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
   210  	stats := tg.exitedCPUStats
   211  	// Account for live tasks.
   212  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   213  		stats.Accumulate(t.cpuStatsAt(now))
   214  	}
   215  	return stats
   216  }
   217  
   218  // JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
   219  // resource usage statistics for all children of [tg] that have terminated and
   220  // been waited for. These statistics will include the resources used by
   221  // grandchildren, and further removed descendants, if all of the intervening
   222  // descendants waited on their terminated children."
   223  func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
   224  	tg.pidns.owner.mu.RLock()
   225  	defer tg.pidns.owner.mu.RUnlock()
   226  	return tg.childCPUStats
   227  }
   228  
   229  // taskClock is a ktime.Clock that measures the time that a task has spent
   230  // executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
   231  //
   232  // +stateify savable
   233  type taskClock struct {
   234  	t *Task
   235  
   236  	// If includeSys is true, the taskClock includes both time spent executing
   237  	// application code as well as time spent in the sentry. Otherwise, the
   238  	// taskClock includes only time spent executing application code.
   239  	includeSys bool
   240  
   241  	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
   242  	// based on either of the clock events, so there's no event to be
   243  	// notified for.
   244  	ktime.NoClockEvents `state:"nosave"`
   245  
   246  	// Implements ktime.Clock.WallTimeUntil.
   247  	//
   248  	// As an upper bound, a task's clock cannot advance faster than CPU
   249  	// time. It would have to execute at a rate of more than 1 task-second
   250  	// per 1 CPU-second, which isn't possible.
   251  	ktime.WallRateClock `state:"nosave"`
   252  }
   253  
   254  // UserCPUClock returns a clock measuring the CPU time the task has spent
   255  // executing application code.
   256  func (t *Task) UserCPUClock() ktime.Clock {
   257  	return &taskClock{t: t, includeSys: false}
   258  }
   259  
   260  // CPUClock returns a clock measuring the CPU time the task has spent executing
   261  // application and "kernel" code.
   262  func (t *Task) CPUClock() ktime.Clock {
   263  	return &taskClock{t: t, includeSys: true}
   264  }
   265  
   266  // Now implements ktime.Clock.Now.
   267  func (tc *taskClock) Now() ktime.Time {
   268  	stats := tc.t.CPUStats()
   269  	if tc.includeSys {
   270  		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
   271  	}
   272  	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
   273  }
   274  
   275  // tgClock is a ktime.Clock that measures the time a thread group has spent
   276  // executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
   277  //
   278  // +stateify savable
   279  type tgClock struct {
   280  	tg *ThreadGroup
   281  
   282  	// If includeSys is true, the tgClock includes both time spent executing
   283  	// application code as well as time spent in the sentry. Otherwise, the
   284  	// tgClock includes only time spent executing application code.
   285  	includeSys bool
   286  
   287  	// Implements waiter.Waitable.
   288  	ktime.ClockEventsQueue `state:"nosave"`
   289  }
   290  
   291  // Now implements ktime.Clock.Now.
   292  func (tgc *tgClock) Now() ktime.Time {
   293  	stats := tgc.tg.CPUStats()
   294  	if tgc.includeSys {
   295  		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
   296  	}
   297  	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
   298  }
   299  
   300  // WallTimeUntil implements ktime.Clock.WallTimeUntil.
   301  func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
   302  	// Thread group CPU time should not exceed wall time * live tasks, since
   303  	// task goroutines exit after the transition to TaskExitZombie in
   304  	// runExitNotify.
   305  	tgc.tg.pidns.owner.mu.RLock()
   306  	n := tgc.tg.liveTasks
   307  	tgc.tg.pidns.owner.mu.RUnlock()
   308  	if n == 0 {
   309  		if t.Before(now) {
   310  			return 0
   311  		}
   312  		// The timer tick raced with thread group exit, after which no more
   313  		// tasks can enter the thread group. So tgc.Now() will never advance
   314  		// again. Return a large delay; the timer should be stopped long before
   315  		// it comes again anyway.
   316  		return time.Hour
   317  	}
   318  	// This is a lower bound on the amount of time that can elapse before an
   319  	// associated timer expires, so returning this value tends to result in a
   320  	// sequence of closely-spaced ticks just before timer expiry. To avoid
   321  	// this, round up to the nearest ClockTick; CPU usage measurements are
   322  	// limited to this resolution anyway.
   323  	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
   324  	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
   325  }
   326  
   327  // UserCPUClock returns a ktime.Clock that measures the time that a thread
   328  // group has spent executing.
   329  func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
   330  	return &tgClock{tg: tg, includeSys: false}
   331  }
   332  
   333  // CPUClock returns a ktime.Clock that measures the time that a thread group
   334  // has spent executing, including sentry time.
   335  func (tg *ThreadGroup) CPUClock() ktime.Clock {
   336  	return &tgClock{tg: tg, includeSys: true}
   337  }
   338  
   339  type kernelCPUClockTicker struct {
   340  	k *Kernel
   341  
   342  	// These are essentially kernelCPUClockTicker.Notify local variables that
   343  	// are cached between calls to reduce allocations.
   344  	rng *rand.Rand
   345  	tgs []*ThreadGroup
   346  }
   347  
   348  func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
   349  	return &kernelCPUClockTicker{
   350  		k:   k,
   351  		rng: rand.New(rand.NewSource(rand.Int63())),
   352  	}
   353  }
   354  
   355  // Notify implements ktime.TimerListener.Notify.
   356  func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
   357  	// Only increment cpuClock by 1 regardless of the number of expirations.
   358  	// This approximately compensates for cases where thread throttling or bad
   359  	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
   360  	// presumably task goroutines as well, from executing for a long period of
   361  	// time. It's also necessary to prevent CPU clocks from seeing large
   362  	// discontinuous jumps.
   363  	now := atomic.AddUint64(&ticker.k.cpuClock, 1)
   364  
   365  	// Check thread group CPU timers.
   366  	tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
   367  	for _, tg := range tgs {
   368  		if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
   369  			continue
   370  		}
   371  
   372  		ticker.k.tasks.mu.RLock()
   373  		if tg.leader == nil {
   374  			// No tasks have ever run in this thread group.
   375  			ticker.k.tasks.mu.RUnlock()
   376  			continue
   377  		}
   378  		// Accumulate thread group CPU stats, and randomly select running tasks
   379  		// using reservoir sampling to receive CPU timer signals.
   380  		var virtReceiver *Task
   381  		nrVirtCandidates := 0
   382  		var profReceiver *Task
   383  		nrProfCandidates := 0
   384  		tgUserTime := tg.exitedCPUStats.UserTime
   385  		tgSysTime := tg.exitedCPUStats.SysTime
   386  		for t := tg.tasks.Front(); t != nil; t = t.Next() {
   387  			tsched := t.TaskGoroutineSchedInfo()
   388  			tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
   389  			tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
   390  			switch tsched.State {
   391  			case TaskGoroutineRunningApp:
   392  				// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
   393  				// timers.
   394  				nrVirtCandidates++
   395  				if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
   396  					virtReceiver = t
   397  				}
   398  				fallthrough
   399  			case TaskGoroutineRunningSys:
   400  				// Considered by ITIMER_PROF and RLIMIT_CPU timers.
   401  				nrProfCandidates++
   402  				if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
   403  					profReceiver = t
   404  				}
   405  			}
   406  		}
   407  		tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
   408  		tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
   409  
   410  		// All of the following are standard (not real-time) signals, which are
   411  		// automatically deduplicated, so we ignore the number of expirations.
   412  		tg.signalHandlers.mu.Lock()
   413  		// It should only be possible for these timers to advance if we found
   414  		// at least one running task.
   415  		if virtReceiver != nil {
   416  			// ITIMER_VIRTUAL
   417  			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
   418  			tg.itimerVirtSetting = newItimerVirtSetting
   419  			if exp != 0 {
   420  				virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
   421  			}
   422  		}
   423  		if profReceiver != nil {
   424  			// ITIMER_PROF
   425  			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
   426  			tg.itimerProfSetting = newItimerProfSetting
   427  			if exp != 0 {
   428  				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
   429  			}
   430  			// RLIMIT_CPU soft limit
   431  			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
   432  			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
   433  			if exp != 0 {
   434  				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
   435  			}
   436  			// RLIMIT_CPU hard limit
   437  			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
   438  			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
   439  				profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
   440  			}
   441  		}
   442  		tg.signalHandlers.mu.Unlock()
   443  
   444  		ticker.k.tasks.mu.RUnlock()
   445  	}
   446  
   447  	// Retain tgs between calls to Notify to reduce allocations.
   448  	for i := range tgs {
   449  		tgs[i] = nil
   450  	}
   451  	ticker.tgs = tgs[:0]
   452  
   453  	// If nothing is running, we can disable the timer.
   454  	tasks := atomic.LoadInt64(&ticker.k.runningTasks)
   455  	if tasks == 0 {
   456  		ticker.k.runningTasksMu.Lock()
   457  		defer ticker.k.runningTasksMu.Unlock()
   458  		tasks := atomic.LoadInt64(&ticker.k.runningTasks)
   459  		if tasks != 0 {
   460  			// Raced with a 0 -> 1 transition.
   461  			return setting, false
   462  		}
   463  
   464  		// Stop the timer. We must cache the current setting so the
   465  		// kernel can access it without violating the lock order.
   466  		ticker.k.cpuClockTickerSetting = setting
   467  		ticker.k.cpuClockTickerDisabled = true
   468  		setting.Enabled = false
   469  		return setting, true
   470  	}
   471  
   472  	return setting, false
   473  }
   474  
   475  // Destroy implements ktime.TimerListener.Destroy.
   476  func (ticker *kernelCPUClockTicker) Destroy() {
   477  }
   478  
   479  // randInt31n returns a random integer in [0, n).
   480  //
   481  // randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
   482  // See that function for details.
   483  func randInt31n(rng *rand.Rand, n int32) int32 {
   484  	v := rng.Uint32()
   485  	prod := uint64(v) * uint64(n)
   486  	low := uint32(prod)
   487  	if low < uint32(n) {
   488  		thresh := uint32(-n) % uint32(n)
   489  		for low < thresh {
   490  			v = rng.Uint32()
   491  			prod = uint64(v) * uint64(n)
   492  			low = uint32(prod)
   493  		}
   494  	}
   495  	return int32(prod >> 32)
   496  }
   497  
   498  // NotifyRlimitCPUUpdated is called by setrlimit.
   499  //
   500  // Preconditions: The caller must be running on the task goroutine.
   501  func (t *Task) NotifyRlimitCPUUpdated() {
   502  	t.k.cpuClockTicker.Atomically(func() {
   503  		t.tg.pidns.owner.mu.RLock()
   504  		defer t.tg.pidns.owner.mu.RUnlock()
   505  		t.tg.signalHandlers.mu.Lock()
   506  		defer t.tg.signalHandlers.mu.Unlock()
   507  		rlimitCPU := t.tg.limits.Get(limits.CPU)
   508  		t.tg.rlimitCPUSoftSetting = ktime.Setting{
   509  			Enabled: rlimitCPU.Cur != limits.Infinity,
   510  			Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
   511  			Period:  time.Second,
   512  		}
   513  		if rlimitCPU.Max != limits.Infinity {
   514  			// Check if tg is already over the hard limit.
   515  			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
   516  			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
   517  			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
   518  				t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
   519  			}
   520  		}
   521  		t.tg.updateCPUTimersEnabledLocked()
   522  	})
   523  }
   524  
   525  // Preconditions: The signal mutex must be locked.
   526  func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
   527  	rlimitCPU := tg.limits.Get(limits.CPU)
   528  	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
   529  		atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
   530  	} else {
   531  		atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
   532  	}
   533  }
   534  
   535  // StateStatus returns a string representation of the task's current state,
   536  // appropriate for /proc/[pid]/status.
   537  func (t *Task) StateStatus() string {
   538  	switch s := t.TaskGoroutineSchedInfo().State; s {
   539  	case TaskGoroutineNonexistent, TaskGoroutineRunningSys:
   540  		t.tg.pidns.owner.mu.RLock()
   541  		defer t.tg.pidns.owner.mu.RUnlock()
   542  		switch t.exitState {
   543  		case TaskExitZombie:
   544  			return "Z (zombie)"
   545  		case TaskExitDead:
   546  			return "X (dead)"
   547  		default:
   548  			// The task goroutine can't exit before passing through
   549  			// runExitNotify, so if s == TaskGoroutineNonexistent, the task has
   550  			// been created but the task goroutine hasn't yet started. The
   551  			// Linux equivalent is struct task_struct::state == TASK_NEW
   552  			// (kernel/fork.c:copy_process() =>
   553  			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
   554  			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
   555  			// TASK_RUNNING.
   556  			return "R (running)"
   557  		}
   558  	case TaskGoroutineRunningApp:
   559  		return "R (running)"
   560  	case TaskGoroutineBlockedInterruptible:
   561  		return "S (sleeping)"
   562  	case TaskGoroutineStopped:
   563  		t.tg.signalHandlers.mu.Lock()
   564  		defer t.tg.signalHandlers.mu.Unlock()
   565  		switch t.stop.(type) {
   566  		case *groupStop:
   567  			return "T (stopped)"
   568  		case *ptraceStop:
   569  			return "t (tracing stop)"
   570  		}
   571  		fallthrough
   572  	case TaskGoroutineBlockedUninterruptible:
   573  		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
   574  		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
   575  		// fs/proc/array.c:task_state_array.
   576  		return "D (disk sleep)"
   577  	default:
   578  		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
   579  	}
   580  }
   581  
   582  // CPUMask returns a copy of t's allowed CPU mask.
   583  func (t *Task) CPUMask() sched.CPUSet {
   584  	t.mu.Lock()
   585  	defer t.mu.Unlock()
   586  	return t.allowedCPUMask.Copy()
   587  }
   588  
   589  // SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
   590  // mask.
   591  //
   592  // Preconditions: mask.Size() ==
   593  // sched.CPUSetSize(t.Kernel().ApplicationCores()).
   594  func (t *Task) SetCPUMask(mask sched.CPUSet) error {
   595  	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
   596  		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
   597  	}
   598  
   599  	// Remove CPUs in mask above Kernel.applicationCores.
   600  	mask.ClearAbove(t.k.applicationCores)
   601  
   602  	// Ensure that at least 1 CPU is still allowed.
   603  	if mask.NumCPUs() == 0 {
   604  		return linuxerr.EINVAL
   605  	}
   606  
   607  	if t.k.useHostCores {
   608  		// No-op; pretend the mask was immediately changed back.
   609  		return nil
   610  	}
   611  
   612  	t.tg.pidns.owner.mu.RLock()
   613  	rootTID := t.tg.pidns.owner.Root.tids[t]
   614  	t.tg.pidns.owner.mu.RUnlock()
   615  
   616  	t.mu.Lock()
   617  	defer t.mu.Unlock()
   618  	t.allowedCPUMask = mask
   619  	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
   620  	return nil
   621  }
   622  
   623  // CPU returns the cpu id for a given task.
   624  func (t *Task) CPU() int32 {
   625  	if t.k.useHostCores {
   626  		return int32(hostcpu.GetCPU())
   627  	}
   628  
   629  	return atomic.LoadInt32(&t.cpu)
   630  }
   631  
   632  // assignCPU returns the virtualized CPU number for the task with global TID
   633  // tid and allowedCPUMask allowed.
   634  func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
   635  	// To pretend that threads are evenly distributed to allowed CPUs, choose n
   636  	// to be less than the number of CPUs in allowed ...
   637  	n := int(tid) % int(allowed.NumCPUs())
   638  	// ... then pick the nth CPU in allowed.
   639  	allowed.ForEachCPU(func(c uint) {
   640  		if n--; n == 0 {
   641  			cpu = int32(c)
   642  		}
   643  	})
   644  	return cpu
   645  }
   646  
   647  // Niceness returns t's niceness.
   648  func (t *Task) Niceness() int {
   649  	t.mu.Lock()
   650  	defer t.mu.Unlock()
   651  	return t.niceness
   652  }
   653  
   654  // Priority returns t's priority.
   655  func (t *Task) Priority() int {
   656  	t.mu.Lock()
   657  	defer t.mu.Unlock()
   658  	return t.niceness + 20
   659  }
   660  
   661  // SetNiceness sets t's niceness to n.
   662  func (t *Task) SetNiceness(n int) {
   663  	t.mu.Lock()
   664  	defer t.mu.Unlock()
   665  	t.niceness = n
   666  }
   667  
   668  // NumaPolicy returns t's current numa policy.
   669  func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
   670  	t.mu.Lock()
   671  	defer t.mu.Unlock()
   672  	return t.numaPolicy, t.numaNodeMask
   673  }
   674  
   675  // SetNumaPolicy sets t's numa policy.
   676  func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
   677  	t.mu.Lock()
   678  	defer t.mu.Unlock()
   679  	t.numaPolicy = policy
   680  	t.numaNodeMask = nodeMask
   681  }