github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_sched.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  // CPU scheduling, real and fake.
    18  
    19  import (
    20  	"fmt"
    21  	"math/rand"
    22  	"time"
    23  
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostcpu"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched"
    28  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    31  )
    32  
    33  // TaskGoroutineState is a coarse representation of the current execution
    34  // status of a kernel.Task goroutine.
    35  type TaskGoroutineState int
    36  
    37  const (
    38  	// TaskGoroutineNonexistent indicates that the task goroutine has either
    39  	// not yet been created by Task.Start() or has returned from Task.run().
    40  	// This must be the zero value for TaskGoroutineState.
    41  	TaskGoroutineNonexistent TaskGoroutineState = iota
    42  
    43  	// TaskGoroutineRunningSys indicates that the task goroutine is executing
    44  	// sentry code.
    45  	TaskGoroutineRunningSys
    46  
    47  	// TaskGoroutineRunningApp indicates that the task goroutine is executing
    48  	// application code.
    49  	TaskGoroutineRunningApp
    50  
    51  	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
    52  	// blocked in Task.block(), and hence may be woken by Task.interrupt()
    53  	// (e.g. due to signal delivery).
    54  	TaskGoroutineBlockedInterruptible
    55  
    56  	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
    57  	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
    58  	// woken by Task.interrupt().
    59  	TaskGoroutineBlockedUninterruptible
    60  
    61  	// TaskGoroutineStopped indicates that the task goroutine is blocked in
    62  	// Task.doStop(). TaskGoroutineStopped is similar to
    63  	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
    64  	// possible to determine when Task.stop is meaningful.
    65  	TaskGoroutineStopped
    66  )
    67  
    68  // TaskGoroutineSchedInfo contains task goroutine scheduling state which must
    69  // be read and updated atomically.
    70  //
    71  // +stateify savable
    72  type TaskGoroutineSchedInfo struct {
    73  	// Timestamp was the value of Kernel.cpuClock when this
    74  	// TaskGoroutineSchedInfo was last updated.
    75  	Timestamp uint64
    76  
    77  	// State is the current state of the task goroutine.
    78  	State TaskGoroutineState
    79  
    80  	// UserTicks is the amount of time the task goroutine has spent executing
    81  	// its associated Task's application code, in units of linux.ClockTick.
    82  	UserTicks uint64
    83  
    84  	// SysTicks is the amount of time the task goroutine has spent executing in
    85  	// the sentry, in units of linux.ClockTick.
    86  	SysTicks uint64
    87  }
    88  
    89  // userTicksAt returns the extrapolated value of ts.UserTicks after
    90  // Kernel.CPUClockNow() indicates a time of now.
    91  //
    92  // Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
    93  // monotonic, this is satisfied if now is the result of a previous call to
    94  // Kernel.CPUClockNow().) This requirement exists because otherwise a racing
    95  // change to t.gosched can cause userTicksAt to adjust stats by too much,
    96  // making the observed stats non-monotonic.
    97  func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
    98  	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
    99  		// Update stats to reflect execution since the last update.
   100  		return ts.UserTicks + (now - ts.Timestamp)
   101  	}
   102  	return ts.UserTicks
   103  }
   104  
   105  // sysTicksAt returns the extrapolated value of ts.SysTicks after
   106  // Kernel.CPUClockNow() indicates a time of now.
   107  //
   108  // Preconditions: As for userTicksAt.
   109  func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
   110  	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
   111  		return ts.SysTicks + (now - ts.Timestamp)
   112  	}
   113  	return ts.SysTicks
   114  }
   115  
   116  // Preconditions: The caller must be running on the task goroutine.
   117  func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
   118  	now := t.k.CPUClockNow()
   119  	if t.gosched.State != TaskGoroutineRunningSys {
   120  		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
   121  	}
   122  	t.goschedSeq.BeginWrite()
   123  	// This function is very hot; avoid defer.
   124  	t.gosched.SysTicks += now - t.gosched.Timestamp
   125  	t.gosched.Timestamp = now
   126  	t.gosched.State = state
   127  	t.goschedSeq.EndWrite()
   128  
   129  	if state != TaskGoroutineRunningApp {
   130  		// Task is blocking/stopping.
   131  		t.k.decRunningTasks()
   132  	}
   133  }
   134  
   135  // Preconditions:
   136  //   - The caller must be running on the task goroutine
   137  //   - The caller must be leaving a state indicated by a previous call to
   138  //     t.accountTaskGoroutineEnter(state).
   139  func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
   140  	if state != TaskGoroutineRunningApp {
   141  		// Task is unblocking/continuing.
   142  		t.k.incRunningTasks()
   143  	}
   144  
   145  	now := t.k.CPUClockNow()
   146  	if t.gosched.State != state {
   147  		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
   148  	}
   149  	t.goschedSeq.BeginWrite()
   150  	// This function is very hot; avoid defer.
   151  	if state == TaskGoroutineRunningApp {
   152  		t.gosched.UserTicks += now - t.gosched.Timestamp
   153  	}
   154  	t.gosched.Timestamp = now
   155  	t.gosched.State = TaskGoroutineRunningSys
   156  	t.goschedSeq.EndWrite()
   157  }
   158  
   159  // Preconditions: The caller must be running on the task goroutine.
   160  func (t *Task) accountTaskGoroutineRunning() {
   161  	now := t.k.CPUClockNow()
   162  	if t.gosched.State != TaskGoroutineRunningSys {
   163  		panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
   164  	}
   165  	t.goschedSeq.BeginWrite()
   166  	t.gosched.SysTicks += now - t.gosched.Timestamp
   167  	t.gosched.Timestamp = now
   168  	t.goschedSeq.EndWrite()
   169  }
   170  
   171  // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
   172  // Most clients should use t.CPUStats() instead.
   173  func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
   174  	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
   175  }
   176  
   177  // CPUStats returns the CPU usage statistics of t.
   178  func (t *Task) CPUStats() usage.CPUStats {
   179  	return t.cpuStatsAt(t.k.CPUClockNow())
   180  }
   181  
   182  // Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
   183  func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
   184  	tsched := t.TaskGoroutineSchedInfo()
   185  	return usage.CPUStats{
   186  		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
   187  		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
   188  		VoluntarySwitches: t.yieldCount.Load(),
   189  	}
   190  }
   191  
   192  // CPUStats returns the combined CPU usage statistics of all past and present
   193  // threads in tg.
   194  func (tg *ThreadGroup) CPUStats() usage.CPUStats {
   195  	tg.pidns.owner.mu.RLock()
   196  	defer tg.pidns.owner.mu.RUnlock()
   197  	// Hack to get a pointer to the Kernel.
   198  	if tg.leader == nil {
   199  		// Per comment on tg.leader, this is only possible if nothing in the
   200  		// ThreadGroup has ever executed anyway.
   201  		return usage.CPUStats{}
   202  	}
   203  	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
   204  }
   205  
   206  // Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
   207  //   - The TaskSet mutex must be locked.
   208  func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
   209  	stats := tg.exitedCPUStats
   210  	// Account for live tasks.
   211  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   212  		stats.Accumulate(t.cpuStatsAt(now))
   213  	}
   214  	return stats
   215  }
   216  
   217  // JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
   218  // resource usage statistics for all children of [tg] that have terminated and
   219  // been waited for. These statistics will include the resources used by
   220  // grandchildren, and further removed descendants, if all of the intervening
   221  // descendants waited on their terminated children."
   222  func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
   223  	tg.pidns.owner.mu.RLock()
   224  	defer tg.pidns.owner.mu.RUnlock()
   225  	return tg.childCPUStats
   226  }
   227  
   228  // taskClock is a ktime.Clock that measures the time that a task has spent
   229  // executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
   230  //
   231  // +stateify savable
   232  type taskClock struct {
   233  	t *Task
   234  
   235  	// If includeSys is true, the taskClock includes both time spent executing
   236  	// application code as well as time spent in the sentry. Otherwise, the
   237  	// taskClock includes only time spent executing application code.
   238  	includeSys bool
   239  
   240  	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
   241  	// based on either of the clock events, so there's no event to be
   242  	// notified for.
   243  	ktime.NoClockEvents `state:"nosave"`
   244  
   245  	// Implements ktime.Clock.WallTimeUntil.
   246  	//
   247  	// As an upper bound, a task's clock cannot advance faster than CPU
   248  	// time. It would have to execute at a rate of more than 1 task-second
   249  	// per 1 CPU-second, which isn't possible.
   250  	ktime.WallRateClock `state:"nosave"`
   251  }
   252  
   253  // UserCPUClock returns a clock measuring the CPU time the task has spent
   254  // executing application code.
   255  func (t *Task) UserCPUClock() ktime.Clock {
   256  	return &taskClock{t: t, includeSys: false}
   257  }
   258  
   259  // CPUClock returns a clock measuring the CPU time the task has spent executing
   260  // application and "kernel" code.
   261  func (t *Task) CPUClock() ktime.Clock {
   262  	return &taskClock{t: t, includeSys: true}
   263  }
   264  
   265  // Now implements ktime.Clock.Now.
   266  func (tc *taskClock) Now() ktime.Time {
   267  	stats := tc.t.CPUStats()
   268  	if tc.includeSys {
   269  		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
   270  	}
   271  	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
   272  }
   273  
   274  // tgClock is a ktime.Clock that measures the time a thread group has spent
   275  // executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
   276  //
   277  // +stateify savable
   278  type tgClock struct {
   279  	tg *ThreadGroup
   280  
   281  	// If includeSys is true, the tgClock includes both time spent executing
   282  	// application code as well as time spent in the sentry. Otherwise, the
   283  	// tgClock includes only time spent executing application code.
   284  	includeSys bool
   285  
   286  	// Implements waiter.Waitable.
   287  	ktime.ClockEventsQueue `state:"nosave"`
   288  }
   289  
   290  // Now implements ktime.Clock.Now.
   291  func (tgc *tgClock) Now() ktime.Time {
   292  	stats := tgc.tg.CPUStats()
   293  	if tgc.includeSys {
   294  		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
   295  	}
   296  	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
   297  }
   298  
   299  // WallTimeUntil implements ktime.Clock.WallTimeUntil.
   300  func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
   301  	// Thread group CPU time should not exceed wall time * live tasks, since
   302  	// task goroutines exit after the transition to TaskExitZombie in
   303  	// runExitNotify.
   304  	tgc.tg.pidns.owner.mu.RLock()
   305  	n := tgc.tg.liveTasks
   306  	tgc.tg.pidns.owner.mu.RUnlock()
   307  	if n == 0 {
   308  		if t.Before(now) {
   309  			return 0
   310  		}
   311  		// The timer tick raced with thread group exit, after which no more
   312  		// tasks can enter the thread group. So tgc.Now() will never advance
   313  		// again. Return a large delay; the timer should be stopped long before
   314  		// it comes again anyway.
   315  		return time.Hour
   316  	}
   317  	// This is a lower bound on the amount of time that can elapse before an
   318  	// associated timer expires, so returning this value tends to result in a
   319  	// sequence of closely-spaced ticks just before timer expiry. To avoid
   320  	// this, round up to the nearest ClockTick; CPU usage measurements are
   321  	// limited to this resolution anyway.
   322  	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
   323  	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
   324  }
   325  
   326  // UserCPUClock returns a ktime.Clock that measures the time that a thread
   327  // group has spent executing.
   328  func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
   329  	return &tgClock{tg: tg, includeSys: false}
   330  }
   331  
   332  // CPUClock returns a ktime.Clock that measures the time that a thread group
   333  // has spent executing, including sentry time.
   334  func (tg *ThreadGroup) CPUClock() ktime.Clock {
   335  	return &tgClock{tg: tg, includeSys: true}
   336  }
   337  
   338  func (k *Kernel) runCPUClockTicker() {
   339  	rng := rand.New(rand.NewSource(rand.Int63()))
   340  	var tgs []*ThreadGroup
   341  
   342  	for {
   343  		// Stop the CPU clock while nothing is running.
   344  		if k.runningTasks.Load() == 0 {
   345  			k.runningTasksMu.Lock()
   346  			if k.runningTasks.Load() == 0 {
   347  				k.cpuClockTickerRunning = false
   348  				k.cpuClockTickerStopCond.Broadcast()
   349  				k.runningTasksCond.Wait()
   350  				// k.cpuClockTickerRunning was set to true by our waker
   351  				// (Kernel.incRunningTasks()). For reasons described there, we must
   352  				// process at least one CPU clock tick between calls to
   353  				// k.runningTasksCond.Wait().
   354  			}
   355  			k.runningTasksMu.Unlock()
   356  		}
   357  
   358  		// Wait for the next CPU clock tick.
   359  		select {
   360  		case <-k.cpuClockTickTimer.C:
   361  			k.cpuClockTickTimer.Reset(linux.ClockTick)
   362  		case <-k.cpuClockTickerWakeCh:
   363  			continue
   364  		}
   365  
   366  		// Advance the CPU clock, and timers based on the CPU clock, atomically
   367  		// under cpuClockMu.
   368  		k.cpuClockMu.Lock()
   369  		now := k.cpuClock.Add(1)
   370  
   371  		// Check thread group CPU timers.
   372  		tgs = k.tasks.Root.ThreadGroupsAppend(tgs)
   373  		for _, tg := range tgs {
   374  			if tg.cpuTimersEnabled.Load() == 0 {
   375  				continue
   376  			}
   377  
   378  			k.tasks.mu.RLock()
   379  			if tg.leader == nil {
   380  				// No tasks have ever run in this thread group.
   381  				k.tasks.mu.RUnlock()
   382  				continue
   383  			}
   384  			// Accumulate thread group CPU stats, and randomly select running tasks
   385  			// using reservoir sampling to receive CPU timer signals.
   386  			var virtReceiver *Task
   387  			nrVirtCandidates := 0
   388  			var profReceiver *Task
   389  			nrProfCandidates := 0
   390  			tgUserTime := tg.exitedCPUStats.UserTime
   391  			tgSysTime := tg.exitedCPUStats.SysTime
   392  			for t := tg.tasks.Front(); t != nil; t = t.Next() {
   393  				tsched := t.TaskGoroutineSchedInfo()
   394  				tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
   395  				tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
   396  				switch tsched.State {
   397  				case TaskGoroutineRunningApp:
   398  					// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
   399  					// timers.
   400  					nrVirtCandidates++
   401  					if int(randInt31n(rng, int32(nrVirtCandidates))) == 0 {
   402  						virtReceiver = t
   403  					}
   404  					fallthrough
   405  				case TaskGoroutineRunningSys:
   406  					// Considered by ITIMER_PROF and RLIMIT_CPU timers.
   407  					nrProfCandidates++
   408  					if int(randInt31n(rng, int32(nrProfCandidates))) == 0 {
   409  						profReceiver = t
   410  					}
   411  				}
   412  			}
   413  			tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
   414  			tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
   415  
   416  			// All of the following are standard (not real-time) signals, which are
   417  			// automatically deduplicated, so we ignore the number of expirations.
   418  			tg.signalHandlers.mu.Lock()
   419  			// It should only be possible for these timers to advance if we found
   420  			// at least one running task.
   421  			if virtReceiver != nil {
   422  				// ITIMER_VIRTUAL
   423  				newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
   424  				tg.itimerVirtSetting = newItimerVirtSetting
   425  				if exp != 0 {
   426  					virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
   427  				}
   428  			}
   429  			if profReceiver != nil {
   430  				// ITIMER_PROF
   431  				newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
   432  				tg.itimerProfSetting = newItimerProfSetting
   433  				if exp != 0 {
   434  					profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
   435  				}
   436  				// RLIMIT_CPU soft limit
   437  				newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
   438  				tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
   439  				if exp != 0 {
   440  					profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
   441  				}
   442  				// RLIMIT_CPU hard limit
   443  				rlimitCPUMax := tg.limits.Get(limits.CPU).Max
   444  				if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
   445  					profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
   446  				}
   447  			}
   448  			tg.signalHandlers.mu.Unlock()
   449  
   450  			k.tasks.mu.RUnlock()
   451  		}
   452  
   453  		k.cpuClockMu.Unlock()
   454  
   455  		// Retain tgs between calls to Notify to reduce allocations.
   456  		for i := range tgs {
   457  			tgs[i] = nil
   458  		}
   459  		tgs = tgs[:0]
   460  	}
   461  }
   462  
   463  // randInt31n returns a random integer in [0, n).
   464  //
   465  // randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
   466  // See that function for details.
   467  func randInt31n(rng *rand.Rand, n int32) int32 {
   468  	v := rng.Uint32()
   469  	prod := uint64(v) * uint64(n)
   470  	low := uint32(prod)
   471  	if low < uint32(n) {
   472  		thresh := uint32(-n) % uint32(n)
   473  		for low < thresh {
   474  			v = rng.Uint32()
   475  			prod = uint64(v) * uint64(n)
   476  			low = uint32(prod)
   477  		}
   478  	}
   479  	return int32(prod >> 32)
   480  }
   481  
   482  // NotifyRlimitCPUUpdated is called by setrlimit.
   483  //
   484  // Preconditions: The caller must be running on the task goroutine.
   485  func (t *Task) NotifyRlimitCPUUpdated() {
   486  	t.k.cpuClockMu.Lock()
   487  	defer t.k.cpuClockMu.Unlock()
   488  	t.tg.pidns.owner.mu.RLock()
   489  	defer t.tg.pidns.owner.mu.RUnlock()
   490  	t.tg.signalHandlers.mu.Lock()
   491  	defer t.tg.signalHandlers.mu.Unlock()
   492  	rlimitCPU := t.tg.limits.Get(limits.CPU)
   493  	t.tg.rlimitCPUSoftSetting = ktime.Setting{
   494  		Enabled: rlimitCPU.Cur != limits.Infinity,
   495  		Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
   496  		Period:  time.Second,
   497  	}
   498  	if rlimitCPU.Max != limits.Infinity {
   499  		// Check if tg is already over the hard limit.
   500  		tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
   501  		tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
   502  		if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
   503  			t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
   504  		}
   505  	}
   506  	t.tg.updateCPUTimersEnabledLocked()
   507  }
   508  
   509  // Preconditions: The signal mutex must be locked.
   510  func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
   511  	rlimitCPU := tg.limits.Get(limits.CPU)
   512  	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
   513  		tg.cpuTimersEnabled.Store(1)
   514  	} else {
   515  		tg.cpuTimersEnabled.Store(0)
   516  	}
   517  }
   518  
   519  // StateStatus returns a string representation of the task's current state,
   520  // appropriate for /proc/[pid]/status.
   521  func (t *Task) StateStatus() string {
   522  	switch s := t.TaskGoroutineSchedInfo().State; s {
   523  	case TaskGoroutineNonexistent, TaskGoroutineRunningSys:
   524  		t.tg.pidns.owner.mu.RLock()
   525  		defer t.tg.pidns.owner.mu.RUnlock()
   526  		switch t.exitState {
   527  		case TaskExitZombie:
   528  			return "Z (zombie)"
   529  		case TaskExitDead:
   530  			return "X (dead)"
   531  		default:
   532  			// The task goroutine can't exit before passing through
   533  			// runExitNotify, so if s == TaskGoroutineNonexistent, the task has
   534  			// been created but the task goroutine hasn't yet started. The
   535  			// Linux equivalent is struct task_struct::state == TASK_NEW
   536  			// (kernel/fork.c:copy_process() =>
   537  			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
   538  			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
   539  			// TASK_RUNNING.
   540  			return "R (running)"
   541  		}
   542  	case TaskGoroutineRunningApp:
   543  		return "R (running)"
   544  	case TaskGoroutineBlockedInterruptible:
   545  		return "S (sleeping)"
   546  	case TaskGoroutineStopped:
   547  		t.tg.signalHandlers.mu.Lock()
   548  		defer t.tg.signalHandlers.mu.Unlock()
   549  		switch t.stop.(type) {
   550  		case *groupStop:
   551  			return "T (stopped)"
   552  		case *ptraceStop:
   553  			return "t (tracing stop)"
   554  		}
   555  		fallthrough
   556  	case TaskGoroutineBlockedUninterruptible:
   557  		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
   558  		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
   559  		// fs/proc/array.c:task_state_array.
   560  		return "D (disk sleep)"
   561  	default:
   562  		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
   563  	}
   564  }
   565  
   566  // CPUMask returns a copy of t's allowed CPU mask.
   567  func (t *Task) CPUMask() sched.CPUSet {
   568  	t.mu.Lock()
   569  	defer t.mu.Unlock()
   570  	return t.allowedCPUMask.Copy()
   571  }
   572  
   573  // SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
   574  // mask.
   575  //
   576  // Preconditions: mask.Size() ==
   577  // sched.CPUSetSize(t.Kernel().ApplicationCores()).
   578  func (t *Task) SetCPUMask(mask sched.CPUSet) error {
   579  	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
   580  		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
   581  	}
   582  
   583  	// Remove CPUs in mask above Kernel.applicationCores.
   584  	mask.ClearAbove(t.k.applicationCores)
   585  
   586  	// Ensure that at least 1 CPU is still allowed.
   587  	if mask.NumCPUs() == 0 {
   588  		return linuxerr.EINVAL
   589  	}
   590  
   591  	if t.k.useHostCores {
   592  		// No-op; pretend the mask was immediately changed back.
   593  		return nil
   594  	}
   595  
   596  	t.tg.pidns.owner.mu.RLock()
   597  	rootTID := t.tg.pidns.owner.Root.tids[t]
   598  	t.tg.pidns.owner.mu.RUnlock()
   599  
   600  	t.mu.Lock()
   601  	defer t.mu.Unlock()
   602  	t.allowedCPUMask = mask
   603  	t.cpu.Store(assignCPU(mask, rootTID))
   604  	return nil
   605  }
   606  
   607  // CPU returns the cpu id for a given task.
   608  func (t *Task) CPU() int32 {
   609  	if t.k.useHostCores {
   610  		return int32(hostcpu.GetCPU())
   611  	}
   612  
   613  	return t.cpu.Load()
   614  }
   615  
   616  // assignCPU returns the virtualized CPU number for the task with global TID
   617  // tid and allowedCPUMask allowed.
   618  func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
   619  	// To pretend that threads are evenly distributed to allowed CPUs, choose n
   620  	// to be less than the number of CPUs in allowed ...
   621  	n := int(tid) % int(allowed.NumCPUs())
   622  	// ... then pick the nth CPU in allowed.
   623  	allowed.ForEachCPU(func(c uint) {
   624  		if n--; n == 0 {
   625  			cpu = int32(c)
   626  		}
   627  	})
   628  	return cpu
   629  }
   630  
   631  // Niceness returns t's niceness.
   632  func (t *Task) Niceness() int {
   633  	t.mu.Lock()
   634  	defer t.mu.Unlock()
   635  	return t.niceness
   636  }
   637  
   638  // Priority returns t's priority.
   639  func (t *Task) Priority() int {
   640  	t.mu.Lock()
   641  	defer t.mu.Unlock()
   642  	return t.niceness + 20
   643  }
   644  
   645  // SetNiceness sets t's niceness to n.
   646  func (t *Task) SetNiceness(n int) {
   647  	t.mu.Lock()
   648  	defer t.mu.Unlock()
   649  	t.niceness = n
   650  }
   651  
   652  // NumaPolicy returns t's current numa policy.
   653  func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
   654  	t.mu.Lock()
   655  	defer t.mu.Unlock()
   656  	return t.numaPolicy, t.numaNodeMask
   657  }
   658  
   659  // SetNumaPolicy sets t's numa policy.
   660  func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
   661  	t.mu.Lock()
   662  	defer t.mu.Unlock()
   663  	t.numaPolicy = policy
   664  	t.numaNodeMask = nodeMask
   665  }