github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/rseq.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/metacubex/gvisor/pkg/abi/linux"
    21  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    22  	"github.com/metacubex/gvisor/pkg/hostarch"
    23  	"github.com/metacubex/gvisor/pkg/sentry/hostcpu"
    24  	"github.com/metacubex/gvisor/pkg/usermem"
    25  )
    26  
    27  // Restartable sequences.
    28  //
    29  // We support two different APIs for restartable sequences.
    30  //
    31  //  1. The upstream interface added in v4.18.
    32  //  2. The interface described in https://lwn.net/Articles/650333/.
    33  //
    34  // Throughout this file and other parts of the kernel, the latter is referred
    35  // to as "old rseq". This interface was never merged upstream, but is supported
    36  // for a limited set of applications that use it regardless.
    37  
    38  // OldRSeqCriticalRegion describes an old rseq critical region.
    39  //
    40  // +stateify savable
    41  type OldRSeqCriticalRegion struct {
    42  	// When a task in this thread group has its CPU preempted (as defined by
    43  	// platform.ErrContextCPUPreempted) or has a signal delivered to an
    44  	// application handler while its instruction pointer is in CriticalSection,
    45  	// set the instruction pointer to Restart and application register r10 (on
    46  	// amd64) to the former instruction pointer.
    47  	CriticalSection hostarch.AddrRange
    48  	Restart         hostarch.Addr
    49  }
    50  
    51  // RSeqAvailable returns true if t supports (old and new) restartable sequences.
    52  func (t *Task) RSeqAvailable() bool {
    53  	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
    54  }
    55  
    56  // SetRSeq registers addr as this thread's rseq structure.
    57  //
    58  // Preconditions: The caller must be running on the task goroutine.
    59  func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
    60  	if t.rseqAddr != 0 {
    61  		if t.rseqAddr != addr {
    62  			return linuxerr.EINVAL
    63  		}
    64  		if t.rseqSignature != signature {
    65  			return linuxerr.EINVAL
    66  		}
    67  		return linuxerr.EBUSY
    68  	}
    69  
    70  	// rseq must be aligned and correctly sized.
    71  	if addr&(linux.AlignOfRSeq-1) != 0 {
    72  		return linuxerr.EINVAL
    73  	}
    74  	if length != linux.SizeOfRSeq {
    75  		return linuxerr.EINVAL
    76  	}
    77  	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
    78  		return linuxerr.EFAULT
    79  	}
    80  
    81  	t.rseqAddr = addr
    82  	t.rseqSignature = signature
    83  
    84  	// Initialize the CPUID.
    85  	//
    86  	// Linux implicitly does this on return from userspace, where failure
    87  	// would cause SIGSEGV.
    88  	if err := t.rseqUpdateCPU(); err != nil {
    89  		t.rseqAddr = 0
    90  		t.rseqSignature = 0
    91  
    92  		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
    93  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
    94  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
    95  		return linuxerr.EFAULT
    96  	}
    97  
    98  	return nil
    99  }
   100  
   101  // ClearRSeq unregisters addr as this thread's rseq structure.
   102  //
   103  // Preconditions: The caller must be running on the task goroutine.
   104  func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
   105  	if t.rseqAddr == 0 {
   106  		return linuxerr.EINVAL
   107  	}
   108  	if t.rseqAddr != addr {
   109  		return linuxerr.EINVAL
   110  	}
   111  	if length != linux.SizeOfRSeq {
   112  		return linuxerr.EINVAL
   113  	}
   114  	if t.rseqSignature != signature {
   115  		return linuxerr.EPERM
   116  	}
   117  
   118  	if err := t.rseqClearCPU(); err != nil {
   119  		return err
   120  	}
   121  
   122  	t.rseqAddr = 0
   123  	t.rseqSignature = 0
   124  
   125  	if t.oldRSeqCPUAddr == 0 {
   126  		// rseqCPU no longer needed.
   127  		t.rseqCPU = -1
   128  	}
   129  
   130  	return nil
   131  }
   132  
   133  // OldRSeqCriticalRegion returns a copy of t's thread group's current
   134  // old restartable sequence.
   135  func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
   136  	return *t.tg.oldRSeqCritical.Load()
   137  }
   138  
   139  // SetOldRSeqCriticalRegion replaces t's thread group's old restartable
   140  // sequence.
   141  //
   142  // Preconditions: t.RSeqAvailable() == true.
   143  func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
   144  	// These checks are somewhat more lenient than in Linux, which (bizarrely)
   145  	// requires r.CriticalSection to be non-empty and r.Restart to be
   146  	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
   147  	// (which disables the critical region).
   148  	if r.CriticalSection.Start == 0 {
   149  		r.CriticalSection.End = 0
   150  		r.Restart = 0
   151  		t.tg.oldRSeqCritical.Store(&r)
   152  		return nil
   153  	}
   154  	if r.CriticalSection.Start >= r.CriticalSection.End {
   155  		return linuxerr.EINVAL
   156  	}
   157  	if r.CriticalSection.Contains(r.Restart) {
   158  		return linuxerr.EINVAL
   159  	}
   160  	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
   161  	// the application address range, for consistency with Linux.
   162  	t.tg.oldRSeqCritical.Store(&r)
   163  	return nil
   164  }
   165  
   166  // OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
   167  // CPU number.
   168  //
   169  // Preconditions: The caller must be running on the task goroutine.
   170  func (t *Task) OldRSeqCPUAddr() hostarch.Addr {
   171  	return t.oldRSeqCPUAddr
   172  }
   173  
   174  // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
   175  // t's CPU number.
   176  //
   177  // Preconditions:
   178  //   - t.RSeqAvailable() == true.
   179  //   - The caller must be running on the task goroutine.
   180  //   - t's AddressSpace must be active.
   181  func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
   182  	t.oldRSeqCPUAddr = addr
   183  
   184  	// Check that addr is writable.
   185  	//
   186  	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
   187  	// unfortunate, but unlikely in a correct program.
   188  	if err := t.rseqUpdateCPU(); err != nil {
   189  		t.oldRSeqCPUAddr = 0
   190  		return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT
   191  	}
   192  	return nil
   193  }
   194  
   195  // Preconditions:
   196  //   - The caller must be running on the task goroutine.
   197  //   - t's AddressSpace must be active.
   198  func (t *Task) rseqUpdateCPU() error {
   199  	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
   200  		t.rseqCPU = -1
   201  		return nil
   202  	}
   203  
   204  	t.rseqCPU = int32(hostcpu.GetCPU())
   205  
   206  	// Update both CPUs, even if one fails.
   207  	rerr := t.rseqCopyOutCPU()
   208  	oerr := t.oldRSeqCopyOutCPU()
   209  
   210  	if rerr != nil {
   211  		return rerr
   212  	}
   213  	return oerr
   214  }
   215  
   216  // Preconditions:
   217  //   - The caller must be running on the task goroutine.
   218  //   - t's AddressSpace must be active.
   219  func (t *Task) oldRSeqCopyOutCPU() error {
   220  	if t.oldRSeqCPUAddr == 0 {
   221  		return nil
   222  	}
   223  
   224  	buf := t.CopyScratchBuffer(4)
   225  	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
   226  	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
   227  	return err
   228  }
   229  
   230  // Preconditions:
   231  //   - The caller must be running on the task goroutine.
   232  //   - t's AddressSpace must be active.
   233  func (t *Task) rseqCopyOutCPU() error {
   234  	if t.rseqAddr == 0 {
   235  		return nil
   236  	}
   237  
   238  	buf := t.CopyScratchBuffer(8)
   239  	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
   240  	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
   241  	hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
   242  	// N.B. This write is not atomic, but since this occurs on the task
   243  	// goroutine then as long as userspace uses a single-instruction read
   244  	// it can't see an invalid value.
   245  	_, err := t.CopyOutBytes(t.rseqAddr, buf)
   246  	return err
   247  }
   248  
   249  // Preconditions:
   250  //   - The caller must be running on the task goroutine.
   251  //   - t's AddressSpace must be active.
   252  func (t *Task) rseqClearCPU() error {
   253  	buf := t.CopyScratchBuffer(8)
   254  	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
   255  	hostarch.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
   256  	hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
   257  	// N.B. This write is not atomic, but since this occurs on the task
   258  	// goroutine then as long as userspace uses a single-instruction read
   259  	// it can't see an invalid value.
   260  	_, err := t.CopyOutBytes(t.rseqAddr, buf)
   261  	return err
   262  }
   263  
   264  // rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
   265  //
   266  // This is a bit complex since both the RSeq and RSeqCriticalSection structs
   267  // are stored in userspace. So we must:
   268  //
   269  //  1. Copy in the address of RSeqCriticalSection from RSeq.
   270  //  2. Copy in RSeqCriticalSection itself.
   271  //  3. Validate critical section struct version, address range, abort address.
   272  //  4. Validate the abort signature (4 bytes preceding abort IP match expected
   273  //     signature).
   274  //
   275  // 5. Clear address of RSeqCriticalSection from RSeq.
   276  // 6. Finally, conditionally abort.
   277  //
   278  // See kernel/rseq.c:rseq_ip_fixup for reference.
   279  //
   280  // Preconditions:
   281  //   - The caller must be running on the task goroutine.
   282  //   - t's AddressSpace must be active.
   283  func (t *Task) rseqAddrInterrupt() {
   284  	if t.rseqAddr == 0 {
   285  		return
   286  	}
   287  
   288  	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
   289  	if !ok {
   290  		// SetRSeq should validate this.
   291  		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
   292  	}
   293  
   294  	if t.Arch().Width() != 8 {
   295  		// We only handle 64-bit for now.
   296  		t.Debugf("Only 64-bit rseq supported.")
   297  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   298  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   299  		return
   300  	}
   301  
   302  	buf := t.CopyScratchBuffer(8)
   303  	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
   304  		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
   305  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   306  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   307  		return
   308  	}
   309  
   310  	critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf))
   311  	if critAddr == 0 {
   312  		return
   313  	}
   314  
   315  	var cs linux.RSeqCriticalSection
   316  	if _, err := cs.CopyIn(t, critAddr); err != nil {
   317  		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
   318  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   319  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   320  		return
   321  	}
   322  
   323  	if cs.Version != 0 {
   324  		t.Debugf("Unknown version in %+v", cs)
   325  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   326  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   327  		return
   328  	}
   329  
   330  	start := hostarch.Addr(cs.Start)
   331  	critRange, ok := start.ToRange(cs.PostCommitOffset)
   332  	if !ok {
   333  		t.Debugf("Invalid start and offset in %+v", cs)
   334  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   335  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   336  		return
   337  	}
   338  
   339  	abort := hostarch.Addr(cs.Abort)
   340  	if critRange.Contains(abort) {
   341  		t.Debugf("Abort in critical section in %+v", cs)
   342  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   343  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   344  		return
   345  	}
   346  
   347  	// Verify signature.
   348  	sigAddr := abort - linux.SizeOfRSeqSignature
   349  
   350  	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
   351  	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
   352  		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
   353  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   354  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   355  		return
   356  	}
   357  
   358  	sig := hostarch.ByteOrder.Uint32(buf)
   359  	if sig != t.rseqSignature {
   360  		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
   361  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   362  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   363  		return
   364  	}
   365  
   366  	// Clear the critical section address.
   367  	//
   368  	// NOTE(b/143949567): We don't support any rseq flags, so we always
   369  	// restart if we are in the critical section, and thus *always* clear
   370  	// critAddrAddr.
   371  	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
   372  		AddressSpaceActive: true,
   373  	}); err != nil {
   374  		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
   375  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   376  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   377  		return
   378  	}
   379  
   380  	// Finally we can actually decide whether or not to restart.
   381  	if !critRange.Contains(hostarch.Addr(t.Arch().IP())) {
   382  		return
   383  	}
   384  
   385  	t.Arch().SetIP(uintptr(cs.Abort))
   386  }
   387  
   388  // Preconditions: The caller must be running on the task goroutine.
   389  func (t *Task) oldRSeqInterrupt() {
   390  	r := t.tg.oldRSeqCritical.Load()
   391  	if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) {
   392  		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
   393  		t.Arch().SetIP(uintptr(r.Restart))
   394  		t.Arch().SetOldRSeqInterruptedIP(ip)
   395  	}
   396  }
   397  
   398  // Preconditions: The caller must be running on the task goroutine.
   399  func (t *Task) rseqInterrupt() {
   400  	t.rseqAddrInterrupt()
   401  	t.oldRSeqInterrupt()
   402  }