github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/rseq.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/hostcpu"
    24  	"github.com/SagerNet/gvisor/pkg/syserror"
    25  	"github.com/SagerNet/gvisor/pkg/usermem"
    26  )
    27  
    28  // Restartable sequences.
    29  //
    30  // We support two different APIs for restartable sequences.
    31  //
    32  //  1. The upstream interface added in v4.18.
    33  //  2. The interface described in https://lwn.net/Articles/650333/.
    34  //
    35  // Throughout this file and other parts of the kernel, the latter is referred
    36  // to as "old rseq". This interface was never merged upstream, but is supported
    37  // for a limited set of applications that use it regardless.
    38  
    39  // OldRSeqCriticalRegion describes an old rseq critical region.
    40  //
    41  // +stateify savable
    42  type OldRSeqCriticalRegion struct {
    43  	// When a task in this thread group has its CPU preempted (as defined by
    44  	// platform.ErrContextCPUPreempted) or has a signal delivered to an
    45  	// application handler while its instruction pointer is in CriticalSection,
    46  	// set the instruction pointer to Restart and application register r10 (on
    47  	// amd64) to the former instruction pointer.
    48  	CriticalSection hostarch.AddrRange
    49  	Restart         hostarch.Addr
    50  }
    51  
    52  // RSeqAvailable returns true if t supports (old and new) restartable sequences.
    53  func (t *Task) RSeqAvailable() bool {
    54  	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
    55  }
    56  
    57  // SetRSeq registers addr as this thread's rseq structure.
    58  //
    59  // Preconditions: The caller must be running on the task goroutine.
    60  func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
    61  	if t.rseqAddr != 0 {
    62  		if t.rseqAddr != addr {
    63  			return linuxerr.EINVAL
    64  		}
    65  		if t.rseqSignature != signature {
    66  			return linuxerr.EINVAL
    67  		}
    68  		return linuxerr.EBUSY
    69  	}
    70  
    71  	// rseq must be aligned and correctly sized.
    72  	if addr&(linux.AlignOfRSeq-1) != 0 {
    73  		return linuxerr.EINVAL
    74  	}
    75  	if length != linux.SizeOfRSeq {
    76  		return linuxerr.EINVAL
    77  	}
    78  	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
    79  		return syserror.EFAULT
    80  	}
    81  
    82  	t.rseqAddr = addr
    83  	t.rseqSignature = signature
    84  
    85  	// Initialize the CPUID.
    86  	//
    87  	// Linux implicitly does this on return from userspace, where failure
    88  	// would cause SIGSEGV.
    89  	if err := t.rseqUpdateCPU(); err != nil {
    90  		t.rseqAddr = 0
    91  		t.rseqSignature = 0
    92  
    93  		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
    94  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
    95  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
    96  		return syserror.EFAULT
    97  	}
    98  
    99  	return nil
   100  }
   101  
   102  // ClearRSeq unregisters addr as this thread's rseq structure.
   103  //
   104  // Preconditions: The caller must be running on the task goroutine.
   105  func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
   106  	if t.rseqAddr == 0 {
   107  		return linuxerr.EINVAL
   108  	}
   109  	if t.rseqAddr != addr {
   110  		return linuxerr.EINVAL
   111  	}
   112  	if length != linux.SizeOfRSeq {
   113  		return linuxerr.EINVAL
   114  	}
   115  	if t.rseqSignature != signature {
   116  		return linuxerr.EPERM
   117  	}
   118  
   119  	if err := t.rseqClearCPU(); err != nil {
   120  		return err
   121  	}
   122  
   123  	t.rseqAddr = 0
   124  	t.rseqSignature = 0
   125  
   126  	if t.oldRSeqCPUAddr == 0 {
   127  		// rseqCPU no longer needed.
   128  		t.rseqCPU = -1
   129  	}
   130  
   131  	return nil
   132  }
   133  
   134  // OldRSeqCriticalRegion returns a copy of t's thread group's current
   135  // old restartable sequence.
   136  func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
   137  	return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
   138  }
   139  
   140  // SetOldRSeqCriticalRegion replaces t's thread group's old restartable
   141  // sequence.
   142  //
   143  // Preconditions: t.RSeqAvailable() == true.
   144  func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
   145  	// These checks are somewhat more lenient than in Linux, which (bizarrely)
   146  	// requires r.CriticalSection to be non-empty and r.Restart to be
   147  	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
   148  	// (which disables the critical region).
   149  	if r.CriticalSection.Start == 0 {
   150  		r.CriticalSection.End = 0
   151  		r.Restart = 0
   152  		t.tg.oldRSeqCritical.Store(&r)
   153  		return nil
   154  	}
   155  	if r.CriticalSection.Start >= r.CriticalSection.End {
   156  		return linuxerr.EINVAL
   157  	}
   158  	if r.CriticalSection.Contains(r.Restart) {
   159  		return linuxerr.EINVAL
   160  	}
   161  	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
   162  	// the application address range, for consistency with Linux.
   163  	t.tg.oldRSeqCritical.Store(&r)
   164  	return nil
   165  }
   166  
   167  // OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
   168  // CPU number.
   169  //
   170  // Preconditions: The caller must be running on the task goroutine.
   171  func (t *Task) OldRSeqCPUAddr() hostarch.Addr {
   172  	return t.oldRSeqCPUAddr
   173  }
   174  
   175  // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
   176  // t's CPU number.
   177  //
   178  // Preconditions:
   179  // * t.RSeqAvailable() == true.
   180  // * The caller must be running on the task goroutine.
   181  // * t's AddressSpace must be active.
   182  func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
   183  	t.oldRSeqCPUAddr = addr
   184  
   185  	// Check that addr is writable.
   186  	//
   187  	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
   188  	// unfortunate, but unlikely in a correct program.
   189  	if err := t.rseqUpdateCPU(); err != nil {
   190  		t.oldRSeqCPUAddr = 0
   191  		return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT
   192  	}
   193  	return nil
   194  }
   195  
   196  // Preconditions:
   197  // * The caller must be running on the task goroutine.
   198  // * t's AddressSpace must be active.
   199  func (t *Task) rseqUpdateCPU() error {
   200  	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
   201  		t.rseqCPU = -1
   202  		return nil
   203  	}
   204  
   205  	t.rseqCPU = int32(hostcpu.GetCPU())
   206  
   207  	// Update both CPUs, even if one fails.
   208  	rerr := t.rseqCopyOutCPU()
   209  	oerr := t.oldRSeqCopyOutCPU()
   210  
   211  	if rerr != nil {
   212  		return rerr
   213  	}
   214  	return oerr
   215  }
   216  
   217  // Preconditions:
   218  // * The caller must be running on the task goroutine.
   219  // * t's AddressSpace must be active.
   220  func (t *Task) oldRSeqCopyOutCPU() error {
   221  	if t.oldRSeqCPUAddr == 0 {
   222  		return nil
   223  	}
   224  
   225  	buf := t.CopyScratchBuffer(4)
   226  	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
   227  	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
   228  	return err
   229  }
   230  
   231  // Preconditions:
   232  // * The caller must be running on the task goroutine.
   233  // * t's AddressSpace must be active.
   234  func (t *Task) rseqCopyOutCPU() error {
   235  	if t.rseqAddr == 0 {
   236  		return nil
   237  	}
   238  
   239  	buf := t.CopyScratchBuffer(8)
   240  	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
   241  	hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
   242  	hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
   243  	// N.B. This write is not atomic, but since this occurs on the task
   244  	// goroutine then as long as userspace uses a single-instruction read
   245  	// it can't see an invalid value.
   246  	_, err := t.CopyOutBytes(t.rseqAddr, buf)
   247  	return err
   248  }
   249  
   250  // Preconditions:
   251  // * The caller must be running on the task goroutine.
   252  // * t's AddressSpace must be active.
   253  func (t *Task) rseqClearCPU() error {
   254  	buf := t.CopyScratchBuffer(8)
   255  	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
   256  	hostarch.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
   257  	hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
   258  	// N.B. This write is not atomic, but since this occurs on the task
   259  	// goroutine then as long as userspace uses a single-instruction read
   260  	// it can't see an invalid value.
   261  	_, err := t.CopyOutBytes(t.rseqAddr, buf)
   262  	return err
   263  }
   264  
   265  // rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
   266  //
   267  // This is a bit complex since both the RSeq and RSeqCriticalSection structs
   268  // are stored in userspace. So we must:
   269  //
   270  // 1. Copy in the address of RSeqCriticalSection from RSeq.
   271  // 2. Copy in RSeqCriticalSection itself.
   272  // 3. Validate critical section struct version, address range, abort address.
   273  // 4. Validate the abort signature (4 bytes preceding abort IP match expected
   274  //    signature).
   275  // 5. Clear address of RSeqCriticalSection from RSeq.
   276  // 6. Finally, conditionally abort.
   277  //
   278  // See kernel/rseq.c:rseq_ip_fixup for reference.
   279  //
   280  // Preconditions:
   281  // * The caller must be running on the task goroutine.
   282  // * t's AddressSpace must be active.
   283  func (t *Task) rseqAddrInterrupt() {
   284  	if t.rseqAddr == 0 {
   285  		return
   286  	}
   287  
   288  	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
   289  	if !ok {
   290  		// SetRSeq should validate this.
   291  		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
   292  	}
   293  
   294  	if t.Arch().Width() != 8 {
   295  		// We only handle 64-bit for now.
   296  		t.Debugf("Only 64-bit rseq supported.")
   297  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   298  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   299  		return
   300  	}
   301  
   302  	buf := t.CopyScratchBuffer(8)
   303  	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
   304  		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
   305  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   306  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   307  		return
   308  	}
   309  
   310  	critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf))
   311  	if critAddr == 0 {
   312  		return
   313  	}
   314  
   315  	var cs linux.RSeqCriticalSection
   316  	if _, err := cs.CopyIn(t, critAddr); err != nil {
   317  		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
   318  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   319  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   320  		return
   321  	}
   322  
   323  	if cs.Version != 0 {
   324  		t.Debugf("Unknown version in %+v", cs)
   325  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   326  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   327  		return
   328  	}
   329  
   330  	start := hostarch.Addr(cs.Start)
   331  	critRange, ok := start.ToRange(cs.PostCommitOffset)
   332  	if !ok {
   333  		t.Debugf("Invalid start and offset in %+v", cs)
   334  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   335  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   336  		return
   337  	}
   338  
   339  	abort := hostarch.Addr(cs.Abort)
   340  	if critRange.Contains(abort) {
   341  		t.Debugf("Abort in critical section in %+v", cs)
   342  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   343  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   344  		return
   345  	}
   346  
   347  	// Verify signature.
   348  	sigAddr := abort - linux.SizeOfRSeqSignature
   349  
   350  	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
   351  	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
   352  		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
   353  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   354  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   355  		return
   356  	}
   357  
   358  	sig := hostarch.ByteOrder.Uint32(buf)
   359  	if sig != t.rseqSignature {
   360  		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
   361  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   362  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   363  		return
   364  	}
   365  
   366  	// Clear the critical section address.
   367  	//
   368  	// NOTE(b/143949567): We don't support any rseq flags, so we always
   369  	// restart if we are in the critical section, and thus *always* clear
   370  	// critAddrAddr.
   371  	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
   372  		AddressSpaceActive: true,
   373  	}); err != nil {
   374  		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
   375  		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
   376  		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   377  		return
   378  	}
   379  
   380  	// Finally we can actually decide whether or not to restart.
   381  	if !critRange.Contains(hostarch.Addr(t.Arch().IP())) {
   382  		return
   383  	}
   384  
   385  	t.Arch().SetIP(uintptr(cs.Abort))
   386  }
   387  
   388  // Preconditions: The caller must be running on the task goroutine.
   389  func (t *Task) oldRSeqInterrupt() {
   390  	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
   391  	if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) {
   392  		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
   393  		t.Arch().SetIP(uintptr(r.Restart))
   394  		t.Arch().SetOldRSeqInterruptedIP(ip)
   395  	}
   396  }
   397  
   398  // Preconditions: The caller must be running on the task goroutine.
   399  func (t *Task) rseqInterrupt() {
   400  	t.rseqAddrInterrupt()
   401  	t.oldRSeqInterrupt()
   402  }