github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/rseq.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 20 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 21 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 22 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 23 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostcpu" 24 "github.com/nicocha30/gvisor-ligolo/pkg/usermem" 25 ) 26 27 // Restartable sequences. 28 // 29 // We support two different APIs for restartable sequences. 30 // 31 // 1. The upstream interface added in v4.18. 32 // 2. The interface described in https://lwn.net/Articles/650333/. 33 // 34 // Throughout this file and other parts of the kernel, the latter is referred 35 // to as "old rseq". This interface was never merged upstream, but is supported 36 // for a limited set of applications that use it regardless. 37 38 // OldRSeqCriticalRegion describes an old rseq critical region. 39 // 40 // +stateify savable 41 type OldRSeqCriticalRegion struct { 42 // When a task in this thread group has its CPU preempted (as defined by 43 // platform.ErrContextCPUPreempted) or has a signal delivered to an 44 // application handler while its instruction pointer is in CriticalSection, 45 // set the instruction pointer to Restart and application register r10 (on 46 // amd64) to the former instruction pointer. 47 CriticalSection hostarch.AddrRange 48 Restart hostarch.Addr 49 } 50 51 // RSeqAvailable returns true if t supports (old and new) restartable sequences. 52 func (t *Task) RSeqAvailable() bool { 53 return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() 54 } 55 56 // SetRSeq registers addr as this thread's rseq structure. 57 // 58 // Preconditions: The caller must be running on the task goroutine. 59 func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { 60 if t.rseqAddr != 0 { 61 if t.rseqAddr != addr { 62 return linuxerr.EINVAL 63 } 64 if t.rseqSignature != signature { 65 return linuxerr.EINVAL 66 } 67 return linuxerr.EBUSY 68 } 69 70 // rseq must be aligned and correctly sized. 71 if addr&(linux.AlignOfRSeq-1) != 0 { 72 return linuxerr.EINVAL 73 } 74 if length != linux.SizeOfRSeq { 75 return linuxerr.EINVAL 76 } 77 if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { 78 return linuxerr.EFAULT 79 } 80 81 t.rseqAddr = addr 82 t.rseqSignature = signature 83 84 // Initialize the CPUID. 85 // 86 // Linux implicitly does this on return from userspace, where failure 87 // would cause SIGSEGV. 88 if err := t.rseqUpdateCPU(); err != nil { 89 t.rseqAddr = 0 90 t.rseqSignature = 0 91 92 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) 93 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 94 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 95 return linuxerr.EFAULT 96 } 97 98 return nil 99 } 100 101 // ClearRSeq unregisters addr as this thread's rseq structure. 102 // 103 // Preconditions: The caller must be running on the task goroutine. 104 func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error { 105 if t.rseqAddr == 0 { 106 return linuxerr.EINVAL 107 } 108 if t.rseqAddr != addr { 109 return linuxerr.EINVAL 110 } 111 if length != linux.SizeOfRSeq { 112 return linuxerr.EINVAL 113 } 114 if t.rseqSignature != signature { 115 return linuxerr.EPERM 116 } 117 118 if err := t.rseqClearCPU(); err != nil { 119 return err 120 } 121 122 t.rseqAddr = 0 123 t.rseqSignature = 0 124 125 if t.oldRSeqCPUAddr == 0 { 126 // rseqCPU no longer needed. 127 t.rseqCPU = -1 128 } 129 130 return nil 131 } 132 133 // OldRSeqCriticalRegion returns a copy of t's thread group's current 134 // old restartable sequence. 135 func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { 136 return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) 137 } 138 139 // SetOldRSeqCriticalRegion replaces t's thread group's old restartable 140 // sequence. 141 // 142 // Preconditions: t.RSeqAvailable() == true. 143 func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { 144 // These checks are somewhat more lenient than in Linux, which (bizarrely) 145 // requires r.CriticalSection to be non-empty and r.Restart to be 146 // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 147 // (which disables the critical region). 148 if r.CriticalSection.Start == 0 { 149 r.CriticalSection.End = 0 150 r.Restart = 0 151 t.tg.oldRSeqCritical.Store(&r) 152 return nil 153 } 154 if r.CriticalSection.Start >= r.CriticalSection.End { 155 return linuxerr.EINVAL 156 } 157 if r.CriticalSection.Contains(r.Restart) { 158 return linuxerr.EINVAL 159 } 160 // TODO(jamieliu): check that r.CriticalSection and r.Restart are in 161 // the application address range, for consistency with Linux. 162 t.tg.oldRSeqCritical.Store(&r) 163 return nil 164 } 165 166 // OldRSeqCPUAddr returns the address that old rseq will keep updated with t's 167 // CPU number. 168 // 169 // Preconditions: The caller must be running on the task goroutine. 170 func (t *Task) OldRSeqCPUAddr() hostarch.Addr { 171 return t.oldRSeqCPUAddr 172 } 173 174 // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with 175 // t's CPU number. 176 // 177 // Preconditions: 178 // - t.RSeqAvailable() == true. 179 // - The caller must be running on the task goroutine. 180 // - t's AddressSpace must be active. 181 func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error { 182 t.oldRSeqCPUAddr = addr 183 184 // Check that addr is writable. 185 // 186 // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's 187 // unfortunate, but unlikely in a correct program. 188 if err := t.rseqUpdateCPU(); err != nil { 189 t.oldRSeqCPUAddr = 0 190 return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT 191 } 192 return nil 193 } 194 195 // Preconditions: 196 // - The caller must be running on the task goroutine. 197 // - t's AddressSpace must be active. 198 func (t *Task) rseqUpdateCPU() error { 199 if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { 200 t.rseqCPU = -1 201 return nil 202 } 203 204 t.rseqCPU = int32(hostcpu.GetCPU()) 205 206 // Update both CPUs, even if one fails. 207 rerr := t.rseqCopyOutCPU() 208 oerr := t.oldRSeqCopyOutCPU() 209 210 if rerr != nil { 211 return rerr 212 } 213 return oerr 214 } 215 216 // Preconditions: 217 // - The caller must be running on the task goroutine. 218 // - t's AddressSpace must be active. 219 func (t *Task) oldRSeqCopyOutCPU() error { 220 if t.oldRSeqCPUAddr == 0 { 221 return nil 222 } 223 224 buf := t.CopyScratchBuffer(4) 225 hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) 226 _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) 227 return err 228 } 229 230 // Preconditions: 231 // - The caller must be running on the task goroutine. 232 // - t's AddressSpace must be active. 233 func (t *Task) rseqCopyOutCPU() error { 234 if t.rseqAddr == 0 { 235 return nil 236 } 237 238 buf := t.CopyScratchBuffer(8) 239 // CPUIDStart and CPUID are the first two fields in linux.RSeq. 240 hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart 241 hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID 242 // N.B. This write is not atomic, but since this occurs on the task 243 // goroutine then as long as userspace uses a single-instruction read 244 // it can't see an invalid value. 245 _, err := t.CopyOutBytes(t.rseqAddr, buf) 246 return err 247 } 248 249 // Preconditions: 250 // - The caller must be running on the task goroutine. 251 // - t's AddressSpace must be active. 252 func (t *Task) rseqClearCPU() error { 253 buf := t.CopyScratchBuffer(8) 254 // CPUIDStart and CPUID are the first two fields in linux.RSeq. 255 hostarch.ByteOrder.PutUint32(buf, 0) // CPUIDStart 256 hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID 257 // N.B. This write is not atomic, but since this occurs on the task 258 // goroutine then as long as userspace uses a single-instruction read 259 // it can't see an invalid value. 260 _, err := t.CopyOutBytes(t.rseqAddr, buf) 261 return err 262 } 263 264 // rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. 265 // 266 // This is a bit complex since both the RSeq and RSeqCriticalSection structs 267 // are stored in userspace. So we must: 268 // 269 // 1. Copy in the address of RSeqCriticalSection from RSeq. 270 // 2. Copy in RSeqCriticalSection itself. 271 // 3. Validate critical section struct version, address range, abort address. 272 // 4. Validate the abort signature (4 bytes preceding abort IP match expected 273 // signature). 274 // 275 // 5. Clear address of RSeqCriticalSection from RSeq. 276 // 6. Finally, conditionally abort. 277 // 278 // See kernel/rseq.c:rseq_ip_fixup for reference. 279 // 280 // Preconditions: 281 // - The caller must be running on the task goroutine. 282 // - t's AddressSpace must be active. 283 func (t *Task) rseqAddrInterrupt() { 284 if t.rseqAddr == 0 { 285 return 286 } 287 288 critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) 289 if !ok { 290 // SetRSeq should validate this. 291 panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) 292 } 293 294 if t.Arch().Width() != 8 { 295 // We only handle 64-bit for now. 296 t.Debugf("Only 64-bit rseq supported.") 297 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 298 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 299 return 300 } 301 302 buf := t.CopyScratchBuffer(8) 303 if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { 304 t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) 305 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 306 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 307 return 308 } 309 310 critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf)) 311 if critAddr == 0 { 312 return 313 } 314 315 var cs linux.RSeqCriticalSection 316 if _, err := cs.CopyIn(t, critAddr); err != nil { 317 t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) 318 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 319 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 320 return 321 } 322 323 if cs.Version != 0 { 324 t.Debugf("Unknown version in %+v", cs) 325 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 326 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 327 return 328 } 329 330 start := hostarch.Addr(cs.Start) 331 critRange, ok := start.ToRange(cs.PostCommitOffset) 332 if !ok { 333 t.Debugf("Invalid start and offset in %+v", cs) 334 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 335 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 336 return 337 } 338 339 abort := hostarch.Addr(cs.Abort) 340 if critRange.Contains(abort) { 341 t.Debugf("Abort in critical section in %+v", cs) 342 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 343 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 344 return 345 } 346 347 // Verify signature. 348 sigAddr := abort - linux.SizeOfRSeqSignature 349 350 buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) 351 if _, err := t.CopyInBytes(sigAddr, buf); err != nil { 352 t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) 353 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 354 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 355 return 356 } 357 358 sig := hostarch.ByteOrder.Uint32(buf) 359 if sig != t.rseqSignature { 360 t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) 361 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 362 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 363 return 364 } 365 366 // Clear the critical section address. 367 // 368 // NOTE(b/143949567): We don't support any rseq flags, so we always 369 // restart if we are in the critical section, and thus *always* clear 370 // critAddrAddr. 371 if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ 372 AddressSpaceActive: true, 373 }); err != nil { 374 t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) 375 t.forceSignal(linux.SIGSEGV, false /* unconditional */) 376 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 377 return 378 } 379 380 // Finally we can actually decide whether or not to restart. 381 if !critRange.Contains(hostarch.Addr(t.Arch().IP())) { 382 return 383 } 384 385 t.Arch().SetIP(uintptr(cs.Abort)) 386 } 387 388 // Preconditions: The caller must be running on the task goroutine. 389 func (t *Task) oldRSeqInterrupt() { 390 r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) 391 if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) { 392 t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) 393 t.Arch().SetIP(uintptr(r.Restart)) 394 t.Arch().SetOldRSeqInterruptedIP(ip) 395 } 396 } 397 398 // Preconditions: The caller must be running on the task goroutine. 399 func (t *Task) rseqInterrupt() { 400 t.rseqAddrInterrupt() 401 t.oldRSeqInterrupt() 402 }