github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/seccomp/patchbpf/enosys_linux.go (about) 1 //go:build cgo && seccomp 2 // +build cgo,seccomp 3 4 package patchbpf 5 6 import ( 7 "bytes" 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "io" 12 "os" 13 "runtime" 14 "unsafe" 15 16 libseccomp "github.com/seccomp/libseccomp-golang" 17 "github.com/sirupsen/logrus" 18 "golang.org/x/net/bpf" 19 "golang.org/x/sys/unix" 20 21 "github.com/opencontainers/runc/libcontainer/configs" 22 "github.com/opencontainers/runc/libcontainer/utils" 23 ) 24 25 // #cgo pkg-config: libseccomp 26 /* 27 #include <errno.h> 28 #include <stdint.h> 29 #include <seccomp.h> 30 #include <linux/seccomp.h> 31 32 const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS); 33 34 // Copied from <linux/seccomp.h>. 35 36 #ifndef SECCOMP_SET_MODE_FILTER 37 # define SECCOMP_SET_MODE_FILTER 1 38 #endif 39 const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER; 40 41 #ifndef SECCOMP_FILTER_FLAG_LOG 42 # define SECCOMP_FILTER_FLAG_LOG (1UL << 1) 43 #endif 44 const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG; 45 46 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW 47 # define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) 48 #endif 49 const uintptr_t C_FILTER_FLAG_SPEC_ALLOW = SECCOMP_FILTER_FLAG_SPEC_ALLOW; 50 51 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER 52 # define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) 53 #endif 54 const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER; 55 56 #ifndef AUDIT_ARCH_RISCV64 57 #ifndef EM_RISCV 58 #define EM_RISCV 243 59 #endif 60 #define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 61 #endif 62 63 // We use the AUDIT_ARCH_* values because those are the ones used by the kernel 64 // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we 65 // use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*. 66 67 const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386; 68 const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64; 69 const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM; 70 const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64; 71 const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS; 72 const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64; 73 const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32; 74 const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL; 75 const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64; 76 const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32; 77 const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC; 78 const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64; 79 const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE; 80 const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390; 81 const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X; 82 const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64; 83 */ 84 import "C" 85 86 var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS) 87 88 // Assume sizeof(int) == 4 in the BPF program. 89 const bpfSizeofInt = 4 90 91 // This syscall is used for multiplexing "large" syscalls on s390(x). Unknown 92 // syscalls will end up with this syscall number, so we need to explicitly 93 // return -ENOSYS for this syscall on those architectures. 94 const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0 95 96 func isAllowAction(action configs.Action) bool { 97 switch action { 98 // Trace is considered an "allow" action because a good tracer should 99 // support future syscalls (by handling -ENOSYS on its own), and giving 100 // -ENOSYS will be disruptive for emulation. 101 case configs.Allow, configs.Log, configs.Trace: 102 return true 103 default: 104 return false 105 } 106 } 107 108 func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) { 109 var program []bpf.RawInstruction 110 for { 111 // Read the next instruction. We have to use NativeEndian because 112 // seccomp_export_bpf outputs the program in *host* endian-ness. 113 var insn unix.SockFilter 114 if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil { 115 if errors.Is(err, io.EOF) { 116 // Parsing complete. 117 break 118 } 119 if errors.Is(err, io.ErrUnexpectedEOF) { 120 // Parsing stopped mid-instruction. 121 return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err) 122 } 123 // All other errors. 124 return nil, fmt.Errorf("error parsing instructions: %w", err) 125 } 126 program = append(program, bpf.RawInstruction{ 127 Op: insn.Code, 128 Jt: insn.Jt, 129 Jf: insn.Jf, 130 K: insn.K, 131 }) 132 } 133 return program, nil 134 } 135 136 func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) { 137 rdr, wtr, err := os.Pipe() 138 if err != nil { 139 return nil, fmt.Errorf("error creating scratch pipe: %w", err) 140 } 141 defer wtr.Close() 142 defer rdr.Close() 143 144 readerBuffer := new(bytes.Buffer) 145 errChan := make(chan error, 1) 146 go func() { 147 _, err := io.Copy(readerBuffer, rdr) 148 errChan <- err 149 close(errChan) 150 }() 151 152 if err := filter.ExportBPF(wtr); err != nil { 153 return nil, fmt.Errorf("error exporting BPF: %w", err) 154 } 155 // Close so that the reader actually gets EOF. 156 _ = wtr.Close() 157 158 if copyErr := <-errChan; copyErr != nil { 159 return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr) 160 } 161 162 // Parse the instructions. 163 rawProgram, err := parseProgram(readerBuffer) 164 if err != nil { 165 return nil, fmt.Errorf("parsing generated BPF filter: %w", err) 166 } 167 program, ok := bpf.Disassemble(rawProgram) 168 if !ok { 169 return nil, errors.New("could not disassemble entire BPF filter") 170 } 171 return program, nil 172 } 173 174 type linuxAuditArch uint32 175 176 const invalidArch linuxAuditArch = 0 177 178 func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) { 179 switch arch { 180 case libseccomp.ArchNative: 181 // Convert to actual native architecture. 182 arch, err := libseccomp.GetNativeArch() 183 if err != nil { 184 return invalidArch, fmt.Errorf("unable to get native arch: %w", err) 185 } 186 return scmpArchToAuditArch(arch) 187 case libseccomp.ArchX86: 188 return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil 189 case libseccomp.ArchAMD64, libseccomp.ArchX32: 190 // NOTE: x32 is treated like x86_64 except all x32 syscalls have the 191 // 30th bit of the syscall number set to indicate that it's not a 192 // normal x86_64 syscall. 193 return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil 194 case libseccomp.ArchARM: 195 return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil 196 case libseccomp.ArchARM64: 197 return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil 198 case libseccomp.ArchMIPS: 199 return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil 200 case libseccomp.ArchMIPS64: 201 return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil 202 case libseccomp.ArchMIPS64N32: 203 return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil 204 case libseccomp.ArchMIPSEL: 205 return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil 206 case libseccomp.ArchMIPSEL64: 207 return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil 208 case libseccomp.ArchMIPSEL64N32: 209 return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil 210 case libseccomp.ArchPPC: 211 return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil 212 case libseccomp.ArchPPC64: 213 return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil 214 case libseccomp.ArchPPC64LE: 215 return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil 216 case libseccomp.ArchS390: 217 return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil 218 case libseccomp.ArchS390X: 219 return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil 220 case libseccomp.ArchRISCV64: 221 return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil 222 default: 223 return invalidArch, fmt.Errorf("unknown architecture: %v", arch) 224 } 225 } 226 227 type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall 228 229 // Figure out largest syscall number referenced in the filter for each 230 // architecture. We will be generating code based on the native architecture 231 // representation, but SCMP_ARCH_X32 means we have to track cases where the 232 // same architecture has different largest syscalls based on the mode. 233 func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { 234 scmpArchs := make(map[libseccomp.ScmpArch]struct{}) 235 for _, ociArch := range config.Architectures { 236 arch, err := libseccomp.GetArchFromString(ociArch) 237 if err != nil { 238 return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) 239 } 240 scmpArchs[arch] = struct{}{} 241 } 242 // On architectures like ppc64le, Docker inexplicably doesn't include the 243 // native architecture in the architecture list which results in no 244 // architectures being present in the list at all (rendering the ENOSYS 245 // stub a no-op). So, always include the native architecture. 246 if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil { 247 return nil, fmt.Errorf("unable to get native arch: %w", err) 248 } else if _, ok := scmpArchs[nativeScmpArch]; !ok { 249 logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch) 250 scmpArchs[nativeScmpArch] = struct{}{} 251 } 252 logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs) 253 254 // Only loop over architectures which are present in the filter. Any other 255 // architectures will get the libseccomp bad architecture action anyway. 256 lastSyscalls := make(lastSyscallMap) 257 for arch := range scmpArchs { 258 auditArch, err := scmpArchToAuditArch(arch) 259 if err != nil { 260 return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) 261 } 262 263 if _, ok := lastSyscalls[auditArch]; !ok { 264 lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} 265 } 266 if _, ok := lastSyscalls[auditArch][arch]; ok { 267 // Because of ArchNative we may hit the same entry multiple times. 268 // Just skip it if we've seen this (linuxAuditArch, ScmpArch) 269 // combination before. 270 continue 271 } 272 273 // Find the largest syscall in the filter for this architecture. 274 var largestSyscall libseccomp.ScmpSyscall 275 for _, rule := range config.Syscalls { 276 sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch) 277 if err != nil { 278 // Ignore unknown syscalls. 279 continue 280 } 281 if sysno > largestSyscall { 282 largestSyscall = sysno 283 } 284 } 285 if largestSyscall != 0 { 286 logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall) 287 lastSyscalls[auditArch][arch] = largestSyscall 288 } else { 289 logrus.Warnf("could not find any syscalls for arch %v", arch) 290 delete(lastSyscalls[auditArch], arch) 291 } 292 } 293 return lastSyscalls, nil 294 } 295 296 // FIXME FIXME FIXME 297 // 298 // This solution is less than ideal. In the future it would be great to have 299 // per-arch information about which syscalls were added in which kernel 300 // versions so we can create far more accurate filter rules (handling holes in 301 // the syscall table and determining -ENOSYS requirements based on kernel 302 // minimum version alone. 303 // 304 // This implementation can in principle cause issues with syscalls like 305 // close_range(2) which were added out-of-order in the syscall table between 306 // kernel releases. 307 func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { 308 // A jump-table for each linuxAuditArch used to generate the initial 309 // conditional jumps -- measured from the *END* of the program so they 310 // remain valid after prepending to the tail. 311 archJumpTable := map[linuxAuditArch]uint32{} 312 313 // Generate our own -ENOSYS rules for each architecture. They have to be 314 // generated in reverse (prepended to the tail of the program) because the 315 // JumpIf jumps need to be computed from the end of the program. 316 programTail := []bpf.Instruction{ 317 // Fall-through rules jump into the filter. 318 bpf.Jump{Skip: 1}, 319 // Rules which jump to here get -ENOSYS. 320 bpf.RetConstant{Val: retErrnoEnosys}, 321 } 322 323 // Generate the syscall -ENOSYS rules. 324 for auditArch, maxSyscalls := range lastSyscalls { 325 // The number of instructions from the tail of this section which need 326 // to be jumped in order to reach the -ENOSYS return. If the section 327 // does not jump, it will fall through to the actual filter. 328 baseJumpEnosys := uint32(len(programTail) - 1) 329 baseJumpFilter := baseJumpEnosys + 1 330 331 // Add the load instruction for the syscall number -- we jump here 332 // directly from the arch code so we need to do it here. Sadly we can't 333 // share this code between architecture branches. 334 section := []bpf.Instruction{ 335 // load [0] (syscall number) 336 bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt}, 337 } 338 339 switch len(maxSyscalls) { 340 case 0: 341 // No syscalls found for this arch -- skip it and move on. 342 continue 343 case 1: 344 // Get the only syscall and scmpArch in the map. 345 var ( 346 scmpArch libseccomp.ScmpArch 347 sysno libseccomp.ScmpSyscall 348 ) 349 for arch, no := range maxSyscalls { 350 sysno = no 351 scmpArch = arch 352 } 353 354 switch scmpArch { 355 // Return -ENOSYS for setup(2) on s390(x). This syscall is used for 356 // multiplexing "large syscall number" syscalls, but if the syscall 357 // number is not known to the kernel then the syscall number is 358 // left unchanged (and because it is sysno=0, you'll end up with 359 // EPERM for syscalls the kernel doesn't know about). 360 // 361 // The actual setup(2) syscall is never used by userspace anymore 362 // (and hasn't existed for decades) outside of this multiplexing 363 // scheme so returning -ENOSYS is fine. 364 case libseccomp.ArchS390, libseccomp.ArchS390X: 365 section = append(section, []bpf.Instruction{ 366 // jne [setup=0],1 367 bpf.JumpIf{ 368 Cond: bpf.JumpNotEqual, 369 Val: uint32(s390xMultiplexSyscall), 370 SkipTrue: 1, 371 }, 372 // ret [ENOSYS] 373 bpf.RetConstant{Val: retErrnoEnosys}, 374 }...) 375 } 376 377 // The simplest case just boils down to a single jgt instruction, 378 // with special handling if baseJumpEnosys is larger than 255 (and 379 // thus a long jump is required). 380 var sectionTail []bpf.Instruction 381 if baseJumpEnosys+1 <= 255 { 382 sectionTail = []bpf.Instruction{ 383 // jgt [syscall],[baseJumpEnosys+1] 384 bpf.JumpIf{ 385 Cond: bpf.JumpGreaterThan, 386 Val: uint32(sysno), 387 SkipTrue: uint8(baseJumpEnosys + 1), 388 }, 389 // ja [baseJumpFilter] 390 bpf.Jump{Skip: baseJumpFilter}, 391 } 392 } else { 393 sectionTail = []bpf.Instruction{ 394 // jle [syscall],1 395 bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1}, 396 // ret [ENOSYS] 397 bpf.RetConstant{Val: retErrnoEnosys}, 398 // ja [baseJumpFilter] 399 bpf.Jump{Skip: baseJumpFilter}, 400 } 401 } 402 403 // If we're on x86 we need to add a check for x32 and if we're in 404 // the wrong mode we jump over the section. 405 if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) { 406 // Generate a prefix to check the mode. 407 switch scmpArch { 408 case libseccomp.ArchAMD64: 409 sectionTail = append([]bpf.Instruction{ 410 // jset (1<<30),[len(tail)-1] 411 bpf.JumpIf{ 412 Cond: bpf.JumpBitsSet, 413 Val: 1 << 30, 414 SkipTrue: uint8(len(sectionTail) - 1), 415 }, 416 }, sectionTail...) 417 case libseccomp.ArchX32: 418 sectionTail = append([]bpf.Instruction{ 419 // jset (1<<30),0,[len(tail)-1] 420 bpf.JumpIf{ 421 Cond: bpf.JumpBitsNotSet, 422 Val: 1 << 30, 423 SkipTrue: uint8(len(sectionTail) - 1), 424 }, 425 }, sectionTail...) 426 default: 427 return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch) 428 } 429 } 430 431 section = append(section, sectionTail...) 432 case 2: 433 // x32 and x86_64 are a unique case, we can't handle any others. 434 if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) { 435 return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch) 436 } 437 438 x32sysno, ok := maxSyscalls[libseccomp.ArchX32] 439 if !ok { 440 return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls) 441 } 442 x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64] 443 if !ok { 444 return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls) 445 } 446 447 // The x32 ABI indicates that a syscall is being made by an x32 448 // process by setting the 30th bit of the syscall number, but we 449 // need to do some special-casing depending on whether we need to 450 // do long jumps. 451 if baseJumpEnosys+2 <= 255 { 452 // For the simple case we want to have something like: 453 // jset (1<<30),1 454 // jgt [x86 syscall],[baseJumpEnosys+2],1 455 // jgt [x32 syscall],[baseJumpEnosys+1] 456 // ja [baseJumpFilter] 457 section = append(section, []bpf.Instruction{ 458 // jset (1<<30),1 459 bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, 460 // jgt [x86 syscall],[baseJumpEnosys+1],1 461 bpf.JumpIf{ 462 Cond: bpf.JumpGreaterThan, 463 Val: uint32(x86sysno), 464 SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1, 465 }, 466 // jgt [x32 syscall],[baseJumpEnosys] 467 bpf.JumpIf{ 468 Cond: bpf.JumpGreaterThan, 469 Val: uint32(x32sysno), 470 SkipTrue: uint8(baseJumpEnosys + 1), 471 }, 472 // ja [baseJumpFilter] 473 bpf.Jump{Skip: baseJumpFilter}, 474 }...) 475 } else { 476 // But if the [baseJumpEnosys+2] jump is larger than 255 we 477 // need to do a long jump like so: 478 // jset (1<<30),1 479 // jgt [x86 syscall],1,2 480 // jle [x32 syscall],1 481 // ret [ENOSYS] 482 // ja [baseJumpFilter] 483 section = append(section, []bpf.Instruction{ 484 // jset (1<<30),1 485 bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, 486 // jgt [x86 syscall],1,2 487 bpf.JumpIf{ 488 Cond: bpf.JumpGreaterThan, 489 Val: uint32(x86sysno), 490 SkipTrue: 1, SkipFalse: 2, 491 }, 492 // jle [x32 syscall],1 493 bpf.JumpIf{ 494 Cond: bpf.JumpLessOrEqual, 495 Val: uint32(x32sysno), 496 SkipTrue: 1, 497 }, 498 // ret [ENOSYS] 499 bpf.RetConstant{Val: retErrnoEnosys}, 500 // ja [baseJumpFilter] 501 bpf.Jump{Skip: baseJumpFilter}, 502 }...) 503 } 504 default: 505 return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls)) 506 } 507 508 // Prepend this section to the tail. 509 programTail = append(section, programTail...) 510 511 // Update jump table. 512 archJumpTable[auditArch] = uint32(len(programTail)) 513 } 514 515 // Add a dummy "jump to filter" for any architecture we might miss below. 516 // Such architectures will probably get the BadArch action of the filter 517 // regardless. 518 programTail = append([]bpf.Instruction{ 519 // ja [end of stub and start of filter] 520 bpf.Jump{Skip: uint32(len(programTail))}, 521 }, programTail...) 522 523 // Generate the jump rules for each architecture. This has to be done in 524 // reverse as well for the same reason as above. We add to programTail 525 // directly because the jumps are impacted by each architecture rule we add 526 // as well. 527 // 528 // TODO: Maybe we want to optimise to avoid long jumps here? So sort the 529 // architectures based on how large the jumps are going to be, or 530 // re-sort the candidate architectures each time to make sure that we 531 // pick the largest jump which is going to be smaller than 255. 532 for auditArch := range lastSyscalls { 533 // We jump forwards but the jump table is calculated from the *END*. 534 jump := uint32(len(programTail)) - archJumpTable[auditArch] 535 536 // Same routine as above -- this is a basic jeq check, complicated 537 // slightly if it turns out that we need to do a long jump. 538 if jump <= 255 { 539 programTail = append([]bpf.Instruction{ 540 // jeq [arch],[jump] 541 bpf.JumpIf{ 542 Cond: bpf.JumpEqual, 543 Val: uint32(auditArch), 544 SkipTrue: uint8(jump), 545 }, 546 }, programTail...) 547 } else { 548 programTail = append([]bpf.Instruction{ 549 // jne [arch],1 550 bpf.JumpIf{ 551 Cond: bpf.JumpNotEqual, 552 Val: uint32(auditArch), 553 SkipTrue: 1, 554 }, 555 // ja [jump] 556 bpf.Jump{Skip: jump}, 557 }, programTail...) 558 } 559 } 560 561 // Prepend the load instruction for the architecture. 562 programTail = append([]bpf.Instruction{ 563 // load [4] (architecture) 564 bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt}, 565 }, programTail...) 566 567 // And that's all folks! 568 return programTail, nil 569 } 570 571 func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) { 572 rawProgram, err := bpf.Assemble(program) 573 if err != nil { 574 return nil, fmt.Errorf("error assembling program: %w", err) 575 } 576 577 // Convert to []unix.SockFilter for unix.SockFilter. 578 var filter []unix.SockFilter 579 for _, insn := range rawProgram { 580 filter = append(filter, unix.SockFilter{ 581 Code: insn.Op, 582 Jt: insn.Jt, 583 Jf: insn.Jf, 584 K: insn.K, 585 }) 586 } 587 return filter, nil 588 } 589 590 func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) { 591 // Patch the generated cBPF only when there is not a defaultErrnoRet set 592 // and it is different from ENOSYS 593 if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) { 594 return nil, nil 595 } 596 // We only add the stub if the default action is not permissive. 597 if isAllowAction(config.DefaultAction) { 598 logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation") 599 return nil, nil 600 } 601 602 lastSyscalls, err := findLastSyscalls(config) 603 if err != nil { 604 return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err) 605 } 606 stubProgram, err := generateEnosysStub(lastSyscalls) 607 if err != nil { 608 return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err) 609 } 610 return stubProgram, nil 611 } 612 613 func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) { 614 program, err := disassembleFilter(filter) 615 if err != nil { 616 return nil, fmt.Errorf("error disassembling original filter: %w", err) 617 } 618 619 patch, err := generatePatch(config) 620 if err != nil { 621 return nil, fmt.Errorf("error generating patch for filter: %w", err) 622 } 623 fullProgram := append(patch, program...) 624 625 logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...") 626 for idx, insn := range patch { 627 logrus.Debugf(" [%4.1d] %s", idx, insn) 628 } 629 logrus.Debugf(" [....] --- original filter ---") 630 631 fprog, err := assemble(fullProgram) 632 if err != nil { 633 return nil, fmt.Errorf("error assembling modified filter: %w", err) 634 } 635 return fprog, nil 636 } 637 638 func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) { 639 // Ignore the error since pre-2.4 libseccomp is treated as API level 0. 640 apiLevel, _ := libseccomp.GetAPI() 641 642 noNewPrivs, err = filter.GetNoNewPrivsBit() 643 if err != nil { 644 return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err) 645 } 646 647 if apiLevel >= 3 { 648 if logBit, err := filter.GetLogBit(); err != nil { 649 return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err) 650 } else if logBit { 651 flags |= uint(C.C_FILTER_FLAG_LOG) 652 } 653 } 654 if apiLevel >= 4 { 655 if ssb, err := filter.GetSSB(); err != nil { 656 return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_SPEC_ALLOW bit: %w", err) 657 } else if ssb { 658 flags |= uint(C.C_FILTER_FLAG_SPEC_ALLOW) 659 } 660 } 661 // XXX: add newly supported filter flags above this line. 662 663 for _, call := range config.Syscalls { 664 if call.Action == configs.Notify { 665 flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER) 666 break 667 } 668 } 669 670 return 671 } 672 673 func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) { 674 // This debug output is validated in tests/integration/seccomp.bats 675 // by the SECCOMP_FILTER_FLAG_* test. 676 logrus.Debugf("seccomp filter flags: %d", flags) 677 fprog := unix.SockFprog{ 678 Len: uint16(len(filter)), 679 Filter: &filter[0], 680 } 681 fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set 682 // If no seccomp flags were requested we can use the old-school prctl(2). 683 if flags == 0 { 684 err = unix.Prctl(unix.PR_SET_SECCOMP, 685 unix.SECCOMP_MODE_FILTER, 686 uintptr(unsafe.Pointer(&fprog)), 0, 0) 687 } else { 688 fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP, 689 uintptr(C.C_SET_MODE_FILTER), 690 uintptr(flags), uintptr(unsafe.Pointer(&fprog))) 691 if errno != 0 { 692 err = errno 693 } 694 if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 { 695 fd = int(fdptr) 696 } 697 } 698 runtime.KeepAlive(filter) 699 runtime.KeepAlive(fprog) 700 return 701 } 702 703 // PatchAndLoad takes a seccomp configuration and a libseccomp filter which has 704 // been pre-configured with the set of rules in the seccomp config. It then 705 // patches said filter to handle -ENOSYS in a much nicer manner than the 706 // default libseccomp default action behaviour, and loads the patched filter 707 // into the kernel for the current process. 708 func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) { 709 // Generate a patched filter. 710 fprog, err := enosysPatchFilter(config, filter) 711 if err != nil { 712 return nil, fmt.Errorf("error patching filter: %w", err) 713 } 714 715 // Get the set of libseccomp flags set. 716 seccompFlags, noNewPrivs, err := filterFlags(config, filter) 717 if err != nil { 718 return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) 719 } 720 721 // Set no_new_privs if it was requested, though in runc we handle 722 // no_new_privs separately so warn if we hit this path. 723 if noNewPrivs { 724 logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") 725 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { 726 return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err) 727 } 728 } 729 730 // Finally, load the filter. 731 fd, err := sysSeccompSetFilter(seccompFlags, fprog) 732 if err != nil { 733 return nil, fmt.Errorf("error loading seccomp filter: %w", err) 734 } 735 return os.NewFile(uintptr(fd), "[seccomp filter]"), nil 736 }