github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/seccomp/patchbpf/enosys_linux.go

github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/seccomp/patchbpf/enosys_linux.go (about)

     1  //go:build cgo && seccomp
     2  // +build cgo,seccomp
     3  
     4  package patchbpf
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"runtime"
    14  	"unsafe"
    15  
    16  	libseccomp "github.com/seccomp/libseccomp-golang"
    17  	"github.com/sirupsen/logrus"
    18  	"golang.org/x/net/bpf"
    19  	"golang.org/x/sys/unix"
    20  
    21  	"github.com/opencontainers/runc/libcontainer/configs"
    22  	"github.com/opencontainers/runc/libcontainer/utils"
    23  )
    24  
    25  // #cgo pkg-config: libseccomp
    26  /*
    27  #include <errno.h>
    28  #include <stdint.h>
    29  #include <seccomp.h>
    30  #include <linux/seccomp.h>
    31  
    32  const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
    33  
    34  // Copied from <linux/seccomp.h>.
    35  
    36  #ifndef SECCOMP_SET_MODE_FILTER
    37  #	define SECCOMP_SET_MODE_FILTER 1
    38  #endif
    39  const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
    40  
    41  #ifndef SECCOMP_FILTER_FLAG_LOG
    42  #	define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
    43  #endif
    44  const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
    45  
    46  #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
    47  #	define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
    48  #endif
    49  const uintptr_t C_FILTER_FLAG_SPEC_ALLOW = SECCOMP_FILTER_FLAG_SPEC_ALLOW;
    50  
    51  #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
    52  #	define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
    53  #endif
    54  const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
    55  
    56  #ifndef AUDIT_ARCH_RISCV64
    57  #ifndef EM_RISCV
    58  #define EM_RISCV		243
    59  #endif
    60  #define AUDIT_ARCH_RISCV64	(EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
    61  #endif
    62  
    63  // We use the AUDIT_ARCH_* values because those are the ones used by the kernel
    64  // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
    65  // use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
    66  
    67  const uint32_t C_AUDIT_ARCH_I386         = AUDIT_ARCH_I386;
    68  const uint32_t C_AUDIT_ARCH_X86_64       = AUDIT_ARCH_X86_64;
    69  const uint32_t C_AUDIT_ARCH_ARM          = AUDIT_ARCH_ARM;
    70  const uint32_t C_AUDIT_ARCH_AARCH64      = AUDIT_ARCH_AARCH64;
    71  const uint32_t C_AUDIT_ARCH_MIPS         = AUDIT_ARCH_MIPS;
    72  const uint32_t C_AUDIT_ARCH_MIPS64       = AUDIT_ARCH_MIPS64;
    73  const uint32_t C_AUDIT_ARCH_MIPS64N32    = AUDIT_ARCH_MIPS64N32;
    74  const uint32_t C_AUDIT_ARCH_MIPSEL       = AUDIT_ARCH_MIPSEL;
    75  const uint32_t C_AUDIT_ARCH_MIPSEL64     = AUDIT_ARCH_MIPSEL64;
    76  const uint32_t C_AUDIT_ARCH_MIPSEL64N32  = AUDIT_ARCH_MIPSEL64N32;
    77  const uint32_t C_AUDIT_ARCH_PPC          = AUDIT_ARCH_PPC;
    78  const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
    79  const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
    80  const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
    81  const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
    82  const uint32_t C_AUDIT_ARCH_RISCV64      = AUDIT_ARCH_RISCV64;
    83  */
    84  import "C"
    85  
    86  var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
    87  
    88  // Assume sizeof(int) == 4 in the BPF program.
    89  const bpfSizeofInt = 4
    90  
    91  // This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
    92  // syscalls will end up with this syscall number, so we need to explicitly
    93  // return -ENOSYS for this syscall on those architectures.
    94  const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
    95  
    96  func isAllowAction(action configs.Action) bool {
    97  	switch action {
    98  	// Trace is considered an "allow" action because a good tracer should
    99  	// support future syscalls (by handling -ENOSYS on its own), and giving
   100  	// -ENOSYS will be disruptive for emulation.
   101  	case configs.Allow, configs.Log, configs.Trace:
   102  		return true
   103  	default:
   104  		return false
   105  	}
   106  }
   107  
   108  func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
   109  	var program []bpf.RawInstruction
   110  	for {
   111  		// Read the next instruction. We have to use NativeEndian because
   112  		// seccomp_export_bpf outputs the program in *host* endian-ness.
   113  		var insn unix.SockFilter
   114  		if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
   115  			if errors.Is(err, io.EOF) {
   116  				// Parsing complete.
   117  				break
   118  			}
   119  			if errors.Is(err, io.ErrUnexpectedEOF) {
   120  				// Parsing stopped mid-instruction.
   121  				return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
   122  			}
   123  			// All other errors.
   124  			return nil, fmt.Errorf("error parsing instructions: %w", err)
   125  		}
   126  		program = append(program, bpf.RawInstruction{
   127  			Op: insn.Code,
   128  			Jt: insn.Jt,
   129  			Jf: insn.Jf,
   130  			K:  insn.K,
   131  		})
   132  	}
   133  	return program, nil
   134  }
   135  
   136  func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
   137  	rdr, wtr, err := os.Pipe()
   138  	if err != nil {
   139  		return nil, fmt.Errorf("error creating scratch pipe: %w", err)
   140  	}
   141  	defer wtr.Close()
   142  	defer rdr.Close()
   143  
   144  	readerBuffer := new(bytes.Buffer)
   145  	errChan := make(chan error, 1)
   146  	go func() {
   147  		_, err := io.Copy(readerBuffer, rdr)
   148  		errChan <- err
   149  		close(errChan)
   150  	}()
   151  
   152  	if err := filter.ExportBPF(wtr); err != nil {
   153  		return nil, fmt.Errorf("error exporting BPF: %w", err)
   154  	}
   155  	// Close so that the reader actually gets EOF.
   156  	_ = wtr.Close()
   157  
   158  	if copyErr := <-errChan; copyErr != nil {
   159  		return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
   160  	}
   161  
   162  	// Parse the instructions.
   163  	rawProgram, err := parseProgram(readerBuffer)
   164  	if err != nil {
   165  		return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
   166  	}
   167  	program, ok := bpf.Disassemble(rawProgram)
   168  	if !ok {
   169  		return nil, errors.New("could not disassemble entire BPF filter")
   170  	}
   171  	return program, nil
   172  }
   173  
   174  type linuxAuditArch uint32
   175  
   176  const invalidArch linuxAuditArch = 0
   177  
   178  func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
   179  	switch arch {
   180  	case libseccomp.ArchNative:
   181  		// Convert to actual native architecture.
   182  		arch, err := libseccomp.GetNativeArch()
   183  		if err != nil {
   184  			return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
   185  		}
   186  		return scmpArchToAuditArch(arch)
   187  	case libseccomp.ArchX86:
   188  		return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
   189  	case libseccomp.ArchAMD64, libseccomp.ArchX32:
   190  		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
   191  		//       30th bit of the syscall number set to indicate that it's not a
   192  		//       normal x86_64 syscall.
   193  		return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
   194  	case libseccomp.ArchARM:
   195  		return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
   196  	case libseccomp.ArchARM64:
   197  		return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
   198  	case libseccomp.ArchMIPS:
   199  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
   200  	case libseccomp.ArchMIPS64:
   201  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
   202  	case libseccomp.ArchMIPS64N32:
   203  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
   204  	case libseccomp.ArchMIPSEL:
   205  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
   206  	case libseccomp.ArchMIPSEL64:
   207  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
   208  	case libseccomp.ArchMIPSEL64N32:
   209  		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
   210  	case libseccomp.ArchPPC:
   211  		return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
   212  	case libseccomp.ArchPPC64:
   213  		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
   214  	case libseccomp.ArchPPC64LE:
   215  		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
   216  	case libseccomp.ArchS390:
   217  		return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
   218  	case libseccomp.ArchS390X:
   219  		return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
   220  	case libseccomp.ArchRISCV64:
   221  		return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
   222  	default:
   223  		return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
   224  	}
   225  }
   226  
   227  type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
   228  
   229  // Figure out largest syscall number referenced in the filter for each
   230  // architecture. We will be generating code based on the native architecture
   231  // representation, but SCMP_ARCH_X32 means we have to track cases where the
   232  // same architecture has different largest syscalls based on the mode.
   233  func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
   234  	scmpArchs := make(map[libseccomp.ScmpArch]struct{})
   235  	for _, ociArch := range config.Architectures {
   236  		arch, err := libseccomp.GetArchFromString(ociArch)
   237  		if err != nil {
   238  			return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
   239  		}
   240  		scmpArchs[arch] = struct{}{}
   241  	}
   242  	// On architectures like ppc64le, Docker inexplicably doesn't include the
   243  	// native architecture in the architecture list which results in no
   244  	// architectures being present in the list at all (rendering the ENOSYS
   245  	// stub a no-op). So, always include the native architecture.
   246  	if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
   247  		return nil, fmt.Errorf("unable to get native arch: %w", err)
   248  	} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
   249  		logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
   250  		scmpArchs[nativeScmpArch] = struct{}{}
   251  	}
   252  	logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
   253  
   254  	// Only loop over architectures which are present in the filter. Any other
   255  	// architectures will get the libseccomp bad architecture action anyway.
   256  	lastSyscalls := make(lastSyscallMap)
   257  	for arch := range scmpArchs {
   258  		auditArch, err := scmpArchToAuditArch(arch)
   259  		if err != nil {
   260  			return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
   261  		}
   262  
   263  		if _, ok := lastSyscalls[auditArch]; !ok {
   264  			lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
   265  		}
   266  		if _, ok := lastSyscalls[auditArch][arch]; ok {
   267  			// Because of ArchNative we may hit the same entry multiple times.
   268  			// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
   269  			// combination before.
   270  			continue
   271  		}
   272  
   273  		// Find the largest syscall in the filter for this architecture.
   274  		var largestSyscall libseccomp.ScmpSyscall
   275  		for _, rule := range config.Syscalls {
   276  			sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
   277  			if err != nil {
   278  				// Ignore unknown syscalls.
   279  				continue
   280  			}
   281  			if sysno > largestSyscall {
   282  				largestSyscall = sysno
   283  			}
   284  		}
   285  		if largestSyscall != 0 {
   286  			logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
   287  			lastSyscalls[auditArch][arch] = largestSyscall
   288  		} else {
   289  			logrus.Warnf("could not find any syscalls for arch %v", arch)
   290  			delete(lastSyscalls[auditArch], arch)
   291  		}
   292  	}
   293  	return lastSyscalls, nil
   294  }
   295  
   296  // FIXME FIXME FIXME
   297  //
   298  // This solution is less than ideal. In the future it would be great to have
   299  // per-arch information about which syscalls were added in which kernel
   300  // versions so we can create far more accurate filter rules (handling holes in
   301  // the syscall table and determining -ENOSYS requirements based on kernel
   302  // minimum version alone.
   303  //
   304  // This implementation can in principle cause issues with syscalls like
   305  // close_range(2) which were added out-of-order in the syscall table between
   306  // kernel releases.
   307  func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
   308  	// A jump-table for each linuxAuditArch used to generate the initial
   309  	// conditional jumps -- measured from the *END* of the program so they
   310  	// remain valid after prepending to the tail.
   311  	archJumpTable := map[linuxAuditArch]uint32{}
   312  
   313  	// Generate our own -ENOSYS rules for each architecture. They have to be
   314  	// generated in reverse (prepended to the tail of the program) because the
   315  	// JumpIf jumps need to be computed from the end of the program.
   316  	programTail := []bpf.Instruction{
   317  		// Fall-through rules jump into the filter.
   318  		bpf.Jump{Skip: 1},
   319  		// Rules which jump to here get -ENOSYS.
   320  		bpf.RetConstant{Val: retErrnoEnosys},
   321  	}
   322  
   323  	// Generate the syscall -ENOSYS rules.
   324  	for auditArch, maxSyscalls := range lastSyscalls {
   325  		// The number of instructions from the tail of this section which need
   326  		// to be jumped in order to reach the -ENOSYS return. If the section
   327  		// does not jump, it will fall through to the actual filter.
   328  		baseJumpEnosys := uint32(len(programTail) - 1)
   329  		baseJumpFilter := baseJumpEnosys + 1
   330  
   331  		// Add the load instruction for the syscall number -- we jump here
   332  		// directly from the arch code so we need to do it here. Sadly we can't
   333  		// share this code between architecture branches.
   334  		section := []bpf.Instruction{
   335  			// load [0] (syscall number)
   336  			bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt},
   337  		}
   338  
   339  		switch len(maxSyscalls) {
   340  		case 0:
   341  			// No syscalls found for this arch -- skip it and move on.
   342  			continue
   343  		case 1:
   344  			// Get the only syscall and scmpArch in the map.
   345  			var (
   346  				scmpArch libseccomp.ScmpArch
   347  				sysno    libseccomp.ScmpSyscall
   348  			)
   349  			for arch, no := range maxSyscalls {
   350  				sysno = no
   351  				scmpArch = arch
   352  			}
   353  
   354  			switch scmpArch {
   355  			// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
   356  			// multiplexing "large syscall number" syscalls, but if the syscall
   357  			// number is not known to the kernel then the syscall number is
   358  			// left unchanged (and because it is sysno=0, you'll end up with
   359  			// EPERM for syscalls the kernel doesn't know about).
   360  			//
   361  			// The actual setup(2) syscall is never used by userspace anymore
   362  			// (and hasn't existed for decades) outside of this multiplexing
   363  			// scheme so returning -ENOSYS is fine.
   364  			case libseccomp.ArchS390, libseccomp.ArchS390X:
   365  				section = append(section, []bpf.Instruction{
   366  					// jne [setup=0],1
   367  					bpf.JumpIf{
   368  						Cond:     bpf.JumpNotEqual,
   369  						Val:      uint32(s390xMultiplexSyscall),
   370  						SkipTrue: 1,
   371  					},
   372  					// ret [ENOSYS]
   373  					bpf.RetConstant{Val: retErrnoEnosys},
   374  				}...)
   375  			}
   376  
   377  			// The simplest case just boils down to a single jgt instruction,
   378  			// with special handling if baseJumpEnosys is larger than 255 (and
   379  			// thus a long jump is required).
   380  			var sectionTail []bpf.Instruction
   381  			if baseJumpEnosys+1 <= 255 {
   382  				sectionTail = []bpf.Instruction{
   383  					// jgt [syscall],[baseJumpEnosys+1]
   384  					bpf.JumpIf{
   385  						Cond:     bpf.JumpGreaterThan,
   386  						Val:      uint32(sysno),
   387  						SkipTrue: uint8(baseJumpEnosys + 1),
   388  					},
   389  					// ja [baseJumpFilter]
   390  					bpf.Jump{Skip: baseJumpFilter},
   391  				}
   392  			} else {
   393  				sectionTail = []bpf.Instruction{
   394  					// jle [syscall],1
   395  					bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
   396  					// ret [ENOSYS]
   397  					bpf.RetConstant{Val: retErrnoEnosys},
   398  					// ja [baseJumpFilter]
   399  					bpf.Jump{Skip: baseJumpFilter},
   400  				}
   401  			}
   402  
   403  			// If we're on x86 we need to add a check for x32 and if we're in
   404  			// the wrong mode we jump over the section.
   405  			if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
   406  				// Generate a prefix to check the mode.
   407  				switch scmpArch {
   408  				case libseccomp.ArchAMD64:
   409  					sectionTail = append([]bpf.Instruction{
   410  						// jset (1<<30),[len(tail)-1]
   411  						bpf.JumpIf{
   412  							Cond:     bpf.JumpBitsSet,
   413  							Val:      1 << 30,
   414  							SkipTrue: uint8(len(sectionTail) - 1),
   415  						},
   416  					}, sectionTail...)
   417  				case libseccomp.ArchX32:
   418  					sectionTail = append([]bpf.Instruction{
   419  						// jset (1<<30),0,[len(tail)-1]
   420  						bpf.JumpIf{
   421  							Cond:     bpf.JumpBitsNotSet,
   422  							Val:      1 << 30,
   423  							SkipTrue: uint8(len(sectionTail) - 1),
   424  						},
   425  					}, sectionTail...)
   426  				default:
   427  					return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
   428  				}
   429  			}
   430  
   431  			section = append(section, sectionTail...)
   432  		case 2:
   433  			// x32 and x86_64 are a unique case, we can't handle any others.
   434  			if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
   435  				return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
   436  			}
   437  
   438  			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
   439  			if !ok {
   440  				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
   441  			}
   442  			x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
   443  			if !ok {
   444  				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
   445  			}
   446  
   447  			// The x32 ABI indicates that a syscall is being made by an x32
   448  			// process by setting the 30th bit of the syscall number, but we
   449  			// need to do some special-casing depending on whether we need to
   450  			// do long jumps.
   451  			if baseJumpEnosys+2 <= 255 {
   452  				// For the simple case we want to have something like:
   453  				//   jset (1<<30),1
   454  				//   jgt [x86 syscall],[baseJumpEnosys+2],1
   455  				//   jgt [x32 syscall],[baseJumpEnosys+1]
   456  				//   ja [baseJumpFilter]
   457  				section = append(section, []bpf.Instruction{
   458  					// jset (1<<30),1
   459  					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
   460  					// jgt [x86 syscall],[baseJumpEnosys+1],1
   461  					bpf.JumpIf{
   462  						Cond:     bpf.JumpGreaterThan,
   463  						Val:      uint32(x86sysno),
   464  						SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
   465  					},
   466  					// jgt [x32 syscall],[baseJumpEnosys]
   467  					bpf.JumpIf{
   468  						Cond:     bpf.JumpGreaterThan,
   469  						Val:      uint32(x32sysno),
   470  						SkipTrue: uint8(baseJumpEnosys + 1),
   471  					},
   472  					// ja [baseJumpFilter]
   473  					bpf.Jump{Skip: baseJumpFilter},
   474  				}...)
   475  			} else {
   476  				// But if the [baseJumpEnosys+2] jump is larger than 255 we
   477  				// need to do a long jump like so:
   478  				//   jset (1<<30),1
   479  				//   jgt [x86 syscall],1,2
   480  				//   jle [x32 syscall],1
   481  				//   ret [ENOSYS]
   482  				//   ja [baseJumpFilter]
   483  				section = append(section, []bpf.Instruction{
   484  					// jset (1<<30),1
   485  					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
   486  					// jgt [x86 syscall],1,2
   487  					bpf.JumpIf{
   488  						Cond:     bpf.JumpGreaterThan,
   489  						Val:      uint32(x86sysno),
   490  						SkipTrue: 1, SkipFalse: 2,
   491  					},
   492  					// jle [x32 syscall],1
   493  					bpf.JumpIf{
   494  						Cond:     bpf.JumpLessOrEqual,
   495  						Val:      uint32(x32sysno),
   496  						SkipTrue: 1,
   497  					},
   498  					// ret [ENOSYS]
   499  					bpf.RetConstant{Val: retErrnoEnosys},
   500  					// ja [baseJumpFilter]
   501  					bpf.Jump{Skip: baseJumpFilter},
   502  				}...)
   503  			}
   504  		default:
   505  			return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
   506  		}
   507  
   508  		// Prepend this section to the tail.
   509  		programTail = append(section, programTail...)
   510  
   511  		// Update jump table.
   512  		archJumpTable[auditArch] = uint32(len(programTail))
   513  	}
   514  
   515  	// Add a dummy "jump to filter" for any architecture we might miss below.
   516  	// Such architectures will probably get the BadArch action of the filter
   517  	// regardless.
   518  	programTail = append([]bpf.Instruction{
   519  		// ja [end of stub and start of filter]
   520  		bpf.Jump{Skip: uint32(len(programTail))},
   521  	}, programTail...)
   522  
   523  	// Generate the jump rules for each architecture. This has to be done in
   524  	// reverse as well for the same reason as above. We add to programTail
   525  	// directly because the jumps are impacted by each architecture rule we add
   526  	// as well.
   527  	//
   528  	// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
   529  	//       architectures based on how large the jumps are going to be, or
   530  	//       re-sort the candidate architectures each time to make sure that we
   531  	//       pick the largest jump which is going to be smaller than 255.
   532  	for auditArch := range lastSyscalls {
   533  		// We jump forwards but the jump table is calculated from the *END*.
   534  		jump := uint32(len(programTail)) - archJumpTable[auditArch]
   535  
   536  		// Same routine as above -- this is a basic jeq check, complicated
   537  		// slightly if it turns out that we need to do a long jump.
   538  		if jump <= 255 {
   539  			programTail = append([]bpf.Instruction{
   540  				// jeq [arch],[jump]
   541  				bpf.JumpIf{
   542  					Cond:     bpf.JumpEqual,
   543  					Val:      uint32(auditArch),
   544  					SkipTrue: uint8(jump),
   545  				},
   546  			}, programTail...)
   547  		} else {
   548  			programTail = append([]bpf.Instruction{
   549  				// jne [arch],1
   550  				bpf.JumpIf{
   551  					Cond:     bpf.JumpNotEqual,
   552  					Val:      uint32(auditArch),
   553  					SkipTrue: 1,
   554  				},
   555  				// ja [jump]
   556  				bpf.Jump{Skip: jump},
   557  			}, programTail...)
   558  		}
   559  	}
   560  
   561  	// Prepend the load instruction for the architecture.
   562  	programTail = append([]bpf.Instruction{
   563  		// load [4] (architecture)
   564  		bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt},
   565  	}, programTail...)
   566  
   567  	// And that's all folks!
   568  	return programTail, nil
   569  }
   570  
   571  func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
   572  	rawProgram, err := bpf.Assemble(program)
   573  	if err != nil {
   574  		return nil, fmt.Errorf("error assembling program: %w", err)
   575  	}
   576  
   577  	// Convert to []unix.SockFilter for unix.SockFilter.
   578  	var filter []unix.SockFilter
   579  	for _, insn := range rawProgram {
   580  		filter = append(filter, unix.SockFilter{
   581  			Code: insn.Op,
   582  			Jt:   insn.Jt,
   583  			Jf:   insn.Jf,
   584  			K:    insn.K,
   585  		})
   586  	}
   587  	return filter, nil
   588  }
   589  
   590  func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
   591  	// Patch the generated cBPF only when there is not a defaultErrnoRet set
   592  	// and it is different from ENOSYS
   593  	if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
   594  		return nil, nil
   595  	}
   596  	// We only add the stub if the default action is not permissive.
   597  	if isAllowAction(config.DefaultAction) {
   598  		logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
   599  		return nil, nil
   600  	}
   601  
   602  	lastSyscalls, err := findLastSyscalls(config)
   603  	if err != nil {
   604  		return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
   605  	}
   606  	stubProgram, err := generateEnosysStub(lastSyscalls)
   607  	if err != nil {
   608  		return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
   609  	}
   610  	return stubProgram, nil
   611  }
   612  
   613  func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
   614  	program, err := disassembleFilter(filter)
   615  	if err != nil {
   616  		return nil, fmt.Errorf("error disassembling original filter: %w", err)
   617  	}
   618  
   619  	patch, err := generatePatch(config)
   620  	if err != nil {
   621  		return nil, fmt.Errorf("error generating patch for filter: %w", err)
   622  	}
   623  	fullProgram := append(patch, program...)
   624  
   625  	logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
   626  	for idx, insn := range patch {
   627  		logrus.Debugf("  [%4.1d] %s", idx, insn)
   628  	}
   629  	logrus.Debugf("  [....] --- original filter ---")
   630  
   631  	fprog, err := assemble(fullProgram)
   632  	if err != nil {
   633  		return nil, fmt.Errorf("error assembling modified filter: %w", err)
   634  	}
   635  	return fprog, nil
   636  }
   637  
   638  func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
   639  	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
   640  	apiLevel, _ := libseccomp.GetAPI()
   641  
   642  	noNewPrivs, err = filter.GetNoNewPrivsBit()
   643  	if err != nil {
   644  		return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
   645  	}
   646  
   647  	if apiLevel >= 3 {
   648  		if logBit, err := filter.GetLogBit(); err != nil {
   649  			return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
   650  		} else if logBit {
   651  			flags |= uint(C.C_FILTER_FLAG_LOG)
   652  		}
   653  	}
   654  	if apiLevel >= 4 {
   655  		if ssb, err := filter.GetSSB(); err != nil {
   656  			return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_SPEC_ALLOW bit: %w", err)
   657  		} else if ssb {
   658  			flags |= uint(C.C_FILTER_FLAG_SPEC_ALLOW)
   659  		}
   660  	}
   661  	// XXX: add newly supported filter flags above this line.
   662  
   663  	for _, call := range config.Syscalls {
   664  		if call.Action == configs.Notify {
   665  			flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
   666  			break
   667  		}
   668  	}
   669  
   670  	return
   671  }
   672  
   673  func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
   674  	// This debug output is validated in tests/integration/seccomp.bats
   675  	// by the SECCOMP_FILTER_FLAG_* test.
   676  	logrus.Debugf("seccomp filter flags: %d", flags)
   677  	fprog := unix.SockFprog{
   678  		Len:    uint16(len(filter)),
   679  		Filter: &filter[0],
   680  	}
   681  	fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
   682  	// If no seccomp flags were requested we can use the old-school prctl(2).
   683  	if flags == 0 {
   684  		err = unix.Prctl(unix.PR_SET_SECCOMP,
   685  			unix.SECCOMP_MODE_FILTER,
   686  			uintptr(unsafe.Pointer(&fprog)), 0, 0)
   687  	} else {
   688  		fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
   689  			uintptr(C.C_SET_MODE_FILTER),
   690  			uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
   691  		if errno != 0 {
   692  			err = errno
   693  		}
   694  		if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
   695  			fd = int(fdptr)
   696  		}
   697  	}
   698  	runtime.KeepAlive(filter)
   699  	runtime.KeepAlive(fprog)
   700  	return
   701  }
   702  
   703  // PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
   704  // been pre-configured with the set of rules in the seccomp config. It then
   705  // patches said filter to handle -ENOSYS in a much nicer manner than the
   706  // default libseccomp default action behaviour, and loads the patched filter
   707  // into the kernel for the current process.
   708  func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) {
   709  	// Generate a patched filter.
   710  	fprog, err := enosysPatchFilter(config, filter)
   711  	if err != nil {
   712  		return nil, fmt.Errorf("error patching filter: %w", err)
   713  	}
   714  
   715  	// Get the set of libseccomp flags set.
   716  	seccompFlags, noNewPrivs, err := filterFlags(config, filter)
   717  	if err != nil {
   718  		return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
   719  	}
   720  
   721  	// Set no_new_privs if it was requested, though in runc we handle
   722  	// no_new_privs separately so warn if we hit this path.
   723  	if noNewPrivs {
   724  		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
   725  		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
   726  			return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err)
   727  		}
   728  	}
   729  
   730  	// Finally, load the filter.
   731  	fd, err := sysSeccompSetFilter(seccompFlags, fprog)
   732  	if err != nil {
   733  		return nil, fmt.Errorf("error loading seccomp filter: %w", err)
   734  	}
   735  	return os.NewFile(uintptr(fd), "[seccomp filter]"), nil
   736  }