github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/seccomp/seccomp_linux.go (about)

     1  //go:build cgo && seccomp
     2  // +build cgo,seccomp
     3  
     4  package seccomp
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"os"
    10  
    11  	libseccomp "github.com/seccomp/libseccomp-golang"
    12  	"github.com/sirupsen/logrus"
    13  	"golang.org/x/sys/unix"
    14  
    15  	"github.com/opencontainers/runc/libcontainer/configs"
    16  	"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
    17  	"github.com/opencontainers/runtime-spec/specs-go"
    18  )
    19  
    20  var (
    21  	actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
    22  	actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
    23  )
    24  
    25  const (
    26  	// Linux system calls can have at most 6 arguments
    27  	syscallMaxArguments int = 6
    28  )
    29  
    30  // InitSeccomp installs the seccomp filters to be used in the container as
    31  // specified in config. Returns the seccomp file descriptor if any of the
    32  // filters include a SCMP_ACT_NOTIFY action.
    33  func InitSeccomp(config *configs.Seccomp) (*os.File, error) {
    34  	if config == nil {
    35  		return nil, errors.New("cannot initialize Seccomp - nil config passed")
    36  	}
    37  
    38  	defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
    39  	if err != nil {
    40  		return nil, errors.New("error initializing seccomp - invalid default action")
    41  	}
    42  
    43  	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
    44  	apiLevel, _ := libseccomp.GetAPI()
    45  	for _, call := range config.Syscalls {
    46  		if call.Action == configs.Notify {
    47  			if apiLevel < 6 {
    48  				return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
    49  			}
    50  
    51  			// We can't allow the write syscall to notify to the seccomp agent.
    52  			// After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain
    53  			// number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we
    54  			// never can write the seccomp fd to the parent and therefore the seccomp agent never receives
    55  			// the seccomp fd and runc is hang during initialization.
    56  			//
    57  			// Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY.
    58  			// Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and
    59  			// send the seccomp fd to the agent (it is another process and not subject to the seccomp
    60  			// filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp
    61  			// agent allows those syscalls to proceed, initialization works just fine and the agent can
    62  			// handle future read()/close() syscalls as it wanted.
    63  			if call.Name == "write" {
    64  				return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
    65  			}
    66  		}
    67  	}
    68  
    69  	// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
    70  	if defaultAction == libseccomp.ActNotify {
    71  		return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
    72  	}
    73  
    74  	filter, err := libseccomp.NewFilter(defaultAction)
    75  	if err != nil {
    76  		return nil, fmt.Errorf("error creating filter: %w", err)
    77  	}
    78  
    79  	// Add extra architectures
    80  	for _, arch := range config.Architectures {
    81  		scmpArch, err := libseccomp.GetArchFromString(arch)
    82  		if err != nil {
    83  			return nil, fmt.Errorf("error validating Seccomp architecture: %w", err)
    84  		}
    85  		if err := filter.AddArch(scmpArch); err != nil {
    86  			return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
    87  		}
    88  	}
    89  
    90  	// Add extra flags.
    91  	for _, flag := range config.Flags {
    92  		if err := setFlag(filter, flag); err != nil {
    93  			return nil, err
    94  		}
    95  	}
    96  
    97  	// Enable libseccomp binary tree optimization for longer rulesets.
    98  	//
    99  	// The number below chosen semi-arbitrarily, considering the following:
   100  	// 1. libseccomp <= 2.5.4 misbehaves when binary tree optimization
   101  	// is enabled and there are 0 rules.
   102  	// 2. All known libseccomp versions (2.5.0 to 2.5.4) generate a binary
   103  	// tree with 4 syscalls per node.
   104  	if len(config.Syscalls) > 32 {
   105  		if err := filter.SetOptimize(2); err != nil {
   106  			// The error is not fatal and is probably means we have older libseccomp.
   107  			logrus.Debugf("seccomp binary tree optimization not available: %v", err)
   108  		}
   109  	}
   110  
   111  	// Unset no new privs bit
   112  	if err := filter.SetNoNewPrivsBit(false); err != nil {
   113  		return nil, fmt.Errorf("error setting no new privileges: %w", err)
   114  	}
   115  
   116  	// Add a rule for each syscall
   117  	for _, call := range config.Syscalls {
   118  		if call == nil {
   119  			return nil, errors.New("encountered nil syscall while initializing Seccomp")
   120  		}
   121  
   122  		if err := matchCall(filter, call, defaultAction); err != nil {
   123  			return nil, err
   124  		}
   125  	}
   126  
   127  	seccompFd, err := patchbpf.PatchAndLoad(config, filter)
   128  	if err != nil {
   129  		return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
   130  	}
   131  	return seccompFd, nil
   132  }
   133  
   134  type unknownFlagError struct {
   135  	flag specs.LinuxSeccompFlag
   136  }
   137  
   138  func (e *unknownFlagError) Error() string {
   139  	return "seccomp flag " + string(e.flag) + " is not known to runc"
   140  }
   141  
   142  func setFlag(filter *libseccomp.ScmpFilter, flag specs.LinuxSeccompFlag) error {
   143  	switch flag {
   144  	case flagTsync:
   145  		// libseccomp-golang always use filterAttrTsync when
   146  		// possible so all goroutines will receive the same
   147  		// rules, so there is nothing to do. It does not make
   148  		// sense to apply the seccomp filter on only one
   149  		// thread; other threads will be terminated after exec
   150  		// anyway.
   151  		return nil
   152  	case specs.LinuxSeccompFlagLog:
   153  		if err := filter.SetLogBit(true); err != nil {
   154  			return fmt.Errorf("error adding log flag to seccomp filter: %w", err)
   155  		}
   156  		return nil
   157  	case specs.LinuxSeccompFlagSpecAllow:
   158  		if err := filter.SetSSB(true); err != nil {
   159  			return fmt.Errorf("error adding SSB flag to seccomp filter: %w", err)
   160  		}
   161  		return nil
   162  	}
   163  	// NOTE when adding more flags above, do not forget to also:
   164  	// - add new flags to `flags` slice in config.go;
   165  	// - add new flag values to flags_value() in tests/integration/seccomp.bats;
   166  	// - modify func filterFlags in patchbpf/ accordingly.
   167  
   168  	return &unknownFlagError{flag: flag}
   169  }
   170  
   171  // FlagSupported checks if the flag is known to runc and supported by
   172  // currently used libseccomp and kernel (i.e. it can be set).
   173  func FlagSupported(flag specs.LinuxSeccompFlag) error {
   174  	filter := &libseccomp.ScmpFilter{}
   175  	err := setFlag(filter, flag)
   176  
   177  	// For flags we don't know, setFlag returns unknownFlagError.
   178  	var uf *unknownFlagError
   179  	if errors.As(err, &uf) {
   180  		return err
   181  	}
   182  	// For flags that are known to runc and libseccomp-golang but can not
   183  	// be applied because either libseccomp or the kernel is too old,
   184  	// seccomp.VersionError is returned.
   185  	var verErr *libseccomp.VersionError
   186  	if errors.As(err, &verErr) {
   187  		// Not supported by libseccomp or the kernel.
   188  		return err
   189  	}
   190  
   191  	// All other flags are known and supported.
   192  	return nil
   193  }
   194  
   195  // Convert Libcontainer Action to Libseccomp ScmpAction
   196  func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
   197  	switch act {
   198  	case configs.Kill, configs.KillThread:
   199  		return libseccomp.ActKillThread, nil
   200  	case configs.Errno:
   201  		if errnoRet != nil {
   202  			return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
   203  		}
   204  		return actErrno, nil
   205  	case configs.Trap:
   206  		return libseccomp.ActTrap, nil
   207  	case configs.Allow:
   208  		return libseccomp.ActAllow, nil
   209  	case configs.Trace:
   210  		if errnoRet != nil {
   211  			return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil
   212  		}
   213  		return actTrace, nil
   214  	case configs.Log:
   215  		return libseccomp.ActLog, nil
   216  	case configs.Notify:
   217  		return libseccomp.ActNotify, nil
   218  	case configs.KillProcess:
   219  		return libseccomp.ActKillProcess, nil
   220  	default:
   221  		return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule")
   222  	}
   223  }
   224  
   225  // Convert Libcontainer Operator to Libseccomp ScmpCompareOp
   226  func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
   227  	switch op {
   228  	case configs.EqualTo:
   229  		return libseccomp.CompareEqual, nil
   230  	case configs.NotEqualTo:
   231  		return libseccomp.CompareNotEqual, nil
   232  	case configs.GreaterThan:
   233  		return libseccomp.CompareGreater, nil
   234  	case configs.GreaterThanOrEqualTo:
   235  		return libseccomp.CompareGreaterEqual, nil
   236  	case configs.LessThan:
   237  		return libseccomp.CompareLess, nil
   238  	case configs.LessThanOrEqualTo:
   239  		return libseccomp.CompareLessOrEqual, nil
   240  	case configs.MaskEqualTo:
   241  		return libseccomp.CompareMaskedEqual, nil
   242  	default:
   243  		return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule")
   244  	}
   245  }
   246  
   247  // Convert Libcontainer Arg to Libseccomp ScmpCondition
   248  func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
   249  	cond := libseccomp.ScmpCondition{}
   250  
   251  	if arg == nil {
   252  		return cond, errors.New("cannot convert nil to syscall condition")
   253  	}
   254  
   255  	op, err := getOperator(arg.Op)
   256  	if err != nil {
   257  		return cond, err
   258  	}
   259  
   260  	return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
   261  }
   262  
   263  // Add a rule to match a single syscall
   264  func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error {
   265  	if call == nil || filter == nil {
   266  		return errors.New("cannot use nil as syscall to block")
   267  	}
   268  
   269  	if len(call.Name) == 0 {
   270  		return errors.New("empty string is not a valid syscall")
   271  	}
   272  
   273  	// Convert the call's action to the libseccomp equivalent
   274  	callAct, err := getAction(call.Action, call.ErrnoRet)
   275  	if err != nil {
   276  		return fmt.Errorf("action in seccomp profile is invalid: %w", err)
   277  	}
   278  	if callAct == defAct {
   279  		// This rule is redundant, silently skip it
   280  		// to avoid error from AddRule.
   281  		return nil
   282  	}
   283  
   284  	// If we can't resolve the syscall, assume it is not supported
   285  	// by this kernel. Warn about it, don't error out.
   286  	callNum, err := libseccomp.GetSyscallFromName(call.Name)
   287  	if err != nil {
   288  		logrus.Debugf("unknown seccomp syscall %q ignored", call.Name)
   289  		return nil
   290  	}
   291  
   292  	// Unconditional match - just add the rule
   293  	if len(call.Args) == 0 {
   294  		if err := filter.AddRule(callNum, callAct); err != nil {
   295  			return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err)
   296  		}
   297  	} else {
   298  		// If two or more arguments have the same condition,
   299  		// Revert to old behavior, adding each condition as a separate rule
   300  		argCounts := make([]uint, syscallMaxArguments)
   301  		conditions := []libseccomp.ScmpCondition{}
   302  
   303  		for _, cond := range call.Args {
   304  			newCond, err := getCondition(cond)
   305  			if err != nil {
   306  				return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err)
   307  			}
   308  
   309  			argCounts[cond.Index] += 1
   310  
   311  			conditions = append(conditions, newCond)
   312  		}
   313  
   314  		hasMultipleArgs := false
   315  		for _, count := range argCounts {
   316  			if count > 1 {
   317  				hasMultipleArgs = true
   318  				break
   319  			}
   320  		}
   321  
   322  		if hasMultipleArgs {
   323  			// Revert to old behavior
   324  			// Add each condition attached to a separate rule
   325  			for _, cond := range conditions {
   326  				condArr := []libseccomp.ScmpCondition{cond}
   327  
   328  				if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
   329  					return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
   330  				}
   331  			}
   332  		} else {
   333  			// No conditions share same argument
   334  			// Use new, proper behavior
   335  			if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
   336  				return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
   337  			}
   338  		}
   339  	}
   340  
   341  	return nil
   342  }
   343  
   344  // Version returns major, minor, and micro.
   345  func Version() (uint, uint, uint) {
   346  	return libseccomp.GetLibraryVersion()
   347  }
   348  
   349  // Enabled is true if seccomp support is compiled in.
   350  const Enabled = true