github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/common/seccomp.go (about)

     1  // Copyright 2016 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package common
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"strings"
    23  
    24  	stage1commontypes "github.com/rkt/rkt/stage1/common/types"
    25  
    26  	"github.com/appc/spec/schema/types"
    27  	"github.com/coreos/go-systemd/unit"
    28  )
    29  
    30  var (
    31  	ErrTooManySeccompIsolators = errors.New("too many seccomp isolators specified")
    32  )
    33  
    34  // Systemd filter mode, see
    35  // https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=
    36  const (
    37  	sdBlacklistPrefix = "~"
    38  	sdWhitelistPrefix = ""
    39  )
    40  
    41  type filterType int
    42  
    43  const (
    44  	ModeBlacklist filterType = iota
    45  	ModeWhitelist
    46  )
    47  
    48  // seccompFilter is an internal representation of the seccomp filtering
    49  // supplied by the isolators.
    50  type seccompFilter struct {
    51  	syscalls             []string   // List of syscalls to filter
    52  	mode                 filterType // whitelist or blacklist
    53  	errno                string     // optional - empty string = use default
    54  	forceNoNewPrivileges bool       // If true, then override the NoNewPrivileges isolator
    55  }
    56  
    57  // generateSeccompFilter computes the concrete seccomp filter from the isolators
    58  func generateSeccompFilter(p *stage1commontypes.Pod, pa *preparedApp) (*seccompFilter, error) {
    59  	sf := seccompFilter{}
    60  	seenIsolators := 0
    61  	for _, i := range pa.app.App.Isolators {
    62  		var flag string
    63  		var err error
    64  		if seccomp, ok := i.Value().(types.LinuxSeccompSet); ok {
    65  			seenIsolators++
    66  			// By appc spec, only one seccomp isolator per app is allowed
    67  			if seenIsolators > 1 {
    68  				return nil, ErrTooManySeccompIsolators
    69  			}
    70  			switch i.Name {
    71  			case types.LinuxSeccompRemoveSetName:
    72  				sf.mode = ModeBlacklist
    73  				sf.syscalls, flag, err = parseLinuxSeccompSet(p, seccomp)
    74  				if err != nil {
    75  					return nil, err
    76  				}
    77  				if flag == "empty" {
    78  					// we interpret "remove @empty" to mean "default whitelist"
    79  					sf.mode = ModeWhitelist
    80  					sf.syscalls = RktDefaultSeccompWhitelist
    81  				}
    82  			case types.LinuxSeccompRetainSetName:
    83  				sf.mode = ModeWhitelist
    84  				sf.syscalls, flag, err = parseLinuxSeccompSet(p, seccomp)
    85  				if err != nil {
    86  					return nil, err
    87  				}
    88  				if flag == "all" {
    89  					// Opt-out seccomp filtering
    90  					return nil, nil
    91  				}
    92  			}
    93  			sf.errno = string(seccomp.Errno())
    94  		}
    95  	}
    96  
    97  	// If unset, use rkt default whitelist
    98  	if seenIsolators == 0 {
    99  		sf.mode = ModeWhitelist
   100  		sf.syscalls = RktDefaultSeccompWhitelist
   101  	}
   102  
   103  	// Non-priv apps *must* have NoNewPrivileges set if they have seccomp
   104  	sf.forceNoNewPrivileges = (pa.uid != 0)
   105  
   106  	return &sf, nil
   107  }
   108  
   109  // seccompUnitOptions converts a concrete seccomp filter to systemd unit options
   110  func seccompUnitOptions(opts []*unit.UnitOption, sf *seccompFilter) ([]*unit.UnitOption, error) {
   111  	if sf == nil {
   112  		return opts, nil
   113  	}
   114  	if sf.errno != "" {
   115  		opts = append(opts, unit.NewUnitOption("Service", "SystemCallErrorNumber", sf.errno))
   116  	}
   117  
   118  	var filterPrefix string
   119  	switch sf.mode {
   120  	case ModeWhitelist:
   121  		filterPrefix = sdWhitelistPrefix
   122  	case ModeBlacklist:
   123  		filterPrefix = sdBlacklistPrefix
   124  	default:
   125  		return nil, fmt.Errorf("unknown filter mode %v", sf.mode)
   126  	}
   127  
   128  	// SystemCallFilter options are written down one entry per line, because
   129  	// filtering sets may be quite large and overlong lines break unit serialization.
   130  	opts = appendOptionsList(opts, "Service", "SystemCallFilter", filterPrefix, sf.syscalls...)
   131  	return opts, nil
   132  }
   133  
   134  // parseLinuxSeccompSet gets an appc LinuxSeccompSet and returns an array
   135  // of values suitable for systemd SystemCallFilter.
   136  func parseLinuxSeccompSet(p *stage1commontypes.Pod, s types.LinuxSeccompSet) (syscallFilter []string, flag string, err error) {
   137  	for _, item := range s.Set() {
   138  		if item[0] == '@' {
   139  			// Wildcards
   140  			wildcard := strings.SplitN(string(item), "/", 2)
   141  			if len(wildcard) != 2 {
   142  				continue
   143  			}
   144  			scope := wildcard[0]
   145  			name := wildcard[1]
   146  			switch scope {
   147  			case "@appc.io":
   148  				// appc-reserved wildcards
   149  				switch name {
   150  				case "all":
   151  					return nil, "all", nil
   152  				case "empty":
   153  					return nil, "empty", nil
   154  				}
   155  			case "@docker":
   156  				// Docker-originated wildcards
   157  				switch name {
   158  				case "default-blacklist":
   159  					syscallFilter = append(syscallFilter, DockerDefaultSeccompBlacklist...)
   160  				case "default-whitelist":
   161  					syscallFilter = append(syscallFilter, DockerDefaultSeccompWhitelist...)
   162  				}
   163  			case "@rkt":
   164  				// Custom rkt wildcards
   165  				switch name {
   166  				case "default-blacklist":
   167  					syscallFilter = append(syscallFilter, RktDefaultSeccompBlacklist...)
   168  				case "default-whitelist":
   169  					syscallFilter = append(syscallFilter, RktDefaultSeccompWhitelist...)
   170  				}
   171  			case "@systemd":
   172  				// Custom systemd wildcards (systemd >= 231)
   173  				_, systemdVersion, err := GetFlavor(p)
   174  				if err != nil || systemdVersion < 231 {
   175  					return nil, "", errors.New("Unsupported or unknown systemd version, seccomp groups need systemd >= v231")
   176  				}
   177  				switch name {
   178  				case "clock":
   179  					syscallFilter = append(syscallFilter, "@clock")
   180  				case "default-whitelist":
   181  					syscallFilter = append(syscallFilter, "@default")
   182  				case "mount":
   183  					syscallFilter = append(syscallFilter, "@mount")
   184  				case "network-io":
   185  					syscallFilter = append(syscallFilter, "@network-io")
   186  				case "obsolete":
   187  					syscallFilter = append(syscallFilter, "@obsolete")
   188  				case "privileged":
   189  					syscallFilter = append(syscallFilter, "@privileged")
   190  				case "process":
   191  					syscallFilter = append(syscallFilter, "@process")
   192  				case "raw-io":
   193  					syscallFilter = append(syscallFilter, "@raw-io")
   194  				}
   195  			}
   196  		} else {
   197  			// Plain syscall name
   198  			syscallFilter = append(syscallFilter, string(item))
   199  		}
   200  	}
   201  	return syscallFilter, "", nil
   202  }