github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/common/seccomp.go (about) 1 // Copyright 2016 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package common 18 19 import ( 20 "errors" 21 "fmt" 22 "strings" 23 24 stage1commontypes "github.com/rkt/rkt/stage1/common/types" 25 26 "github.com/appc/spec/schema/types" 27 "github.com/coreos/go-systemd/unit" 28 ) 29 30 var ( 31 ErrTooManySeccompIsolators = errors.New("too many seccomp isolators specified") 32 ) 33 34 // Systemd filter mode, see 35 // https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter= 36 const ( 37 sdBlacklistPrefix = "~" 38 sdWhitelistPrefix = "" 39 ) 40 41 type filterType int 42 43 const ( 44 ModeBlacklist filterType = iota 45 ModeWhitelist 46 ) 47 48 // seccompFilter is an internal representation of the seccomp filtering 49 // supplied by the isolators. 50 type seccompFilter struct { 51 syscalls []string // List of syscalls to filter 52 mode filterType // whitelist or blacklist 53 errno string // optional - empty string = use default 54 forceNoNewPrivileges bool // If true, then override the NoNewPrivileges isolator 55 } 56 57 // generateSeccompFilter computes the concrete seccomp filter from the isolators 58 func generateSeccompFilter(p *stage1commontypes.Pod, pa *preparedApp) (*seccompFilter, error) { 59 sf := seccompFilter{} 60 seenIsolators := 0 61 for _, i := range pa.app.App.Isolators { 62 var flag string 63 var err error 64 if seccomp, ok := i.Value().(types.LinuxSeccompSet); ok { 65 seenIsolators++ 66 // By appc spec, only one seccomp isolator per app is allowed 67 if seenIsolators > 1 { 68 return nil, ErrTooManySeccompIsolators 69 } 70 switch i.Name { 71 case types.LinuxSeccompRemoveSetName: 72 sf.mode = ModeBlacklist 73 sf.syscalls, flag, err = parseLinuxSeccompSet(p, seccomp) 74 if err != nil { 75 return nil, err 76 } 77 if flag == "empty" { 78 // we interpret "remove @empty" to mean "default whitelist" 79 sf.mode = ModeWhitelist 80 sf.syscalls = RktDefaultSeccompWhitelist 81 } 82 case types.LinuxSeccompRetainSetName: 83 sf.mode = ModeWhitelist 84 sf.syscalls, flag, err = parseLinuxSeccompSet(p, seccomp) 85 if err != nil { 86 return nil, err 87 } 88 if flag == "all" { 89 // Opt-out seccomp filtering 90 return nil, nil 91 } 92 } 93 sf.errno = string(seccomp.Errno()) 94 } 95 } 96 97 // If unset, use rkt default whitelist 98 if seenIsolators == 0 { 99 sf.mode = ModeWhitelist 100 sf.syscalls = RktDefaultSeccompWhitelist 101 } 102 103 // Non-priv apps *must* have NoNewPrivileges set if they have seccomp 104 sf.forceNoNewPrivileges = (pa.uid != 0) 105 106 return &sf, nil 107 } 108 109 // seccompUnitOptions converts a concrete seccomp filter to systemd unit options 110 func seccompUnitOptions(opts []*unit.UnitOption, sf *seccompFilter) ([]*unit.UnitOption, error) { 111 if sf == nil { 112 return opts, nil 113 } 114 if sf.errno != "" { 115 opts = append(opts, unit.NewUnitOption("Service", "SystemCallErrorNumber", sf.errno)) 116 } 117 118 var filterPrefix string 119 switch sf.mode { 120 case ModeWhitelist: 121 filterPrefix = sdWhitelistPrefix 122 case ModeBlacklist: 123 filterPrefix = sdBlacklistPrefix 124 default: 125 return nil, fmt.Errorf("unknown filter mode %v", sf.mode) 126 } 127 128 // SystemCallFilter options are written down one entry per line, because 129 // filtering sets may be quite large and overlong lines break unit serialization. 130 opts = appendOptionsList(opts, "Service", "SystemCallFilter", filterPrefix, sf.syscalls...) 131 return opts, nil 132 } 133 134 // parseLinuxSeccompSet gets an appc LinuxSeccompSet and returns an array 135 // of values suitable for systemd SystemCallFilter. 136 func parseLinuxSeccompSet(p *stage1commontypes.Pod, s types.LinuxSeccompSet) (syscallFilter []string, flag string, err error) { 137 for _, item := range s.Set() { 138 if item[0] == '@' { 139 // Wildcards 140 wildcard := strings.SplitN(string(item), "/", 2) 141 if len(wildcard) != 2 { 142 continue 143 } 144 scope := wildcard[0] 145 name := wildcard[1] 146 switch scope { 147 case "@appc.io": 148 // appc-reserved wildcards 149 switch name { 150 case "all": 151 return nil, "all", nil 152 case "empty": 153 return nil, "empty", nil 154 } 155 case "@docker": 156 // Docker-originated wildcards 157 switch name { 158 case "default-blacklist": 159 syscallFilter = append(syscallFilter, DockerDefaultSeccompBlacklist...) 160 case "default-whitelist": 161 syscallFilter = append(syscallFilter, DockerDefaultSeccompWhitelist...) 162 } 163 case "@rkt": 164 // Custom rkt wildcards 165 switch name { 166 case "default-blacklist": 167 syscallFilter = append(syscallFilter, RktDefaultSeccompBlacklist...) 168 case "default-whitelist": 169 syscallFilter = append(syscallFilter, RktDefaultSeccompWhitelist...) 170 } 171 case "@systemd": 172 // Custom systemd wildcards (systemd >= 231) 173 _, systemdVersion, err := GetFlavor(p) 174 if err != nil || systemdVersion < 231 { 175 return nil, "", errors.New("Unsupported or unknown systemd version, seccomp groups need systemd >= v231") 176 } 177 switch name { 178 case "clock": 179 syscallFilter = append(syscallFilter, "@clock") 180 case "default-whitelist": 181 syscallFilter = append(syscallFilter, "@default") 182 case "mount": 183 syscallFilter = append(syscallFilter, "@mount") 184 case "network-io": 185 syscallFilter = append(syscallFilter, "@network-io") 186 case "obsolete": 187 syscallFilter = append(syscallFilter, "@obsolete") 188 case "privileged": 189 syscallFilter = append(syscallFilter, "@privileged") 190 case "process": 191 syscallFilter = append(syscallFilter, "@process") 192 case "raw-io": 193 syscallFilter = append(syscallFilter, "@raw-io") 194 } 195 } 196 } else { 197 // Plain syscall name 198 syscallFilter = append(syscallFilter, string(item)) 199 } 200 } 201 return syscallFilter, "", nil 202 }