github.com/ubuntu-core/snappy@v0.0.0-20210827154228-9e584df982bb/interfaces/builtin/kubernetes_support.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2017-2018 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package builtin 21 22 import ( 23 "fmt" 24 "strings" 25 26 "github.com/snapcore/snapd/interfaces" 27 "github.com/snapcore/snapd/interfaces/apparmor" 28 "github.com/snapcore/snapd/interfaces/kmod" 29 "github.com/snapcore/snapd/interfaces/seccomp" 30 "github.com/snapcore/snapd/interfaces/udev" 31 "github.com/snapcore/snapd/snap" 32 ) 33 34 const kubernetesSupportSummary = `allows operating as the Kubernetes service` 35 36 const kubernetesSupportBaseDeclarationPlugs = ` 37 kubernetes-support: 38 allow-installation: false 39 deny-auto-connection: true 40 ` 41 42 const kubernetesSupportBaseDeclarationSlots = ` 43 kubernetes-support: 44 allow-installation: 45 slot-snap-type: 46 - core 47 deny-auto-connection: true 48 ` 49 50 const kubernetesSupportConnectedPlugAppArmorCommon = ` 51 # Common rules for running as a kubernetes node 52 53 # reading cgroups 54 capability sys_resource, 55 /sys/fs/cgroup/{,**} r, 56 57 # Allow adjusting the OOM score for containers. Note, this allows adjusting for 58 # all processes, not just containers. 59 @{PROC}/@{pid}/oom_score_adj rw, 60 @{PROC}/sys/vm/overcommit_memory rw, 61 /sys/kernel/mm/hugepages/{,**} r, 62 /sys/kernel/mm/transparent_hugepage/{,**} r, 63 64 capability dac_override, 65 66 /{,usr/}bin/systemd-run Cxr -> systemd_run, 67 /run/systemd/private r, 68 profile systemd_run (attach_disconnected,mediate_deleted) { 69 # Common rules for kubernetes use of systemd_run 70 #include <abstractions/base> 71 72 /{,usr/}bin/systemd-run rm, 73 owner @{PROC}/@{pid}/stat r, 74 owner @{PROC}/@{pid}/environ r, 75 @{PROC}/cmdline r, # proc_cmdline() 76 77 # setsockopt() 78 capability net_admin, 79 80 # systemd-run's detect_container() looks at several files to determine if it 81 # is running in a container. 82 @{PROC}/sys/kernel/osrelease r, 83 @{PROC}/1/sched r, 84 /run/systemd/container r, 85 86 # kubelet calls 'systemd-run --scope true' to determine if systemd is 87 # available and usable for calling certain mount commands under transient 88 # units as part of its lifecycle management. This requires ptrace 'read' on 89 # unconfined since systemd-run will call its detect_container() which will 90 # try to read /proc/1/environ. This is mediated via PTRACE_MODE_READ when 91 # run within kubelet's namespace. 92 ptrace (read) peer=unconfined, 93 /run/systemd/private rw, 94 95 # kubelet calling 'systemd-run --scope true' triggers this when kubelet is 96 # run in a nested container (eg, under lxd). 97 @{PROC}/1/cmdline r, 98 99 # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate 100 # PTRACE_MODE_READ and policy required 'trace' instead of 'read'. 101 # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so 102 # continue to allow this historic 'trace' rule on unconfined (which systemd 103 # runs as) since systemd-run won't be able to ptrace this snap's processes. 104 # This can be dropped once LP: #1890848 is fixed. 105 ptrace (trace) peer=unconfined, 106 107 /{,usr/}bin/true ixr, 108 @{INSTALL_DIR}/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,usr/}bin/true ixr, 109 ###KUBERNETES_SUPPORT_SYSTEMD_RUN### 110 } 111 ` 112 113 const kubernetesSupportConnectedPlugAppArmorKubelet = ` 114 # Allow running as the kubelet service 115 116 # Ideally this would be snap-specific 117 /run/dockershim.sock rw, 118 119 # Ideally this would be snap-specific (it could if the control plane was a 120 # snap), but in deployments where the control plane is not a snap, it will tell 121 # flannel to use this path. 122 /run/flannel/{,**} rw, 123 /run/flannel/** k, 124 125 # allow managing pods' cgroups 126 /sys/fs/cgroup/*/kubepods/{,**} rw, 127 128 # kubelet can be configured to use the systemd cgroup driver which moves 129 # container processes into systemd-managed cgroups. This is now the recommended 130 # configuration since it provides a single cgroup manager (systemd) in an 131 # effort to achieve consistent views of resources. 132 /sys/fs/cgroup/*/systemd/{,system.slice/} rw, # create missing dirs 133 /sys/fs/cgroup/*/systemd/system.slice/** r, 134 /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w, 135 136 # Allow tracing our own processes. Note, this allows seccomp sandbox escape on 137 # kernels < 4.8 138 capability sys_ptrace, 139 ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.*, 140 141 # Allow ptracing other processes (as part of ps-style process lookups). Note, 142 # the peer needs a corresponding tracedby rule. As a special case, disallow 143 # ptracing unconfined. 144 ptrace (trace), 145 deny ptrace (trace) peer=unconfined, 146 147 @{PROC}/[0-9]*/attr/ r, 148 @{PROC}/[0-9]*/fdinfo/ r, 149 @{PROC}/[0-9]*/map_files/ r, 150 @{PROC}/[0-9]*/ns/{,*} r, 151 # dac_read_search needed for lstat'ing non-root owned ns/* files 152 capability dac_read_search, 153 154 # kubernetes will verify and set panic and panic_on_oops to values it considers 155 # sane 156 @{PROC}/sys/kernel/panic w, 157 @{PROC}/sys/kernel/panic_on_oops w, 158 @{PROC}/sys/kernel/keys/root_maxbytes r, 159 @{PROC}/sys/kernel/keys/root_maxkeys r, 160 161 /dev/kmsg r, 162 163 # kubelet calls out to systemd-run for some mounts, but not all of them and not 164 # unmounts... 165 capability sys_admin, 166 mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 167 mount options=(rw, rshared) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 168 169 /{,usr/}bin/mount ixr, 170 /{,usr/}bin/umount ixr, 171 deny /run/mount/utab{,.lock} rw, 172 umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 173 174 # When fsGroup is set, the pod's volume will be recursively chowned with the 175 # setgid bit set on directories so new files will be owned by the fsGroup. See 176 # kubernetes pkg/volume/volume_linux.go:changeFilePermission() 177 capability fsetid, 178 ` 179 180 const kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun = ` 181 # kubelet mount rules 182 capability sys_admin, 183 /{,usr/}bin/mount ixr, 184 mount fstype="tmpfs" tmpfs -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 185 deny /run/mount/utab{,.lock} rw, 186 187 # For mounting volume subPaths 188 mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 189 mount options=(rw, remount, bind) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 190 # nvme0-99, 1-63 partitions with 1-63 optional namespaces 191 mount /dev/nvme{[0-9],[1-9][0-9]}n{[1-9],[1-5][0-9],6[0-3]}{,p{[1-9],[1-5][0-9],6[0-3]}} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 192 # SCSI sda-sdiv, 1-15 partitions 193 mount /dev/sd{[a-z],[a-h][a-z],i[a-v]}{[1-9],1[0-5]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 194 # virtio vda-vdz, 1-63 partitions 195 mount /dev/vd[a-z]{[1-9],[1-5][0-9],6[0-3]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 196 umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 197 198 # When mounting a volume subPath, kubelet binds mounts on an open fd (eg, 199 # /proc/.../fd/N) which triggers a ptrace 'read' denial on the parent 200 # kubelet peer process from this child profile due to PTRACE_MODE_READ (man 201 # ptrace) checks. 202 ptrace (read) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME}, 203 204 # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate 205 # PTRACE_MODE_READ and policy required 'trace' instead of 'read'. 206 # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so 207 # continue to allow this historic 'trace' rule on kubelet (our parent peer) 208 # since systemd-run won't be able to ptrace this snap's processes (kubelet 209 # would also need a corresponding tracedby rule). This can be dropped once 210 # LP: #1890848 is fixed. 211 ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME}, 212 ` 213 214 // k8s.io/apiserver/pkg/storage/etcd3/logger.go pulls in go-systemd via 215 // go.etcd.io/etcd/clientv3. See: 216 // https://github.com/coreos/go-systemd/blob/master/journal/journal.go#L211 217 const kubernetesSupportConnectedPlugAppArmorAutobindUnix = ` 218 # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd) 219 # unix (bind) type=dgram addr=auto, 220 # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax 221 # above which allows only "empty"/automatic addresses, for now we simply permit 222 # all addresses with SOCK_DGRAM type, which leaks info for other addresses than 223 # what docker tries to use 224 # see https://bugs.launchpad.net/snapd/+bug/1867216 225 unix (bind) type=dgram, 226 ` 227 228 const kubernetesSupportConnectedPlugSeccompAutobindUnix = ` 229 # Allow using the 'autobind' feature of bind() (eg, for journald). 230 bind 231 ` 232 233 const kubernetesSupportConnectedPlugSeccompKubelet = ` 234 # Allow running as the kubelet service 235 mount 236 umount 237 umount2 238 239 unshare 240 setns - CLONE_NEWNET 241 242 # When fsGroup is set, the pod's volume will be recursively chowned with the 243 # setgid bit set on directories so new files will be owned by the fsGroup. See 244 # kubernetes pkg/volume/volume_linux.go:changeFilePermission() 245 fchownat 246 ` 247 248 var kubernetesSupportConnectedPlugUDevKubelet = []string{ 249 `KERNEL=="kmsg"`, 250 } 251 252 const kubernetesSupportConnectedPlugAppArmorKubeproxy = ` 253 # Allow running as the kubeproxy service 254 255 # managing our own cgroup 256 /sys/fs/cgroup/*/kube-proxy/{,**} rw, 257 258 # Allow reading the state of modules kubernetes needs 259 /sys/module/libcrc32c/initstate r, 260 /sys/module/llc/initstate r, 261 /sys/module/stp/initstate r, 262 /sys/module/ip_vs/initstate r, 263 /sys/module/ip_vs_rr/initstate r, 264 /sys/module/ip_vs_sh/initstate r, 265 /sys/module/ip_vs_wrr/initstate r, 266 ` 267 268 var kubernetesSupportConnectedPlugKmodKubeProxy = []string{ 269 `ip_vs_rr`, 270 `ip_vs_sh`, 271 `ip_vs_wrr`, 272 `libcrc32c`, 273 `llc`, 274 `stp`, 275 } 276 277 type kubernetesSupportInterface struct { 278 commonInterface 279 } 280 281 func (iface *kubernetesSupportInterface) ServicePermanentPlug(plug *snap.PlugInfo) []string { 282 // only autobind-unix flavor does not get Delegate=true, all other flavors 283 // are usable to manage control groups of processes/containers, and thus 284 // need Delegate=true 285 flavor := k8sFlavor(plug) 286 if flavor == "autobind-unix" { 287 return nil 288 } 289 290 return []string{"Delegate=true"} 291 } 292 293 func k8sFlavor(plug interfaces.Attrer) string { 294 var flavor string 295 _ = plug.Attr("flavor", &flavor) 296 return flavor 297 } 298 299 func (iface *kubernetesSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 300 snippet := kubernetesSupportConnectedPlugAppArmorCommon 301 systemd_run_extra := "" 302 303 // All flavors should include the autobind-unix rules, but we break it 304 // out so other k8s daemons can use this flavor without getting the 305 // privileged rules. 306 switch k8sFlavor(plug) { 307 case "kubelet": 308 systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun 309 snippet += kubernetesSupportConnectedPlugAppArmorKubelet 310 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 311 spec.SetUsesPtraceTrace() 312 case "kubeproxy": 313 snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy 314 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 315 case "autobind-unix": 316 snippet = kubernetesSupportConnectedPlugAppArmorAutobindUnix 317 default: 318 systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun 319 snippet += kubernetesSupportConnectedPlugAppArmorKubelet 320 snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy 321 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 322 spec.SetUsesPtraceTrace() 323 } 324 325 old := "###KUBERNETES_SUPPORT_SYSTEMD_RUN###" 326 spec.AddSnippet(strings.Replace(snippet, old, systemd_run_extra, -1)) 327 return nil 328 } 329 330 func (iface *kubernetesSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 331 // All flavors should include the autobind-unix rules, but we add the 332 // privileged kubelet rules conditionally. 333 snippet := kubernetesSupportConnectedPlugSeccompAutobindUnix 334 flavor := k8sFlavor(plug) 335 if flavor == "kubelet" || flavor == "" { 336 snippet += kubernetesSupportConnectedPlugSeccompKubelet 337 } 338 spec.AddSnippet(snippet) 339 return nil 340 } 341 342 func (iface *kubernetesSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 343 flavor := k8sFlavor(plug) 344 if flavor == "kubelet" || flavor == "" { 345 for _, rule := range kubernetesSupportConnectedPlugUDevKubelet { 346 spec.TagDevice(rule) 347 } 348 } 349 return nil 350 } 351 352 func (iface *kubernetesSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 353 flavor := k8sFlavor(plug) 354 if flavor == "kubeproxy" || flavor == "" { 355 for _, m := range kubernetesSupportConnectedPlugKmodKubeProxy { 356 if err := spec.AddModule(m); err != nil { 357 return err 358 } 359 } 360 } 361 return nil 362 } 363 364 func (iface *kubernetesSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error { 365 // It's fine if flavor isn't specified, but if it is, it needs to be 366 // either "kubelet", "kubeproxy" or "autobind-unix" 367 if t, ok := plug.Attrs["flavor"]; ok && t != "kubelet" && t != "kubeproxy" && t != "autobind-unix" { 368 return fmt.Errorf(`kubernetes-support plug requires "flavor" to be either "kubelet", "kubeproxy" or "autobind-unix"`) 369 } 370 371 return nil 372 } 373 374 func init() { 375 registerIface(&kubernetesSupportInterface{commonInterface{ 376 name: "kubernetes-support", 377 summary: kubernetesSupportSummary, 378 implicitOnClassic: true, 379 implicitOnCore: true, 380 baseDeclarationPlugs: kubernetesSupportBaseDeclarationPlugs, 381 baseDeclarationSlots: kubernetesSupportBaseDeclarationSlots, 382 }}) 383 }