gitee.com/mysnapcore/mysnapd@v0.1.0/interfaces/builtin/kubernetes_support.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2017-2018 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package builtin 21 22 import ( 23 "fmt" 24 "strings" 25 26 "gitee.com/mysnapcore/mysnapd/interfaces" 27 "gitee.com/mysnapcore/mysnapd/interfaces/apparmor" 28 "gitee.com/mysnapcore/mysnapd/interfaces/kmod" 29 "gitee.com/mysnapcore/mysnapd/interfaces/seccomp" 30 "gitee.com/mysnapcore/mysnapd/interfaces/udev" 31 "gitee.com/mysnapcore/mysnapd/snap" 32 ) 33 34 const kubernetesSupportSummary = `allows operating as the Kubernetes service` 35 36 const kubernetesSupportBaseDeclarationPlugs = ` 37 kubernetes-support: 38 allow-installation: false 39 deny-auto-connection: true 40 ` 41 42 const kubernetesSupportBaseDeclarationSlots = ` 43 kubernetes-support: 44 allow-installation: 45 slot-snap-type: 46 - core 47 deny-auto-connection: true 48 ` 49 50 const kubernetesSupportConnectedPlugAppArmorCommon = ` 51 # Common rules for running as a kubernetes node 52 53 # reading cgroups 54 capability sys_resource, 55 /sys/fs/cgroup/{,**} r, 56 57 # Allow adjusting the OOM score for containers. Note, this allows adjusting for 58 # all processes, not just containers. 59 @{PROC}/@{pid}/oom_score_adj rw, 60 @{PROC}/sys/vm/overcommit_memory rw, 61 /sys/kernel/mm/hugepages/{,**} r, 62 /sys/kernel/mm/transparent_hugepage/{,**} r, 63 64 capability dac_override, 65 66 # Lock file used by Calico's IPAM plugin. This is configurable via the 67 # (undocumented) "ipam_lock_file" configuration key: 68 # https://github.com/projectcalico/cni-plugin/blob/master/pkg/types/types.go 69 /{,var/}run/calico/ipam.lock rwk, 70 71 # manually add java certs here 72 # see also https://bugs.launchpad.net/apparmor/+bug/1816372 73 /etc/ssl/certs/java/{,*} r, 74 #include <abstractions/ssl_certs> 75 76 /{,usr/}bin/systemd-run Cxr -> systemd_run, 77 /run/systemd/private r, 78 profile systemd_run (attach_disconnected,mediate_deleted) { 79 # Common rules for kubernetes use of systemd_run 80 #include <abstractions/base> 81 82 /{,usr/}bin/systemd-run rm, 83 owner @{PROC}/@{pid}/stat r, 84 owner @{PROC}/@{pid}/environ r, 85 @{PROC}/cmdline r, # proc_cmdline() 86 87 # setsockopt() 88 capability net_admin, 89 90 # systemd-run's detect_container() looks at several files to determine if it 91 # is running in a container. 92 @{PROC}/sys/kernel/osrelease r, 93 @{PROC}/1/sched r, 94 /run/systemd/container r, 95 96 # kubelet calls 'systemd-run --scope true' to determine if systemd is 97 # available and usable for calling certain mount commands under transient 98 # units as part of its lifecycle management. This requires ptrace 'read' on 99 # unconfined since systemd-run will call its detect_container() which will 100 # try to read /proc/1/environ. This is mediated via PTRACE_MODE_READ when 101 # run within kubelet's namespace. 102 ptrace (read) peer=unconfined, 103 /run/systemd/private rw, 104 105 # kubelet calling 'systemd-run --scope true' triggers this when kubelet is 106 # run in a nested container (eg, under lxd). 107 @{PROC}/1/cmdline r, 108 109 # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate 110 # PTRACE_MODE_READ and policy required 'trace' instead of 'read'. 111 # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so 112 # continue to allow this historic 'trace' rule on unconfined (which systemd 113 # runs as) since systemd-run won't be able to ptrace this snap's processes. 114 # This can be dropped once LP: #1890848 is fixed. 115 ptrace (trace) peer=unconfined, 116 117 /{,usr/}bin/true ixr, 118 @{INSTALL_DIR}/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,usr/}bin/true ixr, 119 ###KUBERNETES_SUPPORT_SYSTEMD_RUN### 120 } 121 ` 122 123 const kubernetesSupportConnectedPlugAppArmorKubelet = ` 124 # Allow running as the kubelet service 125 126 # Ideally this would be snap-specific 127 /run/dockershim.sock rw, 128 129 # Ideally this would be snap-specific (it could if the control plane was a 130 # snap), but in deployments where the control plane is not a snap, it will tell 131 # flannel to use this path. 132 /run/flannel/{,**} rw, 133 /run/flannel/** k, 134 135 # allow managing pods' cgroups 136 /sys/fs/cgroup/*/kubepods/{,**} rw, 137 138 # kubelet can be configured to use the systemd cgroup driver which moves 139 # container processes into systemd-managed cgroups. This is now the recommended 140 # configuration since it provides a single cgroup manager (systemd) in an 141 # effort to achieve consistent views of resources. 142 /sys/fs/cgroup/*/systemd/{,system.slice/} rw, # create missing dirs 143 /sys/fs/cgroup/*/systemd/system.slice/** r, 144 /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w, 145 146 # Allow tracing our own processes. Note, this allows seccomp sandbox escape on 147 # kernels < 4.8 148 capability sys_ptrace, 149 ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.*, 150 151 # Allow ptracing other processes (as part of ps-style process lookups). Note, 152 # the peer needs a corresponding tracedby rule. As a special case, disallow 153 # ptracing unconfined. 154 ptrace (trace), 155 deny ptrace (trace) peer=unconfined, 156 157 @{PROC}/[0-9]*/attr/ r, 158 @{PROC}/[0-9]*/fdinfo/ r, 159 @{PROC}/[0-9]*/map_files/ r, 160 @{PROC}/[0-9]*/ns/{,*} r, 161 # dac_read_search needed for lstat'ing non-root owned ns/* files 162 capability dac_read_search, 163 164 # kubernetes will verify and set panic and panic_on_oops to values it considers 165 # sane 166 @{PROC}/sys/kernel/panic w, 167 @{PROC}/sys/kernel/panic_on_oops w, 168 @{PROC}/sys/kernel/keys/root_maxbytes r, 169 @{PROC}/sys/kernel/keys/root_maxkeys r, 170 171 /dev/kmsg r, 172 173 # kubelet calls out to systemd-run for some mounts, but not all of them and not 174 # unmounts... 175 capability sys_admin, 176 mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 177 mount options=(rw, rshared) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 178 179 /{,usr/}bin/mount ixr, 180 /{,usr/}bin/umount ixr, 181 deny /run/mount/utab{,.lock} rw, 182 umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 183 184 # When fsGroup is set, the pod's volume will be recursively chowned with the 185 # setgid bit set on directories so new files will be owned by the fsGroup. See 186 # kubernetes pkg/volume/volume_linux.go:changeFilePermission() 187 capability fsetid, 188 ` 189 190 const kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun = ` 191 # kubelet mount rules 192 capability sys_admin, 193 /{,usr/}bin/mount ixr, 194 mount fstype="tmpfs" tmpfs -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 195 deny /run/mount/utab{,.lock} rw, 196 197 # For mounting volume subPaths 198 mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 199 mount options=(rw, remount, bind) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**}, 200 # nvme0-99, 1-63 partitions with 1-63 optional namespaces 201 mount /dev/nvme{[0-9],[1-9][0-9]}n{[1-9],[1-5][0-9],6[0-3]}{,p{[1-9],[1-5][0-9],6[0-3]}} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 202 # SCSI sda-sdiv, 1-15 partitions 203 mount /dev/sd{[a-z],[a-h][a-z],i[a-v]}{[1-9],1[0-5]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 204 # virtio vda-vdz, 1-63 partitions 205 mount /dev/vd[a-z]{[1-9],[1-5][0-9],6[0-3]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 206 umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**, 207 208 # When mounting a volume subPath, kubelet binds mounts on an open fd (eg, 209 # /proc/.../fd/N) which triggers a ptrace 'read' denial on the parent 210 # kubelet peer process from this child profile due to PTRACE_MODE_READ (man 211 # ptrace) checks. 212 ptrace (read) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME}, 213 214 # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate 215 # PTRACE_MODE_READ and policy required 'trace' instead of 'read'. 216 # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so 217 # continue to allow this historic 'trace' rule on kubelet (our parent peer) 218 # since systemd-run won't be able to ptrace this snap's processes (kubelet 219 # would also need a corresponding tracedby rule). This can be dropped once 220 # LP: #1890848 is fixed. 221 ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME}, 222 ` 223 224 // k8s.io/apiserver/pkg/storage/etcd3/logger.go pulls in go-systemd via 225 // go.etcd.io/etcd/clientv3. See: 226 // https://github.com/coreos/go-systemd/blob/master/journal/journal.go#L211 227 const kubernetesSupportConnectedPlugAppArmorAutobindUnix = ` 228 # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd) 229 # unix (bind) type=dgram addr=auto, 230 # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax 231 # above which allows only "empty"/automatic addresses, for now we simply permit 232 # all addresses with SOCK_DGRAM type, which leaks info for other addresses than 233 # what docker tries to use 234 # see https://bugs.launchpad.net/snapd/+bug/1867216 235 unix (bind) type=dgram, 236 ` 237 238 const kubernetesSupportConnectedPlugSeccompAutobindUnix = ` 239 # Allow using the 'autobind' feature of bind() (eg, for journald). 240 bind 241 ` 242 243 const kubernetesSupportConnectedPlugSeccompKubelet = ` 244 # Allow running as the kubelet service 245 mount 246 umount 247 umount2 248 249 unshare 250 setns - CLONE_NEWNET 251 252 # When fsGroup is set, the pod's volume will be recursively chowned with the 253 # setgid bit set on directories so new files will be owned by the fsGroup. See 254 # kubernetes pkg/volume/volume_linux.go:changeFilePermission() 255 fchownat 256 ` 257 258 var kubernetesSupportConnectedPlugUDevKubelet = []string{ 259 `KERNEL=="kmsg"`, 260 } 261 262 const kubernetesSupportConnectedPlugAppArmorKubeproxy = ` 263 # Allow running as the kubeproxy service 264 265 # managing our own cgroup 266 /sys/fs/cgroup/*/kube-proxy/{,**} rw, 267 268 # Allow reading the state of modules kubernetes needs 269 /sys/module/libcrc32c/initstate r, 270 /sys/module/llc/initstate r, 271 /sys/module/stp/initstate r, 272 /sys/module/ip_vs/initstate r, 273 /sys/module/ip_vs_rr/initstate r, 274 /sys/module/ip_vs_sh/initstate r, 275 /sys/module/ip_vs_wrr/initstate r, 276 ` 277 278 var kubernetesSupportConnectedPlugKmodKubeProxy = []string{ 279 `ip_vs_rr`, 280 `ip_vs_sh`, 281 `ip_vs_wrr`, 282 `libcrc32c`, 283 `llc`, 284 `stp`, 285 } 286 287 type kubernetesSupportInterface struct { 288 commonInterface 289 } 290 291 func (iface *kubernetesSupportInterface) ServicePermanentPlug(plug *snap.PlugInfo) []string { 292 // only autobind-unix flavor does not get Delegate=true, all other flavors 293 // are usable to manage control groups of processes/containers, and thus 294 // need Delegate=true 295 flavor := k8sFlavor(plug) 296 if flavor == "autobind-unix" { 297 return nil 298 } 299 300 return []string{"Delegate=true"} 301 } 302 303 func k8sFlavor(plug interfaces.Attrer) string { 304 var flavor string 305 _ = plug.Attr("flavor", &flavor) 306 return flavor 307 } 308 309 func (iface *kubernetesSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 310 snippet := kubernetesSupportConnectedPlugAppArmorCommon 311 systemd_run_extra := "" 312 313 // All flavors should include the autobind-unix rules, but we break it 314 // out so other k8s daemons can use this flavor without getting the 315 // privileged rules. 316 switch k8sFlavor(plug) { 317 case "kubelet": 318 systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun 319 snippet += kubernetesSupportConnectedPlugAppArmorKubelet 320 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 321 spec.SetUsesPtraceTrace() 322 case "kubeproxy": 323 snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy 324 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 325 case "autobind-unix": 326 snippet = kubernetesSupportConnectedPlugAppArmorAutobindUnix 327 default: 328 systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun 329 snippet += kubernetesSupportConnectedPlugAppArmorKubelet 330 snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy 331 snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix 332 spec.SetUsesPtraceTrace() 333 } 334 335 old := "###KUBERNETES_SUPPORT_SYSTEMD_RUN###" 336 spec.AddSnippet(strings.Replace(snippet, old, systemd_run_extra, -1)) 337 return nil 338 } 339 340 func (iface *kubernetesSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 341 // All flavors should include the autobind-unix rules, but we add the 342 // privileged kubelet rules conditionally. 343 snippet := kubernetesSupportConnectedPlugSeccompAutobindUnix 344 flavor := k8sFlavor(plug) 345 if flavor == "kubelet" || flavor == "" { 346 snippet += kubernetesSupportConnectedPlugSeccompKubelet 347 } 348 spec.AddSnippet(snippet) 349 return nil 350 } 351 352 func (iface *kubernetesSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 353 flavor := k8sFlavor(plug) 354 if flavor == "kubelet" || flavor == "" { 355 for _, rule := range kubernetesSupportConnectedPlugUDevKubelet { 356 spec.TagDevice(rule) 357 } 358 } 359 return nil 360 } 361 362 func (iface *kubernetesSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 363 flavor := k8sFlavor(plug) 364 if flavor == "kubeproxy" || flavor == "" { 365 for _, m := range kubernetesSupportConnectedPlugKmodKubeProxy { 366 if err := spec.AddModule(m); err != nil { 367 return err 368 } 369 } 370 } 371 return nil 372 } 373 374 func (iface *kubernetesSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error { 375 // It's fine if flavor isn't specified, but if it is, it needs to be 376 // either "kubelet", "kubeproxy" or "autobind-unix" 377 if t, ok := plug.Attrs["flavor"]; ok && t != "kubelet" && t != "kubeproxy" && t != "autobind-unix" { 378 return fmt.Errorf(`kubernetes-support plug requires "flavor" to be either "kubelet", "kubeproxy" or "autobind-unix"`) 379 } 380 381 return nil 382 } 383 384 func init() { 385 registerIface(&kubernetesSupportInterface{commonInterface{ 386 name: "kubernetes-support", 387 summary: kubernetesSupportSummary, 388 implicitOnClassic: true, 389 implicitOnCore: true, 390 baseDeclarationPlugs: kubernetesSupportBaseDeclarationPlugs, 391 baseDeclarationSlots: kubernetesSupportBaseDeclarationSlots, 392 }}) 393 }