github.com/ubuntu-core/snappy@v0.0.0-20210827154228-9e584df982bb/interfaces/builtin/kubernetes_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2017-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  	"strings"
    25  
    26  	"github.com/snapcore/snapd/interfaces"
    27  	"github.com/snapcore/snapd/interfaces/apparmor"
    28  	"github.com/snapcore/snapd/interfaces/kmod"
    29  	"github.com/snapcore/snapd/interfaces/seccomp"
    30  	"github.com/snapcore/snapd/interfaces/udev"
    31  	"github.com/snapcore/snapd/snap"
    32  )
    33  
    34  const kubernetesSupportSummary = `allows operating as the Kubernetes service`
    35  
    36  const kubernetesSupportBaseDeclarationPlugs = `
    37    kubernetes-support:
    38      allow-installation: false
    39      deny-auto-connection: true
    40  `
    41  
    42  const kubernetesSupportBaseDeclarationSlots = `
    43    kubernetes-support:
    44      allow-installation:
    45        slot-snap-type:
    46          - core
    47      deny-auto-connection: true
    48  `
    49  
    50  const kubernetesSupportConnectedPlugAppArmorCommon = `
    51  # Common rules for running as a kubernetes node
    52  
    53  # reading cgroups
    54  capability sys_resource,
    55  /sys/fs/cgroup/{,**} r,
    56  
    57  # Allow adjusting the OOM score for containers. Note, this allows adjusting for
    58  # all processes, not just containers.
    59  @{PROC}/@{pid}/oom_score_adj rw,
    60  @{PROC}/sys/vm/overcommit_memory rw,
    61  /sys/kernel/mm/hugepages/{,**} r,
    62  /sys/kernel/mm/transparent_hugepage/{,**} r,
    63  
    64  capability dac_override,
    65  
    66  /{,usr/}bin/systemd-run Cxr -> systemd_run,
    67  /run/systemd/private r,
    68  profile systemd_run (attach_disconnected,mediate_deleted) {
    69    # Common rules for kubernetes use of systemd_run
    70    #include <abstractions/base>
    71  
    72    /{,usr/}bin/systemd-run rm,
    73    owner @{PROC}/@{pid}/stat r,
    74    owner @{PROC}/@{pid}/environ r,
    75    @{PROC}/cmdline r,  # proc_cmdline()
    76  
    77    # setsockopt()
    78    capability net_admin,
    79  
    80    # systemd-run's detect_container() looks at several files to determine if it
    81    # is running in a container.
    82    @{PROC}/sys/kernel/osrelease r,
    83    @{PROC}/1/sched r,
    84    /run/systemd/container r,
    85  
    86    # kubelet calls 'systemd-run --scope true' to determine if systemd is
    87    # available and usable for calling certain mount commands under transient
    88    # units as part of its lifecycle management. This requires ptrace 'read' on
    89    # unconfined since systemd-run will call its detect_container() which will
    90    # try to read /proc/1/environ. This is mediated via PTRACE_MODE_READ when
    91    # run within kubelet's namespace.
    92    ptrace (read) peer=unconfined,
    93    /run/systemd/private rw,
    94  
    95    # kubelet calling 'systemd-run --scope true' triggers this when kubelet is
    96    # run in a nested container (eg, under lxd).
    97    @{PROC}/1/cmdline r,
    98  
    99    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   100    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   101    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   102    # continue to allow this historic 'trace' rule on unconfined (which systemd
   103    # runs as) since systemd-run won't be able to ptrace this snap's processes.
   104    # This can be dropped once LP: #1890848 is fixed.
   105    ptrace (trace) peer=unconfined,
   106  
   107    /{,usr/}bin/true ixr,
   108    @{INSTALL_DIR}/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,usr/}bin/true ixr,
   109  ###KUBERNETES_SUPPORT_SYSTEMD_RUN###
   110  }
   111  `
   112  
   113  const kubernetesSupportConnectedPlugAppArmorKubelet = `
   114  # Allow running as the kubelet service
   115  
   116  # Ideally this would be snap-specific
   117  /run/dockershim.sock rw,
   118  
   119  # Ideally this would be snap-specific (it could if the control plane was a
   120  # snap), but in deployments where the control plane is not a snap, it will tell
   121  # flannel to use this path.
   122  /run/flannel/{,**} rw,
   123  /run/flannel/** k,
   124  
   125  # allow managing pods' cgroups
   126  /sys/fs/cgroup/*/kubepods/{,**} rw,
   127  
   128  # kubelet can be configured to use the systemd cgroup driver which moves
   129  # container processes into systemd-managed cgroups. This is now the recommended
   130  # configuration since it provides a single cgroup manager (systemd) in an
   131  # effort to achieve consistent views of resources.
   132  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   133  /sys/fs/cgroup/*/systemd/system.slice/** r,
   134  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   135  
   136  # Allow tracing our own processes. Note, this allows seccomp sandbox escape on
   137  # kernels < 4.8
   138  capability sys_ptrace,
   139  ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.*,
   140  
   141  # Allow ptracing other processes (as part of ps-style process lookups). Note,
   142  # the peer needs a corresponding tracedby rule. As a special case, disallow
   143  # ptracing unconfined.
   144  ptrace (trace),
   145  deny ptrace (trace) peer=unconfined,
   146  
   147  @{PROC}/[0-9]*/attr/ r,
   148  @{PROC}/[0-9]*/fdinfo/ r,
   149  @{PROC}/[0-9]*/map_files/ r,
   150  @{PROC}/[0-9]*/ns/{,*} r,
   151  # dac_read_search needed for lstat'ing non-root owned ns/* files
   152  capability dac_read_search,
   153  
   154  # kubernetes will verify and set panic and panic_on_oops to values it considers
   155  # sane
   156  @{PROC}/sys/kernel/panic w,
   157  @{PROC}/sys/kernel/panic_on_oops w,
   158  @{PROC}/sys/kernel/keys/root_maxbytes r,
   159  @{PROC}/sys/kernel/keys/root_maxkeys r,
   160  
   161  /dev/kmsg r,
   162  
   163  # kubelet calls out to systemd-run for some mounts, but not all of them and not
   164  # unmounts...
   165  capability sys_admin,
   166  mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   167  mount options=(rw, rshared) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   168  
   169  /{,usr/}bin/mount ixr,
   170  /{,usr/}bin/umount ixr,
   171  deny /run/mount/utab{,.lock} rw,
   172  umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   173  
   174  # When fsGroup is set, the pod's volume will be recursively chowned with the
   175  # setgid bit set on directories so new files will be owned by the fsGroup. See
   176  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   177  capability fsetid,
   178  `
   179  
   180  const kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun = `
   181    # kubelet mount rules
   182    capability sys_admin,
   183    /{,usr/}bin/mount ixr,
   184    mount fstype="tmpfs" tmpfs -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   185    deny /run/mount/utab{,.lock} rw,
   186  
   187    # For mounting volume subPaths
   188    mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   189    mount options=(rw, remount, bind) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   190    # nvme0-99, 1-63 partitions with 1-63 optional namespaces
   191    mount /dev/nvme{[0-9],[1-9][0-9]}n{[1-9],[1-5][0-9],6[0-3]}{,p{[1-9],[1-5][0-9],6[0-3]}} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   192    # SCSI sda-sdiv, 1-15 partitions
   193    mount /dev/sd{[a-z],[a-h][a-z],i[a-v]}{[1-9],1[0-5]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   194    # virtio vda-vdz, 1-63 partitions
   195    mount /dev/vd[a-z]{[1-9],[1-5][0-9],6[0-3]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   196    umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   197  
   198    # When mounting a volume subPath, kubelet binds mounts on an open fd (eg,
   199    # /proc/.../fd/N) which triggers a ptrace 'read' denial on the parent
   200    # kubelet peer process from this child profile due to PTRACE_MODE_READ (man
   201    # ptrace) checks.
   202    ptrace (read) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   203  
   204    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   205    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   206    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   207    # continue to allow this historic 'trace' rule on kubelet (our parent peer)
   208    # since systemd-run won't be able to ptrace this snap's processes (kubelet
   209    # would also need a corresponding tracedby rule). This can be dropped once
   210    # LP: #1890848 is fixed.
   211    ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   212  `
   213  
   214  // k8s.io/apiserver/pkg/storage/etcd3/logger.go pulls in go-systemd via
   215  // go.etcd.io/etcd/clientv3. See:
   216  // https://github.com/coreos/go-systemd/blob/master/journal/journal.go#L211
   217  const kubernetesSupportConnectedPlugAppArmorAutobindUnix = `
   218  # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd)
   219  # unix (bind) type=dgram addr=auto,
   220  # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax
   221  # above which allows only "empty"/automatic addresses, for now we simply permit
   222  # all addresses with SOCK_DGRAM type, which leaks info for other addresses than
   223  # what docker tries to use
   224  # see https://bugs.launchpad.net/snapd/+bug/1867216
   225  unix (bind) type=dgram,
   226  `
   227  
   228  const kubernetesSupportConnectedPlugSeccompAutobindUnix = `
   229  # Allow using the 'autobind' feature of bind() (eg, for journald).
   230  bind
   231  `
   232  
   233  const kubernetesSupportConnectedPlugSeccompKubelet = `
   234  # Allow running as the kubelet service
   235  mount
   236  umount
   237  umount2
   238  
   239  unshare
   240  setns - CLONE_NEWNET
   241  
   242  # When fsGroup is set, the pod's volume will be recursively chowned with the
   243  # setgid bit set on directories so new files will be owned by the fsGroup. See
   244  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   245  fchownat
   246  `
   247  
   248  var kubernetesSupportConnectedPlugUDevKubelet = []string{
   249  	`KERNEL=="kmsg"`,
   250  }
   251  
   252  const kubernetesSupportConnectedPlugAppArmorKubeproxy = `
   253  # Allow running as the kubeproxy service
   254  
   255  # managing our own cgroup
   256  /sys/fs/cgroup/*/kube-proxy/{,**} rw,
   257  
   258  # Allow reading the state of modules kubernetes needs
   259  /sys/module/libcrc32c/initstate r,
   260  /sys/module/llc/initstate r,
   261  /sys/module/stp/initstate r,
   262  /sys/module/ip_vs/initstate r,
   263  /sys/module/ip_vs_rr/initstate r,
   264  /sys/module/ip_vs_sh/initstate r,
   265  /sys/module/ip_vs_wrr/initstate r,
   266  `
   267  
   268  var kubernetesSupportConnectedPlugKmodKubeProxy = []string{
   269  	`ip_vs_rr`,
   270  	`ip_vs_sh`,
   271  	`ip_vs_wrr`,
   272  	`libcrc32c`,
   273  	`llc`,
   274  	`stp`,
   275  }
   276  
   277  type kubernetesSupportInterface struct {
   278  	commonInterface
   279  }
   280  
   281  func (iface *kubernetesSupportInterface) ServicePermanentPlug(plug *snap.PlugInfo) []string {
   282  	// only autobind-unix flavor does not get Delegate=true, all other flavors
   283  	// are usable to manage control groups of processes/containers, and thus
   284  	// need Delegate=true
   285  	flavor := k8sFlavor(plug)
   286  	if flavor == "autobind-unix" {
   287  		return nil
   288  	}
   289  
   290  	return []string{"Delegate=true"}
   291  }
   292  
   293  func k8sFlavor(plug interfaces.Attrer) string {
   294  	var flavor string
   295  	_ = plug.Attr("flavor", &flavor)
   296  	return flavor
   297  }
   298  
   299  func (iface *kubernetesSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   300  	snippet := kubernetesSupportConnectedPlugAppArmorCommon
   301  	systemd_run_extra := ""
   302  
   303  	// All flavors should include the autobind-unix rules, but we break it
   304  	// out so other k8s daemons can use this flavor without getting the
   305  	// privileged rules.
   306  	switch k8sFlavor(plug) {
   307  	case "kubelet":
   308  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   309  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   310  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   311  		spec.SetUsesPtraceTrace()
   312  	case "kubeproxy":
   313  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   314  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   315  	case "autobind-unix":
   316  		snippet = kubernetesSupportConnectedPlugAppArmorAutobindUnix
   317  	default:
   318  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   319  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   320  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   321  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   322  		spec.SetUsesPtraceTrace()
   323  	}
   324  
   325  	old := "###KUBERNETES_SUPPORT_SYSTEMD_RUN###"
   326  	spec.AddSnippet(strings.Replace(snippet, old, systemd_run_extra, -1))
   327  	return nil
   328  }
   329  
   330  func (iface *kubernetesSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   331  	// All flavors should include the autobind-unix rules, but we add the
   332  	// privileged kubelet rules conditionally.
   333  	snippet := kubernetesSupportConnectedPlugSeccompAutobindUnix
   334  	flavor := k8sFlavor(plug)
   335  	if flavor == "kubelet" || flavor == "" {
   336  		snippet += kubernetesSupportConnectedPlugSeccompKubelet
   337  	}
   338  	spec.AddSnippet(snippet)
   339  	return nil
   340  }
   341  
   342  func (iface *kubernetesSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   343  	flavor := k8sFlavor(plug)
   344  	if flavor == "kubelet" || flavor == "" {
   345  		for _, rule := range kubernetesSupportConnectedPlugUDevKubelet {
   346  			spec.TagDevice(rule)
   347  		}
   348  	}
   349  	return nil
   350  }
   351  
   352  func (iface *kubernetesSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   353  	flavor := k8sFlavor(plug)
   354  	if flavor == "kubeproxy" || flavor == "" {
   355  		for _, m := range kubernetesSupportConnectedPlugKmodKubeProxy {
   356  			if err := spec.AddModule(m); err != nil {
   357  				return err
   358  			}
   359  		}
   360  	}
   361  	return nil
   362  }
   363  
   364  func (iface *kubernetesSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   365  	// It's fine if flavor isn't specified, but if it is, it needs to be
   366  	// either "kubelet", "kubeproxy" or "autobind-unix"
   367  	if t, ok := plug.Attrs["flavor"]; ok && t != "kubelet" && t != "kubeproxy" && t != "autobind-unix" {
   368  		return fmt.Errorf(`kubernetes-support plug requires "flavor" to be either "kubelet", "kubeproxy" or "autobind-unix"`)
   369  	}
   370  
   371  	return nil
   372  }
   373  
   374  func init() {
   375  	registerIface(&kubernetesSupportInterface{commonInterface{
   376  		name:                 "kubernetes-support",
   377  		summary:              kubernetesSupportSummary,
   378  		implicitOnClassic:    true,
   379  		implicitOnCore:       true,
   380  		baseDeclarationPlugs: kubernetesSupportBaseDeclarationPlugs,
   381  		baseDeclarationSlots: kubernetesSupportBaseDeclarationSlots,
   382  	}})
   383  }