github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/interfaces/builtin/kubernetes_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2017-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  	"strings"
    25  
    26  	"github.com/snapcore/snapd/interfaces"
    27  	"github.com/snapcore/snapd/interfaces/apparmor"
    28  	"github.com/snapcore/snapd/interfaces/kmod"
    29  	"github.com/snapcore/snapd/interfaces/seccomp"
    30  	"github.com/snapcore/snapd/interfaces/udev"
    31  	"github.com/snapcore/snapd/snap"
    32  )
    33  
    34  const kubernetesSupportSummary = `allows operating as the Kubernetes service`
    35  
    36  const kubernetesSupportBaseDeclarationPlugs = `
    37    kubernetes-support:
    38      allow-installation: false
    39      deny-auto-connection: true
    40  `
    41  
    42  const kubernetesSupportBaseDeclarationSlots = `
    43    kubernetes-support:
    44      allow-installation:
    45        slot-snap-type:
    46          - core
    47      deny-auto-connection: true
    48  `
    49  
    50  const kubernetesSupportConnectedPlugAppArmorCommon = `
    51  # Common rules for running as a kubernetes node
    52  
    53  # reading cgroups
    54  capability sys_resource,
    55  /sys/fs/cgroup/{,**} r,
    56  
    57  # Allow adjusting the OOM score for containers. Note, this allows adjusting for
    58  # all processes, not just containers.
    59  @{PROC}/@{pid}/oom_score_adj rw,
    60  @{PROC}/sys/vm/overcommit_memory rw,
    61  /sys/kernel/mm/hugepages/{,**} r,
    62  /sys/kernel/mm/transparent_hugepage/{,**} r,
    63  
    64  capability dac_override,
    65  
    66  /{,usr/}bin/systemd-run Cxr -> systemd_run,
    67  /run/systemd/private r,
    68  profile systemd_run (attach_disconnected,mediate_deleted) {
    69    # Common rules for kubernetes use of systemd_run
    70    #include <abstractions/base>
    71  
    72    /{,usr/}bin/systemd-run rm,
    73    owner @{PROC}/@{pid}/stat r,
    74    owner @{PROC}/@{pid}/environ r,
    75    @{PROC}/cmdline r,  # proc_cmdline()
    76  
    77    # setsockopt()
    78    capability net_admin,
    79  
    80    # systemd-run's detect_container() looks at several files to determine if it
    81    # is running in a container.
    82    @{PROC}/sys/kernel/osrelease r,
    83    @{PROC}/1/sched r,
    84    /run/systemd/container r,
    85  
    86    # kubelet calls 'systemd-run --scope true' to determine if systemd is
    87    # available and usable for calling certain mount commands under transient
    88    # units as part of its lifecycle management. This requires ptrace 'read' on
    89    # unconfined since systemd-run will call its detect_container() which will
    90    # try to read /proc/1/environ. This is mediated via PTRACE_MODE_READ when
    91    # run within kubelet's namespace.
    92    ptrace (read) peer=unconfined,
    93    /run/systemd/private rw,
    94  
    95    # kubelet calling 'systemd-run --scope true' triggers this when kubelet is
    96    # run in a nested container (eg, under lxd).
    97    @{PROC}/1/cmdline r,
    98  
    99    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   100    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   101    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   102    # continue to allow this historic 'trace' rule on unconfined (which systemd
   103    # runs as) since systemd-run won't be able to ptrace this snap's processes.
   104    # This can be dropped once LP: #1890848 is fixed.
   105    ptrace (trace) peer=unconfined,
   106  
   107    /{,usr/}bin/true ixr,
   108    @{INSTALL_DIR}/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,usr/}bin/true ixr,
   109  ###KUBERNETES_SUPPORT_SYSTEMD_RUN###
   110  }
   111  `
   112  
   113  const kubernetesSupportConnectedPlugAppArmorKubelet = `
   114  # Allow running as the kubelet service
   115  
   116  # Ideally this would be snap-specific
   117  /run/dockershim.sock rw,
   118  
   119  # Ideally this would be snap-specific (it could if the control plane was a
   120  # snap), but in deployments where the control plane is not a snap, it will tell
   121  # flannel to use this path.
   122  /run/flannel/{,**} rw,
   123  /run/flannel/** k,
   124  
   125  # allow managing pods' cgroups
   126  /sys/fs/cgroup/*/kubepods/{,**} rw,
   127  
   128  # kubelet can be configured to use the systemd cgroup driver which moves
   129  # container processes into systemd-managed cgroups. This is now the recommended
   130  # configuration since it provides a single cgroup manager (systemd) in an
   131  # effort to achieve consistent views of resources.
   132  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   133  /sys/fs/cgroup/*/systemd/system.slice/** r,
   134  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   135  
   136  # Allow tracing our own processes. Note, this allows seccomp sandbox escape on
   137  # kernels < 4.8
   138  capability sys_ptrace,
   139  ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.*,
   140  
   141  # Allow ptracing other processes (as part of ps-style process lookups). Note,
   142  # the peer needs a corresponding tracedby rule. As a special case, disallow
   143  # ptracing unconfined.
   144  ptrace (trace),
   145  deny ptrace (trace) peer=unconfined,
   146  
   147  @{PROC}/[0-9]*/attr/ r,
   148  @{PROC}/[0-9]*/fdinfo/ r,
   149  @{PROC}/[0-9]*/map_files/ r,
   150  @{PROC}/[0-9]*/ns/{,*} r,
   151  # dac_read_search needed for lstat'ing non-root owned ns/* files
   152  capability dac_read_search,
   153  
   154  # kubernetes will verify and set panic and panic_on_oops to values it considers
   155  # sane
   156  @{PROC}/sys/kernel/panic w,
   157  @{PROC}/sys/kernel/panic_on_oops w,
   158  @{PROC}/sys/kernel/keys/root_maxbytes r,
   159  @{PROC}/sys/kernel/keys/root_maxkeys r,
   160  
   161  /dev/kmsg r,
   162  
   163  # kubelet calls out to systemd-run for some mounts, but not all of them and not
   164  # unmounts...
   165  capability sys_admin,
   166  mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   167  mount options=(rw, rshared) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   168  
   169  /{,usr/}bin/mount ixr,
   170  /{,usr/}bin/umount ixr,
   171  deny /run/mount/utab{,.lock} rw,
   172  umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   173  
   174  # When fsGroup is set, the pod's volume will be recursively chowned with the
   175  # setgid bit set on directories so new files will be owned by the fsGroup. See
   176  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   177  capability fsetid,
   178  `
   179  
   180  const kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun = `
   181    # kubelet mount rules
   182    capability sys_admin,
   183    /{,usr/}bin/mount ixr,
   184    mount fstype="tmpfs" tmpfs -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   185    deny /run/mount/utab{,.lock} rw,
   186  
   187    # For mounting volume subPaths
   188    mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   189    mount options=(rw, remount, bind) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   190    # nvme0-99, 1-63 partitions with 1-63 optional namespaces
   191    mount /dev/nvme{[0-9],[1-9][0-9]}n{[1-9],[1-5][0-9],6[0-3]}{,p{[1-9],[1-5][0-9],6[0-3]}} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   192    # SCSI sda-sdiv, 1-15 partitions
   193    mount /dev/sd{[a-z],[a-h][a-z],i[a-v]}{[1-9],1[0-5]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   194    # virtio vda-vdz, 1-63 partitions
   195    mount /dev/vd[a-z]{[1-9],[1-5][0-9],6[0-3]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   196    umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   197  
   198    # When mounting a volume subPath, kubelet binds mounts on an open fd (eg,
   199    # /proc/.../fd/N) which triggers a ptrace 'read' denial on the parent
   200    # kubelet peer process from this child profile due to PTRACE_MODE_READ (man
   201    # ptrace) checks.
   202    ptrace (read) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   203  
   204    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   205    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   206    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   207    # continue to allow this historic 'trace' rule on kubelet (our parent peer)
   208    # since systemd-run won't be able to ptrace this snap's processes (kubelet
   209    # would also need a corresponding tracedby rule). This can be dropped once
   210    # LP: #1890848 is fixed.
   211    ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   212  `
   213  
   214  // k8s.io/apiserver/pkg/storage/etcd3/logger.go pulls in go-systemd via
   215  // go.etcd.io/etcd/clientv3. See:
   216  // https://github.com/coreos/go-systemd/blob/master/journal/journal.go#L211
   217  const kubernetesSupportConnectedPlugAppArmorAutobindUnix = `
   218  # Allow using the 'autobind' feature of bind() (eg, for journald).
   219  #unix (bind) type=dgram addr=none,
   220  # Due to LP: 1867216, we cannot use the above rule and must instead use this
   221  # less specific rule that allows bind() to arbitrary SOCK_DGRAM abstract socket
   222  # names (separate send and receive rules are still required for communicating
   223  # over the socket).
   224  unix (bind) type=dgram,
   225  `
   226  
   227  const kubernetesSupportConnectedPlugSeccompAutobindUnix = `
   228  # Allow using the 'autobind' feature of bind() (eg, for journald).
   229  bind
   230  `
   231  
   232  const kubernetesSupportConnectedPlugSeccompKubelet = `
   233  # Allow running as the kubelet service
   234  mount
   235  umount
   236  umount2
   237  
   238  unshare
   239  setns - CLONE_NEWNET
   240  
   241  # When fsGroup is set, the pod's volume will be recursively chowned with the
   242  # setgid bit set on directories so new files will be owned by the fsGroup. See
   243  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   244  fchownat
   245  `
   246  
   247  var kubernetesSupportConnectedPlugUDevKubelet = []string{
   248  	`KERNEL=="kmsg"`,
   249  }
   250  
   251  const kubernetesSupportConnectedPlugAppArmorKubeproxy = `
   252  # Allow running as the kubeproxy service
   253  
   254  # managing our own cgroup
   255  /sys/fs/cgroup/*/kube-proxy/{,**} rw,
   256  
   257  # Allow reading the state of modules kubernetes needs
   258  /sys/module/libcrc32c/initstate r,
   259  /sys/module/llc/initstate r,
   260  /sys/module/stp/initstate r,
   261  /sys/module/ip_vs/initstate r,
   262  /sys/module/ip_vs_rr/initstate r,
   263  /sys/module/ip_vs_sh/initstate r,
   264  /sys/module/ip_vs_wrr/initstate r,
   265  `
   266  
   267  var kubernetesSupportConnectedPlugKmodKubeProxy = []string{
   268  	`ip_vs_rr`,
   269  	`ip_vs_sh`,
   270  	`ip_vs_wrr`,
   271  	`libcrc32c`,
   272  	`llc`,
   273  	`stp`,
   274  }
   275  
   276  type kubernetesSupportInterface struct {
   277  	commonInterface
   278  }
   279  
   280  func k8sFlavor(plug *interfaces.ConnectedPlug) string {
   281  	var flavor string
   282  	_ = plug.Attr("flavor", &flavor)
   283  	return flavor
   284  }
   285  
   286  func (iface *kubernetesSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   287  	snippet := kubernetesSupportConnectedPlugAppArmorCommon
   288  	systemd_run_extra := ""
   289  
   290  	// All flavors should include the autobind-unix rules, but we break it
   291  	// out so other k8s daemons can use this flavor without getting the
   292  	// privileged rules.
   293  	switch k8sFlavor(plug) {
   294  	case "kubelet":
   295  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   296  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   297  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   298  		spec.SetUsesPtraceTrace()
   299  	case "kubeproxy":
   300  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   301  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   302  	case "autobind-unix":
   303  		snippet = kubernetesSupportConnectedPlugAppArmorAutobindUnix
   304  	default:
   305  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   306  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   307  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   308  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   309  		spec.SetUsesPtraceTrace()
   310  	}
   311  
   312  	old := "###KUBERNETES_SUPPORT_SYSTEMD_RUN###"
   313  	spec.AddSnippet(strings.Replace(snippet, old, systemd_run_extra, -1))
   314  	return nil
   315  }
   316  
   317  func (iface *kubernetesSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   318  	// All flavors should include the autobind-unix rules, but we add the
   319  	// privileged kubelet rules conditionally.
   320  	snippet := kubernetesSupportConnectedPlugSeccompAutobindUnix
   321  	flavor := k8sFlavor(plug)
   322  	if flavor == "kubelet" || flavor == "" {
   323  		snippet += kubernetesSupportConnectedPlugSeccompKubelet
   324  	}
   325  	spec.AddSnippet(snippet)
   326  	return nil
   327  }
   328  
   329  func (iface *kubernetesSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   330  	flavor := k8sFlavor(plug)
   331  	if flavor == "kubelet" || flavor == "" {
   332  		for _, rule := range kubernetesSupportConnectedPlugUDevKubelet {
   333  			spec.TagDevice(rule)
   334  		}
   335  	}
   336  	return nil
   337  }
   338  
   339  func (iface *kubernetesSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   340  	flavor := k8sFlavor(plug)
   341  	if flavor == "kubeproxy" || flavor == "" {
   342  		for _, m := range kubernetesSupportConnectedPlugKmodKubeProxy {
   343  			if err := spec.AddModule(m); err != nil {
   344  				return err
   345  			}
   346  		}
   347  	}
   348  	return nil
   349  }
   350  
   351  func (iface *kubernetesSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   352  	// It's fine if flavor isn't specified, but if it is, it needs to be
   353  	// either "kubelet", "kubeproxy" or "autobind-unix"
   354  	if t, ok := plug.Attrs["flavor"]; ok && t != "kubelet" && t != "kubeproxy" && t != "autobind-unix" {
   355  		return fmt.Errorf(`kubernetes-support plug requires "flavor" to be either "kubelet", "kubeproxy" or "autobind-unix"`)
   356  	}
   357  
   358  	return nil
   359  }
   360  
   361  func init() {
   362  	registerIface(&kubernetesSupportInterface{commonInterface{
   363  		name:                 "kubernetes-support",
   364  		summary:              kubernetesSupportSummary,
   365  		implicitOnClassic:    true,
   366  		implicitOnCore:       true,
   367  		baseDeclarationPlugs: kubernetesSupportBaseDeclarationPlugs,
   368  		baseDeclarationSlots: kubernetesSupportBaseDeclarationSlots,
   369  	}})
   370  }