gitee.com/mysnapcore/mysnapd@v0.1.0/interfaces/builtin/kubernetes_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2017-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  	"strings"
    25  
    26  	"gitee.com/mysnapcore/mysnapd/interfaces"
    27  	"gitee.com/mysnapcore/mysnapd/interfaces/apparmor"
    28  	"gitee.com/mysnapcore/mysnapd/interfaces/kmod"
    29  	"gitee.com/mysnapcore/mysnapd/interfaces/seccomp"
    30  	"gitee.com/mysnapcore/mysnapd/interfaces/udev"
    31  	"gitee.com/mysnapcore/mysnapd/snap"
    32  )
    33  
    34  const kubernetesSupportSummary = `allows operating as the Kubernetes service`
    35  
    36  const kubernetesSupportBaseDeclarationPlugs = `
    37    kubernetes-support:
    38      allow-installation: false
    39      deny-auto-connection: true
    40  `
    41  
    42  const kubernetesSupportBaseDeclarationSlots = `
    43    kubernetes-support:
    44      allow-installation:
    45        slot-snap-type:
    46          - core
    47      deny-auto-connection: true
    48  `
    49  
    50  const kubernetesSupportConnectedPlugAppArmorCommon = `
    51  # Common rules for running as a kubernetes node
    52  
    53  # reading cgroups
    54  capability sys_resource,
    55  /sys/fs/cgroup/{,**} r,
    56  
    57  # Allow adjusting the OOM score for containers. Note, this allows adjusting for
    58  # all processes, not just containers.
    59  @{PROC}/@{pid}/oom_score_adj rw,
    60  @{PROC}/sys/vm/overcommit_memory rw,
    61  /sys/kernel/mm/hugepages/{,**} r,
    62  /sys/kernel/mm/transparent_hugepage/{,**} r,
    63  
    64  capability dac_override,
    65  
    66  # Lock file used by Calico's IPAM plugin. This is configurable via the
    67  # (undocumented) "ipam_lock_file" configuration key:
    68  # https://github.com/projectcalico/cni-plugin/blob/master/pkg/types/types.go
    69  /{,var/}run/calico/ipam.lock rwk,
    70  
    71  # manually add java certs here
    72  # see also https://bugs.launchpad.net/apparmor/+bug/1816372
    73  /etc/ssl/certs/java/{,*} r,
    74  #include <abstractions/ssl_certs>
    75  
    76  /{,usr/}bin/systemd-run Cxr -> systemd_run,
    77  /run/systemd/private r,
    78  profile systemd_run (attach_disconnected,mediate_deleted) {
    79    # Common rules for kubernetes use of systemd_run
    80    #include <abstractions/base>
    81  
    82    /{,usr/}bin/systemd-run rm,
    83    owner @{PROC}/@{pid}/stat r,
    84    owner @{PROC}/@{pid}/environ r,
    85    @{PROC}/cmdline r,  # proc_cmdline()
    86  
    87    # setsockopt()
    88    capability net_admin,
    89  
    90    # systemd-run's detect_container() looks at several files to determine if it
    91    # is running in a container.
    92    @{PROC}/sys/kernel/osrelease r,
    93    @{PROC}/1/sched r,
    94    /run/systemd/container r,
    95  
    96    # kubelet calls 'systemd-run --scope true' to determine if systemd is
    97    # available and usable for calling certain mount commands under transient
    98    # units as part of its lifecycle management. This requires ptrace 'read' on
    99    # unconfined since systemd-run will call its detect_container() which will
   100    # try to read /proc/1/environ. This is mediated via PTRACE_MODE_READ when
   101    # run within kubelet's namespace.
   102    ptrace (read) peer=unconfined,
   103    /run/systemd/private rw,
   104  
   105    # kubelet calling 'systemd-run --scope true' triggers this when kubelet is
   106    # run in a nested container (eg, under lxd).
   107    @{PROC}/1/cmdline r,
   108  
   109    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   110    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   111    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   112    # continue to allow this historic 'trace' rule on unconfined (which systemd
   113    # runs as) since systemd-run won't be able to ptrace this snap's processes.
   114    # This can be dropped once LP: #1890848 is fixed.
   115    ptrace (trace) peer=unconfined,
   116  
   117    /{,usr/}bin/true ixr,
   118    @{INSTALL_DIR}/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,usr/}bin/true ixr,
   119  ###KUBERNETES_SUPPORT_SYSTEMD_RUN###
   120  }
   121  `
   122  
   123  const kubernetesSupportConnectedPlugAppArmorKubelet = `
   124  # Allow running as the kubelet service
   125  
   126  # Ideally this would be snap-specific
   127  /run/dockershim.sock rw,
   128  
   129  # Ideally this would be snap-specific (it could if the control plane was a
   130  # snap), but in deployments where the control plane is not a snap, it will tell
   131  # flannel to use this path.
   132  /run/flannel/{,**} rw,
   133  /run/flannel/** k,
   134  
   135  # allow managing pods' cgroups
   136  /sys/fs/cgroup/*/kubepods/{,**} rw,
   137  
   138  # kubelet can be configured to use the systemd cgroup driver which moves
   139  # container processes into systemd-managed cgroups. This is now the recommended
   140  # configuration since it provides a single cgroup manager (systemd) in an
   141  # effort to achieve consistent views of resources.
   142  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   143  /sys/fs/cgroup/*/systemd/system.slice/** r,
   144  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   145  
   146  # Allow tracing our own processes. Note, this allows seccomp sandbox escape on
   147  # kernels < 4.8
   148  capability sys_ptrace,
   149  ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.*,
   150  
   151  # Allow ptracing other processes (as part of ps-style process lookups). Note,
   152  # the peer needs a corresponding tracedby rule. As a special case, disallow
   153  # ptracing unconfined.
   154  ptrace (trace),
   155  deny ptrace (trace) peer=unconfined,
   156  
   157  @{PROC}/[0-9]*/attr/ r,
   158  @{PROC}/[0-9]*/fdinfo/ r,
   159  @{PROC}/[0-9]*/map_files/ r,
   160  @{PROC}/[0-9]*/ns/{,*} r,
   161  # dac_read_search needed for lstat'ing non-root owned ns/* files
   162  capability dac_read_search,
   163  
   164  # kubernetes will verify and set panic and panic_on_oops to values it considers
   165  # sane
   166  @{PROC}/sys/kernel/panic w,
   167  @{PROC}/sys/kernel/panic_on_oops w,
   168  @{PROC}/sys/kernel/keys/root_maxbytes r,
   169  @{PROC}/sys/kernel/keys/root_maxkeys r,
   170  
   171  /dev/kmsg r,
   172  
   173  # kubelet calls out to systemd-run for some mounts, but not all of them and not
   174  # unmounts...
   175  capability sys_admin,
   176  mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   177  mount options=(rw, rshared) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   178  
   179  /{,usr/}bin/mount ixr,
   180  /{,usr/}bin/umount ixr,
   181  deny /run/mount/utab{,.lock} rw,
   182  umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   183  
   184  # When fsGroup is set, the pod's volume will be recursively chowned with the
   185  # setgid bit set on directories so new files will be owned by the fsGroup. See
   186  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   187  capability fsetid,
   188  `
   189  
   190  const kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun = `
   191    # kubelet mount rules
   192    capability sys_admin,
   193    /{,usr/}bin/mount ixr,
   194    mount fstype="tmpfs" tmpfs -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   195    deny /run/mount/utab{,.lock} rw,
   196  
   197    # For mounting volume subPaths
   198    mount /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   199    mount options=(rw, remount, bind) -> /var/snap/@{SNAP_INSTANCE_NAME}/common/{,**},
   200    # nvme0-99, 1-63 partitions with 1-63 optional namespaces
   201    mount /dev/nvme{[0-9],[1-9][0-9]}n{[1-9],[1-5][0-9],6[0-3]}{,p{[1-9],[1-5][0-9],6[0-3]}} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   202    # SCSI sda-sdiv, 1-15 partitions
   203    mount /dev/sd{[a-z],[a-h][a-z],i[a-v]}{[1-9],1[0-5]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   204    # virtio vda-vdz, 1-63 partitions
   205    mount /dev/vd[a-z]{[1-9],[1-5][0-9],6[0-3]} -> /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   206    umount /var/snap/@{SNAP_INSTANCE_NAME}/common/**,
   207  
   208    # When mounting a volume subPath, kubelet binds mounts on an open fd (eg,
   209    # /proc/.../fd/N) which triggers a ptrace 'read' denial on the parent
   210    # kubelet peer process from this child profile due to PTRACE_MODE_READ (man
   211    # ptrace) checks.
   212    ptrace (read) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   213  
   214    # Ubuntu's ptrace patchset before (at least) 20.04 did not correctly evaluate
   215    # PTRACE_MODE_READ and policy required 'trace' instead of 'read'.
   216    # (LP: #1890848). This child profile doesn't have 'capability sys_ptrace', so
   217    # continue to allow this historic 'trace' rule on kubelet (our parent peer)
   218    # since systemd-run won't be able to ptrace this snap's processes (kubelet
   219    # would also need a corresponding tracedby rule). This can be dropped once
   220    # LP: #1890848 is fixed.
   221    ptrace (trace) peer=snap.@{SNAP_INSTANCE_NAME}.@{SNAP_COMMAND_NAME},
   222  `
   223  
   224  // k8s.io/apiserver/pkg/storage/etcd3/logger.go pulls in go-systemd via
   225  // go.etcd.io/etcd/clientv3. See:
   226  // https://github.com/coreos/go-systemd/blob/master/journal/journal.go#L211
   227  const kubernetesSupportConnectedPlugAppArmorAutobindUnix = `
   228  # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd)
   229  # unix (bind) type=dgram addr=auto,
   230  # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax
   231  # above which allows only "empty"/automatic addresses, for now we simply permit
   232  # all addresses with SOCK_DGRAM type, which leaks info for other addresses than
   233  # what docker tries to use
   234  # see https://bugs.launchpad.net/snapd/+bug/1867216
   235  unix (bind) type=dgram,
   236  `
   237  
   238  const kubernetesSupportConnectedPlugSeccompAutobindUnix = `
   239  # Allow using the 'autobind' feature of bind() (eg, for journald).
   240  bind
   241  `
   242  
   243  const kubernetesSupportConnectedPlugSeccompKubelet = `
   244  # Allow running as the kubelet service
   245  mount
   246  umount
   247  umount2
   248  
   249  unshare
   250  setns - CLONE_NEWNET
   251  
   252  # When fsGroup is set, the pod's volume will be recursively chowned with the
   253  # setgid bit set on directories so new files will be owned by the fsGroup. See
   254  # kubernetes pkg/volume/volume_linux.go:changeFilePermission()
   255  fchownat
   256  `
   257  
   258  var kubernetesSupportConnectedPlugUDevKubelet = []string{
   259  	`KERNEL=="kmsg"`,
   260  }
   261  
   262  const kubernetesSupportConnectedPlugAppArmorKubeproxy = `
   263  # Allow running as the kubeproxy service
   264  
   265  # managing our own cgroup
   266  /sys/fs/cgroup/*/kube-proxy/{,**} rw,
   267  
   268  # Allow reading the state of modules kubernetes needs
   269  /sys/module/libcrc32c/initstate r,
   270  /sys/module/llc/initstate r,
   271  /sys/module/stp/initstate r,
   272  /sys/module/ip_vs/initstate r,
   273  /sys/module/ip_vs_rr/initstate r,
   274  /sys/module/ip_vs_sh/initstate r,
   275  /sys/module/ip_vs_wrr/initstate r,
   276  `
   277  
   278  var kubernetesSupportConnectedPlugKmodKubeProxy = []string{
   279  	`ip_vs_rr`,
   280  	`ip_vs_sh`,
   281  	`ip_vs_wrr`,
   282  	`libcrc32c`,
   283  	`llc`,
   284  	`stp`,
   285  }
   286  
   287  type kubernetesSupportInterface struct {
   288  	commonInterface
   289  }
   290  
   291  func (iface *kubernetesSupportInterface) ServicePermanentPlug(plug *snap.PlugInfo) []string {
   292  	// only autobind-unix flavor does not get Delegate=true, all other flavors
   293  	// are usable to manage control groups of processes/containers, and thus
   294  	// need Delegate=true
   295  	flavor := k8sFlavor(plug)
   296  	if flavor == "autobind-unix" {
   297  		return nil
   298  	}
   299  
   300  	return []string{"Delegate=true"}
   301  }
   302  
   303  func k8sFlavor(plug interfaces.Attrer) string {
   304  	var flavor string
   305  	_ = plug.Attr("flavor", &flavor)
   306  	return flavor
   307  }
   308  
   309  func (iface *kubernetesSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   310  	snippet := kubernetesSupportConnectedPlugAppArmorCommon
   311  	systemd_run_extra := ""
   312  
   313  	// All flavors should include the autobind-unix rules, but we break it
   314  	// out so other k8s daemons can use this flavor without getting the
   315  	// privileged rules.
   316  	switch k8sFlavor(plug) {
   317  	case "kubelet":
   318  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   319  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   320  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   321  		spec.SetUsesPtraceTrace()
   322  	case "kubeproxy":
   323  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   324  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   325  	case "autobind-unix":
   326  		snippet = kubernetesSupportConnectedPlugAppArmorAutobindUnix
   327  	default:
   328  		systemd_run_extra = kubernetesSupportConnectedPlugAppArmorKubeletSystemdRun
   329  		snippet += kubernetesSupportConnectedPlugAppArmorKubelet
   330  		snippet += kubernetesSupportConnectedPlugAppArmorKubeproxy
   331  		snippet += kubernetesSupportConnectedPlugAppArmorAutobindUnix
   332  		spec.SetUsesPtraceTrace()
   333  	}
   334  
   335  	old := "###KUBERNETES_SUPPORT_SYSTEMD_RUN###"
   336  	spec.AddSnippet(strings.Replace(snippet, old, systemd_run_extra, -1))
   337  	return nil
   338  }
   339  
   340  func (iface *kubernetesSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   341  	// All flavors should include the autobind-unix rules, but we add the
   342  	// privileged kubelet rules conditionally.
   343  	snippet := kubernetesSupportConnectedPlugSeccompAutobindUnix
   344  	flavor := k8sFlavor(plug)
   345  	if flavor == "kubelet" || flavor == "" {
   346  		snippet += kubernetesSupportConnectedPlugSeccompKubelet
   347  	}
   348  	spec.AddSnippet(snippet)
   349  	return nil
   350  }
   351  
   352  func (iface *kubernetesSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   353  	flavor := k8sFlavor(plug)
   354  	if flavor == "kubelet" || flavor == "" {
   355  		for _, rule := range kubernetesSupportConnectedPlugUDevKubelet {
   356  			spec.TagDevice(rule)
   357  		}
   358  	}
   359  	return nil
   360  }
   361  
   362  func (iface *kubernetesSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   363  	flavor := k8sFlavor(plug)
   364  	if flavor == "kubeproxy" || flavor == "" {
   365  		for _, m := range kubernetesSupportConnectedPlugKmodKubeProxy {
   366  			if err := spec.AddModule(m); err != nil {
   367  				return err
   368  			}
   369  		}
   370  	}
   371  	return nil
   372  }
   373  
   374  func (iface *kubernetesSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   375  	// It's fine if flavor isn't specified, but if it is, it needs to be
   376  	// either "kubelet", "kubeproxy" or "autobind-unix"
   377  	if t, ok := plug.Attrs["flavor"]; ok && t != "kubelet" && t != "kubeproxy" && t != "autobind-unix" {
   378  		return fmt.Errorf(`kubernetes-support plug requires "flavor" to be either "kubelet", "kubeproxy" or "autobind-unix"`)
   379  	}
   380  
   381  	return nil
   382  }
   383  
   384  func init() {
   385  	registerIface(&kubernetesSupportInterface{commonInterface{
   386  		name:                 "kubernetes-support",
   387  		summary:              kubernetesSupportSummary,
   388  		implicitOnClassic:    true,
   389  		implicitOnCore:       true,
   390  		baseDeclarationPlugs: kubernetesSupportBaseDeclarationPlugs,
   391  		baseDeclarationSlots: kubernetesSupportBaseDeclarationSlots,
   392  	}})
   393  }