github.com/ubuntu-core/snappy@v0.0.0-20210827154228-9e584df982bb/interfaces/builtin/docker_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  
    25  	"github.com/snapcore/snapd/interfaces"
    26  	"github.com/snapcore/snapd/interfaces/apparmor"
    27  	"github.com/snapcore/snapd/interfaces/kmod"
    28  	"github.com/snapcore/snapd/interfaces/seccomp"
    29  	"github.com/snapcore/snapd/release"
    30  	"github.com/snapcore/snapd/snap"
    31  )
    32  
    33  const dockerSupportSummary = `allows operating as the Docker daemon`
    34  
    35  const dockerSupportBaseDeclarationPlugs = `
    36    docker-support:
    37      allow-installation: false
    38      deny-auto-connection: true
    39  `
    40  
    41  const dockerSupportBaseDeclarationSlots = `
    42    docker-support:
    43      allow-installation:
    44        slot-snap-type:
    45          - core
    46      deny-auto-connection: true
    47  `
    48  
    49  const dockerSupportConnectedPlugAppArmorCore = `
    50  # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the
    51  # version of apparmor or the kernel which doesn't resolve the upper layer of an
    52  # overlayfs mount correctly the accesses show up as runc trying to read from
    53  # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/
    54  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl,
    55  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl,
    56  `
    57  
    58  const dockerSupportConnectedPlugAppArmor = `
    59  # Description: allow operating as the Docker daemon/containerd. This policy is
    60  # intentionally not restrictive and is here to help guard against programming
    61  # errors and not for security confinement. The Docker daemon by design requires
    62  # extensive access to the system and cannot be effectively confined against
    63  # malicious activity.
    64  
    65  #include <abstractions/dbus-strict>
    66  
    67  # Allow sockets/etc for docker
    68  /{,var/}run/docker.sock rw,
    69  /{,var/}run/docker/     rw,
    70  /{,var/}run/docker/**   mrwklix,
    71  /{,var/}run/runc/       rw,
    72  /{,var/}run/runc/**     mrwklix,
    73  
    74  # Allow sockets/etc for containerd
    75  /{,var/}run/containerd/{,s/,runc/,runc/k8s.io/,runc/k8s.io/*/} rw,
    76  /{,var/}run/containerd/runc/k8s.io/*/** rwk,
    77  /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw,
    78  /{,var/}run/containerd/io.containerd*/*/** rwk,
    79  /{,var/}run/containerd/s/** rwk,
    80  
    81  # Limit ipam-state to k8s
    82  /run/ipam-state/k8s-** rw,
    83  /run/ipam-state/k8s-*/lock k,
    84  
    85  # Socket for docker-containerd-shim
    86  unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00",
    87  
    88  /{,var/}run/mount/utab r,
    89  
    90  # Wide read access to /proc, but somewhat limited writes for now
    91  @{PROC}/ r,
    92  @{PROC}/** r,
    93  @{PROC}/[0-9]*/attr/{,apparmor/}exec w,
    94  @{PROC}/[0-9]*/oom_score_adj w,
    95  
    96  # Limited read access to specific bits of /sys
    97  /sys/kernel/mm/hugepages/ r,
    98  /sys/kernel/mm/transparent_hugepage/{,**} r,
    99  /sys/fs/cgroup/cpuset/cpuset.cpus r,
   100  /sys/fs/cgroup/cpuset/cpuset.mems r,
   101  /sys/module/apparmor/parameters/enabled r,
   102  
   103  # Limit cgroup writes a bit (Docker uses a "docker" sub-group)
   104  /sys/fs/cgroup/*/docker/   rw,
   105  /sys/fs/cgroup/*/docker/** rw,
   106  
   107  # Also allow cgroup writes to kubernetes pods
   108  /sys/fs/cgroup/*/kubepods/ rw,
   109  /sys/fs/cgroup/*/kubepods/** rw,
   110  
   111  # containerd can also be configured to use the systemd cgroup driver via
   112  # plugins.cri.systemd_cgroup = true which moves container processes into
   113  # systemd-managed cgroups. This is now the recommended configuration since it
   114  # provides a single cgroup manager (systemd) in an effort to achieve consistent
   115  # views of resources.
   116  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   117  /sys/fs/cgroup/*/systemd/system.slice/** r,
   118  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   119  
   120  # Allow tracing ourself (especially the "runc" process we create)
   121  ptrace (trace) peer=@{profile_name},
   122  
   123  # Docker needs a lot of caps, but limits them in the app container
   124  capability,
   125  
   126  # Docker does all kinds of mounts all over the filesystem
   127  /dev/mapper/control rw,
   128  /dev/mapper/docker* rw,
   129  /dev/loop-control r,
   130  /dev/loop[0-9]* rw,
   131  /sys/devices/virtual/block/dm-[0-9]*/** r,
   132  mount,
   133  umount,
   134  
   135  # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN,
   136  # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now
   137  # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root)
   138  pivot_root,
   139  /.pivot_root[0-9]*/ rw,
   140  
   141  # file descriptors (/proc/NNN/fd/X)
   142  # file descriptors in the container show up here due to attach_disconnected
   143  /[0-9]* rw,
   144  
   145  # Docker needs to be able to create and load the profile it applies to
   146  # containers ("docker-default")
   147  /{,usr/}sbin/apparmor_parser ixr,
   148  /etc/apparmor.d/cache/ r,            # apparmor 2.12 and below
   149  /etc/apparmor.d/cache/.features r,
   150  /etc/apparmor.d/{,cache/}docker* rw,
   151  /var/cache/apparmor/{,*/} r,         # apparmor 2.13 and higher
   152  /var/cache/apparmor/*/.features r,
   153  /var/cache/apparmor/*/docker* rw,
   154  /etc/apparmor.d/tunables/{,**} r,
   155  /etc/apparmor.d/abstractions/{,**} r,
   156  /etc/apparmor/parser.conf r,
   157  /etc/apparmor/subdomain.conf r,
   158  /sys/kernel/security/apparmor/.replace rw,
   159  /sys/kernel/security/apparmor/{,**} r,
   160  
   161  # use 'privileged-containers: true' to support --security-opts
   162  
   163  # defaults for docker-default
   164  change_profile unsafe /** -> docker-default,
   165  signal (send) peer=docker-default,
   166  ptrace (read, trace) peer=docker-default,
   167  
   168  # defaults for containerd
   169  change_profile unsafe /** -> cri-containerd.apparmor.d,
   170  signal (send) peer=cri-containerd.apparmor.d,
   171  ptrace (read, trace) peer=cri-containerd.apparmor.d,
   172  
   173  # Graph (storage) driver bits
   174  /{dev,run}/shm/aufs.xino mrw,
   175  /proc/fs/aufs/plink_maint w,
   176  /sys/fs/aufs/** r,
   177  
   178  #cf bug 1502785
   179  / r,
   180  
   181  # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx
   182  # and so to allow allocating a new shell we need this
   183  /dev/pts/ptmx rw,
   184  
   185  # needed by runc for mitigation of CVE-2019-5736
   186  # For details see https://bugs.launchpad.net/apparmor/+bug/1820344
   187  / ix,
   188  /bin/runc ixr,
   189  
   190  /pause ixr,
   191  /bin/busybox ixr,
   192  
   193  # When kubernetes drives containerd, containerd needs access to CNI services,
   194  # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it
   195  # could if the control plane was a snap), but in deployments where the control
   196  # plane is not a snap, it will tell flannel to use this path.
   197  /run/flannel/{,**} rk,
   198  
   199  # When kubernetes drives containerd, containerd needs access to various
   200  # secrets for the pods which are overlayed at /run/secrets/....
   201  # This would ideally be snap-specific (it could if the control plane was a
   202  # snap), but in deployments where the control plane is not a snap, it will tell
   203  # containerd to use this path for various account information for pods.
   204  /run/secrets/kubernetes.io/{,**} rk,
   205  
   206  # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd)
   207  # unix (bind) type=dgram addr=auto,
   208  # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax
   209  # above which allows only "empty"/automatic addresses, for now we simply permit
   210  # all addresses with SOCK_DGRAM type, which leaks info for other addresses than
   211  # what docker tries to use
   212  # see https://bugs.launchpad.net/snapd/+bug/1867216
   213  unix (bind) type=dgram,
   214  `
   215  
   216  const dockerSupportConnectedPlugSecComp = `
   217  # Description: allow operating as the Docker daemon. This policy is
   218  # intentionally not restrictive and is here to help guard against programming
   219  # errors and not for security confinement. The Docker daemon by design requires
   220  # extensive access to the system and cannot be effectively confined against
   221  # malicious activity.
   222  
   223  # Because seccomp may only go more strict, we must allow all syscalls to Docker
   224  # that it expects to give to containers in addition to what it needs to run and
   225  # trust that docker daemon # only gives out reasonable syscalls to containers.
   226  
   227  # Docker includes these in the default container whitelist, but they're
   228  # potentially dangerous.
   229  #finit_module
   230  #init_module
   231  #query_module
   232  #delete_module
   233  
   234  # These have a history of vulnerabilities, are not widely used, and
   235  # open_by_handle_at has been used to break out of Docker containers by brute
   236  # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c
   237  #name_to_handle_at
   238  #open_by_handle_at
   239  
   240  # Calls the Docker daemon itself requires
   241  
   242  # /snap/docker/VERSION/bin/docker-runc
   243  #   "do not inherit the parent's session keyring"
   244  #   "make session keyring searcheable"
   245  # runC uses this to ensure the container doesn't have access to the host
   246  # keyring
   247  keyctl
   248  
   249  # /snap/docker/VERSION/bin/docker-runc
   250  pivot_root
   251  
   252  # ptrace can be abused to break out of the seccomp sandbox
   253  # but is required by the Docker daemon.
   254  ptrace
   255  
   256  # This list comes from Docker's default seccomp whitelist (which is applied to
   257  #   all containers launched unless a custom profile is specified or
   258  #   "--privileged" is used)
   259  # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879
   260  # It has been further filtered to exclude certain known-troublesome syscalls.
   261  accept
   262  accept4
   263  access
   264  acct
   265  adjtimex
   266  alarm
   267  arch_prctl
   268  bind
   269  bpf
   270  breakpoint
   271  brk
   272  cacheflush
   273  capget
   274  capset
   275  chdir
   276  chmod
   277  chown
   278  chown32
   279  chroot
   280  clock_getres
   281  clock_getres_time64
   282  clock_gettime
   283  clock_gettime64
   284  clock_nanosleep
   285  clock_nanosleep_time64
   286  clone
   287  close
   288  connect
   289  copy_file_range
   290  creat
   291  dup
   292  dup2
   293  dup3
   294  epoll_create
   295  epoll_create1
   296  epoll_ctl
   297  epoll_ctl_old
   298  epoll_pwait
   299  epoll_wait
   300  epoll_wait_old
   301  eventfd
   302  eventfd2
   303  execve
   304  execveat
   305  exit
   306  exit_group
   307  faccessat
   308  fadvise64
   309  fadvise64_64
   310  fallocate
   311  fanotify_init
   312  fanotify_mark
   313  fchdir
   314  fchmod
   315  fchmodat
   316  fchown
   317  fchown32
   318  fchownat
   319  fcntl
   320  fcntl64
   321  fdatasync
   322  fgetxattr
   323  flistxattr
   324  flock
   325  fork
   326  fremovexattr
   327  fsetxattr
   328  fstat
   329  fstat64
   330  fstatat64
   331  fstatfs
   332  fstatfs64
   333  fsync
   334  ftruncate
   335  ftruncate64
   336  futex
   337  futex_time64
   338  futimesat
   339  getcpu
   340  getcwd
   341  getdents
   342  getdents64
   343  getegid
   344  getegid32
   345  geteuid
   346  geteuid32
   347  getgid
   348  getgid32
   349  getgroups
   350  getgroups32
   351  getitimer
   352  getpeername
   353  getpgid
   354  getpgrp
   355  getpid
   356  getppid
   357  getpriority
   358  getrandom
   359  getresgid
   360  getresgid32
   361  getresuid
   362  getresuid32
   363  getrlimit
   364  get_robust_list
   365  getrusage
   366  getsid
   367  getsockname
   368  getsockopt
   369  get_thread_area
   370  get_tls
   371  gettid
   372  gettimeofday
   373  getuid
   374  getuid32
   375  getxattr
   376  inotify_add_watch
   377  inotify_init
   378  inotify_init1
   379  inotify_rm_watch
   380  io_cancel
   381  ioctl
   382  io_destroy
   383  io_getevents
   384  ioperm
   385  iopl
   386  ioprio_get
   387  ioprio_set
   388  io_setup
   389  io_submit
   390  ipc
   391  kcmp
   392  kill
   393  lchown
   394  lchown32
   395  lgetxattr
   396  link
   397  linkat
   398  listen
   399  listxattr
   400  llistxattr
   401  _llseek
   402  lookup_dcookie
   403  lremovexattr
   404  lseek
   405  lsetxattr
   406  lstat
   407  lstat64
   408  madvise
   409  memfd_create
   410  mincore
   411  mkdir
   412  mkdirat
   413  mknod
   414  mknodat
   415  mlock
   416  mlock2
   417  mlockall
   418  mmap
   419  mmap2
   420  modify_ldt
   421  mount
   422  mprotect
   423  mq_getsetattr
   424  mq_notify
   425  mq_open
   426  mq_timedreceive
   427  mq_timedreceive_time64
   428  mq_timedsend
   429  mq_timedsend_time64
   430  mq_unlink
   431  mremap
   432  msgctl
   433  msgget
   434  msgrcv
   435  msgsnd
   436  msync
   437  munlock
   438  munlockall
   439  munmap
   440  nanosleep
   441  newfstatat
   442  _newselect
   443  open
   444  openat
   445  pause
   446  perf_event_open
   447  personality
   448  pipe
   449  pipe2
   450  poll
   451  ppoll
   452  ppoll_time64
   453  prctl
   454  pread64
   455  preadv
   456  prlimit64
   457  process_vm_readv
   458  process_vm_writev
   459  pselect6
   460  pselect6_time64
   461  pwrite64
   462  pwritev
   463  read
   464  readahead
   465  readlink
   466  readlinkat
   467  readv
   468  reboot
   469  recv
   470  recvfrom
   471  recvmmsg
   472  recvmmsg_time64
   473  recvmsg
   474  remap_file_pages
   475  removexattr
   476  rename
   477  renameat
   478  renameat2
   479  restart_syscall
   480  rmdir
   481  rt_sigaction
   482  rt_sigpending
   483  rt_sigprocmask
   484  rt_sigqueueinfo
   485  rt_sigreturn
   486  rt_sigsuspend
   487  rt_sigtimedwait
   488  rt_sigtimedwait_time64
   489  rt_tgsigqueueinfo
   490  s390_pci_mmio_read
   491  s390_pci_mmio_write
   492  s390_runtime_instr
   493  sched_getaffinity
   494  sched_getattr
   495  sched_getparam
   496  sched_get_priority_max
   497  sched_get_priority_min
   498  sched_getscheduler
   499  sched_rr_get_interval
   500  sched_rr_get_interval_time64
   501  sched_setaffinity
   502  sched_setattr
   503  sched_setparam
   504  sched_setscheduler
   505  sched_yield
   506  seccomp
   507  select
   508  semctl
   509  semget
   510  semop
   511  semtimedop
   512  semtimedop_time64
   513  send
   514  sendfile
   515  sendfile64
   516  sendmmsg
   517  sendmsg
   518  sendto
   519  setdomainname
   520  setfsgid
   521  setfsgid32
   522  setfsuid
   523  setfsuid32
   524  setgid
   525  setgid32
   526  setgroups
   527  setgroups32
   528  sethostname
   529  setitimer
   530  setns
   531  setpgid
   532  setpriority
   533  setregid
   534  setregid32
   535  setresgid
   536  setresgid32
   537  setresuid
   538  setresuid32
   539  setreuid
   540  setreuid32
   541  setrlimit
   542  set_robust_list
   543  setsid
   544  setsockopt
   545  set_thread_area
   546  set_tid_address
   547  settimeofday
   548  set_tls
   549  setuid
   550  setuid32
   551  setxattr
   552  shmat
   553  shmctl
   554  shmdt
   555  shmget
   556  shutdown
   557  sigaltstack
   558  signalfd
   559  signalfd4
   560  sigreturn
   561  socket
   562  socketcall
   563  socketpair
   564  splice
   565  stat
   566  stat64
   567  statfs
   568  statfs64
   569  stime
   570  symlink
   571  symlinkat
   572  sync
   573  sync_file_range
   574  syncfs
   575  sysinfo
   576  syslog
   577  tee
   578  tgkill
   579  time
   580  timer_create
   581  timer_delete
   582  timerfd_create
   583  timerfd_gettime
   584  timerfd_gettime64
   585  timerfd_settime
   586  timerfd_settime64
   587  timer_getoverrun
   588  timer_gettime
   589  timer_gettime64
   590  timer_settime
   591  timer_settime64
   592  times
   593  tkill
   594  truncate
   595  truncate64
   596  ugetrlimit
   597  umask
   598  umount
   599  umount2
   600  uname
   601  unlink
   602  unlinkat
   603  unshare
   604  utime
   605  utimensat
   606  utimensat_time64
   607  utimes
   608  vfork
   609  vhangup
   610  vmsplice
   611  wait4
   612  waitid
   613  waitpid
   614  write
   615  writev
   616  `
   617  
   618  const dockerSupportPrivilegedAppArmor = `
   619  # Description: allow docker daemon to run privileged containers. This gives
   620  # full access to all resources on the system and thus gives device ownership to
   621  # connected snaps.
   622  
   623  # These rules are here to allow Docker to launch unconfined containers but
   624  # allow the docker daemon itself to go unconfined. Since it runs as root, this
   625  # grants device ownership.
   626  change_profile unsafe /**,
   627  signal (send) peer=unconfined,
   628  ptrace (read, trace) peer=unconfined,
   629  
   630  # This grants raw access to device files and thus device ownership
   631  /dev/** mrwkl,
   632  @{PROC}/** mrwkl,
   633  
   634  # When kubernetes drives docker/containerd, it creates and runs files in the
   635  # container at arbitrary locations (eg, via pivot_root).
   636  /** rwlix,
   637  `
   638  
   639  const dockerSupportPrivilegedSecComp = `
   640  # Description: allow docker daemon to run privileged containers. This gives
   641  # full access to all resources on the system and thus gives device ownership to
   642  # connected snaps.
   643  
   644  # This grants, among other things, kernel module loading and therefore device
   645  # ownership.
   646  @unrestricted
   647  `
   648  
   649  const dockerSupportServiceSnippet = `Delegate=true`
   650  
   651  type dockerSupportInterface struct {
   652  	commonInterface
   653  }
   654  
   655  func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   656  	// https://kubernetes.io/docs/setup/production-environment/container-runtimes/
   657  	if err := spec.AddModule("overlay"); err != nil {
   658  		return err
   659  	}
   660  	return nil
   661  }
   662  
   663  func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   664  	var privileged bool
   665  	_ = plug.Attr("privileged-containers", &privileged)
   666  
   667  	// The 'change_profile unsafe' rules conflict with the 'ix' rules in
   668  	// the home interface, so suppress them (LP: #1797786)
   669  	spec.SetSuppressHomeIx()
   670  	spec.AddSnippet(dockerSupportConnectedPlugAppArmor)
   671  	if privileged {
   672  		spec.AddSnippet(dockerSupportPrivilegedAppArmor)
   673  	}
   674  	if !release.OnClassic {
   675  		spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore)
   676  	}
   677  	spec.SetUsesPtraceTrace()
   678  	return nil
   679  }
   680  
   681  func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   682  	var privileged bool
   683  	_ = plug.Attr("privileged-containers", &privileged)
   684  	snippet := dockerSupportConnectedPlugSecComp
   685  	if privileged {
   686  		snippet += dockerSupportPrivilegedSecComp
   687  	}
   688  	spec.AddSnippet(snippet)
   689  	return nil
   690  }
   691  
   692  func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   693  	if v, ok := plug.Attrs["privileged-containers"]; ok {
   694  		if _, ok = v.(bool); !ok {
   695  			return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'")
   696  		}
   697  	}
   698  	return nil
   699  }
   700  
   701  func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool {
   702  	// allow what declarations allowed
   703  	return true
   704  }
   705  
   706  func init() {
   707  	registerIface(&dockerSupportInterface{commonInterface{
   708  		name:                 "docker-support",
   709  		summary:              dockerSupportSummary,
   710  		implicitOnCore:       true,
   711  		implicitOnClassic:    true,
   712  		baseDeclarationPlugs: dockerSupportBaseDeclarationPlugs,
   713  		baseDeclarationSlots: dockerSupportBaseDeclarationSlots,
   714  		controlsDeviceCgroup: true,
   715  		serviceSnippets:      []string{dockerSupportServiceSnippet},
   716  		// docker-support also uses ptrace(trace), but it already declares this in
   717  		// the AppArmorConnectedPlug method
   718  	}})
   719  }