github.com/hugh712/snapd@v0.0.0-20200910133618-1a99902bd583/interfaces/builtin/docker_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  
    25  	"github.com/snapcore/snapd/interfaces"
    26  	"github.com/snapcore/snapd/interfaces/apparmor"
    27  	"github.com/snapcore/snapd/interfaces/kmod"
    28  	"github.com/snapcore/snapd/interfaces/seccomp"
    29  	"github.com/snapcore/snapd/interfaces/udev"
    30  	"github.com/snapcore/snapd/release"
    31  	apparmor_sandbox "github.com/snapcore/snapd/sandbox/apparmor"
    32  	"github.com/snapcore/snapd/snap"
    33  )
    34  
    35  const dockerSupportSummary = `allows operating as the Docker daemon`
    36  
    37  const dockerSupportBaseDeclarationPlugs = `
    38    docker-support:
    39      allow-installation: false
    40      deny-auto-connection: true
    41  `
    42  
    43  const dockerSupportBaseDeclarationSlots = `
    44    docker-support:
    45      allow-installation:
    46        slot-snap-type:
    47          - core
    48      deny-auto-connection: true
    49  `
    50  
    51  const dockerSupportConnectedPlugAppArmorCore = `
    52  # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the
    53  # version of apparmor or the kernel which doesn't resolve the upper layer of an
    54  # overlayfs mount correctly the accesses show up as runc trying to read from
    55  # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/
    56  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl,
    57  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl,
    58  `
    59  
    60  const dockerSupportConnectedPlugAppArmor = `
    61  # Description: allow operating as the Docker daemon/containerd. This policy is
    62  # intentionally not restrictive and is here to help guard against programming
    63  # errors and not for security confinement. The Docker daemon by design requires
    64  # extensive access to the system and cannot be effectively confined against
    65  # malicious activity.
    66  
    67  #include <abstractions/dbus-strict>
    68  
    69  # Allow sockets/etc for docker
    70  /{,var/}run/docker.sock rw,
    71  /{,var/}run/docker/     rw,
    72  /{,var/}run/docker/**   mrwklix,
    73  /{,var/}run/runc/       rw,
    74  /{,var/}run/runc/**     mrwklix,
    75  
    76  # Allow sockets/etc for containerd
    77  /{,var/}run/containerd/{,runc/,runc/k8s.io/,runc/k8s.io/*/} rw,
    78  /{,var/}run/containerd/runc/k8s.io/*/** rwk,
    79  /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw,
    80  /{,var/}run/containerd/io.containerd*/*/** rwk,
    81  
    82  # Limit ipam-state to k8s
    83  /run/ipam-state/k8s-** rw,
    84  /run/ipam-state/k8s-*/lock k,
    85  
    86  # Socket for docker-containerd-shim
    87  unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00",
    88  
    89  /{,var/}run/mount/utab r,
    90  
    91  # Wide read access to /proc, but somewhat limited writes for now
    92  @{PROC}/ r,
    93  @{PROC}/** r,
    94  @{PROC}/[0-9]*/attr/exec w,
    95  @{PROC}/[0-9]*/oom_score_adj w,
    96  
    97  # Limited read access to specific bits of /sys
    98  /sys/kernel/mm/hugepages/ r,
    99  /sys/kernel/mm/transparent_hugepage/{,**} r,
   100  /sys/fs/cgroup/cpuset/cpuset.cpus r,
   101  /sys/fs/cgroup/cpuset/cpuset.mems r,
   102  /sys/module/apparmor/parameters/enabled r,
   103  
   104  # Limit cgroup writes a bit (Docker uses a "docker" sub-group)
   105  /sys/fs/cgroup/*/docker/   rw,
   106  /sys/fs/cgroup/*/docker/** rw,
   107  
   108  # Also allow cgroup writes to kubernetes pods
   109  /sys/fs/cgroup/*/kubepods/ rw,
   110  /sys/fs/cgroup/*/kubepods/** rw,
   111  
   112  # containerd can also be configured to use the systemd cgroup driver via
   113  # plugins.cri.systemd_cgroup = true which moves container processes into
   114  # systemd-managed cgroups. This is now the recommended configuration since it
   115  # provides a single cgroup manager (systemd) in an effort to achieve consistent
   116  # views of resources.
   117  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   118  /sys/fs/cgroup/*/systemd/system.slice/** r,
   119  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   120  
   121  # Allow tracing ourself (especially the "runc" process we create)
   122  ptrace (trace) peer=@{profile_name},
   123  
   124  # Docker needs a lot of caps, but limits them in the app container
   125  capability,
   126  
   127  # Docker does all kinds of mounts all over the filesystem
   128  /dev/mapper/control rw,
   129  /dev/mapper/docker* rw,
   130  /dev/loop-control r,
   131  /dev/loop[0-9]* rw,
   132  /sys/devices/virtual/block/dm-[0-9]*/** r,
   133  mount,
   134  umount,
   135  
   136  # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN,
   137  # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now
   138  # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root)
   139  pivot_root,
   140  /.pivot_root[0-9]*/ rw,
   141  
   142  # file descriptors (/proc/NNN/fd/X)
   143  # file descriptors in the container show up here due to attach_disconnected
   144  /[0-9]* rw,
   145  
   146  # Docker needs to be able to create and load the profile it applies to
   147  # containers ("docker-default")
   148  /sbin/apparmor_parser ixr,
   149  /etc/apparmor.d/cache/ r,            # apparmor 2.12 and below
   150  /etc/apparmor.d/cache/.features r,
   151  /etc/apparmor.d/{,cache/}docker* rw,
   152  /var/cache/apparmor/{,*/} r,         # apparmor 2.13 and higher
   153  /var/cache/apparmor/*/.features r,
   154  /var/cache/apparmor/*/docker* rw,
   155  /etc/apparmor.d/tunables/{,**} r,
   156  /etc/apparmor.d/abstractions/{,**} r,
   157  /etc/apparmor/parser.conf r,
   158  /etc/apparmor/subdomain.conf r,
   159  /sys/kernel/security/apparmor/.replace rw,
   160  /sys/kernel/security/apparmor/{,**} r,
   161  
   162  # use 'privileged-containers: true' to support --security-opts
   163  
   164  # defaults for docker-default
   165  change_profile unsafe /** -> docker-default,
   166  signal (send) peer=docker-default,
   167  ptrace (read, trace) peer=docker-default,
   168  
   169  # defaults for containerd
   170  change_profile unsafe /** -> cri-containerd.apparmor.d,
   171  signal (send) peer=cri-containerd.apparmor.d,
   172  ptrace (read, trace) peer=cri-containerd.apparmor.d,
   173  
   174  # Graph (storage) driver bits
   175  /{dev,run}/shm/aufs.xino mrw,
   176  /proc/fs/aufs/plink_maint w,
   177  /sys/fs/aufs/** r,
   178  
   179  #cf bug 1502785
   180  / r,
   181  
   182  # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx
   183  # and so to allow allocating a new shell we need this
   184  /dev/pts/ptmx rw,
   185  
   186  # needed by runc for mitigation of CVE-2019-5736
   187  # For details see https://bugs.launchpad.net/apparmor/+bug/1820344
   188  / ix,
   189  /bin/runc ixr,
   190  
   191  /pause ixr,
   192  /bin/busybox ixr,
   193  
   194  # When kubernetes drives containerd, containerd needs access to CNI services,
   195  # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it
   196  # could if the control plane was a snap), but in deployments where the control
   197  # plane is not a snap, it will tell flannel to use this path.
   198  /run/flannel/{,**} rk,
   199  
   200  # When kubernetes drives containerd, containerd needs access to various
   201  # secrets for the pods which are overlayed at /run/secrets/....
   202  # This would ideally be snap-specific (it could if the control plane was a
   203  # snap), but in deployments where the control plane is not a snap, it will tell
   204  # containerd to use this path for various account information for pods.
   205  /run/secrets/kubernetes.io/{,**} rk,
   206  `
   207  
   208  const dockerSupportConnectedPlugSecComp = `
   209  # Description: allow operating as the Docker daemon. This policy is
   210  # intentionally not restrictive and is here to help guard against programming
   211  # errors and not for security confinement. The Docker daemon by design requires
   212  # extensive access to the system and cannot be effectively confined against
   213  # malicious activity.
   214  
   215  # Because seccomp may only go more strict, we must allow all syscalls to Docker
   216  # that it expects to give to containers in addition to what it needs to run and
   217  # trust that docker daemon # only gives out reasonable syscalls to containers.
   218  
   219  # Docker includes these in the default container whitelist, but they're
   220  # potentially dangerous.
   221  #finit_module
   222  #init_module
   223  #query_module
   224  #delete_module
   225  
   226  # These have a history of vulnerabilities, are not widely used, and
   227  # open_by_handle_at has been used to break out of Docker containers by brute
   228  # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c
   229  #name_to_handle_at
   230  #open_by_handle_at
   231  
   232  # Calls the Docker daemon itself requires
   233  
   234  # /snap/docker/VERSION/bin/docker-runc
   235  #   "do not inherit the parent's session keyring"
   236  #   "make session keyring searcheable"
   237  # runC uses this to ensure the container doesn't have access to the host
   238  # keyring
   239  keyctl
   240  
   241  # /snap/docker/VERSION/bin/docker-runc
   242  pivot_root
   243  
   244  # ptrace can be abused to break out of the seccomp sandbox
   245  # but is required by the Docker daemon.
   246  ptrace
   247  
   248  # This list comes from Docker's default seccomp whitelist (which is applied to
   249  #   all containers launched unless a custom profile is specified or
   250  #   "--privileged" is used)
   251  # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879
   252  # It has been further filtered to exclude certain known-troublesome syscalls.
   253  accept
   254  accept4
   255  access
   256  acct
   257  adjtimex
   258  alarm
   259  arch_prctl
   260  bind
   261  bpf
   262  breakpoint
   263  brk
   264  cacheflush
   265  capget
   266  capset
   267  chdir
   268  chmod
   269  chown
   270  chown32
   271  chroot
   272  clock_getres
   273  clock_getres_time64
   274  clock_gettime
   275  clock_gettime64
   276  clock_nanosleep
   277  clock_nanosleep_time64
   278  clone
   279  close
   280  connect
   281  copy_file_range
   282  creat
   283  dup
   284  dup2
   285  dup3
   286  epoll_create
   287  epoll_create1
   288  epoll_ctl
   289  epoll_ctl_old
   290  epoll_pwait
   291  epoll_wait
   292  epoll_wait_old
   293  eventfd
   294  eventfd2
   295  execve
   296  execveat
   297  exit
   298  exit_group
   299  faccessat
   300  fadvise64
   301  fadvise64_64
   302  fallocate
   303  fanotify_init
   304  fanotify_mark
   305  fchdir
   306  fchmod
   307  fchmodat
   308  fchown
   309  fchown32
   310  fchownat
   311  fcntl
   312  fcntl64
   313  fdatasync
   314  fgetxattr
   315  flistxattr
   316  flock
   317  fork
   318  fremovexattr
   319  fsetxattr
   320  fstat
   321  fstat64
   322  fstatat64
   323  fstatfs
   324  fstatfs64
   325  fsync
   326  ftruncate
   327  ftruncate64
   328  futex
   329  futex_time64
   330  futimesat
   331  getcpu
   332  getcwd
   333  getdents
   334  getdents64
   335  getegid
   336  getegid32
   337  geteuid
   338  geteuid32
   339  getgid
   340  getgid32
   341  getgroups
   342  getgroups32
   343  getitimer
   344  getpeername
   345  getpgid
   346  getpgrp
   347  getpid
   348  getppid
   349  getpriority
   350  getrandom
   351  getresgid
   352  getresgid32
   353  getresuid
   354  getresuid32
   355  getrlimit
   356  get_robust_list
   357  getrusage
   358  getsid
   359  getsockname
   360  getsockopt
   361  get_thread_area
   362  get_tls
   363  gettid
   364  gettimeofday
   365  getuid
   366  getuid32
   367  getxattr
   368  inotify_add_watch
   369  inotify_init
   370  inotify_init1
   371  inotify_rm_watch
   372  io_cancel
   373  ioctl
   374  io_destroy
   375  io_getevents
   376  ioperm
   377  iopl
   378  ioprio_get
   379  ioprio_set
   380  io_setup
   381  io_submit
   382  ipc
   383  kcmp
   384  kill
   385  lchown
   386  lchown32
   387  lgetxattr
   388  link
   389  linkat
   390  listen
   391  listxattr
   392  llistxattr
   393  _llseek
   394  lookup_dcookie
   395  lremovexattr
   396  lseek
   397  lsetxattr
   398  lstat
   399  lstat64
   400  madvise
   401  memfd_create
   402  mincore
   403  mkdir
   404  mkdirat
   405  mknod
   406  mknodat
   407  mlock
   408  mlock2
   409  mlockall
   410  mmap
   411  mmap2
   412  modify_ldt
   413  mount
   414  mprotect
   415  mq_getsetattr
   416  mq_notify
   417  mq_open
   418  mq_timedreceive
   419  mq_timedreceive_time64
   420  mq_timedsend
   421  mq_timedsend_time64
   422  mq_unlink
   423  mremap
   424  msgctl
   425  msgget
   426  msgrcv
   427  msgsnd
   428  msync
   429  munlock
   430  munlockall
   431  munmap
   432  nanosleep
   433  newfstatat
   434  _newselect
   435  open
   436  openat
   437  pause
   438  perf_event_open
   439  personality
   440  pipe
   441  pipe2
   442  poll
   443  ppoll
   444  ppoll_time64
   445  prctl
   446  pread64
   447  preadv
   448  prlimit64
   449  process_vm_readv
   450  process_vm_writev
   451  pselect6
   452  pselect6_time64
   453  pwrite64
   454  pwritev
   455  read
   456  readahead
   457  readlink
   458  readlinkat
   459  readv
   460  reboot
   461  recv
   462  recvfrom
   463  recvmmsg
   464  recvmmsg_time64
   465  recvmsg
   466  remap_file_pages
   467  removexattr
   468  rename
   469  renameat
   470  renameat2
   471  restart_syscall
   472  rmdir
   473  rt_sigaction
   474  rt_sigpending
   475  rt_sigprocmask
   476  rt_sigqueueinfo
   477  rt_sigreturn
   478  rt_sigsuspend
   479  rt_sigtimedwait
   480  rt_sigtimedwait_time64
   481  rt_tgsigqueueinfo
   482  s390_pci_mmio_read
   483  s390_pci_mmio_write
   484  s390_runtime_instr
   485  sched_getaffinity
   486  sched_getattr
   487  sched_getparam
   488  sched_get_priority_max
   489  sched_get_priority_min
   490  sched_getscheduler
   491  sched_rr_get_interval
   492  sched_rr_get_interval_time64
   493  sched_setaffinity
   494  sched_setattr
   495  sched_setparam
   496  sched_setscheduler
   497  sched_yield
   498  seccomp
   499  select
   500  semctl
   501  semget
   502  semop
   503  semtimedop
   504  semtimedop_time64
   505  send
   506  sendfile
   507  sendfile64
   508  sendmmsg
   509  sendmsg
   510  sendto
   511  setdomainname
   512  setfsgid
   513  setfsgid32
   514  setfsuid
   515  setfsuid32
   516  setgid
   517  setgid32
   518  setgroups
   519  setgroups32
   520  sethostname
   521  setitimer
   522  setns
   523  setpgid
   524  setpriority
   525  setregid
   526  setregid32
   527  setresgid
   528  setresgid32
   529  setresuid
   530  setresuid32
   531  setreuid
   532  setreuid32
   533  setrlimit
   534  set_robust_list
   535  setsid
   536  setsockopt
   537  set_thread_area
   538  set_tid_address
   539  settimeofday
   540  set_tls
   541  setuid
   542  setuid32
   543  setxattr
   544  shmat
   545  shmctl
   546  shmdt
   547  shmget
   548  shutdown
   549  sigaltstack
   550  signalfd
   551  signalfd4
   552  sigreturn
   553  socket
   554  socketcall
   555  socketpair
   556  splice
   557  stat
   558  stat64
   559  statfs
   560  statfs64
   561  stime
   562  symlink
   563  symlinkat
   564  sync
   565  sync_file_range
   566  syncfs
   567  sysinfo
   568  syslog
   569  tee
   570  tgkill
   571  time
   572  timer_create
   573  timer_delete
   574  timerfd_create
   575  timerfd_gettime
   576  timerfd_gettime64
   577  timerfd_settime
   578  timerfd_settime64
   579  timer_getoverrun
   580  timer_gettime
   581  timer_gettime64
   582  timer_settime
   583  timer_settime64
   584  times
   585  tkill
   586  truncate
   587  truncate64
   588  ugetrlimit
   589  umask
   590  umount
   591  umount2
   592  uname
   593  unlink
   594  unlinkat
   595  unshare
   596  utime
   597  utimensat
   598  utimensat_time64
   599  utimes
   600  vfork
   601  vhangup
   602  vmsplice
   603  wait4
   604  waitid
   605  waitpid
   606  write
   607  writev
   608  `
   609  
   610  const dockerSupportPrivilegedAppArmor = `
   611  # Description: allow docker daemon to run privileged containers. This gives
   612  # full access to all resources on the system and thus gives device ownership to
   613  # connected snaps.
   614  
   615  # These rules are here to allow Docker to launch unconfined containers but
   616  # allow the docker daemon itself to go unconfined. Since it runs as root, this
   617  # grants device ownership.
   618  change_profile unsafe /**,
   619  signal (send) peer=unconfined,
   620  ptrace (read, trace) peer=unconfined,
   621  
   622  # This grants raw access to device files and thus device ownership
   623  /dev/** mrwkl,
   624  @{PROC}/** mrwkl,
   625  
   626  # When kubernetes drives docker/containerd, it creates and runs files in the
   627  # container at arbitrary locations (eg, via pivot_root).
   628  /** rwlix,
   629  `
   630  
   631  const dockerSupportPrivilegedSecComp = `
   632  # Description: allow docker daemon to run privileged containers. This gives
   633  # full access to all resources on the system and thus gives device ownership to
   634  # connected snaps.
   635  
   636  # This grants, among other things, kernel module loading and therefore device
   637  # ownership.
   638  @unrestricted
   639  `
   640  
   641  type dockerSupportInterface struct{}
   642  
   643  func (iface *dockerSupportInterface) Name() string {
   644  	return "docker-support"
   645  }
   646  
   647  func (iface *dockerSupportInterface) StaticInfo() interfaces.StaticInfo {
   648  	return interfaces.StaticInfo{
   649  		Summary:              dockerSupportSummary,
   650  		ImplicitOnCore:       true,
   651  		ImplicitOnClassic:    true,
   652  		BaseDeclarationPlugs: dockerSupportBaseDeclarationPlugs,
   653  		BaseDeclarationSlots: dockerSupportBaseDeclarationSlots,
   654  	}
   655  }
   656  
   657  var (
   658  	parserFeatures = apparmor_sandbox.ParserFeatures
   659  )
   660  
   661  func (iface *dockerSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   662  	spec.SetControlsDeviceCgroup()
   663  
   664  	return nil
   665  }
   666  
   667  func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   668  	// https://kubernetes.io/docs/setup/production-environment/container-runtimes/
   669  	if err := spec.AddModule("overlay"); err != nil {
   670  		return err
   671  	}
   672  	return nil
   673  }
   674  
   675  func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   676  	var privileged bool
   677  	_ = plug.Attr("privileged-containers", &privileged)
   678  
   679  	// The 'change_profile unsafe' rules conflict with the 'ix' rules in
   680  	// the home interface, so suppress them (LP: #1797786)
   681  	spec.SetSuppressHomeIx()
   682  	spec.AddSnippet(dockerSupportConnectedPlugAppArmor)
   683  	if privileged {
   684  		spec.AddSnippet(dockerSupportPrivilegedAppArmor)
   685  	}
   686  	if !release.OnClassic {
   687  		spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore)
   688  	}
   689  	spec.SetUsesPtraceTrace()
   690  	return nil
   691  }
   692  
   693  func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   694  	var privileged bool
   695  	_ = plug.Attr("privileged-containers", &privileged)
   696  	snippet := dockerSupportConnectedPlugSecComp
   697  	if privileged {
   698  		snippet += dockerSupportPrivilegedSecComp
   699  	}
   700  	spec.AddSnippet(snippet)
   701  	return nil
   702  }
   703  
   704  func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   705  	if v, ok := plug.Attrs["privileged-containers"]; ok {
   706  		if _, ok = v.(bool); !ok {
   707  			return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'")
   708  		}
   709  	}
   710  	return nil
   711  }
   712  
   713  func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool {
   714  	// allow what declarations allowed
   715  	return true
   716  }
   717  
   718  func init() {
   719  	registerIface(&dockerSupportInterface{})
   720  }