github.com/bugraaydogar/snapd@v0.0.0-20210315170335-8c70bb858939/interfaces/builtin/docker_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  
    25  	"github.com/snapcore/snapd/interfaces"
    26  	"github.com/snapcore/snapd/interfaces/apparmor"
    27  	"github.com/snapcore/snapd/interfaces/kmod"
    28  	"github.com/snapcore/snapd/interfaces/seccomp"
    29  	"github.com/snapcore/snapd/release"
    30  	"github.com/snapcore/snapd/snap"
    31  )
    32  
    33  const dockerSupportSummary = `allows operating as the Docker daemon`
    34  
    35  const dockerSupportBaseDeclarationPlugs = `
    36    docker-support:
    37      allow-installation: false
    38      deny-auto-connection: true
    39  `
    40  
    41  const dockerSupportBaseDeclarationSlots = `
    42    docker-support:
    43      allow-installation:
    44        slot-snap-type:
    45          - core
    46      deny-auto-connection: true
    47  `
    48  
    49  const dockerSupportConnectedPlugAppArmorCore = `
    50  # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the
    51  # version of apparmor or the kernel which doesn't resolve the upper layer of an
    52  # overlayfs mount correctly the accesses show up as runc trying to read from
    53  # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/
    54  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl,
    55  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl,
    56  `
    57  
    58  const dockerSupportConnectedPlugAppArmor = `
    59  # Description: allow operating as the Docker daemon/containerd. This policy is
    60  # intentionally not restrictive and is here to help guard against programming
    61  # errors and not for security confinement. The Docker daemon by design requires
    62  # extensive access to the system and cannot be effectively confined against
    63  # malicious activity.
    64  
    65  #include <abstractions/dbus-strict>
    66  
    67  # Allow sockets/etc for docker
    68  /{,var/}run/docker.sock rw,
    69  /{,var/}run/docker/     rw,
    70  /{,var/}run/docker/**   mrwklix,
    71  /{,var/}run/runc/       rw,
    72  /{,var/}run/runc/**     mrwklix,
    73  
    74  # Allow sockets/etc for containerd
    75  /{,var/}run/containerd/{,s/,runc/,runc/k8s.io/,runc/k8s.io/*/} rw,
    76  /{,var/}run/containerd/runc/k8s.io/*/** rwk,
    77  /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw,
    78  /{,var/}run/containerd/io.containerd*/*/** rwk,
    79  /{,var/}run/containerd/s/** rwk,
    80  
    81  # Limit ipam-state to k8s
    82  /run/ipam-state/k8s-** rw,
    83  /run/ipam-state/k8s-*/lock k,
    84  
    85  # Socket for docker-containerd-shim
    86  unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00",
    87  
    88  /{,var/}run/mount/utab r,
    89  
    90  # Wide read access to /proc, but somewhat limited writes for now
    91  @{PROC}/ r,
    92  @{PROC}/** r,
    93  @{PROC}/[0-9]*/attr/{,apparmor/}exec w,
    94  @{PROC}/[0-9]*/oom_score_adj w,
    95  
    96  # Limited read access to specific bits of /sys
    97  /sys/kernel/mm/hugepages/ r,
    98  /sys/kernel/mm/transparent_hugepage/{,**} r,
    99  /sys/fs/cgroup/cpuset/cpuset.cpus r,
   100  /sys/fs/cgroup/cpuset/cpuset.mems r,
   101  /sys/module/apparmor/parameters/enabled r,
   102  
   103  # Limit cgroup writes a bit (Docker uses a "docker" sub-group)
   104  /sys/fs/cgroup/*/docker/   rw,
   105  /sys/fs/cgroup/*/docker/** rw,
   106  
   107  # Also allow cgroup writes to kubernetes pods
   108  /sys/fs/cgroup/*/kubepods/ rw,
   109  /sys/fs/cgroup/*/kubepods/** rw,
   110  
   111  # containerd can also be configured to use the systemd cgroup driver via
   112  # plugins.cri.systemd_cgroup = true which moves container processes into
   113  # systemd-managed cgroups. This is now the recommended configuration since it
   114  # provides a single cgroup manager (systemd) in an effort to achieve consistent
   115  # views of resources.
   116  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   117  /sys/fs/cgroup/*/systemd/system.slice/** r,
   118  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   119  
   120  # Allow tracing ourself (especially the "runc" process we create)
   121  ptrace (trace) peer=@{profile_name},
   122  
   123  # Docker needs a lot of caps, but limits them in the app container
   124  capability,
   125  
   126  # Docker does all kinds of mounts all over the filesystem
   127  /dev/mapper/control rw,
   128  /dev/mapper/docker* rw,
   129  /dev/loop-control r,
   130  /dev/loop[0-9]* rw,
   131  /sys/devices/virtual/block/dm-[0-9]*/** r,
   132  mount,
   133  umount,
   134  
   135  # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN,
   136  # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now
   137  # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root)
   138  pivot_root,
   139  /.pivot_root[0-9]*/ rw,
   140  
   141  # file descriptors (/proc/NNN/fd/X)
   142  # file descriptors in the container show up here due to attach_disconnected
   143  /[0-9]* rw,
   144  
   145  # Docker needs to be able to create and load the profile it applies to
   146  # containers ("docker-default")
   147  /sbin/apparmor_parser ixr,
   148  /etc/apparmor.d/cache/ r,            # apparmor 2.12 and below
   149  /etc/apparmor.d/cache/.features r,
   150  /etc/apparmor.d/{,cache/}docker* rw,
   151  /var/cache/apparmor/{,*/} r,         # apparmor 2.13 and higher
   152  /var/cache/apparmor/*/.features r,
   153  /var/cache/apparmor/*/docker* rw,
   154  /etc/apparmor.d/tunables/{,**} r,
   155  /etc/apparmor.d/abstractions/{,**} r,
   156  /etc/apparmor/parser.conf r,
   157  /etc/apparmor/subdomain.conf r,
   158  /sys/kernel/security/apparmor/.replace rw,
   159  /sys/kernel/security/apparmor/{,**} r,
   160  
   161  # use 'privileged-containers: true' to support --security-opts
   162  
   163  # defaults for docker-default
   164  change_profile unsafe /** -> docker-default,
   165  signal (send) peer=docker-default,
   166  ptrace (read, trace) peer=docker-default,
   167  
   168  # defaults for containerd
   169  change_profile unsafe /** -> cri-containerd.apparmor.d,
   170  signal (send) peer=cri-containerd.apparmor.d,
   171  ptrace (read, trace) peer=cri-containerd.apparmor.d,
   172  
   173  # Graph (storage) driver bits
   174  /{dev,run}/shm/aufs.xino mrw,
   175  /proc/fs/aufs/plink_maint w,
   176  /sys/fs/aufs/** r,
   177  
   178  #cf bug 1502785
   179  / r,
   180  
   181  # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx
   182  # and so to allow allocating a new shell we need this
   183  /dev/pts/ptmx rw,
   184  
   185  # needed by runc for mitigation of CVE-2019-5736
   186  # For details see https://bugs.launchpad.net/apparmor/+bug/1820344
   187  / ix,
   188  /bin/runc ixr,
   189  
   190  /pause ixr,
   191  /bin/busybox ixr,
   192  
   193  # When kubernetes drives containerd, containerd needs access to CNI services,
   194  # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it
   195  # could if the control plane was a snap), but in deployments where the control
   196  # plane is not a snap, it will tell flannel to use this path.
   197  /run/flannel/{,**} rk,
   198  
   199  # When kubernetes drives containerd, containerd needs access to various
   200  # secrets for the pods which are overlayed at /run/secrets/....
   201  # This would ideally be snap-specific (it could if the control plane was a
   202  # snap), but in deployments where the control plane is not a snap, it will tell
   203  # containerd to use this path for various account information for pods.
   204  /run/secrets/kubernetes.io/{,**} rk,
   205  `
   206  
   207  const dockerSupportConnectedPlugSecComp = `
   208  # Description: allow operating as the Docker daemon. This policy is
   209  # intentionally not restrictive and is here to help guard against programming
   210  # errors and not for security confinement. The Docker daemon by design requires
   211  # extensive access to the system and cannot be effectively confined against
   212  # malicious activity.
   213  
   214  # Because seccomp may only go more strict, we must allow all syscalls to Docker
   215  # that it expects to give to containers in addition to what it needs to run and
   216  # trust that docker daemon # only gives out reasonable syscalls to containers.
   217  
   218  # Docker includes these in the default container whitelist, but they're
   219  # potentially dangerous.
   220  #finit_module
   221  #init_module
   222  #query_module
   223  #delete_module
   224  
   225  # These have a history of vulnerabilities, are not widely used, and
   226  # open_by_handle_at has been used to break out of Docker containers by brute
   227  # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c
   228  #name_to_handle_at
   229  #open_by_handle_at
   230  
   231  # Calls the Docker daemon itself requires
   232  
   233  # /snap/docker/VERSION/bin/docker-runc
   234  #   "do not inherit the parent's session keyring"
   235  #   "make session keyring searcheable"
   236  # runC uses this to ensure the container doesn't have access to the host
   237  # keyring
   238  keyctl
   239  
   240  # /snap/docker/VERSION/bin/docker-runc
   241  pivot_root
   242  
   243  # ptrace can be abused to break out of the seccomp sandbox
   244  # but is required by the Docker daemon.
   245  ptrace
   246  
   247  # This list comes from Docker's default seccomp whitelist (which is applied to
   248  #   all containers launched unless a custom profile is specified or
   249  #   "--privileged" is used)
   250  # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879
   251  # It has been further filtered to exclude certain known-troublesome syscalls.
   252  accept
   253  accept4
   254  access
   255  acct
   256  adjtimex
   257  alarm
   258  arch_prctl
   259  bind
   260  bpf
   261  breakpoint
   262  brk
   263  cacheflush
   264  capget
   265  capset
   266  chdir
   267  chmod
   268  chown
   269  chown32
   270  chroot
   271  clock_getres
   272  clock_getres_time64
   273  clock_gettime
   274  clock_gettime64
   275  clock_nanosleep
   276  clock_nanosleep_time64
   277  clone
   278  close
   279  connect
   280  copy_file_range
   281  creat
   282  dup
   283  dup2
   284  dup3
   285  epoll_create
   286  epoll_create1
   287  epoll_ctl
   288  epoll_ctl_old
   289  epoll_pwait
   290  epoll_wait
   291  epoll_wait_old
   292  eventfd
   293  eventfd2
   294  execve
   295  execveat
   296  exit
   297  exit_group
   298  faccessat
   299  fadvise64
   300  fadvise64_64
   301  fallocate
   302  fanotify_init
   303  fanotify_mark
   304  fchdir
   305  fchmod
   306  fchmodat
   307  fchown
   308  fchown32
   309  fchownat
   310  fcntl
   311  fcntl64
   312  fdatasync
   313  fgetxattr
   314  flistxattr
   315  flock
   316  fork
   317  fremovexattr
   318  fsetxattr
   319  fstat
   320  fstat64
   321  fstatat64
   322  fstatfs
   323  fstatfs64
   324  fsync
   325  ftruncate
   326  ftruncate64
   327  futex
   328  futex_time64
   329  futimesat
   330  getcpu
   331  getcwd
   332  getdents
   333  getdents64
   334  getegid
   335  getegid32
   336  geteuid
   337  geteuid32
   338  getgid
   339  getgid32
   340  getgroups
   341  getgroups32
   342  getitimer
   343  getpeername
   344  getpgid
   345  getpgrp
   346  getpid
   347  getppid
   348  getpriority
   349  getrandom
   350  getresgid
   351  getresgid32
   352  getresuid
   353  getresuid32
   354  getrlimit
   355  get_robust_list
   356  getrusage
   357  getsid
   358  getsockname
   359  getsockopt
   360  get_thread_area
   361  get_tls
   362  gettid
   363  gettimeofday
   364  getuid
   365  getuid32
   366  getxattr
   367  inotify_add_watch
   368  inotify_init
   369  inotify_init1
   370  inotify_rm_watch
   371  io_cancel
   372  ioctl
   373  io_destroy
   374  io_getevents
   375  ioperm
   376  iopl
   377  ioprio_get
   378  ioprio_set
   379  io_setup
   380  io_submit
   381  ipc
   382  kcmp
   383  kill
   384  lchown
   385  lchown32
   386  lgetxattr
   387  link
   388  linkat
   389  listen
   390  listxattr
   391  llistxattr
   392  _llseek
   393  lookup_dcookie
   394  lremovexattr
   395  lseek
   396  lsetxattr
   397  lstat
   398  lstat64
   399  madvise
   400  memfd_create
   401  mincore
   402  mkdir
   403  mkdirat
   404  mknod
   405  mknodat
   406  mlock
   407  mlock2
   408  mlockall
   409  mmap
   410  mmap2
   411  modify_ldt
   412  mount
   413  mprotect
   414  mq_getsetattr
   415  mq_notify
   416  mq_open
   417  mq_timedreceive
   418  mq_timedreceive_time64
   419  mq_timedsend
   420  mq_timedsend_time64
   421  mq_unlink
   422  mremap
   423  msgctl
   424  msgget
   425  msgrcv
   426  msgsnd
   427  msync
   428  munlock
   429  munlockall
   430  munmap
   431  nanosleep
   432  newfstatat
   433  _newselect
   434  open
   435  openat
   436  pause
   437  perf_event_open
   438  personality
   439  pipe
   440  pipe2
   441  poll
   442  ppoll
   443  ppoll_time64
   444  prctl
   445  pread64
   446  preadv
   447  prlimit64
   448  process_vm_readv
   449  process_vm_writev
   450  pselect6
   451  pselect6_time64
   452  pwrite64
   453  pwritev
   454  read
   455  readahead
   456  readlink
   457  readlinkat
   458  readv
   459  reboot
   460  recv
   461  recvfrom
   462  recvmmsg
   463  recvmmsg_time64
   464  recvmsg
   465  remap_file_pages
   466  removexattr
   467  rename
   468  renameat
   469  renameat2
   470  restart_syscall
   471  rmdir
   472  rt_sigaction
   473  rt_sigpending
   474  rt_sigprocmask
   475  rt_sigqueueinfo
   476  rt_sigreturn
   477  rt_sigsuspend
   478  rt_sigtimedwait
   479  rt_sigtimedwait_time64
   480  rt_tgsigqueueinfo
   481  s390_pci_mmio_read
   482  s390_pci_mmio_write
   483  s390_runtime_instr
   484  sched_getaffinity
   485  sched_getattr
   486  sched_getparam
   487  sched_get_priority_max
   488  sched_get_priority_min
   489  sched_getscheduler
   490  sched_rr_get_interval
   491  sched_rr_get_interval_time64
   492  sched_setaffinity
   493  sched_setattr
   494  sched_setparam
   495  sched_setscheduler
   496  sched_yield
   497  seccomp
   498  select
   499  semctl
   500  semget
   501  semop
   502  semtimedop
   503  semtimedop_time64
   504  send
   505  sendfile
   506  sendfile64
   507  sendmmsg
   508  sendmsg
   509  sendto
   510  setdomainname
   511  setfsgid
   512  setfsgid32
   513  setfsuid
   514  setfsuid32
   515  setgid
   516  setgid32
   517  setgroups
   518  setgroups32
   519  sethostname
   520  setitimer
   521  setns
   522  setpgid
   523  setpriority
   524  setregid
   525  setregid32
   526  setresgid
   527  setresgid32
   528  setresuid
   529  setresuid32
   530  setreuid
   531  setreuid32
   532  setrlimit
   533  set_robust_list
   534  setsid
   535  setsockopt
   536  set_thread_area
   537  set_tid_address
   538  settimeofday
   539  set_tls
   540  setuid
   541  setuid32
   542  setxattr
   543  shmat
   544  shmctl
   545  shmdt
   546  shmget
   547  shutdown
   548  sigaltstack
   549  signalfd
   550  signalfd4
   551  sigreturn
   552  socket
   553  socketcall
   554  socketpair
   555  splice
   556  stat
   557  stat64
   558  statfs
   559  statfs64
   560  stime
   561  symlink
   562  symlinkat
   563  sync
   564  sync_file_range
   565  syncfs
   566  sysinfo
   567  syslog
   568  tee
   569  tgkill
   570  time
   571  timer_create
   572  timer_delete
   573  timerfd_create
   574  timerfd_gettime
   575  timerfd_gettime64
   576  timerfd_settime
   577  timerfd_settime64
   578  timer_getoverrun
   579  timer_gettime
   580  timer_gettime64
   581  timer_settime
   582  timer_settime64
   583  times
   584  tkill
   585  truncate
   586  truncate64
   587  ugetrlimit
   588  umask
   589  umount
   590  umount2
   591  uname
   592  unlink
   593  unlinkat
   594  unshare
   595  utime
   596  utimensat
   597  utimensat_time64
   598  utimes
   599  vfork
   600  vhangup
   601  vmsplice
   602  wait4
   603  waitid
   604  waitpid
   605  write
   606  writev
   607  `
   608  
   609  const dockerSupportPrivilegedAppArmor = `
   610  # Description: allow docker daemon to run privileged containers. This gives
   611  # full access to all resources on the system and thus gives device ownership to
   612  # connected snaps.
   613  
   614  # These rules are here to allow Docker to launch unconfined containers but
   615  # allow the docker daemon itself to go unconfined. Since it runs as root, this
   616  # grants device ownership.
   617  change_profile unsafe /**,
   618  signal (send) peer=unconfined,
   619  ptrace (read, trace) peer=unconfined,
   620  
   621  # This grants raw access to device files and thus device ownership
   622  /dev/** mrwkl,
   623  @{PROC}/** mrwkl,
   624  
   625  # When kubernetes drives docker/containerd, it creates and runs files in the
   626  # container at arbitrary locations (eg, via pivot_root).
   627  /** rwlix,
   628  `
   629  
   630  const dockerSupportPrivilegedSecComp = `
   631  # Description: allow docker daemon to run privileged containers. This gives
   632  # full access to all resources on the system and thus gives device ownership to
   633  # connected snaps.
   634  
   635  # This grants, among other things, kernel module loading and therefore device
   636  # ownership.
   637  @unrestricted
   638  `
   639  
   640  const dockerSupportServiceSnippet = `Delegate=true`
   641  
   642  type dockerSupportInterface struct {
   643  	commonInterface
   644  }
   645  
   646  func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   647  	// https://kubernetes.io/docs/setup/production-environment/container-runtimes/
   648  	if err := spec.AddModule("overlay"); err != nil {
   649  		return err
   650  	}
   651  	return nil
   652  }
   653  
   654  func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   655  	var privileged bool
   656  	_ = plug.Attr("privileged-containers", &privileged)
   657  
   658  	// The 'change_profile unsafe' rules conflict with the 'ix' rules in
   659  	// the home interface, so suppress them (LP: #1797786)
   660  	spec.SetSuppressHomeIx()
   661  	spec.AddSnippet(dockerSupportConnectedPlugAppArmor)
   662  	if privileged {
   663  		spec.AddSnippet(dockerSupportPrivilegedAppArmor)
   664  	}
   665  	if !release.OnClassic {
   666  		spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore)
   667  	}
   668  	spec.SetUsesPtraceTrace()
   669  	return nil
   670  }
   671  
   672  func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   673  	var privileged bool
   674  	_ = plug.Attr("privileged-containers", &privileged)
   675  	snippet := dockerSupportConnectedPlugSecComp
   676  	if privileged {
   677  		snippet += dockerSupportPrivilegedSecComp
   678  	}
   679  	spec.AddSnippet(snippet)
   680  	return nil
   681  }
   682  
   683  func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   684  	if v, ok := plug.Attrs["privileged-containers"]; ok {
   685  		if _, ok = v.(bool); !ok {
   686  			return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'")
   687  		}
   688  	}
   689  	return nil
   690  }
   691  
   692  func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool {
   693  	// allow what declarations allowed
   694  	return true
   695  }
   696  
   697  func init() {
   698  	registerIface(&dockerSupportInterface{commonInterface{
   699  		name:                 "docker-support",
   700  		summary:              dockerSupportSummary,
   701  		implicitOnCore:       true,
   702  		implicitOnClassic:    true,
   703  		baseDeclarationPlugs: dockerSupportBaseDeclarationPlugs,
   704  		baseDeclarationSlots: dockerSupportBaseDeclarationSlots,
   705  		controlsDeviceCgroup: true,
   706  		serviceSnippets:      []string{dockerSupportServiceSnippet},
   707  		// docker-support also uses ptrace(trace), but it already declares this in
   708  		// the AppArmorConnectedPlug method
   709  	}})
   710  }