github.com/chipaca/snappy@v0.0.0-20210104084008-1f06296fe8ad/interfaces/builtin/docker_support.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2016-2018 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package builtin
    21  
    22  import (
    23  	"fmt"
    24  
    25  	"github.com/snapcore/snapd/interfaces"
    26  	"github.com/snapcore/snapd/interfaces/apparmor"
    27  	"github.com/snapcore/snapd/interfaces/kmod"
    28  	"github.com/snapcore/snapd/interfaces/seccomp"
    29  	"github.com/snapcore/snapd/interfaces/udev"
    30  	"github.com/snapcore/snapd/release"
    31  	apparmor_sandbox "github.com/snapcore/snapd/sandbox/apparmor"
    32  	"github.com/snapcore/snapd/snap"
    33  )
    34  
    35  const dockerSupportSummary = `allows operating as the Docker daemon`
    36  
    37  const dockerSupportBaseDeclarationPlugs = `
    38    docker-support:
    39      allow-installation: false
    40      deny-auto-connection: true
    41  `
    42  
    43  const dockerSupportBaseDeclarationSlots = `
    44    docker-support:
    45      allow-installation:
    46        slot-snap-type:
    47          - core
    48      deny-auto-connection: true
    49  `
    50  
    51  const dockerSupportConnectedPlugAppArmorCore = `
    52  # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the
    53  # version of apparmor or the kernel which doesn't resolve the upper layer of an
    54  # overlayfs mount correctly the accesses show up as runc trying to read from
    55  # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/
    56  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl,
    57  /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl,
    58  `
    59  
    60  const dockerSupportConnectedPlugAppArmor = `
    61  # Description: allow operating as the Docker daemon/containerd. This policy is
    62  # intentionally not restrictive and is here to help guard against programming
    63  # errors and not for security confinement. The Docker daemon by design requires
    64  # extensive access to the system and cannot be effectively confined against
    65  # malicious activity.
    66  
    67  #include <abstractions/dbus-strict>
    68  
    69  # Allow sockets/etc for docker
    70  /{,var/}run/docker.sock rw,
    71  /{,var/}run/docker/     rw,
    72  /{,var/}run/docker/**   mrwklix,
    73  /{,var/}run/runc/       rw,
    74  /{,var/}run/runc/**     mrwklix,
    75  
    76  # Allow sockets/etc for containerd
    77  /{,var/}run/containerd/{,s/,runc/,runc/k8s.io/,runc/k8s.io/*/} rw,
    78  /{,var/}run/containerd/runc/k8s.io/*/** rwk,
    79  /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw,
    80  /{,var/}run/containerd/io.containerd*/*/** rwk,
    81  /{,var/}run/containerd/s/** rwk,
    82  
    83  # Limit ipam-state to k8s
    84  /run/ipam-state/k8s-** rw,
    85  /run/ipam-state/k8s-*/lock k,
    86  
    87  # Socket for docker-containerd-shim
    88  unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00",
    89  
    90  /{,var/}run/mount/utab r,
    91  
    92  # Wide read access to /proc, but somewhat limited writes for now
    93  @{PROC}/ r,
    94  @{PROC}/** r,
    95  @{PROC}/[0-9]*/attr/{,apparmor/}exec w,
    96  @{PROC}/[0-9]*/oom_score_adj w,
    97  
    98  # Limited read access to specific bits of /sys
    99  /sys/kernel/mm/hugepages/ r,
   100  /sys/kernel/mm/transparent_hugepage/{,**} r,
   101  /sys/fs/cgroup/cpuset/cpuset.cpus r,
   102  /sys/fs/cgroup/cpuset/cpuset.mems r,
   103  /sys/module/apparmor/parameters/enabled r,
   104  
   105  # Limit cgroup writes a bit (Docker uses a "docker" sub-group)
   106  /sys/fs/cgroup/*/docker/   rw,
   107  /sys/fs/cgroup/*/docker/** rw,
   108  
   109  # Also allow cgroup writes to kubernetes pods
   110  /sys/fs/cgroup/*/kubepods/ rw,
   111  /sys/fs/cgroup/*/kubepods/** rw,
   112  
   113  # containerd can also be configured to use the systemd cgroup driver via
   114  # plugins.cri.systemd_cgroup = true which moves container processes into
   115  # systemd-managed cgroups. This is now the recommended configuration since it
   116  # provides a single cgroup manager (systemd) in an effort to achieve consistent
   117  # views of resources.
   118  /sys/fs/cgroup/*/systemd/{,system.slice/} rw,          # create missing dirs
   119  /sys/fs/cgroup/*/systemd/system.slice/** r,
   120  /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w,
   121  
   122  # Allow tracing ourself (especially the "runc" process we create)
   123  ptrace (trace) peer=@{profile_name},
   124  
   125  # Docker needs a lot of caps, but limits them in the app container
   126  capability,
   127  
   128  # Docker does all kinds of mounts all over the filesystem
   129  /dev/mapper/control rw,
   130  /dev/mapper/docker* rw,
   131  /dev/loop-control r,
   132  /dev/loop[0-9]* rw,
   133  /sys/devices/virtual/block/dm-[0-9]*/** r,
   134  mount,
   135  umount,
   136  
   137  # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN,
   138  # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now
   139  # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root)
   140  pivot_root,
   141  /.pivot_root[0-9]*/ rw,
   142  
   143  # file descriptors (/proc/NNN/fd/X)
   144  # file descriptors in the container show up here due to attach_disconnected
   145  /[0-9]* rw,
   146  
   147  # Docker needs to be able to create and load the profile it applies to
   148  # containers ("docker-default")
   149  /sbin/apparmor_parser ixr,
   150  /etc/apparmor.d/cache/ r,            # apparmor 2.12 and below
   151  /etc/apparmor.d/cache/.features r,
   152  /etc/apparmor.d/{,cache/}docker* rw,
   153  /var/cache/apparmor/{,*/} r,         # apparmor 2.13 and higher
   154  /var/cache/apparmor/*/.features r,
   155  /var/cache/apparmor/*/docker* rw,
   156  /etc/apparmor.d/tunables/{,**} r,
   157  /etc/apparmor.d/abstractions/{,**} r,
   158  /etc/apparmor/parser.conf r,
   159  /etc/apparmor/subdomain.conf r,
   160  /sys/kernel/security/apparmor/.replace rw,
   161  /sys/kernel/security/apparmor/{,**} r,
   162  
   163  # use 'privileged-containers: true' to support --security-opts
   164  
   165  # defaults for docker-default
   166  change_profile unsafe /** -> docker-default,
   167  signal (send) peer=docker-default,
   168  ptrace (read, trace) peer=docker-default,
   169  
   170  # defaults for containerd
   171  change_profile unsafe /** -> cri-containerd.apparmor.d,
   172  signal (send) peer=cri-containerd.apparmor.d,
   173  ptrace (read, trace) peer=cri-containerd.apparmor.d,
   174  
   175  # Graph (storage) driver bits
   176  /{dev,run}/shm/aufs.xino mrw,
   177  /proc/fs/aufs/plink_maint w,
   178  /sys/fs/aufs/** r,
   179  
   180  #cf bug 1502785
   181  / r,
   182  
   183  # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx
   184  # and so to allow allocating a new shell we need this
   185  /dev/pts/ptmx rw,
   186  
   187  # needed by runc for mitigation of CVE-2019-5736
   188  # For details see https://bugs.launchpad.net/apparmor/+bug/1820344
   189  / ix,
   190  /bin/runc ixr,
   191  
   192  /pause ixr,
   193  /bin/busybox ixr,
   194  
   195  # When kubernetes drives containerd, containerd needs access to CNI services,
   196  # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it
   197  # could if the control plane was a snap), but in deployments where the control
   198  # plane is not a snap, it will tell flannel to use this path.
   199  /run/flannel/{,**} rk,
   200  
   201  # When kubernetes drives containerd, containerd needs access to various
   202  # secrets for the pods which are overlayed at /run/secrets/....
   203  # This would ideally be snap-specific (it could if the control plane was a
   204  # snap), but in deployments where the control plane is not a snap, it will tell
   205  # containerd to use this path for various account information for pods.
   206  /run/secrets/kubernetes.io/{,**} rk,
   207  `
   208  
   209  const dockerSupportConnectedPlugSecComp = `
   210  # Description: allow operating as the Docker daemon. This policy is
   211  # intentionally not restrictive and is here to help guard against programming
   212  # errors and not for security confinement. The Docker daemon by design requires
   213  # extensive access to the system and cannot be effectively confined against
   214  # malicious activity.
   215  
   216  # Because seccomp may only go more strict, we must allow all syscalls to Docker
   217  # that it expects to give to containers in addition to what it needs to run and
   218  # trust that docker daemon # only gives out reasonable syscalls to containers.
   219  
   220  # Docker includes these in the default container whitelist, but they're
   221  # potentially dangerous.
   222  #finit_module
   223  #init_module
   224  #query_module
   225  #delete_module
   226  
   227  # These have a history of vulnerabilities, are not widely used, and
   228  # open_by_handle_at has been used to break out of Docker containers by brute
   229  # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c
   230  #name_to_handle_at
   231  #open_by_handle_at
   232  
   233  # Calls the Docker daemon itself requires
   234  
   235  # /snap/docker/VERSION/bin/docker-runc
   236  #   "do not inherit the parent's session keyring"
   237  #   "make session keyring searcheable"
   238  # runC uses this to ensure the container doesn't have access to the host
   239  # keyring
   240  keyctl
   241  
   242  # /snap/docker/VERSION/bin/docker-runc
   243  pivot_root
   244  
   245  # ptrace can be abused to break out of the seccomp sandbox
   246  # but is required by the Docker daemon.
   247  ptrace
   248  
   249  # This list comes from Docker's default seccomp whitelist (which is applied to
   250  #   all containers launched unless a custom profile is specified or
   251  #   "--privileged" is used)
   252  # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879
   253  # It has been further filtered to exclude certain known-troublesome syscalls.
   254  accept
   255  accept4
   256  access
   257  acct
   258  adjtimex
   259  alarm
   260  arch_prctl
   261  bind
   262  bpf
   263  breakpoint
   264  brk
   265  cacheflush
   266  capget
   267  capset
   268  chdir
   269  chmod
   270  chown
   271  chown32
   272  chroot
   273  clock_getres
   274  clock_getres_time64
   275  clock_gettime
   276  clock_gettime64
   277  clock_nanosleep
   278  clock_nanosleep_time64
   279  clone
   280  close
   281  connect
   282  copy_file_range
   283  creat
   284  dup
   285  dup2
   286  dup3
   287  epoll_create
   288  epoll_create1
   289  epoll_ctl
   290  epoll_ctl_old
   291  epoll_pwait
   292  epoll_wait
   293  epoll_wait_old
   294  eventfd
   295  eventfd2
   296  execve
   297  execveat
   298  exit
   299  exit_group
   300  faccessat
   301  fadvise64
   302  fadvise64_64
   303  fallocate
   304  fanotify_init
   305  fanotify_mark
   306  fchdir
   307  fchmod
   308  fchmodat
   309  fchown
   310  fchown32
   311  fchownat
   312  fcntl
   313  fcntl64
   314  fdatasync
   315  fgetxattr
   316  flistxattr
   317  flock
   318  fork
   319  fremovexattr
   320  fsetxattr
   321  fstat
   322  fstat64
   323  fstatat64
   324  fstatfs
   325  fstatfs64
   326  fsync
   327  ftruncate
   328  ftruncate64
   329  futex
   330  futex_time64
   331  futimesat
   332  getcpu
   333  getcwd
   334  getdents
   335  getdents64
   336  getegid
   337  getegid32
   338  geteuid
   339  geteuid32
   340  getgid
   341  getgid32
   342  getgroups
   343  getgroups32
   344  getitimer
   345  getpeername
   346  getpgid
   347  getpgrp
   348  getpid
   349  getppid
   350  getpriority
   351  getrandom
   352  getresgid
   353  getresgid32
   354  getresuid
   355  getresuid32
   356  getrlimit
   357  get_robust_list
   358  getrusage
   359  getsid
   360  getsockname
   361  getsockopt
   362  get_thread_area
   363  get_tls
   364  gettid
   365  gettimeofday
   366  getuid
   367  getuid32
   368  getxattr
   369  inotify_add_watch
   370  inotify_init
   371  inotify_init1
   372  inotify_rm_watch
   373  io_cancel
   374  ioctl
   375  io_destroy
   376  io_getevents
   377  ioperm
   378  iopl
   379  ioprio_get
   380  ioprio_set
   381  io_setup
   382  io_submit
   383  ipc
   384  kcmp
   385  kill
   386  lchown
   387  lchown32
   388  lgetxattr
   389  link
   390  linkat
   391  listen
   392  listxattr
   393  llistxattr
   394  _llseek
   395  lookup_dcookie
   396  lremovexattr
   397  lseek
   398  lsetxattr
   399  lstat
   400  lstat64
   401  madvise
   402  memfd_create
   403  mincore
   404  mkdir
   405  mkdirat
   406  mknod
   407  mknodat
   408  mlock
   409  mlock2
   410  mlockall
   411  mmap
   412  mmap2
   413  modify_ldt
   414  mount
   415  mprotect
   416  mq_getsetattr
   417  mq_notify
   418  mq_open
   419  mq_timedreceive
   420  mq_timedreceive_time64
   421  mq_timedsend
   422  mq_timedsend_time64
   423  mq_unlink
   424  mremap
   425  msgctl
   426  msgget
   427  msgrcv
   428  msgsnd
   429  msync
   430  munlock
   431  munlockall
   432  munmap
   433  nanosleep
   434  newfstatat
   435  _newselect
   436  open
   437  openat
   438  pause
   439  perf_event_open
   440  personality
   441  pipe
   442  pipe2
   443  poll
   444  ppoll
   445  ppoll_time64
   446  prctl
   447  pread64
   448  preadv
   449  prlimit64
   450  process_vm_readv
   451  process_vm_writev
   452  pselect6
   453  pselect6_time64
   454  pwrite64
   455  pwritev
   456  read
   457  readahead
   458  readlink
   459  readlinkat
   460  readv
   461  reboot
   462  recv
   463  recvfrom
   464  recvmmsg
   465  recvmmsg_time64
   466  recvmsg
   467  remap_file_pages
   468  removexattr
   469  rename
   470  renameat
   471  renameat2
   472  restart_syscall
   473  rmdir
   474  rt_sigaction
   475  rt_sigpending
   476  rt_sigprocmask
   477  rt_sigqueueinfo
   478  rt_sigreturn
   479  rt_sigsuspend
   480  rt_sigtimedwait
   481  rt_sigtimedwait_time64
   482  rt_tgsigqueueinfo
   483  s390_pci_mmio_read
   484  s390_pci_mmio_write
   485  s390_runtime_instr
   486  sched_getaffinity
   487  sched_getattr
   488  sched_getparam
   489  sched_get_priority_max
   490  sched_get_priority_min
   491  sched_getscheduler
   492  sched_rr_get_interval
   493  sched_rr_get_interval_time64
   494  sched_setaffinity
   495  sched_setattr
   496  sched_setparam
   497  sched_setscheduler
   498  sched_yield
   499  seccomp
   500  select
   501  semctl
   502  semget
   503  semop
   504  semtimedop
   505  semtimedop_time64
   506  send
   507  sendfile
   508  sendfile64
   509  sendmmsg
   510  sendmsg
   511  sendto
   512  setdomainname
   513  setfsgid
   514  setfsgid32
   515  setfsuid
   516  setfsuid32
   517  setgid
   518  setgid32
   519  setgroups
   520  setgroups32
   521  sethostname
   522  setitimer
   523  setns
   524  setpgid
   525  setpriority
   526  setregid
   527  setregid32
   528  setresgid
   529  setresgid32
   530  setresuid
   531  setresuid32
   532  setreuid
   533  setreuid32
   534  setrlimit
   535  set_robust_list
   536  setsid
   537  setsockopt
   538  set_thread_area
   539  set_tid_address
   540  settimeofday
   541  set_tls
   542  setuid
   543  setuid32
   544  setxattr
   545  shmat
   546  shmctl
   547  shmdt
   548  shmget
   549  shutdown
   550  sigaltstack
   551  signalfd
   552  signalfd4
   553  sigreturn
   554  socket
   555  socketcall
   556  socketpair
   557  splice
   558  stat
   559  stat64
   560  statfs
   561  statfs64
   562  stime
   563  symlink
   564  symlinkat
   565  sync
   566  sync_file_range
   567  syncfs
   568  sysinfo
   569  syslog
   570  tee
   571  tgkill
   572  time
   573  timer_create
   574  timer_delete
   575  timerfd_create
   576  timerfd_gettime
   577  timerfd_gettime64
   578  timerfd_settime
   579  timerfd_settime64
   580  timer_getoverrun
   581  timer_gettime
   582  timer_gettime64
   583  timer_settime
   584  timer_settime64
   585  times
   586  tkill
   587  truncate
   588  truncate64
   589  ugetrlimit
   590  umask
   591  umount
   592  umount2
   593  uname
   594  unlink
   595  unlinkat
   596  unshare
   597  utime
   598  utimensat
   599  utimensat_time64
   600  utimes
   601  vfork
   602  vhangup
   603  vmsplice
   604  wait4
   605  waitid
   606  waitpid
   607  write
   608  writev
   609  `
   610  
   611  const dockerSupportPrivilegedAppArmor = `
   612  # Description: allow docker daemon to run privileged containers. This gives
   613  # full access to all resources on the system and thus gives device ownership to
   614  # connected snaps.
   615  
   616  # These rules are here to allow Docker to launch unconfined containers but
   617  # allow the docker daemon itself to go unconfined. Since it runs as root, this
   618  # grants device ownership.
   619  change_profile unsafe /**,
   620  signal (send) peer=unconfined,
   621  ptrace (read, trace) peer=unconfined,
   622  
   623  # This grants raw access to device files and thus device ownership
   624  /dev/** mrwkl,
   625  @{PROC}/** mrwkl,
   626  
   627  # When kubernetes drives docker/containerd, it creates and runs files in the
   628  # container at arbitrary locations (eg, via pivot_root).
   629  /** rwlix,
   630  `
   631  
   632  const dockerSupportPrivilegedSecComp = `
   633  # Description: allow docker daemon to run privileged containers. This gives
   634  # full access to all resources on the system and thus gives device ownership to
   635  # connected snaps.
   636  
   637  # This grants, among other things, kernel module loading and therefore device
   638  # ownership.
   639  @unrestricted
   640  `
   641  
   642  type dockerSupportInterface struct{}
   643  
   644  func (iface *dockerSupportInterface) Name() string {
   645  	return "docker-support"
   646  }
   647  
   648  func (iface *dockerSupportInterface) StaticInfo() interfaces.StaticInfo {
   649  	return interfaces.StaticInfo{
   650  		Summary:              dockerSupportSummary,
   651  		ImplicitOnCore:       true,
   652  		ImplicitOnClassic:    true,
   653  		BaseDeclarationPlugs: dockerSupportBaseDeclarationPlugs,
   654  		BaseDeclarationSlots: dockerSupportBaseDeclarationSlots,
   655  	}
   656  }
   657  
   658  var (
   659  	parserFeatures = apparmor_sandbox.ParserFeatures
   660  )
   661  
   662  func (iface *dockerSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   663  	spec.SetControlsDeviceCgroup()
   664  
   665  	return nil
   666  }
   667  
   668  func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   669  	// https://kubernetes.io/docs/setup/production-environment/container-runtimes/
   670  	if err := spec.AddModule("overlay"); err != nil {
   671  		return err
   672  	}
   673  	return nil
   674  }
   675  
   676  func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   677  	var privileged bool
   678  	_ = plug.Attr("privileged-containers", &privileged)
   679  
   680  	// The 'change_profile unsafe' rules conflict with the 'ix' rules in
   681  	// the home interface, so suppress them (LP: #1797786)
   682  	spec.SetSuppressHomeIx()
   683  	spec.AddSnippet(dockerSupportConnectedPlugAppArmor)
   684  	if privileged {
   685  		spec.AddSnippet(dockerSupportPrivilegedAppArmor)
   686  	}
   687  	if !release.OnClassic {
   688  		spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore)
   689  	}
   690  	spec.SetUsesPtraceTrace()
   691  	return nil
   692  }
   693  
   694  func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error {
   695  	var privileged bool
   696  	_ = plug.Attr("privileged-containers", &privileged)
   697  	snippet := dockerSupportConnectedPlugSecComp
   698  	if privileged {
   699  		snippet += dockerSupportPrivilegedSecComp
   700  	}
   701  	spec.AddSnippet(snippet)
   702  	return nil
   703  }
   704  
   705  func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error {
   706  	if v, ok := plug.Attrs["privileged-containers"]; ok {
   707  		if _, ok = v.(bool); !ok {
   708  			return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'")
   709  		}
   710  	}
   711  	return nil
   712  }
   713  
   714  func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool {
   715  	// allow what declarations allowed
   716  	return true
   717  }
   718  
   719  func init() {
   720  	registerIface(&dockerSupportInterface{})
   721  }