github.com/ubuntu-core/snappy@v0.0.0-20210827154228-9e584df982bb/interfaces/builtin/docker_support.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2018 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package builtin 21 22 import ( 23 "fmt" 24 25 "github.com/snapcore/snapd/interfaces" 26 "github.com/snapcore/snapd/interfaces/apparmor" 27 "github.com/snapcore/snapd/interfaces/kmod" 28 "github.com/snapcore/snapd/interfaces/seccomp" 29 "github.com/snapcore/snapd/release" 30 "github.com/snapcore/snapd/snap" 31 ) 32 33 const dockerSupportSummary = `allows operating as the Docker daemon` 34 35 const dockerSupportBaseDeclarationPlugs = ` 36 docker-support: 37 allow-installation: false 38 deny-auto-connection: true 39 ` 40 41 const dockerSupportBaseDeclarationSlots = ` 42 docker-support: 43 allow-installation: 44 slot-snap-type: 45 - core 46 deny-auto-connection: true 47 ` 48 49 const dockerSupportConnectedPlugAppArmorCore = ` 50 # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the 51 # version of apparmor or the kernel which doesn't resolve the upper layer of an 52 # overlayfs mount correctly the accesses show up as runc trying to read from 53 # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/ 54 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl, 55 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl, 56 ` 57 58 const dockerSupportConnectedPlugAppArmor = ` 59 # Description: allow operating as the Docker daemon/containerd. This policy is 60 # intentionally not restrictive and is here to help guard against programming 61 # errors and not for security confinement. The Docker daemon by design requires 62 # extensive access to the system and cannot be effectively confined against 63 # malicious activity. 64 65 #include <abstractions/dbus-strict> 66 67 # Allow sockets/etc for docker 68 /{,var/}run/docker.sock rw, 69 /{,var/}run/docker/ rw, 70 /{,var/}run/docker/** mrwklix, 71 /{,var/}run/runc/ rw, 72 /{,var/}run/runc/** mrwklix, 73 74 # Allow sockets/etc for containerd 75 /{,var/}run/containerd/{,s/,runc/,runc/k8s.io/,runc/k8s.io/*/} rw, 76 /{,var/}run/containerd/runc/k8s.io/*/** rwk, 77 /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw, 78 /{,var/}run/containerd/io.containerd*/*/** rwk, 79 /{,var/}run/containerd/s/** rwk, 80 81 # Limit ipam-state to k8s 82 /run/ipam-state/k8s-** rw, 83 /run/ipam-state/k8s-*/lock k, 84 85 # Socket for docker-containerd-shim 86 unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00", 87 88 /{,var/}run/mount/utab r, 89 90 # Wide read access to /proc, but somewhat limited writes for now 91 @{PROC}/ r, 92 @{PROC}/** r, 93 @{PROC}/[0-9]*/attr/{,apparmor/}exec w, 94 @{PROC}/[0-9]*/oom_score_adj w, 95 96 # Limited read access to specific bits of /sys 97 /sys/kernel/mm/hugepages/ r, 98 /sys/kernel/mm/transparent_hugepage/{,**} r, 99 /sys/fs/cgroup/cpuset/cpuset.cpus r, 100 /sys/fs/cgroup/cpuset/cpuset.mems r, 101 /sys/module/apparmor/parameters/enabled r, 102 103 # Limit cgroup writes a bit (Docker uses a "docker" sub-group) 104 /sys/fs/cgroup/*/docker/ rw, 105 /sys/fs/cgroup/*/docker/** rw, 106 107 # Also allow cgroup writes to kubernetes pods 108 /sys/fs/cgroup/*/kubepods/ rw, 109 /sys/fs/cgroup/*/kubepods/** rw, 110 111 # containerd can also be configured to use the systemd cgroup driver via 112 # plugins.cri.systemd_cgroup = true which moves container processes into 113 # systemd-managed cgroups. This is now the recommended configuration since it 114 # provides a single cgroup manager (systemd) in an effort to achieve consistent 115 # views of resources. 116 /sys/fs/cgroup/*/systemd/{,system.slice/} rw, # create missing dirs 117 /sys/fs/cgroup/*/systemd/system.slice/** r, 118 /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w, 119 120 # Allow tracing ourself (especially the "runc" process we create) 121 ptrace (trace) peer=@{profile_name}, 122 123 # Docker needs a lot of caps, but limits them in the app container 124 capability, 125 126 # Docker does all kinds of mounts all over the filesystem 127 /dev/mapper/control rw, 128 /dev/mapper/docker* rw, 129 /dev/loop-control r, 130 /dev/loop[0-9]* rw, 131 /sys/devices/virtual/block/dm-[0-9]*/** r, 132 mount, 133 umount, 134 135 # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN, 136 # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now 137 # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root) 138 pivot_root, 139 /.pivot_root[0-9]*/ rw, 140 141 # file descriptors (/proc/NNN/fd/X) 142 # file descriptors in the container show up here due to attach_disconnected 143 /[0-9]* rw, 144 145 # Docker needs to be able to create and load the profile it applies to 146 # containers ("docker-default") 147 /{,usr/}sbin/apparmor_parser ixr, 148 /etc/apparmor.d/cache/ r, # apparmor 2.12 and below 149 /etc/apparmor.d/cache/.features r, 150 /etc/apparmor.d/{,cache/}docker* rw, 151 /var/cache/apparmor/{,*/} r, # apparmor 2.13 and higher 152 /var/cache/apparmor/*/.features r, 153 /var/cache/apparmor/*/docker* rw, 154 /etc/apparmor.d/tunables/{,**} r, 155 /etc/apparmor.d/abstractions/{,**} r, 156 /etc/apparmor/parser.conf r, 157 /etc/apparmor/subdomain.conf r, 158 /sys/kernel/security/apparmor/.replace rw, 159 /sys/kernel/security/apparmor/{,**} r, 160 161 # use 'privileged-containers: true' to support --security-opts 162 163 # defaults for docker-default 164 change_profile unsafe /** -> docker-default, 165 signal (send) peer=docker-default, 166 ptrace (read, trace) peer=docker-default, 167 168 # defaults for containerd 169 change_profile unsafe /** -> cri-containerd.apparmor.d, 170 signal (send) peer=cri-containerd.apparmor.d, 171 ptrace (read, trace) peer=cri-containerd.apparmor.d, 172 173 # Graph (storage) driver bits 174 /{dev,run}/shm/aufs.xino mrw, 175 /proc/fs/aufs/plink_maint w, 176 /sys/fs/aufs/** r, 177 178 #cf bug 1502785 179 / r, 180 181 # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx 182 # and so to allow allocating a new shell we need this 183 /dev/pts/ptmx rw, 184 185 # needed by runc for mitigation of CVE-2019-5736 186 # For details see https://bugs.launchpad.net/apparmor/+bug/1820344 187 / ix, 188 /bin/runc ixr, 189 190 /pause ixr, 191 /bin/busybox ixr, 192 193 # When kubernetes drives containerd, containerd needs access to CNI services, 194 # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it 195 # could if the control plane was a snap), but in deployments where the control 196 # plane is not a snap, it will tell flannel to use this path. 197 /run/flannel/{,**} rk, 198 199 # When kubernetes drives containerd, containerd needs access to various 200 # secrets for the pods which are overlayed at /run/secrets/.... 201 # This would ideally be snap-specific (it could if the control plane was a 202 # snap), but in deployments where the control plane is not a snap, it will tell 203 # containerd to use this path for various account information for pods. 204 /run/secrets/kubernetes.io/{,**} rk, 205 206 # Allow using the 'autobind' feature of bind() (eg, for journald via go-systemd) 207 # unix (bind) type=dgram addr=auto, 208 # TODO: when snapd vendors in AppArmor userspace, then enable the new syntax 209 # above which allows only "empty"/automatic addresses, for now we simply permit 210 # all addresses with SOCK_DGRAM type, which leaks info for other addresses than 211 # what docker tries to use 212 # see https://bugs.launchpad.net/snapd/+bug/1867216 213 unix (bind) type=dgram, 214 ` 215 216 const dockerSupportConnectedPlugSecComp = ` 217 # Description: allow operating as the Docker daemon. This policy is 218 # intentionally not restrictive and is here to help guard against programming 219 # errors and not for security confinement. The Docker daemon by design requires 220 # extensive access to the system and cannot be effectively confined against 221 # malicious activity. 222 223 # Because seccomp may only go more strict, we must allow all syscalls to Docker 224 # that it expects to give to containers in addition to what it needs to run and 225 # trust that docker daemon # only gives out reasonable syscalls to containers. 226 227 # Docker includes these in the default container whitelist, but they're 228 # potentially dangerous. 229 #finit_module 230 #init_module 231 #query_module 232 #delete_module 233 234 # These have a history of vulnerabilities, are not widely used, and 235 # open_by_handle_at has been used to break out of Docker containers by brute 236 # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c 237 #name_to_handle_at 238 #open_by_handle_at 239 240 # Calls the Docker daemon itself requires 241 242 # /snap/docker/VERSION/bin/docker-runc 243 # "do not inherit the parent's session keyring" 244 # "make session keyring searcheable" 245 # runC uses this to ensure the container doesn't have access to the host 246 # keyring 247 keyctl 248 249 # /snap/docker/VERSION/bin/docker-runc 250 pivot_root 251 252 # ptrace can be abused to break out of the seccomp sandbox 253 # but is required by the Docker daemon. 254 ptrace 255 256 # This list comes from Docker's default seccomp whitelist (which is applied to 257 # all containers launched unless a custom profile is specified or 258 # "--privileged" is used) 259 # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879 260 # It has been further filtered to exclude certain known-troublesome syscalls. 261 accept 262 accept4 263 access 264 acct 265 adjtimex 266 alarm 267 arch_prctl 268 bind 269 bpf 270 breakpoint 271 brk 272 cacheflush 273 capget 274 capset 275 chdir 276 chmod 277 chown 278 chown32 279 chroot 280 clock_getres 281 clock_getres_time64 282 clock_gettime 283 clock_gettime64 284 clock_nanosleep 285 clock_nanosleep_time64 286 clone 287 close 288 connect 289 copy_file_range 290 creat 291 dup 292 dup2 293 dup3 294 epoll_create 295 epoll_create1 296 epoll_ctl 297 epoll_ctl_old 298 epoll_pwait 299 epoll_wait 300 epoll_wait_old 301 eventfd 302 eventfd2 303 execve 304 execveat 305 exit 306 exit_group 307 faccessat 308 fadvise64 309 fadvise64_64 310 fallocate 311 fanotify_init 312 fanotify_mark 313 fchdir 314 fchmod 315 fchmodat 316 fchown 317 fchown32 318 fchownat 319 fcntl 320 fcntl64 321 fdatasync 322 fgetxattr 323 flistxattr 324 flock 325 fork 326 fremovexattr 327 fsetxattr 328 fstat 329 fstat64 330 fstatat64 331 fstatfs 332 fstatfs64 333 fsync 334 ftruncate 335 ftruncate64 336 futex 337 futex_time64 338 futimesat 339 getcpu 340 getcwd 341 getdents 342 getdents64 343 getegid 344 getegid32 345 geteuid 346 geteuid32 347 getgid 348 getgid32 349 getgroups 350 getgroups32 351 getitimer 352 getpeername 353 getpgid 354 getpgrp 355 getpid 356 getppid 357 getpriority 358 getrandom 359 getresgid 360 getresgid32 361 getresuid 362 getresuid32 363 getrlimit 364 get_robust_list 365 getrusage 366 getsid 367 getsockname 368 getsockopt 369 get_thread_area 370 get_tls 371 gettid 372 gettimeofday 373 getuid 374 getuid32 375 getxattr 376 inotify_add_watch 377 inotify_init 378 inotify_init1 379 inotify_rm_watch 380 io_cancel 381 ioctl 382 io_destroy 383 io_getevents 384 ioperm 385 iopl 386 ioprio_get 387 ioprio_set 388 io_setup 389 io_submit 390 ipc 391 kcmp 392 kill 393 lchown 394 lchown32 395 lgetxattr 396 link 397 linkat 398 listen 399 listxattr 400 llistxattr 401 _llseek 402 lookup_dcookie 403 lremovexattr 404 lseek 405 lsetxattr 406 lstat 407 lstat64 408 madvise 409 memfd_create 410 mincore 411 mkdir 412 mkdirat 413 mknod 414 mknodat 415 mlock 416 mlock2 417 mlockall 418 mmap 419 mmap2 420 modify_ldt 421 mount 422 mprotect 423 mq_getsetattr 424 mq_notify 425 mq_open 426 mq_timedreceive 427 mq_timedreceive_time64 428 mq_timedsend 429 mq_timedsend_time64 430 mq_unlink 431 mremap 432 msgctl 433 msgget 434 msgrcv 435 msgsnd 436 msync 437 munlock 438 munlockall 439 munmap 440 nanosleep 441 newfstatat 442 _newselect 443 open 444 openat 445 pause 446 perf_event_open 447 personality 448 pipe 449 pipe2 450 poll 451 ppoll 452 ppoll_time64 453 prctl 454 pread64 455 preadv 456 prlimit64 457 process_vm_readv 458 process_vm_writev 459 pselect6 460 pselect6_time64 461 pwrite64 462 pwritev 463 read 464 readahead 465 readlink 466 readlinkat 467 readv 468 reboot 469 recv 470 recvfrom 471 recvmmsg 472 recvmmsg_time64 473 recvmsg 474 remap_file_pages 475 removexattr 476 rename 477 renameat 478 renameat2 479 restart_syscall 480 rmdir 481 rt_sigaction 482 rt_sigpending 483 rt_sigprocmask 484 rt_sigqueueinfo 485 rt_sigreturn 486 rt_sigsuspend 487 rt_sigtimedwait 488 rt_sigtimedwait_time64 489 rt_tgsigqueueinfo 490 s390_pci_mmio_read 491 s390_pci_mmio_write 492 s390_runtime_instr 493 sched_getaffinity 494 sched_getattr 495 sched_getparam 496 sched_get_priority_max 497 sched_get_priority_min 498 sched_getscheduler 499 sched_rr_get_interval 500 sched_rr_get_interval_time64 501 sched_setaffinity 502 sched_setattr 503 sched_setparam 504 sched_setscheduler 505 sched_yield 506 seccomp 507 select 508 semctl 509 semget 510 semop 511 semtimedop 512 semtimedop_time64 513 send 514 sendfile 515 sendfile64 516 sendmmsg 517 sendmsg 518 sendto 519 setdomainname 520 setfsgid 521 setfsgid32 522 setfsuid 523 setfsuid32 524 setgid 525 setgid32 526 setgroups 527 setgroups32 528 sethostname 529 setitimer 530 setns 531 setpgid 532 setpriority 533 setregid 534 setregid32 535 setresgid 536 setresgid32 537 setresuid 538 setresuid32 539 setreuid 540 setreuid32 541 setrlimit 542 set_robust_list 543 setsid 544 setsockopt 545 set_thread_area 546 set_tid_address 547 settimeofday 548 set_tls 549 setuid 550 setuid32 551 setxattr 552 shmat 553 shmctl 554 shmdt 555 shmget 556 shutdown 557 sigaltstack 558 signalfd 559 signalfd4 560 sigreturn 561 socket 562 socketcall 563 socketpair 564 splice 565 stat 566 stat64 567 statfs 568 statfs64 569 stime 570 symlink 571 symlinkat 572 sync 573 sync_file_range 574 syncfs 575 sysinfo 576 syslog 577 tee 578 tgkill 579 time 580 timer_create 581 timer_delete 582 timerfd_create 583 timerfd_gettime 584 timerfd_gettime64 585 timerfd_settime 586 timerfd_settime64 587 timer_getoverrun 588 timer_gettime 589 timer_gettime64 590 timer_settime 591 timer_settime64 592 times 593 tkill 594 truncate 595 truncate64 596 ugetrlimit 597 umask 598 umount 599 umount2 600 uname 601 unlink 602 unlinkat 603 unshare 604 utime 605 utimensat 606 utimensat_time64 607 utimes 608 vfork 609 vhangup 610 vmsplice 611 wait4 612 waitid 613 waitpid 614 write 615 writev 616 ` 617 618 const dockerSupportPrivilegedAppArmor = ` 619 # Description: allow docker daemon to run privileged containers. This gives 620 # full access to all resources on the system and thus gives device ownership to 621 # connected snaps. 622 623 # These rules are here to allow Docker to launch unconfined containers but 624 # allow the docker daemon itself to go unconfined. Since it runs as root, this 625 # grants device ownership. 626 change_profile unsafe /**, 627 signal (send) peer=unconfined, 628 ptrace (read, trace) peer=unconfined, 629 630 # This grants raw access to device files and thus device ownership 631 /dev/** mrwkl, 632 @{PROC}/** mrwkl, 633 634 # When kubernetes drives docker/containerd, it creates and runs files in the 635 # container at arbitrary locations (eg, via pivot_root). 636 /** rwlix, 637 ` 638 639 const dockerSupportPrivilegedSecComp = ` 640 # Description: allow docker daemon to run privileged containers. This gives 641 # full access to all resources on the system and thus gives device ownership to 642 # connected snaps. 643 644 # This grants, among other things, kernel module loading and therefore device 645 # ownership. 646 @unrestricted 647 ` 648 649 const dockerSupportServiceSnippet = `Delegate=true` 650 651 type dockerSupportInterface struct { 652 commonInterface 653 } 654 655 func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 656 // https://kubernetes.io/docs/setup/production-environment/container-runtimes/ 657 if err := spec.AddModule("overlay"); err != nil { 658 return err 659 } 660 return nil 661 } 662 663 func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 664 var privileged bool 665 _ = plug.Attr("privileged-containers", &privileged) 666 667 // The 'change_profile unsafe' rules conflict with the 'ix' rules in 668 // the home interface, so suppress them (LP: #1797786) 669 spec.SetSuppressHomeIx() 670 spec.AddSnippet(dockerSupportConnectedPlugAppArmor) 671 if privileged { 672 spec.AddSnippet(dockerSupportPrivilegedAppArmor) 673 } 674 if !release.OnClassic { 675 spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore) 676 } 677 spec.SetUsesPtraceTrace() 678 return nil 679 } 680 681 func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 682 var privileged bool 683 _ = plug.Attr("privileged-containers", &privileged) 684 snippet := dockerSupportConnectedPlugSecComp 685 if privileged { 686 snippet += dockerSupportPrivilegedSecComp 687 } 688 spec.AddSnippet(snippet) 689 return nil 690 } 691 692 func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error { 693 if v, ok := plug.Attrs["privileged-containers"]; ok { 694 if _, ok = v.(bool); !ok { 695 return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'") 696 } 697 } 698 return nil 699 } 700 701 func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool { 702 // allow what declarations allowed 703 return true 704 } 705 706 func init() { 707 registerIface(&dockerSupportInterface{commonInterface{ 708 name: "docker-support", 709 summary: dockerSupportSummary, 710 implicitOnCore: true, 711 implicitOnClassic: true, 712 baseDeclarationPlugs: dockerSupportBaseDeclarationPlugs, 713 baseDeclarationSlots: dockerSupportBaseDeclarationSlots, 714 controlsDeviceCgroup: true, 715 serviceSnippets: []string{dockerSupportServiceSnippet}, 716 // docker-support also uses ptrace(trace), but it already declares this in 717 // the AppArmorConnectedPlug method 718 }}) 719 }