github.com/hugh712/snapd@v0.0.0-20200910133618-1a99902bd583/interfaces/builtin/docker_support.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2018 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package builtin 21 22 import ( 23 "fmt" 24 25 "github.com/snapcore/snapd/interfaces" 26 "github.com/snapcore/snapd/interfaces/apparmor" 27 "github.com/snapcore/snapd/interfaces/kmod" 28 "github.com/snapcore/snapd/interfaces/seccomp" 29 "github.com/snapcore/snapd/interfaces/udev" 30 "github.com/snapcore/snapd/release" 31 apparmor_sandbox "github.com/snapcore/snapd/sandbox/apparmor" 32 "github.com/snapcore/snapd/snap" 33 ) 34 35 const dockerSupportSummary = `allows operating as the Docker daemon` 36 37 const dockerSupportBaseDeclarationPlugs = ` 38 docker-support: 39 allow-installation: false 40 deny-auto-connection: true 41 ` 42 43 const dockerSupportBaseDeclarationSlots = ` 44 docker-support: 45 allow-installation: 46 slot-snap-type: 47 - core 48 deny-auto-connection: true 49 ` 50 51 const dockerSupportConnectedPlugAppArmorCore = ` 52 # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the 53 # version of apparmor or the kernel which doesn't resolve the upper layer of an 54 # overlayfs mount correctly the accesses show up as runc trying to read from 55 # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/ 56 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl, 57 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl, 58 ` 59 60 const dockerSupportConnectedPlugAppArmor = ` 61 # Description: allow operating as the Docker daemon/containerd. This policy is 62 # intentionally not restrictive and is here to help guard against programming 63 # errors and not for security confinement. The Docker daemon by design requires 64 # extensive access to the system and cannot be effectively confined against 65 # malicious activity. 66 67 #include <abstractions/dbus-strict> 68 69 # Allow sockets/etc for docker 70 /{,var/}run/docker.sock rw, 71 /{,var/}run/docker/ rw, 72 /{,var/}run/docker/** mrwklix, 73 /{,var/}run/runc/ rw, 74 /{,var/}run/runc/** mrwklix, 75 76 # Allow sockets/etc for containerd 77 /{,var/}run/containerd/{,runc/,runc/k8s.io/,runc/k8s.io/*/} rw, 78 /{,var/}run/containerd/runc/k8s.io/*/** rwk, 79 /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw, 80 /{,var/}run/containerd/io.containerd*/*/** rwk, 81 82 # Limit ipam-state to k8s 83 /run/ipam-state/k8s-** rw, 84 /run/ipam-state/k8s-*/lock k, 85 86 # Socket for docker-containerd-shim 87 unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00", 88 89 /{,var/}run/mount/utab r, 90 91 # Wide read access to /proc, but somewhat limited writes for now 92 @{PROC}/ r, 93 @{PROC}/** r, 94 @{PROC}/[0-9]*/attr/exec w, 95 @{PROC}/[0-9]*/oom_score_adj w, 96 97 # Limited read access to specific bits of /sys 98 /sys/kernel/mm/hugepages/ r, 99 /sys/kernel/mm/transparent_hugepage/{,**} r, 100 /sys/fs/cgroup/cpuset/cpuset.cpus r, 101 /sys/fs/cgroup/cpuset/cpuset.mems r, 102 /sys/module/apparmor/parameters/enabled r, 103 104 # Limit cgroup writes a bit (Docker uses a "docker" sub-group) 105 /sys/fs/cgroup/*/docker/ rw, 106 /sys/fs/cgroup/*/docker/** rw, 107 108 # Also allow cgroup writes to kubernetes pods 109 /sys/fs/cgroup/*/kubepods/ rw, 110 /sys/fs/cgroup/*/kubepods/** rw, 111 112 # containerd can also be configured to use the systemd cgroup driver via 113 # plugins.cri.systemd_cgroup = true which moves container processes into 114 # systemd-managed cgroups. This is now the recommended configuration since it 115 # provides a single cgroup manager (systemd) in an effort to achieve consistent 116 # views of resources. 117 /sys/fs/cgroup/*/systemd/{,system.slice/} rw, # create missing dirs 118 /sys/fs/cgroup/*/systemd/system.slice/** r, 119 /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w, 120 121 # Allow tracing ourself (especially the "runc" process we create) 122 ptrace (trace) peer=@{profile_name}, 123 124 # Docker needs a lot of caps, but limits them in the app container 125 capability, 126 127 # Docker does all kinds of mounts all over the filesystem 128 /dev/mapper/control rw, 129 /dev/mapper/docker* rw, 130 /dev/loop-control r, 131 /dev/loop[0-9]* rw, 132 /sys/devices/virtual/block/dm-[0-9]*/** r, 133 mount, 134 umount, 135 136 # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN, 137 # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now 138 # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root) 139 pivot_root, 140 /.pivot_root[0-9]*/ rw, 141 142 # file descriptors (/proc/NNN/fd/X) 143 # file descriptors in the container show up here due to attach_disconnected 144 /[0-9]* rw, 145 146 # Docker needs to be able to create and load the profile it applies to 147 # containers ("docker-default") 148 /sbin/apparmor_parser ixr, 149 /etc/apparmor.d/cache/ r, # apparmor 2.12 and below 150 /etc/apparmor.d/cache/.features r, 151 /etc/apparmor.d/{,cache/}docker* rw, 152 /var/cache/apparmor/{,*/} r, # apparmor 2.13 and higher 153 /var/cache/apparmor/*/.features r, 154 /var/cache/apparmor/*/docker* rw, 155 /etc/apparmor.d/tunables/{,**} r, 156 /etc/apparmor.d/abstractions/{,**} r, 157 /etc/apparmor/parser.conf r, 158 /etc/apparmor/subdomain.conf r, 159 /sys/kernel/security/apparmor/.replace rw, 160 /sys/kernel/security/apparmor/{,**} r, 161 162 # use 'privileged-containers: true' to support --security-opts 163 164 # defaults for docker-default 165 change_profile unsafe /** -> docker-default, 166 signal (send) peer=docker-default, 167 ptrace (read, trace) peer=docker-default, 168 169 # defaults for containerd 170 change_profile unsafe /** -> cri-containerd.apparmor.d, 171 signal (send) peer=cri-containerd.apparmor.d, 172 ptrace (read, trace) peer=cri-containerd.apparmor.d, 173 174 # Graph (storage) driver bits 175 /{dev,run}/shm/aufs.xino mrw, 176 /proc/fs/aufs/plink_maint w, 177 /sys/fs/aufs/** r, 178 179 #cf bug 1502785 180 / r, 181 182 # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx 183 # and so to allow allocating a new shell we need this 184 /dev/pts/ptmx rw, 185 186 # needed by runc for mitigation of CVE-2019-5736 187 # For details see https://bugs.launchpad.net/apparmor/+bug/1820344 188 / ix, 189 /bin/runc ixr, 190 191 /pause ixr, 192 /bin/busybox ixr, 193 194 # When kubernetes drives containerd, containerd needs access to CNI services, 195 # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it 196 # could if the control plane was a snap), but in deployments where the control 197 # plane is not a snap, it will tell flannel to use this path. 198 /run/flannel/{,**} rk, 199 200 # When kubernetes drives containerd, containerd needs access to various 201 # secrets for the pods which are overlayed at /run/secrets/.... 202 # This would ideally be snap-specific (it could if the control plane was a 203 # snap), but in deployments where the control plane is not a snap, it will tell 204 # containerd to use this path for various account information for pods. 205 /run/secrets/kubernetes.io/{,**} rk, 206 ` 207 208 const dockerSupportConnectedPlugSecComp = ` 209 # Description: allow operating as the Docker daemon. This policy is 210 # intentionally not restrictive and is here to help guard against programming 211 # errors and not for security confinement. The Docker daemon by design requires 212 # extensive access to the system and cannot be effectively confined against 213 # malicious activity. 214 215 # Because seccomp may only go more strict, we must allow all syscalls to Docker 216 # that it expects to give to containers in addition to what it needs to run and 217 # trust that docker daemon # only gives out reasonable syscalls to containers. 218 219 # Docker includes these in the default container whitelist, but they're 220 # potentially dangerous. 221 #finit_module 222 #init_module 223 #query_module 224 #delete_module 225 226 # These have a history of vulnerabilities, are not widely used, and 227 # open_by_handle_at has been used to break out of Docker containers by brute 228 # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c 229 #name_to_handle_at 230 #open_by_handle_at 231 232 # Calls the Docker daemon itself requires 233 234 # /snap/docker/VERSION/bin/docker-runc 235 # "do not inherit the parent's session keyring" 236 # "make session keyring searcheable" 237 # runC uses this to ensure the container doesn't have access to the host 238 # keyring 239 keyctl 240 241 # /snap/docker/VERSION/bin/docker-runc 242 pivot_root 243 244 # ptrace can be abused to break out of the seccomp sandbox 245 # but is required by the Docker daemon. 246 ptrace 247 248 # This list comes from Docker's default seccomp whitelist (which is applied to 249 # all containers launched unless a custom profile is specified or 250 # "--privileged" is used) 251 # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879 252 # It has been further filtered to exclude certain known-troublesome syscalls. 253 accept 254 accept4 255 access 256 acct 257 adjtimex 258 alarm 259 arch_prctl 260 bind 261 bpf 262 breakpoint 263 brk 264 cacheflush 265 capget 266 capset 267 chdir 268 chmod 269 chown 270 chown32 271 chroot 272 clock_getres 273 clock_getres_time64 274 clock_gettime 275 clock_gettime64 276 clock_nanosleep 277 clock_nanosleep_time64 278 clone 279 close 280 connect 281 copy_file_range 282 creat 283 dup 284 dup2 285 dup3 286 epoll_create 287 epoll_create1 288 epoll_ctl 289 epoll_ctl_old 290 epoll_pwait 291 epoll_wait 292 epoll_wait_old 293 eventfd 294 eventfd2 295 execve 296 execveat 297 exit 298 exit_group 299 faccessat 300 fadvise64 301 fadvise64_64 302 fallocate 303 fanotify_init 304 fanotify_mark 305 fchdir 306 fchmod 307 fchmodat 308 fchown 309 fchown32 310 fchownat 311 fcntl 312 fcntl64 313 fdatasync 314 fgetxattr 315 flistxattr 316 flock 317 fork 318 fremovexattr 319 fsetxattr 320 fstat 321 fstat64 322 fstatat64 323 fstatfs 324 fstatfs64 325 fsync 326 ftruncate 327 ftruncate64 328 futex 329 futex_time64 330 futimesat 331 getcpu 332 getcwd 333 getdents 334 getdents64 335 getegid 336 getegid32 337 geteuid 338 geteuid32 339 getgid 340 getgid32 341 getgroups 342 getgroups32 343 getitimer 344 getpeername 345 getpgid 346 getpgrp 347 getpid 348 getppid 349 getpriority 350 getrandom 351 getresgid 352 getresgid32 353 getresuid 354 getresuid32 355 getrlimit 356 get_robust_list 357 getrusage 358 getsid 359 getsockname 360 getsockopt 361 get_thread_area 362 get_tls 363 gettid 364 gettimeofday 365 getuid 366 getuid32 367 getxattr 368 inotify_add_watch 369 inotify_init 370 inotify_init1 371 inotify_rm_watch 372 io_cancel 373 ioctl 374 io_destroy 375 io_getevents 376 ioperm 377 iopl 378 ioprio_get 379 ioprio_set 380 io_setup 381 io_submit 382 ipc 383 kcmp 384 kill 385 lchown 386 lchown32 387 lgetxattr 388 link 389 linkat 390 listen 391 listxattr 392 llistxattr 393 _llseek 394 lookup_dcookie 395 lremovexattr 396 lseek 397 lsetxattr 398 lstat 399 lstat64 400 madvise 401 memfd_create 402 mincore 403 mkdir 404 mkdirat 405 mknod 406 mknodat 407 mlock 408 mlock2 409 mlockall 410 mmap 411 mmap2 412 modify_ldt 413 mount 414 mprotect 415 mq_getsetattr 416 mq_notify 417 mq_open 418 mq_timedreceive 419 mq_timedreceive_time64 420 mq_timedsend 421 mq_timedsend_time64 422 mq_unlink 423 mremap 424 msgctl 425 msgget 426 msgrcv 427 msgsnd 428 msync 429 munlock 430 munlockall 431 munmap 432 nanosleep 433 newfstatat 434 _newselect 435 open 436 openat 437 pause 438 perf_event_open 439 personality 440 pipe 441 pipe2 442 poll 443 ppoll 444 ppoll_time64 445 prctl 446 pread64 447 preadv 448 prlimit64 449 process_vm_readv 450 process_vm_writev 451 pselect6 452 pselect6_time64 453 pwrite64 454 pwritev 455 read 456 readahead 457 readlink 458 readlinkat 459 readv 460 reboot 461 recv 462 recvfrom 463 recvmmsg 464 recvmmsg_time64 465 recvmsg 466 remap_file_pages 467 removexattr 468 rename 469 renameat 470 renameat2 471 restart_syscall 472 rmdir 473 rt_sigaction 474 rt_sigpending 475 rt_sigprocmask 476 rt_sigqueueinfo 477 rt_sigreturn 478 rt_sigsuspend 479 rt_sigtimedwait 480 rt_sigtimedwait_time64 481 rt_tgsigqueueinfo 482 s390_pci_mmio_read 483 s390_pci_mmio_write 484 s390_runtime_instr 485 sched_getaffinity 486 sched_getattr 487 sched_getparam 488 sched_get_priority_max 489 sched_get_priority_min 490 sched_getscheduler 491 sched_rr_get_interval 492 sched_rr_get_interval_time64 493 sched_setaffinity 494 sched_setattr 495 sched_setparam 496 sched_setscheduler 497 sched_yield 498 seccomp 499 select 500 semctl 501 semget 502 semop 503 semtimedop 504 semtimedop_time64 505 send 506 sendfile 507 sendfile64 508 sendmmsg 509 sendmsg 510 sendto 511 setdomainname 512 setfsgid 513 setfsgid32 514 setfsuid 515 setfsuid32 516 setgid 517 setgid32 518 setgroups 519 setgroups32 520 sethostname 521 setitimer 522 setns 523 setpgid 524 setpriority 525 setregid 526 setregid32 527 setresgid 528 setresgid32 529 setresuid 530 setresuid32 531 setreuid 532 setreuid32 533 setrlimit 534 set_robust_list 535 setsid 536 setsockopt 537 set_thread_area 538 set_tid_address 539 settimeofday 540 set_tls 541 setuid 542 setuid32 543 setxattr 544 shmat 545 shmctl 546 shmdt 547 shmget 548 shutdown 549 sigaltstack 550 signalfd 551 signalfd4 552 sigreturn 553 socket 554 socketcall 555 socketpair 556 splice 557 stat 558 stat64 559 statfs 560 statfs64 561 stime 562 symlink 563 symlinkat 564 sync 565 sync_file_range 566 syncfs 567 sysinfo 568 syslog 569 tee 570 tgkill 571 time 572 timer_create 573 timer_delete 574 timerfd_create 575 timerfd_gettime 576 timerfd_gettime64 577 timerfd_settime 578 timerfd_settime64 579 timer_getoverrun 580 timer_gettime 581 timer_gettime64 582 timer_settime 583 timer_settime64 584 times 585 tkill 586 truncate 587 truncate64 588 ugetrlimit 589 umask 590 umount 591 umount2 592 uname 593 unlink 594 unlinkat 595 unshare 596 utime 597 utimensat 598 utimensat_time64 599 utimes 600 vfork 601 vhangup 602 vmsplice 603 wait4 604 waitid 605 waitpid 606 write 607 writev 608 ` 609 610 const dockerSupportPrivilegedAppArmor = ` 611 # Description: allow docker daemon to run privileged containers. This gives 612 # full access to all resources on the system and thus gives device ownership to 613 # connected snaps. 614 615 # These rules are here to allow Docker to launch unconfined containers but 616 # allow the docker daemon itself to go unconfined. Since it runs as root, this 617 # grants device ownership. 618 change_profile unsafe /**, 619 signal (send) peer=unconfined, 620 ptrace (read, trace) peer=unconfined, 621 622 # This grants raw access to device files and thus device ownership 623 /dev/** mrwkl, 624 @{PROC}/** mrwkl, 625 626 # When kubernetes drives docker/containerd, it creates and runs files in the 627 # container at arbitrary locations (eg, via pivot_root). 628 /** rwlix, 629 ` 630 631 const dockerSupportPrivilegedSecComp = ` 632 # Description: allow docker daemon to run privileged containers. This gives 633 # full access to all resources on the system and thus gives device ownership to 634 # connected snaps. 635 636 # This grants, among other things, kernel module loading and therefore device 637 # ownership. 638 @unrestricted 639 ` 640 641 type dockerSupportInterface struct{} 642 643 func (iface *dockerSupportInterface) Name() string { 644 return "docker-support" 645 } 646 647 func (iface *dockerSupportInterface) StaticInfo() interfaces.StaticInfo { 648 return interfaces.StaticInfo{ 649 Summary: dockerSupportSummary, 650 ImplicitOnCore: true, 651 ImplicitOnClassic: true, 652 BaseDeclarationPlugs: dockerSupportBaseDeclarationPlugs, 653 BaseDeclarationSlots: dockerSupportBaseDeclarationSlots, 654 } 655 } 656 657 var ( 658 parserFeatures = apparmor_sandbox.ParserFeatures 659 ) 660 661 func (iface *dockerSupportInterface) UDevConnectedPlug(spec *udev.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 662 spec.SetControlsDeviceCgroup() 663 664 return nil 665 } 666 667 func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 668 // https://kubernetes.io/docs/setup/production-environment/container-runtimes/ 669 if err := spec.AddModule("overlay"); err != nil { 670 return err 671 } 672 return nil 673 } 674 675 func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 676 var privileged bool 677 _ = plug.Attr("privileged-containers", &privileged) 678 679 // The 'change_profile unsafe' rules conflict with the 'ix' rules in 680 // the home interface, so suppress them (LP: #1797786) 681 spec.SetSuppressHomeIx() 682 spec.AddSnippet(dockerSupportConnectedPlugAppArmor) 683 if privileged { 684 spec.AddSnippet(dockerSupportPrivilegedAppArmor) 685 } 686 if !release.OnClassic { 687 spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore) 688 } 689 spec.SetUsesPtraceTrace() 690 return nil 691 } 692 693 func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 694 var privileged bool 695 _ = plug.Attr("privileged-containers", &privileged) 696 snippet := dockerSupportConnectedPlugSecComp 697 if privileged { 698 snippet += dockerSupportPrivilegedSecComp 699 } 700 spec.AddSnippet(snippet) 701 return nil 702 } 703 704 func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error { 705 if v, ok := plug.Attrs["privileged-containers"]; ok { 706 if _, ok = v.(bool); !ok { 707 return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'") 708 } 709 } 710 return nil 711 } 712 713 func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool { 714 // allow what declarations allowed 715 return true 716 } 717 718 func init() { 719 registerIface(&dockerSupportInterface{}) 720 }