github.com/bugraaydogar/snapd@v0.0.0-20210315170335-8c70bb858939/interfaces/builtin/docker_support.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2018 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package builtin 21 22 import ( 23 "fmt" 24 25 "github.com/snapcore/snapd/interfaces" 26 "github.com/snapcore/snapd/interfaces/apparmor" 27 "github.com/snapcore/snapd/interfaces/kmod" 28 "github.com/snapcore/snapd/interfaces/seccomp" 29 "github.com/snapcore/snapd/release" 30 "github.com/snapcore/snapd/snap" 31 ) 32 33 const dockerSupportSummary = `allows operating as the Docker daemon` 34 35 const dockerSupportBaseDeclarationPlugs = ` 36 docker-support: 37 allow-installation: false 38 deny-auto-connection: true 39 ` 40 41 const dockerSupportBaseDeclarationSlots = ` 42 docker-support: 43 allow-installation: 44 slot-snap-type: 45 - core 46 deny-auto-connection: true 47 ` 48 49 const dockerSupportConnectedPlugAppArmorCore = ` 50 # These accesses are necessary for Ubuntu Core 16 and 18, likely due to the 51 # version of apparmor or the kernel which doesn't resolve the upper layer of an 52 # overlayfs mount correctly the accesses show up as runc trying to read from 53 # /system-data/var/snap/docker/common/var-lib-docker/overlay2/$SHA/diff/ 54 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/common/{,**} rwl, 55 /system-data/var/snap/{@{SNAP_NAME},@{SNAP_INSTANCE_NAME}}/@{SNAP_REVISION}/{,**} rwl, 56 ` 57 58 const dockerSupportConnectedPlugAppArmor = ` 59 # Description: allow operating as the Docker daemon/containerd. This policy is 60 # intentionally not restrictive and is here to help guard against programming 61 # errors and not for security confinement. The Docker daemon by design requires 62 # extensive access to the system and cannot be effectively confined against 63 # malicious activity. 64 65 #include <abstractions/dbus-strict> 66 67 # Allow sockets/etc for docker 68 /{,var/}run/docker.sock rw, 69 /{,var/}run/docker/ rw, 70 /{,var/}run/docker/** mrwklix, 71 /{,var/}run/runc/ rw, 72 /{,var/}run/runc/** mrwklix, 73 74 # Allow sockets/etc for containerd 75 /{,var/}run/containerd/{,s/,runc/,runc/k8s.io/,runc/k8s.io/*/} rw, 76 /{,var/}run/containerd/runc/k8s.io/*/** rwk, 77 /{,var/}run/containerd/{io.containerd*/,io.containerd*/k8s.io/,io.containerd*/k8s.io/*/} rw, 78 /{,var/}run/containerd/io.containerd*/*/** rwk, 79 /{,var/}run/containerd/s/** rwk, 80 81 # Limit ipam-state to k8s 82 /run/ipam-state/k8s-** rw, 83 /run/ipam-state/k8s-*/lock k, 84 85 # Socket for docker-containerd-shim 86 unix (bind,listen) type=stream addr="@/containerd-shim/**.sock\x00", 87 88 /{,var/}run/mount/utab r, 89 90 # Wide read access to /proc, but somewhat limited writes for now 91 @{PROC}/ r, 92 @{PROC}/** r, 93 @{PROC}/[0-9]*/attr/{,apparmor/}exec w, 94 @{PROC}/[0-9]*/oom_score_adj w, 95 96 # Limited read access to specific bits of /sys 97 /sys/kernel/mm/hugepages/ r, 98 /sys/kernel/mm/transparent_hugepage/{,**} r, 99 /sys/fs/cgroup/cpuset/cpuset.cpus r, 100 /sys/fs/cgroup/cpuset/cpuset.mems r, 101 /sys/module/apparmor/parameters/enabled r, 102 103 # Limit cgroup writes a bit (Docker uses a "docker" sub-group) 104 /sys/fs/cgroup/*/docker/ rw, 105 /sys/fs/cgroup/*/docker/** rw, 106 107 # Also allow cgroup writes to kubernetes pods 108 /sys/fs/cgroup/*/kubepods/ rw, 109 /sys/fs/cgroup/*/kubepods/** rw, 110 111 # containerd can also be configured to use the systemd cgroup driver via 112 # plugins.cri.systemd_cgroup = true which moves container processes into 113 # systemd-managed cgroups. This is now the recommended configuration since it 114 # provides a single cgroup manager (systemd) in an effort to achieve consistent 115 # views of resources. 116 /sys/fs/cgroup/*/systemd/{,system.slice/} rw, # create missing dirs 117 /sys/fs/cgroup/*/systemd/system.slice/** r, 118 /sys/fs/cgroup/*/systemd/system.slice/cgroup.procs w, 119 120 # Allow tracing ourself (especially the "runc" process we create) 121 ptrace (trace) peer=@{profile_name}, 122 123 # Docker needs a lot of caps, but limits them in the app container 124 capability, 125 126 # Docker does all kinds of mounts all over the filesystem 127 /dev/mapper/control rw, 128 /dev/mapper/docker* rw, 129 /dev/loop-control r, 130 /dev/loop[0-9]* rw, 131 /sys/devices/virtual/block/dm-[0-9]*/** r, 132 mount, 133 umount, 134 135 # After doing a pivot_root using <graph-dir>/<container-fs>/.pivot_rootNNNNNN, 136 # Docker removes the leftover /.pivot_rootNNNNNN directory (which is now 137 # relative to "/" instead of "<graph-dir>/<container-fs>" thanks to pivot_root) 138 pivot_root, 139 /.pivot_root[0-9]*/ rw, 140 141 # file descriptors (/proc/NNN/fd/X) 142 # file descriptors in the container show up here due to attach_disconnected 143 /[0-9]* rw, 144 145 # Docker needs to be able to create and load the profile it applies to 146 # containers ("docker-default") 147 /sbin/apparmor_parser ixr, 148 /etc/apparmor.d/cache/ r, # apparmor 2.12 and below 149 /etc/apparmor.d/cache/.features r, 150 /etc/apparmor.d/{,cache/}docker* rw, 151 /var/cache/apparmor/{,*/} r, # apparmor 2.13 and higher 152 /var/cache/apparmor/*/.features r, 153 /var/cache/apparmor/*/docker* rw, 154 /etc/apparmor.d/tunables/{,**} r, 155 /etc/apparmor.d/abstractions/{,**} r, 156 /etc/apparmor/parser.conf r, 157 /etc/apparmor/subdomain.conf r, 158 /sys/kernel/security/apparmor/.replace rw, 159 /sys/kernel/security/apparmor/{,**} r, 160 161 # use 'privileged-containers: true' to support --security-opts 162 163 # defaults for docker-default 164 change_profile unsafe /** -> docker-default, 165 signal (send) peer=docker-default, 166 ptrace (read, trace) peer=docker-default, 167 168 # defaults for containerd 169 change_profile unsafe /** -> cri-containerd.apparmor.d, 170 signal (send) peer=cri-containerd.apparmor.d, 171 ptrace (read, trace) peer=cri-containerd.apparmor.d, 172 173 # Graph (storage) driver bits 174 /{dev,run}/shm/aufs.xino mrw, 175 /proc/fs/aufs/plink_maint w, 176 /sys/fs/aufs/** r, 177 178 #cf bug 1502785 179 / r, 180 181 # recent versions of docker make a symlink from /dev/ptmx to /dev/pts/ptmx 182 # and so to allow allocating a new shell we need this 183 /dev/pts/ptmx rw, 184 185 # needed by runc for mitigation of CVE-2019-5736 186 # For details see https://bugs.launchpad.net/apparmor/+bug/1820344 187 / ix, 188 /bin/runc ixr, 189 190 /pause ixr, 191 /bin/busybox ixr, 192 193 # When kubernetes drives containerd, containerd needs access to CNI services, 194 # like flanneld's subnet.env for DNS. This would ideally be snap-specific (it 195 # could if the control plane was a snap), but in deployments where the control 196 # plane is not a snap, it will tell flannel to use this path. 197 /run/flannel/{,**} rk, 198 199 # When kubernetes drives containerd, containerd needs access to various 200 # secrets for the pods which are overlayed at /run/secrets/.... 201 # This would ideally be snap-specific (it could if the control plane was a 202 # snap), but in deployments where the control plane is not a snap, it will tell 203 # containerd to use this path for various account information for pods. 204 /run/secrets/kubernetes.io/{,**} rk, 205 ` 206 207 const dockerSupportConnectedPlugSecComp = ` 208 # Description: allow operating as the Docker daemon. This policy is 209 # intentionally not restrictive and is here to help guard against programming 210 # errors and not for security confinement. The Docker daemon by design requires 211 # extensive access to the system and cannot be effectively confined against 212 # malicious activity. 213 214 # Because seccomp may only go more strict, we must allow all syscalls to Docker 215 # that it expects to give to containers in addition to what it needs to run and 216 # trust that docker daemon # only gives out reasonable syscalls to containers. 217 218 # Docker includes these in the default container whitelist, but they're 219 # potentially dangerous. 220 #finit_module 221 #init_module 222 #query_module 223 #delete_module 224 225 # These have a history of vulnerabilities, are not widely used, and 226 # open_by_handle_at has been used to break out of Docker containers by brute 227 # forcing the handle value: http://stealth.openwall.net/xSports/shocker.c 228 #name_to_handle_at 229 #open_by_handle_at 230 231 # Calls the Docker daemon itself requires 232 233 # /snap/docker/VERSION/bin/docker-runc 234 # "do not inherit the parent's session keyring" 235 # "make session keyring searcheable" 236 # runC uses this to ensure the container doesn't have access to the host 237 # keyring 238 keyctl 239 240 # /snap/docker/VERSION/bin/docker-runc 241 pivot_root 242 243 # ptrace can be abused to break out of the seccomp sandbox 244 # but is required by the Docker daemon. 245 ptrace 246 247 # This list comes from Docker's default seccomp whitelist (which is applied to 248 # all containers launched unless a custom profile is specified or 249 # "--privileged" is used) 250 # https://github.com/docker/docker/blob/v1.12.0/profiles/seccomp/seccomp_default.go#L39-L1879 251 # It has been further filtered to exclude certain known-troublesome syscalls. 252 accept 253 accept4 254 access 255 acct 256 adjtimex 257 alarm 258 arch_prctl 259 bind 260 bpf 261 breakpoint 262 brk 263 cacheflush 264 capget 265 capset 266 chdir 267 chmod 268 chown 269 chown32 270 chroot 271 clock_getres 272 clock_getres_time64 273 clock_gettime 274 clock_gettime64 275 clock_nanosleep 276 clock_nanosleep_time64 277 clone 278 close 279 connect 280 copy_file_range 281 creat 282 dup 283 dup2 284 dup3 285 epoll_create 286 epoll_create1 287 epoll_ctl 288 epoll_ctl_old 289 epoll_pwait 290 epoll_wait 291 epoll_wait_old 292 eventfd 293 eventfd2 294 execve 295 execveat 296 exit 297 exit_group 298 faccessat 299 fadvise64 300 fadvise64_64 301 fallocate 302 fanotify_init 303 fanotify_mark 304 fchdir 305 fchmod 306 fchmodat 307 fchown 308 fchown32 309 fchownat 310 fcntl 311 fcntl64 312 fdatasync 313 fgetxattr 314 flistxattr 315 flock 316 fork 317 fremovexattr 318 fsetxattr 319 fstat 320 fstat64 321 fstatat64 322 fstatfs 323 fstatfs64 324 fsync 325 ftruncate 326 ftruncate64 327 futex 328 futex_time64 329 futimesat 330 getcpu 331 getcwd 332 getdents 333 getdents64 334 getegid 335 getegid32 336 geteuid 337 geteuid32 338 getgid 339 getgid32 340 getgroups 341 getgroups32 342 getitimer 343 getpeername 344 getpgid 345 getpgrp 346 getpid 347 getppid 348 getpriority 349 getrandom 350 getresgid 351 getresgid32 352 getresuid 353 getresuid32 354 getrlimit 355 get_robust_list 356 getrusage 357 getsid 358 getsockname 359 getsockopt 360 get_thread_area 361 get_tls 362 gettid 363 gettimeofday 364 getuid 365 getuid32 366 getxattr 367 inotify_add_watch 368 inotify_init 369 inotify_init1 370 inotify_rm_watch 371 io_cancel 372 ioctl 373 io_destroy 374 io_getevents 375 ioperm 376 iopl 377 ioprio_get 378 ioprio_set 379 io_setup 380 io_submit 381 ipc 382 kcmp 383 kill 384 lchown 385 lchown32 386 lgetxattr 387 link 388 linkat 389 listen 390 listxattr 391 llistxattr 392 _llseek 393 lookup_dcookie 394 lremovexattr 395 lseek 396 lsetxattr 397 lstat 398 lstat64 399 madvise 400 memfd_create 401 mincore 402 mkdir 403 mkdirat 404 mknod 405 mknodat 406 mlock 407 mlock2 408 mlockall 409 mmap 410 mmap2 411 modify_ldt 412 mount 413 mprotect 414 mq_getsetattr 415 mq_notify 416 mq_open 417 mq_timedreceive 418 mq_timedreceive_time64 419 mq_timedsend 420 mq_timedsend_time64 421 mq_unlink 422 mremap 423 msgctl 424 msgget 425 msgrcv 426 msgsnd 427 msync 428 munlock 429 munlockall 430 munmap 431 nanosleep 432 newfstatat 433 _newselect 434 open 435 openat 436 pause 437 perf_event_open 438 personality 439 pipe 440 pipe2 441 poll 442 ppoll 443 ppoll_time64 444 prctl 445 pread64 446 preadv 447 prlimit64 448 process_vm_readv 449 process_vm_writev 450 pselect6 451 pselect6_time64 452 pwrite64 453 pwritev 454 read 455 readahead 456 readlink 457 readlinkat 458 readv 459 reboot 460 recv 461 recvfrom 462 recvmmsg 463 recvmmsg_time64 464 recvmsg 465 remap_file_pages 466 removexattr 467 rename 468 renameat 469 renameat2 470 restart_syscall 471 rmdir 472 rt_sigaction 473 rt_sigpending 474 rt_sigprocmask 475 rt_sigqueueinfo 476 rt_sigreturn 477 rt_sigsuspend 478 rt_sigtimedwait 479 rt_sigtimedwait_time64 480 rt_tgsigqueueinfo 481 s390_pci_mmio_read 482 s390_pci_mmio_write 483 s390_runtime_instr 484 sched_getaffinity 485 sched_getattr 486 sched_getparam 487 sched_get_priority_max 488 sched_get_priority_min 489 sched_getscheduler 490 sched_rr_get_interval 491 sched_rr_get_interval_time64 492 sched_setaffinity 493 sched_setattr 494 sched_setparam 495 sched_setscheduler 496 sched_yield 497 seccomp 498 select 499 semctl 500 semget 501 semop 502 semtimedop 503 semtimedop_time64 504 send 505 sendfile 506 sendfile64 507 sendmmsg 508 sendmsg 509 sendto 510 setdomainname 511 setfsgid 512 setfsgid32 513 setfsuid 514 setfsuid32 515 setgid 516 setgid32 517 setgroups 518 setgroups32 519 sethostname 520 setitimer 521 setns 522 setpgid 523 setpriority 524 setregid 525 setregid32 526 setresgid 527 setresgid32 528 setresuid 529 setresuid32 530 setreuid 531 setreuid32 532 setrlimit 533 set_robust_list 534 setsid 535 setsockopt 536 set_thread_area 537 set_tid_address 538 settimeofday 539 set_tls 540 setuid 541 setuid32 542 setxattr 543 shmat 544 shmctl 545 shmdt 546 shmget 547 shutdown 548 sigaltstack 549 signalfd 550 signalfd4 551 sigreturn 552 socket 553 socketcall 554 socketpair 555 splice 556 stat 557 stat64 558 statfs 559 statfs64 560 stime 561 symlink 562 symlinkat 563 sync 564 sync_file_range 565 syncfs 566 sysinfo 567 syslog 568 tee 569 tgkill 570 time 571 timer_create 572 timer_delete 573 timerfd_create 574 timerfd_gettime 575 timerfd_gettime64 576 timerfd_settime 577 timerfd_settime64 578 timer_getoverrun 579 timer_gettime 580 timer_gettime64 581 timer_settime 582 timer_settime64 583 times 584 tkill 585 truncate 586 truncate64 587 ugetrlimit 588 umask 589 umount 590 umount2 591 uname 592 unlink 593 unlinkat 594 unshare 595 utime 596 utimensat 597 utimensat_time64 598 utimes 599 vfork 600 vhangup 601 vmsplice 602 wait4 603 waitid 604 waitpid 605 write 606 writev 607 ` 608 609 const dockerSupportPrivilegedAppArmor = ` 610 # Description: allow docker daemon to run privileged containers. This gives 611 # full access to all resources on the system and thus gives device ownership to 612 # connected snaps. 613 614 # These rules are here to allow Docker to launch unconfined containers but 615 # allow the docker daemon itself to go unconfined. Since it runs as root, this 616 # grants device ownership. 617 change_profile unsafe /**, 618 signal (send) peer=unconfined, 619 ptrace (read, trace) peer=unconfined, 620 621 # This grants raw access to device files and thus device ownership 622 /dev/** mrwkl, 623 @{PROC}/** mrwkl, 624 625 # When kubernetes drives docker/containerd, it creates and runs files in the 626 # container at arbitrary locations (eg, via pivot_root). 627 /** rwlix, 628 ` 629 630 const dockerSupportPrivilegedSecComp = ` 631 # Description: allow docker daemon to run privileged containers. This gives 632 # full access to all resources on the system and thus gives device ownership to 633 # connected snaps. 634 635 # This grants, among other things, kernel module loading and therefore device 636 # ownership. 637 @unrestricted 638 ` 639 640 const dockerSupportServiceSnippet = `Delegate=true` 641 642 type dockerSupportInterface struct { 643 commonInterface 644 } 645 646 func (iface *dockerSupportInterface) KModConnectedPlug(spec *kmod.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 647 // https://kubernetes.io/docs/setup/production-environment/container-runtimes/ 648 if err := spec.AddModule("overlay"); err != nil { 649 return err 650 } 651 return nil 652 } 653 654 func (iface *dockerSupportInterface) AppArmorConnectedPlug(spec *apparmor.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 655 var privileged bool 656 _ = plug.Attr("privileged-containers", &privileged) 657 658 // The 'change_profile unsafe' rules conflict with the 'ix' rules in 659 // the home interface, so suppress them (LP: #1797786) 660 spec.SetSuppressHomeIx() 661 spec.AddSnippet(dockerSupportConnectedPlugAppArmor) 662 if privileged { 663 spec.AddSnippet(dockerSupportPrivilegedAppArmor) 664 } 665 if !release.OnClassic { 666 spec.AddSnippet(dockerSupportConnectedPlugAppArmorCore) 667 } 668 spec.SetUsesPtraceTrace() 669 return nil 670 } 671 672 func (iface *dockerSupportInterface) SecCompConnectedPlug(spec *seccomp.Specification, plug *interfaces.ConnectedPlug, slot *interfaces.ConnectedSlot) error { 673 var privileged bool 674 _ = plug.Attr("privileged-containers", &privileged) 675 snippet := dockerSupportConnectedPlugSecComp 676 if privileged { 677 snippet += dockerSupportPrivilegedSecComp 678 } 679 spec.AddSnippet(snippet) 680 return nil 681 } 682 683 func (iface *dockerSupportInterface) BeforePreparePlug(plug *snap.PlugInfo) error { 684 if v, ok := plug.Attrs["privileged-containers"]; ok { 685 if _, ok = v.(bool); !ok { 686 return fmt.Errorf("docker-support plug requires bool with 'privileged-containers'") 687 } 688 } 689 return nil 690 } 691 692 func (iface *dockerSupportInterface) AutoConnect(*snap.PlugInfo, *snap.SlotInfo) bool { 693 // allow what declarations allowed 694 return true 695 } 696 697 func init() { 698 registerIface(&dockerSupportInterface{commonInterface{ 699 name: "docker-support", 700 summary: dockerSupportSummary, 701 implicitOnCore: true, 702 implicitOnClassic: true, 703 baseDeclarationPlugs: dockerSupportBaseDeclarationPlugs, 704 baseDeclarationSlots: dockerSupportBaseDeclarationSlots, 705 controlsDeviceCgroup: true, 706 serviceSnippets: []string{dockerSupportServiceSnippet}, 707 // docker-support also uses ptrace(trace), but it already declares this in 708 // the AppArmorConnectedPlug method 709 }}) 710 }