github.com/containerd/Containerd@v1.4.13/contrib/seccomp/seccomp_default.go (about)

     1  // +build linux
     2  
     3  /*
     4     Copyright The containerd Authors.
     5  
     6     Licensed under the Apache License, Version 2.0 (the "License");
     7     you may not use this file except in compliance with the License.
     8     You may obtain a copy of the License at
     9  
    10         http://www.apache.org/licenses/LICENSE-2.0
    11  
    12     Unless required by applicable law or agreed to in writing, software
    13     distributed under the License is distributed on an "AS IS" BASIS,
    14     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15     See the License for the specific language governing permissions and
    16     limitations under the License.
    17  */
    18  
    19  package seccomp
    20  
    21  import (
    22  	"runtime"
    23  
    24  	"golang.org/x/sys/unix"
    25  
    26  	"github.com/opencontainers/runtime-spec/specs-go"
    27  )
    28  
    29  func arches() []specs.Arch {
    30  	switch runtime.GOARCH {
    31  	case "amd64":
    32  		return []specs.Arch{specs.ArchX86_64, specs.ArchX86, specs.ArchX32}
    33  	case "arm64":
    34  		return []specs.Arch{specs.ArchARM, specs.ArchAARCH64}
    35  	case "mips64":
    36  		return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
    37  	case "mips64n32":
    38  		return []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64, specs.ArchMIPS64N32}
    39  	case "mipsel64":
    40  		return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
    41  	case "mipsel64n32":
    42  		return []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64, specs.ArchMIPSEL64N32}
    43  	case "s390x":
    44  		return []specs.Arch{specs.ArchS390, specs.ArchS390X}
    45  	default:
    46  		return []specs.Arch{}
    47  	}
    48  }
    49  
    50  // DefaultProfile defines the allowed syscalls for the default seccomp profile.
    51  func DefaultProfile(sp *specs.Spec) *specs.LinuxSeccomp {
    52  	nosys := uint(unix.ENOSYS)
    53  	syscalls := []specs.LinuxSyscall{
    54  		{
    55  			Names: []string{
    56  				"accept",
    57  				"accept4",
    58  				"access",
    59  				"adjtimex",
    60  				"alarm",
    61  				"bind",
    62  				"brk",
    63  				"capget",
    64  				"capset",
    65  				"chdir",
    66  				"chmod",
    67  				"chown",
    68  				"chown32",
    69  				"clock_adjtime",
    70  				"clock_adjtime64",
    71  				"clock_getres",
    72  				"clock_getres_time64",
    73  				"clock_gettime",
    74  				"clock_gettime64",
    75  				"clock_nanosleep",
    76  				"clock_nanosleep_time64",
    77  				"close",
    78  				"connect",
    79  				"copy_file_range",
    80  				"creat",
    81  				"dup",
    82  				"dup2",
    83  				"dup3",
    84  				"epoll_create",
    85  				"epoll_create1",
    86  				"epoll_ctl",
    87  				"epoll_ctl_old",
    88  				"epoll_pwait",
    89  				"epoll_wait",
    90  				"epoll_wait_old",
    91  				"eventfd",
    92  				"eventfd2",
    93  				"execve",
    94  				"execveat",
    95  				"exit",
    96  				"exit_group",
    97  				"faccessat",
    98  				"faccessat2",
    99  				"fadvise64",
   100  				"fadvise64_64",
   101  				"fallocate",
   102  				"fanotify_mark",
   103  				"fchdir",
   104  				"fchmod",
   105  				"fchmodat",
   106  				"fchown",
   107  				"fchown32",
   108  				"fchownat",
   109  				"fcntl",
   110  				"fcntl64",
   111  				"fdatasync",
   112  				"fgetxattr",
   113  				"flistxattr",
   114  				"flock",
   115  				"fork",
   116  				"fremovexattr",
   117  				"fsetxattr",
   118  				"fstat",
   119  				"fstat64",
   120  				"fstatat64",
   121  				"fstatfs",
   122  				"fstatfs64",
   123  				"fsync",
   124  				"ftruncate",
   125  				"ftruncate64",
   126  				"futex",
   127  				"futex_time64",
   128  				"futimesat",
   129  				"getcpu",
   130  				"getcwd",
   131  				"getdents",
   132  				"getdents64",
   133  				"getegid",
   134  				"getegid32",
   135  				"geteuid",
   136  				"geteuid32",
   137  				"getgid",
   138  				"getgid32",
   139  				"getgroups",
   140  				"getgroups32",
   141  				"getitimer",
   142  				"getpeername",
   143  				"getpgid",
   144  				"getpgrp",
   145  				"getpid",
   146  				"getppid",
   147  				"getpriority",
   148  				"getrandom",
   149  				"getresgid",
   150  				"getresgid32",
   151  				"getresuid",
   152  				"getresuid32",
   153  				"getrlimit",
   154  				"get_robust_list",
   155  				"getrusage",
   156  				"getsid",
   157  				"getsockname",
   158  				"getsockopt",
   159  				"get_thread_area",
   160  				"gettid",
   161  				"gettimeofday",
   162  				"getuid",
   163  				"getuid32",
   164  				"getxattr",
   165  				"inotify_add_watch",
   166  				"inotify_init",
   167  				"inotify_init1",
   168  				"inotify_rm_watch",
   169  				"io_cancel",
   170  				"ioctl",
   171  				"io_destroy",
   172  				"io_getevents",
   173  				"io_pgetevents",
   174  				"io_pgetevents_time64",
   175  				"ioprio_get",
   176  				"ioprio_set",
   177  				"io_setup",
   178  				"io_submit",
   179  				"io_uring_enter",
   180  				"io_uring_register",
   181  				"io_uring_setup",
   182  				"ipc",
   183  				"kill",
   184  				"lchown",
   185  				"lchown32",
   186  				"lgetxattr",
   187  				"link",
   188  				"linkat",
   189  				"listen",
   190  				"listxattr",
   191  				"llistxattr",
   192  				"_llseek",
   193  				"lremovexattr",
   194  				"lseek",
   195  				"lsetxattr",
   196  				"lstat",
   197  				"lstat64",
   198  				"madvise",
   199  				"membarrier",
   200  				"memfd_create",
   201  				"mincore",
   202  				"mkdir",
   203  				"mkdirat",
   204  				"mknod",
   205  				"mknodat",
   206  				"mlock",
   207  				"mlock2",
   208  				"mlockall",
   209  				"mmap",
   210  				"mmap2",
   211  				"mprotect",
   212  				"mq_getsetattr",
   213  				"mq_notify",
   214  				"mq_open",
   215  				"mq_timedreceive",
   216  				"mq_timedreceive_time64",
   217  				"mq_timedsend",
   218  				"mq_timedsend_time64",
   219  				"mq_unlink",
   220  				"mremap",
   221  				"msgctl",
   222  				"msgget",
   223  				"msgrcv",
   224  				"msgsnd",
   225  				"msync",
   226  				"munlock",
   227  				"munlockall",
   228  				"munmap",
   229  				"nanosleep",
   230  				"newfstatat",
   231  				"_newselect",
   232  				"open",
   233  				"openat",
   234  				"openat2",
   235  				"pause",
   236  				"pidfd_open",
   237  				"pidfd_send_signal",
   238  				"pipe",
   239  				"pipe2",
   240  				"poll",
   241  				"ppoll",
   242  				"ppoll_time64",
   243  				"prctl",
   244  				"pread64",
   245  				"preadv",
   246  				"preadv2",
   247  				"prlimit64",
   248  				"pselect6",
   249  				"pselect6_time64",
   250  				"pwrite64",
   251  				"pwritev",
   252  				"pwritev2",
   253  				"read",
   254  				"readahead",
   255  				"readlink",
   256  				"readlinkat",
   257  				"readv",
   258  				"recv",
   259  				"recvfrom",
   260  				"recvmmsg",
   261  				"recvmmsg_time64",
   262  				"recvmsg",
   263  				"remap_file_pages",
   264  				"removexattr",
   265  				"rename",
   266  				"renameat",
   267  				"renameat2",
   268  				"restart_syscall",
   269  				"rmdir",
   270  				"rseq",
   271  				"rt_sigaction",
   272  				"rt_sigpending",
   273  				"rt_sigprocmask",
   274  				"rt_sigqueueinfo",
   275  				"rt_sigreturn",
   276  				"rt_sigsuspend",
   277  				"rt_sigtimedwait",
   278  				"rt_sigtimedwait_time64",
   279  				"rt_tgsigqueueinfo",
   280  				"sched_getaffinity",
   281  				"sched_getattr",
   282  				"sched_getparam",
   283  				"sched_get_priority_max",
   284  				"sched_get_priority_min",
   285  				"sched_getscheduler",
   286  				"sched_rr_get_interval",
   287  				"sched_rr_get_interval_time64",
   288  				"sched_setaffinity",
   289  				"sched_setattr",
   290  				"sched_setparam",
   291  				"sched_setscheduler",
   292  				"sched_yield",
   293  				"seccomp",
   294  				"select",
   295  				"semctl",
   296  				"semget",
   297  				"semop",
   298  				"semtimedop",
   299  				"semtimedop_time64",
   300  				"send",
   301  				"sendfile",
   302  				"sendfile64",
   303  				"sendmmsg",
   304  				"sendmsg",
   305  				"sendto",
   306  				"setfsgid",
   307  				"setfsgid32",
   308  				"setfsuid",
   309  				"setfsuid32",
   310  				"setgid",
   311  				"setgid32",
   312  				"setgroups",
   313  				"setgroups32",
   314  				"setitimer",
   315  				"setpgid",
   316  				"setpriority",
   317  				"setregid",
   318  				"setregid32",
   319  				"setresgid",
   320  				"setresgid32",
   321  				"setresuid",
   322  				"setresuid32",
   323  				"setreuid",
   324  				"setreuid32",
   325  				"setrlimit",
   326  				"set_robust_list",
   327  				"setsid",
   328  				"setsockopt",
   329  				"set_thread_area",
   330  				"set_tid_address",
   331  				"setuid",
   332  				"setuid32",
   333  				"setxattr",
   334  				"shmat",
   335  				"shmctl",
   336  				"shmdt",
   337  				"shmget",
   338  				"shutdown",
   339  				"sigaltstack",
   340  				"signalfd",
   341  				"signalfd4",
   342  				"sigprocmask",
   343  				"sigreturn",
   344  				"socket",
   345  				"socketcall",
   346  				"socketpair",
   347  				"splice",
   348  				"stat",
   349  				"stat64",
   350  				"statfs",
   351  				"statfs64",
   352  				"statx",
   353  				"symlink",
   354  				"symlinkat",
   355  				"sync",
   356  				"sync_file_range",
   357  				"syncfs",
   358  				"sysinfo",
   359  				"tee",
   360  				"tgkill",
   361  				"time",
   362  				"timer_create",
   363  				"timer_delete",
   364  				"timer_getoverrun",
   365  				"timer_gettime",
   366  				"timer_gettime64",
   367  				"timer_settime",
   368  				"timer_settime64",
   369  				"timerfd_create",
   370  				"timerfd_gettime",
   371  				"timerfd_gettime64",
   372  				"timerfd_settime",
   373  				"timerfd_settime64",
   374  				"times",
   375  				"tkill",
   376  				"truncate",
   377  				"truncate64",
   378  				"ugetrlimit",
   379  				"umask",
   380  				"uname",
   381  				"unlink",
   382  				"unlinkat",
   383  				"utime",
   384  				"utimensat",
   385  				"utimensat_time64",
   386  				"utimes",
   387  				"vfork",
   388  				"vmsplice",
   389  				"wait4",
   390  				"waitid",
   391  				"waitpid",
   392  				"write",
   393  				"writev",
   394  			},
   395  			Action: specs.ActAllow,
   396  			Args:   []specs.LinuxSeccompArg{},
   397  		},
   398  		{
   399  			Names:  []string{"personality"},
   400  			Action: specs.ActAllow,
   401  			Args: []specs.LinuxSeccompArg{
   402  				{
   403  					Index: 0,
   404  					Value: 0x0,
   405  					Op:    specs.OpEqualTo,
   406  				},
   407  			},
   408  		},
   409  		{
   410  			Names:  []string{"personality"},
   411  			Action: specs.ActAllow,
   412  			Args: []specs.LinuxSeccompArg{
   413  				{
   414  					Index: 0,
   415  					Value: 0x0008,
   416  					Op:    specs.OpEqualTo,
   417  				},
   418  			},
   419  		},
   420  		{
   421  			Names:  []string{"personality"},
   422  			Action: specs.ActAllow,
   423  			Args: []specs.LinuxSeccompArg{
   424  				{
   425  					Index: 0,
   426  					Value: 0x20000,
   427  					Op:    specs.OpEqualTo,
   428  				},
   429  			},
   430  		},
   431  		{
   432  			Names:  []string{"personality"},
   433  			Action: specs.ActAllow,
   434  			Args: []specs.LinuxSeccompArg{
   435  				{
   436  					Index: 0,
   437  					Value: 0x20008,
   438  					Op:    specs.OpEqualTo,
   439  				},
   440  			},
   441  		},
   442  		{
   443  			Names:  []string{"personality"},
   444  			Action: specs.ActAllow,
   445  			Args: []specs.LinuxSeccompArg{
   446  				{
   447  					Index: 0,
   448  					Value: 0xffffffff,
   449  					Op:    specs.OpEqualTo,
   450  				},
   451  			},
   452  		},
   453  	}
   454  
   455  	s := &specs.LinuxSeccomp{
   456  		DefaultAction: specs.ActErrno,
   457  		Architectures: arches(),
   458  		Syscalls:      syscalls,
   459  	}
   460  
   461  	// include by arch
   462  	switch runtime.GOARCH {
   463  	case "ppc64le":
   464  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   465  			Names: []string{
   466  				"sync_file_range2",
   467  			},
   468  			Action: specs.ActAllow,
   469  			Args:   []specs.LinuxSeccompArg{},
   470  		})
   471  	case "arm", "arm64":
   472  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   473  			Names: []string{
   474  				"arm_fadvise64_64",
   475  				"arm_sync_file_range",
   476  				"sync_file_range2",
   477  				"breakpoint",
   478  				"cacheflush",
   479  				"set_tls",
   480  			},
   481  			Action: specs.ActAllow,
   482  			Args:   []specs.LinuxSeccompArg{},
   483  		})
   484  	case "amd64":
   485  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   486  			Names: []string{
   487  				"arch_prctl",
   488  				"modify_ldt",
   489  			},
   490  			Action: specs.ActAllow,
   491  			Args:   []specs.LinuxSeccompArg{},
   492  		})
   493  	case "386":
   494  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   495  			Names: []string{
   496  				"modify_ldt",
   497  			},
   498  			Action: specs.ActAllow,
   499  			Args:   []specs.LinuxSeccompArg{},
   500  		})
   501  	case "s390", "s390x":
   502  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   503  			Names: []string{
   504  				"s390_pci_mmio_read",
   505  				"s390_pci_mmio_write",
   506  				"s390_runtime_instr",
   507  			},
   508  			Action: specs.ActAllow,
   509  			Args:   []specs.LinuxSeccompArg{},
   510  		})
   511  	}
   512  
   513  	admin := false
   514  	for _, c := range sp.Process.Capabilities.Bounding {
   515  		switch c {
   516  		case "CAP_DAC_READ_SEARCH":
   517  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   518  				Names:  []string{"open_by_handle_at"},
   519  				Action: specs.ActAllow,
   520  				Args:   []specs.LinuxSeccompArg{},
   521  			})
   522  		case "CAP_SYS_ADMIN":
   523  			admin = true
   524  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   525  				Names: []string{
   526  					"bpf",
   527  					"clone",
   528  					"clone3",
   529  					"fanotify_init",
   530  					"lookup_dcookie",
   531  					"mount",
   532  					"name_to_handle_at",
   533  					"perf_event_open",
   534  					"quotactl",
   535  					"setdomainname",
   536  					"sethostname",
   537  					"setns",
   538  					"syslog",
   539  					"umount",
   540  					"umount2",
   541  					"unshare",
   542  				},
   543  				Action: specs.ActAllow,
   544  				Args:   []specs.LinuxSeccompArg{},
   545  			})
   546  		case "CAP_SYS_BOOT":
   547  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   548  				Names:  []string{"reboot"},
   549  				Action: specs.ActAllow,
   550  				Args:   []specs.LinuxSeccompArg{},
   551  			})
   552  		case "CAP_SYS_CHROOT":
   553  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   554  				Names:  []string{"chroot"},
   555  				Action: specs.ActAllow,
   556  				Args:   []specs.LinuxSeccompArg{},
   557  			})
   558  		case "CAP_SYS_MODULE":
   559  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   560  				Names: []string{
   561  					"delete_module",
   562  					"init_module",
   563  					"finit_module",
   564  				},
   565  				Action: specs.ActAllow,
   566  				Args:   []specs.LinuxSeccompArg{},
   567  			})
   568  		case "CAP_SYS_PACCT":
   569  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   570  				Names:  []string{"acct"},
   571  				Action: specs.ActAllow,
   572  				Args:   []specs.LinuxSeccompArg{},
   573  			})
   574  		case "CAP_SYS_PTRACE":
   575  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   576  				Names: []string{
   577  					"kcmp",
   578  					"pidfd_getfd",
   579  					"process_vm_readv",
   580  					"process_vm_writev",
   581  					"ptrace",
   582  				},
   583  				Action: specs.ActAllow,
   584  				Args:   []specs.LinuxSeccompArg{},
   585  			})
   586  		case "CAP_SYS_RAWIO":
   587  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   588  				Names: []string{
   589  					"iopl",
   590  					"ioperm",
   591  				},
   592  				Action: specs.ActAllow,
   593  				Args:   []specs.LinuxSeccompArg{},
   594  			})
   595  		case "CAP_SYS_TIME":
   596  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   597  				Names: []string{
   598  					"settimeofday",
   599  					"stime",
   600  					"clock_settime",
   601  				},
   602  				Action: specs.ActAllow,
   603  				Args:   []specs.LinuxSeccompArg{},
   604  			})
   605  		case "CAP_SYS_TTY_CONFIG":
   606  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   607  				Names:  []string{"vhangup"},
   608  				Action: specs.ActAllow,
   609  				Args:   []specs.LinuxSeccompArg{},
   610  			})
   611  		case "CAP_SYSLOG":
   612  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   613  				Names:  []string{"syslog"},
   614  				Action: specs.ActAllow,
   615  				Args:   []specs.LinuxSeccompArg{},
   616  			})
   617  		}
   618  	}
   619  
   620  	if !admin {
   621  		switch runtime.GOARCH {
   622  		case "s390", "s390x":
   623  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   624  				Names: []string{
   625  					"clone",
   626  				},
   627  				Action: specs.ActAllow,
   628  				Args: []specs.LinuxSeccompArg{
   629  					{
   630  						Index:    1,
   631  						Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
   632  						ValueTwo: 0,
   633  						Op:       specs.OpMaskedEqual,
   634  					},
   635  				},
   636  			})
   637  		default:
   638  			s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   639  				Names: []string{
   640  					"clone",
   641  				},
   642  				Action: specs.ActAllow,
   643  				Args: []specs.LinuxSeccompArg{
   644  					{
   645  						Index:    0,
   646  						Value:    unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
   647  						ValueTwo: 0,
   648  						Op:       specs.OpMaskedEqual,
   649  					},
   650  				},
   651  			})
   652  		}
   653  		// clone3 is explicitly requested to give ENOSYS instead of the default EPERM, when CAP_SYS_ADMIN is unset
   654  		// https://github.com/moby/moby/pull/42681
   655  		s.Syscalls = append(s.Syscalls, specs.LinuxSyscall{
   656  			Names: []string{
   657  				"clone3",
   658  			},
   659  			Action:   specs.ActErrno,
   660  			ErrnoRet: &nosys,
   661  		})
   662  	}
   663  
   664  	return s
   665  }