github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/test/syscalls/linux/seccomp.cc (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <errno.h> 16 #include <linux/audit.h> 17 #include <linux/filter.h> 18 #include <linux/seccomp.h> 19 #include <pthread.h> 20 #include <sched.h> 21 #include <signal.h> 22 #include <string.h> 23 #include <sys/prctl.h> 24 #include <sys/syscall.h> 25 #include <time.h> 26 #include <ucontext.h> 27 #include <unistd.h> 28 29 #include <atomic> 30 31 #include "gmock/gmock.h" 32 #include "gtest/gtest.h" 33 #include "absl/base/macros.h" 34 #include "test/util/logging.h" 35 #include "test/util/memory_util.h" 36 #include "test/util/multiprocess_util.h" 37 #include "test/util/posix_error.h" 38 #include "test/util/proc_util.h" 39 #include "test/util/test_util.h" 40 #include "test/util/thread_util.h" 41 42 #ifndef SYS_SECCOMP 43 #define SYS_SECCOMP 1 44 #endif 45 46 namespace gvisor { 47 namespace testing { 48 49 namespace { 50 51 // A syscall not implemented by Linux that we don't expect to be called. 52 #ifdef __x86_64__ 53 constexpr uint32_t kFilteredSyscall = SYS_vserver; 54 #elif __aarch64__ 55 // Use the last of arch_specific_syscalls which are not implemented on arm64. 56 constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15; 57 #endif 58 59 // Applies a seccomp-bpf filter that returns `filtered_result` for 60 // `sysno` and allows all other syscalls. Async-signal-safe. 61 void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result, 62 uint32_t flags = 0) { 63 // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS, 64 // 1) or run with CAP_SYS_ADMIN privileges in its namespace." - 65 // Documentation/prctl/seccomp_filter.txt 66 // 67 // prctl(PR_SET_NO_NEW_PRIVS, 1) may be called repeatedly; calls after the 68 // first are no-ops. 69 TEST_PCHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0); 70 MaybeSave(); 71 72 struct sock_filter filter[] = { 73 // A = seccomp_data.arch 74 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4), 75 #if defined(__x86_64__) 76 // if (A != AUDIT_ARCH_X86_64) goto kill 77 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4), 78 #elif defined(__aarch64__) 79 // if (A != AUDIT_ARCH_AARCH64) goto kill 80 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4), 81 #else 82 #error "Unknown architecture" 83 #endif 84 // A = seccomp_data.nr 85 BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0), 86 // if (A != sysno) goto allow 87 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1), 88 // return filtered_result 89 BPF_STMT(BPF_RET | BPF_K, filtered_result), 90 // allow: return SECCOMP_RET_ALLOW 91 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), 92 // kill: return SECCOMP_RET_KILL 93 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL), 94 }; 95 struct sock_fprog prog; 96 prog.len = ABSL_ARRAYSIZE(filter); 97 prog.filter = filter; 98 if (flags) { 99 TEST_CHECK(syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &prog) == 100 0); 101 } else { 102 TEST_PCHECK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == 0); 103 } 104 MaybeSave(); 105 } 106 107 // Wrapper for sigaction. Async-signal-safe. 108 void RegisterSignalHandler(int signum, 109 void (*handler)(int, siginfo_t*, void*)) { 110 struct sigaction sa = {}; 111 sa.sa_sigaction = handler; 112 sigemptyset(&sa.sa_mask); 113 sa.sa_flags = SA_SIGINFO; 114 TEST_PCHECK(sigaction(signum, &sa, nullptr) == 0); 115 MaybeSave(); 116 } 117 118 // All of the following tests execute in a subprocess to ensure that each test 119 // is run in a separate process. This avoids cross-contamination of seccomp 120 // state between tests, and is necessary to ensure that test processes killed 121 // by SECCOMP_RET_KILL are single-threaded (since SECCOMP_RET_KILL only kills 122 // the offending thread, not the whole thread group). 123 124 TEST(SeccompTest, RetKillCausesDeathBySIGSYS) { 125 pid_t const pid = fork(); 126 if (pid == 0) { 127 // Register a signal handler for SIGSYS that we don't expect to be invoked. 128 RegisterSignalHandler( 129 SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); }); 130 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL); 131 syscall(kFilteredSyscall); 132 TEST_CHECK_MSG(false, "Survived invocation of test syscall"); 133 } 134 ASSERT_THAT(pid, SyscallSucceeds()); 135 int status; 136 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 137 EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS) 138 << "status " << status; 139 } 140 141 TEST(SeccompTest, RetKillOnlyKillsOneThread) { 142 Mapping stack = ASSERT_NO_ERRNO_AND_VALUE( 143 MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); 144 145 pid_t const pid = fork(); 146 if (pid == 0) { 147 // Register a signal handler for SIGSYS that we don't expect to be invoked. 148 RegisterSignalHandler( 149 SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); }); 150 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL); 151 // Pass CLONE_VFORK to block the original thread in the child process until 152 // the clone thread exits with SIGSYS. 153 // 154 // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's 155 // x86_64 implementation is safe. See glibc 156 // sysdeps/unix/sysv/linux/x86_64/clone.S. 157 clone( 158 +[](void* arg) { 159 syscall(kFilteredSyscall); // should kill the thread 160 _exit(1); // should be unreachable 161 return 2; // should be very unreachable, shut up the compiler 162 }, 163 stack.endptr(), 164 CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM | 165 CLONE_VFORK, 166 nullptr); 167 _exit(0); 168 } 169 ASSERT_THAT(pid, SyscallSucceeds()); 170 int status; 171 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 172 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 173 << "status " << status; 174 } 175 176 TEST(SeccompTest, RetTrapCausesSIGSYS) { 177 pid_t const pid = fork(); 178 if (pid == 0) { 179 constexpr uint16_t kTrapValue = 0xdead; 180 RegisterSignalHandler( 181 SIGSYS, +[](int signo, siginfo_t* info, void* ucv) { 182 ucontext_t* uc = static_cast<ucontext_t*>(ucv); 183 // This is a signal handler, so we must stay async-signal-safe. 184 TEST_CHECK(info->si_signo == SIGSYS); 185 TEST_CHECK(info->si_code == SYS_SECCOMP); 186 TEST_CHECK(info->si_errno == kTrapValue); 187 TEST_CHECK(info->si_call_addr != nullptr); 188 TEST_CHECK(info->si_syscall == kFilteredSyscall); 189 #if defined(__x86_64__) 190 TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64); 191 TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall); 192 #elif defined(__aarch64__) 193 TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64); 194 TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall); 195 #endif // defined(__x86_64__) 196 _exit(0); 197 }); 198 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRAP | kTrapValue); 199 syscall(kFilteredSyscall); 200 TEST_CHECK_MSG(false, "Survived invocation of test syscall"); 201 } 202 ASSERT_THAT(pid, SyscallSucceeds()); 203 int status; 204 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 205 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 206 << "status " << status; 207 } 208 209 #ifdef __x86_64__ 210 211 constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400; 212 213 time_t vsyscall_time(time_t* t) { 214 return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t); 215 } 216 217 TEST(SeccompTest, SeccompAppliesToVsyscall) { 218 SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled())); 219 220 pid_t const pid = fork(); 221 if (pid == 0) { 222 constexpr uint16_t kTrapValue = 0xdead; 223 RegisterSignalHandler( 224 SIGSYS, +[](int signo, siginfo_t* info, void* ucv) { 225 ucontext_t* uc = static_cast<ucontext_t*>(ucv); 226 // This is a signal handler, so we must stay async-signal-safe. 227 TEST_CHECK(info->si_signo == SIGSYS); 228 TEST_CHECK(info->si_code == SYS_SECCOMP); 229 TEST_CHECK(info->si_errno == kTrapValue); 230 TEST_CHECK(info->si_call_addr != nullptr); 231 TEST_CHECK(info->si_syscall == SYS_time); 232 TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64); 233 TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == SYS_time); 234 _exit(0); 235 }); 236 ApplySeccompFilter(SYS_time, SECCOMP_RET_TRAP | kTrapValue); 237 vsyscall_time(nullptr); // Should result in death. 238 TEST_CHECK_MSG(false, "Survived invocation of test syscall"); 239 } 240 ASSERT_THAT(pid, SyscallSucceeds()); 241 int status; 242 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 243 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 244 << "status " << status; 245 } 246 247 TEST(SeccompTest, RetKillVsyscallCausesDeathBySIGSYS) { 248 SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled())); 249 250 pid_t const pid = fork(); 251 if (pid == 0) { 252 // Register a signal handler for SIGSYS that we don't expect to be invoked. 253 RegisterSignalHandler( 254 SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); }); 255 ApplySeccompFilter(SYS_time, SECCOMP_RET_KILL); 256 vsyscall_time(nullptr); // Should result in death. 257 TEST_CHECK_MSG(false, "Survived invocation of test syscall"); 258 } 259 ASSERT_THAT(pid, SyscallSucceeds()); 260 int status; 261 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 262 EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS) 263 << "status " << status; 264 } 265 266 #endif // defined(__x86_64__) 267 268 TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) { 269 pid_t const pid = fork(); 270 if (pid == 0) { 271 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE); 272 TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS); 273 _exit(0); 274 } 275 ASSERT_THAT(pid, SyscallSucceeds()); 276 int status; 277 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 278 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 279 << "status " << status; 280 } 281 282 TEST(SeccompTest, RetErrnoReturnsErrno) { 283 pid_t const pid = fork(); 284 if (pid == 0) { 285 // ENOTNAM: "Not a XENIX named type file" 286 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM); 287 TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM); 288 _exit(0); 289 } 290 ASSERT_THAT(pid, SyscallSucceeds()); 291 int status; 292 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 293 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 294 << "status " << status; 295 } 296 297 TEST(SeccompTest, RetAllowAllowsSyscall) { 298 pid_t const pid = fork(); 299 if (pid == 0) { 300 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ALLOW); 301 TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS); 302 _exit(0); 303 } 304 ASSERT_THAT(pid, SyscallSucceeds()); 305 int status; 306 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 307 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 308 << "status " << status; 309 } 310 311 // This test will validate that TSYNC will apply to all threads. 312 TEST(SeccompTest, TsyncAppliesToAllThreads) { 313 Mapping stack = ASSERT_NO_ERRNO_AND_VALUE( 314 MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); 315 316 // We don't want to apply this policy to other test runner threads, so fork. 317 const pid_t pid = fork(); 318 319 if (pid == 0) { 320 // First check that we receive a ENOSYS before the policy is applied. 321 TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS); 322 323 // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's 324 // x86_64 implementation is safe. See glibc 325 // sysdeps/unix/sysv/linux/x86_64/clone.S. 326 clone( 327 +[](void* arg) { 328 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM, 329 SECCOMP_FILTER_FLAG_TSYNC); 330 return 0; 331 }, 332 stack.endptr(), 333 CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM | 334 CLONE_VFORK, 335 nullptr); 336 337 // Because we're using CLONE_VFORK this thread will be blocked until 338 // the second thread has released resources to our virtual memory, since 339 // we're not execing that will happen on _exit. 340 341 // Now verify that the policy applied to this thread too. 342 TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM); 343 _exit(0); 344 } 345 346 ASSERT_THAT(pid, SyscallSucceeds()); 347 int status = 0; 348 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 349 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 350 << "status " << status; 351 } 352 353 // This test will validate that seccomp(2) rejects unsupported flags. 354 TEST(SeccompTest, SeccompRejectsUnknownFlags) { 355 constexpr uint32_t kInvalidFlag = 123; 356 ASSERT_THAT( 357 syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr), 358 SyscallFailsWithErrno(EINVAL)); 359 } 360 361 TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) { 362 // This is RetKillCausesDeathBySIGSYS, plus extra filters before and after the 363 // one that causes the kill that should be ignored. 364 pid_t const pid = fork(); 365 if (pid == 0) { 366 RegisterSignalHandler( 367 SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); }); 368 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE); 369 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL); 370 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM); 371 syscall(kFilteredSyscall); 372 TEST_CHECK_MSG(false, "Survived invocation of test syscall"); 373 } 374 ASSERT_THAT(pid, SyscallSucceeds()); 375 int status; 376 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 377 EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS) 378 << "status " << status; 379 } 380 381 // Passed as argv[1] to cause the test binary to invoke kFilteredSyscall and 382 // exit. Not a real flag since flag parsing happens during initialization, 383 // which may create threads. 384 constexpr char kInvokeFilteredSyscallFlag[] = "--seccomp_test_child"; 385 386 TEST(SeccompTest, FiltersPreservedAcrossForkAndExecve) { 387 ExecveArray const grandchild_argv( 388 {"/proc/self/exe", kInvokeFilteredSyscallFlag}); 389 390 pid_t const pid = fork(); 391 if (pid == 0) { 392 ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL); 393 pid_t const grandchild_pid = fork(); 394 if (grandchild_pid == 0) { 395 execve(grandchild_argv.get()[0], grandchild_argv.get(), 396 /* envp = */ nullptr); 397 TEST_PCHECK_MSG(false, "execve failed"); 398 } 399 int status; 400 TEST_PCHECK(waitpid(grandchild_pid, &status, 0) == grandchild_pid); 401 TEST_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS); 402 _exit(0); 403 } 404 ASSERT_THAT(pid, SyscallSucceeds()); 405 int status; 406 ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid)); 407 EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) 408 << "status " << status; 409 } 410 411 } // namespace 412 413 } // namespace testing 414 } // namespace gvisor 415 416 int main(int argc, char** argv) { 417 if (argc >= 2 && 418 strcmp(argv[1], gvisor::testing::kInvokeFilteredSyscallFlag) == 0) { 419 syscall(gvisor::testing::kFilteredSyscall); 420 exit(0); 421 } 422 423 gvisor::testing::TestInit(&argc, &argv); 424 return gvisor::testing::RunAllTests(); 425 }