github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/test/syscalls/linux/seccomp.cc (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include <errno.h>
    16  #include <linux/audit.h>
    17  #include <linux/filter.h>
    18  #include <linux/seccomp.h>
    19  #include <pthread.h>
    20  #include <sched.h>
    21  #include <signal.h>
    22  #include <string.h>
    23  #include <sys/prctl.h>
    24  #include <sys/syscall.h>
    25  #include <time.h>
    26  #include <ucontext.h>
    27  #include <unistd.h>
    28  
    29  #include <atomic>
    30  
    31  #include "gmock/gmock.h"
    32  #include "gtest/gtest.h"
    33  #include "absl/base/macros.h"
    34  #include "test/util/logging.h"
    35  #include "test/util/memory_util.h"
    36  #include "test/util/multiprocess_util.h"
    37  #include "test/util/posix_error.h"
    38  #include "test/util/proc_util.h"
    39  #include "test/util/test_util.h"
    40  #include "test/util/thread_util.h"
    41  
    42  #ifndef SYS_SECCOMP
    43  #define SYS_SECCOMP 1
    44  #endif
    45  
    46  namespace gvisor {
    47  namespace testing {
    48  
    49  namespace {
    50  
    51  // A syscall not implemented by Linux that we don't expect to be called.
    52  #ifdef __x86_64__
    53  constexpr uint32_t kFilteredSyscall = SYS_vserver;
    54  #elif __aarch64__
    55  // Use the last of arch_specific_syscalls which are not implemented on arm64.
    56  constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
    57  #endif
    58  
    59  // Applies a seccomp-bpf filter that returns `filtered_result` for
    60  // `sysno` and allows all other syscalls. Async-signal-safe.
    61  void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
    62                          uint32_t flags = 0) {
    63    // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS,
    64    // 1) or run with CAP_SYS_ADMIN privileges in its namespace." -
    65    // Documentation/prctl/seccomp_filter.txt
    66    //
    67    // prctl(PR_SET_NO_NEW_PRIVS, 1) may be called repeatedly; calls after the
    68    // first are no-ops.
    69    TEST_PCHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0);
    70    MaybeSave();
    71  
    72    struct sock_filter filter[] = {
    73      // A = seccomp_data.arch
    74      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
    75  #if defined(__x86_64__)
    76      // if (A != AUDIT_ARCH_X86_64) goto kill
    77      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
    78  #elif defined(__aarch64__)
    79      // if (A != AUDIT_ARCH_AARCH64) goto kill
    80      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
    81  #else
    82  #error "Unknown architecture"
    83  #endif
    84      // A = seccomp_data.nr
    85      BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
    86      // if (A != sysno) goto allow
    87      BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
    88      // return filtered_result
    89      BPF_STMT(BPF_RET | BPF_K, filtered_result),
    90      // allow: return SECCOMP_RET_ALLOW
    91      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
    92      // kill: return SECCOMP_RET_KILL
    93      BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
    94    };
    95    struct sock_fprog prog;
    96    prog.len = ABSL_ARRAYSIZE(filter);
    97    prog.filter = filter;
    98    if (flags) {
    99      TEST_CHECK(syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &prog) ==
   100                 0);
   101    } else {
   102      TEST_PCHECK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == 0);
   103    }
   104    MaybeSave();
   105  }
   106  
   107  // Wrapper for sigaction. Async-signal-safe.
   108  void RegisterSignalHandler(int signum,
   109                             void (*handler)(int, siginfo_t*, void*)) {
   110    struct sigaction sa = {};
   111    sa.sa_sigaction = handler;
   112    sigemptyset(&sa.sa_mask);
   113    sa.sa_flags = SA_SIGINFO;
   114    TEST_PCHECK(sigaction(signum, &sa, nullptr) == 0);
   115    MaybeSave();
   116  }
   117  
   118  // All of the following tests execute in a subprocess to ensure that each test
   119  // is run in a separate process. This avoids cross-contamination of seccomp
   120  // state between tests, and is necessary to ensure that test processes killed
   121  // by SECCOMP_RET_KILL are single-threaded (since SECCOMP_RET_KILL only kills
   122  // the offending thread, not the whole thread group).
   123  
   124  TEST(SeccompTest, RetKillCausesDeathBySIGSYS) {
   125    pid_t const pid = fork();
   126    if (pid == 0) {
   127      // Register a signal handler for SIGSYS that we don't expect to be invoked.
   128      RegisterSignalHandler(
   129          SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
   130      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
   131      syscall(kFilteredSyscall);
   132      TEST_CHECK_MSG(false, "Survived invocation of test syscall");
   133    }
   134    ASSERT_THAT(pid, SyscallSucceeds());
   135    int status;
   136    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   137    EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
   138        << "status " << status;
   139  }
   140  
   141  TEST(SeccompTest, RetKillOnlyKillsOneThread) {
   142    Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
   143        MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
   144  
   145    pid_t const pid = fork();
   146    if (pid == 0) {
   147      // Register a signal handler for SIGSYS that we don't expect to be invoked.
   148      RegisterSignalHandler(
   149          SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
   150      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
   151      // Pass CLONE_VFORK to block the original thread in the child process until
   152      // the clone thread exits with SIGSYS.
   153      //
   154      // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
   155      // x86_64 implementation is safe. See glibc
   156      // sysdeps/unix/sysv/linux/x86_64/clone.S.
   157      clone(
   158          +[](void* arg) {
   159            syscall(kFilteredSyscall);  // should kill the thread
   160            _exit(1);                   // should be unreachable
   161            return 2;  // should be very unreachable, shut up the compiler
   162          },
   163          stack.endptr(),
   164          CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
   165              CLONE_VFORK,
   166          nullptr);
   167      _exit(0);
   168    }
   169    ASSERT_THAT(pid, SyscallSucceeds());
   170    int status;
   171    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   172    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   173        << "status " << status;
   174  }
   175  
   176  TEST(SeccompTest, RetTrapCausesSIGSYS) {
   177    pid_t const pid = fork();
   178    if (pid == 0) {
   179      constexpr uint16_t kTrapValue = 0xdead;
   180      RegisterSignalHandler(
   181          SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
   182            ucontext_t* uc = static_cast<ucontext_t*>(ucv);
   183            // This is a signal handler, so we must stay async-signal-safe.
   184            TEST_CHECK(info->si_signo == SIGSYS);
   185            TEST_CHECK(info->si_code == SYS_SECCOMP);
   186            TEST_CHECK(info->si_errno == kTrapValue);
   187            TEST_CHECK(info->si_call_addr != nullptr);
   188            TEST_CHECK(info->si_syscall == kFilteredSyscall);
   189  #if defined(__x86_64__)
   190            TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
   191            TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
   192  #elif defined(__aarch64__)
   193            TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
   194            TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
   195  #endif  // defined(__x86_64__)
   196            _exit(0);
   197          });
   198      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRAP | kTrapValue);
   199      syscall(kFilteredSyscall);
   200      TEST_CHECK_MSG(false, "Survived invocation of test syscall");
   201    }
   202    ASSERT_THAT(pid, SyscallSucceeds());
   203    int status;
   204    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   205    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   206        << "status " << status;
   207  }
   208  
   209  #ifdef __x86_64__
   210  
   211  constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
   212  
   213  time_t vsyscall_time(time_t* t) {
   214    return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
   215  }
   216  
   217  TEST(SeccompTest, SeccompAppliesToVsyscall) {
   218    SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
   219  
   220    pid_t const pid = fork();
   221    if (pid == 0) {
   222      constexpr uint16_t kTrapValue = 0xdead;
   223      RegisterSignalHandler(
   224          SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
   225            ucontext_t* uc = static_cast<ucontext_t*>(ucv);
   226            // This is a signal handler, so we must stay async-signal-safe.
   227            TEST_CHECK(info->si_signo == SIGSYS);
   228            TEST_CHECK(info->si_code == SYS_SECCOMP);
   229            TEST_CHECK(info->si_errno == kTrapValue);
   230            TEST_CHECK(info->si_call_addr != nullptr);
   231            TEST_CHECK(info->si_syscall == SYS_time);
   232            TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
   233            TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == SYS_time);
   234            _exit(0);
   235          });
   236      ApplySeccompFilter(SYS_time, SECCOMP_RET_TRAP | kTrapValue);
   237      vsyscall_time(nullptr);  // Should result in death.
   238      TEST_CHECK_MSG(false, "Survived invocation of test syscall");
   239    }
   240    ASSERT_THAT(pid, SyscallSucceeds());
   241    int status;
   242    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   243    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   244        << "status " << status;
   245  }
   246  
   247  TEST(SeccompTest, RetKillVsyscallCausesDeathBySIGSYS) {
   248    SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
   249  
   250    pid_t const pid = fork();
   251    if (pid == 0) {
   252      // Register a signal handler for SIGSYS that we don't expect to be invoked.
   253      RegisterSignalHandler(
   254          SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
   255      ApplySeccompFilter(SYS_time, SECCOMP_RET_KILL);
   256      vsyscall_time(nullptr);  // Should result in death.
   257      TEST_CHECK_MSG(false, "Survived invocation of test syscall");
   258    }
   259    ASSERT_THAT(pid, SyscallSucceeds());
   260    int status;
   261    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   262    EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
   263        << "status " << status;
   264  }
   265  
   266  #endif  // defined(__x86_64__)
   267  
   268  TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) {
   269    pid_t const pid = fork();
   270    if (pid == 0) {
   271      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
   272      TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
   273      _exit(0);
   274    }
   275    ASSERT_THAT(pid, SyscallSucceeds());
   276    int status;
   277    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   278    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   279        << "status " << status;
   280  }
   281  
   282  TEST(SeccompTest, RetErrnoReturnsErrno) {
   283    pid_t const pid = fork();
   284    if (pid == 0) {
   285      // ENOTNAM: "Not a XENIX named type file"
   286      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
   287      TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
   288      _exit(0);
   289    }
   290    ASSERT_THAT(pid, SyscallSucceeds());
   291    int status;
   292    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   293    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   294        << "status " << status;
   295  }
   296  
   297  TEST(SeccompTest, RetAllowAllowsSyscall) {
   298    pid_t const pid = fork();
   299    if (pid == 0) {
   300      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ALLOW);
   301      TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
   302      _exit(0);
   303    }
   304    ASSERT_THAT(pid, SyscallSucceeds());
   305    int status;
   306    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   307    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   308        << "status " << status;
   309  }
   310  
   311  // This test will validate that TSYNC will apply to all threads.
   312  TEST(SeccompTest, TsyncAppliesToAllThreads) {
   313    Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
   314        MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
   315  
   316    // We don't want to apply this policy to other test runner threads, so fork.
   317    const pid_t pid = fork();
   318  
   319    if (pid == 0) {
   320      // First check that we receive a ENOSYS before the policy is applied.
   321      TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
   322  
   323      // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
   324      // x86_64 implementation is safe. See glibc
   325      // sysdeps/unix/sysv/linux/x86_64/clone.S.
   326      clone(
   327          +[](void* arg) {
   328            ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM,
   329                               SECCOMP_FILTER_FLAG_TSYNC);
   330            return 0;
   331          },
   332          stack.endptr(),
   333          CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
   334              CLONE_VFORK,
   335          nullptr);
   336  
   337      // Because we're using CLONE_VFORK this thread will be blocked until
   338      // the second thread has released resources to our virtual memory, since
   339      // we're not execing that will happen on _exit.
   340  
   341      // Now verify that the policy applied to this thread too.
   342      TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
   343      _exit(0);
   344    }
   345  
   346    ASSERT_THAT(pid, SyscallSucceeds());
   347    int status = 0;
   348    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   349    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   350        << "status " << status;
   351  }
   352  
   353  // This test will validate that seccomp(2) rejects unsupported flags.
   354  TEST(SeccompTest, SeccompRejectsUnknownFlags) {
   355    constexpr uint32_t kInvalidFlag = 123;
   356    ASSERT_THAT(
   357        syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr),
   358        SyscallFailsWithErrno(EINVAL));
   359  }
   360  
   361  TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) {
   362    // This is RetKillCausesDeathBySIGSYS, plus extra filters before and after the
   363    // one that causes the kill that should be ignored.
   364    pid_t const pid = fork();
   365    if (pid == 0) {
   366      RegisterSignalHandler(
   367          SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
   368      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
   369      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
   370      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
   371      syscall(kFilteredSyscall);
   372      TEST_CHECK_MSG(false, "Survived invocation of test syscall");
   373    }
   374    ASSERT_THAT(pid, SyscallSucceeds());
   375    int status;
   376    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   377    EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
   378        << "status " << status;
   379  }
   380  
   381  // Passed as argv[1] to cause the test binary to invoke kFilteredSyscall and
   382  // exit. Not a real flag since flag parsing happens during initialization,
   383  // which may create threads.
   384  constexpr char kInvokeFilteredSyscallFlag[] = "--seccomp_test_child";
   385  
   386  TEST(SeccompTest, FiltersPreservedAcrossForkAndExecve) {
   387    ExecveArray const grandchild_argv(
   388        {"/proc/self/exe", kInvokeFilteredSyscallFlag});
   389  
   390    pid_t const pid = fork();
   391    if (pid == 0) {
   392      ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
   393      pid_t const grandchild_pid = fork();
   394      if (grandchild_pid == 0) {
   395        execve(grandchild_argv.get()[0], grandchild_argv.get(),
   396               /* envp = */ nullptr);
   397        TEST_PCHECK_MSG(false, "execve failed");
   398      }
   399      int status;
   400      TEST_PCHECK(waitpid(grandchild_pid, &status, 0) == grandchild_pid);
   401      TEST_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS);
   402      _exit(0);
   403    }
   404    ASSERT_THAT(pid, SyscallSucceeds());
   405    int status;
   406    ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
   407    EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
   408        << "status " << status;
   409  }
   410  
   411  }  // namespace
   412  
   413  }  // namespace testing
   414  }  // namespace gvisor
   415  
   416  int main(int argc, char** argv) {
   417    if (argc >= 2 &&
   418        strcmp(argv[1], gvisor::testing::kInvokeFilteredSyscallFlag) == 0) {
   419      syscall(gvisor::testing::kFilteredSyscall);
   420      exit(0);
   421    }
   422  
   423    gvisor::testing::TestInit(&argc, &argv);
   424    return gvisor::testing::RunAllTests();
   425  }