gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/sysmsg/sysmsg_lib.c

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/sysmsg/sysmsg_lib.c (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #define _GNU_SOURCE
    16  #include <errno.h>
    17  #include <linux/futex.h>
    18  #include <linux/unistd.h>
    19  #include <signal.h>
    20  #include <stdbool.h>
    21  #include <stddef.h>
    22  #include <stdint.h>
    23  #include <stdlib.h>
    24  
    25  #include "atomic.h"
    26  #include "sysmsg.h"
    27  
    28  // __export_deep_sleep_timeout is the timeout after which the stub thread stops
    29  // polling and fall asleep.
    30  uint64_t __export_deep_sleep_timeout;
    31  
    32  // LINT.IfChange
    33  #define MAX_GUEST_CONTEXTS (4095)
    34  #define MAX_CONTEXT_QUEUE_ENTRIES (MAX_GUEST_CONTEXTS + 1)
    35  #define INVALID_CONTEXT_ID 0xfefefefe
    36  #define INVALID_THREAD_ID 0xfefefefe
    37  
    38  // Each element of a context_queue ring buffer is a sum of its index shifted by
    39  // CQ_INDEX_SHIFT and context_id.
    40  #define CQ_INDEX_SHIFT 32
    41  #define CQ_CONTEXT_MASK ((1UL << CQ_INDEX_SHIFT) - 1)
    42  
    43  // See systrap/context_queue.go
    44  struct context_queue {
    45    uint32_t start;
    46    uint32_t end;
    47    uint32_t num_active_threads;
    48    uint32_t num_spinning_threads;
    49    uint32_t num_threads_to_wakeup;
    50    uint32_t num_active_contexts;
    51    uint32_t num_awake_contexts;
    52    uint32_t fast_path_disabled;
    53    uint32_t used_fast_path;
    54    uint64_t ringbuffer[MAX_CONTEXT_QUEUE_ENTRIES];
    55  };
    56  
    57  struct context_queue *__export_context_queue_addr;
    58  
    59  // LINT.ThenChange(../context_queue.go)
    60  
    61  uint32_t is_empty(struct context_queue *queue) {
    62    return atomic_load(&queue->start) == atomic_load(&queue->end);
    63  }
    64  
    65  int32_t queued_contexts(struct context_queue *queue) {
    66    return (atomic_load(&queue->end) + MAX_CONTEXT_QUEUE_ENTRIES -
    67            atomic_load(&queue->start)) %
    68           MAX_CONTEXT_QUEUE_ENTRIES;
    69  }
    70  
    71  #if defined(__x86_64__)
    72  static __inline__ unsigned long rdtsc(void) {
    73    unsigned h, l;
    74    __asm__ __volatile__("rdtsc" : "=a"(l), "=d"(h));
    75    return ((unsigned long)l) | (((unsigned long)h) << 32);
    76  }
    77  
    78  static __inline__ void spinloop(void) { asm("pause"); }
    79  #elif defined(__aarch64__)
    80  static __inline__ unsigned long rdtsc(void) {
    81    long val;
    82    asm volatile("mrs %0, cntvct_el0" : "=r"(val));
    83    return val;
    84  }
    85  
    86  static __inline__ void spinloop(void) { asm volatile("yield" : : : "memory"); }
    87  #endif
    88  
    89  void *__export_context_region;
    90  
    91  static struct thread_context *thread_context_addr(uint32_t tcid) {
    92    return (struct thread_context *)(__export_context_region +
    93                                     tcid *
    94                                         ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT);
    95  }
    96  
    97  void memcpy(uint8_t *dest, uint8_t *src, size_t n) {
    98    for (size_t i = 0; i < n; i += 1) {
    99      dest[i] = src[i];
   100    }
   101  }
   102  
   103  // The spinning queue is a queue of spinning threads. It solves the
   104  // fragmentation problem. The idea is to minimize the number of threads
   105  // processing requests. We can't control how system threads are scheduled, so
   106  // can't distribute requests efficiently. The spinning queue emulates virtual
   107  // threads sorted by their spinning time.
   108  //
   109  // This queue is lock-less to be sure that any thread scheduled out
   110  // from CPU doesn't block others.
   111  //
   112  // The size of the queue must be a divisor of 2^32, because queue indexes are
   113  // calculated as modules of uint32 values.
   114  #define SPINNING_QUEUE_SIZE 256
   115  
   116  // MAX_RE_ENQUEUE defines the amount of time a given entry in the spinning queue
   117  // needs to reach timeout in order to be removed. Re-enqueuing a timeout is done
   118  // in order to mitigate rdtsc inaccuracies.
   119  #define MAX_RE_ENQUEUE 2
   120  
   121  struct spinning_queue {
   122    uint32_t len;
   123    uint32_t start;
   124    uint32_t end;
   125    uint64_t start_times[SPINNING_QUEUE_SIZE];
   126    uint8_t num_times_re_enqueued[SPINNING_QUEUE_SIZE];
   127  };
   128  
   129  struct spinning_queue *__export_spinning_queue_addr;
   130  
   131  // spinning_queue_push adds a new thread to the queue. It returns false if the
   132  // queue is full, or if re_enqueue_times has reached MAX_RE_ENQUEUE.
   133  static bool spinning_queue_push(uint8_t re_enqueue_times)
   134      __attribute__((warn_unused_result));
   135  static bool spinning_queue_push(uint8_t re_enqueue_times) {
   136    struct spinning_queue *queue = __export_spinning_queue_addr;
   137    uint32_t idx, end, len;
   138  
   139    BUILD_BUG_ON(sizeof(struct spinning_queue) > SPINNING_QUEUE_MEM_SIZE);
   140    if (re_enqueue_times >= MAX_RE_ENQUEUE) {
   141      return false;
   142    }
   143  
   144    len = atomic_add(&queue->len, 1);
   145    if (len > SPINNING_QUEUE_SIZE) {
   146      atomic_sub(&queue->len, 1);
   147      return false;
   148    }
   149    end = atomic_add(&queue->end, 1);
   150  
   151    idx = end - 1;
   152    atomic_store(&queue->num_times_re_enqueued[idx % SPINNING_QUEUE_SIZE],
   153                 re_enqueue_times);
   154    atomic_store(&queue->start_times[idx % SPINNING_QUEUE_SIZE], rdtsc());
   155    return true;
   156  }
   157  
   158  // spinning_queue_pop() removes one thread from a queue that has been spinning
   159  // the shortest time.
   160  // However it doesn't take into account the spinning re-enqueue.
   161  static void spinning_queue_pop() {
   162    struct spinning_queue *queue = __export_spinning_queue_addr;
   163  
   164    atomic_sub(&queue->end, 1);
   165    atomic_sub(&queue->len, 1);
   166  }
   167  
   168  // spinning_queue_remove_first removes one thread from a queue that has been
   169  // spinning longer than others and longer than a specified timeout.
   170  //
   171  // If `timeout` is zero, it always removes one element and never returns false.
   172  //
   173  // Returns true if one thread has been removed from the queue.
   174  static bool spinning_queue_remove_first(uint64_t timeout)
   175      __attribute__((warn_unused_result));
   176  static bool spinning_queue_remove_first(uint64_t timeout) {
   177    struct spinning_queue *queue = __export_spinning_queue_addr;
   178    uint64_t ts;
   179    uint8_t re_enqueue = 0;
   180  
   181    while (1) {
   182      uint32_t idx, qidx;
   183  
   184      idx = atomic_load(&queue->start);
   185      qidx = idx % SPINNING_QUEUE_SIZE;
   186      ts = atomic_load(&queue->start_times[qidx]);
   187  
   188      if (ts == 0) continue;
   189      if (rdtsc() - ts < timeout) return false;
   190      if (idx != atomic_load(&queue->start)) continue;  // Lose the race.
   191  
   192      re_enqueue = atomic_load(&queue->num_times_re_enqueued[qidx]);
   193      if (atomic_compare_exchange(&queue->start_times[qidx], &ts, 0)) {
   194        atomic_add(&queue->start, 1);
   195        break;
   196      }
   197    }
   198  
   199    atomic_sub(&queue->len, 1);
   200    if (timeout == 0) return true;
   201    return !spinning_queue_push(re_enqueue + 1);
   202  }
   203  
   204  struct thread_context *queue_get_context(struct sysmsg *sysmsg) {
   205    struct context_queue *queue = __export_context_queue_addr;
   206  
   207    // Indexes should not jump when start or end are overflowed.
   208    BUILD_BUG_ON(UINT32_MAX % MAX_CONTEXT_QUEUE_ENTRIES !=
   209                 MAX_CONTEXT_QUEUE_ENTRIES - 1);
   210  
   211    while (!is_empty(queue)) {
   212      uint64_t idx = atomic_load(&queue->start);
   213      uint32_t next = idx % MAX_CONTEXT_QUEUE_ENTRIES;
   214      uint64_t v = atomic_load(&queue->ringbuffer[next]);
   215  
   216      // We need to check the index to be sure that a ring buffer hasn't been
   217      // recycled.
   218      if ((v >> CQ_INDEX_SHIFT) != idx) continue;
   219      if (!atomic_compare_exchange(&queue->ringbuffer[next], &v,
   220                                   INVALID_CONTEXT_ID)) {
   221        continue;
   222      }
   223  
   224      uint32_t context_id = v & CQ_CONTEXT_MASK;
   225      if (context_id == INVALID_CONTEXT_ID) continue;
   226  
   227      atomic_add(&queue->start, 1);
   228      if (context_id > MAX_GUEST_CONTEXTS) {
   229        panic(STUB_ERROR_BAD_CONTEXT_ID, context_id);
   230      }
   231      struct thread_context *ctx = thread_context_addr(context_id);
   232      sysmsg->context = ctx;
   233      atomic_store(&ctx->acked_time, rdtsc());
   234      atomic_store(&ctx->thread_id, sysmsg->thread_id);
   235      return ctx;
   236    }
   237    return NULL;
   238  }
   239  
   240  // get_context_fast sets nr_active_threads_p only if it deactivates the thread.
   241  static struct thread_context *get_context_fast(struct sysmsg *sysmsg,
   242                                                 struct context_queue *queue,
   243                                                 uint32_t *nr_active_threads_p) {
   244    uint32_t nr_active_threads, nr_awake_contexts;
   245  
   246    if (!spinning_queue_push(0)) return NULL;
   247    atomic_store(&queue->used_fast_path, 1);
   248  
   249    while (1) {
   250      struct thread_context *ctx;
   251  
   252      ctx = queue_get_context(sysmsg);
   253      if (ctx) {
   254        spinning_queue_pop();
   255        return ctx;
   256      }
   257  
   258      if (atomic_load(&queue->fast_path_disabled) != 0) {
   259        if (!spinning_queue_remove_first(0))
   260          panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0);
   261        break;
   262      }
   263  
   264      nr_active_threads = atomic_load(&queue->num_active_threads);
   265      nr_awake_contexts = atomic_load(&queue->num_awake_contexts);
   266  
   267      if (nr_awake_contexts < nr_active_threads) {
   268        if (atomic_compare_exchange(&queue->num_active_threads,
   269                                    &nr_active_threads, nr_active_threads - 1)) {
   270          nr_active_threads -= 1;
   271          if (!spinning_queue_remove_first(0))
   272            panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0);
   273          *nr_active_threads_p = nr_active_threads;
   274          break;
   275        }
   276      }
   277  
   278      if (spinning_queue_remove_first(__export_deep_sleep_timeout)) {
   279        break;
   280      }
   281      spinloop();
   282    }
   283    return NULL;
   284  }
   285  
   286  #define NR_IF_THREAD_IS_ACTIVE (~0)
   287  
   288  static bool try_to_dec_threads_to_wakeup(struct context_queue *queue) {
   289    while (1) {
   290      uint32_t nr = atomic_load(&queue->num_threads_to_wakeup);
   291      if (nr == 0) {
   292        return false;
   293      }
   294      if (atomic_compare_exchange(&queue->num_threads_to_wakeup, &nr, nr - 1)) {
   295        return true;
   296      };
   297    }
   298  }
   299  
   300  void init_new_thread() {
   301    struct context_queue *queue = __export_context_queue_addr;
   302  
   303    atomic_add(&queue->num_active_threads, 1);
   304    try_to_dec_threads_to_wakeup(queue);
   305  }
   306  
   307  // get_context retrieves a context that is ready to be restored to the user.
   308  // This populates sysmsg->thread_context_id.
   309  struct thread_context *get_context(struct sysmsg *sysmsg) {
   310    struct context_queue *queue = __export_context_queue_addr;
   311    uint32_t nr_active_threads;
   312  
   313    struct thread_context *ctx;
   314    for (;;) {
   315      atomic_add(&queue->num_spinning_threads, 1);
   316  
   317      // Change sysmsg thread state just to indicate thread is not asleep.
   318      atomic_store(&sysmsg->state, THREAD_STATE_PREP);
   319      ctx = queue_get_context(sysmsg);
   320      if (ctx) {
   321        goto exit;
   322      }
   323  
   324      bool fast_path_enabled = atomic_load(&queue->fast_path_disabled) == 0;
   325  
   326      nr_active_threads = NR_IF_THREAD_IS_ACTIVE;
   327      if (fast_path_enabled) {
   328        ctx = get_context_fast(sysmsg, queue, &nr_active_threads);
   329        if (ctx) goto exit;
   330      }
   331      if (nr_active_threads == NR_IF_THREAD_IS_ACTIVE) {
   332        nr_active_threads = atomic_sub(&queue->num_active_threads, 1);
   333      }
   334  
   335      atomic_sub(&queue->num_spinning_threads, 1);
   336      atomic_store(&sysmsg->state, THREAD_STATE_ASLEEP);
   337      uint32_t nr_active_contexts = atomic_load(&queue->num_active_contexts);
   338      // We have to make another attempt to get a context here to prevent TOCTTOU
   339      // races with waitOnState and kickSysmsgThread. There are two assumptions:
   340      // * If the queue isn't empty, one or more threads have to be active.
   341      // * A new thread isn't kicked, if the number of active threads are not less
   342      //   than a number of active contexts.
   343      if (nr_active_threads < nr_active_contexts) {
   344        ctx = queue_get_context(sysmsg);
   345        if (ctx) {
   346          atomic_store(&sysmsg->state, THREAD_STATE_PREP);
   347          atomic_add(&queue->num_active_threads, 1);
   348          return ctx;
   349        }
   350      }
   351  
   352      while (1) {
   353        if (!try_to_dec_threads_to_wakeup(queue)) {
   354          sys_futex(&queue->num_threads_to_wakeup, FUTEX_WAIT, 0, NULL, NULL, 0);
   355          continue;
   356        }
   357        // Mark this thread as being active only if it can get a context.
   358        ctx = queue_get_context(sysmsg);
   359        if (ctx) {
   360          atomic_store(&sysmsg->state, THREAD_STATE_PREP);
   361          atomic_add(&queue->num_active_threads, 1);
   362          return ctx;
   363        }
   364      }
   365    }
   366  exit:
   367    atomic_sub(&queue->num_spinning_threads, 1);
   368    return ctx;
   369  }
   370  
   371  // switch_context signals the sentry that the old context is ready to be worked
   372  // on and retrieves a new context to switch to.
   373  struct thread_context *switch_context(struct sysmsg *sysmsg,
   374                                        struct thread_context *ctx,
   375                                        enum context_state new_context_state) {
   376    struct context_queue *queue = __export_context_queue_addr;
   377  
   378    if (ctx) {
   379      atomic_sub(&queue->num_active_contexts, 1);
   380      atomic_store(&ctx->thread_id, INVALID_THREAD_ID);
   381      atomic_store(&ctx->last_thread_id, sysmsg->thread_id);
   382      atomic_store(&ctx->state_changed_time, rdtsc());
   383      atomic_store(&ctx->state, new_context_state);
   384      if (atomic_load(&ctx->sentry_fast_path) == 0) {
   385        int ret = sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 0);
   386        if (ret < 0) {
   387          panic(STUB_ERROR_FUTEX, ret);
   388        }
   389      }
   390    }
   391  
   392    return get_context(sysmsg);
   393  }
   394  
   395  void verify_offsets() {
   396    BUILD_BUG_ON(offsetof_sysmsg_self != offsetof(struct sysmsg, self));
   397    BUILD_BUG_ON(offsetof_sysmsg_ret_addr != offsetof(struct sysmsg, ret_addr));
   398    BUILD_BUG_ON(offsetof_sysmsg_syshandler !=
   399                 offsetof(struct sysmsg, syshandler));
   400    BUILD_BUG_ON(offsetof_sysmsg_syshandler_stack !=
   401                 offsetof(struct sysmsg, syshandler_stack));
   402    BUILD_BUG_ON(offsetof_sysmsg_app_stack != offsetof(struct sysmsg, app_stack));
   403    BUILD_BUG_ON(offsetof_sysmsg_interrupt != offsetof(struct sysmsg, interrupt));
   404    BUILD_BUG_ON(offsetof_sysmsg_state != offsetof(struct sysmsg, state));
   405    BUILD_BUG_ON(offsetof_sysmsg_context != offsetof(struct sysmsg, context));
   406  
   407    BUILD_BUG_ON(offsetof_thread_context_fpstate !=
   408                 offsetof(struct thread_context, fpstate));
   409    BUILD_BUG_ON(offsetof_thread_context_fpstate_changed !=
   410                 offsetof(struct thread_context, fpstate_changed));
   411    BUILD_BUG_ON(offsetof_thread_context_ptregs !=
   412                 offsetof(struct thread_context, ptregs));
   413  
   414    BUILD_BUG_ON(kTHREAD_STATE_NONE != THREAD_STATE_NONE);
   415  
   416    BUILD_BUG_ON(sizeof(struct thread_context) >
   417                 ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT);
   418  }