gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/sysmsg/sysmsg_lib.c (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #define _GNU_SOURCE 16 #include <errno.h> 17 #include <linux/futex.h> 18 #include <linux/unistd.h> 19 #include <signal.h> 20 #include <stdbool.h> 21 #include <stddef.h> 22 #include <stdint.h> 23 #include <stdlib.h> 24 25 #include "atomic.h" 26 #include "sysmsg.h" 27 28 // __export_deep_sleep_timeout is the timeout after which the stub thread stops 29 // polling and fall asleep. 30 uint64_t __export_deep_sleep_timeout; 31 32 // LINT.IfChange 33 #define MAX_GUEST_CONTEXTS (4095) 34 #define MAX_CONTEXT_QUEUE_ENTRIES (MAX_GUEST_CONTEXTS + 1) 35 #define INVALID_CONTEXT_ID 0xfefefefe 36 #define INVALID_THREAD_ID 0xfefefefe 37 38 // Each element of a context_queue ring buffer is a sum of its index shifted by 39 // CQ_INDEX_SHIFT and context_id. 40 #define CQ_INDEX_SHIFT 32 41 #define CQ_CONTEXT_MASK ((1UL << CQ_INDEX_SHIFT) - 1) 42 43 // See systrap/context_queue.go 44 struct context_queue { 45 uint32_t start; 46 uint32_t end; 47 uint32_t num_active_threads; 48 uint32_t num_spinning_threads; 49 uint32_t num_threads_to_wakeup; 50 uint32_t num_active_contexts; 51 uint32_t num_awake_contexts; 52 uint32_t fast_path_disabled; 53 uint32_t used_fast_path; 54 uint64_t ringbuffer[MAX_CONTEXT_QUEUE_ENTRIES]; 55 }; 56 57 struct context_queue *__export_context_queue_addr; 58 59 // LINT.ThenChange(../context_queue.go) 60 61 uint32_t is_empty(struct context_queue *queue) { 62 return atomic_load(&queue->start) == atomic_load(&queue->end); 63 } 64 65 int32_t queued_contexts(struct context_queue *queue) { 66 return (atomic_load(&queue->end) + MAX_CONTEXT_QUEUE_ENTRIES - 67 atomic_load(&queue->start)) % 68 MAX_CONTEXT_QUEUE_ENTRIES; 69 } 70 71 #if defined(__x86_64__) 72 static __inline__ unsigned long rdtsc(void) { 73 unsigned h, l; 74 __asm__ __volatile__("rdtsc" : "=a"(l), "=d"(h)); 75 return ((unsigned long)l) | (((unsigned long)h) << 32); 76 } 77 78 static __inline__ void spinloop(void) { asm("pause"); } 79 #elif defined(__aarch64__) 80 static __inline__ unsigned long rdtsc(void) { 81 long val; 82 asm volatile("mrs %0, cntvct_el0" : "=r"(val)); 83 return val; 84 } 85 86 static __inline__ void spinloop(void) { asm volatile("yield" : : : "memory"); } 87 #endif 88 89 void *__export_context_region; 90 91 static struct thread_context *thread_context_addr(uint32_t tcid) { 92 return (struct thread_context *)(__export_context_region + 93 tcid * 94 ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); 95 } 96 97 void memcpy(uint8_t *dest, uint8_t *src, size_t n) { 98 for (size_t i = 0; i < n; i += 1) { 99 dest[i] = src[i]; 100 } 101 } 102 103 // The spinning queue is a queue of spinning threads. It solves the 104 // fragmentation problem. The idea is to minimize the number of threads 105 // processing requests. We can't control how system threads are scheduled, so 106 // can't distribute requests efficiently. The spinning queue emulates virtual 107 // threads sorted by their spinning time. 108 // 109 // This queue is lock-less to be sure that any thread scheduled out 110 // from CPU doesn't block others. 111 // 112 // The size of the queue must be a divisor of 2^32, because queue indexes are 113 // calculated as modules of uint32 values. 114 #define SPINNING_QUEUE_SIZE 256 115 116 // MAX_RE_ENQUEUE defines the amount of time a given entry in the spinning queue 117 // needs to reach timeout in order to be removed. Re-enqueuing a timeout is done 118 // in order to mitigate rdtsc inaccuracies. 119 #define MAX_RE_ENQUEUE 2 120 121 struct spinning_queue { 122 uint32_t len; 123 uint32_t start; 124 uint32_t end; 125 uint64_t start_times[SPINNING_QUEUE_SIZE]; 126 uint8_t num_times_re_enqueued[SPINNING_QUEUE_SIZE]; 127 }; 128 129 struct spinning_queue *__export_spinning_queue_addr; 130 131 // spinning_queue_push adds a new thread to the queue. It returns false if the 132 // queue is full, or if re_enqueue_times has reached MAX_RE_ENQUEUE. 133 static bool spinning_queue_push(uint8_t re_enqueue_times) 134 __attribute__((warn_unused_result)); 135 static bool spinning_queue_push(uint8_t re_enqueue_times) { 136 struct spinning_queue *queue = __export_spinning_queue_addr; 137 uint32_t idx, end, len; 138 139 BUILD_BUG_ON(sizeof(struct spinning_queue) > SPINNING_QUEUE_MEM_SIZE); 140 if (re_enqueue_times >= MAX_RE_ENQUEUE) { 141 return false; 142 } 143 144 len = atomic_add(&queue->len, 1); 145 if (len > SPINNING_QUEUE_SIZE) { 146 atomic_sub(&queue->len, 1); 147 return false; 148 } 149 end = atomic_add(&queue->end, 1); 150 151 idx = end - 1; 152 atomic_store(&queue->num_times_re_enqueued[idx % SPINNING_QUEUE_SIZE], 153 re_enqueue_times); 154 atomic_store(&queue->start_times[idx % SPINNING_QUEUE_SIZE], rdtsc()); 155 return true; 156 } 157 158 // spinning_queue_pop() removes one thread from a queue that has been spinning 159 // the shortest time. 160 // However it doesn't take into account the spinning re-enqueue. 161 static void spinning_queue_pop() { 162 struct spinning_queue *queue = __export_spinning_queue_addr; 163 164 atomic_sub(&queue->end, 1); 165 atomic_sub(&queue->len, 1); 166 } 167 168 // spinning_queue_remove_first removes one thread from a queue that has been 169 // spinning longer than others and longer than a specified timeout. 170 // 171 // If `timeout` is zero, it always removes one element and never returns false. 172 // 173 // Returns true if one thread has been removed from the queue. 174 static bool spinning_queue_remove_first(uint64_t timeout) 175 __attribute__((warn_unused_result)); 176 static bool spinning_queue_remove_first(uint64_t timeout) { 177 struct spinning_queue *queue = __export_spinning_queue_addr; 178 uint64_t ts; 179 uint8_t re_enqueue = 0; 180 181 while (1) { 182 uint32_t idx, qidx; 183 184 idx = atomic_load(&queue->start); 185 qidx = idx % SPINNING_QUEUE_SIZE; 186 ts = atomic_load(&queue->start_times[qidx]); 187 188 if (ts == 0) continue; 189 if (rdtsc() - ts < timeout) return false; 190 if (idx != atomic_load(&queue->start)) continue; // Lose the race. 191 192 re_enqueue = atomic_load(&queue->num_times_re_enqueued[qidx]); 193 if (atomic_compare_exchange(&queue->start_times[qidx], &ts, 0)) { 194 atomic_add(&queue->start, 1); 195 break; 196 } 197 } 198 199 atomic_sub(&queue->len, 1); 200 if (timeout == 0) return true; 201 return !spinning_queue_push(re_enqueue + 1); 202 } 203 204 struct thread_context *queue_get_context(struct sysmsg *sysmsg) { 205 struct context_queue *queue = __export_context_queue_addr; 206 207 // Indexes should not jump when start or end are overflowed. 208 BUILD_BUG_ON(UINT32_MAX % MAX_CONTEXT_QUEUE_ENTRIES != 209 MAX_CONTEXT_QUEUE_ENTRIES - 1); 210 211 while (!is_empty(queue)) { 212 uint64_t idx = atomic_load(&queue->start); 213 uint32_t next = idx % MAX_CONTEXT_QUEUE_ENTRIES; 214 uint64_t v = atomic_load(&queue->ringbuffer[next]); 215 216 // We need to check the index to be sure that a ring buffer hasn't been 217 // recycled. 218 if ((v >> CQ_INDEX_SHIFT) != idx) continue; 219 if (!atomic_compare_exchange(&queue->ringbuffer[next], &v, 220 INVALID_CONTEXT_ID)) { 221 continue; 222 } 223 224 uint32_t context_id = v & CQ_CONTEXT_MASK; 225 if (context_id == INVALID_CONTEXT_ID) continue; 226 227 atomic_add(&queue->start, 1); 228 if (context_id > MAX_GUEST_CONTEXTS) { 229 panic(STUB_ERROR_BAD_CONTEXT_ID, context_id); 230 } 231 struct thread_context *ctx = thread_context_addr(context_id); 232 sysmsg->context = ctx; 233 atomic_store(&ctx->acked_time, rdtsc()); 234 atomic_store(&ctx->thread_id, sysmsg->thread_id); 235 return ctx; 236 } 237 return NULL; 238 } 239 240 // get_context_fast sets nr_active_threads_p only if it deactivates the thread. 241 static struct thread_context *get_context_fast(struct sysmsg *sysmsg, 242 struct context_queue *queue, 243 uint32_t *nr_active_threads_p) { 244 uint32_t nr_active_threads, nr_awake_contexts; 245 246 if (!spinning_queue_push(0)) return NULL; 247 atomic_store(&queue->used_fast_path, 1); 248 249 while (1) { 250 struct thread_context *ctx; 251 252 ctx = queue_get_context(sysmsg); 253 if (ctx) { 254 spinning_queue_pop(); 255 return ctx; 256 } 257 258 if (atomic_load(&queue->fast_path_disabled) != 0) { 259 if (!spinning_queue_remove_first(0)) 260 panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); 261 break; 262 } 263 264 nr_active_threads = atomic_load(&queue->num_active_threads); 265 nr_awake_contexts = atomic_load(&queue->num_awake_contexts); 266 267 if (nr_awake_contexts < nr_active_threads) { 268 if (atomic_compare_exchange(&queue->num_active_threads, 269 &nr_active_threads, nr_active_threads - 1)) { 270 nr_active_threads -= 1; 271 if (!spinning_queue_remove_first(0)) 272 panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); 273 *nr_active_threads_p = nr_active_threads; 274 break; 275 } 276 } 277 278 if (spinning_queue_remove_first(__export_deep_sleep_timeout)) { 279 break; 280 } 281 spinloop(); 282 } 283 return NULL; 284 } 285 286 #define NR_IF_THREAD_IS_ACTIVE (~0) 287 288 static bool try_to_dec_threads_to_wakeup(struct context_queue *queue) { 289 while (1) { 290 uint32_t nr = atomic_load(&queue->num_threads_to_wakeup); 291 if (nr == 0) { 292 return false; 293 } 294 if (atomic_compare_exchange(&queue->num_threads_to_wakeup, &nr, nr - 1)) { 295 return true; 296 }; 297 } 298 } 299 300 void init_new_thread() { 301 struct context_queue *queue = __export_context_queue_addr; 302 303 atomic_add(&queue->num_active_threads, 1); 304 try_to_dec_threads_to_wakeup(queue); 305 } 306 307 // get_context retrieves a context that is ready to be restored to the user. 308 // This populates sysmsg->thread_context_id. 309 struct thread_context *get_context(struct sysmsg *sysmsg) { 310 struct context_queue *queue = __export_context_queue_addr; 311 uint32_t nr_active_threads; 312 313 struct thread_context *ctx; 314 for (;;) { 315 atomic_add(&queue->num_spinning_threads, 1); 316 317 // Change sysmsg thread state just to indicate thread is not asleep. 318 atomic_store(&sysmsg->state, THREAD_STATE_PREP); 319 ctx = queue_get_context(sysmsg); 320 if (ctx) { 321 goto exit; 322 } 323 324 bool fast_path_enabled = atomic_load(&queue->fast_path_disabled) == 0; 325 326 nr_active_threads = NR_IF_THREAD_IS_ACTIVE; 327 if (fast_path_enabled) { 328 ctx = get_context_fast(sysmsg, queue, &nr_active_threads); 329 if (ctx) goto exit; 330 } 331 if (nr_active_threads == NR_IF_THREAD_IS_ACTIVE) { 332 nr_active_threads = atomic_sub(&queue->num_active_threads, 1); 333 } 334 335 atomic_sub(&queue->num_spinning_threads, 1); 336 atomic_store(&sysmsg->state, THREAD_STATE_ASLEEP); 337 uint32_t nr_active_contexts = atomic_load(&queue->num_active_contexts); 338 // We have to make another attempt to get a context here to prevent TOCTTOU 339 // races with waitOnState and kickSysmsgThread. There are two assumptions: 340 // * If the queue isn't empty, one or more threads have to be active. 341 // * A new thread isn't kicked, if the number of active threads are not less 342 // than a number of active contexts. 343 if (nr_active_threads < nr_active_contexts) { 344 ctx = queue_get_context(sysmsg); 345 if (ctx) { 346 atomic_store(&sysmsg->state, THREAD_STATE_PREP); 347 atomic_add(&queue->num_active_threads, 1); 348 return ctx; 349 } 350 } 351 352 while (1) { 353 if (!try_to_dec_threads_to_wakeup(queue)) { 354 sys_futex(&queue->num_threads_to_wakeup, FUTEX_WAIT, 0, NULL, NULL, 0); 355 continue; 356 } 357 // Mark this thread as being active only if it can get a context. 358 ctx = queue_get_context(sysmsg); 359 if (ctx) { 360 atomic_store(&sysmsg->state, THREAD_STATE_PREP); 361 atomic_add(&queue->num_active_threads, 1); 362 return ctx; 363 } 364 } 365 } 366 exit: 367 atomic_sub(&queue->num_spinning_threads, 1); 368 return ctx; 369 } 370 371 // switch_context signals the sentry that the old context is ready to be worked 372 // on and retrieves a new context to switch to. 373 struct thread_context *switch_context(struct sysmsg *sysmsg, 374 struct thread_context *ctx, 375 enum context_state new_context_state) { 376 struct context_queue *queue = __export_context_queue_addr; 377 378 if (ctx) { 379 atomic_sub(&queue->num_active_contexts, 1); 380 atomic_store(&ctx->thread_id, INVALID_THREAD_ID); 381 atomic_store(&ctx->last_thread_id, sysmsg->thread_id); 382 atomic_store(&ctx->state_changed_time, rdtsc()); 383 atomic_store(&ctx->state, new_context_state); 384 if (atomic_load(&ctx->sentry_fast_path) == 0) { 385 int ret = sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 0); 386 if (ret < 0) { 387 panic(STUB_ERROR_FUTEX, ret); 388 } 389 } 390 } 391 392 return get_context(sysmsg); 393 } 394 395 void verify_offsets() { 396 BUILD_BUG_ON(offsetof_sysmsg_self != offsetof(struct sysmsg, self)); 397 BUILD_BUG_ON(offsetof_sysmsg_ret_addr != offsetof(struct sysmsg, ret_addr)); 398 BUILD_BUG_ON(offsetof_sysmsg_syshandler != 399 offsetof(struct sysmsg, syshandler)); 400 BUILD_BUG_ON(offsetof_sysmsg_syshandler_stack != 401 offsetof(struct sysmsg, syshandler_stack)); 402 BUILD_BUG_ON(offsetof_sysmsg_app_stack != offsetof(struct sysmsg, app_stack)); 403 BUILD_BUG_ON(offsetof_sysmsg_interrupt != offsetof(struct sysmsg, interrupt)); 404 BUILD_BUG_ON(offsetof_sysmsg_state != offsetof(struct sysmsg, state)); 405 BUILD_BUG_ON(offsetof_sysmsg_context != offsetof(struct sysmsg, context)); 406 407 BUILD_BUG_ON(offsetof_thread_context_fpstate != 408 offsetof(struct thread_context, fpstate)); 409 BUILD_BUG_ON(offsetof_thread_context_fpstate_changed != 410 offsetof(struct thread_context, fpstate_changed)); 411 BUILD_BUG_ON(offsetof_thread_context_ptregs != 412 offsetof(struct thread_context, ptregs)); 413 414 BUILD_BUG_ON(kTHREAD_STATE_NONE != THREAD_STATE_NONE); 415 416 BUILD_BUG_ON(sizeof(struct thread_context) > 417 ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); 418 }