github.com/moontrade/nogc@v0.1.7/collections/wormhole/lib.c (about) 1 /* 2 * Copyright (c) 2016--2021 Wu, Xingbo <wuxb45@gmail.com> 3 * 4 * All rights reserved. No warranty, explicit or implicit, provided. 5 */ 6 #define _GNU_SOURCE 7 8 // headers {{{ 9 #include "lib.h" 10 #include "ctypes.h" 11 #include <assert.h> 12 #include <execinfo.h> 13 #include <math.h> 14 #include <netdb.h> 15 #include <sched.h> 16 #include <signal.h> 17 #include <sys/socket.h> 18 #include <poll.h> 19 #include <sys/ioctl.h> 20 #include <time.h> 21 #include <stdarg.h> // va_start 22 23 #if defined(__linux__) 24 #include <linux/fs.h> 25 #include <malloc.h> // malloc_usable_size 26 #elif defined(__APPLE__) && defined(__MACH__) 27 #include <sys/disk.h> 28 #include <malloc/malloc.h> 29 #elif defined(__FreeBSD__) 30 #include <sys/disk.h> 31 #include <malloc_np.h> 32 #endif // OS 33 34 #if defined(__FreeBSD__) 35 #include <pthread_np.h> 36 #endif 37 // }}} headers 38 39 // math {{{ 40 inline u64 41 mhash64(const u64 v) 42 { 43 return v * 11400714819323198485lu; 44 } 45 46 inline u32 47 mhash32(const u32 v) 48 { 49 return v * 2654435761u; 50 } 51 52 // From Daniel Lemire's blog (2013, lemire.me) 53 u64 54 gcd64(u64 a, u64 b) 55 { 56 if (a == 0) 57 return b; 58 if (b == 0) 59 return a; 60 61 const u32 shift = (u32)__builtin_ctzl(a | b); 62 a >>= __builtin_ctzl(a); 63 do { 64 b >>= __builtin_ctzl(b); 65 if (a > b) { 66 const u64 t = b; 67 b = a; 68 a = t; 69 } 70 b = b - a; 71 } while (b); 72 return a << shift; 73 } 74 // }}} math 75 76 // random {{{ 77 // Lehmer's generator is 2x faster than xorshift 78 /** 79 * D. H. Lehmer, Mathematical methods in large-scale computing units. 80 * Proceedings of a Second Symposium on Large Scale Digital Calculating 81 * Machinery; 82 * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146. 83 * 84 * P L'Ecuyer, Tables of linear congruential generators of different sizes and 85 * good lattice structure. Mathematics of Computation of the American 86 * Mathematical 87 * Society 68.225 (1999): 249-260. 88 */ 89 struct lehmer_u64 { 90 union { 91 u128 v128; 92 u64 v64[2]; 93 }; 94 }; 95 96 static __thread struct lehmer_u64 rseed_u128 = {.v64 = {4294967291, 1549556881}}; 97 98 static inline u64 99 lehmer_u64_next(struct lehmer_u64 * const s) 100 { 101 const u64 r = s->v64[1]; 102 s->v128 *= 0xda942042e4dd58b5lu; 103 return r; 104 } 105 106 static inline void 107 lehmer_u64_seed(struct lehmer_u64 * const s, const u64 seed) 108 { 109 s->v128 = (((u128)(~seed)) << 64) | (seed | 1); 110 (void)lehmer_u64_next(s); 111 } 112 113 inline u64 114 random_u64(void) 115 { 116 return lehmer_u64_next(&rseed_u128); 117 } 118 119 inline void 120 srandom_u64(const u64 seed) 121 { 122 lehmer_u64_seed(&rseed_u128, seed); 123 } 124 125 inline double 126 random_double(void) 127 { 128 // random between [0.0 - 1.0] 129 const u64 r = random_u64(); 130 return ((double)r) * (1.0 / ((double)(~0lu))); 131 } 132 // }}} random 133 134 // timing {{{ 135 inline u64 136 time_nsec(void) 137 { 138 struct timespec ts; 139 // MONO_RAW is 5x to 10x slower than MONO 140 clock_gettime(CLOCK_MONOTONIC, &ts); 141 return ((u64)ts.tv_sec) * 1000000000lu + ((u64)ts.tv_nsec); 142 } 143 144 inline double 145 time_sec(void) 146 { 147 const u64 nsec = time_nsec(); 148 return ((double)nsec) * 1.0e-9; 149 } 150 151 inline u64 152 time_diff_nsec(const u64 last) 153 { 154 return time_nsec() - last; 155 } 156 157 inline double 158 time_diff_sec(const double last) 159 { 160 return time_sec() - last; 161 } 162 163 // need char str[64] 164 void 165 time_stamp(char * str, const size_t size) 166 { 167 time_t now; 168 struct tm nowtm; 169 time(&now); 170 localtime_r(&now, &nowtm); 171 strftime(str, size, "%F %T %z", &nowtm); 172 } 173 174 void 175 time_stamp2(char * str, const size_t size) 176 { 177 time_t now; 178 struct tm nowtm; 179 time(&now); 180 localtime_r(&now, &nowtm); 181 strftime(str, size, "%F-%H-%M-%S%z", &nowtm); 182 } 183 // }}} timing 184 185 // cpucache {{{ 186 inline void 187 cpu_pause(void) 188 { 189 #if defined(__x86_64__) 190 _mm_pause(); 191 #elif defined(__aarch64__) 192 // nop 193 #endif 194 } 195 196 inline void 197 cpu_mfence(void) 198 { 199 atomic_thread_fence(MO_SEQ_CST); 200 } 201 202 // compiler fence 203 inline void 204 cpu_cfence(void) 205 { 206 atomic_thread_fence(MO_ACQ_REL); 207 } 208 209 inline void 210 cpu_prefetch0(const void * const ptr) 211 { 212 __builtin_prefetch(ptr, 0, 0); 213 } 214 215 inline void 216 cpu_prefetch1(const void * const ptr) 217 { 218 __builtin_prefetch(ptr, 0, 1); 219 } 220 221 inline void 222 cpu_prefetch2(const void * const ptr) 223 { 224 __builtin_prefetch(ptr, 0, 2); 225 } 226 227 inline void 228 cpu_prefetch3(const void * const ptr) 229 { 230 __builtin_prefetch(ptr, 0, 3); 231 } 232 233 inline void 234 cpu_prefetchw(const void * const ptr) 235 { 236 __builtin_prefetch(ptr, 1, 0); 237 } 238 // }}} cpucache 239 240 // crc32c {{{ 241 inline u32 242 crc32c_u8(const u32 crc, const u8 v) 243 { 244 #if defined(__x86_64__) 245 return _mm_crc32_u8(crc, v); 246 #elif defined(__aarch64__) 247 return __crc32cb(crc, v); 248 #endif 249 } 250 251 inline u32 252 crc32c_u16(const u32 crc, const u16 v) 253 { 254 #if defined(__x86_64__) 255 return _mm_crc32_u16(crc, v); 256 #elif defined(__aarch64__) 257 return __crc32ch(crc, v); 258 #endif 259 } 260 261 inline u32 262 crc32c_u32(const u32 crc, const u32 v) 263 { 264 #if defined(__x86_64__) 265 return _mm_crc32_u32(crc, v); 266 #elif defined(__aarch64__) 267 return __crc32cw(crc, v); 268 #endif 269 } 270 271 inline u32 272 crc32c_u64(const u32 crc, const u64 v) 273 { 274 #if defined(__x86_64__) 275 return (u32)_mm_crc32_u64(crc, v); 276 #elif defined(__aarch64__) 277 return (u32)__crc32cd(crc, v); 278 #endif 279 } 280 281 inline u32 282 crc32c_inc_123(const u8 * buf, u32 nr, u32 crc) 283 { 284 if (nr == 1) 285 return crc32c_u8(crc, buf[0]); 286 287 crc = crc32c_u16(crc, *(u16 *)buf); 288 return (nr == 2) ? crc : crc32c_u8(crc, buf[2]); 289 } 290 291 inline u32 292 crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc) 293 { 294 //debug_assert((nr & 3) == 0); 295 const u32 nr8 = nr >> 3; 296 #pragma nounroll 297 for (u32 i = 0; i < nr8; i++) 298 crc = crc32c_u64(crc, ((u64*)buf)[i]); 299 300 if (nr & 4u) 301 crc = crc32c_u32(crc, ((u32*)buf)[nr8<<1]); 302 return crc; 303 } 304 305 u32 306 crc32c_inc(const u8 * buf, u32 nr, u32 crc) 307 { 308 crc = crc32c_inc_x4(buf, nr, crc); 309 const u32 nr123 = nr & 3u; 310 return nr123 ? crc32c_inc_123(buf + nr - nr123, nr123, crc) : crc; 311 } 312 // }}} crc32c 313 314 // debug {{{ 315 void 316 debug_break(void) 317 { 318 usleep(100); 319 } 320 321 static u64 * debug_watch_u64 = NULL; 322 323 static void 324 watch_u64_handler(const int sig) 325 { 326 (void)sig; 327 const u64 v = debug_watch_u64 ? (*debug_watch_u64) : 0; 328 fprintf(stderr, "[USR1] %lu (0x%lx)\n", v, v); 329 } 330 331 void 332 watch_u64_usr1(u64 * const ptr) 333 { 334 debug_watch_u64 = ptr; 335 struct sigaction sa = {}; 336 sa.sa_handler = watch_u64_handler; 337 sigemptyset(&(sa.sa_mask)); 338 sa.sa_flags = SA_RESTART; 339 if (sigaction(SIGUSR1, &sa, NULL) == -1) { 340 fprintf(stderr, "Failed to set signal handler for SIGUSR1\n"); 341 } else { 342 fprintf(stderr, "to watch> kill -s SIGUSR1 %d\n", getpid()); 343 } 344 } 345 346 static void * debug_bt_state = NULL; 347 #if defined(BACKTRACE) && defined(__linux__) 348 // TODO: get exec path on MacOS and FreeBSD 349 350 #include <backtrace.h> 351 static char debug_filepath[1024] = {}; 352 353 static void 354 debug_bt_error_cb(void * const data, const char * const msg, const int errnum) 355 { 356 (void)data; 357 if (msg) 358 dprintf(2, "libbacktrace: %s %s\n", msg, strerror(errnum)); 359 } 360 361 static int 362 debug_bt_print_cb(void * const data, const uintptr_t pc, 363 const char * const file, const int lineno, const char * const func) 364 { 365 u32 * const plevel = (typeof(plevel))data; 366 if (file || func || lineno) { 367 dprintf(2, "[%u]0x%012lx " TERMCLR(35) "%s" TERMCLR(31) ":" TERMCLR(34) "%d" TERMCLR(0)" %s\n", 368 *plevel, pc, file ? file : "???", lineno, func ? func : "???"); 369 } else if (pc) { 370 dprintf(2, "[%u]0x%012lx ??\n", *plevel, pc); 371 } 372 (*plevel)++; 373 return 0; 374 } 375 376 __attribute__((constructor)) 377 static void 378 debug_backtrace_init(void) 379 { 380 const ssize_t len = readlink("/proc/self/exe", debug_filepath, 1023); 381 // disable backtrace 382 if (len < 0 || len >= 1023) 383 return; 384 385 debug_filepath[len] = '\0'; 386 debug_bt_state = backtrace_create_state(debug_filepath, 1, debug_bt_error_cb, NULL); 387 } 388 #endif // BACKTRACE 389 390 static void 391 debug_wait_gdb(void * const bt_state) 392 { 393 if (bt_state) { 394 #if defined(BACKTRACE) 395 dprintf(2, "Backtrace :\n"); 396 u32 level = 0; 397 backtrace_full(debug_bt_state, 1, debug_bt_print_cb, debug_bt_error_cb, &level); 398 #endif // BACKTRACE 399 } else { // fallback to execinfo if no backtrace or initialization failed 400 void *array[64]; 401 const int size = backtrace(array, 64); 402 dprintf(2, "Backtrace (%d):\n", size - 1); 403 backtrace_symbols_fd(array + 1, size - 1, 2); 404 } 405 406 abool v = true; 407 char timestamp[32]; 408 time_stamp(timestamp, 32); 409 char threadname[32] = {}; 410 thread_get_name(pthread_self(), threadname, 32); 411 strcat(threadname, "(!!)"); 412 thread_set_name(pthread_self(), threadname); 413 char hostname[32]; 414 gethostname(hostname, 32); 415 416 const char * const pattern = "[Waiting GDB] %1$s %2$s @ %3$s\n" 417 " Attach me: " TERMCLR(31) "sudo -Hi gdb -p %4$d" TERMCLR(0) "\n"; 418 char buf[256]; 419 sprintf(buf, pattern, timestamp, threadname, hostname, getpid()); 420 write(2, buf, strlen(buf)); 421 422 // to continue: gdb> set var v = 0 423 // to kill from shell: $ kill %pid; kill -CONT %pid 424 425 // uncomment this line to surrender the shell on error 426 // kill(getpid(), SIGSTOP); // stop burning cpu, once 427 428 static au32 nr_waiting = 0; 429 const u32 seq = atomic_fetch_add_explicit(&nr_waiting, 1, MO_RELAXED); 430 if (seq == 0) { 431 sprintf(buf, "/run/user/%u/.debug_wait_gdb_pid", getuid()); 432 const int pidfd = open(buf, O_CREAT|O_TRUNC|O_WRONLY, 00644); 433 if (pidfd >= 0) { 434 dprintf(pidfd, "%u", getpid()); 435 close(pidfd); 436 } 437 } 438 439 #pragma nounroll 440 while (atomic_load_explicit(&v, MO_CONSUME)) 441 sleep(1); 442 } 443 444 #ifndef NDEBUG 445 void 446 debug_assert(const bool v) 447 { 448 if (!v) 449 debug_wait_gdb(debug_bt_state); 450 } 451 #endif 452 453 __attribute__((noreturn)) 454 void 455 debug_die(void) 456 { 457 debug_wait_gdb(debug_bt_state); 458 exit(0); 459 } 460 461 __attribute__((noreturn)) 462 void 463 debug_die_perror(void) 464 { 465 perror(NULL); 466 debug_die(); 467 } 468 469 #if !defined(NOSIGNAL) 470 // signal handler for wait_gdb on fatal errors 471 static void 472 wait_gdb_handler(const int sig, siginfo_t * const info, void * const context) 473 { 474 (void)info; 475 (void)context; 476 char buf[64] = "[SIGNAL] "; 477 strcat(buf, strsignal(sig)); 478 write(2, buf, strlen(buf)); 479 debug_wait_gdb(NULL); 480 } 481 482 // setup hooks for catching fatal errors 483 __attribute__((constructor)) 484 static void 485 debug_init(void) 486 { 487 void * stack = pages_alloc_4kb(16); 488 //fprintf(stderr, "altstack %p\n", stack); 489 stack_t ss = {.ss_sp = stack, .ss_flags = 0, .ss_size = PGSZ*16}; 490 if (sigaltstack(&ss, NULL)) 491 fprintf(stderr, "sigaltstack failed\n"); 492 493 struct sigaction sa = {.sa_sigaction = wait_gdb_handler, .sa_flags = SA_SIGINFO | SA_ONSTACK}; 494 sigemptyset(&(sa.sa_mask)); 495 const int fatals[] = {SIGSEGV, SIGFPE, SIGILL, SIGBUS, 0}; 496 for (int i = 0; fatals[i]; i++) { 497 if (sigaction(fatals[i], &sa, NULL) == -1) { 498 fprintf(stderr, "Failed to set signal handler for %s\n", strsignal(fatals[i])); 499 fflush(stderr); 500 } 501 } 502 } 503 504 __attribute__((destructor)) 505 static void 506 debug_exit(void) 507 { 508 // to get rid of valgrind warnings 509 stack_t ss = {.ss_flags = SS_DISABLE}; 510 stack_t oss = {}; 511 sigaltstack(&ss, &oss); 512 if (oss.ss_sp) 513 pages_unmap(oss.ss_sp, PGSZ * 16); 514 } 515 #endif // !defined(NOSIGNAL) 516 517 void 518 debug_dump_maps(FILE * const out) 519 { 520 FILE * const in = fopen("/proc/self/smaps", "r"); 521 char * line0 = yalloc(1024); 522 size_t size0 = 1024; 523 while (!feof(in)) { 524 const ssize_t r1 = getline(&line0, &size0, in); 525 if (r1 < 0) break; 526 fprintf(out, "%s", line0); 527 } 528 fflush(out); 529 fclose(in); 530 } 531 532 static pid_t perf_pid = 0; 533 534 #if defined(__linux__) 535 __attribute__((constructor)) 536 static void 537 debug_perf_init(void) 538 { 539 const pid_t ppid = getppid(); 540 char tmp[256] = {}; 541 sprintf(tmp, "/proc/%d/cmdline", ppid); 542 FILE * const fc = fopen(tmp, "r"); 543 const size_t nr = fread(tmp, 1, sizeof(tmp) - 1, fc); 544 fclose(fc); 545 // look for "perf record" 546 if (nr < 12) 547 return; 548 tmp[nr] = '\0'; 549 for (u64 i = 0; i < nr; i++) 550 if (tmp[i] == 0) 551 tmp[i] = ' '; 552 553 char * const perf = strstr(tmp, "perf record"); 554 if (perf) { 555 fprintf(stderr, "%s: perf detected\n", __func__); 556 perf_pid = ppid; 557 } 558 } 559 #endif // __linux__ 560 561 bool 562 debug_perf_switch(void) 563 { 564 if (perf_pid > 0) { 565 kill(perf_pid, SIGUSR2); 566 return true; 567 } else { 568 return false; 569 } 570 } 571 // }}} debug 572 573 // mm {{{ 574 #ifdef ALLOCFAIL 575 bool 576 alloc_fail(void) 577 { 578 #define ALLOCFAIL_RECP ((64lu)) 579 #define ALLOCFAIL_MAGIC ((ALLOCFAIL_RECP / 3lu)) 580 return ((random_u64() % ALLOCFAIL_RECP) == ALLOCFAIL_MAGIC); 581 } 582 583 #ifdef MALLOCFAIL 584 extern void * __libc_malloc(size_t size); 585 void * 586 malloc(size_t size) 587 { 588 if (alloc_fail()) 589 return NULL; 590 return __libc_malloc(size); 591 } 592 593 extern void * __libc_calloc(size_t nmemb, size_t size); 594 void * 595 calloc(size_t nmemb, size_t size) 596 { 597 if (alloc_fail()) 598 return NULL; 599 return __libc_calloc(nmemb, size); 600 } 601 602 extern void *__libc_realloc(void *ptr, size_t size); 603 604 void * 605 realloc(void *ptr, size_t size) 606 { 607 if (alloc_fail()) 608 return NULL; 609 return __libc_realloc(ptr, size); 610 } 611 #endif // MALLOC_FAIL 612 #endif // ALLOC_FAIL 613 614 void * 615 xalloc(const size_t align, const size_t size) 616 { 617 #ifdef ALLOCFAIL 618 if (alloc_fail()) 619 return NULL; 620 #endif 621 void * p; 622 return (posix_memalign(&p, align, size) == 0) ? p : NULL; 623 } 624 625 // alloc cache-line aligned address 626 void * 627 yalloc(const size_t size) 628 { 629 #ifdef ALLOCFAIL 630 if (alloc_fail()) 631 return NULL; 632 #endif 633 void * p; 634 return (posix_memalign(&p, 64, size) == 0) ? p : NULL; 635 } 636 637 void ** 638 malloc_2d(const size_t nr, const size_t size) 639 { 640 const size_t size1 = nr * sizeof(void *); 641 const size_t size2 = nr * size; 642 void ** const mem = malloc(size1 + size2); 643 u8 * const mem2 = ((u8 *)mem) + size1; 644 for (size_t i = 0; i < nr; i++) 645 mem[i] = mem2 + (i * size); 646 return mem; 647 } 648 649 inline void ** 650 calloc_2d(const size_t nr, const size_t size) 651 { 652 void ** const ret = malloc_2d(nr, size); 653 memset(ret[0], 0, nr * size); 654 return ret; 655 } 656 657 inline void 658 pages_unmap(void * const ptr, const size_t size) 659 { 660 #ifndef HEAPCHECKING 661 munmap(ptr, size); 662 #else 663 (void)size; 664 free(ptr); 665 #endif 666 } 667 668 void 669 pages_lock(void * const ptr, const size_t size) 670 { 671 static bool use_mlock = true; 672 if (use_mlock) { 673 const int ret = mlock(ptr, size); 674 if (ret != 0) { 675 use_mlock = false; 676 fprintf(stderr, "%s: mlock disabled\n", __func__); 677 } 678 } 679 } 680 681 #ifndef HEAPCHECKING 682 static void * 683 pages_do_alloc(const size_t size, const int flags) 684 { 685 // vi /etc/security/limits.conf 686 // * - memlock unlimited 687 void * const p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); 688 if (p == MAP_FAILED) 689 return NULL; 690 691 pages_lock(p, size); 692 return p; 693 } 694 695 #if defined(__linux__) && defined(MAP_HUGETLB) 696 697 #if defined(MAP_HUGE_SHIFT) 698 #define PAGES_FLAGS_1G ((MAP_HUGETLB | (30 << MAP_HUGE_SHIFT))) 699 #define PAGES_FLAGS_2M ((MAP_HUGETLB | (21 << MAP_HUGE_SHIFT))) 700 #else // MAP_HUGE_SHIFT 701 #define PAGES_FLAGS_1G ((MAP_HUGETLB)) 702 #define PAGES_FLAGS_2M ((MAP_HUGETLB)) 703 #endif // MAP_HUGE_SHIFT 704 705 #else 706 #define PAGES_FLAGS_1G ((0)) 707 #define PAGES_FLAGS_2M ((0)) 708 #endif // __linux__ 709 710 #endif // HEAPCHECKING 711 712 inline void * 713 pages_alloc_1gb(const size_t nr_1gb) 714 { 715 const u64 sz = nr_1gb << 30; 716 #ifndef HEAPCHECKING 717 return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_1G); 718 #else 719 void * const p = xalloc(1lu << 21, sz); // Warning: valgrind fails with 30 720 if (p) 721 memset(p, 0, sz); 722 return p; 723 #endif 724 } 725 726 inline void * 727 pages_alloc_2mb(const size_t nr_2mb) 728 { 729 const u64 sz = nr_2mb << 21; 730 #ifndef HEAPCHECKING 731 return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_2M); 732 #else 733 void * const p = xalloc(1lu << 21, sz); 734 if (p) 735 memset(p, 0, sz); 736 return p; 737 #endif 738 } 739 740 inline void * 741 pages_alloc_4kb(const size_t nr_4kb) 742 { 743 const size_t sz = nr_4kb << 12; 744 #ifndef HEAPCHECKING 745 return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS); 746 #else 747 void * const p = xalloc(1lu << 12, sz); 748 if (p) 749 memset(p, 0, sz); 750 return p; 751 #endif 752 } 753 754 void * 755 pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out) 756 { 757 #ifdef ALLOCFAIL 758 if (alloc_fail()) 759 return NULL; 760 #endif 761 // 1gb huge page: at least 0.25GB 762 if (try_1gb) { 763 if (size >= (1lu << 28)) { 764 const size_t nr_1gb = bits_round_up(size, 30) >> 30; 765 void * const p1 = pages_alloc_1gb(nr_1gb); 766 if (p1) { 767 *size_out = nr_1gb << 30; 768 return p1; 769 } 770 } 771 } 772 773 // 2mb huge page: at least 0.5MB 774 if (size >= (1lu << 19)) { 775 const size_t nr_2mb = bits_round_up(size, 21) >> 21; 776 void * const p2 = pages_alloc_2mb(nr_2mb); 777 if (p2) { 778 *size_out = nr_2mb << 21; 779 return p2; 780 } 781 } 782 783 const size_t nr_4kb = bits_round_up(size, 12) >> 12; 784 void * const p3 = pages_alloc_4kb(nr_4kb); 785 if (p3) 786 *size_out = nr_4kb << 12; 787 return p3; 788 } 789 // }}} mm 790 791 // process/thread {{{ 792 static u32 process_ncpu; 793 #if defined(__FreeBSD__) 794 typedef cpuset_t cpu_set_t; 795 #elif defined(__APPLE__) && defined(__MACH__) 796 typedef u64 cpu_set_t; 797 #define CPU_SETSIZE ((64)) 798 #define CPU_COUNT(__cpu_ptr__) (__builtin_popcountl(*__cpu_ptr__)) 799 #define CPU_ISSET(__cpu_idx__, __cpu_ptr__) (((*__cpu_ptr__) >> __cpu_idx__) & 1lu) 800 #define CPU_ZERO(__cpu_ptr__) ((*__cpu_ptr__) = 0) 801 #define CPU_SET(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) |= (1lu << __cpu_idx__)) 802 #define CPU_CLR(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) &= ~(1lu << __cpu_idx__)) 803 #define pthread_attr_setaffinity_np(...) ((void)0) 804 #endif 805 806 __attribute__((constructor)) 807 static void 808 process_init(void) 809 { 810 // Linux's default is 1024 cpus 811 process_ncpu = (u32)sysconf(_SC_NPROCESSORS_CONF); 812 if (process_ncpu > CPU_SETSIZE) { 813 fprintf(stderr, "%s: can use only %zu cores\n", 814 __func__, (size_t)CPU_SETSIZE); 815 process_ncpu = CPU_SETSIZE; 816 } 817 thread_set_name(pthread_self(), "main"); 818 } 819 820 static inline int 821 thread_getaffinity_set(cpu_set_t * const cpuset) 822 { 823 #if defined(__linux__) 824 return sched_getaffinity(0, sizeof(*cpuset), cpuset); 825 #elif defined(__FreeBSD__) 826 return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset); 827 #elif defined(__APPLE__) && defined(__MACH__) 828 *cpuset = (1lu << process_ncpu) - 1; 829 return (int)process_ncpu; // TODO 830 #endif // OS 831 } 832 833 static inline int 834 thread_setaffinity_set(const cpu_set_t * const cpuset) 835 { 836 #if defined(__linux__) 837 return sched_setaffinity(0, sizeof(*cpuset), cpuset); 838 #elif defined(__FreeBSD__) 839 return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset); 840 #elif defined(__APPLE__) && defined(__MACH__) 841 (void)cpuset; // TODO 842 return 0; 843 #endif // OS 844 } 845 846 void 847 thread_get_name(const pthread_t pt, char * const name, const size_t len) 848 { 849 #if defined(__linux__) 850 pthread_getname_np(pt, name, len); 851 #elif defined(__FreeBSD__) 852 pthread_get_name_np(pt, name, len); 853 #elif defined(__APPLE__) && defined(__MACH__) 854 (void)pt; 855 (void)len; 856 strcpy(name, "unknown"); // TODO 857 #endif // OS 858 } 859 860 void 861 thread_set_name(const pthread_t pt, const char * const name) 862 { 863 #if defined(__linux__) 864 pthread_setname_np(pt, name); 865 #elif defined(__FreeBSD__) 866 pthread_set_name_np(pt, name); 867 #elif defined(__APPLE__) && defined(__MACH__) 868 (void)pt; 869 (void)name; // TODO 870 #endif // OS 871 } 872 873 // kB 874 long 875 process_get_rss(void) 876 { 877 struct rusage rs; 878 getrusage(RUSAGE_SELF, &rs); 879 return rs.ru_maxrss; 880 } 881 882 u32 883 process_affinity_count(void) 884 { 885 cpu_set_t set; 886 if (thread_getaffinity_set(&set) != 0) 887 return process_ncpu; 888 889 const u32 nr = (u32)CPU_COUNT(&set); 890 return nr ? nr : process_ncpu; 891 } 892 893 u32 894 process_getaffinity_list(const u32 max, u32 * const cores) 895 { 896 memset(cores, 0, max * sizeof(cores[0])); 897 cpu_set_t set; 898 if (thread_getaffinity_set(&set) != 0) 899 return 0; 900 901 const u32 nr_affinity = (u32)CPU_COUNT(&set); 902 const u32 nr = nr_affinity < max ? nr_affinity : max; 903 u32 j = 0; 904 for (u32 i = 0; i < process_ncpu; i++) { 905 if (CPU_ISSET(i, &set)) 906 cores[j++] = i; 907 908 if (j >= nr) 909 break; 910 } 911 return j; 912 } 913 914 void 915 thread_setaffinity_list(const u32 nr, const u32 * const list) 916 { 917 cpu_set_t set; 918 CPU_ZERO(&set); 919 for (u32 i = 0; i < nr; i++) 920 if (list[i] < process_ncpu) 921 CPU_SET(list[i], &set); 922 thread_setaffinity_set(&set); 923 } 924 925 void 926 thread_pin(const u32 cpu) 927 { 928 cpu_set_t set; 929 CPU_ZERO(&set); 930 CPU_SET(cpu % process_ncpu, &set); 931 thread_setaffinity_set(&set); 932 } 933 934 u64 935 process_cpu_time_usec(void) 936 { 937 struct rusage rs; 938 getrusage(RUSAGE_SELF, &rs); 939 const u64 usr = (((u64)rs.ru_utime.tv_sec) * 1000000lu) + ((u64)rs.ru_utime.tv_usec); 940 const u64 sys = (((u64)rs.ru_stime.tv_sec) * 1000000lu) + ((u64)rs.ru_stime.tv_usec); 941 return usr + sys; 942 } 943 944 struct fork_join_info { 945 u32 total; 946 u32 ncores; 947 u32 * cores; 948 void *(*func)(void *); 949 bool args; 950 union { 951 void * arg1; 952 void ** argn; 953 }; 954 union { 955 struct { volatile au32 ferr, jerr; }; 956 volatile au64 xerr; 957 }; 958 }; 959 960 // DON'T CHANGE! 961 #define FORK_JOIN_RANK_BITS ((16)) // 16 962 #define FORK_JOIN_MAX ((1u << FORK_JOIN_RANK_BITS)) 963 964 /* 965 * fj(6): T0 966 * / \ 967 * T0 T4 968 * / \ / 969 * T0 T2 T4 970 * / \ / \ / \ 971 * t0 t1 t2 t3 t4 t5 972 */ 973 974 // recursive tree fork-join 975 static void * 976 thread_do_fork_join_worker(void * const ptr) 977 { 978 struct entry13 fjp = {.ptr = ptr}; 979 // GCC: Without explicitly casting from fjp.fji (a 45-bit u64 value), 980 // the high bits will get truncated, which is always CORRECT in gcc. 981 // Don't use gcc. 982 struct fork_join_info * const fji = u64_to_ptr(fjp.e3); 983 const u32 rank = (u32)fjp.e1; 984 985 const u32 nchild = (u32)__builtin_ctz(rank ? rank : bits_p2_up_u32(fji->total)); 986 debug_assert(nchild <= FORK_JOIN_RANK_BITS); 987 pthread_t tids[FORK_JOIN_RANK_BITS]; 988 if (nchild) { 989 cpu_set_t set; 990 CPU_ZERO(&set); 991 pthread_attr_t attr; 992 pthread_attr_init(&attr); 993 //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Joinable by default 994 // fork top-down 995 for (u32 i = nchild - 1; i < nchild; i--) { 996 const u32 cr = rank + (1u << i); // child's rank 997 if (cr >= fji->total) 998 continue; // should not break 999 const u32 core = fji->cores[(cr < fji->ncores) ? cr : (cr % fji->ncores)]; 1000 CPU_SET(core, &set); 1001 pthread_attr_setaffinity_np(&attr, sizeof(set), &set); 1002 fjp.e1 = (u16)cr; 1003 const int r = pthread_create(&tids[i], &attr, thread_do_fork_join_worker, fjp.ptr); 1004 CPU_CLR(core, &set); 1005 if (unlikely(r)) { // fork failed 1006 memset(&tids[0], 0, sizeof(tids[0]) * (i+1)); 1007 u32 nmiss = (1u << (i + 1)) - 1; 1008 if ((rank + nmiss) >= fji->total) 1009 nmiss = fji->total - 1 - rank; 1010 (void)atomic_fetch_add_explicit(&fji->ferr, nmiss, MO_RELAXED); 1011 break; 1012 } 1013 } 1014 pthread_attr_destroy(&attr); 1015 } 1016 1017 char thname0[16]; 1018 char thname1[16]; 1019 thread_get_name(pthread_self(), thname0, 16); 1020 snprintf(thname1, 16, "%.8s_%u", thname0, rank); 1021 thread_set_name(pthread_self(), thname1); 1022 1023 void * const ret = fji->func(fji->args ? fji->argn[rank] : fji->arg1); 1024 1025 thread_set_name(pthread_self(), thname0); 1026 // join bottom-up 1027 for (u32 i = 0; i < nchild; i++) { 1028 const u32 cr = rank + (1u << i); // child rank 1029 if (cr >= fji->total) 1030 break; // safe to break 1031 if (tids[i]) { 1032 const int r = pthread_join(tids[i], NULL); 1033 if (unlikely(r)) { // error 1034 //fprintf(stderr, "pthread_join %u..%u = %d: %s\n", rank, cr, r, strerror(r)); 1035 (void)atomic_fetch_add_explicit(&fji->jerr, 1, MO_RELAXED); 1036 } 1037 } 1038 } 1039 return ret; 1040 } 1041 1042 u64 1043 thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx) 1044 { 1045 if (unlikely(nr > FORK_JOIN_MAX)) { 1046 fprintf(stderr, "%s reduce nr to %u\n", __func__, FORK_JOIN_MAX); 1047 nr = FORK_JOIN_MAX; 1048 } 1049 1050 u32 cores[CPU_SETSIZE]; 1051 u32 ncores = process_getaffinity_list(process_ncpu, cores); 1052 if (unlikely(ncores == 0)) { // force to use all cores 1053 ncores = process_ncpu; 1054 for (u32 i = 0; i < process_ncpu; i++) 1055 cores[i] = i; 1056 } 1057 if (unlikely(nr == 0)) 1058 nr = ncores; 1059 1060 // the compiler does not know fji can change since we cast &fji into fjp 1061 struct fork_join_info fji = {.total = nr, .cores = cores, .ncores = ncores, 1062 .func = func, .args = args, .arg1 = argx}; 1063 const struct entry13 fjp = entry13(0, (u64)(&fji)); 1064 1065 // save current affinity 1066 cpu_set_t set0; 1067 thread_getaffinity_set(&set0); 1068 1069 // master thread shares thread0's core 1070 cpu_set_t set; 1071 CPU_ZERO(&set); 1072 CPU_SET(fji.cores[0], &set); 1073 thread_setaffinity_set(&set); 1074 1075 const u64 t0 = time_nsec(); 1076 (void)thread_do_fork_join_worker(fjp.ptr); 1077 const u64 dt = time_diff_nsec(t0); 1078 1079 // restore original affinity 1080 thread_setaffinity_set(&set0); 1081 1082 // check and report errors (unlikely) 1083 if (atomic_load_explicit(&fji.xerr, MO_CONSUME)) 1084 fprintf(stderr, "%s errors: fork %u join %u\n", __func__, fji.ferr, fji.jerr); 1085 return dt; 1086 } 1087 1088 int 1089 thread_create_at(const u32 cpu, pthread_t * const thread, 1090 void *(*start_routine) (void *), void * const arg) 1091 { 1092 const u32 cpu_id = (cpu < process_ncpu) ? cpu : (cpu % process_ncpu); 1093 pthread_attr_t attr; 1094 pthread_attr_init(&attr); 1095 //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 1096 cpu_set_t set; 1097 1098 CPU_ZERO(&set); 1099 CPU_SET(cpu_id, &set); 1100 pthread_attr_setaffinity_np(&attr, sizeof(set), &set); 1101 const int r = pthread_create(thread, &attr, start_routine, arg); 1102 pthread_attr_destroy(&attr); 1103 return r; 1104 } 1105 // }}} process/thread 1106 1107 // locking {{{ 1108 1109 // spinlock {{{ 1110 #if defined(__linux__) 1111 #define SPINLOCK_PTHREAD 1112 #endif // __linux__ 1113 1114 #if defined(SPINLOCK_PTHREAD) 1115 static_assert(sizeof(pthread_spinlock_t) <= sizeof(spinlock), "spinlock size"); 1116 #else // SPINLOCK_PTHREAD 1117 static_assert(sizeof(au32) <= sizeof(spinlock), "spinlock size"); 1118 #endif // SPINLOCK_PTHREAD 1119 1120 void 1121 spinlock_init(spinlock * const lock) 1122 { 1123 #if defined(SPINLOCK_PTHREAD) 1124 pthread_spinlock_t * const p = (typeof(p))lock; 1125 pthread_spin_init(p, PTHREAD_PROCESS_PRIVATE); 1126 #else // SPINLOCK_PTHREAD 1127 au32 * const p = (typeof(p))lock; 1128 atomic_store_explicit(p, 0, MO_RELEASE); 1129 #endif // SPINLOCK_PTHREAD 1130 } 1131 1132 inline void 1133 spinlock_lock(spinlock * const lock) 1134 { 1135 #if defined(CORR) 1136 #pragma nounroll 1137 while (!spinlock_trylock(lock)) 1138 corr_yield(); 1139 #else // CORR 1140 #if defined(SPINLOCK_PTHREAD) 1141 pthread_spinlock_t * const p = (typeof(p))lock; 1142 pthread_spin_lock(p); // return value ignored 1143 #else // SPINLOCK_PTHREAD 1144 au32 * const p = (typeof(p))lock; 1145 #pragma nounroll 1146 do { 1147 if (atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0) 1148 return; 1149 #pragma nounroll 1150 do { 1151 cpu_pause(); 1152 } while (atomic_load_explicit(p, MO_CONSUME)); 1153 } while (true); 1154 #endif // SPINLOCK_PTHREAD 1155 #endif // CORR 1156 } 1157 1158 inline bool 1159 spinlock_trylock(spinlock * const lock) 1160 { 1161 #if defined(SPINLOCK_PTHREAD) 1162 pthread_spinlock_t * const p = (typeof(p))lock; 1163 return !pthread_spin_trylock(p); 1164 #else // SPINLOCK_PTHREAD 1165 au32 * const p = (typeof(p))lock; 1166 return atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0; 1167 #endif // SPINLOCK_PTHREAD 1168 } 1169 1170 inline void 1171 spinlock_unlock(spinlock * const lock) 1172 { 1173 #if defined(SPINLOCK_PTHREAD) 1174 pthread_spinlock_t * const p = (typeof(p))lock; 1175 pthread_spin_unlock(p); // return value ignored 1176 #else // SPINLOCK_PTHREAD 1177 au32 * const p = (typeof(p))lock; 1178 atomic_store_explicit(p, 0, MO_RELEASE); 1179 #endif // SPINLOCK_PTHREAD 1180 } 1181 // }}} spinlock 1182 1183 // pthread mutex {{{ 1184 static_assert(sizeof(pthread_mutex_t) <= sizeof(mutex), "mutexlock size"); 1185 inline void 1186 mutex_init(mutex * const lock) 1187 { 1188 pthread_mutex_t * const p = (typeof(p))lock; 1189 pthread_mutex_init(p, NULL); 1190 } 1191 1192 inline void 1193 mutex_lock(mutex * const lock) 1194 { 1195 #if defined(CORR) 1196 #pragma nounroll 1197 while (!mutex_trylock(lock)) 1198 corr_yield(); 1199 #else 1200 pthread_mutex_t * const p = (typeof(p))lock; 1201 pthread_mutex_lock(p); // return value ignored 1202 #endif 1203 } 1204 1205 inline bool 1206 mutex_trylock(mutex * const lock) 1207 { 1208 pthread_mutex_t * const p = (typeof(p))lock; 1209 return !pthread_mutex_trylock(p); // return value ignored 1210 } 1211 1212 inline void 1213 mutex_unlock(mutex * const lock) 1214 { 1215 pthread_mutex_t * const p = (typeof(p))lock; 1216 pthread_mutex_unlock(p); // return value ignored 1217 } 1218 1219 inline void 1220 mutex_deinit(mutex * const lock) 1221 { 1222 pthread_mutex_t * const p = (typeof(p))lock; 1223 pthread_mutex_destroy(p); 1224 } 1225 // }}} pthread mutex 1226 1227 // rwdep {{{ 1228 // poor man's lockdep for rwlock 1229 // per-thread lock list 1230 // it calls debug_die() when local double-(un)locking is detected 1231 // cyclic dependencies can be manually identified by looking at the two lists below in gdb 1232 #ifdef RWDEP 1233 #define RWDEP_NR ((16)) 1234 __thread const rwlock * rwdep_readers[RWDEP_NR] = {}; 1235 __thread const rwlock * rwdep_writers[RWDEP_NR] = {}; 1236 1237 static void 1238 rwdep_check(const rwlock * const lock) 1239 { 1240 debug_assert(lock); 1241 for (u64 i = 0; i < RWDEP_NR; i++) { 1242 if (rwdep_readers[i] == lock) 1243 debug_die(); 1244 if (rwdep_writers[i] == lock) 1245 debug_die(); 1246 } 1247 } 1248 #endif // RWDEP 1249 1250 static void 1251 rwdep_lock_read(const rwlock * const lock) 1252 { 1253 #ifdef RWDEP 1254 rwdep_check(lock); 1255 for (u64 i = 0; i < RWDEP_NR; i++) { 1256 if (rwdep_readers[i] == NULL) { 1257 rwdep_readers[i] = lock; 1258 return; 1259 } 1260 } 1261 #else 1262 (void)lock; 1263 #endif // RWDEP 1264 } 1265 1266 static void 1267 rwdep_unlock_read(const rwlock * const lock) 1268 { 1269 #ifdef RWDEP 1270 for (u64 i = 0; i < RWDEP_NR; i++) { 1271 if (rwdep_readers[i] == lock) { 1272 rwdep_readers[i] = NULL; 1273 return; 1274 } 1275 } 1276 debug_die(); 1277 #else 1278 (void)lock; 1279 #endif // RWDEP 1280 } 1281 1282 static void 1283 rwdep_lock_write(const rwlock * const lock) 1284 { 1285 #ifdef RWDEP 1286 rwdep_check(lock); 1287 for (u64 i = 0; i < RWDEP_NR; i++) { 1288 if (rwdep_writers[i] == NULL) { 1289 rwdep_writers[i] = lock; 1290 return; 1291 } 1292 } 1293 #else 1294 (void)lock; 1295 #endif // RWDEP 1296 } 1297 1298 static void 1299 rwdep_unlock_write(const rwlock * const lock) 1300 { 1301 #ifdef RWDEP 1302 for (u64 i = 0; i < RWDEP_NR; i++) { 1303 if (rwdep_writers[i] == lock) { 1304 rwdep_writers[i] = NULL; 1305 return; 1306 } 1307 } 1308 debug_die(); 1309 #else 1310 (void)lock; 1311 #endif // RWDEP 1312 } 1313 // }}} rwlockdep 1314 1315 // rwlock {{{ 1316 typedef au32 lock_t; 1317 typedef u32 lock_v; 1318 static_assert(sizeof(lock_t) == sizeof(lock_v), "lock size"); 1319 static_assert(sizeof(lock_t) <= sizeof(rwlock), "lock size"); 1320 1321 #define RWLOCK_WSHIFT ((sizeof(lock_t) * 8 - 1)) 1322 #define RWLOCK_WBIT ((((lock_v)1) << RWLOCK_WSHIFT)) 1323 1324 inline void 1325 rwlock_init(rwlock * const lock) 1326 { 1327 lock_t * const pvar = (typeof(pvar))lock; 1328 atomic_store_explicit(pvar, 0, MO_RELEASE); 1329 } 1330 1331 inline bool 1332 rwlock_trylock_read(rwlock * const lock) 1333 { 1334 lock_t * const pvar = (typeof(pvar))lock; 1335 if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) { 1336 rwdep_lock_read(lock); 1337 return true; 1338 } else { 1339 atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED); 1340 return false; 1341 } 1342 } 1343 1344 inline bool 1345 rwlock_trylock_read_lp(rwlock * const lock) 1346 { 1347 lock_t * const pvar = (typeof(pvar))lock; 1348 if (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) { 1349 cpu_pause(); 1350 return false; 1351 } 1352 return rwlock_trylock_read(lock); 1353 } 1354 1355 // actually nr + 1 1356 inline bool 1357 rwlock_trylock_read_nr(rwlock * const lock, u16 nr) 1358 { 1359 lock_t * const pvar = (typeof(pvar))lock; 1360 if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) { 1361 rwdep_lock_read(lock); 1362 return true; 1363 } 1364 1365 #pragma nounroll 1366 do { // someone already locked; wait for a little while 1367 cpu_pause(); 1368 if ((atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) == 0) { 1369 rwdep_lock_read(lock); 1370 return true; 1371 } 1372 } while (nr--); 1373 1374 atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED); 1375 return false; 1376 } 1377 1378 inline void 1379 rwlock_lock_read(rwlock * const lock) 1380 { 1381 lock_t * const pvar = (typeof(pvar))lock; 1382 #pragma nounroll 1383 do { 1384 if (rwlock_trylock_read(lock)) 1385 return; 1386 #pragma nounroll 1387 do { 1388 #if defined(CORR) 1389 corr_yield(); 1390 #else 1391 cpu_pause(); 1392 #endif 1393 } while (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT); 1394 } while (true); 1395 } 1396 1397 inline void 1398 rwlock_unlock_read(rwlock * const lock) 1399 { 1400 rwdep_unlock_read(lock); 1401 lock_t * const pvar = (typeof(pvar))lock; 1402 atomic_fetch_sub_explicit(pvar, 1, MO_RELEASE); 1403 } 1404 1405 inline bool 1406 rwlock_trylock_write(rwlock * const lock) 1407 { 1408 lock_t * const pvar = (typeof(pvar))lock; 1409 lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME); 1410 if ((v0 == 0) && atomic_compare_exchange_weak_explicit(pvar, &v0, RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) { 1411 rwdep_lock_write(lock); 1412 return true; 1413 } else { 1414 return false; 1415 } 1416 } 1417 1418 // actually nr + 1 1419 inline bool 1420 rwlock_trylock_write_nr(rwlock * const lock, u16 nr) 1421 { 1422 #pragma nounroll 1423 do { 1424 if (rwlock_trylock_write(lock)) 1425 return true; 1426 cpu_pause(); 1427 } while (nr--); 1428 return false; 1429 } 1430 1431 inline void 1432 rwlock_lock_write(rwlock * const lock) 1433 { 1434 lock_t * const pvar = (typeof(pvar))lock; 1435 #pragma nounroll 1436 do { 1437 if (rwlock_trylock_write(lock)) 1438 return; 1439 #pragma nounroll 1440 do { 1441 #if defined(CORR) 1442 corr_yield(); 1443 #else 1444 cpu_pause(); 1445 #endif 1446 } while (atomic_load_explicit(pvar, MO_CONSUME)); 1447 } while (true); 1448 } 1449 1450 inline bool 1451 rwlock_trylock_write_hp(rwlock * const lock) 1452 { 1453 lock_t * const pvar = (typeof(pvar))lock; 1454 lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME); 1455 if (v0 >> RWLOCK_WSHIFT) 1456 return false; 1457 1458 if (atomic_compare_exchange_weak_explicit(pvar, &v0, v0|RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) { 1459 rwdep_lock_write(lock); 1460 // WBIT successfully marked; must wait for readers to leave 1461 if (v0) { // saw active readers 1462 #pragma nounroll 1463 while (atomic_load_explicit(pvar, MO_CONSUME) != RWLOCK_WBIT) { 1464 #if defined(CORR) 1465 corr_yield(); 1466 #else 1467 cpu_pause(); 1468 #endif 1469 } 1470 } 1471 return true; 1472 } else { 1473 return false; 1474 } 1475 } 1476 1477 inline bool 1478 rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr) 1479 { 1480 #pragma nounroll 1481 do { 1482 if (rwlock_trylock_write_hp(lock)) 1483 return true; 1484 cpu_pause(); 1485 } while (nr--); 1486 return false; 1487 } 1488 1489 inline void 1490 rwlock_lock_write_hp(rwlock * const lock) 1491 { 1492 #pragma nounroll 1493 while (!rwlock_trylock_write_hp(lock)) { 1494 #if defined(CORR) 1495 corr_yield(); 1496 #else 1497 cpu_pause(); 1498 #endif 1499 } 1500 } 1501 1502 inline void 1503 rwlock_unlock_write(rwlock * const lock) 1504 { 1505 rwdep_unlock_write(lock); 1506 lock_t * const pvar = (typeof(pvar))lock; 1507 atomic_fetch_sub_explicit(pvar, RWLOCK_WBIT, MO_RELEASE); 1508 } 1509 1510 inline void 1511 rwlock_write_to_read(rwlock * const lock) 1512 { 1513 rwdep_unlock_write(lock); 1514 rwdep_lock_read(lock); 1515 lock_t * const pvar = (typeof(pvar))lock; 1516 // +R -W 1517 atomic_fetch_add_explicit(pvar, ((lock_v)1) - RWLOCK_WBIT, MO_ACQ_REL); 1518 } 1519 1520 #undef RWLOCK_WSHIFT 1521 #undef RWLOCK_WBIT 1522 // }}} rwlock 1523 1524 // }}} locking 1525 1526 // coroutine {{{ 1527 1528 extern u64 co_switch_stack(u64 * const saversp, const u64 newrsp, const u64 retval) { 1529 // asm {{{ 1530 #if defined(__x86_64__) 1531 // number pushes in co_switch_stack 1532 #define CO_CONTEXT_SIZE ((6)) 1533 1534 // for switch/exit: pass a return value to the target 1535 asm ( 1536 ".align 16;" 1537 #if defined(__linux__) || defined(__FreeBSD__) 1538 ".global co_switch_stack;" 1539 ".type co_switch_stack, @function;" 1540 "co_switch_stack:" 1541 #elif defined(__APPLE__) && defined(__MACH__) 1542 ".global _co_switch_stack;" 1543 "_co_switch_stack:" 1544 #else 1545 #error Supported platforms: Linux/FreeBSD/Apple 1546 #endif // OS 1547 "push %rbp; push %rbx; push %r12;" 1548 "push %r13; push %r14; push %r15;" 1549 "mov %rsp, (%rdi);" 1550 "mov %rsi, %rsp;" 1551 "pop %r15; pop %r14; pop %r13;" 1552 "pop %r12; pop %rbx; pop %rbp;" 1553 "mov %rdx, %rax;" 1554 "retq;" 1555 ); 1556 1557 #elif defined(__aarch64__) 1558 // number pushes in co_switch_stack 1559 #define CO_CONTEXT_SIZE ((20)) 1560 asm ( 1561 ".align 16;" 1562 #if defined(__linux__) || defined(__FreeBSD__) 1563 ".global co_switch_stack;" 1564 ".type co_switch_stack, @function;" 1565 "co_switch_stack:" 1566 #elif defined(__APPLE__) && defined(__MACH__) 1567 ".global _co_switch_stack;" 1568 "_co_switch_stack:" 1569 #else 1570 #error supported platforms: Linux/FreeBSD/Apple 1571 #endif // OS 1572 "sub x8, sp, 160;" 1573 "str x8, [x0];" 1574 "stp x30, x19, [x8]; ldp x30, x19, [x1];" 1575 "stp x20, x21, [x8, 16]; ldp x20, x21, [x1, 16];" 1576 "stp x22, x23, [x8, 32]; ldp x22, x23, [x1, 32];" 1577 "stp x24, x25, [x8, 48]; ldp x24, x25, [x1, 48];" 1578 "stp x26, x27, [x8, 64]; ldp x26, x27, [x1, 64];" 1579 "stp x28, x29, [x8, 80]; ldp x28, x29, [x1, 80];" 1580 "stp d8, d9, [x8, 96]; ldp d8, d9, [x1, 96];" 1581 "stp d10, d11, [x8, 112]; ldp d10, d11, [x1, 112];" 1582 "stp d12, d13, [x8, 128]; ldp d12, d13, [x1, 128];" 1583 "stp d14, d15, [x8, 144]; ldp d14, d15, [x1, 144];" 1584 "add sp, x1, 160;" 1585 "mov x0, x2;" 1586 "br x30;" 1587 ); 1588 } 1589 1590 extern void co_entry_aarch64(void) { 1591 asm ( 1592 ".align 16;" 1593 #if defined(__linux__) || defined(__FreeBSD__) 1594 ".global co_entry_aarch64;" 1595 ".type co_entry_aarch64, @function;" 1596 "co_entry_aarch64:" 1597 #elif defined(__APPLE__) && defined(__MACH__) 1598 ".global _co_entry_aarch64;" 1599 "_co_entry_aarch64:" 1600 #else 1601 #error supported platforms: Linux/FreeBSD/Apple 1602 #endif // OS 1603 "ldr x8, [sp, 0];" 1604 "blr x8;" 1605 "ldr x8, [sp, 8];" 1606 "blr x8;" 1607 "ldr x8, [sp, 16];" 1608 "blr x8;" 1609 ); 1610 #else 1611 #error supported CPUs: x86_64 or AArch64 1612 #endif // co_switch_stack x86_64 and aarch64 1613 // }}} asm 1614 } 1615 1616 // co {{{ 1617 struct co { 1618 u64 rsp; 1619 void * priv; 1620 u64 * host; // set host to NULL to exit 1621 size_t stksz; 1622 }; 1623 1624 static __thread struct co * volatile co_curr = NULL; // NULL in host 1625 1626 // the stack sits under the struct co 1627 static void 1628 co_init(struct co * const co, void * func, void * priv, u64 * const host, 1629 const u64 stksz, void * func_exit) 1630 { 1631 debug_assert((stksz & 0x3f) == 0); // a multiple of 64 bytes 1632 u64 * rsp = ((u64 *)co) - 4; 1633 rsp[0] = (u64)func; 1634 rsp[1] = (u64)func_exit; 1635 rsp[2] = (u64)debug_die; 1636 rsp[3] = 0; 1637 1638 rsp -= CO_CONTEXT_SIZE; 1639 1640 #if defined(__aarch64__) 1641 rsp[0] = (u64)co_entry_aarch64; 1642 #endif 1643 1644 co->rsp = (u64)rsp; 1645 co->priv = priv; 1646 co->host = host; 1647 co->stksz = stksz; 1648 } 1649 1650 static void 1651 co_exit0(void) 1652 { 1653 co_exit(0); 1654 } 1655 1656 struct co * 1657 co_create(const u64 stacksize, void * func, void * priv, u64 * const host) 1658 { 1659 const u64 stksz = bits_round_up(stacksize, 6); 1660 const size_t alloc_size = stksz + sizeof(struct co); 1661 u8 * const mem = yalloc(alloc_size); 1662 if (mem == NULL) 1663 return NULL; 1664 1665 #ifdef CO_STACK_CHECK 1666 memset(mem, 0x5c, stksz); 1667 #endif // CO_STACK_CHECK 1668 1669 struct co * const co = (typeof(co))(mem + stksz); 1670 co_init(co, func, priv, host, stksz, co_exit0); 1671 return co; 1672 } 1673 1674 inline void 1675 co_reuse(struct co * const co, void * func, void * priv, u64 * const host) 1676 { 1677 co_init(co, func, priv, host, co->stksz, co_exit0); 1678 } 1679 1680 inline struct co * 1681 co_fork(void * func, void * priv) 1682 { 1683 return co_curr ? co_create(co_curr->stksz, func, priv, co_curr->host) : NULL; 1684 } 1685 1686 inline void * 1687 co_priv(void) 1688 { 1689 return co_curr ? co_curr->priv : NULL; 1690 } 1691 1692 // the host calls this to enter a coroutine. 1693 inline u64 1694 co_enter(struct co * const to, const u64 retval) 1695 { 1696 debug_assert(co_curr == NULL); // must entry from the host 1697 debug_assert(to && to->host); 1698 u64 * const save = to->host; 1699 co_curr = to; 1700 const u64 ret = co_switch_stack(save, to->rsp, retval); 1701 co_curr = NULL; 1702 return ret; 1703 } 1704 1705 // switch from a coroutine to another coroutine 1706 // co_curr must be valid 1707 // the target will resume and receive the retval 1708 inline u64 1709 co_switch_to(struct co * const to, const u64 retval) 1710 { 1711 debug_assert(co_curr); 1712 debug_assert(co_curr != to); 1713 debug_assert(to && to->host); 1714 struct co * const save = co_curr; 1715 co_curr = to; 1716 return co_switch_stack(&(save->rsp), to->rsp, retval); 1717 } 1718 1719 // switch from a coroutine to the host routine 1720 // co_yield is now a c++ keyword... 1721 inline u64 1722 co_back(const u64 retval) 1723 { 1724 debug_assert(co_curr); 1725 struct co * const save = co_curr; 1726 co_curr = NULL; 1727 return co_switch_stack(&(save->rsp), *(save->host), retval); 1728 } 1729 1730 #ifdef CO_STACK_CHECK 1731 static void 1732 co_stack_check(const u8 * const mem, const u64 stksz) 1733 { 1734 const u64 * const mem64 = (typeof(mem64))mem; 1735 const u64 size64 = stksz / sizeof(u64); 1736 for (u64 i = 0; i < size64; i++) { 1737 if (mem64[i] != 0x5c5c5c5c5c5c5c5clu) { 1738 fprintf(stderr, "%s co stack usage: %lu/%lu\n", __func__, stksz - (i * sizeof(u64)), stksz); 1739 break; 1740 } 1741 } 1742 } 1743 #endif // CO_STACK_CHECK 1744 1745 // return to host and set host to NULL 1746 __attribute__((noreturn)) 1747 void 1748 co_exit(const u64 retval) 1749 { 1750 debug_assert(co_curr); 1751 #ifdef CO_STACK_CHECK 1752 const u64 stksz = co_curr->stksz; 1753 u8 * const mem = ((u8 *)co_curr) - stksz; 1754 co_stack_check(mem, stksz); 1755 #endif // CO_STACK_CHECK 1756 const u64 hostrsp = *(co_curr->host); 1757 co_curr->host = NULL; 1758 struct co * const save = co_curr; 1759 co_curr = NULL; 1760 (void)co_switch_stack(&(save->rsp), hostrsp, retval); 1761 // return to co_enter 1762 debug_die(); 1763 } 1764 1765 // host is set to NULL on exit 1766 inline bool 1767 co_valid(struct co * const co) 1768 { 1769 return co->host != NULL; 1770 } 1771 1772 // return NULL on host 1773 inline struct co * 1774 co_self(void) 1775 { 1776 return co_curr; 1777 } 1778 1779 inline void 1780 co_destroy(struct co * const co) 1781 { 1782 u8 * const mem = ((u8 *)co) - co->stksz; 1783 free(mem); 1784 } 1785 // }}} co 1786 1787 // corr {{{ 1788 struct corr { 1789 struct co co; 1790 struct corr * next; 1791 struct corr * prev; 1792 }; 1793 1794 // initial and link guest to the run-queue 1795 struct corr * 1796 corr_create(const u64 stacksize, void * func, void * priv, u64 * const host) 1797 { 1798 const u64 stksz = bits_round_up(stacksize, 6); 1799 const size_t alloc_size = stksz + sizeof(struct corr); 1800 u8 * const mem = yalloc(alloc_size); 1801 if (mem == NULL) 1802 return NULL; 1803 1804 #ifdef CO_STACK_CHECK 1805 memset(mem, 0x5c, stksz); 1806 #endif // CO_STACK_CHECK 1807 1808 struct corr * const co = (typeof(co))(mem + stksz); 1809 co_init(&(co->co), func, priv, host, stksz, corr_exit); 1810 co->next = co; 1811 co->prev = co; 1812 return co; 1813 } 1814 1815 struct corr * 1816 corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev) 1817 { 1818 const u64 stksz = bits_round_up(stacksize, 6); 1819 const size_t alloc_size = stksz + sizeof(struct corr); 1820 u8 * const mem = yalloc(alloc_size); 1821 if (mem == NULL) 1822 return NULL; 1823 1824 #ifdef CO_STACK_CHECK 1825 memset(mem, 0x5c, stksz); 1826 #endif // CO_STACK_CHECK 1827 1828 struct corr * const co = (typeof(co))(mem + stksz); 1829 co_init(&(co->co), func, priv, prev->co.host, stksz, corr_exit); 1830 co->next = prev->next; 1831 co->prev = prev; 1832 co->prev->next = co; 1833 co->next->prev = co; 1834 return co; 1835 } 1836 1837 inline void 1838 corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host) 1839 { 1840 co_init(&(co->co), func, priv, host, co->co.stksz, corr_exit); 1841 co->next = co; 1842 co->prev = co; 1843 } 1844 1845 inline void 1846 corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev) 1847 { 1848 co_init(&(co->co), func, priv, prev->co.host, co->co.stksz, corr_exit); 1849 co->next = prev->next; 1850 co->prev = prev; 1851 co->prev->next = co; 1852 co->next->prev = co; 1853 } 1854 1855 inline void 1856 corr_enter(struct corr * const co) 1857 { 1858 (void)co_enter(&(co->co), 0); 1859 } 1860 1861 inline void 1862 corr_yield(void) 1863 { 1864 struct corr * const curr = (typeof(curr))co_curr; 1865 if (curr && (curr->next != curr)) 1866 (void)co_switch_to(&(curr->next->co), 0); 1867 } 1868 1869 __attribute__((noreturn)) 1870 inline void 1871 corr_exit(void) 1872 { 1873 debug_assert(co_curr); 1874 #ifdef CO_STACK_CHECK 1875 const u64 stksz = co_curr->stksz; 1876 const u8 * const mem = ((u8 *)(co_curr)) - stksz; 1877 co_stack_check(mem, stksz); 1878 #endif // CO_STACK_CHECK 1879 1880 struct corr * const curr = (typeof(curr))co_curr; 1881 if (curr->next != curr) { // have more corr 1882 struct corr * const next = curr->next; 1883 struct corr * const prev = curr->prev; 1884 next->prev = prev; 1885 prev->next = next; 1886 curr->next = NULL; 1887 curr->prev = NULL; 1888 curr->co.host = NULL; // invalidate 1889 (void)co_switch_to(&(next->co), 0); 1890 } else { // the last corr 1891 co_exit0(); 1892 } 1893 debug_die(); 1894 } 1895 1896 inline void 1897 corr_destroy(struct corr * const co) 1898 { 1899 co_destroy(&(co->co)); 1900 } 1901 // }}} corr 1902 1903 // }}} co 1904 1905 // bits {{{ 1906 inline u32 1907 bits_reverse_u32(const u32 v) 1908 { 1909 const u32 v2 = __builtin_bswap32(v); 1910 const u32 v3 = ((v2 & 0xf0f0f0f0u) >> 4) | ((v2 & 0x0f0f0f0fu) << 4); 1911 const u32 v4 = ((v3 & 0xccccccccu) >> 2) | ((v3 & 0x33333333u) << 2); 1912 const u32 v5 = ((v4 & 0xaaaaaaaau) >> 1) | ((v4 & 0x55555555u) << 1); 1913 return v5; 1914 } 1915 1916 inline u64 1917 bits_reverse_u64(const u64 v) 1918 { 1919 const u64 v2 = __builtin_bswap64(v); 1920 const u64 v3 = ((v2 & 0xf0f0f0f0f0f0f0f0lu) >> 4) | ((v2 & 0x0f0f0f0f0f0f0f0flu) << 4); 1921 const u64 v4 = ((v3 & 0xcccccccccccccccclu) >> 2) | ((v3 & 0x3333333333333333lu) << 2); 1922 const u64 v5 = ((v4 & 0xaaaaaaaaaaaaaaaalu) >> 1) | ((v4 & 0x5555555555555555lu) << 1); 1923 return v5; 1924 } 1925 1926 inline u64 1927 bits_rotl_u64(const u64 v, const u8 n) 1928 { 1929 const u8 sh = n & 0x3f; 1930 return (v << sh) | (v >> (64 - sh)); 1931 } 1932 1933 inline u64 1934 bits_rotr_u64(const u64 v, const u8 n) 1935 { 1936 const u8 sh = n & 0x3f; 1937 return (v >> sh) | (v << (64 - sh)); 1938 } 1939 1940 inline u32 1941 bits_rotl_u32(const u32 v, const u8 n) 1942 { 1943 const u8 sh = n & 0x1f; 1944 return (v << sh) | (v >> (32 - sh)); 1945 } 1946 1947 inline u32 1948 bits_rotr_u32(const u32 v, const u8 n) 1949 { 1950 const u8 sh = n & 0x1f; 1951 return (v >> sh) | (v << (32 - sh)); 1952 } 1953 1954 inline u64 1955 bits_p2_up_u64(const u64 v) 1956 { 1957 // clz(0) is undefined 1958 return (v > 1) ? (1lu << (64 - __builtin_clzl(v - 1lu))) : v; 1959 } 1960 1961 inline u32 1962 bits_p2_up_u32(const u32 v) 1963 { 1964 // clz(0) is undefined 1965 return (v > 1) ? (1u << (32 - __builtin_clz(v - 1u))) : v; 1966 } 1967 1968 inline u64 1969 bits_p2_down_u64(const u64 v) 1970 { 1971 return v ? (1lu << (63 - __builtin_clzl(v))) : v; 1972 } 1973 1974 inline u32 1975 bits_p2_down_u32(const u32 v) 1976 { 1977 return v ? (1u << (31 - __builtin_clz(v))) : v; 1978 } 1979 1980 inline u64 1981 bits_round_up(const u64 v, const u8 power) 1982 { 1983 return (v + (1lu << power) - 1lu) >> power << power; 1984 } 1985 1986 inline u64 1987 bits_round_up_a(const u64 v, const u64 a) 1988 { 1989 return (v + a - 1) / a * a; 1990 } 1991 1992 inline u64 1993 bits_round_down(const u64 v, const u8 power) 1994 { 1995 return v >> power << power; 1996 } 1997 1998 inline u64 1999 bits_round_down_a(const u64 v, const u64 a) 2000 { 2001 return v / a * a; 2002 } 2003 // }}} bits 2004 2005 // vi128 {{{ 2006 #if defined(__GNUC__) && __GNUC__ >= 7 2007 #define FALLTHROUGH __attribute__ ((fallthrough)) 2008 #else 2009 #define FALLTHROUGH ((void)0) 2010 #endif /* __GNUC__ >= 7 */ 2011 2012 inline u32 2013 vi128_estimate_u32(const u32 v) 2014 { 2015 static const u8 t[] = {5,5,5,5, 2016 4,4,4,4,4,4,4, 3,3,3,3,3,3,3, 2017 2,2,2,2,2,2,2, 1,1,1,1,1,1,1}; 2018 return v ? t[__builtin_clz(v)] : 2; 2019 // 0 -> [0x80 0x00] the first byte is non-zero 2020 2021 // nz bit range -> enc length offset in t[] 2022 // 0 -> 2 special case 2023 // 1 to 7 -> 1 31 to 25 2024 // 8 to 14 -> 2 24 to 18 2025 // 15 to 21 -> 3 17 to 11 2026 // 22 to 28 -> 4 10 to 4 2027 // 29 to 32 -> 5 3 to 0 2028 } 2029 2030 u8 * 2031 vi128_encode_u32(u8 * dst, u32 v) 2032 { 2033 switch (vi128_estimate_u32(v)) { 2034 case 5: 2035 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2036 case 4: 2037 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2038 case 3: 2039 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2040 case 2: 2041 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2042 case 1: 2043 *(dst++) = (u8)v; 2044 break; 2045 default: 2046 debug_die(); 2047 break; 2048 } 2049 return dst; 2050 } 2051 2052 const u8 * 2053 vi128_decode_u32(const u8 * src, u32 * const out) 2054 { 2055 debug_assert(*src); 2056 u32 r = 0; 2057 for (u32 shift = 0; shift < 32; shift += 7) { 2058 const u8 byte = *(src++); 2059 r |= (((u32)(byte & 0x7f)) << shift); 2060 if ((byte & 0x80) == 0) { // No more bytes to consume 2061 *out = r; 2062 return src; 2063 } 2064 } 2065 *out = 0; 2066 return NULL; // invalid 2067 } 2068 2069 inline u32 2070 vi128_estimate_u64(const u64 v) 2071 { 2072 static const u8 t[] = {10, 2073 9,9,9,9,9,9,9, 8,8,8,8,8,8,8, 7,7,7,7,7,7,7, 2074 6,6,6,6,6,6,6, 5,5,5,5,5,5,5, 4,4,4,4,4,4,4, 2075 3,3,3,3,3,3,3, 2,2,2,2,2,2,2, 1,1,1,1,1,1,1}; 2076 return v ? t[__builtin_clzl(v)] : 2; 2077 } 2078 2079 // return ptr after the generated bytes 2080 u8 * 2081 vi128_encode_u64(u8 * dst, u64 v) 2082 { 2083 switch (vi128_estimate_u64(v)) { 2084 case 10: 2085 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2086 case 9: 2087 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2088 case 8: 2089 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2090 case 7: 2091 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2092 case 6: 2093 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2094 case 5: 2095 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2096 case 4: 2097 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2098 case 3: 2099 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2100 case 2: 2101 *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH; 2102 case 1: 2103 *(dst++) = (u8)v; 2104 break; 2105 default: 2106 debug_die(); 2107 break; 2108 } 2109 return dst; 2110 } 2111 2112 // return ptr after the consumed bytes 2113 const u8 * 2114 vi128_decode_u64(const u8 * src, u64 * const out) 2115 { 2116 u64 r = 0; 2117 for (u32 shift = 0; shift < 64; shift += 7) { 2118 const u8 byte = *(src++); 2119 r |= (((u64)(byte & 0x7f)) << shift); 2120 if ((byte & 0x80) == 0) { // No more bytes to consume 2121 *out = r; 2122 return src; 2123 } 2124 } 2125 *out = 0; 2126 return NULL; // invalid 2127 } 2128 2129 #undef FALLTHROUGH 2130 // }}} vi128 2131 2132 // misc {{{ 2133 inline struct entry13 2134 entry13(const u16 e1, const u64 e3) 2135 { 2136 debug_assert((e3 >> 48) == 0); 2137 return (struct entry13){.v64 = (e3 << 16) | e1}; 2138 } 2139 2140 inline void 2141 entry13_update_e3(struct entry13 * const e, const u64 e3) 2142 { 2143 debug_assert((e3 >> 48) == 0); 2144 *e = entry13(e->e1, e3); 2145 } 2146 2147 inline void * 2148 u64_to_ptr(const u64 v) 2149 { 2150 return (void *)v; 2151 } 2152 2153 inline u64 2154 ptr_to_u64(const void * const ptr) 2155 { 2156 return (u64)ptr; 2157 } 2158 2159 // portable malloc_usable_size 2160 inline size_t 2161 m_usable_size(void * const ptr) 2162 { 2163 #if defined(__linux__) || defined(__FreeBSD__) 2164 const size_t sz = malloc_usable_size(ptr); 2165 #elif defined(__APPLE__) && defined(__MACH__) 2166 const size_t sz = malloc_size(ptr); 2167 #endif // OS 2168 2169 #ifndef HEAPCHECKING 2170 // valgrind and asan may return unaligned usable size 2171 debug_assert((sz & 0x7lu) == 0); 2172 #endif // HEAPCHECKING 2173 2174 return sz; 2175 } 2176 2177 inline size_t 2178 fdsize(const int fd) 2179 { 2180 struct stat st; 2181 st.st_size = 0; 2182 if (fstat(fd, &st) != 0) 2183 return 0; 2184 2185 if (S_ISBLK(st.st_mode)) { 2186 #if defined(__linux__) 2187 ioctl(fd, BLKGETSIZE64, &st.st_size); 2188 #elif defined(__APPLE__) && defined(__MACH__) 2189 u64 blksz = 0; 2190 u64 nblks = 0; 2191 ioctl(fd, DKIOCGETBLOCKSIZE, &blksz); 2192 ioctl(fd, DKIOCGETBLOCKCOUNT, &nblks); 2193 st.st_size = (ssize_t)(blksz * nblks); 2194 #elif defined(__FreeBSD__) 2195 ioctl(fd, DIOCGMEDIASIZE, &st.st_size); 2196 #endif // OS 2197 } 2198 2199 return (size_t)st.st_size; 2200 } 2201 2202 u32 2203 memlcp(const u8 * const p1, const u8 * const p2, const u32 max) 2204 { 2205 const u32 max64 = max & (~7u); 2206 u32 clen = 0; 2207 while (clen < max64) { 2208 const u64 v1 = *(const u64 *)(p1+clen); 2209 const u64 v2 = *(const u64 *)(p2+clen); 2210 const u64 x = v1 ^ v2; 2211 if (x) 2212 return clen + (u32)(__builtin_ctzl(x) >> 3); 2213 2214 clen += sizeof(u64); 2215 } 2216 2217 if ((clen + sizeof(u32)) <= max) { 2218 const u32 v1 = *(const u32 *)(p1+clen); 2219 const u32 v2 = *(const u32 *)(p2+clen); 2220 const u32 x = v1 ^ v2; 2221 if (x) 2222 return clen + (u32)(__builtin_ctz(x) >> 3); 2223 2224 clen += sizeof(u32); 2225 } 2226 2227 while ((clen < max) && (p1[clen] == p2[clen])) 2228 clen++; 2229 return clen; 2230 } 2231 2232 static double logger_t0 = 0.0; 2233 2234 __attribute__((constructor)) 2235 static void 2236 logger_init(void) 2237 { 2238 logger_t0 = time_sec(); 2239 } 2240 2241 __attribute__ ((format (printf, 2, 3))) 2242 void 2243 logger_printf(const int fd, const char * const fmt, ...) 2244 { 2245 char buf[4096]; 2246 va_list ap; 2247 va_start(ap, fmt); 2248 vsnprintf(buf, sizeof(buf), fmt, ap); 2249 va_end(ap); 2250 dprintf(fd, "%010.3lf %08x %s", time_diff_sec(logger_t0), crc32c_u64(0x12345678, (u64)pthread_self()), buf); 2251 } 2252 // }}} misc 2253 2254 // astk {{{ 2255 // atomic stack 2256 struct acell { struct acell * next; }; 2257 2258 // extract ptr from m value 2259 static inline struct acell * 2260 astk_ptr(const u64 m) 2261 { 2262 return (struct acell *)(m >> 16); 2263 } 2264 2265 // calculate the new magic 2266 static inline u64 2267 astk_m1(const u64 m0, struct acell * const ptr) 2268 { 2269 return ((m0 + 1) & 0xfffflu) | (((u64)ptr) << 16); 2270 } 2271 2272 // calculate the new magic 2273 static inline u64 2274 astk_m1_unsafe(struct acell * const ptr) 2275 { 2276 return ((u64)ptr) << 16; 2277 } 2278 2279 static bool 2280 astk_try_push(au64 * const pmagic, struct acell * const first, struct acell * const last) 2281 { 2282 u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2283 last->next = astk_ptr(m0); 2284 const u64 m1 = astk_m1(m0, first); 2285 return atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_RELEASE, MO_RELAXED); 2286 } 2287 2288 static void 2289 astk_push_safe(au64 * const pmagic, struct acell * const first, struct acell * const last) 2290 { 2291 while (!astk_try_push(pmagic, first, last)); 2292 } 2293 2294 static void 2295 astk_push_unsafe(au64 * const pmagic, struct acell * const first, 2296 struct acell * const last) 2297 { 2298 const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2299 last->next = astk_ptr(m0); 2300 const u64 m1 = astk_m1_unsafe(first); 2301 atomic_store_explicit(pmagic, m1, MO_RELAXED); 2302 } 2303 2304 //// can fail for two reasons: (1) NULL: no available object; (2) ~0lu: contention 2305 // static void * 2306 //astk_try_pop(au64 * const pmagic) 2307 //{ 2308 // u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2309 // struct acell * const ret = astk_ptr(m0); 2310 // if (ret == NULL) 2311 // return NULL; 2312 // 2313 // const u64 m1 = astk_m1(m0, ret->next); 2314 // if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED)) 2315 // return ret; 2316 // else 2317 // return (void *)(~0lu); 2318 //} 2319 2320 static void * 2321 astk_pop_safe(au64 * const pmagic) 2322 { 2323 do { 2324 u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2325 struct acell * const ret = astk_ptr(m0); 2326 if (ret == NULL) 2327 return NULL; 2328 2329 const u64 m1 = astk_m1(m0, ret->next); 2330 if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED)) 2331 return ret; 2332 } while (true); 2333 } 2334 2335 static void * 2336 astk_pop_unsafe(au64 * const pmagic) 2337 { 2338 const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2339 struct acell * const ret = astk_ptr(m0); 2340 if (ret == NULL) 2341 return NULL; 2342 2343 const u64 m1 = astk_m1_unsafe(ret->next); 2344 atomic_store_explicit(pmagic, m1, MO_RELAXED); 2345 return (void *)ret; 2346 } 2347 2348 static void * 2349 astk_peek_unsafe(au64 * const pmagic) 2350 { 2351 const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME); 2352 return astk_ptr(m0); 2353 } 2354 // }}} astk 2355 2356 // slab {{{ 2357 #define SLAB_OBJ0_OFFSET ((64)) 2358 struct slab { 2359 au64 magic; // hi 48: ptr, lo 16: seq 2360 u64 padding1[7]; 2361 2362 // 2nd line 2363 struct acell * head_active; // list of blocks in use or in magic 2364 struct acell * head_backup; // list of unused full blocks 2365 u64 nr_ready; // UNSAFE only! number of objects under magic 2366 u64 padding2[5]; 2367 2368 // 3rd line const 2369 u64 obj_size; // const: aligned size of each object 2370 u64 blk_size; // const: size of each memory block 2371 u64 objs_per_slab; // const: number of objects in a slab 2372 u64 obj0_offset; // const: offset of the first object in a block 2373 u64 padding3[4]; 2374 2375 // 4th line 2376 union { 2377 mutex lock; 2378 u64 padding4[8]; 2379 }; 2380 }; 2381 static_assert(sizeof(struct slab) == 256, "sizeof(struct slab) != 256"); 2382 2383 static void 2384 slab_add(struct slab * const slab, struct acell * const blk, const bool is_safe) 2385 { 2386 // insert into head_active 2387 blk->next = slab->head_active; 2388 slab->head_active = blk; 2389 2390 u8 * const base = ((u8 *)blk) + slab->obj0_offset; 2391 struct acell * iter = (typeof(iter))base; // [0] 2392 for (u64 i = 1; i < slab->objs_per_slab; i++) { 2393 struct acell * const next = (typeof(next))(base + (i * slab->obj_size)); 2394 iter->next = next; 2395 iter = next; 2396 } 2397 2398 // base points to the first block; iter points to the last block 2399 if (is_safe) { // other threads can poll magic 2400 astk_push_safe(&slab->magic, (struct acell *)base, iter); 2401 } else { // unsafe 2402 astk_push_unsafe(&slab->magic, (struct acell *)base, iter); 2403 slab->nr_ready += slab->objs_per_slab; 2404 } 2405 } 2406 2407 // critical section; call with lock 2408 static bool 2409 slab_expand(struct slab * const slab, const bool is_safe) 2410 { 2411 struct acell * const old = slab->head_backup; 2412 if (old) { // pop old from backup and add 2413 slab->head_backup = old->next; 2414 slab_add(slab, old, is_safe); 2415 } else { // more core 2416 size_t blk_size; 2417 struct acell * const new = pages_alloc_best(slab->blk_size, true, &blk_size); 2418 (void)blk_size; 2419 if (new == NULL) 2420 return false; 2421 2422 slab_add(slab, new, is_safe); 2423 } 2424 return true; 2425 } 2426 2427 // return 0 on failure; otherwise, obj0_offset 2428 static u64 2429 slab_check_sizes(const u64 obj_size, const u64 blk_size) 2430 { 2431 // obj must be non-zero and 8-byte aligned 2432 // blk must be at least of page size and power of 2 2433 if ((!obj_size) || (obj_size % 8lu) || (blk_size < 4096lu) || (blk_size & (blk_size - 1))) 2434 return 0; 2435 2436 // each slab should have at least one object 2437 const u64 obj0_offset = (obj_size & (obj_size - 1)) ? SLAB_OBJ0_OFFSET : obj_size; 2438 if (obj0_offset >= blk_size || (blk_size - obj0_offset) < obj_size) 2439 return 0; 2440 2441 return obj0_offset; 2442 } 2443 2444 static void 2445 slab_init_internal(struct slab * const slab, const u64 obj_size, const u64 blk_size, const u64 obj0_offset) 2446 { 2447 memset(slab, 0, sizeof(*slab)); 2448 slab->obj_size = obj_size; 2449 slab->blk_size = blk_size; 2450 slab->objs_per_slab = (blk_size - obj0_offset) / obj_size; 2451 debug_assert(slab->objs_per_slab); // >= 1 2452 slab->obj0_offset = obj0_offset; 2453 mutex_init(&(slab->lock)); 2454 } 2455 2456 struct slab * 2457 slab_create(const u64 obj_size, const u64 blk_size) 2458 { 2459 const u64 obj0_offset = slab_check_sizes(obj_size, blk_size); 2460 if (!obj0_offset) 2461 return NULL; 2462 2463 struct slab * const slab = yalloc(sizeof(*slab)); 2464 if (slab == NULL) 2465 return NULL; 2466 2467 slab_init_internal(slab, obj_size, blk_size, obj0_offset); 2468 return slab; 2469 } 2470 2471 // unsafe 2472 bool 2473 slab_reserve_unsafe(struct slab * const slab, const u64 nr) 2474 { 2475 while (slab->nr_ready < nr) 2476 if (!slab_expand(slab, false)) 2477 return false; 2478 return true; 2479 } 2480 2481 void * 2482 slab_alloc_unsafe(struct slab * const slab) 2483 { 2484 void * ret = astk_pop_unsafe(&slab->magic); 2485 if (ret == NULL) { 2486 if (!slab_expand(slab, false)) 2487 return NULL; 2488 ret = astk_pop_unsafe(&slab->magic); 2489 } 2490 debug_assert(ret); 2491 slab->nr_ready--; 2492 return ret; 2493 } 2494 2495 void * 2496 slab_alloc_safe(struct slab * const slab) 2497 { 2498 void * ret = astk_pop_safe(&slab->magic); 2499 if (ret) 2500 return ret; 2501 2502 mutex_lock(&slab->lock); 2503 do { 2504 ret = astk_pop_safe(&slab->magic); // may already have new objs 2505 if (ret) 2506 break; 2507 if (!slab_expand(slab, true)) 2508 break; 2509 } while (true); 2510 mutex_unlock(&slab->lock); 2511 return ret; 2512 } 2513 2514 void 2515 slab_free_unsafe(struct slab * const slab, void * const ptr) 2516 { 2517 debug_assert(ptr); 2518 astk_push_unsafe(&slab->magic, ptr, ptr); 2519 slab->nr_ready++; 2520 } 2521 2522 void 2523 slab_free_safe(struct slab * const slab, void * const ptr) 2524 { 2525 astk_push_safe(&slab->magic, ptr, ptr); 2526 } 2527 2528 // UNSAFE 2529 void 2530 slab_free_all(struct slab * const slab) 2531 { 2532 slab->magic = 0; 2533 slab->nr_ready = 0; // backup does not count 2534 2535 if (slab->head_active) { 2536 struct acell * iter = slab->head_active; 2537 while (iter->next) 2538 iter = iter->next; 2539 // now iter points to the last blk 2540 iter->next = slab->head_backup; // active..backup 2541 slab->head_backup = slab->head_active; // backup gets all 2542 slab->head_active = NULL; // empty active 2543 } 2544 } 2545 2546 // unsafe 2547 u64 2548 slab_get_nalloc(struct slab * const slab) 2549 { 2550 struct acell * iter = slab->head_active; 2551 u64 n = 0; 2552 while (iter) { 2553 n++; 2554 iter = iter->next; 2555 } 2556 n *= slab->objs_per_slab; 2557 2558 iter = astk_peek_unsafe(&slab->magic); 2559 while (iter) { 2560 n--; 2561 iter = iter->next; 2562 } 2563 return n; 2564 } 2565 2566 static void 2567 slab_deinit(struct slab * const slab) 2568 { 2569 debug_assert(slab); 2570 struct acell * iter = slab->head_active; 2571 while (iter) { 2572 struct acell * const next = iter->next; 2573 pages_unmap(iter, slab->blk_size); 2574 iter = next; 2575 } 2576 iter = slab->head_backup; 2577 while (iter) { 2578 struct acell * const next = iter->next; 2579 pages_unmap(iter, slab->blk_size); 2580 iter = next; 2581 } 2582 } 2583 2584 void 2585 slab_destroy(struct slab * const slab) 2586 { 2587 slab_deinit(slab); 2588 free(slab); 2589 } 2590 // }}} slab 2591 2592 // string {{{ 2593 static union { u16 v16; u8 v8[2]; } strdec_table[100]; 2594 2595 __attribute__((constructor)) 2596 static void 2597 strdec_init(void) 2598 { 2599 for (u8 i = 0; i < 100; i++) { 2600 const u8 hi = (typeof(hi))('0' + (i / 10)); 2601 const u8 lo = (typeof(lo))('0' + (i % 10)); 2602 strdec_table[i].v8[0] = hi; 2603 strdec_table[i].v8[1] = lo; 2604 } 2605 } 2606 2607 // output 10 bytes 2608 void 2609 strdec_32(void * const out, const u32 v) 2610 { 2611 u32 vv = v; 2612 u16 * const ptr = (typeof(ptr))out; 2613 for (u64 i = 4; i <= 4; i--) { // x5 2614 ptr[i] = strdec_table[vv % 100].v16; 2615 vv /= 100u; 2616 } 2617 } 2618 2619 // output 20 bytes 2620 void 2621 strdec_64(void * const out, const u64 v) 2622 { 2623 u64 vv = v; 2624 u16 * const ptr = (typeof(ptr))out; 2625 for (u64 i = 9; i <= 9; i--) { // x10 2626 ptr[i] = strdec_table[vv % 100].v16; 2627 vv /= 100; 2628 } 2629 } 2630 2631 static const u8 strhex_table_16[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; 2632 2633 #if defined(__x86_64__) 2634 static inline m128 2635 strhex_helper(const u64 v) 2636 { 2637 static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0}; 2638 2639 const m128 tmp = _mm_set_epi64x((s64)(v>>4), (s64)v); // mm want s64 2640 const m128 hilo = _mm_and_si128(tmp, _mm_set1_epi8(0xf)); 2641 const m128 bin = _mm_shuffle_epi8(hilo, _mm_load_si128((void *)mask1)); 2642 const m128 str = _mm_shuffle_epi8(_mm_load_si128((const void *)strhex_table_16), bin); 2643 return str; 2644 } 2645 #elif defined(__aarch64__) 2646 static inline m128 2647 strhex_helper(const u64 v) 2648 { 2649 static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0}; 2650 u64 v2[2] = {v, v>>4}; 2651 const m128 tmp = vld1q_u8((u8 *)v2); 2652 const m128 hilo = vandq_u8(tmp, vdupq_n_u8(0xf)); 2653 const m128 bin = vqtbl1q_u8(hilo, vld1q_u8(mask1)); 2654 const m128 str = vqtbl1q_u8(vld1q_u8(strhex_table_16), bin); 2655 return str; 2656 } 2657 #else 2658 static u16 strhex_table_256[256]; 2659 2660 __attribute__((constructor)) 2661 static void 2662 strhex_init(void) 2663 { 2664 for (u64 i = 0; i < 256; i++) 2665 strhex_table_256[i] = (((u16)strhex_table_16[i & 0xf]) << 8) | (strhex_table_16[i>>4]); 2666 } 2667 #endif // __x86_64__ 2668 2669 // output 8 bytes 2670 void 2671 strhex_32(void * const out, u32 v) 2672 { 2673 #if defined(__x86_64__) 2674 const m128 str = strhex_helper((u64)v); 2675 _mm_storel_epi64(out, _mm_srli_si128(str, 8)); 2676 #elif defined(__aarch64__) 2677 const m128 str = strhex_helper((u64)v); 2678 vst1q_lane_u64(out, vreinterpretq_u64_u8(str), 1); 2679 #else 2680 u16 * const ptr = (typeof(ptr))out; 2681 for (u64 i = 0; i < 4; i++) { 2682 ptr[3-i] = strhex_table_256[v & 0xff]; 2683 v >>= 8; 2684 } 2685 #endif 2686 } 2687 2688 // output 16 bytes // buffer must be aligned by 16B 2689 void 2690 strhex_64(void * const out, u64 v) 2691 { 2692 #if defined(__x86_64__) 2693 const m128 str = strhex_helper(v); 2694 _mm_storeu_si128(out, str); 2695 #elif defined(__aarch64__) 2696 const m128 str = strhex_helper(v); 2697 vst1q_u8(out, str); 2698 #else 2699 u16 * const ptr = (typeof(ptr))out; 2700 for (u64 i = 0; i < 8; i++) { 2701 ptr[7-i] = strhex_table_256[v & 0xff]; 2702 v >>= 8; 2703 } 2704 #endif 2705 } 2706 2707 // string to u64 2708 inline u64 2709 a2u64(const void * const str) 2710 { 2711 return strtoull(str, NULL, 10); 2712 } 2713 2714 inline u32 2715 a2u32(const void * const str) 2716 { 2717 return (u32)strtoull(str, NULL, 10); 2718 } 2719 2720 inline s64 2721 a2s64(const void * const str) 2722 { 2723 return strtoll(str, NULL, 10); 2724 } 2725 2726 inline s32 2727 a2s32(const void * const str) 2728 { 2729 return (s32)strtoll(str, NULL, 10); 2730 } 2731 2732 void 2733 str_print_hex(FILE * const out, const void * const data, const u32 len) 2734 { 2735 const u8 * const ptr = data; 2736 const u32 strsz = len * 3; 2737 u8 * const buf = malloc(strsz); 2738 for (u32 i = 0; i < len; i++) { 2739 buf[i*3] = ' '; 2740 buf[i*3+1] = strhex_table_16[ptr[i]>>4]; 2741 buf[i*3+2] = strhex_table_16[ptr[i] & 0xf]; 2742 } 2743 fwrite(buf, strsz, 1, out); 2744 free(buf); 2745 } 2746 2747 void 2748 str_print_dec(FILE * const out, const void * const data, const u32 len) 2749 { 2750 const u8 * const ptr = data; 2751 const u32 strsz = len * 4; 2752 u8 * const buf = malloc(strsz); 2753 for (u32 i = 0; i < len; i++) { 2754 const u8 v = ptr[i]; 2755 buf[i*4] = ' '; 2756 const u8 v1 = v / 100u; 2757 const u8 v23 = v % 100u; 2758 buf[i*4+1] = (u8)'0' + v1; 2759 buf[i*4+2] = (u8)'0' + (v23 / 10u); 2760 buf[i*4+3] = (u8)'0' + (v23 % 10u); 2761 } 2762 fwrite(buf, strsz, 1, out); 2763 free(buf); 2764 } 2765 2766 // returns a NULL-terminated list of string tokens. 2767 // After use you only need to free the returned pointer (char **). 2768 char ** 2769 strtoks(const char * const str, const char * const delim) 2770 { 2771 if (str == NULL) 2772 return NULL; 2773 size_t nptr_alloc = 32; 2774 char ** tokens = malloc(sizeof(tokens[0]) * nptr_alloc); 2775 if (tokens == NULL) 2776 return NULL; 2777 const size_t bufsize = strlen(str) + 1; 2778 char * const buf = malloc(bufsize); 2779 if (buf == NULL) 2780 goto fail_buf; 2781 2782 memcpy(buf, str, bufsize); 2783 char * saveptr = NULL; 2784 char * tok = strtok_r(buf, delim, &saveptr); 2785 size_t ntoks = 0; 2786 while (tok) { 2787 if (ntoks >= nptr_alloc) { 2788 nptr_alloc += 32; 2789 char ** const r = realloc(tokens, sizeof(tokens[0]) * nptr_alloc); 2790 if (r == NULL) 2791 goto fail_realloc; 2792 2793 tokens = r; 2794 } 2795 tokens[ntoks] = tok; 2796 ntoks++; 2797 tok = strtok_r(NULL, delim, &saveptr); 2798 } 2799 tokens[ntoks] = NULL; 2800 const size_t nptr = ntoks + 1; // append a NULL 2801 const size_t rsize = (sizeof(tokens[0]) * nptr) + bufsize; 2802 char ** const r = realloc(tokens, rsize); 2803 if (r == NULL) 2804 goto fail_realloc; 2805 2806 tokens = r; 2807 char * const dest = (char *)(&(tokens[nptr])); 2808 memcpy(dest, buf, bufsize); 2809 for (u64 i = 0; i < ntoks; i++) 2810 tokens[i] += (dest - buf); 2811 2812 free(buf); 2813 return tokens; 2814 2815 fail_realloc: 2816 free(buf); 2817 fail_buf: 2818 free(tokens); 2819 return NULL; 2820 } 2821 2822 u32 2823 strtoks_count(const char * const * const toks) 2824 { 2825 if (!toks) 2826 return 0; 2827 u32 n = 0; 2828 while (toks[n++]); 2829 return n; 2830 } 2831 // }}} string 2832 2833 // qsbr {{{ 2834 #define QSBR_STATES_NR ((23)) // shard capacity; valid values are 3*8-1 == 23; 5*8-1 == 39; 7*8-1 == 55 2835 #define QSBR_SHARD_BITS ((5)) // 2^n shards 2836 #define QSBR_SHARD_NR (((1u) << QSBR_SHARD_BITS)) 2837 #define QSBR_SHARD_MASK ((QSBR_SHARD_NR - 1)) 2838 2839 struct qsbr_ref_real { 2840 #ifdef QSBR_DEBUG 2841 pthread_t ptid; // 8 2842 u32 status; // 4 2843 u32 nbt; // 4 (number of backtrace frames) 2844 #define QSBR_DEBUG_BTNR ((14)) 2845 void * backtrace[QSBR_DEBUG_BTNR]; 2846 #endif 2847 volatile au64 qstate; // user updates it 2848 struct qsbr_ref_real * volatile * pptr; // internal only 2849 struct qsbr_ref_real * park; 2850 }; 2851 2852 static_assert(sizeof(struct qsbr_ref) == sizeof(struct qsbr_ref_real), "sizeof qsbr_ref"); 2853 2854 // Quiescent-State-Based Reclamation RCU 2855 struct qsbr { 2856 struct qsbr_ref_real target; 2857 u64 padding0[5]; 2858 struct qshard { 2859 au64 bitmap; 2860 struct qsbr_ref_real * volatile ptrs[QSBR_STATES_NR]; 2861 } shards[QSBR_SHARD_NR]; 2862 }; 2863 2864 struct qsbr * 2865 qsbr_create(void) 2866 { 2867 struct qsbr * const q = yalloc(sizeof(*q)); 2868 memset(q, 0, sizeof(*q)); 2869 return q; 2870 } 2871 2872 static inline struct qshard * 2873 qsbr_shard(struct qsbr * const q, void * const ptr) 2874 { 2875 const u32 sid = crc32c_u64(0, (u64)ptr) & QSBR_SHARD_MASK; 2876 debug_assert(sid < QSBR_SHARD_NR); 2877 return &(q->shards[sid]); 2878 } 2879 2880 static inline void 2881 qsbr_write_qstate(struct qsbr_ref_real * const ref, const u64 v) 2882 { 2883 atomic_store_explicit(&ref->qstate, v, MO_RELAXED); 2884 } 2885 2886 bool 2887 qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref) 2888 { 2889 struct qsbr_ref_real * const ref = (typeof(ref))qref; 2890 struct qshard * const shard = qsbr_shard(q, ref); 2891 qsbr_write_qstate(ref, 0); 2892 2893 do { 2894 u64 bits = atomic_load_explicit(&shard->bitmap, MO_CONSUME); 2895 const u32 pos = (u32)__builtin_ctzl(~bits); 2896 if (unlikely(pos >= QSBR_STATES_NR)) 2897 return false; 2898 2899 const u64 bits1 = bits | (1lu << pos); 2900 if (atomic_compare_exchange_weak_explicit(&shard->bitmap, &bits, bits1, MO_ACQUIRE, MO_RELAXED)) { 2901 shard->ptrs[pos] = ref; 2902 2903 ref->pptr = &(shard->ptrs[pos]); 2904 ref->park = &q->target; 2905 #ifdef QSBR_DEBUG 2906 ref->ptid = (u64)pthread_self(); 2907 ref->tid = 0; 2908 ref->status = 1; 2909 ref->nbt = backtrace(ref->backtrace, QSBR_DEBUG_BTNR); 2910 #endif 2911 return true; 2912 } 2913 } while (true); 2914 } 2915 2916 void 2917 qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref) 2918 { 2919 struct qsbr_ref_real * const ref = (typeof(ref))qref; 2920 struct qshard * const shard = qsbr_shard(q, ref); 2921 const u32 pos = (u32)(ref->pptr - shard->ptrs); 2922 debug_assert(pos < QSBR_STATES_NR); 2923 debug_assert(shard->bitmap & (1lu << pos)); 2924 2925 shard->ptrs[pos] = &q->target; 2926 (void)atomic_fetch_and_explicit(&shard->bitmap, ~(1lu << pos), MO_RELEASE); 2927 #ifdef QSBR_DEBUG 2928 ref->tid = 0; 2929 ref->ptid = 0; 2930 ref->status = 0xffff; // unregistered 2931 ref->nbt = 0; 2932 #endif 2933 ref->pptr = NULL; 2934 // wait for qsbr_wait to leave if it's working on the shard 2935 while (atomic_load_explicit(&shard->bitmap, MO_CONSUME) >> 63) 2936 cpu_pause(); 2937 } 2938 2939 inline void 2940 qsbr_update(struct qsbr_ref * const qref, const u64 v) 2941 { 2942 struct qsbr_ref_real * const ref = (typeof(ref))qref; 2943 debug_assert((*ref->pptr) == ref); // must be unparked 2944 // rcu update does not require release or acquire order 2945 qsbr_write_qstate(ref, v); 2946 } 2947 2948 inline void 2949 qsbr_park(struct qsbr_ref * const qref) 2950 { 2951 cpu_cfence(); 2952 struct qsbr_ref_real * const ref = (typeof(ref))qref; 2953 *ref->pptr = ref->park; 2954 #ifdef QSBR_DEBUG 2955 ref->status = 0xfff; // parked 2956 #endif 2957 } 2958 2959 inline void 2960 qsbr_resume(struct qsbr_ref * const qref) 2961 { 2962 struct qsbr_ref_real * const ref = (typeof(ref))qref; 2963 *ref->pptr = ref; 2964 #ifdef QSBR_DEBUG 2965 ref->status = 0xf; // resumed 2966 #endif 2967 cpu_cfence(); 2968 } 2969 2970 // waiters needs external synchronization 2971 void 2972 qsbr_wait(struct qsbr * const q, const u64 target) 2973 { 2974 cpu_cfence(); 2975 qsbr_write_qstate(&q->target, target); 2976 u64 cbits = 0; // check-bits; each bit corresponds to a shard 2977 u64 bms[QSBR_SHARD_NR]; // copy of all bitmap 2978 // take an unsafe snapshot of active users 2979 for (u32 i = 0; i < QSBR_SHARD_NR; i++) { 2980 bms[i] = atomic_load_explicit(&q->shards[i].bitmap, MO_CONSUME); 2981 if (bms[i]) 2982 cbits |= (1lu << i); // set to 1 if [i] has ptrs 2983 } 2984 2985 while (cbits) { 2986 for (u64 ctmp = cbits; ctmp; ctmp &= (ctmp - 1)) { 2987 // shard id 2988 const u32 i = (u32)__builtin_ctzl(ctmp); 2989 struct qshard * const shard = &(q->shards[i]); 2990 const u64 bits1 = atomic_fetch_or_explicit(&(shard->bitmap), 1lu << 63, MO_ACQUIRE); 2991 for (u64 bits = bms[i]; bits; bits &= (bits - 1)) { 2992 const u64 bit = bits & -bits; // extract lowest bit 2993 if (((bits1 & bit) == 0) || 2994 (atomic_load_explicit(&(shard->ptrs[__builtin_ctzl(bit)]->qstate), MO_CONSUME) == target)) 2995 bms[i] &= ~bit; 2996 } 2997 (void)atomic_fetch_and_explicit(&(shard->bitmap), ~(1lu << 63), MO_RELEASE); 2998 if (bms[i] == 0) 2999 cbits &= ~(1lu << i); 3000 } 3001 #if defined(CORR) 3002 corr_yield(); 3003 #endif 3004 } 3005 debug_assert(cbits == 0); 3006 cpu_cfence(); 3007 } 3008 3009 void 3010 qsbr_destroy(struct qsbr * const q) 3011 { 3012 if (q) 3013 free(q); 3014 } 3015 #undef QSBR_STATES_NR 3016 #undef QSBR_BITMAP_NR 3017 // }}} qsbr 3018 3019 // vim:fdm=marker