github.com/moontrade/nogc@v0.1.7/collections/wormhole/lib.c (about)

     1  /*
     2   * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
     3   *
     4   * All rights reserved. No warranty, explicit or implicit, provided.
     5   */
     6  #define _GNU_SOURCE
     7  
     8  // headers {{{
     9  #include "lib.h"
    10  #include "ctypes.h"
    11  #include <assert.h>
    12  #include <execinfo.h>
    13  #include <math.h>
    14  #include <netdb.h>
    15  #include <sched.h>
    16  #include <signal.h>
    17  #include <sys/socket.h>
    18  #include <poll.h>
    19  #include <sys/ioctl.h>
    20  #include <time.h>
    21  #include <stdarg.h> // va_start
    22  
    23  #if defined(__linux__)
    24  #include <linux/fs.h>
    25  #include <malloc.h>  // malloc_usable_size
    26  #elif defined(__APPLE__) && defined(__MACH__)
    27  #include <sys/disk.h>
    28  #include <malloc/malloc.h>
    29  #elif defined(__FreeBSD__)
    30  #include <sys/disk.h>
    31  #include <malloc_np.h>
    32  #endif // OS
    33  
    34  #if defined(__FreeBSD__)
    35  #include <pthread_np.h>
    36  #endif
    37  // }}} headers
    38  
    39  // math {{{
    40    inline u64
    41  mhash64(const u64 v)
    42  {
    43    return v * 11400714819323198485lu;
    44  }
    45  
    46    inline u32
    47  mhash32(const u32 v)
    48  {
    49    return v * 2654435761u;
    50  }
    51  
    52  // From Daniel Lemire's blog (2013, lemire.me)
    53    u64
    54  gcd64(u64 a, u64 b)
    55  {
    56    if (a == 0)
    57      return b;
    58    if (b == 0)
    59      return a;
    60  
    61    const u32 shift = (u32)__builtin_ctzl(a | b);
    62    a >>= __builtin_ctzl(a);
    63    do {
    64      b >>= __builtin_ctzl(b);
    65      if (a > b) {
    66        const u64 t = b;
    67        b = a;
    68        a = t;
    69      }
    70      b = b - a;
    71    } while (b);
    72    return a << shift;
    73  }
    74  // }}} math
    75  
    76  // random {{{
    77  // Lehmer's generator is 2x faster than xorshift
    78  /**
    79   * D. H. Lehmer, Mathematical methods in large-scale computing units.
    80   * Proceedings of a Second Symposium on Large Scale Digital Calculating
    81   * Machinery;
    82   * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
    83   *
    84   * P L'Ecuyer,  Tables of linear congruential generators of different sizes and
    85   * good lattice structure. Mathematics of Computation of the American
    86   * Mathematical
    87   * Society 68.225 (1999): 249-260.
    88   */
    89  struct lehmer_u64 {
    90    union {
    91      u128 v128;
    92      u64 v64[2];
    93    };
    94  };
    95  
    96  static __thread struct lehmer_u64 rseed_u128 = {.v64 = {4294967291, 1549556881}};
    97  
    98    static inline u64
    99  lehmer_u64_next(struct lehmer_u64 * const s)
   100  {
   101    const u64 r = s->v64[1];
   102    s->v128 *= 0xda942042e4dd58b5lu;
   103    return r;
   104  }
   105  
   106    static inline void
   107  lehmer_u64_seed(struct lehmer_u64 * const s, const u64 seed)
   108  {
   109    s->v128 = (((u128)(~seed)) << 64) | (seed | 1);
   110    (void)lehmer_u64_next(s);
   111  }
   112  
   113    inline u64
   114  random_u64(void)
   115  {
   116    return lehmer_u64_next(&rseed_u128);
   117  }
   118  
   119    inline void
   120  srandom_u64(const u64 seed)
   121  {
   122    lehmer_u64_seed(&rseed_u128, seed);
   123  }
   124  
   125    inline double
   126  random_double(void)
   127  {
   128    // random between [0.0 - 1.0]
   129    const u64 r = random_u64();
   130    return ((double)r) * (1.0 / ((double)(~0lu)));
   131  }
   132  // }}} random
   133  
   134  // timing {{{
   135    inline u64
   136  time_nsec(void)
   137  {
   138    struct timespec ts;
   139    // MONO_RAW is 5x to 10x slower than MONO
   140    clock_gettime(CLOCK_MONOTONIC, &ts);
   141    return ((u64)ts.tv_sec) * 1000000000lu + ((u64)ts.tv_nsec);
   142  }
   143  
   144    inline double
   145  time_sec(void)
   146  {
   147    const u64 nsec = time_nsec();
   148    return ((double)nsec) * 1.0e-9;
   149  }
   150  
   151    inline u64
   152  time_diff_nsec(const u64 last)
   153  {
   154    return time_nsec() - last;
   155  }
   156  
   157    inline double
   158  time_diff_sec(const double last)
   159  {
   160    return time_sec() - last;
   161  }
   162  
   163  // need char str[64]
   164    void
   165  time_stamp(char * str, const size_t size)
   166  {
   167    time_t now;
   168    struct tm nowtm;
   169    time(&now);
   170    localtime_r(&now, &nowtm);
   171    strftime(str, size, "%F %T %z", &nowtm);
   172  }
   173  
   174    void
   175  time_stamp2(char * str, const size_t size)
   176  {
   177    time_t now;
   178    struct tm nowtm;
   179    time(&now);
   180    localtime_r(&now, &nowtm);
   181    strftime(str, size, "%F-%H-%M-%S%z", &nowtm);
   182  }
   183  // }}} timing
   184  
   185  // cpucache {{{
   186    inline void
   187  cpu_pause(void)
   188  {
   189  #if defined(__x86_64__)
   190    _mm_pause();
   191  #elif defined(__aarch64__)
   192    // nop
   193  #endif
   194  }
   195  
   196    inline void
   197  cpu_mfence(void)
   198  {
   199    atomic_thread_fence(MO_SEQ_CST);
   200  }
   201  
   202  // compiler fence
   203    inline void
   204  cpu_cfence(void)
   205  {
   206    atomic_thread_fence(MO_ACQ_REL);
   207  }
   208  
   209    inline void
   210  cpu_prefetch0(const void * const ptr)
   211  {
   212    __builtin_prefetch(ptr, 0, 0);
   213  }
   214  
   215    inline void
   216  cpu_prefetch1(const void * const ptr)
   217  {
   218    __builtin_prefetch(ptr, 0, 1);
   219  }
   220  
   221    inline void
   222  cpu_prefetch2(const void * const ptr)
   223  {
   224    __builtin_prefetch(ptr, 0, 2);
   225  }
   226  
   227    inline void
   228  cpu_prefetch3(const void * const ptr)
   229  {
   230    __builtin_prefetch(ptr, 0, 3);
   231  }
   232  
   233    inline void
   234  cpu_prefetchw(const void * const ptr)
   235  {
   236    __builtin_prefetch(ptr, 1, 0);
   237  }
   238  // }}} cpucache
   239  
   240  // crc32c {{{
   241    inline u32
   242  crc32c_u8(const u32 crc, const u8 v)
   243  {
   244  #if defined(__x86_64__)
   245    return _mm_crc32_u8(crc, v);
   246  #elif defined(__aarch64__)
   247    return __crc32cb(crc, v);
   248  #endif
   249  }
   250  
   251    inline u32
   252  crc32c_u16(const u32 crc, const u16 v)
   253  {
   254  #if defined(__x86_64__)
   255    return _mm_crc32_u16(crc, v);
   256  #elif defined(__aarch64__)
   257    return __crc32ch(crc, v);
   258  #endif
   259  }
   260  
   261    inline u32
   262  crc32c_u32(const u32 crc, const u32 v)
   263  {
   264  #if defined(__x86_64__)
   265    return _mm_crc32_u32(crc, v);
   266  #elif defined(__aarch64__)
   267    return __crc32cw(crc, v);
   268  #endif
   269  }
   270  
   271    inline u32
   272  crc32c_u64(const u32 crc, const u64 v)
   273  {
   274  #if defined(__x86_64__)
   275    return (u32)_mm_crc32_u64(crc, v);
   276  #elif defined(__aarch64__)
   277    return (u32)__crc32cd(crc, v);
   278  #endif
   279  }
   280  
   281    inline u32
   282  crc32c_inc_123(const u8 * buf, u32 nr, u32 crc)
   283  {
   284    if (nr == 1)
   285      return crc32c_u8(crc, buf[0]);
   286  
   287    crc = crc32c_u16(crc, *(u16 *)buf);
   288    return (nr == 2) ? crc : crc32c_u8(crc, buf[2]);
   289  }
   290  
   291    inline u32
   292  crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc)
   293  {
   294    //debug_assert((nr & 3) == 0);
   295    const u32 nr8 = nr >> 3;
   296  #pragma nounroll
   297    for (u32 i = 0; i < nr8; i++)
   298      crc = crc32c_u64(crc, ((u64*)buf)[i]);
   299  
   300    if (nr & 4u)
   301      crc = crc32c_u32(crc, ((u32*)buf)[nr8<<1]);
   302    return crc;
   303  }
   304  
   305    u32
   306  crc32c_inc(const u8 * buf, u32 nr, u32 crc)
   307  {
   308    crc = crc32c_inc_x4(buf, nr, crc);
   309    const u32 nr123 = nr & 3u;
   310    return nr123 ? crc32c_inc_123(buf + nr - nr123, nr123, crc) : crc;
   311  }
   312  // }}} crc32c
   313  
   314  // debug {{{
   315    void
   316  debug_break(void)
   317  {
   318    usleep(100);
   319  }
   320  
   321  static u64 * debug_watch_u64 = NULL;
   322  
   323    static void
   324  watch_u64_handler(const int sig)
   325  {
   326    (void)sig;
   327    const u64 v = debug_watch_u64 ? (*debug_watch_u64) : 0;
   328    fprintf(stderr, "[USR1] %lu (0x%lx)\n", v, v);
   329  }
   330  
   331    void
   332  watch_u64_usr1(u64 * const ptr)
   333  {
   334    debug_watch_u64 = ptr;
   335    struct sigaction sa = {};
   336    sa.sa_handler = watch_u64_handler;
   337    sigemptyset(&(sa.sa_mask));
   338    sa.sa_flags = SA_RESTART;
   339    if (sigaction(SIGUSR1, &sa, NULL) == -1) {
   340      fprintf(stderr, "Failed to set signal handler for SIGUSR1\n");
   341    } else {
   342      fprintf(stderr, "to watch> kill -s SIGUSR1 %d\n", getpid());
   343    }
   344  }
   345  
   346  static void * debug_bt_state = NULL;
   347  #if defined(BACKTRACE) && defined(__linux__)
   348  // TODO: get exec path on MacOS and FreeBSD
   349  
   350  #include <backtrace.h>
   351  static char debug_filepath[1024] = {};
   352  
   353    static void
   354  debug_bt_error_cb(void * const data, const char * const msg, const int errnum)
   355  {
   356    (void)data;
   357    if (msg)
   358      dprintf(2, "libbacktrace: %s %s\n", msg, strerror(errnum));
   359  }
   360  
   361    static int
   362  debug_bt_print_cb(void * const data, const uintptr_t pc,
   363      const char * const file, const int lineno, const char * const func)
   364  {
   365    u32 * const plevel = (typeof(plevel))data;
   366    if (file || func || lineno) {
   367      dprintf(2, "[%u]0x%012lx " TERMCLR(35) "%s" TERMCLR(31) ":" TERMCLR(34) "%d" TERMCLR(0)" %s\n",
   368          *plevel, pc, file ? file : "???", lineno, func ? func : "???");
   369    } else if (pc) {
   370      dprintf(2, "[%u]0x%012lx ??\n", *plevel, pc);
   371    }
   372    (*plevel)++;
   373    return 0;
   374  }
   375  
   376  __attribute__((constructor))
   377    static void
   378  debug_backtrace_init(void)
   379  {
   380    const ssize_t len = readlink("/proc/self/exe", debug_filepath, 1023);
   381    // disable backtrace
   382    if (len < 0 || len >= 1023)
   383      return;
   384  
   385    debug_filepath[len] = '\0';
   386    debug_bt_state = backtrace_create_state(debug_filepath, 1, debug_bt_error_cb, NULL);
   387  }
   388  #endif // BACKTRACE
   389  
   390    static void
   391  debug_wait_gdb(void * const bt_state)
   392  {
   393    if (bt_state) {
   394  #if defined(BACKTRACE)
   395      dprintf(2, "Backtrace :\n");
   396      u32 level = 0;
   397      backtrace_full(debug_bt_state, 1, debug_bt_print_cb, debug_bt_error_cb, &level);
   398  #endif // BACKTRACE
   399    } else { // fallback to execinfo if no backtrace or initialization failed
   400      void *array[64];
   401      const int size = backtrace(array, 64);
   402      dprintf(2, "Backtrace (%d):\n", size - 1);
   403      backtrace_symbols_fd(array + 1, size - 1, 2);
   404    }
   405  
   406    abool v = true;
   407    char timestamp[32];
   408    time_stamp(timestamp, 32);
   409    char threadname[32] = {};
   410    thread_get_name(pthread_self(), threadname, 32);
   411    strcat(threadname, "(!!)");
   412    thread_set_name(pthread_self(), threadname);
   413    char hostname[32];
   414    gethostname(hostname, 32);
   415  
   416    const char * const pattern = "[Waiting GDB] %1$s %2$s @ %3$s\n"
   417      "    Attach me: " TERMCLR(31) "sudo -Hi gdb -p %4$d" TERMCLR(0) "\n";
   418    char buf[256];
   419    sprintf(buf, pattern, timestamp, threadname, hostname, getpid());
   420    write(2, buf, strlen(buf));
   421  
   422    // to continue: gdb> set var v = 0
   423    // to kill from shell: $ kill %pid; kill -CONT %pid
   424  
   425    // uncomment this line to surrender the shell on error
   426    // kill(getpid(), SIGSTOP); // stop burning cpu, once
   427  
   428    static au32 nr_waiting = 0;
   429    const u32 seq = atomic_fetch_add_explicit(&nr_waiting, 1, MO_RELAXED);
   430    if (seq == 0) {
   431      sprintf(buf, "/run/user/%u/.debug_wait_gdb_pid", getuid());
   432      const int pidfd = open(buf, O_CREAT|O_TRUNC|O_WRONLY, 00644);
   433      if (pidfd >= 0) {
   434        dprintf(pidfd, "%u", getpid());
   435        close(pidfd);
   436      }
   437    }
   438  
   439  #pragma nounroll
   440    while (atomic_load_explicit(&v, MO_CONSUME))
   441      sleep(1);
   442  }
   443  
   444  #ifndef NDEBUG
   445    void
   446  debug_assert(const bool v)
   447  {
   448    if (!v)
   449      debug_wait_gdb(debug_bt_state);
   450  }
   451  #endif
   452  
   453  __attribute__((noreturn))
   454    void
   455  debug_die(void)
   456  {
   457    debug_wait_gdb(debug_bt_state);
   458    exit(0);
   459  }
   460  
   461  __attribute__((noreturn))
   462    void
   463  debug_die_perror(void)
   464  {
   465    perror(NULL);
   466    debug_die();
   467  }
   468  
   469  #if !defined(NOSIGNAL)
   470  // signal handler for wait_gdb on fatal errors
   471    static void
   472  wait_gdb_handler(const int sig, siginfo_t * const info, void * const context)
   473  {
   474    (void)info;
   475    (void)context;
   476    char buf[64] = "[SIGNAL] ";
   477    strcat(buf, strsignal(sig));
   478    write(2, buf, strlen(buf));
   479    debug_wait_gdb(NULL);
   480  }
   481  
   482  // setup hooks for catching fatal errors
   483  __attribute__((constructor))
   484    static void
   485  debug_init(void)
   486  {
   487    void * stack = pages_alloc_4kb(16);
   488    //fprintf(stderr, "altstack %p\n", stack);
   489    stack_t ss = {.ss_sp = stack, .ss_flags = 0, .ss_size = PGSZ*16};
   490    if (sigaltstack(&ss, NULL))
   491      fprintf(stderr, "sigaltstack failed\n");
   492  
   493    struct sigaction sa = {.sa_sigaction = wait_gdb_handler, .sa_flags = SA_SIGINFO | SA_ONSTACK};
   494    sigemptyset(&(sa.sa_mask));
   495    const int fatals[] = {SIGSEGV, SIGFPE, SIGILL, SIGBUS, 0};
   496    for (int i = 0; fatals[i]; i++) {
   497      if (sigaction(fatals[i], &sa, NULL) == -1) {
   498        fprintf(stderr, "Failed to set signal handler for %s\n", strsignal(fatals[i]));
   499        fflush(stderr);
   500      }
   501    }
   502  }
   503  
   504  __attribute__((destructor))
   505    static void
   506  debug_exit(void)
   507  {
   508    // to get rid of valgrind warnings
   509    stack_t ss = {.ss_flags = SS_DISABLE};
   510    stack_t oss = {};
   511    sigaltstack(&ss, &oss);
   512    if (oss.ss_sp)
   513      pages_unmap(oss.ss_sp, PGSZ * 16);
   514  }
   515  #endif // !defined(NOSIGNAL)
   516  
   517    void
   518  debug_dump_maps(FILE * const out)
   519  {
   520    FILE * const in = fopen("/proc/self/smaps", "r");
   521    char * line0 = yalloc(1024);
   522    size_t size0 = 1024;
   523    while (!feof(in)) {
   524      const ssize_t r1 = getline(&line0, &size0, in);
   525      if (r1 < 0) break;
   526      fprintf(out, "%s", line0);
   527    }
   528    fflush(out);
   529    fclose(in);
   530  }
   531  
   532  static pid_t perf_pid = 0;
   533  
   534  #if defined(__linux__)
   535  __attribute__((constructor))
   536    static void
   537  debug_perf_init(void)
   538  {
   539    const pid_t ppid = getppid();
   540    char tmp[256] = {};
   541    sprintf(tmp, "/proc/%d/cmdline", ppid);
   542    FILE * const fc = fopen(tmp, "r");
   543    const size_t nr = fread(tmp, 1, sizeof(tmp) - 1, fc);
   544    fclose(fc);
   545    // look for "perf record"
   546    if (nr < 12)
   547      return;
   548    tmp[nr] = '\0';
   549    for (u64 i = 0; i < nr; i++)
   550      if (tmp[i] == 0)
   551        tmp[i] = ' ';
   552  
   553    char * const perf = strstr(tmp, "perf record");
   554    if (perf) {
   555      fprintf(stderr, "%s: perf detected\n", __func__);
   556      perf_pid = ppid;
   557    }
   558  }
   559  #endif // __linux__
   560  
   561    bool
   562  debug_perf_switch(void)
   563  {
   564    if (perf_pid > 0) {
   565      kill(perf_pid, SIGUSR2);
   566      return true;
   567    } else {
   568      return false;
   569    }
   570  }
   571  // }}} debug
   572  
   573  // mm {{{
   574  #ifdef ALLOCFAIL
   575    bool
   576  alloc_fail(void)
   577  {
   578  #define ALLOCFAIL_RECP ((64lu))
   579  #define ALLOCFAIL_MAGIC ((ALLOCFAIL_RECP / 3lu))
   580    return ((random_u64() % ALLOCFAIL_RECP) == ALLOCFAIL_MAGIC);
   581  }
   582  
   583  #ifdef MALLOCFAIL
   584  extern void * __libc_malloc(size_t size);
   585    void *
   586  malloc(size_t size)
   587  {
   588    if (alloc_fail())
   589      return NULL;
   590    return __libc_malloc(size);
   591  }
   592  
   593  extern void * __libc_calloc(size_t nmemb, size_t size);
   594    void *
   595  calloc(size_t nmemb, size_t size)
   596  {
   597    if (alloc_fail())
   598      return NULL;
   599    return __libc_calloc(nmemb, size);
   600  }
   601  
   602  extern void *__libc_realloc(void *ptr, size_t size);
   603  
   604    void *
   605  realloc(void *ptr, size_t size)
   606  {
   607    if (alloc_fail())
   608      return NULL;
   609    return __libc_realloc(ptr, size);
   610  }
   611  #endif // MALLOC_FAIL
   612  #endif // ALLOC_FAIL
   613  
   614    void *
   615  xalloc(const size_t align, const size_t size)
   616  {
   617  #ifdef ALLOCFAIL
   618    if (alloc_fail())
   619      return NULL;
   620  #endif
   621    void * p;
   622    return (posix_memalign(&p, align, size) == 0) ? p : NULL;
   623  }
   624  
   625  // alloc cache-line aligned address
   626    void *
   627  yalloc(const size_t size)
   628  {
   629  #ifdef ALLOCFAIL
   630    if (alloc_fail())
   631      return NULL;
   632  #endif
   633    void * p;
   634    return (posix_memalign(&p, 64, size) == 0) ? p : NULL;
   635  }
   636  
   637    void **
   638  malloc_2d(const size_t nr, const size_t size)
   639  {
   640    const size_t size1 = nr * sizeof(void *);
   641    const size_t size2 = nr * size;
   642    void ** const mem = malloc(size1 + size2);
   643    u8 * const mem2 = ((u8 *)mem) + size1;
   644    for (size_t i = 0; i < nr; i++)
   645      mem[i] = mem2 + (i * size);
   646    return mem;
   647  }
   648  
   649    inline void **
   650  calloc_2d(const size_t nr, const size_t size)
   651  {
   652    void ** const ret = malloc_2d(nr, size);
   653    memset(ret[0], 0, nr * size);
   654    return ret;
   655  }
   656  
   657    inline void
   658  pages_unmap(void * const ptr, const size_t size)
   659  {
   660  #ifndef HEAPCHECKING
   661    munmap(ptr, size);
   662  #else
   663    (void)size;
   664    free(ptr);
   665  #endif
   666  }
   667  
   668    void
   669  pages_lock(void * const ptr, const size_t size)
   670  {
   671    static bool use_mlock = true;
   672    if (use_mlock) {
   673      const int ret = mlock(ptr, size);
   674      if (ret != 0) {
   675        use_mlock = false;
   676        fprintf(stderr, "%s: mlock disabled\n", __func__);
   677      }
   678    }
   679  }
   680  
   681  #ifndef HEAPCHECKING
   682    static void *
   683  pages_do_alloc(const size_t size, const int flags)
   684  {
   685    // vi /etc/security/limits.conf
   686    // * - memlock unlimited
   687    void * const p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
   688    if (p == MAP_FAILED)
   689      return NULL;
   690  
   691    pages_lock(p, size);
   692    return p;
   693  }
   694  
   695  #if defined(__linux__) && defined(MAP_HUGETLB)
   696  
   697  #if defined(MAP_HUGE_SHIFT)
   698  #define PAGES_FLAGS_1G ((MAP_HUGETLB | (30 << MAP_HUGE_SHIFT)))
   699  #define PAGES_FLAGS_2M ((MAP_HUGETLB | (21 << MAP_HUGE_SHIFT)))
   700  #else // MAP_HUGE_SHIFT
   701  #define PAGES_FLAGS_1G ((MAP_HUGETLB))
   702  #define PAGES_FLAGS_2M ((MAP_HUGETLB))
   703  #endif // MAP_HUGE_SHIFT
   704  
   705  #else
   706  #define PAGES_FLAGS_1G ((0))
   707  #define PAGES_FLAGS_2M ((0))
   708  #endif // __linux__
   709  
   710  #endif // HEAPCHECKING
   711  
   712    inline void *
   713  pages_alloc_1gb(const size_t nr_1gb)
   714  {
   715    const u64 sz = nr_1gb << 30;
   716  #ifndef HEAPCHECKING
   717    return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_1G);
   718  #else
   719    void * const p = xalloc(1lu << 21, sz); // Warning: valgrind fails with 30
   720    if (p)
   721      memset(p, 0, sz);
   722    return p;
   723  #endif
   724  }
   725  
   726    inline void *
   727  pages_alloc_2mb(const size_t nr_2mb)
   728  {
   729    const u64 sz = nr_2mb << 21;
   730  #ifndef HEAPCHECKING
   731    return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_2M);
   732  #else
   733    void * const p = xalloc(1lu << 21, sz);
   734    if (p)
   735      memset(p, 0, sz);
   736    return p;
   737  #endif
   738  }
   739  
   740    inline void *
   741  pages_alloc_4kb(const size_t nr_4kb)
   742  {
   743    const size_t sz = nr_4kb << 12;
   744  #ifndef HEAPCHECKING
   745    return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS);
   746  #else
   747    void * const p = xalloc(1lu << 12, sz);
   748    if (p)
   749      memset(p, 0, sz);
   750    return p;
   751  #endif
   752  }
   753  
   754    void *
   755  pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out)
   756  {
   757  #ifdef ALLOCFAIL
   758    if (alloc_fail())
   759      return NULL;
   760  #endif
   761    // 1gb huge page: at least 0.25GB
   762    if (try_1gb) {
   763      if (size >= (1lu << 28)) {
   764        const size_t nr_1gb = bits_round_up(size, 30) >> 30;
   765        void * const p1 = pages_alloc_1gb(nr_1gb);
   766        if (p1) {
   767          *size_out = nr_1gb << 30;
   768          return p1;
   769        }
   770      }
   771    }
   772  
   773    // 2mb huge page: at least 0.5MB
   774    if (size >= (1lu << 19)) {
   775      const size_t nr_2mb = bits_round_up(size, 21) >> 21;
   776      void * const p2 = pages_alloc_2mb(nr_2mb);
   777      if (p2) {
   778        *size_out = nr_2mb << 21;
   779        return p2;
   780      }
   781    }
   782  
   783    const size_t nr_4kb = bits_round_up(size, 12) >> 12;
   784    void * const p3 = pages_alloc_4kb(nr_4kb);
   785    if (p3)
   786      *size_out = nr_4kb << 12;
   787    return p3;
   788  }
   789  // }}} mm
   790  
   791  // process/thread {{{
   792  static u32 process_ncpu;
   793  #if defined(__FreeBSD__)
   794  typedef cpuset_t cpu_set_t;
   795  #elif defined(__APPLE__) && defined(__MACH__)
   796  typedef u64 cpu_set_t;
   797  #define CPU_SETSIZE ((64))
   798  #define CPU_COUNT(__cpu_ptr__) (__builtin_popcountl(*__cpu_ptr__))
   799  #define CPU_ISSET(__cpu_idx__, __cpu_ptr__) (((*__cpu_ptr__) >> __cpu_idx__) & 1lu)
   800  #define CPU_ZERO(__cpu_ptr__) ((*__cpu_ptr__) = 0)
   801  #define CPU_SET(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) |= (1lu << __cpu_idx__))
   802  #define CPU_CLR(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) &= ~(1lu << __cpu_idx__))
   803  #define pthread_attr_setaffinity_np(...) ((void)0)
   804  #endif
   805  
   806  __attribute__((constructor))
   807    static void
   808  process_init(void)
   809  {
   810    // Linux's default is 1024 cpus
   811    process_ncpu = (u32)sysconf(_SC_NPROCESSORS_CONF);
   812    if (process_ncpu > CPU_SETSIZE) {
   813      fprintf(stderr, "%s: can use only %zu cores\n",
   814          __func__, (size_t)CPU_SETSIZE);
   815      process_ncpu = CPU_SETSIZE;
   816    }
   817    thread_set_name(pthread_self(), "main");
   818  }
   819  
   820    static inline int
   821  thread_getaffinity_set(cpu_set_t * const cpuset)
   822  {
   823  #if defined(__linux__)
   824    return sched_getaffinity(0, sizeof(*cpuset), cpuset);
   825  #elif defined(__FreeBSD__)
   826    return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
   827  #elif defined(__APPLE__) && defined(__MACH__)
   828    *cpuset = (1lu << process_ncpu) - 1;
   829    return (int)process_ncpu; // TODO
   830  #endif // OS
   831  }
   832  
   833    static inline int
   834  thread_setaffinity_set(const cpu_set_t * const cpuset)
   835  {
   836  #if defined(__linux__)
   837    return sched_setaffinity(0, sizeof(*cpuset), cpuset);
   838  #elif defined(__FreeBSD__)
   839    return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
   840  #elif defined(__APPLE__) && defined(__MACH__)
   841    (void)cpuset; // TODO
   842    return 0;
   843  #endif // OS
   844  }
   845  
   846    void
   847  thread_get_name(const pthread_t pt, char * const name, const size_t len)
   848  {
   849  #if defined(__linux__)
   850    pthread_getname_np(pt, name, len);
   851  #elif defined(__FreeBSD__)
   852    pthread_get_name_np(pt, name, len);
   853  #elif defined(__APPLE__) && defined(__MACH__)
   854    (void)pt;
   855    (void)len;
   856    strcpy(name, "unknown"); // TODO
   857  #endif // OS
   858  }
   859  
   860    void
   861  thread_set_name(const pthread_t pt, const char * const name)
   862  {
   863  #if defined(__linux__)
   864    pthread_setname_np(pt, name);
   865  #elif defined(__FreeBSD__)
   866    pthread_set_name_np(pt, name);
   867  #elif defined(__APPLE__) && defined(__MACH__)
   868    (void)pt;
   869    (void)name; // TODO
   870  #endif // OS
   871  }
   872  
   873  // kB
   874    long
   875  process_get_rss(void)
   876  {
   877    struct rusage rs;
   878    getrusage(RUSAGE_SELF, &rs);
   879    return rs.ru_maxrss;
   880  }
   881  
   882    u32
   883  process_affinity_count(void)
   884  {
   885    cpu_set_t set;
   886    if (thread_getaffinity_set(&set) != 0)
   887      return process_ncpu;
   888  
   889    const u32 nr = (u32)CPU_COUNT(&set);
   890    return nr ? nr : process_ncpu;
   891  }
   892  
   893    u32
   894  process_getaffinity_list(const u32 max, u32 * const cores)
   895  {
   896    memset(cores, 0, max * sizeof(cores[0]));
   897    cpu_set_t set;
   898    if (thread_getaffinity_set(&set) != 0)
   899      return 0;
   900  
   901    const u32 nr_affinity = (u32)CPU_COUNT(&set);
   902    const u32 nr = nr_affinity < max ? nr_affinity : max;
   903    u32 j = 0;
   904    for (u32 i = 0; i < process_ncpu; i++) {
   905      if (CPU_ISSET(i, &set))
   906        cores[j++] = i;
   907  
   908      if (j >= nr)
   909        break;
   910    }
   911    return j;
   912  }
   913  
   914    void
   915  thread_setaffinity_list(const u32 nr, const u32 * const list)
   916  {
   917    cpu_set_t set;
   918    CPU_ZERO(&set);
   919    for (u32 i = 0; i < nr; i++)
   920      if (list[i] < process_ncpu)
   921        CPU_SET(list[i], &set);
   922    thread_setaffinity_set(&set);
   923  }
   924  
   925    void
   926  thread_pin(const u32 cpu)
   927  {
   928    cpu_set_t set;
   929    CPU_ZERO(&set);
   930    CPU_SET(cpu % process_ncpu, &set);
   931    thread_setaffinity_set(&set);
   932  }
   933  
   934    u64
   935  process_cpu_time_usec(void)
   936  {
   937    struct rusage rs;
   938    getrusage(RUSAGE_SELF, &rs);
   939    const u64 usr = (((u64)rs.ru_utime.tv_sec) * 1000000lu) + ((u64)rs.ru_utime.tv_usec);
   940    const u64 sys = (((u64)rs.ru_stime.tv_sec) * 1000000lu) + ((u64)rs.ru_stime.tv_usec);
   941    return usr + sys;
   942  }
   943  
   944  struct fork_join_info {
   945    u32 total;
   946    u32 ncores;
   947    u32 * cores;
   948    void *(*func)(void *);
   949    bool args;
   950    union {
   951      void * arg1;
   952      void ** argn;
   953    };
   954    union {
   955      struct { volatile au32 ferr, jerr; };
   956      volatile au64 xerr;
   957    };
   958  };
   959  
   960  // DON'T CHANGE!
   961  #define FORK_JOIN_RANK_BITS ((16)) // 16
   962  #define FORK_JOIN_MAX ((1u << FORK_JOIN_RANK_BITS))
   963  
   964  /*
   965   * fj(6):     T0
   966   *         /      \
   967   *       T0        T4
   968   *     /   \      /
   969   *    T0   T2    T4
   970   *   / \   / \   / \
   971   *  t0 t1 t2 t3 t4 t5
   972   */
   973  
   974  // recursive tree fork-join
   975    static void *
   976  thread_do_fork_join_worker(void * const ptr)
   977  {
   978    struct entry13 fjp = {.ptr = ptr};
   979    // GCC: Without explicitly casting from fjp.fji (a 45-bit u64 value),
   980    // the high bits will get truncated, which is always CORRECT in gcc.
   981    // Don't use gcc.
   982    struct fork_join_info * const fji = u64_to_ptr(fjp.e3);
   983    const u32 rank = (u32)fjp.e1;
   984  
   985    const u32 nchild = (u32)__builtin_ctz(rank ? rank : bits_p2_up_u32(fji->total));
   986    debug_assert(nchild <= FORK_JOIN_RANK_BITS);
   987    pthread_t tids[FORK_JOIN_RANK_BITS];
   988    if (nchild) {
   989      cpu_set_t set;
   990      CPU_ZERO(&set);
   991      pthread_attr_t attr;
   992      pthread_attr_init(&attr);
   993      //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Joinable by default
   994      // fork top-down
   995      for (u32 i = nchild - 1; i < nchild; i--) {
   996        const u32 cr = rank + (1u << i); // child's rank
   997        if (cr >= fji->total)
   998          continue; // should not break
   999        const u32 core = fji->cores[(cr < fji->ncores) ? cr : (cr % fji->ncores)];
  1000        CPU_SET(core, &set);
  1001        pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
  1002        fjp.e1 = (u16)cr;
  1003        const int r = pthread_create(&tids[i], &attr, thread_do_fork_join_worker, fjp.ptr);
  1004        CPU_CLR(core, &set);
  1005        if (unlikely(r)) { // fork failed
  1006          memset(&tids[0], 0, sizeof(tids[0]) * (i+1));
  1007          u32 nmiss = (1u << (i + 1)) - 1;
  1008          if ((rank + nmiss) >= fji->total)
  1009            nmiss = fji->total - 1 - rank;
  1010          (void)atomic_fetch_add_explicit(&fji->ferr, nmiss, MO_RELAXED);
  1011          break;
  1012        }
  1013      }
  1014      pthread_attr_destroy(&attr);
  1015    }
  1016  
  1017    char thname0[16];
  1018    char thname1[16];
  1019    thread_get_name(pthread_self(), thname0, 16);
  1020    snprintf(thname1, 16, "%.8s_%u", thname0, rank);
  1021    thread_set_name(pthread_self(), thname1);
  1022  
  1023    void * const ret = fji->func(fji->args ? fji->argn[rank] : fji->arg1);
  1024  
  1025    thread_set_name(pthread_self(), thname0);
  1026    // join bottom-up
  1027    for (u32 i = 0; i < nchild; i++) {
  1028      const u32 cr = rank + (1u << i); // child rank
  1029      if (cr >= fji->total)
  1030        break; // safe to break
  1031      if (tids[i]) {
  1032        const int r = pthread_join(tids[i], NULL);
  1033        if (unlikely(r)) { // error
  1034          //fprintf(stderr, "pthread_join %u..%u = %d: %s\n", rank, cr, r, strerror(r));
  1035          (void)atomic_fetch_add_explicit(&fji->jerr, 1, MO_RELAXED);
  1036        }
  1037      }
  1038    }
  1039    return ret;
  1040  }
  1041  
  1042    u64
  1043  thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx)
  1044  {
  1045    if (unlikely(nr > FORK_JOIN_MAX)) {
  1046      fprintf(stderr, "%s reduce nr to %u\n", __func__, FORK_JOIN_MAX);
  1047      nr = FORK_JOIN_MAX;
  1048    }
  1049  
  1050    u32 cores[CPU_SETSIZE];
  1051    u32 ncores = process_getaffinity_list(process_ncpu, cores);
  1052    if (unlikely(ncores == 0)) { // force to use all cores
  1053      ncores = process_ncpu;
  1054      for (u32 i = 0; i < process_ncpu; i++)
  1055        cores[i] = i;
  1056    }
  1057    if (unlikely(nr == 0))
  1058      nr = ncores;
  1059  
  1060    // the compiler does not know fji can change since we cast &fji into fjp
  1061    struct fork_join_info fji = {.total = nr, .cores = cores, .ncores = ncores,
  1062        .func = func, .args = args, .arg1 = argx};
  1063    const struct entry13 fjp = entry13(0, (u64)(&fji));
  1064  
  1065    // save current affinity
  1066    cpu_set_t set0;
  1067    thread_getaffinity_set(&set0);
  1068  
  1069    // master thread shares thread0's core
  1070    cpu_set_t set;
  1071    CPU_ZERO(&set);
  1072    CPU_SET(fji.cores[0], &set);
  1073    thread_setaffinity_set(&set);
  1074  
  1075    const u64 t0 = time_nsec();
  1076    (void)thread_do_fork_join_worker(fjp.ptr);
  1077    const u64 dt = time_diff_nsec(t0);
  1078  
  1079    // restore original affinity
  1080    thread_setaffinity_set(&set0);
  1081  
  1082    // check and report errors (unlikely)
  1083    if (atomic_load_explicit(&fji.xerr, MO_CONSUME))
  1084      fprintf(stderr, "%s errors: fork %u join %u\n", __func__, fji.ferr, fji.jerr);
  1085    return dt;
  1086  }
  1087  
  1088    int
  1089  thread_create_at(const u32 cpu, pthread_t * const thread,
  1090      void *(*start_routine) (void *), void * const arg)
  1091  {
  1092    const u32 cpu_id = (cpu < process_ncpu) ? cpu : (cpu % process_ncpu);
  1093    pthread_attr_t attr;
  1094    pthread_attr_init(&attr);
  1095    //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  1096    cpu_set_t set;
  1097  
  1098    CPU_ZERO(&set);
  1099    CPU_SET(cpu_id, &set);
  1100    pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
  1101    const int r = pthread_create(thread, &attr, start_routine, arg);
  1102    pthread_attr_destroy(&attr);
  1103    return r;
  1104  }
  1105  // }}} process/thread
  1106  
  1107  // locking {{{
  1108  
  1109  // spinlock {{{
  1110  #if defined(__linux__)
  1111  #define SPINLOCK_PTHREAD
  1112  #endif // __linux__
  1113  
  1114  #if defined(SPINLOCK_PTHREAD)
  1115  static_assert(sizeof(pthread_spinlock_t) <= sizeof(spinlock), "spinlock size");
  1116  #else // SPINLOCK_PTHREAD
  1117  static_assert(sizeof(au32) <= sizeof(spinlock), "spinlock size");
  1118  #endif // SPINLOCK_PTHREAD
  1119  
  1120    void
  1121  spinlock_init(spinlock * const lock)
  1122  {
  1123  #if defined(SPINLOCK_PTHREAD)
  1124    pthread_spinlock_t * const p = (typeof(p))lock;
  1125    pthread_spin_init(p, PTHREAD_PROCESS_PRIVATE);
  1126  #else // SPINLOCK_PTHREAD
  1127    au32 * const p = (typeof(p))lock;
  1128    atomic_store_explicit(p, 0, MO_RELEASE);
  1129  #endif // SPINLOCK_PTHREAD
  1130  }
  1131  
  1132    inline void
  1133  spinlock_lock(spinlock * const lock)
  1134  {
  1135  #if defined(CORR)
  1136  #pragma nounroll
  1137    while (!spinlock_trylock(lock))
  1138      corr_yield();
  1139  #else // CORR
  1140  #if defined(SPINLOCK_PTHREAD)
  1141    pthread_spinlock_t * const p = (typeof(p))lock;
  1142    pthread_spin_lock(p); // return value ignored
  1143  #else // SPINLOCK_PTHREAD
  1144    au32 * const p = (typeof(p))lock;
  1145  #pragma nounroll
  1146    do {
  1147      if (atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0)
  1148        return;
  1149  #pragma nounroll
  1150      do {
  1151        cpu_pause();
  1152      } while (atomic_load_explicit(p, MO_CONSUME));
  1153    } while (true);
  1154  #endif // SPINLOCK_PTHREAD
  1155  #endif // CORR
  1156  }
  1157  
  1158    inline bool
  1159  spinlock_trylock(spinlock * const lock)
  1160  {
  1161  #if defined(SPINLOCK_PTHREAD)
  1162    pthread_spinlock_t * const p = (typeof(p))lock;
  1163    return !pthread_spin_trylock(p);
  1164  #else // SPINLOCK_PTHREAD
  1165    au32 * const p = (typeof(p))lock;
  1166    return atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0;
  1167  #endif // SPINLOCK_PTHREAD
  1168  }
  1169  
  1170    inline void
  1171  spinlock_unlock(spinlock * const lock)
  1172  {
  1173  #if defined(SPINLOCK_PTHREAD)
  1174    pthread_spinlock_t * const p = (typeof(p))lock;
  1175    pthread_spin_unlock(p); // return value ignored
  1176  #else // SPINLOCK_PTHREAD
  1177    au32 * const p = (typeof(p))lock;
  1178    atomic_store_explicit(p, 0, MO_RELEASE);
  1179  #endif // SPINLOCK_PTHREAD
  1180  }
  1181  // }}} spinlock
  1182  
  1183  // pthread mutex {{{
  1184  static_assert(sizeof(pthread_mutex_t) <= sizeof(mutex), "mutexlock size");
  1185    inline void
  1186  mutex_init(mutex * const lock)
  1187  {
  1188    pthread_mutex_t * const p = (typeof(p))lock;
  1189    pthread_mutex_init(p, NULL);
  1190  }
  1191  
  1192    inline void
  1193  mutex_lock(mutex * const lock)
  1194  {
  1195  #if defined(CORR)
  1196  #pragma nounroll
  1197    while (!mutex_trylock(lock))
  1198      corr_yield();
  1199  #else
  1200    pthread_mutex_t * const p = (typeof(p))lock;
  1201    pthread_mutex_lock(p); // return value ignored
  1202  #endif
  1203  }
  1204  
  1205    inline bool
  1206  mutex_trylock(mutex * const lock)
  1207  {
  1208    pthread_mutex_t * const p = (typeof(p))lock;
  1209    return !pthread_mutex_trylock(p); // return value ignored
  1210  }
  1211  
  1212    inline void
  1213  mutex_unlock(mutex * const lock)
  1214  {
  1215    pthread_mutex_t * const p = (typeof(p))lock;
  1216    pthread_mutex_unlock(p); // return value ignored
  1217  }
  1218  
  1219    inline void
  1220  mutex_deinit(mutex * const lock)
  1221  {
  1222    pthread_mutex_t * const p = (typeof(p))lock;
  1223    pthread_mutex_destroy(p);
  1224  }
  1225  // }}} pthread mutex
  1226  
  1227  // rwdep {{{
  1228  // poor man's lockdep for rwlock
  1229  // per-thread lock list
  1230  // it calls debug_die() when local double-(un)locking is detected
  1231  // cyclic dependencies can be manually identified by looking at the two lists below in gdb
  1232  #ifdef RWDEP
  1233  #define RWDEP_NR ((16))
  1234  __thread const rwlock * rwdep_readers[RWDEP_NR] = {};
  1235  __thread const rwlock * rwdep_writers[RWDEP_NR] = {};
  1236  
  1237    static void
  1238  rwdep_check(const rwlock * const lock)
  1239  {
  1240    debug_assert(lock);
  1241    for (u64 i = 0; i < RWDEP_NR; i++) {
  1242      if (rwdep_readers[i] == lock)
  1243        debug_die();
  1244      if (rwdep_writers[i] == lock)
  1245        debug_die();
  1246    }
  1247  }
  1248  #endif // RWDEP
  1249  
  1250    static void
  1251  rwdep_lock_read(const rwlock * const lock)
  1252  {
  1253  #ifdef RWDEP
  1254    rwdep_check(lock);
  1255    for (u64 i = 0; i < RWDEP_NR; i++) {
  1256      if (rwdep_readers[i] == NULL) {
  1257        rwdep_readers[i] = lock;
  1258        return;
  1259      }
  1260    }
  1261  #else
  1262    (void)lock;
  1263  #endif // RWDEP
  1264  }
  1265  
  1266    static void
  1267  rwdep_unlock_read(const rwlock * const lock)
  1268  {
  1269  #ifdef RWDEP
  1270    for (u64 i = 0; i < RWDEP_NR; i++) {
  1271      if (rwdep_readers[i] == lock) {
  1272        rwdep_readers[i] = NULL;
  1273        return;
  1274      }
  1275    }
  1276    debug_die();
  1277  #else
  1278    (void)lock;
  1279  #endif // RWDEP
  1280  }
  1281  
  1282    static void
  1283  rwdep_lock_write(const rwlock * const lock)
  1284  {
  1285  #ifdef RWDEP
  1286    rwdep_check(lock);
  1287    for (u64 i = 0; i < RWDEP_NR; i++) {
  1288      if (rwdep_writers[i] == NULL) {
  1289        rwdep_writers[i] = lock;
  1290        return;
  1291      }
  1292    }
  1293  #else
  1294    (void)lock;
  1295  #endif // RWDEP
  1296  }
  1297  
  1298    static void
  1299  rwdep_unlock_write(const rwlock * const lock)
  1300  {
  1301  #ifdef RWDEP
  1302    for (u64 i = 0; i < RWDEP_NR; i++) {
  1303      if (rwdep_writers[i] == lock) {
  1304        rwdep_writers[i] = NULL;
  1305        return;
  1306      }
  1307    }
  1308    debug_die();
  1309  #else
  1310    (void)lock;
  1311  #endif // RWDEP
  1312  }
  1313  // }}} rwlockdep
  1314  
  1315  // rwlock {{{
  1316  typedef au32 lock_t;
  1317  typedef u32 lock_v;
  1318  static_assert(sizeof(lock_t) == sizeof(lock_v), "lock size");
  1319  static_assert(sizeof(lock_t) <= sizeof(rwlock), "lock size");
  1320  
  1321  #define RWLOCK_WSHIFT ((sizeof(lock_t) * 8 - 1))
  1322  #define RWLOCK_WBIT ((((lock_v)1) << RWLOCK_WSHIFT))
  1323  
  1324    inline void
  1325  rwlock_init(rwlock * const lock)
  1326  {
  1327    lock_t * const pvar = (typeof(pvar))lock;
  1328    atomic_store_explicit(pvar, 0, MO_RELEASE);
  1329  }
  1330  
  1331    inline bool
  1332  rwlock_trylock_read(rwlock * const lock)
  1333  {
  1334    lock_t * const pvar = (typeof(pvar))lock;
  1335    if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
  1336      rwdep_lock_read(lock);
  1337      return true;
  1338    } else {
  1339      atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
  1340      return false;
  1341    }
  1342  }
  1343  
  1344    inline bool
  1345  rwlock_trylock_read_lp(rwlock * const lock)
  1346  {
  1347    lock_t * const pvar = (typeof(pvar))lock;
  1348    if (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) {
  1349      cpu_pause();
  1350      return false;
  1351    }
  1352    return rwlock_trylock_read(lock);
  1353  }
  1354  
  1355  // actually nr + 1
  1356    inline bool
  1357  rwlock_trylock_read_nr(rwlock * const lock, u16 nr)
  1358  {
  1359    lock_t * const pvar = (typeof(pvar))lock;
  1360    if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
  1361      rwdep_lock_read(lock);
  1362      return true;
  1363    }
  1364  
  1365  #pragma nounroll
  1366    do { // someone already locked; wait for a little while
  1367      cpu_pause();
  1368      if ((atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) == 0) {
  1369        rwdep_lock_read(lock);
  1370        return true;
  1371      }
  1372    } while (nr--);
  1373  
  1374    atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
  1375    return false;
  1376  }
  1377  
  1378    inline void
  1379  rwlock_lock_read(rwlock * const lock)
  1380  {
  1381    lock_t * const pvar = (typeof(pvar))lock;
  1382  #pragma nounroll
  1383    do {
  1384      if (rwlock_trylock_read(lock))
  1385        return;
  1386  #pragma nounroll
  1387      do {
  1388  #if defined(CORR)
  1389        corr_yield();
  1390  #else
  1391        cpu_pause();
  1392  #endif
  1393      } while (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT);
  1394    } while (true);
  1395  }
  1396  
  1397    inline void
  1398  rwlock_unlock_read(rwlock * const lock)
  1399  {
  1400    rwdep_unlock_read(lock);
  1401    lock_t * const pvar = (typeof(pvar))lock;
  1402    atomic_fetch_sub_explicit(pvar, 1, MO_RELEASE);
  1403  }
  1404  
  1405    inline bool
  1406  rwlock_trylock_write(rwlock * const lock)
  1407  {
  1408    lock_t * const pvar = (typeof(pvar))lock;
  1409    lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
  1410    if ((v0 == 0) && atomic_compare_exchange_weak_explicit(pvar, &v0, RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
  1411      rwdep_lock_write(lock);
  1412      return true;
  1413    } else {
  1414      return false;
  1415    }
  1416  }
  1417  
  1418  // actually nr + 1
  1419    inline bool
  1420  rwlock_trylock_write_nr(rwlock * const lock, u16 nr)
  1421  {
  1422  #pragma nounroll
  1423    do {
  1424      if (rwlock_trylock_write(lock))
  1425        return true;
  1426      cpu_pause();
  1427    } while (nr--);
  1428    return false;
  1429  }
  1430  
  1431    inline void
  1432  rwlock_lock_write(rwlock * const lock)
  1433  {
  1434    lock_t * const pvar = (typeof(pvar))lock;
  1435  #pragma nounroll
  1436    do {
  1437      if (rwlock_trylock_write(lock))
  1438        return;
  1439  #pragma nounroll
  1440      do {
  1441  #if defined(CORR)
  1442        corr_yield();
  1443  #else
  1444        cpu_pause();
  1445  #endif
  1446      } while (atomic_load_explicit(pvar, MO_CONSUME));
  1447    } while (true);
  1448  }
  1449  
  1450    inline bool
  1451  rwlock_trylock_write_hp(rwlock * const lock)
  1452  {
  1453    lock_t * const pvar = (typeof(pvar))lock;
  1454    lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
  1455    if (v0 >> RWLOCK_WSHIFT)
  1456      return false;
  1457  
  1458    if (atomic_compare_exchange_weak_explicit(pvar, &v0, v0|RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
  1459      rwdep_lock_write(lock);
  1460      // WBIT successfully marked; must wait for readers to leave
  1461      if (v0) { // saw active readers
  1462  #pragma nounroll
  1463        while (atomic_load_explicit(pvar, MO_CONSUME) != RWLOCK_WBIT) {
  1464  #if defined(CORR)
  1465          corr_yield();
  1466  #else
  1467          cpu_pause();
  1468  #endif
  1469        }
  1470      }
  1471      return true;
  1472    } else {
  1473      return false;
  1474    }
  1475  }
  1476  
  1477    inline bool
  1478  rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr)
  1479  {
  1480  #pragma nounroll
  1481    do {
  1482      if (rwlock_trylock_write_hp(lock))
  1483        return true;
  1484      cpu_pause();
  1485    } while (nr--);
  1486    return false;
  1487  }
  1488  
  1489    inline void
  1490  rwlock_lock_write_hp(rwlock * const lock)
  1491  {
  1492  #pragma nounroll
  1493    while (!rwlock_trylock_write_hp(lock)) {
  1494  #if defined(CORR)
  1495      corr_yield();
  1496  #else
  1497      cpu_pause();
  1498  #endif
  1499    }
  1500  }
  1501  
  1502    inline void
  1503  rwlock_unlock_write(rwlock * const lock)
  1504  {
  1505    rwdep_unlock_write(lock);
  1506    lock_t * const pvar = (typeof(pvar))lock;
  1507    atomic_fetch_sub_explicit(pvar, RWLOCK_WBIT, MO_RELEASE);
  1508  }
  1509  
  1510    inline void
  1511  rwlock_write_to_read(rwlock * const lock)
  1512  {
  1513    rwdep_unlock_write(lock);
  1514    rwdep_lock_read(lock);
  1515    lock_t * const pvar = (typeof(pvar))lock;
  1516    // +R -W
  1517    atomic_fetch_add_explicit(pvar, ((lock_v)1) - RWLOCK_WBIT, MO_ACQ_REL);
  1518  }
  1519  
  1520  #undef RWLOCK_WSHIFT
  1521  #undef RWLOCK_WBIT
  1522  // }}} rwlock
  1523  
  1524  // }}} locking
  1525  
  1526  // coroutine {{{
  1527  
  1528  extern u64 co_switch_stack(u64 * const saversp, const u64 newrsp, const u64 retval) {
  1529  // asm {{{
  1530  #if defined(__x86_64__)
  1531  // number pushes in co_switch_stack
  1532  #define CO_CONTEXT_SIZE ((6))
  1533  
  1534  // for switch/exit: pass a return value to the target
  1535  asm (
  1536      ".align 16;"
  1537  #if defined(__linux__) || defined(__FreeBSD__)
  1538      ".global co_switch_stack;"
  1539      ".type co_switch_stack, @function;"
  1540      "co_switch_stack:"
  1541  #elif defined(__APPLE__) && defined(__MACH__)
  1542      ".global _co_switch_stack;"
  1543      "_co_switch_stack:"
  1544  #else
  1545  #error Supported platforms: Linux/FreeBSD/Apple
  1546  #endif // OS
  1547      "push %rbp; push %rbx; push %r12;"
  1548      "push %r13; push %r14; push %r15;"
  1549      "mov  %rsp, (%rdi);"
  1550      "mov  %rsi, %rsp;"
  1551      "pop  %r15; pop  %r14; pop  %r13;"
  1552      "pop  %r12; pop  %rbx; pop  %rbp;"
  1553      "mov  %rdx, %rax;"
  1554      "retq;"
  1555      );
  1556  
  1557  #elif defined(__aarch64__)
  1558  // number pushes in co_switch_stack
  1559  #define CO_CONTEXT_SIZE ((20))
  1560  asm (
  1561      ".align 16;"
  1562  #if defined(__linux__) || defined(__FreeBSD__)
  1563      ".global co_switch_stack;"
  1564      ".type co_switch_stack, @function;"
  1565      "co_switch_stack:"
  1566  #elif defined(__APPLE__) && defined(__MACH__)
  1567      ".global _co_switch_stack;"
  1568      "_co_switch_stack:"
  1569  #else
  1570  #error supported platforms: Linux/FreeBSD/Apple
  1571  #endif // OS
  1572      "sub  x8, sp, 160;"
  1573      "str  x8, [x0];"
  1574      "stp x30, x19, [x8];      ldp x30, x19, [x1];"
  1575      "stp x20, x21, [x8, 16];  ldp x20, x21, [x1, 16];"
  1576      "stp x22, x23, [x8, 32];  ldp x22, x23, [x1, 32];"
  1577      "stp x24, x25, [x8, 48];  ldp x24, x25, [x1, 48];"
  1578      "stp x26, x27, [x8, 64];  ldp x26, x27, [x1, 64];"
  1579      "stp x28, x29, [x8, 80];  ldp x28, x29, [x1, 80];"
  1580      "stp  d8,  d9, [x8, 96];  ldp  d8,  d9, [x1, 96];"
  1581      "stp d10, d11, [x8, 112]; ldp d10, d11, [x1, 112];"
  1582      "stp d12, d13, [x8, 128]; ldp d12, d13, [x1, 128];"
  1583      "stp d14, d15, [x8, 144]; ldp d14, d15, [x1, 144];"
  1584      "add  sp, x1, 160;"
  1585      "mov  x0, x2;"
  1586      "br  x30;"
  1587      );
  1588  }
  1589  
  1590  extern void co_entry_aarch64(void) {
  1591  asm (
  1592      ".align 16;"
  1593  #if defined(__linux__) || defined(__FreeBSD__)
  1594      ".global co_entry_aarch64;"
  1595      ".type co_entry_aarch64, @function;"
  1596      "co_entry_aarch64:"
  1597  #elif defined(__APPLE__) && defined(__MACH__)
  1598      ".global _co_entry_aarch64;"
  1599      "_co_entry_aarch64:"
  1600  #else
  1601  #error supported platforms: Linux/FreeBSD/Apple
  1602  #endif // OS
  1603      "ldr x8, [sp, 0];"
  1604      "blr x8;"
  1605      "ldr x8, [sp, 8];"
  1606      "blr x8;"
  1607      "ldr x8, [sp, 16];"
  1608      "blr x8;"
  1609      );
  1610  #else
  1611  #error supported CPUs: x86_64 or AArch64
  1612  #endif // co_switch_stack x86_64 and aarch64
  1613  // }}} asm
  1614  }
  1615  
  1616  // co {{{
  1617  struct co {
  1618    u64 rsp;
  1619    void * priv;
  1620    u64 * host; // set host to NULL to exit
  1621    size_t stksz;
  1622  };
  1623  
  1624  static __thread struct co * volatile co_curr = NULL; // NULL in host
  1625  
  1626  // the stack sits under the struct co
  1627    static void
  1628  co_init(struct co * const co, void * func, void * priv, u64 * const host,
  1629      const u64 stksz, void * func_exit)
  1630  {
  1631    debug_assert((stksz & 0x3f) == 0); // a multiple of 64 bytes
  1632    u64 * rsp = ((u64 *)co) - 4;
  1633    rsp[0] = (u64)func;
  1634    rsp[1] = (u64)func_exit;
  1635    rsp[2] = (u64)debug_die;
  1636    rsp[3] = 0;
  1637  
  1638    rsp -= CO_CONTEXT_SIZE;
  1639  
  1640  #if defined(__aarch64__)
  1641    rsp[0] = (u64)co_entry_aarch64;
  1642  #endif
  1643  
  1644    co->rsp = (u64)rsp;
  1645    co->priv = priv;
  1646    co->host = host;
  1647    co->stksz = stksz;
  1648  }
  1649  
  1650    static void
  1651  co_exit0(void)
  1652  {
  1653    co_exit(0);
  1654  }
  1655  
  1656    struct co *
  1657  co_create(const u64 stacksize, void * func, void * priv, u64 * const host)
  1658  {
  1659    const u64 stksz = bits_round_up(stacksize, 6);
  1660    const size_t alloc_size = stksz + sizeof(struct co);
  1661    u8 * const mem = yalloc(alloc_size);
  1662    if (mem == NULL)
  1663      return NULL;
  1664  
  1665  #ifdef CO_STACK_CHECK
  1666    memset(mem, 0x5c, stksz);
  1667  #endif // CO_STACK_CHECK
  1668  
  1669    struct co * const co = (typeof(co))(mem + stksz);
  1670    co_init(co, func, priv, host, stksz, co_exit0);
  1671    return co;
  1672  }
  1673  
  1674    inline void
  1675  co_reuse(struct co * const co, void * func, void * priv, u64 * const host)
  1676  {
  1677    co_init(co, func, priv, host, co->stksz, co_exit0);
  1678  }
  1679  
  1680    inline struct co *
  1681  co_fork(void * func, void * priv)
  1682  {
  1683    return co_curr ? co_create(co_curr->stksz, func, priv, co_curr->host) : NULL;
  1684  }
  1685  
  1686    inline void *
  1687  co_priv(void)
  1688  {
  1689    return co_curr ? co_curr->priv : NULL;
  1690  }
  1691  
  1692  // the host calls this to enter a coroutine.
  1693    inline u64
  1694  co_enter(struct co * const to, const u64 retval)
  1695  {
  1696    debug_assert(co_curr == NULL); // must entry from the host
  1697    debug_assert(to && to->host);
  1698    u64 * const save = to->host;
  1699    co_curr = to;
  1700    const u64 ret = co_switch_stack(save, to->rsp, retval);
  1701    co_curr = NULL;
  1702    return ret;
  1703  }
  1704  
  1705  // switch from a coroutine to another coroutine
  1706  // co_curr must be valid
  1707  // the target will resume and receive the retval
  1708    inline u64
  1709  co_switch_to(struct co * const to, const u64 retval)
  1710  {
  1711    debug_assert(co_curr);
  1712    debug_assert(co_curr != to);
  1713    debug_assert(to && to->host);
  1714    struct co * const save = co_curr;
  1715    co_curr = to;
  1716    return co_switch_stack(&(save->rsp), to->rsp, retval);
  1717  }
  1718  
  1719  // switch from a coroutine to the host routine
  1720  // co_yield is now a c++ keyword...
  1721    inline u64
  1722  co_back(const u64 retval)
  1723  {
  1724    debug_assert(co_curr);
  1725    struct co * const save = co_curr;
  1726    co_curr = NULL;
  1727    return co_switch_stack(&(save->rsp), *(save->host), retval);
  1728  }
  1729  
  1730  #ifdef CO_STACK_CHECK
  1731    static void
  1732  co_stack_check(const u8 * const mem, const u64 stksz)
  1733  {
  1734    const u64 * const mem64 = (typeof(mem64))mem;
  1735    const u64 size64 = stksz / sizeof(u64);
  1736    for (u64 i = 0; i < size64; i++) {
  1737      if (mem64[i] != 0x5c5c5c5c5c5c5c5clu) {
  1738        fprintf(stderr, "%s co stack usage: %lu/%lu\n", __func__, stksz - (i * sizeof(u64)), stksz);
  1739        break;
  1740      }
  1741    }
  1742  }
  1743  #endif // CO_STACK_CHECK
  1744  
  1745  // return to host and set host to NULL
  1746  __attribute__((noreturn))
  1747    void
  1748  co_exit(const u64 retval)
  1749  {
  1750    debug_assert(co_curr);
  1751  #ifdef CO_STACK_CHECK
  1752    const u64 stksz = co_curr->stksz;
  1753    u8 * const mem = ((u8 *)co_curr) - stksz;
  1754    co_stack_check(mem, stksz);
  1755  #endif // CO_STACK_CHECK
  1756    const u64 hostrsp = *(co_curr->host);
  1757    co_curr->host = NULL;
  1758    struct co * const save = co_curr;
  1759    co_curr = NULL;
  1760    (void)co_switch_stack(&(save->rsp), hostrsp, retval);
  1761    // return to co_enter
  1762    debug_die();
  1763  }
  1764  
  1765  // host is set to NULL on exit
  1766    inline bool
  1767  co_valid(struct co * const co)
  1768  {
  1769    return co->host != NULL;
  1770  }
  1771  
  1772  // return NULL on host
  1773    inline struct co *
  1774  co_self(void)
  1775  {
  1776    return co_curr;
  1777  }
  1778  
  1779    inline void
  1780  co_destroy(struct co * const co)
  1781  {
  1782    u8 * const mem = ((u8 *)co) - co->stksz;
  1783    free(mem);
  1784  }
  1785  // }}} co
  1786  
  1787  // corr {{{
  1788  struct corr {
  1789    struct co co;
  1790    struct corr * next;
  1791    struct corr * prev;
  1792  };
  1793  
  1794  // initial and link guest to the run-queue
  1795    struct corr *
  1796  corr_create(const u64 stacksize, void * func, void * priv, u64 * const host)
  1797  {
  1798    const u64 stksz = bits_round_up(stacksize, 6);
  1799    const size_t alloc_size = stksz + sizeof(struct corr);
  1800    u8 * const mem = yalloc(alloc_size);
  1801    if (mem == NULL)
  1802      return NULL;
  1803  
  1804  #ifdef CO_STACK_CHECK
  1805    memset(mem, 0x5c, stksz);
  1806  #endif // CO_STACK_CHECK
  1807  
  1808    struct corr * const co = (typeof(co))(mem + stksz);
  1809    co_init(&(co->co), func, priv, host, stksz, corr_exit);
  1810    co->next = co;
  1811    co->prev = co;
  1812    return co;
  1813  }
  1814  
  1815    struct corr *
  1816  corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev)
  1817  {
  1818    const u64 stksz = bits_round_up(stacksize, 6);
  1819    const size_t alloc_size = stksz + sizeof(struct corr);
  1820    u8 * const mem = yalloc(alloc_size);
  1821    if (mem == NULL)
  1822      return NULL;
  1823  
  1824  #ifdef CO_STACK_CHECK
  1825    memset(mem, 0x5c, stksz);
  1826  #endif // CO_STACK_CHECK
  1827  
  1828    struct corr * const co = (typeof(co))(mem + stksz);
  1829    co_init(&(co->co), func, priv, prev->co.host, stksz, corr_exit);
  1830    co->next = prev->next;
  1831    co->prev = prev;
  1832    co->prev->next = co;
  1833    co->next->prev = co;
  1834    return co;
  1835  }
  1836  
  1837    inline void
  1838  corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host)
  1839  {
  1840    co_init(&(co->co), func, priv, host, co->co.stksz, corr_exit);
  1841    co->next = co;
  1842    co->prev = co;
  1843  }
  1844  
  1845    inline void
  1846  corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev)
  1847  {
  1848    co_init(&(co->co), func, priv, prev->co.host, co->co.stksz, corr_exit);
  1849    co->next = prev->next;
  1850    co->prev = prev;
  1851    co->prev->next = co;
  1852    co->next->prev = co;
  1853  }
  1854  
  1855    inline void
  1856  corr_enter(struct corr * const co)
  1857  {
  1858    (void)co_enter(&(co->co), 0);
  1859  }
  1860  
  1861    inline void
  1862  corr_yield(void)
  1863  {
  1864    struct corr * const curr = (typeof(curr))co_curr;
  1865    if (curr && (curr->next != curr))
  1866      (void)co_switch_to(&(curr->next->co), 0);
  1867  }
  1868  
  1869  __attribute__((noreturn))
  1870    inline void
  1871  corr_exit(void)
  1872  {
  1873    debug_assert(co_curr);
  1874  #ifdef CO_STACK_CHECK
  1875    const u64 stksz = co_curr->stksz;
  1876    const u8 * const mem = ((u8 *)(co_curr)) - stksz;
  1877    co_stack_check(mem, stksz);
  1878  #endif // CO_STACK_CHECK
  1879  
  1880    struct corr * const curr = (typeof(curr))co_curr;
  1881    if (curr->next != curr) { // have more corr
  1882      struct corr * const next = curr->next;
  1883      struct corr * const prev = curr->prev;
  1884      next->prev = prev;
  1885      prev->next = next;
  1886      curr->next = NULL;
  1887      curr->prev = NULL;
  1888      curr->co.host = NULL; // invalidate
  1889      (void)co_switch_to(&(next->co), 0);
  1890    } else { // the last corr
  1891      co_exit0();
  1892    }
  1893    debug_die();
  1894  }
  1895  
  1896    inline void
  1897  corr_destroy(struct corr * const co)
  1898  {
  1899    co_destroy(&(co->co));
  1900  }
  1901  // }}} corr
  1902  
  1903  // }}} co
  1904  
  1905  // bits {{{
  1906    inline u32
  1907  bits_reverse_u32(const u32 v)
  1908  {
  1909    const u32 v2 = __builtin_bswap32(v);
  1910    const u32 v3 = ((v2 & 0xf0f0f0f0u) >> 4) | ((v2 & 0x0f0f0f0fu) << 4);
  1911    const u32 v4 = ((v3 & 0xccccccccu) >> 2) | ((v3 & 0x33333333u) << 2);
  1912    const u32 v5 = ((v4 & 0xaaaaaaaau) >> 1) | ((v4 & 0x55555555u) << 1);
  1913    return v5;
  1914  }
  1915  
  1916    inline u64
  1917  bits_reverse_u64(const u64 v)
  1918  {
  1919    const u64 v2 = __builtin_bswap64(v);
  1920    const u64 v3 = ((v2 & 0xf0f0f0f0f0f0f0f0lu) >>  4) | ((v2 & 0x0f0f0f0f0f0f0f0flu) <<  4);
  1921    const u64 v4 = ((v3 & 0xcccccccccccccccclu) >>  2) | ((v3 & 0x3333333333333333lu) <<  2);
  1922    const u64 v5 = ((v4 & 0xaaaaaaaaaaaaaaaalu) >>  1) | ((v4 & 0x5555555555555555lu) <<  1);
  1923    return v5;
  1924  }
  1925  
  1926    inline u64
  1927  bits_rotl_u64(const u64 v, const u8 n)
  1928  {
  1929    const u8 sh = n & 0x3f;
  1930    return (v << sh) | (v >> (64 - sh));
  1931  }
  1932  
  1933    inline u64
  1934  bits_rotr_u64(const u64 v, const u8 n)
  1935  {
  1936    const u8 sh = n & 0x3f;
  1937    return (v >> sh) | (v << (64 - sh));
  1938  }
  1939  
  1940    inline u32
  1941  bits_rotl_u32(const u32 v, const u8 n)
  1942  {
  1943    const u8 sh = n & 0x1f;
  1944    return (v << sh) | (v >> (32 - sh));
  1945  }
  1946  
  1947    inline u32
  1948  bits_rotr_u32(const u32 v, const u8 n)
  1949  {
  1950    const u8 sh = n & 0x1f;
  1951    return (v >> sh) | (v << (32 - sh));
  1952  }
  1953  
  1954    inline u64
  1955  bits_p2_up_u64(const u64 v)
  1956  {
  1957    // clz(0) is undefined
  1958    return (v > 1) ? (1lu << (64 - __builtin_clzl(v - 1lu))) : v;
  1959  }
  1960  
  1961    inline u32
  1962  bits_p2_up_u32(const u32 v)
  1963  {
  1964    // clz(0) is undefined
  1965    return (v > 1) ? (1u << (32 - __builtin_clz(v - 1u))) : v;
  1966  }
  1967  
  1968    inline u64
  1969  bits_p2_down_u64(const u64 v)
  1970  {
  1971    return v ? (1lu << (63 - __builtin_clzl(v))) : v;
  1972  }
  1973  
  1974    inline u32
  1975  bits_p2_down_u32(const u32 v)
  1976  {
  1977    return v ? (1u << (31 - __builtin_clz(v))) : v;
  1978  }
  1979  
  1980    inline u64
  1981  bits_round_up(const u64 v, const u8 power)
  1982  {
  1983    return (v + (1lu << power) - 1lu) >> power << power;
  1984  }
  1985  
  1986    inline u64
  1987  bits_round_up_a(const u64 v, const u64 a)
  1988  {
  1989    return (v + a - 1) / a * a;
  1990  }
  1991  
  1992    inline u64
  1993  bits_round_down(const u64 v, const u8 power)
  1994  {
  1995    return v >> power << power;
  1996  }
  1997  
  1998    inline u64
  1999  bits_round_down_a(const u64 v, const u64 a)
  2000  {
  2001    return v / a * a;
  2002  }
  2003  // }}} bits
  2004  
  2005  // vi128 {{{
  2006  #if defined(__GNUC__) && __GNUC__ >= 7
  2007  #define FALLTHROUGH __attribute__ ((fallthrough))
  2008  #else
  2009  #define FALLTHROUGH ((void)0)
  2010  #endif /* __GNUC__ >= 7 */
  2011  
  2012    inline u32
  2013  vi128_estimate_u32(const u32 v)
  2014  {
  2015    static const u8 t[] = {5,5,5,5,
  2016      4,4,4,4,4,4,4, 3,3,3,3,3,3,3,
  2017      2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
  2018    return v ? t[__builtin_clz(v)] : 2;
  2019    // 0 -> [0x80 0x00] the first byte is non-zero
  2020  
  2021    // nz bit range -> enc length    offset in t[]
  2022    // 0 -> 2          special case
  2023    // 1 to 7 -> 1     31 to 25
  2024    // 8 to 14 -> 2    24 to 18
  2025    // 15 to 21 -> 3   17 to 11
  2026    // 22 to 28 -> 4   10 to 4
  2027    // 29 to 32 -> 5    3 to 0
  2028  }
  2029  
  2030    u8 *
  2031  vi128_encode_u32(u8 * dst, u32 v)
  2032  {
  2033    switch (vi128_estimate_u32(v)) {
  2034    case 5:
  2035      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2036    case 4:
  2037      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2038    case 3:
  2039      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2040    case 2:
  2041      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2042    case 1:
  2043      *(dst++) = (u8)v;
  2044      break;
  2045    default:
  2046      debug_die();
  2047      break;
  2048    }
  2049    return dst;
  2050  }
  2051  
  2052    const u8 *
  2053  vi128_decode_u32(const u8 * src, u32 * const out)
  2054  {
  2055    debug_assert(*src);
  2056    u32 r = 0;
  2057    for (u32 shift = 0; shift < 32; shift += 7) {
  2058      const u8 byte = *(src++);
  2059      r |= (((u32)(byte & 0x7f)) << shift);
  2060      if ((byte & 0x80) == 0) { // No more bytes to consume
  2061        *out = r;
  2062        return src;
  2063      }
  2064    }
  2065    *out = 0;
  2066    return NULL; // invalid
  2067  }
  2068  
  2069    inline u32
  2070  vi128_estimate_u64(const u64 v)
  2071  {
  2072    static const u8 t[] = {10,
  2073      9,9,9,9,9,9,9, 8,8,8,8,8,8,8, 7,7,7,7,7,7,7,
  2074      6,6,6,6,6,6,6, 5,5,5,5,5,5,5, 4,4,4,4,4,4,4,
  2075      3,3,3,3,3,3,3, 2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
  2076    return v ? t[__builtin_clzl(v)] : 2;
  2077  }
  2078  
  2079  // return ptr after the generated bytes
  2080    u8 *
  2081  vi128_encode_u64(u8 * dst, u64 v)
  2082  {
  2083    switch (vi128_estimate_u64(v)) {
  2084    case 10:
  2085      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2086    case 9:
  2087      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2088    case 8:
  2089      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2090    case 7:
  2091      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2092    case 6:
  2093      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2094    case 5:
  2095      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2096    case 4:
  2097      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2098    case 3:
  2099      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2100    case 2:
  2101      *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
  2102    case 1:
  2103      *(dst++) = (u8)v;
  2104      break;
  2105    default:
  2106      debug_die();
  2107      break;
  2108    }
  2109    return dst;
  2110  }
  2111  
  2112  // return ptr after the consumed bytes
  2113    const u8 *
  2114  vi128_decode_u64(const u8 * src, u64 * const out)
  2115  {
  2116    u64 r = 0;
  2117    for (u32 shift = 0; shift < 64; shift += 7) {
  2118      const u8 byte = *(src++);
  2119      r |= (((u64)(byte & 0x7f)) << shift);
  2120      if ((byte & 0x80) == 0) { // No more bytes to consume
  2121        *out = r;
  2122        return src;
  2123      }
  2124    }
  2125    *out = 0;
  2126    return NULL; // invalid
  2127  }
  2128  
  2129  #undef FALLTHROUGH
  2130  // }}} vi128
  2131  
  2132  // misc {{{
  2133    inline struct entry13
  2134  entry13(const u16 e1, const u64 e3)
  2135  {
  2136    debug_assert((e3 >> 48) == 0);
  2137    return (struct entry13){.v64 = (e3 << 16) | e1};
  2138  }
  2139  
  2140    inline void
  2141  entry13_update_e3(struct entry13 * const e, const u64 e3)
  2142  {
  2143    debug_assert((e3 >> 48) == 0);
  2144    *e = entry13(e->e1, e3);
  2145  }
  2146  
  2147    inline void *
  2148  u64_to_ptr(const u64 v)
  2149  {
  2150    return (void *)v;
  2151  }
  2152  
  2153    inline u64
  2154  ptr_to_u64(const void * const ptr)
  2155  {
  2156    return (u64)ptr;
  2157  }
  2158  
  2159  // portable malloc_usable_size
  2160    inline size_t
  2161  m_usable_size(void * const ptr)
  2162  {
  2163  #if defined(__linux__) || defined(__FreeBSD__)
  2164    const size_t sz = malloc_usable_size(ptr);
  2165  #elif defined(__APPLE__) && defined(__MACH__)
  2166    const size_t sz = malloc_size(ptr);
  2167  #endif // OS
  2168  
  2169  #ifndef HEAPCHECKING
  2170    // valgrind and asan may return unaligned usable size
  2171    debug_assert((sz & 0x7lu) == 0);
  2172  #endif // HEAPCHECKING
  2173  
  2174    return sz;
  2175  }
  2176  
  2177    inline size_t
  2178  fdsize(const int fd)
  2179  {
  2180    struct stat st;
  2181    st.st_size = 0;
  2182    if (fstat(fd, &st) != 0)
  2183      return 0;
  2184  
  2185    if (S_ISBLK(st.st_mode)) {
  2186  #if defined(__linux__)
  2187      ioctl(fd, BLKGETSIZE64, &st.st_size);
  2188  #elif defined(__APPLE__) && defined(__MACH__)
  2189      u64 blksz = 0;
  2190      u64 nblks = 0;
  2191      ioctl(fd, DKIOCGETBLOCKSIZE, &blksz);
  2192      ioctl(fd, DKIOCGETBLOCKCOUNT, &nblks);
  2193      st.st_size = (ssize_t)(blksz * nblks);
  2194  #elif defined(__FreeBSD__)
  2195      ioctl(fd, DIOCGMEDIASIZE, &st.st_size);
  2196  #endif // OS
  2197    }
  2198  
  2199    return (size_t)st.st_size;
  2200  }
  2201  
  2202    u32
  2203  memlcp(const u8 * const p1, const u8 * const p2, const u32 max)
  2204  {
  2205    const u32 max64 = max & (~7u);
  2206    u32 clen = 0;
  2207    while (clen < max64) {
  2208      const u64 v1 = *(const u64 *)(p1+clen);
  2209      const u64 v2 = *(const u64 *)(p2+clen);
  2210      const u64 x = v1 ^ v2;
  2211      if (x)
  2212        return clen + (u32)(__builtin_ctzl(x) >> 3);
  2213  
  2214      clen += sizeof(u64);
  2215    }
  2216  
  2217    if ((clen + sizeof(u32)) <= max) {
  2218      const u32 v1 = *(const u32 *)(p1+clen);
  2219      const u32 v2 = *(const u32 *)(p2+clen);
  2220      const u32 x = v1 ^ v2;
  2221      if (x)
  2222        return clen + (u32)(__builtin_ctz(x) >> 3);
  2223  
  2224      clen += sizeof(u32);
  2225    }
  2226  
  2227    while ((clen < max) && (p1[clen] == p2[clen]))
  2228      clen++;
  2229    return clen;
  2230  }
  2231  
  2232  static double logger_t0 = 0.0;
  2233  
  2234  __attribute__((constructor))
  2235    static void
  2236  logger_init(void)
  2237  {
  2238    logger_t0 = time_sec();
  2239  }
  2240  
  2241  __attribute__ ((format (printf, 2, 3)))
  2242    void
  2243  logger_printf(const int fd, const char * const fmt, ...)
  2244  {
  2245    char buf[4096];
  2246    va_list ap;
  2247    va_start(ap, fmt);
  2248    vsnprintf(buf, sizeof(buf), fmt, ap);
  2249    va_end(ap);
  2250    dprintf(fd, "%010.3lf %08x %s", time_diff_sec(logger_t0), crc32c_u64(0x12345678, (u64)pthread_self()), buf);
  2251  }
  2252  // }}} misc
  2253  
  2254  // astk {{{
  2255  // atomic stack
  2256  struct acell { struct acell * next; };
  2257  
  2258  // extract ptr from m value
  2259    static inline struct acell *
  2260  astk_ptr(const u64 m)
  2261  {
  2262    return (struct acell *)(m >> 16);
  2263  }
  2264  
  2265  // calculate the new magic
  2266    static inline u64
  2267  astk_m1(const u64 m0, struct acell * const ptr)
  2268  {
  2269    return ((m0 + 1) & 0xfffflu) | (((u64)ptr) << 16);
  2270  }
  2271  
  2272  // calculate the new magic
  2273    static inline u64
  2274  astk_m1_unsafe(struct acell * const ptr)
  2275  {
  2276    return ((u64)ptr) << 16;
  2277  }
  2278  
  2279    static bool
  2280  astk_try_push(au64 * const pmagic, struct acell * const first, struct acell * const last)
  2281  {
  2282    u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2283    last->next = astk_ptr(m0);
  2284    const u64 m1 = astk_m1(m0, first);
  2285    return atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_RELEASE, MO_RELAXED);
  2286  }
  2287  
  2288    static void
  2289  astk_push_safe(au64 * const pmagic, struct acell * const first, struct acell * const last)
  2290  {
  2291    while (!astk_try_push(pmagic, first, last));
  2292  }
  2293  
  2294    static void
  2295  astk_push_unsafe(au64 * const pmagic, struct acell * const first,
  2296      struct acell * const last)
  2297  {
  2298    const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2299    last->next = astk_ptr(m0);
  2300    const u64 m1 = astk_m1_unsafe(first);
  2301    atomic_store_explicit(pmagic, m1, MO_RELAXED);
  2302  }
  2303  
  2304  //// can fail for two reasons: (1) NULL: no available object; (2) ~0lu: contention
  2305  //  static void *
  2306  //astk_try_pop(au64 * const pmagic)
  2307  //{
  2308  //  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2309  //  struct acell * const ret = astk_ptr(m0);
  2310  //  if (ret == NULL)
  2311  //    return NULL;
  2312  //
  2313  //  const u64 m1 = astk_m1(m0, ret->next);
  2314  //  if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
  2315  //    return ret;
  2316  //  else
  2317  //    return (void *)(~0lu);
  2318  //}
  2319  
  2320    static void *
  2321  astk_pop_safe(au64 * const pmagic)
  2322  {
  2323    do {
  2324      u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2325      struct acell * const ret = astk_ptr(m0);
  2326      if (ret == NULL)
  2327        return NULL;
  2328  
  2329      const u64 m1 = astk_m1(m0, ret->next);
  2330      if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
  2331        return ret;
  2332    } while (true);
  2333  }
  2334  
  2335    static void *
  2336  astk_pop_unsafe(au64 * const pmagic)
  2337  {
  2338    const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2339    struct acell * const ret = astk_ptr(m0);
  2340    if (ret == NULL)
  2341      return NULL;
  2342  
  2343    const u64 m1 = astk_m1_unsafe(ret->next);
  2344    atomic_store_explicit(pmagic, m1, MO_RELAXED);
  2345    return (void *)ret;
  2346  }
  2347  
  2348    static void *
  2349  astk_peek_unsafe(au64 * const pmagic)
  2350  {
  2351    const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
  2352    return astk_ptr(m0);
  2353  }
  2354  // }}} astk
  2355  
  2356  // slab {{{
  2357  #define SLAB_OBJ0_OFFSET ((64))
  2358  struct slab {
  2359    au64 magic; // hi 48: ptr, lo 16: seq
  2360    u64 padding1[7];
  2361  
  2362    // 2nd line
  2363    struct acell * head_active; // list of blocks in use or in magic
  2364    struct acell * head_backup; // list of unused full blocks
  2365    u64 nr_ready; // UNSAFE only! number of objects under magic
  2366    u64 padding2[5];
  2367  
  2368    // 3rd line const
  2369    u64 obj_size; // const: aligned size of each object
  2370    u64 blk_size; // const: size of each memory block
  2371    u64 objs_per_slab; // const: number of objects in a slab
  2372    u64 obj0_offset; // const: offset of the first object in a block
  2373    u64 padding3[4];
  2374  
  2375    // 4th line
  2376    union {
  2377      mutex lock;
  2378      u64 padding4[8];
  2379    };
  2380  };
  2381  static_assert(sizeof(struct slab) == 256, "sizeof(struct slab) != 256");
  2382  
  2383    static void
  2384  slab_add(struct slab * const slab, struct acell * const blk, const bool is_safe)
  2385  {
  2386    // insert into head_active
  2387    blk->next = slab->head_active;
  2388    slab->head_active = blk;
  2389  
  2390    u8 * const base = ((u8 *)blk) + slab->obj0_offset;
  2391    struct acell * iter = (typeof(iter))base; // [0]
  2392    for (u64 i = 1; i < slab->objs_per_slab; i++) {
  2393      struct acell * const next = (typeof(next))(base + (i * slab->obj_size));
  2394      iter->next = next;
  2395      iter = next;
  2396    }
  2397  
  2398    // base points to the first block; iter points to the last block
  2399    if (is_safe) { // other threads can poll magic
  2400      astk_push_safe(&slab->magic, (struct acell *)base, iter);
  2401    } else { // unsafe
  2402      astk_push_unsafe(&slab->magic, (struct acell *)base, iter);
  2403      slab->nr_ready += slab->objs_per_slab;
  2404    }
  2405  }
  2406  
  2407  // critical section; call with lock
  2408    static bool
  2409  slab_expand(struct slab * const slab, const bool is_safe)
  2410  {
  2411    struct acell * const old = slab->head_backup;
  2412    if (old) { // pop old from backup and add
  2413      slab->head_backup = old->next;
  2414      slab_add(slab, old, is_safe);
  2415    } else { // more core
  2416      size_t blk_size;
  2417      struct acell * const new = pages_alloc_best(slab->blk_size, true, &blk_size);
  2418      (void)blk_size;
  2419      if (new == NULL)
  2420        return false;
  2421  
  2422      slab_add(slab, new, is_safe);
  2423    }
  2424    return true;
  2425  }
  2426  
  2427  // return 0 on failure; otherwise, obj0_offset
  2428    static u64
  2429  slab_check_sizes(const u64 obj_size, const u64 blk_size)
  2430  {
  2431    // obj must be non-zero and 8-byte aligned
  2432    // blk must be at least of page size and power of 2
  2433    if ((!obj_size) || (obj_size % 8lu) || (blk_size < 4096lu) || (blk_size & (blk_size - 1)))
  2434      return 0;
  2435  
  2436    // each slab should have at least one object
  2437    const u64 obj0_offset = (obj_size & (obj_size - 1)) ? SLAB_OBJ0_OFFSET : obj_size;
  2438    if (obj0_offset >= blk_size || (blk_size - obj0_offset) < obj_size)
  2439      return 0;
  2440  
  2441    return obj0_offset;
  2442  }
  2443  
  2444    static void
  2445  slab_init_internal(struct slab * const slab, const u64 obj_size, const u64 blk_size, const u64 obj0_offset)
  2446  {
  2447    memset(slab, 0, sizeof(*slab));
  2448    slab->obj_size = obj_size;
  2449    slab->blk_size = blk_size;
  2450    slab->objs_per_slab = (blk_size - obj0_offset) / obj_size;
  2451    debug_assert(slab->objs_per_slab); // >= 1
  2452    slab->obj0_offset = obj0_offset;
  2453    mutex_init(&(slab->lock));
  2454  }
  2455  
  2456    struct slab *
  2457  slab_create(const u64 obj_size, const u64 blk_size)
  2458  {
  2459    const u64 obj0_offset = slab_check_sizes(obj_size, blk_size);
  2460    if (!obj0_offset)
  2461      return NULL;
  2462  
  2463    struct slab * const slab = yalloc(sizeof(*slab));
  2464    if (slab == NULL)
  2465      return NULL;
  2466  
  2467    slab_init_internal(slab, obj_size, blk_size, obj0_offset);
  2468    return slab;
  2469  }
  2470  
  2471  // unsafe
  2472    bool
  2473  slab_reserve_unsafe(struct slab * const slab, const u64 nr)
  2474  {
  2475    while (slab->nr_ready < nr)
  2476      if (!slab_expand(slab, false))
  2477        return false;
  2478    return true;
  2479  }
  2480  
  2481    void *
  2482  slab_alloc_unsafe(struct slab * const slab)
  2483  {
  2484    void * ret = astk_pop_unsafe(&slab->magic);
  2485    if (ret == NULL) {
  2486      if (!slab_expand(slab, false))
  2487        return NULL;
  2488      ret = astk_pop_unsafe(&slab->magic);
  2489    }
  2490    debug_assert(ret);
  2491    slab->nr_ready--;
  2492    return ret;
  2493  }
  2494  
  2495    void *
  2496  slab_alloc_safe(struct slab * const slab)
  2497  {
  2498    void * ret = astk_pop_safe(&slab->magic);
  2499    if (ret)
  2500      return ret;
  2501  
  2502    mutex_lock(&slab->lock);
  2503    do {
  2504      ret = astk_pop_safe(&slab->magic); // may already have new objs
  2505      if (ret)
  2506        break;
  2507      if (!slab_expand(slab, true))
  2508        break;
  2509    } while (true);
  2510    mutex_unlock(&slab->lock);
  2511    return ret;
  2512  }
  2513  
  2514    void
  2515  slab_free_unsafe(struct slab * const slab, void * const ptr)
  2516  {
  2517    debug_assert(ptr);
  2518    astk_push_unsafe(&slab->magic, ptr, ptr);
  2519    slab->nr_ready++;
  2520  }
  2521  
  2522    void
  2523  slab_free_safe(struct slab * const slab, void * const ptr)
  2524  {
  2525    astk_push_safe(&slab->magic, ptr, ptr);
  2526  }
  2527  
  2528  // UNSAFE
  2529    void
  2530  slab_free_all(struct slab * const slab)
  2531  {
  2532    slab->magic = 0;
  2533    slab->nr_ready = 0; // backup does not count
  2534  
  2535    if (slab->head_active) {
  2536      struct acell * iter = slab->head_active;
  2537      while (iter->next)
  2538        iter = iter->next;
  2539      // now iter points to the last blk
  2540      iter->next = slab->head_backup; // active..backup
  2541      slab->head_backup = slab->head_active; // backup gets all
  2542      slab->head_active = NULL; // empty active
  2543    }
  2544  }
  2545  
  2546  // unsafe
  2547    u64
  2548  slab_get_nalloc(struct slab * const slab)
  2549  {
  2550    struct acell * iter = slab->head_active;
  2551    u64 n = 0;
  2552    while (iter) {
  2553      n++;
  2554      iter = iter->next;
  2555    }
  2556    n *= slab->objs_per_slab;
  2557  
  2558    iter = astk_peek_unsafe(&slab->magic);
  2559    while (iter) {
  2560      n--;
  2561      iter = iter->next;
  2562    }
  2563    return n;
  2564  }
  2565  
  2566    static void
  2567  slab_deinit(struct slab * const slab)
  2568  {
  2569    debug_assert(slab);
  2570    struct acell * iter = slab->head_active;
  2571    while (iter) {
  2572      struct acell * const next = iter->next;
  2573      pages_unmap(iter, slab->blk_size);
  2574      iter = next;
  2575    }
  2576    iter = slab->head_backup;
  2577    while (iter) {
  2578      struct acell * const next = iter->next;
  2579      pages_unmap(iter, slab->blk_size);
  2580      iter = next;
  2581    }
  2582  }
  2583  
  2584    void
  2585  slab_destroy(struct slab * const slab)
  2586  {
  2587    slab_deinit(slab);
  2588    free(slab);
  2589  }
  2590  // }}} slab
  2591  
  2592  // string {{{
  2593  static union { u16 v16; u8 v8[2]; } strdec_table[100];
  2594  
  2595  __attribute__((constructor))
  2596    static void
  2597  strdec_init(void)
  2598  {
  2599    for (u8 i = 0; i < 100; i++) {
  2600      const u8 hi = (typeof(hi))('0' + (i / 10));
  2601      const u8 lo = (typeof(lo))('0' + (i % 10));
  2602      strdec_table[i].v8[0] = hi;
  2603      strdec_table[i].v8[1] = lo;
  2604    }
  2605  }
  2606  
  2607  // output 10 bytes
  2608    void
  2609  strdec_32(void * const out, const u32 v)
  2610  {
  2611    u32 vv = v;
  2612    u16 * const ptr = (typeof(ptr))out;
  2613    for (u64 i = 4; i <= 4; i--) { // x5
  2614      ptr[i] = strdec_table[vv % 100].v16;
  2615      vv /= 100u;
  2616    }
  2617  }
  2618  
  2619  // output 20 bytes
  2620    void
  2621  strdec_64(void * const out, const u64 v)
  2622  {
  2623    u64 vv = v;
  2624    u16 * const ptr = (typeof(ptr))out;
  2625    for (u64 i = 9; i <= 9; i--) { // x10
  2626      ptr[i] = strdec_table[vv % 100].v16;
  2627      vv /= 100;
  2628    }
  2629  }
  2630  
  2631  static const u8 strhex_table_16[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
  2632  
  2633  #if defined(__x86_64__)
  2634    static inline m128
  2635  strhex_helper(const u64 v)
  2636  {
  2637    static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
  2638  
  2639    const m128 tmp = _mm_set_epi64x((s64)(v>>4), (s64)v); // mm want s64
  2640    const m128 hilo = _mm_and_si128(tmp, _mm_set1_epi8(0xf));
  2641    const m128 bin = _mm_shuffle_epi8(hilo, _mm_load_si128((void *)mask1));
  2642    const m128 str = _mm_shuffle_epi8(_mm_load_si128((const void *)strhex_table_16), bin);
  2643    return str;
  2644  }
  2645  #elif defined(__aarch64__)
  2646    static inline m128
  2647  strhex_helper(const u64 v)
  2648  {
  2649    static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
  2650    u64 v2[2] = {v, v>>4};
  2651    const m128 tmp = vld1q_u8((u8 *)v2);
  2652    const m128 hilo = vandq_u8(tmp, vdupq_n_u8(0xf));
  2653    const m128 bin = vqtbl1q_u8(hilo, vld1q_u8(mask1));
  2654    const m128 str = vqtbl1q_u8(vld1q_u8(strhex_table_16), bin);
  2655    return str;
  2656  }
  2657  #else
  2658  static u16 strhex_table_256[256];
  2659  
  2660  __attribute__((constructor))
  2661    static void
  2662  strhex_init(void)
  2663  {
  2664    for (u64 i = 0; i < 256; i++)
  2665      strhex_table_256[i] = (((u16)strhex_table_16[i & 0xf]) << 8) | (strhex_table_16[i>>4]);
  2666  }
  2667  #endif // __x86_64__
  2668  
  2669  // output 8 bytes
  2670    void
  2671  strhex_32(void * const out, u32 v)
  2672  {
  2673  #if defined(__x86_64__)
  2674    const m128 str = strhex_helper((u64)v);
  2675    _mm_storel_epi64(out, _mm_srli_si128(str, 8));
  2676  #elif defined(__aarch64__)
  2677    const m128 str = strhex_helper((u64)v);
  2678    vst1q_lane_u64(out, vreinterpretq_u64_u8(str), 1);
  2679  #else
  2680    u16 * const ptr = (typeof(ptr))out;
  2681    for (u64 i = 0; i < 4; i++) {
  2682      ptr[3-i] = strhex_table_256[v & 0xff];
  2683      v >>= 8;
  2684    }
  2685  #endif
  2686  }
  2687  
  2688  // output 16 bytes // buffer must be aligned by 16B
  2689    void
  2690  strhex_64(void * const out, u64 v)
  2691  {
  2692  #if defined(__x86_64__)
  2693    const m128 str = strhex_helper(v);
  2694    _mm_storeu_si128(out, str);
  2695  #elif defined(__aarch64__)
  2696    const m128 str = strhex_helper(v);
  2697    vst1q_u8(out, str);
  2698  #else
  2699    u16 * const ptr = (typeof(ptr))out;
  2700    for (u64 i = 0; i < 8; i++) {
  2701      ptr[7-i] = strhex_table_256[v & 0xff];
  2702      v >>= 8;
  2703    }
  2704  #endif
  2705  }
  2706  
  2707  // string to u64
  2708    inline u64
  2709  a2u64(const void * const str)
  2710  {
  2711    return strtoull(str, NULL, 10);
  2712  }
  2713  
  2714    inline u32
  2715  a2u32(const void * const str)
  2716  {
  2717    return (u32)strtoull(str, NULL, 10);
  2718  }
  2719  
  2720    inline s64
  2721  a2s64(const void * const str)
  2722  {
  2723    return strtoll(str, NULL, 10);
  2724  }
  2725  
  2726    inline s32
  2727  a2s32(const void * const str)
  2728  {
  2729    return (s32)strtoll(str, NULL, 10);
  2730  }
  2731  
  2732    void
  2733  str_print_hex(FILE * const out, const void * const data, const u32 len)
  2734  {
  2735    const u8 * const ptr = data;
  2736    const u32 strsz = len * 3;
  2737    u8 * const buf = malloc(strsz);
  2738    for (u32 i = 0; i < len; i++) {
  2739      buf[i*3] = ' ';
  2740      buf[i*3+1] = strhex_table_16[ptr[i]>>4];
  2741      buf[i*3+2] = strhex_table_16[ptr[i] & 0xf];
  2742    }
  2743    fwrite(buf, strsz, 1, out);
  2744    free(buf);
  2745  }
  2746  
  2747    void
  2748  str_print_dec(FILE * const out, const void * const data, const u32 len)
  2749  {
  2750    const u8 * const ptr = data;
  2751    const u32 strsz = len * 4;
  2752    u8 * const buf = malloc(strsz);
  2753    for (u32 i = 0; i < len; i++) {
  2754      const u8 v = ptr[i];
  2755      buf[i*4] = ' ';
  2756      const u8 v1 = v / 100u;
  2757      const u8 v23 = v % 100u;
  2758      buf[i*4+1] = (u8)'0' + v1;
  2759      buf[i*4+2] = (u8)'0' + (v23 / 10u);
  2760      buf[i*4+3] = (u8)'0' + (v23 % 10u);
  2761    }
  2762    fwrite(buf, strsz, 1, out);
  2763    free(buf);
  2764  }
  2765  
  2766  // returns a NULL-terminated list of string tokens.
  2767  // After use you only need to free the returned pointer (char **).
  2768    char **
  2769  strtoks(const char * const str, const char * const delim)
  2770  {
  2771    if (str == NULL)
  2772      return NULL;
  2773    size_t nptr_alloc = 32;
  2774    char ** tokens = malloc(sizeof(tokens[0]) * nptr_alloc);
  2775    if (tokens == NULL)
  2776      return NULL;
  2777    const size_t bufsize = strlen(str) + 1;
  2778    char * const buf = malloc(bufsize);
  2779    if (buf == NULL)
  2780      goto fail_buf;
  2781  
  2782    memcpy(buf, str, bufsize);
  2783    char * saveptr = NULL;
  2784    char * tok = strtok_r(buf, delim, &saveptr);
  2785    size_t ntoks = 0;
  2786    while (tok) {
  2787      if (ntoks >= nptr_alloc) {
  2788        nptr_alloc += 32;
  2789        char ** const r = realloc(tokens, sizeof(tokens[0]) * nptr_alloc);
  2790        if (r == NULL)
  2791          goto fail_realloc;
  2792  
  2793        tokens = r;
  2794      }
  2795      tokens[ntoks] = tok;
  2796      ntoks++;
  2797      tok = strtok_r(NULL, delim, &saveptr);
  2798    }
  2799    tokens[ntoks] = NULL;
  2800    const size_t nptr = ntoks + 1; // append a NULL
  2801    const size_t rsize = (sizeof(tokens[0]) * nptr) + bufsize;
  2802    char ** const r = realloc(tokens, rsize);
  2803    if (r == NULL)
  2804      goto fail_realloc;
  2805  
  2806    tokens = r;
  2807    char * const dest = (char *)(&(tokens[nptr]));
  2808    memcpy(dest, buf, bufsize);
  2809    for (u64 i = 0; i < ntoks; i++)
  2810      tokens[i] += (dest - buf);
  2811  
  2812    free(buf);
  2813    return tokens;
  2814  
  2815  fail_realloc:
  2816    free(buf);
  2817  fail_buf:
  2818    free(tokens);
  2819    return NULL;
  2820  }
  2821  
  2822    u32
  2823  strtoks_count(const char * const * const toks)
  2824  {
  2825    if (!toks)
  2826      return 0;
  2827    u32 n = 0;
  2828    while (toks[n++]);
  2829    return n;
  2830  }
  2831  // }}} string
  2832  
  2833  // qsbr {{{
  2834  #define QSBR_STATES_NR ((23)) // shard capacity; valid values are 3*8-1 == 23; 5*8-1 == 39; 7*8-1 == 55
  2835  #define QSBR_SHARD_BITS  ((5)) // 2^n shards
  2836  #define QSBR_SHARD_NR    (((1u) << QSBR_SHARD_BITS))
  2837  #define QSBR_SHARD_MASK  ((QSBR_SHARD_NR - 1))
  2838  
  2839  struct qsbr_ref_real {
  2840  #ifdef QSBR_DEBUG
  2841    pthread_t ptid; // 8
  2842    u32 status; // 4
  2843    u32 nbt; // 4 (number of backtrace frames)
  2844  #define QSBR_DEBUG_BTNR ((14))
  2845    void * backtrace[QSBR_DEBUG_BTNR];
  2846  #endif
  2847    volatile au64 qstate; // user updates it
  2848    struct qsbr_ref_real * volatile * pptr; // internal only
  2849    struct qsbr_ref_real * park;
  2850  };
  2851  
  2852  static_assert(sizeof(struct qsbr_ref) == sizeof(struct qsbr_ref_real), "sizeof qsbr_ref");
  2853  
  2854  // Quiescent-State-Based Reclamation RCU
  2855  struct qsbr {
  2856    struct qsbr_ref_real target;
  2857    u64 padding0[5];
  2858    struct qshard {
  2859      au64 bitmap;
  2860      struct qsbr_ref_real * volatile ptrs[QSBR_STATES_NR];
  2861    } shards[QSBR_SHARD_NR];
  2862  };
  2863  
  2864    struct qsbr *
  2865  qsbr_create(void)
  2866  {
  2867    struct qsbr * const q = yalloc(sizeof(*q));
  2868    memset(q, 0, sizeof(*q));
  2869    return q;
  2870  }
  2871  
  2872    static inline struct qshard *
  2873  qsbr_shard(struct qsbr * const q, void * const ptr)
  2874  {
  2875    const u32 sid = crc32c_u64(0, (u64)ptr) & QSBR_SHARD_MASK;
  2876    debug_assert(sid < QSBR_SHARD_NR);
  2877    return &(q->shards[sid]);
  2878  }
  2879  
  2880    static inline void
  2881  qsbr_write_qstate(struct qsbr_ref_real * const ref, const u64 v)
  2882  {
  2883    atomic_store_explicit(&ref->qstate, v, MO_RELAXED);
  2884  }
  2885  
  2886    bool
  2887  qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref)
  2888  {
  2889    struct qsbr_ref_real * const ref = (typeof(ref))qref;
  2890    struct qshard * const shard = qsbr_shard(q, ref);
  2891    qsbr_write_qstate(ref, 0);
  2892  
  2893    do {
  2894      u64 bits = atomic_load_explicit(&shard->bitmap, MO_CONSUME);
  2895      const u32 pos = (u32)__builtin_ctzl(~bits);
  2896      if (unlikely(pos >= QSBR_STATES_NR))
  2897        return false;
  2898  
  2899      const u64 bits1 = bits | (1lu << pos);
  2900      if (atomic_compare_exchange_weak_explicit(&shard->bitmap, &bits, bits1, MO_ACQUIRE, MO_RELAXED)) {
  2901        shard->ptrs[pos] = ref;
  2902  
  2903        ref->pptr = &(shard->ptrs[pos]);
  2904        ref->park = &q->target;
  2905  #ifdef QSBR_DEBUG
  2906        ref->ptid = (u64)pthread_self();
  2907        ref->tid = 0;
  2908        ref->status = 1;
  2909        ref->nbt = backtrace(ref->backtrace, QSBR_DEBUG_BTNR);
  2910  #endif
  2911        return true;
  2912      }
  2913    } while (true);
  2914  }
  2915  
  2916    void
  2917  qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref)
  2918  {
  2919    struct qsbr_ref_real * const ref = (typeof(ref))qref;
  2920    struct qshard * const shard = qsbr_shard(q, ref);
  2921    const u32 pos = (u32)(ref->pptr - shard->ptrs);
  2922    debug_assert(pos < QSBR_STATES_NR);
  2923    debug_assert(shard->bitmap & (1lu << pos));
  2924  
  2925    shard->ptrs[pos] = &q->target;
  2926    (void)atomic_fetch_and_explicit(&shard->bitmap, ~(1lu << pos), MO_RELEASE);
  2927  #ifdef QSBR_DEBUG
  2928    ref->tid = 0;
  2929    ref->ptid = 0;
  2930    ref->status = 0xffff; // unregistered
  2931    ref->nbt = 0;
  2932  #endif
  2933    ref->pptr = NULL;
  2934    // wait for qsbr_wait to leave if it's working on the shard
  2935    while (atomic_load_explicit(&shard->bitmap, MO_CONSUME) >> 63)
  2936      cpu_pause();
  2937  }
  2938  
  2939    inline void
  2940  qsbr_update(struct qsbr_ref * const qref, const u64 v)
  2941  {
  2942    struct qsbr_ref_real * const ref = (typeof(ref))qref;
  2943    debug_assert((*ref->pptr) == ref); // must be unparked
  2944    // rcu update does not require release or acquire order
  2945    qsbr_write_qstate(ref, v);
  2946  }
  2947  
  2948    inline void
  2949  qsbr_park(struct qsbr_ref * const qref)
  2950  {
  2951    cpu_cfence();
  2952    struct qsbr_ref_real * const ref = (typeof(ref))qref;
  2953    *ref->pptr = ref->park;
  2954  #ifdef QSBR_DEBUG
  2955    ref->status = 0xfff; // parked
  2956  #endif
  2957  }
  2958  
  2959    inline void
  2960  qsbr_resume(struct qsbr_ref * const qref)
  2961  {
  2962    struct qsbr_ref_real * const ref = (typeof(ref))qref;
  2963    *ref->pptr = ref;
  2964  #ifdef QSBR_DEBUG
  2965    ref->status = 0xf; // resumed
  2966  #endif
  2967    cpu_cfence();
  2968  }
  2969  
  2970  // waiters needs external synchronization
  2971    void
  2972  qsbr_wait(struct qsbr * const q, const u64 target)
  2973  {
  2974    cpu_cfence();
  2975    qsbr_write_qstate(&q->target, target);
  2976    u64 cbits = 0; // check-bits; each bit corresponds to a shard
  2977    u64 bms[QSBR_SHARD_NR]; // copy of all bitmap
  2978    // take an unsafe snapshot of active users
  2979    for (u32 i = 0; i < QSBR_SHARD_NR; i++) {
  2980      bms[i] = atomic_load_explicit(&q->shards[i].bitmap, MO_CONSUME);
  2981      if (bms[i])
  2982        cbits |= (1lu << i); // set to 1 if [i] has ptrs
  2983    }
  2984  
  2985    while (cbits) {
  2986      for (u64 ctmp = cbits; ctmp; ctmp &= (ctmp - 1)) {
  2987        // shard id
  2988        const u32 i = (u32)__builtin_ctzl(ctmp);
  2989        struct qshard * const shard = &(q->shards[i]);
  2990        const u64 bits1 = atomic_fetch_or_explicit(&(shard->bitmap), 1lu << 63, MO_ACQUIRE);
  2991        for (u64 bits = bms[i]; bits; bits &= (bits - 1)) {
  2992          const u64 bit = bits & -bits; // extract lowest bit
  2993          if (((bits1 & bit) == 0) ||
  2994              (atomic_load_explicit(&(shard->ptrs[__builtin_ctzl(bit)]->qstate), MO_CONSUME) == target))
  2995            bms[i] &= ~bit;
  2996        }
  2997        (void)atomic_fetch_and_explicit(&(shard->bitmap), ~(1lu << 63), MO_RELEASE);
  2998        if (bms[i] == 0)
  2999          cbits &= ~(1lu << i);
  3000      }
  3001  #if defined(CORR)
  3002      corr_yield();
  3003  #endif
  3004    }
  3005    debug_assert(cbits == 0);
  3006    cpu_cfence();
  3007  }
  3008  
  3009    void
  3010  qsbr_destroy(struct qsbr * const q)
  3011  {
  3012    if (q)
  3013      free(q);
  3014  }
  3015  #undef QSBR_STATES_NR
  3016  #undef QSBR_BITMAP_NR
  3017  // }}} qsbr
  3018  
  3019  // vim:fdm=marker