github.com/moontrade/nogc@v0.1.7/sync/RWSpinLock.h

github.com/moontrade/nogc@v0.1.7/sync/RWSpinLock.h (about)

     1  /*
     2   * Copyright (c) Facebook, Inc. and its affiliates.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  /*
    18   * N.B. You most likely do _not_ want to use RWSpinLock or any other
    19   * kind of spinlock.  Use SharedMutex instead.
    20   *
    21   * In short, spinlocks in preemptive multi-tasking operating systems
    22   * have serious problems and fast mutexes like SharedMutex are almost
    23   * certainly the better choice, because letting the OS scheduler put a
    24   * thread to sleep is better for system responsiveness and throughput
    25   * than wasting a timeslice repeatedly querying a lock held by a
    26   * thread that's blocked, and you can't prevent userspace
    27   * programs blocking.
    28   *
    29   * Spinlocks in an operating system kernel make much more sense than
    30   * they do in userspace.
    31   *
    32   * -------------------------------------------------------------------
    33   *
    34   * Two Read-Write spin lock implementations.
    35   *
    36   *  Ref: http://locklessinc.com/articles/locks
    37   *
    38   *  Both locks here are faster than pthread_rwlock and have very low
    39   *  overhead (usually 20-30ns).  They don't use any system mutexes and
    40   *  are very compact (4/8 bytes), so are suitable for per-instance
    41   *  based locking, particularly when contention is not expected.
    42   *
    43   *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
    44   *  about for why a spin lock is frequently a bad idea generally.)
    45   *  RWSpinLock has minimal overhead, and comparable contention
    46   *  performance when the number of competing threads is less than or
    47   *  equal to the number of logical CPUs.  Even as the number of
    48   *  threads gets larger, RWSpinLock can still be very competitive in
    49   *  READ, although it is slower on WRITE, and also inherently unfair
    50   *  to writers.
    51   *
    52   *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
    53   *  your application really needs a lot more threads, and a
    54   *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
    55   *
    56   *  Caveats:
    57   *
    58   *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
    59   *    based systems.
    60   *
    61   *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
    62   *    readers and writers.
    63   *
    64   *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
    65   *    readers and writers.
    66   *
    67   *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
    68   *    writer priority) is NOT reentrant, even for lock_shared().
    69   *
    70   *    The lock will not grant any new shared (read) accesses while a thread
    71   *    attempting to acquire the lock in write mode is blocked. (That is,
    72   *    if the lock is held in shared mode by N threads, and a thread attempts
    73   *    to acquire it in write mode, no one else can acquire it in shared mode
    74   *    until these N threads release the lock and then the blocked thread
    75   *    acquires and releases the exclusive lock.) This also applies for
    76   *    attempts to reacquire the lock in shared mode by threads that already
    77   *    hold it in shared mode, making the lock non-reentrant.
    78   *
    79   *    RWSpinLock handles 2^30 - 1 concurrent readers.
    80   *
    81   * @author Xin Liu <xliux@fb.com>
    82   */
    83  
    84  #pragma once
    85  
    86  /*
    87  ========================================================================
    88  Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
    89  ========================================================================
    90  
    91  ------------------------------------------------------------------------------
    92  1. Single thread benchmark (read/write lock + unlock overhead)
    93  Benchmark                                    Iters   Total t    t/iter iter/sec
    94  -------------------------------------------------------------------------------
    95  *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
    96  +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
    97  +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
    98  +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
    99  +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
   100  +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
   101  +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
   102  +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
   103  +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
   104  +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
   105  + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
   106  + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
   107  
   108  ------------------------------------------------------------------------------
   109  2. Contention Benchmark      90% read  10% write
   110  Benchmark                    hits       average    min       max        sigma
   111  ------------------------------------------------------------------------------
   112  ---------- 8  threads ------------
   113  RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
   114  RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
   115  RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
   116  RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
   117  pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
   118  pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
   119  
   120  ---------- 16 threads ------------
   121  RWSpinLock       Write       124236     237ns      78ns      261us      801ns
   122  RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
   123  RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
   124  RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
   125  pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
   126  pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
   127  
   128  ---------- 50 threads ------------
   129  RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
   130  RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
   131  RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
   132  RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
   133  pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
   134  pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
   135  
   136  */
   137  
   138  #include "Portability.h"
   139  #include "Asm.h"
   140  
   141  #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
   142  #define RW_SPINLOCK_USE_X86_INTRINSIC_
   143  #include <x86intrin.h>
   144  #elif defined(_MSC_VER) && defined(FOLLY_X64)
   145  #define RW_SPINLOCK_USE_X86_INTRINSIC_
   146  #elif FOLLY_AARCH64
   147  #define RW_SPINLOCK_USE_X86_INTRINSIC_
   148  #else
   149  #undef RW_SPINLOCK_USE_X86_INTRINSIC_
   150  #endif
   151  
   152  // iOS doesn't define _mm_cvtsi64_si128 and friends
   153  #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
   154  #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   155  #else
   156  #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   157  #endif
   158  
   159  #include <algorithm>
   160  #include <atomic>
   161  #include <thread>
   162  
   163  #include "Likely.h"
   164  
   165  
   166  /*
   167   * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
   168   * a nice writer and don't expect a lot of write/read contention, or
   169   * when you need small rwlocks since you are creating a large number
   170   * of them.
   171   *
   172   * Note that the unfairness here is extreme: if the lock is
   173   * continually accessed for read, writers will never get a chance.  If
   174   * the lock can be that highly contended this class is probably not an
   175   * ideal choice anyway.
   176   *
   177   * It currently implements most of the Lockable, SharedLockable and
   178   * UpgradeLockable concepts except the TimedLockable related locking/unlocking
   179   * interfaces.
   180   */
   181  class RWSpinLock {
   182    enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
   183  
   184   public:
   185    constexpr RWSpinLock() : bits_(0) {}
   186  
   187    RWSpinLock(RWSpinLock const&) = delete;
   188    RWSpinLock& operator=(RWSpinLock const&) = delete;
   189  
   190    // Lockable Concept
   191    void lock() {
   192      uint_fast32_t count = 0;
   193      while (!LIKELY(try_lock())) {
   194        if (++count > 1000) {
   195          std::this_thread::yield();
   196        }
   197      }
   198    }
   199  
   200    // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
   201    void unlock() {
   202      static_assert(READER > WRITER + UPGRADED, "wrong bits!");
   203      bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
   204    }
   205  
   206    // SharedLockable Concept
   207    void lock_shared() {
   208      uint_fast32_t count = 0;
   209      while (!LIKELY(try_lock_shared())) {
   210        if (++count > 1000) {
   211          std::this_thread::yield();
   212        }
   213      }
   214    }
   215  
   216    void unlock_shared() { bits_.fetch_add(-READER, std::memory_order_release); }
   217  
   218    // Downgrade the lock from writer status to reader status.
   219    void unlock_and_lock_shared() {
   220      bits_.fetch_add(READER, std::memory_order_acquire);
   221      unlock();
   222    }
   223  
   224    // UpgradeLockable Concept
   225    void lock_upgrade() {
   226      uint_fast32_t count = 0;
   227      while (!try_lock_upgrade()) {
   228        if (++count > 1000) {
   229          std::this_thread::yield();
   230        }
   231      }
   232    }
   233  
   234    void unlock_upgrade() {
   235      bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
   236    }
   237  
   238    // unlock upgrade and try to acquire write lock
   239    void unlock_upgrade_and_lock() {
   240      int64_t count = 0;
   241      while (!try_unlock_upgrade_and_lock()) {
   242        if (++count > 1000) {
   243          std::this_thread::yield();
   244        }
   245      }
   246    }
   247  
   248    // unlock upgrade and read lock atomically
   249    void unlock_upgrade_and_lock_shared() {
   250      bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
   251    }
   252  
   253    // write unlock and upgrade lock atomically
   254    void unlock_and_lock_upgrade() {
   255      // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
   256      // the same time when other threads are trying do try_lock_upgrade().
   257      bits_.fetch_or(UPGRADED, std::memory_order_acquire);
   258      bits_.fetch_add(-WRITER, std::memory_order_release);
   259    }
   260  
   261    // Attempt to acquire writer permission. Return false if we didn't get it.
   262    bool try_lock() {
   263      int32_t expect = 0;
   264      return bits_.compare_exchange_strong(
   265          expect, WRITER, std::memory_order_acq_rel);
   266    }
   267  
   268    // Try to get reader permission on the lock. This can fail if we
   269    // find out someone is a writer or upgrader.
   270    // Setting the UPGRADED bit would allow a writer-to-be to indicate
   271    // its intention to write and block any new readers while waiting
   272    // for existing readers to finish and release their read locks. This
   273    // helps avoid starving writers (promoted from upgraders).
   274    bool try_lock_shared() {
   275      // fetch_add is considerably (100%) faster than compare_exchange,
   276      // so here we are optimizing for the common (lock success) case.
   277      int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
   278      if (UNLIKELY(value & (WRITER | UPGRADED))) {
   279        bits_.fetch_add(-READER, std::memory_order_release);
   280        return false;
   281      }
   282      return true;
   283    }
   284  
   285    // try to unlock upgrade and write lock atomically
   286    bool try_unlock_upgrade_and_lock() {
   287      int32_t expect = UPGRADED;
   288      return bits_.compare_exchange_strong(
   289          expect, WRITER, std::memory_order_acq_rel);
   290    }
   291  
   292    // try to acquire an upgradable lock.
   293    bool try_lock_upgrade() {
   294      int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
   295  
   296      // Note: when failed, we cannot flip the UPGRADED bit back,
   297      // as in this case there is either another upgrade lock or a write lock.
   298      // If it's a write lock, the bit will get cleared up when that lock's done
   299      // with unlock().
   300      return ((value & (UPGRADED | WRITER)) == 0);
   301    }
   302  
   303    // mainly for debugging purposes.
   304    int32_t bits() const { return bits_.load(std::memory_order_acquire); }
   305  
   306    class FOLLY_NODISCARD ReadHolder;
   307    class FOLLY_NODISCARD UpgradedHolder;
   308    class FOLLY_NODISCARD WriteHolder;
   309  
   310    class FOLLY_NODISCARD ReadHolder {
   311     public:
   312      explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
   313        if (lock_) {
   314          lock_->lock_shared();
   315        }
   316      }
   317  
   318      explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
   319        lock_->lock_shared();
   320      }
   321  
   322      ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
   323        other.lock_ = nullptr;
   324      }
   325  
   326      // down-grade
   327      explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
   328        upgraded.lock_ = nullptr;
   329        if (lock_) {
   330          lock_->unlock_upgrade_and_lock_shared();
   331        }
   332      }
   333  
   334      explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
   335        writer.lock_ = nullptr;
   336        if (lock_) {
   337          lock_->unlock_and_lock_shared();
   338        }
   339      }
   340  
   341      ReadHolder& operator=(ReadHolder&& other) {
   342        using std::swap;
   343        swap(lock_, other.lock_);
   344        return *this;
   345      }
   346  
   347      ReadHolder(const ReadHolder& other) = delete;
   348      ReadHolder& operator=(const ReadHolder& other) = delete;
   349  
   350      ~ReadHolder() {
   351        if (lock_) {
   352          lock_->unlock_shared();
   353        }
   354      }
   355  
   356      void reset(RWSpinLock* lock = nullptr) {
   357        if (lock == lock_) {
   358          return;
   359        }
   360        if (lock_) {
   361          lock_->unlock_shared();
   362        }
   363        lock_ = lock;
   364        if (lock_) {
   365          lock_->lock_shared();
   366        }
   367      }
   368  
   369      void swap(ReadHolder* other) { std::swap(lock_, other->lock_); }
   370  
   371     private:
   372      friend class UpgradedHolder;
   373      friend class WriteHolder;
   374      RWSpinLock* lock_;
   375    };
   376  
   377    class FOLLY_NODISCARD UpgradedHolder {
   378     public:
   379      explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
   380        if (lock_) {
   381          lock_->lock_upgrade();
   382        }
   383      }
   384  
   385      explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
   386        lock_->lock_upgrade();
   387      }
   388  
   389      explicit UpgradedHolder(WriteHolder&& writer) {
   390        lock_ = writer.lock_;
   391        writer.lock_ = nullptr;
   392        if (lock_) {
   393          lock_->unlock_and_lock_upgrade();
   394        }
   395      }
   396  
   397      UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
   398        other.lock_ = nullptr;
   399      }
   400  
   401      UpgradedHolder& operator=(UpgradedHolder&& other) {
   402        using std::swap;
   403        swap(lock_, other.lock_);
   404        return *this;
   405      }
   406  
   407      UpgradedHolder(const UpgradedHolder& other) = delete;
   408      UpgradedHolder& operator=(const UpgradedHolder& other) = delete;
   409  
   410      ~UpgradedHolder() {
   411        if (lock_) {
   412          lock_->unlock_upgrade();
   413        }
   414      }
   415  
   416      void reset(RWSpinLock* lock = nullptr) {
   417        if (lock == lock_) {
   418          return;
   419        }
   420        if (lock_) {
   421          lock_->unlock_upgrade();
   422        }
   423        lock_ = lock;
   424        if (lock_) {
   425          lock_->lock_upgrade();
   426        }
   427      }
   428  
   429      void swap(UpgradedHolder* other) {
   430        using std::swap;
   431        swap(lock_, other->lock_);
   432      }
   433  
   434     private:
   435      friend class WriteHolder;
   436      friend class ReadHolder;
   437      RWSpinLock* lock_;
   438    };
   439  
   440    class FOLLY_NODISCARD WriteHolder {
   441     public:
   442      explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
   443        if (lock_) {
   444          lock_->lock();
   445        }
   446      }
   447  
   448      explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) { lock_->lock(); }
   449  
   450      // promoted from an upgrade lock holder
   451      explicit WriteHolder(UpgradedHolder&& upgraded) {
   452        lock_ = upgraded.lock_;
   453        upgraded.lock_ = nullptr;
   454        if (lock_) {
   455          lock_->unlock_upgrade_and_lock();
   456        }
   457      }
   458  
   459      WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
   460        other.lock_ = nullptr;
   461      }
   462  
   463      WriteHolder& operator=(WriteHolder&& other) {
   464        using std::swap;
   465        swap(lock_, other.lock_);
   466        return *this;
   467      }
   468  
   469      WriteHolder(const WriteHolder& other) = delete;
   470      WriteHolder& operator=(const WriteHolder& other) = delete;
   471  
   472      ~WriteHolder() {
   473        if (lock_) {
   474          lock_->unlock();
   475        }
   476      }
   477  
   478      void reset(RWSpinLock* lock = nullptr) {
   479        if (lock == lock_) {
   480          return;
   481        }
   482        if (lock_) {
   483          lock_->unlock();
   484        }
   485        lock_ = lock;
   486        if (lock_) {
   487          lock_->lock();
   488        }
   489      }
   490  
   491      void swap(WriteHolder* other) {
   492        using std::swap;
   493        swap(lock_, other->lock_);
   494      }
   495  
   496     private:
   497      friend class ReadHolder;
   498      friend class UpgradedHolder;
   499      RWSpinLock* lock_;
   500    };
   501  
   502   private:
   503    std::atomic<int32_t> bits_;
   504  };
   505  
   506  #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
   507  // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
   508  
   509  namespace detail {
   510  template <size_t kBitWidth>
   511  struct RWTicketIntTrait {
   512    static_assert(
   513        kBitWidth == 32 || kBitWidth == 64,
   514        "bit width has to be either 32 or 64 ");
   515  };
   516  
   517  template <>
   518  struct RWTicketIntTrait<64> {
   519    typedef uint64_t FullInt;
   520    typedef uint32_t HalfInt;
   521    typedef uint16_t QuarterInt;
   522  
   523  #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   524    static __m128i make128(const uint16_t v[4]) {
   525      return _mm_set_epi16(
   526          0, 0, 0, 0, short(v[3]), short(v[2]), short(v[1]), short(v[0]));
   527    }
   528    static inline __m128i fromInteger(uint64_t from) {
   529      return _mm_cvtsi64_si128(int64_t(from));
   530    }
   531    static inline uint64_t toInteger(__m128i in) {
   532      return uint64_t(_mm_cvtsi128_si64(in));
   533    }
   534    static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
   535      return toInteger(_mm_add_epi16(in, kDelta));
   536    }
   537  #endif
   538  };
   539  
   540  template <>
   541  struct RWTicketIntTrait<32> {
   542    typedef uint32_t FullInt;
   543    typedef uint16_t HalfInt;
   544    typedef uint8_t QuarterInt;
   545  
   546  #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   547    static __m128i make128(const uint8_t v[4]) {
   548      // clang-format off
   549      return _mm_set_epi8(
   550          0, 0, 0, 0,
   551          0, 0, 0, 0,
   552          0, 0, 0, 0,
   553          char(v[3]), char(v[2]), char(v[1]), char(v[0]));
   554      // clang-format on
   555    }
   556    static inline __m128i fromInteger(uint32_t from) {
   557      return _mm_cvtsi32_si128(int32_t(from));
   558    }
   559    static inline uint32_t toInteger(__m128i in) {
   560      return uint32_t(_mm_cvtsi128_si32(in));
   561    }
   562    static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
   563      return toInteger(_mm_add_epi8(in, kDelta));
   564    }
   565  #endif
   566  };
   567  } // namespace detail
   568  
   569  template <size_t kBitWidth, bool kFavorWriter = true>
   570  class RWTicketSpinLockT {
   571    typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
   572    typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
   573    typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
   574    typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt QuarterInt;
   575  
   576    union RWTicket {
   577      constexpr RWTicket() : whole(0) {}
   578      FullInt whole;
   579      HalfInt readWrite;
   580      __extension__ struct {
   581        QuarterInt write;
   582        QuarterInt read;
   583        QuarterInt users;
   584      };
   585    } ticket;
   586  
   587   private: // Some x64-specific utilities for atomic access to ticket.
   588    template <class T>
   589    static T load_acquire(T* addr) {
   590      T t = *addr; // acquire barrier
   591      asm_volatile_memory();
   592      return t;
   593    }
   594  
   595    template <class T>
   596    static void store_release(T* addr, T v) {
   597      asm_volatile_memory();
   598      *addr = v; // release barrier
   599    }
   600  
   601   public:
   602    constexpr RWTicketSpinLockT() {}
   603  
   604    RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
   605    RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
   606  
   607    void lock() {
   608      if (kFavorWriter) {
   609        writeLockAggressive();
   610      } else {
   611        writeLockNice();
   612      }
   613    }
   614  
   615    /*
   616     * Both try_lock and try_lock_shared diverge in our implementation from the
   617     * lock algorithm described in the link above.
   618     *
   619     * In the read case, it is undesirable that the readers could wait
   620     * for another reader (before increasing ticket.read in the other
   621     * implementation).  Our approach gives up on
   622     * first-come-first-serve, but our benchmarks showed improve
   623     * performance for both readers and writers under heavily contended
   624     * cases, particularly when the number of threads exceeds the number
   625     * of logical CPUs.
   626     *
   627     * We have writeLockAggressive() using the original implementation
   628     * for a writer, which gives some advantage to the writer over the
   629     * readers---for that path it is guaranteed that the writer will
   630     * acquire the lock after all the existing readers exit.
   631     */
   632    bool try_lock() {
   633      RWTicket t;
   634      FullInt old = t.whole = load_acquire(&ticket.whole);
   635      if (t.users != t.write) {
   636        return false;
   637      }
   638      ++t.users;
   639      return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
   640    }
   641  
   642    /*
   643     * Call this if you want to prioritize writer to avoid starvation.
   644     * Unlike writeLockNice, immediately acquires the write lock when
   645     * the existing readers (arriving before the writer) finish their
   646     * turns.
   647     */
   648    void writeLockAggressive() {
   649      // std::this_thread::yield() is needed here to avoid a pathology if the
   650      // number of threads attempting concurrent writes is >= the number of real
   651      // cores allocated to this process. This is less likely than the
   652      // corresponding situation in lock_shared(), but we still want to
   653      // avoid it
   654      uint_fast32_t count = 0;
   655      QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
   656      while (val != load_acquire(&ticket.write)) {
   657        asm_volatile_pause();
   658        if (UNLIKELY(++count > 1000)) {
   659          std::this_thread::yield();
   660        }
   661      }
   662    }
   663  
   664    // Call this when the writer should be nicer to the readers.
   665    void writeLockNice() {
   666      // Here it doesn't cpu-relax the writer.
   667      //
   668      // This is because usually we have many more readers than the
   669      // writers, so the writer has less chance to get the lock when
   670      // there are a lot of competing readers.  The aggressive spinning
   671      // can help to avoid starving writers.
   672      //
   673      // We don't worry about std::this_thread::yield() here because the caller
   674      // has already explicitly abandoned fairness.
   675      while (!try_lock()) {
   676      }
   677    }
   678  
   679    // Atomically unlock the write-lock from writer and acquire the read-lock.
   680    void unlock_and_lock_shared() {
   681      QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
   682    }
   683  
   684    // Release writer permission on the lock.
   685    void unlock() {
   686      RWTicket t;
   687      t.whole = load_acquire(&ticket.whole);
   688  
   689  #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   690      FullInt old = t.whole;
   691      // SSE2 can reduce the lock and unlock overhead by 10%
   692      static const QuarterInt kDeltaBuf[4] = {1, 1, 0, 0}; // write/read/user
   693      static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
   694      __m128i m = IntTraitType::fromInteger(old);
   695      t.whole = IntTraitType::addParallel(m, kDelta);
   696  #else
   697      ++t.read;
   698      ++t.write;
   699  #endif
   700      store_release(&ticket.readWrite, t.readWrite);
   701    }
   702  
   703    void lock_shared() {
   704      // std::this_thread::yield() is important here because we can't grab the
   705      // shared lock if there is a pending writeLockAggressive, so we
   706      // need to let threads that already have a shared lock complete
   707      uint_fast32_t count = 0;
   708      while (!LIKELY(try_lock_shared())) {
   709        asm_volatile_pause();
   710        if (UNLIKELY((++count & 1023) == 0)) {
   711          std::this_thread::yield();
   712        }
   713      }
   714    }
   715  
   716    bool try_lock_shared() {
   717      RWTicket t, old;
   718      old.whole = t.whole = load_acquire(&ticket.whole);
   719      old.users = old.read;
   720  #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
   721      // SSE2 may reduce the total lock and unlock overhead by 10%
   722      static const QuarterInt kDeltaBuf[4] = {0, 1, 1, 0}; // write/read/user
   723      static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
   724      __m128i m = IntTraitType::fromInteger(old.whole);
   725      t.whole = IntTraitType::addParallel(m, kDelta);
   726  #else
   727      ++t.read;
   728      ++t.users;
   729  #endif
   730      return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
   731    }
   732  
   733    void unlock_shared() { __sync_fetch_and_add(&ticket.write, 1); }
   734  
   735    class FOLLY_NODISCARD WriteHolder;
   736  
   737    typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
   738    class FOLLY_NODISCARD ReadHolder {
   739     public:
   740      ReadHolder(ReadHolder const&) = delete;
   741      ReadHolder& operator=(ReadHolder const&) = delete;
   742  
   743      explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
   744        if (lock_) {
   745          lock_->lock_shared();
   746        }
   747      }
   748  
   749      explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
   750        if (lock_) {
   751          lock_->lock_shared();
   752        }
   753      }
   754  
   755      // atomically unlock the write-lock from writer and acquire the read-lock
   756      explicit ReadHolder(WriteHolder* writer) : lock_(nullptr) {
   757        std::swap(this->lock_, writer->lock_);
   758        if (lock_) {
   759          lock_->unlock_and_lock_shared();
   760        }
   761      }
   762  
   763      ~ReadHolder() {
   764        if (lock_) {
   765          lock_->unlock_shared();
   766        }
   767      }
   768  
   769      void reset(RWSpinLock* lock = nullptr) {
   770        if (lock_) {
   771          lock_->unlock_shared();
   772        }
   773        lock_ = lock;
   774        if (lock_) {
   775          lock_->lock_shared();
   776        }
   777      }
   778  
   779      void swap(ReadHolder* other) { std::swap(this->lock_, other->lock_); }
   780  
   781     private:
   782      RWSpinLock* lock_;
   783    };
   784  
   785    class FOLLY_NODISCARD WriteHolder {
   786     public:
   787      WriteHolder(WriteHolder const&) = delete;
   788      WriteHolder& operator=(WriteHolder const&) = delete;
   789  
   790      explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
   791        if (lock_) {
   792          lock_->lock();
   793        }
   794      }
   795      explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
   796        if (lock_) {
   797          lock_->lock();
   798        }
   799      }
   800  
   801      ~WriteHolder() {
   802        if (lock_) {
   803          lock_->unlock();
   804        }
   805      }
   806  
   807      void reset(RWSpinLock* lock = nullptr) {
   808        if (lock == lock_) {
   809          return;
   810        }
   811        if (lock_) {
   812          lock_->unlock();
   813        }
   814        lock_ = lock;
   815        if (lock_) {
   816          lock_->lock();
   817        }
   818      }
   819  
   820      void swap(WriteHolder* other) { std::swap(this->lock_, other->lock_); }
   821  
   822     private:
   823      friend class ReadHolder;
   824      RWSpinLock* lock_;
   825    };
   826  };
   827  
   828  typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
   829  typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
   830  
   831  #endif // RW_SPINLOCK_USE_X86_INTRINSIC_
   832  
   833  
   834  #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
   835  #undef RW_SPINLOCK_USE_X86_INTRINSIC_
   836  #endif