github.com/moontrade/mdbx-go@v0.4.0/mdbx.c (about)

     1  /*
     2   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
     3   * and other libmdbx authors: please see AUTHORS file.
     4   * All rights reserved.
     5   *
     6   * Redistribution and use in source and binary forms, with or without
     7   * modification, are permitted only as authorized by the OpenLDAP
     8   * Public License.
     9   *
    10   * A copy of this license is available in the file LICENSE in the
    11   * top-level directory of the distribution or, alternatively, at
    12   * <http://www.OpenLDAP.org/license.html>. */
    13  
    14  #define xMDBX_ALLOY 1
    15  #define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5
    16  #ifdef MDBX_CONFIG_H
    17  #include MDBX_CONFIG_H
    18  #endif
    19  
    20  #define LIBMDBX_INTERNALS
    21  #ifdef xMDBX_TOOLS
    22  #define MDBX_DEPRECATED
    23  #endif /* xMDBX_TOOLS */
    24  
    25  #ifdef xMDBX_ALLOY
    26  /* Amalgamated build */
    27  #define MDBX_INTERNAL_FUNC static
    28  #define MDBX_INTERNAL_VAR static
    29  #else
    30  /* Non-amalgamated build */
    31  #define MDBX_INTERNAL_FUNC
    32  #define MDBX_INTERNAL_VAR extern
    33  #endif /* xMDBX_ALLOY */
    34  
    35  /*----------------------------------------------------------------------------*/
    36  
    37  /** Disables using GNU/Linux libc extensions.
    38   * \ingroup build_option
    39   * \note This option couldn't be moved to the options.h since dependant
    40   * control macros/defined should be prepared before include the options.h */
    41  #ifndef MDBX_DISABLE_GNU_SOURCE
    42  #define MDBX_DISABLE_GNU_SOURCE 0
    43  #endif
    44  #if MDBX_DISABLE_GNU_SOURCE
    45  #undef _GNU_SOURCE
    46  #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
    47  #define _GNU_SOURCE
    48  #endif /* MDBX_DISABLE_GNU_SOURCE */
    49  
    50  /* Should be defined before any includes */
    51  #if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) &&                \
    52      !defined(ANDROID)
    53  #define _FILE_OFFSET_BITS 64
    54  #endif
    55  
    56  #ifdef __APPLE__
    57  #define _DARWIN_C_SOURCE
    58  #endif
    59  
    60  #ifdef _MSC_VER
    61  #if _MSC_FULL_VER < 190024234
    62  /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
    63   * Studio 2015 Update 3). But you could remove this #error and try to continue
    64   * at your own risk. In such case please don't rise up an issues related ONLY to
    65   * old compilers.
    66   *
    67   * NOTE:
    68   *   Unfortunately, there are several different builds of "Visual Studio" that
    69   *   are called "Visual Studio 2015 Update 3".
    70   *
    71   *   The 190024234 is used here because it is minimal version of Visual Studio
    72   *   that was used for build and testing libmdbx in recent years. Soon this
    73   *   value will be increased to 19.0.24241.7, since build and testing using
    74   *   "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
    75   *
    76   *   Please ask Microsoft (but not us) for information about version differences
    77   *   and how to and where you can obtain the latest "Visual Studio 2015" build
    78   *   with all fixes.
    79   */
    80  #error                                                                         \
    81      "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
    82  #endif
    83  #ifndef _CRT_SECURE_NO_WARNINGS
    84  #define _CRT_SECURE_NO_WARNINGS
    85  #endif /* _CRT_SECURE_NO_WARNINGS */
    86  #if _MSC_VER > 1800
    87  #pragma warning(disable : 4464) /* relative include path contains '..' */
    88  #endif
    89  #if _MSC_VER > 1913
    90  #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation...  \
    91                                   */
    92  #endif
    93  #if _MSC_VER > 1914
    94  #pragma warning(                                                               \
    95      disable : 5105) /* winbase.h(9531): warning C5105: macro expansion         \
    96                         producing 'defined' has undefined behavior */
    97  #endif
    98  #pragma warning(disable : 4710) /* 'xyz': function not inlined */
    99  #pragma warning(disable : 4711) /* function 'xyz' selected for automatic       \
   100                                     inline expansion */
   101  #pragma warning(                                                               \
   102      disable : 4201) /* nonstandard extension used : nameless struct / union */
   103  #pragma warning(disable : 4702) /* unreachable code */
   104  #pragma warning(disable : 4706) /* assignment within conditional expression */
   105  #pragma warning(disable : 4127) /* conditional expression is constant */
   106  #pragma warning(disable : 4324) /* 'xyz': structure was padded due to          \
   107                                     alignment specifier */
   108  #pragma warning(disable : 4310) /* cast truncates constant value */
   109  #pragma warning(                                                               \
   110      disable : 4820) /* bytes padding added after data member for alignment */
   111  #pragma warning(disable : 4548) /* expression before comma has no effect;      \
   112                                     expected expression with side - effect */
   113  #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
   114                                     unaligned */
   115  #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized      \
   116                                     array in struct/union */
   117  #pragma warning(disable : 4204) /* nonstandard extension used: non-constant    \
   118                                     aggregate initializer */
   119  #pragma warning(                                                               \
   120      disable : 4505) /* unreferenced local function has been removed */
   121  #endif              /* _MSC_VER (warnings) */
   122  
   123  #if defined(__GNUC__) && __GNUC__ < 9
   124  #pragma GCC diagnostic ignored "-Wattributes"
   125  #endif /* GCC < 9 */
   126  
   127  #if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) &&    \
   128      !defined(__USE_MINGW_ANSI_STDIO)
   129  #define __USE_MINGW_ANSI_STDIO 1
   130  #endif /* __USE_MINGW_ANSI_STDIO */
   131  
   132  #include "mdbx.h"
   133  /*
   134   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
   135   * and other libmdbx authors: please see AUTHORS file.
   136   * All rights reserved.
   137   *
   138   * Redistribution and use in source and binary forms, with or without
   139   * modification, are permitted only as authorized by the OpenLDAP
   140   * Public License.
   141   *
   142   * A copy of this license is available in the file LICENSE in the
   143   * top-level directory of the distribution or, alternatively, at
   144   * <http://www.OpenLDAP.org/license.html>.
   145   */
   146  
   147  
   148  /*----------------------------------------------------------------------------*/
   149  /* Microsoft compiler generates a lot of warning for self includes... */
   150  
   151  #ifdef _MSC_VER
   152  #pragma warning(push, 1)
   153  #pragma warning(disable : 4548) /* expression before comma has no effect;      \
   154                                     expected expression with side - effect */
   155  #pragma warning(disable : 4530) /* C++ exception handler used, but unwind      \
   156                                   * semantics are not enabled. Specify /EHsc */
   157  #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling  \
   158                                   * mode specified; termination on exception is \
   159                                   * not guaranteed. Specify /EHsc */
   160  #endif                          /* _MSC_VER (warnings) */
   161  
   162  #if defined(_WIN32) || defined(_WIN64)
   163  #if !defined(_CRT_SECURE_NO_WARNINGS)
   164  #define _CRT_SECURE_NO_WARNINGS
   165  #endif /* _CRT_SECURE_NO_WARNINGS */
   166  #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY &&             \
   167      !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
   168  #define _NO_CRT_STDIO_INLINE
   169  #endif
   170  #elif !defined(_POSIX_C_SOURCE)
   171  #define _POSIX_C_SOURCE 200809L
   172  #endif /* Windows */
   173  
   174  /*----------------------------------------------------------------------------*/
   175  /* basic C99 includes */
   176  #include <inttypes.h>
   177  #include <stddef.h>
   178  #include <stdint.h>
   179  #include <stdlib.h>
   180  
   181  #include <assert.h>
   182  #include <fcntl.h>
   183  #include <limits.h>
   184  #include <stdio.h>
   185  #include <string.h>
   186  #include <time.h>
   187  
   188  #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
   189  #error                                                                         \
   190      "Sanity checking failed: Two's complement, reasonably sized integer types"
   191  #endif
   192  
   193  #ifndef SSIZE_MAX
   194  #define SSIZE_MAX INTPTR_MAX
   195  #endif
   196  
   197  #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
   198  #define MDBX_WORDBITS 64
   199  #else
   200  #define MDBX_WORDBITS 32
   201  #endif /* MDBX_WORDBITS */
   202  
   203  /*----------------------------------------------------------------------------*/
   204  /* feature testing */
   205  
   206  #ifndef __has_warning
   207  #define __has_warning(x) (0)
   208  #endif
   209  
   210  #ifndef __has_include
   211  #define __has_include(x) (0)
   212  #endif
   213  
   214  #ifndef __has_feature
   215  #define __has_feature(x) (0)
   216  #endif
   217  
   218  #ifndef __has_extension
   219  #define __has_extension(x) (0)
   220  #endif
   221  
   222  #if __has_feature(thread_sanitizer)
   223  #define __SANITIZE_THREAD__ 1
   224  #endif
   225  
   226  #if __has_feature(address_sanitizer)
   227  #define __SANITIZE_ADDRESS__ 1
   228  #endif
   229  
   230  #ifndef __GNUC_PREREQ
   231  #if defined(__GNUC__) && defined(__GNUC_MINOR__)
   232  #define __GNUC_PREREQ(maj, min)                                                \
   233    ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
   234  #else
   235  #define __GNUC_PREREQ(maj, min) (0)
   236  #endif
   237  #endif /* __GNUC_PREREQ */
   238  
   239  #ifndef __CLANG_PREREQ
   240  #ifdef __clang__
   241  #define __CLANG_PREREQ(maj, min)                                               \
   242    ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
   243  #else
   244  #define __CLANG_PREREQ(maj, min) (0)
   245  #endif
   246  #endif /* __CLANG_PREREQ */
   247  
   248  #ifndef __GLIBC_PREREQ
   249  #if defined(__GLIBC__) && defined(__GLIBC_MINOR__)
   250  #define __GLIBC_PREREQ(maj, min)                                               \
   251    ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
   252  #else
   253  #define __GLIBC_PREREQ(maj, min) (0)
   254  #endif
   255  #endif /* __GLIBC_PREREQ */
   256  
   257  /*----------------------------------------------------------------------------*/
   258  /* C11' alignas() */
   259  
   260  #if __has_include(<stdalign.h>)
   261  #include <stdalign.h>
   262  #endif
   263  #if defined(alignas) || defined(__cplusplus)
   264  #define MDBX_ALIGNAS(N) alignas(N)
   265  #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
   266  #define MDBX_ALIGNAS(N) _Alignas(N)
   267  #elif defined(_MSC_VER)
   268  #define MDBX_ALIGNAS(N) __declspec(align(N))
   269  #elif __has_attribute(__aligned__) || defined(__GNUC__)
   270  #define MDBX_ALIGNAS(N) __attribute__((__aligned__(N)))
   271  #else
   272  #error "FIXME: Required alignas() or equivalent."
   273  #endif /* MDBX_ALIGNAS */
   274  
   275  /*----------------------------------------------------------------------------*/
   276  /* Systems macros and includes */
   277  
   278  #ifndef __extern_C
   279  #ifdef __cplusplus
   280  #define __extern_C extern "C"
   281  #else
   282  #define __extern_C
   283  #endif
   284  #endif /* __extern_C */
   285  
   286  #if !defined(nullptr) && !defined(__cplusplus) ||                              \
   287      (__cplusplus < 201103L && !defined(_MSC_VER))
   288  #define nullptr NULL
   289  #endif
   290  
   291  #if defined(__APPLE__) || defined(_DARWIN_C_SOURCE)
   292  #include <AvailabilityMacros.h>
   293  #include <TargetConditionals.h>
   294  #ifndef MAC_OS_X_VERSION_MIN_REQUIRED
   295  #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */
   296  #endif
   297  #endif /* Apple OSX & iOS */
   298  
   299  #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
   300      defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) ||         \
   301      defined(__APPLE__) || defined(__MACH__)
   302  #include <sys/cdefs.h>
   303  #include <sys/mount.h>
   304  #include <sys/sysctl.h>
   305  #include <sys/types.h>
   306  #if defined(__FreeBSD__) || defined(__DragonFly__)
   307  #include <vm/vm_param.h>
   308  #elif defined(__OpenBSD__) || defined(__NetBSD__)
   309  #include <uvm/uvm_param.h>
   310  #else
   311  #define SYSCTL_LEGACY_NONCONST_MIB
   312  #endif
   313  #ifndef __MACH__
   314  #include <sys/vmmeter.h>
   315  #endif
   316  #else
   317  #include <malloc.h>
   318  #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) ||                \
   319        defined(_WIN32) || defined(_WIN64))
   320  #include <mntent.h>
   321  #endif /* !Solaris */
   322  #endif /* !xBSD */
   323  
   324  #if defined(__FreeBSD__) || __has_include(<malloc_np.h>)
   325  #include <malloc_np.h>
   326  #endif
   327  
   328  #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>)
   329  #include <malloc/malloc.h>
   330  #endif /* MacOS */
   331  
   332  #if defined(__MACH__)
   333  #include <mach/host_info.h>
   334  #include <mach/mach_host.h>
   335  #include <mach/mach_port.h>
   336  #include <uuid/uuid.h>
   337  #endif
   338  
   339  #if defined(__linux__) || defined(__gnu_linux__)
   340  #include <sched.h>
   341  #include <sys/sendfile.h>
   342  #include <sys/statfs.h>
   343  #endif /* Linux */
   344  
   345  #ifndef _XOPEN_SOURCE
   346  #define _XOPEN_SOURCE 0
   347  #endif
   348  
   349  #ifndef _XOPEN_SOURCE_EXTENDED
   350  #define _XOPEN_SOURCE_EXTENDED 0
   351  #else
   352  #include <utmpx.h>
   353  #endif /* _XOPEN_SOURCE_EXTENDED */
   354  
   355  #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
   356  #include <kstat.h>
   357  #include <sys/mnttab.h>
   358  /* On Solaris, it's easier to add a missing prototype rather than find a
   359   * combination of #defines that break nothing. */
   360  __extern_C key_t ftok(const char *, int);
   361  #endif /* SunOS/Solaris */
   362  
   363  #if defined(_WIN32) || defined(_WIN64) /*-------------------------------------*/
   364  
   365  #ifndef _WIN32_WINNT
   366  #define _WIN32_WINNT 0x0601 /* Windows 7 */
   367  #elif _WIN32_WINNT < 0x0500
   368  #error At least 'Windows 2000' API is required for libmdbx.
   369  #endif /* _WIN32_WINNT */
   370  #if (defined(__MINGW32__) || defined(__MINGW64__)) &&                          \
   371      !defined(__USE_MINGW_ANSI_STDIO)
   372  #define __USE_MINGW_ANSI_STDIO 1
   373  #endif /* MinGW */
   374  #ifndef WIN32_LEAN_AND_MEAN
   375  #define WIN32_LEAN_AND_MEAN
   376  #endif /* WIN32_LEAN_AND_MEAN */
   377  #include <excpt.h>
   378  #include <tlhelp32.h>
   379  #include <windows.h>
   380  #include <winnt.h>
   381  #include <winternl.h>
   382  
   383  #else /*----------------------------------------------------------------------*/
   384  
   385  #include <unistd.h>
   386  #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1
   387  #error "libmdbx requires the _POSIX_MAPPED_FILES feature"
   388  #endif /* _POSIX_MAPPED_FILES */
   389  
   390  #include <pthread.h>
   391  #include <semaphore.h>
   392  #include <signal.h>
   393  #include <sys/file.h>
   394  #include <sys/ipc.h>
   395  #include <sys/mman.h>
   396  #include <sys/param.h>
   397  #include <sys/stat.h>
   398  #include <sys/statvfs.h>
   399  #include <sys/uio.h>
   400  
   401  #endif /*---------------------------------------------------------------------*/
   402  
   403  #if defined(__ANDROID_API__) || defined(ANDROID)
   404  #include <android/log.h>
   405  #if __ANDROID_API__ >= 21
   406  #include <sys/sendfile.h>
   407  #endif
   408  #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS != MDBX_WORDBITS
   409  #error "_FILE_OFFSET_BITS != MDBX_WORDBITS" (_FILE_OFFSET_BITS != MDBX_WORDBITS)
   410  #elif defined(__FILE_OFFSET_BITS) && __FILE_OFFSET_BITS != MDBX_WORDBITS
   411  #error "__FILE_OFFSET_BITS != MDBX_WORDBITS" (__FILE_OFFSET_BITS != MDBX_WORDBITS)
   412  #endif
   413  #endif /* Android */
   414  
   415  #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>)
   416  #include <sys/stat.h>
   417  #endif
   418  #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>)
   419  #include <sys/types.h>
   420  #endif
   421  #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>)
   422  #include <sys/file.h>
   423  #endif
   424  
   425  /*----------------------------------------------------------------------------*/
   426  /* Byteorder */
   427  
   428  #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
   429      defined(i486) || defined(__i486) || defined(__i486__) ||                   \
   430      defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) ||   \
   431      defined(__i686) || defined(__i686__) || defined(_M_IX86) ||                \
   432      defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) ||            \
   433      defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) ||          \
   434      defined(__amd64__) || defined(__amd64) || defined(_M_X64) ||               \
   435      defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
   436  #ifndef __ia32__
   437  /* LY: define neutral __ia32__ for x86 and x86-64 */
   438  #define __ia32__ 1
   439  #endif /* __ia32__ */
   440  #if !defined(__amd64__) &&                                                     \
   441      (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) ||           \
   442       defined(_M_X64) || defined(_M_AMD64))
   443  /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
   444  #define __amd64__ 1
   445  #endif /* __amd64__ */
   446  #endif /* all x86 */
   447  
   448  #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
   449      !defined(__ORDER_BIG_ENDIAN__)
   450  
   451  #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) ||                          \
   452      defined(__ANDROID_API__) || defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
   453  #include <endian.h>
   454  #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) ||       \
   455      defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
   456  #include <machine/endian.h>
   457  #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
   458  #include <sys/isa_defs.h>
   459  #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) ||             \
   460      (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
   461  #include <sys/endian.h>
   462  #include <sys/types.h>
   463  #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) ||   \
   464      defined(__NetBSD__) || defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
   465  #include <sys/param.h>
   466  #endif /* OS */
   467  
   468  #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
   469  #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
   470  #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
   471  #define __BYTE_ORDER__ __BYTE_ORDER
   472  #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
   473  #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
   474  #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
   475  #define __BYTE_ORDER__ _BYTE_ORDER
   476  #else
   477  #define __ORDER_LITTLE_ENDIAN__ 1234
   478  #define __ORDER_BIG_ENDIAN__ 4321
   479  
   480  #if defined(__LITTLE_ENDIAN__) ||                                              \
   481      (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) ||                      \
   482      defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) ||    \
   483      defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) ||            \
   484      defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) ||                \
   485      defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) ||   \
   486      defined(__BFIN__) || defined(__ia64__) || defined(_IA64) ||                \
   487      defined(__IA64__) || defined(__ia64) || defined(_M_IA64) ||                \
   488      defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) ||        \
   489      defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) ||              \
   490      defined(__WINDOWS__)
   491  #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
   492  
   493  #elif defined(__BIG_ENDIAN__) ||                                               \
   494      (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) ||                      \
   495      defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) ||    \
   496      defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) ||            \
   497      defined(__m68k__) || defined(M68000) || defined(__hppa__) ||               \
   498      defined(__hppa) || defined(__HPPA__) || defined(__sparc__) ||              \
   499      defined(__sparc) || defined(__370__) || defined(__THW_370__) ||            \
   500      defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
   501  #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
   502  
   503  #else
   504  #error __BYTE_ORDER__ should be defined.
   505  #endif /* Arch */
   506  
   507  #endif
   508  #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */
   509  
   510  /*----------------------------------------------------------------------------*/
   511  /* Availability of CMOV or equivalent */
   512  
   513  #ifndef MDBX_HAVE_CMOV
   514  #if defined(__e2k__)
   515  #define MDBX_HAVE_CMOV 1
   516  #elif defined(__thumb2__) || defined(__thumb2)
   517  #define MDBX_HAVE_CMOV 1
   518  #elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB)
   519  #define MDBX_HAVE_CMOV 0
   520  #elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) ||          \
   521      defined(__aarch64) || defined(__arm__) || defined(__arm) ||                \
   522      defined(__CC_ARM)
   523  #define MDBX_HAVE_CMOV 1
   524  #elif (defined(__riscv__) || defined(__riscv64)) &&                            \
   525      (defined(__riscv_b) || defined(__riscv_bitmanip))
   526  #define MDBX_HAVE_CMOV 1
   527  #elif defined(i686) || defined(__i686) || defined(__i686__) ||                 \
   528      (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) ||                \
   529      defined(__x86_64__) || defined(__amd64__) || defined(__amd64) ||           \
   530      defined(_M_X64) || defined(_M_AMD64)
   531  #define MDBX_HAVE_CMOV 1
   532  #else
   533  #define MDBX_HAVE_CMOV 0
   534  #endif
   535  #endif /* MDBX_HAVE_CMOV */
   536  
   537  /*----------------------------------------------------------------------------*/
   538  /* Compiler's includes for builtins/intrinsics */
   539  
   540  #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
   541  #include <intrin.h>
   542  #elif __GNUC_PREREQ(4, 4) || defined(__clang__)
   543  #if defined(__e2k__)
   544  #include <e2kintrin.h>
   545  #include <x86intrin.h>
   546  #endif /* __e2k__ */
   547  #if defined(__ia32__)
   548  #include <cpuid.h>
   549  #include <x86intrin.h>
   550  #endif /* __ia32__ */
   551  #ifdef __ARM_NEON
   552  #include <arm_neon.h>
   553  #endif
   554  #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
   555  #include <mbarrier.h>
   556  #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
   557      (defined(HP_IA64) || defined(__ia64))
   558  #include <machine/sys/inline.h>
   559  #elif defined(__IBMC__) && defined(__powerpc)
   560  #include <atomic.h>
   561  #elif defined(_AIX)
   562  #include <builtins.h>
   563  #include <sys/atomic_op.h>
   564  #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha)
   565  #include <c_asm.h>
   566  #include <machine/builtins.h>
   567  #elif defined(__MWERKS__)
   568  /* CodeWarrior - troubles ? */
   569  #pragma gcc_extensions
   570  #elif defined(__SNC__)
   571  /* Sony PS3 - troubles ? */
   572  #elif defined(__hppa__) || defined(__hppa)
   573  #include <machine/inline.h>
   574  #else
   575  #error Unsupported C compiler, please use GNU C 4.4 or newer
   576  #endif /* Compiler */
   577  
   578  #if !defined(__noop) && !defined(_MSC_VER)
   579  #define __noop                                                                 \
   580    do {                                                                         \
   581    } while (0)
   582  #endif /* __noop */
   583  
   584  #if defined(__fallthrough) &&                                                  \
   585      (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__))
   586  #undef __fallthrough
   587  #endif /* __fallthrough workaround for MinGW */
   588  
   589  #ifndef __fallthrough
   590  #if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) &&               \
   591                               (!defined(__clang__) || __clang__ > 4)) ||        \
   592      __cplusplus >= 201703L
   593  #define __fallthrough [[fallthrough]]
   594  #elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L
   595  #define __fallthrough [[fallthrough]]
   596  #elif __GNUC_PREREQ(7, 0) &&                                                   \
   597      (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) ||           \
   598       (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126))
   599  #define __fallthrough __attribute__((__fallthrough__))
   600  #elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L &&  \
   601      __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
   602  #define __fallthrough [[clang::fallthrough]]
   603  #else
   604  #define __fallthrough
   605  #endif
   606  #endif /* __fallthrough */
   607  
   608  #ifndef __unreachable
   609  #if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable)
   610  #define __unreachable() __builtin_unreachable()
   611  #elif defined(_MSC_VER)
   612  #define __unreachable() __assume(0)
   613  #else
   614  #define __unreachable()                                                        \
   615    do {                                                                         \
   616    } while (1)
   617  #endif
   618  #endif /* __unreachable */
   619  
   620  #ifndef __prefetch
   621  #if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch)
   622  #define __prefetch(ptr) __builtin_prefetch(ptr)
   623  #else
   624  #define __prefetch(ptr)                                                        \
   625    do {                                                                         \
   626      (void)(ptr);                                                               \
   627    } while (0)
   628  #endif
   629  #endif /* __prefetch */
   630  
   631  #ifndef offsetof
   632  #define offsetof(type, member) __builtin_offsetof(type, member)
   633  #endif /* offsetof */
   634  
   635  #ifndef container_of
   636  #define container_of(ptr, type, member)                                        \
   637    ((type *)((char *)(ptr)-offsetof(type, member)))
   638  #endif /* container_of */
   639  
   640  /*----------------------------------------------------------------------------*/
   641  
   642  #ifndef __always_inline
   643  #if defined(__GNUC__) || __has_attribute(__always_inline__)
   644  #define __always_inline __inline __attribute__((__always_inline__))
   645  #elif defined(_MSC_VER)
   646  #define __always_inline __forceinline
   647  #else
   648  #define __always_inline
   649  #endif
   650  #endif /* __always_inline */
   651  
   652  #ifndef __noinline
   653  #if defined(__GNUC__) || __has_attribute(__noinline__)
   654  #define __noinline __attribute__((__noinline__))
   655  #elif defined(_MSC_VER)
   656  #define __noinline __declspec(noinline)
   657  #else
   658  #define __noinline
   659  #endif
   660  #endif /* __noinline */
   661  
   662  #ifndef __must_check_result
   663  #if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
   664  #define __must_check_result __attribute__((__warn_unused_result__))
   665  #else
   666  #define __must_check_result
   667  #endif
   668  #endif /* __must_check_result */
   669  
   670  #ifndef __nothrow
   671  #if defined(__cplusplus)
   672  #if __cplusplus < 201703L
   673  #define __nothrow throw()
   674  #else
   675  #define __nothrow noexcept(true)
   676  #endif /* __cplusplus */
   677  #elif defined(__GNUC__) || __has_attribute(__nothrow__)
   678  #define __nothrow __attribute__((__nothrow__))
   679  #elif defined(_MSC_VER) && defined(__cplusplus)
   680  #define __nothrow __declspec(nothrow)
   681  #else
   682  #define __nothrow
   683  #endif
   684  #endif /* __nothrow */
   685  
   686  #ifndef __hidden
   687  #if defined(__GNUC__) || __has_attribute(__visibility__)
   688  #define __hidden __attribute__((__visibility__("hidden")))
   689  #else
   690  #define __hidden
   691  #endif
   692  #endif /* __hidden */
   693  
   694  #ifndef __optimize
   695  #if defined(__OPTIMIZE__)
   696  #if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__)
   697  #define __optimize(ops) __attribute__((__optimize__(ops)))
   698  #else
   699  #define __optimize(ops)
   700  #endif
   701  #else
   702  #define __optimize(ops)
   703  #endif
   704  #endif /* __optimize */
   705  
   706  #ifndef __hot
   707  #if defined(__OPTIMIZE__)
   708  #if defined(__e2k__)
   709  #define __hot __attribute__((__hot__)) __optimize(3)
   710  #elif defined(__clang__) && !__has_attribute(__hot_) &&                        \
   711      __has_attribute(__section__) &&                                            \
   712      (defined(__linux__) || defined(__gnu_linux__))
   713  /* just put frequently used functions in separate section */
   714  #define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
   715  #elif defined(__LCC__)
   716  #define __hot __attribute__((__hot__, __optimize__("Ofast,O4")))
   717  #elif defined(__GNUC__) || __has_attribute(__hot__)
   718  #define __hot __attribute__((__hot__)) __optimize("O3")
   719  #else
   720  #define __hot __optimize("O3")
   721  #endif
   722  #else
   723  #define __hot
   724  #endif
   725  #endif /* __hot */
   726  
   727  #ifndef __cold
   728  #if defined(__OPTIMIZE__)
   729  #if defined(__e2k__)
   730  #define __cold __attribute__((__cold__)) __optimize(1)
   731  #elif defined(__clang__) && !__has_attribute(cold) &&                          \
   732      __has_attribute(__section__) &&                                            \
   733      (defined(__linux__) || defined(__gnu_linux__))
   734  /* just put infrequently used functions in separate section */
   735  #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
   736  #elif defined(__LCC__)
   737  #define __hot __attribute__((__cold__, __optimize__("Osize")))
   738  #elif defined(__GNUC__) || __has_attribute(cold)
   739  #define __cold __attribute__((__cold__)) __optimize("Os")
   740  #else
   741  #define __cold __optimize("Os")
   742  #endif
   743  #else
   744  #define __cold
   745  #endif
   746  #endif /* __cold */
   747  
   748  #ifndef __flatten
   749  #if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
   750  #define __flatten __attribute__((__flatten__))
   751  #else
   752  #define __flatten
   753  #endif
   754  #endif /* __flatten */
   755  
   756  #ifndef likely
   757  #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) &&                  \
   758      !defined(__COVERITY__)
   759  #define likely(cond) __builtin_expect(!!(cond), 1)
   760  #else
   761  #define likely(x) (!!(x))
   762  #endif
   763  #endif /* likely */
   764  
   765  #ifndef unlikely
   766  #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) &&                  \
   767      !defined(__COVERITY__)
   768  #define unlikely(cond) __builtin_expect(!!(cond), 0)
   769  #else
   770  #define unlikely(x) (!!(x))
   771  #endif
   772  #endif /* unlikely */
   773  
   774  #ifndef __anonymous_struct_extension__
   775  #if defined(__GNUC__)
   776  #define __anonymous_struct_extension__ __extension__
   777  #else
   778  #define __anonymous_struct_extension__
   779  #endif
   780  #endif /* __anonymous_struct_extension__ */
   781  
   782  #ifndef expect_with_probability
   783  #if defined(__builtin_expect_with_probability) ||                              \
   784      __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0)
   785  #define expect_with_probability(expr, value, prob)                             \
   786    __builtin_expect_with_probability(expr, value, prob)
   787  #else
   788  #define expect_with_probability(expr, value, prob) (expr)
   789  #endif
   790  #endif /* expect_with_probability */
   791  
   792  #ifndef MDBX_WEAK_IMPORT_ATTRIBUTE
   793  #ifdef WEAK_IMPORT_ATTRIBUTE
   794  #define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE
   795  #elif __has_attribute(__weak__) && __has_attribute(__weak_import__)
   796  #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__))
   797  #elif __has_attribute(__weak__) ||                                             \
   798      (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__))
   799  #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__))
   800  #else
   801  #define MDBX_WEAK_IMPORT_ATTRIBUTE
   802  #endif
   803  #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */
   804  
   805  /*----------------------------------------------------------------------------*/
   806  
   807  #if defined(MDBX_USE_VALGRIND)
   808  #include <valgrind/memcheck.h>
   809  #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE
   810  /* LY: available since Valgrind 3.10 */
   811  #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s)
   812  #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s)
   813  #endif
   814  #elif !defined(RUNNING_ON_VALGRIND)
   815  #define VALGRIND_CREATE_MEMPOOL(h, r, z)
   816  #define VALGRIND_DESTROY_MEMPOOL(h)
   817  #define VALGRIND_MEMPOOL_TRIM(h, a, s)
   818  #define VALGRIND_MEMPOOL_ALLOC(h, a, s)
   819  #define VALGRIND_MEMPOOL_FREE(h, a)
   820  #define VALGRIND_MEMPOOL_CHANGE(h, a, b, s)
   821  #define VALGRIND_MAKE_MEM_NOACCESS(a, s)
   822  #define VALGRIND_MAKE_MEM_DEFINED(a, s)
   823  #define VALGRIND_MAKE_MEM_UNDEFINED(a, s)
   824  #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s)
   825  #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s)
   826  #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0)
   827  #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0)
   828  #define RUNNING_ON_VALGRIND (0)
   829  #endif /* MDBX_USE_VALGRIND */
   830  
   831  #ifdef __SANITIZE_ADDRESS__
   832  #include <sanitizer/asan_interface.h>
   833  #elif !defined(ASAN_POISON_MEMORY_REGION)
   834  #define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
   835  #define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
   836  #endif /* __SANITIZE_ADDRESS__ */
   837  
   838  /*----------------------------------------------------------------------------*/
   839  
   840  #ifndef ARRAY_LENGTH
   841  #ifdef __cplusplus
   842  template <typename T, size_t N> char (&__ArraySizeHelper(T (&array)[N]))[N];
   843  #define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array)))
   844  #else
   845  #define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
   846  #endif
   847  #endif /* ARRAY_LENGTH */
   848  
   849  #ifndef ARRAY_END
   850  #define ARRAY_END(array) (&array[ARRAY_LENGTH(array)])
   851  #endif /* ARRAY_END */
   852  
   853  #define CONCAT(a, b) a##b
   854  #define XCONCAT(a, b) CONCAT(a, b)
   855  
   856  #define MDBX_TETRAD(a, b, c, d)                                                \
   857    ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d))
   858  
   859  #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3])
   860  
   861  #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__)
   862  
   863  #ifndef STATIC_ASSERT_MSG
   864  #if defined(static_assert)
   865  #define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg)
   866  #elif defined(_STATIC_ASSERT)
   867  #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
   868  #elif defined(_MSC_VER)
   869  #include <crtdbg.h>
   870  #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
   871  #elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||            \
   872      __has_feature(c_static_assert)
   873  #define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg)
   874  #else
   875  #define STATIC_ASSERT_MSG(expr, msg)                                           \
   876    switch (0) {                                                                 \
   877    case 0:                                                                      \
   878    case (expr):;                                                                \
   879    }
   880  #endif
   881  #endif /* STATIC_ASSERT */
   882  
   883  #ifndef STATIC_ASSERT
   884  #define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
   885  #endif
   886  
   887  #ifndef __Wpedantic_format_voidptr
   888  MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void *
   889  __Wpedantic_format_voidptr(const void *ptr) {
   890    return ptr;
   891  }
   892  #define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
   893  #endif /* __Wpedantic_format_voidptr */
   894  
   895  #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
   896  /* Actually libmdbx was not tested with compilers older than GCC 4.2.
   897   * But you could ignore this warning at your own risk.
   898   * In such case please don't rise up an issues related ONLY to old compilers.
   899   */
   900  #warning "libmdbx required GCC >= 4.2"
   901  #endif
   902  
   903  #if defined(__clang__) && !__CLANG_PREREQ(3, 8)
   904  /* Actually libmdbx was not tested with CLANG older than 3.8.
   905   * But you could ignore this warning at your own risk.
   906   * In such case please don't rise up an issues related ONLY to old compilers.
   907   */
   908  #warning "libmdbx required CLANG >= 3.8"
   909  #endif
   910  
   911  #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
   912  /* Actually libmdbx was not tested with something older than glibc 2.12.
   913   * But you could ignore this warning at your own risk.
   914   * In such case please don't rise up an issues related ONLY to old systems.
   915   */
   916  #warning "libmdbx was only tested with GLIBC >= 2.12."
   917  #endif
   918  
   919  #ifdef __SANITIZE_THREAD__
   920  #warning                                                                       \
   921      "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
   922  #endif /* __SANITIZE_THREAD__ */
   923  
   924  #if __has_warning("-Wnested-anon-types")
   925  #if defined(__clang__)
   926  #pragma clang diagnostic ignored "-Wnested-anon-types"
   927  #elif defined(__GNUC__)
   928  #pragma GCC diagnostic ignored "-Wnested-anon-types"
   929  #else
   930  #pragma warning disable "nested-anon-types"
   931  #endif
   932  #endif /* -Wnested-anon-types */
   933  
   934  #if __has_warning("-Wconstant-logical-operand")
   935  #if defined(__clang__)
   936  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
   937  #elif defined(__GNUC__)
   938  #pragma GCC diagnostic ignored "-Wconstant-logical-operand"
   939  #else
   940  #pragma warning disable "constant-logical-operand"
   941  #endif
   942  #endif /* -Wconstant-logical-operand */
   943  
   944  #if defined(__LCC__) && (__LCC__ <= 121)
   945  /* bug #2798 */
   946  #pragma diag_suppress alignment_reduction_ignored
   947  #elif defined(__ICC)
   948  #pragma warning(disable : 3453 1366)
   949  #elif __has_warning("-Walignment-reduction-ignored")
   950  #if defined(__clang__)
   951  #pragma clang diagnostic ignored "-Walignment-reduction-ignored"
   952  #elif defined(__GNUC__)
   953  #pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
   954  #else
   955  #pragma warning disable "alignment-reduction-ignored"
   956  #endif
   957  #endif /* -Walignment-reduction-ignored */
   958  
   959  #ifndef MDBX_EXCLUDE_FOR_GPROF
   960  #ifdef ENABLE_GPROF
   961  #define MDBX_EXCLUDE_FOR_GPROF                                                 \
   962    __attribute__((__no_instrument_function__,                                   \
   963                   __no_profile_instrument_function__))
   964  #else
   965  #define MDBX_EXCLUDE_FOR_GPROF
   966  #endif /* ENABLE_GPROF */
   967  #endif /* MDBX_EXCLUDE_FOR_GPROF */
   968  
   969  #ifdef __cplusplus
   970  extern "C" {
   971  #endif
   972  
   973  /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
   974  
   975  /*
   976   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
   977   * and other libmdbx authors: please see AUTHORS file.
   978   * All rights reserved.
   979   *
   980   * Redistribution and use in source and binary forms, with or without
   981   * modification, are permitted only as authorized by the OpenLDAP
   982   * Public License.
   983   *
   984   * A copy of this license is available in the file LICENSE in the
   985   * top-level directory of the distribution or, alternatively, at
   986   * <http://www.OpenLDAP.org/license.html>.
   987   */
   988  
   989  
   990  /*----------------------------------------------------------------------------*/
   991  /* C11 Atomics */
   992  
   993  #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
   994  #include <cstdatomic>
   995  #define MDBX_HAVE_C11ATOMICS
   996  #elif !defined(__cplusplus) &&                                                 \
   997      (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) &&              \
   998      !defined(__STDC_NO_ATOMICS__) &&                                           \
   999      (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) ||                            \
  1000       !(defined(__GNUC__) || defined(__clang__)))
  1001  #include <stdatomic.h>
  1002  #define MDBX_HAVE_C11ATOMICS
  1003  #elif defined(__GNUC__) || defined(__clang__)
  1004  #elif defined(_MSC_VER)
  1005  #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
  1006  #pragma warning(disable : 4133) /* 'function': incompatible types - from       \
  1007                                     'size_t' to 'LONGLONG' */
  1008  #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to     \
  1009                                     'std::size_t', possible loss of data */
  1010  #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to     \
  1011                                     'long', possible loss of data */
  1012  #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
  1013  #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
  1014  #elif defined(__APPLE__)
  1015  #include <libkern/OSAtomic.h>
  1016  #else
  1017  #error FIXME atomic-ops
  1018  #endif
  1019  
  1020  /*----------------------------------------------------------------------------*/
  1021  /* Memory/Compiler barriers, cache coherence */
  1022  
  1023  #if __has_include(<sys/cachectl.h>)
  1024  #include <sys/cachectl.h>
  1025  #elif defined(__mips) || defined(__mips__) || defined(__mips64) ||             \
  1026      defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
  1027      defined(__MWERKS__) || defined(__sgi)
  1028  /* MIPS should have explicit cache control */
  1029  #include <sys/cachectl.h>
  1030  #endif
  1031  
  1032  MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) {
  1033  #if defined(__clang__) || defined(__GNUC__)
  1034    __asm__ __volatile__("" ::: "memory");
  1035  #elif defined(_MSC_VER)
  1036    _ReadWriteBarrier();
  1037  #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
  1038    __memory_barrier();
  1039  #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
  1040    __compiler_barrier();
  1041  #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
  1042      (defined(HP_IA64) || defined(__ia64))
  1043    _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */);
  1044  #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
  1045      defined(__ppc64__) || defined(__powerpc64__)
  1046    __fence();
  1047  #else
  1048  #error "Could not guess the kind of compiler, please report to us."
  1049  #endif
  1050  }
  1051  
  1052  MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) {
  1053  #ifdef MDBX_HAVE_C11ATOMICS
  1054    atomic_thread_fence(memory_order_seq_cst);
  1055  #elif defined(__ATOMIC_SEQ_CST)
  1056  #ifdef __clang__
  1057    __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
  1058  #else
  1059    __atomic_thread_fence(__ATOMIC_SEQ_CST);
  1060  #endif
  1061  #elif defined(__clang__) || defined(__GNUC__)
  1062    __sync_synchronize();
  1063  #elif defined(_WIN32) || defined(_WIN64)
  1064    MemoryBarrier();
  1065  #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
  1066  #if defined(__ia32__)
  1067    _mm_mfence();
  1068  #else
  1069    __mf();
  1070  #endif
  1071  #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
  1072    __machine_rw_barrier();
  1073  #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
  1074      (defined(HP_IA64) || defined(__ia64))
  1075    _Asm_mf();
  1076  #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
  1077      defined(__ppc64__) || defined(__powerpc64__)
  1078    __lwsync();
  1079  #else
  1080  #error "Could not guess the kind of compiler, please report to us."
  1081  #endif
  1082  }
  1083  
  1084  /*----------------------------------------------------------------------------*/
  1085  /* system-depended definitions */
  1086  
  1087  #if defined(_WIN32) || defined(_WIN64)
  1088  #define HAVE_SYS_STAT_H
  1089  #define HAVE_SYS_TYPES_H
  1090  typedef HANDLE osal_thread_t;
  1091  typedef unsigned osal_thread_key_t;
  1092  #define MAP_FAILED NULL
  1093  #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
  1094  #define THREAD_CALL WINAPI
  1095  #define THREAD_RESULT DWORD
  1096  typedef struct {
  1097    HANDLE mutex;
  1098    HANDLE event[2];
  1099  } osal_condpair_t;
  1100  typedef CRITICAL_SECTION osal_fastmutex_t;
  1101  
  1102  #if !defined(_MSC_VER) && !defined(__try)
  1103  #define __try
  1104  #define __except(COND) if (false)
  1105  #endif /* stub for MSVC's __try/__except */
  1106  
  1107  #if MDBX_WITHOUT_MSVC_CRT
  1108  
  1109  #ifndef osal_malloc
  1110  static inline void *osal_malloc(size_t bytes) {
  1111    return HeapAlloc(GetProcessHeap(), 0, bytes);
  1112  }
  1113  #endif /* osal_malloc */
  1114  
  1115  #ifndef osal_calloc
  1116  static inline void *osal_calloc(size_t nelem, size_t size) {
  1117    return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size);
  1118  }
  1119  #endif /* osal_calloc */
  1120  
  1121  #ifndef osal_realloc
  1122  static inline void *osal_realloc(void *ptr, size_t bytes) {
  1123    return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes)
  1124               : HeapAlloc(GetProcessHeap(), 0, bytes);
  1125  }
  1126  #endif /* osal_realloc */
  1127  
  1128  #ifndef osal_free
  1129  static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
  1130  #endif /* osal_free */
  1131  
  1132  #else /* MDBX_WITHOUT_MSVC_CRT */
  1133  
  1134  #define osal_malloc malloc
  1135  #define osal_calloc calloc
  1136  #define osal_realloc realloc
  1137  #define osal_free free
  1138  #define osal_strdup _strdup
  1139  
  1140  #endif /* MDBX_WITHOUT_MSVC_CRT */
  1141  
  1142  #ifndef snprintf
  1143  #define snprintf _snprintf /* ntdll */
  1144  #endif
  1145  
  1146  #ifndef vsnprintf
  1147  #define vsnprintf _vsnprintf /* ntdll */
  1148  #endif
  1149  
  1150  MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
  1151                                      size_t src_n);
  1152  
  1153  #else /*----------------------------------------------------------------------*/
  1154  
  1155  typedef pthread_t osal_thread_t;
  1156  typedef pthread_key_t osal_thread_key_t;
  1157  #define INVALID_HANDLE_VALUE (-1)
  1158  #define THREAD_CALL
  1159  #define THREAD_RESULT void *
  1160  typedef struct {
  1161    pthread_mutex_t mutex;
  1162    pthread_cond_t cond[2];
  1163  } osal_condpair_t;
  1164  typedef pthread_mutex_t osal_fastmutex_t;
  1165  #define osal_malloc malloc
  1166  #define osal_calloc calloc
  1167  #define osal_realloc realloc
  1168  #define osal_free free
  1169  #define osal_strdup strdup
  1170  #endif /* Platform */
  1171  
  1172  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  1173  /* malloc_usable_size() already provided */
  1174  #elif defined(__APPLE__)
  1175  #define malloc_usable_size(ptr) malloc_size(ptr)
  1176  #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT
  1177  #define malloc_usable_size(ptr) _msize(ptr)
  1178  #endif /* malloc_usable_size */
  1179  
  1180  /*----------------------------------------------------------------------------*/
  1181  /* OS abstraction layer stuff */
  1182  
  1183  /* Get the size of a memory page for the system.
  1184   * This is the basic size that the platform's memory manager uses, and is
  1185   * fundamental to the use of memory-mapped files. */
  1186  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
  1187  osal_syspagesize(void) {
  1188  #if defined(_WIN32) || defined(_WIN64)
  1189    SYSTEM_INFO si;
  1190    GetSystemInfo(&si);
  1191    return si.dwPageSize;
  1192  #else
  1193    return sysconf(_SC_PAGE_SIZE);
  1194  #endif
  1195  }
  1196  
  1197  #if defined(_WIN32) || defined(_WIN64)
  1198  typedef wchar_t pathchar_t;
  1199  #else
  1200  typedef char pathchar_t;
  1201  #endif
  1202  
  1203  typedef struct osal_mmap_param {
  1204    union {
  1205      void *address;
  1206      uint8_t *dxb;
  1207      struct MDBX_lockinfo *lck;
  1208    };
  1209    mdbx_filehandle_t fd;
  1210    size_t limit;   /* mapping length, but NOT a size of file nor DB */
  1211    size_t current; /* mapped region size, i.e. the size of file and DB */
  1212    uint64_t filesize /* in-process cache of a file size */;
  1213  #if defined(_WIN32) || defined(_WIN64)
  1214    HANDLE section; /* memory-mapped section handle */
  1215  #endif
  1216  } osal_mmap_t;
  1217  
  1218  typedef union bin128 {
  1219    __anonymous_struct_extension__ struct { uint64_t x, y; };
  1220    __anonymous_struct_extension__ struct { uint32_t a, b, c, d; };
  1221  } bin128_t;
  1222  
  1223  #if defined(_WIN32) || defined(_WIN64)
  1224  typedef union osal_srwlock {
  1225    __anonymous_struct_extension__ struct {
  1226      long volatile readerCount;
  1227      long volatile writerCount;
  1228    };
  1229    RTL_SRWLOCK native;
  1230  } osal_srwlock_t;
  1231  #endif /* Windows */
  1232  
  1233  #ifndef __cplusplus
  1234  
  1235  /*----------------------------------------------------------------------------*/
  1236  /* libc compatibility stuff */
  1237  
  1238  #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) &&                           \
  1239      (defined(_GNU_SOURCE) || defined(_BSD_SOURCE))
  1240  #define osal_asprintf asprintf
  1241  #define osal_vasprintf vasprintf
  1242  #else
  1243  MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
  1244      MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...);
  1245  MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap);
  1246  #endif
  1247  
  1248  #if !defined(MADV_DODUMP) && defined(MADV_CORE)
  1249  #define MADV_DODUMP MADV_CORE
  1250  #endif /* MADV_CORE -> MADV_DODUMP */
  1251  
  1252  #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE)
  1253  #define MADV_DONTDUMP MADV_NOCORE
  1254  #endif /* MADV_NOCORE -> MADV_DONTDUMP */
  1255  
  1256  MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny);
  1257  MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny);
  1258  
  1259  /* max bytes to write in one call */
  1260  #if defined(_WIN32) || defined(_WIN64)
  1261  #define MAX_WRITE UINT32_C(0x01000000)
  1262  #else
  1263  #define MAX_WRITE UINT32_C(0x3fff0000)
  1264  #endif
  1265  
  1266  #if defined(__linux__) || defined(__gnu_linux__)
  1267  MDBX_INTERNAL_VAR uint32_t linux_kernel_version;
  1268  MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
  1269  #endif /* Linux */
  1270  
  1271  #ifndef osal_strdup
  1272  LIBMDBX_API char *osal_strdup(const char *str);
  1273  #endif
  1274  
  1275  MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) {
  1276  #if defined(_WIN32) || defined(_WIN64)
  1277    DWORD rc = GetLastError();
  1278  #else
  1279    int rc = errno;
  1280  #endif
  1281    return rc;
  1282  }
  1283  
  1284  #ifndef osal_memalign_alloc
  1285  MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes,
  1286                                             void **result);
  1287  #endif
  1288  #ifndef osal_memalign_free
  1289  MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr);
  1290  #endif
  1291  
  1292  MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair);
  1293  MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair);
  1294  MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair);
  1295  MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair,
  1296                                              bool part);
  1297  MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part);
  1298  MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair);
  1299  
  1300  MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex);
  1301  MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex);
  1302  MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex);
  1303  MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex);
  1304  
  1305  MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
  1306                                      int iovcnt, uint64_t offset,
  1307                                      size_t expected_written);
  1308  MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count,
  1309                                    uint64_t offset);
  1310  MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
  1311                                     size_t count, uint64_t offset);
  1312  MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
  1313                                    size_t count);
  1314  
  1315  MDBX_INTERNAL_FUNC int
  1316  osal_thread_create(osal_thread_t *thread,
  1317                     THREAD_RESULT(THREAD_CALL *start_routine)(void *),
  1318                     void *arg);
  1319  MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread);
  1320  
  1321  enum osal_syncmode_bits {
  1322    MDBX_SYNC_NONE = 0,
  1323    MDBX_SYNC_DATA = 1,
  1324    MDBX_SYNC_SIZE = 2,
  1325    MDBX_SYNC_IODQ = 4
  1326  };
  1327  
  1328  MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd,
  1329                                    const enum osal_syncmode_bits mode_bits);
  1330  MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
  1331  MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
  1332  MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
  1333  
  1334  enum osal_openfile_purpose {
  1335    MDBX_OPEN_DXB_READ = 0,
  1336    MDBX_OPEN_DXB_LAZY = 1,
  1337    MDBX_OPEN_DXB_DSYNC = 2,
  1338    MDBX_OPEN_LCK = 3,
  1339    MDBX_OPEN_COPY = 4,
  1340    MDBX_OPEN_DELETE = 5
  1341  };
  1342  
  1343  MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
  1344                                       const MDBX_env *env,
  1345                                       const pathchar_t *pathname,
  1346                                       mdbx_filehandle_t *fd,
  1347                                       mdbx_mode_t unix_mode_bits);
  1348  MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd);
  1349  MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname);
  1350  MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname);
  1351  MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd);
  1352  MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait);
  1353  
  1354  #define MMAP_OPTION_TRUNCATE 1
  1355  #define MMAP_OPTION_SEMAPHORE 2
  1356  MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map,
  1357                                   const size_t must, const size_t limit,
  1358                                   const unsigned options);
  1359  MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map);
  1360  #define MDBX_MRESIZE_MAY_MOVE 0x00000100
  1361  #define MDBX_MRESIZE_MAY_UNMAP 0x00000200
  1362  MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map,
  1363                                      size_t size, size_t limit);
  1364  #if defined(_WIN32) || defined(_WIN64)
  1365  typedef struct {
  1366    unsigned limit, count;
  1367    HANDLE handles[31];
  1368  } mdbx_handle_array_t;
  1369  MDBX_INTERNAL_FUNC int
  1370  osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
  1371  MDBX_INTERNAL_FUNC int
  1372  osal_resume_threads_after_remap(mdbx_handle_array_t *array);
  1373  #endif /* Windows */
  1374  MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
  1375                                    size_t length,
  1376                                    enum osal_syncmode_bits mode_bits);
  1377  MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
  1378                                              const pathchar_t *pathname,
  1379                                              int err);
  1380  
  1381  MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) {
  1382    STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
  1383  #if defined(_WIN32) || defined(_WIN64)
  1384    return GetCurrentProcessId();
  1385  #else
  1386    STATIC_ASSERT(sizeof(pid_t) <= sizeof(uint32_t));
  1387    return getpid();
  1388  #endif
  1389  }
  1390  
  1391  MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) {
  1392    mdbx_tid_t thunk;
  1393    STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
  1394  #if defined(_WIN32) || defined(_WIN64)
  1395    thunk = GetCurrentThreadId();
  1396  #else
  1397    thunk = pthread_self();
  1398  #endif
  1399    return (uintptr_t)thunk;
  1400  }
  1401  
  1402  #if !defined(_WIN32) && !defined(_WIN64)
  1403  #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
  1404  MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void);
  1405  #else
  1406  static __inline int osal_check_tid4bionic(void) { return 0; }
  1407  #endif /* __ANDROID_API__ || ANDROID) || BIONIC */
  1408  
  1409  MDBX_MAYBE_UNUSED static __inline int
  1410  osal_pthread_mutex_lock(pthread_mutex_t *mutex) {
  1411    int err = osal_check_tid4bionic();
  1412    return unlikely(err) ? err : pthread_mutex_lock(mutex);
  1413  }
  1414  #endif /* !Windows */
  1415  
  1416  MDBX_INTERNAL_FUNC uint64_t osal_monotime(void);
  1417  MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16);
  1418  MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime);
  1419  
  1420  MDBX_INTERNAL_FUNC bin128_t osal_bootid(void);
  1421  /*----------------------------------------------------------------------------*/
  1422  /* lck stuff */
  1423  
  1424  /// \brief Initialization of synchronization primitives linked with MDBX_env
  1425  ///   instance both in LCK-file and within the current process.
  1426  /// \param
  1427  ///   global_uniqueness_flag = true - denotes that there are no other processes
  1428  ///     working with DB and LCK-file. Thus the function MUST initialize
  1429  ///     shared synchronization objects in memory-mapped LCK-file.
  1430  ///   global_uniqueness_flag = false - denotes that at least one process is
  1431  ///     already working with DB and LCK-file, including the case when DB
  1432  ///     has already been opened in the current process. Thus the function
  1433  ///     MUST NOT initialize shared synchronization objects in memory-mapped
  1434  ///     LCK-file that are already in use.
  1435  /// \return Error code or zero on success.
  1436  MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
  1437                                       MDBX_env *inprocess_neighbor,
  1438                                       int global_uniqueness_flag);
  1439  
  1440  /// \brief Disconnects from shared interprocess objects and destructs
  1441  ///   synchronization objects linked with MDBX_env instance
  1442  ///   within the current process.
  1443  /// \param
  1444  ///   inprocess_neighbor = NULL - if the current process does not have other
  1445  ///     instances of MDBX_env linked with the DB being closed.
  1446  ///     Thus the function MUST check for other processes working with DB or
  1447  ///     LCK-file, and keep or destroy shared synchronization objects in
  1448  ///     memory-mapped LCK-file depending on the result.
  1449  ///   inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
  1450  ///     (anyone of there is several) working with DB or LCK-file within the
  1451  ///     current process. Thus the function MUST NOT try to acquire exclusive
  1452  ///     lock and/or try to destruct shared synchronization objects linked with
  1453  ///     DB or LCK-file. Moreover, the implementation MUST ensure correct work
  1454  ///     of other instances of MDBX_env within the current process, e.g.
  1455  ///     restore POSIX-fcntl locks after the closing of file descriptors.
  1456  /// \return Error code (MDBX_PANIC) or zero on success.
  1457  MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
  1458                                          MDBX_env *inprocess_neighbor);
  1459  
  1460  /// \brief Connects to shared interprocess locking objects and tries to acquire
  1461  ///   the maximum lock level (shared if exclusive is not available)
  1462  ///   Depending on implementation or/and platform (Windows) this function may
  1463  ///   acquire the non-OS super-level lock (e.g. for shared synchronization
  1464  ///   objects initialization), which will be downgraded to OS-exclusive or
  1465  ///   shared via explicit calling of osal_lck_downgrade().
  1466  /// \return
  1467  ///   MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
  1468  ///     the current process is the first and only after the last use of DB.
  1469  ///   MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
  1470  ///     DB has already been opened and now is used by other processes.
  1471  ///   Otherwise (not 0 and not -1) - error code.
  1472  MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env);
  1473  
  1474  /// \brief Downgrades the level of initially acquired lock to
  1475  ///   operational level specified by argument. The reson for such downgrade:
  1476  ///    - unblocking of other processes that are waiting for access, i.e.
  1477  ///      if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
  1478  ///      should be made aware that access is unavailable rather than
  1479  ///      wait for it.
  1480  ///    - freeing locks that interfere file operation (especially for Windows)
  1481  ///   (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
  1482  ///   (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
  1483  ///   operational lock.
  1484  /// \return Error code or zero on success
  1485  MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env);
  1486  
  1487  /// \brief Locks LCK-file or/and table of readers for (de)registering.
  1488  /// \return Error code or zero on success
  1489  MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env);
  1490  
  1491  /// \brief Unlocks LCK-file or/and table of readers after (de)registering.
  1492  MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env);
  1493  
  1494  /// \brief Acquires lock for DB change (on writing transaction start)
  1495  ///   Reading transactions will not be blocked.
  1496  ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
  1497  /// \return Error code or zero on success
  1498  LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait);
  1499  
  1500  /// \brief Releases lock once DB changes is made (after writing transaction
  1501  ///   has finished).
  1502  ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
  1503  LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env);
  1504  
  1505  /// \brief Sets alive-flag of reader presence (indicative lock) for PID of
  1506  ///   the current process. The function does no more than needed for
  1507  ///   the correct working of osal_rpid_check() in other processes.
  1508  /// \return Error code or zero on success
  1509  MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env);
  1510  
  1511  /// \brief Resets alive-flag of reader presence (indicative lock)
  1512  ///   for PID of the current process. The function does no more than needed
  1513  ///   for the correct working of osal_rpid_check() in other processes.
  1514  /// \return Error code or zero on success
  1515  MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env);
  1516  
  1517  /// \brief Checks for reading process status with the given pid with help of
  1518  ///   alive-flag of presence (indicative lock) or using another way.
  1519  /// \return
  1520  ///   MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
  1521  ///     and working with DB (indicative lock is present).
  1522  ///   MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
  1523  ///     or not working with DB (indicative lock is not present).
  1524  ///   Otherwise (not 0 and not -1) - error code.
  1525  MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid);
  1526  
  1527  #if defined(_WIN32) || defined(_WIN64)
  1528  
  1529  #define OSAL_MB2WIDE(FROM, TO)                                                 \
  1530    do {                                                                         \
  1531      const char *const from_tmp = (FROM);                                       \
  1532      const size_t from_mblen = strlen(from_tmp);                                \
  1533      const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen);        \
  1534      if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX)                     \
  1535        return ERROR_INVALID_NAME;                                               \
  1536      wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t));          \
  1537      if (to_wlen + 1 !=                                                         \
  1538          osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1))              \
  1539        return ERROR_INVALID_NAME;                                               \
  1540      (TO) = to_tmp;                                                             \
  1541    } while (0)
  1542  
  1543  typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *);
  1544  MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init,
  1545      osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared,
  1546      osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive;
  1547  
  1548  #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
  1549  typedef enum _FILE_INFO_BY_HANDLE_CLASS {
  1550    FileBasicInfo,
  1551    FileStandardInfo,
  1552    FileNameInfo,
  1553    FileRenameInfo,
  1554    FileDispositionInfo,
  1555    FileAllocationInfo,
  1556    FileEndOfFileInfo,
  1557    FileStreamInfo,
  1558    FileCompressionInfo,
  1559    FileAttributeTagInfo,
  1560    FileIdBothDirectoryInfo,
  1561    FileIdBothDirectoryRestartInfo,
  1562    FileIoPriorityHintInfo,
  1563    FileRemoteProtocolInfo,
  1564    MaximumFileInfoByHandleClass
  1565  } FILE_INFO_BY_HANDLE_CLASS,
  1566      *PFILE_INFO_BY_HANDLE_CLASS;
  1567  
  1568  typedef struct _FILE_END_OF_FILE_INFO {
  1569    LARGE_INTEGER EndOfFile;
  1570  } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
  1571  
  1572  #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
  1573  #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
  1574  
  1575  typedef struct _FILE_REMOTE_PROTOCOL_INFO {
  1576    USHORT StructureVersion;
  1577    USHORT StructureSize;
  1578    DWORD Protocol;
  1579    USHORT ProtocolMajorVersion;
  1580    USHORT ProtocolMinorVersion;
  1581    USHORT ProtocolRevision;
  1582    USHORT Reserved;
  1583    DWORD Flags;
  1584    struct {
  1585      DWORD Reserved[8];
  1586    } GenericReserved;
  1587    struct {
  1588      DWORD Reserved[16];
  1589    } ProtocolSpecificReserved;
  1590  } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
  1591  
  1592  #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
  1593  
  1594  typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
  1595      _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
  1596      _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
  1597  MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx
  1598      mdbx_GetFileInformationByHandleEx;
  1599  
  1600  typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
  1601      _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
  1602      _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
  1603      _Out_opt_ LPDWORD lpMaximumComponentLength,
  1604      _Out_opt_ LPDWORD lpFileSystemFlags,
  1605      _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
  1606  MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW
  1607      mdbx_GetVolumeInformationByHandleW;
  1608  
  1609  typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
  1610                                                        _Out_ LPWSTR lpszFilePath,
  1611                                                        _In_ DWORD cchFilePath,
  1612                                                        _In_ DWORD dwFlags);
  1613  MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
  1614  
  1615  typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
  1616      _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
  1617      _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
  1618  MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle
  1619      mdbx_SetFileInformationByHandle;
  1620  
  1621  typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
  1622      IN HANDLE FileHandle, IN OUT HANDLE Event,
  1623      IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
  1624      OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
  1625      IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
  1626      OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
  1627  MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile;
  1628  
  1629  typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
  1630  MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64;
  1631  
  1632  #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
  1633  typedef struct _WIN32_MEMORY_RANGE_ENTRY {
  1634    PVOID VirtualAddress;
  1635    SIZE_T NumberOfBytes;
  1636  } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
  1637  #endif /* Windows 8.x */
  1638  
  1639  typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
  1640      HANDLE hProcess, ULONG_PTR NumberOfEntries,
  1641      PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
  1642  MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
  1643  
  1644  typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
  1645  
  1646  typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
  1647                                                IN PLARGE_INTEGER NewSectionSize);
  1648  MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection;
  1649  
  1650  static __inline bool mdbx_RunningUnderWine(void) {
  1651    return !mdbx_NtExtendSection;
  1652  }
  1653  
  1654  typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
  1655                                             LPCSTR lpValue, DWORD dwFlags,
  1656                                             LPDWORD pdwType, PVOID pvData,
  1657                                             LPDWORD pcbData);
  1658  MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
  1659  
  1660  NTSYSAPI ULONG RtlRandomEx(PULONG Seed);
  1661  
  1662  #endif /* Windows */
  1663  
  1664  #endif /* !__cplusplus */
  1665  
  1666  /*----------------------------------------------------------------------------*/
  1667  
  1668  #if defined(_MSC_VER) && _MSC_VER >= 1900
  1669  /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
  1670   * for internal format-args checker. */
  1671  #undef PRIuPTR
  1672  #undef PRIiPTR
  1673  #undef PRIdPTR
  1674  #undef PRIxPTR
  1675  #define PRIuPTR "Iu"
  1676  #define PRIiPTR "Ii"
  1677  #define PRIdPTR "Id"
  1678  #define PRIxPTR "Ix"
  1679  #define PRIuSIZE "zu"
  1680  #define PRIiSIZE "zi"
  1681  #define PRIdSIZE "zd"
  1682  #define PRIxSIZE "zx"
  1683  #endif /* fix PRI*PTR for _MSC_VER */
  1684  
  1685  #ifndef PRIuSIZE
  1686  #define PRIuSIZE PRIuPTR
  1687  #define PRIiSIZE PRIiPTR
  1688  #define PRIdSIZE PRIdPTR
  1689  #define PRIxSIZE PRIxPTR
  1690  #endif /* PRI*SIZE macros for MSVC */
  1691  
  1692  #ifdef _MSC_VER
  1693  #pragma warning(pop)
  1694  #endif
  1695  
  1696  #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
  1697  #if defined(xMDBX_TOOLS)
  1698  extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
  1699  #endif
  1700  
  1701  /*******************************************************************************
  1702   *******************************************************************************
  1703   *******************************************************************************
  1704   *
  1705   *
  1706   *         ####   #####    #####     #     ####   #    #   ####
  1707   *        #    #  #    #     #       #    #    #  ##   #  #
  1708   *        #    #  #    #     #       #    #    #  # #  #   ####
  1709   *        #    #  #####      #       #    #    #  #  # #       #
  1710   *        #    #  #          #       #    #    #  #   ##  #    #
  1711   *         ####   #          #       #     ####   #    #   ####
  1712   *
  1713   *
  1714   */
  1715  
  1716  /** \defgroup build_option Build options
  1717   * The libmdbx build options.
  1718   @{ */
  1719  
  1720  /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
  1721  #define MDBX_OSX_WANNA_DURABILITY 0
  1722  /** Using fsync() with chance of data lost on power failure */
  1723  #define MDBX_OSX_WANNA_SPEED 1
  1724  
  1725  #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
  1726  /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED
  1727   * for OSX & iOS */
  1728  #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
  1729  #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */
  1730  
  1731  /** Controls checking PID against reuse DB environment after the fork() */
  1732  #ifndef MDBX_ENV_CHECKPID
  1733  #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
  1734  /* PID check could be omitted:
  1735   *  - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork()
  1736   *    mapped pages will not be available for child process.
  1737   *  - in Windows where fork() not available. */
  1738  #define MDBX_ENV_CHECKPID 0
  1739  #else
  1740  #define MDBX_ENV_CHECKPID 1
  1741  #endif
  1742  #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
  1743  #else
  1744  #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
  1745  #endif /* MDBX_ENV_CHECKPID */
  1746  
  1747  /** Controls checking transaction owner thread against misuse transactions from
  1748   * other threads. */
  1749  #ifndef MDBX_TXN_CHECKOWNER
  1750  #define MDBX_TXN_CHECKOWNER 1
  1751  #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
  1752  #else
  1753  #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
  1754  #endif /* MDBX_TXN_CHECKOWNER */
  1755  
  1756  /** Does a system have battery-backed Real-Time Clock or just a fake. */
  1757  #ifndef MDBX_TRUST_RTC
  1758  #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) ||     \
  1759      defined(__OpenBSD__)
  1760  #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */
  1761  #else
  1762  #define MDBX_TRUST_RTC 1
  1763  #endif
  1764  #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC)
  1765  #else
  1766  #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC)
  1767  #endif /* MDBX_TRUST_RTC */
  1768  
  1769  /** Controls online database auto-compactification during write-transactions. */
  1770  #ifndef MDBX_ENABLE_REFUND
  1771  #define MDBX_ENABLE_REFUND 1
  1772  #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
  1773  #error MDBX_ENABLE_REFUND must be defined as 0 or 1
  1774  #endif /* MDBX_ENABLE_REFUND */
  1775  
  1776  /** Controls gathering statistics for page operations. */
  1777  #ifndef MDBX_ENABLE_PGOP_STAT
  1778  #define MDBX_ENABLE_PGOP_STAT 1
  1779  #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1)
  1780  #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
  1781  #endif /* MDBX_ENABLE_PGOP_STAT */
  1782  
  1783  /** Enables chunking long list of retired pages during huge transactions commit
  1784   * to avoid use sequences of pages. */
  1785  #ifndef MDBX_ENABLE_BIGFOOT
  1786  #if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
  1787  #define MDBX_ENABLE_BIGFOOT 1
  1788  #else
  1789  #define MDBX_ENABLE_BIGFOOT 0
  1790  #endif
  1791  #elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1)
  1792  #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
  1793  #endif /* MDBX_ENABLE_BIGFOOT */
  1794  
  1795  /** Controls use of POSIX madvise() hints and friends. */
  1796  #ifndef MDBX_ENABLE_MADVISE
  1797  #define MDBX_ENABLE_MADVISE 1
  1798  #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
  1799  #error MDBX_ENABLE_MADVISE must be defined as 0 or 1
  1800  #endif /* MDBX_ENABLE_MADVISE */
  1801  
  1802  /** Disable some checks to reduce an overhead and detection probability of
  1803   * database corruption to a values closer to the LMDB. */
  1804  #ifndef MDBX_DISABLE_VALIDATION
  1805  #define MDBX_DISABLE_VALIDATION 0
  1806  #elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1)
  1807  #error MDBX_DISABLE_VALIDATION must be defined as 0 or 1
  1808  #endif /* MDBX_DISABLE_VALIDATION */
  1809  
  1810  #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT
  1811  #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1
  1812  #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
  1813          MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1)
  1814  #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
  1815  #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
  1816  
  1817  #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT
  1818  #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1
  1819  #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
  1820          MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1)
  1821  #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
  1822  #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
  1823  
  1824  /** Basically, this build-option is for TODO. Guess it should be replaced
  1825   * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
  1826   *  0/OFF = Don't track dirty pages at all and don't spilling ones.
  1827   *          This should be by-default on Linux and may-be other systems
  1828   *          (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
  1829   *          properly LRU tracking and async writing on-demand.
  1830   *  1/ON  = Lite tracking of dirty pages but with LRU labels and explicit
  1831   *          spilling with msync(MS_ASYNC). */
  1832  #ifndef MDBX_FAKE_SPILL_WRITEMAP
  1833  #if defined(__linux__) || defined(__gnu_linux__)
  1834  #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
  1835  #else
  1836  #define MDBX_FAKE_SPILL_WRITEMAP 0
  1837  #endif
  1838  #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
  1839  #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
  1840  #endif /* MDBX_FAKE_SPILL_WRITEMAP */
  1841  
  1842  /** Controls sort order of internal page number lists.
  1843   * This mostly experimental/advanced option with not for regular MDBX users.
  1844   * \warning The database format depend on this option and libmdbx builded with
  1845   * different option value are incompatible. */
  1846  #ifndef MDBX_PNL_ASCENDING
  1847  #define MDBX_PNL_ASCENDING 0
  1848  #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
  1849  #error MDBX_PNL_ASCENDING must be defined as 0 or 1
  1850  #endif /* MDBX_PNL_ASCENDING */
  1851  
  1852  /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
  1853  #ifndef MDBX_WITHOUT_MSVC_CRT
  1854  #define MDBX_WITHOUT_MSVC_CRT 1
  1855  #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
  1856  #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
  1857  #endif /* MDBX_WITHOUT_MSVC_CRT */
  1858  
  1859  /** Size of buffer used during copying a environment/database file. */
  1860  #ifndef MDBX_ENVCOPY_WRITEBUF
  1861  #define MDBX_ENVCOPY_WRITEBUF 1048576u
  1862  #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \
  1863      MDBX_ENVCOPY_WRITEBUF % 65536u
  1864  #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536
  1865  #endif /* MDBX_ENVCOPY_WRITEBUF */
  1866  
  1867  /** Forces assertion checking */
  1868  #ifndef MDBX_FORCE_ASSERTIONS
  1869  #define MDBX_FORCE_ASSERTIONS 0
  1870  #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1)
  1871  #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1
  1872  #endif /* MDBX_FORCE_ASSERTIONS */
  1873  
  1874  /** Presumed malloc size overhead for each allocation
  1875   * to adjust allocations to be more aligned. */
  1876  #ifndef MDBX_ASSUME_MALLOC_OVERHEAD
  1877  #ifdef __SIZEOF_POINTER__
  1878  #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u)
  1879  #else
  1880  #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
  1881  #endif
  1882  #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 ||   \
  1883      MDBX_ASSUME_MALLOC_OVERHEAD % 4
  1884  #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4
  1885  #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */
  1886  
  1887  /** If defined then enables integration with Valgrind,
  1888   * a memory analyzing tool. */
  1889  #ifndef MDBX_USE_VALGRIND
  1890  #endif /* MDBX_USE_VALGRIND */
  1891  
  1892  /** If defined then enables use C11 atomics,
  1893   *  otherwise detects ones availability automatically. */
  1894  #ifndef MDBX_HAVE_C11ATOMICS
  1895  #endif /* MDBX_HAVE_C11ATOMICS */
  1896  
  1897  //------------------------------------------------------------------------------
  1898  
  1899  /** Win32 File Locking API for \ref MDBX_LOCKING */
  1900  #define MDBX_LOCKING_WIN32FILES -1
  1901  
  1902  /** SystemV IPC semaphores for \ref MDBX_LOCKING */
  1903  #define MDBX_LOCKING_SYSV 5
  1904  
  1905  /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */
  1906  #define MDBX_LOCKING_POSIX1988 1988
  1907  
  1908  /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */
  1909  #define MDBX_LOCKING_POSIX2001 2001
  1910  
  1911  /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */
  1912  #define MDBX_LOCKING_POSIX2008 2008
  1913  
  1914  /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */
  1915  #define MDBX_LOCKING_BENAPHORE 1995
  1916  
  1917  /** Advanced: Choices the locking implementation (autodetection by default). */
  1918  #if defined(_WIN32) || defined(_WIN64)
  1919  #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES
  1920  #else
  1921  #ifndef MDBX_LOCKING
  1922  #if defined(_POSIX_THREAD_PROCESS_SHARED) &&                                   \
  1923      _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__)
  1924  
  1925  /* Some platforms define the EOWNERDEAD error code even though they
  1926   * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */
  1927  #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L &&          \
  1928      ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) &&                            \
  1929        _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) ||                                \
  1930       (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) &&                            \
  1931        _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) ||                                \
  1932       defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) &&     \
  1933      (!defined(__GLIBC__) ||                                                    \
  1934       __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */)
  1935  #define MDBX_LOCKING MDBX_LOCKING_POSIX2008
  1936  #else
  1937  #define MDBX_LOCKING MDBX_LOCKING_POSIX2001
  1938  #endif
  1939  #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
  1940  #define MDBX_LOCKING MDBX_LOCKING_POSIX1988
  1941  #else
  1942  #define MDBX_LOCKING MDBX_LOCKING_SYSV
  1943  #endif
  1944  #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING)
  1945  #else
  1946  #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING)
  1947  #endif /* MDBX_LOCKING */
  1948  #endif /* !Windows */
  1949  
  1950  /** Advanced: Using POSIX OFD-locks (autodetection by default). */
  1951  #ifndef MDBX_USE_OFDLOCKS
  1952  #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) &&   \
  1953      !defined(MDBX_SAFE4QEMU) &&                                                \
  1954      !defined(__sun) /* OFD-lock are broken on Solaris */
  1955  #define MDBX_USE_OFDLOCKS 1
  1956  #else
  1957  #define MDBX_USE_OFDLOCKS 0
  1958  #endif
  1959  #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
  1960  #else
  1961  #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
  1962  #endif /* MDBX_USE_OFDLOCKS */
  1963  
  1964  /** Advanced: Using sendfile() syscall (autodetection by default). */
  1965  #ifndef MDBX_USE_SENDFILE
  1966  #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
  1967       !defined(__ANDROID_API__)) ||                                             \
  1968      (defined(__ANDROID_API__) && __ANDROID_API__ >= 21)
  1969  #define MDBX_USE_SENDFILE 1
  1970  #else
  1971  #define MDBX_USE_SENDFILE 0
  1972  #endif
  1973  #endif /* MDBX_USE_SENDFILE */
  1974  
  1975  /** Advanced: Using copy_file_range() syscall (autodetection by default). */
  1976  #ifndef MDBX_USE_COPYFILERANGE
  1977  #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
  1978  #define MDBX_USE_COPYFILERANGE 1
  1979  #else
  1980  #define MDBX_USE_COPYFILERANGE 0
  1981  #endif
  1982  #endif /* MDBX_USE_COPYFILERANGE */
  1983  
  1984  /** Advanced: Using sync_file_range() syscall (autodetection by default). */
  1985  #ifndef MDBX_USE_SYNCFILERANGE
  1986  #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
  1987       defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) ||           \
  1988      (defined(__ANDROID_API__) && __ANDROID_API__ >= 26)
  1989  #define MDBX_USE_SYNCFILERANGE 1
  1990  #else
  1991  #define MDBX_USE_SYNCFILERANGE 0
  1992  #endif
  1993  #endif /* MDBX_USE_SYNCFILERANGE */
  1994  
  1995  //------------------------------------------------------------------------------
  1996  
  1997  #ifndef MDBX_CPU_WRITEBACK_INCOHERENT
  1998  #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) ||                \
  1999      defined(__hppa__) || defined(DOXYGEN)
  2000  #define MDBX_CPU_WRITEBACK_INCOHERENT 0
  2001  #else
  2002  #define MDBX_CPU_WRITEBACK_INCOHERENT 1
  2003  #endif
  2004  #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
  2005  
  2006  #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
  2007  #ifdef __OpenBSD__
  2008  #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1
  2009  #else
  2010  #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0
  2011  #endif
  2012  #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
  2013  
  2014  #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
  2015  #if defined(__mips) || defined(__mips__) || defined(__mips64) ||               \
  2016      defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
  2017      defined(__MWERKS__) || defined(__sgi)
  2018  /* MIPS has cache coherency issues. */
  2019  #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1
  2020  #else
  2021  /* LY: assume no relevant mmap/dcache issues. */
  2022  #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0
  2023  #endif
  2024  #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
  2025  
  2026  #ifndef MDBX_64BIT_ATOMIC
  2027  #if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
  2028  #define MDBX_64BIT_ATOMIC 1
  2029  #else
  2030  #define MDBX_64BIT_ATOMIC 0
  2031  #endif
  2032  #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
  2033  #else
  2034  #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
  2035  #endif /* MDBX_64BIT_ATOMIC */
  2036  
  2037  #ifndef MDBX_64BIT_CAS
  2038  #if defined(ATOMIC_LLONG_LOCK_FREE)
  2039  #if ATOMIC_LLONG_LOCK_FREE > 1
  2040  #define MDBX_64BIT_CAS 1
  2041  #else
  2042  #define MDBX_64BIT_CAS 0
  2043  #endif
  2044  #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
  2045  #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
  2046  #define MDBX_64BIT_CAS 1
  2047  #else
  2048  #define MDBX_64BIT_CAS 0
  2049  #endif
  2050  #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE)
  2051  #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1
  2052  #define MDBX_64BIT_CAS 1
  2053  #else
  2054  #define MDBX_64BIT_CAS 0
  2055  #endif
  2056  #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
  2057  #define MDBX_64BIT_CAS 1
  2058  #else
  2059  #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC
  2060  #endif
  2061  #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS)
  2062  #else
  2063  #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS)
  2064  #endif /* MDBX_64BIT_CAS */
  2065  
  2066  #ifndef MDBX_UNALIGNED_OK
  2067  #if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) ||                 \
  2068      defined(ENABLE_UBSAN)
  2069  #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */
  2070  #elif defined(__ARM_FEATURE_UNALIGNED)
  2071  #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */
  2072  #elif defined(__e2k__) || defined(__elbrus__)
  2073  #if __iset__ > 4
  2074  #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */
  2075  #else
  2076  #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */
  2077  #endif
  2078  #elif defined(__ia32__)
  2079  #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */
  2080  #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0)
  2081  /* expecting an optimization will well done, also this
  2082   * hushes false-positives from UBSAN (undefined behaviour sanitizer) */
  2083  #define MDBX_UNALIGNED_OK 0
  2084  #else
  2085  #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */
  2086  #endif
  2087  #elif MDBX_UNALIGNED_OK == 1
  2088  #undef MDBX_UNALIGNED_OK
  2089  #define MDBX_UNALIGNED_OK 32 /* any unaligned access allowed */
  2090  #endif                       /* MDBX_UNALIGNED_OK */
  2091  
  2092  #ifndef MDBX_CACHELINE_SIZE
  2093  #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE)
  2094  #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE
  2095  #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
  2096  #define MDBX_CACHELINE_SIZE 128
  2097  #else
  2098  #define MDBX_CACHELINE_SIZE 64
  2099  #endif
  2100  #endif /* MDBX_CACHELINE_SIZE */
  2101  
  2102  /** @} end of build options */
  2103  /*******************************************************************************
  2104   *******************************************************************************
  2105   ******************************************************************************/
  2106  
  2107  #ifndef DOXYGEN
  2108  
  2109  /* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
  2110  #ifndef MDBX_DEBUG
  2111  #ifdef NDEBUG
  2112  #define MDBX_DEBUG 0
  2113  #else
  2114  #define MDBX_DEBUG 1
  2115  #endif
  2116  #endif /* MDBX_DEBUG */
  2117  
  2118  #else
  2119  
  2120  /* !!! Actually this is a fake definitions for Doxygen !!! */
  2121  
  2122  /** Controls enabling of debugging features.
  2123   *
  2124   *  - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all,
  2125   *                     including logging and assertion controls.
  2126   *                     Logging level and corresponding debug flags changing
  2127   *                     by \ref mdbx_setup_debug() will not have effect.
  2128   *  - `MDBX_DEBUG > 0` Enables code for the debugging features (logging,
  2129   *                     assertions checking and internal audit).
  2130   *                     Simultaneously sets the default logging level
  2131   *                     to the `MDBX_DEBUG` value.
  2132   *                     Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
  2133   *
  2134   * \ingroup build_option */
  2135  #define MDBX_DEBUG 0...7
  2136  
  2137  /** Disables using of GNU libc extensions. */
  2138  #define MDBX_DISABLE_GNU_SOURCE 0 or 1
  2139  
  2140  #endif /* DOXYGEN */
  2141  
  2142  /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
  2143  #if MDBX_DEBUG
  2144  #undef NDEBUG
  2145  #endif
  2146  
  2147  /*----------------------------------------------------------------------------*/
  2148  /* Atomics */
  2149  
  2150  enum MDBX_memory_order {
  2151    mo_Relaxed,
  2152    mo_AcquireRelease
  2153    /* , mo_SequentialConsistency */
  2154  };
  2155  
  2156  typedef union {
  2157    volatile uint32_t weak;
  2158  #ifdef MDBX_HAVE_C11ATOMICS
  2159    volatile _Atomic uint32_t c11a;
  2160  #endif /* MDBX_HAVE_C11ATOMICS */
  2161  } MDBX_atomic_uint32_t;
  2162  
  2163  typedef union {
  2164    volatile uint64_t weak;
  2165  #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
  2166    volatile _Atomic uint64_t c11a;
  2167  #endif
  2168  #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
  2169    __anonymous_struct_extension__ struct {
  2170  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  2171      MDBX_atomic_uint32_t low, high;
  2172  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  2173      MDBX_atomic_uint32_t high, low;
  2174  #else
  2175  #error "FIXME: Unsupported byte order"
  2176  #endif /* __BYTE_ORDER__ */
  2177    };
  2178  #endif
  2179  } MDBX_atomic_uint64_t;
  2180  
  2181  #ifdef MDBX_HAVE_C11ATOMICS
  2182  
  2183  /* Crutches for C11 atomic compiler's bugs */
  2184  #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
  2185  #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
  2186  #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
  2187  #elif defined(__clang__) && __clang__ < 8
  2188  #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
  2189  #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
  2190  #else
  2191  #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
  2192  #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
  2193  #endif /* Crutches for C11 atomic compiler's bugs */
  2194  
  2195  #define mo_c11_store(fence)                                                    \
  2196    (((fence) == mo_Relaxed)          ? memory_order_relaxed                     \
  2197     : ((fence) == mo_AcquireRelease) ? memory_order_release                     \
  2198                                      : memory_order_seq_cst)
  2199  #define mo_c11_load(fence)                                                     \
  2200    (((fence) == mo_Relaxed)          ? memory_order_relaxed                     \
  2201     : ((fence) == mo_AcquireRelease) ? memory_order_acquire                     \
  2202                                      : memory_order_seq_cst)
  2203  
  2204  #endif /* MDBX_HAVE_C11ATOMICS */
  2205  
  2206  #ifndef __cplusplus
  2207  
  2208  #ifdef MDBX_HAVE_C11ATOMICS
  2209  #define osal_memory_fence(order, write)                                        \
  2210    atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order))
  2211  #else /* MDBX_HAVE_C11ATOMICS */
  2212  #define osal_memory_fence(order, write)                                        \
  2213    do {                                                                         \
  2214      osal_compiler_barrier();                                                   \
  2215      if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed           \
  2216                                                          : mo_AcquireRelease))  \
  2217        osal_memory_barrier();                                                   \
  2218    } while (0)
  2219  #endif /* MDBX_HAVE_C11ATOMICS */
  2220  
  2221  #if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__)
  2222  #define atomic_store32(p, value, order)                                        \
  2223    ({                                                                           \
  2224      const uint32_t value_to_store = (value);                                   \
  2225      atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store,           \
  2226                            mo_c11_store(order));                                \
  2227      value_to_store;                                                            \
  2228    })
  2229  #define atomic_load32(p, order)                                                \
  2230    atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order))
  2231  #define atomic_store64(p, value, order)                                        \
  2232    ({                                                                           \
  2233      const uint64_t value_to_store = (value);                                   \
  2234      atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store,           \
  2235                            mo_c11_store(order));                                \
  2236      value_to_store;                                                            \
  2237    })
  2238  #define atomic_load64(p, order)                                                \
  2239    atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order))
  2240  #endif /* LCC && MDBX_HAVE_C11ATOMICS */
  2241  
  2242  #ifndef atomic_store32
  2243  MDBX_MAYBE_UNUSED static __always_inline uint32_t
  2244  atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value,
  2245                 enum MDBX_memory_order order) {
  2246    STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
  2247  #ifdef MDBX_HAVE_C11ATOMICS
  2248    assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
  2249    atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
  2250  #else  /* MDBX_HAVE_C11ATOMICS */
  2251    if (order != mo_Relaxed)
  2252      osal_compiler_barrier();
  2253    p->weak = value;
  2254    osal_memory_fence(order, true);
  2255  #endif /* MDBX_HAVE_C11ATOMICS */
  2256    return value;
  2257  }
  2258  #endif /* atomic_store32 */
  2259  
  2260  #ifndef atomic_load32
  2261  MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
  2262      const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) {
  2263    STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
  2264  #ifdef MDBX_HAVE_C11ATOMICS
  2265    assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
  2266    return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
  2267  #else  /* MDBX_HAVE_C11ATOMICS */
  2268    osal_memory_fence(order, false);
  2269    const uint32_t value = p->weak;
  2270    if (order != mo_Relaxed)
  2271      osal_compiler_barrier();
  2272    return value;
  2273  #endif /* MDBX_HAVE_C11ATOMICS */
  2274  }
  2275  #endif /* atomic_load32 */
  2276  
  2277  #endif /* !__cplusplus */
  2278  
  2279  /*----------------------------------------------------------------------------*/
  2280  /* Basic constants and types */
  2281  
  2282  /* A stamp that identifies a file as an MDBX file.
  2283   * There's nothing special about this value other than that it is easily
  2284   * recognizable, and it will reflect any byte order mismatches. */
  2285  #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
  2286  
  2287  /* FROZEN: The version number for a database's datafile format. */
  2288  #define MDBX_DATA_VERSION 3
  2289  /* The version number for a database's lockfile format. */
  2290  #define MDBX_LOCK_VERSION 4
  2291  
  2292  /* handle for the DB used to track free pages. */
  2293  #define FREE_DBI 0
  2294  /* handle for the default DB. */
  2295  #define MAIN_DBI 1
  2296  /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
  2297  #define CORE_DBS 2
  2298  
  2299  /* Number of meta pages - also hardcoded elsewhere */
  2300  #define NUM_METAS 3
  2301  
  2302  /* A page number in the database.
  2303   *
  2304   * MDBX uses 32 bit for page numbers. This limits database
  2305   * size up to 2^44 bytes, in case of 4K pages. */
  2306  typedef uint32_t pgno_t;
  2307  typedef MDBX_atomic_uint32_t atomic_pgno_t;
  2308  #define PRIaPGNO PRIu32
  2309  #define MAX_PAGENO UINT32_C(0x7FFFffff)
  2310  #define MIN_PAGENO NUM_METAS
  2311  
  2312  #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
  2313  
  2314  /* A transaction ID. */
  2315  typedef uint64_t txnid_t;
  2316  typedef MDBX_atomic_uint64_t atomic_txnid_t;
  2317  #define PRIaTXN PRIi64
  2318  #define MIN_TXNID UINT64_C(1)
  2319  #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
  2320  #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
  2321  #define INVALID_TXNID UINT64_MAX
  2322  /* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
  2323   * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
  2324  #ifndef xMDBX_TXNID_STEP
  2325  #if MDBX_64BIT_CAS
  2326  #define xMDBX_TXNID_STEP 1u
  2327  #else
  2328  #define xMDBX_TXNID_STEP 2u
  2329  #endif
  2330  #endif /* xMDBX_TXNID_STEP */
  2331  
  2332  /* Used for offsets within a single page.
  2333   * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
  2334   * this is plenty. */
  2335  typedef uint16_t indx_t;
  2336  
  2337  #define MEGABYTE ((size_t)1 << 20)
  2338  
  2339  /*----------------------------------------------------------------------------*/
  2340  /* Core structures for database and shared memory (i.e. format definition) */
  2341  #pragma pack(push, 4)
  2342  
  2343  /* Information about a single database in the environment. */
  2344  typedef struct MDBX_db {
  2345    uint16_t md_flags;        /* see mdbx_dbi_open */
  2346    uint16_t md_depth;        /* depth of this tree */
  2347    uint32_t md_xsize;        /* key-size for MDBX_DUPFIXED (LEAF2 pages) */
  2348    pgno_t md_root;           /* the root page of this tree */
  2349    pgno_t md_branch_pages;   /* number of internal pages */
  2350    pgno_t md_leaf_pages;     /* number of leaf pages */
  2351    pgno_t md_overflow_pages; /* number of overflow pages */
  2352    uint64_t md_seq;          /* table sequence counter */
  2353    uint64_t md_entries;      /* number of data items */
  2354    uint64_t md_mod_txnid;    /* txnid of last committed modification */
  2355  } MDBX_db;
  2356  
  2357  /* database size-related parameters */
  2358  typedef struct MDBX_geo {
  2359    uint16_t grow_pv;   /* datafile growth step as a 16-bit packed (exponential
  2360                             quantized) value */
  2361    uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
  2362                             (exponential quantized) value */
  2363    pgno_t lower;       /* minimal size of datafile in pages */
  2364    pgno_t upper;       /* maximal size of datafile in pages */
  2365    pgno_t now;         /* current size of datafile in pages */
  2366    pgno_t next;        /* first unused page in the datafile,
  2367                           but actually the file may be shorter. */
  2368  } MDBX_geo;
  2369  
  2370  /* Meta page content.
  2371   * A meta page is the start point for accessing a database snapshot.
  2372   * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
  2373  typedef struct MDBX_meta {
  2374    /* Stamp identifying this as an MDBX file.
  2375     * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
  2376    uint32_t mm_magic_and_version[2];
  2377  
  2378    /* txnid that committed this page, the first of a two-phase-update pair */
  2379    union {
  2380      MDBX_atomic_uint32_t mm_txnid_a[2];
  2381      uint64_t unsafe_txnid;
  2382    };
  2383  
  2384    uint16_t mm_extra_flags;  /* extra DB flags, zero (nothing) for now */
  2385    uint8_t mm_validator_id;  /* ID of checksum and page validation method,
  2386                               * zero (nothing) for now */
  2387    uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
  2388                               * zero (nothing) for now */
  2389  
  2390    MDBX_geo mm_geo; /* database size-related parameters */
  2391  
  2392    MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
  2393                              /* The size of pages used in this DB */
  2394  #define mm_psize mm_dbs[FREE_DBI].md_xsize
  2395    MDBX_canary mm_canary;
  2396  
  2397  #define MDBX_DATASIGN_NONE 0u
  2398  #define MDBX_DATASIGN_WEAK 1u
  2399  #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
  2400  #define META_IS_STEADY(meta)                                                   \
  2401    SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign))
  2402    union {
  2403      uint32_t mm_sign[2];
  2404      uint64_t unsafe_sign;
  2405    };
  2406  
  2407    /* txnid that committed this page, the second of a two-phase-update pair */
  2408    MDBX_atomic_uint32_t mm_txnid_b[2];
  2409  
  2410    /* Number of non-meta pages which were put in GC after COW. May be 0 in case
  2411     * DB was previously handled by libmdbx without corresponding feature.
  2412     * This value in couple with mr_snapshot_pages_retired allows fast estimation
  2413     * of "how much reader is restraining GC recycling". */
  2414    uint32_t mm_pages_retired[2];
  2415  
  2416    /* The analogue /proc/sys/kernel/random/boot_id or similar to determine
  2417     * whether the system was rebooted after the last use of the database files.
  2418     * If there was no reboot, but there is no need to rollback to the last
  2419     * steady sync point. Zeros mean that no relevant information is available
  2420     * from the system. */
  2421    bin128_t mm_bootid;
  2422  
  2423  } MDBX_meta;
  2424  
  2425  #pragma pack(1)
  2426  
  2427  /* Common header for all page types. The page type depends on mp_flags.
  2428   *
  2429   * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with
  2430   * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
  2431   * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header.
  2432   *
  2433   * P_OVERFLOW records occupy one or more contiguous pages where only the
  2434   * first has a page header. They hold the real data of F_BIGDATA nodes.
  2435   *
  2436   * P_SUBP sub-pages are small leaf "pages" with duplicate data.
  2437   * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
  2438   * (Duplicate data can also go in sub-databases, which use normal pages.)
  2439   *
  2440   * P_META pages contain MDBX_meta, the start point of an MDBX snapshot.
  2441   *
  2442   * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once
  2443   * in the snapshot: Either used by a database or listed in a GC record. */
  2444  typedef struct MDBX_page {
  2445    union {
  2446  #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid)
  2447  #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid)
  2448  #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid)
  2449  #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front)
  2450  #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front)
  2451      uint64_t
  2452          mp_txnid; /* txnid which created this page, maybe zero in legacy DB */
  2453      struct MDBX_page *mp_next; /* for in-memory list of freed pages */
  2454    };
  2455    uint16_t mp_leaf2_ksize;   /* key size if this is a LEAF2 page */
  2456  #define P_BRANCH 0x01u       /* branch page */
  2457  #define P_LEAF 0x02u         /* leaf page */
  2458  #define P_OVERFLOW 0x04u     /* overflow page */
  2459  #define P_META 0x08u         /* meta page */
  2460  #define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */
  2461  #define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */
  2462  #define P_LEAF2 0x20u        /* for MDBX_DUPFIXED records */
  2463  #define P_SUBP 0x40u         /* for MDBX_DUPSORT sub-pages */
  2464  #define P_SPILLED 0x2000u    /* spilled in parent txn */
  2465  #define P_LOOSE 0x4000u      /* page was dirtied then freed, can be reused */
  2466  #define P_FROZEN 0x8000u     /* used for retire page with known status */
  2467  #define P_ILL_BITS                                                             \
  2468    ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED))
  2469    uint16_t mp_flags;
  2470    union {
  2471      uint32_t mp_pages; /* number of overflow pages */
  2472      __anonymous_struct_extension__ struct {
  2473        indx_t mp_lower; /* lower bound of free space */
  2474        indx_t mp_upper; /* upper bound of free space */
  2475      };
  2476    };
  2477    pgno_t mp_pgno; /* page number */
  2478  
  2479  #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
  2480      (!defined(__cplusplus) && defined(_MSC_VER))
  2481    indx_t mp_ptrs[] /* dynamic size */;
  2482  #endif /* C99 */
  2483  } MDBX_page;
  2484  
  2485  #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags)
  2486  
  2487  /* Drop legacy P_DIRTY flag for sub-pages for compatilibity */
  2488  #define PAGETYPE_COMPAT(p)                                                     \
  2489    (unlikely(PAGETYPE_WHOLE(p) & P_SUBP)                                        \
  2490         ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY)                        \
  2491         : PAGETYPE_WHOLE(p))
  2492  
  2493  /* Size of the page header, excluding dynamic data at the end */
  2494  #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
  2495  
  2496  #pragma pack(pop)
  2497  
  2498  #if MDBX_ENABLE_PGOP_STAT
  2499  /* Statistics of page operations overall of all (running, completed and aborted)
  2500   * transactions */
  2501  typedef struct {
  2502    MDBX_atomic_uint64_t newly;   /* Quantity of a new pages added */
  2503    MDBX_atomic_uint64_t cow;     /* Quantity of pages copied for update */
  2504    MDBX_atomic_uint64_t clone;   /* Quantity of parent's dirty pages clones
  2505                                     for nested transactions */
  2506    MDBX_atomic_uint64_t split;   /* Page splits */
  2507    MDBX_atomic_uint64_t merge;   /* Page merges */
  2508    MDBX_atomic_uint64_t spill;   /* Quantity of spilled dirty pages */
  2509    MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
  2510    MDBX_atomic_uint64_t
  2511        wops; /* Number of explicit write operations (not a pages) to a disk */
  2512    MDBX_atomic_uint64_t
  2513        gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
  2514                    unit/scale is platform-depended, see osal_monotime(). */
  2515  } MDBX_pgop_stat_t;
  2516  #endif /* MDBX_ENABLE_PGOP_STAT */
  2517  
  2518  #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
  2519  #define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
  2520  typedef void osal_ipclock_t;
  2521  #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
  2522  
  2523  #define MDBX_CLOCK_SIGN UINT32_C(0xF18D)
  2524  typedef mdbx_pid_t osal_ipclock_t;
  2525  #ifndef EOWNERDEAD
  2526  #define EOWNERDEAD MDBX_RESULT_TRUE
  2527  #endif
  2528  
  2529  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
  2530      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
  2531  #define MDBX_CLOCK_SIGN UINT32_C(0x8017)
  2532  typedef pthread_mutex_t osal_ipclock_t;
  2533  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
  2534  #define MDBX_CLOCK_SIGN UINT32_C(0xFC29)
  2535  typedef sem_t osal_ipclock_t;
  2536  #else
  2537  #error "FIXME"
  2538  #endif /* MDBX_LOCKING */
  2539  
  2540  #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus)
  2541  MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc);
  2542  MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc);
  2543  #endif /* MDBX_LOCKING */
  2544  
  2545  /* Reader Lock Table
  2546   *
  2547   * Readers don't acquire any locks for their data access. Instead, they
  2548   * simply record their transaction ID in the reader table. The reader
  2549   * mutex is needed just to find an empty slot in the reader table. The
  2550   * slot's address is saved in thread-specific data so that subsequent
  2551   * read transactions started by the same thread need no further locking to
  2552   * proceed.
  2553   *
  2554   * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
  2555   * No reader table is used if the database is on a read-only filesystem.
  2556   *
  2557   * Since the database uses multi-version concurrency control, readers don't
  2558   * actually need any locking. This table is used to keep track of which
  2559   * readers are using data from which old transactions, so that we'll know
  2560   * when a particular old transaction is no longer in use. Old transactions
  2561   * that have discarded any data pages can then have those pages reclaimed
  2562   * for use by a later write transaction.
  2563   *
  2564   * The lock table is constructed such that reader slots are aligned with the
  2565   * processor's cache line size. Any slot is only ever used by one thread.
  2566   * This alignment guarantees that there will be no contention or cache
  2567   * thrashing as threads update their own slot info, and also eliminates
  2568   * any need for locking when accessing a slot.
  2569   *
  2570   * A writer thread will scan every slot in the table to determine the oldest
  2571   * outstanding reader transaction. Any freed pages older than this will be
  2572   * reclaimed by the writer. The writer doesn't use any locks when scanning
  2573   * this table. This means that there's no guarantee that the writer will
  2574   * see the most up-to-date reader info, but that's not required for correct
  2575   * operation - all we need is to know the upper bound on the oldest reader,
  2576   * we don't care at all about the newest reader. So the only consequence of
  2577   * reading stale information here is that old pages might hang around a
  2578   * while longer before being reclaimed. That's actually good anyway, because
  2579   * the longer we delay reclaiming old pages, the more likely it is that a
  2580   * string of contiguous pages can be found after coalescing old pages from
  2581   * many old transactions together. */
  2582  
  2583  /* The actual reader record, with cacheline padding. */
  2584  typedef struct MDBX_reader {
  2585    /* Current Transaction ID when this transaction began, or (txnid_t)-1.
  2586     * Multiple readers that start at the same time will probably have the
  2587     * same ID here. Again, it's not important to exclude them from
  2588     * anything; all we need to know is which version of the DB they
  2589     * started from so we can avoid overwriting any data used in that
  2590     * particular version. */
  2591    MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
  2592  
  2593    /* The information we store in a single slot of the reader table.
  2594     * In addition to a transaction ID, we also record the process and
  2595     * thread ID that owns a slot, so that we can detect stale information,
  2596     * e.g. threads or processes that went away without cleaning up.
  2597     *
  2598     * NOTE: We currently don't check for stale records.
  2599     * We simply re-init the table when we know that we're the only process
  2600     * opening the lock file. */
  2601  
  2602    /* The thread ID of the thread owning this txn. */
  2603    MDBX_atomic_uint64_t mr_tid;
  2604  
  2605    /* The process ID of the process owning this reader txn. */
  2606    MDBX_atomic_uint32_t mr_pid;
  2607  
  2608    /* The number of pages used in the reader's MVCC snapshot,
  2609     * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
  2610    atomic_pgno_t mr_snapshot_pages_used;
  2611    /* Number of retired pages at the time this reader starts transaction. So,
  2612     * at any time the difference mm_pages_retired - mr_snapshot_pages_retired
  2613     * will give the number of pages which this reader restraining from reuse. */
  2614    MDBX_atomic_uint64_t mr_snapshot_pages_retired;
  2615  } MDBX_reader;
  2616  
  2617  /* The header for the reader table (a memory-mapped lock file). */
  2618  typedef struct MDBX_lockinfo {
  2619    /* Stamp identifying this as an MDBX file.
  2620     * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
  2621    uint64_t mti_magic_and_version;
  2622  
  2623    /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
  2624    uint32_t mti_os_and_format;
  2625  
  2626    /* Flags which environment was opened. */
  2627    MDBX_atomic_uint32_t mti_envmode;
  2628  
  2629    /* Threshold of un-synced-with-disk pages for auto-sync feature,
  2630     * zero means no-threshold, i.e. auto-sync is disabled. */
  2631    atomic_pgno_t mti_autosync_threshold;
  2632  
  2633    /* Low 32-bit of txnid with which meta-pages was synced,
  2634     * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
  2635    MDBX_atomic_uint32_t mti_meta_sync_txnid;
  2636  
  2637    /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
  2638     * the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
  2639     * The time value is represented in a suitable system-dependent form, for
  2640     * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
  2641     * Zero means timed auto-sync is disabled. */
  2642    MDBX_atomic_uint64_t mti_autosync_period;
  2643  
  2644    /* Marker to distinguish uniqueness of DB/CLK. */
  2645    MDBX_atomic_uint64_t mti_bait_uniqueness;
  2646  
  2647    MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
  2648  
  2649  #if MDBX_ENABLE_PGOP_STAT
  2650    /* Statistics of costly ops of all (running, completed and aborted)
  2651     * transactions */
  2652    MDBX_pgop_stat_t mti_pgop_stat;
  2653  #endif /* MDBX_ENABLE_PGOP_STAT*/
  2654  
  2655    MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
  2656  
  2657    /* Write transaction lock. */
  2658  #if MDBX_LOCKING > 0
  2659    osal_ipclock_t mti_wlock;
  2660  #endif /* MDBX_LOCKING > 0 */
  2661  
  2662    atomic_txnid_t mti_oldest_reader;
  2663  
  2664    /* Timestamp of the last steady sync. Value is represented in a suitable
  2665     * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
  2666     * clock_gettime(CLOCK_MONOTONIC). */
  2667    MDBX_atomic_uint64_t mti_sync_timestamp;
  2668  
  2669    /* Number un-synced-with-disk pages for auto-sync feature. */
  2670    atomic_pgno_t mti_unsynced_pages;
  2671  
  2672    /* Number of page which was discarded last time by madvise(MADV_FREE). */
  2673    atomic_pgno_t mti_discarded_tail;
  2674  
  2675    /* Timestamp of the last readers check. */
  2676    MDBX_atomic_uint64_t mti_reader_check_timestamp;
  2677  
  2678    /* Shared anchor for tracking readahead edge and enabled/disabled status. */
  2679    pgno_t mti_readahead_anchor;
  2680  
  2681    MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
  2682  
  2683    /* Readeaders registration lock. */
  2684  #if MDBX_LOCKING > 0
  2685    osal_ipclock_t mti_rlock;
  2686  #endif /* MDBX_LOCKING > 0 */
  2687  
  2688    /* The number of slots that have been used in the reader table.
  2689     * This always records the maximum count, it is not decremented
  2690     * when readers release their slots. */
  2691    MDBX_atomic_uint32_t mti_numreaders;
  2692    MDBX_atomic_uint32_t mti_readers_refresh_flag;
  2693  
  2694  #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
  2695      (!defined(__cplusplus) && defined(_MSC_VER))
  2696    MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
  2697    MDBX_reader mti_readers[] /* dynamic size */;
  2698  #endif /* C99 */
  2699  } MDBX_lockinfo;
  2700  
  2701  /* Lockfile format signature: version, features and field layout */
  2702  #define MDBX_LOCK_FORMAT                                                       \
  2703    (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 +              \
  2704     (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 +             \
  2705     (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 +                 \
  2706     (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 +                    \
  2707     (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
  2708  
  2709  #define MDBX_DATA_MAGIC                                                        \
  2710    ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
  2711  
  2712  #define MDBX_DATA_MAGIC_LEGACY_COMPAT                                          \
  2713    ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
  2714  
  2715  #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
  2716  
  2717  #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
  2718  
  2719  /* The maximum size of a database page.
  2720   *
  2721   * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
  2722   *
  2723   * MDBX will use database pages < OS pages if needed.
  2724   * That causes more I/O in write transactions: The OS must
  2725   * know (read) the whole page before writing a partial page.
  2726   *
  2727   * Note that we don't currently support Huge pages. On Linux,
  2728   * regular data files cannot use Huge pages, and in general
  2729   * Huge pages aren't actually pageable. We rely on the OS
  2730   * demand-pager to read our data and page it out when memory
  2731   * pressure from other processes is high. So until OSs have
  2732   * actual paging support for Huge pages, they're not viable. */
  2733  #define MAX_PAGESIZE MDBX_MAX_PAGESIZE
  2734  #define MIN_PAGESIZE MDBX_MIN_PAGESIZE
  2735  
  2736  #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
  2737  #if defined(_WIN32) || defined(_WIN64)
  2738  #define MAX_MAPSIZE32 UINT32_C(0x38000000)
  2739  #else
  2740  #define MAX_MAPSIZE32 UINT32_C(0x7f000000)
  2741  #endif
  2742  #define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MAX_PAGESIZE)
  2743  
  2744  #if MDBX_WORDBITS >= 64
  2745  #define MAX_MAPSIZE MAX_MAPSIZE64
  2746  #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO)
  2747  #else
  2748  #define MAX_MAPSIZE MAX_MAPSIZE32
  2749  #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
  2750  #endif /* MDBX_WORDBITS */
  2751  
  2752  #define MDBX_READERS_LIMIT 32767
  2753  #define MDBX_RADIXSORT_THRESHOLD 333
  2754  
  2755  /*----------------------------------------------------------------------------*/
  2756  
  2757  /* An PNL is an Page Number List, a sorted array of IDs.
  2758   * The first element of the array is a counter for how many actual page-numbers
  2759   * are in the list. By default PNLs are sorted in descending order, this allow
  2760   * cut off a page with lowest pgno (at the tail) just truncating the list. The
  2761   * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
  2762  typedef pgno_t *MDBX_PNL;
  2763  
  2764  #if MDBX_PNL_ASCENDING
  2765  #define MDBX_PNL_ORDERED(first, last) ((first) < (last))
  2766  #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
  2767  #else
  2768  #define MDBX_PNL_ORDERED(first, last) ((first) > (last))
  2769  #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
  2770  #endif
  2771  
  2772  /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */
  2773  typedef txnid_t *MDBX_TXL;
  2774  
  2775  /* An Dirty-Page list item is an pgno/pointer pair. */
  2776  typedef struct MDBX_dp {
  2777    MDBX_page *ptr;
  2778    pgno_t pgno;
  2779    union {
  2780      unsigned extra;
  2781      __anonymous_struct_extension__ struct {
  2782        unsigned multi : 1;
  2783        unsigned lru : 31;
  2784      };
  2785    };
  2786  } MDBX_dp;
  2787  
  2788  /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
  2789  typedef struct MDBX_dpl {
  2790    unsigned sorted;
  2791    unsigned length;
  2792    unsigned pages_including_loose; /* number of pages, but not an entries. */
  2793    unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
  2794  #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
  2795      (!defined(__cplusplus) && defined(_MSC_VER))
  2796    MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
  2797  #endif
  2798  } MDBX_dpl;
  2799  
  2800  /* PNL sizes */
  2801  #define MDBX_PNL_GRANULATE 1024
  2802  #define MDBX_PNL_INITIAL                                                       \
  2803    (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
  2804  
  2805  #define MDBX_TXL_GRANULATE 32
  2806  #define MDBX_TXL_INITIAL                                                       \
  2807    (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
  2808  #define MDBX_TXL_MAX                                                           \
  2809    ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
  2810  
  2811  #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
  2812  #define MDBX_PNL_SIZE(pl) ((pl)[0])
  2813  #define MDBX_PNL_FIRST(pl) ((pl)[1])
  2814  #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
  2815  #define MDBX_PNL_BEGIN(pl) (&(pl)[1])
  2816  #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
  2817  
  2818  #if MDBX_PNL_ASCENDING
  2819  #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
  2820  #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
  2821  #else
  2822  #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
  2823  #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
  2824  #endif
  2825  
  2826  #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
  2827  #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
  2828  
  2829  /*----------------------------------------------------------------------------*/
  2830  /* Internal structures */
  2831  
  2832  /* Auxiliary DB info.
  2833   * The information here is mostly static/read-only. There is
  2834   * only a single copy of this record in the environment. */
  2835  typedef struct MDBX_dbx {
  2836    MDBX_val md_name;                /* name of the database */
  2837    MDBX_cmp_func *md_cmp;           /* function for comparing keys */
  2838    MDBX_cmp_func *md_dcmp;          /* function for comparing data items */
  2839    size_t md_klen_min, md_klen_max; /* min/max key length for the database */
  2840    size_t md_vlen_min,
  2841        md_vlen_max; /* min/max value/data length for the database */
  2842  } MDBX_dbx;
  2843  
  2844  typedef struct troika {
  2845    uint8_t fsm, recent, prefer_steady, tail_and_flags;
  2846  #define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7)
  2847  #define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64)
  2848  #define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128)
  2849  #define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3)
  2850    txnid_t txnid[NUM_METAS];
  2851  } meta_troika_t;
  2852  
  2853  /* A database transaction.
  2854   * Every operation requires a transaction handle. */
  2855  struct MDBX_txn {
  2856  #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31)
  2857    uint32_t mt_signature;
  2858  
  2859    /* Transaction Flags */
  2860    /* mdbx_txn_begin() flags */
  2861  #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE)
  2862  #define MDBX_TXN_RW_BEGIN_FLAGS                                                \
  2863    (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY)
  2864    /* Additional flag for sync_locked() */
  2865  #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
  2866  
  2867  #define TXN_FLAGS                                                              \
  2868    (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS |     \
  2869     MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID)
  2870  
  2871  #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) ||       \
  2872      ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) &         \
  2873       MDBX_SHRINK_ALLOWED)
  2874  #error "Oops, some txn flags overlapped or wrong"
  2875  #endif
  2876    uint32_t mt_flags;
  2877  
  2878    MDBX_txn *mt_parent; /* parent of a nested txn */
  2879    /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */
  2880    MDBX_txn *mt_child;
  2881    MDBX_geo mt_geo;
  2882    /* next unallocated page */
  2883  #define mt_next_pgno mt_geo.next
  2884    /* corresponding to the current size of datafile */
  2885  #define mt_end_pgno mt_geo.now
  2886  
  2887    /* The ID of this transaction. IDs are integers incrementing from
  2888     * INITIAL_TXNID. Only committed write transactions increment the ID. If a
  2889     * transaction aborts, the ID may be re-used by the next writer. */
  2890    txnid_t mt_txnid;
  2891    txnid_t mt_front;
  2892  
  2893    MDBX_env *mt_env; /* the DB environment */
  2894    /* Array of records for each DB known in the environment. */
  2895    MDBX_dbx *mt_dbxs;
  2896    /* Array of MDBX_db records for each known DB */
  2897    MDBX_db *mt_dbs;
  2898    /* Array of sequence numbers for each DB handle */
  2899    MDBX_atomic_uint32_t *mt_dbiseqs;
  2900  
  2901    /* Transaction DBI Flags */
  2902  #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */
  2903  #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */
  2904  #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */
  2905  #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */
  2906  #define DBI_VALID 0x10           /* DB handle is valid, see also DB_VALID */
  2907  #define DBI_USRVALID 0x20        /* As DB_VALID, but not set for FREE_DBI */
  2908  #define DBI_AUDITED 0x40         /* Internal flag for accounting during audit */
  2909    /* Array of flags for each DB */
  2910    uint8_t *mt_dbistate;
  2911    /* Number of DB records in use, or 0 when the txn is finished.
  2912     * This number only ever increments until the txn finishes; we
  2913     * don't decrement it when individual DB handles are closed. */
  2914    MDBX_dbi mt_numdbs;
  2915    size_t mt_owner; /* thread ID that owns this transaction */
  2916    MDBX_canary mt_canary;
  2917    void *mt_userctx; /* User-settable context */
  2918    MDBX_cursor **mt_cursors;
  2919  
  2920    union {
  2921      struct {
  2922        /* For read txns: This thread/txn's reader table slot, or NULL. */
  2923        MDBX_reader *reader;
  2924      } to;
  2925      struct {
  2926        meta_troika_t troika;
  2927        /* In write txns, array of cursors for each DB */
  2928        pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
  2929        txnid_t last_reclaimed;   /* ID of last used record */
  2930  #if MDBX_ENABLE_REFUND
  2931        pgno_t loose_refund_wl /* FIXME: describe */;
  2932  #endif /* MDBX_ENABLE_REFUND */
  2933        /* dirtylist room: Dirty array size - dirty pages visible to this txn.
  2934         * Includes ancestor txns' dirty pages not hidden by other txns'
  2935         * dirty/spilled pages. Thus commit(nested txn) has room to merge
  2936         * dirtylist into mt_parent after freeing hidden mt_parent pages. */
  2937        unsigned dirtyroom;
  2938        /* a sequence to spilling dirty page with LRU policy */
  2939        unsigned dirtylru;
  2940        /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
  2941        MDBX_dpl *dirtylist;
  2942        /* The list of reclaimed txns from GC */
  2943        MDBX_TXL lifo_reclaimed;
  2944        /* The list of pages that became unused during this transaction. */
  2945        MDBX_PNL retired_pages;
  2946        /* The list of loose pages that became unused and may be reused
  2947         * in this transaction, linked through `mp_next`. */
  2948        MDBX_page *loose_pages;
  2949        /* Number of loose pages (tw.loose_pages) */
  2950        unsigned loose_count;
  2951        unsigned spill_least_removed;
  2952        /* The sorted list of dirty pages we temporarily wrote to disk
  2953         * because the dirty list was full. page numbers in here are
  2954         * shifted left by 1, deleted slots have the LSB set. */
  2955        MDBX_PNL spill_pages;
  2956      } tw;
  2957    };
  2958  };
  2959  
  2960  #if MDBX_WORDBITS >= 64
  2961  #define CURSOR_STACK 32
  2962  #else
  2963  #define CURSOR_STACK 24
  2964  #endif
  2965  
  2966  struct MDBX_xcursor;
  2967  
  2968  /* Cursors are used for all DB operations.
  2969   * A cursor holds a path of (page pointer, key index) from the DB
  2970   * root to a position in the DB, plus other state. MDBX_DUPSORT
  2971   * cursors include an xcursor to the current data item. Write txns
  2972   * track their cursors and keep them up to date when data moves.
  2973   * Exception: An xcursor's pointer to a P_SUBP page can be stale.
  2974   * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
  2975  struct MDBX_cursor {
  2976  #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1)
  2977  #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047)
  2978  #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7)
  2979    uint32_t mc_signature;
  2980    /* The database handle this cursor operates on */
  2981    MDBX_dbi mc_dbi;
  2982    /* Next cursor on this DB in this txn */
  2983    MDBX_cursor *mc_next;
  2984    /* Backup of the original cursor if this cursor is a shadow */
  2985    MDBX_cursor *mc_backup;
  2986    /* Context used for databases with MDBX_DUPSORT, otherwise NULL */
  2987    struct MDBX_xcursor *mc_xcursor;
  2988    /* The transaction that owns this cursor */
  2989    MDBX_txn *mc_txn;
  2990    /* The database record for this cursor */
  2991    MDBX_db *mc_db;
  2992    /* The database auxiliary record for this cursor */
  2993    MDBX_dbx *mc_dbx;
  2994    /* The mt_dbistate for this database */
  2995    uint8_t *mc_dbistate;
  2996    uint8_t mc_snum; /* number of pushed pages */
  2997    uint8_t mc_top;  /* index of top page, normally mc_snum-1 */
  2998  
  2999    /* Cursor state flags. */
  3000  #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
  3001  #define C_EOF 0x02         /* No more data */
  3002  #define C_SUB 0x04         /* Cursor is a sub-cursor */
  3003  #define C_DEL 0x08         /* last op was a cursor_del */
  3004  #define C_UNTRACK 0x10     /* Un-track cursor when closing */
  3005  #define C_RECLAIMING 0x20  /* GC lookup is prohibited */
  3006  #define C_GCFREEZE 0x40    /* reclaimed_pglist must not be updated */
  3007    uint8_t mc_flags;        /* see mdbx_cursor */
  3008  
  3009    /* Cursor checking flags. */
  3010  #define CC_BRANCH 0x01    /* same as P_BRANCH for CHECK_LEAF_TYPE() */
  3011  #define CC_LEAF 0x02      /* same as P_LEAF for CHECK_LEAF_TYPE() */
  3012  #define CC_OVERFLOW 0x04  /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */
  3013  #define CC_UPDATING 0x08  /* update/rebalance pending */
  3014  #define CC_SKIPORD 0x10   /* don't check keys ordering */
  3015  #define CC_LEAF2 0x20     /* same as P_LEAF2 for CHECK_LEAF_TYPE() */
  3016  #define CC_RETIRING 0x40  /* refs to child pages may be invalid */
  3017  #define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */
  3018    uint8_t mc_checking;    /* page checking level */
  3019  
  3020    MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
  3021    indx_t mc_ki[CURSOR_STACK];     /* stack of page indices */
  3022  };
  3023  
  3024  #define CHECK_LEAF_TYPE(mc, mp)                                                \
  3025    (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) &                                 \
  3026      (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0)
  3027  
  3028  /* Context for sorted-dup records.
  3029   * We could have gone to a fully recursive design, with arbitrarily
  3030   * deep nesting of sub-databases. But for now we only handle these
  3031   * levels - main DB, optional sub-DB, sorted-duplicate DB. */
  3032  typedef struct MDBX_xcursor {
  3033    /* A sub-cursor for traversing the Dup DB */
  3034    MDBX_cursor mx_cursor;
  3035    /* The database record for this Dup DB */
  3036    MDBX_db mx_db;
  3037    /* The auxiliary DB record for this Dup DB */
  3038    MDBX_dbx mx_dbx;
  3039  } MDBX_xcursor;
  3040  
  3041  typedef struct MDBX_cursor_couple {
  3042    MDBX_cursor outer;
  3043    void *mc_userctx; /* User-settable context */
  3044    MDBX_xcursor inner;
  3045  } MDBX_cursor_couple;
  3046  
  3047  /* The database environment. */
  3048  struct MDBX_env {
  3049    /* ----------------------------------------------------- mostly static part */
  3050  #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
  3051    MDBX_atomic_uint32_t me_signature;
  3052    /* Failed to update the meta page. Probably an I/O error. */
  3053  #define MDBX_FATAL_ERROR UINT32_C(0x80000000)
  3054    /* Some fields are initialized. */
  3055  #define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
  3056    /* me_txkey is set */
  3057  #define MDBX_ENV_TXKEY UINT32_C(0x10000000)
  3058    /* Legacy MDBX_MAPASYNC (prior v0.9) */
  3059  #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
  3060    /* Legacy MDBX_COALESCE (prior v0.12) */
  3061  #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000)
  3062  #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
  3063    uint32_t me_flags;
  3064    osal_mmap_t me_dxb_mmap; /* The main data file */
  3065  #define me_map me_dxb_mmap.dxb
  3066  #define me_lazy_fd me_dxb_mmap.fd
  3067    mdbx_filehandle_t me_dsync_fd;
  3068    osal_mmap_t me_lck_mmap; /* The lock file */
  3069  #define me_lfd me_lck_mmap.fd
  3070    struct MDBX_lockinfo *me_lck;
  3071  
  3072    unsigned me_psize;        /* DB page size, initialized from me_os_psize */
  3073    unsigned me_leaf_nodemax; /* max size of a leaf-node */
  3074    uint8_t me_psize2log;     /* log2 of DB page size */
  3075    int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
  3076    uint16_t me_merge_threshold,
  3077        me_merge_threshold_gc;  /* pages emptier than this are candidates for
  3078                                   merging */
  3079    unsigned me_os_psize;       /* OS page size, from osal_syspagesize() */
  3080    unsigned me_maxreaders;     /* size of the reader table */
  3081    MDBX_dbi me_maxdbs;         /* size of the DB table */
  3082    uint32_t me_pid;            /* process ID of this env */
  3083    osal_thread_key_t me_txkey; /* thread-key for readers */
  3084    pathchar_t *me_pathname;    /* path to the DB files */
  3085    void *me_pbuf;              /* scratch area for DUPSORT put() */
  3086    MDBX_txn *me_txn0;          /* preallocated write transaction */
  3087  
  3088    MDBX_dbx *me_dbxs;                /* array of static DB info */
  3089    uint16_t *me_dbflags;             /* array of flags from MDBX_db.md_flags */
  3090    MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */
  3091    unsigned
  3092        me_maxgc_ov1page;    /* Number of pgno_t fit in a single overflow page */
  3093    uint32_t me_live_reader; /* have liveness lock in reader table */
  3094    void *me_userctx;        /* User-settable context */
  3095    MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
  3096  
  3097    struct {
  3098      unsigned dp_reserve_limit;
  3099      unsigned rp_augment_limit;
  3100      unsigned dp_limit;
  3101      unsigned dp_initial;
  3102      uint8_t dp_loose_limit;
  3103      uint8_t spill_max_denominator;
  3104      uint8_t spill_min_denominator;
  3105      uint8_t spill_parent4child_denominator;
  3106      unsigned merge_threshold_16dot16_percent;
  3107      union {
  3108        unsigned all;
  3109        /* tracks options with non-auto values but tuned by user */
  3110        struct {
  3111          unsigned dp_limit : 1;
  3112        } non_auto;
  3113      } flags;
  3114    } me_options;
  3115  
  3116    /* struct me_dbgeo used for accepting db-geo params from user for the new
  3117     * database creation, i.e. when mdbx_env_set_geometry() was called before
  3118     * mdbx_env_open(). */
  3119    struct {
  3120      size_t lower;  /* minimal size of datafile */
  3121      size_t upper;  /* maximal size of datafile */
  3122      size_t now;    /* current size of datafile */
  3123      size_t grow;   /* step to grow datafile */
  3124      size_t shrink; /* threshold to shrink datafile */
  3125    } me_dbgeo;
  3126  
  3127  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
  3128    union {
  3129      key_t key;
  3130      int semid;
  3131    } me_sysv_ipc;
  3132  #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
  3133  
  3134    MDBX_env *me_lcklist_next;
  3135  
  3136    /* --------------------------------------------------- mostly volatile part */
  3137  
  3138    MDBX_txn *me_txn; /* current write transaction */
  3139    osal_fastmutex_t me_dbi_lock;
  3140    MDBX_dbi me_numdbs; /* number of DBs opened */
  3141  
  3142    MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */
  3143    unsigned me_dp_reserve_len;
  3144    /* PNL of pages that became unused in a write txn */
  3145    MDBX_PNL me_retired_pages;
  3146  
  3147  #if defined(_WIN32) || defined(_WIN64)
  3148    osal_srwlock_t me_remap_guard;
  3149    /* Workaround for LockFileEx and WriteFile multithread bug */
  3150    CRITICAL_SECTION me_windowsbug_lock;
  3151  #else
  3152    osal_fastmutex_t me_remap_guard;
  3153  #endif
  3154  
  3155    /* -------------------------------------------------------------- debugging */
  3156  
  3157  #if MDBX_DEBUG
  3158    MDBX_assert_func *me_assert_func; /*  Callback for assertion failures */
  3159  #endif
  3160  #ifdef MDBX_USE_VALGRIND
  3161    int me_valgrind_handle;
  3162  #endif
  3163  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
  3164    pgno_t me_poison_edge;
  3165  #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
  3166  
  3167  #ifndef xMDBX_DEBUG_SPILLING
  3168  #define xMDBX_DEBUG_SPILLING 0
  3169  #endif
  3170  #if xMDBX_DEBUG_SPILLING == 2
  3171    unsigned debug_dirtied_est, debug_dirtied_act;
  3172  #endif /* xMDBX_DEBUG_SPILLING */
  3173  
  3174    /* ------------------------------------------------- stub for lck-less mode */
  3175    MDBX_atomic_uint64_t
  3176        x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) /
  3177                       sizeof(MDBX_atomic_uint64_t)];
  3178  };
  3179  
  3180  #ifndef __cplusplus
  3181  /*----------------------------------------------------------------------------*/
  3182  /* Debug and Logging stuff */
  3183  
  3184  #define MDBX_RUNTIME_FLAGS_INIT                                                \
  3185    ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT
  3186  
  3187  extern uint8_t runtime_flags;
  3188  extern uint8_t loglevel;
  3189  extern MDBX_debug_func *debug_logger;
  3190  
  3191  MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) {
  3192  #if MDBX_DEBUG
  3193    if (MDBX_DBG_JITTER & runtime_flags)
  3194      osal_jitter(tiny);
  3195  #else
  3196    (void)tiny;
  3197  #endif
  3198  }
  3199  
  3200  MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5)
  3201      debug_log(int level, const char *function, int line, const char *fmt, ...)
  3202          MDBX_PRINTF_ARGS(4, 5);
  3203  MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line,
  3204                                       const char *fmt, va_list args);
  3205  
  3206  #if MDBX_DEBUG
  3207  #define LOG_ENABLED(msg) unlikely(msg <= loglevel)
  3208  #define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT))
  3209  #else /* MDBX_DEBUG */
  3210  #define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel)
  3211  #define AUDIT_ENABLED() (0)
  3212  #endif /* MDBX_DEBUG */
  3213  
  3214  #if MDBX_FORCE_ASSERTIONS
  3215  #define ASSERT_ENABLED() (1)
  3216  #elif MDBX_DEBUG
  3217  #define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT))
  3218  #else
  3219  #define ASSERT_ENABLED() (0)
  3220  #endif /* assertions */
  3221  
  3222  #define DEBUG_EXTRA(fmt, ...)                                                  \
  3223    do {                                                                         \
  3224      if (LOG_ENABLED(MDBX_LOG_EXTRA))                                           \
  3225        debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__);         \
  3226    } while (0)
  3227  
  3228  #define DEBUG_EXTRA_PRINT(fmt, ...)                                            \
  3229    do {                                                                         \
  3230      if (LOG_ENABLED(MDBX_LOG_EXTRA))                                           \
  3231        debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__);                    \
  3232    } while (0)
  3233  
  3234  #define TRACE(fmt, ...)                                                        \
  3235    do {                                                                         \
  3236      if (LOG_ENABLED(MDBX_LOG_TRACE))                                           \
  3237        debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__);    \
  3238    } while (0)
  3239  
  3240  #define DEBUG(fmt, ...)                                                        \
  3241    do {                                                                         \
  3242      if (LOG_ENABLED(MDBX_LOG_DEBUG))                                           \
  3243        debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__);    \
  3244    } while (0)
  3245  
  3246  #define VERBOSE(fmt, ...)                                                      \
  3247    do {                                                                         \
  3248      if (LOG_ENABLED(MDBX_LOG_VERBOSE))                                         \
  3249        debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__);  \
  3250    } while (0)
  3251  
  3252  #define NOTICE(fmt, ...)                                                       \
  3253    do {                                                                         \
  3254      if (LOG_ENABLED(MDBX_LOG_NOTICE))                                          \
  3255        debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__);   \
  3256    } while (0)
  3257  
  3258  #define WARNING(fmt, ...)                                                      \
  3259    do {                                                                         \
  3260      if (LOG_ENABLED(MDBX_LOG_WARN))                                            \
  3261        debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__);     \
  3262    } while (0)
  3263  
  3264  #undef ERROR /* wingdi.h                                                       \
  3265    Yeah, morons from M$ put such definition to the public header. */
  3266  
  3267  #define ERROR(fmt, ...)                                                        \
  3268    do {                                                                         \
  3269      if (LOG_ENABLED(MDBX_LOG_ERROR))                                           \
  3270        debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__);    \
  3271    } while (0)
  3272  
  3273  #define FATAL(fmt, ...)                                                        \
  3274    debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
  3275  
  3276  #define ENSURE_MSG(env, expr, msg)                                             \
  3277    do {                                                                         \
  3278      if (unlikely(!(expr)))                                                     \
  3279        mdbx_assert_fail(env, msg, __func__, __LINE__);                          \
  3280    } while (0)
  3281  
  3282  #define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr)
  3283  
  3284  /* assert(3) variant in environment context */
  3285  #define eASSERT(env, expr)                                                     \
  3286    do {                                                                         \
  3287      if (ASSERT_ENABLED())                                                      \
  3288        ENSURE(env, expr);                                                       \
  3289    } while (0)
  3290  
  3291  /* assert(3) variant in cursor context */
  3292  #define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr)
  3293  
  3294  /* assert(3) variant in transaction context */
  3295  #define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr)
  3296  
  3297  #ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */
  3298  #undef assert
  3299  #define assert(expr) eASSERT(NULL, expr)
  3300  #endif
  3301  
  3302  /*----------------------------------------------------------------------------*/
  3303  /* Cache coherence and mmap invalidation */
  3304  
  3305  #if MDBX_CPU_WRITEBACK_INCOHERENT
  3306  #define osal_flush_incoherent_cpu_writeback() osal_memory_barrier()
  3307  #else
  3308  #define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier()
  3309  #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
  3310  
  3311  MDBX_MAYBE_UNUSED static __inline void
  3312  osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) {
  3313  #if MDBX_MMAP_INCOHERENT_FILE_WRITE
  3314    char *const begin = (char *)(-pagesize & (intptr_t)addr);
  3315    char *const end =
  3316        (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
  3317    int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
  3318    eASSERT(nullptr, err == 0);
  3319    (void)err;
  3320  #else
  3321    (void)pagesize;
  3322  #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
  3323  
  3324  #if MDBX_MMAP_INCOHERENT_CPU_CACHE
  3325  #ifdef DCACHE
  3326    /* MIPS has cache coherency issues.
  3327     * Note: for any nbytes >= on-chip cache size, entire is flushed. */
  3328    cacheflush(addr, nbytes, DCACHE);
  3329  #else
  3330  #error "Oops, cacheflush() not available"
  3331  #endif /* DCACHE */
  3332  #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
  3333  
  3334  #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
  3335    (void)addr;
  3336    (void)nbytes;
  3337  #endif
  3338  }
  3339  
  3340  /*----------------------------------------------------------------------------*/
  3341  /* Internal prototypes */
  3342  
  3343  MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked,
  3344                                              int *dead);
  3345  MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin,
  3346                                    MDBX_reader *end);
  3347  MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key);
  3348  
  3349  MDBX_INTERNAL_FUNC void global_ctor(void);
  3350  MDBX_INTERNAL_FUNC void global_dtor(void);
  3351  MDBX_INTERNAL_FUNC void thread_dtor(void *ptr);
  3352  
  3353  #endif /* !__cplusplus */
  3354  
  3355  #define MDBX_IS_ERROR(rc)                                                      \
  3356    ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
  3357  
  3358  /* Internal error codes, not exposed outside libmdbx */
  3359  #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10)
  3360  
  3361  /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */
  3362  #define DDBI(mc)                                                               \
  3363    (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
  3364  
  3365  /* Key size which fits in a DKBUF (debug key buffer). */
  3366  #define DKBUF_MAX 511
  3367  #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2]
  3368  #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1)
  3369  #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
  3370  
  3371  #if MDBX_DEBUG
  3372  #define DKBUF_DEBUG DKBUF
  3373  #define DKEY_DEBUG(x) DKEY(x)
  3374  #define DVAL_DEBUG(x) DVAL(x)
  3375  #else
  3376  #define DKBUF_DEBUG ((void)(0))
  3377  #define DKEY_DEBUG(x) ("-")
  3378  #define DVAL_DEBUG(x) ("-")
  3379  #endif
  3380  
  3381  /* An invalid page number.
  3382   * Mainly used to denote an empty tree. */
  3383  #define P_INVALID (~(pgno_t)0)
  3384  
  3385  /* Test if the flags f are set in a flag word w. */
  3386  #define F_ISSET(w, f) (((w) & (f)) == (f))
  3387  
  3388  /* Round n up to an even number. */
  3389  #define EVEN(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */
  3390  
  3391  /* Default size of memory map.
  3392   * This is certainly too small for any actual applications. Apps should
  3393   * always set the size explicitly using mdbx_env_set_geometry(). */
  3394  #define DEFAULT_MAPSIZE MEGABYTE
  3395  
  3396  /* Number of slots in the reader table.
  3397   * This value was chosen somewhat arbitrarily. The 61 is a prime number,
  3398   * and such readers plus a couple mutexes fit into single 4KB page.
  3399   * Applications should set the table size using mdbx_env_set_maxreaders(). */
  3400  #define DEFAULT_READERS 61
  3401  
  3402  /* Test if a page is a leaf page */
  3403  #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0)
  3404  /* Test if a page is a LEAF2 page */
  3405  #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0)
  3406  /* Test if a page is a branch page */
  3407  #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0)
  3408  /* Test if a page is an overflow page */
  3409  #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0)
  3410  /* Test if a page is a sub page */
  3411  #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0)
  3412  
  3413  /* Header for a single key/data pair within a page.
  3414   * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
  3415   * We guarantee 2-byte alignment for 'MDBX_node's.
  3416   *
  3417   * Leaf node flags describe node contents.  F_BIGDATA says the node's
  3418   * data part is the page number of an overflow page with actual data.
  3419   * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
  3420   * a sub-page/sub-database, and named databases (just F_SUBDATA). */
  3421  typedef struct MDBX_node {
  3422  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  3423    union {
  3424      uint32_t mn_dsize;
  3425      uint32_t mn_pgno32;
  3426    };
  3427    uint8_t mn_flags; /* see mdbx_node flags */
  3428    uint8_t mn_extra;
  3429    uint16_t mn_ksize; /* key size */
  3430  #else
  3431    uint16_t mn_ksize; /* key size */
  3432    uint8_t mn_extra;
  3433    uint8_t mn_flags; /* see mdbx_node flags */
  3434    union {
  3435      uint32_t mn_pgno32;
  3436      uint32_t mn_dsize;
  3437    };
  3438  #endif /* __BYTE_ORDER__ */
  3439  
  3440    /* mdbx_node Flags */
  3441  #define F_BIGDATA 0x01 /* data put on overflow page */
  3442  #define F_SUBDATA 0x02 /* data is a sub-database */
  3443  #define F_DUPDATA 0x04 /* data has duplicates */
  3444  
  3445    /* valid flags for mdbx_node_add() */
  3446  #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
  3447  
  3448  #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
  3449      (!defined(__cplusplus) && defined(_MSC_VER))
  3450    uint8_t mn_data[] /* key and data are appended here */;
  3451  #endif /* C99 */
  3452  } MDBX_node;
  3453  
  3454  #define DB_PERSISTENT_FLAGS                                                    \
  3455    (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED |          \
  3456     MDBX_INTEGERDUP | MDBX_REVERSEDUP)
  3457  
  3458  /* mdbx_dbi_open() flags */
  3459  #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE)
  3460  
  3461  #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */
  3462  #define DB_INTERNAL_FLAGS DB_VALID
  3463  
  3464  #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS
  3465  #error "Oops, some flags overlapped or wrong"
  3466  #endif
  3467  #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS
  3468  #error "Oops, some flags overlapped or wrong"
  3469  #endif
  3470  
  3471  /* max number of pages to commit in one writev() call */
  3472  #define MDBX_COMMIT_PAGES 64
  3473  #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
  3474  #undef MDBX_COMMIT_PAGES
  3475  #define MDBX_COMMIT_PAGES IOV_MAX
  3476  #endif
  3477  
  3478  /*
  3479   *                /
  3480   *                | -1, a < b
  3481   * CMP2INT(a,b) = <  0, a == b
  3482   *                |  1, a > b
  3483   *                \
  3484   */
  3485  #ifndef __e2k__
  3486  /* LY: fast enough on most systems */
  3487  #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b))
  3488  #else
  3489  /* LY: more parallelable on VLIW Elbrus */
  3490  #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a)))
  3491  #endif
  3492  
  3493  /* Do not spill pages to disk if txn is getting full, may fail instead */
  3494  #define MDBX_NOSPILL 0x8000
  3495  
  3496  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
  3497  int64pgno(int64_t i64) {
  3498    if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1))
  3499      return (pgno_t)i64;
  3500    return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO;
  3501  }
  3502  
  3503  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
  3504  pgno_add(size_t base, size_t augend) {
  3505    assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO);
  3506    return int64pgno(base + augend);
  3507  }
  3508  
  3509  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
  3510  pgno_sub(size_t base, size_t subtrahend) {
  3511    assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 &&
  3512           subtrahend < MAX_PAGENO);
  3513    return int64pgno(base - subtrahend);
  3514  }
  3515  
  3516  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool
  3517  is_powerof2(size_t x) {
  3518    return (x & (x - 1)) == 0;
  3519  }
  3520  
  3521  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
  3522  floor_powerof2(size_t value, size_t granularity) {
  3523    assert(is_powerof2(granularity));
  3524    return value & ~(granularity - 1);
  3525  }
  3526  
  3527  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
  3528  ceil_powerof2(size_t value, size_t granularity) {
  3529    return floor_powerof2(value + granularity - 1, granularity);
  3530  }
  3531  
  3532  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
  3533  log2n_powerof2(size_t value) {
  3534    assert(value > 0 && value < INT32_MAX && is_powerof2(value));
  3535    assert((value & -(int32_t)value) == value);
  3536  #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
  3537    return __builtin_ctzl(value);
  3538  #elif defined(_MSC_VER)
  3539    unsigned long index;
  3540    _BitScanForward(&index, (unsigned long)value);
  3541    return index;
  3542  #else
  3543    static const uint8_t debruijn_ctz32[32] = {
  3544        0,  1,  28, 2,  29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4,  8,
  3545        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6,  11, 5,  10, 9};
  3546    return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
  3547  #endif
  3548  }
  3549  
  3550  /* Only a subset of the mdbx_env flags can be changed
  3551   * at runtime. Changing other flags requires closing the
  3552   * environment and re-opening it with the new flags. */
  3553  #define ENV_CHANGEABLE_FLAGS                                                   \
  3554    (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC |             \
  3555     MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE |           \
  3556     MDBX_VALIDATION)
  3557  #define ENV_CHANGELESS_FLAGS                                                   \
  3558    (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \
  3559     MDBX_LIFORECLAIM | MDBX_EXCLUSIVE)
  3560  #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS)
  3561  
  3562  #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS
  3563  MDBX_MAYBE_UNUSED static void static_checks(void) {
  3564    STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI,
  3565                      "Oops, MDBX_MAX_DBI or CORE_DBS?");
  3566    STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) ==
  3567                          ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) &
  3568                           (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)),
  3569                      "Oops, some flags overlapped or wrong");
  3570    STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0,
  3571                      "Oops, some flags overlapped or wrong");
  3572  }
  3573  #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */
  3574  
  3575  #ifdef __cplusplus
  3576  }
  3577  #endif
  3578  
  3579  #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size)                             \
  3580    do {                                                                         \
  3581      TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),               \
  3582            (size_t)(size), __LINE__);                                           \
  3583      ASAN_POISON_MEMORY_REGION(addr, size);                                     \
  3584    } while (0)
  3585  
  3586  #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size)                           \
  3587    do {                                                                         \
  3588      TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),             \
  3589            (size_t)(size), __LINE__);                                           \
  3590      ASAN_UNPOISON_MEMORY_REGION(addr, size);                                   \
  3591    } while (0)
  3592  /*
  3593   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>.
  3594   * and other libmdbx authors: please see AUTHORS file.
  3595   * All rights reserved.
  3596   *
  3597   * This code is derived from "LMDB engine" written by
  3598   * Howard Chu (Symas Corporation), which itself derived from btree.c
  3599   * written by Martin Hedenfalk.
  3600   *
  3601   * ---
  3602   *
  3603   * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved.
  3604   *
  3605   * Redistribution and use in source and binary forms, with or without
  3606   * modification, are permitted only as authorized by the OpenLDAP
  3607   * Public License.
  3608   *
  3609   * A copy of this license is available in the file LICENSE in the
  3610   * top-level directory of the distribution or, alternatively, at
  3611   * <http://www.OpenLDAP.org/license.html>.
  3612   *
  3613   * ---
  3614   *
  3615   * Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
  3616   *
  3617   * Permission to use, copy, modify, and distribute this software for any
  3618   * purpose with or without fee is hereby granted, provided that the above
  3619   * copyright notice and this permission notice appear in all copies.
  3620   *
  3621   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  3622   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  3623   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  3624   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  3625   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  3626   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  3627   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  3628  
  3629  
  3630  /*------------------------------------------------------------------------------
  3631   * Internal inline functions */
  3632  
  3633  MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) {
  3634    assert(value > INT_MIN);
  3635    const unsigned expanded_sign =
  3636        (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1));
  3637    return ((unsigned)value + expanded_sign) ^ expanded_sign;
  3638  }
  3639  
  3640  /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
  3641  MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m,
  3642                                                          unsigned e) {
  3643    assert(m < 2048 && e < 8);
  3644    return (pgno_t)(32768 + ((m + 1) << (e + 8)));
  3645  }
  3646  
  3647  MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v,
  3648                                                            unsigned e) {
  3649    assert(v > (e ? me2v(2047, e - 1) : 32768));
  3650    assert(v <= me2v(2047, e));
  3651    size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8);
  3652    m -= m > 0;
  3653    assert(m < 2048 && e < 8);
  3654    // f e d c b a 9 8 7 6 5 4 3 2 1 0
  3655    // 1 e e e m m m m m m m m m m m 1
  3656    const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1));
  3657    assert(pv != 65535);
  3658    return pv;
  3659  }
  3660  
  3661  /* Convert 16-bit packed (exponential quantized) value to number of pages */
  3662  MDBX_NOTHROW_CONST_FUNCTION static pgno_t pv2pages(uint16_t pv) {
  3663    if ((pv & 0x8001) != 0x8001)
  3664      return pv;
  3665    if (pv == 65535)
  3666      return 65536;
  3667    // f e d c b a 9 8 7 6 5 4 3 2 1 0
  3668    // 1 e e e m m m m m m m m m m m 1
  3669    return me2v((pv >> 1) & 2047, (pv >> 12) & 7);
  3670  }
  3671  
  3672  /* Convert number of pages to 16-bit packed (exponential quantized) value */
  3673  MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) {
  3674    if (pages < 32769 || (pages < 65536 && (pages & 1) == 0))
  3675      return (uint16_t)pages;
  3676    if (pages <= me2v(2047, 0))
  3677      return v2me(pages, 0);
  3678    if (pages <= me2v(2047, 1))
  3679      return v2me(pages, 1);
  3680    if (pages <= me2v(2047, 2))
  3681      return v2me(pages, 2);
  3682    if (pages <= me2v(2047, 3))
  3683      return v2me(pages, 3);
  3684    if (pages <= me2v(2047, 4))
  3685      return v2me(pages, 4);
  3686    if (pages <= me2v(2047, 5))
  3687      return v2me(pages, 5);
  3688    if (pages <= me2v(2047, 6))
  3689      return v2me(pages, 6);
  3690    return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533;
  3691  }
  3692  
  3693  /*------------------------------------------------------------------------------
  3694   * Unaligned access */
  3695  
  3696  MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned
  3697  field_alignment(unsigned alignment_baseline, size_t field_offset) {
  3698    unsigned merge = alignment_baseline | (unsigned)field_offset;
  3699    return merge & -(int)merge;
  3700  }
  3701  
  3702  /* read-thunk for UB-sanitizer */
  3703  MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t
  3704  peek_u8(const uint8_t *const __restrict ptr) {
  3705    return *ptr;
  3706  }
  3707  
  3708  /* write-thunk for UB-sanitizer */
  3709  static __always_inline void poke_u8(uint8_t *const __restrict ptr,
  3710                                      const uint8_t v) {
  3711    *ptr = v;
  3712  }
  3713  
  3714  MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t
  3715  unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) {
  3716    assert((uintptr_t)ptr % expected_alignment == 0);
  3717    if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0)
  3718      return *(const uint16_t *)ptr;
  3719    else {
  3720  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3721      defined(_M_X64) || defined(_M_IA64)
  3722      return *(const __unaligned uint16_t *)ptr;
  3723  #else
  3724      uint16_t v;
  3725      memcpy(&v, ptr, sizeof(v));
  3726      return v;
  3727  #endif /* _MSC_VER || __unaligned */
  3728    }
  3729  }
  3730  
  3731  static __always_inline void
  3732  unaligned_poke_u16(const unsigned expected_alignment,
  3733                     void *const __restrict ptr, const uint16_t v) {
  3734    assert((uintptr_t)ptr % expected_alignment == 0);
  3735    if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0)
  3736      *(uint16_t *)ptr = v;
  3737    else {
  3738  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3739      defined(_M_X64) || defined(_M_IA64)
  3740      *((uint16_t __unaligned *)ptr) = v;
  3741  #else
  3742      memcpy(ptr, &v, sizeof(v));
  3743  #endif /* _MSC_VER || __unaligned */
  3744    }
  3745  }
  3746  
  3747  MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32(
  3748      const unsigned expected_alignment, const void *const __restrict ptr) {
  3749    assert((uintptr_t)ptr % expected_alignment == 0);
  3750    if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0)
  3751      return *(const uint32_t *)ptr;
  3752    else if ((expected_alignment % sizeof(uint16_t)) == 0) {
  3753      const uint16_t lo =
  3754          ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
  3755      const uint16_t hi =
  3756          ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
  3757      return lo | (uint32_t)hi << 16;
  3758    } else {
  3759  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3760      defined(_M_X64) || defined(_M_IA64)
  3761      return *(const __unaligned uint32_t *)ptr;
  3762  #else
  3763      uint32_t v;
  3764      memcpy(&v, ptr, sizeof(v));
  3765      return v;
  3766  #endif /* _MSC_VER || __unaligned */
  3767    }
  3768  }
  3769  
  3770  static __always_inline void
  3771  unaligned_poke_u32(const unsigned expected_alignment,
  3772                     void *const __restrict ptr, const uint32_t v) {
  3773    assert((uintptr_t)ptr % expected_alignment == 0);
  3774    if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0)
  3775      *(uint32_t *)ptr = v;
  3776    else if ((expected_alignment % sizeof(uint16_t)) == 0) {
  3777      ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v;
  3778      ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
  3779          (uint16_t)(v >> 16);
  3780    } else {
  3781  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3782      defined(_M_X64) || defined(_M_IA64)
  3783      *((uint32_t __unaligned *)ptr) = v;
  3784  #else
  3785      memcpy(ptr, &v, sizeof(v));
  3786  #endif /* _MSC_VER || __unaligned */
  3787    }
  3788  }
  3789  
  3790  MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64(
  3791      const unsigned expected_alignment, const void *const __restrict ptr) {
  3792    assert((uintptr_t)ptr % expected_alignment == 0);
  3793    if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
  3794      return *(const uint64_t *)ptr;
  3795    else if ((expected_alignment % sizeof(uint32_t)) == 0) {
  3796      const uint32_t lo =
  3797          ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
  3798      const uint32_t hi =
  3799          ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
  3800      return lo | (uint64_t)hi << 32;
  3801    } else {
  3802  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3803      defined(_M_X64) || defined(_M_IA64)
  3804      return *(const __unaligned uint64_t *)ptr;
  3805  #else
  3806      uint64_t v;
  3807      memcpy(&v, ptr, sizeof(v));
  3808      return v;
  3809  #endif /* _MSC_VER || __unaligned */
  3810    }
  3811  }
  3812  
  3813  static __always_inline uint64_t
  3814  unaligned_peek_u64_volatile(const unsigned expected_alignment,
  3815                              const volatile void *const __restrict ptr) {
  3816    assert((uintptr_t)ptr % expected_alignment == 0);
  3817    assert(expected_alignment % sizeof(uint32_t) == 0);
  3818    if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0)
  3819      return *(const volatile uint64_t *)ptr;
  3820    else {
  3821  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3822      defined(_M_X64) || defined(_M_IA64)
  3823      return *(const volatile __unaligned uint64_t *)ptr;
  3824  #else
  3825      const uint32_t lo = ((const volatile uint32_t *)
  3826                               ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
  3827      const uint32_t hi = ((const volatile uint32_t *)
  3828                               ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
  3829      return lo | (uint64_t)hi << 32;
  3830  #endif /* _MSC_VER || __unaligned */
  3831    }
  3832  }
  3833  
  3834  static __always_inline void
  3835  unaligned_poke_u64(const unsigned expected_alignment,
  3836                     void *const __restrict ptr, const uint64_t v) {
  3837    assert((uintptr_t)ptr % expected_alignment == 0);
  3838    if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0)
  3839      *(uint64_t *)ptr = v;
  3840    else if ((expected_alignment % sizeof(uint32_t)) == 0) {
  3841      ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v;
  3842      ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
  3843          (uint32_t)(v >> 32);
  3844    } else {
  3845  #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) ||            \
  3846      defined(_M_X64) || defined(_M_IA64)
  3847      *((uint64_t __unaligned *)ptr) = v;
  3848  #else
  3849      memcpy(ptr, &v, sizeof(v));
  3850  #endif /* _MSC_VER || __unaligned */
  3851    }
  3852  }
  3853  
  3854  #define UNALIGNED_PEEK_8(ptr, struct, field)                                   \
  3855    peek_u8((const uint8_t *)(ptr) + offsetof(struct, field))
  3856  #define UNALIGNED_POKE_8(ptr, struct, field, value)                            \
  3857    poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value)
  3858  
  3859  #define UNALIGNED_PEEK_16(ptr, struct, field)                                  \
  3860    unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field))
  3861  #define UNALIGNED_POKE_16(ptr, struct, field, value)                           \
  3862    unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value)
  3863  
  3864  #define UNALIGNED_PEEK_32(ptr, struct, field)                                  \
  3865    unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field))
  3866  #define UNALIGNED_POKE_32(ptr, struct, field, value)                           \
  3867    unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value)
  3868  
  3869  #define UNALIGNED_PEEK_64(ptr, struct, field)                                  \
  3870    unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field))
  3871  #define UNALIGNED_POKE_64(ptr, struct, field, value)                           \
  3872    unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value)
  3873  
  3874  /* Get the page number pointed to by a branch node */
  3875  MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
  3876  node_pgno(const MDBX_node *const __restrict node) {
  3877    pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32);
  3878    if (sizeof(pgno) > 4)
  3879      pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32;
  3880    return pgno;
  3881  }
  3882  
  3883  /* Set the page number in a branch node */
  3884  static __always_inline void node_set_pgno(MDBX_node *const __restrict node,
  3885                                            pgno_t pgno) {
  3886    assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO);
  3887  
  3888    UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno);
  3889    if (sizeof(pgno) > 4)
  3890      UNALIGNED_POKE_8(node, MDBX_node, mn_extra,
  3891                       (uint8_t)((uint64_t)pgno >> 32));
  3892  }
  3893  
  3894  /* Get the size of the data in a leaf node */
  3895  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  3896  node_ds(const MDBX_node *const __restrict node) {
  3897    return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize);
  3898  }
  3899  
  3900  /* Set the size of the data for a leaf node */
  3901  static __always_inline void node_set_ds(MDBX_node *const __restrict node,
  3902                                          size_t size) {
  3903    assert(size < INT_MAX);
  3904    UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size);
  3905  }
  3906  
  3907  /* The size of a key in a node */
  3908  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  3909  node_ks(const MDBX_node *const __restrict node) {
  3910    return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize);
  3911  }
  3912  
  3913  /* Set the size of the key for a leaf node */
  3914  static __always_inline void node_set_ks(MDBX_node *const __restrict node,
  3915                                          size_t size) {
  3916    assert(size < INT16_MAX);
  3917    UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size);
  3918  }
  3919  
  3920  MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t
  3921  node_flags(const MDBX_node *const __restrict node) {
  3922    return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags);
  3923  }
  3924  
  3925  static __always_inline void node_set_flags(MDBX_node *const __restrict node,
  3926                                             uint8_t flags) {
  3927    UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags);
  3928  }
  3929  
  3930  /* Size of the node header, excluding dynamic data at the end */
  3931  #define NODESIZE offsetof(MDBX_node, mn_data)
  3932  
  3933  /* Address of the key for the node */
  3934  MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
  3935  node_key(const MDBX_node *const __restrict node) {
  3936    return (char *)node + NODESIZE;
  3937  }
  3938  
  3939  /* Address of the data for a node */
  3940  MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
  3941  node_data(const MDBX_node *const __restrict node) {
  3942    return (char *)node_key(node) + node_ks(node);
  3943  }
  3944  
  3945  /* Size of a node in a leaf page with a given key and data.
  3946   * This is node header plus key plus data size. */
  3947  MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
  3948  node_size_len(const size_t key_len, const size_t value_len) {
  3949    return NODESIZE + EVEN(key_len + value_len);
  3950  }
  3951  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  3952  node_size(const MDBX_val *key, const MDBX_val *value) {
  3953    return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0);
  3954  }
  3955  
  3956  MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
  3957  peek_pgno(const void *const __restrict ptr) {
  3958    if (sizeof(pgno_t) == sizeof(uint32_t))
  3959      return (pgno_t)unaligned_peek_u32(1, ptr);
  3960    else if (sizeof(pgno_t) == sizeof(uint64_t))
  3961      return (pgno_t)unaligned_peek_u64(1, ptr);
  3962    else {
  3963      pgno_t pgno;
  3964      memcpy(&pgno, ptr, sizeof(pgno));
  3965      return pgno;
  3966    }
  3967  }
  3968  
  3969  static __always_inline void poke_pgno(void *const __restrict ptr,
  3970                                        const pgno_t pgno) {
  3971    if (sizeof(pgno) == sizeof(uint32_t))
  3972      unaligned_poke_u32(1, ptr, pgno);
  3973    else if (sizeof(pgno) == sizeof(uint64_t))
  3974      unaligned_poke_u64(1, ptr, pgno);
  3975    else
  3976      memcpy(ptr, &pgno, sizeof(pgno));
  3977  }
  3978  
  3979  MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
  3980  node_largedata_pgno(const MDBX_node *const __restrict node) {
  3981    assert(node_flags(node) & F_BIGDATA);
  3982    return peek_pgno(node_data(node));
  3983  }
  3984  
  3985  /*------------------------------------------------------------------------------
  3986   * Nodes, Keys & Values length limitation factors:
  3987   *
  3988   * BRANCH_NODE_MAX
  3989   *   Branch-page must contain at least two nodes, within each a key and a child
  3990   *   page number. But page can't be splitted if it contains less that 4 keys,
  3991   *   i.e. a page should not overflow before adding the fourth key. Therefore,
  3992   *   at least 3 branch-node should fit in the single branch-page. Further, the
  3993   *   first node of a branch-page doesn't contain a key, i.e. the first node
  3994   *   is always require space just for itself. Thus:
  3995   *       PAGEROOM = pagesize - page_hdr_len;
  3996   *       BRANCH_NODE_MAX = even_floor(
  3997   *         (PAGEROOM - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t));
  3998   *       KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len;
  3999   *
  4000   * LEAF_NODE_MAX
  4001   *   Leaf-node must fit into single leaf-page, where a value could be placed on
  4002   *   a large/overflow page. However, may require to insert a nearly page-sized
  4003   *   node between two large nodes are already fill-up a page. In this case the
  4004   *   page must be splitted to two if some pair of nodes fits on one page, or
  4005   *   otherwise the page should be splitted to the THREE with a single node
  4006   *   per each of ones. Such 1-into-3 page splitting is costly and complex since
  4007   *   requires TWO insertion into the parent page, that could lead to split it
  4008   *   and so on up to the root. Therefore double-splitting is avoided here and
  4009   *   the maximum node size is half of a leaf page space:
  4010   *       LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t));
  4011   *       DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX;
  4012   *
  4013   *  - SubDatabase-node must fit into one leaf-page:
  4014   *       SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db);
  4015   *
  4016   *  - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer
  4017   *    than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX,
  4018   *    since dupsort value couldn't be placed on a large/overflow page:
  4019   *       DUPSORT_DATALEN_MAX = min(KEYLEN_MAX,
  4020   *                                 max(DATALEN_NO_OVERFLOW, sizeof(MDBX_db));
  4021   */
  4022  
  4023  #define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ)
  4024  #define EVEN_FLOOR(n) ((n) & ~(size_t)1)
  4025  #define BRANCH_NODE_MAX(pagesize)                                              \
  4026    (EVEN_FLOOR((PAGEROOM(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) -     \
  4027                sizeof(indx_t)))
  4028  #define LEAF_NODE_MAX(pagesize)                                                \
  4029    (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t))
  4030  #define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1)
  4031  
  4032  static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) {
  4033    assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE &&
  4034           is_powerof2(pagesize));
  4035    STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8);
  4036    if (flags & MDBX_INTEGERKEY)
  4037      return 8 /* sizeof(uint64_t) */;
  4038  
  4039    const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE;
  4040    STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) - NODESIZE -
  4041                      /* sizeof(uint64) as a key */ 8 >
  4042                  sizeof(MDBX_db));
  4043    if (flags &
  4044        (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
  4045      const intptr_t max_dupsort_leaf_key =
  4046          LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db);
  4047      return (max_branch_key < max_dupsort_leaf_key)
  4048                 ? (unsigned)max_branch_key
  4049                 : (unsigned)max_dupsort_leaf_key;
  4050    }
  4051    return (unsigned)max_branch_key;
  4052  }
  4053  
  4054  static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) {
  4055    assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE &&
  4056           is_powerof2(pagesize));
  4057  
  4058    if (flags & MDBX_INTEGERDUP)
  4059      return 8 /* sizeof(uint64_t) */;
  4060  
  4061    if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
  4062      return keysize_max(pagesize, 0);
  4063  
  4064    const unsigned page_ln2 = log2n_powerof2(pagesize);
  4065    const size_t hard = 0x7FF00000ul;
  4066    const size_t hard_pages = hard >> page_ln2;
  4067    STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO);
  4068    const size_t pages_limit = MDBX_PGL_LIMIT / 4;
  4069    const size_t limit =
  4070        (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
  4071    return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
  4072  }
  4073  
  4074  __cold int mdbx_env_get_maxkeysize(const MDBX_env *env) {
  4075    return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT);
  4076  }
  4077  
  4078  __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env,
  4079                                        MDBX_db_flags_t flags) {
  4080    if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE))
  4081      return -1;
  4082  
  4083    return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags);
  4084  }
  4085  
  4086  size_t mdbx_default_pagesize(void) {
  4087    size_t pagesize = osal_syspagesize();
  4088    ENSURE(nullptr, is_powerof2(pagesize));
  4089    pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE;
  4090    pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE;
  4091    return pagesize;
  4092  }
  4093  
  4094  __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize,
  4095                                          MDBX_db_flags_t flags) {
  4096    if (pagesize < 1)
  4097      pagesize = (intptr_t)mdbx_default_pagesize();
  4098    if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
  4099                 pagesize > (intptr_t)MAX_PAGESIZE ||
  4100                 !is_powerof2((size_t)pagesize)))
  4101      return -1;
  4102  
  4103    return keysize_max(pagesize, flags);
  4104  }
  4105  
  4106  __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env,
  4107                                        MDBX_db_flags_t flags) {
  4108    if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE))
  4109      return -1;
  4110  
  4111    return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags);
  4112  }
  4113  
  4114  __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize,
  4115                                          MDBX_db_flags_t flags) {
  4116    if (pagesize < 1)
  4117      pagesize = (intptr_t)mdbx_default_pagesize();
  4118    if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
  4119                 pagesize > (intptr_t)MAX_PAGESIZE ||
  4120                 !is_powerof2((size_t)pagesize)))
  4121      return -1;
  4122  
  4123    return valsize_max(pagesize, flags);
  4124  }
  4125  
  4126  /* Calculate the size of a leaf node.
  4127   *
  4128   * The size depends on the environment's page size; if a data item
  4129   * is too large it will be put onto an large/overflow page and the node
  4130   * size will only include the key and not the data. Sizes are always
  4131   * rounded up to an even number of bytes, to guarantee 2-byte alignment
  4132   * of the MDBX_node headers. */
  4133  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  4134  leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) {
  4135    size_t node_bytes = node_size(key, data);
  4136    if (node_bytes > env->me_leaf_nodemax) {
  4137      /* put on large/overflow page */
  4138      node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t);
  4139    }
  4140  
  4141    return node_bytes + sizeof(indx_t);
  4142  }
  4143  
  4144  /* Calculate the size of a branch node.
  4145   *
  4146   * The size should depend on the environment's page size but since
  4147   * we currently don't support spilling large keys onto large/overflow
  4148   * pages, it's simply the size of the MDBX_node header plus the
  4149   * size of the key. Sizes are always rounded up to an even number
  4150   * of bytes, to guarantee 2-byte alignment of the MDBX_node headers.
  4151   *
  4152   * [in] env The environment handle.
  4153   * [in] key The key for the node.
  4154   *
  4155   * Returns The number of bytes needed to store the node. */
  4156  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  4157  branch_size(const MDBX_env *env, const MDBX_val *key) {
  4158    /* Size of a node in a branch page with a given key.
  4159     * This is just the node header plus the key, there is no data. */
  4160    size_t node_bytes = node_size(key, nullptr);
  4161    if (unlikely(node_bytes > env->me_leaf_nodemax)) {
  4162      /* put on large/overflow page */
  4163      /* not implemented */
  4164      mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__,
  4165                       __LINE__);
  4166      node_bytes = node_size(key, nullptr) + sizeof(pgno_t);
  4167    }
  4168  
  4169    return node_bytes + sizeof(indx_t);
  4170  }
  4171  
  4172  MDBX_NOTHROW_CONST_FUNCTION static __always_inline uint16_t
  4173  flags_db2sub(uint16_t db_flags) {
  4174    uint16_t sub_flags = db_flags & MDBX_DUPFIXED;
  4175  
  4176    /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */
  4177  #define SHIFT_INTEGERDUP_TO_INTEGERKEY 2
  4178    STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) ==
  4179                  MDBX_INTEGERKEY);
  4180    sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY;
  4181  
  4182    /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */
  4183  #define SHIFT_REVERSEDUP_TO_REVERSEKEY 5
  4184    STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) ==
  4185                  MDBX_REVERSEKEY);
  4186    sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY;
  4187  
  4188    return sub_flags;
  4189  }
  4190  
  4191  /*----------------------------------------------------------------------------*/
  4192  
  4193  MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
  4194  pgno2bytes(const MDBX_env *env, pgno_t pgno) {
  4195    eASSERT(env, (1u << env->me_psize2log) == env->me_psize);
  4196    return ((size_t)pgno) << env->me_psize2log;
  4197  }
  4198  
  4199  MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page *
  4200  pgno2page(const MDBX_env *env, pgno_t pgno) {
  4201    return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno));
  4202  }
  4203  
  4204  MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
  4205  bytes2pgno(const MDBX_env *env, size_t bytes) {
  4206    eASSERT(env, (env->me_psize >> env->me_psize2log) == 1);
  4207    return (pgno_t)(bytes >> env->me_psize2log);
  4208  }
  4209  
  4210  MDBX_NOTHROW_PURE_FUNCTION static size_t
  4211  pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) {
  4212    return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize);
  4213  }
  4214  
  4215  MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env,
  4216                                                              pgno_t pgno) {
  4217    return bytes2pgno(env, pgno_align2os_bytes(env, pgno));
  4218  }
  4219  
  4220  MDBX_NOTHROW_PURE_FUNCTION static size_t
  4221  bytes_align2os_bytes(const MDBX_env *env, size_t bytes) {
  4222    return ceil_powerof2(ceil_powerof2(bytes, env->me_psize), env->me_os_psize);
  4223  }
  4224  
  4225  /* Address of first usable data byte in a page, after the header */
  4226  MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
  4227  page_data(const MDBX_page *mp) {
  4228    return (char *)mp + PAGEHDRSZ;
  4229  }
  4230  
  4231  MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page *
  4232  data_page(const void *data) {
  4233    return container_of(data, MDBX_page, mp_ptrs);
  4234  }
  4235  
  4236  MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_meta *
  4237  page_meta(MDBX_page *mp) {
  4238    return (MDBX_meta *)page_data(mp);
  4239  }
  4240  
  4241  /* Number of nodes on a page */
  4242  MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
  4243  page_numkeys(const MDBX_page *mp) {
  4244    return mp->mp_lower >> 1;
  4245  }
  4246  
  4247  /* The amount of space remaining in the page */
  4248  MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
  4249  page_room(const MDBX_page *mp) {
  4250    return mp->mp_upper - mp->mp_lower;
  4251  }
  4252  
  4253  /* Maximum free space in an empty page */
  4254  MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
  4255  page_space(const MDBX_env *env) {
  4256    STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
  4257    return env->me_psize - PAGEHDRSZ;
  4258  }
  4259  
  4260  MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
  4261  page_used(const MDBX_env *env, const MDBX_page *mp) {
  4262    return page_space(env) - page_room(mp);
  4263  }
  4264  
  4265  /* The percentage of space used in the page, in a percents. */
  4266  MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double
  4267  page_fill(const MDBX_env *env, const MDBX_page *mp) {
  4268    return page_used(env, mp) * 100.0 / page_space(env);
  4269  }
  4270  
  4271  /* The number of large/overflow pages needed to store the given size. */
  4272  MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
  4273  number_of_ovpages(const MDBX_env *env, size_t bytes) {
  4274    return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1;
  4275  }
  4276  
  4277  __cold static const char *pagetype_caption(const uint8_t type,
  4278                                             char buf4unknown[16]) {
  4279    switch (type) {
  4280    case P_BRANCH:
  4281      return "branch";
  4282    case P_LEAF:
  4283      return "leaf";
  4284    case P_LEAF | P_SUBP:
  4285      return "subleaf";
  4286    case P_LEAF | P_LEAF2:
  4287      return "dupfixed-leaf";
  4288    case P_LEAF | P_LEAF2 | P_SUBP:
  4289      return "dupfixed-subleaf";
  4290    case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY:
  4291      return "dupfixed-subleaf.legacy-dirty";
  4292    case P_OVERFLOW:
  4293      return "large";
  4294    default:
  4295      snprintf(buf4unknown, 16, "unknown_0x%x", type);
  4296      return buf4unknown;
  4297    }
  4298  }
  4299  
  4300  __cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3)
  4301      bad_page(const MDBX_page *mp, const char *fmt, ...) {
  4302    if (LOG_ENABLED(MDBX_LOG_ERROR)) {
  4303      static const MDBX_page *prev;
  4304      if (prev != mp) {
  4305        char buf4unknown[16];
  4306        prev = mp;
  4307        debug_log(MDBX_LOG_ERROR, "badpage", 0,
  4308                  "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n",
  4309                  pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno,
  4310                  mp->mp_txnid);
  4311      }
  4312  
  4313      va_list args;
  4314      va_start(args, fmt);
  4315      debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args);
  4316      va_end(args);
  4317    }
  4318    return MDBX_CORRUPTED;
  4319  }
  4320  
  4321  __cold static void MDBX_PRINTF_ARGS(2, 3)
  4322      poor_page(const MDBX_page *mp, const char *fmt, ...) {
  4323    if (LOG_ENABLED(MDBX_LOG_NOTICE)) {
  4324      static const MDBX_page *prev;
  4325      if (prev != mp) {
  4326        char buf4unknown[16];
  4327        prev = mp;
  4328        debug_log(MDBX_LOG_NOTICE, "poorpage", 0,
  4329                  "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n",
  4330                  pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno,
  4331                  mp->mp_txnid);
  4332      }
  4333  
  4334      va_list args;
  4335      va_start(args, fmt);
  4336      debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args);
  4337      va_end(args);
  4338    }
  4339  }
  4340  
  4341  /* Address of node i in page p */
  4342  MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node *
  4343  page_node(const MDBX_page *mp, unsigned i) {
  4344    assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH);
  4345    assert(page_numkeys(mp) > (unsigned)(i));
  4346    assert(mp->mp_ptrs[i] % 2 == 0);
  4347    return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ);
  4348  }
  4349  
  4350  /* The address of a key in a LEAF2 page.
  4351   * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs.
  4352   * There are no node headers, keys are stored contiguously. */
  4353  MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
  4354  page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) {
  4355    assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2));
  4356    assert(mp->mp_leaf2_ksize == keysize);
  4357    (void)keysize;
  4358    return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize);
  4359  }
  4360  
  4361  /* Set the node's key into keyptr. */
  4362  static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) {
  4363    keyptr->iov_len = node_ks(node);
  4364    keyptr->iov_base = node_key(node);
  4365  }
  4366  
  4367  /* Set the node's key into keyptr, if requested. */
  4368  static __always_inline void
  4369  get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) {
  4370    if (keyptr)
  4371      get_key(node, keyptr);
  4372  }
  4373  
  4374  /*------------------------------------------------------------------------------
  4375   * safe read/write volatile 64-bit fields on 32-bit architectures. */
  4376  
  4377  #ifndef atomic_store64
  4378  MDBX_MAYBE_UNUSED static __always_inline uint64_t
  4379  atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value,
  4380                 enum MDBX_memory_order order) {
  4381    STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8);
  4382  #if MDBX_64BIT_ATOMIC
  4383  #ifdef MDBX_HAVE_C11ATOMICS
  4384    assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
  4385    atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order));
  4386  #else  /* MDBX_HAVE_C11ATOMICS */
  4387    if (order != mo_Relaxed)
  4388      osal_compiler_barrier();
  4389    p->weak = value;
  4390    osal_memory_fence(order, true);
  4391  #endif /* MDBX_HAVE_C11ATOMICS */
  4392  #else  /* !MDBX_64BIT_ATOMIC */
  4393    osal_compiler_barrier();
  4394    atomic_store32(&p->low, (uint32_t)value, mo_Relaxed);
  4395    jitter4testing(true);
  4396    atomic_store32(&p->high, (uint32_t)(value >> 32), order);
  4397    jitter4testing(true);
  4398  #endif /* !MDBX_64BIT_ATOMIC */
  4399    return value;
  4400  }
  4401  #endif /* atomic_store64 */
  4402  
  4403  #ifndef atomic_load64
  4404  MDBX_MAYBE_UNUSED static
  4405  #if MDBX_64BIT_ATOMIC
  4406      __always_inline
  4407  #endif /* MDBX_64BIT_ATOMIC */
  4408          uint64_t
  4409          atomic_load64(const volatile MDBX_atomic_uint64_t *p,
  4410                        enum MDBX_memory_order order) {
  4411    STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8);
  4412  #if MDBX_64BIT_ATOMIC
  4413  #ifdef MDBX_HAVE_C11ATOMICS
  4414    assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p)));
  4415    return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order));
  4416  #else  /* MDBX_HAVE_C11ATOMICS */
  4417    osal_memory_fence(order, false);
  4418    const uint64_t value = p->weak;
  4419    if (order != mo_Relaxed)
  4420      osal_compiler_barrier();
  4421    return value;
  4422  #endif /* MDBX_HAVE_C11ATOMICS */
  4423  #else  /* !MDBX_64BIT_ATOMIC */
  4424    osal_compiler_barrier();
  4425    uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32;
  4426    jitter4testing(true);
  4427    value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
  4428                                                          : mo_AcquireRelease);
  4429    jitter4testing(true);
  4430    for (;;) {
  4431      osal_compiler_barrier();
  4432      uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32;
  4433      jitter4testing(true);
  4434      again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
  4435                                                            : mo_AcquireRelease);
  4436      jitter4testing(true);
  4437      if (likely(value == again))
  4438        return value;
  4439      value = again;
  4440    }
  4441  #endif /* !MDBX_64BIT_ATOMIC */
  4442  }
  4443  #endif /* atomic_load64 */
  4444  
  4445  static __always_inline void atomic_yield(void) {
  4446  #if defined(_WIN32) || defined(_WIN64)
  4447    YieldProcessor();
  4448  #elif defined(__ia32__) || defined(__e2k__)
  4449    __builtin_ia32_pause();
  4450  #elif defined(__ia64__)
  4451  #if defined(__HP_cc__) || defined(__HP_aCC__)
  4452    _Asm_hint(_HINT_PAUSE);
  4453  #else
  4454    __asm__ __volatile__("hint @pause");
  4455  #endif
  4456  #elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) ||       \
  4457      defined(__ARM_ARCH_6K__)
  4458  #ifdef __CC_ARM
  4459    __yield();
  4460  #else
  4461    __asm__ __volatile__("yield");
  4462  #endif
  4463  #elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \
  4464      __mips_isa_rev >= 2
  4465    __asm__ __volatile__("pause");
  4466  #elif defined(__mips) || defined(__mips__) || defined(__mips64) ||             \
  4467      defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
  4468      defined(__MWERKS__) || defined(__sgi)
  4469    __asm__ __volatile__(".word 0x00000140");
  4470  #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
  4471    sched_yield();
  4472  #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
  4473    pthread_yield();
  4474  #endif
  4475  }
  4476  
  4477  #if MDBX_64BIT_CAS
  4478  static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c,
  4479                                           uint64_t v) {
  4480  #ifdef MDBX_HAVE_C11ATOMICS
  4481    STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t));
  4482  #ifdef ATOMIC_LLONG_LOCK_FREE
  4483    STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0);
  4484  #if ATOMIC_LLONG_LOCK_FREE < 2
  4485    assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
  4486  #endif /* ATOMIC_LLONG_LOCK_FREE < 2 */
  4487  #else  /* defined(ATOMIC_LLONG_LOCK_FREE) */
  4488    assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
  4489  #endif
  4490    return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v);
  4491  #elif defined(__GNUC__) || defined(__clang__)
  4492    return __sync_bool_compare_and_swap(&p->weak, c, v);
  4493  #elif defined(_MSC_VER)
  4494    return c == (uint64_t)_InterlockedCompareExchange64(
  4495                    (volatile __int64 *)&p->weak, v, c);
  4496  #elif defined(__APPLE__)
  4497    return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak);
  4498  #else
  4499  #error FIXME: Unsupported compiler
  4500  #endif
  4501  }
  4502  #endif /* MDBX_64BIT_CAS */
  4503  
  4504  static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c,
  4505                                           uint32_t v) {
  4506  #ifdef MDBX_HAVE_C11ATOMICS
  4507    STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
  4508  #ifdef ATOMIC_INT_LOCK_FREE
  4509    STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0);
  4510  #if ATOMIC_INT_LOCK_FREE < 2
  4511    assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
  4512  #endif
  4513  #else
  4514    assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
  4515  #endif
  4516    return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v);
  4517  #elif defined(__GNUC__) || defined(__clang__)
  4518    return __sync_bool_compare_and_swap(&p->weak, c, v);
  4519  #elif defined(_MSC_VER)
  4520    STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
  4521    return c ==
  4522           (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c);
  4523  #elif defined(__APPLE__)
  4524    return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak);
  4525  #else
  4526  #error FIXME: Unsupported compiler
  4527  #endif
  4528  }
  4529  
  4530  static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p,
  4531                                               uint32_t v) {
  4532  #ifdef MDBX_HAVE_C11ATOMICS
  4533    STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
  4534  #ifdef ATOMIC_INT_LOCK_FREE
  4535    STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0);
  4536  #if ATOMIC_INT_LOCK_FREE < 2
  4537    assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
  4538  #endif
  4539  #else
  4540    assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
  4541  #endif
  4542    return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v);
  4543  #elif defined(__GNUC__) || defined(__clang__)
  4544    return __sync_fetch_and_add(&p->weak, v);
  4545  #elif defined(_MSC_VER)
  4546    STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
  4547    return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v);
  4548  #elif defined(__APPLE__)
  4549    return OSAtomicAdd32Barrier(v, &p->weak);
  4550  #else
  4551  #error FIXME: Unsupported compiler
  4552  #endif
  4553  }
  4554  
  4555  #define atomic_sub32(p, v) atomic_add32(p, 0 - (v))
  4556  
  4557  static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) {
  4558    txnid += xMDBX_TXNID_STEP;
  4559  #if !MDBX_64BIT_CAS
  4560    /* avoid overflow of low-part in safe64_reset() */
  4561    txnid += (UINT32_MAX == (uint32_t)txnid);
  4562  #endif
  4563    return txnid;
  4564  }
  4565  
  4566  /* Atomically make target value >= SAFE64_INVALID_THRESHOLD */
  4567  static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p,
  4568                                           bool single_writer) {
  4569    if (single_writer) {
  4570  #if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64
  4571      atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
  4572  #else
  4573      atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
  4574  #endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */
  4575    } else {
  4576  #if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC
  4577      /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */
  4578      atomic_store64(p, UINT64_MAX, mo_AcquireRelease);
  4579  #elif MDBX_64BIT_CAS
  4580      /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */
  4581      atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
  4582  #else
  4583      /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1
  4584       * and overflow was preserved in safe64_txnid_next() */
  4585      STATIC_ASSERT(xMDBX_TXNID_STEP > 1);
  4586      atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
  4587      atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease);
  4588      atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
  4589  #endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */
  4590    }
  4591    assert(p->weak >= SAFE64_INVALID_THRESHOLD);
  4592    jitter4testing(true);
  4593  }
  4594  
  4595  static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p,
  4596                                                   txnid_t compare) {
  4597    /* LY: This function is used to reset `mr_txnid` from hsr-handler in case
  4598     *     the asynchronously cancellation of read transaction. Therefore,
  4599     *     there may be a collision between the cleanup performed here and
  4600     *     asynchronous termination and restarting of the read transaction
  4601     *     in another proces/thread. In general we MUST NOT reset the `mr_txnid`
  4602     *     if a new transaction was started (i.e. if `mr_txnid` was changed). */
  4603  #if MDBX_64BIT_CAS
  4604    bool rc = atomic_cas64(p, compare, UINT64_MAX);
  4605  #else
  4606    /* LY: There is no gold ratio here since shared mutex is too costly,
  4607     *     in such way we must acquire/release it for every update of mr_txnid,
  4608     *     i.e. twice for each read transaction). */
  4609    bool rc = false;
  4610    if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare &&
  4611               atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) {
  4612      if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) !=
  4613                   (uint32_t)compare))
  4614        atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32));
  4615      else
  4616        rc = true;
  4617    }
  4618  #endif /* MDBX_64BIT_CAS */
  4619    jitter4testing(true);
  4620    return rc;
  4621  }
  4622  
  4623  static __always_inline void safe64_write(MDBX_atomic_uint64_t *p,
  4624                                           const uint64_t v) {
  4625    assert(p->weak >= SAFE64_INVALID_THRESHOLD);
  4626  #if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS
  4627    atomic_store64(p, v, mo_AcquireRelease);
  4628  #else  /* MDBX_64BIT_ATOMIC */
  4629    osal_compiler_barrier();
  4630    /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */
  4631    atomic_store32(&p->low, (uint32_t)v, mo_Relaxed);
  4632    assert(p->weak >= SAFE64_INVALID_THRESHOLD);
  4633    jitter4testing(true);
  4634    /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */
  4635    atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease);
  4636  #endif /* MDBX_64BIT_ATOMIC */
  4637    assert(p->weak == v);
  4638    jitter4testing(true);
  4639  }
  4640  
  4641  static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) {
  4642    jitter4testing(true);
  4643    uint64_t v;
  4644    do
  4645      v = atomic_load64(p, mo_AcquireRelease);
  4646    while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak));
  4647    return v;
  4648  }
  4649  
  4650  #if 0 /* unused for now */
  4651  MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) {
  4652  #if MDBX_WORDBITS >= 64
  4653    return v < SAFE64_INVALID_THRESHOLD;
  4654  #else
  4655    return (v >> 32) != UINT32_MAX;
  4656  #endif /* MDBX_WORDBITS */
  4657  }
  4658  
  4659  MDBX_MAYBE_UNUSED static __always_inline bool
  4660   safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) {
  4661  #if MDBX_64BIT_ATOMIC
  4662    return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD;
  4663  #else
  4664    return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX;
  4665  #endif /* MDBX_64BIT_ATOMIC */
  4666  }
  4667  #endif /* unused for now */
  4668  
  4669  /* non-atomic write with safety for reading a half-updated value */
  4670  static __always_inline void safe64_update(MDBX_atomic_uint64_t *p,
  4671                                            const uint64_t v) {
  4672  #if MDBX_64BIT_ATOMIC
  4673    atomic_store64(p, v, mo_Relaxed);
  4674  #else
  4675    safe64_reset(p, true);
  4676    safe64_write(p, v);
  4677  #endif /* MDBX_64BIT_ATOMIC */
  4678  }
  4679  
  4680  /* non-atomic increment with safety for reading a half-updated value */
  4681  MDBX_MAYBE_UNUSED static
  4682  #if MDBX_64BIT_ATOMIC
  4683      __always_inline
  4684  #endif /* MDBX_64BIT_ATOMIC */
  4685      void
  4686      safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) {
  4687    assert(v > 0);
  4688    safe64_update(p, safe64_read(p) + v);
  4689  }
  4690  
  4691  /*----------------------------------------------------------------------------*/
  4692  /* rthc (tls keys and destructors) */
  4693  
  4694  typedef struct rthc_entry_t {
  4695    MDBX_reader *begin;
  4696    MDBX_reader *end;
  4697    osal_thread_key_t thr_tls_key;
  4698  } rthc_entry_t;
  4699  
  4700  #if MDBX_DEBUG
  4701  #define RTHC_INITIAL_LIMIT 1
  4702  #else
  4703  #define RTHC_INITIAL_LIMIT 16
  4704  #endif
  4705  
  4706  static bin128_t bootid;
  4707  
  4708  #if defined(_WIN32) || defined(_WIN64)
  4709  static CRITICAL_SECTION rthc_critical_section;
  4710  static CRITICAL_SECTION lcklist_critical_section;
  4711  #else
  4712  
  4713  static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER;
  4714  static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER;
  4715  static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER;
  4716  static osal_thread_key_t rthc_key;
  4717  static MDBX_atomic_uint32_t rthc_pending;
  4718  
  4719  static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) {
  4720    uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^
  4721                    UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr);
  4722  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  4723    return salt << 8 | kind;
  4724  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  4725    return (uint64_t)kind << 56 | salt >> 8;
  4726  #else
  4727  #error "FIXME: Unsupported byte order"
  4728  #endif /* __BYTE_ORDER__ */
  4729  }
  4730  
  4731  #define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D)
  4732  #define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0)
  4733  static __thread uint64_t rthc_thread_state;
  4734  
  4735  #if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) &&                     \
  4736      !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS)
  4737  /* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */
  4738  #define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS                                     \
  4739    __attribute__((__no_sanitize_address__, __noinline__))
  4740  #else
  4741  #define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS __inline
  4742  #endif
  4743  
  4744  MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) {
  4745    return *(volatile uint64_t *)rthc;
  4746  }
  4747  
  4748  MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t
  4749  rthc_compare_and_clean(const void *rthc, const uint64_t signature) {
  4750  #if MDBX_64BIT_CAS
  4751    return atomic_cas64((MDBX_atomic_uint64_t *)rthc, signature, 0);
  4752  #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  4753    return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)signature, 0);
  4754  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  4755    return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32),
  4756                        0);
  4757  #else
  4758  #error "FIXME: Unsupported byte order"
  4759  #endif
  4760  }
  4761  
  4762  static __inline int rthc_atexit(void (*dtor)(void *), void *obj,
  4763                                  void *dso_symbol) {
  4764  #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL
  4765  #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) ||                           \
  4766      defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) ||         \
  4767      defined(ANDROID)
  4768  #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1
  4769  #else
  4770  #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0
  4771  #endif
  4772  #endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */
  4773  
  4774  #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT
  4775  #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) ||                                \
  4776      defined(HAVE___CXA_THREAD_ATEXIT)
  4777  #define MDBX_HAVE_CXA_THREAD_ATEXIT 1
  4778  #elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL &&                                     \
  4779      (defined(__linux__) || defined(__gnu_linux__))
  4780  #define MDBX_HAVE_CXA_THREAD_ATEXIT 1
  4781  #else
  4782  #define MDBX_HAVE_CXA_THREAD_ATEXIT 0
  4783  #endif
  4784  #endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */
  4785  
  4786    int rc = MDBX_ENOSYS;
  4787  #if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT
  4788  #define __cxa_thread_atexit __cxa_thread_atexit_impl
  4789  #endif
  4790  #if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit)
  4791    extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj,
  4792                                   void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE;
  4793    if (&__cxa_thread_atexit)
  4794      rc = __cxa_thread_atexit(dtor, obj, dso_symbol);
  4795  #elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE)
  4796    extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr)
  4797        MDBX_WEAK_IMPORT_ATTRIBUTE;
  4798    if (&_tlv_atexit) {
  4799      (void)dso_symbol;
  4800      _tlv_atexit(dtor, obj);
  4801      rc = 0;
  4802    }
  4803  #else
  4804    (void)dtor;
  4805    (void)obj;
  4806    (void)dso_symbol;
  4807  #endif
  4808    return rc;
  4809  }
  4810  
  4811  __cold static void workaround_glibc_bug21031(void) {
  4812    /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031
  4813     *
  4814     * Due race between pthread_key_delete() and __nptl_deallocate_tsd()
  4815     * The destructor(s) of thread-local-storage object(s) may be running
  4816     * in another thread(s) and be blocked or not finished yet.
  4817     * In such case we get a SEGFAULT after unload this library DSO.
  4818     *
  4819     * So just by yielding a few timeslices we give a chance
  4820     * to such destructor(s) for completion and avoids segfault. */
  4821    sched_yield();
  4822    sched_yield();
  4823    sched_yield();
  4824  }
  4825  #endif
  4826  
  4827  static unsigned rthc_count, rthc_limit;
  4828  static rthc_entry_t *rthc_table;
  4829  static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT];
  4830  
  4831  static __inline void rthc_lock(void) {
  4832  #if defined(_WIN32) || defined(_WIN64)
  4833    EnterCriticalSection(&rthc_critical_section);
  4834  #else
  4835    ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0);
  4836  #endif
  4837  }
  4838  
  4839  static __inline void rthc_unlock(void) {
  4840  #if defined(_WIN32) || defined(_WIN64)
  4841    LeaveCriticalSection(&rthc_critical_section);
  4842  #else
  4843    ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0);
  4844  #endif
  4845  }
  4846  
  4847  static __inline int thread_key_create(osal_thread_key_t *key) {
  4848    int rc;
  4849  #if defined(_WIN32) || defined(_WIN64)
  4850    *key = TlsAlloc();
  4851    rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError();
  4852  #else
  4853    rc = pthread_key_create(key, nullptr);
  4854  #endif
  4855    TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key),
  4856          (uintptr_t)*key, rc);
  4857    return rc;
  4858  }
  4859  
  4860  static __inline void thread_key_delete(osal_thread_key_t key) {
  4861    TRACE("key = %" PRIuPTR, (uintptr_t)key);
  4862  #if defined(_WIN32) || defined(_WIN64)
  4863    ENSURE(nullptr, TlsFree(key));
  4864  #else
  4865    ENSURE(nullptr, pthread_key_delete(key) == 0);
  4866    workaround_glibc_bug21031();
  4867  #endif
  4868  }
  4869  
  4870  static __inline void *thread_rthc_get(osal_thread_key_t key) {
  4871  #if defined(_WIN32) || defined(_WIN64)
  4872    return TlsGetValue(key);
  4873  #else
  4874    return pthread_getspecific(key);
  4875  #endif
  4876  }
  4877  
  4878  static void thread_rthc_set(osal_thread_key_t key, const void *value) {
  4879  #if defined(_WIN32) || defined(_WIN64)
  4880    ENSURE(nullptr, TlsSetValue(key, (void *)value));
  4881  #else
  4882    const uint64_t sign_registered =
  4883        MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state);
  4884    const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state);
  4885    if (value && unlikely(rthc_thread_state != sign_registered &&
  4886                          rthc_thread_state != sign_counted)) {
  4887      rthc_thread_state = sign_registered;
  4888      TRACE("thread registered 0x%" PRIxPTR, osal_thread_self());
  4889      if (rthc_atexit(thread_dtor, &rthc_thread_state,
  4890                      (void *)&mdbx_version /* dso_anchor */)) {
  4891        ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0);
  4892        rthc_thread_state = sign_counted;
  4893        const unsigned count_before = atomic_add32(&rthc_pending, 1);
  4894        ENSURE(nullptr, count_before < INT_MAX);
  4895        NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u",
  4896               (uintptr_t)rthc_key, count_before);
  4897        (void)count_before;
  4898      }
  4899    }
  4900    ENSURE(nullptr, pthread_setspecific(key, value) == 0);
  4901  #endif
  4902  }
  4903  
  4904  /* dtor called for thread, i.e. for all mdbx's environment objects */
  4905  __cold void thread_dtor(void *rthc) {
  4906    rthc_lock();
  4907    TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(),
  4908          osal_thread_self(), rthc);
  4909  
  4910    const uint32_t self_pid = osal_getpid();
  4911    for (unsigned i = 0; i < rthc_count; ++i) {
  4912      const osal_thread_key_t key = rthc_table[i].thr_tls_key;
  4913      MDBX_reader *const reader = thread_rthc_get(key);
  4914      if (reader < rthc_table[i].begin || reader >= rthc_table[i].end)
  4915        continue;
  4916  #if !defined(_WIN32) && !defined(_WIN64)
  4917      if (pthread_setspecific(key, nullptr) != 0) {
  4918        TRACE("== thread 0x%" PRIxPTR
  4919              ", rthc %p: ignore race with tsd-key deletion",
  4920              osal_thread_self(), __Wpedantic_format_voidptr(reader));
  4921        continue /* ignore race with tsd-key deletion by mdbx_env_close() */;
  4922      }
  4923  #endif
  4924  
  4925      TRACE("== thread 0x%" PRIxPTR
  4926            ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, "
  4927            "current-pid %i",
  4928            osal_thread_self(), __Wpedantic_format_voidptr(reader), i,
  4929            __Wpedantic_format_voidptr(rthc_table[i].begin),
  4930            __Wpedantic_format_voidptr(rthc_table[i].end),
  4931            (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid);
  4932      if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) {
  4933        TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(),
  4934              __Wpedantic_format_voidptr(reader));
  4935        atomic_cas32(&reader->mr_pid, self_pid, 0);
  4936      }
  4937    }
  4938  
  4939  #if defined(_WIN32) || defined(_WIN64)
  4940    TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc);
  4941    rthc_unlock();
  4942  #else
  4943    const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
  4944    const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
  4945    const uint64_t state = rthc_read(rthc);
  4946    if (state == sign_registered &&
  4947        rthc_compare_and_clean(rthc, sign_registered)) {
  4948      TRACE("== thread 0x%" PRIxPTR
  4949            ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  4950            osal_thread_self(), rthc, osal_getpid(), "registered", state);
  4951    } else if (state == sign_counted &&
  4952               rthc_compare_and_clean(rthc, sign_counted)) {
  4953      TRACE("== thread 0x%" PRIxPTR
  4954            ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  4955            osal_thread_self(), rthc, osal_getpid(), "counted", state);
  4956      ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
  4957    } else {
  4958      WARNING("thread 0x%" PRIxPTR
  4959              ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  4960              osal_thread_self(), rthc, osal_getpid(), "wrong", state);
  4961    }
  4962  
  4963    if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) {
  4964      TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(),
  4965            rthc, osal_getpid());
  4966      ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0);
  4967    }
  4968  
  4969    TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc);
  4970    /* Allow tail call optimization, i.e. gcc should generate the jmp instruction
  4971     * instead of a call for pthread_mutex_unlock() and therefore CPU could not
  4972     * return to current DSO's code section, which may be unloaded immediately
  4973     * after the mutex got released. */
  4974    pthread_mutex_unlock(&rthc_mutex);
  4975  #endif
  4976  }
  4977  
  4978  MDBX_EXCLUDE_FOR_GPROF
  4979  __cold void global_dtor(void) {
  4980    TRACE(">> pid %d", osal_getpid());
  4981  
  4982    rthc_lock();
  4983  #if !defined(_WIN32) && !defined(_WIN64)
  4984    uint64_t *rthc = pthread_getspecific(rthc_key);
  4985    TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64
  4986          ", left %d",
  4987          osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(),
  4988          rthc ? rthc_read(rthc) : ~UINT64_C(0),
  4989          atomic_load32(&rthc_pending, mo_Relaxed));
  4990    if (rthc) {
  4991      const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc);
  4992      const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc);
  4993      const uint64_t state = rthc_read(rthc);
  4994      if (state == sign_registered &&
  4995          rthc_compare_and_clean(rthc, sign_registered)) {
  4996        TRACE("== thread 0x%" PRIxPTR
  4997              ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  4998              osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(),
  4999              "registered", state);
  5000      } else if (state == sign_counted &&
  5001                 rthc_compare_and_clean(rthc, sign_counted)) {
  5002        TRACE("== thread 0x%" PRIxPTR
  5003              ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  5004              osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(),
  5005              "counted", state);
  5006        ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
  5007      } else {
  5008        WARNING("thread 0x%" PRIxPTR
  5009                ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")",
  5010                osal_thread_self(), __Wpedantic_format_voidptr(rthc),
  5011                osal_getpid(), "wrong", state);
  5012      }
  5013    }
  5014  
  5015    struct timespec abstime;
  5016    ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0);
  5017    abstime.tv_nsec += 1000000000l / 10;
  5018    if (abstime.tv_nsec >= 1000000000l) {
  5019      abstime.tv_nsec -= 1000000000l;
  5020      abstime.tv_sec += 1;
  5021    }
  5022  #if MDBX_DEBUG > 0
  5023    abstime.tv_sec += 600;
  5024  #endif
  5025  
  5026    for (unsigned left;
  5027         (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) {
  5028      NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left);
  5029      const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime);
  5030      if (rc && rc != EINTR)
  5031        break;
  5032    }
  5033    thread_key_delete(rthc_key);
  5034  #endif
  5035  
  5036    const uint32_t self_pid = osal_getpid();
  5037    for (unsigned i = 0; i < rthc_count; ++i) {
  5038      const osal_thread_key_t key = rthc_table[i].thr_tls_key;
  5039      thread_key_delete(key);
  5040      for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end;
  5041           ++rthc) {
  5042        TRACE("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), "
  5043              "rthc-pid %i, current-pid %i",
  5044              i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin),
  5045              __Wpedantic_format_voidptr(rthc_table[i].end),
  5046              __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin),
  5047              rthc->mr_pid.weak, self_pid);
  5048        if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) {
  5049          atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease);
  5050          TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc));
  5051        }
  5052      }
  5053    }
  5054  
  5055    rthc_limit = rthc_count = 0;
  5056    if (rthc_table != rthc_table_static)
  5057      osal_free(rthc_table);
  5058    rthc_table = nullptr;
  5059    rthc_unlock();
  5060  
  5061  #if defined(_WIN32) || defined(_WIN64)
  5062    DeleteCriticalSection(&lcklist_critical_section);
  5063    DeleteCriticalSection(&rthc_critical_section);
  5064  #else
  5065    /* LY: yielding a few timeslices to give a more chance
  5066     * to racing destructor(s) for completion. */
  5067    workaround_glibc_bug21031();
  5068  #endif
  5069  
  5070    TRACE("<< pid %d\n", osal_getpid());
  5071  }
  5072  
  5073  __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin,
  5074                        MDBX_reader *end) {
  5075    assert(pkey != NULL);
  5076  #ifndef NDEBUG
  5077    *pkey = (osal_thread_key_t)0xBADBADBAD;
  5078  #endif /* NDEBUG */
  5079  
  5080    rthc_lock();
  5081    TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit);
  5082    int rc;
  5083    if (rthc_count == rthc_limit) {
  5084      rthc_entry_t *new_table =
  5085          osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table,
  5086                       sizeof(rthc_entry_t) * rthc_limit * 2);
  5087      if (new_table == nullptr) {
  5088        rc = MDBX_ENOMEM;
  5089        goto bailout;
  5090      }
  5091      if (rthc_table == rthc_table_static)
  5092        memcpy(new_table, rthc_table_static, sizeof(rthc_table_static));
  5093      rthc_table = new_table;
  5094      rthc_limit *= 2;
  5095    }
  5096  
  5097    rc = thread_key_create(&rthc_table[rthc_count].thr_tls_key);
  5098    if (rc != MDBX_SUCCESS)
  5099      goto bailout;
  5100  
  5101    *pkey = rthc_table[rthc_count].thr_tls_key;
  5102    TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey,
  5103          __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end));
  5104  
  5105    rthc_table[rthc_count].begin = begin;
  5106    rthc_table[rthc_count].end = end;
  5107    ++rthc_count;
  5108    TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey,
  5109          rthc_count, rthc_limit);
  5110    rthc_unlock();
  5111    return MDBX_SUCCESS;
  5112  
  5113  bailout:
  5114    rthc_unlock();
  5115    return rc;
  5116  }
  5117  
  5118  __cold void rthc_remove(const osal_thread_key_t key) {
  5119    thread_key_delete(key);
  5120    rthc_lock();
  5121    TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count,
  5122          rthc_limit);
  5123  
  5124    for (unsigned i = 0; i < rthc_count; ++i) {
  5125      if (key == rthc_table[i].thr_tls_key) {
  5126        const uint32_t self_pid = osal_getpid();
  5127        TRACE("== [%i], %p ...%p, current-pid %d", i,
  5128              __Wpedantic_format_voidptr(rthc_table[i].begin),
  5129              __Wpedantic_format_voidptr(rthc_table[i].end), self_pid);
  5130  
  5131        for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end;
  5132             ++rthc) {
  5133          if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) {
  5134            atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease);
  5135            TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc));
  5136          }
  5137        }
  5138        if (--rthc_count > 0)
  5139          rthc_table[i] = rthc_table[rthc_count];
  5140        else if (rthc_table != rthc_table_static) {
  5141          osal_free(rthc_table);
  5142          rthc_table = rthc_table_static;
  5143          rthc_limit = RTHC_INITIAL_LIMIT;
  5144        }
  5145        break;
  5146      }
  5147    }
  5148  
  5149    TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count,
  5150          rthc_limit);
  5151    rthc_unlock();
  5152  }
  5153  
  5154  //------------------------------------------------------------------------------
  5155  
  5156  #define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459))
  5157  static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END;
  5158  
  5159  static __inline void lcklist_lock(void) {
  5160  #if defined(_WIN32) || defined(_WIN64)
  5161    EnterCriticalSection(&lcklist_critical_section);
  5162  #else
  5163    ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0);
  5164  #endif
  5165  }
  5166  
  5167  static __inline void lcklist_unlock(void) {
  5168  #if defined(_WIN32) || defined(_WIN64)
  5169    LeaveCriticalSection(&lcklist_critical_section);
  5170  #else
  5171    ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0);
  5172  #endif
  5173  }
  5174  
  5175  MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) {
  5176    /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */
  5177    v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50);
  5178    v *= UINT64_C(0xA24BAED4963EE407);
  5179    v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49);
  5180    v *= UINT64_C(0x9FB21C651E98DF25);
  5181    return v ^ v >> 28;
  5182  }
  5183  
  5184  static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) {
  5185    int rc;
  5186    uint64_t bait;
  5187    MDBX_lockinfo *const pending_lck = pending->lck;
  5188    MDBX_lockinfo *const scan_lck = scan->lck;
  5189    if (pending_lck) {
  5190      bait = atomic_load64(&pending_lck->mti_bait_uniqueness, mo_AcquireRelease);
  5191      rc = MDBX_SUCCESS;
  5192    } else {
  5193      bait = 0 /* hush MSVC warning */;
  5194      rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA);
  5195      if (rc == MDBX_SUCCESS)
  5196        rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness),
  5197                        offsetof(MDBX_lockinfo, mti_bait_uniqueness));
  5198    }
  5199    if (likely(rc == MDBX_SUCCESS) &&
  5200        bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease))
  5201      rc = MDBX_RESULT_TRUE;
  5202  
  5203    TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d",
  5204          pending_lck ? "mem" : "file", bait,
  5205          (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc);
  5206    return rc;
  5207  }
  5208  
  5209  static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan,
  5210                       uint64_t *abra) {
  5211    if (*abra == 0) {
  5212      const uintptr_t tid = osal_thread_self();
  5213      uintptr_t uit = 0;
  5214      memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit));
  5215      *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit);
  5216    }
  5217    const uint64_t cadabra =
  5218        rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid())
  5219            << 24 |
  5220        *abra >> 40;
  5221    MDBX_lockinfo *const scan_lck = scan->lck;
  5222    atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease);
  5223    *abra = *abra * UINT64_C(6364136223846793005) + 1;
  5224    return uniq_peek(pending, scan);
  5225  }
  5226  
  5227  __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) {
  5228    *found = nullptr;
  5229    uint64_t salt = 0;
  5230    for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END;
  5231         scan = scan->me_lcklist_next) {
  5232      MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck;
  5233      int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)
  5234                    ? uniq_peek(pending, &scan->me_lck_mmap)
  5235                    : uniq_poke(pending, &scan->me_lck_mmap, &salt);
  5236      if (err == MDBX_ENODATA) {
  5237        uint64_t length;
  5238        if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS &&
  5239                   length == 0)) {
  5240          /* LY: skip checking since LCK-file is empty, i.e. just created. */
  5241          DEBUG("uniq-probe: %s", "unique (new/empty lck)");
  5242          return MDBX_RESULT_TRUE;
  5243        }
  5244      }
  5245      if (err == MDBX_RESULT_TRUE)
  5246        err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
  5247      if (err == MDBX_RESULT_TRUE) {
  5248        (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo),
  5249                         MDBX_SYNC_NONE);
  5250        err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
  5251      }
  5252      if (err == MDBX_RESULT_TRUE) {
  5253        err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
  5254        *found = scan;
  5255        DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found));
  5256        return MDBX_RESULT_FALSE;
  5257      }
  5258      if (unlikely(err != MDBX_SUCCESS)) {
  5259        DEBUG("uniq-probe: failed rc %d", err);
  5260        return err;
  5261      }
  5262    }
  5263  
  5264    DEBUG("uniq-probe: %s", "unique");
  5265    return MDBX_RESULT_TRUE;
  5266  }
  5267  
  5268  static int lcklist_detach_locked(MDBX_env *env) {
  5269    MDBX_env *inprocess_neighbor = nullptr;
  5270    int rc = MDBX_SUCCESS;
  5271    if (env->me_lcklist_next != nullptr) {
  5272      ENSURE(env, env->me_lcklist_next != nullptr);
  5273      ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END);
  5274      for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END;
  5275           ptr = &(*ptr)->me_lcklist_next) {
  5276        if (*ptr == env) {
  5277          *ptr = env->me_lcklist_next;
  5278          env->me_lcklist_next = nullptr;
  5279          break;
  5280        }
  5281      }
  5282      ENSURE(env, env->me_lcklist_next == nullptr);
  5283    }
  5284  
  5285    rc = likely(osal_getpid() == env->me_pid)
  5286             ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor)
  5287             : MDBX_PANIC;
  5288    if (!inprocess_neighbor && env->me_live_reader)
  5289      (void)osal_rpid_clear(env);
  5290    if (!MDBX_IS_ERROR(rc))
  5291      rc = osal_lck_destroy(env, inprocess_neighbor);
  5292    return rc;
  5293  }
  5294  
  5295  /*------------------------------------------------------------------------------
  5296   * LY: State of the art quicksort-based sorting, with internal stack
  5297   * and network-sort for small chunks.
  5298   * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */
  5299  
  5300  #if MDBX_HAVE_CMOV
  5301  #define SORT_CMP_SWAP(TYPE, CMP, a, b)                                         \
  5302    do {                                                                         \
  5303      const TYPE swap_tmp = (a);                                                 \
  5304      const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5);    \
  5305      (a) = swap_cmp ? swap_tmp : b;                                             \
  5306      (b) = swap_cmp ? b : swap_tmp;                                             \
  5307    } while (0)
  5308  #else
  5309  #define SORT_CMP_SWAP(TYPE, CMP, a, b)                                         \
  5310    do                                                                           \
  5311      if (expect_with_probability(!CMP(a, b), 0, .5)) {                          \
  5312        const TYPE swap_tmp = (a);                                               \
  5313        (a) = (b);                                                               \
  5314        (b) = swap_tmp;                                                          \
  5315      }                                                                          \
  5316    while (0)
  5317  #endif
  5318  
  5319  //  3 comparators, 3 parallel operations
  5320  //  o-----^--^--o
  5321  //        |  |
  5322  //  o--^--|--v--o
  5323  //     |  |
  5324  //  o--v--v-----o
  5325  //
  5326  //  [[1,2]]
  5327  //  [[0,2]]
  5328  //  [[0,1]]
  5329  #define SORT_NETWORK_3(TYPE, CMP, begin)                                       \
  5330    do {                                                                         \
  5331      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5332      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5333      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5334    } while (0)
  5335  
  5336  //  5 comparators, 3 parallel operations
  5337  //  o--^--^--------o
  5338  //     |  |
  5339  //  o--v--|--^--^--o
  5340  //        |  |  |
  5341  //  o--^--v--|--v--o
  5342  //     |     |
  5343  //  o--v-----v-----o
  5344  //
  5345  //  [[0,1],[2,3]]
  5346  //  [[0,2],[1,3]]
  5347  //  [[1,2]]
  5348  #define SORT_NETWORK_4(TYPE, CMP, begin)                                       \
  5349    do {                                                                         \
  5350      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5351      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
  5352      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5353      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
  5354      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5355    } while (0)
  5356  
  5357  //  9 comparators, 5 parallel operations
  5358  //  o--^--^-----^-----------o
  5359  //     |  |     |
  5360  //  o--|--|--^--v-----^--^--o
  5361  //     |  |  |        |  |
  5362  //  o--|--v--|--^--^--|--v--o
  5363  //     |     |  |  |  |
  5364  //  o--|-----v--|--v--|--^--o
  5365  //     |        |     |  |
  5366  //  o--v--------v-----v--v--o
  5367  //
  5368  //  [[0,4],[1,3]]
  5369  //  [[0,2]]
  5370  //  [[2,4],[0,1]]
  5371  //  [[2,3],[1,4]]
  5372  //  [[1,2],[3,4]]
  5373  #define SORT_NETWORK_5(TYPE, CMP, begin)                                       \
  5374    do {                                                                         \
  5375      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
  5376      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
  5377      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5378      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
  5379      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5380      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
  5381      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
  5382      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5383      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
  5384    } while (0)
  5385  
  5386  //  12 comparators, 6 parallel operations
  5387  //  o-----^--^--^-----------------o
  5388  //        |  |  |
  5389  //  o--^--|--v--|--^--------^-----o
  5390  //     |  |     |  |        |
  5391  //  o--v--v-----|--|--^--^--|--^--o
  5392  //              |  |  |  |  |  |
  5393  //  o-----^--^--v--|--|--|--v--v--o
  5394  //        |  |     |  |  |
  5395  //  o--^--|--v-----v--|--v--------o
  5396  //     |  |           |
  5397  //  o--v--v-----------v-----------o
  5398  //
  5399  //  [[1,2],[4,5]]
  5400  //  [[0,2],[3,5]]
  5401  //  [[0,1],[3,4],[2,5]]
  5402  //  [[0,3],[1,4]]
  5403  //  [[2,4],[1,3]]
  5404  //  [[2,3]]
  5405  #define SORT_NETWORK_6(TYPE, CMP, begin)                                       \
  5406    do {                                                                         \
  5407      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5408      SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
  5409      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5410      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
  5411      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5412      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
  5413      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
  5414      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]);                              \
  5415      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
  5416      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
  5417      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
  5418      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
  5419    } while (0)
  5420  
  5421  //  16 comparators, 6 parallel operations
  5422  //  o--^--------^-----^-----------------o
  5423  //     |        |     |
  5424  //  o--|--^-----|--^--v--------^--^-----o
  5425  //     |  |     |  |           |  |
  5426  //  o--|--|--^--v--|--^-----^--|--v-----o
  5427  //     |  |  |     |  |     |  |
  5428  //  o--|--|--|-----v--|--^--v--|--^--^--o
  5429  //     |  |  |        |  |     |  |  |
  5430  //  o--v--|--|--^-----v--|--^--v--|--v--o
  5431  //        |  |  |        |  |     |
  5432  //  o-----v--|--|--------v--v-----|--^--o
  5433  //           |  |                 |  |
  5434  //  o--------v--v-----------------v--v--o
  5435  //
  5436  //  [[0,4],[1,5],[2,6]]
  5437  //  [[0,2],[1,3],[4,6]]
  5438  //  [[2,4],[3,5],[0,1]]
  5439  //  [[2,3],[4,5]]
  5440  //  [[1,4],[3,6]]
  5441  //  [[1,2],[3,4],[5,6]]
  5442  #define SORT_NETWORK_7(TYPE, CMP, begin)                                       \
  5443    do {                                                                         \
  5444      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
  5445      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
  5446      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
  5447      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5448      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
  5449      SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
  5450      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
  5451      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
  5452      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5453      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
  5454      SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
  5455      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
  5456      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
  5457      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5458      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
  5459      SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
  5460    } while (0)
  5461  
  5462  //  19 comparators, 6 parallel operations
  5463  //  o--^--------^-----^-----------------o
  5464  //     |        |     |
  5465  //  o--|--^-----|--^--v--------^--^-----o
  5466  //     |  |     |  |           |  |
  5467  //  o--|--|--^--v--|--^-----^--|--v-----o
  5468  //     |  |  |     |  |     |  |
  5469  //  o--|--|--|--^--v--|--^--v--|--^--^--o
  5470  //     |  |  |  |     |  |     |  |  |
  5471  //  o--v--|--|--|--^--v--|--^--v--|--v--o
  5472  //        |  |  |  |     |  |     |
  5473  //  o-----v--|--|--|--^--v--v-----|--^--o
  5474  //           |  |  |  |           |  |
  5475  //  o--------v--|--v--|--^--------v--v--o
  5476  //              |     |  |
  5477  //  o-----------v-----v--v--------------o
  5478  //
  5479  //  [[0,4],[1,5],[2,6],[3,7]]
  5480  //  [[0,2],[1,3],[4,6],[5,7]]
  5481  //  [[2,4],[3,5],[0,1],[6,7]]
  5482  //  [[2,3],[4,5]]
  5483  //  [[1,4],[3,6]]
  5484  //  [[1,2],[3,4],[5,6]]
  5485  #define SORT_NETWORK_8(TYPE, CMP, begin)                                       \
  5486    do {                                                                         \
  5487      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
  5488      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
  5489      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
  5490      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
  5491      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
  5492      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
  5493      SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
  5494      SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
  5495      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
  5496      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
  5497      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5498      SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
  5499      SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
  5500      SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
  5501      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
  5502      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
  5503      SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
  5504      SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
  5505      SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
  5506    } while (0)
  5507  
  5508  #define SORT_INNER(TYPE, CMP, begin, end, len)                                 \
  5509    switch (len) {                                                               \
  5510    default:                                                                     \
  5511      assert(false);                                                             \
  5512      __unreachable();                                                           \
  5513    case 0:                                                                      \
  5514    case 1:                                                                      \
  5515      break;                                                                     \
  5516    case 2:                                                                      \
  5517      SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
  5518      break;                                                                     \
  5519    case 3:                                                                      \
  5520      SORT_NETWORK_3(TYPE, CMP, begin);                                          \
  5521      break;                                                                     \
  5522    case 4:                                                                      \
  5523      SORT_NETWORK_4(TYPE, CMP, begin);                                          \
  5524      break;                                                                     \
  5525    case 5:                                                                      \
  5526      SORT_NETWORK_5(TYPE, CMP, begin);                                          \
  5527      break;                                                                     \
  5528    case 6:                                                                      \
  5529      SORT_NETWORK_6(TYPE, CMP, begin);                                          \
  5530      break;                                                                     \
  5531    case 7:                                                                      \
  5532      SORT_NETWORK_7(TYPE, CMP, begin);                                          \
  5533      break;                                                                     \
  5534    case 8:                                                                      \
  5535      SORT_NETWORK_8(TYPE, CMP, begin);                                          \
  5536      break;                                                                     \
  5537    }
  5538  
  5539  #define SORT_SWAP(TYPE, a, b)                                                  \
  5540    do {                                                                         \
  5541      const TYPE swap_tmp = (a);                                                 \
  5542      (a) = (b);                                                                 \
  5543      (b) = swap_tmp;                                                            \
  5544    } while (0)
  5545  
  5546  #define SORT_PUSH(low, high)                                                   \
  5547    do {                                                                         \
  5548      top->lo = (low);                                                           \
  5549      top->hi = (high);                                                          \
  5550      ++top;                                                                     \
  5551    } while (0)
  5552  
  5553  #define SORT_POP(low, high)                                                    \
  5554    do {                                                                         \
  5555      --top;                                                                     \
  5556      low = top->lo;                                                             \
  5557      high = top->hi;                                                            \
  5558    } while (0)
  5559  
  5560  #define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP)        \
  5561                                                                                 \
  5562    static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \
  5563      while (++first <= last)                                                    \
  5564        if (expect_with_probability(CMP(first[0], first[-1]), 1, .1))            \
  5565          return false;                                                          \
  5566      return true;                                                               \
  5567    }                                                                            \
  5568                                                                                 \
  5569    typedef struct {                                                             \
  5570      TYPE *lo, *hi;                                                             \
  5571    } NAME##_stack;                                                              \
  5572                                                                                 \
  5573    __hot static void NAME(TYPE *const __restrict begin,                         \
  5574                           TYPE *const __restrict end) {                         \
  5575      NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack;  \
  5576                                                                                 \
  5577      TYPE *__restrict hi = end - 1;                                             \
  5578      TYPE *__restrict lo = begin;                                               \
  5579      while (true) {                                                             \
  5580        const ptrdiff_t len = hi - lo;                                           \
  5581        if (len < 8) {                                                           \
  5582          SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1);                            \
  5583          if (unlikely(top == stack))                                            \
  5584            break;                                                               \
  5585          SORT_POP(lo, hi);                                                      \
  5586          continue;                                                              \
  5587        }                                                                        \
  5588                                                                                 \
  5589        TYPE *__restrict mid = lo + (len >> 1);                                  \
  5590        SORT_CMP_SWAP(TYPE, CMP, *lo, *mid);                                     \
  5591        SORT_CMP_SWAP(TYPE, CMP, *mid, *hi);                                     \
  5592        SORT_CMP_SWAP(TYPE, CMP, *lo, *mid);                                     \
  5593                                                                                 \
  5594        TYPE *right = hi - 1;                                                    \
  5595        TYPE *left = lo + 1;                                                     \
  5596        while (1) {                                                              \
  5597          while (expect_with_probability(CMP(*left, *mid), 0, .5))               \
  5598            ++left;                                                              \
  5599          while (expect_with_probability(CMP(*mid, *right), 0, .5))              \
  5600            --right;                                                             \
  5601          if (unlikely(left > right)) {                                          \
  5602            if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) {                           \
  5603              if (NAME##_is_sorted(lo, right))                                   \
  5604                lo = right + 1;                                                  \
  5605              if (NAME##_is_sorted(left, hi))                                    \
  5606                hi = left;                                                       \
  5607            }                                                                    \
  5608            break;                                                               \
  5609          }                                                                      \
  5610          SORT_SWAP(TYPE, *left, *right);                                        \
  5611          mid = (mid == left) ? right : (mid == right) ? left : mid;             \
  5612          ++left;                                                                \
  5613          --right;                                                               \
  5614        }                                                                        \
  5615                                                                                 \
  5616        if (right - lo > hi - left) {                                            \
  5617          SORT_PUSH(lo, right);                                                  \
  5618          lo = left;                                                             \
  5619        } else {                                                                 \
  5620          SORT_PUSH(left, hi);                                                   \
  5621          hi = right;                                                            \
  5622        }                                                                        \
  5623      }                                                                          \
  5624                                                                                 \
  5625      if (AUDIT_ENABLED()) {                                                     \
  5626        for (TYPE *scan = begin + 1; scan < end; ++scan)                         \
  5627          assert(CMP(scan[-1], scan[0]));                                        \
  5628      }                                                                          \
  5629    }
  5630  
  5631  /*------------------------------------------------------------------------------
  5632   * LY: radix sort for large chunks */
  5633  
  5634  #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP)  \
  5635                                                                                 \
  5636    __hot static bool NAME##_radixsort(TYPE *const begin,                        \
  5637                                       const unsigned length) {                  \
  5638      TYPE *tmp;                                                                 \
  5639      if (BUFFER_PREALLOCATED) {                                                 \
  5640        tmp = begin + length + END_GAP;                                          \
  5641        /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */                    \
  5642      } else {                                                                   \
  5643        tmp = osal_malloc(sizeof(TYPE) * length);                                \
  5644        if (unlikely(!tmp))                                                      \
  5645          return false;                                                          \
  5646      }                                                                          \
  5647                                                                                 \
  5648      unsigned key_shift = 0, key_diff_mask;                                     \
  5649      do {                                                                       \
  5650        struct {                                                                 \
  5651          unsigned a[256], b[256];                                               \
  5652        } counters;                                                              \
  5653        memset(&counters, 0, sizeof(counters));                                  \
  5654                                                                                 \
  5655        key_diff_mask = 0;                                                       \
  5656        unsigned prev_key = EXTRACT_KEY(begin) >> key_shift;                     \
  5657        TYPE *r = begin, *end = begin + length;                                  \
  5658        do {                                                                     \
  5659          const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
  5660          counters.a[key & 255]++;                                               \
  5661          counters.b[(key >> 8) & 255]++;                                        \
  5662          key_diff_mask |= prev_key ^ key;                                       \
  5663          prev_key = key;                                                        \
  5664        } while (++r != end);                                                    \
  5665                                                                                 \
  5666        unsigned ta = 0, tb = 0;                                                 \
  5667        for (unsigned i = 0; i < 256; ++i) {                                     \
  5668          const unsigned ia = counters.a[i];                                     \
  5669          counters.a[i] = ta;                                                    \
  5670          ta += ia;                                                              \
  5671          const unsigned ib = counters.b[i];                                     \
  5672          counters.b[i] = tb;                                                    \
  5673          tb += ib;                                                              \
  5674        }                                                                        \
  5675                                                                                 \
  5676        r = begin;                                                               \
  5677        do {                                                                     \
  5678          const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
  5679          tmp[counters.a[key & 255]++] = *r;                                     \
  5680        } while (++r != end);                                                    \
  5681                                                                                 \
  5682        if (unlikely(key_diff_mask < 256)) {                                     \
  5683          memcpy(begin, tmp, (char *)end - (char *)begin);                       \
  5684          break;                                                                 \
  5685        }                                                                        \
  5686        end = (r = tmp) + length;                                                \
  5687        do {                                                                     \
  5688          const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
  5689          begin[counters.b[(key >> 8) & 255]++] = *r;                            \
  5690        } while (++r != end);                                                    \
  5691                                                                                 \
  5692        key_shift += 16;                                                         \
  5693      } while (key_diff_mask >> 16);                                             \
  5694                                                                                 \
  5695      if (!(BUFFER_PREALLOCATED))                                                \
  5696        osal_free(tmp);                                                          \
  5697      return true;                                                               \
  5698    }
  5699  
  5700  /*------------------------------------------------------------------------------
  5701   * LY: Binary search */
  5702  
  5703  #if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__)
  5704  #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag)                         \
  5705    do                                                                           \
  5706      __asm __volatile(""                                                        \
  5707                       : "+r"(size)                                              \
  5708                       : "r" /* the `b` constraint is more suitable here, but    \
  5709                                cause CLANG to allocate and push/pop an one more \
  5710                                register, so using the `r` which avoids this. */ \
  5711                       (flag));                                                  \
  5712    while (0)
  5713  #else
  5714  #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag)                         \
  5715    do {                                                                         \
  5716      /* nope for non-clang or non-x86 */;                                       \
  5717    } while (0)
  5718  #endif /* Workaround for CLANG */
  5719  
  5720  #define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key)                      \
  5721    do {                                                                         \
  5722    } while (0)
  5723  
  5724  #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP)                            \
  5725    static __always_inline const TYPE_LIST *NAME(                                \
  5726        const TYPE_LIST *it, unsigned length, const TYPE_ARG item) {             \
  5727      const TYPE_LIST *const begin = it, *const end = begin + length;            \
  5728                                                                                 \
  5729      if (MDBX_HAVE_CMOV)                                                        \
  5730        do {                                                                     \
  5731          /* Адаптивно-упрощенный шаг двоичного поиска:                          \
  5732           *  - без переходов при наличии cmov или аналога;                      \
  5733           *  - допускает лишние итерации;                                       \
  5734           *  - но ищет пока size > 2, что требует дозавершения поиска           \
  5735           *    среди остающихся 0-1-2 элементов. */                             \
  5736          const TYPE_LIST *const middle = it + (length >> 1);                    \
  5737          length = (length + 1) >> 1;                                            \
  5738          const bool flag = expect_with_probability(CMP(*middle, item), 0, .5);  \
  5739          WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag);                      \
  5740          it = flag ? middle : it;                                               \
  5741        } while (length > 2);                                                    \
  5742      else                                                                       \
  5743        while (length > 2) {                                                     \
  5744          /* Вариант с использованием условного перехода. Основное отличие в     \
  5745           * том, что при "не равно" (true от компаратора) переход делается на 1 \
  5746           * ближе к концу массива. Алгоритмически это верно и обеспечивает      \
  5747           * чуть-чуть более быструю сходимость, но зато требует больше          \
  5748           * вычислений при true от компаратора. Также ВАЖНО(!) не допускается   \
  5749           * спекулятивное выполнение при size == 0. */                          \
  5750          const TYPE_LIST *const middle = it + (length >> 1);                    \
  5751          length = (length + 1) >> 1;                                            \
  5752          const bool flag = expect_with_probability(CMP(*middle, item), 0, .5);  \
  5753          if (flag) {                                                            \
  5754            it = middle + 1;                                                     \
  5755            length -= 1;                                                         \
  5756          }                                                                      \
  5757        }                                                                        \
  5758      it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5);        \
  5759      it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5);        \
  5760                                                                                 \
  5761      if (AUDIT_ENABLED()) {                                                     \
  5762        for (const TYPE_LIST *scan = begin; scan < it; ++scan)                   \
  5763          assert(CMP(*scan, item));                                              \
  5764        for (const TYPE_LIST *scan = it; scan < end; ++scan)                     \
  5765          assert(!CMP(*scan, item));                                             \
  5766        (void)begin, (void)end;                                                  \
  5767      }                                                                          \
  5768                                                                                 \
  5769      return it;                                                                 \
  5770    }
  5771  
  5772  /*----------------------------------------------------------------------------*/
  5773  
  5774  static __always_inline size_t pnl_size2bytes(size_t size) {
  5775    assert(size > 0 && size <= MDBX_PGL_LIMIT);
  5776  #if MDBX_PNL_PREALLOC_FOR_RADIXSORT
  5777    size += size;
  5778  #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
  5779    STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD +
  5780                      (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) +
  5781                       MDBX_PNL_GRANULATE + 2) *
  5782                          sizeof(pgno_t) <
  5783                  SIZE_MAX / 4 * 3);
  5784    size_t bytes =
  5785        ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2),
  5786                      MDBX_PNL_GRANULATE * sizeof(pgno_t)) -
  5787        MDBX_ASSUME_MALLOC_OVERHEAD;
  5788    return bytes;
  5789  }
  5790  
  5791  static __always_inline pgno_t pnl_bytes2size(const size_t bytes) {
  5792    size_t size = bytes / sizeof(pgno_t);
  5793    assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536);
  5794    size -= 2;
  5795  #if MDBX_PNL_PREALLOC_FOR_RADIXSORT
  5796    size >>= 1;
  5797  #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
  5798    return (pgno_t)size;
  5799  }
  5800  
  5801  static MDBX_PNL pnl_alloc(size_t size) {
  5802    size_t bytes = pnl_size2bytes(size);
  5803    MDBX_PNL pl = osal_malloc(bytes);
  5804    if (likely(pl)) {
  5805  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  5806      bytes = malloc_usable_size(pl);
  5807  #endif /* malloc_usable_size */
  5808      pl[0] = pnl_bytes2size(bytes);
  5809      assert(pl[0] >= size);
  5810      pl[1] = 0;
  5811      pl += 1;
  5812    }
  5813    return pl;
  5814  }
  5815  
  5816  static void pnl_free(MDBX_PNL pl) {
  5817    if (likely(pl))
  5818      osal_free(pl - 1);
  5819  }
  5820  
  5821  /* Shrink the PNL to the default size if it has grown larger */
  5822  static void pnl_shrink(MDBX_PNL *ppl) {
  5823    assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
  5824           pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) <
  5825               MDBX_PNL_INITIAL * 3 / 2);
  5826    assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
  5827           MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
  5828    MDBX_PNL_SIZE(*ppl) = 0;
  5829    if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) >
  5830                 MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) {
  5831      size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL);
  5832      MDBX_PNL pl = osal_realloc(*ppl - 1, bytes);
  5833      if (likely(pl)) {
  5834  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  5835        bytes = malloc_usable_size(pl);
  5836  #endif /* malloc_usable_size */
  5837        *pl = pnl_bytes2size(bytes);
  5838        *ppl = pl + 1;
  5839      }
  5840    }
  5841  }
  5842  
  5843  /* Grow the PNL to the size growed to at least given size */
  5844  static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) {
  5845    const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl);
  5846    assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
  5847           MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
  5848    if (likely(allocated >= wanna))
  5849      return MDBX_SUCCESS;
  5850  
  5851    if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) {
  5852      ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT);
  5853      return MDBX_TXN_FULL;
  5854    }
  5855  
  5856    const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT)
  5857                            ? wanna + wanna - allocated
  5858                            : MDBX_PGL_LIMIT;
  5859    size_t bytes = pnl_size2bytes(size);
  5860    MDBX_PNL pl = osal_realloc(*ppl - 1, bytes);
  5861    if (likely(pl)) {
  5862  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  5863      bytes = malloc_usable_size(pl);
  5864  #endif /* malloc_usable_size */
  5865      *pl = pnl_bytes2size(bytes);
  5866      assert(*pl >= wanna);
  5867      *ppl = pl + 1;
  5868      return MDBX_SUCCESS;
  5869    }
  5870    return MDBX_ENOMEM;
  5871  }
  5872  
  5873  /* Make room for num additional elements in an PNL */
  5874  static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl,
  5875                                                          size_t num) {
  5876    assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
  5877           MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
  5878    assert(num <= MDBX_PGL_LIMIT);
  5879    const size_t wanna = MDBX_PNL_SIZE(*ppl) + num;
  5880    return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS
  5881                                                    : pnl_reserve(ppl, wanna);
  5882  }
  5883  
  5884  static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) {
  5885    assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl));
  5886    if (AUDIT_ENABLED()) {
  5887      for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i)
  5888        assert(pgno != pl[i]);
  5889    }
  5890    MDBX_PNL_SIZE(pl) += 1;
  5891    MDBX_PNL_LAST(pl) = pgno;
  5892  }
  5893  
  5894  /* Append an pgno range onto an unsorted PNL */
  5895  __always_inline static int __must_check_result pnl_append_range(bool spilled,
  5896                                                                  MDBX_PNL *ppl,
  5897                                                                  pgno_t pgno,
  5898                                                                  unsigned n) {
  5899    assert(n > 0);
  5900    int rc = pnl_need(ppl, n);
  5901    if (unlikely(rc != MDBX_SUCCESS))
  5902      return rc;
  5903  
  5904    const MDBX_PNL pnl = *ppl;
  5905  #if MDBX_PNL_ASCENDING
  5906    unsigned w = MDBX_PNL_SIZE(pnl);
  5907    do {
  5908      pnl[++w] = pgno;
  5909      pgno += spilled ? 2 : 1;
  5910    } while (--n);
  5911    MDBX_PNL_SIZE(pnl) = w;
  5912  #else
  5913    unsigned w = MDBX_PNL_SIZE(pnl) + n;
  5914    MDBX_PNL_SIZE(pnl) = w;
  5915    do {
  5916      pnl[w--] = pgno;
  5917      pgno += spilled ? 2 : 1;
  5918    } while (--n);
  5919  #endif
  5920  
  5921    return MDBX_SUCCESS;
  5922  }
  5923  
  5924  /* Append an pgno range into the sorted PNL */
  5925  __hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl,
  5926                                                        pgno_t pgno, unsigned n) {
  5927    assert(n > 0);
  5928    int rc = pnl_need(ppl, n);
  5929    if (unlikely(rc != MDBX_SUCCESS))
  5930      return rc;
  5931  
  5932    const MDBX_PNL pnl = *ppl;
  5933    unsigned r = MDBX_PNL_SIZE(pnl), w = r + n;
  5934    MDBX_PNL_SIZE(pnl) = w;
  5935    while (r && MDBX_PNL_DISORDERED(pnl[r], pgno))
  5936      pnl[w--] = pnl[r--];
  5937  
  5938    for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w)
  5939      pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++;
  5940  
  5941    return MDBX_SUCCESS;
  5942  }
  5943  
  5944  __hot static bool pnl_check(const pgno_t *pl, const size_t limit) {
  5945    assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND);
  5946    if (likely(MDBX_PNL_SIZE(pl))) {
  5947      if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT))
  5948        return false;
  5949      if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO))
  5950        return false;
  5951      if (unlikely(MDBX_PNL_MOST(pl) >= limit))
  5952        return false;
  5953  
  5954      if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) &&
  5955          likely(MDBX_PNL_SIZE(pl) > 1)) {
  5956        const pgno_t *scan = MDBX_PNL_BEGIN(pl);
  5957        const pgno_t *const end = MDBX_PNL_END(pl);
  5958        pgno_t prev = *scan++;
  5959        do {
  5960          if (unlikely(!MDBX_PNL_ORDERED(prev, *scan)))
  5961            return false;
  5962          prev = *scan;
  5963        } while (likely(++scan != end));
  5964      }
  5965    }
  5966    return true;
  5967  }
  5968  
  5969  static __always_inline bool pnl_check_allocated(const pgno_t *pl,
  5970                                                  const size_t limit) {
  5971    return pl == nullptr ||
  5972           (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl) && pnl_check(pl, limit));
  5973  }
  5974  
  5975  static __always_inline void
  5976  pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a,
  5977                  const pgno_t *__restrict src_b,
  5978                  const pgno_t *__restrict const src_b_detent) {
  5979    do {
  5980  #if MDBX_HAVE_CMOV
  5981      const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a);
  5982  #if defined(__LCC__) || __CLANG_PREREQ(13, 0)
  5983      // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode
  5984      // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?)
  5985      // gcc<=6: cmov×3
  5986      // clang<=12: cmov×3
  5987      // clang>=13: cmov, set+add/sub
  5988      *dst = flag ? *src_a-- : *src_b--;
  5989  #else
  5990      // gcc: cmov, cmp+set+add/sub
  5991      // clang<=5: cmov×2, set+add/sub
  5992      // clang>=6: cmov, set+add/sub
  5993      *dst = flag ? *src_a : *src_b;
  5994      src_b += flag - 1;
  5995      src_a -= flag;
  5996  #endif
  5997      --dst;
  5998  #else  /* MDBX_HAVE_CMOV */
  5999      while (MDBX_PNL_ORDERED(*src_b, *src_a))
  6000        *dst-- = *src_a--;
  6001      *dst-- = *src_b--;
  6002  #endif /* !MDBX_HAVE_CMOV */
  6003    } while (likely(src_b > src_b_detent));
  6004  }
  6005  
  6006  /* Merge a PNL onto a PNL. The destination PNL must be big enough */
  6007  __hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) {
  6008    assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
  6009    assert(pnl_check(src, MAX_PAGENO + 1));
  6010    const pgno_t src_len = MDBX_PNL_SIZE(src);
  6011    const pgno_t dst_len = MDBX_PNL_SIZE(dst);
  6012    if (likely(src_len > 0)) {
  6013      const pgno_t total = dst_len + src_len;
  6014      assert(MDBX_PNL_ALLOCLEN(dst) >= total);
  6015      dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID);
  6016      pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src);
  6017      MDBX_PNL_SIZE(dst) = total;
  6018    }
  6019    assert(pnl_check_allocated(dst, MAX_PAGENO + 1));
  6020  }
  6021  
  6022  static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) {
  6023    tASSERT(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) &&
  6024                     txn->tw.spill_least_removed > 0);
  6025    txn->tw.spill_least_removed =
  6026        (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed;
  6027    txn->tw.spill_pages[idx] |= 1;
  6028    MDBX_PNL_SIZE(txn->tw.spill_pages) -=
  6029        (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
  6030  
  6031    while (unlikely(npages > 1)) {
  6032      const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1;
  6033      if (MDBX_PNL_ASCENDING) {
  6034        if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) ||
  6035            (txn->tw.spill_pages[idx] >> 1) != pgno)
  6036          return;
  6037      } else {
  6038        if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno)
  6039          return;
  6040        txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed)
  6041                                          ? idx
  6042                                          : txn->tw.spill_least_removed;
  6043      }
  6044      txn->tw.spill_pages[idx] |= 1;
  6045      MDBX_PNL_SIZE(txn->tw.spill_pages) -=
  6046          (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
  6047      --npages;
  6048    }
  6049  }
  6050  
  6051  static MDBX_PNL spill_purge(MDBX_txn *txn) {
  6052    tASSERT(txn, txn->tw.spill_least_removed > 0);
  6053    const MDBX_PNL sl = txn->tw.spill_pages;
  6054    if (txn->tw.spill_least_removed != INT_MAX) {
  6055      unsigned len = MDBX_PNL_SIZE(sl), r, w;
  6056      for (w = r = txn->tw.spill_least_removed; r <= len; ++r) {
  6057        sl[w] = sl[r];
  6058        w += 1 - (sl[r] & 1);
  6059      }
  6060      for (size_t i = 1; i < w; ++i)
  6061        tASSERT(txn, (sl[i] & 1) == 0);
  6062      MDBX_PNL_SIZE(sl) = w - 1;
  6063      txn->tw.spill_least_removed = INT_MAX;
  6064    } else {
  6065      for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i)
  6066        tASSERT(txn, (sl[i] & 1) == 0);
  6067    }
  6068    return sl;
  6069  }
  6070  
  6071  #if MDBX_PNL_ASCENDING
  6072  #define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr))
  6073  #else
  6074  #define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr))
  6075  #endif
  6076  RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY,
  6077                 MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0)
  6078  
  6079  SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED)
  6080  
  6081  __hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) {
  6082    if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) ||
  6083        unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl))))
  6084      pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl));
  6085  }
  6086  
  6087  static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) {
  6088    pnl_sort_nochk(pnl);
  6089    assert(pnl_check(pnl, limit4check));
  6090    (void)limit4check;
  6091  }
  6092  
  6093  /* Search for an pgno in an PNL.
  6094   * Returns The index of the first item greater than or equal to pgno. */
  6095  SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED)
  6096  
  6097  __hot __noinline static unsigned pnl_search_nochk(const MDBX_PNL pnl,
  6098                                                    pgno_t pgno) {
  6099    const pgno_t *begin = MDBX_PNL_BEGIN(pnl);
  6100    const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno);
  6101    const pgno_t *end = begin + MDBX_PNL_SIZE(pnl);
  6102    assert(it >= begin && it <= end);
  6103    if (it != begin)
  6104      assert(MDBX_PNL_ORDERED(it[-1], pgno));
  6105    if (it != end)
  6106      assert(!MDBX_PNL_ORDERED(it[0], pgno));
  6107    return (unsigned)(it - begin + 1);
  6108  }
  6109  
  6110  static __inline unsigned pnl_search(const MDBX_PNL pnl, pgno_t pgno,
  6111                                      size_t limit) {
  6112    assert(pnl_check_allocated(pnl, limit));
  6113    assert(pgno < limit);
  6114    (void)limit;
  6115    return pnl_search_nochk(pnl, pgno);
  6116  }
  6117  
  6118  static __inline unsigned search_spilled(const MDBX_txn *txn, pgno_t pgno) {
  6119    const MDBX_PNL pnl = txn->tw.spill_pages;
  6120    if (likely(!pnl))
  6121      return 0;
  6122    pgno <<= 1;
  6123    unsigned n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1);
  6124    return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0;
  6125  }
  6126  
  6127  static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno,
  6128                                         unsigned npages) {
  6129    const MDBX_PNL pnl = txn->tw.spill_pages;
  6130    if (likely(!pnl))
  6131      return false;
  6132    const unsigned len = MDBX_PNL_SIZE(pnl);
  6133    if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
  6134      DEBUG_EXTRA("PNL len %u [", len);
  6135      for (unsigned i = 1; i <= len; ++i)
  6136        DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1)
  6137                                               : (long)(pnl[i] >> 1));
  6138      DEBUG_EXTRA_PRINT("%s\n", "]");
  6139    }
  6140    const pgno_t spilled_range_begin = pgno << 1;
  6141    const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1;
  6142  #if MDBX_PNL_ASCENDING
  6143    const unsigned n =
  6144        pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1);
  6145    assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n]));
  6146    const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last;
  6147  #else
  6148    const unsigned n =
  6149        pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1);
  6150    assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n]));
  6151    const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin;
  6152  #endif
  6153    if (ASSERT_ENABLED()) {
  6154      bool check = false;
  6155      for (unsigned i = 0; i < npages; ++i)
  6156        check |= search_spilled(txn, pgno + i) != 0;
  6157      assert(check == rc);
  6158    }
  6159    return rc;
  6160  }
  6161  
  6162  /*----------------------------------------------------------------------------*/
  6163  
  6164  static __always_inline size_t txl_size2bytes(const size_t size) {
  6165    assert(size > 0 && size <= MDBX_TXL_MAX * 2);
  6166    size_t bytes =
  6167        ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2),
  6168                      MDBX_TXL_GRANULATE * sizeof(txnid_t)) -
  6169        MDBX_ASSUME_MALLOC_OVERHEAD;
  6170    return bytes;
  6171  }
  6172  
  6173  static __always_inline size_t txl_bytes2size(const size_t bytes) {
  6174    size_t size = bytes / sizeof(txnid_t);
  6175    assert(size > 2 && size <= MDBX_TXL_MAX * 2);
  6176    return size - 2;
  6177  }
  6178  
  6179  static MDBX_TXL txl_alloc(void) {
  6180    size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL);
  6181    MDBX_TXL tl = osal_malloc(bytes);
  6182    if (likely(tl)) {
  6183  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  6184      bytes = malloc_usable_size(tl);
  6185  #endif /* malloc_usable_size */
  6186      tl[0] = txl_bytes2size(bytes);
  6187      assert(tl[0] >= MDBX_TXL_INITIAL);
  6188      tl[1] = 0;
  6189      tl += 1;
  6190    }
  6191    return tl;
  6192  }
  6193  
  6194  static void txl_free(MDBX_TXL tl) {
  6195    if (likely(tl))
  6196      osal_free(tl - 1);
  6197  }
  6198  
  6199  static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) {
  6200    const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl);
  6201    assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX &&
  6202           MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl));
  6203    if (likely(allocated >= wanna))
  6204      return MDBX_SUCCESS;
  6205  
  6206    if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) {
  6207      ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX);
  6208      return MDBX_TXN_FULL;
  6209    }
  6210  
  6211    const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX)
  6212                            ? wanna + wanna - allocated
  6213                            : MDBX_TXL_MAX;
  6214    size_t bytes = txl_size2bytes(size);
  6215    MDBX_TXL tl = osal_realloc(*ptl - 1, bytes);
  6216    if (likely(tl)) {
  6217  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  6218      bytes = malloc_usable_size(tl);
  6219  #endif /* malloc_usable_size */
  6220      *tl = txl_bytes2size(bytes);
  6221      assert(*tl >= wanna);
  6222      *ptl = tl + 1;
  6223      return MDBX_SUCCESS;
  6224    }
  6225    return MDBX_ENOMEM;
  6226  }
  6227  
  6228  static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl,
  6229                                                          size_t num) {
  6230    assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX &&
  6231           MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl));
  6232    assert(num <= MDBX_PGL_LIMIT);
  6233    const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num;
  6234    return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS
  6235                                                    : txl_reserve(ptl, wanna);
  6236  }
  6237  
  6238  static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) {
  6239    assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl));
  6240    MDBX_PNL_SIZE(tl) += 1;
  6241    MDBX_PNL_LAST(tl) = id;
  6242  }
  6243  
  6244  #define TXNID_SORT_CMP(first, last) ((first) > (last))
  6245  SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP)
  6246  static void txl_sort(MDBX_TXL tl) {
  6247    txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl));
  6248  }
  6249  
  6250  static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) {
  6251    if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) {
  6252      int rc = txl_need(ptl, MDBX_TXL_GRANULATE);
  6253      if (unlikely(rc != MDBX_SUCCESS))
  6254        return rc;
  6255    }
  6256    txl_xappend(*ptl, id);
  6257    return MDBX_SUCCESS;
  6258  }
  6259  
  6260  /*----------------------------------------------------------------------------*/
  6261  
  6262  #define MDBX_DPL_UNSORTED_BACKLOG 16
  6263  #define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG
  6264  #define MDBX_DPL_GAP_FOR_EDGING 2
  6265  #define MDBX_DPL_RESERVE_GAP                                                   \
  6266    (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING)
  6267  
  6268  static __always_inline size_t dpl_size2bytes(ptrdiff_t size) {
  6269    assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT);
  6270  #if MDBX_DPL_PREALLOC_FOR_RADIXSORT
  6271    size += size;
  6272  #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
  6273    STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) +
  6274                      (MDBX_PGL_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1) +
  6275                       MDBX_DPL_RESERVE_GAP) *
  6276                          sizeof(MDBX_dp) +
  6277                      MDBX_PNL_GRANULATE * sizeof(void *) * 2 <
  6278                  SIZE_MAX / 4 * 3);
  6279    size_t bytes =
  6280        ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) +
  6281                          ((size_t)size + MDBX_DPL_RESERVE_GAP) * sizeof(MDBX_dp),
  6282                      MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
  6283        MDBX_ASSUME_MALLOC_OVERHEAD;
  6284    return bytes;
  6285  }
  6286  
  6287  static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) {
  6288    size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp);
  6289    assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP &&
  6290           size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
  6291    size -= MDBX_DPL_RESERVE_GAP;
  6292  #if MDBX_DPL_PREALLOC_FOR_RADIXSORT
  6293    size >>= 1;
  6294  #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
  6295    return (unsigned)size;
  6296  }
  6297  
  6298  static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) {
  6299    static const MDBX_page dpl_stub_pageE = {
  6300        {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0};
  6301    assert(dpl_stub_pageE.mp_flags == P_BAD &&
  6302           dpl_stub_pageE.mp_pgno == P_INVALID);
  6303    dl->length = len;
  6304    dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE;
  6305    dl->items[len + 1].pgno = P_INVALID;
  6306    dl->items[len + 1].extra = 0;
  6307    return len;
  6308  }
  6309  
  6310  static __always_inline void dpl_clear(MDBX_dpl *dl) {
  6311    static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0};
  6312    assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0);
  6313    dl->sorted = dpl_setlen(dl, 0);
  6314    dl->pages_including_loose = 0;
  6315    dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB;
  6316    dl->items[0].pgno = 0;
  6317    dl->items[0].extra = 0;
  6318    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6319  }
  6320  
  6321  static void dpl_free(MDBX_txn *txn) {
  6322    if (likely(txn->tw.dirtylist)) {
  6323      osal_free(txn->tw.dirtylist);
  6324      txn->tw.dirtylist = NULL;
  6325    }
  6326  }
  6327  
  6328  static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) {
  6329    size_t bytes =
  6330        dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT);
  6331    MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes);
  6332    if (likely(dl)) {
  6333  #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
  6334      bytes = malloc_usable_size(dl);
  6335  #endif /* malloc_usable_size */
  6336      dl->detent = dpl_bytes2size(bytes);
  6337      tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent);
  6338      txn->tw.dirtylist = dl;
  6339    }
  6340    return dl;
  6341  }
  6342  
  6343  static int dpl_alloc(MDBX_txn *txn) {
  6344    tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
  6345    const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper)
  6346                          ? txn->mt_env->me_options.dp_initial
  6347                          : txn->mt_geo.upper;
  6348    if (txn->tw.dirtylist) {
  6349      dpl_clear(txn->tw.dirtylist);
  6350      const int realloc_threshold = 64;
  6351      if (likely(
  6352              !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold ||
  6353                (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold)))
  6354        return MDBX_SUCCESS;
  6355    }
  6356    if (unlikely(!dpl_reserve(txn, wanna)))
  6357      return MDBX_ENOMEM;
  6358    dpl_clear(txn->tw.dirtylist);
  6359    return MDBX_SUCCESS;
  6360  }
  6361  
  6362  #define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno)
  6363  RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY,
  6364                 MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1)
  6365  
  6366  #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
  6367  SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP)
  6368  
  6369  __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) {
  6370    MDBX_dpl *dl = txn->tw.dirtylist;
  6371    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6372    const unsigned unsorted = dl->length - dl->sorted;
  6373    if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) ||
  6374        unlikely(!dpl_radixsort(dl->items + 1, dl->length))) {
  6375      if (dl->sorted > unsorted / 4 + 4 &&
  6376          (MDBX_DPL_PREALLOC_FOR_RADIXSORT ||
  6377           dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) {
  6378        MDBX_dp *const sorted_begin = dl->items + 1;
  6379        MDBX_dp *const sorted_end = sorted_begin + dl->sorted;
  6380        MDBX_dp *const end =
  6381            dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT
  6382                             ? dl->length + dl->length + 1
  6383                             : dl->detent + MDBX_DPL_RESERVE_GAP);
  6384        MDBX_dp *const tmp = end - unsorted;
  6385        assert(dl->items + dl->length + 1 < tmp);
  6386        /* copy unsorted to the end of allocated space and sort it */
  6387        memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp));
  6388        dp_sort(tmp, tmp + unsorted);
  6389        /* merge two parts from end to begin */
  6390        MDBX_dp *__restrict w = dl->items + dl->length;
  6391        MDBX_dp *__restrict l = dl->items + dl->sorted;
  6392        MDBX_dp *__restrict r = end - 1;
  6393        do {
  6394          const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5);
  6395  #if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV
  6396          *w = cmp ? *l-- : *r--;
  6397  #else
  6398          *w = cmp ? *l : *r;
  6399          l -= cmp;
  6400          r += cmp - 1;
  6401  #endif
  6402        } while (likely(--w > l));
  6403        assert(r == tmp - 1);
  6404        assert(dl->items[0].pgno == 0 &&
  6405               dl->items[dl->length + 1].pgno == P_INVALID);
  6406        if (ASSERT_ENABLED())
  6407          for (unsigned i = 0; i <= dl->length; ++i)
  6408            assert(dl->items[i].pgno < dl->items[i + 1].pgno);
  6409      } else {
  6410        dp_sort(dl->items + 1, dl->items + dl->length + 1);
  6411        assert(dl->items[0].pgno == 0 &&
  6412               dl->items[dl->length + 1].pgno == P_INVALID);
  6413      }
  6414    } else {
  6415      assert(dl->items[0].pgno == 0 &&
  6416             dl->items[dl->length + 1].pgno == P_INVALID);
  6417    }
  6418    dl->sorted = dl->length;
  6419    return dl;
  6420  }
  6421  
  6422  static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) {
  6423    MDBX_dpl *dl = txn->tw.dirtylist;
  6424    assert(dl->length <= MDBX_PGL_LIMIT);
  6425    assert(dl->sorted <= dl->length);
  6426    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6427    return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn);
  6428  }
  6429  
  6430  /* Returns the index of the first dirty-page whose pgno
  6431   * member is greater than or equal to id. */
  6432  #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
  6433  SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP)
  6434  
  6435  __hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) {
  6436    MDBX_dpl *dl = txn->tw.dirtylist;
  6437    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6438    if (AUDIT_ENABLED()) {
  6439      for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
  6440        assert(ptr[0].pgno < ptr[1].pgno);
  6441        assert(ptr[0].pgno >= NUM_METAS);
  6442      }
  6443    }
  6444  
  6445    switch (dl->length - dl->sorted) {
  6446    default:
  6447      /* sort a whole */
  6448      dpl_sort_slowpath(txn);
  6449      break;
  6450    case 0:
  6451      /* whole sorted cases */
  6452      break;
  6453  
  6454  #define LINEAR_SEARCH_CASE(N)                                                  \
  6455    case N:                                                                      \
  6456      if (dl->items[dl->length - N + 1].pgno == pgno)                            \
  6457        return dl->length - N + 1;                                               \
  6458      __fallthrough
  6459  
  6460      /* use linear scan until the threshold */
  6461      LINEAR_SEARCH_CASE(7); /* fall through */
  6462      LINEAR_SEARCH_CASE(6); /* fall through */
  6463      LINEAR_SEARCH_CASE(5); /* fall through */
  6464      LINEAR_SEARCH_CASE(4); /* fall through */
  6465      LINEAR_SEARCH_CASE(3); /* fall through */
  6466      LINEAR_SEARCH_CASE(2); /* fall through */
  6467    case 1:
  6468      if (dl->items[dl->length].pgno == pgno)
  6469        return dl->length;
  6470      /* continue bsearch on the sorted part */
  6471      break;
  6472    }
  6473    return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
  6474  }
  6475  
  6476  MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned
  6477  dpl_npages(const MDBX_dpl *dl, unsigned i) {
  6478    assert(0 <= (int)i && i <= dl->length);
  6479    unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages;
  6480    assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1));
  6481    return n;
  6482  }
  6483  
  6484  MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned
  6485  dpl_endpgno(const MDBX_dpl *dl, unsigned i) {
  6486    return dpl_npages(dl, i) + dl->items[i].pgno;
  6487  }
  6488  
  6489  static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno,
  6490                                     unsigned npages) {
  6491    MDBX_dpl *dl = txn->tw.dirtylist;
  6492    assert(dl->sorted == dl->length);
  6493    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6494    unsigned const n = dpl_search(txn, pgno);
  6495    assert(n >= 1 && n <= dl->length + 1);
  6496    assert(pgno <= dl->items[n].pgno);
  6497    assert(pgno > dl->items[n - 1].pgno);
  6498    const bool rc =
  6499        /* intersection with founded */ pgno + npages > dl->items[n].pgno ||
  6500        /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno;
  6501    if (ASSERT_ENABLED()) {
  6502      bool check = false;
  6503      for (unsigned i = 1; i <= dl->length; ++i) {
  6504        const MDBX_page *const dp = dl->items[i].ptr;
  6505        if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages ||
  6506              dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno))
  6507          check |= true;
  6508      }
  6509      assert(check == rc);
  6510    }
  6511    return rc;
  6512  }
  6513  
  6514  static __always_inline unsigned dpl_exist(MDBX_txn *txn, pgno_t pgno) {
  6515    MDBX_dpl *dl = txn->tw.dirtylist;
  6516    unsigned i = dpl_search(txn, pgno);
  6517    assert((int)i > 0);
  6518    return (dl->items[i].pgno == pgno) ? i : 0;
  6519  }
  6520  
  6521  MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn,
  6522                                                           const pgno_t pgno) {
  6523    const MDBX_dpl *dl = txn->tw.dirtylist;
  6524    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6525    for (unsigned i = dl->length; i > dl->sorted; --i)
  6526      if (dl->items[i].pgno == pgno)
  6527        return dl->items[i].ptr;
  6528  
  6529    if (dl->sorted) {
  6530      const unsigned i =
  6531          (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
  6532      if (dl->items[i].pgno == pgno)
  6533        return dl->items[i].ptr;
  6534    }
  6535    return nullptr;
  6536  }
  6537  
  6538  static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) {
  6539    MDBX_dpl *dl = txn->tw.dirtylist;
  6540    assert((int)i > 0 && i <= dl->length);
  6541    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6542    dl->pages_including_loose -= npages;
  6543    dl->sorted -= dl->sorted >= i;
  6544    dl->length -= 1;
  6545    memmove(dl->items + i, dl->items + i + 1,
  6546            (dl->length - i + 2) * sizeof(dl->items[0]));
  6547    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6548  }
  6549  
  6550  static void dpl_remove(const MDBX_txn *txn, unsigned i) {
  6551    dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i));
  6552  }
  6553  
  6554  static __always_inline int __must_check_result dpl_append(MDBX_txn *txn,
  6555                                                            pgno_t pgno,
  6556                                                            MDBX_page *page,
  6557                                                            unsigned npages) {
  6558    MDBX_dpl *dl = txn->tw.dirtylist;
  6559    assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
  6560    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6561    if (AUDIT_ENABLED()) {
  6562      for (unsigned i = dl->length; i > 0; --i) {
  6563        assert(dl->items[i].pgno != pgno);
  6564        if (unlikely(dl->items[i].pgno == pgno)) {
  6565          ERROR("Page %u already exist in the DPL at %u", pgno, i);
  6566          return MDBX_PROBLEM;
  6567        }
  6568      }
  6569    }
  6570  
  6571    const unsigned length = dl->length + 1;
  6572    const unsigned sorted =
  6573        (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno)
  6574            ? length
  6575            : dl->sorted;
  6576  
  6577    if (unlikely(dl->length == dl->detent)) {
  6578      if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) {
  6579        ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT);
  6580        return MDBX_TXN_FULL;
  6581      }
  6582      const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42)
  6583                              ? dl->detent + dl->detent
  6584                              : dl->detent + dl->detent / 2;
  6585      dl = dpl_reserve(txn, size);
  6586      if (unlikely(!dl))
  6587        return MDBX_ENOMEM;
  6588      tASSERT(txn, dl->length < dl->detent);
  6589    }
  6590  
  6591    /* copy the stub beyond the end */
  6592    dl->items[length + 1] = dl->items[length];
  6593    /* append page */
  6594    dl->items[length].ptr = page;
  6595    dl->items[length].pgno = pgno;
  6596    dl->items[length].multi = npages > 1;
  6597    dl->items[length].lru = txn->tw.dirtylru++;
  6598    dl->length = length;
  6599    dl->sorted = sorted;
  6600    dl->pages_including_loose += npages;
  6601    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  6602    return MDBX_SUCCESS;
  6603  }
  6604  
  6605  static __inline uint32_t dpl_age(const MDBX_txn *txn, unsigned i) {
  6606    const MDBX_dpl *dl = txn->tw.dirtylist;
  6607    assert((int)i > 0 && i <= dl->length);
  6608    /* overflow could be here */
  6609    return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF);
  6610  }
  6611  
  6612  /*----------------------------------------------------------------------------*/
  6613  
  6614  uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT;
  6615  uint8_t loglevel = MDBX_LOG_FATAL;
  6616  MDBX_debug_func *debug_logger;
  6617  
  6618  static __must_check_result __inline int page_retire(MDBX_cursor *mc,
  6619                                                      MDBX_page *mp);
  6620  
  6621  static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp,
  6622                                            unsigned npages);
  6623  typedef struct page_result {
  6624    MDBX_page *page;
  6625    int err;
  6626  } pgr_t;
  6627  
  6628  static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard);
  6629  
  6630  static pgr_t page_new(MDBX_cursor *mc, const unsigned flags);
  6631  static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages);
  6632  static int page_touch(MDBX_cursor *mc);
  6633  static int cursor_touch(MDBX_cursor *mc);
  6634  static int touch_dbi(MDBX_cursor *mc);
  6635  
  6636  #define MDBX_END_NAMES                                                         \
  6637    {                                                                            \
  6638      "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin",  \
  6639          "fail-beginchild"                                                      \
  6640    }
  6641  enum {
  6642    /* txn_end operation number, for logging */
  6643    MDBX_END_COMMITTED,
  6644    MDBX_END_PURE_COMMIT,
  6645    MDBX_END_ABORT,
  6646    MDBX_END_RESET,
  6647    MDBX_END_RESET_TMP,
  6648    MDBX_END_FAIL_BEGIN,
  6649    MDBX_END_FAIL_BEGINCHILD
  6650  };
  6651  #define MDBX_END_OPMASK 0x0F  /* mask for txn_end() operation number */
  6652  #define MDBX_END_UPDATE 0x10  /* update env state (DBIs) */
  6653  #define MDBX_END_FREE 0x20    /* free txn unless it is MDBX_env.me_txn0 */
  6654  #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */
  6655  #define MDBX_END_SLOT 0x80    /* release any reader slot if MDBX_NOTLS */
  6656  static int txn_end(MDBX_txn *txn, const unsigned mode);
  6657  
  6658  static __always_inline pgr_t page_get_inline(const uint16_t ILL,
  6659                                               MDBX_cursor *const mc,
  6660                                               const pgno_t pgno,
  6661                                               const txnid_t front);
  6662  
  6663  static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno,
  6664                            const txnid_t front) {
  6665    return page_get_inline(P_ILL_BITS, mc, pgno, front);
  6666  }
  6667  
  6668  __hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno,
  6669                                    const txnid_t front) {
  6670    return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front);
  6671  }
  6672  
  6673  static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno,
  6674                              const txnid_t front) {
  6675    return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno,
  6676                           front);
  6677  }
  6678  
  6679  static __always_inline int __must_check_result page_get(MDBX_cursor *mc,
  6680                                                          const pgno_t pgno,
  6681                                                          MDBX_page **mp,
  6682                                                          const txnid_t front) {
  6683    pgr_t ret = page_get_three(mc, pgno, front);
  6684    *mp = ret.page;
  6685    return ret.err;
  6686  }
  6687  
  6688  static int __must_check_result page_search_root(MDBX_cursor *mc,
  6689                                                  const MDBX_val *key, int flags);
  6690  
  6691  #define MDBX_PS_MODIFY 1
  6692  #define MDBX_PS_ROOTONLY 2
  6693  #define MDBX_PS_FIRST 4
  6694  #define MDBX_PS_LAST 8
  6695  static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key,
  6696                                             int flags);
  6697  static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst);
  6698  
  6699  #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */
  6700  static int __must_check_result page_split(MDBX_cursor *mc,
  6701                                            const MDBX_val *const newkey,
  6702                                            MDBX_val *const newdata,
  6703                                            pgno_t newpgno, const unsigned naf);
  6704  
  6705  static bool coherency_check_meta(const MDBX_env *env,
  6706                                   const volatile MDBX_meta *meta, bool report);
  6707  static int __must_check_result validate_meta_copy(MDBX_env *env,
  6708                                                    const MDBX_meta *meta,
  6709                                                    MDBX_meta *dest);
  6710  static int __must_check_result override_meta(MDBX_env *env, unsigned target,
  6711                                               txnid_t txnid,
  6712                                               const MDBX_meta *shape);
  6713  static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta,
  6714                                             const int lck_exclusive,
  6715                                             const mdbx_mode_t mode_bits);
  6716  static int __must_check_result sync_locked(MDBX_env *env, unsigned flags,
  6717                                             MDBX_meta *const pending,
  6718                                             meta_troika_t *const troika);
  6719  static int env_close(MDBX_env *env);
  6720  
  6721  struct node_result {
  6722    MDBX_node *node;
  6723    bool exact;
  6724  };
  6725  
  6726  static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key);
  6727  
  6728  static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx,
  6729                                                 const MDBX_val *key,
  6730                                                 pgno_t pgno);
  6731  static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned indx,
  6732                                               const MDBX_val *key,
  6733                                               MDBX_val *data, unsigned flags);
  6734  static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx,
  6735                                                const MDBX_val *key);
  6736  
  6737  static void node_del(MDBX_cursor *mc, size_t ksize);
  6738  static void node_shrink(MDBX_page *mp, unsigned indx);
  6739  static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst,
  6740                                           bool fromleft);
  6741  static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf,
  6742                                           MDBX_val *data, const MDBX_page *mp);
  6743  static int __must_check_result rebalance(MDBX_cursor *mc);
  6744  static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key);
  6745  
  6746  static void cursor_pop(MDBX_cursor *mc);
  6747  static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp);
  6748  
  6749  static int __must_check_result audit_ex(MDBX_txn *txn, unsigned retired_stored,
  6750                                          bool dont_filter_gc);
  6751  
  6752  static int __must_check_result page_check(MDBX_cursor *const mc,
  6753                                            const MDBX_page *const mp);
  6754  static int __must_check_result cursor_check(MDBX_cursor *mc);
  6755  static int __must_check_result cursor_check_updating(MDBX_cursor *mc);
  6756  static int __must_check_result cursor_del(MDBX_cursor *mc);
  6757  static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi,
  6758                                         const MDBX_val *key,
  6759                                         const MDBX_val *data, unsigned flags);
  6760  #define SIBLING_LEFT 0
  6761  #define SIBLING_RIGHT 2
  6762  static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir);
  6763  static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key,
  6764                                             MDBX_val *data, MDBX_cursor_op op);
  6765  static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key,
  6766                                             MDBX_val *data, MDBX_cursor_op op);
  6767  struct cursor_set_result {
  6768    int err;
  6769    bool exact;
  6770  };
  6771  
  6772  static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key,
  6773                                             MDBX_val *data, MDBX_cursor_op op);
  6774  static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key,
  6775                                              MDBX_val *data);
  6776  static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key,
  6777                                             MDBX_val *data);
  6778  
  6779  static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn,
  6780                                             MDBX_dbi dbi);
  6781  static int __must_check_result cursor_xinit0(MDBX_cursor *mc);
  6782  static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node,
  6783                                               const MDBX_page *mp);
  6784  static int __must_check_result cursor_xinit2(MDBX_cursor *mc,
  6785                                               MDBX_xcursor *src_mx,
  6786                                               bool new_dupdata);
  6787  static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst);
  6788  
  6789  static int __must_check_result drop_tree(MDBX_cursor *mc,
  6790                                           const bool may_have_subDBs);
  6791  static int __must_check_result fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi);
  6792  static int __must_check_result setup_dbx(MDBX_dbx *const dbx,
  6793                                           const MDBX_db *const db,
  6794                                           const unsigned pagesize);
  6795  
  6796  static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2,
  6797      cmp_int_unaligned, cmp_lenfast;
  6798  
  6799  static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags);
  6800  static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags);
  6801  
  6802  __cold const char *mdbx_liberr2str(int errnum) {
  6803    /* Table of descriptions for MDBX errors */
  6804    static const char *const tbl[] = {
  6805        "MDBX_KEYEXIST: Key/data pair already exists",
  6806        "MDBX_NOTFOUND: No matching key/data pair found",
  6807        "MDBX_PAGE_NOTFOUND: Requested page not found",
  6808        "MDBX_CORRUPTED: Database is corrupted",
  6809        "MDBX_PANIC: Environment had fatal error",
  6810        "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx",
  6811        "MDBX_INVALID: File is not an MDBX file",
  6812        "MDBX_MAP_FULL: Environment mapsize limit reached",
  6813        "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)",
  6814        "MDBX_READERS_FULL: Too many readers (maxreaders reached)",
  6815        NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */,
  6816        "MDBX_TXN_FULL: Transaction has too many dirty pages,"
  6817        " i.e transaction is too big",
  6818        "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates"
  6819        " corruption, i.e branch-pages loop",
  6820        "MDBX_PAGE_FULL: Internal error - Page has no more space",
  6821        "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend"
  6822        " mapping, e.g. since address space is unavailable or busy,"
  6823        " or Operation system not supported such operations",
  6824        "MDBX_INCOMPATIBLE: Environment or database is not compatible"
  6825        " with the requested operation or the specified flags",
  6826        "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
  6827        " e.g. read-transaction already run for current thread",
  6828        "MDBX_BAD_TXN: Transaction is not valid for requested operation,"
  6829        " e.g. had errored and be must aborted, has a child, or is invalid",
  6830        "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"
  6831        " for target database, either invalid subDB name",
  6832        "MDBX_BAD_DBI: The specified DBI-handle is invalid"
  6833        " or changed by another thread/transaction",
  6834        "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted",
  6835        "MDBX_BUSY: Another write transaction is running,"
  6836        " or environment is already used while opening with MDBX_EXCLUSIVE flag",
  6837    };
  6838  
  6839    if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) {
  6840      int i = errnum - MDBX_KEYEXIST;
  6841      return tbl[i];
  6842    }
  6843  
  6844    switch (errnum) {
  6845    case MDBX_SUCCESS:
  6846      return "MDBX_SUCCESS: Successful";
  6847    case MDBX_EMULTIVAL:
  6848      return "MDBX_EMULTIVAL: The specified key has"
  6849             " more than one associated value";
  6850    case MDBX_EBADSIGN:
  6851      return "MDBX_EBADSIGN: Wrong signature of a runtime object(s),"
  6852             " e.g. memory corruption or double-free";
  6853    case MDBX_WANNA_RECOVERY:
  6854      return "MDBX_WANNA_RECOVERY: Database should be recovered,"
  6855             " but this could NOT be done automatically for now"
  6856             " since it opened in read-only mode";
  6857    case MDBX_EKEYMISMATCH:
  6858      return "MDBX_EKEYMISMATCH: The given key value is mismatched to the"
  6859             " current cursor position";
  6860    case MDBX_TOO_LARGE:
  6861      return "MDBX_TOO_LARGE: Database is too large for current system,"
  6862             " e.g. could NOT be mapped into RAM";
  6863    case MDBX_THREAD_MISMATCH:
  6864      return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not"
  6865             " owned object, e.g. a transaction that started by another thread";
  6866    case MDBX_TXN_OVERLAPPING:
  6867      return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for"
  6868             " the current thread";
  6869    default:
  6870      return NULL;
  6871    }
  6872  }
  6873  
  6874  __cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
  6875    const char *msg = mdbx_liberr2str(errnum);
  6876    if (!msg && buflen > 0 && buflen < INT_MAX) {
  6877  #if defined(_WIN32) || defined(_WIN64)
  6878      const DWORD size = FormatMessageA(
  6879          FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
  6880          errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
  6881          NULL);
  6882      return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
  6883  #elif defined(_GNU_SOURCE) && defined(__GLIBC__)
  6884      /* GNU-specific */
  6885      if (errnum > 0)
  6886        msg = strerror_r(errnum, buf, buflen);
  6887  #elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
  6888      /* XSI-compliant */
  6889      if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0)
  6890        msg = buf;
  6891  #else
  6892      if (errnum > 0) {
  6893        msg = strerror(errnum);
  6894        if (msg) {
  6895          strncpy(buf, msg, buflen);
  6896          msg = buf;
  6897        }
  6898      }
  6899  #endif
  6900      if (!msg) {
  6901        (void)snprintf(buf, buflen, "error %d", errnum);
  6902        msg = buf;
  6903      }
  6904      buf[buflen - 1] = '\0';
  6905    }
  6906    return msg;
  6907  }
  6908  
  6909  __cold const char *mdbx_strerror(int errnum) {
  6910  #if defined(_WIN32) || defined(_WIN64)
  6911    static char buf[1024];
  6912    return mdbx_strerror_r(errnum, buf, sizeof(buf));
  6913  #else
  6914    const char *msg = mdbx_liberr2str(errnum);
  6915    if (!msg) {
  6916      if (errnum > 0)
  6917        msg = strerror(errnum);
  6918      if (!msg) {
  6919        static char buf[32];
  6920        (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum);
  6921        msg = buf;
  6922      }
  6923    }
  6924    return msg;
  6925  #endif
  6926  }
  6927  
  6928  #if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */
  6929  const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) {
  6930    const char *msg = mdbx_liberr2str(errnum);
  6931    if (!msg && buflen > 0 && buflen < INT_MAX) {
  6932      const DWORD size = FormatMessageA(
  6933          FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
  6934          errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
  6935          NULL);
  6936      if (!size)
  6937        msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
  6938      else if (!CharToOemBuffA(buf, buf, size))
  6939        msg = "CharToOemBuffA() failed";
  6940      else
  6941        msg = buf;
  6942    }
  6943    return msg;
  6944  }
  6945  
  6946  const char *mdbx_strerror_ANSI2OEM(int errnum) {
  6947    static char buf[1024];
  6948    return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf));
  6949  }
  6950  #endif /* Bit of madness for Windows */
  6951  
  6952  __cold void debug_log_va(int level, const char *function, int line,
  6953                           const char *fmt, va_list args) {
  6954    if (debug_logger)
  6955      debug_logger(level, function, line, fmt, args);
  6956    else {
  6957  #if defined(_WIN32) || defined(_WIN64)
  6958      if (IsDebuggerPresent()) {
  6959        int prefix_len = 0;
  6960        char *prefix = nullptr;
  6961        if (function && line > 0)
  6962          prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line);
  6963        else if (function)
  6964          prefix_len = osal_asprintf(&prefix, "%s: ", function);
  6965        else if (line > 0)
  6966          prefix_len = osal_asprintf(&prefix, "%d: ", line);
  6967        if (prefix_len > 0 && prefix) {
  6968          OutputDebugStringA(prefix);
  6969          osal_free(prefix);
  6970        }
  6971        char *msg = nullptr;
  6972        int msg_len = osal_vasprintf(&msg, fmt, args);
  6973        if (msg_len > 0 && msg) {
  6974          OutputDebugStringA(msg);
  6975          osal_free(msg);
  6976        }
  6977      }
  6978  #else
  6979      if (function && line > 0)
  6980        fprintf(stderr, "%s:%d ", function, line);
  6981      else if (function)
  6982        fprintf(stderr, "%s: ", function);
  6983      else if (line > 0)
  6984        fprintf(stderr, "%d: ", line);
  6985      vfprintf(stderr, fmt, args);
  6986      fflush(stderr);
  6987  #endif
  6988    }
  6989  }
  6990  
  6991  __cold void debug_log(int level, const char *function, int line,
  6992                        const char *fmt, ...) {
  6993    va_list args;
  6994    va_start(args, fmt);
  6995    debug_log_va(level, function, line, fmt, args);
  6996    va_end(args);
  6997  }
  6998  
  6999  /* Dump a key in ascii or hexadecimal. */
  7000  const char *mdbx_dump_val(const MDBX_val *key, char *const buf,
  7001                            const size_t bufsize) {
  7002    if (!key)
  7003      return "<null>";
  7004    if (!key->iov_len)
  7005      return "<empty>";
  7006    if (!buf || bufsize < 4)
  7007      return nullptr;
  7008  
  7009    bool is_ascii = true;
  7010    const uint8_t *const data = key->iov_base;
  7011    for (unsigned i = 0; i < key->iov_len; i++)
  7012      if (data[i] < ' ' || data[i] > '~') {
  7013        is_ascii = false;
  7014        break;
  7015      }
  7016  
  7017    if (is_ascii) {
  7018      int len =
  7019          snprintf(buf, bufsize, "%.*s",
  7020                   (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data);
  7021      assert(len > 0 && (unsigned)len < bufsize);
  7022      (void)len;
  7023    } else {
  7024      char *const detent = buf + bufsize - 2;
  7025      char *ptr = buf;
  7026      *ptr++ = '<';
  7027      for (unsigned i = 0; i < key->iov_len; i++) {
  7028        const ptrdiff_t left = detent - ptr;
  7029        assert(left > 0);
  7030        int len = snprintf(ptr, left, "%02x", data[i]);
  7031        if (len < 0 || len >= left)
  7032          break;
  7033        ptr += len;
  7034      }
  7035      if (ptr < detent) {
  7036        ptr[0] = '>';
  7037        ptr[1] = '\0';
  7038      }
  7039    }
  7040    return buf;
  7041  }
  7042  
  7043  /*------------------------------------------------------------------------------
  7044   LY: debug stuff */
  7045  
  7046  static const char *leafnode_type(MDBX_node *n) {
  7047    static const char *const tp[2][2] = {{"", ": DB"},
  7048                                         {": sub-page", ": sub-DB"}};
  7049    return (node_flags(n) & F_BIGDATA)
  7050               ? ": large page"
  7051               : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)];
  7052  }
  7053  
  7054  /* Display all the keys in the page. */
  7055  MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) {
  7056    pgno_t pgno = mp->mp_pgno;
  7057    const char *type;
  7058    MDBX_node *node;
  7059    unsigned i, nkeys, nsize, total = 0;
  7060    MDBX_val key;
  7061    DKBUF;
  7062  
  7063    switch (PAGETYPE_WHOLE(mp)) {
  7064    case P_BRANCH:
  7065      type = "Branch page";
  7066      break;
  7067    case P_LEAF:
  7068      type = "Leaf page";
  7069      break;
  7070    case P_LEAF | P_SUBP:
  7071      type = "Leaf sub-page";
  7072      break;
  7073    case P_LEAF | P_LEAF2:
  7074      type = "Leaf2 page";
  7075      break;
  7076    case P_LEAF | P_LEAF2 | P_SUBP:
  7077      type = "Leaf2 sub-page";
  7078      break;
  7079    case P_OVERFLOW:
  7080      VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages);
  7081      return;
  7082    case P_META:
  7083      VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno,
  7084              unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a));
  7085      return;
  7086    default:
  7087      VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags);
  7088      return;
  7089    }
  7090  
  7091    nkeys = page_numkeys(mp);
  7092    VERBOSE("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys);
  7093  
  7094    for (i = 0; i < nkeys; i++) {
  7095      if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
  7096        key.iov_len = nsize = mp->mp_leaf2_ksize;
  7097        key.iov_base = page_leaf2key(mp, i, nsize);
  7098        total += nsize;
  7099        VERBOSE("key %u: nsize %u, %s\n", i, nsize, DKEY(&key));
  7100        continue;
  7101      }
  7102      node = page_node(mp, i);
  7103      key.iov_len = node_ks(node);
  7104      key.iov_base = node->mn_data;
  7105      nsize = (unsigned)(NODESIZE + key.iov_len);
  7106      if (IS_BRANCH(mp)) {
  7107        VERBOSE("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node),
  7108                DKEY(&key));
  7109        total += nsize;
  7110      } else {
  7111        if (node_flags(node) & F_BIGDATA)
  7112          nsize += sizeof(pgno_t);
  7113        else
  7114          nsize += (unsigned)node_ds(node);
  7115        total += nsize;
  7116        nsize += sizeof(indx_t);
  7117        VERBOSE("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key),
  7118                leafnode_type(node));
  7119      }
  7120      total = EVEN(total);
  7121    }
  7122    VERBOSE("Total: header %u + contents %u + unused %u\n",
  7123            IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total,
  7124            page_room(mp));
  7125  }
  7126  
  7127  /*----------------------------------------------------------------------------*/
  7128  
  7129  /* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */
  7130  #define XCURSOR_INITED(mc)                                                     \
  7131    ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
  7132  
  7133  /* Update sub-page pointer, if any, in mc->mc_xcursor.
  7134   * Needed when the node which contains the sub-page may have moved.
  7135   * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */
  7136  #define XCURSOR_REFRESH(mc, mp, ki)                                            \
  7137    do {                                                                         \
  7138      MDBX_page *xr_pg = (mp);                                                   \
  7139      MDBX_node *xr_node = page_node(xr_pg, ki);                                 \
  7140      if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA)          \
  7141        (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node);               \
  7142    } while (0)
  7143  
  7144  MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) {
  7145    for (MDBX_cursor *scan = mc->mc_txn->mt_cursors[mc->mc_dbi]; scan;
  7146         scan = scan->mc_next)
  7147      if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan))
  7148        return true;
  7149    return false;
  7150  }
  7151  
  7152  /* Perform act while tracking temporary cursor mn */
  7153  #define WITH_CURSOR_TRACKING(mn, act)                                          \
  7154    do {                                                                         \
  7155      cASSERT(&(mn),                                                             \
  7156              mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */);       \
  7157      cASSERT(&(mn), !cursor_is_tracked(&(mn)));                                 \
  7158      MDBX_cursor mc_dummy;                                                      \
  7159      MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi];         \
  7160      MDBX_cursor *tracked = &(mn);                                              \
  7161      if ((mn).mc_flags & C_SUB) {                                               \
  7162        mc_dummy.mc_flags = C_INITIALIZED;                                       \
  7163        mc_dummy.mc_top = 0;                                                     \
  7164        mc_dummy.mc_snum = 0;                                                    \
  7165        mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn);                             \
  7166        tracked = &mc_dummy;                                                     \
  7167      }                                                                          \
  7168      tracked->mc_next = *tracking_head;                                         \
  7169      *tracking_head = tracked;                                                  \
  7170      { act; }                                                                   \
  7171      *tracking_head = tracked->mc_next;                                         \
  7172    } while (0)
  7173  
  7174  int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
  7175               const MDBX_val *b) {
  7176    eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE);
  7177    return txn->mt_dbxs[dbi].md_cmp(a, b);
  7178  }
  7179  
  7180  int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
  7181                const MDBX_val *b) {
  7182    eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE);
  7183    return txn->mt_dbxs[dbi].md_dcmp(a, b);
  7184  }
  7185  
  7186  /* Allocate memory for a page.
  7187   * Re-use old malloc'ed pages first for singletons, otherwise just malloc.
  7188   * Set MDBX_TXN_ERROR on failure. */
  7189  static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) {
  7190    MDBX_env *env = txn->mt_env;
  7191    MDBX_page *np = env->me_dp_reserve;
  7192    size_t size = env->me_psize;
  7193    if (likely(num == 1 && np)) {
  7194      eASSERT(env, env->me_dp_reserve_len > 0);
  7195      MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size);
  7196      VALGRIND_MEMPOOL_ALLOC(env, np, size);
  7197      VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next));
  7198      env->me_dp_reserve = np->mp_next;
  7199      env->me_dp_reserve_len -= 1;
  7200    } else {
  7201      size = pgno2bytes(env, num);
  7202      np = osal_malloc(size);
  7203      if (unlikely(!np)) {
  7204        txn->mt_flags |= MDBX_TXN_ERROR;
  7205        return np;
  7206      }
  7207      VALGRIND_MEMPOOL_ALLOC(env, np, size);
  7208    }
  7209  
  7210    if ((env->me_flags & MDBX_NOMEMINIT) == 0) {
  7211      /* For a single page alloc, we init everything after the page header.
  7212       * For multi-page, we init the final page; if the caller needed that
  7213       * many pages they will be filling in at least up to the last page. */
  7214      size_t skip = PAGEHDRSZ;
  7215      if (num > 1)
  7216        skip += pgno2bytes(env, num - 1);
  7217      memset((char *)np + skip, 0, size - skip);
  7218    }
  7219  #if MDBX_DEBUG
  7220    np->mp_pgno = 0;
  7221  #endif
  7222    VALGRIND_MAKE_MEM_UNDEFINED(np, size);
  7223    np->mp_flags = 0;
  7224    np->mp_pages = num;
  7225    return np;
  7226  }
  7227  
  7228  /* Free a shadow dirty page */
  7229  static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) {
  7230    VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
  7231    MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
  7232    if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB))
  7233      memset(dp, -1, pgno2bytes(env, npages));
  7234    if (npages == 1 &&
  7235        env->me_dp_reserve_len < env->me_options.dp_reserve_limit) {
  7236      MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next),
  7237                                     pgno2bytes(env, npages) -
  7238                                         sizeof(dp->mp_next));
  7239      dp->mp_next = env->me_dp_reserve;
  7240      VALGRIND_MEMPOOL_FREE(env, dp);
  7241      env->me_dp_reserve = dp;
  7242      env->me_dp_reserve_len += 1;
  7243    } else {
  7244      /* large pages just get freed directly */
  7245      VALGRIND_MEMPOOL_FREE(env, dp);
  7246      osal_free(dp);
  7247    }
  7248  }
  7249  
  7250  /* Return all dirty pages to dpage list */
  7251  static void dlist_free(MDBX_txn *txn) {
  7252    MDBX_env *env = txn->mt_env;
  7253    MDBX_dpl *const dl = txn->tw.dirtylist;
  7254  
  7255    for (unsigned i = 1; i <= dl->length; i++)
  7256      dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i));
  7257  
  7258    dpl_clear(dl);
  7259  }
  7260  
  7261  static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) {
  7262    cASSERT(mc, (mc->mc_flags & C_SUB) != 0);
  7263    MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
  7264    MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner);
  7265    cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db);
  7266    cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx);
  7267    return couple->outer.mc_db;
  7268  }
  7269  
  7270  MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) {
  7271    const MDBX_dpl *const dl = txn->tw.dirtylist;
  7272    assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
  7273    tASSERT(txn, txn->tw.dirtyroom + dl->length ==
  7274                     (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
  7275                                     : txn->mt_env->me_options.dp_limit));
  7276  
  7277    if (!AUDIT_ENABLED())
  7278      return true;
  7279  
  7280    unsigned loose = 0, pages = 0;
  7281    for (unsigned i = dl->length; i > 0; --i) {
  7282      const MDBX_page *const dp = dl->items[i].ptr;
  7283      if (!dp)
  7284        continue;
  7285  
  7286      tASSERT(txn, dp->mp_pgno == dl->items[i].pgno);
  7287      if (unlikely(dp->mp_pgno != dl->items[i].pgno))
  7288        return false;
  7289  
  7290      const uint32_t age = dpl_age(txn, i);
  7291      tASSERT(txn, age < UINT32_MAX / 3);
  7292      if (unlikely(age > UINT32_MAX / 3))
  7293        return false;
  7294  
  7295      tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
  7296      if (dp->mp_flags == P_LOOSE) {
  7297        loose += 1;
  7298      } else if (unlikely(!IS_MODIFIABLE(txn, dp)))
  7299        return false;
  7300  
  7301      const unsigned num = dpl_npages(dl, i);
  7302      pages += num;
  7303      tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num);
  7304      if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num))
  7305        return false;
  7306  
  7307      if (i < dl->sorted) {
  7308        tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num);
  7309        if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num))
  7310          return false;
  7311      }
  7312  
  7313      const unsigned rpa =
  7314          pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno);
  7315      tASSERT(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) ||
  7316                       txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno);
  7317      if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
  7318          unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno))
  7319        return false;
  7320      if (num > 1) {
  7321        const unsigned rpb = pnl_search(txn->tw.reclaimed_pglist,
  7322                                        dp->mp_pgno + num - 1, txn->mt_next_pgno);
  7323        tASSERT(txn, rpa == rpb);
  7324        if (unlikely(rpa != rpb))
  7325          return false;
  7326      }
  7327    }
  7328  
  7329    tASSERT(txn, loose == txn->tw.loose_count);
  7330    if (unlikely(loose != txn->tw.loose_count))
  7331      return false;
  7332  
  7333    tASSERT(txn, pages == dl->pages_including_loose);
  7334    if (unlikely(pages != dl->pages_including_loose))
  7335      return false;
  7336  
  7337    for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) {
  7338      const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
  7339      tASSERT(txn, !dp);
  7340      if (unlikely(dp))
  7341        return false;
  7342    }
  7343  
  7344    return true;
  7345  }
  7346  
  7347  #if MDBX_ENABLE_REFUND
  7348  static void refund_reclaimed(MDBX_txn *txn) {
  7349    /* Scanning in descend order */
  7350    pgno_t next_pgno = txn->mt_next_pgno;
  7351    const MDBX_PNL pnl = txn->tw.reclaimed_pglist;
  7352    tASSERT(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1);
  7353  #if MDBX_PNL_ASCENDING
  7354    unsigned i = MDBX_PNL_SIZE(pnl);
  7355    tASSERT(txn, pnl[i] == next_pgno - 1);
  7356    while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1)
  7357      ;
  7358    MDBX_PNL_SIZE(pnl) = i;
  7359  #else
  7360    unsigned i = 1;
  7361    tASSERT(txn, pnl[i] == next_pgno - 1);
  7362    unsigned len = MDBX_PNL_SIZE(pnl);
  7363    while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1)
  7364      ;
  7365    MDBX_PNL_SIZE(pnl) = len -= i - 1;
  7366    for (unsigned move = 0; move < len; ++move)
  7367      pnl[1 + move] = pnl[i + move];
  7368  #endif
  7369    VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
  7370            txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno);
  7371    txn->mt_next_pgno = next_pgno;
  7372    tASSERT(txn,
  7373            pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1));
  7374  }
  7375  
  7376  static void refund_loose(MDBX_txn *txn) {
  7377    tASSERT(txn, txn->tw.loose_pages != nullptr);
  7378    tASSERT(txn, txn->tw.loose_count > 0);
  7379  
  7380    MDBX_dpl *const dl = txn->tw.dirtylist;
  7381    tASSERT(txn, dl->length >= txn->tw.loose_count);
  7382  
  7383    pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
  7384    MDBX_PNL suitable = onstack;
  7385  
  7386    if (dl->length - dl->sorted > txn->tw.loose_count) {
  7387      /* Dirty list is useless since unsorted. */
  7388      if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) {
  7389        suitable = pnl_alloc(txn->tw.loose_count);
  7390        if (unlikely(!suitable))
  7391          return /* this is not a reason for transaction fail */;
  7392      }
  7393  
  7394      /* Collect loose-pages which may be refunded. */
  7395      tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count);
  7396      pgno_t most = MIN_PAGENO;
  7397      unsigned w = 0;
  7398      for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) {
  7399        tASSERT(txn, lp->mp_flags == P_LOOSE);
  7400        tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno);
  7401        if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) {
  7402          tASSERT(txn,
  7403                  w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack))
  7404                                             : MDBX_PNL_ALLOCLEN(suitable)));
  7405          suitable[++w] = lp->mp_pgno;
  7406          most = (lp->mp_pgno > most) ? lp->mp_pgno : most;
  7407        }
  7408      }
  7409  
  7410      if (most + 1 == txn->mt_next_pgno) {
  7411        /* Sort suitable list and refund pages at the tail. */
  7412        MDBX_PNL_SIZE(suitable) = w;
  7413        pnl_sort(suitable, MAX_PAGENO + 1);
  7414  
  7415        /* Scanning in descend order */
  7416        const int step = MDBX_PNL_ASCENDING ? -1 : 1;
  7417        const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1;
  7418        const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1;
  7419        tASSERT(txn, suitable[begin] >= suitable[end - step]);
  7420        tASSERT(txn, most == suitable[begin]);
  7421  
  7422        for (int i = begin + step; i != end; i += step) {
  7423          if (suitable[i] != most - 1)
  7424            break;
  7425          most -= 1;
  7426        }
  7427        const unsigned refunded = txn->mt_next_pgno - most;
  7428        DEBUG("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded,
  7429              most, txn->mt_next_pgno);
  7430        txn->tw.loose_count -= refunded;
  7431        txn->tw.dirtyroom += refunded;
  7432        dl->pages_including_loose -= refunded;
  7433        assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
  7434        txn->mt_next_pgno = most;
  7435  
  7436        /* Filter-out dirty list */
  7437        unsigned r = 0;
  7438        w = 0;
  7439        if (dl->sorted) {
  7440          do {
  7441            if (dl->items[++r].pgno < most) {
  7442              if (++w != r)
  7443                dl->items[w] = dl->items[r];
  7444            }
  7445          } while (r < dl->sorted);
  7446          dl->sorted = w;
  7447        }
  7448        while (r < dl->length) {
  7449          if (dl->items[++r].pgno < most) {
  7450            if (++w != r)
  7451              dl->items[w] = dl->items[r];
  7452          }
  7453        }
  7454        dpl_setlen(dl, w);
  7455        tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
  7456                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
  7457                                         : txn->mt_env->me_options.dp_limit));
  7458  
  7459        goto unlink_loose;
  7460      }
  7461    } else {
  7462      /* Dirtylist is mostly sorted, just refund loose pages at the end. */
  7463      dpl_sort(txn);
  7464      tASSERT(txn,
  7465              dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno);
  7466      tASSERT(txn, dl->sorted == dl->length);
  7467  
  7468      /* Scan dirtylist tail-forward and cutoff suitable pages. */
  7469      unsigned n;
  7470      for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 &&
  7471                           dl->items[n].ptr->mp_flags == P_LOOSE;
  7472           --n) {
  7473        tASSERT(txn, n > 0);
  7474        MDBX_page *dp = dl->items[n].ptr;
  7475        DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno);
  7476        tASSERT(txn, dp->mp_pgno == dl->items[n].pgno);
  7477        txn->mt_next_pgno -= 1;
  7478      }
  7479      dpl_setlen(dl, n);
  7480  
  7481      if (dl->sorted != dl->length) {
  7482        const unsigned refunded = dl->sorted - dl->length;
  7483        dl->sorted = dl->length;
  7484        txn->tw.loose_count -= refunded;
  7485        txn->tw.dirtyroom += refunded;
  7486        dl->pages_including_loose -= refunded;
  7487        tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
  7488                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
  7489                                         : txn->mt_env->me_options.dp_limit));
  7490  
  7491        /* Filter-out loose chain & dispose refunded pages. */
  7492      unlink_loose:
  7493        for (MDBX_page **link = &txn->tw.loose_pages; *link;) {
  7494          MDBX_page *dp = *link;
  7495          tASSERT(txn, dp->mp_flags == P_LOOSE);
  7496          if (txn->mt_next_pgno > dp->mp_pgno) {
  7497            link = &dp->mp_next;
  7498          } else {
  7499            *link = dp->mp_next;
  7500            if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
  7501              dpage_free(txn->mt_env, dp, 1);
  7502          }
  7503        }
  7504      }
  7505    }
  7506  
  7507    tASSERT(txn, dirtylist_check(txn));
  7508    if (suitable != onstack)
  7509      pnl_free(suitable);
  7510    txn->tw.loose_refund_wl = txn->mt_next_pgno;
  7511  }
  7512  
  7513  static bool txn_refund(MDBX_txn *txn) {
  7514    const pgno_t before = txn->mt_next_pgno;
  7515  
  7516    if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno)
  7517      refund_loose(txn);
  7518  
  7519    while (true) {
  7520      if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 ||
  7521          MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1)
  7522        break;
  7523  
  7524      refund_reclaimed(txn);
  7525      if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno)
  7526        break;
  7527  
  7528      const pgno_t memo = txn->mt_next_pgno;
  7529      refund_loose(txn);
  7530      if (memo == txn->mt_next_pgno)
  7531        break;
  7532    }
  7533  
  7534    if (before == txn->mt_next_pgno)
  7535      return false;
  7536  
  7537    if (txn->tw.spill_pages)
  7538      /* Squash deleted pagenums if we refunded any */
  7539      spill_purge(txn);
  7540  
  7541    return true;
  7542  }
  7543  #else  /* MDBX_ENABLE_REFUND */
  7544  static __inline bool txn_refund(MDBX_txn *txn) {
  7545    (void)txn;
  7546    /* No online auto-compactification. */
  7547    return false;
  7548  }
  7549  #endif /* MDBX_ENABLE_REFUND */
  7550  
  7551  __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno,
  7552                               unsigned npages) {
  7553    MDBX_env *const env = txn->mt_env;
  7554    DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno);
  7555    eASSERT(env, pgno >= NUM_METAS && npages);
  7556    if (!IS_FROZEN(txn, mp)) {
  7557      const size_t bytes = pgno2bytes(env, npages);
  7558      memset(mp, -1, bytes);
  7559      mp->mp_pgno = pgno;
  7560      if ((env->me_flags & MDBX_WRITEMAP) == 0)
  7561        osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno));
  7562    } else {
  7563      struct iovec iov[MDBX_COMMIT_PAGES];
  7564      iov[0].iov_len = env->me_psize;
  7565      iov[0].iov_base = (char *)env->me_pbuf + env->me_psize;
  7566      size_t iov_off = pgno2bytes(env, pgno);
  7567      unsigned n = 1;
  7568      while (--npages) {
  7569        iov[n] = iov[0];
  7570        if (++n == MDBX_COMMIT_PAGES) {
  7571          osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off,
  7572                       pgno2bytes(env, MDBX_COMMIT_PAGES));
  7573          iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
  7574          n = 0;
  7575        }
  7576      }
  7577      osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n));
  7578    }
  7579  }
  7580  
  7581  /* Remove page from dirty list */
  7582  static __inline void page_wash(MDBX_txn *txn, const unsigned di,
  7583                                 MDBX_page *const mp, const unsigned npages) {
  7584    tASSERT(txn, di && di <= txn->tw.dirtylist->length &&
  7585                     txn->tw.dirtylist->items[di].ptr == mp);
  7586    dpl_remove_ex(txn, di, npages);
  7587    txn->tw.dirtyroom++;
  7588    tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
  7589                     (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
  7590                                     : txn->mt_env->me_options.dp_limit));
  7591    mp->mp_txnid = INVALID_TXNID;
  7592    mp->mp_flags = P_BAD;
  7593    VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
  7594    if (txn->mt_flags & MDBX_WRITEMAP) {
  7595      VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
  7596                                 pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
  7597      MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
  7598                                     pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
  7599    } else
  7600      dpage_free(txn->mt_env, mp, npages);
  7601  }
  7602  
  7603  /* Retire, loosen or free a single page.
  7604   *
  7605   * For dirty pages, saves single pages to a list for future reuse in this same
  7606   * txn. It has been pulled from the GC and already resides on the dirty list,
  7607   * but has been deleted. Use these pages first before pulling again from the GC.
  7608   *
  7609   * If the page wasn't dirtied in this txn, just add it
  7610   * to this txn's free list. */
  7611  static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
  7612                            MDBX_page *mp /* maybe null */,
  7613                            unsigned pageflags /* maybe unknown/zero */) {
  7614    int rc;
  7615    MDBX_txn *const txn = mc->mc_txn;
  7616    tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags));
  7617  
  7618    /* During deleting entire subtrees, it is reasonable and possible to avoid
  7619     * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs:
  7620     *  - mp is null, i.e. the page has not yet been read;
  7621     *  - pagetype is known and the P_LEAF bit is set;
  7622     *  - we can determine the page status via scanning the lists
  7623     *    of dirty and spilled pages.
  7624     *
  7625     *  On the other hand, this could be suboptimal for WRITEMAP mode, since
  7626     *  requires support the list of dirty pages and avoid explicit spilling.
  7627     *  So for flexibility and avoid extra internal dependencies we just
  7628     *  fallback to reading if dirty list was not allocated yet. */
  7629    unsigned di = 0, si = 0, npages = 1;
  7630    bool is_frozen = false, is_spilled = false, is_shadowed = false;
  7631    if (unlikely(!mp)) {
  7632      if (ASSERT_ENABLED() && pageflags) {
  7633        pgr_t check;
  7634        check = page_get_any(mc, pgno, txn->mt_front);
  7635        if (unlikely(check.err != MDBX_SUCCESS))
  7636          return check.err;
  7637        tASSERT(txn,
  7638                (check.page->mp_flags & ~P_LEAF2) == (pageflags & ~P_FROZEN));
  7639        tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page));
  7640      }
  7641      if (pageflags & P_FROZEN) {
  7642        is_frozen = true;
  7643        if (ASSERT_ENABLED()) {
  7644          for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) {
  7645            tASSERT(txn, !search_spilled(scan, pgno));
  7646            tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
  7647          }
  7648        }
  7649        goto status_done;
  7650      } else if (pageflags && txn->tw.dirtylist) {
  7651        if ((di = dpl_exist(txn, pgno)) != 0) {
  7652          mp = txn->tw.dirtylist->items[di].ptr;
  7653          tASSERT(txn, IS_MODIFIABLE(txn, mp));
  7654          goto status_done;
  7655        }
  7656        if ((si = search_spilled(txn, pgno)) != 0) {
  7657          is_spilled = true;
  7658          goto status_done;
  7659        }
  7660        for (MDBX_txn *parent = txn->mt_parent; parent;
  7661             parent = parent->mt_parent) {
  7662          if (dpl_exist(parent, pgno)) {
  7663            is_shadowed = true;
  7664            goto status_done;
  7665          }
  7666          if (search_spilled(parent, pgno)) {
  7667            is_spilled = true;
  7668            goto status_done;
  7669          }
  7670        }
  7671        is_frozen = true;
  7672        goto status_done;
  7673      }
  7674  
  7675      pgr_t pg = page_get_any(mc, pgno, txn->mt_front);
  7676      if (unlikely(pg.err != MDBX_SUCCESS))
  7677        return pg.err;
  7678      mp = pg.page;
  7679      tASSERT(txn, !pageflags || mp->mp_flags == pageflags);
  7680      pageflags = mp->mp_flags;
  7681    }
  7682  
  7683    is_frozen = IS_FROZEN(txn, mp);
  7684    if (!is_frozen) {
  7685      const bool is_dirty = IS_MODIFIABLE(txn, mp);
  7686      is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP);
  7687      is_shadowed = IS_SHADOWED(txn, mp);
  7688      if (is_dirty) {
  7689        tASSERT(txn, !is_spilled);
  7690        tASSERT(txn, !search_spilled(txn, pgno));
  7691        tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent ||
  7692                         (txn->mt_flags & MDBX_WRITEMAP));
  7693      } else {
  7694        tASSERT(txn, !debug_dpl_find(txn, pgno));
  7695      }
  7696  
  7697      di = is_dirty ? dpl_exist(txn, pgno) : 0;
  7698      si = is_spilled ? search_spilled(txn, pgno) : 0;
  7699      tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP));
  7700    } else {
  7701      tASSERT(txn, !IS_MODIFIABLE(txn, mp));
  7702      tASSERT(txn, !IS_SPILLED(txn, mp));
  7703      tASSERT(txn, !IS_SHADOWED(txn, mp));
  7704    }
  7705  
  7706  status_done:
  7707    if (likely((pageflags & P_OVERFLOW) == 0)) {
  7708      STATIC_ASSERT(P_BRANCH == 1);
  7709      const bool is_branch = pageflags & P_BRANCH;
  7710      if (unlikely(mc->mc_flags & C_SUB)) {
  7711        MDBX_db *outer = outer_db(mc);
  7712        cASSERT(mc, !is_branch || outer->md_branch_pages > 0);
  7713        outer->md_branch_pages -= is_branch;
  7714        cASSERT(mc, is_branch || outer->md_leaf_pages > 0);
  7715        outer->md_leaf_pages -= 1 - is_branch;
  7716      }
  7717      cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0);
  7718      mc->mc_db->md_branch_pages -= is_branch;
  7719      cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0);
  7720      mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0;
  7721    } else {
  7722      npages = mp->mp_pages;
  7723      cASSERT(mc, mc->mc_db->md_overflow_pages >= npages);
  7724      mc->mc_db->md_overflow_pages -= npages;
  7725    }
  7726  
  7727    if (is_frozen) {
  7728    retire:
  7729      DEBUG("retire %u page %" PRIaPGNO, npages, pgno);
  7730      rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages);
  7731      tASSERT(txn, dirtylist_check(txn));
  7732      return rc;
  7733    }
  7734  
  7735    /* Возврат страниц в нераспределенный "хвост" БД.
  7736     * Содержимое страниц не уничтожается, а для вложенных транзакций граница
  7737     * нераспределенного "хвоста" БД сдвигается только при их коммите. */
  7738    if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) {
  7739      const char *kind = nullptr;
  7740      if (di) {
  7741        /* Страница испачкана в этой транзакции, но до этого могла быть
  7742         * аллоцирована, испачкана и пролита в одной из родительских транзакций.
  7743         * Её МОЖНО вытолкнуть в нераспределенный хвост. */
  7744        kind = "dirty";
  7745        /* Remove from dirty list */
  7746        page_wash(txn, di, mp, npages);
  7747      } else if (si) {
  7748        /* Страница пролита в этой транзакции, т.е. она аллоцирована
  7749         * и запачкана в этой или одной из родительских транзакций.
  7750         * Её МОЖНО вытолкнуть в нераспределенный хвост. */
  7751        kind = "spilled";
  7752        spill_remove(txn, si, npages);
  7753      } else if ((txn->mt_flags & MDBX_WRITEMAP)) {
  7754        kind = "writemap";
  7755        tASSERT(txn, mp && IS_MODIFIABLE(txn, mp));
  7756      } else {
  7757        /* Страница аллоцирована, запачкана и возможно пролита в одной
  7758         * из родительских транзакций.
  7759         * Её МОЖНО вытолкнуть в нераспределенный хвост. */
  7760        kind = "parent's";
  7761        if (ASSERT_ENABLED() && mp) {
  7762          kind = nullptr;
  7763          for (MDBX_txn *parent = txn->mt_parent; parent;
  7764               parent = parent->mt_parent) {
  7765            if (search_spilled(parent, pgno)) {
  7766              kind = "parent-spilled";
  7767              tASSERT(txn, is_spilled);
  7768              break;
  7769            }
  7770            if (mp == debug_dpl_find(parent, pgno)) {
  7771              kind = "parent-dirty";
  7772              tASSERT(txn, !is_spilled);
  7773              break;
  7774            }
  7775          }
  7776          tASSERT(txn, kind != nullptr);
  7777        }
  7778        tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp)));
  7779      }
  7780      DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno);
  7781      txn->mt_next_pgno = pgno;
  7782      txn_refund(txn);
  7783      return MDBX_SUCCESS;
  7784    }
  7785  
  7786    if (di) {
  7787      /* Dirty page from this transaction */
  7788      /* If suitable we can reuse it through loose list */
  7789      if (likely(npages == 1 &&
  7790                 txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit &&
  7791                 (!MDBX_ENABLE_REFUND ||
  7792                  /* skip pages near to the end in favor of compactification */
  7793                  txn->mt_next_pgno >
  7794                      pgno + txn->mt_env->me_options.dp_loose_limit ||
  7795                  txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) {
  7796        DEBUG("loosen dirty page %" PRIaPGNO, pgno);
  7797        mp->mp_flags = P_LOOSE;
  7798        mp->mp_next = txn->tw.loose_pages;
  7799        txn->tw.loose_pages = mp;
  7800        txn->tw.loose_count++;
  7801  #if MDBX_ENABLE_REFUND
  7802        txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl)
  7803                                      ? pgno + 2
  7804                                      : txn->tw.loose_refund_wl;
  7805  #endif /* MDBX_ENABLE_REFUND */
  7806        if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
  7807          memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ);
  7808        VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
  7809                                   txn->mt_env->me_psize - PAGEHDRSZ);
  7810        MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
  7811                                       txn->mt_env->me_psize - PAGEHDRSZ);
  7812        return MDBX_SUCCESS;
  7813      }
  7814  
  7815  #if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__)
  7816      if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
  7817  #endif
  7818      {
  7819        /* Страница могла быть изменена в одной из родительских транзакций,
  7820         * в том числе, позже выгружена и затем снова загружена и изменена.
  7821         * В обоих случаях её нельзя затирать на диске и помечать недоступной
  7822         * в asan и/или valgrind */
  7823        for (MDBX_txn *parent = txn->mt_parent;
  7824             parent && (parent->mt_flags & MDBX_TXN_SPILLS);
  7825             parent = parent->mt_parent) {
  7826          if (intersect_spilled(parent, pgno, npages))
  7827            goto skip_invalidate;
  7828          if (dpl_intersect(parent, pgno, npages))
  7829            goto skip_invalidate;
  7830        }
  7831  
  7832  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
  7833        if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
  7834  #endif
  7835          kill_page(txn, mp, pgno, npages);
  7836        if (!(txn->mt_flags & MDBX_WRITEMAP)) {
  7837          VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)),
  7838                                     pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
  7839          MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)),
  7840                                         pgno2bytes(txn->mt_env, npages) -
  7841                                             PAGEHDRSZ);
  7842        }
  7843      }
  7844    skip_invalidate:
  7845      /* Remove from dirty list */
  7846      page_wash(txn, di, mp, npages);
  7847  
  7848    reclaim:
  7849      DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno);
  7850      rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages);
  7851      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
  7852                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
  7853      tASSERT(txn, dirtylist_check(txn));
  7854      return rc;
  7855    }
  7856  
  7857    if (si) {
  7858      /* Page ws spilled in this txn */
  7859      spill_remove(txn, si, npages);
  7860      /* Страница могла быть выделена и затем пролита в этой транзакции,
  7861       * тогда её необходимо поместить в reclaimed-список.
  7862       * Либо она могла быть выделена в одной из родительских транзакций и затем
  7863       * пролита в этой транзакции, тогда её необходимо поместить в
  7864       * retired-список для последующей фильтрации при коммите. */
  7865      for (MDBX_txn *parent = txn->mt_parent; parent;
  7866           parent = parent->mt_parent) {
  7867        if (dpl_exist(parent, pgno))
  7868          goto retire;
  7869      }
  7870      /* Страница точно была выделена в этой транзакции
  7871       * и теперь может быть использована повторно. */
  7872      goto reclaim;
  7873    }
  7874  
  7875    if (is_shadowed) {
  7876      /* Dirty page MUST BE a clone from (one of) parent transaction(s). */
  7877      if (ASSERT_ENABLED()) {
  7878        const MDBX_page *parent_dp = nullptr;
  7879        /* Check parent(s)'s dirty lists. */
  7880        for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp;
  7881             parent = parent->mt_parent) {
  7882          tASSERT(txn, !search_spilled(parent, pgno));
  7883          parent_dp = debug_dpl_find(parent, pgno);
  7884        }
  7885        tASSERT(txn, parent_dp && (!mp || parent_dp == mp));
  7886      }
  7887      /* Страница была выделена в родительской транзакции и теперь может быть
  7888       * использована повторно, но только внутри этой транзакции, либо дочерних.
  7889       */
  7890      goto reclaim;
  7891    }
  7892  
  7893    /* Страница может входить в доступный читателям MVCC-снимок, либо же она
  7894     * могла быть выделена, а затем пролита в одной из родительских
  7895     * транзакций. Поэтому пока помещаем её в retired-список, который будет
  7896     * фильтроваться относительно dirty- и spilled-списков родительских
  7897     * транзакций при коммите дочерних транзакций, либо же будет записан
  7898     * в GC в неизменном виде. */
  7899    goto retire;
  7900  }
  7901  
  7902  static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) {
  7903    return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags);
  7904  }
  7905  
  7906  struct iov_ctx {
  7907    unsigned iov_items;
  7908    size_t iov_bytes;
  7909    size_t iov_off;
  7910    pgno_t flush_begin;
  7911    pgno_t flush_end;
  7912    struct iovec iov[MDBX_COMMIT_PAGES];
  7913  };
  7914  
  7915  static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) {
  7916    ctx->flush_begin = MAX_PAGENO;
  7917    ctx->flush_end = MIN_PAGENO;
  7918    ctx->iov_items = 0;
  7919    ctx->iov_bytes = 0;
  7920    ctx->iov_off = 0;
  7921    (void)txn;
  7922  }
  7923  
  7924  static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) {
  7925    tASSERT(txn, ctx->iov_items == 0);
  7926  #if defined(__linux__) || defined(__gnu_linux__)
  7927    MDBX_env *const env = txn->mt_env;
  7928    if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00)
  7929      /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
  7930       * arguments, making this function fairly expensive. Therefore, the
  7931       * whole cache is always flushed. */
  7932      osal_flush_incoherent_mmap(
  7933          env->me_map + pgno2bytes(env, ctx->flush_begin),
  7934          pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize);
  7935  #endif /* Linux */
  7936  }
  7937  
  7938  static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) {
  7939    tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
  7940    tASSERT(txn, ctx->iov_items > 0);
  7941  
  7942    MDBX_env *const env = txn->mt_env;
  7943    int rc;
  7944    if (likely(ctx->iov_items == 1)) {
  7945      eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len);
  7946      rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len,
  7947                       ctx->iov_off);
  7948    } else {
  7949      rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off,
  7950                        ctx->iov_bytes);
  7951    }
  7952  
  7953    if (unlikely(rc != MDBX_SUCCESS))
  7954      ERROR("Write error: %s", mdbx_strerror(rc));
  7955    else {
  7956      VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off,
  7957                                ctx->iov_bytes);
  7958      MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off,
  7959                                       ctx->iov_bytes);
  7960    }
  7961  
  7962    unsigned iov_items = ctx->iov_items;
  7963  #if MDBX_ENABLE_PGOP_STAT
  7964    txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items;
  7965  #endif /* MDBX_ENABLE_PGOP_STAT */
  7966    ctx->iov_items = 0;
  7967    ctx->iov_bytes = 0;
  7968  
  7969    uint64_t timestamp = 0;
  7970    for (unsigned i = 0; i < iov_items; i++) {
  7971      MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base;
  7972      const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno);
  7973      /* check with timeout as the workaround
  7974       * for todo4recovery://erased_by_github/libmdbx/issues/269 */
  7975      while (likely(rc == MDBX_SUCCESS) &&
  7976             unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) {
  7977        if (!timestamp) {
  7978          timestamp = osal_monotime();
  7979          iov_done(txn, ctx);
  7980          WARNING(
  7981              "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno,
  7982              "(workaround for incoherent flaw of unified page/buffer cache)");
  7983        } else if (unlikely(osal_monotime() - timestamp > 65536 / 10)) {
  7984          ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno,
  7985                "(workaround for incoherent flaw of unified page/buffer cache)");
  7986          rc = MDBX_CORRUPTED;
  7987        }
  7988  #if defined(_WIN32) || defined(_WIN64)
  7989        SwitchToThread();
  7990  #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
  7991        sched_yield();
  7992  #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
  7993        pthread_yield();
  7994  #else
  7995        usleep(42);
  7996  #endif
  7997      }
  7998      dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len));
  7999    }
  8000    return rc;
  8001  }
  8002  
  8003  static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp,
  8004                      unsigned npages) {
  8005    MDBX_env *const env = txn->mt_env;
  8006    tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno);
  8007    tASSERT(txn, IS_MODIFIABLE(txn, dp));
  8008    tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)));
  8009  
  8010    ctx->flush_begin =
  8011        (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
  8012    ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
  8013                         ? ctx->flush_end
  8014                         : dp->mp_pgno + npages;
  8015    env->me_lck->mti_unsynced_pages.weak += npages;
  8016  
  8017    if (IS_SHADOWED(txn, dp)) {
  8018      tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
  8019      dp->mp_txnid = txn->mt_txnid;
  8020      tASSERT(txn, IS_SPILLED(txn, dp));
  8021      const size_t size = pgno2bytes(env, npages);
  8022      if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) ||
  8023          ctx->iov_items == ARRAY_LENGTH(ctx->iov) ||
  8024          ctx->iov_bytes + size > MAX_WRITE) {
  8025        if (ctx->iov_items) {
  8026          int err = iov_write(txn, ctx);
  8027          if (unlikely(err != MDBX_SUCCESS))
  8028            return err;
  8029  #if defined(__linux__) || defined(__gnu_linux__)
  8030          if (linux_kernel_version >= 0x02060b00)
  8031          /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
  8032           * arguments, making this function fairly expensive. Therefore, the
  8033           * whole cache is always flushed. */
  8034  #endif /* Linux */
  8035            osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes,
  8036                                       env->me_os_psize);
  8037        }
  8038        ctx->iov_off = pgno2bytes(env, dp->mp_pgno);
  8039      }
  8040      ctx->iov[ctx->iov_items].iov_base = (void *)dp;
  8041      ctx->iov[ctx->iov_items].iov_len = size;
  8042      ctx->iov_items += 1;
  8043      ctx->iov_bytes += size;
  8044    } else {
  8045      tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP);
  8046    }
  8047    return MDBX_SUCCESS;
  8048  }
  8049  
  8050  static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp,
  8051                        unsigned npages) {
  8052    tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
  8053    pgno_t pgno = dp->mp_pgno;
  8054    int err = iov_page(txn, ctx, dp, npages);
  8055    if (likely(err == MDBX_SUCCESS)) {
  8056      err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages);
  8057  #if MDBX_ENABLE_PGOP_STAT
  8058      if (likely(err == MDBX_SUCCESS))
  8059        txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages;
  8060  #endif /* MDBX_ENABLE_PGOP_STAT */
  8061    }
  8062    return err;
  8063  }
  8064  
  8065  /* Set unspillable LRU-label for dirty pages watched by txn.
  8066   * Returns the number of pages marked as unspillable. */
  8067  static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) {
  8068    unsigned keep = 0;
  8069    while (mc->mc_flags & C_INITIALIZED) {
  8070      for (unsigned i = 0; i < mc->mc_snum; ++i) {
  8071        const MDBX_page *mp = mc->mc_pg[i];
  8072        if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) {
  8073          unsigned const n = dpl_search(txn, mp->mp_pgno);
  8074          if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno &&
  8075              dpl_age(txn, n)) {
  8076            txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru;
  8077            ++keep;
  8078          }
  8079        }
  8080      }
  8081      if (!mc->mc_xcursor)
  8082        break;
  8083      mc = &mc->mc_xcursor->mx_cursor;
  8084    }
  8085    return keep;
  8086  }
  8087  
  8088  static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
  8089    unsigned keep = m0 ? cursor_keep(txn, m0) : 0;
  8090    for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i)
  8091      if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) &&
  8092          txn->mt_dbs[i].md_root != P_INVALID)
  8093        for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next)
  8094          if (mc != m0)
  8095            keep += cursor_keep(txn, mc);
  8096    return keep;
  8097  }
  8098  
  8099  /* Returns the spilling priority (0..255) for a dirty page:
  8100   *      0 = should be spilled;
  8101   *    ...
  8102   *  > 255 = must not be spilled. */
  8103  static unsigned spill_prio(const MDBX_txn *txn, const unsigned i,
  8104                             const uint32_t reciprocal) {
  8105    MDBX_dpl *const dl = txn->tw.dirtylist;
  8106    const uint32_t age = dpl_age(txn, i);
  8107    const unsigned npages = dpl_npages(dl, i);
  8108    const pgno_t pgno = dl->items[i].pgno;
  8109    if (age == 0) {
  8110      DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno);
  8111      return 256;
  8112    }
  8113  
  8114    MDBX_page *const dp = dl->items[i].ptr;
  8115    if (dp->mp_flags & (P_LOOSE | P_SPILLED)) {
  8116      DEBUG("skip %s %u page %" PRIaPGNO,
  8117            (dp->mp_flags & P_LOOSE)   ? "loose"
  8118            : (dp->mp_flags & P_LOOSE) ? "loose"
  8119                                       : "parent-spilled",
  8120            npages, pgno);
  8121      return 256;
  8122    }
  8123  
  8124    /* Can't spill twice,
  8125     * make sure it's not already in a parent's spill list(s). */
  8126    MDBX_txn *parent = txn->mt_parent;
  8127    if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) {
  8128      do
  8129        if (intersect_spilled(parent, pgno, npages)) {
  8130          DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno);
  8131          dp->mp_flags |= P_SPILLED;
  8132          return 256;
  8133        }
  8134      while ((parent = parent->mt_parent) != nullptr);
  8135    }
  8136  
  8137    tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX);
  8138    unsigned prio = age * reciprocal >> 24;
  8139    tASSERT(txn, prio < 256);
  8140    if (likely(npages == 1))
  8141      return prio = 256 - prio;
  8142  
  8143    /* make a large/overflow pages be likely to spill */
  8144    uint32_t factor = npages | npages >> 1;
  8145    factor |= factor >> 2;
  8146    factor |= factor >> 4;
  8147    factor |= factor >> 8;
  8148    factor |= factor >> 16;
  8149    factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157;
  8150    factor = (factor < 256) ? 255 - factor : 0;
  8151    tASSERT(txn, factor < 256 && factor < (256 - prio));
  8152    return prio = factor;
  8153  }
  8154  
  8155  /* Spill pages from the dirty list back to disk.
  8156   * This is intended to prevent running into MDBX_TXN_FULL situations,
  8157   * but note that they may still occur in a few cases:
  8158   *
  8159   * 1) our estimate of the txn size could be too small. Currently this
  8160   *  seems unlikely, except with a large number of MDBX_MULTIPLE items.
  8161   *
  8162   * 2) child txns may run out of space if their parents dirtied a
  8163   *  lot of pages and never spilled them. TODO: we probably should do
  8164   *  a preemptive spill during mdbx_txn_begin() of a child txn, if
  8165   *  the parent's dirtyroom is below a given threshold.
  8166   *
  8167   * Otherwise, if not using nested txns, it is expected that apps will
  8168   * not run into MDBX_TXN_FULL any more. The pages are flushed to disk
  8169   * the same way as for a txn commit, e.g. their dirty status is cleared.
  8170   * If the txn never references them again, they can be left alone.
  8171   * If the txn only reads them, they can be used without any fuss.
  8172   * If the txn writes them again, they can be dirtied immediately without
  8173   * going thru all of the work of page_touch(). Such references are
  8174   * handled by page_unspill().
  8175   *
  8176   * Also note, we never spill DB root pages, nor pages of active cursors,
  8177   * because we'll need these back again soon anyway. And in nested txns,
  8178   * we can't spill a page in a child txn if it was already spilled in a
  8179   * parent txn. That would alter the parent txns' data even though
  8180   * the child hasn't committed yet, and we'd have no way to undo it if
  8181   * the child aborted. */
  8182  static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
  8183                       const unsigned need) {
  8184  #if xMDBX_DEBUG_SPILLING != 1
  8185    /* production mode */
  8186    if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
  8187      return MDBX_SUCCESS;
  8188    unsigned wanna_spill = need - txn->tw.dirtyroom;
  8189  #else
  8190    /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */
  8191    unsigned wanna_spill =
  8192        (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1;
  8193  #endif /* xMDBX_DEBUG_SPILLING */
  8194  
  8195    const unsigned dirty = txn->tw.dirtylist->length;
  8196    const unsigned spill_min =
  8197        txn->mt_env->me_options.spill_min_denominator
  8198            ? dirty / txn->mt_env->me_options.spill_min_denominator
  8199            : 0;
  8200    const unsigned spill_max =
  8201        dirty - (txn->mt_env->me_options.spill_max_denominator
  8202                     ? dirty / txn->mt_env->me_options.spill_max_denominator
  8203                     : 0);
  8204    wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min;
  8205    wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max;
  8206    if (!wanna_spill)
  8207      return MDBX_SUCCESS;
  8208  
  8209    NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill,
  8210           txn->tw.dirtyroom, need);
  8211    tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill);
  8212  
  8213    struct iov_ctx ctx;
  8214    iov_init(txn, &ctx);
  8215    int rc = MDBX_SUCCESS;
  8216    if (txn->mt_flags & MDBX_WRITEMAP) {
  8217      MDBX_dpl *const dl = txn->tw.dirtylist;
  8218      const unsigned span = dl->length - txn->tw.loose_count;
  8219      txn->tw.dirtyroom += span;
  8220      unsigned r, w;
  8221      for (w = 0, r = 1; r <= dl->length; ++r) {
  8222        MDBX_page *dp = dl->items[r].ptr;
  8223        if (dp->mp_flags & P_LOOSE)
  8224          dl->items[++w] = dl->items[r];
  8225        else if (!MDBX_FAKE_SPILL_WRITEMAP) {
  8226          rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r));
  8227          tASSERT(txn, rc == MDBX_SUCCESS);
  8228        }
  8229      }
  8230  
  8231      tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count);
  8232      dl->sorted = (dl->sorted == dl->length) ? w : 0;
  8233      dpl_setlen(dl, w);
  8234      tASSERT(txn, dirtylist_check(txn));
  8235  
  8236      if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) {
  8237        MDBX_env *const env = txn->mt_env;
  8238  #if MDBX_ENABLE_PGOP_STAT
  8239        env->me_lck->mti_pgop_stat.wops.weak += 1;
  8240  #endif /* MDBX_ENABLE_PGOP_STAT */
  8241        rc = osal_msync(&env->me_dxb_mmap,
  8242                        pgno_align2os_bytes(env, ctx.flush_begin),
  8243                        pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin),
  8244                        MDBX_SYNC_NONE);
  8245      }
  8246      return rc;
  8247    }
  8248  
  8249    tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP));
  8250    if (!txn->tw.spill_pages) {
  8251      txn->tw.spill_least_removed = INT_MAX;
  8252      txn->tw.spill_pages = pnl_alloc(wanna_spill);
  8253      if (unlikely(!txn->tw.spill_pages)) {
  8254        rc = MDBX_ENOMEM;
  8255      bailout:
  8256        txn->mt_flags |= MDBX_TXN_ERROR;
  8257        return rc;
  8258      }
  8259    } else {
  8260      /* purge deleted slots */
  8261      spill_purge(txn);
  8262      rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill);
  8263      (void)rc /* ignore since the resulting list may be shorter
  8264       and pnl_append() will increase pnl on demand */
  8265          ;
  8266    }
  8267  
  8268    /* Сортируем чтобы запись на диск была полее последовательна */
  8269    MDBX_dpl *const dl = dpl_sort(txn);
  8270  
  8271    /* Preserve pages which may soon be dirtied again */
  8272    const unsigned unspillable = txn_keep(txn, m0);
  8273    if (unspillable + txn->tw.loose_count >= dl->length) {
  8274  #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode  */
  8275      if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
  8276        return MDBX_SUCCESS;
  8277  #endif /* xMDBX_DEBUG_SPILLING */
  8278      ERROR("all %u dirty pages are unspillable  since referenced "
  8279            "by a cursor(s), use fewer cursors or increase "
  8280            "MDBX_opt_txn_dp_limit",
  8281            unspillable);
  8282      goto done;
  8283    }
  8284  
  8285    /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU,
  8286     * но при этом учесть важные поправки:
  8287     *  - лучше выталкивать старые large/overflow страницы, так будет освобождено
  8288     *    больше памяти, а также так как они (в текущем понимании) гораздо реже
  8289     *    повторно изменяются;
  8290     *  - при прочих равных лучше выталкивать смежные страницы, так будет
  8291     *    меньше I/O операций;
  8292     *  - желательно потратить на это меньше времени чем std::partial_sort_copy;
  8293     *
  8294     * Решение:
  8295     *  - Квантуем весь диапазон lru-меток до 256 значений и задействуем один
  8296     *    проход 8-битного radix-sort. В результате получаем 256 уровней
  8297     *    "свежести", в том числе значение lru-метки, старее которой страницы
  8298     *    должны быть выгружены;
  8299     *  - Двигаемся последовательно в сторону увеличения номеров страниц
  8300     *    и выталкиваем страницы с lru-меткой старее отсекающего значения,
  8301     *    пока не вытолкнем достаточно;
  8302     *  - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва
  8303     *    I/O операций выталкиваем и их, если они попадают в первую половину
  8304     *    между выталкиваемыми и самыми свежими lru-метками;
  8305     *  - дополнительно при сортировке умышленно старим large/overflow страницы,
  8306     *    тем самым повышая их шансы на выталкивание. */
  8307  
  8308    /* get min/max of LRU-labels */
  8309    uint32_t age_max = 0;
  8310    for (unsigned i = 1; i <= dl->length; ++i) {
  8311      const uint32_t age = dpl_age(txn, i);
  8312      age_max = (age_max >= age) ? age_max : age;
  8313    }
  8314  
  8315    VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
  8316  
  8317    /* half of 8-bit radix-sort */
  8318    unsigned radix_counters[256], spillable = 0, spilled = 0;
  8319    memset(&radix_counters, 0, sizeof(radix_counters));
  8320    const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
  8321    for (unsigned i = 1; i <= dl->length; ++i) {
  8322      unsigned prio = spill_prio(txn, i, reciprocal);
  8323      if (prio < 256) {
  8324        radix_counters[prio] += 1;
  8325        spillable += 1;
  8326      }
  8327    }
  8328  
  8329    if (likely(spillable > 0)) {
  8330      unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0];
  8331      for (unsigned i = 1; i < 256; i++) {
  8332        if (amount < wanna_spill) {
  8333          prio2spill = i;
  8334          prio2adjacent = i + (257 - i) / 2;
  8335          amount += radix_counters[i];
  8336        } else if (amount + amount < spillable + wanna_spill
  8337                   /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) {
  8338          prio2adjacent = i;
  8339          amount += radix_counters[i];
  8340        } else
  8341          break;
  8342      }
  8343  
  8344      VERBOSE("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, "
  8345              "wanna_spill %u",
  8346              prio2spill, prio2adjacent, amount, spillable, wanna_spill);
  8347      tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
  8348  
  8349      unsigned prev_prio = 256;
  8350      unsigned r, w, prio;
  8351      for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill;
  8352           prev_prio = prio, ++r) {
  8353        prio = spill_prio(txn, r, reciprocal);
  8354        MDBX_page *const dp = dl->items[r].ptr;
  8355        if (prio < prio2adjacent) {
  8356          const pgno_t pgno = dl->items[r].pgno;
  8357          const unsigned npages = dpl_npages(dl, r);
  8358          if (prio <= prio2spill) {
  8359            if (prev_prio < prio2adjacent && prev_prio > prio2spill &&
  8360                dpl_endpgno(dl, r - 1) == pgno) {
  8361              DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO
  8362                    " (age %d, prio %u)",
  8363                    dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1),
  8364                    prev_prio);
  8365              --w;
  8366              rc = spill_page(txn, &ctx, dl->items[r - 1].ptr,
  8367                              dpl_npages(dl, r - 1));
  8368              if (unlikely(rc != MDBX_SUCCESS))
  8369                break;
  8370              ++spilled;
  8371            }
  8372  
  8373            DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages,
  8374                  dp->mp_pgno, dpl_age(txn, r), prio);
  8375            rc = spill_page(txn, &ctx, dp, npages);
  8376            if (unlikely(rc != MDBX_SUCCESS))
  8377              break;
  8378            ++spilled;
  8379            continue;
  8380          }
  8381  
  8382          if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) {
  8383            DEBUG("co-spill %u next-adjacent page %" PRIaPGNO
  8384                  " (age %d, prio %u)",
  8385                  npages, dp->mp_pgno, dpl_age(txn, r), prio);
  8386            rc = spill_page(txn, &ctx, dp, npages);
  8387            if (unlikely(rc != MDBX_SUCCESS))
  8388              break;
  8389            prio = prev_prio /* to continue co-spilling next adjacent pages */;
  8390            ++spilled;
  8391            continue;
  8392          }
  8393        }
  8394        dl->items[++w] = dl->items[r];
  8395      }
  8396  
  8397      tASSERT(txn, spillable == 0 || spilled > 0);
  8398  
  8399      while (r <= dl->length)
  8400        dl->items[++w] = dl->items[r++];
  8401      tASSERT(txn, r - 1 - w == spilled);
  8402  
  8403      dl->sorted = dpl_setlen(dl, w);
  8404      txn->tw.dirtyroom += spilled;
  8405      tASSERT(txn, dirtylist_check(txn));
  8406  
  8407      if (ctx.iov_items) {
  8408        /* iov_page() frees dirty-pages and reset iov_items in case of failure. */
  8409        tASSERT(txn, rc == MDBX_SUCCESS);
  8410        rc = iov_write(txn, &ctx);
  8411      }
  8412  
  8413      if (unlikely(rc != MDBX_SUCCESS))
  8414        goto bailout;
  8415  
  8416      pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1);
  8417      txn->mt_flags |= MDBX_TXN_SPILLS;
  8418      NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled,
  8419             txn->tw.dirtyroom);
  8420      iov_done(txn, &ctx);
  8421    } else {
  8422      tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS);
  8423      for (unsigned i = 1; i <= dl->length; ++i) {
  8424        MDBX_page *dp = dl->items[i].ptr;
  8425        NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
  8426               i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i),
  8427               spill_prio(txn, i, reciprocal));
  8428      }
  8429    }
  8430  
  8431  #if xMDBX_DEBUG_SPILLING == 2
  8432    if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
  8433      ERROR("dirty-list length: before %u, after %u, parent %i, loose %u; "
  8434            "needed %u, spillable %u; "
  8435            "spilled %u dirty-entries, now have %u dirty-room",
  8436            dl->length + spilled, dl->length,
  8437            (txn->mt_parent && txn->mt_parent->tw.dirtylist)
  8438                ? (int)txn->mt_parent->tw.dirtylist->length
  8439                : -1,
  8440            txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom);
  8441    ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
  8442  #endif /* xMDBX_DEBUG_SPILLING */
  8443  
  8444  done:
  8445    return likely(txn->tw.dirtyroom + txn->tw.loose_count >
  8446                  ((need > CURSOR_STACK) ? CURSOR_STACK : need))
  8447               ? MDBX_SUCCESS
  8448               : MDBX_TXN_FULL;
  8449  }
  8450  
  8451  static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key,
  8452                          const MDBX_val *data) {
  8453    MDBX_txn *txn = mc->mc_txn;
  8454    /* Estimate how much space this operation will take: */
  8455    /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */
  8456    unsigned need = CURSOR_STACK + 3;
  8457    /* 2) GC/FreeDB for any payload */
  8458    if (mc->mc_dbi > FREE_DBI) {
  8459      need += txn->mt_dbs[FREE_DBI].md_depth + 3;
  8460      /* 3) Named DBs also dirty the main DB */
  8461      if (mc->mc_dbi > MAIN_DBI)
  8462        need += txn->mt_dbs[MAIN_DBI].md_depth + 3;
  8463    }
  8464  #if xMDBX_DEBUG_SPILLING != 2
  8465    /* production mode */
  8466    /* 4) Double the page chain estimation
  8467     * for extensively splitting, rebalance and merging */
  8468    need += need;
  8469    /* 5) Factor the key+data which to be put in */
  8470    need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1;
  8471  #else
  8472    /* debug mode */
  8473    (void)key;
  8474    (void)data;
  8475    mc->mc_txn->mt_env->debug_dirtied_est = ++need;
  8476    mc->mc_txn->mt_env->debug_dirtied_act = 0;
  8477  #endif /* xMDBX_DEBUG_SPILLING == 2 */
  8478  
  8479    return txn_spill(txn, mc, need);
  8480  }
  8481  
  8482  /*----------------------------------------------------------------------------*/
  8483  
  8484  static bool meta_bootid_match(const MDBX_meta *meta) {
  8485    return memcmp(&meta->mm_bootid, &bootid, 16) == 0 &&
  8486           (bootid.x | bootid.y) != 0;
  8487  }
  8488  
  8489  static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta,
  8490                                   const int lck_exclusive) {
  8491    return lck_exclusive
  8492               ? /* exclusive lock */ meta_bootid_match(meta)
  8493               : /* db already opened */ env->me_lck_mmap.lck &&
  8494                     (env->me_lck_mmap.lck->mti_envmode.weak & MDBX_RDONLY) == 0;
  8495  }
  8496  
  8497  #define METAPAGE(env, n) page_meta(pgno2page(env, n))
  8498  #define METAPAGE_END(env) METAPAGE(env, NUM_METAS)
  8499  
  8500  MDBX_NOTHROW_PURE_FUNCTION static txnid_t
  8501  constmeta_txnid(const MDBX_meta *meta) {
  8502    const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a);
  8503    const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b);
  8504    return likely(a == b) ? a : 0;
  8505  }
  8506  
  8507  typedef struct {
  8508    uint64_t txnid;
  8509    size_t is_steady;
  8510  } meta_snap_t;
  8511  
  8512  static __always_inline txnid_t
  8513  atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) {
  8514  #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) &&      \
  8515      MDBX_UNALIGNED_OK >= 8
  8516    return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr,
  8517                         mo_AcquireRelease);
  8518  #else
  8519    const uint32_t l = atomic_load32(
  8520        &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
  8521    const uint32_t h = atomic_load32(
  8522        &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
  8523    return (uint64_t)h << 32 | l;
  8524  #endif
  8525  }
  8526  
  8527  static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) {
  8528    txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a);
  8529    jitter4testing(true);
  8530    size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID;
  8531    jitter4testing(true);
  8532    if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b)))
  8533      txnid = is_steady = 0;
  8534    meta_snap_t r = {txnid, is_steady};
  8535    return r;
  8536  }
  8537  
  8538  static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) {
  8539    return meta_snap(meta).txnid;
  8540  }
  8541  
  8542  static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta,
  8543                                         txnid_t txnid) {
  8544    eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
  8545    eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid &&
  8546                     unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
  8547    (void)env;
  8548  #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) &&      \
  8549      MDBX_UNALIGNED_OK >= 8
  8550    atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0,
  8551                   mo_AcquireRelease);
  8552    atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid,
  8553                   mo_AcquireRelease);
  8554  #else
  8555    atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
  8556                   0, mo_AcquireRelease);
  8557    atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
  8558                   0, mo_AcquireRelease);
  8559    atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
  8560                   (uint32_t)txnid, mo_AcquireRelease);
  8561    atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
  8562                   (uint32_t)(txnid >> 32), mo_AcquireRelease);
  8563  #endif
  8564  }
  8565  
  8566  static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta,
  8567                                       txnid_t txnid) {
  8568    eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
  8569    eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid);
  8570    eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
  8571    (void)env;
  8572    jitter4testing(true);
  8573    memcpy(&meta->mm_bootid, &bootid, 16);
  8574  #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) &&      \
  8575      MDBX_UNALIGNED_OK >= 8
  8576    atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid,
  8577                   mo_AcquireRelease);
  8578  #else
  8579    atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__],
  8580                   (uint32_t)txnid, mo_AcquireRelease);
  8581    atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__],
  8582                   (uint32_t)(txnid >> 32), mo_AcquireRelease);
  8583  #endif
  8584  }
  8585  
  8586  static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
  8587                                      const txnid_t txnid) {
  8588    eASSERT(env,
  8589            !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env));
  8590    (void)env;
  8591    /* update inconsistently since this function used ONLY for filling meta-image
  8592     * for writing, but not the actual meta-page */
  8593    memcpy(&meta->mm_bootid, &bootid, 16);
  8594    unaligned_poke_u64(4, meta->mm_txnid_a, txnid);
  8595    unaligned_poke_u64(4, meta->mm_txnid_b, txnid);
  8596  }
  8597  
  8598  static __inline uint64_t meta_sign(const MDBX_meta *meta) {
  8599    uint64_t sign = MDBX_DATASIGN_NONE;
  8600  #if 0 /* TODO */
  8601    sign = hippeus_hash64(...);
  8602  #else
  8603    (void)meta;
  8604  #endif
  8605    /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */
  8606    return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign;
  8607  }
  8608  
  8609  typedef struct {
  8610    txnid_t txnid;
  8611    union {
  8612      const volatile MDBX_meta *ptr_v;
  8613      const MDBX_meta *ptr_c;
  8614    };
  8615    size_t is_steady;
  8616  } meta_ptr_t;
  8617  
  8618  static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) {
  8619    eASSERT(env, n < NUM_METAS);
  8620    meta_ptr_t r;
  8621    meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n));
  8622    r.txnid = snap.txnid;
  8623    r.is_steady = snap.is_steady;
  8624    return r;
  8625  }
  8626  
  8627  static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) {
  8628    return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s;
  8629  }
  8630  
  8631  static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int,
  8632                                                 bool a_steady, bool b_steady) {
  8633    assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
  8634    return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady);
  8635  }
  8636  
  8637  static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int,
  8638                                                 bool a_steady, bool b_steady) {
  8639    assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */);
  8640    return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1);
  8641  }
  8642  
  8643  static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady,
  8644                                          txnid_t b_txnid, bool b_steady) {
  8645    return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
  8646  }
  8647  
  8648  static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady,
  8649                                          txnid_t b_txnid, bool b_steady) {
  8650    return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady);
  8651  }
  8652  
  8653  MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02,
  8654                                                 uint8_t c12, bool s0, bool s1,
  8655                                                 bool s2) {
  8656    assert(c01 < 3 && c02 < 3 && c12 < 3);
  8657    /* assert(s0 < 2 && s1 < 2 && s2 < 2); */
  8658    const uint8_t recent = meta_cmp2recent(c01, s0, s1)
  8659                               ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
  8660                               : (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
  8661    const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1)
  8662                                      ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
  8663                                      : (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
  8664  
  8665    uint8_t tail;
  8666    if (recent == 0)
  8667      tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
  8668    else if (recent == 1)
  8669      tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
  8670    else
  8671      tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
  8672  
  8673    const bool valid =
  8674        c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
  8675    const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
  8676                        (c12 != 1 || s1 != s2);
  8677    return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7;
  8678  }
  8679  
  8680  static __inline void meta_troika_unpack(meta_troika_t *troika,
  8681                                          const uint8_t packed) {
  8682    troika->recent = (packed >> 2) & 3;
  8683    troika->prefer_steady = (packed >> 4) & 3;
  8684    troika->tail_and_flags = packed & 0xC3;
  8685  }
  8686  
  8687  static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = {
  8688      232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232,
  8689      168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169,
  8690      232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233,
  8691      169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152,
  8692      168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212,
  8693      214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137,
  8694      216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40,
  8695      129, 148, 150, 168, 169, 168, 40,  169, 129, 152, 194, 233, 169, 168, 169,
  8696      168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168,
  8697      168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228,
  8698      212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233,
  8699      233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150,
  8700      164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214,
  8701      214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194,
  8702      210, 194, 225, 193, 210, 194};
  8703  
  8704  __hot static meta_troika_t meta_tap(const MDBX_env *env) {
  8705    meta_snap_t snap;
  8706    meta_troika_t troika;
  8707    snap = meta_snap(METAPAGE(env, 0));
  8708    troika.txnid[0] = snap.txnid;
  8709    troika.fsm = (uint8_t)snap.is_steady << 0;
  8710    snap = meta_snap(METAPAGE(env, 1));
  8711    troika.txnid[1] = snap.txnid;
  8712    troika.fsm += (uint8_t)snap.is_steady << 1;
  8713    troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8);
  8714    snap = meta_snap(METAPAGE(env, 2));
  8715    troika.txnid[2] = snap.txnid;
  8716    troika.fsm += (uint8_t)snap.is_steady << 2;
  8717    troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3);
  8718    troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3);
  8719  
  8720    meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]);
  8721    return troika;
  8722  }
  8723  
  8724  static txnid_t recent_committed_txnid(const MDBX_env *env) {
  8725    const txnid_t m0 = meta_txnid(METAPAGE(env, 0));
  8726    const txnid_t m1 = meta_txnid(METAPAGE(env, 1));
  8727    const txnid_t m2 = meta_txnid(METAPAGE(env, 2));
  8728    return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2);
  8729  }
  8730  
  8731  static __inline bool meta_eq(const meta_troika_t *troika, unsigned a,
  8732                               unsigned b) {
  8733    assert(a < NUM_METAS && b < NUM_METAS);
  8734    return troika->txnid[a] == troika->txnid[b] &&
  8735           (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 &&
  8736           troika->txnid[a];
  8737  }
  8738  
  8739  static unsigned meta_eq_mask(const meta_troika_t *troika) {
  8740    return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 |
  8741           meta_eq(troika, 2, 0) << 2;
  8742  }
  8743  
  8744  __hot static bool meta_should_retry(const MDBX_env *env,
  8745                                      meta_troika_t *troika) {
  8746    const meta_troika_t prev = *troika;
  8747    *troika = meta_tap(env);
  8748    return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] ||
  8749           prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2];
  8750  }
  8751  
  8752  static __always_inline meta_ptr_t meta_recent(const MDBX_env *env,
  8753                                                const meta_troika_t *troika) {
  8754    meta_ptr_t r;
  8755    r.txnid = troika->txnid[troika->recent];
  8756    r.ptr_v = METAPAGE(env, troika->recent);
  8757    r.is_steady = (troika->fsm >> troika->recent) & 1;
  8758    return r;
  8759  }
  8760  
  8761  static __always_inline meta_ptr_t
  8762  meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) {
  8763    meta_ptr_t r;
  8764    r.txnid = troika->txnid[troika->prefer_steady];
  8765    r.ptr_v = METAPAGE(env, troika->prefer_steady);
  8766    r.is_steady = (troika->fsm >> troika->prefer_steady) & 1;
  8767    return r;
  8768  }
  8769  
  8770  static __always_inline meta_ptr_t meta_tail(const MDBX_env *env,
  8771                                              const meta_troika_t *troika) {
  8772    const uint8_t tail = troika->tail_and_flags & 3;
  8773    meta_ptr_t r;
  8774    r.txnid = troika->txnid[tail];
  8775    r.ptr_v = METAPAGE(env, tail);
  8776    r.is_steady = (troika->fsm >> tail) & 1;
  8777    return r;
  8778  }
  8779  
  8780  static const char *durable_caption(const volatile MDBX_meta *const meta) {
  8781    if (META_IS_STEADY(meta))
  8782      return (unaligned_peek_u64_volatile(4, meta->mm_sign) ==
  8783              meta_sign((const MDBX_meta *)meta))
  8784                 ? "Steady"
  8785                 : "Tainted";
  8786    return "Weak";
  8787  }
  8788  
  8789  __cold static void meta_troika_dump(const MDBX_env *env,
  8790                                      const meta_troika_t *troika) {
  8791    const meta_ptr_t recent = meta_recent(env, troika);
  8792    const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika);
  8793    const meta_ptr_t tail = meta_tail(env, troika);
  8794    NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, "
  8795           "head=%d-%" PRIaTXN ".%c, "
  8796           "base=%d-%" PRIaTXN ".%c, "
  8797           "tail=%d-%" PRIaTXN ".%c, "
  8798           "valid %c, strict %c",
  8799           troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1],
  8800           (troika->fsm & 2) ? 's' : 'w', troika->txnid[2],
  8801           (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent,
  8802           recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady,
  8803           prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w',
  8804           troika->tail_and_flags % NUM_METAS, tail.txnid,
  8805           tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N',
  8806           TROIKA_STRICT_VALID(troika) ? 'Y' : 'N');
  8807  }
  8808  
  8809  /*----------------------------------------------------------------------------*/
  8810  
  8811  /* Find oldest txnid still referenced. */
  8812  static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) {
  8813    const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
  8814    eASSERT(env, steady <= env->me_txn0->mt_txnid);
  8815  
  8816    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
  8817    if (unlikely(lck == NULL /* exclusive without-lck mode */)) {
  8818      eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub);
  8819      return env->me_lck->mti_oldest_reader.weak = steady;
  8820    }
  8821  
  8822    const txnid_t prev_oldest =
  8823        atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease);
  8824    eASSERT(env, steady >= prev_oldest);
  8825  
  8826    txnid_t new_oldest = prev_oldest;
  8827    while (new_oldest != steady &&
  8828           nothing_changed !=
  8829               atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) {
  8830      lck->mti_readers_refresh_flag.weak = nothing_changed;
  8831      jitter4testing(false);
  8832      const unsigned snap_nreaders =
  8833          atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
  8834      new_oldest = steady;
  8835  
  8836      for (unsigned i = 0; i < snap_nreaders; ++i) {
  8837        const mdbx_pid_t pid =
  8838            atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease);
  8839        if (!pid)
  8840          continue;
  8841        jitter4testing(true);
  8842  
  8843        const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid);
  8844        if (unlikely(rtxn < prev_oldest)) {
  8845          if (unlikely(nothing_changed ==
  8846                       atomic_load32(&lck->mti_readers_refresh_flag,
  8847                                     mo_AcquireRelease)) &&
  8848              safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) {
  8849            NOTICE("kick stuck reader[%u of %u].pid_%u %" PRIaTXN
  8850                   " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN,
  8851                   i, snap_nreaders, pid, rtxn, prev_oldest, steady);
  8852          }
  8853          continue;
  8854        }
  8855  
  8856        if (rtxn < new_oldest) {
  8857          new_oldest = rtxn;
  8858          if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest)
  8859            break;
  8860        }
  8861      }
  8862    }
  8863  
  8864    if (new_oldest != prev_oldest) {
  8865      VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest);
  8866      eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak);
  8867      atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed);
  8868    }
  8869    return new_oldest;
  8870  }
  8871  
  8872  static txnid_t txn_oldest_reader(const MDBX_txn *const txn) {
  8873    return find_oldest_reader(txn->mt_env,
  8874                              txn->tw.troika.txnid[txn->tw.troika.prefer_steady]);
  8875  }
  8876  
  8877  /* Find largest mvcc-snapshot still referenced. */
  8878  __cold static pgno_t find_largest_snapshot(const MDBX_env *env,
  8879                                             pgno_t last_used_page) {
  8880    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
  8881    if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
  8882    retry:;
  8883      const unsigned snap_nreaders =
  8884          atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
  8885      for (unsigned i = 0; i < snap_nreaders; ++i) {
  8886        if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
  8887          /* jitter4testing(true); */
  8888          const pgno_t snap_pages = atomic_load32(
  8889              &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed);
  8890          const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
  8891          if (unlikely(
  8892                  snap_pages !=
  8893                      atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used,
  8894                                    mo_AcquireRelease) ||
  8895                  snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
  8896            goto retry;
  8897          if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid)
  8898            last_used_page = snap_pages;
  8899        }
  8900      }
  8901    }
  8902  
  8903    return last_used_page;
  8904  }
  8905  
  8906  /* Add a page to the txn's dirty list */
  8907  __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp,
  8908                                                  unsigned npages) {
  8909  #if xMDBX_DEBUG_SPILLING == 2
  8910    txn->mt_env->debug_dirtied_act += 1;
  8911    ENSURE(txn->mt_env,
  8912           txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est);
  8913    ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
  8914  #endif /* xMDBX_DEBUG_SPILLING == 2 */
  8915  
  8916    int rc;
  8917    mp->mp_txnid = txn->mt_front;
  8918    if (unlikely(txn->tw.dirtyroom == 0)) {
  8919      if (txn->tw.loose_count) {
  8920        MDBX_page *loose = txn->tw.loose_pages;
  8921        DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno);
  8922        rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1);
  8923        if (unlikely(rc != MDBX_SUCCESS))
  8924          goto bailout;
  8925        unsigned di = dpl_search(txn, loose->mp_pgno);
  8926        tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose);
  8927        dpl_remove(txn, di);
  8928        txn->tw.loose_pages = loose->mp_next;
  8929        txn->tw.loose_count--;
  8930        txn->tw.dirtyroom++;
  8931        if (!(txn->mt_flags & MDBX_WRITEMAP))
  8932          dpage_free(txn->mt_env, loose, 1);
  8933      } else {
  8934        ERROR("Dirtyroom is depleted, DPL length %u", txn->tw.dirtylist->length);
  8935        if (!(txn->mt_flags & MDBX_WRITEMAP))
  8936          dpage_free(txn->mt_env, mp, npages);
  8937        return MDBX_TXN_FULL;
  8938      }
  8939    }
  8940  
  8941    rc = dpl_append(txn, mp->mp_pgno, mp, npages);
  8942    if (unlikely(rc != MDBX_SUCCESS)) {
  8943    bailout:
  8944      txn->mt_flags |= MDBX_TXN_ERROR;
  8945      return rc;
  8946    }
  8947    txn->tw.dirtyroom--;
  8948    tASSERT(txn, dirtylist_check(txn));
  8949    return MDBX_SUCCESS;
  8950  }
  8951  
  8952  #if !(defined(_WIN32) || defined(_WIN64))
  8953  MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) {
  8954  #ifdef ENOSYS
  8955    if (err == ENOSYS)
  8956      return MDBX_RESULT_TRUE;
  8957  #endif /* ENOSYS */
  8958  #ifdef ENOIMPL
  8959    if (err == ENOIMPL)
  8960      return MDBX_RESULT_TRUE;
  8961  #endif /* ENOIMPL */
  8962  #ifdef ENOTSUP
  8963    if (err == ENOTSUP)
  8964      return MDBX_RESULT_TRUE;
  8965  #endif /* ENOTSUP */
  8966  #ifdef ENOSUPP
  8967    if (err == ENOSUPP)
  8968      return MDBX_RESULT_TRUE;
  8969  #endif /* ENOSUPP */
  8970  #ifdef EOPNOTSUPP
  8971    if (err == EOPNOTSUPP)
  8972      return MDBX_RESULT_TRUE;
  8973  #endif /* EOPNOTSUPP */
  8974    if (err == EAGAIN)
  8975      return MDBX_RESULT_TRUE;
  8976    return err;
  8977  }
  8978  #endif /* defined(_WIN32) || defined(_WIN64) */
  8979  
  8980  #if MDBX_ENABLE_MADVISE
  8981  /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
  8982  __cold static int set_readahead(MDBX_env *env, const pgno_t edge,
  8983                                  const bool enable, const bool force_whole) {
  8984    eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1);
  8985    eASSERT(env, (enable & 1) == (enable != 0));
  8986    const bool toggle = force_whole ||
  8987                        ((enable ^ env->me_lck->mti_readahead_anchor) & 1) ||
  8988                        !env->me_lck->mti_readahead_anchor;
  8989    const pgno_t prev_edge = env->me_lck->mti_readahead_anchor >> 1;
  8990    const size_t limit = env->me_dxb_mmap.limit;
  8991    size_t offset =
  8992        toggle ? 0
  8993               : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge);
  8994    offset = (offset < limit) ? offset : limit;
  8995  
  8996    size_t length =
  8997        pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge);
  8998    length = (length < limit) ? length : limit;
  8999    length -= offset;
  9000  
  9001    eASSERT(env, 0 <= (intptr_t)length);
  9002    if (length == 0)
  9003      return MDBX_SUCCESS;
  9004  
  9005    NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset),
  9006           bytes2pgno(env, offset + length));
  9007  
  9008  #if defined(F_RDAHEAD)
  9009    if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1))
  9010      return errno;
  9011  #endif /* F_RDAHEAD */
  9012  
  9013    int err;
  9014    if (enable) {
  9015  #if defined(MADV_NORMAL)
  9016      err = madvise(env->me_map + offset, length, MADV_NORMAL)
  9017                ? ignore_enosys(errno)
  9018                : MDBX_SUCCESS;
  9019      if (unlikely(MDBX_IS_ERROR(err)))
  9020        return err;
  9021  #elif defined(POSIX_MADV_NORMAL)
  9022      err = ignore_enosys(
  9023          posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL));
  9024      if (unlikely(MDBX_IS_ERROR(err)))
  9025        return err;
  9026  #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED)
  9027      err = ignore_enosys(
  9028          posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL));
  9029      if (unlikely(MDBX_IS_ERROR(err)))
  9030        return err;
  9031  #elif defined(_WIN32) || defined(_WIN64)
  9032      /* no madvise on Windows */
  9033  #else
  9034  #warning "FIXME"
  9035  #endif
  9036      if (toggle) {
  9037        /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel,
  9038         * because MADV_WILLNEED with offset != 0 may cause SIGBUS
  9039         * on following access to the hinted region.
  9040         * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021;
  9041         * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */
  9042  #if defined(F_RDADVISE)
  9043        struct radvisory hint;
  9044        hint.ra_offset = offset;
  9045        hint.ra_count =
  9046            unlikely(length > INT_MAX && sizeof(length) > sizeof(hint.ra_count))
  9047                ? INT_MAX
  9048                : (int)length;
  9049        (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl(
  9050            env->me_lazy_fd, F_RDADVISE, &hint);
  9051  #elif defined(MADV_WILLNEED)
  9052        err = madvise(env->me_map + offset, length, MADV_WILLNEED)
  9053                  ? ignore_enosys(errno)
  9054                  : MDBX_SUCCESS;
  9055        if (unlikely(MDBX_IS_ERROR(err)))
  9056          return err;
  9057  #elif defined(POSIX_MADV_WILLNEED)
  9058        err = ignore_enosys(
  9059            posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED));
  9060        if (unlikely(MDBX_IS_ERROR(err)))
  9061          return err;
  9062  #elif defined(_WIN32) || defined(_WIN64)
  9063        if (mdbx_PrefetchVirtualMemory) {
  9064          WIN32_MEMORY_RANGE_ENTRY hint;
  9065          hint.VirtualAddress = env->me_map + offset;
  9066          hint.NumberOfBytes = length;
  9067          (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0);
  9068        }
  9069  #elif defined(POSIX_FADV_WILLNEED)
  9070        err = ignore_enosys(
  9071            posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED));
  9072        if (unlikely(MDBX_IS_ERROR(err)))
  9073          return err;
  9074  #else
  9075  #warning "FIXME"
  9076  #endif
  9077      }
  9078    } else {
  9079  #if defined(MADV_RANDOM)
  9080      err = madvise(env->me_map + offset, length, MADV_RANDOM)
  9081                ? ignore_enosys(errno)
  9082                : MDBX_SUCCESS;
  9083      if (unlikely(MDBX_IS_ERROR(err)))
  9084        return err;
  9085  #elif defined(POSIX_MADV_RANDOM)
  9086      err = ignore_enosys(
  9087          posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM));
  9088      if (unlikely(MDBX_IS_ERROR(err)))
  9089        return err;
  9090  #elif defined(POSIX_FADV_RANDOM)
  9091      err = ignore_enosys(
  9092          posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM));
  9093      if (unlikely(MDBX_IS_ERROR(err)))
  9094        return err;
  9095  #elif defined(_WIN32) || defined(_WIN64)
  9096      /* no madvise on Windows */
  9097  #else
  9098  #warning "FIXME"
  9099  #endif /* MADV_RANDOM */
  9100    }
  9101  
  9102    env->me_lck->mti_readahead_anchor = (enable & 1) + (edge << 1);
  9103    err = MDBX_SUCCESS;
  9104    return err;
  9105  }
  9106  #endif /* MDBX_ENABLE_MADVISE */
  9107  
  9108  __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
  9109                               const pgno_t size_pgno, const pgno_t limit_pgno,
  9110                               const bool implicit) {
  9111    const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno);
  9112    const size_t size_bytes = pgno_align2os_bytes(env, size_pgno);
  9113    const size_t prev_size = env->me_dxb_mmap.current;
  9114    const size_t prev_limit = env->me_dxb_mmap.limit;
  9115  #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND)
  9116    const void *const prev_addr = env->me_map;
  9117  #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */
  9118  
  9119    VERBOSE("resize datafile/mapping: "
  9120            "present %" PRIuPTR " -> %" PRIuPTR ", "
  9121            "limit %" PRIuPTR " -> %" PRIuPTR,
  9122            prev_size, size_bytes, prev_limit, limit_bytes);
  9123  
  9124    eASSERT(env, limit_bytes >= size_bytes);
  9125    eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno);
  9126    eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno);
  9127  
  9128    unsigned mresize_flags =
  9129        env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC);
  9130  #if defined(_WIN32) || defined(_WIN64)
  9131    /* Acquire guard in exclusive mode for:
  9132     *   - to avoid collision between read and write txns around env->me_dbgeo;
  9133     *   - to avoid attachment of new reading threads (see osal_rdt_lock); */
  9134    osal_srwlock_AcquireExclusive(&env->me_remap_guard);
  9135    mdbx_handle_array_t *suspended = NULL;
  9136    mdbx_handle_array_t array_onstack;
  9137    int rc = MDBX_SUCCESS;
  9138    if (limit_bytes == env->me_dxb_mmap.limit &&
  9139        size_bytes == env->me_dxb_mmap.current &&
  9140        size_bytes == env->me_dxb_mmap.filesize)
  9141      goto bailout;
  9142  
  9143    if ((env->me_flags & MDBX_NOTLS) == 0) {
  9144      /* 1) Windows allows only extending a read-write section, but not a
  9145       *    corresponding mapped view. Therefore in other cases we must suspend
  9146       *    the local threads for safe remap.
  9147       * 2) At least on Windows 10 1803 the entire mapped section is unavailable
  9148       *    for short time during NtExtendSection() or VirtualAlloc() execution.
  9149       * 3) Under Wine runtime environment on Linux a section extending is not
  9150       *    supported.
  9151       *
  9152       * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */
  9153      array_onstack.limit = ARRAY_LENGTH(array_onstack.handles);
  9154      array_onstack.count = 0;
  9155      suspended = &array_onstack;
  9156      rc = osal_suspend_threads_before_remap(env, &suspended);
  9157      if (rc != MDBX_SUCCESS) {
  9158        ERROR("failed suspend-for-remap: errcode %d", rc);
  9159        goto bailout;
  9160      }
  9161      mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP
  9162                                : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
  9163    }
  9164  #else  /* Windows */
  9165    /* Acquire guard to avoid collision between read and write txns
  9166     * around env->me_dbgeo */
  9167    int rc = osal_fastmutex_acquire(&env->me_remap_guard);
  9168    if (unlikely(rc != MDBX_SUCCESS))
  9169      return rc;
  9170    if (limit_bytes == env->me_dxb_mmap.limit &&
  9171        size_bytes == env->me_dxb_mmap.current)
  9172      goto bailout;
  9173  
  9174    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
  9175    if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) &&
  9176        lck && !implicit) {
  9177      int err = osal_rdt_lock(env) /* lock readers table until remap done */;
  9178      if (unlikely(MDBX_IS_ERROR(err))) {
  9179        rc = err;
  9180        goto bailout;
  9181      }
  9182  
  9183      /* looking for readers from this process */
  9184      const unsigned snap_nreaders =
  9185          atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
  9186      eASSERT(env, !implicit);
  9187      mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
  9188      for (unsigned i = 0; i < snap_nreaders; ++i) {
  9189        if (lck->mti_readers[i].mr_pid.weak == env->me_pid &&
  9190            lck->mti_readers[i].mr_tid.weak != osal_thread_self()) {
  9191          /* the base address of the mapping can't be changed since
  9192           * the other reader thread from this process exists. */
  9193          osal_rdt_unlock(env);
  9194          mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE);
  9195          break;
  9196        }
  9197      }
  9198    }
  9199  #endif /* ! Windows */
  9200  
  9201    if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
  9202  #if MDBX_ENABLE_PGOP_STAT
  9203      env->me_lck->mti_pgop_stat.wops.weak += 1;
  9204  #endif /* MDBX_ENABLE_PGOP_STAT */
  9205      rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno),
  9206                      MDBX_SYNC_NONE);
  9207      if (unlikely(rc != MDBX_SUCCESS))
  9208        goto bailout;
  9209    }
  9210  
  9211  #if MDBX_ENABLE_MADVISE
  9212    if (size_bytes < prev_size) {
  9213      NOTICE("resize-MADV_%s %u..%u",
  9214             (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno,
  9215             bytes2pgno(env, prev_size));
  9216      rc = MDBX_RESULT_TRUE;
  9217  #if defined(MADV_REMOVE)
  9218      if (env->me_flags & MDBX_WRITEMAP)
  9219        rc =
  9220            madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE)
  9221                ? ignore_enosys(errno)
  9222                : MDBX_SUCCESS;
  9223  #endif /* MADV_REMOVE */
  9224  #if defined(MADV_DONTNEED)
  9225      if (rc == MDBX_RESULT_TRUE)
  9226        rc = madvise(env->me_map + size_bytes, prev_size - size_bytes,
  9227                     MADV_DONTNEED)
  9228                 ? ignore_enosys(errno)
  9229                 : MDBX_SUCCESS;
  9230  #elif defined(POSIX_MADV_DONTNEED)
  9231      if (rc == MDBX_RESULT_TRUE)
  9232        rc = ignore_enosys(posix_madvise(env->me_map + size_bytes,
  9233                                         prev_size - size_bytes,
  9234                                         POSIX_MADV_DONTNEED));
  9235  #elif defined(POSIX_FADV_DONTNEED)
  9236      if (rc == MDBX_RESULT_TRUE)
  9237        rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes,
  9238                                         prev_size - size_bytes,
  9239                                         POSIX_FADV_DONTNEED));
  9240  #endif /* MADV_DONTNEED */
  9241      if (unlikely(MDBX_IS_ERROR(rc)))
  9242        goto bailout;
  9243      if (env->me_lck->mti_discarded_tail.weak > size_pgno)
  9244        env->me_lck->mti_discarded_tail.weak = size_pgno;
  9245    }
  9246  #endif /* MDBX_ENABLE_MADVISE */
  9247  
  9248    rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes);
  9249  
  9250  #if MDBX_ENABLE_MADVISE
  9251    if (rc == MDBX_SUCCESS) {
  9252      env->me_lck->mti_discarded_tail.weak = size_pgno;
  9253      const bool readahead =
  9254          !(env->me_flags & MDBX_NORDAHEAD) &&
  9255          mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size);
  9256      const bool force = limit_bytes != prev_limit ||
  9257                         env->me_dxb_mmap.address != prev_addr
  9258  #if defined(_WIN32) || defined(_WIN64)
  9259                         || prev_size > size_bytes
  9260  #endif /* Windows */
  9261          ;
  9262      rc = set_readahead(env, size_pgno, readahead, force);
  9263    }
  9264  #endif /* MDBX_ENABLE_MADVISE */
  9265  
  9266  bailout:
  9267    if (rc == MDBX_SUCCESS) {
  9268      eASSERT(env, size_bytes == env->me_dxb_mmap.current);
  9269      eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize);
  9270      eASSERT(env, limit_bytes == env->me_dxb_mmap.limit);
  9271  #ifdef MDBX_USE_VALGRIND
  9272      if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) {
  9273        VALGRIND_DISCARD(env->me_valgrind_handle);
  9274        env->me_valgrind_handle = 0;
  9275        if (env->me_dxb_mmap.limit)
  9276          env->me_valgrind_handle =
  9277              VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx");
  9278      }
  9279  #endif /* MDBX_USE_VALGRIND */
  9280    } else {
  9281      if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) {
  9282        ERROR("failed resize datafile/mapping: "
  9283              "present %" PRIuPTR " -> %" PRIuPTR ", "
  9284              "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d",
  9285              prev_size, size_bytes, prev_limit, limit_bytes, rc);
  9286      } else {
  9287        WARNING("unable resize datafile/mapping: "
  9288                "present %" PRIuPTR " -> %" PRIuPTR ", "
  9289                "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d",
  9290                prev_size, size_bytes, prev_limit, limit_bytes, rc);
  9291      }
  9292      if (!env->me_dxb_mmap.address) {
  9293        env->me_flags |= MDBX_FATAL_ERROR;
  9294        if (env->me_txn)
  9295          env->me_txn->mt_flags |= MDBX_TXN_ERROR;
  9296        rc = MDBX_PANIC;
  9297      }
  9298    }
  9299  
  9300  #if defined(_WIN32) || defined(_WIN64)
  9301    int err = MDBX_SUCCESS;
  9302    osal_srwlock_ReleaseExclusive(&env->me_remap_guard);
  9303    if (suspended) {
  9304      err = osal_resume_threads_after_remap(suspended);
  9305      if (suspended != &array_onstack)
  9306        osal_free(suspended);
  9307    }
  9308  #else
  9309    if (env->me_lck_mmap.lck &&
  9310        (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0)
  9311      osal_rdt_unlock(env);
  9312    int err = osal_fastmutex_release(&env->me_remap_guard);
  9313  #endif /* Windows */
  9314    if (err != MDBX_SUCCESS) {
  9315      FATAL("failed resume-after-remap: errcode %d", err);
  9316      return MDBX_PANIC;
  9317    }
  9318    return rc;
  9319  }
  9320  
  9321  __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno,
  9322                                        const pgno_t size_pgno,
  9323                                        const pgno_t limit_pgno) {
  9324    const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit);
  9325    eASSERT(env, mapped_pgno >= used_pgno);
  9326    return map_resize(
  9327        env, used_pgno, size_pgno,
  9328        (size_pgno > mapped_pgno)
  9329            ? limit_pgno
  9330            : /* The actual mapsize may be less since the geo.upper may be changed
  9331                 by other process. So, avoids remapping until it necessary. */
  9332            mapped_pgno,
  9333        true);
  9334  }
  9335  
  9336  static int meta_unsteady(MDBX_env *env, const txnid_t last_steady,
  9337                           MDBX_meta *const meta, mdbx_filehandle_t fd) {
  9338    const uint64_t wipe = MDBX_DATASIGN_NONE;
  9339    if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) {
  9340      WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady,
  9341              data_page(meta)->mp_pgno);
  9342      if (env->me_flags & MDBX_WRITEMAP)
  9343        unaligned_poke_u64(4, meta->mm_sign, wipe);
  9344      else
  9345        return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign),
  9346                           (uint8_t *)&meta->mm_sign - env->me_map);
  9347    }
  9348    return MDBX_SUCCESS;
  9349  }
  9350  
  9351  __cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) {
  9352    MDBX_env *const env = txn->mt_env;
  9353  #if MDBX_ENABLE_PGOP_STAT
  9354    env->me_lck->mti_pgop_stat.wops.weak += 1;
  9355  #endif /* MDBX_ENABLE_PGOP_STAT */
  9356    const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
  9357                                     ? env->me_dsync_fd
  9358                                     : env->me_lazy_fd;
  9359    int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd);
  9360    if (unlikely(err != MDBX_SUCCESS))
  9361      return err;
  9362    err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd);
  9363    if (unlikely(err != MDBX_SUCCESS))
  9364      return err;
  9365    err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd);
  9366    if (unlikely(err != MDBX_SUCCESS))
  9367      return err;
  9368  
  9369    if (env->me_flags & MDBX_WRITEMAP) {
  9370      osal_flush_incoherent_cpu_writeback();
  9371      err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
  9372                       MDBX_SYNC_DATA);
  9373      if (unlikely(err != MDBX_SUCCESS))
  9374        return err;
  9375    } else {
  9376      if (fd == env->me_lazy_fd) {
  9377  #if MDBX_USE_SYNCFILERANGE
  9378        static bool syncfilerange_unavailable;
  9379        if (!syncfilerange_unavailable &&
  9380            sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS),
  9381                            SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) {
  9382          err = errno;
  9383          if (ignore_enosys(err) == MDBX_RESULT_TRUE)
  9384            syncfilerange_unavailable = true;
  9385        }
  9386        if (syncfilerange_unavailable)
  9387  #endif /* MDBX_USE_SYNCFILERANGE */
  9388          err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
  9389        if (unlikely(err != MDBX_SUCCESS))
  9390          return err;
  9391      }
  9392      osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
  9393                                 env->me_os_psize);
  9394    }
  9395  
  9396    /* force oldest refresh */
  9397    atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed);
  9398    tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
  9399    txn->tw.troika = meta_tap(env);
  9400    for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child)
  9401      if (scan != txn)
  9402        scan->tw.troika = txn->tw.troika;
  9403    return MDBX_SUCCESS;
  9404  }
  9405  
  9406  //------------------------------------------------------------------------------
  9407  
  9408  MDBX_MAYBE_UNUSED __hot static pgno_t *
  9409  scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) {
  9410    assert(seq > 0 && len > seq);
  9411  #if MDBX_PNL_ASCENDING
  9412    assert(range[-1] == len);
  9413    const pgno_t *const detent = range + len - seq;
  9414    const ptrdiff_t offset = (ptrdiff_t)seq;
  9415    const pgno_t target = (pgno_t)offset;
  9416    if (likely(len > seq + 3)) {
  9417      do {
  9418        const pgno_t diff0 = range[offset + 0] - range[0];
  9419        const pgno_t diff1 = range[offset + 1] - range[1];
  9420        const pgno_t diff2 = range[offset + 2] - range[2];
  9421        const pgno_t diff3 = range[offset + 3] - range[3];
  9422        if (diff0 == target)
  9423          return range + 0;
  9424        if (diff1 == target)
  9425          return range + 1;
  9426        if (diff2 == target)
  9427          return range + 2;
  9428        if (diff3 == target)
  9429          return range + 3;
  9430        range += 4;
  9431      } while (range + 3 < detent);
  9432      if (range == detent)
  9433        return nullptr;
  9434    }
  9435    do
  9436      if (range[offset] - *range == target)
  9437        return range;
  9438    while (++range < detent);
  9439  #else
  9440    assert(range[-(ptrdiff_t)len] == len);
  9441    const pgno_t *const detent = range - len + seq;
  9442    const ptrdiff_t offset = -(ptrdiff_t)seq;
  9443    const pgno_t target = (pgno_t)offset;
  9444    if (likely(len > seq + 3)) {
  9445      do {
  9446        const pgno_t diff0 = range[-0] - range[offset - 0];
  9447        const pgno_t diff1 = range[-1] - range[offset - 1];
  9448        const pgno_t diff2 = range[-2] - range[offset - 2];
  9449        const pgno_t diff3 = range[-3] - range[offset - 3];
  9450        /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору
  9451         * загружать и вычислять все значения параллельно. */
  9452        if (diff0 == target)
  9453          return range - 0;
  9454        if (diff1 == target)
  9455          return range - 1;
  9456        if (diff2 == target)
  9457          return range - 2;
  9458        if (diff3 == target)
  9459          return range - 3;
  9460        range -= 4;
  9461      } while (range > detent + 3);
  9462      if (range == detent)
  9463        return nullptr;
  9464    }
  9465    do
  9466      if (*range - range[offset] == target)
  9467        return range;
  9468    while (--range > detent);
  9469  #endif /* MDBX_PNL sort-order */
  9470    return nullptr;
  9471  }
  9472  
  9473  MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl,
  9474                                                            const unsigned seq) {
  9475    size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pnl);
  9476  #if MDBX_PNL_ASCENDING
  9477    while (seq <= MDBX_PNL_SIZE(pnl) - begin) {
  9478      if (pnl[begin + seq] - pnl[begin] == seq)
  9479        return pnl + begin;
  9480      ++begin;
  9481    }
  9482  #else
  9483    while (begin > seq) {
  9484      if (pnl[begin - seq] - pnl[begin] == seq)
  9485        return pnl + begin;
  9486      --begin;
  9487    }
  9488  #endif /* MDBX_PNL sort-order */
  9489    return nullptr;
  9490  }
  9491  
  9492  #if defined(_MSC_VER) && !defined(__builtin_clz) &&                            \
  9493      !__has_builtin(__builtin_clz)
  9494  MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) {
  9495    unsigned long index;
  9496    _BitScanReverse(&index, value);
  9497    return index;
  9498  }
  9499  #endif /* _MSC_VER */
  9500  
  9501  #if defined(_MSC_VER) && !defined(__builtin_clzl) &&                           \
  9502      !__has_builtin(__builtin_clzl)
  9503  #define __builtin_clzl(value) __builtin_clz(value)
  9504  #endif /* _MSC_VER */
  9505  
  9506  #if !defined(MDBX_ATTRIBUTE_TARGET) &&                                         \
  9507      (__has_attribute(__target__) || __GNUC_PREREQ(5, 0))
  9508  #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target)))
  9509  #endif /* MDBX_ATTRIBUTE_TARGET */
  9510  
  9511  #if defined(__SSE2__)
  9512  #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */
  9513  #elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__)
  9514  #define __SSE2__
  9515  #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */
  9516  #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__)
  9517  #define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2")
  9518  #endif /* __SSE2__ */
  9519  
  9520  #if defined(__AVX2__)
  9521  #define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */
  9522  #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__)
  9523  #define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2")
  9524  #endif /* __AVX2__ */
  9525  
  9526  #if defined(__AVX512BW__)
  9527  #define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */
  9528  #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) &&                   \
  9529      (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0))
  9530  #define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw")
  9531  #endif /* __AVX512BW__ */
  9532  
  9533  #ifdef MDBX_ATTRIBUTE_TARGET_SSE2
  9534  MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned
  9535  diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset,
  9536                    const __m128i pattern) {
  9537    const __m128i f = _mm_loadu_si128((const __m128i *)ptr);
  9538    const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset));
  9539    const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern);
  9540    return _mm_movemask_ps(*(const __m128 *)&cmp);
  9541  }
  9542  
  9543  MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t *
  9544  scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) {
  9545    assert(seq > 0 && len > seq);
  9546  #if MDBX_PNL_ASCENDING
  9547  #error "FIXME: Not implemented"
  9548  #endif /* MDBX_PNL_ASCENDING */
  9549    assert(range[-(ptrdiff_t)len] == len);
  9550    pgno_t *const detent = range - len + seq;
  9551    const ptrdiff_t offset = -(ptrdiff_t)seq;
  9552    const pgno_t target = (pgno_t)offset;
  9553    const __m128i pattern = _mm_set1_epi32(target);
  9554    uint8_t mask;
  9555    if (likely(len > seq + 3)) {
  9556      do {
  9557        mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern);
  9558        if (mask) {
  9559  #ifndef __SANITIZE_ADDRESS__
  9560        found:
  9561  #endif /* __SANITIZE_ADDRESS__ */
  9562          return range + 28 - __builtin_clz(mask);
  9563        }
  9564        range -= 4;
  9565      } while (range > detent + 3);
  9566      if (range == detent)
  9567        return nullptr;
  9568    }
  9569  
  9570    /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не
  9571     * только за пределами региона выделенного под PNL, но и пересекать границу
  9572     * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
  9573     * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
  9574  #ifndef __SANITIZE_ADDRESS__
  9575    const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */;
  9576    if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
  9577        !RUNNING_ON_VALGRIND) {
  9578      const unsigned extra = (unsigned)(detent + 4 - range);
  9579      assert(extra > 0 && extra < 4);
  9580      mask = 0xF << extra;
  9581      mask &= diffcmp2mask_sse2(range - 3, offset, pattern);
  9582      if (mask)
  9583        goto found;
  9584      return nullptr;
  9585    }
  9586  #endif /* __SANITIZE_ADDRESS__ */
  9587    do
  9588      if (*range - range[offset] == target)
  9589        return range;
  9590    while (--range != detent);
  9591    return nullptr;
  9592  }
  9593  #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */
  9594  
  9595  #ifdef MDBX_ATTRIBUTE_TARGET_AVX2
  9596  MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned
  9597  diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset,
  9598                    const __m256i pattern) {
  9599    const __m256i f = _mm256_loadu_si256((const __m256i *)ptr);
  9600    const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset));
  9601    const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern);
  9602    return _mm256_movemask_ps(*(const __m256 *)&cmp);
  9603  }
  9604  
  9605  MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t *
  9606  scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) {
  9607    assert(seq > 0 && len > seq);
  9608  #if MDBX_PNL_ASCENDING
  9609  #error "FIXME: Not implemented"
  9610  #endif /* MDBX_PNL_ASCENDING */
  9611    assert(range[-(ptrdiff_t)len] == len);
  9612    pgno_t *const detent = range - len + seq;
  9613    const ptrdiff_t offset = -(ptrdiff_t)seq;
  9614    const pgno_t target = (pgno_t)offset;
  9615    const __m256i pattern = _mm256_set1_epi32(target);
  9616    uint8_t mask;
  9617    if (likely(len > seq + 7)) {
  9618      do {
  9619        mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern);
  9620        if (mask) {
  9621  #ifndef __SANITIZE_ADDRESS__
  9622        found:
  9623  #endif /* __SANITIZE_ADDRESS__ */
  9624          return range + 24 - __builtin_clz(mask);
  9625        }
  9626        range -= 8;
  9627      } while (range > detent + 7);
  9628      if (range == detent)
  9629        return nullptr;
  9630    }
  9631  
  9632    /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не
  9633     * только за пределами региона выделенного под PNL, но и пересекать границу
  9634     * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
  9635     * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
  9636  #ifndef __SANITIZE_ADDRESS__
  9637    const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */;
  9638    if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
  9639        !RUNNING_ON_VALGRIND) {
  9640      const unsigned extra = (unsigned)(detent + 8 - range);
  9641      assert(extra > 0 && extra < 8);
  9642      mask = 0xFF << extra;
  9643      mask &= diffcmp2mask_avx2(range - 7, offset, pattern);
  9644      if (mask)
  9645        goto found;
  9646      return nullptr;
  9647    }
  9648  #endif /* __SANITIZE_ADDRESS__ */
  9649    if (range - 3 > detent) {
  9650      mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern);
  9651      if (mask)
  9652        return range + 28 - __builtin_clz(mask);
  9653      range -= 4;
  9654    }
  9655    while (range > detent) {
  9656      if (*range - range[offset] == target)
  9657        return range;
  9658      --range;
  9659    }
  9660    return nullptr;
  9661  }
  9662  #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */
  9663  
  9664  #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW
  9665  MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned
  9666  diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset,
  9667                        const __m512i pattern) {
  9668    const __m512i f = _mm512_loadu_si512((const __m512i *)ptr);
  9669    const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset));
  9670    return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern);
  9671  }
  9672  
  9673  MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t *
  9674  scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) {
  9675    assert(seq > 0 && len > seq);
  9676  #if MDBX_PNL_ASCENDING
  9677  #error "FIXME: Not implemented"
  9678  #endif /* MDBX_PNL_ASCENDING */
  9679    assert(range[-(ptrdiff_t)len] == len);
  9680    pgno_t *const detent = range - len + seq;
  9681    const ptrdiff_t offset = -(ptrdiff_t)seq;
  9682    const pgno_t target = (pgno_t)offset;
  9683    const __m512i pattern = _mm512_set1_epi32(target);
  9684    unsigned mask;
  9685    if (likely(len > seq + 15)) {
  9686      do {
  9687        mask = diffcmp2mask_avx512bw(range - 15, offset, pattern);
  9688        if (mask) {
  9689  #ifndef __SANITIZE_ADDRESS__
  9690        found:
  9691  #endif /* __SANITIZE_ADDRESS__ */
  9692          return range + 16 - __builtin_clz(mask);
  9693        }
  9694        range -= 16;
  9695      } while (range > detent + 15);
  9696      if (range == detent)
  9697        return nullptr;
  9698    }
  9699  
  9700    /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не
  9701     * только за пределами региона выделенного под PNL, но и пересекать границу
  9702     * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
  9703     * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
  9704  #ifndef __SANITIZE_ADDRESS__
  9705    const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */;
  9706    if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
  9707        !RUNNING_ON_VALGRIND) {
  9708      const unsigned extra = (unsigned)(detent + 16 - range);
  9709      assert(extra > 0 && extra < 16);
  9710      mask = 0xFFFF << extra;
  9711      mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern);
  9712      if (mask)
  9713        goto found;
  9714      return nullptr;
  9715    }
  9716  #endif /* __SANITIZE_ADDRESS__ */
  9717    if (range - 7 > detent) {
  9718      mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern);
  9719      if (mask)
  9720        return range + 24 - __builtin_clz(mask);
  9721      range -= 8;
  9722    }
  9723    if (range - 3 > detent) {
  9724      mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern);
  9725      if (mask)
  9726        return range + 28 - __builtin_clz(mask);
  9727      range -= 4;
  9728    }
  9729    while (range > detent) {
  9730      if (*range - range[offset] == target)
  9731        return range;
  9732      --range;
  9733    }
  9734    return nullptr;
  9735  }
  9736  #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */
  9737  
  9738  #if (defined(__ARM_NEON) || defined(__ARM_NEON__)) &&                          \
  9739      (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  9740  static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr,
  9741                                                  const ptrdiff_t offset,
  9742                                                  const uint32x4_t pattern) {
  9743    const uint32x4_t f = vld1q_u32(ptr);
  9744    const uint32x4_t l = vld1q_u32(ptr + offset);
  9745    const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern));
  9746    if (sizeof(size_t) > 7)
  9747      return vget_lane_u64(vreinterpret_u64_u16(cmp), 0);
  9748    else
  9749      return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))),
  9750                           0);
  9751  }
  9752  
  9753  __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len,
  9754                                     const unsigned seq) {
  9755    assert(seq > 0 && len > seq);
  9756  #if MDBX_PNL_ASCENDING
  9757  #error "FIXME: Not implemented"
  9758  #endif /* MDBX_PNL_ASCENDING */
  9759    assert(range[-(ptrdiff_t)len] == len);
  9760    pgno_t *const detent = range - len + seq;
  9761    const ptrdiff_t offset = -(ptrdiff_t)seq;
  9762    const pgno_t target = (pgno_t)offset;
  9763    const uint32x4_t pattern = vmovq_n_u32(target);
  9764    size_t mask;
  9765    if (likely(len > seq + 3)) {
  9766      do {
  9767        mask = diffcmp2mask_neon(range - 3, offset, pattern);
  9768        if (mask) {
  9769  #ifndef __SANITIZE_ADDRESS__
  9770        found:
  9771  #endif /* __SANITIZE_ADDRESS__ */
  9772          return (pgno_t *)((char *)range -
  9773                            (__builtin_clzl(mask) >> sizeof(size_t) / 4));
  9774        }
  9775        range -= 4;
  9776      } while (range > detent + 3);
  9777      if (range == detent)
  9778        return nullptr;
  9779    }
  9780  
  9781    /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не
  9782     * только за пределами региона выделенного под PNL, но и пересекать границу
  9783     * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
  9784     * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
  9785  #ifndef __SANITIZE_ADDRESS__
  9786    const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */;
  9787    if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
  9788        !RUNNING_ON_VALGRIND) {
  9789      const unsigned extra = (unsigned)(detent + 4 - range);
  9790      assert(extra > 0 && extra < 4);
  9791      mask = (~(size_t)0) << (extra * sizeof(size_t) * 2);
  9792      mask &= diffcmp2mask_neon(range - 3, offset, pattern);
  9793      if (mask)
  9794        goto found;
  9795      return nullptr;
  9796    }
  9797  #endif /* __SANITIZE_ADDRESS__ */
  9798    do
  9799      if (*range - range[offset] == target)
  9800        return range;
  9801    while (--range != detent);
  9802    return nullptr;
  9803  }
  9804  #endif /* __ARM_NEON || __ARM_NEON__ */
  9805  
  9806  #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW)
  9807  #define scan4seq_default scan4seq_avx512bw
  9808  #define scan4seq scan4seq_default
  9809  #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2)
  9810  #define scan4seq_default scan4seq_avx2
  9811  #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2)
  9812  #define scan4seq_default scan4seq_sse2
  9813  #elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) &&                        \
  9814      (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  9815  #define scan4seq_default scan4seq_neon
  9816  /* Choosing of another variants should be added here. */
  9817  #endif /* scan4seq_default */
  9818  
  9819  #ifndef scan4seq_default
  9820  #define scan4seq_default scan4seq_fallback
  9821  #endif /* scan4seq_default */
  9822  
  9823  #ifdef scan4seq
  9824  /* The scan4seq() is the best or no alternatives */
  9825  #else
  9826  #if !(__has_builtin(__builtin_cpu_supports) ||                                 \
  9827        defined(__BUILTIN_CPU_SUPPORTS__) ||                                     \
  9828        (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23)))
  9829  /* The scan4seq_default() will be used  since no cpu-features detection support
  9830   * from compiler. Please don't ask to implement cpuid-based detection and don't
  9831   * make such PRs. */
  9832  #define scan4seq scan4seq_default
  9833  #else
  9834  /* Selecting the most appropriate implementation at runtime,
  9835   * depending on the available CPU features. */
  9836  static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len,
  9837                                   const unsigned seq);
  9838  static pgno_t *(*scan4seq)(pgno_t *range, const size_t len,
  9839                             const unsigned seq) = scan4seq_resolver;
  9840  
  9841  static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len,
  9842                                   const unsigned seq) {
  9843    pgno_t *(*choice)(pgno_t * range, const size_t len, const unsigned seq) =
  9844        nullptr;
  9845  #if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) ||      \
  9846      __GNUC_PREREQ(4, 8)
  9847    __builtin_cpu_init();
  9848  #endif /* __builtin_cpu_init() */
  9849  #ifdef MDBX_ATTRIBUTE_TARGET_SSE2
  9850    if (__builtin_cpu_supports("sse2"))
  9851      choice = scan4seq_sse2;
  9852  #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */
  9853  #ifdef MDBX_ATTRIBUTE_TARGET_AVX2
  9854    if (__builtin_cpu_supports("avx2"))
  9855      choice = scan4seq_avx2;
  9856  #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */
  9857  #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW
  9858    if (__builtin_cpu_supports("avx512bw"))
  9859      choice = scan4seq_avx512bw;
  9860  #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */
  9861    /* Choosing of another variants should be added here. */
  9862    scan4seq = choice ? choice : scan4seq_default;
  9863    return scan4seq(range, len, seq);
  9864  }
  9865  #endif /* __has_builtin(__builtin_cpu_supports */
  9866  #endif /* scan4seq */
  9867  
  9868  //------------------------------------------------------------------------------
  9869  
  9870  /* Allocate page numbers and memory for writing.  Maintain mt_last_reclaimed,
  9871   * mt_reclaimed_pglist and mt_next_pgno.  Set MDBX_TXN_ERROR on failure.
  9872   *
  9873   * If there are free pages available from older transactions, they
  9874   * are re-used first. Otherwise allocate a new page at mt_next_pgno.
  9875   * Do not modify the GC, just merge GC records into mt_reclaimed_pglist
  9876   * and move mt_last_reclaimed to say which records were consumed.  Only this
  9877   * function can create mt_reclaimed_pglist and move
  9878   * mt_last_reclaimed/mt_next_pgno.
  9879   *
  9880   * [in] mc    cursor A cursor handle identifying the transaction and
  9881   *            database for which we are allocating.
  9882   * [in] num   the number of pages to allocate.
  9883   *
  9884   * Returns 0 on success, non-zero on failure.*/
  9885  
  9886  #define MDBX_ALLOC_GC 1
  9887  #define MDBX_ALLOC_NEW 2
  9888  #define MDBX_ALLOC_COALESCE 4
  9889  #define MDBX_ALLOC_SLOT 8
  9890  #define MDBX_ALLOC_FAKE 16
  9891  #define MDBX_ALLOC_NOLOG 32
  9892  #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW)
  9893  
  9894  static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
  9895    pgr_t ret;
  9896    MDBX_txn *const txn = mc->mc_txn;
  9897    MDBX_env *const env = txn->mt_env;
  9898    eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT));
  9899    eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW));
  9900  
  9901    const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2;
  9902    if (likely(flags & MDBX_ALLOC_GC)) {
  9903      flags |= env->me_flags & MDBX_LIFORECLAIM;
  9904      if (txn->mt_dbs[FREE_DBI].md_branch_pages &&
  9905          MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold)
  9906        flags |= MDBX_ALLOC_COALESCE;
  9907      if (unlikely(
  9908              /* If mc is updating the GC, then the retired-list cannot play
  9909                 catch-up with itself by growing while trying to save it. */
  9910              (mc->mc_flags & C_RECLAIMING) ||
  9911              /* avoid (recursive) search inside empty tree and while tree is
  9912                 updating, todo4recovery://erased_by_github/libmdbx/issues/31 */
  9913              txn->mt_dbs[FREE_DBI].md_entries == 0 ||
  9914              /* If our dirty list is already full, we can't touch GC */
  9915              (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth &&
  9916               !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY))))
  9917        flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE);
  9918    }
  9919  
  9920    eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist,
  9921                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
  9922    pgno_t pgno, *re_list = txn->tw.reclaimed_pglist;
  9923    unsigned re_len = MDBX_PNL_SIZE(re_list);
  9924    pgno_t *range = nullptr;
  9925    txnid_t detent = 0, last = 0;
  9926  #if MDBX_ENABLE_PGOP_STAT
  9927    uint64_t timestamp = 0;
  9928  #endif /* MDBX_ENABLE_PGOP_STAT */
  9929  
  9930    while (true) { /* hsr-kick retry loop */
  9931      MDBX_cursor_couple recur;
  9932      for (MDBX_cursor_op op = MDBX_FIRST;;
  9933           op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) {
  9934        MDBX_val key, data;
  9935  
  9936        /* Seek a big enough contiguous page range.
  9937         * Prefer pages with lower pgno. */
  9938        eASSERT(env,
  9939                pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
  9940        if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) {
  9941          eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno &&
  9942                           MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno);
  9943          range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len);
  9944          pgno = *range;
  9945          if (num == 1)
  9946            goto done;
  9947          range = scan4seq(range, re_len, num - 1);
  9948          tASSERT(txn, range == scan4range_checker(re_list, num - 1));
  9949          if (likely(range)) {
  9950            pgno = *range;
  9951            goto done;
  9952          }
  9953        }
  9954  
  9955        if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */
  9956          if (unlikely(!(flags & MDBX_ALLOC_GC)))
  9957            break /* reclaiming is prohibited for now */;
  9958  
  9959            /* Prepare to fetch and coalesce */
  9960  #if MDBX_ENABLE_PGOP_STAT
  9961          if (likely(timestamp == 0))
  9962            timestamp = osal_monotime();
  9963  #endif /* MDBX_ENABLE_PGOP_STAT */
  9964          detent = txn_oldest_reader(txn) + 1;
  9965  
  9966          ret.err = cursor_init(&recur.outer, txn, FREE_DBI);
  9967          if (unlikely(ret.err != MDBX_SUCCESS))
  9968            goto fail;
  9969          if (flags & MDBX_LIFORECLAIM) {
  9970            /* Begin from oldest reader if any */
  9971            if (detent > MIN_TXNID) {
  9972              last = detent - 1;
  9973              op = MDBX_SET_RANGE;
  9974            }
  9975          } else if (txn->tw.last_reclaimed) {
  9976            /* Continue lookup from txn->tw.last_reclaimed to oldest reader */
  9977            last = txn->tw.last_reclaimed;
  9978            op = MDBX_SET_RANGE;
  9979          }
  9980  
  9981          key.iov_base = &last;
  9982          key.iov_len = sizeof(last);
  9983        }
  9984  
  9985        if (!(flags & MDBX_LIFORECLAIM)) {
  9986          /* Do not try fetch more if the record will be too recent */
  9987          if (op != MDBX_FIRST && ++last >= detent) {
  9988            detent = txn_oldest_reader(txn) + 1;
  9989            if (detent <= last)
  9990              break;
  9991          }
  9992        }
  9993  
  9994        ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op);
  9995        if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) {
  9996          if (op == MDBX_SET_RANGE)
  9997            continue;
  9998          const txnid_t snap = txn_oldest_reader(txn);
  9999          if (unlikely(detent <= snap)) {
 10000            detent = snap + 1;
 10001            last = snap;
 10002            key.iov_base = &last;
 10003            key.iov_len = sizeof(last);
 10004            op = MDBX_SET_RANGE;
 10005            ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op);
 10006          }
 10007        }
 10008        if (unlikely(ret.err)) {
 10009          if (ret.err == MDBX_NOTFOUND)
 10010            break;
 10011          goto fail;
 10012        }
 10013  
 10014        if (unlikely(key.iov_len != sizeof(txnid_t))) {
 10015          ret.err = MDBX_CORRUPTED;
 10016          goto fail;
 10017        }
 10018        last = unaligned_peek_u64(4, key.iov_base);
 10019        if (detent <= last) {
 10020          detent = txn_oldest_reader(txn) + 1;
 10021          if (detent <= last) {
 10022            if (flags & MDBX_LIFORECLAIM)
 10023              continue;
 10024            break;
 10025          }
 10026        }
 10027  
 10028        if (flags & MDBX_LIFORECLAIM) {
 10029          /* skip IDs of records that already reclaimed */
 10030          if (txn->tw.lifo_reclaimed) {
 10031            size_t i;
 10032            for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i)
 10033              if (txn->tw.lifo_reclaimed[i] == last)
 10034                break;
 10035            if (i)
 10036              continue;
 10037          }
 10038        }
 10039  
 10040        /* Reading next GC record */
 10041        MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top];
 10042        if (unlikely((ret.err = node_read(
 10043                          &recur.outer,
 10044                          page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]),
 10045                          &data, mp)) != MDBX_SUCCESS))
 10046          goto fail;
 10047  
 10048        if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) {
 10049          txn->tw.lifo_reclaimed = txl_alloc();
 10050          if (unlikely(!txn->tw.lifo_reclaimed)) {
 10051            ret.err = MDBX_ENOMEM;
 10052            goto fail;
 10053          }
 10054        }
 10055  
 10056        /* Append PNL from GC record to tw.reclaimed_pglist */
 10057        cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0);
 10058        pgno_t *gc_pnl = (pgno_t *)data.iov_base;
 10059        tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl));
 10060        if (unlikely(data.iov_len % sizeof(pgno_t) ||
 10061                     data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
 10062                     !pnl_check(gc_pnl, txn->mt_next_pgno))) {
 10063          ret.err = MDBX_CORRUPTED;
 10064          goto fail;
 10065        }
 10066        const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl);
 10067        if (unlikely(/* list is too long already */ MDBX_PNL_SIZE(
 10068                         txn->tw.reclaimed_pglist) >=
 10069                     env->me_options.rp_augment_limit) &&
 10070            ((/* not a slot-request from gc-update */
 10071              (flags & MDBX_ALLOC_SLOT) == 0 &&
 10072              /* have enough unallocated space */ txn->mt_geo.upper >=
 10073                  txn->mt_next_pgno + (size_t)num) ||
 10074             gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >=
 10075                 MDBX_PGL_LIMIT)) {
 10076          /* Stop reclaiming to avoid large/overflow the page list.
 10077           * This is a rare case while search for a continuously multi-page region
 10078           * in a large database.
 10079           * todo4recovery://erased_by_github/libmdbx/issues/123 */
 10080          NOTICE("stop reclaiming to avoid PNL overflow: %u (current) + %u "
 10081                 "(chunk) -> %u",
 10082                 MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len,
 10083                 gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
 10084          flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE);
 10085          break;
 10086        }
 10087        ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len);
 10088        if (unlikely(ret.err != MDBX_SUCCESS))
 10089          goto fail;
 10090        re_list = txn->tw.reclaimed_pglist;
 10091  
 10092        /* Remember ID of GC record */
 10093        if (flags & MDBX_LIFORECLAIM) {
 10094          ret.err = txl_append(&txn->tw.lifo_reclaimed, last);
 10095          if (unlikely(ret.err != MDBX_SUCCESS))
 10096            goto fail;
 10097        }
 10098        txn->tw.last_reclaimed = last;
 10099  
 10100        if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
 10101          DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL",
 10102                      last, txn->mt_dbs[FREE_DBI].md_root, gc_len);
 10103          for (unsigned i = gc_len; i; i--)
 10104            DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]);
 10105          DEBUG_EXTRA_PRINT("%s\n", ".");
 10106        }
 10107  
 10108        /* Merge in descending sorted order */
 10109        pnl_merge(re_list, gc_pnl);
 10110        if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) {
 10111          ret.err = MDBX_CORRUPTED;
 10112          goto fail;
 10113        }
 10114        tASSERT(txn, dirtylist_check(txn));
 10115  
 10116        re_len = MDBX_PNL_SIZE(re_list);
 10117        tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno);
 10118        if (MDBX_ENABLE_REFUND && re_len &&
 10119            unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) {
 10120          /* Refund suitable pages into "unallocated" space */
 10121          txn_refund(txn);
 10122          re_list = txn->tw.reclaimed_pglist;
 10123          re_len = MDBX_PNL_SIZE(re_list);
 10124        }
 10125  
 10126        /* Done for a kick-reclaim mode, actually no page needed */
 10127        if (unlikely(flags & MDBX_ALLOC_SLOT)) {
 10128          DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT");
 10129  #if MDBX_ENABLE_PGOP_STAT
 10130          eASSERT(env, timestamp != 0);
 10131          env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp;
 10132  #endif /* MDBX_ENABLE_PGOP_STAT */
 10133          ret.err = MDBX_SUCCESS;
 10134          ret.page = NULL;
 10135          return ret;
 10136        }
 10137  
 10138        /* Don't try to coalesce too much. */
 10139        if (re_len /* current size */ > coalesce_threshold) {
 10140          if (flags & MDBX_ALLOC_COALESCE)
 10141            TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold");
 10142          flags &= ~MDBX_ALLOC_COALESCE;
 10143        }
 10144      }
 10145  
 10146      if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) {
 10147        DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE");
 10148        flags &= ~MDBX_ALLOC_COALESCE;
 10149        continue;
 10150      }
 10151  
 10152      /* There is no suitable pages in the GC and to be able to allocate
 10153       * we should CHOICE one of:
 10154       *  - make a new steady checkpoint if reclaiming was stopped by
 10155       *    the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode;
 10156       *  - kick lagging reader(s) if reclaiming was stopped by ones of it.
 10157       *  - extend the database file. */
 10158  
 10159      /* Will use new pages from the map if nothing is suitable in the GC. */
 10160      range = nullptr;
 10161      pgno = txn->mt_next_pgno;
 10162      const size_t next = (size_t)pgno + num;
 10163  
 10164      if (flags & MDBX_ALLOC_GC) {
 10165        const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
 10166        const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
 10167        /* does reclaiming stopped at the last steady point? */
 10168        if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady &&
 10169            detent == prefer_steady.txnid + 1) {
 10170          DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN
 10171                "-%s, detent %" PRIaTXN,
 10172                recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid,
 10173                durable_caption(prefer_steady.ptr_c), detent);
 10174          ret.err = MDBX_RESULT_TRUE;
 10175          const pgno_t autosync_threshold =
 10176              atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
 10177          const uint64_t autosync_period =
 10178              atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
 10179          /* wipe the last steady-point if one of:
 10180           *  - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
 10181           *  - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
 10182           * otherwise, make a new steady-point if one of:
 10183           *  - auto-sync threshold is specified and reached;
 10184           *  - upper limit of database size is reached;
 10185           *  - database is full (with the current file size)
 10186           *       AND auto-sync threshold it NOT specified */
 10187          if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) &&
 10188              ((autosync_threshold | autosync_period) == 0 ||
 10189               next >= prefer_steady.ptr_c->mm_geo.now)) {
 10190            /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
 10191             * without any auto-sync threshold(s). */
 10192            ret.err = wipe_steady(txn, detent);
 10193            DEBUG("gc-wipe-steady, rc %d", ret.err);
 10194            eASSERT(env, prefer_steady.ptr_c !=
 10195                             meta_prefer_steady(env, &txn->tw.troika).ptr_c);
 10196          } else if ((flags & MDBX_ALLOC_NEW) == 0 ||
 10197                     (autosync_threshold &&
 10198                      atomic_load32(&env->me_lck->mti_unsynced_pages,
 10199                                    mo_Relaxed) >= autosync_threshold) ||
 10200                     (autosync_period &&
 10201                      osal_monotime() -
 10202                              atomic_load64(&env->me_lck->mti_sync_timestamp,
 10203                                            mo_Relaxed) >=
 10204                          autosync_period) ||
 10205                     next >= txn->mt_geo.upper ||
 10206                     (next >= txn->mt_end_pgno &&
 10207                      (autosync_threshold | autosync_period) == 0)) {
 10208            /* make steady checkpoint. */
 10209            MDBX_meta meta = *recent.ptr_c;
 10210            ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta,
 10211                                  &txn->tw.troika);
 10212            DEBUG("gc-make-steady, rc %d", ret.err);
 10213            eASSERT(env, prefer_steady.ptr_c !=
 10214                             meta_prefer_steady(env, &txn->tw.troika).ptr_c);
 10215          }
 10216          if (likely(ret.err != MDBX_RESULT_TRUE)) {
 10217            if (unlikely(ret.err != MDBX_SUCCESS))
 10218              goto fail;
 10219            continue;
 10220          }
 10221        }
 10222      }
 10223  
 10224      /* don't kick lagging reader(s) if is enough unallocated space
 10225       * at the end of database file. */
 10226      if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno)
 10227        goto done;
 10228  
 10229      if (flags & MDBX_ALLOC_GC) {
 10230        const txnid_t laggard = txn_oldest_reader(txn);
 10231        if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP &&
 10232                                  kick_longlived_readers(env, laggard) >= detent))
 10233          continue;
 10234      }
 10235  
 10236      ret.err = MDBX_NOTFOUND;
 10237      if (flags & MDBX_ALLOC_NEW) {
 10238        ret.err = MDBX_MAP_FULL;
 10239        if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) {
 10240          eASSERT(env, next > txn->mt_end_pgno);
 10241          const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv);
 10242          size_t aligned = pgno_align2os_pgno(
 10243              env, (pgno_t)(next + grow_step - next % grow_step));
 10244  
 10245          if (aligned > txn->mt_geo.upper)
 10246            aligned = txn->mt_geo.upper;
 10247          eASSERT(env, aligned > txn->mt_end_pgno);
 10248  
 10249          VERBOSE("try growth datafile to %zu pages (+%zu)", aligned,
 10250                  aligned - txn->mt_end_pgno);
 10251          ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned,
 10252                                        txn->mt_geo.upper);
 10253          if (ret.err == MDBX_SUCCESS) {
 10254            env->me_txn->mt_end_pgno = (pgno_t)aligned;
 10255            goto done;
 10256          }
 10257  
 10258          ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned,
 10259                aligned - txn->mt_end_pgno, ret.err);
 10260        } else {
 10261          NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next,
 10262                 txn->mt_geo.upper);
 10263        }
 10264      }
 10265  
 10266    fail:
 10267  #if MDBX_ENABLE_PGOP_STAT
 10268      if (timestamp)
 10269        env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp;
 10270  #endif /* MDBX_ENABLE_PGOP_STAT */
 10271      eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist,
 10272                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 10273      int level;
 10274      const char *what;
 10275      if (likely(!(flags & MDBX_ALLOC_FAKE))) {
 10276        txn->mt_flags |= MDBX_TXN_ERROR;
 10277        level = MDBX_LOG_ERROR;
 10278        what = "pages";
 10279      } else {
 10280        level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
 10281        what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages";
 10282      }
 10283      if (LOG_ENABLED(level))
 10284        debug_log(level, __func__, __LINE__,
 10285                  "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what,
 10286                  flags, ret.err);
 10287  
 10288      eASSERT(env, ret.err != MDBX_SUCCESS);
 10289      ret.page = NULL;
 10290      return ret;
 10291    }
 10292  
 10293  done:
 10294    eASSERT(env, !(flags & MDBX_ALLOC_SLOT));
 10295    ENSURE(env, pgno >= NUM_METAS);
 10296  #if MDBX_ENABLE_PGOP_STAT
 10297    if (likely(timestamp))
 10298      env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp;
 10299  #endif /* MDBX_ENABLE_PGOP_STAT */
 10300    if (unlikely(flags & MDBX_ALLOC_FAKE)) {
 10301      DEBUG("return NULL-page for %u pages %s allocation", num,
 10302            "gc-slot/backlog");
 10303      ret.page = NULL;
 10304      ret.err = MDBX_SUCCESS;
 10305      return ret;
 10306    }
 10307  
 10308    if (env->me_flags & MDBX_WRITEMAP) {
 10309      ret.page = pgno2page(env, pgno);
 10310      VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
 10311      MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
 10312    } else {
 10313      ret.page = page_malloc(txn, num);
 10314      if (unlikely(!ret.page)) {
 10315        ret.err = MDBX_ENOMEM;
 10316        goto fail;
 10317      }
 10318    }
 10319  
 10320    if (range) {
 10321      cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0);
 10322      tASSERT(txn, pgno < txn->mt_next_pgno);
 10323      tASSERT(txn, pgno == *range);
 10324      /* Cutoff allocated pages from tw.reclaimed_pglist */
 10325  #if MDBX_PNL_ASCENDING
 10326      for (const pgno_t *const end = re_list + re_len - num; range <= end;
 10327           ++range)
 10328        *range = range[num];
 10329  #else
 10330      for (const pgno_t *const end = re_list + re_len; ++range <= end;)
 10331        range[-(ptrdiff_t)num] = *range;
 10332  #endif
 10333      MDBX_PNL_SIZE(re_list) = re_len -= num;
 10334      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 10335                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 10336    } else {
 10337      txn->mt_next_pgno = pgno + num;
 10338      eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno);
 10339    }
 10340  
 10341    if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
 10342      memset(ret.page, -1, pgno2bytes(env, num));
 10343    VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
 10344  
 10345    ret.page->mp_pgno = pgno;
 10346    ret.page->mp_leaf2_ksize = 0;
 10347    ret.page->mp_flags = 0;
 10348    if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) {
 10349      ret.page->mp_pages = num;
 10350      ret.page->mp_flags = P_OVERFLOW;
 10351    }
 10352    ret.err = page_dirty(txn, ret.page, num);
 10353    if (unlikely(ret.err != MDBX_SUCCESS))
 10354      goto fail;
 10355  
 10356    tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 10357                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 10358    return ret;
 10359  }
 10360  
 10361  __hot static pgr_t page_alloc(MDBX_cursor *mc) {
 10362    MDBX_txn *const txn = mc->mc_txn;
 10363  
 10364    /* If there are any loose pages, just use them */
 10365    while (likely(txn->tw.loose_pages)) {
 10366  #if MDBX_ENABLE_REFUND
 10367      if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) {
 10368        txn_refund(txn);
 10369        if (!txn->tw.loose_pages)
 10370          break;
 10371      }
 10372  #endif /* MDBX_ENABLE_REFUND */
 10373  
 10374      MDBX_page *page = txn->tw.loose_pages;
 10375      txn->tw.loose_pages = page->mp_next;
 10376      txn->tw.loose_count--;
 10377      DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno);
 10378      tASSERT(txn, page->mp_pgno < txn->mt_next_pgno);
 10379      tASSERT(txn, page->mp_pgno >= NUM_METAS);
 10380      VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env));
 10381      MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env));
 10382      page->mp_txnid = txn->mt_front;
 10383      pgr_t ret = {page, MDBX_SUCCESS};
 10384      return ret;
 10385    }
 10386  
 10387    if (likely(!(mc->mc_flags & C_GCFREEZE))) {
 10388      MDBX_PNL pnl = txn->tw.reclaimed_pglist;
 10389      const unsigned len = MDBX_PNL_SIZE(pnl);
 10390      if (likely(len > 0)) {
 10391        MDBX_PNL_SIZE(pnl) = len - 1;
 10392  #if MDBX_PNL_ASCENDING
 10393        const pgno_t pgno = pnl[1];
 10394        for (unsigned i = 1; i < len; ++i)
 10395          pnl[i] = pnl[i + 1];
 10396  #else
 10397        const pgno_t pgno = pnl[len];
 10398  #endif
 10399  
 10400        MDBX_env *const env = txn->mt_env;
 10401        pgr_t ret;
 10402        if (env->me_flags & MDBX_WRITEMAP) {
 10403          ret.page = pgno2page(env, pgno);
 10404          MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize);
 10405        } else {
 10406          ret.page = page_malloc(txn, 1);
 10407          if (unlikely(!ret.page)) {
 10408            ret.err = MDBX_ENOMEM;
 10409            return ret;
 10410          }
 10411        }
 10412  
 10413        VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize);
 10414        ret.page->mp_pgno = pgno;
 10415        ret.page->mp_leaf2_ksize = 0;
 10416        ret.page->mp_flags = 0;
 10417        tASSERT(txn, ret.page->mp_pgno >= NUM_METAS);
 10418  
 10419        ret.err = page_dirty(txn, ret.page, 1);
 10420        tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 10421                                         txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 10422        return ret;
 10423      }
 10424    }
 10425  
 10426    return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL);
 10427  }
 10428  
 10429  /* Copy the used portions of a non-large/overflow page. */
 10430  __hot static void page_copy(MDBX_page *dst, const MDBX_page *src,
 10431                              size_t psize) {
 10432    STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ);
 10433    STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4);
 10434    if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) {
 10435      size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower;
 10436  
 10437      /* If page isn't full, just copy the used portion. Adjust
 10438       * alignment so memcpy may copy words instead of bytes. */
 10439      if (unused >= MDBX_CACHELINE_SIZE * 2) {
 10440        lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *));
 10441        upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *));
 10442        memcpy(dst, src, lower);
 10443        dst = (void *)((char *)dst + upper);
 10444        src = (void *)((char *)src + upper);
 10445        psize -= upper;
 10446      }
 10447    }
 10448    memcpy(dst, src, psize);
 10449  }
 10450  
 10451  /* Pull a page off the txn's spill list, if present.
 10452   *
 10453   * If a page being referenced was spilled to disk in this txn, bring
 10454   * it back and make it dirty/writable again. */
 10455  static pgr_t __must_check_result page_unspill(MDBX_txn *const txn,
 10456                                                const MDBX_page *const mp) {
 10457    VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno);
 10458    tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0);
 10459    tASSERT(txn, IS_SPILLED(txn, mp));
 10460    const MDBX_txn *scan = txn;
 10461    pgr_t ret;
 10462    do {
 10463      tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0);
 10464      const unsigned si = search_spilled(scan, mp->mp_pgno);
 10465      if (!si)
 10466        continue;
 10467      const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
 10468      ret.page = page_malloc(txn, npages);
 10469      if (unlikely(!ret.page)) {
 10470        ret.err = MDBX_ENOMEM;
 10471        return ret;
 10472      }
 10473      page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages));
 10474      if (scan == txn) {
 10475        /* If in current txn, this page is no longer spilled.
 10476         * If it happens to be the last page, truncate the spill list.
 10477         * Otherwise mark it as deleted by setting the LSB. */
 10478        spill_remove(txn, si, npages);
 10479      } /* otherwise, if belonging to a parent txn, the
 10480         * page remains spilled until child commits */
 10481  
 10482      ret.err = page_dirty(txn, ret.page, npages);
 10483      if (unlikely(ret.err != MDBX_SUCCESS))
 10484        return ret;
 10485  #if MDBX_ENABLE_PGOP_STAT
 10486      txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += npages;
 10487  #endif /* MDBX_ENABLE_PGOP_STAT */
 10488      ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED;
 10489      ret.err = MDBX_SUCCESS;
 10490      return ret;
 10491    } while (likely((scan = scan->mt_parent) != nullptr &&
 10492                    (scan->mt_flags & MDBX_TXN_SPILLS) != 0));
 10493    ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN
 10494          " not found in the spill-list(s), current txn %" PRIaTXN
 10495          " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN,
 10496          mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front,
 10497          txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front);
 10498    ret.err = MDBX_PROBLEM;
 10499    ret.page = NULL;
 10500    return ret;
 10501  }
 10502  
 10503  /* Touch a page: make it dirty and re-insert into tree with updated pgno.
 10504   * Set MDBX_TXN_ERROR on failure.
 10505   *
 10506   * [in] mc  cursor pointing to the page to be touched
 10507   *
 10508   * Returns 0 on success, non-zero on failure. */
 10509  __hot static int page_touch(MDBX_cursor *mc) {
 10510    const MDBX_page *const mp = mc->mc_pg[mc->mc_top];
 10511    MDBX_page *np;
 10512    MDBX_txn *txn = mc->mc_txn;
 10513    int rc;
 10514  
 10515    if (ASSERT_ENABLED()) {
 10516      if (mc->mc_flags & C_SUB) {
 10517        MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
 10518        MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner);
 10519        tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db);
 10520        tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx);
 10521        tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY);
 10522      } else {
 10523        tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY);
 10524      }
 10525      tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
 10526      tASSERT(txn, !IS_OVERFLOW(mp));
 10527      tASSERT(txn, dirtylist_check(txn));
 10528    }
 10529  
 10530    if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp))
 10531      return MDBX_SUCCESS;
 10532  
 10533    if (IS_FROZEN(txn, mp)) {
 10534      /* CoW the page */
 10535      rc = pnl_need(&txn->tw.retired_pages, 1);
 10536      if (unlikely(rc != MDBX_SUCCESS))
 10537        goto fail;
 10538      const pgr_t par = page_alloc(mc);
 10539      rc = par.err;
 10540      np = par.page;
 10541      if (unlikely(rc != MDBX_SUCCESS))
 10542        goto fail;
 10543  
 10544      const pgno_t pgno = np->mp_pgno;
 10545      DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc),
 10546            mp->mp_pgno, pgno);
 10547      tASSERT(txn, mp->mp_pgno != pgno);
 10548      pnl_xappend(txn->tw.retired_pages, mp->mp_pgno);
 10549      /* Update the parent page, if any, to point to the new page */
 10550      if (mc->mc_top) {
 10551        MDBX_page *parent = mc->mc_pg[mc->mc_top - 1];
 10552        MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]);
 10553        node_set_pgno(node, pgno);
 10554      } else {
 10555        mc->mc_db->md_root = pgno;
 10556      }
 10557  
 10558  #if MDBX_ENABLE_PGOP_STAT
 10559      txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1;
 10560  #endif /* MDBX_ENABLE_PGOP_STAT */
 10561      page_copy(np, mp, txn->mt_env->me_psize);
 10562      np->mp_pgno = pgno;
 10563      np->mp_txnid = txn->mt_front;
 10564    } else if (IS_SPILLED(txn, mp)) {
 10565      pgr_t pur = page_unspill(txn, mp);
 10566      np = pur.page;
 10567      rc = pur.err;
 10568      if (likely(rc == MDBX_SUCCESS)) {
 10569        tASSERT(txn, np != nullptr);
 10570        goto done;
 10571      }
 10572      goto fail;
 10573    } else {
 10574      if (unlikely(!txn->mt_parent)) {
 10575        ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s "
 10576              "page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
 10577              " without parent transaction, current txn %" PRIaTXN
 10578              " front %" PRIaTXN,
 10579              IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid,
 10580              mc->mc_txn->mt_txnid, mc->mc_txn->mt_front);
 10581        rc = MDBX_PROBLEM;
 10582        goto fail;
 10583      }
 10584  
 10585      DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno);
 10586      tASSERT(txn,
 10587              txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
 10588      /* No - copy it */
 10589      np = page_malloc(txn, 1);
 10590      if (unlikely(!np)) {
 10591        rc = MDBX_ENOMEM;
 10592        goto fail;
 10593      }
 10594      page_copy(np, mp, txn->mt_env->me_psize);
 10595  
 10596      /* insert a clone of parent's dirty page, so don't touch dirtyroom */
 10597      rc = page_dirty(txn, np, 1);
 10598      if (unlikely(rc != MDBX_SUCCESS))
 10599        goto fail;
 10600  
 10601  #if MDBX_ENABLE_PGOP_STAT
 10602      txn->mt_env->me_lck->mti_pgop_stat.clone.weak += 1;
 10603  #endif /* MDBX_ENABLE_PGOP_STAT */
 10604    }
 10605  
 10606  done:
 10607    /* Adjust cursors pointing to mp */
 10608    mc->mc_pg[mc->mc_top] = np;
 10609    MDBX_cursor *m2 = txn->mt_cursors[mc->mc_dbi];
 10610    if (mc->mc_flags & C_SUB) {
 10611      for (; m2; m2 = m2->mc_next) {
 10612        MDBX_cursor *m3 = &m2->mc_xcursor->mx_cursor;
 10613        if (m3->mc_snum < mc->mc_snum)
 10614          continue;
 10615        if (m3->mc_pg[mc->mc_top] == mp)
 10616          m3->mc_pg[mc->mc_top] = np;
 10617      }
 10618    } else {
 10619      for (; m2; m2 = m2->mc_next) {
 10620        if (m2->mc_snum < mc->mc_snum)
 10621          continue;
 10622        if (m2 == mc)
 10623          continue;
 10624        if (m2->mc_pg[mc->mc_top] == mp) {
 10625          m2->mc_pg[mc->mc_top] = np;
 10626          if (XCURSOR_INITED(m2) && IS_LEAF(np))
 10627            XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]);
 10628        }
 10629      }
 10630    }
 10631    return MDBX_SUCCESS;
 10632  
 10633  fail:
 10634    txn->mt_flags |= MDBX_TXN_ERROR;
 10635    return rc;
 10636  }
 10637  
 10638  __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) {
 10639    bool locked = false;
 10640    int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
 10641  
 10642  retry:;
 10643    unsigned flags = env->me_flags & ~(MDBX_NOMETASYNC | MDBX_SHRINK_ALLOWED);
 10644    if (unlikely((flags & (MDBX_RDONLY | MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) !=
 10645                 MDBX_ENV_ACTIVE)) {
 10646      rc = MDBX_EACCESS;
 10647      if (!(flags & MDBX_ENV_ACTIVE))
 10648        rc = MDBX_EPERM;
 10649      if (flags & MDBX_FATAL_ERROR)
 10650        rc = MDBX_PANIC;
 10651      goto bailout;
 10652    }
 10653  
 10654    const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self());
 10655    meta_ptr_t head;
 10656    if (inside_txn | locked)
 10657      head = meta_recent(env, &env->me_txn0->tw.troika);
 10658    else {
 10659      const meta_troika_t troika = meta_tap(env);
 10660      head = meta_recent(env, &troika);
 10661    }
 10662    const pgno_t unsynced_pages =
 10663        atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
 10664    if (unsynced_pages == 0) {
 10665      const uint32_t synched_meta_txnid_u32 =
 10666          atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed);
 10667      if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady)
 10668        goto bailout;
 10669    }
 10670  
 10671    const pgno_t autosync_threshold =
 10672        atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
 10673    const uint64_t autosync_period =
 10674        atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
 10675    if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
 10676        (autosync_period &&
 10677         osal_monotime() -
 10678                 atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
 10679             autosync_period))
 10680      flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
 10681  
 10682    if (!inside_txn) {
 10683      if (!locked) {
 10684  #if MDBX_ENABLE_PGOP_STAT
 10685        unsigned wops = 0;
 10686  #endif /* MDBX_ENABLE_PGOP_STAT */
 10687  
 10688        int err;
 10689        /* pre-sync to avoid latency for writer */
 10690        if (unsynced_pages > /* FIXME: define threshold */ 16 &&
 10691            (flags & MDBX_SAFE_NOSYNC) == 0) {
 10692          eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
 10693          if (flags & MDBX_WRITEMAP) {
 10694            /* Acquire guard to avoid collision with remap */
 10695  #if defined(_WIN32) || defined(_WIN64)
 10696            osal_srwlock_AcquireShared(&env->me_remap_guard);
 10697  #else
 10698            err = osal_fastmutex_acquire(&env->me_remap_guard);
 10699            if (unlikely(err != MDBX_SUCCESS))
 10700              return err;
 10701  #endif
 10702            const size_t usedbytes =
 10703                pgno_align2os_bytes(env, head.ptr_c->mm_geo.next);
 10704            err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA);
 10705  #if defined(_WIN32) || defined(_WIN64)
 10706            osal_srwlock_ReleaseShared(&env->me_remap_guard);
 10707  #else
 10708            int unlock_err = osal_fastmutex_release(&env->me_remap_guard);
 10709            if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS)
 10710              err = unlock_err;
 10711  #endif
 10712          } else
 10713            err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
 10714  
 10715          if (unlikely(err != MDBX_SUCCESS))
 10716            return err;
 10717  
 10718  #if MDBX_ENABLE_PGOP_STAT
 10719          wops = 1;
 10720  #endif /* MDBX_ENABLE_PGOP_STAT */
 10721          /* pre-sync done */
 10722          rc = MDBX_SUCCESS /* means "some data was synced" */;
 10723        }
 10724  
 10725        err = mdbx_txn_lock(env, nonblock);
 10726        if (unlikely(err != MDBX_SUCCESS))
 10727          return err;
 10728  
 10729        locked = true;
 10730  #if MDBX_ENABLE_PGOP_STAT
 10731        env->me_lck->mti_pgop_stat.wops.weak += wops;
 10732  #endif /* MDBX_ENABLE_PGOP_STAT */
 10733        env->me_txn0->tw.troika = meta_tap(env);
 10734        eASSERT(env, !env->me_txn && !env->me_txn0->mt_child);
 10735        goto retry;
 10736      }
 10737      eASSERT(env, head.txnid == recent_committed_txnid(env));
 10738      env->me_txn0->mt_txnid = head.txnid;
 10739      txn_oldest_reader(env->me_txn0);
 10740      flags |= MDBX_SHRINK_ALLOWED;
 10741    }
 10742  
 10743    eASSERT(env, inside_txn || locked);
 10744    eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0);
 10745  
 10746    if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
 10747      DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO,
 10748            data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c),
 10749            unsynced_pages);
 10750      MDBX_meta meta = *head.ptr_c;
 10751      rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika);
 10752      if (unlikely(rc != MDBX_SUCCESS))
 10753        goto bailout;
 10754    }
 10755  
 10756    /* LY: sync meta-pages if MDBX_NOMETASYNC enabled
 10757     *     and someone was not synced above. */
 10758    if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
 10759        (uint32_t)head.txnid) {
 10760  #if MDBX_ENABLE_PGOP_STAT
 10761      env->me_lck->mti_pgop_stat.wops.weak += 1;
 10762  #endif /* MDBX_ENABLE_PGOP_STAT */
 10763      rc = (flags & MDBX_WRITEMAP)
 10764               ? osal_msync(&env->me_dxb_mmap, 0,
 10765                            pgno_align2os_bytes(env, NUM_METAS),
 10766                            MDBX_SYNC_DATA | MDBX_SYNC_IODQ)
 10767               : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 10768      if (likely(rc == MDBX_SUCCESS))
 10769        atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid,
 10770                       mo_Relaxed);
 10771    }
 10772  
 10773  bailout:
 10774    if (locked)
 10775      mdbx_txn_unlock(env);
 10776    return rc;
 10777  }
 10778  
 10779  static __inline int check_env(const MDBX_env *env, const bool wanna_active) {
 10780    if (unlikely(!env))
 10781      return MDBX_EINVAL;
 10782  
 10783    if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE))
 10784      return MDBX_EBADSIGN;
 10785  
 10786  #if MDBX_ENV_CHECKPID
 10787    if (unlikely(env->me_pid != osal_getpid())) {
 10788      ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR;
 10789      return MDBX_PANIC;
 10790    }
 10791  #endif /* MDBX_ENV_CHECKPID */
 10792  
 10793    if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
 10794      return MDBX_PANIC;
 10795  
 10796    if (wanna_active) {
 10797      if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0))
 10798        return MDBX_EPERM;
 10799      eASSERT(env, env->me_map != nullptr);
 10800    }
 10801  
 10802    return MDBX_SUCCESS;
 10803  }
 10804  
 10805  __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) {
 10806    int rc = check_env(env, true);
 10807    if (unlikely(rc != MDBX_SUCCESS))
 10808      return rc;
 10809  
 10810    return env_sync(env, force, nonblock);
 10811  }
 10812  
 10813  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 10814  __cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); }
 10815  
 10816  __cold int mdbx_env_sync_poll(MDBX_env *env) {
 10817    return __inline_mdbx_env_sync_poll(env);
 10818  }
 10819  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 10820  
 10821  /* Back up parent txn's cursors, then grab the originals for tracking */
 10822  static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) {
 10823    for (int i = parent->mt_numdbs; --i >= 0;) {
 10824      nested->mt_cursors[i] = NULL;
 10825      MDBX_cursor *mc = parent->mt_cursors[i];
 10826      if (mc != NULL) {
 10827        size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor)
 10828                                     : sizeof(MDBX_cursor);
 10829        for (MDBX_cursor *bk; mc; mc = bk->mc_next) {
 10830          bk = mc;
 10831          if (mc->mc_signature != MDBX_MC_LIVE)
 10832            continue;
 10833          bk = osal_malloc(size);
 10834          if (unlikely(!bk))
 10835            return MDBX_ENOMEM;
 10836  #if MDBX_DEBUG
 10837          memset(bk, 0xCD, size);
 10838          VALGRIND_MAKE_MEM_UNDEFINED(bk, size);
 10839  #endif /* MDBX_DEBUG */
 10840          *bk = *mc;
 10841          mc->mc_backup = bk;
 10842          /* Kill pointers into src to reduce abuse: The
 10843           * user may not use mc until dst ends. But we need a valid
 10844           * txn pointer here for cursor fixups to keep working. */
 10845          mc->mc_txn = nested;
 10846          mc->mc_db = &nested->mt_dbs[i];
 10847          mc->mc_dbistate = &nested->mt_dbistate[i];
 10848          MDBX_xcursor *mx = mc->mc_xcursor;
 10849          if (mx != NULL) {
 10850            *(MDBX_xcursor *)(bk + 1) = *mx;
 10851            mx->mx_cursor.mc_txn = nested;
 10852          }
 10853          mc->mc_next = nested->mt_cursors[i];
 10854          nested->mt_cursors[i] = mc;
 10855        }
 10856      }
 10857    }
 10858    return MDBX_SUCCESS;
 10859  }
 10860  
 10861  /* Close this txn's cursors, give parent txn's cursors back to parent.
 10862   *
 10863   * [in] txn     the transaction handle.
 10864   * [in] merge   true to keep changes to parent cursors, false to revert.
 10865   *
 10866   * Returns 0 on success, non-zero on failure. */
 10867  static void cursors_eot(MDBX_txn *txn, const bool merge) {
 10868    for (int i = txn->mt_numdbs; --i >= 0;) {
 10869      MDBX_cursor *next, *mc = txn->mt_cursors[i];
 10870      if (!mc)
 10871        continue;
 10872      txn->mt_cursors[i] = NULL;
 10873      do {
 10874        const unsigned stage = mc->mc_signature;
 10875        MDBX_cursor *bk = mc->mc_backup;
 10876        next = mc->mc_next;
 10877        ENSURE(txn->mt_env,
 10878               stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk));
 10879        cASSERT(mc, mc->mc_dbi == (unsigned)i);
 10880        if (bk) {
 10881          MDBX_xcursor *mx = mc->mc_xcursor;
 10882          cASSERT(mc, mx == bk->mc_xcursor);
 10883          tASSERT(txn, txn->mt_parent != NULL);
 10884          ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE);
 10885          if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */)
 10886            mc->mc_signature = stage /* Promote closed state to parent txn */;
 10887          else if (merge) {
 10888            /* Restore pointers to parent txn */
 10889            mc->mc_next = bk->mc_next;
 10890            mc->mc_backup = bk->mc_backup;
 10891            mc->mc_txn = bk->mc_txn;
 10892            mc->mc_db = bk->mc_db;
 10893            mc->mc_dbistate = bk->mc_dbistate;
 10894            if (mx) {
 10895              if (mx != bk->mc_xcursor) {
 10896                *bk->mc_xcursor = *mx;
 10897                mx = bk->mc_xcursor;
 10898              }
 10899              mx->mx_cursor.mc_txn = bk->mc_txn;
 10900            }
 10901          } else {
 10902            /* Restore from backup, i.e. rollback/abort nested txn */
 10903            *mc = *bk;
 10904            if (mx)
 10905              *mx = *(MDBX_xcursor *)(bk + 1);
 10906          }
 10907          bk->mc_signature = 0;
 10908          osal_free(bk);
 10909        } else {
 10910          ENSURE(txn->mt_env, stage == MDBX_MC_LIVE);
 10911          mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */;
 10912          mc->mc_flags = 0 /* reset C_UNTRACK */;
 10913        }
 10914      } while ((mc = next) != NULL);
 10915    }
 10916  }
 10917  
 10918  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 10919  /* Find largest mvcc-snapshot still referenced by this process. */
 10920  static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) {
 10921    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 10922    if (likely(lck != NULL /* exclusive mode */)) {
 10923      const unsigned snap_nreaders =
 10924          atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 10925      for (unsigned i = 0; i < snap_nreaders; ++i) {
 10926      retry:
 10927        if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) ==
 10928            env->me_pid) {
 10929          /* jitter4testing(true); */
 10930          const pgno_t snap_pages = atomic_load32(
 10931              &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed);
 10932          const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
 10933          if (unlikely(
 10934                  snap_pages !=
 10935                      atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used,
 10936                                    mo_AcquireRelease) ||
 10937                  snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
 10938            goto retry;
 10939          if (largest < snap_pages &&
 10940              atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <=
 10941                  /* ignore pending updates */ snap_txnid &&
 10942              snap_txnid <= MAX_TXNID)
 10943            largest = snap_pages;
 10944        }
 10945      }
 10946    }
 10947    return largest;
 10948  }
 10949  
 10950  static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) {
 10951  #if !defined(__SANITIZE_ADDRESS__)
 10952    if (!RUNNING_ON_VALGRIND)
 10953      return;
 10954  #endif
 10955  
 10956    if (txn) { /* transaction start */
 10957      if (env->me_poison_edge < txn->mt_next_pgno)
 10958        env->me_poison_edge = txn->mt_next_pgno;
 10959      VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno));
 10960      MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map,
 10961                                       pgno2bytes(env, txn->mt_next_pgno));
 10962      /* don't touch more, it should be already poisoned */
 10963    } else { /* transaction end */
 10964      bool should_unlock = false;
 10965      pgno_t last = MAX_PAGENO + 1;
 10966      if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) {
 10967        /* inside write-txn */
 10968        last = meta_recent(env, &env->me_txn0->troika).ptr_v->mm_geo.next;
 10969      } else if (env->me_flags & MDBX_RDONLY) {
 10970        /* read-only mode, no write-txn, no wlock mutex */
 10971        last = NUM_METAS;
 10972      } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) {
 10973        /* no write-txn */
 10974        last = NUM_METAS;
 10975        should_unlock = true;
 10976      } else {
 10977        /* write txn is running, therefore shouldn't poison any memory range */
 10978        return;
 10979      }
 10980  
 10981      last = find_largest_this(env, last);
 10982      const pgno_t edge = env->me_poison_edge;
 10983      if (edge > last) {
 10984        eASSERT(env, last >= NUM_METAS);
 10985        env->me_poison_edge = last;
 10986        VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last),
 10987                                   pgno2bytes(env, edge - last));
 10988        MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last),
 10989                                       pgno2bytes(env, edge - last));
 10990      }
 10991      if (should_unlock)
 10992        mdbx_txn_unlock(env);
 10993    }
 10994  }
 10995  #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
 10996  
 10997  typedef struct {
 10998    int err;
 10999    MDBX_reader *rslot;
 11000  } bind_rslot_result;
 11001  
 11002  static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) {
 11003    eASSERT(env, env->me_lck_mmap.lck);
 11004    eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC);
 11005    eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT);
 11006  
 11007    bind_rslot_result result = {osal_rdt_lock(env), nullptr};
 11008    if (unlikely(MDBX_IS_ERROR(result.err)))
 11009      return result;
 11010    if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
 11011      osal_rdt_unlock(env);
 11012      result.err = MDBX_PANIC;
 11013      return result;
 11014    }
 11015    if (unlikely(!env->me_map)) {
 11016      osal_rdt_unlock(env);
 11017      result.err = MDBX_EPERM;
 11018      return result;
 11019    }
 11020  
 11021    if (unlikely(env->me_live_reader != env->me_pid)) {
 11022      result.err = osal_rpid_set(env);
 11023      if (unlikely(result.err != MDBX_SUCCESS)) {
 11024        osal_rdt_unlock(env);
 11025        return result;
 11026      }
 11027      env->me_live_reader = env->me_pid;
 11028    }
 11029  
 11030    result.err = MDBX_SUCCESS;
 11031    unsigned slot, nreaders;
 11032    while (1) {
 11033      nreaders = env->me_lck->mti_numreaders.weak;
 11034      for (slot = 0; slot < nreaders; slot++)
 11035        if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid,
 11036                           mo_AcquireRelease))
 11037          break;
 11038  
 11039      if (likely(slot < env->me_maxreaders))
 11040        break;
 11041  
 11042      result.err = cleanup_dead_readers(env, true, NULL);
 11043      if (result.err != MDBX_RESULT_TRUE) {
 11044        osal_rdt_unlock(env);
 11045        result.err =
 11046            (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
 11047        return result;
 11048      }
 11049    }
 11050  
 11051    result.rslot = &env->me_lck->mti_readers[slot];
 11052    /* Claim the reader slot, carefully since other code
 11053     * uses the reader table un-mutexed: First reset the
 11054     * slot, next publish it in lck->mti_numreaders.  After
 11055     * that, it is safe for mdbx_env_close() to touch it.
 11056     * When it will be closed, we can finally claim it. */
 11057    atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease);
 11058    safe64_reset(&result.rslot->mr_txnid, true);
 11059    if (slot == nreaders)
 11060      env->me_lck->mti_numreaders.weak = ++nreaders;
 11061    result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid;
 11062    atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease);
 11063    osal_rdt_unlock(env);
 11064  
 11065    if (likely(env->me_flags & MDBX_ENV_TXKEY)) {
 11066      eASSERT(env, env->me_live_reader == env->me_pid);
 11067      thread_rthc_set(env->me_txkey, result.rslot);
 11068    }
 11069    return result;
 11070  }
 11071  
 11072  __cold int mdbx_thread_register(const MDBX_env *env) {
 11073    int rc = check_env(env, true);
 11074    if (unlikely(rc != MDBX_SUCCESS))
 11075      return rc;
 11076  
 11077    if (unlikely(!env->me_lck_mmap.lck))
 11078      return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM;
 11079  
 11080    if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) {
 11081      eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
 11082      return MDBX_EINVAL /* MDBX_NOTLS mode */;
 11083    }
 11084  
 11085    eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY |
 11086                                   MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY);
 11087    MDBX_reader *r = thread_rthc_get(env->me_txkey);
 11088    if (unlikely(r != NULL)) {
 11089      eASSERT(env, r->mr_pid.weak == env->me_pid);
 11090      eASSERT(env, r->mr_tid.weak == osal_thread_self());
 11091      if (unlikely(r->mr_pid.weak != env->me_pid))
 11092        return MDBX_BAD_RSLOT;
 11093      return MDBX_RESULT_TRUE /* already registered */;
 11094    }
 11095  
 11096    const uintptr_t tid = osal_thread_self();
 11097    if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid))
 11098      return MDBX_TXN_OVERLAPPING;
 11099    return bind_rslot((MDBX_env *)env, tid).err;
 11100  }
 11101  
 11102  __cold int mdbx_thread_unregister(const MDBX_env *env) {
 11103    int rc = check_env(env, true);
 11104    if (unlikely(rc != MDBX_SUCCESS))
 11105      return rc;
 11106  
 11107    if (unlikely(!env->me_lck_mmap.lck))
 11108      return MDBX_RESULT_TRUE;
 11109  
 11110    if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) {
 11111      eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
 11112      return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */;
 11113    }
 11114  
 11115    eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY |
 11116                                   MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY);
 11117    MDBX_reader *r = thread_rthc_get(env->me_txkey);
 11118    if (unlikely(r == NULL))
 11119      return MDBX_RESULT_TRUE /* not registered */;
 11120  
 11121    eASSERT(env, r->mr_pid.weak == env->me_pid);
 11122    eASSERT(env, r->mr_tid.weak == osal_thread_self());
 11123    if (unlikely(r->mr_pid.weak != env->me_pid ||
 11124                 r->mr_tid.weak != osal_thread_self()))
 11125      return MDBX_BAD_RSLOT;
 11126  
 11127    eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD);
 11128    if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD))
 11129      return MDBX_BUSY /* transaction is still active */;
 11130  
 11131    atomic_store32(&r->mr_pid, 0, mo_Relaxed);
 11132    atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
 11133                   mo_AcquireRelease);
 11134    thread_rthc_set(env->me_txkey, nullptr);
 11135    return MDBX_SUCCESS;
 11136  }
 11137  
 11138  /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */
 11139  static bool coherency_check(const MDBX_env *env, const txnid_t txnid,
 11140                              const volatile MDBX_db *dbs,
 11141                              const volatile MDBX_meta *meta, bool report) {
 11142    const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid;
 11143    const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid;
 11144  
 11145    const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root;
 11146    const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno != P_INVALID)
 11147                                       ? pgno2page(env, freedb_root_pgno)
 11148                                       : nullptr;
 11149  
 11150    const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root;
 11151    const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID)
 11152                                       ? pgno2page(env, maindb_root_pgno)
 11153                                       : nullptr;
 11154    const uint64_t magic_and_version =
 11155        unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version);
 11156  
 11157    bool ok = true;
 11158    if (unlikely(txnid < freedb_mod_txnid ||
 11159                 (!freedb_mod_txnid && freedb_root &&
 11160                  likely(magic_and_version == MDBX_DATA_MAGIC)))) {
 11161      if (report)
 11162        WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN
 11163                " for meta_txnid %" PRIaTXN " %s",
 11164                "free", freedb_mod_txnid, txnid,
 11165                "(workaround for incoherent flaw of unified page/buffer cache)");
 11166      ok = false;
 11167    }
 11168    if (unlikely(txnid < maindb_mod_txnid ||
 11169                 (!maindb_mod_txnid && maindb_root &&
 11170                  likely(magic_and_version == MDBX_DATA_MAGIC)))) {
 11171      if (report)
 11172        WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN
 11173                " for meta_txnid %" PRIaTXN " %s",
 11174                "main", maindb_mod_txnid, txnid,
 11175                "(workaround for incoherent flaw of unified page/buffer cache)");
 11176      ok = false;
 11177    }
 11178    if (likely(freedb_root && freedb_mod_txnid)) {
 11179      VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->mp_txnid));
 11180      MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root,
 11181                                       sizeof(freedb_root->mp_txnid));
 11182      const txnid_t root_txnid = freedb_root->mp_txnid;
 11183      if (unlikely(root_txnid != freedb_mod_txnid)) {
 11184        if (report)
 11185          WARNING(
 11186              "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
 11187              " for %sdb.mod_txnid %" PRIaTXN " %s",
 11188              freedb_root_pgno, root_txnid, "free", freedb_mod_txnid,
 11189              "(workaround for incoherent flaw of unified page/buffer cache)");
 11190        ok = false;
 11191      }
 11192    }
 11193    if (likely(maindb_root && maindb_mod_txnid)) {
 11194      VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->mp_txnid));
 11195      MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root,
 11196                                       sizeof(maindb_root->mp_txnid));
 11197      const txnid_t root_txnid = maindb_root->mp_txnid;
 11198      if (unlikely(root_txnid != maindb_mod_txnid)) {
 11199        if (report)
 11200          WARNING(
 11201              "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN
 11202              " for %sdb.mod_txnid %" PRIaTXN " %s",
 11203              maindb_root_pgno, root_txnid, "main", maindb_mod_txnid,
 11204              "(workaround for incoherent flaw of unified page/buffer cache)");
 11205        ok = false;
 11206      }
 11207    }
 11208    return ok;
 11209  }
 11210  
 11211  __cold static int coherency_timeout(uint64_t *timestamp) {
 11212    if (likely(timestamp && *timestamp == 0))
 11213      *timestamp = osal_monotime();
 11214    else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) {
 11215      ERROR("bailout waiting for valid snapshot (%s)",
 11216            "workaround for incoherent flaw of unified page/buffer cache");
 11217      return MDBX_CORRUPTED;
 11218    }
 11219  
 11220    osal_memory_fence(mo_AcquireRelease, true);
 11221  #if defined(_WIN32) || defined(_WIN64)
 11222    SwitchToThread();
 11223  #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
 11224    sched_yield();
 11225  #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
 11226    pthread_yield();
 11227  #else
 11228    usleep(42);
 11229  #endif
 11230    return MDBX_RESULT_TRUE;
 11231  }
 11232  
 11233  /* check with timeout as the workaround
 11234   * for todo4recovery://erased_by_github/libmdbx/issues/269 */
 11235  __hot static int coherency_check_readed(const MDBX_env *env,
 11236                                          const txnid_t txnid,
 11237                                          const volatile MDBX_db *dbs,
 11238                                          const volatile MDBX_meta *meta,
 11239                                          uint64_t *timestamp) {
 11240    const bool report = !(timestamp && *timestamp);
 11241    if (unlikely(!coherency_check(env, txnid, dbs, meta, report)))
 11242      return coherency_timeout(timestamp);
 11243    return MDBX_SUCCESS;
 11244  }
 11245  
 11246  static int coherency_check_written(const MDBX_env *env, const txnid_t txnid,
 11247                                     const volatile MDBX_meta *meta,
 11248                                     uint64_t *timestamp) {
 11249    const bool report = !(timestamp && *timestamp);
 11250    const txnid_t head_txnid = meta_txnid(meta);
 11251    if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) {
 11252      if (report)
 11253        WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s",
 11254                (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid,
 11255                bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb),
 11256                "(workaround for incoherent flaw of unified page/buffer cache)");
 11257      return coherency_timeout(timestamp);
 11258    }
 11259    return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp);
 11260  }
 11261  
 11262  static bool coherency_check_meta(const MDBX_env *env,
 11263                                   const volatile MDBX_meta *meta, bool report) {
 11264    uint64_t timestamp = 0;
 11265    return coherency_check_written(env, 0, meta, report ? &timestamp : nullptr) ==
 11266           MDBX_SUCCESS;
 11267  }
 11268  
 11269  /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */
 11270  static int txn_renew(MDBX_txn *txn, const unsigned flags) {
 11271    MDBX_env *env = txn->mt_env;
 11272    int rc;
 11273  
 11274  #if MDBX_ENV_CHECKPID
 11275    if (unlikely(env->me_pid != osal_getpid())) {
 11276      env->me_flags |= MDBX_FATAL_ERROR;
 11277      return MDBX_PANIC;
 11278    }
 11279  #endif /* MDBX_ENV_CHECKPID */
 11280  
 11281    STATIC_ASSERT(sizeof(MDBX_reader) == 32);
 11282  #if MDBX_LOCKING > 0
 11283    STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0);
 11284    STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0);
 11285  #else
 11286    STATIC_ASSERT(
 11287        offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0);
 11288    STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE ==
 11289                  0);
 11290  #endif /* MDBX_LOCKING */
 11291    STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE ==
 11292                  0);
 11293  
 11294    const uintptr_t tid = osal_thread_self();
 11295    if (flags & MDBX_TXN_RDONLY) {
 11296      eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0);
 11297      txn->mt_flags =
 11298          MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP));
 11299      MDBX_reader *r = txn->to.reader;
 11300      STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid));
 11301      if (likely(env->me_flags & MDBX_ENV_TXKEY)) {
 11302        eASSERT(env, !(env->me_flags & MDBX_NOTLS));
 11303        r = thread_rthc_get(env->me_txkey);
 11304        if (likely(r)) {
 11305          if (unlikely(!r->mr_pid.weak) &&
 11306              (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) {
 11307            thread_rthc_set(env->me_txkey, nullptr);
 11308            r = nullptr;
 11309          } else {
 11310            eASSERT(env, r->mr_pid.weak == env->me_pid);
 11311            eASSERT(env, r->mr_tid.weak == osal_thread_self());
 11312          }
 11313        }
 11314      } else {
 11315        eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
 11316      }
 11317  
 11318      if (likely(r)) {
 11319        if (unlikely(r->mr_pid.weak != env->me_pid ||
 11320                     r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD))
 11321          return MDBX_BAD_RSLOT;
 11322      } else if (env->me_lck_mmap.lck) {
 11323        bind_rslot_result brs = bind_rslot(env, tid);
 11324        if (unlikely(brs.err != MDBX_SUCCESS))
 11325          return brs.err;
 11326        r = brs.rslot;
 11327      }
 11328      txn->to.reader = r;
 11329      if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) {
 11330        eASSERT(env, txn->mt_txnid == 0);
 11331        eASSERT(env, txn->mt_owner == 0);
 11332        eASSERT(env, txn->mt_numdbs == 0);
 11333        if (likely(r)) {
 11334          eASSERT(env, r->mr_snapshot_pages_used.weak == 0);
 11335          eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD);
 11336          atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed);
 11337        }
 11338        txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
 11339        return MDBX_SUCCESS;
 11340      }
 11341  
 11342      /* Seek & fetch the last meta */
 11343      uint64_t timestamp = 0;
 11344      unsigned loop = 0;
 11345      meta_troika_t troika = meta_tap(env);
 11346      while (1) {
 11347        const meta_ptr_t head =
 11348            likely(env->me_stuck_meta < 0)
 11349                ? /* regular */ meta_recent(env, &troika)
 11350                : /* recovery mode */ meta_ptr(env, env->me_stuck_meta);
 11351        if (likely(r)) {
 11352          safe64_reset(&r->mr_txnid, false);
 11353          atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next,
 11354                         mo_Relaxed);
 11355          atomic_store64(
 11356              &r->mr_snapshot_pages_retired,
 11357              unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired),
 11358              mo_Relaxed);
 11359          safe64_write(&r->mr_txnid, head.txnid);
 11360          eASSERT(env, r->mr_pid.weak == osal_getpid());
 11361          eASSERT(env,
 11362                  r->mr_tid.weak ==
 11363                      ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self()));
 11364          eASSERT(env, r->mr_txnid.weak == head.txnid ||
 11365                           (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD &&
 11366                            head.txnid < env->me_lck->mti_oldest_reader.weak));
 11367          atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
 11368                         mo_AcquireRelease);
 11369        } else {
 11370          /* exclusive mode without lck */
 11371          eASSERT(env, !env->me_lck_mmap.lck &&
 11372                           env->me_lck == (void *)&env->x_lckless_stub);
 11373        }
 11374        jitter4testing(true);
 11375  
 11376        /* Snap the state from current meta-head */
 11377        txn->mt_txnid = head.txnid;
 11378        txn->mt_geo = head.ptr_v->mm_geo;
 11379        memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db));
 11380        txn->mt_canary = head.ptr_v->mm_canary;
 11381  
 11382        if (unlikely(env->me_stuck_meta >= 0))
 11383          break;
 11384        if (unlikely(meta_should_retry(env, &troika) ||
 11385                     head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader,
 11386                                                mo_AcquireRelease))) {
 11387          if (unlikely(++loop > 42)) {
 11388            ERROR("bailout waiting for valid snapshot (%s)",
 11389                  "metapages are too volatile");
 11390            rc = MDBX_PROBLEM;
 11391            txn->mt_txnid = INVALID_TXNID;
 11392            if (likely(r))
 11393              safe64_reset(&r->mr_txnid, false);
 11394            goto bailout;
 11395          }
 11396          timestamp = 0;
 11397          continue;
 11398        }
 11399  
 11400        rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v,
 11401                                    &timestamp);
 11402        jitter4testing(false);
 11403        if (likely(rc == MDBX_SUCCESS))
 11404          break;
 11405  
 11406        if (unlikely(rc != MDBX_RESULT_TRUE)) {
 11407          txn->mt_txnid = INVALID_TXNID;
 11408          if (likely(r))
 11409            safe64_reset(&r->mr_txnid, false);
 11410          goto bailout;
 11411        }
 11412      }
 11413  
 11414      if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) {
 11415        ERROR("%s", "environment corrupted by died writer, must shutdown!");
 11416        if (likely(r))
 11417          safe64_reset(&r->mr_txnid, false);
 11418        txn->mt_txnid = INVALID_TXNID;
 11419        rc = MDBX_CORRUPTED;
 11420        goto bailout;
 11421      }
 11422      eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak);
 11423      txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
 11424      ENSURE(env, txn->mt_txnid >=
 11425                      /* paranoia is appropriate here */ env->me_lck
 11426                          ->mti_oldest_reader.weak);
 11427      txn->mt_numdbs = env->me_numdbs;
 11428    } else {
 11429      eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS |
 11430                              MDBX_WRITEMAP)) == 0);
 11431      if (unlikely(txn->mt_owner == tid ||
 11432                   /* not recovery mode */ env->me_stuck_meta >= 0))
 11433        return MDBX_BUSY;
 11434      MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 11435      if (lck && (env->me_flags & MDBX_NOTLS) == 0 &&
 11436          (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) {
 11437        const unsigned snap_nreaders =
 11438            atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 11439        for (unsigned i = 0; i < snap_nreaders; ++i) {
 11440          if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) ==
 11441                  env->me_pid &&
 11442              unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) ==
 11443                       tid)) {
 11444            const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
 11445            if (txnid >= MIN_TXNID && txnid <= MAX_TXNID)
 11446              return MDBX_TXN_OVERLAPPING;
 11447          }
 11448        }
 11449      }
 11450  
 11451      /* Not yet touching txn == env->me_txn0, it may be active */
 11452      jitter4testing(false);
 11453      rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY));
 11454      if (unlikely(rc))
 11455        return rc;
 11456      if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
 11457        mdbx_txn_unlock(env);
 11458        return MDBX_PANIC;
 11459      }
 11460  #if defined(_WIN32) || defined(_WIN64)
 11461      if (unlikely(!env->me_map)) {
 11462        mdbx_txn_unlock(env);
 11463        return MDBX_EPERM;
 11464      }
 11465  #endif /* Windows */
 11466  
 11467      txn->tw.troika = meta_tap(env);
 11468      const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
 11469      uint64_t timestamp = 0;
 11470      while (
 11471          "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") {
 11472        rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs,
 11473                                    head.ptr_v, &timestamp);
 11474        if (likely(rc == MDBX_SUCCESS))
 11475          break;
 11476        if (unlikely(rc != MDBX_RESULT_TRUE))
 11477          goto bailout;
 11478      }
 11479      txn->mt_canary = head.ptr_c->mm_canary;
 11480      eASSERT(env, meta_txnid(head.ptr_v) == head.txnid);
 11481      txn->mt_txnid = safe64_txnid_next(head.txnid);
 11482      if (unlikely(txn->mt_txnid > MAX_TXNID)) {
 11483        rc = MDBX_TXN_FULL;
 11484        ERROR("txnid overflow, raise %d", rc);
 11485        goto bailout;
 11486      }
 11487  
 11488      txn->mt_flags = flags;
 11489      txn->mt_child = NULL;
 11490      txn->tw.loose_pages = NULL;
 11491      txn->tw.loose_count = 0;
 11492  #if MDBX_ENABLE_REFUND
 11493      txn->tw.loose_refund_wl = 0;
 11494  #endif /* MDBX_ENABLE_REFUND */
 11495      MDBX_PNL_SIZE(txn->tw.retired_pages) = 0;
 11496      txn->tw.spill_pages = NULL;
 11497      txn->tw.spill_least_removed = 0;
 11498      txn->tw.last_reclaimed = 0;
 11499      if (txn->tw.lifo_reclaimed)
 11500        MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0;
 11501      env->me_txn = txn;
 11502      txn->mt_numdbs = env->me_numdbs;
 11503      memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned));
 11504      /* Copy the DB info and flags */
 11505      memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db));
 11506      /* Moved to here to avoid a data race in read TXNs */
 11507      txn->mt_geo = head.ptr_c->mm_geo;
 11508  
 11509      rc = dpl_alloc(txn);
 11510      if (unlikely(rc != MDBX_SUCCESS))
 11511        goto bailout;
 11512      txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit;
 11513      txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0;
 11514    }
 11515  
 11516    /* Setup db info */
 11517    osal_compiler_barrier();
 11518    memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs);
 11519    for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) {
 11520      const unsigned db_flags = env->me_dbflags[i];
 11521      txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS;
 11522      txn->mt_dbistate[i] =
 11523          (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0;
 11524    }
 11525    txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID;
 11526    txn->mt_dbistate[FREE_DBI] = DBI_VALID;
 11527    txn->mt_front =
 11528        txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0);
 11529  
 11530    if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
 11531      WARNING("%s", "environment had fatal error, must shutdown!");
 11532      rc = MDBX_PANIC;
 11533    } else {
 11534      const size_t size =
 11535          pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno
 11536                                                            : txn->mt_end_pgno);
 11537      if (unlikely(size > env->me_dxb_mmap.limit)) {
 11538        if (txn->mt_geo.upper > MAX_PAGENO + 1 ||
 11539            bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) !=
 11540                txn->mt_geo.upper) {
 11541          rc = MDBX_UNABLE_EXTEND_MAPSIZE;
 11542          goto bailout;
 11543        }
 11544        rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno,
 11545                        txn->mt_geo.upper,
 11546                        (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false);
 11547        if (rc != MDBX_SUCCESS)
 11548          goto bailout;
 11549      } else {
 11550        env->me_dxb_mmap.current = size;
 11551        env->me_dxb_mmap.filesize =
 11552            (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize;
 11553      }
 11554      if (txn->mt_flags & MDBX_TXN_RDONLY) {
 11555  #if defined(_WIN32) || defined(_WIN64)
 11556        if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) ||
 11557             (mdbx_RunningUnderWine() &&
 11558              /* under Wine acquisition of remap_guard is always required,
 11559               * since Wine don't support section extending,
 11560               * i.e. in both cases unmap+map are required. */
 11561              size < env->me_dbgeo.upper && env->me_dbgeo.grow)) &&
 11562            /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) {
 11563          txn->mt_flags |= MDBX_SHRINK_ALLOWED;
 11564          osal_srwlock_AcquireShared(&env->me_remap_guard);
 11565        }
 11566  #endif /* Windows */
 11567      }
 11568  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 11569      txn_valgrind(env, txn);
 11570  #endif
 11571      txn->mt_owner = tid;
 11572      return MDBX_SUCCESS;
 11573    }
 11574  bailout:
 11575    tASSERT(txn, rc != MDBX_SUCCESS);
 11576    txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN);
 11577    return rc;
 11578  }
 11579  
 11580  static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) {
 11581    if (unlikely(!txn))
 11582      return MDBX_EINVAL;
 11583  
 11584    if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
 11585      return MDBX_EBADSIGN;
 11586  
 11587    if (unlikely(txn->mt_flags & bad_bits))
 11588      return MDBX_BAD_TXN;
 11589  
 11590    tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) ==
 11591                     ((txn->mt_flags & MDBX_TXN_RDONLY)
 11592                          ? txn->mt_env->me_flags & MDBX_NOTLS
 11593                          : 0));
 11594  #if MDBX_TXN_CHECKOWNER
 11595    STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY);
 11596    if (unlikely(txn->mt_owner != osal_thread_self()) &&
 11597        (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) <
 11598            (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY))
 11599      return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN;
 11600  #endif /* MDBX_TXN_CHECKOWNER */
 11601  
 11602    if (bad_bits && unlikely(!txn->mt_env->me_map))
 11603      return MDBX_EPERM;
 11604  
 11605    return MDBX_SUCCESS;
 11606  }
 11607  
 11608  static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
 11609    int err = check_txn(txn, bad_bits);
 11610    if (unlikely(err))
 11611      return err;
 11612  
 11613    if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY))
 11614      return MDBX_EACCESS;
 11615  
 11616    return MDBX_SUCCESS;
 11617  }
 11618  
 11619  int mdbx_txn_renew(MDBX_txn *txn) {
 11620    if (unlikely(!txn))
 11621      return MDBX_EINVAL;
 11622  
 11623    if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
 11624      return MDBX_EBADSIGN;
 11625  
 11626    if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0))
 11627      return MDBX_EINVAL;
 11628  
 11629    int rc;
 11630    if (unlikely(txn->mt_owner != 0 || !(txn->mt_flags & MDBX_TXN_FINISHED))) {
 11631      rc = mdbx_txn_reset(txn);
 11632      if (unlikely(rc != MDBX_SUCCESS))
 11633        return rc;
 11634    }
 11635  
 11636    rc = txn_renew(txn, MDBX_TXN_RDONLY);
 11637    if (rc == MDBX_SUCCESS) {
 11638      txn->mt_owner = osal_thread_self();
 11639      DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
 11640            "/%" PRIaPGNO,
 11641            txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w',
 11642            (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root,
 11643            txn->mt_dbs[FREE_DBI].md_root);
 11644    }
 11645    return rc;
 11646  }
 11647  
 11648  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 11649  int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
 11650                     MDBX_txn **ret) {
 11651    return __inline_mdbx_txn_begin(env, parent, flags, ret);
 11652  }
 11653  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 11654  
 11655  int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
 11656    int rc = check_txn(txn, MDBX_TXN_FINISHED);
 11657    if (unlikely(rc != MDBX_SUCCESS))
 11658      return rc;
 11659  
 11660    txn->mt_userctx = ctx;
 11661    return MDBX_SUCCESS;
 11662  }
 11663  
 11664  void *mdbx_txn_get_userctx(const MDBX_txn *txn) {
 11665    return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->mt_userctx;
 11666  }
 11667  
 11668  int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
 11669                        MDBX_txn **ret, void *context) {
 11670    MDBX_txn *txn;
 11671    unsigned size, tsize;
 11672  
 11673    if (unlikely(!ret))
 11674      return MDBX_EINVAL;
 11675    *ret = NULL;
 11676  
 11677    if (unlikely((flags & ~MDBX_TXN_RW_BEGIN_FLAGS) &&
 11678                 (flags & ~MDBX_TXN_RO_BEGIN_FLAGS)))
 11679      return MDBX_EINVAL;
 11680  
 11681    int rc = check_env(env, true);
 11682    if (unlikely(rc != MDBX_SUCCESS))
 11683      return rc;
 11684  
 11685    if (unlikely(env->me_flags & MDBX_RDONLY &
 11686                 ~flags)) /* write txn in RDONLY env */
 11687      return MDBX_EACCESS;
 11688  
 11689    flags |= env->me_flags & MDBX_WRITEMAP;
 11690  
 11691    if (parent) {
 11692      /* Nested transactions: Max 1 child, write txns only, no writemap */
 11693      rc = check_txn_rw(parent,
 11694                        MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED);
 11695      if (unlikely(rc != MDBX_SUCCESS))
 11696        return rc;
 11697  
 11698      if (env->me_options.spill_parent4child_denominator) {
 11699        /* Spill dirty-pages of parent to provide dirtyroom for child txn */
 11700        rc = txn_spill(parent, nullptr,
 11701                       parent->tw.dirtylist->length /
 11702                           env->me_options.spill_parent4child_denominator);
 11703        if (unlikely(rc != MDBX_SUCCESS))
 11704          return rc;
 11705      }
 11706      tASSERT(parent, audit_ex(parent, 0, false) == 0);
 11707  
 11708      flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS);
 11709    } else if (flags & MDBX_TXN_RDONLY) {
 11710      if (env->me_txn0 &&
 11711          unlikely(env->me_txn0->mt_owner == osal_thread_self()) &&
 11712          (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0)
 11713        return MDBX_TXN_OVERLAPPING;
 11714    } else {
 11715      /* Reuse preallocated write txn. However, do not touch it until
 11716       * txn_renew() succeeds, since it currently may be active. */
 11717      txn = env->me_txn0;
 11718      goto renew;
 11719    }
 11720  
 11721    size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1);
 11722    size += tsize = sizeof(MDBX_txn);
 11723    if (unlikely((txn = osal_malloc(size)) == NULL)) {
 11724      DEBUG("calloc: %s", "failed");
 11725      return MDBX_ENOMEM;
 11726    }
 11727  #if MDBX_DEBUG
 11728    memset(txn, 0xCD, size);
 11729    VALGRIND_MAKE_MEM_UNDEFINED(txn, size);
 11730  #endif /* MDBX_DEBUG */
 11731    memset(txn, 0, tsize);
 11732    txn->mt_dbxs = env->me_dbxs; /* static */
 11733    txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
 11734    txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
 11735    txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs;
 11736    txn->mt_flags = flags;
 11737    txn->mt_env = env;
 11738  
 11739    if (parent) {
 11740      tASSERT(parent, dirtylist_check(parent));
 11741      txn->mt_dbiseqs = parent->mt_dbiseqs;
 11742      txn->mt_geo = parent->mt_geo;
 11743      rc = dpl_alloc(txn);
 11744      if (likely(rc == MDBX_SUCCESS)) {
 11745        const unsigned len =
 11746            MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count;
 11747        txn->tw.reclaimed_pglist =
 11748            pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
 11749        if (unlikely(!txn->tw.reclaimed_pglist))
 11750          rc = MDBX_ENOMEM;
 11751      }
 11752      if (unlikely(rc != MDBX_SUCCESS)) {
 11753      nested_failed:
 11754        pnl_free(txn->tw.reclaimed_pglist);
 11755        dpl_free(txn);
 11756        osal_free(txn);
 11757        return rc;
 11758      }
 11759  
 11760      /* Move loose pages to reclaimed list */
 11761      if (parent->tw.loose_count) {
 11762        do {
 11763          MDBX_page *lp = parent->tw.loose_pages;
 11764          const unsigned di = dpl_exist(parent, lp->mp_pgno);
 11765          tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp);
 11766          tASSERT(parent, lp->mp_flags == P_LOOSE);
 11767          rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1);
 11768          if (unlikely(rc != MDBX_SUCCESS))
 11769            goto nested_failed;
 11770          parent->tw.loose_pages = lp->mp_next;
 11771          /* Remove from dirty list */
 11772          page_wash(parent, di, lp, 1);
 11773        } while (parent->tw.loose_pages);
 11774        parent->tw.loose_count = 0;
 11775  #if MDBX_ENABLE_REFUND
 11776        parent->tw.loose_refund_wl = 0;
 11777  #endif /* MDBX_ENABLE_REFUND */
 11778        tASSERT(parent, dirtylist_check(parent));
 11779      }
 11780      txn->tw.dirtyroom = parent->tw.dirtyroom;
 11781      txn->tw.dirtylru = parent->tw.dirtylru;
 11782  
 11783      dpl_sort(parent);
 11784      if (parent->tw.spill_pages)
 11785        spill_purge(parent);
 11786  
 11787      tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >=
 11788                       MDBX_PNL_SIZE(parent->tw.reclaimed_pglist));
 11789      memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist,
 11790             MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist));
 11791      eASSERT(env, pnl_check_allocated(
 11792                       txn->tw.reclaimed_pglist,
 11793                       (txn->mt_next_pgno /* LY: intentional assignment here,
 11794                                                 only for assertion */
 11795                        = parent->mt_next_pgno) -
 11796                           MDBX_ENABLE_REFUND));
 11797  
 11798      txn->tw.last_reclaimed = parent->tw.last_reclaimed;
 11799      if (parent->tw.lifo_reclaimed) {
 11800        txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed;
 11801        parent->tw.lifo_reclaimed =
 11802            (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed);
 11803      }
 11804  
 11805      txn->tw.retired_pages = parent->tw.retired_pages;
 11806      parent->tw.retired_pages =
 11807          (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages);
 11808  
 11809      txn->mt_txnid = parent->mt_txnid;
 11810      txn->mt_front = parent->mt_front + 1;
 11811  #if MDBX_ENABLE_REFUND
 11812      txn->tw.loose_refund_wl = 0;
 11813  #endif /* MDBX_ENABLE_REFUND */
 11814      txn->mt_canary = parent->mt_canary;
 11815      parent->mt_flags |= MDBX_TXN_HAS_CHILD;
 11816      parent->mt_child = txn;
 11817      txn->mt_parent = parent;
 11818      txn->mt_numdbs = parent->mt_numdbs;
 11819      txn->mt_owner = parent->mt_owner;
 11820      memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db));
 11821      txn->tw.troika = parent->tw.troika;
 11822      /* Copy parent's mt_dbistate, but clear DB_NEW */
 11823      for (unsigned i = 0; i < txn->mt_numdbs; i++)
 11824        txn->mt_dbistate[i] =
 11825            parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
 11826      tASSERT(parent,
 11827              parent->tw.dirtyroom + parent->tw.dirtylist->length ==
 11828                  (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
 11829                                     : parent->mt_env->me_options.dp_limit));
 11830      tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
 11831                       (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
 11832                                       : txn->mt_env->me_options.dp_limit));
 11833      env->me_txn = txn;
 11834      rc = cursor_shadow(parent, txn);
 11835      if (AUDIT_ENABLED() && ASSERT_ENABLED()) {
 11836        txn->mt_signature = MDBX_MT_SIGNATURE;
 11837        tASSERT(txn, audit_ex(txn, 0, false) == 0);
 11838      }
 11839      if (unlikely(rc != MDBX_SUCCESS))
 11840        txn_end(txn, MDBX_END_FAIL_BEGINCHILD);
 11841    } else { /* MDBX_TXN_RDONLY */
 11842      txn->mt_dbiseqs = env->me_dbiseqs;
 11843    renew:
 11844      rc = txn_renew(txn, flags);
 11845    }
 11846  
 11847    if (unlikely(rc != MDBX_SUCCESS)) {
 11848      if (txn != env->me_txn0)
 11849        osal_free(txn);
 11850    } else {
 11851      if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
 11852        eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
 11853      else if (flags & MDBX_TXN_RDONLY)
 11854        eASSERT(env, (txn->mt_flags &
 11855                      ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
 11856                        /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0);
 11857      else {
 11858        eASSERT(env, (txn->mt_flags &
 11859                      ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC |
 11860                        MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
 11861        assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed);
 11862      }
 11863      txn->mt_signature = MDBX_MT_SIGNATURE;
 11864      txn->mt_userctx = context;
 11865      *ret = txn;
 11866      DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
 11867            "/%" PRIaPGNO,
 11868            txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn,
 11869            (void *)env, txn->mt_dbs[MAIN_DBI].md_root,
 11870            txn->mt_dbs[FREE_DBI].md_root);
 11871    }
 11872  
 11873    return rc;
 11874  }
 11875  
 11876  int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
 11877    int rc = check_txn(txn, MDBX_TXN_FINISHED);
 11878    if (unlikely(rc != MDBX_SUCCESS))
 11879      return rc;
 11880  
 11881    if (unlikely(!info))
 11882      return MDBX_EINVAL;
 11883  
 11884    MDBX_env *const env = txn->mt_env;
 11885  #if MDBX_ENV_CHECKPID
 11886    if (unlikely(env->me_pid != osal_getpid())) {
 11887      env->me_flags |= MDBX_FATAL_ERROR;
 11888      return MDBX_PANIC;
 11889    }
 11890  #endif /* MDBX_ENV_CHECKPID */
 11891  
 11892    info->txn_id = txn->mt_txnid;
 11893    info->txn_space_used = pgno2bytes(env, txn->mt_geo.next);
 11894  
 11895    if (txn->mt_flags & MDBX_TXN_RDONLY) {
 11896      meta_ptr_t head;
 11897      uint64_t head_retired;
 11898      meta_troika_t troika = meta_tap(env);
 11899      do {
 11900        /* fetch info from volatile head */
 11901        head = meta_recent(env, &troika);
 11902        head_retired =
 11903            unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired);
 11904        info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now);
 11905        info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper);
 11906        info->txn_space_leftover =
 11907            pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next);
 11908      } while (unlikely(meta_should_retry(env, &troika)));
 11909  
 11910      info->txn_reader_lag = head.txnid - info->txn_id;
 11911      info->txn_space_dirty = info->txn_space_retired = 0;
 11912      uint64_t reader_snapshot_pages_retired;
 11913      if (txn->to.reader &&
 11914          head_retired >
 11915              (reader_snapshot_pages_retired = atomic_load64(
 11916                   &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) {
 11917        info->txn_space_dirty = info->txn_space_retired = pgno2bytes(
 11918            env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
 11919  
 11920        size_t retired_next_reader = 0;
 11921        MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 11922        if (scan_rlt && info->txn_reader_lag > 1 && lck) {
 11923          /* find next more recent reader */
 11924          txnid_t next_reader = head.txnid;
 11925          const unsigned snap_nreaders =
 11926              atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 11927          for (unsigned i = 0; i < snap_nreaders; ++i) {
 11928          retry:
 11929            if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
 11930              jitter4testing(true);
 11931              const txnid_t snap_txnid =
 11932                  safe64_read(&lck->mti_readers[i].mr_txnid);
 11933              const uint64_t snap_retired =
 11934                  atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired,
 11935                                mo_AcquireRelease);
 11936              if (unlikely(snap_retired !=
 11937                           atomic_load64(
 11938                               &lck->mti_readers[i].mr_snapshot_pages_retired,
 11939                               mo_Relaxed)) ||
 11940                  snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))
 11941                goto retry;
 11942              if (snap_txnid <= txn->mt_txnid) {
 11943                retired_next_reader = 0;
 11944                break;
 11945              }
 11946              if (snap_txnid < next_reader) {
 11947                next_reader = snap_txnid;
 11948                retired_next_reader = pgno2bytes(
 11949                    env, (pgno_t)(snap_retired -
 11950                                  atomic_load64(
 11951                                      &txn->to.reader->mr_snapshot_pages_retired,
 11952                                      mo_Relaxed)));
 11953              }
 11954            }
 11955          }
 11956        }
 11957        info->txn_space_dirty = retired_next_reader;
 11958      }
 11959    } else {
 11960      info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now);
 11961      info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper);
 11962      info->txn_space_retired = pgno2bytes(
 11963          env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages
 11964                             : MDBX_PNL_SIZE(txn->tw.retired_pages));
 11965      info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
 11966      info->txn_space_dirty =
 11967          pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom);
 11968      info->txn_reader_lag = INT64_MAX;
 11969      MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 11970      if (scan_rlt && lck) {
 11971        txnid_t oldest_snapshot = txn->mt_txnid;
 11972        const unsigned snap_nreaders =
 11973            atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 11974        if (snap_nreaders) {
 11975          oldest_snapshot = txn_oldest_reader(txn);
 11976          if (oldest_snapshot == txn->mt_txnid - 1) {
 11977            /* check if there is at least one reader */
 11978            bool exists = false;
 11979            for (unsigned i = 0; i < snap_nreaders; ++i) {
 11980              if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) &&
 11981                  txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) {
 11982                exists = true;
 11983                break;
 11984              }
 11985            }
 11986            oldest_snapshot += !exists;
 11987          }
 11988        }
 11989        info->txn_reader_lag = txn->mt_txnid - oldest_snapshot;
 11990      }
 11991    }
 11992  
 11993    return MDBX_SUCCESS;
 11994  }
 11995  
 11996  MDBX_env *mdbx_txn_env(const MDBX_txn *txn) {
 11997    if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE ||
 11998                 txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE))
 11999      return NULL;
 12000    return txn->mt_env;
 12001  }
 12002  
 12003  uint64_t mdbx_txn_id(const MDBX_txn *txn) {
 12004    if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE))
 12005      return 0;
 12006    return txn->mt_txnid;
 12007  }
 12008  
 12009  int mdbx_txn_flags(const MDBX_txn *txn) {
 12010    if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) {
 12011      assert((-1 & (int)MDBX_TXN_INVALID) != 0);
 12012      return -1;
 12013    }
 12014    assert(0 == (int)(txn->mt_flags & MDBX_TXN_INVALID));
 12015    return txn->mt_flags;
 12016  }
 12017  
 12018  /* Check for misused dbi handles */
 12019  static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) {
 12020    if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs)
 12021      return false;
 12022    if (likely(
 12023            txn->mt_dbiseqs[dbi].weak ==
 12024            atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi],
 12025                          mo_AcquireRelease)))
 12026      return false;
 12027    return true;
 12028  }
 12029  
 12030  static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) {
 12031    unsigned v = env->me_dbiseqs[slot].weak + 1;
 12032    return v + (v == 0);
 12033  }
 12034  
 12035  static void dbi_import_locked(MDBX_txn *txn) {
 12036    const MDBX_env *const env = txn->mt_env;
 12037    unsigned n = env->me_numdbs;
 12038    for (unsigned i = CORE_DBS; i < n; ++i) {
 12039      if (i >= txn->mt_numdbs) {
 12040        txn->mt_cursors[i] = NULL;
 12041        if (txn->mt_dbiseqs != env->me_dbiseqs)
 12042          txn->mt_dbiseqs[i].weak = 0;
 12043        txn->mt_dbistate[i] = 0;
 12044      }
 12045      if ((dbi_changed(txn, i) &&
 12046           (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) ||
 12047          ((env->me_dbflags[i] & DB_VALID) &&
 12048           !(txn->mt_dbistate[i] & DBI_VALID))) {
 12049        tASSERT(txn,
 12050                (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0);
 12051        txn->mt_dbiseqs[i] = env->me_dbiseqs[i];
 12052        txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS;
 12053        txn->mt_dbistate[i] = 0;
 12054        if (env->me_dbflags[i] & DB_VALID) {
 12055          txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE;
 12056          tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL);
 12057          tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL);
 12058        }
 12059      }
 12060    }
 12061    while (unlikely(n < txn->mt_numdbs))
 12062      if (txn->mt_cursors[txn->mt_numdbs - 1] == NULL &&
 12063          (txn->mt_dbistate[txn->mt_numdbs - 1] & DBI_USRVALID) == 0)
 12064        txn->mt_numdbs -= 1;
 12065      else {
 12066        if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) {
 12067          if (txn->mt_dbiseqs != env->me_dbiseqs)
 12068            txn->mt_dbiseqs[n].weak = 0;
 12069          txn->mt_dbistate[n] = 0;
 12070        }
 12071        ++n;
 12072      }
 12073    txn->mt_numdbs = n;
 12074  }
 12075  
 12076  /* Import DBI which opened after txn started into context */
 12077  __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) {
 12078    if (dbi < CORE_DBS ||
 12079        (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs))
 12080      return false;
 12081  
 12082    ENSURE(txn->mt_env,
 12083           osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS);
 12084    dbi_import_locked(txn);
 12085    ENSURE(txn->mt_env,
 12086           osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS);
 12087    return txn->mt_dbistate[dbi] & DBI_USRVALID;
 12088  }
 12089  
 12090  /* Export or close DBI handles opened in this txn. */
 12091  static void dbi_update(MDBX_txn *txn, int keep) {
 12092    tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0);
 12093    MDBX_dbi n = txn->mt_numdbs;
 12094    if (n) {
 12095      bool locked = false;
 12096      MDBX_env *const env = txn->mt_env;
 12097  
 12098      for (unsigned i = n; --i >= CORE_DBS;) {
 12099        if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0))
 12100          continue;
 12101        if (!locked) {
 12102          ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
 12103          locked = true;
 12104        }
 12105        if (env->me_numdbs <= i ||
 12106            txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak)
 12107          continue /* dbi explicitly closed and/or then re-opened by other txn */;
 12108        if (keep) {
 12109          env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID;
 12110        } else {
 12111          char *ptr = env->me_dbxs[i].md_name.iov_base;
 12112          if (ptr) {
 12113            env->me_dbxs[i].md_name.iov_len = 0;
 12114            eASSERT(env, env->me_dbflags[i] == 0);
 12115            atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i),
 12116                           mo_AcquireRelease);
 12117            env->me_dbxs[i].md_name.iov_base = NULL;
 12118            osal_free(ptr);
 12119          }
 12120        }
 12121      }
 12122  
 12123      n = env->me_numdbs;
 12124      if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) {
 12125        if (!locked) {
 12126          ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
 12127          locked = true;
 12128        }
 12129  
 12130        n = env->me_numdbs;
 12131        while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID))
 12132          --n;
 12133        env->me_numdbs = n;
 12134      }
 12135  
 12136      if (unlikely(locked))
 12137        ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
 12138    }
 12139  }
 12140  
 12141  /* Filter-out pgno list from transaction's dirty-page list */
 12142  static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) {
 12143    if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) {
 12144      tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled));
 12145      MDBX_dpl *dl = dpl_sort(txn);
 12146  
 12147      /* Scanning in ascend order */
 12148      const int step = MDBX_PNL_ASCENDING ? 1 : -1;
 12149      const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl);
 12150      const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0;
 12151      tASSERT(txn, pl[begin] <= pl[end - step]);
 12152  
 12153      unsigned r = dpl_search(txn, pl[begin] >> spilled);
 12154      tASSERT(txn, dl->sorted == dl->length);
 12155      for (int i = begin; r <= dl->length;) { /* scan loop */
 12156        assert(i != end);
 12157        tASSERT(txn, !spilled || (pl[i] & 1) == 0);
 12158        pgno_t pl_pgno = pl[i] >> spilled;
 12159        pgno_t dp_pgno = dl->items[r].pgno;
 12160        if (likely(dp_pgno != pl_pgno)) {
 12161          const bool cmp = dp_pgno < pl_pgno;
 12162          r += cmp;
 12163          i += cmp ? 0 : step;
 12164          if (likely(i != end))
 12165            continue;
 12166          return;
 12167        }
 12168  
 12169        /* update loop */
 12170        unsigned npages, w = r;
 12171      remove_dl:
 12172        npages = dpl_npages(dl, r);
 12173        dl->pages_including_loose -= npages;
 12174        if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0)
 12175          dpage_free(txn->mt_env, dl->items[r].ptr, npages);
 12176        ++r;
 12177      next_i:
 12178        i += step;
 12179        if (unlikely(i == end)) {
 12180          while (r <= dl->length)
 12181            dl->items[w++] = dl->items[r++];
 12182        } else {
 12183          while (r <= dl->length) {
 12184            assert(i != end);
 12185            tASSERT(txn, !spilled || (pl[i] & 1) == 0);
 12186            pl_pgno = pl[i] >> spilled;
 12187            dp_pgno = dl->items[r].pgno;
 12188            if (dp_pgno < pl_pgno)
 12189              dl->items[w++] = dl->items[r++];
 12190            else if (dp_pgno > pl_pgno)
 12191              goto next_i;
 12192            else
 12193              goto remove_dl;
 12194          }
 12195        }
 12196        dl->sorted = dpl_setlen(dl, w - 1);
 12197        txn->tw.dirtyroom += r - w;
 12198        tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
 12199                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
 12200                                         : txn->mt_env->me_options.dp_limit));
 12201        return;
 12202      }
 12203    }
 12204  }
 12205  
 12206  /* End a transaction, except successful commit of a nested transaction.
 12207   * May be called twice for readonly txns: First reset it, then abort.
 12208   * [in] txn   the transaction handle to end
 12209   * [in] mode  why and how to end the transaction */
 12210  static int txn_end(MDBX_txn *txn, const unsigned mode) {
 12211    MDBX_env *env = txn->mt_env;
 12212    static const char *const names[] = MDBX_END_NAMES;
 12213  
 12214  #if MDBX_ENV_CHECKPID
 12215    if (unlikely(txn->mt_env->me_pid != osal_getpid())) {
 12216      env->me_flags |= MDBX_FATAL_ERROR;
 12217      return MDBX_PANIC;
 12218    }
 12219  #endif /* MDBX_ENV_CHECKPID */
 12220  
 12221    DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO
 12222          "/%" PRIaPGNO,
 12223          names[mode & MDBX_END_OPMASK], txn->mt_txnid,
 12224          (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env,
 12225          txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root);
 12226  
 12227    ENSURE(env, txn->mt_txnid >=
 12228                    /* paranoia is appropriate here */ env->me_lck
 12229                        ->mti_oldest_reader.weak);
 12230  
 12231    if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */
 12232      cursors_eot(txn, false);
 12233  
 12234    int rc = MDBX_SUCCESS;
 12235    if (txn->mt_flags & MDBX_TXN_RDONLY) {
 12236      if (txn->to.reader) {
 12237        MDBX_reader *slot = txn->to.reader;
 12238        eASSERT(env, slot->mr_pid.weak == env->me_pid);
 12239        if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) {
 12240          eASSERT(env,
 12241                  txn->mt_txnid == slot->mr_txnid.weak &&
 12242                      slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak);
 12243  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 12244          txn_valgrind(env, nullptr);
 12245  #endif
 12246          atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed);
 12247          safe64_reset(&slot->mr_txnid, false);
 12248          atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
 12249                         mo_Relaxed);
 12250        } else {
 12251          eASSERT(env, slot->mr_pid.weak == env->me_pid);
 12252          eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD);
 12253        }
 12254        if (mode & MDBX_END_SLOT) {
 12255          if ((env->me_flags & MDBX_ENV_TXKEY) == 0)
 12256            atomic_store32(&slot->mr_pid, 0, mo_Relaxed);
 12257          txn->to.reader = NULL;
 12258        }
 12259      }
 12260  #if defined(_WIN32) || defined(_WIN64)
 12261      if (txn->mt_flags & MDBX_SHRINK_ALLOWED)
 12262        osal_srwlock_ReleaseShared(&env->me_remap_guard);
 12263  #endif
 12264      txn->mt_numdbs = 0; /* prevent further DBI activity */
 12265      txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
 12266      txn->mt_owner = 0;
 12267    } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) {
 12268  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 12269      if (txn == env->me_txn0)
 12270        txn_valgrind(env, nullptr);
 12271  #endif
 12272  
 12273      txn->mt_flags = MDBX_TXN_FINISHED;
 12274      txn->mt_owner = 0;
 12275      env->me_txn = txn->mt_parent;
 12276      pnl_free(txn->tw.spill_pages);
 12277      txn->tw.spill_pages = nullptr;
 12278      if (txn == env->me_txn0) {
 12279        eASSERT(env, txn->mt_parent == NULL);
 12280        /* Export or close DBI handles created in this txn */
 12281        dbi_update(txn, mode & MDBX_END_UPDATE);
 12282        pnl_shrink(&txn->tw.retired_pages);
 12283        pnl_shrink(&txn->tw.reclaimed_pglist);
 12284        if (!(env->me_flags & MDBX_WRITEMAP))
 12285          dlist_free(txn);
 12286        /* The writer mutex was locked in mdbx_txn_begin. */
 12287        mdbx_txn_unlock(env);
 12288      } else {
 12289        eASSERT(env, txn->mt_parent != NULL);
 12290        MDBX_txn *const parent = txn->mt_parent;
 12291        eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE);
 12292        eASSERT(env, parent->mt_child == txn &&
 12293                         (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
 12294        eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist,
 12295                                         txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 12296        eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika,
 12297                            sizeof(meta_troika_t)) == 0);
 12298  
 12299        if (txn->tw.lifo_reclaimed) {
 12300          eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >=
 12301                           (unsigned)(uintptr_t)parent->tw.lifo_reclaimed);
 12302          MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) =
 12303              (unsigned)(uintptr_t)parent->tw.lifo_reclaimed;
 12304          parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed;
 12305        }
 12306  
 12307        if (txn->tw.retired_pages) {
 12308          eASSERT(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >=
 12309                           (unsigned)(uintptr_t)parent->tw.retired_pages);
 12310          MDBX_PNL_SIZE(txn->tw.retired_pages) =
 12311              (unsigned)(uintptr_t)parent->tw.retired_pages;
 12312          parent->tw.retired_pages = txn->tw.retired_pages;
 12313        }
 12314  
 12315        parent->mt_child = nullptr;
 12316        parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
 12317        parent->tw.dirtylru = txn->tw.dirtylru;
 12318        tASSERT(parent, dirtylist_check(parent));
 12319        tASSERT(parent, audit_ex(parent, 0, false) == 0);
 12320        if (!(env->me_flags & MDBX_WRITEMAP))
 12321          dlist_free(txn);
 12322        dpl_free(txn);
 12323        pnl_free(txn->tw.reclaimed_pglist);
 12324  
 12325        if (parent->mt_geo.upper != txn->mt_geo.upper ||
 12326            parent->mt_geo.now != txn->mt_geo.now) {
 12327          /* undo resize performed by child txn */
 12328          rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now,
 12329                                   parent->mt_geo.upper);
 12330          if (rc == MDBX_EPERM) {
 12331            /* unable undo resize (it is regular for Windows),
 12332             * therefore promote size changes from child to the parent txn */
 12333            WARNING("unable undo resize performed by child txn, promote to "
 12334                    "the parent (%u->%u, %u->%u)",
 12335                    txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper,
 12336                    parent->mt_geo.upper);
 12337            parent->mt_geo.now = txn->mt_geo.now;
 12338            parent->mt_geo.upper = txn->mt_geo.upper;
 12339            parent->mt_flags |= MDBX_TXN_DIRTY;
 12340            rc = MDBX_SUCCESS;
 12341          } else if (unlikely(rc != MDBX_SUCCESS)) {
 12342            ERROR("error %d while undo resize performed by child txn, fail "
 12343                  "the parent",
 12344                  rc);
 12345            parent->mt_flags |= MDBX_TXN_ERROR;
 12346            if (!env->me_dxb_mmap.address)
 12347              env->me_flags |= MDBX_FATAL_ERROR;
 12348          }
 12349        }
 12350      }
 12351    }
 12352  
 12353    eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0);
 12354    if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) {
 12355      txn->mt_signature = 0;
 12356      osal_free(txn);
 12357    }
 12358  
 12359    return rc;
 12360  }
 12361  
 12362  int mdbx_txn_reset(MDBX_txn *txn) {
 12363    int rc = check_txn(txn, 0);
 12364    if (unlikely(rc != MDBX_SUCCESS))
 12365      return rc;
 12366  
 12367    /* This call is only valid for read-only txns */
 12368    if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0))
 12369      return MDBX_EINVAL;
 12370  
 12371    /* LY: don't close DBI-handles */
 12372    rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE);
 12373    if (rc == MDBX_SUCCESS) {
 12374      tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE);
 12375      tASSERT(txn, txn->mt_owner == 0);
 12376    }
 12377    return rc;
 12378  }
 12379  
 12380  int mdbx_txn_break(MDBX_txn *txn) {
 12381    do {
 12382      int rc = check_txn(txn, 0);
 12383      if (unlikely(rc != MDBX_SUCCESS))
 12384        return rc;
 12385      txn->mt_flags |= MDBX_TXN_ERROR;
 12386      if (txn->mt_flags & MDBX_TXN_RDONLY)
 12387        break;
 12388      txn = txn->mt_child;
 12389    } while (txn);
 12390    return MDBX_SUCCESS;
 12391  }
 12392  
 12393  int mdbx_txn_abort(MDBX_txn *txn) {
 12394    int rc = check_txn(txn, 0);
 12395    if (unlikely(rc != MDBX_SUCCESS))
 12396      return rc;
 12397  
 12398    if (txn->mt_flags & MDBX_TXN_RDONLY)
 12399      /* LY: don't close DBI-handles */
 12400      return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT |
 12401                              MDBX_END_FREE);
 12402  
 12403    if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED))
 12404      return MDBX_BAD_TXN;
 12405  
 12406    if (txn->mt_child)
 12407      mdbx_txn_abort(txn->mt_child);
 12408  
 12409    tASSERT(txn, dirtylist_check(txn));
 12410    return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE);
 12411  }
 12412  
 12413  /* Count all the pages in each DB and in the GC and make sure
 12414   * it matches the actual number of pages being used. */
 12415  __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored,
 12416                             bool dont_filter_gc) {
 12417    pgno_t pending = 0;
 12418    if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
 12419      pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) +
 12420                (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored);
 12421    }
 12422  
 12423    MDBX_cursor_couple cx;
 12424    int rc = cursor_init(&cx.outer, txn, FREE_DBI);
 12425    if (unlikely(rc != MDBX_SUCCESS))
 12426      return rc;
 12427  
 12428    pgno_t gc = 0;
 12429    MDBX_val key, data;
 12430    while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) {
 12431      if (!dont_filter_gc) {
 12432        if (unlikely(key.iov_len != sizeof(txnid_t)))
 12433          return MDBX_CORRUPTED;
 12434        txnid_t id = unaligned_peek_u64(4, key.iov_base);
 12435        if (txn->tw.lifo_reclaimed) {
 12436          for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i)
 12437            if (id == txn->tw.lifo_reclaimed[i])
 12438              goto skip;
 12439        } else if (id <= txn->tw.last_reclaimed)
 12440          goto skip;
 12441      }
 12442  
 12443      gc += *(pgno_t *)data.iov_base;
 12444    skip:;
 12445    }
 12446    tASSERT(txn, rc == MDBX_NOTFOUND);
 12447  
 12448    for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++)
 12449      txn->mt_dbistate[i] &= ~DBI_AUDITED;
 12450  
 12451    pgno_t used = NUM_METAS;
 12452    for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) {
 12453      if (!(txn->mt_dbistate[i] & DBI_VALID))
 12454        continue;
 12455      rc = cursor_init(&cx.outer, txn, i);
 12456      if (unlikely(rc != MDBX_SUCCESS))
 12457        return rc;
 12458      txn->mt_dbistate[i] |= DBI_AUDITED;
 12459      if (txn->mt_dbs[i].md_root == P_INVALID)
 12460        continue;
 12461      used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages +
 12462              txn->mt_dbs[i].md_overflow_pages;
 12463  
 12464      if (i != MAIN_DBI)
 12465        continue;
 12466      rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST);
 12467      while (rc == MDBX_SUCCESS) {
 12468        MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top];
 12469        for (unsigned j = 0; j < page_numkeys(mp); j++) {
 12470          MDBX_node *node = page_node(mp, j);
 12471          if (node_flags(node) == F_SUBDATA) {
 12472            if (unlikely(node_ds(node) != sizeof(MDBX_db)))
 12473              return MDBX_CORRUPTED;
 12474            MDBX_db db_copy, *db;
 12475            memcpy(db = &db_copy, node_data(node), sizeof(db_copy));
 12476            if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
 12477              for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) {
 12478                if ((txn->mt_dbistate[k] & DBI_VALID) &&
 12479                    /* txn->mt_dbxs[k].md_name.iov_len > 0 && */
 12480                    node_ks(node) == txn->mt_dbxs[k].md_name.iov_len &&
 12481                    memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base,
 12482                           node_ks(node)) == 0) {
 12483                  txn->mt_dbistate[k] |= DBI_AUDITED;
 12484                  if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE))
 12485                    db = txn->mt_dbs + k;
 12486                  break;
 12487                }
 12488              }
 12489            }
 12490            used +=
 12491                db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages;
 12492          }
 12493        }
 12494        rc = cursor_sibling(&cx.outer, SIBLING_RIGHT);
 12495      }
 12496      tASSERT(txn, rc == MDBX_NOTFOUND);
 12497    }
 12498  
 12499    for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) {
 12500      if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) !=
 12501          DBI_VALID)
 12502        continue;
 12503      for (MDBX_txn *t = txn; t; t = t->mt_parent)
 12504        if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) {
 12505          used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages +
 12506                  t->mt_dbs[i].md_overflow_pages;
 12507          txn->mt_dbistate[i] |= DBI_AUDITED;
 12508          break;
 12509        }
 12510      if (!(txn->mt_dbistate[i] & DBI_AUDITED)) {
 12511        WARNING("audit %s@%" PRIaTXN
 12512                ": unable account dbi %d / \"%*s\", state 0x%02x",
 12513                txn->mt_parent ? "nested-" : "", txn->mt_txnid, i,
 12514                (int)txn->mt_dbxs[i].md_name.iov_len,
 12515                (const char *)txn->mt_dbxs[i].md_name.iov_base,
 12516                txn->mt_dbistate[i]);
 12517      }
 12518    }
 12519  
 12520    if (pending + gc + used == txn->mt_next_pgno)
 12521      return MDBX_SUCCESS;
 12522  
 12523    if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0)
 12524      ERROR("audit @%" PRIaTXN ": %u(pending) = %u(loose) + "
 12525            "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)",
 12526            txn->mt_txnid, pending, txn->tw.loose_count,
 12527            MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
 12528            txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0,
 12529            retired_stored);
 12530    ERROR("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO
 12531          "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO
 12532          "(allocated)",
 12533          txn->mt_txnid, pending, gc, used, pending + gc + used,
 12534          txn->mt_next_pgno);
 12535    return MDBX_PROBLEM;
 12536  }
 12537  
 12538  typedef struct gc_update_context {
 12539    unsigned retired_stored, loop;
 12540    unsigned settled, cleaned_slot, reused_slot, filled_slot;
 12541    txnid_t cleaned_id, rid;
 12542    bool lifo, dense;
 12543  #if MDBX_ENABLE_BIGFOOT
 12544    txnid_t bigfoot;
 12545  #endif /* MDBX_ENABLE_BIGFOOT */
 12546    MDBX_cursor_couple cursor;
 12547  } gcu_context_t;
 12548  
 12549  static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) {
 12550    memset(ctx, 0, offsetof(gcu_context_t, cursor));
 12551    ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0;
 12552  #if MDBX_ENABLE_BIGFOOT
 12553    ctx->bigfoot = txn->mt_txnid;
 12554  #endif /* MDBX_ENABLE_BIGFOOT */
 12555    return cursor_init(&ctx->cursor.outer, txn, FREE_DBI);
 12556  }
 12557  
 12558  static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) {
 12559    return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count;
 12560  }
 12561  
 12562  static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) {
 12563    int err = MDBX_SUCCESS;
 12564    if (ctx->retired_stored)
 12565      do {
 12566        MDBX_val key, val;
 12567  #if MDBX_ENABLE_BIGFOOT
 12568        key.iov_base = &ctx->bigfoot;
 12569  #else
 12570        key.iov_base = &txn->mt_txnid;
 12571  #endif /* MDBX_ENABLE_BIGFOOT */
 12572        key.iov_len = sizeof(txnid_t);
 12573        const struct cursor_set_result csr =
 12574            cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET);
 12575        if (csr.err == MDBX_SUCCESS && csr.exact) {
 12576          ctx->retired_stored = 0;
 12577          err = mdbx_cursor_del(&ctx->cursor.outer, 0);
 12578          TRACE("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn),
 12579                err);
 12580        }
 12581      }
 12582  #if MDBX_ENABLE_BIGFOOT
 12583      while (!err && --ctx->bigfoot >= txn->mt_txnid);
 12584  #else
 12585      while (0);
 12586  #endif /* MDBX_ENABLE_BIGFOOT */
 12587    return err;
 12588  }
 12589  
 12590  /* Prepare a backlog of pages to modify GC itself, while reclaiming is
 12591   * prohibited. It should be enough to prevent search in page_alloc_slowpath()
 12592   * during a deleting, when GC tree is unbalanced. */
 12593  static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
 12594                                 const bool reserve4retired) {
 12595    const unsigned pages4retiredlist =
 12596        reserve4retired ? number_of_ovpages(
 12597                              txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages))
 12598                        : 0;
 12599    const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth;
 12600    const unsigned backlog4rebalance = backlog4cow + 1;
 12601  
 12602    if (likely(pages4retiredlist < 2 &&
 12603               gcu_backlog_size(txn) > (reserve4retired
 12604                                            ? backlog4rebalance
 12605                                            : (backlog4cow + backlog4rebalance))))
 12606      return MDBX_SUCCESS;
 12607  
 12608    TRACE(">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u",
 12609          reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist,
 12610          backlog4cow, backlog4rebalance);
 12611  
 12612    int err;
 12613    if (unlikely(pages4retiredlist > 2)) {
 12614      MDBX_val key, val;
 12615      key.iov_base = val.iov_base = nullptr;
 12616      key.iov_len = sizeof(txnid_t);
 12617      val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
 12618      err = cursor_spill(&ctx->cursor.outer, &key, &val);
 12619      if (unlikely(err != MDBX_SUCCESS))
 12620        return err;
 12621    }
 12622  
 12623    ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
 12624    err = cursor_touch(&ctx->cursor.outer);
 12625    TRACE("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err);
 12626  
 12627    if (unlikely(pages4retiredlist > 1) &&
 12628        MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored &&
 12629        err == MDBX_SUCCESS) {
 12630      tASSERT(txn, reserve4retired);
 12631      err = gcu_clean_stored_retired(txn, ctx);
 12632      if (unlikely(err != MDBX_SUCCESS))
 12633        return err;
 12634      err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist,
 12635                                MDBX_ALLOC_GC | MDBX_ALLOC_FAKE)
 12636                .err;
 12637      TRACE("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err);
 12638      cASSERT(&ctx->cursor.outer,
 12639              gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS);
 12640    }
 12641  
 12642    while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist &&
 12643           err == MDBX_SUCCESS)
 12644      err = page_alloc_slowpath(&ctx->cursor.outer, 0,
 12645                                MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
 12646                                    MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG)
 12647                .err;
 12648  
 12649    ctx->cursor.outer.mc_flags |= C_RECLAIMING;
 12650    TRACE("<< backlog %u, err %d", gcu_backlog_size(txn), err);
 12651    return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS;
 12652  }
 12653  
 12654  static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) {
 12655    /* PNL is initially empty, zero out at least the length */
 12656    memset(pnl.iov_base, 0, sizeof(pgno_t));
 12657    if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0)
 12658      /* zero out to avoid leaking values from uninitialized malloc'ed memory
 12659       * to the file in non-writemap mode if length of the saving page-list
 12660       * was changed during space reservation. */
 12661      memset(pnl.iov_base, 0, pnl.iov_len);
 12662  }
 12663  
 12664  /* Cleanups reclaimed GC (aka freeDB) records, saves the retired-list (aka
 12665   * freelist) of current transaction to GC, puts back into GC leftover of the
 12666   * reclaimed pages with chunking. This recursive changes the reclaimed-list,
 12667   * loose-list and retired-list. Keep trying until it stabilizes.
 12668   *
 12669   * NOTE: This code is a consequence of many iterations of adding crutches (aka
 12670   * "checks and balances") to partially bypass the fundamental design problems
 12671   * inherited from LMDB. So do not try to understand it completely in order to
 12672   * avoid your madness. */
 12673  static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) {
 12674    TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid);
 12675    MDBX_env *const env = txn->mt_env;
 12676    const char *const dbg_prefix_mode = ctx->lifo ? "    lifo" : "    fifo";
 12677    (void)dbg_prefix_mode;
 12678    ctx->cursor.outer.mc_flags |= C_RECLAIMING;
 12679    ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI];
 12680    txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer;
 12681  
 12682    /* txn->tw.reclaimed_pglist[] can grow and shrink during this call.
 12683     * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow.
 12684     * Page numbers cannot disappear from txn->tw.retired_pages[]. */
 12685  
 12686  retry:
 12687    ++ctx->loop;
 12688    TRACE("%s", " >> restart");
 12689    int rc = MDBX_SUCCESS;
 12690    tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 12691                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 12692    tASSERT(txn, dirtylist_check(txn));
 12693    if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) {
 12694      ERROR("too more loops %u, bailout", ctx->loop);
 12695      rc = MDBX_PROBLEM;
 12696      goto bailout;
 12697    }
 12698  
 12699    if (unlikely(ctx->dense)) {
 12700      rc = gcu_clean_stored_retired(txn, ctx);
 12701      if (unlikely(rc != MDBX_SUCCESS))
 12702        goto bailout;
 12703    }
 12704  
 12705    ctx->settled = 0;
 12706    ctx->cleaned_slot = 0;
 12707    ctx->reused_slot = 0;
 12708    ctx->filled_slot = ~0u;
 12709    ctx->cleaned_id = 0;
 12710    ctx->rid = txn->tw.last_reclaimed;
 12711    while (true) {
 12712      /* Come back here after each Put() in case retired-list changed */
 12713      MDBX_val key, data;
 12714      TRACE("%s", " >> continue");
 12715  
 12716      if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) &&
 12717          (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page ||
 12718           ctx->retired_stored > env->me_maxgc_ov1page)) {
 12719        rc = gcu_prepare_backlog(txn, ctx, true);
 12720        if (unlikely(rc != MDBX_SUCCESS))
 12721          goto bailout;
 12722      }
 12723  
 12724      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 12725                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 12726      if (ctx->lifo) {
 12727        if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed
 12728                                     ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 12729                                     : 0)) {
 12730          ctx->settled = 0;
 12731          ctx->cleaned_slot = 0;
 12732          ctx->reused_slot = 0;
 12733          ctx->filled_slot = ~0u;
 12734          /* LY: cleanup reclaimed records. */
 12735          do {
 12736            ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot];
 12737            tASSERT(txn,
 12738                    ctx->cleaned_slot > 0 &&
 12739                        ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak);
 12740            key.iov_base = &ctx->cleaned_id;
 12741            key.iov_len = sizeof(ctx->cleaned_id);
 12742            rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET);
 12743            if (rc == MDBX_NOTFOUND)
 12744              continue;
 12745            if (unlikely(rc != MDBX_SUCCESS))
 12746              goto bailout;
 12747            if (likely(!ctx->dense)) {
 12748              rc = gcu_prepare_backlog(txn, ctx, false);
 12749              if (unlikely(rc != MDBX_SUCCESS))
 12750                goto bailout;
 12751            }
 12752            tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak);
 12753            TRACE("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
 12754                  ctx->cleaned_slot, ctx->cleaned_id);
 12755            tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer);
 12756            rc = mdbx_cursor_del(&ctx->cursor.outer, 0);
 12757            if (unlikely(rc != MDBX_SUCCESS))
 12758              goto bailout;
 12759          } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
 12760          txl_sort(txn->tw.lifo_reclaimed);
 12761        }
 12762      } else {
 12763        /* If using records from GC which we have not yet deleted,
 12764         * now delete them and any we reserved for tw.reclaimed_pglist. */
 12765        while (ctx->cleaned_id <= txn->tw.last_reclaimed) {
 12766          rc = cursor_first(&ctx->cursor.outer, &key, NULL);
 12767          if (rc == MDBX_NOTFOUND)
 12768            break;
 12769          if (unlikely(rc != MDBX_SUCCESS))
 12770            goto bailout;
 12771          if (!MDBX_DISABLE_VALIDATION &&
 12772              unlikely(key.iov_len != sizeof(txnid_t))) {
 12773            rc = MDBX_CORRUPTED;
 12774            goto bailout;
 12775          }
 12776          ctx->rid = ctx->cleaned_id;
 12777          ctx->settled = 0;
 12778          ctx->reused_slot = 0;
 12779          ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base);
 12780          if (ctx->cleaned_id > txn->tw.last_reclaimed)
 12781            break;
 12782          if (likely(!ctx->dense)) {
 12783            rc = gcu_prepare_backlog(txn, ctx, false);
 12784            if (unlikely(rc != MDBX_SUCCESS))
 12785              goto bailout;
 12786          }
 12787          tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed);
 12788          tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak);
 12789          TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
 12790                ctx->cleaned_id);
 12791          tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer);
 12792          rc = mdbx_cursor_del(&ctx->cursor.outer, 0);
 12793          if (unlikely(rc != MDBX_SUCCESS))
 12794            goto bailout;
 12795        }
 12796      }
 12797  
 12798      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 12799                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 12800      tASSERT(txn, dirtylist_check(txn));
 12801      if (AUDIT_ENABLED()) {
 12802        rc = audit_ex(txn, ctx->retired_stored, false);
 12803        if (unlikely(rc != MDBX_SUCCESS))
 12804          goto bailout;
 12805      }
 12806  
 12807      /* return suitable into unallocated space */
 12808      if (txn_refund(txn)) {
 12809        tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 12810                                         txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 12811        if (AUDIT_ENABLED()) {
 12812          rc = audit_ex(txn, ctx->retired_stored, false);
 12813          if (unlikely(rc != MDBX_SUCCESS))
 12814            goto bailout;
 12815        }
 12816      }
 12817  
 12818      /* handle loose pages - put ones into the reclaimed- or retired-list */
 12819      if (txn->tw.loose_pages) {
 12820        /* Return loose page numbers to tw.reclaimed_pglist,
 12821         * though usually none are left at this point.
 12822         * The pages themselves remain in dirtylist. */
 12823        if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) {
 12824          if (txn->tw.loose_count > 0) {
 12825            TRACE("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode,
 12826                  txn->tw.loose_count);
 12827            rc = page_alloc_slowpath(&ctx->cursor.outer, 0,
 12828                                     MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
 12829                                         MDBX_ALLOC_FAKE)
 12830                     .err;
 12831            if (rc == MDBX_SUCCESS) {
 12832              TRACE("%s: retry since gc-slot for %u loose-pages available",
 12833                    dbg_prefix_mode, txn->tw.loose_count);
 12834              continue;
 12835            }
 12836  
 12837            /* Put loose page numbers in tw.retired_pages,
 12838             * since unable to return them to tw.reclaimed_pglist. */
 12839            if (unlikely((rc = pnl_need(&txn->tw.retired_pages,
 12840                                        txn->tw.loose_count)) != 0))
 12841              goto bailout;
 12842            for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next)
 12843              pnl_xappend(txn->tw.retired_pages, mp->mp_pgno);
 12844            TRACE("%s: append %u loose-pages to retired-pages", dbg_prefix_mode,
 12845                  txn->tw.loose_count);
 12846          }
 12847        } else {
 12848          /* Room for loose pages + temp PNL with same */
 12849          rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2);
 12850          if (unlikely(rc != MDBX_SUCCESS))
 12851            goto bailout;
 12852          MDBX_PNL loose = txn->tw.reclaimed_pglist +
 12853                           MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) -
 12854                           txn->tw.loose_count - 1;
 12855          unsigned count = 0;
 12856          for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) {
 12857            tASSERT(txn, mp->mp_flags == P_LOOSE);
 12858            loose[++count] = mp->mp_pgno;
 12859          }
 12860          tASSERT(txn, count == txn->tw.loose_count);
 12861          MDBX_PNL_SIZE(loose) = count;
 12862          pnl_sort(loose, txn->mt_next_pgno);
 12863          pnl_merge(txn->tw.reclaimed_pglist, loose);
 12864          TRACE("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode,
 12865                txn->tw.loose_count);
 12866        }
 12867  
 12868        /* filter-out list of dirty-pages from loose-pages */
 12869        MDBX_dpl *const dl = txn->tw.dirtylist;
 12870        unsigned w = 0;
 12871        for (unsigned r = w; ++r <= dl->length;) {
 12872          MDBX_page *dp = dl->items[r].ptr;
 12873          tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
 12874          tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno);
 12875          if ((dp->mp_flags & P_LOOSE) == 0) {
 12876            if (++w != r)
 12877              dl->items[w] = dl->items[r];
 12878          } else {
 12879            tASSERT(txn, dp->mp_flags == P_LOOSE);
 12880            if ((env->me_flags & MDBX_WRITEMAP) == 0)
 12881              dpage_free(env, dp, 1);
 12882          }
 12883        }
 12884        TRACE("%s: filtered-out loose-pages from %u -> %u dirty-pages",
 12885              dbg_prefix_mode, dl->length, w);
 12886        tASSERT(txn, txn->tw.loose_count == dl->length - w);
 12887        dpl_setlen(dl, w);
 12888        dl->sorted = 0;
 12889        dl->pages_including_loose -= txn->tw.loose_count;
 12890        txn->tw.dirtyroom += txn->tw.loose_count;
 12891        tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
 12892                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
 12893                                         : txn->mt_env->me_options.dp_limit));
 12894        txn->tw.loose_pages = NULL;
 12895        txn->tw.loose_count = 0;
 12896  #if MDBX_ENABLE_REFUND
 12897        txn->tw.loose_refund_wl = 0;
 12898  #endif /* MDBX_ENABLE_REFUND */
 12899      }
 12900  
 12901      const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
 12902      /* handle retired-list - store ones into single gc-record */
 12903      if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) {
 12904        if (unlikely(!ctx->retired_stored)) {
 12905          /* Make sure last page of GC is touched and on retired-list */
 12906          ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
 12907          rc = page_search(&ctx->cursor.outer, NULL,
 12908                           MDBX_PS_LAST | MDBX_PS_MODIFY);
 12909          ctx->cursor.outer.mc_flags |= C_RECLAIMING;
 12910          if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND)
 12911            goto bailout;
 12912        }
 12913  
 12914  #if MDBX_ENABLE_BIGFOOT
 12915        unsigned retired_pages_before;
 12916        do {
 12917          if (ctx->bigfoot > txn->mt_txnid) {
 12918            rc = gcu_clean_stored_retired(txn, ctx);
 12919            tASSERT(txn, ctx->bigfoot <= txn->mt_txnid);
 12920          }
 12921  
 12922          retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages);
 12923          rc = gcu_prepare_backlog(txn, ctx, true);
 12924          if (unlikely(rc != MDBX_SUCCESS))
 12925            goto bailout;
 12926  
 12927          pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno);
 12928          ctx->retired_stored = 0;
 12929          ctx->bigfoot = txn->mt_txnid;
 12930          do {
 12931            key.iov_len = sizeof(txnid_t);
 12932            key.iov_base = &ctx->bigfoot;
 12933            const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) -
 12934                                  ctx->retired_stored;
 12935            const unsigned chunk =
 12936                (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID)
 12937                    ? env->me_maxgc_ov1page
 12938                    : left;
 12939            data.iov_len = (chunk + 1) * sizeof(pgno_t);
 12940            rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE);
 12941            if (unlikely(rc != MDBX_SUCCESS))
 12942              goto bailout;
 12943  
 12944            if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) {
 12945              const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING)
 12946                                      ? left - chunk
 12947                                      : ctx->retired_stored;
 12948              pgno_t *const begin = txn->tw.retired_pages + at;
 12949              /* MDBX_PNL_ASCENDING == false && LIFO == false:
 12950               *  - the larger pgno is at the beginning of retired list
 12951               *    and should be placed with the larger txnid.
 12952               * MDBX_PNL_ASCENDING == true && LIFO == true:
 12953               *  - the larger pgno is at the ending of retired list
 12954               *    and should be placed with the smaller txnid.
 12955               */
 12956              const pgno_t save = *begin;
 12957              *begin = chunk;
 12958              memcpy(data.iov_base, begin, data.iov_len);
 12959              *begin = save;
 12960              TRACE("%s: put-retired/bigfoot @ %" PRIaTXN
 12961                    " (slice #%u) #%u [%u..%u] of %u",
 12962                    dbg_prefix_mode, ctx->bigfoot,
 12963                    (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at,
 12964                    at + chunk, retired_pages_before);
 12965            }
 12966            ctx->retired_stored += chunk;
 12967          } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) &&
 12968                   (++ctx->bigfoot, true));
 12969        } while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages));
 12970  #else
 12971        /* Write to last page of GC */
 12972        key.iov_len = sizeof(txnid_t);
 12973        key.iov_base = &txn->mt_txnid;
 12974        do {
 12975          gcu_prepare_backlog(txn, ctx, true);
 12976          data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
 12977          rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE);
 12978          if (unlikely(rc != MDBX_SUCCESS))
 12979            goto bailout;
 12980          /* Retry if tw.retired_pages[] grew during the Put() */
 12981        } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages));
 12982  
 12983        ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages);
 12984        pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno);
 12985        eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages));
 12986        memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len);
 12987  
 12988        TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode,
 12989              ctx->retired_stored, txn->mt_txnid);
 12990  #endif /* MDBX_ENABLE_BIGFOOT */
 12991        if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
 12992          unsigned i = ctx->retired_stored;
 12993          DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %u, retired-PNL",
 12994                      txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
 12995          for (; i; i--)
 12996            DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]);
 12997          DEBUG_EXTRA_PRINT("%s\n", ".");
 12998        }
 12999        if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
 13000                     ctx->settled)) {
 13001          TRACE("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode,
 13002                amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
 13003          goto retry /* rare case, but avoids GC fragmentation
 13004                                  and one cycle. */
 13005              ;
 13006        }
 13007        continue;
 13008      }
 13009  
 13010      /* handle reclaimed and lost pages - merge and store both into gc */
 13011      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 13012                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 13013      tASSERT(txn, txn->tw.loose_count == 0);
 13014  
 13015      TRACE("%s", " >> reserving");
 13016      if (AUDIT_ENABLED()) {
 13017        rc = audit_ex(txn, ctx->retired_stored, false);
 13018        if (unlikely(rc != MDBX_SUCCESS))
 13019          goto bailout;
 13020      }
 13021      const unsigned left = amount - ctx->settled;
 13022      TRACE("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, "
 13023            "reused-gc-slots %u",
 13024            dbg_prefix_mode, amount, ctx->settled, (int)left,
 13025            txn->tw.lifo_reclaimed
 13026                ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 13027                : 0,
 13028            ctx->reused_slot);
 13029      if (0 >= (int)left)
 13030        break;
 13031  
 13032      const unsigned prefer_max_scatter = 257;
 13033      txnid_t reservation_gc_id;
 13034      if (ctx->lifo) {
 13035        if (txn->tw.lifo_reclaimed == nullptr) {
 13036          txn->tw.lifo_reclaimed = txl_alloc();
 13037          if (unlikely(!txn->tw.lifo_reclaimed)) {
 13038            rc = MDBX_ENOMEM;
 13039            goto bailout;
 13040          }
 13041        }
 13042        if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
 13043                prefer_max_scatter &&
 13044            left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
 13045                    ctx->reused_slot) *
 13046                       env->me_maxgc_ov1page &&
 13047            !ctx->dense) {
 13048          /* LY: need just a txn-id for save page list. */
 13049          bool need_cleanup = false;
 13050          txnid_t snap_oldest;
 13051        retry_rid:
 13052          ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
 13053          do {
 13054            snap_oldest = txn_oldest_reader(txn);
 13055            rc = page_alloc_slowpath(&ctx->cursor.outer, 0,
 13056                                     MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
 13057                                         MDBX_ALLOC_FAKE)
 13058                     .err;
 13059            if (likely(rc == MDBX_SUCCESS)) {
 13060              TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode,
 13061                    MDBX_PNL_LAST(txn->tw.lifo_reclaimed));
 13062              need_cleanup = true;
 13063            }
 13064          } while (rc == MDBX_SUCCESS &&
 13065                   (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
 13066                       prefer_max_scatter &&
 13067                   left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
 13068                           ctx->reused_slot) *
 13069                              env->me_maxgc_ov1page);
 13070          ctx->cursor.outer.mc_flags |= C_RECLAIMING;
 13071  
 13072          if (likely(rc == MDBX_SUCCESS)) {
 13073            TRACE("%s: got enough from GC.", dbg_prefix_mode);
 13074            continue;
 13075          } else if (unlikely(rc != MDBX_NOTFOUND))
 13076            /* LY: some troubles... */
 13077            goto bailout;
 13078  
 13079          if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
 13080            if (need_cleanup) {
 13081              txl_sort(txn->tw.lifo_reclaimed);
 13082              ctx->cleaned_slot = 0;
 13083            }
 13084            ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed);
 13085          } else {
 13086            tASSERT(txn, txn->tw.last_reclaimed == 0);
 13087            if (unlikely(txn_oldest_reader(txn) != snap_oldest))
 13088              /* should retry page_alloc_slowpath(MDBX_ALLOC_GC)
 13089               * if the oldest reader changes since the last attempt */
 13090              goto retry_rid;
 13091            /* no reclaimable GC entries,
 13092             * therefore no entries with ID < mdbx_find_oldest(txn) */
 13093            txn->tw.last_reclaimed = ctx->rid = snap_oldest;
 13094            TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode,
 13095                  ctx->rid);
 13096          }
 13097  
 13098          /* LY: GC is empty, will look any free txn-id in high2low order. */
 13099          while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter &&
 13100                 left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
 13101                         ctx->reused_slot) *
 13102                            env->me_maxgc_ov1page) {
 13103            if (unlikely(ctx->rid <= MIN_TXNID)) {
 13104              if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <=
 13105                           ctx->reused_slot)) {
 13106                NOTICE("** restart: reserve depleted (reused_gc_slot %u >= "
 13107                       "lifo_reclaimed %u" PRIaTXN,
 13108                       ctx->reused_slot,
 13109                       (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
 13110                goto retry;
 13111              }
 13112              break;
 13113            }
 13114  
 13115            tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID);
 13116            --ctx->rid;
 13117            key.iov_base = &ctx->rid;
 13118            key.iov_len = sizeof(ctx->rid);
 13119            rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY);
 13120            if (unlikely(rc == MDBX_SUCCESS)) {
 13121              DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search",
 13122                    dbg_prefix_mode, ctx->rid);
 13123              ++ctx->rid;
 13124              rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST);
 13125              if (rc == MDBX_NOTFOUND) {
 13126                DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode);
 13127                ctx->dense = true;
 13128                break;
 13129              }
 13130              if (unlikely(rc != MDBX_SUCCESS ||
 13131                           key.iov_len != sizeof(txnid_t))) {
 13132                rc = MDBX_CORRUPTED;
 13133                goto bailout;
 13134              }
 13135              txnid_t gc_first = unaligned_peek_u64(4, key.iov_base);
 13136              if (gc_first <= MIN_TXNID) {
 13137                DEBUG("%s: no free GC's id(s) less than %" PRIaTXN
 13138                      " (going dense-mode)",
 13139                      dbg_prefix_mode, ctx->rid);
 13140                ctx->dense = true;
 13141                break;
 13142              }
 13143              ctx->rid = gc_first - 1;
 13144            }
 13145  
 13146            eASSERT(env, !ctx->dense);
 13147            rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid);
 13148            if (unlikely(rc != MDBX_SUCCESS))
 13149              goto bailout;
 13150  
 13151            if (ctx->reused_slot)
 13152              /* rare case, but it is better to clear and re-create GC entries
 13153               * with less fragmentation. */
 13154              need_cleanup = true;
 13155            else
 13156              ctx->cleaned_slot +=
 13157                  1 /* mark cleanup is not needed for added slot. */;
 13158  
 13159            TRACE("%s: append @%" PRIaTXN
 13160                  " to lifo-reclaimed, cleaned-gc-slot = %u",
 13161                  dbg_prefix_mode, ctx->rid, ctx->cleaned_slot);
 13162          }
 13163  
 13164          if (need_cleanup || ctx->dense) {
 13165            if (ctx->cleaned_slot)
 13166              TRACE("%s: restart inner-loop to clear and re-create GC entries",
 13167                    dbg_prefix_mode);
 13168            ctx->cleaned_slot = 0;
 13169            continue;
 13170          }
 13171        }
 13172  
 13173        const unsigned i =
 13174            (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot;
 13175        tASSERT(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
 13176        reservation_gc_id = txn->tw.lifo_reclaimed[i];
 13177        TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode,
 13178              reservation_gc_id, i);
 13179      } else {
 13180        tASSERT(txn, txn->tw.lifo_reclaimed == NULL);
 13181        if (unlikely(ctx->rid == 0)) {
 13182          ctx->rid = txn_oldest_reader(txn);
 13183          rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST);
 13184          if (rc == MDBX_SUCCESS) {
 13185            if (unlikely(key.iov_len != sizeof(txnid_t))) {
 13186              rc = MDBX_CORRUPTED;
 13187              goto bailout;
 13188            }
 13189            txnid_t gc_first = unaligned_peek_u64(4, key.iov_base);
 13190            if (ctx->rid >= gc_first)
 13191              ctx->rid = gc_first - 1;
 13192            if (unlikely(ctx->rid == 0)) {
 13193              ERROR("%s", "** no GC tail-space to store (going dense-mode)");
 13194              ctx->dense = true;
 13195              goto retry;
 13196            }
 13197          } else if (rc != MDBX_NOTFOUND)
 13198            goto bailout;
 13199          txn->tw.last_reclaimed = ctx->rid;
 13200          ctx->cleaned_id = ctx->rid + 1;
 13201        }
 13202        reservation_gc_id = ctx->rid--;
 13203        TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode,
 13204              reservation_gc_id);
 13205      }
 13206      ++ctx->reused_slot;
 13207  
 13208      unsigned chunk = left;
 13209      if (unlikely(chunk > env->me_maxgc_ov1page)) {
 13210        const unsigned avail_gc_slots =
 13211            txn->tw.lifo_reclaimed
 13212                ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
 13213                      ctx->reused_slot + 1
 13214            : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid
 13215                                     : INT16_MAX;
 13216        if (avail_gc_slots > 1) {
 13217          if (chunk < env->me_maxgc_ov1page * 2)
 13218            chunk /= 2;
 13219          else {
 13220            const unsigned threshold =
 13221                env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter)
 13222                                             ? avail_gc_slots
 13223                                             : prefer_max_scatter);
 13224            if (left < threshold)
 13225              chunk = env->me_maxgc_ov1page;
 13226            else {
 13227              const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1;
 13228              unsigned span = 1;
 13229              unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
 13230                                          sizeof(pgno_t)) /* - 1 + span */;
 13231              if (tail > avail) {
 13232                for (unsigned i = amount - span; i > 0; --i) {
 13233                  if (MDBX_PNL_ASCENDING
 13234                          ? (txn->tw.reclaimed_pglist[i] + span)
 13235                          : (txn->tw.reclaimed_pglist[i] - span) ==
 13236                                txn->tw.reclaimed_pglist[i + span]) {
 13237                    span += 1;
 13238                    avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
 13239                                       sizeof(pgno_t)) -
 13240                            1 + span;
 13241                    if (avail >= tail)
 13242                      break;
 13243                  }
 13244                }
 13245              }
 13246  
 13247              chunk = (avail >= tail) ? tail - span
 13248                      : (avail_gc_slots > 3 &&
 13249                         ctx->reused_slot < prefer_max_scatter - 3)
 13250                          ? avail - span
 13251                          : tail;
 13252            }
 13253          }
 13254        }
 13255      }
 13256      tASSERT(txn, chunk > 0);
 13257  
 13258      TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id "
 13259            "%" PRIaTXN,
 13260            dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id);
 13261  
 13262      TRACE("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk,
 13263            env->me_maxgc_ov1page);
 13264  
 13265      tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak);
 13266      if (unlikely(
 13267              reservation_gc_id < MIN_TXNID ||
 13268              reservation_gc_id >
 13269                  atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) {
 13270        ERROR("** internal error (reservation_gc_id %" PRIaTXN ")",
 13271              reservation_gc_id);
 13272        rc = MDBX_PROBLEM;
 13273        goto bailout;
 13274      }
 13275  
 13276      key.iov_len = sizeof(reservation_gc_id);
 13277      key.iov_base = &reservation_gc_id;
 13278      data.iov_len = (chunk + 1) * sizeof(pgno_t);
 13279      TRACE("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk,
 13280            ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id);
 13281      gcu_prepare_backlog(txn, ctx, true);
 13282      rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data,
 13283                           MDBX_RESERVE | MDBX_NOOVERWRITE);
 13284      tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 13285                                       txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 13286      if (unlikely(rc != MDBX_SUCCESS))
 13287        goto bailout;
 13288  
 13289      gcu_clean_reserved(env, data);
 13290      ctx->settled += chunk;
 13291      TRACE("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled,
 13292            chunk);
 13293  
 13294      if (txn->tw.lifo_reclaimed &&
 13295          unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) &&
 13296          (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount >
 13297                                env->me_maxgc_ov1page)) {
 13298        NOTICE("** restart: reclaimed-list growth %u -> %u", amount,
 13299               (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
 13300        goto retry;
 13301      }
 13302  
 13303      continue;
 13304    }
 13305  
 13306    tASSERT(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed
 13307                                           ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 13308                                           : 0));
 13309  
 13310    TRACE("%s", " >> filling");
 13311    /* Fill in the reserved records */
 13312    ctx->filled_slot =
 13313        txn->tw.lifo_reclaimed
 13314            ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot
 13315            : ctx->reused_slot;
 13316    rc = MDBX_SUCCESS;
 13317    tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist,
 13318                                     txn->mt_next_pgno - MDBX_ENABLE_REFUND));
 13319    tASSERT(txn, dirtylist_check(txn));
 13320    if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) {
 13321      MDBX_val key, data;
 13322      key.iov_len = data.iov_len = 0; /* avoid MSVC warning */
 13323      key.iov_base = data.iov_base = NULL;
 13324  
 13325      const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
 13326      unsigned left = amount;
 13327      if (txn->tw.lifo_reclaimed == nullptr) {
 13328        tASSERT(txn, ctx->lifo == 0);
 13329        rc = cursor_first(&ctx->cursor.outer, &key, &data);
 13330        if (unlikely(rc != MDBX_SUCCESS))
 13331          goto bailout;
 13332      } else {
 13333        tASSERT(txn, ctx->lifo != 0);
 13334      }
 13335  
 13336      while (true) {
 13337        txnid_t fill_gc_id;
 13338        TRACE("%s: left %u of %u", dbg_prefix_mode, left,
 13339              (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
 13340        if (txn->tw.lifo_reclaimed == nullptr) {
 13341          tASSERT(txn, ctx->lifo == 0);
 13342          fill_gc_id = unaligned_peek_u64(4, key.iov_base);
 13343          if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) {
 13344            NOTICE(
 13345                "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN
 13346                " > last_reclaimed %" PRIaTXN,
 13347                ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed);
 13348            goto retry;
 13349          }
 13350        } else {
 13351          tASSERT(txn, ctx->lifo != 0);
 13352          if (++ctx->filled_slot >
 13353              (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
 13354            NOTICE("** restart: reserve depleted (filled_gc_slot %u > "
 13355                   "lifo_reclaimed %u" PRIaTXN,
 13356                   ctx->filled_slot,
 13357                   (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
 13358            goto retry;
 13359          }
 13360          fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot];
 13361          TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]",
 13362                dbg_prefix_mode, fill_gc_id, ctx->filled_slot);
 13363          key.iov_base = &fill_gc_id;
 13364          key.iov_len = sizeof(fill_gc_id);
 13365          rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY);
 13366          if (unlikely(rc != MDBX_SUCCESS))
 13367            goto bailout;
 13368        }
 13369        tASSERT(txn,
 13370                ctx->cleaned_slot == (txn->tw.lifo_reclaimed
 13371                                          ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 13372                                          : 0));
 13373        tASSERT(txn, fill_gc_id > 0 &&
 13374                         fill_gc_id <= env->me_lck->mti_oldest_reader.weak);
 13375        key.iov_base = &fill_gc_id;
 13376        key.iov_len = sizeof(fill_gc_id);
 13377  
 13378        tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2);
 13379        ctx->cursor.outer.mc_flags |= C_GCFREEZE;
 13380        unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1;
 13381        if (unlikely(chunk > left)) {
 13382          TRACE("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk,
 13383                left, fill_gc_id);
 13384          if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) ||
 13385              chunk - left > env->me_maxgc_ov1page) {
 13386            data.iov_len = (left + 1) * sizeof(pgno_t);
 13387            if (ctx->loop < 7)
 13388              ctx->cursor.outer.mc_flags &= ~C_GCFREEZE;
 13389          }
 13390          chunk = left;
 13391        }
 13392        rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data,
 13393                             MDBX_CURRENT | MDBX_RESERVE);
 13394        ctx->cursor.outer.mc_flags &= ~C_GCFREEZE;
 13395        if (unlikely(rc != MDBX_SUCCESS))
 13396          goto bailout;
 13397        gcu_clean_reserved(env, data);
 13398  
 13399        if (unlikely(txn->tw.loose_count ||
 13400                     amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
 13401          NOTICE("** restart: reclaimed-list growth (%u -> %u, loose +%u)",
 13402                 amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
 13403                 txn->tw.loose_count);
 13404          goto retry;
 13405        }
 13406        if (unlikely(txn->tw.lifo_reclaimed
 13407                         ? ctx->cleaned_slot <
 13408                               MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 13409                         : ctx->cleaned_id < txn->tw.last_reclaimed)) {
 13410          NOTICE("%s", "** restart: reclaimed-slots changed");
 13411          goto retry;
 13412        }
 13413        if (unlikely(ctx->retired_stored !=
 13414                     MDBX_PNL_SIZE(txn->tw.retired_pages))) {
 13415          tASSERT(txn,
 13416                  ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages));
 13417          NOTICE("** restart: retired-list growth (%u -> %u)",
 13418                 ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages));
 13419          goto retry;
 13420        }
 13421  
 13422        pgno_t *dst = data.iov_base;
 13423        *dst++ = chunk;
 13424        pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk;
 13425        memcpy(dst, src, chunk * sizeof(pgno_t));
 13426        pgno_t *from = src, *to = src + chunk;
 13427        TRACE("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN,
 13428              dbg_prefix_mode, chunk, (unsigned)(from - txn->tw.reclaimed_pglist),
 13429              from[0], (unsigned)(to - txn->tw.reclaimed_pglist), to[-1],
 13430              fill_gc_id);
 13431  
 13432        left -= chunk;
 13433        if (AUDIT_ENABLED()) {
 13434          rc = audit_ex(txn, ctx->retired_stored + amount - left, true);
 13435          if (unlikely(rc != MDBX_SUCCESS))
 13436            goto bailout;
 13437        }
 13438        if (left == 0) {
 13439          rc = MDBX_SUCCESS;
 13440          break;
 13441        }
 13442  
 13443        if (txn->tw.lifo_reclaimed == nullptr) {
 13444          tASSERT(txn, ctx->lifo == 0);
 13445          rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT);
 13446          if (unlikely(rc != MDBX_SUCCESS))
 13447            goto bailout;
 13448        } else {
 13449          tASSERT(txn, ctx->lifo != 0);
 13450        }
 13451      }
 13452    }
 13453  
 13454    tASSERT(txn, rc == MDBX_SUCCESS);
 13455    if (unlikely(txn->tw.loose_count != 0)) {
 13456      NOTICE("** restart: got %u loose pages", txn->tw.loose_count);
 13457      goto retry;
 13458    }
 13459    if (unlikely(ctx->filled_slot !=
 13460                 (txn->tw.lifo_reclaimed
 13461                      ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
 13462                      : 0))) {
 13463  
 13464      const bool will_retry = ctx->loop < 9;
 13465      NOTICE("** %s: reserve excess (filled-slot %u, loop %u)",
 13466             will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop);
 13467      if (will_retry)
 13468        goto retry;
 13469    }
 13470  
 13471    tASSERT(txn, txn->tw.lifo_reclaimed == NULL ||
 13472                     ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
 13473  
 13474  bailout:
 13475    txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next;
 13476  
 13477    MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0;
 13478    TRACE("<<< %u loops, rc = %d", ctx->loop, rc);
 13479    return rc;
 13480  }
 13481  
 13482  static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) {
 13483    MDBX_dpl *const dl =
 13484        (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn);
 13485    int rc = MDBX_SUCCESS;
 13486    unsigned r, w;
 13487    for (w = 0, r = 1; r <= dl->length; ++r) {
 13488      MDBX_page *dp = dl->items[r].ptr;
 13489      if (dp->mp_flags & P_LOOSE) {
 13490        dl->items[++w] = dl->items[r];
 13491        continue;
 13492      }
 13493      unsigned npages = dpl_npages(dl, r);
 13494      rc = iov_page(txn, ctx, dp, npages);
 13495      if (unlikely(rc != MDBX_SUCCESS))
 13496        break;
 13497    }
 13498  
 13499    if (ctx->iov_items) {
 13500      /* iov_page() frees dirty-pages and reset iov_items in case of failure. */
 13501      tASSERT(txn, rc == MDBX_SUCCESS);
 13502      rc = iov_write(txn, ctx);
 13503    }
 13504  
 13505    while (r <= dl->length)
 13506      dl->items[++w] = dl->items[r++];
 13507  
 13508    dl->sorted = dpl_setlen(dl, w);
 13509    txn->tw.dirtyroom += r - 1 - w;
 13510    tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
 13511                     (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
 13512                                     : txn->mt_env->me_options.dp_limit));
 13513    return rc;
 13514  }
 13515  
 13516  /* Check txn and dbi arguments to a function */
 13517  static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi,
 13518                                        unsigned validity) {
 13519    if (likely(dbi < txn->mt_numdbs)) {
 13520      if (likely(!dbi_changed(txn, dbi))) {
 13521        if (likely(txn->mt_dbistate[dbi] & validity))
 13522          return true;
 13523        if (likely(dbi < CORE_DBS ||
 13524                   (txn->mt_env->me_dbflags[dbi] & DB_VALID) == 0))
 13525          return false;
 13526      }
 13527    }
 13528    return dbi_import(txn, dbi);
 13529  }
 13530  
 13531  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 13532  int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); }
 13533  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 13534  
 13535  /* Merge child txn into parent */
 13536  static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn,
 13537                                 const unsigned parent_retired_len) {
 13538    MDBX_dpl *const src = dpl_sort(txn);
 13539  
 13540    /* Remove refunded pages from parent's dirty list */
 13541    MDBX_dpl *const dst = dpl_sort(parent);
 13542    if (MDBX_ENABLE_REFUND) {
 13543      unsigned n = dst->length;
 13544      while (n && dst->items[n].pgno >= parent->mt_next_pgno) {
 13545        if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) {
 13546          unsigned npages = dpl_npages(dst, n);
 13547          dpage_free(txn->mt_env, dst->items[n].ptr, npages);
 13548        }
 13549        --n;
 13550      }
 13551      parent->tw.dirtyroom += dst->sorted - n;
 13552      dst->sorted = dpl_setlen(dst, n);
 13553      tASSERT(parent,
 13554              parent->tw.dirtyroom + parent->tw.dirtylist->length ==
 13555                  (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
 13556                                     : parent->mt_env->me_options.dp_limit));
 13557    }
 13558  
 13559    /* Remove reclaimed pages from parent's dirty list */
 13560    const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist;
 13561    dpl_sift(parent, reclaimed_list, false);
 13562  
 13563    /* Move retired pages from parent's dirty & spilled list to reclaimed */
 13564    unsigned r, w, d, s, l;
 13565    for (r = w = parent_retired_len;
 13566         ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) {
 13567      const pgno_t pgno = parent->tw.retired_pages[r];
 13568      const unsigned di = dpl_exist(parent, pgno);
 13569      const unsigned si = !di ? search_spilled(parent, pgno) : 0;
 13570      unsigned npages;
 13571      const char *kind;
 13572      if (di) {
 13573        MDBX_page *dp = dst->items[di].ptr;
 13574        tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH |
 13575                                          P_OVERFLOW | P_SPILLED)) == 0);
 13576        npages = dpl_npages(dst, di);
 13577        page_wash(parent, di, dp, npages);
 13578        kind = "dirty";
 13579        l = 1;
 13580        if (unlikely(npages > l)) {
 13581          /* OVERFLOW-страница могла быть переиспользована по частям. Тогда
 13582           * в retired-списке может быть только начало последовательности,
 13583           * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
 13584           * переносим в reclaimed с проверкой на обрыв последовательности.
 13585           * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
 13586           * страница была разбита на части, то важно удалить dirty-элемент,
 13587           * а все осколки будут учтены отдельно. */
 13588  
 13589          /* Список retired страниц не сортирован, но для ускорения сортировки
 13590           * дополняется в соответствии с MDBX_PNL_ASCENDING */
 13591  #if MDBX_PNL_ASCENDING
 13592          const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages);
 13593          while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) {
 13594            ++r;
 13595            if (++l == npages)
 13596              break;
 13597          }
 13598  #else
 13599          while (w > parent_retired_len &&
 13600                 parent->tw.retired_pages[w - 1] == pgno + l) {
 13601            --w;
 13602            if (++l == npages)
 13603              break;
 13604          }
 13605  #endif
 13606        }
 13607      } else if (unlikely(si)) {
 13608        l = npages = 1;
 13609        spill_remove(parent, si, 1);
 13610        kind = "spilled";
 13611      } else {
 13612        parent->tw.retired_pages[++w] = pgno;
 13613        continue;
 13614      }
 13615  
 13616      DEBUG("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, kind,
 13617            pgno);
 13618      int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l);
 13619      ENSURE(txn->mt_env, err == MDBX_SUCCESS);
 13620    }
 13621    MDBX_PNL_SIZE(parent->tw.retired_pages) = w;
 13622  
 13623    /* Filter-out parent spill list */
 13624    if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) {
 13625      const MDBX_PNL sl = spill_purge(parent);
 13626      unsigned len = MDBX_PNL_SIZE(sl);
 13627      if (len) {
 13628        /* Remove refunded pages from parent's spill list */
 13629        if (MDBX_ENABLE_REFUND &&
 13630            MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) {
 13631  #if MDBX_PNL_ASCENDING
 13632          unsigned i = MDBX_PNL_SIZE(sl);
 13633          assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
 13634          do {
 13635            if ((sl[i] & 1) == 0)
 13636              DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
 13637            i -= 1;
 13638          } while (i && sl[i] >= (parent->mt_next_pgno << 1));
 13639          MDBX_PNL_SIZE(sl) = i;
 13640  #else
 13641          assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
 13642          unsigned i = 0;
 13643          do {
 13644            ++i;
 13645            if ((sl[i] & 1) == 0)
 13646              DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
 13647          } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1));
 13648          MDBX_PNL_SIZE(sl) = len -= i;
 13649          memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
 13650  #endif
 13651        }
 13652        tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1));
 13653  
 13654        /* Remove reclaimed pages from parent's spill list */
 13655        s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list);
 13656        /* Scanning from end to begin */
 13657        while (s && r) {
 13658          if (sl[s] & 1) {
 13659            --s;
 13660            continue;
 13661          }
 13662          const pgno_t spilled_pgno = sl[s] >> 1;
 13663          const pgno_t reclaimed_pgno = reclaimed_list[r];
 13664          if (reclaimed_pgno != spilled_pgno) {
 13665            const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
 13666            s -= !cmp;
 13667            r -= cmp;
 13668          } else {
 13669            DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO,
 13670                  reclaimed_pgno);
 13671            spill_remove(parent, s, 1);
 13672            --s;
 13673            --r;
 13674          }
 13675        }
 13676  
 13677        /* Remove anything in our dirty list from parent's spill list */
 13678        /* Scanning spill list in descend order */
 13679        const int step = MDBX_PNL_ASCENDING ? -1 : 1;
 13680        s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1;
 13681        d = src->length;
 13682        while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) {
 13683          if (sl[s] & 1) {
 13684            s += step;
 13685            continue;
 13686          }
 13687          const pgno_t spilled_pgno = sl[s] >> 1;
 13688          const pgno_t dirty_pgno_form = src->items[d].pgno;
 13689          const unsigned npages = dpl_npages(src, d);
 13690          const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
 13691          if (dirty_pgno_form > spilled_pgno) {
 13692            --d;
 13693            continue;
 13694          }
 13695          if (dirty_pgno_to <= spilled_pgno) {
 13696            s += step;
 13697            continue;
 13698          }
 13699  
 13700          DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages,
 13701                dirty_pgno_form);
 13702          spill_remove(parent, s, 1);
 13703          s += step;
 13704        }
 13705  
 13706        /* Squash deleted pagenums if we deleted any */
 13707        spill_purge(parent);
 13708      }
 13709    }
 13710  
 13711    /* Remove anything in our spill list from parent's dirty list */
 13712    if (txn->tw.spill_pages) {
 13713      tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages,
 13714                                       (size_t)parent->mt_next_pgno << 1));
 13715      dpl_sift(parent, txn->tw.spill_pages, true);
 13716      tASSERT(parent,
 13717              parent->tw.dirtyroom + parent->tw.dirtylist->length ==
 13718                  (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
 13719                                     : parent->mt_env->me_options.dp_limit));
 13720    }
 13721  
 13722    /* Find length of merging our dirty list with parent's and release
 13723     * filter-out pages */
 13724    for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
 13725      MDBX_page *sp = src->items[s].ptr;
 13726      tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
 13727                                        P_LOOSE | P_SPILLED)) == 0);
 13728      const unsigned s_npages = dpl_npages(src, s);
 13729      const pgno_t s_pgno = src->items[s].pgno;
 13730  
 13731      MDBX_page *dp = dst->items[d].ptr;
 13732      tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
 13733                                        P_SPILLED)) == 0);
 13734      const unsigned d_npages = dpl_npages(dst, d);
 13735      const pgno_t d_pgno = dst->items[d].pgno;
 13736  
 13737      if (d_pgno >= s_pgno + s_npages) {
 13738        --d;
 13739        ++l;
 13740      } else if (d_pgno + d_npages <= s_pgno) {
 13741        if (sp->mp_flags != P_LOOSE) {
 13742          sp->mp_txnid = parent->mt_front;
 13743          sp->mp_flags &= ~P_SPILLED;
 13744        }
 13745        --s;
 13746        ++l;
 13747      } else {
 13748        dst->items[d--].ptr = nullptr;
 13749        if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
 13750          dpage_free(txn->mt_env, dp, d_npages);
 13751      }
 13752    }
 13753    assert(dst->sorted == dst->length);
 13754    tASSERT(parent, dst->detent >= l + d + s);
 13755    dst->sorted = l + d + s; /* the merged length */
 13756  
 13757    while (s > 0) {
 13758      MDBX_page *sp = src->items[s].ptr;
 13759      tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
 13760                                        P_LOOSE | P_SPILLED)) == 0);
 13761      if (sp->mp_flags != P_LOOSE) {
 13762        sp->mp_txnid = parent->mt_front;
 13763        sp->mp_flags &= ~P_SPILLED;
 13764      }
 13765      --s;
 13766    }
 13767  
 13768    /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
 13769    if (dst->sorted >= dst->length) {
 13770      /* from end to begin with dst extending */
 13771      for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
 13772        if (unlikely(l <= d)) {
 13773          /* squash to get a gap of free space for merge */
 13774          for (r = w = 1; r <= d; ++r)
 13775            if (dst->items[r].ptr) {
 13776              if (w != r) {
 13777                dst->items[w] = dst->items[r];
 13778                dst->items[r].ptr = nullptr;
 13779              }
 13780              ++w;
 13781            }
 13782          NOTICE("squash to begin for extending-merge %u -> %u", d, w - 1);
 13783          d = w - 1;
 13784          continue;
 13785        }
 13786        assert(l > d);
 13787        if (dst->items[d].ptr) {
 13788          dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno)
 13789                                ? dst->items[d--]
 13790                                : src->items[s--];
 13791        } else
 13792          --d;
 13793      }
 13794      if (s > 0) {
 13795        assert(l == s);
 13796        while (d > 0) {
 13797          assert(dst->items[d].ptr == nullptr);
 13798          --d;
 13799        }
 13800        do {
 13801          assert(l > 0);
 13802          dst->items[l--] = src->items[s--];
 13803        } while (s > 0);
 13804      } else {
 13805        assert(l == d);
 13806        while (l > 0) {
 13807          assert(dst->items[l].ptr != nullptr);
 13808          --l;
 13809        }
 13810      }
 13811    } else {
 13812      /* from begin to end with shrinking (a lot of new large/overflow pages) */
 13813      for (l = s = d = 1; s <= src->length && d <= dst->length;) {
 13814        if (unlikely(l >= d)) {
 13815          /* squash to get a gap of free space for merge */
 13816          for (r = w = dst->length; r >= d; --r)
 13817            if (dst->items[r].ptr) {
 13818              if (w != r) {
 13819                dst->items[w] = dst->items[r];
 13820                dst->items[r].ptr = nullptr;
 13821              }
 13822              --w;
 13823            }
 13824          NOTICE("squash to end for shrinking-merge %u -> %u", d, w + 1);
 13825          d = w + 1;
 13826          continue;
 13827        }
 13828        assert(l < d);
 13829        if (dst->items[d].ptr) {
 13830          dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno)
 13831                                ? dst->items[d++]
 13832                                : src->items[s++];
 13833        } else
 13834          ++d;
 13835      }
 13836      if (s <= src->length) {
 13837        assert(dst->sorted - l == src->length - s);
 13838        while (d <= dst->length) {
 13839          assert(dst->items[d].ptr == nullptr);
 13840          --d;
 13841        }
 13842        do {
 13843          assert(l <= dst->sorted);
 13844          dst->items[l++] = src->items[s++];
 13845        } while (s <= src->length);
 13846      } else {
 13847        assert(dst->sorted - l == dst->length - d);
 13848        while (l <= dst->sorted) {
 13849          assert(l <= d && d <= dst->length && dst->items[d].ptr);
 13850          dst->items[l++] = dst->items[d++];
 13851        }
 13852      }
 13853    }
 13854    parent->tw.dirtyroom -= dst->sorted - dst->length;
 13855    assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit);
 13856    dpl_setlen(dst, dst->sorted);
 13857    parent->tw.dirtylru = txn->tw.dirtylru;
 13858  
 13859    /* В текущем понимании выгоднее пересчитать кол-во страниц,
 13860     * чем подмешивать лишние ветвления и вычисления в циклы выше. */
 13861    dst->pages_including_loose = 0;
 13862    for (r = 1; r <= dst->length; ++r)
 13863      dst->pages_including_loose += dpl_npages(dst, r);
 13864  
 13865    tASSERT(parent, dirtylist_check(parent));
 13866    dpl_free(txn);
 13867  
 13868    if (txn->tw.spill_pages) {
 13869      if (parent->tw.spill_pages) {
 13870        /* Must not fail since space was preserved above. */
 13871        pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages);
 13872        pnl_free(txn->tw.spill_pages);
 13873      } else {
 13874        parent->tw.spill_pages = txn->tw.spill_pages;
 13875        parent->tw.spill_least_removed = txn->tw.spill_least_removed;
 13876      }
 13877      tASSERT(parent, dirtylist_check(parent));
 13878    }
 13879  
 13880    parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
 13881    if (parent->tw.spill_pages) {
 13882      assert(pnl_check_allocated(parent->tw.spill_pages,
 13883                                 (size_t)parent->mt_next_pgno << 1));
 13884      if (MDBX_PNL_SIZE(parent->tw.spill_pages))
 13885        parent->mt_flags |= MDBX_TXN_SPILLS;
 13886    }
 13887  }
 13888  
 13889  int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
 13890    STATIC_ASSERT(MDBX_TXN_FINISHED ==
 13891                  MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR);
 13892    const uint64_t ts_0 = latency ? osal_monotime() : 0;
 13893    uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0;
 13894    uint32_t audit_duration = 0;
 13895  
 13896    int rc = check_txn(txn, MDBX_TXN_FINISHED);
 13897    if (unlikely(rc != MDBX_SUCCESS))
 13898      goto provide_latency;
 13899  
 13900    if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) {
 13901      rc = MDBX_RESULT_TRUE;
 13902      goto fail;
 13903    }
 13904  
 13905    MDBX_env *env = txn->mt_env;
 13906  #if MDBX_ENV_CHECKPID
 13907    if (unlikely(env->me_pid != osal_getpid())) {
 13908      env->me_flags |= MDBX_FATAL_ERROR;
 13909      rc = MDBX_PANIC;
 13910      goto provide_latency;
 13911    }
 13912  #endif /* MDBX_ENV_CHECKPID */
 13913  
 13914    /* txn_end() mode for a commit which writes nothing */
 13915    unsigned end_mode =
 13916        MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE;
 13917    if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY))
 13918      goto done;
 13919  
 13920    if (txn->mt_child) {
 13921      rc = mdbx_txn_commit_ex(txn->mt_child, NULL);
 13922      tASSERT(txn, txn->mt_child == NULL);
 13923      if (unlikely(rc != MDBX_SUCCESS))
 13924        goto fail;
 13925    }
 13926  
 13927    if (unlikely(txn != env->me_txn)) {
 13928      DEBUG("%s", "attempt to commit unknown transaction");
 13929      rc = MDBX_EINVAL;
 13930      goto fail;
 13931    }
 13932  
 13933    if (txn->mt_parent) {
 13934      tASSERT(txn, audit_ex(txn, 0, false) == 0);
 13935      eASSERT(env, txn != env->me_txn0);
 13936      MDBX_txn *const parent = txn->mt_parent;
 13937      eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE);
 13938      eASSERT(env, parent->mt_child == txn &&
 13939                       (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
 13940      eASSERT(env, dirtylist_check(txn));
 13941  
 13942      if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) &&
 13943          parent->mt_numdbs == txn->mt_numdbs) {
 13944        for (int i = txn->mt_numdbs; --i >= 0;) {
 13945          tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0);
 13946          if ((txn->mt_dbistate[i] & DBI_STALE) &&
 13947              !(parent->mt_dbistate[i] & DBI_STALE))
 13948            tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i],
 13949                                sizeof(MDBX_db)) == 0);
 13950        }
 13951  
 13952        tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo,
 13953                            sizeof(parent->mt_geo)) == 0);
 13954        tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary,
 13955                            sizeof(parent->mt_canary)) == 0);
 13956        tASSERT(txn,
 13957                !txn->tw.spill_pages || MDBX_PNL_SIZE(txn->tw.spill_pages) == 0);
 13958        tASSERT(txn, txn->tw.loose_count == 0);
 13959  
 13960        /* fast completion of pure nested transaction */
 13961        end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE;
 13962        goto done;
 13963      }
 13964  
 13965      /* Preserve space for spill list to avoid parent's state corruption
 13966       * if allocation fails. */
 13967      const unsigned parent_retired_len =
 13968          (unsigned)(uintptr_t)parent->tw.retired_pages;
 13969      tASSERT(txn, parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages));
 13970      const unsigned retired_delta =
 13971          MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len;
 13972      if (retired_delta) {
 13973        rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta);
 13974        if (unlikely(rc != MDBX_SUCCESS))
 13975          goto fail;
 13976      }
 13977  
 13978      if (txn->tw.spill_pages) {
 13979        if (parent->tw.spill_pages) {
 13980          rc = pnl_need(&parent->tw.spill_pages,
 13981                        MDBX_PNL_SIZE(txn->tw.spill_pages));
 13982          if (unlikely(rc != MDBX_SUCCESS))
 13983            goto fail;
 13984        }
 13985        spill_purge(txn);
 13986      }
 13987  
 13988      if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length >
 13989                       parent->tw.dirtylist->detent &&
 13990                   !dpl_reserve(parent, txn->tw.dirtylist->length +
 13991                                            parent->tw.dirtylist->length))) {
 13992        rc = MDBX_ENOMEM;
 13993        goto fail;
 13994      }
 13995  
 13996      //-------------------------------------------------------------------------
 13997  
 13998      parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed;
 13999      txn->tw.lifo_reclaimed = NULL;
 14000  
 14001      parent->tw.retired_pages = txn->tw.retired_pages;
 14002      txn->tw.retired_pages = NULL;
 14003  
 14004      pnl_free(parent->tw.reclaimed_pglist);
 14005      parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist;
 14006      txn->tw.reclaimed_pglist = NULL;
 14007      parent->tw.last_reclaimed = txn->tw.last_reclaimed;
 14008  
 14009      parent->mt_geo = txn->mt_geo;
 14010      parent->mt_canary = txn->mt_canary;
 14011      parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY;
 14012  
 14013      /* Move loose pages to parent */
 14014  #if MDBX_ENABLE_REFUND
 14015      parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
 14016  #endif /* MDBX_ENABLE_REFUND */
 14017      parent->tw.loose_count = txn->tw.loose_count;
 14018      parent->tw.loose_pages = txn->tw.loose_pages;
 14019  
 14020      /* Merge our cursors into parent's and close them */
 14021      cursors_eot(txn, true);
 14022      end_mode |= MDBX_END_EOTDONE;
 14023  
 14024      /* Update parent's DBs array */
 14025      memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db));
 14026      parent->mt_numdbs = txn->mt_numdbs;
 14027      for (unsigned i = 0; i < txn->mt_numdbs; i++) {
 14028        /* preserve parent's status */
 14029        const uint8_t state =
 14030            txn->mt_dbistate[i] |
 14031            (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
 14032        DEBUG("db %u dbi-state %s 0x%02x -> 0x%02x", i,
 14033              (parent->mt_dbistate[i] != state) ? "update" : "still",
 14034              parent->mt_dbistate[i], state);
 14035        parent->mt_dbistate[i] = state;
 14036      }
 14037  
 14038      ts_1 = latency ? osal_monotime() : 0;
 14039      txn_merge(parent, txn, parent_retired_len);
 14040      ts_2 = latency ? osal_monotime() : 0;
 14041      env->me_txn = parent;
 14042      parent->mt_child = NULL;
 14043      tASSERT(parent, dirtylist_check(parent));
 14044  
 14045  #if MDBX_ENABLE_REFUND
 14046      txn_refund(parent);
 14047      if (ASSERT_ENABLED()) {
 14048        /* Check parent's loose pages not suitable for refund */
 14049        for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next)
 14050          tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl &&
 14051                              lp->mp_pgno + 1 < parent->mt_next_pgno);
 14052        /* Check parent's reclaimed pages not suitable for refund */
 14053        if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist))
 14054          tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 <
 14055                              parent->mt_next_pgno);
 14056      }
 14057  #endif /* MDBX_ENABLE_REFUND */
 14058  
 14059      ts_4 = ts_3 = latency ? osal_monotime() : 0;
 14060      txn->mt_signature = 0;
 14061      osal_free(txn);
 14062      tASSERT(parent, audit_ex(parent, 0, false) == 0);
 14063      rc = MDBX_SUCCESS;
 14064      goto provide_latency;
 14065    }
 14066  
 14067    tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
 14068                     (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
 14069                                     : txn->mt_env->me_options.dp_limit));
 14070    cursors_eot(txn, false);
 14071    end_mode |= MDBX_END_EOTDONE;
 14072  
 14073    if (txn->tw.dirtylist->length == 0 &&
 14074        (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) {
 14075      for (int i = txn->mt_numdbs; --i >= 0;)
 14076        tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0);
 14077  #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT
 14078      rc = txn_end(txn, end_mode);
 14079      if (unlikely(rc != MDBX_SUCCESS))
 14080        goto fail;
 14081      rc = MDBX_RESULT_TRUE;
 14082      goto provide_latency;
 14083  #else
 14084      goto done;
 14085  #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */
 14086    }
 14087  
 14088    DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO
 14089          "/%" PRIaPGNO,
 14090          txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root,
 14091          txn->mt_dbs[FREE_DBI].md_root);
 14092  
 14093    /* Update DB root pointers */
 14094    if (txn->mt_numdbs > CORE_DBS) {
 14095      MDBX_cursor_couple couple;
 14096      MDBX_val data;
 14097      data.iov_len = sizeof(MDBX_db);
 14098  
 14099      rc = cursor_init(&couple.outer, txn, MAIN_DBI);
 14100      if (unlikely(rc != MDBX_SUCCESS))
 14101        goto fail;
 14102      for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) {
 14103        if (txn->mt_dbistate[i] & DBI_DIRTY) {
 14104          MDBX_db *db = &txn->mt_dbs[i];
 14105          DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN
 14106                " -> %" PRIaTXN,
 14107                i, db->md_mod_txnid, txn->mt_txnid);
 14108          /* Может быть mod_txnid > front после коммита вложенных тразакций */
 14109          db->md_mod_txnid = txn->mt_txnid;
 14110          data.iov_base = db;
 14111          WITH_CURSOR_TRACKING(couple.outer,
 14112                               rc = mdbx_cursor_put(&couple.outer,
 14113                                                    &txn->mt_dbxs[i].md_name,
 14114                                                    &data, F_SUBDATA));
 14115          if (unlikely(rc != MDBX_SUCCESS))
 14116            goto fail;
 14117        }
 14118      }
 14119    }
 14120  
 14121    ts_1 = latency ? osal_monotime() : 0;
 14122    gcu_context_t gcu_ctx;
 14123    rc = gcu_context_init(txn, &gcu_ctx);
 14124    if (unlikely(rc != MDBX_SUCCESS))
 14125      goto fail;
 14126    rc = update_gc(txn, &gcu_ctx);
 14127    if (unlikely(rc != MDBX_SUCCESS))
 14128      goto fail;
 14129  
 14130    txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)
 14131                                             ? txn->mt_txnid
 14132                                             : txn->mt_dbs[FREE_DBI].md_mod_txnid;
 14133  
 14134    txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY)
 14135                                             ? txn->mt_txnid
 14136                                             : txn->mt_dbs[MAIN_DBI].md_mod_txnid;
 14137  
 14138    ts_2 = latency ? osal_monotime() : 0;
 14139    if (AUDIT_ENABLED()) {
 14140      rc = audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true);
 14141      const uint64_t audit_end = osal_monotime();
 14142      audit_duration = osal_monotime_to_16dot16(audit_end - ts_2);
 14143      ts_2 = audit_end;
 14144      if (unlikely(rc != MDBX_SUCCESS))
 14145        goto fail;
 14146    }
 14147  
 14148    struct iov_ctx write_ctx;
 14149    iov_init(txn, &write_ctx);
 14150    rc = txn_write(txn, &write_ctx);
 14151    if (likely(rc == MDBX_SUCCESS))
 14152      iov_done(txn, &write_ctx);
 14153    /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
 14154    ts_3 = latency ? osal_monotime() : 0;
 14155  
 14156    if (likely(rc == MDBX_SUCCESS)) {
 14157      const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
 14158      MDBX_meta meta;
 14159      memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8);
 14160      meta.mm_extra_flags = head.ptr_c->mm_extra_flags;
 14161      meta.mm_validator_id = head.ptr_c->mm_validator_id;
 14162      meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr;
 14163      unaligned_poke_u64(4, meta.mm_pages_retired,
 14164                         unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) +
 14165                             MDBX_PNL_SIZE(txn->tw.retired_pages));
 14166      meta.mm_geo = txn->mt_geo;
 14167      meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
 14168      meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
 14169      meta.mm_canary = txn->mt_canary;
 14170  
 14171      txnid_t commit_txnid = txn->mt_txnid;
 14172  #if MDBX_ENABLE_BIGFOOT
 14173      if (gcu_ctx.bigfoot > txn->mt_txnid) {
 14174        commit_txnid = gcu_ctx.bigfoot;
 14175        TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid,
 14176              (unsigned)(commit_txnid - txn->mt_txnid));
 14177      }
 14178  #endif
 14179      meta_set_txnid(env, &meta, commit_txnid);
 14180  
 14181      rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED,
 14182                       &meta, &txn->tw.troika);
 14183    }
 14184    ts_4 = latency ? osal_monotime() : 0;
 14185    if (unlikely(rc != MDBX_SUCCESS)) {
 14186      env->me_flags |= MDBX_FATAL_ERROR;
 14187      goto fail;
 14188    }
 14189  
 14190    end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE;
 14191  
 14192  done:
 14193    rc = txn_end(txn, end_mode);
 14194  
 14195  provide_latency:
 14196    if (latency) {
 14197      latency->audit = audit_duration;
 14198      latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0;
 14199      latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0;
 14200      latency->write = (ts_2 && ts_3) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0;
 14201      latency->sync = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0;
 14202      const uint64_t ts_5 = osal_monotime();
 14203      latency->ending = ts_4 ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0;
 14204      latency->whole = osal_monotime_to_16dot16(ts_5 - ts_0);
 14205    }
 14206    return rc;
 14207  
 14208  fail:
 14209    mdbx_txn_abort(txn);
 14210    goto provide_latency;
 14211  }
 14212  
 14213  static int validate_meta(MDBX_env *env, MDBX_meta *const meta,
 14214                           const MDBX_page *const page,
 14215                           const unsigned meta_number, unsigned *guess_pagesize) {
 14216    const uint64_t magic_and_version =
 14217        unaligned_peek_u64(4, &meta->mm_magic_and_version);
 14218    if (unlikely(magic_and_version != MDBX_DATA_MAGIC &&
 14219                 magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT &&
 14220                 magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) {
 14221      ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number,
 14222            magic_and_version);
 14223      return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
 14224                                                      : MDBX_VERSION_MISMATCH;
 14225    }
 14226  
 14227    if (unlikely(page->mp_pgno != meta_number)) {
 14228      ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno);
 14229      return MDBX_INVALID;
 14230    }
 14231  
 14232    if (unlikely(page->mp_flags != P_META)) {
 14233      ERROR("page #%u not a meta-page", meta_number);
 14234      return MDBX_INVALID;
 14235    }
 14236  
 14237    /* LY: check pagesize */
 14238    if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE ||
 14239                 meta->mm_psize > MAX_PAGESIZE)) {
 14240      WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number,
 14241              meta->mm_psize);
 14242      return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID;
 14243    }
 14244  
 14245    if (guess_pagesize && *guess_pagesize != meta->mm_psize) {
 14246      *guess_pagesize = meta->mm_psize;
 14247      VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize);
 14248    }
 14249  
 14250    const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a);
 14251    if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) {
 14252      WARNING("meta[%u] not completely updated, skip it", meta_number);
 14253      return MDBX_RESULT_TRUE;
 14254    }
 14255  
 14256    /* LY: check signature as a checksum */
 14257    if (META_IS_STEADY(meta) &&
 14258        unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) {
 14259      WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64
 14260              "), skip it",
 14261              meta_number, unaligned_peek_u64(4, &meta->mm_sign),
 14262              meta_sign(meta));
 14263      return MDBX_RESULT_TRUE;
 14264    }
 14265  
 14266    DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
 14267          ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
 14268          " +%u -%u, txn_id %" PRIaTXN ", %s",
 14269          page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root,
 14270          meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next,
 14271          meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv),
 14272          pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta));
 14273  
 14274    if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) {
 14275      WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number,
 14276              txnid);
 14277      return MDBX_RESULT_TRUE;
 14278    }
 14279  
 14280    /* LY: check min-pages value */
 14281    if (unlikely(meta->mm_geo.lower < MIN_PAGENO ||
 14282                 meta->mm_geo.lower > MAX_PAGENO + 1)) {
 14283      WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it",
 14284              meta_number, meta->mm_geo.lower);
 14285      return MDBX_INVALID;
 14286    }
 14287  
 14288    /* LY: check max-pages value */
 14289    if (unlikely(meta->mm_geo.upper < MIN_PAGENO ||
 14290                 meta->mm_geo.upper > MAX_PAGENO + 1 ||
 14291                 meta->mm_geo.upper < meta->mm_geo.lower)) {
 14292      WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it",
 14293              meta_number, meta->mm_geo.upper);
 14294      return MDBX_INVALID;
 14295    }
 14296  
 14297    /* LY: check last_pgno */
 14298    if (unlikely(meta->mm_geo.next < MIN_PAGENO ||
 14299                 meta->mm_geo.next - 1 > MAX_PAGENO)) {
 14300      WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it",
 14301              meta_number, meta->mm_geo.next);
 14302      return MDBX_CORRUPTED;
 14303    }
 14304  
 14305    /* LY: check filesize & used_bytes */
 14306    const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize;
 14307    if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) {
 14308      /* Here could be a race with DB-shrinking performed by other process */
 14309      int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
 14310      if (unlikely(err != MDBX_SUCCESS))
 14311        return err;
 14312      if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) {
 14313        WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64
 14314                "), skip it",
 14315                meta_number, used_bytes, env->me_dxb_mmap.filesize);
 14316        return MDBX_CORRUPTED;
 14317      }
 14318    }
 14319    if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO ||
 14320                 used_bytes > MAX_MAPSIZE)) {
 14321      WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it",
 14322              meta_number, used_bytes);
 14323      return MDBX_TOO_LARGE;
 14324    }
 14325  
 14326    /* LY: check mapsize limits */
 14327    pgno_t geo_lower = meta->mm_geo.lower;
 14328    uint64_t mapsize_min = geo_lower * (uint64_t)meta->mm_psize;
 14329    STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE);
 14330    STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
 14331    STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MIN_PAGESIZE % (4ul << 20) == 0);
 14332    if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) {
 14333      if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE &&
 14334          mapsize_min <= MAX_MAPSIZE64) {
 14335        eASSERT(env,
 14336                meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE);
 14337        WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), "
 14338                "but size of used space still acceptable (%" PRIu64 ")",
 14339                meta_number, mapsize_min, used_bytes);
 14340        geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize);
 14341        if (geo_lower > MAX_PAGENO + 1) {
 14342          geo_lower = MAX_PAGENO + 1;
 14343          mapsize_min = geo_lower * (uint64_t)meta->mm_psize;
 14344        }
 14345        WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
 14346                " instead of wrong %" PRIaPGNO
 14347                ", will be corrected on next commit(s)",
 14348                meta_number, "lower", geo_lower, meta->mm_geo.lower);
 14349        meta->mm_geo.lower = geo_lower;
 14350      } else {
 14351        WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it",
 14352                meta_number, mapsize_min);
 14353        return MDBX_VERSION_MISMATCH;
 14354      }
 14355    }
 14356  
 14357    pgno_t geo_upper = meta->mm_geo.upper;
 14358    uint64_t mapsize_max = geo_upper * (uint64_t)meta->mm_psize;
 14359    STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
 14360    if (unlikely(mapsize_max > MAX_MAPSIZE ||
 14361                 (MAX_PAGENO + 1) <
 14362                     ceil_powerof2((size_t)mapsize_max, env->me_os_psize) /
 14363                         (size_t)meta->mm_psize)) {
 14364      if (mapsize_max > MAX_MAPSIZE64) {
 14365        WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it",
 14366                meta_number, mapsize_max);
 14367        return MDBX_VERSION_MISMATCH;
 14368      }
 14369      /* allow to open large DB from a 32-bit environment */
 14370      eASSERT(env,
 14371              meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE);
 14372      WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), "
 14373              "but size of used space still acceptable (%" PRIu64 ")",
 14374              meta_number, mapsize_max, used_bytes);
 14375      geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize);
 14376      if (geo_upper > MAX_PAGENO + 1) {
 14377        geo_upper = MAX_PAGENO + 1;
 14378        mapsize_max = geo_upper * (uint64_t)meta->mm_psize;
 14379      }
 14380      WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
 14381              " instead of wrong %" PRIaPGNO
 14382              ", will be corrected on next commit(s)",
 14383              meta_number, "upper", geo_upper, meta->mm_geo.upper);
 14384      meta->mm_geo.upper = geo_upper;
 14385    }
 14386  
 14387    /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper].
 14388     *
 14389     * Copy-with-compaction by previous version of libmdbx could produce DB-file
 14390     * less than meta.geo.lower bound, in case actual filling is low or no data
 14391     * at all. This is not a problem as there is no damage or loss of data.
 14392     * Therefore it is better not to consider such situation as an error, but
 14393     * silently correct it. */
 14394    pgno_t geo_now = meta->mm_geo.now;
 14395    if (geo_now < geo_lower)
 14396      geo_now = geo_lower;
 14397    if (geo_now > geo_upper && meta->mm_geo.next <= geo_upper)
 14398      geo_now = geo_upper;
 14399  
 14400    if (unlikely(meta->mm_geo.next > geo_now)) {
 14401      WARNING("meta[%u] next-pageno (%" PRIaPGNO
 14402              ") is beyond end-pgno (%" PRIaPGNO "), skip it",
 14403              meta_number, meta->mm_geo.next, geo_now);
 14404      return MDBX_CORRUPTED;
 14405    }
 14406    if (meta->mm_geo.now != geo_now) {
 14407      WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO
 14408              " instead of wrong %" PRIaPGNO
 14409              ", will be corrected on next commit(s)",
 14410              meta_number, "now", geo_now, meta->mm_geo.now);
 14411      meta->mm_geo.now = geo_now;
 14412    }
 14413  
 14414    /* GC */
 14415    if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) {
 14416      if (unlikely(meta->mm_dbs[FREE_DBI].md_branch_pages ||
 14417                   meta->mm_dbs[FREE_DBI].md_depth ||
 14418                   meta->mm_dbs[FREE_DBI].md_entries ||
 14419                   meta->mm_dbs[FREE_DBI].md_leaf_pages ||
 14420                   meta->mm_dbs[FREE_DBI].md_overflow_pages)) {
 14421        WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC");
 14422        return MDBX_CORRUPTED;
 14423      }
 14424    } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) {
 14425      WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
 14426              "GC", meta->mm_dbs[FREE_DBI].md_root);
 14427      return MDBX_CORRUPTED;
 14428    }
 14429  
 14430    /* MainDB */
 14431    if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) {
 14432      if (unlikely(meta->mm_dbs[MAIN_DBI].md_branch_pages ||
 14433                   meta->mm_dbs[MAIN_DBI].md_depth ||
 14434                   meta->mm_dbs[MAIN_DBI].md_entries ||
 14435                   meta->mm_dbs[MAIN_DBI].md_leaf_pages ||
 14436                   meta->mm_dbs[MAIN_DBI].md_overflow_pages)) {
 14437        WARNING("meta[%u] has false-empty %s", meta_number, "MainDB");
 14438        return MDBX_CORRUPTED;
 14439      }
 14440    } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) {
 14441      WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
 14442              "MainDB", meta->mm_dbs[MAIN_DBI].md_root);
 14443      return MDBX_CORRUPTED;
 14444    }
 14445  
 14446    if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) {
 14447      WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it",
 14448              meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC");
 14449      return MDBX_CORRUPTED;
 14450    }
 14451  
 14452    if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) {
 14453      WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it",
 14454              meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB");
 14455      return MDBX_CORRUPTED;
 14456    }
 14457  
 14458    return MDBX_SUCCESS;
 14459  }
 14460  
 14461  static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta,
 14462                                MDBX_meta *dest) {
 14463    *dest = *meta;
 14464    return validate_meta(env, dest, data_page(meta),
 14465                         bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr);
 14466  }
 14467  
 14468  /* Read the environment parameters of a DB environment
 14469   * before mapping it into memory. */
 14470  __cold static int read_header(MDBX_env *env, MDBX_meta *dest,
 14471                                const int lck_exclusive,
 14472                                const mdbx_mode_t mode_bits) {
 14473    int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
 14474    if (unlikely(rc != MDBX_SUCCESS))
 14475      return rc;
 14476  
 14477    memset(dest, 0, sizeof(MDBX_meta));
 14478    unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK);
 14479    rc = MDBX_CORRUPTED;
 14480  
 14481    /* Read twice all meta pages so we can find the latest one. */
 14482    unsigned loop_limit = NUM_METAS * 2;
 14483    /* We don't know the page size on first time. So, just guess it. */
 14484    unsigned guess_pagesize = 0;
 14485    for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) {
 14486      const unsigned meta_number = loop_count % NUM_METAS;
 14487      const unsigned offset = (guess_pagesize             ? guess_pagesize
 14488                               : (loop_count > NUM_METAS) ? env->me_psize
 14489                                                          : env->me_os_psize) *
 14490                              meta_number;
 14491  
 14492      char buffer[MIN_PAGESIZE];
 14493      unsigned retryleft = 42;
 14494      while (1) {
 14495        TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number,
 14496              offset, MIN_PAGESIZE, retryleft);
 14497        int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset);
 14498        if (err != MDBX_SUCCESS) {
 14499          if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 &&
 14500              env->me_dxb_mmap.filesize == 0 &&
 14501              mode_bits /* non-zero for DB creation */ != 0)
 14502            NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err));
 14503          else
 14504            ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err,
 14505                  mdbx_strerror(err));
 14506          return err;
 14507        }
 14508  
 14509        char again[MIN_PAGESIZE];
 14510        err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset);
 14511        if (err != MDBX_SUCCESS) {
 14512          ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err,
 14513                mdbx_strerror(err));
 14514          return err;
 14515        }
 14516  
 14517        if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0)
 14518          break;
 14519  
 14520        VERBOSE("meta[%u] was updated, re-read it", meta_number);
 14521      }
 14522  
 14523      if (!retryleft) {
 14524        ERROR("meta[%u] is too volatile, skip it", meta_number);
 14525        continue;
 14526      }
 14527  
 14528      MDBX_page *const page = (MDBX_page *)buffer;
 14529      MDBX_meta *const meta = page_meta(page);
 14530      rc = validate_meta(env, meta, page, meta_number, &guess_pagesize);
 14531      if (rc != MDBX_SUCCESS)
 14532        continue;
 14533  
 14534      bool latch;
 14535      if (env->me_stuck_meta >= 0)
 14536        latch = (meta_number == (unsigned)env->me_stuck_meta);
 14537      else if (meta_bootid_match(meta))
 14538        latch = meta_choice_recent(
 14539            meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign),
 14540            dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign));
 14541      else
 14542        latch = meta_choice_steady(
 14543            meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign),
 14544            dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign));
 14545      if (latch) {
 14546        *dest = *meta;
 14547        if (!lck_exclusive && !META_IS_STEADY(dest))
 14548          loop_limit += 1; /* LY: should re-read to hush race with update */
 14549        VERBOSE("latch meta[%u]", meta_number);
 14550      }
 14551    }
 14552  
 14553    if (dest->mm_psize == 0 ||
 14554        (env->me_stuck_meta < 0 &&
 14555         !(META_IS_STEADY(dest) ||
 14556           meta_weak_acceptable(env, dest, lck_exclusive)))) {
 14557      ERROR("%s", "no usable meta-pages, database is corrupted");
 14558      if (rc == MDBX_SUCCESS) {
 14559        /* TODO: try to restore the database by fully checking b-tree structure
 14560         * for the each meta page, if the corresponding option was given */
 14561        return MDBX_CORRUPTED;
 14562      }
 14563      return rc;
 14564    }
 14565  
 14566    return MDBX_SUCCESS;
 14567  }
 14568  
 14569  __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model,
 14570                                      unsigned num) {
 14571    ENSURE(env, is_powerof2(env->me_psize));
 14572    ENSURE(env, env->me_psize >= MIN_PAGESIZE);
 14573    ENSURE(env, env->me_psize <= MAX_PAGESIZE);
 14574    ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE);
 14575    ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE);
 14576    ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower);
 14577    ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper);
 14578  
 14579    memset(model, 0, env->me_psize);
 14580    model->mp_pgno = num;
 14581    model->mp_flags = P_META;
 14582    MDBX_meta *const model_meta = page_meta(model);
 14583    unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC);
 14584  
 14585    model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower);
 14586    model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper);
 14587    model_meta->mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow));
 14588    model_meta->mm_geo.shrink_pv =
 14589        pages2pv(bytes2pgno(env, env->me_dbgeo.shrink));
 14590    model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now);
 14591    model_meta->mm_geo.next = NUM_METAS;
 14592  
 14593    ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO);
 14594    ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1);
 14595    ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower);
 14596    ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper);
 14597    ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO);
 14598    ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now);
 14599    ENSURE(env, model_meta->mm_geo.grow_pv ==
 14600                    pages2pv(pv2pages(model_meta->mm_geo.grow_pv)));
 14601    ENSURE(env, model_meta->mm_geo.shrink_pv ==
 14602                    pages2pv(pv2pages(model_meta->mm_geo.shrink_pv)));
 14603  
 14604    model_meta->mm_psize = env->me_psize;
 14605    model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY;
 14606    model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
 14607    model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
 14608    meta_set_txnid(env, model_meta, MIN_TXNID + num);
 14609    unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta));
 14610    eASSERT(env, coherency_check_meta(env, model_meta, true));
 14611    return (MDBX_page *)((uint8_t *)model + env->me_psize);
 14612  }
 14613  
 14614  /* Fill in most of the zeroed meta-pages for an empty database environment.
 14615   * Return pointer to recently (head) meta-page. */
 14616  __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) {
 14617    MDBX_page *page0 = (MDBX_page *)buffer;
 14618    MDBX_page *page1 = meta_model(env, page0, 0);
 14619    MDBX_page *page2 = meta_model(env, page1, 1);
 14620    meta_model(env, page2, 2);
 14621    return page_meta(page2);
 14622  }
 14623  
 14624  #if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64))
 14625  static size_t madvise_threshold(const MDBX_env *env,
 14626                                  const size_t largest_bytes) {
 14627    /* TODO: use options */
 14628    const unsigned factor = 9;
 14629    const size_t threshold = (largest_bytes < (65536ul << factor))
 14630                                 ? 65536 /* minimal threshold */
 14631                             : (largest_bytes > (MEGABYTE * 4 << factor))
 14632                                 ? MEGABYTE * 4 /* maximal threshold */
 14633                                 : largest_bytes >> factor;
 14634    return bytes_align2os_bytes(env, threshold);
 14635  }
 14636  #endif /* MDBX_ENABLE_MADVISE */
 14637  
 14638  static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
 14639                         meta_troika_t *const troika) {
 14640    eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
 14641    const MDBX_meta *const meta0 = METAPAGE(env, 0);
 14642    const MDBX_meta *const meta1 = METAPAGE(env, 1);
 14643    const MDBX_meta *const meta2 = METAPAGE(env, 2);
 14644    const meta_ptr_t head = meta_recent(env, troika);
 14645    int rc;
 14646  
 14647    eASSERT(env,
 14648            pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS));
 14649    eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0);
 14650    eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now);
 14651  
 14652    if (flags & MDBX_SAFE_NOSYNC) {
 14653      /* Check auto-sync conditions */
 14654      const pgno_t autosync_threshold =
 14655          atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
 14656      const uint64_t autosync_period =
 14657          atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
 14658      if ((autosync_threshold &&
 14659           atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
 14660               autosync_threshold) ||
 14661          (autosync_period &&
 14662           osal_monotime() -
 14663                   atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
 14664               autosync_period))
 14665        flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
 14666    }
 14667  
 14668    pgno_t shrink = 0;
 14669    if (flags & MDBX_SHRINK_ALLOWED) {
 14670      /* LY: check conditions to discard unused pages */
 14671      const pgno_t largest_pgno = find_largest_snapshot(
 14672          env, (head.ptr_c->mm_geo.next > pending->mm_geo.next)
 14673                   ? head.ptr_c->mm_geo.next
 14674                   : pending->mm_geo.next);
 14675      eASSERT(env, largest_pgno >= NUM_METAS);
 14676  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 14677      const pgno_t edge = env->me_poison_edge;
 14678      if (edge > largest_pgno) {
 14679        env->me_poison_edge = largest_pgno;
 14680        VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno),
 14681                                   pgno2bytes(env, edge - largest_pgno));
 14682        MDBX_ASAN_POISON_MEMORY_REGION(env->me_map +
 14683                                           pgno2bytes(env, largest_pgno),
 14684                                       pgno2bytes(env, edge - largest_pgno));
 14685      }
 14686  #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
 14687  #if MDBX_ENABLE_MADVISE &&                                                     \
 14688      (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
 14689      const size_t largest_bytes = pgno2bytes(env, largest_pgno);
 14690      /* threshold to avoid unreasonable frequent madvise() calls */
 14691      const size_t threshold = madvise_threshold(env, largest_bytes);
 14692      const size_t discard_edge_bytes = bytes_align2os_bytes(
 14693          env, ((MDBX_RDONLY &
 14694                 (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak
 14695                                       : env->me_flags))
 14696                    ? largest_bytes
 14697                    : largest_bytes + threshold));
 14698      const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes);
 14699      const pgno_t prev_discarded_pgno =
 14700          atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
 14701      if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) {
 14702        NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno,
 14703               prev_discarded_pgno);
 14704        atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
 14705                       mo_Relaxed);
 14706        const size_t prev_discarded_bytes =
 14707            ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize);
 14708        ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
 14709  #if defined(MADV_DONTNEED)
 14710        int advise = MADV_DONTNEED;
 14711  #if defined(MADV_FREE) &&                                                      \
 14712      0 /* MADV_FREE works for only anonymous vma at the moment */
 14713        if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000)
 14714          advise = MADV_FREE;
 14715  #endif /* MADV_FREE */
 14716        int err = madvise(env->me_map + discard_edge_bytes,
 14717                          prev_discarded_bytes - discard_edge_bytes, advise)
 14718                      ? ignore_enosys(errno)
 14719                      : MDBX_SUCCESS;
 14720  #else
 14721        int err = ignore_enosys(posix_madvise(
 14722            env->me_map + discard_edge_bytes,
 14723            prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
 14724  #endif
 14725        if (unlikely(MDBX_IS_ERROR(err)))
 14726          return err;
 14727      }
 14728  #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
 14729  
 14730      /* LY: check conditions to shrink datafile */
 14731      const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
 14732      pgno_t shrink_step = 0;
 14733      if (pending->mm_geo.shrink_pv &&
 14734          pending->mm_geo.now - pending->mm_geo.next >
 14735              (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) {
 14736        if (pending->mm_geo.now > largest_pgno &&
 14737            pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
 14738          pgno_t grow_step = 0;
 14739          const pgno_t aligner =
 14740              pending->mm_geo.grow_pv
 14741                  ? (grow_step = pv2pages(pending->mm_geo.grow_pv))
 14742                  : shrink_step;
 14743          const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
 14744          const pgno_t aligned = pgno_align2os_pgno(
 14745              env, with_backlog_gap + aligner - with_backlog_gap % aligner);
 14746          const pgno_t bottom =
 14747              (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
 14748          if (pending->mm_geo.now > bottom) {
 14749            if (TROIKA_HAVE_STEADY(troika))
 14750              /* force steady, but only if steady-checkpoint is present */
 14751              flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED;
 14752            shrink = pending->mm_geo.now - bottom;
 14753            pending->mm_geo.now = bottom;
 14754            if (unlikely(head.txnid == pending->unsafe_txnid)) {
 14755              const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid);
 14756              NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN,
 14757                     pending->unsafe_txnid, txnid);
 14758              ENSURE(env, !env->me_txn0 ||
 14759                              (env->me_txn0->mt_owner != osal_thread_self() &&
 14760                               !env->me_txn));
 14761              if (unlikely(txnid > MAX_TXNID)) {
 14762                rc = MDBX_TXN_FULL;
 14763                ERROR("txnid overflow, raise %d", rc);
 14764                goto fail;
 14765              }
 14766              meta_set_txnid(env, pending, txnid);
 14767              eASSERT(env, coherency_check_meta(env, pending, true));
 14768            }
 14769          }
 14770        }
 14771      }
 14772    }
 14773  
 14774    /* LY: step#1 - sync previously written/updated data-pages */
 14775    rc = MDBX_RESULT_FALSE /* carry steady */;
 14776    if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
 14777      eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
 14778      enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE;
 14779      if ((flags & MDBX_SAFE_NOSYNC) == 0) {
 14780        mode_bits = MDBX_SYNC_DATA;
 14781        if (pending->mm_geo.next >
 14782            meta_prefer_steady(env, troika).ptr_c->mm_geo.now)
 14783          mode_bits |= MDBX_SYNC_SIZE;
 14784        if (flags & MDBX_NOMETASYNC)
 14785          mode_bits |= MDBX_SYNC_IODQ;
 14786      }
 14787  #if MDBX_ENABLE_PGOP_STAT
 14788      env->me_lck->mti_pgop_stat.wops.weak += 1;
 14789  #endif /* MDBX_ENABLE_PGOP_STAT */
 14790      if (flags & MDBX_WRITEMAP)
 14791        rc =
 14792            osal_msync(&env->me_dxb_mmap, 0,
 14793                       pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits);
 14794      else
 14795        rc = osal_fsync(env->me_lazy_fd, mode_bits);
 14796      if (unlikely(rc != MDBX_SUCCESS))
 14797        goto fail;
 14798      rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */
 14799                                      : MDBX_RESULT_FALSE /* carry steady */;
 14800    }
 14801    eASSERT(env, coherency_check_meta(env, pending, true));
 14802  
 14803    /* Steady or Weak */
 14804    if (rc == MDBX_RESULT_FALSE /* carry steady */) {
 14805      atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(),
 14806                     mo_Relaxed);
 14807      unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending));
 14808      atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
 14809    } else {
 14810      assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
 14811      unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK);
 14812    }
 14813  
 14814    const bool legal4overwrite =
 14815        head.txnid == pending->unsafe_txnid &&
 14816        memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) ==
 14817            0 &&
 14818        memcmp(&head.ptr_c->mm_canary, &pending->mm_canary,
 14819               sizeof(pending->mm_canary)) == 0 &&
 14820        memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) ==
 14821            0;
 14822    MDBX_meta *target = nullptr;
 14823    if (head.txnid == pending->unsafe_txnid) {
 14824      ENSURE(env, legal4overwrite);
 14825      if (!head.is_steady && META_IS_STEADY(pending))
 14826        target = (MDBX_meta *)head.ptr_c;
 14827      else {
 14828        WARNING("%s", "skip update meta");
 14829        return MDBX_SUCCESS;
 14830      }
 14831    } else {
 14832      const unsigned troika_tail = troika->tail_and_flags & 3;
 14833      ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent &&
 14834                      troika_tail != troika->prefer_steady);
 14835      target = (MDBX_meta *)meta_tail(env, troika).ptr_c;
 14836    }
 14837  
 14838    /* LY: step#2 - update meta-page. */
 14839    DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
 14840          ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
 14841          " +%u -%u, txn_id %" PRIaTXN ", %s",
 14842          data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root,
 14843          pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower,
 14844          pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper,
 14845          pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv),
 14846          pending->unsafe_txnid, durable_caption(pending));
 14847  
 14848    DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO,
 14849          (meta0 == head.ptr_c) ? "head"
 14850          : (meta0 == target)   ? "tail"
 14851                                : "stay",
 14852          durable_caption(meta0), constmeta_txnid(meta0),
 14853          meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root);
 14854    DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO,
 14855          (meta1 == head.ptr_c) ? "head"
 14856          : (meta1 == target)   ? "tail"
 14857                                : "stay",
 14858          durable_caption(meta1), constmeta_txnid(meta1),
 14859          meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root);
 14860    DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO,
 14861          (meta2 == head.ptr_c) ? "head"
 14862          : (meta2 == target)   ? "tail"
 14863                                : "stay",
 14864          durable_caption(meta2), constmeta_txnid(meta2),
 14865          meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root);
 14866  
 14867    eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) ||
 14868                     (META_IS_STEADY(pending) && !META_IS_STEADY(meta0)));
 14869    eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) ||
 14870                     (META_IS_STEADY(pending) && !META_IS_STEADY(meta1)));
 14871    eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) ||
 14872                     (META_IS_STEADY(pending) && !META_IS_STEADY(meta2)));
 14873  
 14874    eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
 14875    ENSURE(env, target == head.ptr_c ||
 14876                    constmeta_txnid(target) < pending->unsafe_txnid);
 14877  #if MDBX_ENABLE_PGOP_STAT
 14878    env->me_lck->mti_pgop_stat.wops.weak += 1;
 14879  #endif /* MDBX_ENABLE_PGOP_STAT */
 14880    if (flags & MDBX_WRITEMAP) {
 14881      jitter4testing(true);
 14882      if (likely(target != head.ptr_c)) {
 14883        /* LY: 'invalidate' the meta. */
 14884        meta_update_begin(env, target, pending->unsafe_txnid);
 14885        unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK);
 14886  #ifndef NDEBUG
 14887        /* debug: provoke failure to catch a violators, but don't touch mm_psize
 14888         * to allow readers catch actual pagesize. */
 14889        uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root;
 14890        uint8_t *provoke_end = (uint8_t *)&target->mm_sign;
 14891        memset(provoke_begin, 0xCC, provoke_end - provoke_begin);
 14892        jitter4testing(false);
 14893  #endif
 14894  
 14895        /* LY: update info */
 14896        target->mm_geo = pending->mm_geo;
 14897        target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI];
 14898        target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI];
 14899        target->mm_canary = pending->mm_canary;
 14900        memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8);
 14901        jitter4testing(true);
 14902  
 14903        /* LY: 'commit' the meta */
 14904        meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b));
 14905        jitter4testing(true);
 14906        eASSERT(env, coherency_check_meta(env, target, true));
 14907      } else {
 14908        /* dangerous case (target == head), only mm_sign could
 14909         * me updated, check assertions once again */
 14910        eASSERT(env,
 14911                legal4overwrite && !head.is_steady && META_IS_STEADY(pending));
 14912      }
 14913      memcpy(target->mm_sign, pending->mm_sign, 8);
 14914      osal_flush_incoherent_cpu_writeback();
 14915      jitter4testing(true);
 14916      /* sync meta-pages */
 14917      rc =
 14918          osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
 14919                     (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE
 14920                                               : MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 14921      if (unlikely(rc != MDBX_SUCCESS))
 14922        goto fail;
 14923    } else {
 14924      const MDBX_meta undo_meta = *target;
 14925      const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
 14926                                       ? env->me_dsync_fd
 14927                                       : env->me_lazy_fd;
 14928  #if MDBX_ENABLE_PGOP_STAT
 14929      env->me_lck->mti_pgop_stat.wops.weak += 1;
 14930  #endif /* MDBX_ENABLE_PGOP_STAT */
 14931      rc = osal_pwrite(fd, pending, sizeof(MDBX_meta),
 14932                       (uint8_t *)target - env->me_map);
 14933      if (unlikely(rc != MDBX_SUCCESS)) {
 14934      undo:
 14935        DEBUG("%s", "write failed, disk error?");
 14936        /* On a failure, the pagecache still contains the new data.
 14937         * Try write some old data back, to prevent it from being used. */
 14938        osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta),
 14939                    (uint8_t *)target - env->me_map);
 14940        goto fail;
 14941      }
 14942      osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
 14943      /* sync meta-pages */
 14944      if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) {
 14945        rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 14946        if (rc != MDBX_SUCCESS)
 14947          goto undo;
 14948      }
 14949    }
 14950  
 14951    uint64_t timestamp = 0;
 14952    while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") {
 14953      rc =
 14954          coherency_check_written(env, pending->unsafe_txnid, target, &timestamp);
 14955      if (likely(rc == MDBX_SUCCESS))
 14956        break;
 14957      if (unlikely(rc != MDBX_RESULT_TRUE))
 14958        goto fail;
 14959    }
 14960    env->me_lck->mti_meta_sync_txnid.weak =
 14961        (uint32_t)pending->unsafe_txnid -
 14962        ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0);
 14963  
 14964    *troika = meta_tap(env);
 14965    for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child)
 14966      if (troika != &txn->tw.troika)
 14967        txn->tw.troika = *troika;
 14968  
 14969    /* LY: shrink datafile if needed */
 14970    if (unlikely(shrink)) {
 14971      VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")",
 14972              pending->mm_geo.now, shrink);
 14973      rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now,
 14974                               pending->mm_geo.upper);
 14975      if (rc != MDBX_SUCCESS && rc != MDBX_EPERM)
 14976        goto fail;
 14977      eASSERT(env, coherency_check_meta(env, target, true));
 14978    }
 14979  
 14980    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 14981    if (likely(lck))
 14982      /* toggle oldest refresh */
 14983      atomic_store32(&lck->mti_readers_refresh_flag, false, mo_Relaxed);
 14984  
 14985    return MDBX_SUCCESS;
 14986  
 14987  fail:
 14988    env->me_flags |= MDBX_FATAL_ERROR;
 14989    return rc;
 14990  }
 14991  
 14992  static void recalculate_merge_threshold(MDBX_env *env) {
 14993    const unsigned bytes = page_space(env);
 14994    env->me_merge_threshold =
 14995        (uint16_t)(bytes -
 14996                   (bytes * env->me_options.merge_threshold_16dot16_percent >>
 14997                    16));
 14998    env->me_merge_threshold_gc =
 14999        (uint16_t)(bytes -
 15000                   ((env->me_options.merge_threshold_16dot16_percent > 19005)
 15001                        ? bytes / 3 /* 33 % */
 15002                        : bytes / 4 /* 25 % */));
 15003  }
 15004  
 15005  __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) {
 15006    STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE);
 15007    STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta));
 15008    ENSURE(env, is_powerof2(pagesize));
 15009    ENSURE(env, pagesize >= MIN_PAGESIZE);
 15010    ENSURE(env, pagesize <= MAX_PAGESIZE);
 15011    env->me_psize = (unsigned)pagesize;
 15012    if (env->me_pbuf) {
 15013      osal_memalign_free(env->me_pbuf);
 15014      env->me_pbuf = nullptr;
 15015    }
 15016  
 15017    STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4);
 15018    STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT);
 15019    const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
 15020    ENSURE(env,
 15021           maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4);
 15022    env->me_maxgc_ov1page = (unsigned)maxgc_ov1page;
 15023  
 15024    STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42);
 15025    STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX);
 15026    STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) >= BRANCH_NODE_MAX(MIN_PAGESIZE));
 15027    STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) > NODESIZE + 42);
 15028    STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX);
 15029    const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize);
 15030    const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize);
 15031    ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) &&
 15032                    branch_nodemax % 2 == 0 &&
 15033                    leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) &&
 15034                    leaf_nodemax >= branch_nodemax &&
 15035                    leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0);
 15036    env->me_leaf_nodemax = (unsigned)leaf_nodemax;
 15037    env->me_psize2log = (uint8_t)log2n_powerof2(pagesize);
 15038    eASSERT(env, pgno2bytes(env, 1) == pagesize);
 15039    eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2);
 15040    recalculate_merge_threshold(env);
 15041  
 15042    const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE);
 15043    if (!env->me_options.flags.non_auto.dp_limit) {
 15044      /* auto-setup dp_limit by "The42" ;-) */
 15045      intptr_t total_ram_pages, avail_ram_pages;
 15046      int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages);
 15047      if (unlikely(err != MDBX_SUCCESS))
 15048        ERROR("mdbx_get_sysraminfo(), rc %d", err);
 15049      else {
 15050        size_t reasonable_dpl_limit =
 15051            (size_t)(total_ram_pages + avail_ram_pages) / 42;
 15052        if (pagesize > env->me_os_psize)
 15053          reasonable_dpl_limit /= pagesize / env->me_os_psize;
 15054        else if (pagesize < env->me_os_psize)
 15055          reasonable_dpl_limit *= env->me_os_psize / pagesize;
 15056        reasonable_dpl_limit = (reasonable_dpl_limit < MDBX_PGL_LIMIT)
 15057                                   ? reasonable_dpl_limit
 15058                                   : MDBX_PGL_LIMIT;
 15059        reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK * 4)
 15060                                   ? reasonable_dpl_limit
 15061                                   : CURSOR_STACK * 4;
 15062        env->me_options.dp_limit = (unsigned)reasonable_dpl_limit;
 15063      }
 15064    }
 15065    if (env->me_options.dp_limit > max_pgno - NUM_METAS)
 15066      env->me_options.dp_limit = max_pgno - NUM_METAS;
 15067    if (env->me_options.dp_initial > env->me_options.dp_limit)
 15068      env->me_options.dp_initial = env->me_options.dp_limit;
 15069  }
 15070  
 15071  static __inline MDBX_CONST_FUNCTION MDBX_lockinfo *
 15072  lckless_stub(const MDBX_env *env) {
 15073    uintptr_t stub = (uintptr_t)&env->x_lckless_stub;
 15074    /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */
 15075    stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1);
 15076    return (MDBX_lockinfo *)stub;
 15077  }
 15078  
 15079  __cold int mdbx_env_create(MDBX_env **penv) {
 15080    MDBX_env *env = osal_calloc(1, sizeof(MDBX_env));
 15081    if (unlikely(!env))
 15082      return MDBX_ENOMEM;
 15083  
 15084    env->me_maxreaders = DEFAULT_READERS;
 15085    env->me_maxdbs = env->me_numdbs = CORE_DBS;
 15086    env->me_lazy_fd = INVALID_HANDLE_VALUE;
 15087    env->me_dsync_fd = INVALID_HANDLE_VALUE;
 15088    env->me_lfd = INVALID_HANDLE_VALUE;
 15089    env->me_pid = osal_getpid();
 15090    env->me_stuck_meta = -1;
 15091  
 15092    env->me_options.dp_reserve_limit = 1024;
 15093    env->me_options.rp_augment_limit = 256 * 1024;
 15094    env->me_options.dp_limit = 64 * 1024;
 15095    if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS)
 15096      env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS;
 15097    env->me_options.dp_initial = MDBX_PNL_INITIAL;
 15098    if (env->me_options.dp_initial > env->me_options.dp_limit)
 15099      env->me_options.dp_initial = env->me_options.dp_limit;
 15100    env->me_options.spill_max_denominator = 8;
 15101    env->me_options.spill_min_denominator = 8;
 15102    env->me_options.spill_parent4child_denominator = 0;
 15103    env->me_options.dp_loose_limit = 64;
 15104    env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */;
 15105  
 15106    int rc;
 15107    const size_t os_psize = osal_syspagesize();
 15108    if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) {
 15109      ERROR("unsuitable system pagesize %" PRIuPTR, os_psize);
 15110      rc = MDBX_INCOMPATIBLE;
 15111      goto bailout;
 15112    }
 15113    env->me_os_psize = (unsigned)os_psize;
 15114    setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize
 15115                                                          : MAX_PAGESIZE);
 15116  
 15117    rc = osal_fastmutex_init(&env->me_dbi_lock);
 15118    if (unlikely(rc != MDBX_SUCCESS))
 15119      goto bailout;
 15120  
 15121  #if defined(_WIN32) || defined(_WIN64)
 15122    osal_srwlock_Init(&env->me_remap_guard);
 15123    InitializeCriticalSection(&env->me_windowsbug_lock);
 15124  #else
 15125    rc = osal_fastmutex_init(&env->me_remap_guard);
 15126    if (unlikely(rc != MDBX_SUCCESS)) {
 15127      osal_fastmutex_destroy(&env->me_dbi_lock);
 15128      goto bailout;
 15129    }
 15130  
 15131  #if MDBX_LOCKING > MDBX_LOCKING_SYSV
 15132    MDBX_lockinfo *const stub = lckless_stub(env);
 15133    rc = osal_ipclock_stub(&stub->mti_wlock);
 15134  #endif /* MDBX_LOCKING */
 15135    if (unlikely(rc != MDBX_SUCCESS)) {
 15136      osal_fastmutex_destroy(&env->me_remap_guard);
 15137      osal_fastmutex_destroy(&env->me_dbi_lock);
 15138      goto bailout;
 15139    }
 15140  #endif /* Windows */
 15141  
 15142    VALGRIND_CREATE_MEMPOOL(env, 0, 0);
 15143    env->me_signature.weak = MDBX_ME_SIGNATURE;
 15144    *penv = env;
 15145    return MDBX_SUCCESS;
 15146  
 15147  bailout:
 15148    osal_free(env);
 15149    *penv = nullptr;
 15150    return rc;
 15151  }
 15152  
 15153  __cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) {
 15154    if (*cached_result == 0) {
 15155      intptr_t pagesize, total_ram_pages;
 15156      if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) !=
 15157                   MDBX_SUCCESS))
 15158        return *cached_result = MAX_MAPSIZE32 /* the 32-bit limit is good enough
 15159                                                 for fallback */
 15160            ;
 15161  
 15162      if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize))
 15163        return *cached_result = MAX_MAPSIZE;
 15164      assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2));
 15165  
 15166      /* Suggesting should not be more than golden ratio of the size of RAM. */
 15167      *cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize;
 15168  
 15169      /* Round to the nearest human-readable granulation. */
 15170      for (size_t unit = MEGABYTE; unit; unit <<= 5) {
 15171        const size_t floor = floor_powerof2(*cached_result, unit);
 15172        const size_t ceil = ceil_powerof2(*cached_result, unit);
 15173        const size_t threshold = (size_t)*cached_result >> 4;
 15174        const bool down =
 15175            *cached_result - floor < ceil - *cached_result || ceil > MAX_MAPSIZE;
 15176        if (threshold < (down ? *cached_result - floor : ceil - *cached_result))
 15177          break;
 15178        *cached_result = down ? floor : ceil;
 15179      }
 15180    }
 15181    return *cached_result;
 15182  }
 15183  
 15184  __cold LIBMDBX_API int
 15185  mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
 15186                        intptr_t size_upper, intptr_t growth_step,
 15187                        intptr_t shrink_threshold, intptr_t pagesize) {
 15188    int rc = check_env(env, false);
 15189    if (unlikely(rc != MDBX_SUCCESS))
 15190      return rc;
 15191  
 15192    const bool inside_txn =
 15193        (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self());
 15194  
 15195  #if MDBX_DEBUG
 15196    if (growth_step < 0) {
 15197      growth_step = 1;
 15198      if (shrink_threshold < 0)
 15199        shrink_threshold = 1;
 15200    }
 15201  #endif /* MDBX_DEBUG */
 15202  
 15203    intptr_t reasonable_maxsize = 0;
 15204    bool need_unlock = false;
 15205    if (env->me_map) {
 15206      /* env already mapped */
 15207      if (unlikely(env->me_flags & MDBX_RDONLY))
 15208        return MDBX_EACCESS;
 15209  
 15210      if (!inside_txn) {
 15211        int err = mdbx_txn_lock(env, false);
 15212        if (unlikely(err != MDBX_SUCCESS))
 15213          return err;
 15214        need_unlock = true;
 15215        env->me_txn0->tw.troika = meta_tap(env);
 15216        eASSERT(env, !env->me_txn && !env->me_txn0->mt_child);
 15217        env->me_txn0->mt_txnid =
 15218            env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent];
 15219        txn_oldest_reader(env->me_txn0);
 15220      }
 15221  
 15222      /* get untouched params from current TXN or DB */
 15223      if (pagesize <= 0 || pagesize >= INT_MAX)
 15224        pagesize = env->me_psize;
 15225      const MDBX_geo *const geo =
 15226          inside_txn ? &env->me_txn->mt_geo
 15227                     : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo;
 15228      if (size_lower < 0)
 15229        size_lower = pgno2bytes(env, geo->lower);
 15230      if (size_now < 0)
 15231        size_now = pgno2bytes(env, geo->now);
 15232      if (size_upper < 0)
 15233        size_upper = pgno2bytes(env, geo->upper);
 15234      if (growth_step < 0)
 15235        growth_step = pgno2bytes(env, pv2pages(geo->grow_pv));
 15236      if (shrink_threshold < 0)
 15237        shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv));
 15238  
 15239      if (pagesize != (intptr_t)env->me_psize) {
 15240        rc = MDBX_EINVAL;
 15241        goto bailout;
 15242      }
 15243      const size_t usedbytes =
 15244          pgno2bytes(env, find_largest_snapshot(env, geo->next));
 15245      if ((size_t)size_upper < usedbytes) {
 15246        rc = MDBX_MAP_FULL;
 15247        goto bailout;
 15248      }
 15249      if ((size_t)size_now < usedbytes)
 15250        size_now = usedbytes;
 15251    } else {
 15252      /* env NOT yet mapped */
 15253      if (unlikely(inside_txn))
 15254        return MDBX_PANIC;
 15255  
 15256      /* is requested some auto-value for pagesize ? */
 15257      if (pagesize >= INT_MAX /* maximal */)
 15258        pagesize = MAX_PAGESIZE;
 15259      else if (pagesize <= 0) {
 15260        if (pagesize < 0 /* default */) {
 15261          pagesize = env->me_os_psize;
 15262          if ((uintptr_t)pagesize > MAX_PAGESIZE)
 15263            pagesize = MAX_PAGESIZE;
 15264          eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE);
 15265        } else if (pagesize == 0 /* minimal */)
 15266          pagesize = MIN_PAGESIZE;
 15267  
 15268        /* choose pagesize */
 15269        intptr_t max_size = (size_now > size_lower) ? size_now : size_lower;
 15270        max_size = (size_upper > max_size) ? size_upper : max_size;
 15271        if (max_size < 0 /* default */)
 15272          max_size = DEFAULT_MAPSIZE;
 15273        else if (max_size == 0 /* minimal */)
 15274          max_size = MIN_MAPSIZE;
 15275        else if (max_size >= (intptr_t)MAX_MAPSIZE /* maximal */)
 15276          max_size = get_reasonable_db_maxsize(&reasonable_maxsize);
 15277  
 15278        while (max_size > pagesize * (int64_t)(MAX_PAGENO + 1) &&
 15279               pagesize < MAX_PAGESIZE)
 15280          pagesize <<= 1;
 15281      }
 15282    }
 15283  
 15284    if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE ||
 15285        !is_powerof2(pagesize)) {
 15286      rc = MDBX_EINVAL;
 15287      goto bailout;
 15288    }
 15289  
 15290    if (size_lower <= 0) {
 15291      size_lower = MIN_MAPSIZE;
 15292      if (MIN_MAPSIZE / pagesize < MIN_PAGENO)
 15293        size_lower = MIN_PAGENO * pagesize;
 15294    }
 15295    if (size_lower >= INTPTR_MAX) {
 15296      size_lower = get_reasonable_db_maxsize(&reasonable_maxsize);
 15297      if ((size_t)size_lower / pagesize > MAX_PAGENO + 1)
 15298        size_lower = pagesize * (MAX_PAGENO + 1);
 15299    }
 15300  
 15301    if (size_now <= 0) {
 15302      size_now = size_lower;
 15303      if (size_upper >= size_lower && size_now > size_upper)
 15304        size_now = size_upper;
 15305    }
 15306    if (size_now >= INTPTR_MAX) {
 15307      size_now = get_reasonable_db_maxsize(&reasonable_maxsize);
 15308      if ((size_t)size_now / pagesize > MAX_PAGENO + 1)
 15309        size_now = pagesize * (MAX_PAGENO + 1);
 15310    }
 15311  
 15312    if (size_upper <= 0) {
 15313      if (size_now >= get_reasonable_db_maxsize(&reasonable_maxsize) / 2)
 15314        size_upper = get_reasonable_db_maxsize(&reasonable_maxsize);
 15315      else if (MAX_MAPSIZE != MAX_MAPSIZE32 &&
 15316               (size_t)size_now >= MAX_MAPSIZE32 / 2 &&
 15317               (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3)
 15318        size_upper = MAX_MAPSIZE32;
 15319      else {
 15320        size_upper = size_now + size_now;
 15321        if ((size_t)size_upper < DEFAULT_MAPSIZE * 2)
 15322          size_upper = DEFAULT_MAPSIZE * 2;
 15323      }
 15324      if ((size_t)size_upper / pagesize > (MAX_PAGENO + 1))
 15325        size_upper = pagesize * (MAX_PAGENO + 1);
 15326    } else if (size_upper >= INTPTR_MAX) {
 15327      size_upper = get_reasonable_db_maxsize(&reasonable_maxsize);
 15328      if ((size_t)size_upper / pagesize > MAX_PAGENO + 1)
 15329        size_upper = pagesize * (MAX_PAGENO + 1);
 15330    }
 15331  
 15332    if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) {
 15333      rc = MDBX_EINVAL;
 15334      goto bailout;
 15335    }
 15336  
 15337    if ((uint64_t)size_lower / pagesize < MIN_PAGENO) {
 15338      size_lower = pagesize * MIN_PAGENO;
 15339      if (unlikely(size_lower > size_upper)) {
 15340        rc = MDBX_EINVAL;
 15341        goto bailout;
 15342      }
 15343      if (size_now < size_lower)
 15344        size_now = size_lower;
 15345    }
 15346  
 15347    if (unlikely((size_t)size_upper > MAX_MAPSIZE ||
 15348                 (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) {
 15349      rc = MDBX_TOO_LARGE;
 15350      goto bailout;
 15351    }
 15352  
 15353    const size_t unit = (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize
 15354                                                              : (size_t)pagesize;
 15355    size_lower = ceil_powerof2(size_lower, unit);
 15356    size_upper = ceil_powerof2(size_upper, unit);
 15357    size_now = ceil_powerof2(size_now, unit);
 15358  
 15359    /* LY: подбираем значение size_upper:
 15360     *  - кратное размеру страницы
 15361     *  - без нарушения MAX_MAPSIZE и MAX_PAGENO */
 15362    while (unlikely((size_t)size_upper > MAX_MAPSIZE ||
 15363                    (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) {
 15364      if ((size_t)size_upper < unit + MIN_MAPSIZE ||
 15365          (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) {
 15366        /* паранойа на случай переполнения при невероятных значениях */
 15367        rc = MDBX_EINVAL;
 15368        goto bailout;
 15369      }
 15370      size_upper -= unit;
 15371      if ((size_t)size_upper < (size_t)size_lower)
 15372        size_lower = size_upper;
 15373    }
 15374    eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0);
 15375  
 15376    if (size_now < size_lower)
 15377      size_now = size_lower;
 15378    if (size_now > size_upper)
 15379      size_now = size_upper;
 15380  
 15381    if (growth_step < 0) {
 15382      growth_step = ((size_t)(size_upper - size_lower)) / 42;
 15383      if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE)
 15384        growth_step = size_lower;
 15385      if (growth_step < 65536)
 15386        growth_step = 65536;
 15387      if ((size_t)growth_step > MAX_MAPSIZE / 64)
 15388        growth_step = MAX_MAPSIZE / 64;
 15389    }
 15390    if (growth_step == 0 && shrink_threshold > 0)
 15391      growth_step = 1;
 15392    growth_step = ceil_powerof2(growth_step, unit);
 15393  
 15394    if (shrink_threshold < 0)
 15395      shrink_threshold = growth_step + growth_step;
 15396    shrink_threshold = ceil_powerof2(shrink_threshold, unit);
 15397  
 15398    //----------------------------------------------------------------------------
 15399  
 15400    if (!env->me_map) {
 15401      /* save user's geo-params for future open/create */
 15402      if (pagesize != (intptr_t)env->me_psize)
 15403        setup_pagesize(env, pagesize);
 15404      env->me_dbgeo.lower = size_lower;
 15405      env->me_dbgeo.now = size_now;
 15406      env->me_dbgeo.upper = size_upper;
 15407      env->me_dbgeo.grow =
 15408          pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step))));
 15409      env->me_dbgeo.shrink =
 15410          pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold))));
 15411  
 15412      ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE);
 15413      ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO);
 15414      ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0);
 15415      ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0);
 15416  
 15417      ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE);
 15418      ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1);
 15419      ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0);
 15420      ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0);
 15421  
 15422      ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower);
 15423      ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper);
 15424      ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0);
 15425      ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0);
 15426  
 15427      ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0);
 15428      ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0);
 15429      ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0);
 15430      ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0);
 15431  
 15432      rc = MDBX_SUCCESS;
 15433    } else {
 15434      /* apply new params to opened environment */
 15435      ENSURE(env, pagesize == (intptr_t)env->me_psize);
 15436      MDBX_meta meta;
 15437      memset(&meta, 0, sizeof(meta));
 15438      const MDBX_geo *current_geo;
 15439      if (!inside_txn) {
 15440        eASSERT(env, need_unlock);
 15441        const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika);
 15442  
 15443        uint64_t timestamp = 0;
 15444        while ("workaround for "
 15445               "todo4recovery://erased_by_github/libmdbx/issues/269") {
 15446          meta = *head.ptr_c;
 15447          rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta,
 15448                                      &timestamp);
 15449          if (likely(rc == MDBX_SUCCESS))
 15450            break;
 15451          if (unlikely(rc != MDBX_RESULT_TRUE))
 15452            goto bailout;
 15453        }
 15454        const txnid_t txnid = safe64_txnid_next(head.txnid);
 15455        if (unlikely(txnid > MAX_TXNID)) {
 15456          rc = MDBX_TXN_FULL;
 15457          ERROR("txnid overflow, raise %d", rc);
 15458          goto bailout;
 15459        }
 15460        meta_set_txnid(env, &meta, txnid);
 15461        current_geo = &meta.mm_geo;
 15462      } else {
 15463        current_geo = &env->me_txn->mt_geo;
 15464      }
 15465  
 15466      MDBX_geo new_geo;
 15467      new_geo.lower = bytes2pgno(env, size_lower);
 15468      new_geo.now = bytes2pgno(env, size_now);
 15469      new_geo.upper = bytes2pgno(env, size_upper);
 15470      new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step));
 15471      new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold));
 15472      new_geo.next = current_geo->next;
 15473  
 15474      ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower);
 15475      ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper);
 15476      ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now);
 15477      ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv)));
 15478      ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv)));
 15479  
 15480      ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE);
 15481      ENSURE(env, new_geo.lower >= MIN_PAGENO);
 15482      ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE);
 15483      ENSURE(env, new_geo.upper <= MAX_PAGENO + 1);
 15484      ENSURE(env, new_geo.now >= new_geo.next);
 15485      ENSURE(env, new_geo.upper >= new_geo.now);
 15486      ENSURE(env, new_geo.now >= new_geo.lower);
 15487  
 15488      if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) {
 15489  #if defined(_WIN32) || defined(_WIN64)
 15490        /* Was DB shrinking disabled before and now it will be enabled? */
 15491        if (new_geo.lower < new_geo.upper && new_geo.shrink_pv &&
 15492            !(current_geo->lower < current_geo->upper &&
 15493              current_geo->shrink_pv)) {
 15494          if (!env->me_lck_mmap.lck) {
 15495            rc = MDBX_EPERM;
 15496            goto bailout;
 15497          }
 15498          int err = osal_rdt_lock(env);
 15499          if (unlikely(MDBX_IS_ERROR(err))) {
 15500            rc = err;
 15501            goto bailout;
 15502          }
 15503  
 15504          /* Check if there are any reading threads that do not use the SRWL */
 15505          const size_t CurrentTid = GetCurrentThreadId();
 15506          const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
 15507          const MDBX_reader *const end =
 15508              begin + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders,
 15509                                    mo_AcquireRelease);
 15510          for (const MDBX_reader *reader = begin; reader < end; ++reader) {
 15511            if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak &&
 15512                reader->mr_tid.weak != CurrentTid) {
 15513              /* At least one thread may don't use SRWL */
 15514              rc = MDBX_EPERM;
 15515              break;
 15516            }
 15517          }
 15518  
 15519          osal_rdt_unlock(env);
 15520          if (unlikely(rc != MDBX_SUCCESS))
 15521            goto bailout;
 15522        }
 15523  #endif
 15524  
 15525        if (new_geo.now != current_geo->now ||
 15526            new_geo.upper != current_geo->upper) {
 15527          rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper,
 15528                          false);
 15529          if (unlikely(rc != MDBX_SUCCESS))
 15530            goto bailout;
 15531        }
 15532        if (inside_txn) {
 15533          env->me_txn->mt_geo = new_geo;
 15534          env->me_txn->mt_flags |= MDBX_TXN_DIRTY;
 15535        } else {
 15536          meta.mm_geo = new_geo;
 15537          rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika);
 15538        }
 15539  
 15540        if (likely(rc == MDBX_SUCCESS)) {
 15541          /* store new geo to env to avoid influences */
 15542          env->me_dbgeo.now = pgno2bytes(env, new_geo.now);
 15543          env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower);
 15544          env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper);
 15545          env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv));
 15546          env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv));
 15547        }
 15548      }
 15549    }
 15550  
 15551  bailout:
 15552    if (need_unlock)
 15553      mdbx_txn_unlock(env);
 15554    return rc;
 15555  }
 15556  
 15557  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 15558  __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
 15559    return __inline_mdbx_env_set_mapsize(env, size);
 15560  }
 15561  
 15562  __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
 15563    return __inline_mdbx_env_set_maxdbs(env, dbs);
 15564  }
 15565  
 15566  __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
 15567    return __inline_mdbx_env_get_maxdbs(env, dbs);
 15568  }
 15569  
 15570  __cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) {
 15571    return __inline_mdbx_env_set_maxreaders(env, readers);
 15572  }
 15573  
 15574  __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) {
 15575    return __inline_mdbx_env_get_maxreaders(env, readers);
 15576  }
 15577  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 15578  
 15579  __cold static int alloc_page_buf(MDBX_env *env) {
 15580    return env->me_pbuf
 15581               ? MDBX_SUCCESS
 15582               : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS,
 15583                                     &env->me_pbuf);
 15584  }
 15585  
 15586  /* Further setup required for opening an MDBX environment */
 15587  __cold static int setup_dxb(MDBX_env *env, const int lck_rc,
 15588                              const mdbx_mode_t mode_bits) {
 15589    MDBX_meta header;
 15590    int rc = MDBX_RESULT_FALSE;
 15591    int err = read_header(env, &header, lck_rc, mode_bits);
 15592    if (unlikely(err != MDBX_SUCCESS)) {
 15593      if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA ||
 15594          (env->me_flags & MDBX_RDONLY) != 0 ||
 15595          /* recovery mode */ env->me_stuck_meta >= 0)
 15596        return err;
 15597  
 15598      DEBUG("%s", "create new database");
 15599      rc = /* new database */ MDBX_RESULT_TRUE;
 15600  
 15601      if (!env->me_dbgeo.now) {
 15602        /* set defaults if not configured */
 15603        err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1);
 15604        if (unlikely(err != MDBX_SUCCESS))
 15605          return err;
 15606      }
 15607  
 15608      err = alloc_page_buf(env);
 15609      if (unlikely(err != MDBX_SUCCESS))
 15610        return err;
 15611  
 15612      header = *init_metas(env, env->me_pbuf);
 15613      err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS,
 15614                        0);
 15615      if (unlikely(err != MDBX_SUCCESS))
 15616        return err;
 15617  
 15618      err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize =
 15619                                                env->me_dxb_mmap.current =
 15620                                                    env->me_dbgeo.now);
 15621      if (unlikely(err != MDBX_SUCCESS))
 15622        return err;
 15623  
 15624  #ifndef NDEBUG /* just for checking */
 15625      err = read_header(env, &header, lck_rc, mode_bits);
 15626      if (unlikely(err != MDBX_SUCCESS))
 15627        return err;
 15628  #endif
 15629    }
 15630  
 15631    VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO
 15632            "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN
 15633            ", %s",
 15634            header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root,
 15635            header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now,
 15636            header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv),
 15637            pv2pages(header.mm_geo.shrink_pv),
 15638            unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header));
 15639  
 15640    if (env->me_psize != header.mm_psize)
 15641      setup_pagesize(env, header.mm_psize);
 15642    const size_t used_bytes = pgno2bytes(env, header.mm_geo.next);
 15643    const size_t used_aligned2os_bytes =
 15644        ceil_powerof2(used_bytes, env->me_os_psize);
 15645    if ((env->me_flags & MDBX_RDONLY) /* readonly */
 15646        || lck_rc != MDBX_RESULT_TRUE /* not exclusive */
 15647        || /* recovery mode */ env->me_stuck_meta >= 0) {
 15648      /* use present params from db */
 15649      const size_t pagesize = header.mm_psize;
 15650      err = mdbx_env_set_geometry(
 15651          env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize,
 15652          header.mm_geo.upper * pagesize,
 15653          pv2pages(header.mm_geo.grow_pv) * pagesize,
 15654          pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize);
 15655      if (unlikely(err != MDBX_SUCCESS)) {
 15656        ERROR("%s: err %d", "could not apply preconfigured geometry from db",
 15657              err);
 15658        return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err;
 15659      }
 15660    } else if (env->me_dbgeo.now) {
 15661      /* silently growth to last used page */
 15662      if (env->me_dbgeo.now < used_aligned2os_bytes)
 15663        env->me_dbgeo.now = used_aligned2os_bytes;
 15664      if (env->me_dbgeo.upper < used_aligned2os_bytes)
 15665        env->me_dbgeo.upper = used_aligned2os_bytes;
 15666  
 15667      /* apply preconfigured params, but only if substantial changes:
 15668       *  - upper or lower limit changes
 15669       *  - shrink threshold or growth step
 15670       * But ignore change just a 'now/current' size. */
 15671      if (bytes_align2os_bytes(env, env->me_dbgeo.upper) !=
 15672              pgno2bytes(env, header.mm_geo.upper) ||
 15673          bytes_align2os_bytes(env, env->me_dbgeo.lower) !=
 15674              pgno2bytes(env, header.mm_geo.lower) ||
 15675          bytes_align2os_bytes(env, env->me_dbgeo.shrink) !=
 15676              pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) ||
 15677          bytes_align2os_bytes(env, env->me_dbgeo.grow) !=
 15678              pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) {
 15679  
 15680        if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes)
 15681          /* pre-shrink if enabled */
 15682          env->me_dbgeo.now = used_bytes + env->me_dbgeo.shrink -
 15683                              used_bytes % env->me_dbgeo.shrink;
 15684  
 15685        err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now,
 15686                                    env->me_dbgeo.upper, env->me_dbgeo.grow,
 15687                                    env->me_dbgeo.shrink, header.mm_psize);
 15688        if (unlikely(err != MDBX_SUCCESS)) {
 15689          ERROR("%s: err %d", "could not apply preconfigured db-geometry", err);
 15690          return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err;
 15691        }
 15692  
 15693        /* update meta fields */
 15694        header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now);
 15695        header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower);
 15696        header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper);
 15697        header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow));
 15698        header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink));
 15699  
 15700        VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO
 15701                "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
 15702                " +%u -%u, txn_id %" PRIaTXN ", %s",
 15703                header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root,
 15704                header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now,
 15705                header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv),
 15706                pv2pages(header.mm_geo.shrink_pv),
 15707                unaligned_peek_u64(4, header.mm_txnid_a),
 15708                durable_caption(&header));
 15709      } else {
 15710        /* fetch back 'now/current' size, since it was ignored during comparison
 15711         * and may differ. */
 15712        env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now);
 15713      }
 15714      ENSURE(env, header.mm_geo.now >= header.mm_geo.next);
 15715    } else {
 15716      /* geo-params are not pre-configured by user,
 15717       * get current values from the meta. */
 15718      env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now);
 15719      env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower);
 15720      env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper);
 15721      env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv));
 15722      env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv));
 15723    }
 15724  
 15725    ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now);
 15726    ENSURE(env, env->me_dbgeo.now >= used_bytes);
 15727    const uint64_t filesize_before = env->me_dxb_mmap.filesize;
 15728    if (unlikely(filesize_before != env->me_dbgeo.now)) {
 15729      if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) {
 15730        VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO
 15731                "p, have %" PRIu64 "b/%" PRIaPGNO "p), "
 15732                "assume other process working",
 15733                env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now),
 15734                filesize_before, bytes2pgno(env, (size_t)filesize_before));
 15735      } else {
 15736        WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO
 15737                "p, have %" PRIu64 "b/%" PRIaPGNO "p)",
 15738                env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now),
 15739                filesize_before, bytes2pgno(env, (size_t)filesize_before));
 15740        if (filesize_before < used_bytes) {
 15741          ERROR("last-page beyond end-of-file (last %" PRIaPGNO
 15742                ", have %" PRIaPGNO ")",
 15743                header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before));
 15744          return MDBX_CORRUPTED;
 15745        }
 15746  
 15747        if (env->me_flags & MDBX_RDONLY) {
 15748          if (filesize_before & (env->me_os_psize - 1)) {
 15749            ERROR("%s", "filesize should be rounded-up to system page");
 15750            return MDBX_WANNA_RECOVERY;
 15751          }
 15752          WARNING("%s", "ignore filesize mismatch in readonly-mode");
 15753        } else {
 15754          VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO
 15755                  " pages",
 15756                  env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now));
 15757        }
 15758      }
 15759    }
 15760  
 15761    VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x,
 15762            bootid.y, (bootid.x | bootid.y) ? "" : "not-");
 15763  
 15764  #if MDBX_ENABLE_MADVISE
 15765    /* calculate readahead hint before mmap with zero redundant pages */
 15766    const bool readahead =
 15767        !(env->me_flags & MDBX_NORDAHEAD) &&
 15768        mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE;
 15769  #endif /* MDBX_ENABLE_MADVISE */
 15770  
 15771    err = osal_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
 15772                    env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0);
 15773    if (unlikely(err != MDBX_SUCCESS))
 15774      return err;
 15775  
 15776  #if MDBX_ENABLE_MADVISE
 15777  #if defined(MADV_DONTDUMP)
 15778    err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP)
 15779              ? ignore_enosys(errno)
 15780              : MDBX_SUCCESS;
 15781    if (unlikely(MDBX_IS_ERROR(err)))
 15782      return err;
 15783  #endif /* MADV_DONTDUMP */
 15784  #if defined(MADV_DODUMP)
 15785    if (runtime_flags & MDBX_DBG_DUMP) {
 15786      const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS);
 15787      err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP)
 15788                ? ignore_enosys(errno)
 15789                : MDBX_SUCCESS;
 15790      if (unlikely(MDBX_IS_ERROR(err)))
 15791        return err;
 15792    }
 15793  #endif /* MADV_DODUMP */
 15794  #endif /* MDBX_ENABLE_MADVISE */
 15795  
 15796  #ifdef MDBX_USE_VALGRIND
 15797    env->me_valgrind_handle =
 15798        VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx");
 15799  #endif /* MDBX_USE_VALGRIND */
 15800  
 15801    eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) &&
 15802                     used_bytes <= env->me_dxb_mmap.limit);
 15803  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 15804    if (env->me_dxb_mmap.filesize > used_bytes &&
 15805        env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) {
 15806      VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes,
 15807                                 env->me_dxb_mmap.filesize - used_bytes);
 15808      MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes,
 15809                                     env->me_dxb_mmap.filesize - used_bytes);
 15810    }
 15811    env->me_poison_edge =
 15812        bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit)
 15813                            ? env->me_dxb_mmap.filesize
 15814                            : env->me_dxb_mmap.limit);
 15815  #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
 15816  
 15817    meta_troika_t troika = meta_tap(env);
 15818  #if MDBX_DEBUG
 15819    meta_troika_dump(env, &troika);
 15820  #endif
 15821    eASSERT(env, !env->me_txn && !env->me_txn0);
 15822    //-------------------------------- validate/rollback head & steady meta-pages
 15823    if (unlikely(env->me_stuck_meta >= 0)) {
 15824      /* recovery mode */
 15825      MDBX_meta clone;
 15826      MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta);
 15827      err = validate_meta_copy(env, target, &clone);
 15828      if (unlikely(err != MDBX_SUCCESS)) {
 15829        ERROR("target meta[%u] is corrupted",
 15830              bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map));
 15831        meta_troika_dump(env, &troika);
 15832        return MDBX_CORRUPTED;
 15833      }
 15834    } else /* not recovery mode */
 15835      while (1) {
 15836        const unsigned meta_clash_mask = meta_eq_mask(&troika);
 15837        if (unlikely(meta_clash_mask)) {
 15838          ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask);
 15839          meta_troika_dump(env, &troika);
 15840          return MDBX_CORRUPTED;
 15841        }
 15842  
 15843        if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) {
 15844          /* non-exclusive mode,
 15845           * meta-pages should be validated by a first process opened the DB */
 15846          if (troika.recent == troika.prefer_steady)
 15847            break;
 15848  
 15849          if (!env->me_lck_mmap.lck) {
 15850            /* LY: without-lck (read-only) mode, so it is impossible that other
 15851             * process made weak checkpoint. */
 15852            ERROR("%s", "without-lck, unable recovery/rollback");
 15853            meta_troika_dump(env, &troika);
 15854            return MDBX_WANNA_RECOVERY;
 15855          }
 15856  
 15857          /* LY: assume just have a collision with other running process,
 15858           *     or someone make a weak checkpoint */
 15859          VERBOSE("%s", "assume collision or online weak checkpoint");
 15860          break;
 15861        }
 15862        eASSERT(env, lck_rc == MDBX_RESULT_TRUE);
 15863        /* exclusive mode */
 15864  
 15865        const meta_ptr_t recent = meta_recent(env, &troika);
 15866        const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika);
 15867        MDBX_meta clone;
 15868        if (prefer_steady.is_steady) {
 15869          err = validate_meta_copy(env, prefer_steady.ptr_c, &clone);
 15870          if (unlikely(err != MDBX_SUCCESS)) {
 15871            ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed",
 15872                  bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map),
 15873                  "steady", prefer_steady.txnid, "manual recovery");
 15874            meta_troika_dump(env, &troika);
 15875            return MDBX_CORRUPTED;
 15876          }
 15877          if (prefer_steady.ptr_c == recent.ptr_c)
 15878            break;
 15879        }
 15880  
 15881        const pgno_t pgno =
 15882            bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map);
 15883        const bool last_valid =
 15884            validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS;
 15885        eASSERT(env,
 15886                !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid);
 15887        if (unlikely(!last_valid)) {
 15888          if (unlikely(!prefer_steady.is_steady)) {
 15889            ERROR("%s for open or automatic rollback, %s",
 15890                  "there are no suitable meta-pages",
 15891                  "manual recovery is required");
 15892            meta_troika_dump(env, &troika);
 15893            return MDBX_CORRUPTED;
 15894          }
 15895          WARNING("meta[%u] with last txnid %" PRIaTXN
 15896                  " is corrupted, rollback needed",
 15897                  pgno, recent.txnid);
 15898          meta_troika_dump(env, &troika);
 15899          goto purge_meta_head;
 15900        }
 15901  
 15902        if (meta_bootid_match(recent.ptr_c)) {
 15903          if (env->me_flags & MDBX_RDONLY) {
 15904            ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: "
 15905                  "rollback NOT needed, steady-sync NEEDED%s",
 15906                  "opening after an unclean shutdown", bootid.x, bootid.y,
 15907                  ", but unable in read-only mode");
 15908            meta_troika_dump(env, &troika);
 15909            return MDBX_WANNA_RECOVERY;
 15910          }
 15911          WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: "
 15912                  "rollback NOT needed, steady-sync NEEDED%s",
 15913                  "opening after an unclean shutdown", bootid.x, bootid.y, "");
 15914          header = clone;
 15915          atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next,
 15916                         mo_Relaxed);
 15917          break;
 15918        }
 15919        if (unlikely(!prefer_steady.is_steady)) {
 15920          ERROR("%s, but %s for automatic rollback: %s",
 15921                "opening after an unclean shutdown",
 15922                "there are no suitable meta-pages",
 15923                "manual recovery is required");
 15924          meta_troika_dump(env, &troika);
 15925          return MDBX_CORRUPTED;
 15926        }
 15927        if (env->me_flags & MDBX_RDONLY) {
 15928          ERROR("%s and rollback needed: (from head %" PRIaTXN
 15929                " to steady %" PRIaTXN ")%s",
 15930                "opening after an unclean shutdown", recent.txnid,
 15931                prefer_steady.txnid, ", but unable in read-only mode");
 15932          meta_troika_dump(env, &troika);
 15933          return MDBX_WANNA_RECOVERY;
 15934        }
 15935  
 15936      purge_meta_head:
 15937        NOTICE("%s and doing automatic rollback: "
 15938               "purge%s meta[%u] with%s txnid %" PRIaTXN,
 15939               "opening after an unclean shutdown", last_valid ? "" : " invalid",
 15940               pgno, last_valid ? " weak" : "", recent.txnid);
 15941        meta_troika_dump(env, &troika);
 15942        ENSURE(env, prefer_steady.is_steady);
 15943        err = override_meta(env, pgno, 0,
 15944                            last_valid ? recent.ptr_c : prefer_steady.ptr_c);
 15945        if (err) {
 15946          ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d",
 15947                pgno, recent.txnid, err);
 15948          return err;
 15949        }
 15950        troika = meta_tap(env);
 15951        ENSURE(env, 0 == meta_txnid(recent.ptr_v));
 15952        ENSURE(env, 0 == meta_eq_mask(&troika));
 15953      }
 15954  
 15955    if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
 15956      //-------------------------------------------------- shrink DB & update geo
 15957      /* re-check size after mmap */
 15958      if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 ||
 15959          env->me_dxb_mmap.current < used_bytes) {
 15960        ERROR("unacceptable/unexpected datafile size %" PRIuPTR,
 15961              env->me_dxb_mmap.current);
 15962        return MDBX_PROBLEM;
 15963      }
 15964      if (env->me_dxb_mmap.current != env->me_dbgeo.now) {
 15965        header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current);
 15966        NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO
 15967               " pages",
 15968               env->me_dxb_mmap.current, header.mm_geo.now);
 15969      }
 15970  
 15971      const meta_ptr_t recent = meta_recent(env, &troika);
 15972      if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) {
 15973        if ((env->me_flags & MDBX_RDONLY) != 0 ||
 15974            /* recovery mode */ env->me_stuck_meta >= 0) {
 15975          WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO
 15976                  "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO
 15977                  "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u",
 15978                  (env->me_stuck_meta < 0) ? "read-only" : "recovery",
 15979                  recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now,
 15980                  recent.ptr_c->mm_geo.upper,
 15981                  pv2pages(recent.ptr_c->mm_geo.shrink_pv),
 15982                  pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower,
 15983                  header.mm_geo.now, header.mm_geo.upper,
 15984                  pv2pages(header.mm_geo.shrink_pv),
 15985                  pv2pages(header.mm_geo.grow_pv));
 15986        } else {
 15987          const txnid_t next_txnid = safe64_txnid_next(recent.txnid);
 15988          if (unlikely(next_txnid > MAX_TXNID)) {
 15989            ERROR("txnid overflow, raise %d", MDBX_TXN_FULL);
 15990            return MDBX_TXN_FULL;
 15991          }
 15992          NOTICE("updating meta.geo: "
 15993                 "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
 15994                 "/s%u-g%u (txn#%" PRIaTXN "), "
 15995                 "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
 15996                 "/s%u-g%u (txn#%" PRIaTXN ")",
 15997                 recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now,
 15998                 recent.ptr_c->mm_geo.upper,
 15999                 pv2pages(recent.ptr_c->mm_geo.shrink_pv),
 16000                 pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid,
 16001                 header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper,
 16002                 pv2pages(header.mm_geo.shrink_pv),
 16003                 pv2pages(header.mm_geo.grow_pv), next_txnid);
 16004  
 16005          ENSURE(env, header.unsafe_txnid == recent.txnid);
 16006          meta_set_txnid(env, &header, next_txnid);
 16007          err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header,
 16008                            &troika);
 16009          if (err) {
 16010            ERROR("error %d, while updating meta.geo: "
 16011                  "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
 16012                  "/s%u-g%u (txn#%" PRIaTXN "), "
 16013                  "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
 16014                  "/s%u-g%u (txn#%" PRIaTXN ")",
 16015                  err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now,
 16016                  recent.ptr_c->mm_geo.upper,
 16017                  pv2pages(recent.ptr_c->mm_geo.shrink_pv),
 16018                  pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid,
 16019                  header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper,
 16020                  pv2pages(header.mm_geo.shrink_pv),
 16021                  pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid);
 16022            return err;
 16023          }
 16024        }
 16025      }
 16026  
 16027      atomic_store32(&env->me_lck->mti_discarded_tail,
 16028                     bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed);
 16029  
 16030      if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 &&
 16031          (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) {
 16032        for (int n = 0; n < NUM_METAS; ++n) {
 16033          MDBX_meta *const meta = METAPAGE(env, n);
 16034          if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) !=
 16035                       MDBX_DATA_MAGIC)) {
 16036            const txnid_t txnid = constmeta_txnid(meta);
 16037            NOTICE("%s %s"
 16038                   "meta[%u], txnid %" PRIaTXN,
 16039                   "updating db-format signature for",
 16040                   META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid);
 16041            err = override_meta(env, n, txnid, meta);
 16042            if (unlikely(err != MDBX_SUCCESS) &&
 16043                /* Just ignore the MDBX_PROBLEM error, since here it is
 16044                 * returned only in case of the attempt to upgrade an obsolete
 16045                 * meta-page that is invalid for current state of a DB,
 16046                 * e.g. after shrinking DB file */
 16047                err != MDBX_PROBLEM) {
 16048              ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d",
 16049                    "updating db-format signature for", n, txnid, err);
 16050              return err;
 16051            }
 16052            troika = meta_tap(env);
 16053          }
 16054        }
 16055      }
 16056    } /* lck exclusive, lck_rc == MDBX_RESULT_TRUE */
 16057  
 16058    //---------------------------------------------------- setup madvise/readahead
 16059  #if MDBX_ENABLE_MADVISE
 16060    if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
 16061  #if defined(MADV_REMOVE)
 16062      if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 &&
 16063          /* not recovery mode */ env->me_stuck_meta < 0) {
 16064        NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)",
 16065               env->me_lck->mti_discarded_tail.weak,
 16066               bytes2pgno(env, env->me_dxb_mmap.current));
 16067        err =
 16068            madvise(env->me_map + used_aligned2os_bytes,
 16069                    env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE)
 16070                ? ignore_enosys(errno)
 16071                : MDBX_SUCCESS;
 16072        if (unlikely(MDBX_IS_ERROR(err)))
 16073          return err;
 16074      }
 16075  #endif /* MADV_REMOVE */
 16076  #if defined(MADV_DONTNEED)
 16077      NOTICE("open-MADV_%s %u..%u", "DONTNEED",
 16078             env->me_lck->mti_discarded_tail.weak,
 16079             bytes2pgno(env, env->me_dxb_mmap.current));
 16080      err =
 16081          madvise(env->me_map + used_aligned2os_bytes,
 16082                  env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED)
 16083              ? ignore_enosys(errno)
 16084              : MDBX_SUCCESS;
 16085      if (unlikely(MDBX_IS_ERROR(err)))
 16086        return err;
 16087  #elif defined(POSIX_MADV_DONTNEED)
 16088      err = ignore_enosys(posix_madvise(
 16089          env->me_map + used_aligned2os_bytes,
 16090          env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED));
 16091      if (unlikely(MDBX_IS_ERROR(err)))
 16092        return err;
 16093  #elif defined(POSIX_FADV_DONTNEED)
 16094      err = ignore_enosys(posix_fadvise(
 16095          env->me_lazy_fd, used_aligned2os_bytes,
 16096          env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED));
 16097      if (unlikely(MDBX_IS_ERROR(err)))
 16098        return err;
 16099  #endif /* MADV_DONTNEED */
 16100    }
 16101  
 16102    err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true);
 16103    if (unlikely(err != MDBX_SUCCESS))
 16104      return err;
 16105  #endif /* MDBX_ENABLE_MADVISE */
 16106  
 16107    return rc;
 16108  }
 16109  
 16110  /******************************************************************************/
 16111  
 16112  /* Open and/or initialize the lock region for the environment. */
 16113  __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname,
 16114                              mdbx_mode_t mode) {
 16115    eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE);
 16116    eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE);
 16117  
 16118    int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode);
 16119    if (err != MDBX_SUCCESS) {
 16120      switch (err) {
 16121      default:
 16122        return err;
 16123      case MDBX_ENOFILE:
 16124      case MDBX_EACCESS:
 16125      case MDBX_EPERM:
 16126        if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE))
 16127          return err;
 16128        break;
 16129      case MDBX_EROFS:
 16130        if ((env->me_flags & MDBX_RDONLY) == 0)
 16131          return err;
 16132        break;
 16133      }
 16134  
 16135      if (err != MDBX_ENOFILE) {
 16136        /* ENSURE the file system is read-only */
 16137        err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err);
 16138        if (err != MDBX_SUCCESS &&
 16139            /* ignore ERROR_NOT_SUPPORTED for exclusive mode */
 16140            !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE)))
 16141          return err;
 16142      }
 16143  
 16144      /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
 16145      /* beginning of a locked section ---------------------------------------- */
 16146      lcklist_lock();
 16147      eASSERT(env, env->me_lcklist_next == nullptr);
 16148      env->me_lfd = INVALID_HANDLE_VALUE;
 16149      const int rc = osal_lck_seize(env);
 16150      if (MDBX_IS_ERROR(rc)) {
 16151        /* Calling lcklist_detach_locked() is required to restore POSIX-filelock
 16152         * and this job will be done by env_close(). */
 16153        lcklist_unlock();
 16154        return rc;
 16155      }
 16156      /* insert into inprocess lck-list */
 16157      env->me_lcklist_next = inprocess_lcklist_head;
 16158      inprocess_lcklist_head = env;
 16159      lcklist_unlock();
 16160      /* end of a locked section ---------------------------------------------- */
 16161  
 16162      env->me_lck = lckless_stub(env);
 16163      env->me_maxreaders = UINT_MAX;
 16164      DEBUG("lck-setup:%s%s%s", " lck-less",
 16165            (env->me_flags & MDBX_RDONLY) ? " readonly" : "",
 16166            (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
 16167      return rc;
 16168    }
 16169  
 16170    /* beginning of a locked section ------------------------------------------ */
 16171    lcklist_lock();
 16172    eASSERT(env, env->me_lcklist_next == nullptr);
 16173  
 16174    /* Try to get exclusive lock. If we succeed, then
 16175     * nobody is using the lock region and we should initialize it. */
 16176    err = osal_lck_seize(env);
 16177    if (MDBX_IS_ERROR(err)) {
 16178    bailout:
 16179      /* Calling lcklist_detach_locked() is required to restore POSIX-filelock
 16180       * and this job will be done by env_close(). */
 16181      lcklist_unlock();
 16182      return err;
 16183    }
 16184  
 16185    MDBX_env *inprocess_neighbor = nullptr;
 16186    if (err == MDBX_RESULT_TRUE) {
 16187      err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor);
 16188      if (MDBX_IS_ERROR(err))
 16189        goto bailout;
 16190      if (inprocess_neighbor &&
 16191          ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 ||
 16192           (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) {
 16193        err = MDBX_BUSY;
 16194        goto bailout;
 16195      }
 16196    }
 16197    const int lck_seize_rc = err;
 16198  
 16199    DEBUG("lck-setup:%s%s%s", " with-lck",
 16200          (env->me_flags & MDBX_RDONLY) ? " readonly" : "",
 16201          (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
 16202  
 16203    uint64_t size = 0;
 16204    err = osal_filesize(env->me_lfd, &size);
 16205    if (unlikely(err != MDBX_SUCCESS))
 16206      goto bailout;
 16207  
 16208    if (lck_seize_rc == MDBX_RESULT_TRUE) {
 16209      size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) +
 16210                               sizeof(MDBX_lockinfo),
 16211                           env->me_os_psize);
 16212      jitter4testing(false);
 16213    } else {
 16214      if (env->me_flags & MDBX_EXCLUSIVE) {
 16215        err = MDBX_BUSY;
 16216        goto bailout;
 16217      }
 16218      if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 ||
 16219          size < env->me_os_psize) {
 16220        ERROR("lck-file has invalid size %" PRIu64 " bytes", size);
 16221        err = MDBX_PROBLEM;
 16222        goto bailout;
 16223      }
 16224    }
 16225  
 16226    const size_t maxreaders =
 16227        ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader);
 16228    if (maxreaders < 4) {
 16229      ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
 16230      err = MDBX_PROBLEM;
 16231      goto bailout;
 16232    }
 16233    env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT)
 16234                             ? (unsigned)maxreaders
 16235                             : (unsigned)MDBX_READERS_LIMIT;
 16236  
 16237    err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP,
 16238                    &env->me_lck_mmap, (size_t)size, (size_t)size,
 16239                    lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE
 16240                                 : MMAP_OPTION_SEMAPHORE);
 16241    if (unlikely(err != MDBX_SUCCESS))
 16242      goto bailout;
 16243  
 16244  #if MDBX_ENABLE_MADVISE
 16245  #ifdef MADV_DODUMP
 16246    err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno)
 16247                                                           : MDBX_SUCCESS;
 16248    if (unlikely(MDBX_IS_ERROR(err)))
 16249      goto bailout;
 16250  #endif /* MADV_DODUMP */
 16251  
 16252  #ifdef MADV_WILLNEED
 16253    err = madvise(env->me_lck_mmap.lck, size, MADV_WILLNEED)
 16254              ? ignore_enosys(errno)
 16255              : MDBX_SUCCESS;
 16256    if (unlikely(MDBX_IS_ERROR(err)))
 16257      goto bailout;
 16258  #elif defined(POSIX_MADV_WILLNEED)
 16259    err = ignore_enosys(
 16260        posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED));
 16261    if (unlikely(MDBX_IS_ERROR(err)))
 16262      goto bailout;
 16263  #endif /* MADV_WILLNEED */
 16264  #endif /* MDBX_ENABLE_MADVISE */
 16265  
 16266    struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 16267    if (lck_seize_rc == MDBX_RESULT_TRUE) {
 16268      /* LY: exclusive mode, check and reset lck content */
 16269      memset(lck, 0, (size_t)size);
 16270      jitter4testing(false);
 16271      lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
 16272      lck->mti_os_and_format = MDBX_LOCK_FORMAT;
 16273  #if MDBX_ENABLE_PGOP_STAT
 16274      lck->mti_pgop_stat.wops.weak = 1;
 16275  #endif /* MDBX_ENABLE_PGOP_STAT */
 16276      err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE);
 16277      if (unlikely(err != MDBX_SUCCESS)) {
 16278        ERROR("initial-%s for lck-file failed", "msync");
 16279        goto bailout;
 16280      }
 16281      err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE);
 16282      if (unlikely(err != MDBX_SUCCESS)) {
 16283        ERROR("initial-%s for lck-file failed", "fsync");
 16284        goto bailout;
 16285      }
 16286    } else {
 16287      if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
 16288        const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC;
 16289        ERROR("lock region has %s",
 16290              invalid
 16291                  ? "invalid magic"
 16292                  : "incompatible version (only applications with nearly or the "
 16293                    "same versions of libmdbx can share the same database)");
 16294        err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH;
 16295        goto bailout;
 16296      }
 16297      if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) {
 16298        ERROR("lock region has os/format signature 0x%" PRIx32
 16299              ", expected 0x%" PRIx32,
 16300              lck->mti_os_and_format, MDBX_LOCK_FORMAT);
 16301        err = MDBX_VERSION_MISMATCH;
 16302        goto bailout;
 16303      }
 16304    }
 16305  
 16306    err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc);
 16307    if (MDBX_IS_ERROR(err))
 16308      goto bailout;
 16309  
 16310    ENSURE(env, env->me_lcklist_next == nullptr);
 16311    /* insert into inprocess lck-list */
 16312    env->me_lcklist_next = inprocess_lcklist_head;
 16313    inprocess_lcklist_head = env;
 16314    lcklist_unlock();
 16315    /* end of a locked section ------------------------------------------------ */
 16316  
 16317    eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc));
 16318    env->me_lck = lck;
 16319    return lck_seize_rc;
 16320  }
 16321  
 16322  __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
 16323    if (volume <= 1024 * 1024 * 4ul)
 16324      return MDBX_RESULT_TRUE;
 16325  
 16326    intptr_t pagesize, total_ram_pages;
 16327    int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr);
 16328    if (unlikely(err != MDBX_SUCCESS))
 16329      return err;
 16330  
 16331    const int log2page = log2n_powerof2(pagesize);
 16332    const intptr_t volume_pages = (volume + pagesize - 1) >> log2page;
 16333    const intptr_t redundancy_pages =
 16334        (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page)
 16335                         : (intptr_t)(redundancy + pagesize - 1) >> log2page;
 16336    if (volume_pages >= total_ram_pages ||
 16337        volume_pages + redundancy_pages >= total_ram_pages)
 16338      return MDBX_RESULT_FALSE;
 16339  
 16340    intptr_t avail_ram_pages;
 16341    err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages);
 16342    if (unlikely(err != MDBX_SUCCESS))
 16343      return err;
 16344  
 16345    return (volume_pages + redundancy_pages >= avail_ram_pages)
 16346               ? MDBX_RESULT_FALSE
 16347               : MDBX_RESULT_TRUE;
 16348  }
 16349  
 16350  /* Merge sync flags */
 16351  static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) {
 16352    uint32_t r = a | b;
 16353  
 16354    /* avoid false MDBX_UTTERLY_NOSYNC */
 16355    if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
 16356        !F_ISSET(b, MDBX_UTTERLY_NOSYNC))
 16357      r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
 16358  
 16359    /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
 16360    if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) ==
 16361            (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC) &&
 16362        !F_ISSET(r, MDBX_UTTERLY_NOSYNC))
 16363      r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
 16364  
 16365    /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */
 16366    if (r & MDBX_SAFE_NOSYNC)
 16367      r |= MDBX_NOMETASYNC;
 16368  
 16369    assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) &&
 16370             !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
 16371             !F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
 16372    return r;
 16373  }
 16374  
 16375  __cold static int __must_check_result override_meta(MDBX_env *env,
 16376                                                      unsigned target,
 16377                                                      txnid_t txnid,
 16378                                                      const MDBX_meta *shape) {
 16379    int rc = alloc_page_buf(env);
 16380    if (unlikely(rc != MDBX_SUCCESS))
 16381      return rc;
 16382    MDBX_page *const page = env->me_pbuf;
 16383    meta_model(env, page, target);
 16384    MDBX_meta *const model = page_meta(page);
 16385    meta_set_txnid(env, model, txnid);
 16386    eASSERT(env, coherency_check_meta(env, model, true));
 16387    if (shape) {
 16388      if (txnid && unlikely(!coherency_check_meta(env, shape, false))) {
 16389        ERROR("bailout overriding meta-%u since model failed "
 16390              "freedb/maindb %s-check for txnid #%" PRIaTXN,
 16391              target, "pre", constmeta_txnid(shape));
 16392        return MDBX_PROBLEM;
 16393      }
 16394      if (runtime_flags & MDBX_DBG_DONT_UPGRADE)
 16395        memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version,
 16396               sizeof(model->mm_magic_and_version));
 16397      model->mm_extra_flags = shape->mm_extra_flags;
 16398      model->mm_validator_id = shape->mm_validator_id;
 16399      model->mm_extra_pagehdr = shape->mm_extra_pagehdr;
 16400      memcpy(&model->mm_geo, &shape->mm_geo, sizeof(model->mm_geo));
 16401      memcpy(&model->mm_dbs, &shape->mm_dbs, sizeof(model->mm_dbs));
 16402      memcpy(&model->mm_canary, &shape->mm_canary, sizeof(model->mm_canary));
 16403      memcpy(&model->mm_pages_retired, &shape->mm_pages_retired,
 16404             sizeof(model->mm_pages_retired));
 16405      if (txnid) {
 16406        if ((!model->mm_dbs[FREE_DBI].md_mod_txnid &&
 16407             model->mm_dbs[FREE_DBI].md_root != P_INVALID) ||
 16408            (!model->mm_dbs[MAIN_DBI].md_mod_txnid &&
 16409             model->mm_dbs[MAIN_DBI].md_root != P_INVALID))
 16410          memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version,
 16411                 sizeof(model->mm_magic_and_version));
 16412        if (unlikely(!coherency_check_meta(env, model, false))) {
 16413          ERROR("bailout overriding meta-%u since model failed "
 16414                "freedb/maindb %s-check for txnid #%" PRIaTXN,
 16415                target, "post", txnid);
 16416          return MDBX_PROBLEM;
 16417        }
 16418      }
 16419    }
 16420    unaligned_poke_u64(4, model->mm_sign, meta_sign(model));
 16421    rc = validate_meta(env, model, page, target, nullptr);
 16422    if (unlikely(MDBX_IS_ERROR(rc)))
 16423      return MDBX_PROBLEM;
 16424  
 16425    if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0)
 16426      return MDBX_SUCCESS;
 16427  
 16428  #if MDBX_ENABLE_PGOP_STAT
 16429    env->me_lck->mti_pgop_stat.wops.weak += 1;
 16430  #endif /* MDBX_ENABLE_PGOP_STAT */
 16431    if (env->me_flags & MDBX_WRITEMAP) {
 16432      rc = osal_msync(&env->me_dxb_mmap, 0,
 16433                      pgno_align2os_bytes(env, model->mm_geo.next),
 16434                      MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 16435      if (unlikely(rc != MDBX_SUCCESS))
 16436        return rc;
 16437      /* override_meta() called only while current process have exclusive
 16438       * lock of a DB file. So meta-page could be updated directly without
 16439       * clearing consistency flag by mdbx_meta_update_begin() */
 16440      memcpy(pgno2page(env, target), page, env->me_psize);
 16441      osal_flush_incoherent_cpu_writeback();
 16442      rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
 16443                      MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 16444    } else {
 16445      const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
 16446                                       ? env->me_dsync_fd
 16447                                       : env->me_lazy_fd;
 16448      rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target));
 16449      if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd)
 16450        rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 16451    }
 16452    osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
 16453                               env->me_os_psize);
 16454    eASSERT(env, !env->me_txn && !env->me_txn0);
 16455    return rc;
 16456  }
 16457  
 16458  __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) {
 16459    if (unlikely(target >= NUM_METAS))
 16460      return MDBX_EINVAL;
 16461    int rc = check_env(env, true);
 16462    if (unlikely(rc != MDBX_SUCCESS))
 16463      return rc;
 16464  
 16465    if (unlikely((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) !=
 16466                 MDBX_EXCLUSIVE))
 16467      return MDBX_EPERM;
 16468  
 16469    const MDBX_meta *target_meta = METAPAGE(env, target);
 16470    txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta));
 16471    for (unsigned n = 0; n < NUM_METAS; ++n) {
 16472      if (n == target)
 16473        continue;
 16474      MDBX_meta meta = *METAPAGE(env, target);
 16475      if (validate_meta(env, &meta, pgno2page(env, n), n, nullptr) !=
 16476          MDBX_SUCCESS) {
 16477        int err = override_meta(env, n, 0, nullptr);
 16478        if (unlikely(err != MDBX_SUCCESS))
 16479          return err;
 16480      } else {
 16481        txnid_t txnid = constmeta_txnid(&meta);
 16482        if (new_txnid <= txnid)
 16483          new_txnid = safe64_txnid_next(txnid);
 16484      }
 16485    }
 16486  
 16487    if (unlikely(new_txnid > MAX_TXNID)) {
 16488      ERROR("txnid overflow, raise %d", MDBX_TXN_FULL);
 16489      return MDBX_TXN_FULL;
 16490    }
 16491    return override_meta(env, target, new_txnid, target_meta);
 16492  }
 16493  
 16494  __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname,
 16495                                        unsigned target_meta, bool writeable) {
 16496  #if defined(_WIN32) || defined(_WIN64)
 16497    const wchar_t *pathnameW = nullptr;
 16498    OSAL_MB2WIDE(pathname, pathnameW);
 16499    return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable);
 16500  }
 16501  
 16502  __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname,
 16503                                         unsigned target_meta, bool writeable) {
 16504  #endif /* Windows */
 16505  
 16506    if (unlikely(target_meta >= NUM_METAS))
 16507      return MDBX_EINVAL;
 16508    int rc = check_env(env, false);
 16509    if (unlikely(rc != MDBX_SUCCESS))
 16510      return rc;
 16511    if (unlikely(env->me_map))
 16512      return MDBX_EPERM;
 16513  
 16514    env->me_stuck_meta = (int8_t)target_meta;
 16515    return
 16516  #if defined(_WIN32) || defined(_WIN64)
 16517        mdbx_env_openW
 16518  #else
 16519        mdbx_env_open
 16520  #endif /* Windows */
 16521        (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY,
 16522         0);
 16523  }
 16524  
 16525  typedef struct {
 16526    void *buffer_for_free;
 16527    pathchar_t *lck, *dxb;
 16528    size_t ent_len;
 16529  } MDBX_handle_env_pathname;
 16530  
 16531  static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) {
 16532  #if defined(_WIN32) || defined(_WIN64)
 16533    while (len > 0) {
 16534      pathchar_t a = *l++;
 16535      pathchar_t b = *r++;
 16536      a = (a == '\\') ? '/' : a;
 16537      b = (b == '\\') ? '/' : b;
 16538      if (a != b)
 16539        return false;
 16540    }
 16541    return true;
 16542  #else
 16543    return memcmp(l, r, len * sizeof(pathchar_t)) == 0;
 16544  #endif
 16545  }
 16546  
 16547  __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx,
 16548                                        const pathchar_t *pathname,
 16549                                        MDBX_env_flags_t *flags,
 16550                                        const mdbx_mode_t mode) {
 16551    memset(ctx, 0, sizeof(*ctx));
 16552    if (unlikely(!pathname || !*pathname))
 16553      return MDBX_EINVAL;
 16554  
 16555    int rc;
 16556  #if defined(_WIN32) || defined(_WIN64)
 16557    const DWORD dwAttrib = GetFileAttributesW(pathname);
 16558    if (dwAttrib == INVALID_FILE_ATTRIBUTES) {
 16559      rc = GetLastError();
 16560      if (rc != MDBX_ENOFILE)
 16561        return rc;
 16562      if (mode == 0 || (*flags & MDBX_RDONLY) != 0)
 16563        /* can't open existing */
 16564        return rc;
 16565  
 16566      /* auto-create directory if requested */
 16567      if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) {
 16568        rc = GetLastError();
 16569        if (rc != ERROR_ALREADY_EXISTS)
 16570          return rc;
 16571      }
 16572    } else {
 16573      /* ignore passed MDBX_NOSUBDIR flag and set it automatically */
 16574      *flags |= MDBX_NOSUBDIR;
 16575      if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY)
 16576        *flags -= MDBX_NOSUBDIR;
 16577    }
 16578  #else
 16579    struct stat st;
 16580    if (stat(pathname, &st)) {
 16581      rc = errno;
 16582      if (rc != MDBX_ENOFILE)
 16583        return rc;
 16584      if (mode == 0 || (*flags & MDBX_RDONLY) != 0)
 16585        /* can't open existing */
 16586        return rc;
 16587  
 16588      /* auto-create directory if requested */
 16589      const mdbx_mode_t dir_mode =
 16590          (/* inherit read/write permissions for group and others */ mode &
 16591           (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
 16592          /* always add read/write/search for owner */ S_IRWXU |
 16593          ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) |
 16594          ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0);
 16595      if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) {
 16596        rc = errno;
 16597        if (rc != EEXIST)
 16598          return rc;
 16599      }
 16600    } else {
 16601      /* ignore passed MDBX_NOSUBDIR flag and set it automatically */
 16602      *flags |= MDBX_NOSUBDIR;
 16603      if (S_ISDIR(st.st_mode))
 16604        *flags -= MDBX_NOSUBDIR;
 16605    }
 16606  #endif
 16607  
 16608    static const pathchar_t dxb_name[] = MDBX_DATANAME;
 16609    static const pathchar_t lck_name[] = MDBX_LOCKNAME;
 16610    static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX;
 16611  
 16612  #if defined(_WIN32) || defined(_WIN64)
 16613    assert(dxb_name[0] == '\\' && lck_name[0] == '\\');
 16614    const size_t pathname_len = wcslen(pathname);
 16615  #else
 16616    assert(dxb_name[0] == '/' && lck_name[0] == '/');
 16617    const size_t pathname_len = strlen(pathname);
 16618  #endif
 16619    assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/');
 16620    ctx->ent_len = pathname_len;
 16621    static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1;
 16622    if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len &&
 16623        path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name,
 16624                   dxb_name_len)) {
 16625      *flags -= MDBX_NOSUBDIR;
 16626      ctx->ent_len -= dxb_name_len;
 16627    }
 16628  
 16629    const size_t bytes_needed =
 16630        sizeof(pathchar_t) * ctx->ent_len * 2 +
 16631        ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t)
 16632                                  : sizeof(lck_name) + sizeof(dxb_name));
 16633    ctx->buffer_for_free = osal_malloc(bytes_needed);
 16634    if (!ctx->buffer_for_free)
 16635      return MDBX_ENOMEM;
 16636  
 16637    ctx->dxb = ctx->buffer_for_free;
 16638    ctx->lck = ctx->dxb + ctx->ent_len + 1;
 16639    memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1));
 16640    if (*flags & MDBX_NOSUBDIR) {
 16641      memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix));
 16642    } else {
 16643      ctx->lck += dxb_name_len;
 16644      memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name));
 16645      memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name));
 16646    }
 16647    memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len);
 16648  
 16649    return MDBX_SUCCESS;
 16650  }
 16651  
 16652  __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) {
 16653  #if defined(_WIN32) || defined(_WIN64)
 16654    const wchar_t *pathnameW = nullptr;
 16655    OSAL_MB2WIDE(pathname, pathnameW);
 16656    return mdbx_env_deleteW(pathnameW, mode);
 16657  }
 16658  
 16659  __cold int mdbx_env_deleteW(const wchar_t *pathname,
 16660                              MDBX_env_delete_mode_t mode) {
 16661  #endif /* Windows */
 16662  
 16663    switch (mode) {
 16664    default:
 16665      return MDBX_EINVAL;
 16666    case MDBX_ENV_JUST_DELETE:
 16667    case MDBX_ENV_ENSURE_UNUSED:
 16668    case MDBX_ENV_WAIT_FOR_UNUSED:
 16669      break;
 16670    }
 16671  
 16672  #ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */
 16673    MDBX_env *const dummy_env = alloca(sizeof(MDBX_env));
 16674  #else
 16675    MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo;
 16676  #endif
 16677    memset(dummy_env, 0, sizeof(*dummy_env));
 16678    dummy_env->me_flags =
 16679        (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS;
 16680    dummy_env->me_os_psize = (unsigned)osal_syspagesize();
 16681    dummy_env->me_psize = (unsigned)mdbx_default_pagesize();
 16682    dummy_env->me_pathname = (pathchar_t *)pathname;
 16683  
 16684    MDBX_handle_env_pathname env_pathname;
 16685    STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t));
 16686    int rc = MDBX_RESULT_TRUE,
 16687        err = handle_env_pathname(&env_pathname, pathname,
 16688                                  (MDBX_env_flags_t *)&dummy_env->me_flags, 0);
 16689    if (likely(err == MDBX_SUCCESS)) {
 16690      mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE,
 16691                        dxb_handle = INVALID_HANDLE_VALUE;
 16692      if (mode > MDBX_ENV_JUST_DELETE) {
 16693        err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb,
 16694                            &dxb_handle, 0);
 16695        err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err;
 16696        if (err == MDBX_SUCCESS) {
 16697          err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck,
 16698                              &clk_handle, 0);
 16699          err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err;
 16700        }
 16701        if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE)
 16702          err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED);
 16703        if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE)
 16704          err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED);
 16705      }
 16706  
 16707      if (err == MDBX_SUCCESS) {
 16708        err = osal_removefile(env_pathname.dxb);
 16709        if (err == MDBX_SUCCESS)
 16710          rc = MDBX_SUCCESS;
 16711        else if (err == MDBX_ENOFILE)
 16712          err = MDBX_SUCCESS;
 16713      }
 16714  
 16715      if (err == MDBX_SUCCESS) {
 16716        err = osal_removefile(env_pathname.lck);
 16717        if (err == MDBX_SUCCESS)
 16718          rc = MDBX_SUCCESS;
 16719        else if (err == MDBX_ENOFILE)
 16720          err = MDBX_SUCCESS;
 16721      }
 16722  
 16723      if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) {
 16724        err = osal_removedirectory(pathname);
 16725        if (err == MDBX_SUCCESS)
 16726          rc = MDBX_SUCCESS;
 16727        else if (err == MDBX_ENOFILE)
 16728          err = MDBX_SUCCESS;
 16729      }
 16730  
 16731      if (dxb_handle != INVALID_HANDLE_VALUE)
 16732        osal_closefile(dxb_handle);
 16733      if (clk_handle != INVALID_HANDLE_VALUE)
 16734        osal_closefile(clk_handle);
 16735    } else if (err == MDBX_ENOFILE)
 16736      err = MDBX_SUCCESS;
 16737  
 16738    osal_free(env_pathname.buffer_for_free);
 16739    return (err == MDBX_SUCCESS) ? rc : err;
 16740  }
 16741  
 16742  __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
 16743                           MDBX_env_flags_t flags, mdbx_mode_t mode) {
 16744  #if defined(_WIN32) || defined(_WIN64)
 16745    const wchar_t *pathnameW = nullptr;
 16746    OSAL_MB2WIDE(pathname, pathnameW);
 16747    return mdbx_env_openW(env, pathnameW, flags, mode);
 16748  }
 16749  
 16750  __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
 16751                            MDBX_env_flags_t flags, mdbx_mode_t mode) {
 16752  #endif /* Windows */
 16753  
 16754    int rc = check_env(env, false);
 16755    if (unlikely(rc != MDBX_SUCCESS))
 16756      return rc;
 16757  
 16758    if (unlikely(flags & ~ENV_USABLE_FLAGS))
 16759      return MDBX_EINVAL;
 16760  
 16761    if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE ||
 16762                 (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map))
 16763      return MDBX_EPERM;
 16764  
 16765    /* Pickup previously mdbx_env_set_flags(),
 16766     * but avoid MDBX_UTTERLY_NOSYNC by disjunction */
 16767    const uint32_t saved_me_flags = env->me_flags;
 16768    flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags);
 16769  
 16770    if (flags & MDBX_RDONLY) {
 16771      /* Silently ignore irrelevant flags when we're only getting read access */
 16772      flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC |
 16773                 MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM |
 16774                 MDBX_NOMEMINIT | MDBX_ACCEDE);
 16775      mode = 0;
 16776    } else {
 16777  #if MDBX_MMAP_INCOHERENT_FILE_WRITE
 16778      /* Temporary `workaround` for OpenBSD kernel's flaw.
 16779       * See todo4recovery://erased_by_github/libmdbx/issues/67 */
 16780      if ((flags & MDBX_WRITEMAP) == 0) {
 16781        if (flags & MDBX_ACCEDE)
 16782          flags |= MDBX_WRITEMAP;
 16783        else {
 16784          debug_log(MDBX_LOG_ERROR, __func__, __LINE__,
 16785                    "System (i.e. OpenBSD) requires MDBX_WRITEMAP because "
 16786                    "of an internal flaw(s) in a file/buffer/page cache.\n");
 16787          return 42 /* ENOPROTOOPT */;
 16788        }
 16789      }
 16790  #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
 16791    }
 16792  
 16793    MDBX_handle_env_pathname env_pathname;
 16794    rc = handle_env_pathname(&env_pathname, pathname, &flags, mode);
 16795    if (unlikely(rc != MDBX_SUCCESS))
 16796      goto bailout;
 16797  
 16798    env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE;
 16799    env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t));
 16800    env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx));
 16801    env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0]));
 16802    env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0]));
 16803    if (!(env->me_dbxs && env->me_pathname && env->me_dbflags &&
 16804          env->me_dbiseqs)) {
 16805      rc = MDBX_ENOMEM;
 16806      goto bailout;
 16807    }
 16808    memcpy(env->me_pathname, env_pathname.dxb,
 16809           env_pathname.ent_len * sizeof(pathchar_t));
 16810    env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */
 16811    env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast;
 16812  
 16813    rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ
 16814                                             : MDBX_OPEN_DXB_LAZY,
 16815                       env, env_pathname.dxb, &env->me_lazy_fd, mode);
 16816    if (rc != MDBX_SUCCESS)
 16817      goto bailout;
 16818  
 16819    eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
 16820    if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
 16821      rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
 16822                         &env->me_dsync_fd, 0);
 16823      ENSURE(env,
 16824             (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE));
 16825    }
 16826  
 16827  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
 16828    env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42);
 16829    if (env->me_sysv_ipc.key == -1) {
 16830      rc = errno;
 16831      goto bailout;
 16832    }
 16833  #endif /* MDBX_LOCKING */
 16834  
 16835  #if !(defined(_WIN32) || defined(_WIN64))
 16836    if (mode == 0) {
 16837      /* pickup mode for lck-file */
 16838      struct stat st;
 16839      if (fstat(env->me_lazy_fd, &st)) {
 16840        rc = errno;
 16841        goto bailout;
 16842      }
 16843      mode = st.st_mode;
 16844    }
 16845    mode = (/* inherit read permissions for group and others */ mode &
 16846            (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
 16847           /* always add read/write for owner */ S_IRUSR | S_IWUSR |
 16848           ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) |
 16849           ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0);
 16850  #endif /* !Windows */
 16851    const int lck_rc = setup_lck(env, env_pathname.lck, mode);
 16852    if (MDBX_IS_ERROR(lck_rc)) {
 16853      rc = lck_rc;
 16854      goto bailout;
 16855    }
 16856  
 16857    /* Set the position in files outside of the data to avoid corruption
 16858     * due to erroneous use of file descriptors in the application code. */
 16859    osal_fseek(env->me_lfd, UINT64_C(1) << 63);
 16860    osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63);
 16861    if (env->me_dsync_fd != INVALID_HANDLE_VALUE)
 16862      osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63);
 16863  
 16864    const MDBX_env_flags_t rigorous_flags =
 16865        MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC;
 16866    const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC |
 16867                                        MDBX_LIFORECLAIM |
 16868                                        MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD;
 16869  
 16870    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 16871    if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
 16872      while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) {
 16873        if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY,
 16874                         env->me_flags & mode_flags)) {
 16875          /* The case:
 16876           *  - let's assume that for some reason the DB file is smaller
 16877           *    than it should be according to the geometry,
 16878           *    but not smaller than the last page used;
 16879           *  - the first process that opens the database (lck_rc == RESULT_TRUE)
 16880           *    does this in readonly mode and therefore cannot bring
 16881           *    the file size back to normal;
 16882           *  - some next process (lck_rc != RESULT_TRUE) opens the DB in
 16883           *    read-write mode and now is here.
 16884           *
 16885           * FIXME: Should we re-check and set the size of DB-file right here? */
 16886          break;
 16887        }
 16888        atomic_yield();
 16889      }
 16890  
 16891      if (env->me_flags & MDBX_ACCEDE) {
 16892        /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */
 16893        const unsigned diff =
 16894            (lck->mti_envmode.weak ^ env->me_flags) & mode_flags;
 16895        NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags,
 16896               env->me_flags ^ diff);
 16897        env->me_flags ^= diff;
 16898      }
 16899  
 16900      if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) {
 16901        ERROR("%s", "current mode/flags incompatible with requested");
 16902        rc = MDBX_INCOMPATIBLE;
 16903        goto bailout;
 16904      }
 16905    }
 16906  
 16907    const int dxb_rc = setup_dxb(env, lck_rc, mode);
 16908    if (MDBX_IS_ERROR(dxb_rc)) {
 16909      rc = dxb_rc;
 16910      goto bailout;
 16911    }
 16912  
 16913    if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) &&
 16914        (lck_rc != /* exclusive */ MDBX_RESULT_TRUE ||
 16915         (flags & MDBX_EXCLUSIVE) == 0)) {
 16916      ERROR("%s", "recovery requires exclusive mode");
 16917      rc = MDBX_BUSY;
 16918      goto bailout;
 16919    }
 16920  
 16921    DEBUG("opened dbenv %p", (void *)env);
 16922    if (lck) {
 16923      if (lck_rc == MDBX_RESULT_TRUE) {
 16924        lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY);
 16925        lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env);
 16926        lck->mti_reader_check_timestamp.weak = osal_monotime();
 16927        rc = osal_lck_downgrade(env);
 16928        DEBUG("lck-downgrade-%s: rc %i",
 16929              (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
 16930        if (rc != MDBX_SUCCESS)
 16931          goto bailout;
 16932      } else {
 16933        rc = cleanup_dead_readers(env, false, NULL);
 16934        if (MDBX_IS_ERROR(rc))
 16935          goto bailout;
 16936      }
 16937  
 16938      if ((env->me_flags & MDBX_NOTLS) == 0) {
 16939        rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0],
 16940                        &lck->mti_readers[env->me_maxreaders]);
 16941        if (unlikely(rc != MDBX_SUCCESS))
 16942          goto bailout;
 16943        env->me_flags |= MDBX_ENV_TXKEY;
 16944      }
 16945    } else {
 16946      env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY);
 16947      env->me_lck->mti_meta_sync_txnid.weak =
 16948          (uint32_t)recent_committed_txnid(env);
 16949      env->me_lck->mti_reader_check_timestamp.weak = osal_monotime();
 16950    }
 16951  
 16952    if ((flags & MDBX_RDONLY) == 0) {
 16953      const size_t tsize = sizeof(MDBX_txn),
 16954                   size = tsize + env->me_maxdbs *
 16955                                      (sizeof(MDBX_db) + sizeof(MDBX_cursor *) +
 16956                                       sizeof(MDBX_atomic_uint32_t) + 1);
 16957      rc = alloc_page_buf(env);
 16958      if (rc == MDBX_SUCCESS) {
 16959        memset(env->me_pbuf, -1, env->me_psize * 2);
 16960        MDBX_txn *txn = osal_calloc(1, size);
 16961        if (txn) {
 16962          txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
 16963          txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
 16964          txn->mt_dbiseqs =
 16965              (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs);
 16966          txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs);
 16967          txn->mt_env = env;
 16968          txn->mt_dbxs = env->me_dbxs;
 16969          txn->mt_flags = MDBX_TXN_FINISHED;
 16970          env->me_txn0 = txn;
 16971          txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL);
 16972          txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL);
 16973          if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist))
 16974            rc = MDBX_ENOMEM;
 16975        } else
 16976          rc = MDBX_ENOMEM;
 16977      }
 16978    }
 16979  
 16980  #if MDBX_DEBUG
 16981    if (rc == MDBX_SUCCESS) {
 16982      const meta_troika_t troika = meta_tap(env);
 16983      const meta_ptr_t head = meta_recent(env, &troika);
 16984      const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI];
 16985  
 16986      DEBUG("opened database version %u, pagesize %u",
 16987            (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version),
 16988            env->me_psize);
 16989      DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN,
 16990            data_page(head.ptr_c)->mp_pgno, head.txnid);
 16991      DEBUG("depth: %u", db->md_depth);
 16992      DEBUG("entries: %" PRIu64, db->md_entries);
 16993      DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages);
 16994      DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages);
 16995      DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages);
 16996      DEBUG("root: %" PRIaPGNO, db->md_root);
 16997      DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid);
 16998    }
 16999  #endif
 17000  
 17001  bailout:
 17002    if (rc != MDBX_SUCCESS) {
 17003      rc = env_close(env) ? MDBX_PANIC : rc;
 17004      env->me_flags =
 17005          saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR);
 17006    } else {
 17007  #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
 17008      txn_valgrind(env, nullptr);
 17009  #endif
 17010    }
 17011    osal_free(env_pathname.buffer_for_free);
 17012    return rc;
 17013  }
 17014  
 17015  /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */
 17016  __cold static int env_close(MDBX_env *env) {
 17017    const unsigned flags = env->me_flags;
 17018    if (!(flags & MDBX_ENV_ACTIVE)) {
 17019      ENSURE(env, env->me_lcklist_next == nullptr);
 17020      return MDBX_SUCCESS;
 17021    }
 17022  
 17023    env->me_flags &= ~ENV_INTERNAL_FLAGS;
 17024    env->me_lck = nullptr;
 17025    if (flags & MDBX_ENV_TXKEY) {
 17026      rthc_remove(env->me_txkey);
 17027      env->me_txkey = (osal_thread_key_t)0;
 17028    }
 17029  
 17030    lcklist_lock();
 17031    const int rc = lcklist_detach_locked(env);
 17032    lcklist_unlock();
 17033  
 17034    if (env->me_map) {
 17035      osal_munmap(&env->me_dxb_mmap);
 17036  #ifdef MDBX_USE_VALGRIND
 17037      VALGRIND_DISCARD(env->me_valgrind_handle);
 17038      env->me_valgrind_handle = -1;
 17039  #endif
 17040    }
 17041  
 17042    if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
 17043      (void)osal_closefile(env->me_dsync_fd);
 17044      env->me_dsync_fd = INVALID_HANDLE_VALUE;
 17045    }
 17046  
 17047    if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
 17048      (void)osal_closefile(env->me_lazy_fd);
 17049      env->me_lazy_fd = INVALID_HANDLE_VALUE;
 17050    }
 17051  
 17052    if (env->me_lck_mmap.lck)
 17053      osal_munmap(&env->me_lck_mmap);
 17054  
 17055    if (env->me_lfd != INVALID_HANDLE_VALUE) {
 17056      (void)osal_closefile(env->me_lfd);
 17057      env->me_lfd = INVALID_HANDLE_VALUE;
 17058    }
 17059  
 17060    if (env->me_dbxs) {
 17061      for (unsigned i = env->me_numdbs; --i >= CORE_DBS;)
 17062        osal_free(env->me_dbxs[i].md_name.iov_base);
 17063      osal_free(env->me_dbxs);
 17064      env->me_dbxs = nullptr;
 17065    }
 17066    if (env->me_pbuf) {
 17067      osal_memalign_free(env->me_pbuf);
 17068      env->me_pbuf = nullptr;
 17069    }
 17070    if (env->me_dbiseqs) {
 17071      osal_free(env->me_dbiseqs);
 17072      env->me_dbiseqs = nullptr;
 17073    }
 17074    if (env->me_dbflags) {
 17075      osal_free(env->me_dbflags);
 17076      env->me_dbflags = nullptr;
 17077    }
 17078    if (env->me_pathname) {
 17079      osal_free(env->me_pathname);
 17080      env->me_pathname = nullptr;
 17081    }
 17082    if (env->me_txn0) {
 17083      dpl_free(env->me_txn0);
 17084      txl_free(env->me_txn0->tw.lifo_reclaimed);
 17085      pnl_free(env->me_txn0->tw.retired_pages);
 17086      pnl_free(env->me_txn0->tw.spill_pages);
 17087      pnl_free(env->me_txn0->tw.reclaimed_pglist);
 17088      osal_free(env->me_txn0);
 17089      env->me_txn0 = nullptr;
 17090    }
 17091    env->me_stuck_meta = -1;
 17092    return rc;
 17093  }
 17094  
 17095  __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
 17096    MDBX_page *dp;
 17097    int rc = MDBX_SUCCESS;
 17098  
 17099    if (unlikely(!env))
 17100      return MDBX_EINVAL;
 17101  
 17102    if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE))
 17103      return MDBX_EBADSIGN;
 17104  
 17105  #if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64))
 17106    /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows
 17107     * platforms (i.e. where fork() is available).
 17108     * This is required to legitimize a call after fork()
 17109     * from a child process, that should be allowed to free resources. */
 17110    if (unlikely(env->me_pid != osal_getpid()))
 17111      env->me_flags |= MDBX_FATAL_ERROR;
 17112  #endif /* MDBX_ENV_CHECKPID */
 17113  
 17114    if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 &&
 17115        env->me_txn0) {
 17116      if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self())
 17117        return MDBX_BUSY;
 17118    } else
 17119      dont_sync = true;
 17120  
 17121    if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, 0))
 17122      return MDBX_EBADSIGN;
 17123  
 17124    if (!dont_sync) {
 17125  #if defined(_WIN32) || defined(_WIN64)
 17126      /* On windows, without blocking is impossible to determine whether another
 17127       * process is running a writing transaction or not.
 17128       * Because in the "owner died" condition kernel don't release
 17129       * file lock immediately. */
 17130      rc = env_sync(env, true, false);
 17131      rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
 17132  #else
 17133      struct stat st;
 17134      if (unlikely(fstat(env->me_lazy_fd, &st)))
 17135        rc = errno;
 17136      else if (st.st_nlink > 0 /* don't sync deleted files */) {
 17137        rc = env_sync(env, true, true);
 17138        rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY ||
 17139              rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE)
 17140                 ? MDBX_SUCCESS
 17141                 : rc;
 17142      }
 17143  #endif
 17144    }
 17145  
 17146    eASSERT(env, env->me_signature.weak == 0);
 17147    rc = env_close(env) ? MDBX_PANIC : rc;
 17148    ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS);
 17149  #if defined(_WIN32) || defined(_WIN64)
 17150    /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */
 17151    DeleteCriticalSection(&env->me_windowsbug_lock);
 17152  #else
 17153    ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS);
 17154  #endif /* Windows */
 17155  
 17156  #if MDBX_LOCKING > MDBX_LOCKING_SYSV
 17157    MDBX_lockinfo *const stub = lckless_stub(env);
 17158    ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0);
 17159  #endif /* MDBX_LOCKING */
 17160  
 17161    while ((dp = env->me_dp_reserve) != NULL) {
 17162      MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
 17163      VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
 17164      env->me_dp_reserve = dp->mp_next;
 17165      osal_free(dp);
 17166    }
 17167    VALGRIND_DESTROY_MEMPOOL(env);
 17168    ENSURE(env, env->me_lcklist_next == nullptr);
 17169    env->me_pid = 0;
 17170    osal_free(env);
 17171  
 17172    return rc;
 17173  }
 17174  
 17175  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 17176  __cold int mdbx_env_close(MDBX_env *env) {
 17177    return __inline_mdbx_env_close(env);
 17178  }
 17179  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 17180  
 17181  /* Compare two items pointing at aligned unsigned int's. */
 17182  __hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) {
 17183    eASSERT(NULL, a->iov_len == b->iov_len);
 17184    switch (a->iov_len) {
 17185    case 4:
 17186      return CMP2INT(unaligned_peek_u32(4, a->iov_base),
 17187                     unaligned_peek_u32(4, b->iov_base));
 17188    case 8:
 17189      return CMP2INT(unaligned_peek_u64(4, a->iov_base),
 17190                     unaligned_peek_u64(4, b->iov_base));
 17191    default:
 17192      mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
 17193                       __LINE__);
 17194      return 0;
 17195    }
 17196  }
 17197  
 17198  /* Compare two items pointing at 2-byte aligned unsigned int's. */
 17199  __hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) {
 17200    eASSERT(NULL, a->iov_len == b->iov_len);
 17201    switch (a->iov_len) {
 17202    case 4:
 17203      return CMP2INT(unaligned_peek_u32(2, a->iov_base),
 17204                     unaligned_peek_u32(2, b->iov_base));
 17205    case 8:
 17206      return CMP2INT(unaligned_peek_u64(2, a->iov_base),
 17207                     unaligned_peek_u64(2, b->iov_base));
 17208    default:
 17209      mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
 17210                       __LINE__);
 17211      return 0;
 17212    }
 17213  }
 17214  
 17215  /* Compare two items pointing at unsigned values with unknown alignment.
 17216   *
 17217   * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */
 17218  __hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) {
 17219    eASSERT(NULL, a->iov_len == b->iov_len);
 17220    switch (a->iov_len) {
 17221    case 4:
 17222      return CMP2INT(unaligned_peek_u32(1, a->iov_base),
 17223                     unaligned_peek_u32(1, b->iov_base));
 17224    case 8:
 17225      return CMP2INT(unaligned_peek_u64(1, a->iov_base),
 17226                     unaligned_peek_u64(1, b->iov_base));
 17227    default:
 17228      mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
 17229                       __LINE__);
 17230      return 0;
 17231    }
 17232  }
 17233  
 17234  /* Compare two items lexically */
 17235  __hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) {
 17236    if (a->iov_len == b->iov_len)
 17237      return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0;
 17238  
 17239    const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1;
 17240    const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
 17241    int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0;
 17242    return likely(diff_data) ? diff_data : diff_len;
 17243  }
 17244  
 17245  /* Compare two items in reverse byte order */
 17246  __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) {
 17247    const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
 17248    if (likely(shortest)) {
 17249      const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len;
 17250      const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len;
 17251      const uint8_t *const end = pa - shortest;
 17252      do {
 17253        int diff = *--pa - *--pb;
 17254        if (likely(diff))
 17255          return diff;
 17256      } while (pa != end);
 17257    }
 17258    return CMP2INT(a->iov_len, b->iov_len);
 17259  }
 17260  
 17261  /* Fast non-lexically comparator */
 17262  __hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) {
 17263    int diff = CMP2INT(a->iov_len, b->iov_len);
 17264    return likely(diff) || a->iov_len == 0
 17265               ? diff
 17266               : memcmp(a->iov_base, b->iov_base, a->iov_len);
 17267  }
 17268  
 17269  static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a,
 17270                           const MDBX_val *b) {
 17271    /* checking for the use of a known good comparator
 17272     * or/otherwise for a full byte-to-byte match */
 17273    return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse ||
 17274           cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0;
 17275  }
 17276  
 17277  /* Search for key within a page, using binary search.
 17278   * Returns the smallest entry larger or equal to the key.
 17279   * Updates the cursor index with the index of the found entry.
 17280   * If no entry larger or equal to the key is found, returns NULL. */
 17281  __hot static struct node_result node_search(MDBX_cursor *mc,
 17282                                              const MDBX_val *key) {
 17283    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 17284    const int nkeys = page_numkeys(mp);
 17285    DKBUF_DEBUG;
 17286  
 17287    DEBUG("searching %u keys in %s %spage %" PRIaPGNO, nkeys,
 17288          IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
 17289          mp->mp_pgno);
 17290  
 17291    struct node_result ret;
 17292    ret.exact = false;
 17293    STATIC_ASSERT(P_BRANCH == 1);
 17294    int low = mp->mp_flags & P_BRANCH;
 17295    int high = nkeys - 1;
 17296    if (unlikely(high < low)) {
 17297      mc->mc_ki[mc->mc_top] = 0;
 17298      ret.node = NULL;
 17299      return ret;
 17300    }
 17301  
 17302    int i;
 17303    MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp;
 17304    MDBX_val nodekey;
 17305    if (unlikely(IS_LEAF2(mp))) {
 17306      cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize);
 17307      nodekey.iov_len = mp->mp_leaf2_ksize;
 17308      do {
 17309        i = (low + high) >> 1;
 17310        nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len);
 17311        cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >=
 17312                        (char *)nodekey.iov_base + nodekey.iov_len);
 17313        int cr = cmp(key, &nodekey);
 17314        DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
 17315        if (cr > 0)
 17316          /* Found entry is less than the key. */
 17317          /* Skip to get the smallest entry larger than key. */
 17318          low = ++i;
 17319        else if (cr < 0)
 17320          high = i - 1;
 17321        else {
 17322          ret.exact = true;
 17323          break;
 17324        }
 17325      } while (likely(low <= high));
 17326  
 17327      /* store the key index */
 17328      mc->mc_ki[mc->mc_top] = (indx_t)i;
 17329      ret.node = (i < nkeys)
 17330                     ? /* fake for LEAF2 */ (MDBX_node *)(intptr_t)-1
 17331                     : /* There is no entry larger or equal to the key. */ NULL;
 17332      return ret;
 17333    }
 17334  
 17335    if (IS_BRANCH(mp) && cmp == cmp_int_align2)
 17336      /* Branch pages have no data, so if using integer keys,
 17337       * alignment is guaranteed. Use faster cmp_int_align4(). */
 17338      cmp = cmp_int_align4;
 17339  
 17340    MDBX_node *node;
 17341    do {
 17342      i = (low + high) >> 1;
 17343      node = page_node(mp, i);
 17344      nodekey.iov_len = node_ks(node);
 17345      nodekey.iov_base = node_key(node);
 17346      cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >=
 17347                      (char *)nodekey.iov_base + nodekey.iov_len);
 17348      int cr = cmp(key, &nodekey);
 17349      if (IS_LEAF(mp))
 17350        DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr);
 17351      else
 17352        DEBUG("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i,
 17353              DKEY_DEBUG(&nodekey), node_pgno(node), cr);
 17354      if (cr > 0)
 17355        /* Found entry is less than the key. */
 17356        /* Skip to get the smallest entry larger than key. */
 17357        low = ++i;
 17358      else if (cr < 0)
 17359        high = i - 1;
 17360      else {
 17361        ret.exact = true;
 17362        break;
 17363      }
 17364    } while (likely(low <= high));
 17365  
 17366    /* store the key index */
 17367    mc->mc_ki[mc->mc_top] = (indx_t)i;
 17368    ret.node = (i < nkeys)
 17369                   ? page_node(mp, i)
 17370                   : /* There is no entry larger or equal to the key. */ NULL;
 17371    return ret;
 17372  }
 17373  
 17374  /* Pop a page off the top of the cursor's stack. */
 17375  static __inline void cursor_pop(MDBX_cursor *mc) {
 17376    if (likely(mc->mc_snum)) {
 17377      DEBUG("popped page %" PRIaPGNO " off db %d cursor %p",
 17378            mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc);
 17379      if (likely(--mc->mc_snum)) {
 17380        mc->mc_top--;
 17381      } else {
 17382        mc->mc_flags &= ~C_INITIALIZED;
 17383      }
 17384    }
 17385  }
 17386  
 17387  /* Push a page onto the top of the cursor's stack.
 17388   * Set MDBX_TXN_ERROR on failure. */
 17389  static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) {
 17390    DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc),
 17391          (void *)mc);
 17392  
 17393    if (unlikely(mc->mc_snum >= CURSOR_STACK)) {
 17394      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 17395      return MDBX_CURSOR_FULL;
 17396    }
 17397  
 17398    mc->mc_top = mc->mc_snum++;
 17399    mc->mc_pg[mc->mc_top] = mp;
 17400    mc->mc_ki[mc->mc_top] = 0;
 17401    return MDBX_SUCCESS;
 17402  }
 17403  
 17404  __hot static __always_inline int page_get_checker_lite(const uint16_t ILL,
 17405                                                         const MDBX_page *page,
 17406                                                         MDBX_txn *const txn,
 17407                                                         const txnid_t front) {
 17408    if (unlikely(page->mp_flags & ILL)) {
 17409      if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS))
 17410        return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags);
 17411      else if (ILL & P_OVERFLOW) {
 17412        assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0);
 17413        assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2));
 17414        return bad_page(page, "unexpected %s instead of %s (%u)\n",
 17415                        "large/overlow", "branch/leaf/leaf2", page->mp_flags);
 17416      } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) {
 17417        assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2));
 17418        assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2));
 17419        return bad_page(page, "unexpected %s instead of %s (%u)\n",
 17420                        "branch/leaf/leaf2", "large/overlow", page->mp_flags);
 17421      } else {
 17422        assert(false);
 17423      }
 17424    }
 17425  
 17426    if (unlikely(page->mp_txnid > front) &&
 17427        unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid))
 17428      return bad_page(
 17429          page,
 17430          "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n",
 17431          page->mp_txnid,
 17432          (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn"
 17433                                                             : "parent-page",
 17434          front);
 17435  
 17436    if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) &&
 17437        (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) {
 17438      if (unlikely(page->mp_upper < page->mp_lower ||
 17439                   ((page->mp_lower | page->mp_upper) & 1) ||
 17440                   PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize))
 17441        return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %u\n",
 17442                        page->mp_lower, page->mp_upper, page_space(txn->mt_env));
 17443  
 17444    } else if ((ILL & P_OVERFLOW) == 0) {
 17445      const pgno_t npages = page->mp_pages;
 17446      if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2))
 17447        return bad_page(page, "invalid n-pages (%u) for large-page\n", npages);
 17448      if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno))
 17449        return bad_page(
 17450            page,
 17451            "end of large-page beyond (%u) allocated space (%u next-pgno)\n",
 17452            page->mp_pgno + npages, txn->mt_next_pgno);
 17453    } else {
 17454      assert(false);
 17455    }
 17456    return MDBX_SUCCESS;
 17457  }
 17458  
 17459  __cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL,
 17460                                                       MDBX_page *page,
 17461                                                       MDBX_cursor *const mc,
 17462                                                       const txnid_t front) {
 17463    pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)};
 17464    if (likely(r.err == MDBX_SUCCESS))
 17465      r.err = page_check(mc, page);
 17466    if (unlikely(r.err != MDBX_SUCCESS))
 17467      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 17468    return r;
 17469  }
 17470  
 17471  __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL,
 17472                                                     MDBX_cursor *const mc,
 17473                                                     const pgno_t pgno,
 17474                                                     const txnid_t front) {
 17475    MDBX_txn *const txn = mc->mc_txn;
 17476    tASSERT(txn, front <= txn->mt_front);
 17477  
 17478    pgr_t r;
 17479    if (unlikely(pgno >= txn->mt_next_pgno)) {
 17480      ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno);
 17481      r.page = nullptr;
 17482      r.err = MDBX_PAGE_NOTFOUND;
 17483    bailout:
 17484      txn->mt_flags |= MDBX_TXN_ERROR;
 17485      return r;
 17486    }
 17487  
 17488    eASSERT(txn->mt_env,
 17489            ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0);
 17490    r.page = pgno2page(txn->mt_env, pgno);
 17491    if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) {
 17492      const MDBX_txn *spiller = txn;
 17493      do {
 17494        /* Spilled pages were dirtied in this txn and flushed
 17495         * because the dirty list got full. Bring this page
 17496         * back in from the map (but don't unspill it here,
 17497         * leave that unless page_touch happens again). */
 17498        if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) &&
 17499            search_spilled(spiller, pgno))
 17500          break;
 17501  
 17502        const unsigned i = dpl_search(spiller, pgno);
 17503        tASSERT(txn, (int)i > 0);
 17504        if (spiller->tw.dirtylist->items[i].pgno == pgno) {
 17505          spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++;
 17506          r.page = spiller->tw.dirtylist->items[i].ptr;
 17507          break;
 17508        }
 17509  
 17510        spiller = spiller->mt_parent;
 17511      } while (spiller);
 17512    }
 17513  
 17514    if (unlikely(r.page->mp_pgno != pgno)) {
 17515      r.err = bad_page(
 17516          r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n",
 17517          r.page->mp_pgno, pgno);
 17518      goto bailout;
 17519    }
 17520  
 17521    if (unlikely(mc->mc_checking & CC_PAGECHECK))
 17522      return page_get_checker_full(ILL, r.page, mc, front);
 17523  
 17524  #if MDBX_DISABLE_VALIDATION
 17525    r.err = MDBX_SUCCESS;
 17526  #else
 17527    r.err = page_get_checker_lite(ILL, r.page, txn, front);
 17528    if (unlikely(r.err != MDBX_SUCCESS))
 17529      goto bailout;
 17530  #endif /* MDBX_DISABLE_VALIDATION */
 17531    return r;
 17532  }
 17533  
 17534  /* Finish mdbx_page_search() / mdbx_page_search_lowest().
 17535   * The cursor is at the root page, set up the rest of it. */
 17536  __hot __noinline static int page_search_root(MDBX_cursor *mc,
 17537                                               const MDBX_val *key, int flags) {
 17538    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 17539    int rc;
 17540    DKBUF_DEBUG;
 17541  
 17542    while (IS_BRANCH(mp)) {
 17543      MDBX_node *node;
 17544      int i;
 17545  
 17546      DEBUG("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno,
 17547            page_numkeys(mp));
 17548      /* Don't assert on branch pages in the GC. We can get here
 17549       * while in the process of rebalancing a GC branch page; we must
 17550       * let that proceed. ITS#8336 */
 17551      cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1);
 17552      DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0)));
 17553  
 17554      if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) {
 17555        i = 0;
 17556        if (flags & MDBX_PS_LAST) {
 17557          i = page_numkeys(mp) - 1;
 17558          /* if already init'd, see if we're already in right place */
 17559          if (mc->mc_flags & C_INITIALIZED) {
 17560            if (mc->mc_ki[mc->mc_top] == i) {
 17561              mc->mc_top = mc->mc_snum++;
 17562              mp = mc->mc_pg[mc->mc_top];
 17563              goto ready;
 17564            }
 17565          }
 17566        }
 17567      } else {
 17568        const struct node_result nsr = node_search(mc, key);
 17569        if (likely(nsr.node))
 17570          i = mc->mc_ki[mc->mc_top] + nsr.exact - 1;
 17571        else
 17572          i = page_numkeys(mp) - 1;
 17573        DEBUG("following index %u for key [%s]", i, DKEY_DEBUG(key));
 17574      }
 17575  
 17576      cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp));
 17577      node = page_node(mp, i);
 17578  
 17579      rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
 17580      if (unlikely(rc != MDBX_SUCCESS))
 17581        return rc;
 17582  
 17583      mc->mc_ki[mc->mc_top] = (indx_t)i;
 17584      if (unlikely(rc = cursor_push(mc, mp)))
 17585        return rc;
 17586  
 17587    ready:
 17588      if (flags & MDBX_PS_MODIFY) {
 17589        if (unlikely((rc = page_touch(mc)) != 0))
 17590          return rc;
 17591        mp = mc->mc_pg[mc->mc_top];
 17592      }
 17593    }
 17594  
 17595    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 17596      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 17597            mp->mp_pgno, mp->mp_flags);
 17598      return MDBX_CORRUPTED;
 17599    }
 17600  
 17601    DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno,
 17602          DKEY_DEBUG(key));
 17603    mc->mc_flags |= C_INITIALIZED;
 17604    mc->mc_flags &= ~C_EOF;
 17605  
 17606    return MDBX_SUCCESS;
 17607  }
 17608  
 17609  static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db,
 17610                       const unsigned pagesize) {
 17611    if (unlikely(!dbx->md_cmp)) {
 17612      dbx->md_cmp = get_default_keycmp(db->md_flags);
 17613      dbx->md_dcmp = get_default_datacmp(db->md_flags);
 17614    }
 17615  
 17616    dbx->md_klen_min =
 17617        (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0;
 17618    dbx->md_klen_max = keysize_max(pagesize, db->md_flags);
 17619    assert(dbx->md_klen_max != (unsigned)-1);
 17620  
 17621    dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP)
 17622                           ? 4 /* sizeof(uint32_t) */
 17623                           : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0);
 17624    dbx->md_vlen_max = valsize_max(pagesize, db->md_flags);
 17625    assert(dbx->md_vlen_max != (unsigned)-1);
 17626  
 17627    if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) {
 17628      if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min ||
 17629                                               db->md_xsize > dbx->md_vlen_max)) {
 17630        ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize,
 17631              dbx->md_vlen_min, dbx->md_vlen_max);
 17632        return MDBX_CORRUPTED;
 17633      }
 17634      dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize;
 17635    }
 17636    return MDBX_SUCCESS;
 17637  }
 17638  
 17639  static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) {
 17640    MDBX_cursor_couple couple;
 17641    if (unlikely(dbi_changed(txn, dbi))) {
 17642      NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid);
 17643      return MDBX_BAD_DBI;
 17644    }
 17645    int rc = cursor_init(&couple.outer, txn, MAIN_DBI);
 17646    if (unlikely(rc != MDBX_SUCCESS))
 17647      return rc;
 17648  
 17649    MDBX_dbx *const dbx = &txn->mt_dbxs[dbi];
 17650    rc = page_search(&couple.outer, &dbx->md_name, 0);
 17651    if (unlikely(rc != MDBX_SUCCESS)) {
 17652    notfound:
 17653      NOTICE("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN
 17654             " (err %d)",
 17655             dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base,
 17656             txn->mt_txnid, rc);
 17657      return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc;
 17658    }
 17659  
 17660    MDBX_val data;
 17661    struct node_result nsr = node_search(&couple.outer, &dbx->md_name);
 17662    if (unlikely(!nsr.exact)) {
 17663      rc = MDBX_NOTFOUND;
 17664      goto notfound;
 17665    }
 17666    if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) {
 17667      NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
 17668             dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base,
 17669             txn->mt_txnid, "wrong flags");
 17670      return MDBX_INCOMPATIBLE; /* not a named DB */
 17671    }
 17672  
 17673    rc = node_read(&couple.outer, nsr.node, &data,
 17674                   couple.outer.mc_pg[couple.outer.mc_top]);
 17675    if (unlikely(rc != MDBX_SUCCESS))
 17676      return rc;
 17677  
 17678    if (unlikely(data.iov_len != sizeof(MDBX_db))) {
 17679      NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)",
 17680             dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base,
 17681             txn->mt_txnid, "wrong rec-size");
 17682      return MDBX_INCOMPATIBLE; /* not a named DB */
 17683    }
 17684  
 17685    uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags);
 17686    /* The txn may not know this DBI, or another process may
 17687     * have dropped and recreated the DB with other flags. */
 17688    MDBX_db *const db = &txn->mt_dbs[dbi];
 17689    if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) {
 17690      NOTICE("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN
 17691             " with different flags (present 0x%X != wanna 0x%X)",
 17692             dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base,
 17693             txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags);
 17694      return MDBX_INCOMPATIBLE;
 17695    }
 17696  
 17697    memcpy(db, data.iov_base, sizeof(MDBX_db));
 17698  #if !MDBX_DISABLE_VALIDATION
 17699    const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid;
 17700    tASSERT(txn, txn->mt_front >= pp_txnid);
 17701    if (unlikely(db->md_mod_txnid > pp_txnid)) {
 17702      ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")",
 17703            db->md_mod_txnid, pp_txnid);
 17704      return MDBX_CORRUPTED;
 17705    }
 17706  #endif /* !MDBX_DISABLE_VALIDATION */
 17707    rc = setup_dbx(dbx, db, txn->mt_env->me_psize);
 17708    if (unlikely(rc != MDBX_SUCCESS))
 17709      return rc;
 17710  
 17711    txn->mt_dbistate[dbi] &= ~DBI_STALE;
 17712    return MDBX_SUCCESS;
 17713  }
 17714  
 17715  /* Search for the lowest key under the current branch page.
 17716   * This just bypasses a numkeys check in the current page
 17717   * before calling mdbx_page_search_root(), because the callers
 17718   * are all in situations where the current page is known to
 17719   * be underfilled. */
 17720  __hot static int page_search_lowest(MDBX_cursor *mc) {
 17721    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 17722    cASSERT(mc, IS_BRANCH(mp));
 17723    MDBX_node *node = page_node(mp, 0);
 17724  
 17725    int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
 17726    if (unlikely(rc != MDBX_SUCCESS))
 17727      return rc;
 17728  
 17729    mc->mc_ki[mc->mc_top] = 0;
 17730    if (unlikely(rc = cursor_push(mc, mp)))
 17731      return rc;
 17732    return page_search_root(mc, NULL, MDBX_PS_FIRST);
 17733  }
 17734  
 17735  /* Search for the page a given key should be in.
 17736   * Push it and its parent pages on the cursor stack.
 17737   *
 17738   * [in,out] mc  the cursor for this operation.
 17739   * [in] key     the key to search for, or NULL for first/last page.
 17740   * [in] flags   If MDBX_PS_MODIFY is set, visited pages in the DB
 17741   *              are touched (updated with new page numbers).
 17742   *              If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last
 17743   * leaf.
 17744   *              This is used by mdbx_cursor_first() and mdbx_cursor_last().
 17745   *              If MDBX_PS_ROOTONLY set, just fetch root node, no further
 17746   *              lookups.
 17747   *
 17748   * Returns 0 on success, non-zero on failure. */
 17749  __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) {
 17750    int rc;
 17751    pgno_t root;
 17752  
 17753    /* Make sure the txn is still viable, then find the root from
 17754     * the txn's db table and set it as the root of the cursor's stack. */
 17755    if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) {
 17756      DEBUG("%s", "transaction has failed, must abort");
 17757      return MDBX_BAD_TXN;
 17758    }
 17759  
 17760    /* Make sure we're using an up-to-date root */
 17761    if (unlikely(*mc->mc_dbistate & DBI_STALE)) {
 17762      rc = fetch_sdb(mc->mc_txn, mc->mc_dbi);
 17763      if (unlikely(rc != MDBX_SUCCESS))
 17764        return rc;
 17765    }
 17766    root = mc->mc_db->md_root;
 17767  
 17768    if (unlikely(root == P_INVALID)) { /* Tree is empty. */
 17769      DEBUG("%s", "tree is empty");
 17770      return MDBX_NOTFOUND;
 17771    }
 17772  
 17773    cASSERT(mc, root >= NUM_METAS);
 17774    if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) {
 17775      txnid_t pp_txnid = mc->mc_db->md_mod_txnid;
 17776      pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid
 17777                     ? pp_txnid
 17778                     : mc->mc_txn->mt_txnid;
 17779      if ((mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
 17780        MDBX_txn *scan = mc->mc_txn;
 17781        do
 17782          if ((scan->mt_flags & MDBX_TXN_DIRTY) &&
 17783              (mc->mc_dbi == MAIN_DBI ||
 17784               (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) {
 17785            /* После коммита вложенных тразакций может быть mod_txnid > front */
 17786            pp_txnid = scan->mt_front;
 17787            break;
 17788          }
 17789        while (unlikely((scan = scan->mt_parent) != nullptr));
 17790      }
 17791      if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0))
 17792        return rc;
 17793    }
 17794  
 17795    mc->mc_snum = 1;
 17796    mc->mc_top = 0;
 17797  
 17798    DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root,
 17799          mc->mc_pg[0]->mp_flags);
 17800  
 17801    if (flags & MDBX_PS_MODIFY) {
 17802      if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc)))
 17803        return rc;
 17804      if (unlikely(rc = page_touch(mc)))
 17805        return rc;
 17806    }
 17807  
 17808    if (flags & MDBX_PS_ROOTONLY)
 17809      return MDBX_SUCCESS;
 17810  
 17811    return page_search_root(mc, key, flags);
 17812  }
 17813  
 17814  /* Read large/overflow node data. */
 17815  static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node,
 17816                                          MDBX_val *data, const MDBX_page *mp) {
 17817    cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node));
 17818  
 17819    pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid);
 17820    if (unlikely((lp.err != MDBX_SUCCESS))) {
 17821      DEBUG("read large/overflow page %" PRIaPGNO " failed",
 17822            node_largedata_pgno(node));
 17823      return lp.err;
 17824    }
 17825  
 17826    cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW);
 17827    data->iov_base = page_data(lp.page);
 17828    if (!MDBX_DISABLE_VALIDATION) {
 17829      const MDBX_env *env = mc->mc_txn->mt_env;
 17830      const size_t dsize = data->iov_len;
 17831      if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax))
 17832        poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
 17833      const unsigned npages = number_of_ovpages(env, dsize);
 17834      if (unlikely(lp.page->mp_pages != npages)) {
 17835        if (lp.page->mp_pages < npages)
 17836          return bad_page(lp.page,
 17837                          "too less n-pages %u for bigdata-node (%zu bytes)",
 17838                          lp.page->mp_pages, dsize);
 17839        else
 17840          poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)",
 17841                    lp.page->mp_pages, dsize);
 17842      }
 17843    }
 17844    return MDBX_SUCCESS;
 17845  }
 17846  
 17847  /* Return the data associated with a given node. */
 17848  static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node,
 17849                                       MDBX_val *data, const MDBX_page *mp) {
 17850    data->iov_len = node_ds(node);
 17851    data->iov_base = node_data(node);
 17852    if (likely(node_flags(node) != F_BIGDATA))
 17853      return MDBX_SUCCESS;
 17854    return node_read_bigdata(mc, node, data, mp);
 17855  }
 17856  
 17857  int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) {
 17858    DKBUF_DEBUG;
 17859    DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
 17860  
 17861    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 17862    if (unlikely(rc != MDBX_SUCCESS))
 17863      return rc;
 17864  
 17865    if (unlikely(!key || !data))
 17866      return MDBX_EINVAL;
 17867  
 17868    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 17869      return MDBX_BAD_DBI;
 17870  
 17871    MDBX_cursor_couple cx;
 17872    rc = cursor_init(&cx.outer, txn, dbi);
 17873    if (unlikely(rc != MDBX_SUCCESS))
 17874      return rc;
 17875  
 17876    return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err;
 17877  }
 17878  
 17879  int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
 17880                              MDBX_val *data) {
 17881    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 17882    if (unlikely(rc != MDBX_SUCCESS))
 17883      return rc;
 17884  
 17885    if (unlikely(!key || !data))
 17886      return MDBX_EINVAL;
 17887  
 17888    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 17889      return MDBX_BAD_DBI;
 17890  
 17891    if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED))
 17892      return MDBX_BAD_TXN;
 17893  
 17894    MDBX_cursor_couple cx;
 17895    rc = cursor_init(&cx.outer, txn, dbi);
 17896    if (unlikely(rc != MDBX_SUCCESS))
 17897      return rc;
 17898  
 17899    return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND);
 17900  }
 17901  
 17902  int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
 17903                  size_t *values_count) {
 17904    DKBUF_DEBUG;
 17905    DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
 17906  
 17907    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 17908    if (unlikely(rc != MDBX_SUCCESS))
 17909      return rc;
 17910  
 17911    if (unlikely(!key || !data))
 17912      return MDBX_EINVAL;
 17913  
 17914    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 17915      return MDBX_BAD_DBI;
 17916  
 17917    MDBX_cursor_couple cx;
 17918    rc = cursor_init(&cx.outer, txn, dbi);
 17919    if (unlikely(rc != MDBX_SUCCESS))
 17920      return rc;
 17921  
 17922    rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err;
 17923    if (unlikely(rc != MDBX_SUCCESS)) {
 17924      if (rc == MDBX_NOTFOUND && values_count)
 17925        *values_count = 0;
 17926      return rc;
 17927    }
 17928  
 17929    if (values_count) {
 17930      *values_count = 1;
 17931      if (cx.outer.mc_xcursor != NULL) {
 17932        MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
 17933                                    cx.outer.mc_ki[cx.outer.mc_top]);
 17934        if (node_flags(node) & F_DUPDATA) {
 17935          // coverity[uninit_use : FALSE]
 17936          tASSERT(txn, cx.outer.mc_xcursor == &cx.inner &&
 17937                           (cx.inner.mx_cursor.mc_flags & C_INITIALIZED));
 17938          // coverity[uninit_use : FALSE]
 17939          *values_count =
 17940              (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) ||
 17941               cx.inner.mx_db.md_entries <= PTRDIFF_MAX)
 17942                  ? (size_t)cx.inner.mx_db.md_entries
 17943                  : PTRDIFF_MAX;
 17944        }
 17945      }
 17946    }
 17947    return MDBX_SUCCESS;
 17948  }
 17949  
 17950  /* Find a sibling for a page.
 17951   * Replaces the page at the top of the cursor's stack with the specified
 17952   * sibling, if one exists.
 17953   *
 17954   * [in] mc    The cursor for this operation.
 17955   * [in] dir   SIBLING_LEFT or SIBLING_RIGHT.
 17956   *
 17957   * Returns 0 on success, non-zero on failure. */
 17958  static int cursor_sibling(MDBX_cursor *mc, int dir) {
 17959    int rc;
 17960    MDBX_node *node;
 17961    MDBX_page *mp;
 17962    assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT);
 17963  
 17964    if (unlikely(mc->mc_snum < 2))
 17965      return MDBX_NOTFOUND; /* root has no siblings */
 17966  
 17967    cursor_pop(mc);
 17968    DEBUG("parent page is page %" PRIaPGNO ", index %u",
 17969          mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
 17970  
 17971    if ((dir == SIBLING_RIGHT)
 17972            ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top]))
 17973            : (mc->mc_ki[mc->mc_top] == 0)) {
 17974      DEBUG("no more keys aside, moving to next %s sibling",
 17975            dir ? "right" : "left");
 17976      if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) {
 17977        /* undo cursor_pop before returning */
 17978        mc->mc_top++;
 17979        mc->mc_snum++;
 17980        return rc;
 17981      }
 17982    } else {
 17983      assert((dir - 1) == -1 || (dir - 1) == 1);
 17984      mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1);
 17985      DEBUG("just moving to %s index key %u",
 17986            (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]);
 17987    }
 17988    cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
 17989  
 17990    node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 17991    rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
 17992    if (unlikely(rc != MDBX_SUCCESS)) {
 17993      /* mc will be inconsistent if caller does mc_snum++ as above */
 17994      mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
 17995      return rc;
 17996    }
 17997  
 17998    rc = cursor_push(mc, mp);
 17999    if (unlikely(rc != MDBX_SUCCESS))
 18000      return rc;
 18001  
 18002    mc->mc_ki[mc->mc_top] =
 18003        (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0;
 18004    return MDBX_SUCCESS;
 18005  }
 18006  
 18007  /* Move the cursor to the next data item. */
 18008  static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
 18009                         MDBX_cursor_op op) {
 18010    MDBX_page *mp;
 18011    MDBX_node *node;
 18012    int rc;
 18013  
 18014    if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP)
 18015      return MDBX_NOTFOUND;
 18016  
 18017    if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
 18018      return cursor_first(mc, key, data);
 18019  
 18020    mp = mc->mc_pg[mc->mc_top];
 18021    if (unlikely(mc->mc_flags & C_EOF)) {
 18022      if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp))
 18023        return MDBX_NOTFOUND;
 18024      mc->mc_flags ^= C_EOF;
 18025    }
 18026  
 18027    if (mc->mc_db->md_flags & MDBX_DUPSORT) {
 18028      node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18029      if (node_flags(node) & F_DUPDATA) {
 18030        if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) {
 18031          rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT);
 18032          if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) {
 18033            if (likely(rc == MDBX_SUCCESS))
 18034              get_key_optional(node, key);
 18035            return rc;
 18036          }
 18037        }
 18038      } else {
 18039        mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 18040        if (op == MDBX_NEXT_DUP)
 18041          return MDBX_NOTFOUND;
 18042      }
 18043    }
 18044  
 18045    DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno,
 18046          (void *)mc);
 18047    if (mc->mc_flags & C_DEL) {
 18048      mc->mc_flags ^= C_DEL;
 18049      goto skip;
 18050    }
 18051  
 18052    int ki = mc->mc_ki[mc->mc_top];
 18053    mc->mc_ki[mc->mc_top] = (indx_t)++ki;
 18054    const int numkeys = page_numkeys(mp);
 18055    if (unlikely(ki >= numkeys)) {
 18056      DEBUG("%s", "=====> move to next sibling page");
 18057      mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1);
 18058      rc = cursor_sibling(mc, SIBLING_RIGHT);
 18059      if (unlikely(rc != MDBX_SUCCESS)) {
 18060        mc->mc_flags |= C_EOF;
 18061        return rc;
 18062      }
 18063      mp = mc->mc_pg[mc->mc_top];
 18064      DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno,
 18065            mc->mc_ki[mc->mc_top]);
 18066    }
 18067  
 18068  skip:
 18069    DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u",
 18070          mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]);
 18071  
 18072    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18073      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18074            mp->mp_pgno, mp->mp_flags);
 18075      return MDBX_CORRUPTED;
 18076    }
 18077  
 18078    if (IS_LEAF2(mp)) {
 18079      if (likely(key)) {
 18080        key->iov_len = mc->mc_db->md_xsize;
 18081        key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
 18082      }
 18083      return MDBX_SUCCESS;
 18084    }
 18085  
 18086    node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18087    if (node_flags(node) & F_DUPDATA) {
 18088      rc = cursor_xinit1(mc, node, mp);
 18089      if (unlikely(rc != MDBX_SUCCESS))
 18090        return rc;
 18091      rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
 18092      if (unlikely(rc != MDBX_SUCCESS))
 18093        return rc;
 18094    } else if (likely(data)) {
 18095      rc = node_read(mc, node, data, mp);
 18096      if (unlikely(rc != MDBX_SUCCESS))
 18097        return rc;
 18098    }
 18099  
 18100    get_key_optional(node, key);
 18101    return MDBX_SUCCESS;
 18102  }
 18103  
 18104  /* Move the cursor to the previous data item. */
 18105  static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
 18106                         MDBX_cursor_op op) {
 18107    MDBX_page *mp;
 18108    MDBX_node *node;
 18109    int rc;
 18110  
 18111    if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP)
 18112      return MDBX_NOTFOUND;
 18113  
 18114    if (unlikely(!(mc->mc_flags & C_INITIALIZED))) {
 18115      rc = cursor_last(mc, key, data);
 18116      if (unlikely(rc))
 18117        return rc;
 18118      mc->mc_ki[mc->mc_top]++;
 18119    }
 18120  
 18121    mp = mc->mc_pg[mc->mc_top];
 18122    if ((mc->mc_db->md_flags & MDBX_DUPSORT) &&
 18123        mc->mc_ki[mc->mc_top] < page_numkeys(mp)) {
 18124      node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18125      if (node_flags(node) & F_DUPDATA) {
 18126        if (op == MDBX_PREV || op == MDBX_PREV_DUP) {
 18127          rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV);
 18128          if (op != MDBX_PREV || rc != MDBX_NOTFOUND) {
 18129            if (likely(rc == MDBX_SUCCESS)) {
 18130              get_key_optional(node, key);
 18131              mc->mc_flags &= ~C_EOF;
 18132            }
 18133            return rc;
 18134          }
 18135        }
 18136      } else {
 18137        mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 18138        if (op == MDBX_PREV_DUP)
 18139          return MDBX_NOTFOUND;
 18140      }
 18141    }
 18142  
 18143    DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno,
 18144          (void *)mc);
 18145  
 18146    mc->mc_flags &= ~(C_EOF | C_DEL);
 18147  
 18148    int ki = mc->mc_ki[mc->mc_top];
 18149    mc->mc_ki[mc->mc_top] = (indx_t)--ki;
 18150    if (unlikely(ki < 0)) {
 18151      mc->mc_ki[mc->mc_top] = 0;
 18152      DEBUG("%s", "=====> move to prev sibling page");
 18153      if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS)
 18154        return rc;
 18155      mp = mc->mc_pg[mc->mc_top];
 18156      DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno,
 18157            mc->mc_ki[mc->mc_top]);
 18158    }
 18159    DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u",
 18160          mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]);
 18161  
 18162    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18163      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18164            mp->mp_pgno, mp->mp_flags);
 18165      return MDBX_CORRUPTED;
 18166    }
 18167  
 18168    if (IS_LEAF2(mp)) {
 18169      if (likely(key)) {
 18170        key->iov_len = mc->mc_db->md_xsize;
 18171        key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
 18172      }
 18173      return MDBX_SUCCESS;
 18174    }
 18175  
 18176    node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18177  
 18178    if (node_flags(node) & F_DUPDATA) {
 18179      rc = cursor_xinit1(mc, node, mp);
 18180      if (unlikely(rc != MDBX_SUCCESS))
 18181        return rc;
 18182      rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
 18183      if (unlikely(rc != MDBX_SUCCESS))
 18184        return rc;
 18185    } else if (likely(data)) {
 18186      rc = node_read(mc, node, data, mp);
 18187      if (unlikely(rc != MDBX_SUCCESS))
 18188        return rc;
 18189    }
 18190  
 18191    get_key_optional(node, key);
 18192    return MDBX_SUCCESS;
 18193  }
 18194  
 18195  /* Set the cursor on a specific data item. */
 18196  __hot static struct cursor_set_result
 18197  cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) {
 18198    MDBX_page *mp;
 18199    MDBX_node *node = NULL;
 18200    DKBUF_DEBUG;
 18201  
 18202    struct cursor_set_result ret;
 18203    ret.exact = false;
 18204    if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min ||
 18205                 key->iov_len > mc->mc_dbx->md_klen_max)) {
 18206      cASSERT(mc, !"Invalid key-size");
 18207      ret.err = MDBX_BAD_VALSIZE;
 18208      return ret;
 18209    }
 18210  
 18211    MDBX_val aligned_key = *key;
 18212    uint64_t aligned_keybytes;
 18213    if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
 18214      switch (aligned_key.iov_len) {
 18215      default:
 18216        cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY");
 18217        ret.err = MDBX_BAD_VALSIZE;
 18218        return ret;
 18219      case 4:
 18220        if (unlikely(3 & (uintptr_t)aligned_key.iov_base))
 18221          /* copy instead of return error to avoid break compatibility */
 18222          aligned_key.iov_base =
 18223              memcpy(&aligned_keybytes, aligned_key.iov_base, 4);
 18224        break;
 18225      case 8:
 18226        if (unlikely(7 & (uintptr_t)aligned_key.iov_base))
 18227          /* copy instead of return error to avoid break compatibility */
 18228          aligned_key.iov_base =
 18229              memcpy(&aligned_keybytes, aligned_key.iov_base, 8);
 18230        break;
 18231      }
 18232    }
 18233  
 18234    if (mc->mc_xcursor)
 18235      mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 18236  
 18237    /* See if we're already on the right page */
 18238    if (mc->mc_flags & C_INITIALIZED) {
 18239      MDBX_val nodekey;
 18240  
 18241      cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
 18242      mp = mc->mc_pg[mc->mc_top];
 18243      if (unlikely(!page_numkeys(mp))) {
 18244        mc->mc_ki[mc->mc_top] = 0;
 18245        mc->mc_flags |= C_EOF;
 18246        ret.err = MDBX_NOTFOUND;
 18247        return ret;
 18248      }
 18249      if (IS_LEAF2(mp)) {
 18250        nodekey.iov_len = mc->mc_db->md_xsize;
 18251        nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len);
 18252      } else {
 18253        node = page_node(mp, 0);
 18254        get_key(node, &nodekey);
 18255      }
 18256      int cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
 18257      if (unlikely(cmp == 0)) {
 18258        /* Probably happens rarely, but first node on the page
 18259         * was the one we wanted. */
 18260        mc->mc_ki[mc->mc_top] = 0;
 18261        ret.exact = true;
 18262        cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18263                        (mc->mc_flags & C_EOF));
 18264        goto got_node;
 18265      }
 18266      if (cmp > 0) {
 18267        const unsigned nkeys = page_numkeys(mp);
 18268        if (nkeys > 1) {
 18269          if (IS_LEAF2(mp)) {
 18270            nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len);
 18271          } else {
 18272            node = page_node(mp, nkeys - 1);
 18273            get_key(node, &nodekey);
 18274          }
 18275          cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
 18276          if (cmp == 0) {
 18277            /* last node was the one we wanted */
 18278            cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1);
 18279            mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1);
 18280            ret.exact = true;
 18281            cASSERT(mc,
 18282                    mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18283                        (mc->mc_flags & C_EOF));
 18284            goto got_node;
 18285          }
 18286          if (cmp < 0) {
 18287            if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) {
 18288              /* This is definitely the right page, skip search_page */
 18289              if (IS_LEAF2(mp)) {
 18290                nodekey.iov_base =
 18291                    page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len);
 18292              } else {
 18293                node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18294                get_key(node, &nodekey);
 18295              }
 18296              cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
 18297              if (cmp == 0) {
 18298                /* current node was the one we wanted */
 18299                ret.exact = true;
 18300                cASSERT(mc, mc->mc_ki[mc->mc_top] <
 18301                                    page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18302                                (mc->mc_flags & C_EOF));
 18303                goto got_node;
 18304              }
 18305            }
 18306            mc->mc_flags &= ~C_EOF;
 18307            goto search_node;
 18308          }
 18309        }
 18310        /* If any parents have right-sibs, search.
 18311         * Otherwise, there's nothing further. */
 18312        unsigned i;
 18313        for (i = 0; i < mc->mc_top; i++)
 18314          if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1)
 18315            break;
 18316        if (i == mc->mc_top) {
 18317          /* There are no other pages */
 18318          cASSERT(mc, nkeys <= UINT16_MAX);
 18319          mc->mc_ki[mc->mc_top] = (uint16_t)nkeys;
 18320          mc->mc_flags |= C_EOF;
 18321          ret.err = MDBX_NOTFOUND;
 18322          return ret;
 18323        }
 18324      }
 18325      if (!mc->mc_top) {
 18326        /* There are no other pages */
 18327        mc->mc_ki[mc->mc_top] = 0;
 18328        if (op == MDBX_SET_RANGE)
 18329          goto got_node;
 18330  
 18331        cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18332                        (mc->mc_flags & C_EOF));
 18333        ret.err = MDBX_NOTFOUND;
 18334        return ret;
 18335      }
 18336    } else {
 18337      mc->mc_pg[0] = 0;
 18338    }
 18339  
 18340    ret.err = page_search(mc, &aligned_key, 0);
 18341    if (unlikely(ret.err != MDBX_SUCCESS))
 18342      return ret;
 18343  
 18344    mp = mc->mc_pg[mc->mc_top];
 18345    cASSERT(mc, IS_LEAF(mp));
 18346  
 18347  search_node:;
 18348    struct node_result nsr = node_search(mc, &aligned_key);
 18349    node = nsr.node;
 18350    ret.exact = nsr.exact;
 18351    if (!ret.exact) {
 18352      if (op != MDBX_SET_RANGE) {
 18353        /* MDBX_SET specified and not an exact match. */
 18354        if (unlikely(mc->mc_ki[mc->mc_top] >=
 18355                     page_numkeys(mc->mc_pg[mc->mc_top])))
 18356          mc->mc_flags |= C_EOF;
 18357        ret.err = MDBX_NOTFOUND;
 18358        return ret;
 18359      }
 18360  
 18361      if (node == NULL) {
 18362        DEBUG("%s", "===> inexact leaf not found, goto sibling");
 18363        ret.err = cursor_sibling(mc, SIBLING_RIGHT);
 18364        if (unlikely(ret.err != MDBX_SUCCESS)) {
 18365          mc->mc_flags |= C_EOF;
 18366          return ret; /* no entries matched */
 18367        }
 18368        mp = mc->mc_pg[mc->mc_top];
 18369        cASSERT(mc, IS_LEAF(mp));
 18370        if (!IS_LEAF2(mp))
 18371          node = page_node(mp, 0);
 18372      }
 18373    }
 18374    cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18375                    (mc->mc_flags & C_EOF));
 18376  
 18377  got_node:
 18378    mc->mc_flags |= C_INITIALIZED;
 18379    mc->mc_flags &= ~C_EOF;
 18380  
 18381    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18382      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18383            mp->mp_pgno, mp->mp_flags);
 18384      ret.err = MDBX_CORRUPTED;
 18385      return ret;
 18386    }
 18387  
 18388    if (IS_LEAF2(mp)) {
 18389      if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) {
 18390        key->iov_len = mc->mc_db->md_xsize;
 18391        key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
 18392      }
 18393      ret.err = MDBX_SUCCESS;
 18394      return ret;
 18395    }
 18396  
 18397    if (node_flags(node) & F_DUPDATA) {
 18398      ret.err = cursor_xinit1(mc, node, mp);
 18399      if (unlikely(ret.err != MDBX_SUCCESS))
 18400        return ret;
 18401      if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) {
 18402        ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
 18403        if (unlikely(ret.err != MDBX_SUCCESS))
 18404          return ret;
 18405      } else {
 18406        ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE);
 18407        if (unlikely(ret.err != MDBX_SUCCESS))
 18408          return ret;
 18409        if (op == MDBX_GET_BOTH && !ret.exact) {
 18410          ret.err = MDBX_NOTFOUND;
 18411          return ret;
 18412        }
 18413      }
 18414    } else if (likely(data)) {
 18415      if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) {
 18416        if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min ||
 18417                     data->iov_len > mc->mc_dbx->md_vlen_max)) {
 18418          cASSERT(mc, !"Invalid data-size");
 18419          ret.err = MDBX_BAD_VALSIZE;
 18420          return ret;
 18421        }
 18422        MDBX_val aligned_data = *data;
 18423        uint64_t aligned_databytes;
 18424        if (mc->mc_db->md_flags & MDBX_INTEGERDUP) {
 18425          switch (aligned_data.iov_len) {
 18426          default:
 18427            cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP");
 18428            ret.err = MDBX_BAD_VALSIZE;
 18429            return ret;
 18430          case 4:
 18431            if (unlikely(3 & (uintptr_t)aligned_data.iov_base))
 18432              /* copy instead of return error to avoid break compatibility */
 18433              aligned_data.iov_base =
 18434                  memcpy(&aligned_databytes, aligned_data.iov_base, 4);
 18435            break;
 18436          case 8:
 18437            if (unlikely(7 & (uintptr_t)aligned_data.iov_base))
 18438              /* copy instead of return error to avoid break compatibility */
 18439              aligned_data.iov_base =
 18440                  memcpy(&aligned_databytes, aligned_data.iov_base, 8);
 18441            break;
 18442          }
 18443        }
 18444        MDBX_val actual_data;
 18445        ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]);
 18446        if (unlikely(ret.err != MDBX_SUCCESS))
 18447          return ret;
 18448        const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data);
 18449        if (cmp) {
 18450          cASSERT(mc,
 18451                  mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18452                      (mc->mc_flags & C_EOF));
 18453          if (op != MDBX_GET_BOTH_RANGE || cmp > 0) {
 18454            ret.err = MDBX_NOTFOUND;
 18455            return ret;
 18456          }
 18457        }
 18458        *data = actual_data;
 18459      } else {
 18460        ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]);
 18461        if (unlikely(ret.err != MDBX_SUCCESS))
 18462          return ret;
 18463      }
 18464    }
 18465  
 18466    /* The key already matches in all other cases */
 18467    if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY)
 18468      get_key_optional(node, key);
 18469  
 18470    DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key),
 18471          DVAL_DEBUG(data));
 18472    ret.err = MDBX_SUCCESS;
 18473    return ret;
 18474  }
 18475  
 18476  /* Move the cursor to the first item in the database. */
 18477  static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) {
 18478    int rc;
 18479  
 18480    if (mc->mc_xcursor)
 18481      mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 18482  
 18483    if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
 18484      rc = page_search(mc, NULL, MDBX_PS_FIRST);
 18485      if (unlikely(rc != MDBX_SUCCESS))
 18486        return rc;
 18487    }
 18488  
 18489    const MDBX_page *mp = mc->mc_pg[mc->mc_top];
 18490    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18491      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18492            mp->mp_pgno, mp->mp_flags);
 18493      return MDBX_CORRUPTED;
 18494    }
 18495  
 18496    mc->mc_flags |= C_INITIALIZED;
 18497    mc->mc_flags &= ~C_EOF;
 18498    mc->mc_ki[mc->mc_top] = 0;
 18499  
 18500    if (IS_LEAF2(mp)) {
 18501      if (likely(key)) {
 18502        key->iov_len = mc->mc_db->md_xsize;
 18503        key->iov_base = page_leaf2key(mp, 0, key->iov_len);
 18504      }
 18505      return MDBX_SUCCESS;
 18506    }
 18507  
 18508    MDBX_node *node = page_node(mp, 0);
 18509    if (node_flags(node) & F_DUPDATA) {
 18510      rc = cursor_xinit1(mc, node, mp);
 18511      if (unlikely(rc != MDBX_SUCCESS))
 18512        return rc;
 18513      rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
 18514      if (unlikely(rc))
 18515        return rc;
 18516    } else if (likely(data)) {
 18517      rc = node_read(mc, node, data, mp);
 18518      if (unlikely(rc != MDBX_SUCCESS))
 18519        return rc;
 18520    }
 18521  
 18522    get_key_optional(node, key);
 18523    return MDBX_SUCCESS;
 18524  }
 18525  
 18526  /* Move the cursor to the last item in the database. */
 18527  static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) {
 18528    int rc;
 18529  
 18530    if (mc->mc_xcursor)
 18531      mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 18532  
 18533    if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
 18534      rc = page_search(mc, NULL, MDBX_PS_LAST);
 18535      if (unlikely(rc != MDBX_SUCCESS))
 18536        return rc;
 18537    }
 18538  
 18539    const MDBX_page *mp = mc->mc_pg[mc->mc_top];
 18540    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18541      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18542            mp->mp_pgno, mp->mp_flags);
 18543      return MDBX_CORRUPTED;
 18544    }
 18545  
 18546    mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1;
 18547    mc->mc_flags |= C_INITIALIZED | C_EOF;
 18548  
 18549    if (IS_LEAF2(mp)) {
 18550      if (likely(key)) {
 18551        key->iov_len = mc->mc_db->md_xsize;
 18552        key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
 18553      }
 18554      return MDBX_SUCCESS;
 18555    }
 18556  
 18557    MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18558    if (node_flags(node) & F_DUPDATA) {
 18559      rc = cursor_xinit1(mc, node, mp);
 18560      if (unlikely(rc != MDBX_SUCCESS))
 18561        return rc;
 18562      rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
 18563      if (unlikely(rc))
 18564        return rc;
 18565    } else if (likely(data)) {
 18566      rc = node_read(mc, node, data, mp);
 18567      if (unlikely(rc != MDBX_SUCCESS))
 18568        return rc;
 18569    }
 18570  
 18571    get_key_optional(node, key);
 18572    return MDBX_SUCCESS;
 18573  }
 18574  
 18575  __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
 18576                            MDBX_cursor_op op) {
 18577    if (unlikely(mc == NULL))
 18578      return MDBX_EINVAL;
 18579  
 18580    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 18581      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 18582                                                       : MDBX_EBADSIGN;
 18583  
 18584    int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED);
 18585    if (unlikely(rc != MDBX_SUCCESS))
 18586      return rc;
 18587  
 18588    int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data);
 18589    switch (op) {
 18590    case MDBX_GET_CURRENT: {
 18591      if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
 18592        return MDBX_ENODATA;
 18593      const MDBX_page *mp = mc->mc_pg[mc->mc_top];
 18594      if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18595        ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18596              mp->mp_pgno, mp->mp_flags);
 18597        return MDBX_CORRUPTED;
 18598      }
 18599      const unsigned nkeys = page_numkeys(mp);
 18600      if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) {
 18601        cASSERT(mc, nkeys <= UINT16_MAX);
 18602        if (mc->mc_flags & C_EOF)
 18603          return MDBX_ENODATA;
 18604        mc->mc_ki[mc->mc_top] = (uint16_t)nkeys;
 18605        mc->mc_flags |= C_EOF;
 18606        return MDBX_NOTFOUND;
 18607      }
 18608      cASSERT(mc, nkeys > 0);
 18609  
 18610      rc = MDBX_SUCCESS;
 18611      if (IS_LEAF2(mp)) {
 18612        key->iov_len = mc->mc_db->md_xsize;
 18613        key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
 18614      } else {
 18615        MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
 18616        get_key_optional(node, key);
 18617        if (data) {
 18618          if (node_flags(node) & F_DUPDATA) {
 18619            if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) {
 18620              rc = cursor_xinit1(mc, node, mp);
 18621              if (unlikely(rc != MDBX_SUCCESS))
 18622                return rc;
 18623              rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
 18624              if (unlikely(rc))
 18625                return rc;
 18626            } else {
 18627              rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL,
 18628                                   MDBX_GET_CURRENT);
 18629              if (unlikely(rc))
 18630                return rc;
 18631            }
 18632          } else {
 18633            rc = node_read(mc, node, data, mp);
 18634            if (unlikely(rc))
 18635              return rc;
 18636          }
 18637        }
 18638      }
 18639      break;
 18640    }
 18641    case MDBX_GET_BOTH:
 18642    case MDBX_GET_BOTH_RANGE:
 18643      if (unlikely(data == NULL))
 18644        return MDBX_EINVAL;
 18645      if (unlikely(mc->mc_xcursor == NULL))
 18646        return MDBX_INCOMPATIBLE;
 18647      /* fall through */
 18648      __fallthrough;
 18649    case MDBX_SET:
 18650    case MDBX_SET_KEY:
 18651    case MDBX_SET_RANGE:
 18652      if (unlikely(key == NULL))
 18653        return MDBX_EINVAL;
 18654      rc = cursor_set(mc, key, data, op).err;
 18655      if (mc->mc_flags & C_INITIALIZED) {
 18656        cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum);
 18657        cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
 18658                        (mc->mc_flags & C_EOF));
 18659      }
 18660      break;
 18661    case MDBX_GET_MULTIPLE:
 18662      if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED)))
 18663        return MDBX_EINVAL;
 18664      if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED)))
 18665        return MDBX_INCOMPATIBLE;
 18666      rc = MDBX_SUCCESS;
 18667      if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) !=
 18668          C_INITIALIZED)
 18669        break;
 18670      goto fetchm;
 18671    case MDBX_NEXT_MULTIPLE:
 18672      if (unlikely(data == NULL))
 18673        return MDBX_EINVAL;
 18674      if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED)))
 18675        return MDBX_INCOMPATIBLE;
 18676      rc = cursor_next(mc, key, data, MDBX_NEXT_DUP);
 18677      if (rc == MDBX_SUCCESS) {
 18678        if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
 18679          MDBX_cursor *mx;
 18680        fetchm:
 18681          mx = &mc->mc_xcursor->mx_cursor;
 18682          data->iov_len =
 18683              page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize;
 18684          data->iov_base = page_data(mx->mc_pg[mx->mc_top]);
 18685          mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1;
 18686        } else {
 18687          rc = MDBX_NOTFOUND;
 18688        }
 18689      }
 18690      break;
 18691    case MDBX_PREV_MULTIPLE:
 18692      if (data == NULL)
 18693        return MDBX_EINVAL;
 18694      if (!(mc->mc_db->md_flags & MDBX_DUPFIXED))
 18695        return MDBX_INCOMPATIBLE;
 18696      rc = MDBX_SUCCESS;
 18697      if (!(mc->mc_flags & C_INITIALIZED))
 18698        rc = cursor_last(mc, key, data);
 18699      if (rc == MDBX_SUCCESS) {
 18700        MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor;
 18701        if (mx->mc_flags & C_INITIALIZED) {
 18702          rc = cursor_sibling(mx, SIBLING_LEFT);
 18703          if (rc == MDBX_SUCCESS)
 18704            goto fetchm;
 18705        } else {
 18706          rc = MDBX_NOTFOUND;
 18707        }
 18708      }
 18709      break;
 18710    case MDBX_NEXT:
 18711    case MDBX_NEXT_DUP:
 18712    case MDBX_NEXT_NODUP:
 18713      rc = cursor_next(mc, key, data, op);
 18714      break;
 18715    case MDBX_PREV:
 18716    case MDBX_PREV_DUP:
 18717    case MDBX_PREV_NODUP:
 18718      rc = cursor_prev(mc, key, data, op);
 18719      break;
 18720    case MDBX_FIRST:
 18721      rc = cursor_first(mc, key, data);
 18722      break;
 18723    case MDBX_FIRST_DUP:
 18724      mfunc = cursor_first;
 18725    move:
 18726      if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED)))
 18727        return MDBX_EINVAL;
 18728      if (unlikely(mc->mc_xcursor == NULL))
 18729        return MDBX_INCOMPATIBLE;
 18730      if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) {
 18731        mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]);
 18732        mc->mc_flags |= C_EOF;
 18733        return MDBX_NOTFOUND;
 18734      }
 18735      {
 18736        MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 18737        if (!(node_flags(node) & F_DUPDATA)) {
 18738          get_key_optional(node, key);
 18739          rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]);
 18740          break;
 18741        }
 18742      }
 18743      if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)))
 18744        return MDBX_EINVAL;
 18745      rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
 18746      break;
 18747    case MDBX_LAST:
 18748      rc = cursor_last(mc, key, data);
 18749      break;
 18750    case MDBX_LAST_DUP:
 18751      mfunc = cursor_last;
 18752      goto move;
 18753    case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */
 18754    case MDBX_SET_LOWERBOUND: {
 18755      if (unlikely(key == NULL || data == NULL))
 18756        return MDBX_EINVAL;
 18757      MDBX_val save_data = *data;
 18758      struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE);
 18759      rc = csr.err;
 18760      if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) {
 18761        mc->mc_flags &= ~C_DEL;
 18762        csr.exact = false;
 18763        if (!save_data.iov_base && (mc->mc_db->md_flags & MDBX_DUPFIXED)) {
 18764          /* Avoiding search nested dupfixed hive if no data provided.
 18765           * This is changes the semantic of MDBX_SET_LOWERBOUND but avoid
 18766           * returning MDBX_BAD_VALSIZE. */
 18767        } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
 18768          *data = save_data;
 18769          csr =
 18770              cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE);
 18771          rc = csr.err;
 18772          if (rc == MDBX_NOTFOUND) {
 18773            cASSERT(mc, !csr.exact);
 18774            rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP);
 18775          }
 18776        } else {
 18777          int cmp = mc->mc_dbx->md_dcmp(&save_data, data);
 18778          csr.exact = (cmp == 0);
 18779          if (cmp > 0)
 18780            rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP);
 18781        }
 18782      }
 18783      if (rc == MDBX_SUCCESS && !csr.exact)
 18784        rc = MDBX_RESULT_TRUE;
 18785      if (unlikely(op == MDBX_SET_UPPERBOUND)) {
 18786        /* minor fixups for MDBX_SET_UPPERBOUND */
 18787        if (rc == MDBX_RESULT_TRUE)
 18788          /* already at great-than by MDBX_SET_LOWERBOUND */
 18789          rc = MDBX_SUCCESS;
 18790        else if (rc == MDBX_SUCCESS)
 18791          /* exactly match, going next */
 18792          rc = cursor_next(mc, key, data, MDBX_NEXT);
 18793      }
 18794      break;
 18795    }
 18796    default:
 18797      DEBUG("unhandled/unimplemented cursor operation %u", op);
 18798      return MDBX_EINVAL;
 18799    }
 18800  
 18801    mc->mc_flags &= ~C_DEL;
 18802    return rc;
 18803  }
 18804  
 18805  static int cursor_first_batch(MDBX_cursor *mc) {
 18806    if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
 18807      int err = page_search(mc, NULL, MDBX_PS_FIRST);
 18808      if (unlikely(err != MDBX_SUCCESS))
 18809        return err;
 18810    }
 18811    cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
 18812  
 18813    mc->mc_flags |= C_INITIALIZED;
 18814    mc->mc_flags &= ~C_EOF;
 18815    mc->mc_ki[mc->mc_top] = 0;
 18816    return MDBX_SUCCESS;
 18817  }
 18818  
 18819  static int cursor_next_batch(MDBX_cursor *mc) {
 18820    if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
 18821      return cursor_first_batch(mc);
 18822  
 18823    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 18824    if (unlikely(mc->mc_flags & C_EOF)) {
 18825      if ((unsigned)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp))
 18826        return MDBX_NOTFOUND;
 18827      mc->mc_flags ^= C_EOF;
 18828    }
 18829  
 18830    int ki = mc->mc_ki[mc->mc_top];
 18831    mc->mc_ki[mc->mc_top] = (indx_t)++ki;
 18832    const int numkeys = page_numkeys(mp);
 18833    if (likely(ki >= numkeys)) {
 18834      DEBUG("%s", "=====> move to next sibling page");
 18835      mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1);
 18836      int err = cursor_sibling(mc, SIBLING_RIGHT);
 18837      if (unlikely(err != MDBX_SUCCESS)) {
 18838        mc->mc_flags |= C_EOF;
 18839        return err;
 18840      }
 18841      mp = mc->mc_pg[mc->mc_top];
 18842      DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno,
 18843            mc->mc_ki[mc->mc_top]);
 18844      if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18845        ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18846              mp->mp_pgno, mp->mp_flags);
 18847        return MDBX_CORRUPTED;
 18848      }
 18849    }
 18850    return MDBX_SUCCESS;
 18851  }
 18852  
 18853  int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs,
 18854                            size_t limit, MDBX_cursor_op op) {
 18855    if (unlikely(mc == NULL || count == NULL || limit < 4))
 18856      return MDBX_EINVAL;
 18857  
 18858    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 18859      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 18860                                                       : MDBX_EBADSIGN;
 18861  
 18862    int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED);
 18863    if (unlikely(rc != MDBX_SUCCESS))
 18864      return rc;
 18865  
 18866    if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT))
 18867      return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */;
 18868  
 18869    switch (op) {
 18870    case MDBX_FIRST:
 18871      rc = cursor_first_batch(mc);
 18872      break;
 18873    case MDBX_NEXT:
 18874      rc = cursor_next_batch(mc);
 18875      break;
 18876    case MDBX_GET_CURRENT:
 18877      rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA;
 18878      break;
 18879    default:
 18880      DEBUG("unhandled/unimplemented cursor operation %u", op);
 18881      rc = MDBX_EINVAL;
 18882      break;
 18883    }
 18884  
 18885    if (unlikely(rc != MDBX_SUCCESS)) {
 18886      *count = 0;
 18887      return rc;
 18888    }
 18889  
 18890    const MDBX_page *const mp = mc->mc_pg[mc->mc_top];
 18891    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 18892      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 18893            mp->mp_pgno, mp->mp_flags);
 18894      return MDBX_CORRUPTED;
 18895    }
 18896    const unsigned nkeys = page_numkeys(mp);
 18897    unsigned i = mc->mc_ki[mc->mc_top], n = 0;
 18898    if (unlikely(i >= nkeys)) {
 18899      cASSERT(mc, op == MDBX_GET_CURRENT);
 18900      cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE);
 18901      *count = 0;
 18902      if (mc->mc_flags & C_EOF) {
 18903        cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE);
 18904        return MDBX_ENODATA;
 18905      }
 18906      if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE)
 18907        return MDBX_EINVAL /* again MDBX_GET_CURRENT after MDBX_GET_CURRENT */;
 18908      mc->mc_flags |= C_EOF;
 18909      return MDBX_NOTFOUND;
 18910    }
 18911  
 18912    do {
 18913      if (unlikely(n + 2 > limit)) {
 18914        rc = MDBX_RESULT_TRUE;
 18915        break;
 18916      }
 18917      const MDBX_node *leaf = page_node(mp, i);
 18918      get_key(leaf, &pairs[n]);
 18919      rc = node_read(mc, leaf, &pairs[n + 1], mp);
 18920      if (unlikely(rc != MDBX_SUCCESS))
 18921        break;
 18922      n += 2;
 18923    } while (++i < nkeys);
 18924  
 18925    mc->mc_ki[mc->mc_top] = (indx_t)i;
 18926    *count = n;
 18927    return rc;
 18928  }
 18929  
 18930  static int touch_dbi(MDBX_cursor *mc) {
 18931    cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0);
 18932    *mc->mc_dbistate |= DBI_DIRTY;
 18933    mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
 18934    if (mc->mc_dbi >= CORE_DBS) {
 18935      cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0);
 18936      /* Touch DB record of named DB */
 18937      MDBX_cursor_couple cx;
 18938      int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI);
 18939      if (unlikely(rc != MDBX_SUCCESS))
 18940        return rc;
 18941      mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY;
 18942      rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY);
 18943      if (unlikely(rc != MDBX_SUCCESS))
 18944        return rc;
 18945    }
 18946    return MDBX_SUCCESS;
 18947  }
 18948  
 18949  /* Touch all the pages in the cursor stack. Set mc_top.
 18950   * Makes sure all the pages are writable, before attempting a write operation.
 18951   * [in] mc The cursor to operate on. */
 18952  static int cursor_touch(MDBX_cursor *mc) {
 18953    int rc = MDBX_SUCCESS;
 18954    if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) {
 18955      rc = touch_dbi(mc);
 18956      if (unlikely(rc != MDBX_SUCCESS))
 18957        return rc;
 18958    }
 18959    if (likely(mc->mc_snum)) {
 18960      mc->mc_top = 0;
 18961      do {
 18962        rc = page_touch(mc);
 18963      } while (!rc && ++(mc->mc_top) < mc->mc_snum);
 18964      mc->mc_top = mc->mc_snum - 1;
 18965    }
 18966    return rc;
 18967  }
 18968  
 18969  __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
 18970                            unsigned flags) {
 18971    MDBX_env *env;
 18972    MDBX_page *sub_root = NULL;
 18973    MDBX_val xdata, *rdata, dkey, olddata;
 18974    MDBX_db nested_dupdb;
 18975    int err;
 18976    DKBUF_DEBUG;
 18977  
 18978    if (unlikely(mc == NULL || key == NULL || data == NULL))
 18979      return MDBX_EINVAL;
 18980  
 18981    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 18982      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 18983                                                       : MDBX_EBADSIGN;
 18984  
 18985    int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED);
 18986    if (unlikely(rc != MDBX_SUCCESS))
 18987      return rc;
 18988  
 18989    if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi)))
 18990      return MDBX_BAD_DBI;
 18991  
 18992    cASSERT(mc, cursor_is_tracked(mc));
 18993    env = mc->mc_txn->mt_env;
 18994  
 18995    /* Check this first so counter will always be zero on any early failures. */
 18996    size_t mcount = 0, dcount = 0;
 18997    if (unlikely(flags & MDBX_MULTIPLE)) {
 18998      if (unlikely(flags & MDBX_RESERVE))
 18999        return MDBX_EINVAL;
 19000      if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED)))
 19001        return MDBX_INCOMPATIBLE;
 19002      dcount = data[1].iov_len;
 19003      if (unlikely(dcount < 2 || data->iov_len == 0))
 19004        return MDBX_BAD_VALSIZE;
 19005      if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize)
 19006        return MDBX_BAD_VALSIZE;
 19007      if (unlikely(dcount > MAX_MAPSIZE / 2 /
 19008                                (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) {
 19009        /* checking for multiplication overflow */
 19010        if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len))
 19011          return MDBX_TOO_LARGE;
 19012      }
 19013      data[1].iov_len = 0 /* reset done item counter */;
 19014    }
 19015  
 19016    if (flags & MDBX_RESERVE) {
 19017      if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP |
 19018                                          MDBX_INTEGERDUP | MDBX_DUPFIXED)))
 19019        return MDBX_INCOMPATIBLE;
 19020      data->iov_base = nullptr;
 19021    }
 19022  
 19023    const unsigned nospill = flags & MDBX_NOSPILL;
 19024    flags -= nospill;
 19025  
 19026    if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
 19027      return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS
 19028                                                      : MDBX_BAD_TXN;
 19029  
 19030    uint64_t aligned_keybytes, aligned_databytes;
 19031    MDBX_val aligned_key, aligned_data;
 19032    if (likely((mc->mc_flags & C_SUB) == 0)) {
 19033      if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min ||
 19034                   key->iov_len > mc->mc_dbx->md_klen_max)) {
 19035        cASSERT(mc, !"Invalid key-size");
 19036        return MDBX_BAD_VALSIZE;
 19037      }
 19038      if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min ||
 19039                   data->iov_len > mc->mc_dbx->md_vlen_max)) {
 19040        cASSERT(mc, !"Invalid data-size");
 19041        return MDBX_BAD_VALSIZE;
 19042      }
 19043  
 19044      if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
 19045        switch (key->iov_len) {
 19046        default:
 19047          cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY");
 19048          return MDBX_BAD_VALSIZE;
 19049        case 4:
 19050          if (unlikely(3 & (uintptr_t)key->iov_base)) {
 19051            /* copy instead of return error to avoid break compatibility */
 19052            aligned_key.iov_base =
 19053                memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4);
 19054            key = &aligned_key;
 19055          }
 19056          break;
 19057        case 8:
 19058          if (unlikely(7 & (uintptr_t)key->iov_base)) {
 19059            /* copy instead of return error to avoid break compatibility */
 19060            aligned_key.iov_base =
 19061                memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8);
 19062            key = &aligned_key;
 19063          }
 19064          break;
 19065        }
 19066      }
 19067      if (mc->mc_db->md_flags & MDBX_INTEGERDUP) {
 19068        switch (data->iov_len) {
 19069        default:
 19070          cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY");
 19071          return MDBX_BAD_VALSIZE;
 19072        case 4:
 19073          if (unlikely(3 & (uintptr_t)data->iov_base)) {
 19074            if (unlikely(flags & MDBX_MULTIPLE))
 19075              return MDBX_BAD_VALSIZE;
 19076            /* copy instead of return error to avoid break compatibility */
 19077            aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base,
 19078                                           aligned_data.iov_len = 4);
 19079            data = &aligned_data;
 19080          }
 19081          break;
 19082        case 8:
 19083          if (unlikely(7 & (uintptr_t)data->iov_base)) {
 19084            if (unlikely(flags & MDBX_MULTIPLE))
 19085              return MDBX_BAD_VALSIZE;
 19086            /* copy instead of return error to avoid break compatibility */
 19087            aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base,
 19088                                           aligned_data.iov_len = 8);
 19089            data = &aligned_data;
 19090          }
 19091          break;
 19092        }
 19093      }
 19094    }
 19095  
 19096    DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR,
 19097          DDBI(mc), DKEY_DEBUG(key), key->iov_len,
 19098          DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len);
 19099  
 19100    int dupdata_flag = 0;
 19101    if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) {
 19102      if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE)))
 19103        return MDBX_EINVAL;
 19104      /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи,
 19105       * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает
 19106       * со значением в текущей позиции курсора.
 19107       * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц
 19108       * с MDBX_DUPSORT также требуется текущий размер данных. */
 19109      MDBX_val current_key, current_data;
 19110      rc = mdbx_cursor_get(mc, &current_key, &current_data, MDBX_GET_CURRENT);
 19111      if (unlikely(rc != MDBX_SUCCESS))
 19112        return rc;
 19113      if (mc->mc_dbx->md_cmp(key, &current_key) != 0)
 19114        return MDBX_EKEYMISMATCH;
 19115  
 19116      if (unlikely((flags & MDBX_MULTIPLE)))
 19117        goto drop_current;
 19118  
 19119      if (mc->mc_db->md_flags & MDBX_DUPSORT) {
 19120        MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 19121        if (node_flags(node) & F_DUPDATA) {
 19122          cASSERT(mc, mc->mc_xcursor != NULL &&
 19123                          (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED));
 19124          /* Если за ключом более одного значения, либо если размер данных
 19125           * отличается, то вместо обновления требуется удаление и
 19126           * последующая вставка. */
 19127          if (mc->mc_xcursor->mx_db.md_entries > 1 ||
 19128              current_data.iov_len != data->iov_len) {
 19129          drop_current:
 19130            rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS);
 19131            if (unlikely(rc != MDBX_SUCCESS))
 19132              return rc;
 19133            flags -= MDBX_CURRENT;
 19134            goto skip_check_samedata;
 19135          }
 19136        } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) {
 19137          rc = mdbx_cursor_del(mc, 0);
 19138          if (unlikely(rc != MDBX_SUCCESS))
 19139            return rc;
 19140          flags -= MDBX_CURRENT;
 19141          goto skip_check_samedata;
 19142        }
 19143      }
 19144      if (!(flags & MDBX_RESERVE) &&
 19145          unlikely(cmp_lenfast(&current_data, data) == 0))
 19146        return MDBX_SUCCESS /* the same data, nothing to update */;
 19147    skip_check_samedata:;
 19148    }
 19149  
 19150    if (mc->mc_db->md_root == P_INVALID) {
 19151      /* new database, cursor has nothing to point to */
 19152      mc->mc_snum = 0;
 19153      mc->mc_top = 0;
 19154      mc->mc_flags &= ~C_INITIALIZED;
 19155      rc = MDBX_NO_ROOT;
 19156    } else if ((flags & MDBX_CURRENT) == 0) {
 19157      bool exact = false;
 19158      if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) {
 19159        rc = cursor_last(mc, &dkey, &olddata);
 19160        if (likely(rc == MDBX_SUCCESS)) {
 19161          rc = mc->mc_dbx->md_cmp(key, &dkey);
 19162          if (likely(rc > 0)) {
 19163            mc->mc_ki[mc->mc_top]++; /* step forward for appending */
 19164            rc = MDBX_NOTFOUND;
 19165          } else {
 19166            if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP)))
 19167              /* new-key < last-key
 19168               * or new-key == last-key without MDBX_APPENDDUP */
 19169              return MDBX_EKEYMISMATCH;
 19170            exact = true;
 19171          }
 19172        }
 19173      } else {
 19174        struct cursor_set_result csr =
 19175            /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */
 19176            cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET);
 19177        rc = csr.err;
 19178        exact = csr.exact;
 19179      }
 19180      if (likely(rc == MDBX_SUCCESS)) {
 19181        if (exact) {
 19182          if (unlikely(flags & MDBX_NOOVERWRITE)) {
 19183            DEBUG("duplicate key [%s]", DKEY_DEBUG(key));
 19184            *data = olddata;
 19185            return MDBX_KEYEXIST;
 19186          }
 19187          if (unlikely(mc->mc_flags & C_SUB)) {
 19188            /* nested subtree of DUPSORT-database with the same key,
 19189             * nothing to update */
 19190            eASSERT(env, data->iov_len == 0 &&
 19191                             (olddata.iov_len == 0 ||
 19192                              /* olddata may not be updated in case LEAF2-page
 19193                                 of dupfixed-subDB */
 19194                              (mc->mc_db->md_flags & MDBX_DUPFIXED)));
 19195            return MDBX_SUCCESS;
 19196          }
 19197          if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor &&
 19198              (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
 19199            rc = mdbx_cursor_del(mc, MDBX_ALLDUPS);
 19200            if (unlikely(rc != MDBX_SUCCESS))
 19201              return rc;
 19202            flags -= MDBX_ALLDUPS;
 19203            rc = MDBX_NOTFOUND;
 19204            exact = false;
 19205          } else /* checking for early exit without dirtying pages */
 19206            if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) &&
 19207                unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) {
 19208              if (!mc->mc_xcursor)
 19209                /* the same data, nothing to update */
 19210                return MDBX_SUCCESS;
 19211              if (flags & MDBX_NODUPDATA)
 19212                return MDBX_KEYEXIST;
 19213              if (flags & MDBX_APPENDDUP)
 19214                return MDBX_EKEYMISMATCH;
 19215              if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata)))
 19216                /* data is match exactly byte-to-byte, nothing to update */
 19217                return MDBX_SUCCESS;
 19218              else {
 19219                /* The data has differences, but the user-provided comparator
 19220                 * considers them equal. So continue update since called without.
 19221                 * Continue to update since was called without MDBX_NODUPDATA. */
 19222              }
 19223            }
 19224        }
 19225      } else if (unlikely(rc != MDBX_NOTFOUND))
 19226        return rc;
 19227    }
 19228  
 19229    mc->mc_flags &= ~C_DEL;
 19230  
 19231    /* Cursor is positioned, check for room in the dirty list */
 19232    if (!nospill) {
 19233      rdata = data;
 19234      if (unlikely(flags & MDBX_MULTIPLE)) {
 19235        rdata = &xdata;
 19236        xdata.iov_len = data->iov_len * dcount;
 19237      }
 19238      if (unlikely(err = cursor_spill(mc, key, rdata)))
 19239        return err;
 19240    }
 19241  
 19242    if (unlikely(rc == MDBX_NO_ROOT)) {
 19243      /* new database, write a root leaf page */
 19244      DEBUG("%s", "allocating new root leaf page");
 19245      if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) {
 19246        err = touch_dbi(mc);
 19247        if (unlikely(err != MDBX_SUCCESS))
 19248          return err;
 19249      }
 19250      pgr_t npr = page_new(mc, P_LEAF);
 19251      if (unlikely(npr.err != MDBX_SUCCESS))
 19252        return npr.err;
 19253      npr.err = cursor_push(mc, npr.page);
 19254      if (unlikely(npr.err != MDBX_SUCCESS))
 19255        return npr.err;
 19256      mc->mc_db->md_root = npr.page->mp_pgno;
 19257      mc->mc_db->md_depth++;
 19258      if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
 19259        assert(key->iov_len >= mc->mc_dbx->md_klen_min &&
 19260               key->iov_len <= mc->mc_dbx->md_klen_max);
 19261        mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = key->iov_len;
 19262      }
 19263      if (mc->mc_db->md_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) {
 19264        assert(data->iov_len >= mc->mc_dbx->md_vlen_min &&
 19265               data->iov_len <= mc->mc_dbx->md_vlen_max);
 19266        assert(mc->mc_xcursor != NULL);
 19267        mc->mc_db->md_xsize = mc->mc_xcursor->mx_db.md_xsize =
 19268            (unsigned)(mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max =
 19269                           mc->mc_xcursor->mx_dbx.md_klen_min =
 19270                               mc->mc_xcursor->mx_dbx.md_klen_max =
 19271                                   data->iov_len);
 19272      }
 19273      if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED)
 19274        npr.page->mp_flags |= P_LEAF2;
 19275      mc->mc_flags |= C_INITIALIZED;
 19276    } else {
 19277      /* make sure all cursor pages are writable */
 19278      err = cursor_touch(mc);
 19279      if (unlikely(err))
 19280        return err;
 19281    }
 19282  
 19283    bool insert_key, insert_data, do_sub = false;
 19284    insert_key = insert_data = (rc != MDBX_SUCCESS);
 19285    uint16_t fp_flags = P_LEAF;
 19286    MDBX_page *fp = env->me_pbuf;
 19287    fp->mp_txnid = mc->mc_txn->mt_front;
 19288    if (insert_key) {
 19289      /* The key does not exist */
 19290      DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]);
 19291      if ((mc->mc_db->md_flags & MDBX_DUPSORT) &&
 19292          node_size(key, data) > env->me_leaf_nodemax) {
 19293        /* Too big for a node, insert in sub-DB.  Set up an empty
 19294         * "old sub-page" for prep_subDB to expand to a full page. */
 19295        fp->mp_leaf2_ksize =
 19296            (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0;
 19297        fp->mp_lower = fp->mp_upper = 0;
 19298        olddata.iov_len = PAGEHDRSZ;
 19299        goto prep_subDB;
 19300      }
 19301    } else {
 19302      /* there's only a key anyway, so this is a no-op */
 19303      if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
 19304        char *ptr;
 19305        unsigned ksize = mc->mc_db->md_xsize;
 19306        if (unlikely(key->iov_len != ksize))
 19307          return MDBX_BAD_VALSIZE;
 19308        ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
 19309        memcpy(ptr, key->iov_base, ksize);
 19310      fix_parent:
 19311        /* if overwriting slot 0 of leaf, need to
 19312         * update branch key if there is a parent page */
 19313        if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
 19314          unsigned dtop = 1;
 19315          mc->mc_top--;
 19316          /* slot 0 is always an empty key, find real slot */
 19317          while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
 19318            mc->mc_top--;
 19319            dtop++;
 19320          }
 19321          err = MDBX_SUCCESS;
 19322          if (mc->mc_ki[mc->mc_top])
 19323            err = update_key(mc, key);
 19324          cASSERT(mc, mc->mc_top + dtop < UINT16_MAX);
 19325          mc->mc_top += (uint8_t)dtop;
 19326          if (unlikely(err != MDBX_SUCCESS))
 19327            return err;
 19328        }
 19329  
 19330        if (AUDIT_ENABLED()) {
 19331          err = cursor_check(mc);
 19332          if (unlikely(err != MDBX_SUCCESS))
 19333            return err;
 19334        }
 19335        return MDBX_SUCCESS;
 19336      }
 19337  
 19338    more:;
 19339      if (AUDIT_ENABLED()) {
 19340        err = cursor_check(mc);
 19341        if (unlikely(err != MDBX_SUCCESS))
 19342          return err;
 19343      }
 19344      MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 19345  
 19346      /* Large/Overflow page overwrites need special handling */
 19347      if (unlikely(node_flags(node) & F_BIGDATA)) {
 19348        int dpages = (node_size(key, data) > env->me_leaf_nodemax)
 19349                         ? number_of_ovpages(env, data->iov_len)
 19350                         : 0;
 19351  
 19352        const pgno_t pgno = node_largedata_pgno(node);
 19353        pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid);
 19354        if (unlikely(lp.err != MDBX_SUCCESS))
 19355          return lp.err;
 19356        cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW);
 19357  
 19358        /* Is the ov page from this txn (or a parent) and big enough? */
 19359        int ovpages = lp.page->mp_pages;
 19360        if (!IS_FROZEN(mc->mc_txn, lp.page) &&
 19361            (unlikely(mc->mc_flags & C_GCFREEZE)
 19362                 ? (ovpages >= dpages)
 19363                 : (ovpages ==
 19364                    /* LY: add configurable threshold to keep reserve space */
 19365                    dpages))) {
 19366          /* yes, overwrite it. */
 19367          if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) {
 19368            if (IS_SPILLED(mc->mc_txn, lp.page)) {
 19369              lp = /* TODO: avoid search and get txn & spill-index from
 19370                       page_result */
 19371                  page_unspill(mc->mc_txn, lp.page);
 19372              if (unlikely(lp.err))
 19373                return lp.err;
 19374            } else {
 19375              if (unlikely(!mc->mc_txn->mt_parent)) {
 19376                ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s "
 19377                      "page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
 19378                      " without parent transaction, current txn %" PRIaTXN
 19379                      " front %" PRIaTXN,
 19380                      "overflow/large", pgno, lp.page->mp_txnid,
 19381                      mc->mc_txn->mt_txnid, mc->mc_txn->mt_front);
 19382                return MDBX_PROBLEM;
 19383              }
 19384  
 19385              /* It is writable only in a parent txn */
 19386              MDBX_page *np = page_malloc(mc->mc_txn, ovpages);
 19387              if (unlikely(!np))
 19388                return MDBX_ENOMEM;
 19389  
 19390              memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */
 19391              err = page_dirty(mc->mc_txn, lp.page = np, ovpages);
 19392              if (unlikely(err != MDBX_SUCCESS))
 19393                return err;
 19394  
 19395  #if MDBX_ENABLE_PGOP_STAT
 19396              mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages;
 19397  #endif /* MDBX_ENABLE_PGOP_STAT */
 19398              cASSERT(mc, dirtylist_check(mc->mc_txn));
 19399            }
 19400          }
 19401          node_set_ds(node, data->iov_len);
 19402          if (flags & MDBX_RESERVE)
 19403            data->iov_base = page_data(lp.page);
 19404          else
 19405            memcpy(page_data(lp.page), data->iov_base, data->iov_len);
 19406  
 19407          if (AUDIT_ENABLED()) {
 19408            err = cursor_check(mc);
 19409            if (unlikely(err != MDBX_SUCCESS))
 19410              return err;
 19411          }
 19412          return MDBX_SUCCESS;
 19413        }
 19414  
 19415        if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS)
 19416          return err;
 19417      } else {
 19418        olddata.iov_len = node_ds(node);
 19419        olddata.iov_base = node_data(node);
 19420        cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <=
 19421                        (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize);
 19422  
 19423        /* DB has dups? */
 19424        if (mc->mc_db->md_flags & MDBX_DUPSORT) {
 19425          /* Prepare (sub-)page/sub-DB to accept the new item, if needed.
 19426           * fp: old sub-page or a header faking it.
 19427           * mp: new (sub-)page.  offset: growth in page size.
 19428           * xdata: node data with new page or DB. */
 19429          unsigned i;
 19430          size_t offset = 0;
 19431          MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf;
 19432          mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
 19433  
 19434          /* Was a single item before, must convert now */
 19435          if (!(node_flags(node) & F_DUPDATA)) {
 19436  
 19437            /* does data match? */
 19438            const int cmp = mc->mc_dbx->md_dcmp(data, &olddata);
 19439            if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0))
 19440              return MDBX_EKEYMISMATCH;
 19441            if (cmp == 0) {
 19442              if (flags & MDBX_NODUPDATA)
 19443                return MDBX_KEYEXIST;
 19444              if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) {
 19445                /* data is match exactly byte-to-byte, nothing to update */
 19446                if (unlikely(flags & MDBX_MULTIPLE)) {
 19447                  rc = MDBX_SUCCESS;
 19448                  goto continue_multiple;
 19449                }
 19450                return MDBX_SUCCESS;
 19451              } else {
 19452                /* The data has differences, but the user-provided comparator
 19453                 * considers them equal. So continue update since called without.
 19454                 * Continue to update since was called without MDBX_NODUPDATA. */
 19455              }
 19456              cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax);
 19457              goto current;
 19458            }
 19459  
 19460            /* Just overwrite the current item */
 19461            if (flags & MDBX_CURRENT) {
 19462              cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax);
 19463              goto current;
 19464            }
 19465  
 19466            /* Back up original data item */
 19467            memcpy(dkey.iov_base = fp + 1, olddata.iov_base,
 19468                   dkey.iov_len = olddata.iov_len);
 19469            dupdata_flag = 1;
 19470  
 19471            /* Make sub-page header for the dup items, with dummy body */
 19472            fp->mp_flags = P_LEAF | P_SUBP;
 19473            fp->mp_lower = 0;
 19474            xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len;
 19475            if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
 19476              fp->mp_flags |= P_LEAF2;
 19477              fp->mp_leaf2_ksize = (uint16_t)data->iov_len;
 19478              xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */
 19479              cASSERT(mc, xdata.iov_len <= env->me_psize);
 19480            } else {
 19481              xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) +
 19482                               (dkey.iov_len & 1) + (data->iov_len & 1);
 19483              cASSERT(mc, xdata.iov_len <= env->me_psize);
 19484            }
 19485            fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ);
 19486            olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */
 19487          } else if (node_flags(node) & F_SUBDATA) {
 19488            /* Data is on sub-DB, just store it */
 19489            flags |= F_DUPDATA | F_SUBDATA;
 19490            goto put_sub;
 19491          } else {
 19492            /* Data is on sub-page */
 19493            fp = olddata.iov_base;
 19494            switch (flags) {
 19495            default:
 19496              if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) {
 19497                offset = node_size(data, nullptr) + sizeof(indx_t);
 19498                break;
 19499              }
 19500              offset = fp->mp_leaf2_ksize;
 19501              if (page_room(fp) < offset) {
 19502                offset *= 4; /* space for 4 more */
 19503                break;
 19504              }
 19505              /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */
 19506              __fallthrough;
 19507            case MDBX_CURRENT | MDBX_NODUPDATA:
 19508            case MDBX_CURRENT:
 19509              fp->mp_txnid = mc->mc_txn->mt_front;
 19510              fp->mp_pgno = mp->mp_pgno;
 19511              mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
 19512              flags |= F_DUPDATA;
 19513              goto put_sub;
 19514            }
 19515            xdata.iov_len = olddata.iov_len + offset;
 19516          }
 19517  
 19518          fp_flags = fp->mp_flags;
 19519          if (node_size_len(node_ks(node), xdata.iov_len) >
 19520              env->me_leaf_nodemax) {
 19521            /* Too big for a sub-page, convert to sub-DB */
 19522            fp_flags &= ~P_SUBP;
 19523          prep_subDB:
 19524            nested_dupdb.md_xsize = 0;
 19525            nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags);
 19526            if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
 19527              fp_flags |= P_LEAF2;
 19528              nested_dupdb.md_xsize = fp->mp_leaf2_ksize;
 19529            }
 19530            nested_dupdb.md_depth = 1;
 19531            nested_dupdb.md_branch_pages = 0;
 19532            nested_dupdb.md_leaf_pages = 1;
 19533            nested_dupdb.md_overflow_pages = 0;
 19534            nested_dupdb.md_entries = page_numkeys(fp);
 19535            xdata.iov_len = sizeof(nested_dupdb);
 19536            xdata.iov_base = &nested_dupdb;
 19537            const pgr_t par = page_alloc(mc);
 19538            mp = par.page;
 19539            if (unlikely(par.err != MDBX_SUCCESS))
 19540              return par.err;
 19541            mc->mc_db->md_leaf_pages += 1;
 19542            cASSERT(mc, env->me_psize > olddata.iov_len);
 19543            offset = env->me_psize - (unsigned)olddata.iov_len;
 19544            flags |= F_DUPDATA | F_SUBDATA;
 19545            nested_dupdb.md_root = mp->mp_pgno;
 19546            nested_dupdb.md_seq = 0;
 19547            nested_dupdb.md_mod_txnid = mc->mc_txn->mt_txnid;
 19548            sub_root = mp;
 19549          }
 19550          if (mp != fp) {
 19551            mp->mp_flags = fp_flags;
 19552            mp->mp_txnid = mc->mc_txn->mt_front;
 19553            mp->mp_leaf2_ksize = fp->mp_leaf2_ksize;
 19554            mp->mp_lower = fp->mp_lower;
 19555            cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX);
 19556            mp->mp_upper = (indx_t)(fp->mp_upper + offset);
 19557            if (unlikely(fp_flags & P_LEAF2)) {
 19558              memcpy(page_data(mp), page_data(fp),
 19559                     page_numkeys(fp) * fp->mp_leaf2_ksize);
 19560            } else {
 19561              memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ,
 19562                     (char *)fp + fp->mp_upper + PAGEHDRSZ,
 19563                     olddata.iov_len - fp->mp_upper - PAGEHDRSZ);
 19564              memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs),
 19565                     page_numkeys(fp) * sizeof(mp->mp_ptrs[0]));
 19566              for (i = 0; i < page_numkeys(fp); i++) {
 19567                cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX);
 19568                mp->mp_ptrs[i] += (indx_t)offset;
 19569              }
 19570            }
 19571          }
 19572  
 19573          rdata = &xdata;
 19574          flags |= F_DUPDATA;
 19575          do_sub = true;
 19576          if (!insert_key)
 19577            node_del(mc, 0);
 19578          goto new_sub;
 19579        }
 19580  
 19581        /* MDBX passes F_SUBDATA in 'flags' to write a DB record */
 19582        if (unlikely((node_flags(node) ^ flags) & F_SUBDATA))
 19583          return MDBX_INCOMPATIBLE;
 19584  
 19585      current:
 19586        if (data->iov_len == olddata.iov_len) {
 19587          cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node)));
 19588          /* same size, just replace it. Note that we could
 19589           * also reuse this node if the new data is smaller,
 19590           * but instead we opt to shrink the node in that case. */
 19591          if (flags & MDBX_RESERVE)
 19592            data->iov_base = olddata.iov_base;
 19593          else if (!(mc->mc_flags & C_SUB))
 19594            memcpy(olddata.iov_base, data->iov_base, data->iov_len);
 19595          else {
 19596            cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1);
 19597            cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF);
 19598            cASSERT(mc, node_ds(node) == 0);
 19599            cASSERT(mc, node_flags(node) == 0);
 19600            cASSERT(mc, key->iov_len < UINT16_MAX);
 19601            node_set_ks(node, key->iov_len);
 19602            memcpy(node_key(node), key->iov_base, key->iov_len);
 19603            cASSERT(mc, (char *)node_key(node) + node_ds(node) <
 19604                            (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize);
 19605            goto fix_parent;
 19606          }
 19607  
 19608          if (AUDIT_ENABLED()) {
 19609            err = cursor_check(mc);
 19610            if (unlikely(err != MDBX_SUCCESS))
 19611              return err;
 19612          }
 19613          return MDBX_SUCCESS;
 19614        }
 19615      }
 19616      node_del(mc, 0);
 19617    }
 19618  
 19619    rdata = data;
 19620  
 19621  new_sub:;
 19622    const unsigned naf = flags & NODE_ADD_FLAGS;
 19623    size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len
 19624                                                   : leaf_size(env, key, rdata);
 19625    if (page_room(mc->mc_pg[mc->mc_top]) < nsize) {
 19626      rc = page_split(mc, key, rdata, P_INVALID,
 19627                      insert_key ? naf : naf | MDBX_SPLIT_REPLACE);
 19628      if (rc == MDBX_SUCCESS && AUDIT_ENABLED())
 19629        rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc);
 19630    } else {
 19631      /* There is room already in this leaf page. */
 19632      if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
 19633        cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) &&
 19634                        rdata->iov_len == 0);
 19635        rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key);
 19636      } else
 19637        rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf);
 19638      if (likely(rc == 0)) {
 19639        /* Adjust other cursors pointing to mp */
 19640        const MDBX_dbi dbi = mc->mc_dbi;
 19641        const unsigned i = mc->mc_top;
 19642        MDBX_page *const mp = mc->mc_pg[i];
 19643        for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2;
 19644             m2 = m2->mc_next) {
 19645          MDBX_cursor *m3 =
 19646              (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 19647          if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp)
 19648            continue;
 19649          if (m3->mc_ki[i] >= mc->mc_ki[i])
 19650            m3->mc_ki[i] += insert_key;
 19651          if (XCURSOR_INITED(m3))
 19652            XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]);
 19653        }
 19654      }
 19655    }
 19656  
 19657    if (likely(rc == MDBX_SUCCESS)) {
 19658      /* Now store the actual data in the child DB. Note that we're
 19659       * storing the user data in the keys field, so there are strict
 19660       * size limits on dupdata. The actual data fields of the child
 19661       * DB are all zero size. */
 19662      if (do_sub) {
 19663        int xflags;
 19664        size_t ecount;
 19665      put_sub:
 19666        xdata.iov_len = 0;
 19667        xdata.iov_base = nullptr;
 19668        MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 19669  #define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1
 19670        STATIC_ASSERT(
 19671            (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) ==
 19672            MDBX_NOOVERWRITE);
 19673        xflags = MDBX_CURRENT | MDBX_NOSPILL |
 19674                 ((flags & MDBX_NODUPDATA) >>
 19675                  SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE);
 19676        if ((flags & MDBX_CURRENT) == 0) {
 19677          xflags -= MDBX_CURRENT;
 19678          err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]);
 19679          if (unlikely(err != MDBX_SUCCESS))
 19680            return err;
 19681        }
 19682        if (sub_root)
 19683          mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root;
 19684        /* converted, write the original data first */
 19685        if (dupdata_flag) {
 19686          rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
 19687          if (unlikely(rc))
 19688            goto bad_sub;
 19689          /* we've done our job */
 19690          dkey.iov_len = 0;
 19691        }
 19692        if (!(node_flags(node) & F_SUBDATA) || sub_root) {
 19693          /* Adjust other cursors pointing to mp */
 19694          MDBX_cursor *m2;
 19695          MDBX_xcursor *mx = mc->mc_xcursor;
 19696          unsigned i = mc->mc_top;
 19697          MDBX_page *mp = mc->mc_pg[i];
 19698          const int nkeys = page_numkeys(mp);
 19699  
 19700          for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) {
 19701            if (m2 == mc || m2->mc_snum < mc->mc_snum)
 19702              continue;
 19703            if (!(m2->mc_flags & C_INITIALIZED))
 19704              continue;
 19705            if (m2->mc_pg[i] == mp) {
 19706              if (m2->mc_ki[i] == mc->mc_ki[i]) {
 19707                err = cursor_xinit2(m2, mx, dupdata_flag);
 19708                if (unlikely(err != MDBX_SUCCESS))
 19709                  return err;
 19710              } else if (!insert_key && m2->mc_ki[i] < nkeys) {
 19711                XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]);
 19712              }
 19713            }
 19714          }
 19715        }
 19716        cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX);
 19717        ecount = (size_t)mc->mc_xcursor->mx_db.md_entries;
 19718  #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1
 19719        STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) ==
 19720                      MDBX_APPEND);
 19721        xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND;
 19722        rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
 19723        if (flags & F_SUBDATA) {
 19724          void *db = node_data(node);
 19725          mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid;
 19726          memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db));
 19727        }
 19728        insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries);
 19729      }
 19730      /* Increment count unless we just replaced an existing item. */
 19731      if (insert_data)
 19732        mc->mc_db->md_entries++;
 19733      if (insert_key) {
 19734        /* Invalidate txn if we created an empty sub-DB */
 19735        if (unlikely(rc))
 19736          goto bad_sub;
 19737        /* If we succeeded and the key didn't exist before,
 19738         * make sure the cursor is marked valid. */
 19739        mc->mc_flags |= C_INITIALIZED;
 19740      }
 19741      if (unlikely(flags & MDBX_MULTIPLE)) {
 19742        if (likely(rc == MDBX_SUCCESS)) {
 19743        continue_multiple:
 19744          mcount++;
 19745          /* let caller know how many succeeded, if any */
 19746          data[1].iov_len = mcount;
 19747          if (mcount < dcount) {
 19748            data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len;
 19749            insert_key = insert_data = false;
 19750            goto more;
 19751          }
 19752        }
 19753      }
 19754      if (rc == MDBX_SUCCESS && AUDIT_ENABLED())
 19755        rc = cursor_check(mc);
 19756      return rc;
 19757    bad_sub:
 19758      if (unlikely(rc == MDBX_KEYEXIST)) {
 19759        /* should not happen, we deleted that item */
 19760        ERROR("Unexpected %i error while put to nested dupsort's hive", rc);
 19761        rc = MDBX_PROBLEM;
 19762      }
 19763    }
 19764    mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 19765    return rc;
 19766  }
 19767  
 19768  __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
 19769    if (unlikely(!mc))
 19770      return MDBX_EINVAL;
 19771  
 19772    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 19773      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 19774                                                       : MDBX_EBADSIGN;
 19775  
 19776    int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED);
 19777    if (unlikely(rc != MDBX_SUCCESS))
 19778      return rc;
 19779  
 19780    if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi)))
 19781      return MDBX_BAD_DBI;
 19782  
 19783    if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
 19784      return MDBX_ENODATA;
 19785  
 19786    if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])))
 19787      return MDBX_NOTFOUND;
 19788  
 19789    if (likely((flags & MDBX_NOSPILL) == 0) &&
 19790        unlikely(rc = cursor_spill(mc, NULL, NULL)))
 19791      return rc;
 19792  
 19793    rc = cursor_touch(mc);
 19794    if (unlikely(rc != MDBX_SUCCESS))
 19795      return rc;
 19796  
 19797    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 19798    if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
 19799      ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
 19800            mp->mp_pgno, mp->mp_flags);
 19801      return MDBX_CORRUPTED;
 19802    }
 19803    if (IS_LEAF2(mp))
 19804      goto del_key;
 19805  
 19806    MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
 19807    if (node_flags(node) & F_DUPDATA) {
 19808      if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) {
 19809        /* cursor_del() will subtract the final entry */
 19810        mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
 19811        mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
 19812      } else {
 19813        if (!(node_flags(node) & F_SUBDATA))
 19814          mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
 19815        rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL);
 19816        if (unlikely(rc))
 19817          return rc;
 19818        /* If sub-DB still has entries, we're done */
 19819        if (mc->mc_xcursor->mx_db.md_entries) {
 19820          if (node_flags(node) & F_SUBDATA) {
 19821            /* update subDB info */
 19822            void *db = node_data(node);
 19823            mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid;
 19824            memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db));
 19825          } else {
 19826            MDBX_cursor *m2;
 19827            /* shrink fake page */
 19828            node_shrink(mp, mc->mc_ki[mc->mc_top]);
 19829            node = page_node(mp, mc->mc_ki[mc->mc_top]);
 19830            mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
 19831            /* fix other sub-DB cursors pointed at fake pages on this page */
 19832            for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) {
 19833              if (m2 == mc || m2->mc_snum < mc->mc_snum)
 19834                continue;
 19835              if (!(m2->mc_flags & C_INITIALIZED))
 19836                continue;
 19837              if (m2->mc_pg[mc->mc_top] == mp) {
 19838                MDBX_node *inner = node;
 19839                if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp))
 19840                  continue;
 19841                if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) {
 19842                  inner = page_node(mp, m2->mc_ki[mc->mc_top]);
 19843                  if (node_flags(inner) & F_SUBDATA)
 19844                    continue;
 19845                }
 19846                m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner);
 19847              }
 19848            }
 19849          }
 19850          mc->mc_db->md_entries--;
 19851          cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 &&
 19852                          mc->mc_db->md_root != P_INVALID);
 19853          return rc;
 19854        } else {
 19855          mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
 19856        }
 19857        /* otherwise fall thru and delete the sub-DB */
 19858      }
 19859  
 19860      if (node_flags(node) & F_SUBDATA) {
 19861        /* add all the child DB's pages to the free list */
 19862        rc = drop_tree(&mc->mc_xcursor->mx_cursor, false);
 19863        if (unlikely(rc))
 19864          goto fail;
 19865      }
 19866    }
 19867    /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */
 19868    else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA))
 19869      return MDBX_INCOMPATIBLE;
 19870  
 19871    /* add large/overflow pages to free list */
 19872    if (node_flags(node) & F_BIGDATA) {
 19873      pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid);
 19874      if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page))))
 19875        goto fail;
 19876    }
 19877  
 19878  del_key:
 19879    return cursor_del(mc);
 19880  
 19881  fail:
 19882    mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 19883    return rc;
 19884  }
 19885  
 19886  /* Allocate and initialize new pages for a database.
 19887   * Set MDBX_TXN_ERROR on failure. */
 19888  static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) {
 19889    cASSERT(mc, (flags & P_OVERFLOW) == 0);
 19890    pgr_t ret = page_alloc(mc);
 19891    if (unlikely(ret.err != MDBX_SUCCESS))
 19892      return ret;
 19893  
 19894    DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno);
 19895    ret.page->mp_flags = (uint16_t)flags;
 19896    ret.page->mp_txnid = mc->mc_txn->mt_front;
 19897    cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY);
 19898    cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
 19899  #if MDBX_ENABLE_PGOP_STAT
 19900    mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1;
 19901  #endif /* MDBX_ENABLE_PGOP_STAT */
 19902  
 19903    STATIC_ASSERT(P_BRANCH == 1);
 19904    const unsigned is_branch = flags & P_BRANCH;
 19905  
 19906    ret.page->mp_lower = 0;
 19907    ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
 19908    mc->mc_db->md_branch_pages += is_branch;
 19909    mc->mc_db->md_leaf_pages += 1 - is_branch;
 19910    if (unlikely(mc->mc_flags & C_SUB)) {
 19911      MDBX_db *outer = outer_db(mc);
 19912      outer->md_branch_pages += is_branch;
 19913      outer->md_leaf_pages += 1 - is_branch;
 19914    }
 19915    return ret;
 19916  }
 19917  
 19918  static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) {
 19919    pgr_t ret = likely(npages == 1)
 19920                    ? page_alloc(mc)
 19921                    : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL);
 19922    if (unlikely(ret.err != MDBX_SUCCESS))
 19923      return ret;
 19924  
 19925    DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi,
 19926          ret.page->mp_pgno, npages);
 19927    ret.page->mp_flags = P_OVERFLOW;
 19928    ret.page->mp_txnid = mc->mc_txn->mt_front;
 19929    cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY);
 19930    cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
 19931  #if MDBX_ENABLE_PGOP_STAT
 19932    mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages;
 19933  #endif /* MDBX_ENABLE_PGOP_STAT */
 19934  
 19935    mc->mc_db->md_overflow_pages += npages;
 19936    ret.page->mp_pages = npages;
 19937    cASSERT(mc, !(mc->mc_flags & C_SUB));
 19938    return ret;
 19939  }
 19940  
 19941  __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc,
 19942                                                      unsigned indx,
 19943                                                      const MDBX_val *key) {
 19944    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 19945    DKBUF_DEBUG;
 19946    DEBUG("add to leaf2-%spage %" PRIaPGNO " index %i, "
 19947          " key size %" PRIuPTR " [%s]",
 19948          IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0,
 19949          DKEY_DEBUG(key));
 19950  
 19951    cASSERT(mc, key);
 19952    cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2));
 19953    const unsigned ksize = mc->mc_db->md_xsize;
 19954    cASSERT(mc, ksize == key->iov_len);
 19955    const unsigned nkeys = page_numkeys(mp);
 19956  
 19957    /* Just using these for counting */
 19958    const intptr_t lower = mp->mp_lower + sizeof(indx_t);
 19959    const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t));
 19960    if (unlikely(lower > upper)) {
 19961      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 19962      return MDBX_PAGE_FULL;
 19963    }
 19964    mp->mp_lower = (indx_t)lower;
 19965    mp->mp_upper = (indx_t)upper;
 19966  
 19967    char *const ptr = page_leaf2key(mp, indx, ksize);
 19968    cASSERT(mc, nkeys >= indx);
 19969    const unsigned diff = nkeys - indx;
 19970    if (likely(diff > 0))
 19971      /* Move higher keys up one slot. */
 19972      memmove(ptr + ksize, ptr, diff * ksize);
 19973    /* insert new key */
 19974    memcpy(ptr, key->iov_base, ksize);
 19975    return MDBX_SUCCESS;
 19976  }
 19977  
 19978  static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx,
 19979                                                 const MDBX_val *key,
 19980                                                 pgno_t pgno) {
 19981    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 19982    DKBUF_DEBUG;
 19983    DEBUG("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO
 19984          " key size %" PRIuPTR " [%s]",
 19985          IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno,
 19986          key ? key->iov_len : 0, DKEY_DEBUG(key));
 19987  
 19988    cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH);
 19989    STATIC_ASSERT(NODESIZE % 2 == 0);
 19990  
 19991    /* Move higher pointers up one slot. */
 19992    const unsigned nkeys = page_numkeys(mp);
 19993    cASSERT(mc, nkeys >= indx);
 19994    for (unsigned i = nkeys; i > indx; --i)
 19995      mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
 19996  
 19997    /* Adjust free space offsets. */
 19998    const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key);
 19999    const intptr_t lower = mp->mp_lower + sizeof(indx_t);
 20000    const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t));
 20001    if (unlikely(lower > upper)) {
 20002      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 20003      return MDBX_PAGE_FULL;
 20004    }
 20005    mp->mp_lower = (indx_t)lower;
 20006    mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper;
 20007  
 20008    /* Write the node data. */
 20009    MDBX_node *node = page_node(mp, indx);
 20010    node_set_pgno(node, pgno);
 20011    node_set_flags(node, 0);
 20012    UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0);
 20013    node_set_ks(node, 0);
 20014    if (likely(key != NULL)) {
 20015      node_set_ks(node, key->iov_len);
 20016      memcpy(node_key(node), key->iov_base, key->iov_len);
 20017    }
 20018    return MDBX_SUCCESS;
 20019  }
 20020  
 20021  __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc,
 20022                                                     unsigned indx,
 20023                                                     const MDBX_val *key,
 20024                                                     MDBX_val *data,
 20025                                                     unsigned flags) {
 20026    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 20027    DKBUF_DEBUG;
 20028    DEBUG("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR
 20029          " key size %" PRIuPTR " [%s]",
 20030          IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0,
 20031          key ? key->iov_len : 0, DKEY_DEBUG(key));
 20032    cASSERT(mc, key != NULL && data != NULL);
 20033    cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF);
 20034    cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data));
 20035    MDBX_page *largepage = NULL;
 20036  
 20037    size_t node_bytes;
 20038    if (unlikely(flags & F_BIGDATA)) {
 20039      /* Data already on large/overflow page. */
 20040      STATIC_ASSERT(sizeof(pgno_t) % 2 == 0);
 20041      node_bytes =
 20042          node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
 20043    } else if (unlikely(node_size(key, data) >
 20044                        mc->mc_txn->mt_env->me_leaf_nodemax)) {
 20045      /* Put data on large/overflow page. */
 20046      if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) {
 20047        ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db",
 20048              mc->mc_db->md_flags);
 20049        return MDBX_PROBLEM;
 20050      }
 20051      if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) {
 20052        ERROR("Unexpected target %s flags 0x%x for large data-item", "node",
 20053              flags);
 20054        return MDBX_PROBLEM;
 20055      }
 20056      const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len);
 20057      const pgr_t npr = page_new_large(mc, ovpages);
 20058      if (unlikely(npr.err != MDBX_SUCCESS))
 20059        return npr.err;
 20060      largepage = npr.page;
 20061      DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR
 20062            " data bytes",
 20063            largepage->mp_pages, largepage->mp_pgno, data->iov_len);
 20064      flags |= F_BIGDATA;
 20065      node_bytes =
 20066          node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
 20067    } else {
 20068      node_bytes = node_size(key, data) + sizeof(indx_t);
 20069    }
 20070    cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data));
 20071  
 20072    /* Move higher pointers up one slot. */
 20073    const unsigned nkeys = page_numkeys(mp);
 20074    cASSERT(mc, nkeys >= indx);
 20075    for (unsigned i = nkeys; i > indx; --i)
 20076      mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
 20077  
 20078    /* Adjust free space offsets. */
 20079    const intptr_t lower = mp->mp_lower + sizeof(indx_t);
 20080    const intptr_t upper = mp->mp_upper - (node_bytes - sizeof(indx_t));
 20081    if (unlikely(lower > upper)) {
 20082      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 20083      return MDBX_PAGE_FULL;
 20084    }
 20085    mp->mp_lower = (indx_t)lower;
 20086    mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper;
 20087  
 20088    /* Write the node data. */
 20089    MDBX_node *node = page_node(mp, indx);
 20090    node_set_ks(node, key->iov_len);
 20091    node_set_flags(node, (uint8_t)flags);
 20092    UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0);
 20093    node_set_ds(node, data->iov_len);
 20094    memcpy(node_key(node), key->iov_base, key->iov_len);
 20095  
 20096    void *nodedata = node_data(node);
 20097    if (likely(largepage == NULL)) {
 20098      if (unlikely(flags & F_BIGDATA)) {
 20099        memcpy(nodedata, data->iov_base, sizeof(pgno_t));
 20100        return MDBX_SUCCESS;
 20101      }
 20102    } else {
 20103      poke_pgno(nodedata, largepage->mp_pgno);
 20104      nodedata = page_data(largepage);
 20105    }
 20106    if (unlikely(flags & MDBX_RESERVE))
 20107      data->iov_base = nodedata;
 20108    else if (likely(nodedata != data->iov_base &&
 20109                    data->iov_len /* to avoid UBSAN traps*/ != 0))
 20110      memcpy(nodedata, data->iov_base, data->iov_len);
 20111    return MDBX_SUCCESS;
 20112  }
 20113  
 20114  /* Delete the specified node from a page.
 20115   * [in] mc Cursor pointing to the node to delete.
 20116   * [in] ksize The size of a node. Only used if the page is
 20117   * part of a MDBX_DUPFIXED database. */
 20118  __hot static void node_del(MDBX_cursor *mc, size_t ksize) {
 20119    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 20120    const unsigned hole = mc->mc_ki[mc->mc_top];
 20121    const unsigned nkeys = page_numkeys(mp);
 20122  
 20123    DEBUG("delete node %u on %s page %" PRIaPGNO, hole,
 20124          IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno);
 20125    cASSERT(mc, hole < nkeys);
 20126  
 20127    if (IS_LEAF2(mp)) {
 20128      cASSERT(mc, ksize >= sizeof(indx_t));
 20129      unsigned diff = nkeys - 1 - hole;
 20130      char *base = page_leaf2key(mp, hole, ksize);
 20131      if (diff)
 20132        memmove(base, base + ksize, diff * ksize);
 20133      cASSERT(mc, mp->mp_lower >= sizeof(indx_t));
 20134      mp->mp_lower -= sizeof(indx_t);
 20135      cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t));
 20136      mp->mp_upper += (indx_t)(ksize - sizeof(indx_t));
 20137      return;
 20138    }
 20139  
 20140    MDBX_node *node = page_node(mp, hole);
 20141    cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0);
 20142    size_t hole_size = NODESIZE + node_ks(node);
 20143    if (IS_LEAF(mp))
 20144      hole_size +=
 20145          (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node);
 20146    hole_size = EVEN(hole_size);
 20147  
 20148    const indx_t hole_offset = mp->mp_ptrs[hole];
 20149    unsigned r, w;
 20150    for (r = w = 0; r < nkeys; r++)
 20151      if (r != hole)
 20152        mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset)
 20153                               ? mp->mp_ptrs[r] + (indx_t)hole_size
 20154                               : mp->mp_ptrs[r];
 20155  
 20156    char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
 20157    memmove(base + hole_size, base, hole_offset - mp->mp_upper);
 20158  
 20159    cASSERT(mc, mp->mp_lower >= sizeof(indx_t));
 20160    mp->mp_lower -= sizeof(indx_t);
 20161    cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size);
 20162    mp->mp_upper += (indx_t)hole_size;
 20163  
 20164    if (AUDIT_ENABLED()) {
 20165      const uint8_t checking = mc->mc_checking;
 20166      mc->mc_checking |= CC_UPDATING;
 20167      const int page_check_err = page_check(mc, mp);
 20168      mc->mc_checking = checking;
 20169      cASSERT(mc, page_check_err == MDBX_SUCCESS);
 20170    }
 20171  }
 20172  
 20173  /* Compact the main page after deleting a node on a subpage.
 20174   * [in] mp The main page to operate on.
 20175   * [in] indx The index of the subpage on the main page. */
 20176  static void node_shrink(MDBX_page *mp, unsigned indx) {
 20177    MDBX_node *node;
 20178    MDBX_page *sp, *xp;
 20179    char *base;
 20180    size_t nsize, delta, len, ptr;
 20181    int i;
 20182  
 20183    node = page_node(mp, indx);
 20184    sp = (MDBX_page *)node_data(node);
 20185    delta = page_room(sp);
 20186    assert(delta > 0);
 20187  
 20188    /* Prepare to shift upward, set len = length(subpage part to shift) */
 20189    if (IS_LEAF2(sp)) {
 20190      delta &= /* do not make the node uneven-sized */ ~(size_t)1;
 20191      if (unlikely(delta) == 0)
 20192        return;
 20193      nsize = node_ds(node) - delta;
 20194      assert(nsize % 1 == 0);
 20195      len = nsize;
 20196    } else {
 20197      xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */
 20198      for (i = page_numkeys(sp); --i >= 0;) {
 20199        assert(sp->mp_ptrs[i] >= delta);
 20200        xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta);
 20201      }
 20202      nsize = node_ds(node) - delta;
 20203      len = PAGEHDRSZ;
 20204    }
 20205    sp->mp_upper = sp->mp_lower;
 20206    sp->mp_pgno = mp->mp_pgno;
 20207    node_set_ds(node, nsize);
 20208  
 20209    /* Shift <lower nodes...initial part of subpage> upward */
 20210    base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
 20211    memmove(base + delta, base, (char *)sp + len - base);
 20212  
 20213    ptr = mp->mp_ptrs[indx];
 20214    for (i = page_numkeys(mp); --i >= 0;) {
 20215      if (mp->mp_ptrs[i] <= ptr) {
 20216        assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta);
 20217        mp->mp_ptrs[i] += (indx_t)delta;
 20218      }
 20219    }
 20220    assert((size_t)UINT16_MAX - mp->mp_upper >= delta);
 20221    mp->mp_upper += (indx_t)delta;
 20222  }
 20223  
 20224  /* Initial setup of a sorted-dups cursor.
 20225   *
 20226   * Sorted duplicates are implemented as a sub-database for the given key.
 20227   * The duplicate data items are actually keys of the sub-database.
 20228   * Operations on the duplicate data items are performed using a sub-cursor
 20229   * initialized when the sub-database is first accessed. This function does
 20230   * the preliminary setup of the sub-cursor, filling in the fields that
 20231   * depend only on the parent DB.
 20232   *
 20233   * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */
 20234  static int cursor_xinit0(MDBX_cursor *mc) {
 20235    MDBX_xcursor *mx = mc->mc_xcursor;
 20236    if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) {
 20237      ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
 20238            mc->mc_dbi);
 20239      return MDBX_CORRUPTED;
 20240    }
 20241  
 20242    mx->mx_cursor.mc_xcursor = NULL;
 20243    mx->mx_cursor.mc_next = NULL;
 20244    mx->mx_cursor.mc_txn = mc->mc_txn;
 20245    mx->mx_cursor.mc_db = &mx->mx_db;
 20246    mx->mx_cursor.mc_dbx = &mx->mx_dbx;
 20247    mx->mx_cursor.mc_dbi = mc->mc_dbi;
 20248    mx->mx_cursor.mc_dbistate = mc->mc_dbistate;
 20249    mx->mx_cursor.mc_snum = 0;
 20250    mx->mx_cursor.mc_top = 0;
 20251    mx->mx_cursor.mc_flags = C_SUB;
 20252    STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2);
 20253    cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF);
 20254    mx->mx_cursor.mc_checking =
 20255        mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1);
 20256    mx->mx_dbx.md_name.iov_len = 0;
 20257    mx->mx_dbx.md_name.iov_base = NULL;
 20258    mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
 20259    mx->mx_dbx.md_dcmp = NULL;
 20260    mx->mx_dbx.md_klen_min = INT_MAX;
 20261    mx->mx_dbx.md_vlen_min = mx->mx_dbx.md_klen_max = mx->mx_dbx.md_vlen_max = 0;
 20262    return MDBX_SUCCESS;
 20263  }
 20264  
 20265  /* Final setup of a sorted-dups cursor.
 20266   * Sets up the fields that depend on the data from the main cursor.
 20267   * [in] mc The main cursor whose sorted-dups cursor is to be initialized.
 20268   * [in] node The data containing the MDBX_db record for the sorted-dup database.
 20269   */
 20270  static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node,
 20271                           const MDBX_page *mp) {
 20272    MDBX_xcursor *mx = mc->mc_xcursor;
 20273    if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) {
 20274      ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
 20275            mc->mc_dbi);
 20276      return MDBX_CORRUPTED;
 20277    }
 20278  
 20279    const uint8_t flags = node_flags(node);
 20280    switch (flags) {
 20281    default:
 20282      ERROR("invalid node flags %u", flags);
 20283      return MDBX_CORRUPTED;
 20284    case F_DUPDATA | F_SUBDATA:
 20285      if (!MDBX_DISABLE_VALIDATION &&
 20286          unlikely(node_ds(node) != sizeof(MDBX_db))) {
 20287        ERROR("invalid nested-db record size %zu", node_ds(node));
 20288        return MDBX_CORRUPTED;
 20289      }
 20290      memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db));
 20291      const txnid_t pp_txnid = mp->mp_txnid;
 20292      if (!MDBX_DISABLE_VALIDATION &&
 20293          unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) {
 20294        ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")",
 20295              mx->mx_db.md_mod_txnid, pp_txnid);
 20296        return MDBX_CORRUPTED;
 20297      }
 20298      mx->mx_cursor.mc_pg[0] = 0;
 20299      mx->mx_cursor.mc_snum = 0;
 20300      mx->mx_cursor.mc_top = 0;
 20301      mx->mx_cursor.mc_flags = C_SUB;
 20302      break;
 20303    case F_DUPDATA:
 20304      if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) {
 20305        ERROR("invalid nested-page size %zu", node_ds(node));
 20306        return MDBX_CORRUPTED;
 20307      }
 20308      MDBX_page *fp = node_data(node);
 20309      mx->mx_db.md_depth = 1;
 20310      mx->mx_db.md_branch_pages = 0;
 20311      mx->mx_db.md_leaf_pages = 1;
 20312      mx->mx_db.md_overflow_pages = 0;
 20313      mx->mx_db.md_entries = page_numkeys(fp);
 20314      mx->mx_db.md_root = fp->mp_pgno;
 20315      mx->mx_db.md_mod_txnid = mp->mp_txnid;
 20316      mx->mx_cursor.mc_snum = 1;
 20317      mx->mx_cursor.mc_top = 0;
 20318      mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED;
 20319      mx->mx_cursor.mc_pg[0] = fp;
 20320      mx->mx_cursor.mc_ki[0] = 0;
 20321      mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags);
 20322      mx->mx_db.md_xsize =
 20323          (mc->mc_db->md_flags & MDBX_DUPFIXED) ? fp->mp_leaf2_ksize : 0;
 20324      break;
 20325    }
 20326  
 20327    if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) {
 20328      if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) {
 20329        ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize);
 20330        return MDBX_CORRUPTED;
 20331      }
 20332      if (!MDBX_DISABLE_VALIDATION &&
 20333          unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) {
 20334        ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags);
 20335        return MDBX_CORRUPTED;
 20336      }
 20337      if (!MDBX_DISABLE_VALIDATION &&
 20338          unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min ||
 20339                   mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) {
 20340        ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length "
 20341              "(%zu/%zu)",
 20342              mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min,
 20343              mc->mc_dbx->md_vlen_max);
 20344        return MDBX_CORRUPTED;
 20345      }
 20346      mc->mc_db->md_xsize = mx->mx_db.md_xsize;
 20347      mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = mx->mx_db.md_xsize;
 20348    }
 20349    mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min;
 20350    mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max;
 20351  
 20352    DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi,
 20353          mx->mx_db.md_root);
 20354    return MDBX_SUCCESS;
 20355  }
 20356  
 20357  /* Fixup a sorted-dups cursor due to underlying update.
 20358   * Sets up some fields that depend on the data from the main cursor.
 20359   * Almost the same as init1, but skips initialization steps if the
 20360   * xcursor had already been used.
 20361   * [in] mc The main cursor whose sorted-dups cursor is to be fixed up.
 20362   * [in] src_mx The xcursor of an up-to-date cursor.
 20363   * [in] new_dupdata True if converting from a non-F_DUPDATA item. */
 20364  static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx,
 20365                           bool new_dupdata) {
 20366    MDBX_xcursor *mx = mc->mc_xcursor;
 20367    if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) {
 20368      ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
 20369            mc->mc_dbi);
 20370      return MDBX_CORRUPTED;
 20371    }
 20372  
 20373    if (new_dupdata) {
 20374      mx->mx_cursor.mc_snum = 1;
 20375      mx->mx_cursor.mc_top = 0;
 20376      mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED;
 20377      mx->mx_cursor.mc_ki[0] = 0;
 20378    }
 20379  
 20380    mx->mx_dbx.md_klen_min = src_mx->mx_dbx.md_klen_min;
 20381    mx->mx_dbx.md_klen_max = src_mx->mx_dbx.md_klen_max;
 20382    mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp;
 20383    mx->mx_db = src_mx->mx_db;
 20384    mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0];
 20385    if (mx->mx_cursor.mc_flags & C_INITIALIZED) {
 20386      DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi,
 20387            mx->mx_db.md_root);
 20388    }
 20389    return MDBX_SUCCESS;
 20390  }
 20391  
 20392  static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi,
 20393                                  MDBX_txn *const txn, MDBX_db *const db,
 20394                                  MDBX_dbx *const dbx, uint8_t *const dbstate) {
 20395    couple->outer.mc_signature = MDBX_MC_LIVE;
 20396    couple->outer.mc_next = NULL;
 20397    couple->outer.mc_backup = NULL;
 20398    couple->outer.mc_dbi = dbi;
 20399    couple->outer.mc_txn = txn;
 20400    couple->outer.mc_db = db;
 20401    couple->outer.mc_dbx = dbx;
 20402    couple->outer.mc_dbistate = dbstate;
 20403    couple->outer.mc_snum = 0;
 20404    couple->outer.mc_top = 0;
 20405    couple->outer.mc_pg[0] = 0;
 20406    couple->outer.mc_flags = 0;
 20407    STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF &&
 20408                  CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2);
 20409    couple->outer.mc_checking =
 20410        (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION))
 20411            ? CC_PAGECHECK | CC_LEAF
 20412            : CC_LEAF;
 20413    couple->outer.mc_ki[0] = 0;
 20414    couple->outer.mc_xcursor = NULL;
 20415  
 20416    int rc = MDBX_SUCCESS;
 20417    if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) {
 20418      rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY);
 20419      rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS;
 20420    } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) {
 20421      rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db,
 20422                     txn->mt_env->me_psize);
 20423    }
 20424  
 20425    if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) {
 20426      couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE;
 20427      couple->outer.mc_xcursor = &couple->inner;
 20428      rc = cursor_xinit0(&couple->outer);
 20429      if (unlikely(rc != MDBX_SUCCESS))
 20430        return rc;
 20431      couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min;
 20432      couple->inner.mx_dbx.md_klen_max = couple->outer.mc_dbx->md_vlen_max;
 20433    }
 20434    return rc;
 20435  }
 20436  
 20437  /* Initialize a cursor for a given transaction and database. */
 20438  static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) {
 20439    STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0);
 20440    return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn,
 20441                       &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi],
 20442                       &txn->mt_dbistate[dbi]);
 20443  }
 20444  
 20445  MDBX_cursor *mdbx_cursor_create(void *context) {
 20446    MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple));
 20447    if (unlikely(!couple))
 20448      return nullptr;
 20449  
 20450    couple->outer.mc_signature = MDBX_MC_READY4CLOSE;
 20451    couple->outer.mc_dbi = UINT_MAX;
 20452    couple->mc_userctx = context;
 20453    return &couple->outer;
 20454  }
 20455  
 20456  int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) {
 20457    if (unlikely(!mc))
 20458      return MDBX_EINVAL;
 20459  
 20460    if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
 20461                 mc->mc_signature != MDBX_MC_LIVE))
 20462      return MDBX_EBADSIGN;
 20463  
 20464    MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer);
 20465    couple->mc_userctx = ctx;
 20466    return MDBX_SUCCESS;
 20467  }
 20468  
 20469  void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) {
 20470    if (unlikely(!mc))
 20471      return nullptr;
 20472  
 20473    if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
 20474                 mc->mc_signature != MDBX_MC_LIVE))
 20475      return nullptr;
 20476  
 20477    MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer);
 20478    return couple->mc_userctx;
 20479  }
 20480  
 20481  int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
 20482    if (unlikely(!mc))
 20483      return MDBX_EINVAL;
 20484  
 20485    if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
 20486                 mc->mc_signature != MDBX_MC_LIVE))
 20487      return MDBX_EBADSIGN;
 20488  
 20489    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 20490    if (unlikely(rc != MDBX_SUCCESS))
 20491      return rc;
 20492  
 20493    if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
 20494      return MDBX_BAD_DBI;
 20495  
 20496    if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY)))
 20497      return MDBX_EACCESS;
 20498  
 20499    if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ {
 20500      cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE);
 20501      if (unlikely(mc->mc_dbi != dbi ||
 20502                   /* paranoia */ mc->mc_signature != MDBX_MC_LIVE ||
 20503                   mc->mc_txn != txn))
 20504        return MDBX_EINVAL;
 20505  
 20506      assert(mc->mc_db == &txn->mt_dbs[dbi]);
 20507      assert(mc->mc_dbx == &txn->mt_dbxs[dbi]);
 20508      assert(mc->mc_dbi == dbi);
 20509      assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]);
 20510      return likely(mc->mc_dbi == dbi &&
 20511                    /* paranoia */ mc->mc_signature == MDBX_MC_LIVE &&
 20512                    mc->mc_txn == txn)
 20513                 ? MDBX_SUCCESS
 20514                 : MDBX_EINVAL /* Disallow change DBI in nested transactions */;
 20515    }
 20516  
 20517    if (mc->mc_signature == MDBX_MC_LIVE) {
 20518      if (unlikely(!mc->mc_txn ||
 20519                   mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) {
 20520        ERROR("Wrong cursor's transaction %p 0x%x",
 20521              __Wpedantic_format_voidptr(mc->mc_txn),
 20522              mc->mc_txn ? mc->mc_txn->mt_signature : 0);
 20523        return MDBX_PROBLEM;
 20524      }
 20525      if (mc->mc_flags & C_UNTRACK) {
 20526        MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
 20527        while (*prev && *prev != mc)
 20528          prev = &(*prev)->mc_next;
 20529        cASSERT(mc, *prev == mc);
 20530        *prev = mc->mc_next;
 20531      }
 20532      mc->mc_signature = MDBX_MC_READY4CLOSE;
 20533      mc->mc_flags = 0;
 20534      mc->mc_dbi = UINT_MAX;
 20535      mc->mc_next = NULL;
 20536      mc->mc_db = NULL;
 20537      mc->mc_dbx = NULL;
 20538      mc->mc_dbistate = NULL;
 20539    }
 20540    cASSERT(mc, !(mc->mc_flags & C_UNTRACK));
 20541  
 20542    rc = cursor_init(mc, txn, dbi);
 20543    if (unlikely(rc != MDBX_SUCCESS))
 20544      return rc;
 20545  
 20546    mc->mc_next = txn->mt_cursors[dbi];
 20547    txn->mt_cursors[dbi] = mc;
 20548    mc->mc_flags |= C_UNTRACK;
 20549  
 20550    return MDBX_SUCCESS;
 20551  }
 20552  
 20553  int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) {
 20554    if (unlikely(!ret))
 20555      return MDBX_EINVAL;
 20556    *ret = NULL;
 20557  
 20558    MDBX_cursor *const mc = mdbx_cursor_create(nullptr);
 20559    if (unlikely(!mc))
 20560      return MDBX_ENOMEM;
 20561  
 20562    int rc = mdbx_cursor_bind(txn, mc, dbi);
 20563    if (unlikely(rc != MDBX_SUCCESS)) {
 20564      mdbx_cursor_close(mc);
 20565      return rc;
 20566    }
 20567  
 20568    *ret = mc;
 20569    return MDBX_SUCCESS;
 20570  }
 20571  
 20572  int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) {
 20573    return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL;
 20574  }
 20575  
 20576  int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) {
 20577    if (unlikely(!src))
 20578      return MDBX_EINVAL;
 20579    if (unlikely(src->mc_signature != MDBX_MC_LIVE))
 20580      return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 20581                                                        : MDBX_EBADSIGN;
 20582  
 20583    int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi);
 20584    if (unlikely(rc != MDBX_SUCCESS))
 20585      return rc;
 20586  
 20587    assert(dest->mc_db == src->mc_db);
 20588    assert(dest->mc_dbi == src->mc_dbi);
 20589    assert(dest->mc_dbx == src->mc_dbx);
 20590    assert(dest->mc_dbistate == src->mc_dbistate);
 20591  again:
 20592    assert(dest->mc_txn == src->mc_txn);
 20593    dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK;
 20594    dest->mc_top = src->mc_top;
 20595    dest->mc_snum = src->mc_snum;
 20596    for (unsigned i = 0; i < src->mc_snum; ++i) {
 20597      dest->mc_ki[i] = src->mc_ki[i];
 20598      dest->mc_pg[i] = src->mc_pg[i];
 20599    }
 20600  
 20601    if (src->mc_xcursor) {
 20602      dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db;
 20603      dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx;
 20604      src = &src->mc_xcursor->mx_cursor;
 20605      dest = &dest->mc_xcursor->mx_cursor;
 20606      goto again;
 20607    }
 20608  
 20609    return MDBX_SUCCESS;
 20610  }
 20611  
 20612  void mdbx_cursor_close(MDBX_cursor *mc) {
 20613    if (likely(mc)) {
 20614      ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE ||
 20615                       mc->mc_signature == MDBX_MC_READY4CLOSE);
 20616      MDBX_txn *const txn = mc->mc_txn;
 20617      if (!mc->mc_backup) {
 20618        mc->mc_txn = NULL;
 20619        /* Unlink from txn, if tracked. */
 20620        if (mc->mc_flags & C_UNTRACK) {
 20621          ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS);
 20622          MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi];
 20623          while (*prev && *prev != mc)
 20624            prev = &(*prev)->mc_next;
 20625          tASSERT(txn, *prev == mc);
 20626          *prev = mc->mc_next;
 20627        }
 20628        mc->mc_signature = 0;
 20629        mc->mc_next = mc;
 20630        osal_free(mc);
 20631      } else {
 20632        /* Cursor closed before nested txn ends */
 20633        tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE);
 20634        ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
 20635        mc->mc_signature = MDBX_MC_WAIT4EOT;
 20636      }
 20637    }
 20638  }
 20639  
 20640  MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) {
 20641    if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE))
 20642      return NULL;
 20643    MDBX_txn *txn = mc->mc_txn;
 20644    if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE))
 20645      return NULL;
 20646    if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED))
 20647      return NULL;
 20648    return txn;
 20649  }
 20650  
 20651  MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) {
 20652    if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE))
 20653      return UINT_MAX;
 20654    return mc->mc_dbi;
 20655  }
 20656  
 20657  /* Return the count of duplicate data items for the current key */
 20658  int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) {
 20659    if (unlikely(mc == NULL))
 20660      return MDBX_EINVAL;
 20661  
 20662    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 20663      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 20664                                                       : MDBX_EBADSIGN;
 20665  
 20666    int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED);
 20667    if (unlikely(rc != MDBX_SUCCESS))
 20668      return rc;
 20669  
 20670    if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED)))
 20671      return MDBX_EINVAL;
 20672  
 20673    if (!mc->mc_snum) {
 20674      *countp = 0;
 20675      return MDBX_NOTFOUND;
 20676    }
 20677  
 20678    MDBX_page *mp = mc->mc_pg[mc->mc_top];
 20679    if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) {
 20680      *countp = 0;
 20681      return MDBX_NOTFOUND;
 20682    }
 20683  
 20684    *countp = 1;
 20685    if (mc->mc_xcursor != NULL) {
 20686      MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
 20687      if (node_flags(node) & F_DUPDATA) {
 20688        cASSERT(mc, mc->mc_xcursor &&
 20689                        (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED));
 20690        *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX)
 20691                      ? PTRDIFF_MAX
 20692                      : (size_t)mc->mc_xcursor->mx_db.md_entries;
 20693      }
 20694    }
 20695    return MDBX_SUCCESS;
 20696  }
 20697  
 20698  /* Replace the key for a branch node with a new key.
 20699   * Set MDBX_TXN_ERROR on failure.
 20700   * [in] mc Cursor pointing to the node to operate on.
 20701   * [in] key The new key to use.
 20702   * Returns 0 on success, non-zero on failure. */
 20703  static int update_key(MDBX_cursor *mc, const MDBX_val *key) {
 20704    MDBX_page *mp;
 20705    MDBX_node *node;
 20706    char *base;
 20707    size_t len;
 20708    ptrdiff_t delta, ksize, oksize;
 20709    int ptr, i, nkeys, indx;
 20710    DKBUF_DEBUG;
 20711  
 20712    cASSERT(mc, cursor_is_tracked(mc));
 20713    indx = mc->mc_ki[mc->mc_top];
 20714    mp = mc->mc_pg[mc->mc_top];
 20715    node = page_node(mp, indx);
 20716    ptr = mp->mp_ptrs[indx];
 20717  #if MDBX_DEBUG
 20718    MDBX_val k2;
 20719    k2.iov_base = node_key(node);
 20720    k2.iov_len = node_ks(node);
 20721    DEBUG("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, ptr,
 20722          DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno);
 20723  #endif /* MDBX_DEBUG */
 20724  
 20725    /* Sizes must be 2-byte aligned. */
 20726    ksize = EVEN(key->iov_len);
 20727    oksize = EVEN(node_ks(node));
 20728    delta = ksize - oksize;
 20729  
 20730    /* Shift node contents if EVEN(key length) changed. */
 20731    if (delta) {
 20732      if (delta > (int)page_room(mp)) {
 20733        /* not enough space left, do a delete and split */
 20734        DEBUG("Not enough room, delta = %zd, splitting...", delta);
 20735        pgno_t pgno = node_pgno(node);
 20736        node_del(mc, 0);
 20737        int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE);
 20738        if (err == MDBX_SUCCESS && AUDIT_ENABLED())
 20739          err = cursor_check_updating(mc);
 20740        return err;
 20741      }
 20742  
 20743      nkeys = page_numkeys(mp);
 20744      for (i = 0; i < nkeys; i++) {
 20745        if (mp->mp_ptrs[i] <= ptr) {
 20746          cASSERT(mc, mp->mp_ptrs[i] >= delta);
 20747          mp->mp_ptrs[i] -= (indx_t)delta;
 20748        }
 20749      }
 20750  
 20751      base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
 20752      len = ptr - mp->mp_upper + NODESIZE;
 20753      memmove(base - delta, base, len);
 20754      cASSERT(mc, mp->mp_upper >= delta);
 20755      mp->mp_upper -= (indx_t)delta;
 20756  
 20757      node = page_node(mp, indx);
 20758    }
 20759  
 20760    /* But even if no shift was needed, update ksize */
 20761    node_set_ks(node, key->iov_len);
 20762  
 20763    if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0))
 20764      memcpy(node_key(node), key->iov_base, key->iov_len);
 20765    return MDBX_SUCCESS;
 20766  }
 20767  
 20768  /* Move a node from csrc to cdst. */
 20769  static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) {
 20770    int rc;
 20771    DKBUF_DEBUG;
 20772  
 20773    MDBX_page *psrc = csrc->mc_pg[csrc->mc_top];
 20774    MDBX_page *pdst = cdst->mc_pg[cdst->mc_top];
 20775    cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst));
 20776    cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi);
 20777    cASSERT(csrc, csrc->mc_top == cdst->mc_top);
 20778    if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) {
 20779    bailout:
 20780      ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node",
 20781            PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst));
 20782      csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 20783      return MDBX_PROBLEM;
 20784    }
 20785  
 20786    MDBX_val key4move;
 20787    switch (PAGETYPE_WHOLE(psrc)) {
 20788    case P_BRANCH: {
 20789      const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]);
 20790      cASSERT(csrc, node_flags(srcnode) == 0);
 20791      const pgno_t srcpg = node_pgno(srcnode);
 20792      key4move.iov_len = node_ks(srcnode);
 20793      key4move.iov_base = node_key(srcnode);
 20794  
 20795      if (csrc->mc_ki[csrc->mc_top] == 0) {
 20796        const unsigned snum = csrc->mc_snum;
 20797        cASSERT(csrc, snum > 0);
 20798        /* must find the lowest key below src */
 20799        rc = page_search_lowest(csrc);
 20800        MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top];
 20801        if (unlikely(rc))
 20802          return rc;
 20803        cASSERT(csrc, IS_LEAF(lowest_page));
 20804        if (unlikely(!IS_LEAF(lowest_page)))
 20805          goto bailout;
 20806        if (IS_LEAF2(lowest_page)) {
 20807          key4move.iov_len = csrc->mc_db->md_xsize;
 20808          key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len);
 20809        } else {
 20810          const MDBX_node *lowest_node = page_node(lowest_page, 0);
 20811          key4move.iov_len = node_ks(lowest_node);
 20812          key4move.iov_base = node_key(lowest_node);
 20813        }
 20814  
 20815        /* restore cursor after mdbx_page_search_lowest() */
 20816        csrc->mc_snum = (uint8_t)snum;
 20817        csrc->mc_top = (uint8_t)snum - 1;
 20818        csrc->mc_ki[csrc->mc_top] = 0;
 20819  
 20820        /* paranoia */
 20821        cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
 20822        cASSERT(csrc, IS_BRANCH(psrc));
 20823        if (unlikely(!IS_BRANCH(psrc)))
 20824          goto bailout;
 20825      }
 20826  
 20827      if (cdst->mc_ki[cdst->mc_top] == 0) {
 20828        const unsigned snum = cdst->mc_snum;
 20829        cASSERT(csrc, snum > 0);
 20830        MDBX_cursor mn;
 20831        cursor_copy(cdst, &mn);
 20832        /* must find the lowest key below dst */
 20833        rc = page_search_lowest(&mn);
 20834        if (unlikely(rc))
 20835          return rc;
 20836        MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top];
 20837        cASSERT(cdst, IS_LEAF(lowest_page));
 20838        if (unlikely(!IS_LEAF(lowest_page)))
 20839          goto bailout;
 20840        MDBX_val key;
 20841        if (IS_LEAF2(lowest_page)) {
 20842          key.iov_len = mn.mc_db->md_xsize;
 20843          key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len);
 20844        } else {
 20845          MDBX_node *lowest_node = page_node(lowest_page, 0);
 20846          key.iov_len = node_ks(lowest_node);
 20847          key.iov_base = node_key(lowest_node);
 20848        }
 20849  
 20850        /* restore cursor after mdbx_page_search_lowest() */
 20851        mn.mc_snum = (uint8_t)snum;
 20852        mn.mc_top = (uint8_t)snum - 1;
 20853        mn.mc_ki[mn.mc_top] = 0;
 20854  
 20855        const intptr_t delta =
 20856            EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0)));
 20857        const intptr_t needed =
 20858            branch_size(cdst->mc_txn->mt_env, &key4move) + delta;
 20859        const intptr_t have = page_room(pdst);
 20860        if (unlikely(needed > have))
 20861          return MDBX_RESULT_TRUE;
 20862  
 20863        if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
 20864          return rc;
 20865        psrc = csrc->mc_pg[csrc->mc_top];
 20866        pdst = cdst->mc_pg[cdst->mc_top];
 20867  
 20868        WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key));
 20869        if (unlikely(rc))
 20870          return rc;
 20871      } else {
 20872        const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move);
 20873        const size_t have = page_room(pdst);
 20874        if (unlikely(needed > have))
 20875          return MDBX_RESULT_TRUE;
 20876  
 20877        if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
 20878          return rc;
 20879        psrc = csrc->mc_pg[csrc->mc_top];
 20880        pdst = cdst->mc_pg[cdst->mc_top];
 20881      }
 20882  
 20883      DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
 20884            " to node %u on page %" PRIaPGNO,
 20885            "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
 20886            psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
 20887      /* Add the node to the destination page. */
 20888      rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg);
 20889    } break;
 20890  
 20891    case P_LEAF: {
 20892      /* Mark src and dst as dirty. */
 20893      if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
 20894        return rc;
 20895      psrc = csrc->mc_pg[csrc->mc_top];
 20896      pdst = cdst->mc_pg[cdst->mc_top];
 20897      const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]);
 20898      MDBX_val data;
 20899      data.iov_len = node_ds(srcnode);
 20900      data.iov_base = node_data(srcnode);
 20901      key4move.iov_len = node_ks(srcnode);
 20902      key4move.iov_base = node_key(srcnode);
 20903      DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
 20904            " to node %u on page %" PRIaPGNO,
 20905            "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
 20906            psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
 20907      /* Add the node to the destination page. */
 20908      rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data,
 20909                         node_flags(srcnode));
 20910    } break;
 20911  
 20912    case P_LEAF | P_LEAF2: {
 20913      /* Mark src and dst as dirty. */
 20914      if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst))))
 20915        return rc;
 20916      psrc = csrc->mc_pg[csrc->mc_top];
 20917      pdst = cdst->mc_pg[cdst->mc_top];
 20918      key4move.iov_len = csrc->mc_db->md_xsize;
 20919      key4move.iov_base =
 20920          page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len);
 20921      DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO
 20922            " to node %u on page %" PRIaPGNO,
 20923            "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
 20924            psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
 20925      /* Add the node to the destination page. */
 20926      rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move);
 20927    } break;
 20928  
 20929    default:
 20930      assert(false);
 20931      goto bailout;
 20932    }
 20933  
 20934    if (unlikely(rc != MDBX_SUCCESS))
 20935      return rc;
 20936  
 20937    /* Delete the node from the source page. */
 20938    node_del(csrc, key4move.iov_len);
 20939  
 20940    cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
 20941    cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
 20942    cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst));
 20943  
 20944    {
 20945      /* Adjust other cursors pointing to mp */
 20946      MDBX_cursor *m2, *m3;
 20947      const MDBX_dbi dbi = csrc->mc_dbi;
 20948      cASSERT(csrc, csrc->mc_top == cdst->mc_top);
 20949      if (fromleft) {
 20950        /* If we're adding on the left, bump others up */
 20951        for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
 20952          m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 20953          if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
 20954            continue;
 20955          if (m3 != cdst && m3->mc_pg[csrc->mc_top] == pdst &&
 20956              m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
 20957            m3->mc_ki[csrc->mc_top]++;
 20958          }
 20959          if (m3 != csrc && m3->mc_pg[csrc->mc_top] == psrc &&
 20960              m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) {
 20961            m3->mc_pg[csrc->mc_top] = pdst;
 20962            m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
 20963            cASSERT(csrc, csrc->mc_top > 0);
 20964            m3->mc_ki[csrc->mc_top - 1]++;
 20965          }
 20966          if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
 20967            XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
 20968        }
 20969      } else {
 20970        /* Adding on the right, bump others down */
 20971        for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
 20972          m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 20973          if (m3 == csrc)
 20974            continue;
 20975          if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
 20976            continue;
 20977          if (m3->mc_pg[csrc->mc_top] == psrc) {
 20978            if (!m3->mc_ki[csrc->mc_top]) {
 20979              m3->mc_pg[csrc->mc_top] = pdst;
 20980              m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
 20981              cASSERT(csrc, csrc->mc_top > 0);
 20982              m3->mc_ki[csrc->mc_top - 1]--;
 20983            } else {
 20984              m3->mc_ki[csrc->mc_top]--;
 20985            }
 20986            if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
 20987              XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top],
 20988                              m3->mc_ki[csrc->mc_top]);
 20989          }
 20990        }
 20991      }
 20992    }
 20993  
 20994    /* Update the parent separators. */
 20995    if (csrc->mc_ki[csrc->mc_top] == 0) {
 20996      cASSERT(csrc, csrc->mc_top > 0);
 20997      if (csrc->mc_ki[csrc->mc_top - 1] != 0) {
 20998        MDBX_val key;
 20999        if (IS_LEAF2(psrc)) {
 21000          key.iov_len = psrc->mp_leaf2_ksize;
 21001          key.iov_base = page_leaf2key(psrc, 0, key.iov_len);
 21002        } else {
 21003          MDBX_node *srcnode = page_node(psrc, 0);
 21004          key.iov_len = node_ks(srcnode);
 21005          key.iov_base = node_key(srcnode);
 21006        }
 21007        DEBUG("update separator for source page %" PRIaPGNO " to [%s]",
 21008              psrc->mp_pgno, DKEY_DEBUG(&key));
 21009        MDBX_cursor mn;
 21010        cursor_copy(csrc, &mn);
 21011        cASSERT(csrc, mn.mc_snum > 0);
 21012        mn.mc_snum--;
 21013        mn.mc_top--;
 21014        /* We want rebalance to find mn when doing fixups */
 21015        WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key));
 21016        if (unlikely(rc != MDBX_SUCCESS))
 21017          return rc;
 21018      }
 21019      if (IS_BRANCH(psrc)) {
 21020        const MDBX_val nullkey = {0, 0};
 21021        const indx_t ix = csrc->mc_ki[csrc->mc_top];
 21022        csrc->mc_ki[csrc->mc_top] = 0;
 21023        rc = update_key(csrc, &nullkey);
 21024        csrc->mc_ki[csrc->mc_top] = ix;
 21025        cASSERT(csrc, rc == MDBX_SUCCESS);
 21026      }
 21027    }
 21028  
 21029    if (cdst->mc_ki[cdst->mc_top] == 0) {
 21030      cASSERT(cdst, cdst->mc_top > 0);
 21031      if (cdst->mc_ki[cdst->mc_top - 1] != 0) {
 21032        MDBX_val key;
 21033        if (IS_LEAF2(pdst)) {
 21034          key.iov_len = pdst->mp_leaf2_ksize;
 21035          key.iov_base = page_leaf2key(pdst, 0, key.iov_len);
 21036        } else {
 21037          MDBX_node *srcnode = page_node(pdst, 0);
 21038          key.iov_len = node_ks(srcnode);
 21039          key.iov_base = node_key(srcnode);
 21040        }
 21041        DEBUG("update separator for destination page %" PRIaPGNO " to [%s]",
 21042              pdst->mp_pgno, DKEY_DEBUG(&key));
 21043        MDBX_cursor mn;
 21044        cursor_copy(cdst, &mn);
 21045        cASSERT(cdst, mn.mc_snum > 0);
 21046        mn.mc_snum--;
 21047        mn.mc_top--;
 21048        /* We want rebalance to find mn when doing fixups */
 21049        WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key));
 21050        if (unlikely(rc != MDBX_SUCCESS))
 21051          return rc;
 21052      }
 21053      if (IS_BRANCH(pdst)) {
 21054        const MDBX_val nullkey = {0, 0};
 21055        const indx_t ix = cdst->mc_ki[cdst->mc_top];
 21056        cdst->mc_ki[cdst->mc_top] = 0;
 21057        rc = update_key(cdst, &nullkey);
 21058        cdst->mc_ki[cdst->mc_top] = ix;
 21059        cASSERT(cdst, rc == MDBX_SUCCESS);
 21060      }
 21061    }
 21062  
 21063    return MDBX_SUCCESS;
 21064  }
 21065  
 21066  /* Merge one page into another.
 21067   *
 21068   * The nodes from the page pointed to by csrc will be copied to the page
 21069   * pointed to by cdst and then the csrc page will be freed.
 21070   *
 21071   * [in] csrc Cursor pointing to the source page.
 21072   * [in] cdst Cursor pointing to the destination page.
 21073   *
 21074   * Returns 0 on success, non-zero on failure. */
 21075  static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
 21076    MDBX_val key;
 21077    int rc;
 21078  
 21079    cASSERT(csrc, csrc != cdst);
 21080    cASSERT(csrc, cursor_is_tracked(csrc));
 21081    cASSERT(cdst, cursor_is_tracked(cdst));
 21082    const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top];
 21083    MDBX_page *pdst = cdst->mc_pg[cdst->mc_top];
 21084    DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno,
 21085          pdst->mp_pgno);
 21086  
 21087    cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst));
 21088    cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db);
 21089    cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */
 21090    cASSERT(cdst, cdst->mc_snum > 1);
 21091    cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
 21092                      IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
 21093    cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth ||
 21094                      IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1]));
 21095    cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc));
 21096    const int pagetype = PAGETYPE_WHOLE(psrc);
 21097  
 21098    /* Move all nodes from src to dst */
 21099    const unsigned dst_nkeys = page_numkeys(pdst);
 21100    const unsigned src_nkeys = page_numkeys(psrc);
 21101    cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u));
 21102    if (likely(src_nkeys)) {
 21103      unsigned j = dst_nkeys;
 21104      if (unlikely(pagetype & P_LEAF2)) {
 21105        /* Mark dst as dirty. */
 21106        if (unlikely(rc = page_touch(cdst)))
 21107          return rc;
 21108  
 21109        key.iov_len = csrc->mc_db->md_xsize;
 21110        key.iov_base = page_data(psrc);
 21111        unsigned i = 0;
 21112        do {
 21113          rc = node_add_leaf2(cdst, j++, &key);
 21114          if (unlikely(rc != MDBX_SUCCESS))
 21115            return rc;
 21116          key.iov_base = (char *)key.iov_base + key.iov_len;
 21117        } while (++i != src_nkeys);
 21118      } else {
 21119        MDBX_node *srcnode = page_node(psrc, 0);
 21120        key.iov_len = node_ks(srcnode);
 21121        key.iov_base = node_key(srcnode);
 21122        if (pagetype & P_BRANCH) {
 21123          MDBX_cursor mn;
 21124          cursor_copy(csrc, &mn);
 21125          /* must find the lowest key below src */
 21126          rc = page_search_lowest(&mn);
 21127          if (unlikely(rc))
 21128            return rc;
 21129  
 21130          const MDBX_page *mp = mn.mc_pg[mn.mc_top];
 21131          if (likely(!IS_LEAF2(mp))) {
 21132            cASSERT(&mn, IS_LEAF(mp));
 21133            const MDBX_node *lowest = page_node(mp, 0);
 21134            key.iov_len = node_ks(lowest);
 21135            key.iov_base = node_key(lowest);
 21136          } else {
 21137            cASSERT(&mn, mn.mc_top > csrc->mc_top);
 21138            key.iov_len = mp->mp_leaf2_ksize;
 21139            key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len);
 21140          }
 21141          cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min);
 21142          cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max);
 21143  
 21144          const size_t dst_room = page_room(pdst);
 21145          const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc);
 21146          const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len;
 21147          if (unlikely(space_needed > dst_room))
 21148            return MDBX_RESULT_TRUE;
 21149        }
 21150  
 21151        /* Mark dst as dirty. */
 21152        if (unlikely(rc = page_touch(cdst)))
 21153          return rc;
 21154  
 21155        unsigned i = 0;
 21156        while (true) {
 21157          if (pagetype & P_LEAF) {
 21158            MDBX_val data;
 21159            data.iov_len = node_ds(srcnode);
 21160            data.iov_base = node_data(srcnode);
 21161            rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode));
 21162          } else {
 21163            cASSERT(csrc, node_flags(srcnode) == 0);
 21164            rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode));
 21165          }
 21166          if (unlikely(rc != MDBX_SUCCESS))
 21167            return rc;
 21168  
 21169          if (++i == src_nkeys)
 21170            break;
 21171          srcnode = page_node(psrc, i);
 21172          key.iov_len = node_ks(srcnode);
 21173          key.iov_base = node_key(srcnode);
 21174        }
 21175      }
 21176  
 21177      pdst = cdst->mc_pg[cdst->mc_top];
 21178      DEBUG("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)",
 21179            pdst->mp_pgno, page_numkeys(pdst),
 21180            page_fill(cdst->mc_txn->mt_env, pdst));
 21181  
 21182      cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
 21183      cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
 21184    }
 21185  
 21186    /* Unlink the src page from parent and add to free list. */
 21187    csrc->mc_top--;
 21188    node_del(csrc, 0);
 21189    if (csrc->mc_ki[csrc->mc_top] == 0) {
 21190      const MDBX_val nullkey = {0, 0};
 21191      rc = update_key(csrc, &nullkey);
 21192      if (unlikely(rc)) {
 21193        csrc->mc_top++;
 21194        return rc;
 21195      }
 21196    }
 21197    csrc->mc_top++;
 21198  
 21199    cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
 21200    cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
 21201  
 21202    {
 21203      /* Adjust other cursors pointing to mp */
 21204      MDBX_cursor *m2, *m3;
 21205      const MDBX_dbi dbi = csrc->mc_dbi;
 21206      const unsigned top = csrc->mc_top;
 21207  
 21208      for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
 21209        m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 21210        if (m3 == csrc || top >= m3->mc_snum)
 21211          continue;
 21212        if (m3->mc_pg[top] == psrc) {
 21213          m3->mc_pg[top] = pdst;
 21214          cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX);
 21215          m3->mc_ki[top] += (indx_t)dst_nkeys;
 21216          m3->mc_ki[top - 1] = cdst->mc_ki[top - 1];
 21217        } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] &&
 21218                   m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) {
 21219          m3->mc_ki[top - 1]--;
 21220        }
 21221        if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
 21222          XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]);
 21223      }
 21224    }
 21225  
 21226    /* If not operating on GC, allow this page to be reused
 21227     * in this txn. Otherwise just add to free list. */
 21228    rc = page_retire(csrc, (MDBX_page *)psrc);
 21229    if (unlikely(rc))
 21230      return rc;
 21231  
 21232    cASSERT(cdst, cdst->mc_db->md_entries > 0);
 21233    cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth);
 21234    cASSERT(cdst, cdst->mc_top > 0);
 21235    cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1);
 21236    MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top];
 21237    const indx_t top_indx = cdst->mc_ki[cdst->mc_top];
 21238    const unsigned save_snum = cdst->mc_snum;
 21239    const uint16_t save_depth = cdst->mc_db->md_depth;
 21240    cursor_pop(cdst);
 21241    rc = rebalance(cdst);
 21242    if (unlikely(rc))
 21243      return rc;
 21244  
 21245    cASSERT(cdst, cdst->mc_db->md_entries > 0);
 21246    cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth);
 21247    cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1);
 21248  
 21249  #if MDBX_ENABLE_PGOP_STAT
 21250    cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1;
 21251  #endif /* MDBX_ENABLE_PGOP_STAT */
 21252  
 21253    if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) {
 21254      /* LY: don't touch cursor if top-page is a LEAF */
 21255      cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
 21256                        PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype);
 21257      return MDBX_SUCCESS;
 21258    }
 21259  
 21260    cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys);
 21261  
 21262    if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) {
 21263      /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */
 21264      goto bailout;
 21265    }
 21266  
 21267    if (top_page == cdst->mc_pg[cdst->mc_top]) {
 21268      /* LY: don't touch cursor if prev top-page already on the top */
 21269      cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx);
 21270      cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
 21271                        PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype);
 21272      return MDBX_SUCCESS;
 21273    }
 21274  
 21275    const int new_snum = save_snum - save_depth + cdst->mc_db->md_depth;
 21276    if (unlikely(new_snum < 1 || new_snum > cdst->mc_db->md_depth)) {
 21277      /* LY: out of range, unable restore cursor's stack */
 21278      goto bailout;
 21279    }
 21280  
 21281    if (top_page == cdst->mc_pg[new_snum - 1]) {
 21282      cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx);
 21283      /* LY: restore cursor stack */
 21284      cdst->mc_snum = (uint8_t)new_snum;
 21285      cdst->mc_top = (uint8_t)new_snum - 1;
 21286      cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
 21287                        IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
 21288      cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
 21289                        PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype);
 21290      return MDBX_SUCCESS;
 21291    }
 21292  
 21293    MDBX_page *const stub_page = (MDBX_page *)(~(uintptr_t)top_page);
 21294    const indx_t stub_indx = top_indx;
 21295    if (save_depth > cdst->mc_db->md_depth &&
 21296        ((cdst->mc_pg[save_snum - 1] == top_page &&
 21297          cdst->mc_ki[save_snum - 1] == top_indx) ||
 21298         (cdst->mc_pg[save_snum - 1] == stub_page &&
 21299          cdst->mc_ki[save_snum - 1] == stub_indx))) {
 21300      /* LY: restore cursor stack */
 21301      cdst->mc_pg[new_snum - 1] = top_page;
 21302      cdst->mc_ki[new_snum - 1] = top_indx;
 21303      cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]);
 21304      cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum];
 21305      cdst->mc_snum = (uint8_t)new_snum;
 21306      cdst->mc_top = (uint8_t)new_snum - 1;
 21307      cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
 21308                        IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
 21309      cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
 21310                        PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype);
 21311      return MDBX_SUCCESS;
 21312    }
 21313  
 21314  bailout:
 21315    /* LY: unable restore cursor's stack */
 21316    cdst->mc_flags &= ~C_INITIALIZED;
 21317    return MDBX_CURSOR_FULL;
 21318  }
 21319  
 21320  static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
 21321    cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi);
 21322    cASSERT(cdst, cdst->mc_txn == csrc->mc_txn);
 21323    cASSERT(cdst, cdst->mc_db == csrc->mc_db);
 21324    cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx);
 21325    cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate);
 21326    cdst->mc_snum = csrc->mc_snum;
 21327    cdst->mc_top = csrc->mc_top;
 21328    cdst->mc_flags = csrc->mc_flags;
 21329    cdst->mc_checking = csrc->mc_checking;
 21330  
 21331    for (unsigned i = 0; i < csrc->mc_snum; i++) {
 21332      cdst->mc_pg[i] = csrc->mc_pg[i];
 21333      cdst->mc_ki[i] = csrc->mc_ki[i];
 21334    }
 21335  }
 21336  
 21337  /* Copy the contents of a cursor.
 21338   * [in] csrc The cursor to copy from.
 21339   * [out] cdst The cursor to copy to. */
 21340  static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
 21341    cASSERT(csrc, csrc->mc_txn->mt_txnid >=
 21342                      csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak);
 21343    cdst->mc_dbi = csrc->mc_dbi;
 21344    cdst->mc_next = NULL;
 21345    cdst->mc_backup = NULL;
 21346    cdst->mc_xcursor = NULL;
 21347    cdst->mc_txn = csrc->mc_txn;
 21348    cdst->mc_db = csrc->mc_db;
 21349    cdst->mc_dbx = csrc->mc_dbx;
 21350    cdst->mc_dbistate = csrc->mc_dbistate;
 21351    cursor_restore(csrc, cdst);
 21352  }
 21353  
 21354  /* Rebalance the tree after a delete operation.
 21355   * [in] mc Cursor pointing to the page where rebalancing should begin.
 21356   * Returns 0 on success, non-zero on failure. */
 21357  static int rebalance(MDBX_cursor *mc) {
 21358    cASSERT(mc, cursor_is_tracked(mc));
 21359    cASSERT(mc, mc->mc_snum > 0);
 21360    cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth ||
 21361                    IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
 21362    const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]);
 21363  
 21364    STATIC_ASSERT(P_BRANCH == 1);
 21365    const unsigned minkeys = (pagetype & P_BRANCH) + 1;
 21366  
 21367    /* Pages emptier than this are candidates for merging. */
 21368    unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI)
 21369                                  ? mc->mc_txn->mt_env->me_merge_threshold
 21370                                  : mc->mc_txn->mt_env->me_merge_threshold_gc;
 21371  
 21372    const MDBX_page *const tp = mc->mc_pg[mc->mc_top];
 21373    const unsigned numkeys = page_numkeys(tp);
 21374    const unsigned room = page_room(tp);
 21375    DEBUG("rebalancing %s page %" PRIaPGNO
 21376          " (has %u keys, full %.1f%%, used %u, room %u bytes )",
 21377          (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys,
 21378          page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp),
 21379          room);
 21380  
 21381    if (unlikely(numkeys < minkeys)) {
 21382      DEBUG("page %" PRIaPGNO " must be merged due keys < %u threshold",
 21383            tp->mp_pgno, minkeys);
 21384    } else if (unlikely(room > room_threshold)) {
 21385      DEBUG("page %" PRIaPGNO " should be merged due room %u > %u threshold",
 21386            tp->mp_pgno, room, room_threshold);
 21387    } else {
 21388      DEBUG("no need to rebalance page %" PRIaPGNO ", room %u < %u threshold",
 21389            tp->mp_pgno, room, room_threshold);
 21390      cASSERT(mc, mc->mc_db->md_entries > 0);
 21391      return MDBX_SUCCESS;
 21392    }
 21393  
 21394    int rc;
 21395    if (mc->mc_snum < 2) {
 21396      MDBX_page *const mp = mc->mc_pg[0];
 21397      const unsigned nkeys = page_numkeys(mp);
 21398      cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0));
 21399      if (IS_SUBP(mp)) {
 21400        DEBUG("%s", "Can't rebalance a subpage, ignoring");
 21401        cASSERT(mc, pagetype & P_LEAF);
 21402        return MDBX_SUCCESS;
 21403      }
 21404      if (nkeys == 0) {
 21405        cASSERT(mc, IS_LEAF(mp));
 21406        DEBUG("%s", "tree is completely empty");
 21407        cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0);
 21408        mc->mc_db->md_root = P_INVALID;
 21409        mc->mc_db->md_depth = 0;
 21410        cASSERT(mc, mc->mc_db->md_branch_pages == 0 &&
 21411                        mc->mc_db->md_overflow_pages == 0 &&
 21412                        mc->mc_db->md_leaf_pages == 1);
 21413        /* Adjust cursors pointing to mp */
 21414        for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2;
 21415             m2 = m2->mc_next) {
 21416          MDBX_cursor *m3 =
 21417              (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 21418          if (m3 == mc || !(m3->mc_flags & C_INITIALIZED))
 21419            continue;
 21420          if (m3->mc_pg[0] == mp) {
 21421            m3->mc_snum = 0;
 21422            m3->mc_top = 0;
 21423            m3->mc_flags &= ~C_INITIALIZED;
 21424          }
 21425        }
 21426        mc->mc_snum = 0;
 21427        mc->mc_top = 0;
 21428        mc->mc_flags &= ~C_INITIALIZED;
 21429  
 21430        rc = page_retire(mc, mp);
 21431        if (unlikely(rc != MDBX_SUCCESS))
 21432          return rc;
 21433      } else if (IS_BRANCH(mp) && nkeys == 1) {
 21434        DEBUG("%s", "collapsing root page!");
 21435        mc->mc_db->md_root = node_pgno(page_node(mp, 0));
 21436        rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid);
 21437        if (unlikely(rc != MDBX_SUCCESS))
 21438          return rc;
 21439        mc->mc_db->md_depth--;
 21440        mc->mc_ki[0] = mc->mc_ki[1];
 21441        for (int i = 1; i < mc->mc_db->md_depth; i++) {
 21442          mc->mc_pg[i] = mc->mc_pg[i + 1];
 21443          mc->mc_ki[i] = mc->mc_ki[i + 1];
 21444        }
 21445  
 21446        /* Adjust other cursors pointing to mp */
 21447        for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2;
 21448             m2 = m2->mc_next) {
 21449          MDBX_cursor *m3 =
 21450              (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 21451          if (m3 == mc || !(m3->mc_flags & C_INITIALIZED))
 21452            continue;
 21453          if (m3->mc_pg[0] == mp) {
 21454            for (int i = 0; i < mc->mc_db->md_depth; i++) {
 21455              m3->mc_pg[i] = m3->mc_pg[i + 1];
 21456              m3->mc_ki[i] = m3->mc_ki[i + 1];
 21457            }
 21458            m3->mc_snum--;
 21459            m3->mc_top--;
 21460          }
 21461        }
 21462        cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
 21463                        PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype);
 21464        cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth ||
 21465                        IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
 21466  
 21467        rc = page_retire(mc, mp);
 21468        if (unlikely(rc != MDBX_SUCCESS))
 21469          return rc;
 21470      } else {
 21471        DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)",
 21472              mp->mp_pgno, mp->mp_flags);
 21473      }
 21474      return MDBX_SUCCESS;
 21475    }
 21476  
 21477    /* The parent (branch page) must have at least 2 pointers,
 21478     * otherwise the tree is invalid. */
 21479    const unsigned pre_top = mc->mc_top - 1;
 21480    cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top]));
 21481    cASSERT(mc, !IS_SUBP(mc->mc_pg[0]));
 21482    cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1);
 21483  
 21484    /* Leaf page fill factor is below the threshold.
 21485     * Try to move keys from left or right neighbor, or
 21486     * merge with a neighbor page. */
 21487  
 21488    /* Find neighbors. */
 21489    MDBX_cursor mn;
 21490    cursor_copy(mc, &mn);
 21491  
 21492    MDBX_page *left = nullptr, *right = nullptr;
 21493    if (mn.mc_ki[pre_top] > 0) {
 21494      rc = page_get(
 21495          &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)),
 21496          &left, mc->mc_pg[mc->mc_top]->mp_txnid);
 21497      if (unlikely(rc != MDBX_SUCCESS))
 21498        return rc;
 21499      cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]));
 21500    }
 21501    if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) {
 21502      rc = page_get(
 21503          &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)),
 21504          &right, mc->mc_pg[mc->mc_top]->mp_txnid);
 21505      if (unlikely(rc != MDBX_SUCCESS))
 21506        return rc;
 21507      cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]));
 21508    }
 21509    cASSERT(mc, left || right);
 21510  
 21511    const unsigned ki_top = mc->mc_ki[mc->mc_top];
 21512    const unsigned ki_pre_top = mn.mc_ki[pre_top];
 21513    const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]);
 21514  
 21515    const unsigned left_room = left ? page_room(left) : 0;
 21516    const unsigned right_room = right ? page_room(right) : 0;
 21517    const unsigned left_nkeys = left ? page_numkeys(left) : 0;
 21518    const unsigned right_nkeys = right ? page_numkeys(right) : 0;
 21519  retry:
 21520    if (left_room > room_threshold && left_room >= right_room) {
 21521      /* try merge with left */
 21522      cASSERT(mc, left_nkeys >= minkeys);
 21523      mn.mc_pg[mn.mc_top] = left;
 21524      mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
 21525      mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1);
 21526      mc->mc_ki[mc->mc_top] = 0;
 21527      const unsigned new_ki = ki_top + left_nkeys;
 21528      mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
 21529      /* We want rebalance to find mn when doing fixups */
 21530      WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn));
 21531      if (likely(rc != MDBX_RESULT_TRUE)) {
 21532        cursor_restore(&mn, mc);
 21533        mc->mc_ki[mc->mc_top] = (indx_t)new_ki;
 21534        cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
 21535        return rc;
 21536      }
 21537    }
 21538    if (right_room > room_threshold) {
 21539      /* try merge with right */
 21540      cASSERT(mc, right_nkeys >= minkeys);
 21541      mn.mc_pg[mn.mc_top] = right;
 21542      mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
 21543      mn.mc_ki[mn.mc_top] = 0;
 21544      mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
 21545      WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc));
 21546      if (likely(rc != MDBX_RESULT_TRUE)) {
 21547        mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
 21548        cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
 21549        return rc;
 21550      }
 21551    }
 21552  
 21553    if (left_nkeys > minkeys &&
 21554        (right_nkeys <= left_nkeys || right_room >= left_room)) {
 21555      /* try move from left */
 21556      mn.mc_pg[mn.mc_top] = left;
 21557      mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
 21558      mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1);
 21559      mc->mc_ki[mc->mc_top] = 0;
 21560      WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true));
 21561      if (likely(rc != MDBX_RESULT_TRUE)) {
 21562        mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1);
 21563        cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
 21564        return rc;
 21565      }
 21566    }
 21567    if (right_nkeys > minkeys) {
 21568      /* try move from right */
 21569      mn.mc_pg[mn.mc_top] = right;
 21570      mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
 21571      mn.mc_ki[mn.mc_top] = 0;
 21572      mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
 21573      WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false));
 21574      if (likely(rc != MDBX_RESULT_TRUE)) {
 21575        mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
 21576        cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
 21577        return rc;
 21578      }
 21579    }
 21580  
 21581    if (nkeys >= minkeys) {
 21582      mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
 21583      if (AUDIT_ENABLED())
 21584        return cursor_check_updating(mc);
 21585      return MDBX_SUCCESS;
 21586    }
 21587  
 21588    if (likely(room_threshold > 0)) {
 21589      room_threshold = 0;
 21590      goto retry;
 21591    }
 21592    ERROR("Unable to merge/rebalance %s page %" PRIaPGNO
 21593          " (has %u keys, full %.1f%%, used %u, room %u bytes )",
 21594          (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys,
 21595          page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp),
 21596          room);
 21597    return MDBX_PROBLEM;
 21598  }
 21599  
 21600  __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) {
 21601    DKBUF;
 21602    int rc = MDBX_SUCCESS;
 21603    if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO))
 21604      rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno);
 21605  
 21606    MDBX_env *const env = mc->mc_txn->mt_env;
 21607    const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb;
 21608    unsigned flags_mask = P_ILL_BITS;
 21609    unsigned flags_expected = 0;
 21610    if (offset < 0 ||
 21611        offset > (ptrdiff_t)(env->me_dxb_mmap.current - ((mp->mp_flags & P_SUBP)
 21612                                                             ? PAGEHDRSZ + 1
 21613                                                             : env->me_psize))) {
 21614      /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */
 21615      flags_mask -= P_SUBP;
 21616      if ((env->me_flags & MDBX_WRITEMAP) != 0 ||
 21617          (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP)))
 21618        rc = bad_page(mp, "invalid page-address %p, offset %zi\n",
 21619                      __Wpedantic_format_voidptr(mp), offset);
 21620    } else if (offset & (env->me_psize - 1))
 21621      flags_expected = P_SUBP;
 21622  
 21623    if (unlikely((mp->mp_flags & flags_mask) != flags_expected))
 21624      rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n",
 21625                    mp->mp_flags & flags_mask, flags_expected);
 21626  
 21627    cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0);
 21628    const uint8_t type = PAGETYPE_WHOLE(mp);
 21629    switch (type) {
 21630    default:
 21631      return bad_page(mp, "invalid type (%u)\n", type);
 21632    case P_OVERFLOW:
 21633      if (unlikely(mc->mc_flags & C_SUB))
 21634        rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large",
 21635                      "nested dupsort tree", mc->mc_db->md_flags);
 21636      const pgno_t npages = mp->mp_pages;
 21637      if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2))
 21638        rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages);
 21639      if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno))
 21640        rc = bad_page(
 21641            mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n",
 21642            mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno);
 21643      return rc; //-------------------------- end of large/overflow page handling
 21644    case P_LEAF | P_SUBP:
 21645      if (unlikely(mc->mc_db->md_depth != 1))
 21646        rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
 21647                      "leaf-sub", "nested dupsort db", mc->mc_db->md_flags);
 21648      /* fall through */
 21649      __fallthrough;
 21650    case P_LEAF:
 21651      if (unlikely((mc->mc_checking & CC_LEAF2) != 0))
 21652        rc = bad_page(
 21653            mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n",
 21654            mc->mc_db->md_flags);
 21655      break;
 21656    case P_LEAF | P_LEAF2 | P_SUBP:
 21657      if (unlikely(mc->mc_db->md_depth != 1))
 21658        rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n",
 21659                      "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags);
 21660      /* fall through */
 21661      __fallthrough;
 21662    case P_LEAF | P_LEAF2:
 21663      if (unlikely((mc->mc_checking & CC_LEAF2) == 0))
 21664        rc = bad_page(
 21665            mp,
 21666            "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n",
 21667            mc->mc_db->md_flags);
 21668      break;
 21669    case P_BRANCH:
 21670      break;
 21671    }
 21672  
 21673    if (unlikely(mp->mp_upper < mp->mp_lower ||
 21674                 ((mp->mp_lower | mp->mp_upper) & 1) ||
 21675                 PAGEHDRSZ + mp->mp_upper > env->me_psize))
 21676      rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %u\n",
 21677                    mp->mp_lower, mp->mp_upper, page_space(env));
 21678  
 21679    char *const end_of_page = (char *)mp + env->me_psize;
 21680    const unsigned nkeys = page_numkeys(mp);
 21681    STATIC_ASSERT(P_BRANCH == 1);
 21682    if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) {
 21683      if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) &&
 21684          (!(mc->mc_checking & CC_UPDATING) ||
 21685           !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP))))
 21686        rc =
 21687            bad_page(mp, "%s-page nkeys (%u) < %u\n",
 21688                     IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp));
 21689    }
 21690    if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper +
 21691                                      nkeys * sizeof(MDBX_node) + nkeys - 1 >
 21692                                  env->me_psize))
 21693      rc = bad_page(mp, "invalid page upper (%u) for nkeys %u with limit %u\n",
 21694                    mp->mp_upper, nkeys, page_space(env));
 21695  
 21696    const size_t ksize_max = keysize_max(env->me_psize, 0);
 21697    const size_t leaf2_ksize = mp->mp_leaf2_ksize;
 21698    if (IS_LEAF2(mp)) {
 21699      if (unlikely((mc->mc_flags & C_SUB) == 0 ||
 21700                   (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0))
 21701        rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n",
 21702                      mc->mc_db->md_flags);
 21703      if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max))
 21704        rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize);
 21705    }
 21706  
 21707    MDBX_val here, prev = {0, 0};
 21708    for (unsigned i = 0; i < nkeys; ++i) {
 21709      if (IS_LEAF2(mp)) {
 21710        char *const key = page_leaf2key(mp, i, leaf2_ksize);
 21711        if (unlikely(end_of_page < key + leaf2_ksize)) {
 21712          rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n",
 21713                        key + leaf2_ksize - end_of_page);
 21714          continue;
 21715        }
 21716  
 21717        if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) {
 21718          if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min ||
 21719                       leaf2_ksize > mc->mc_dbx->md_klen_max))
 21720            rc = bad_page(
 21721                mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n",
 21722                leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max);
 21723          else
 21724            mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize;
 21725        }
 21726        if ((mc->mc_checking & CC_SKIPORD) == 0) {
 21727          here.iov_len = leaf2_ksize;
 21728          here.iov_base = key;
 21729          if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0))
 21730            rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i,
 21731                          DKEY(&prev), DVAL(&here));
 21732          prev = here;
 21733        }
 21734      } else {
 21735        const MDBX_node *const node = page_node(mp, i);
 21736        const char *node_end = (char *)node + NODESIZE;
 21737        if (unlikely(node_end > end_of_page)) {
 21738          rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i,
 21739                        node_end - end_of_page);
 21740          continue;
 21741        }
 21742        const size_t ksize = node_ks(node);
 21743        if (unlikely(ksize > ksize_max))
 21744          rc = bad_page(mp, "node[%u] too long key (%zu)\n", i, ksize);
 21745        char *key = node_key(node);
 21746        if (unlikely(end_of_page < key + ksize)) {
 21747          rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i,
 21748                        key + ksize - end_of_page);
 21749          continue;
 21750        }
 21751        if ((IS_LEAF(mp) || i > 0)) {
 21752          if (unlikely(ksize < mc->mc_dbx->md_klen_min ||
 21753                       ksize > mc->mc_dbx->md_klen_max))
 21754            rc = bad_page(
 21755                mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n",
 21756                i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max);
 21757          if ((mc->mc_checking & CC_SKIPORD) == 0) {
 21758            here.iov_base = key;
 21759            here.iov_len = ksize;
 21760            if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0))
 21761              rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i,
 21762                            DKEY(&prev), DVAL(&here));
 21763            prev = here;
 21764          }
 21765        }
 21766        if (IS_BRANCH(mp)) {
 21767          if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 &&
 21768              unlikely(ksize != 0))
 21769            rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n",
 21770                          i, ksize);
 21771          const pgno_t ref = node_pgno(node);
 21772          if (unlikely(ref < MIN_PAGENO) ||
 21773              (unlikely(ref >= mc->mc_txn->mt_next_pgno) &&
 21774               (unlikely(ref >= mc->mc_txn->mt_geo.now) ||
 21775                !(mc->mc_checking & CC_RETIRING))))
 21776            rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref);
 21777          if (unlikely(node_flags(node)))
 21778            rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i,
 21779                          node_flags(node));
 21780          continue;
 21781        }
 21782  
 21783        switch (node_flags(node)) {
 21784        default:
 21785          rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node));
 21786          break;
 21787        case F_BIGDATA /* data on large-page */:
 21788        case 0 /* usual */:
 21789        case F_SUBDATA /* sub-db */:
 21790        case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
 21791        case F_DUPDATA /* short sub-page */:
 21792          break;
 21793        }
 21794  
 21795        const size_t dsize = node_ds(node);
 21796        const char *const data = node_data(node);
 21797        if (node_flags(node) & F_BIGDATA) {
 21798          if (unlikely(end_of_page < data + sizeof(pgno_t))) {
 21799            rc = bad_page(
 21800                mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n",
 21801                "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page);
 21802            continue;
 21803          }
 21804          if (unlikely(dsize <= mc->mc_dbx->md_vlen_min ||
 21805                       dsize > mc->mc_dbx->md_vlen_max))
 21806            rc = bad_page(
 21807                mp,
 21808                "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n",
 21809                dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max);
 21810          if (unlikely(node_size_len(node_ks(node), dsize) <=
 21811                       mc->mc_txn->mt_env->me_leaf_nodemax))
 21812            poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize);
 21813  
 21814          if ((mc->mc_checking & CC_RETIRING) == 0) {
 21815            const pgr_t lp =
 21816                page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid);
 21817            if (unlikely(lp.err != MDBX_SUCCESS))
 21818              return lp.err;
 21819            cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW);
 21820            const unsigned npages = number_of_ovpages(env, dsize);
 21821            if (unlikely(lp.page->mp_pages != npages)) {
 21822              if (lp.page->mp_pages < npages)
 21823                rc = bad_page(lp.page,
 21824                              "too less n-pages %u for bigdata-node (%zu bytes)",
 21825                              lp.page->mp_pages, dsize);
 21826              else
 21827                poor_page(lp.page,
 21828                          "extra n-pages %u for bigdata-node (%zu bytes)",
 21829                          lp.page->mp_pages, dsize);
 21830            }
 21831          }
 21832          continue;
 21833        }
 21834  
 21835        if (unlikely(end_of_page < data + dsize)) {
 21836          rc =
 21837              bad_page(mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n",
 21838                       "data", i, nkeys, dsize, data + dsize - end_of_page);
 21839          continue;
 21840        }
 21841  
 21842        switch (node_flags(node)) {
 21843        default:
 21844          /* wrong, but already handled */
 21845          continue;
 21846        case 0 /* usual */:
 21847          if (unlikely(dsize < mc->mc_dbx->md_vlen_min ||
 21848                       dsize > mc->mc_dbx->md_vlen_max)) {
 21849            rc = bad_page(
 21850                mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n",
 21851                dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max);
 21852            continue;
 21853          }
 21854          break;
 21855        case F_SUBDATA /* sub-db */:
 21856          if (unlikely(dsize != sizeof(MDBX_db))) {
 21857            rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize);
 21858            continue;
 21859          }
 21860          break;
 21861        case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
 21862          if (unlikely(dsize != sizeof(MDBX_db))) {
 21863            rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize);
 21864            continue;
 21865          }
 21866          break;
 21867        case F_DUPDATA /* short sub-page */:
 21868          if (unlikely(dsize <= PAGEHDRSZ)) {
 21869            rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n",
 21870                          dsize);
 21871            continue;
 21872          } else {
 21873            const MDBX_page *const sp = (MDBX_page *)data;
 21874            switch (sp->mp_flags &
 21875                    /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
 21876            case P_LEAF | P_SUBP:
 21877            case P_LEAF | P_LEAF2 | P_SUBP:
 21878              break;
 21879            default:
 21880              rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n",
 21881                            sp->mp_flags);
 21882              continue;
 21883            }
 21884  
 21885            const char *const end_of_subpage = data + dsize;
 21886            const int nsubkeys = page_numkeys(sp);
 21887            if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) &&
 21888                mc->mc_db->md_entries)
 21889              rc = bad_page(mp, "no keys on a %s-page\n",
 21890                            IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub");
 21891  
 21892            MDBX_val sub_here, sub_prev = {0, 0};
 21893            for (int j = 0; j < nsubkeys; j++) {
 21894              if (IS_LEAF2(sp)) {
 21895                /* LEAF2 pages have no mp_ptrs[] or node headers */
 21896                size_t sub_ksize = sp->mp_leaf2_ksize;
 21897                char *sub_key = page_leaf2key(sp, j, sub_ksize);
 21898                if (unlikely(end_of_subpage < sub_key + sub_ksize)) {
 21899                  rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n",
 21900                                sub_key + sub_ksize - end_of_subpage);
 21901                  continue;
 21902                }
 21903  
 21904                if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) {
 21905                  if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min ||
 21906                               sub_ksize > mc->mc_dbx->md_vlen_max))
 21907                    rc = bad_page(mp,
 21908                                  "nested-leaf2-key size (%zu) <> min/max "
 21909                                  "value-length (%zu/%zu)\n",
 21910                                  sub_ksize, mc->mc_dbx->md_vlen_min,
 21911                                  mc->mc_dbx->md_vlen_max);
 21912                  else
 21913                    mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize;
 21914                }
 21915                if ((mc->mc_checking & CC_SKIPORD) == 0) {
 21916                  sub_here.iov_len = sub_ksize;
 21917                  sub_here.iov_base = sub_key;
 21918                  if (sub_prev.iov_base &&
 21919                      unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0))
 21920                    rc = bad_page(mp,
 21921                                  "nested-leaf2-key #%u wrong order (%s >= %s)\n",
 21922                                  j, DKEY(&sub_prev), DVAL(&sub_here));
 21923                  sub_prev = sub_here;
 21924                }
 21925              } else {
 21926                const MDBX_node *const sub_node = page_node(sp, j);
 21927                const char *sub_node_end = (char *)sub_node + NODESIZE;
 21928                if (unlikely(sub_node_end > end_of_subpage)) {
 21929                  rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n",
 21930                                end_of_subpage - sub_node_end);
 21931                  continue;
 21932                }
 21933                if (unlikely(node_flags(sub_node) != 0))
 21934                  rc = bad_page(mp, "nested-node invalid flags (%u)\n",
 21935                                node_flags(sub_node));
 21936  
 21937                size_t sub_ksize = node_ks(sub_node);
 21938                char *sub_key = node_key(sub_node);
 21939                size_t sub_dsize = node_ds(sub_node);
 21940                /* char *sub_data = node_data(sub_node); */
 21941  
 21942                if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min ||
 21943                             sub_ksize > mc->mc_dbx->md_vlen_max))
 21944                  rc = bad_page(mp,
 21945                                "nested-node-key size (%zu) <> min/max "
 21946                                "value-length (%zu/%zu)\n",
 21947                                sub_ksize, mc->mc_dbx->md_vlen_min,
 21948                                mc->mc_dbx->md_vlen_max);
 21949                if ((mc->mc_checking & CC_SKIPORD) == 0) {
 21950                  sub_here.iov_len = sub_ksize;
 21951                  sub_here.iov_base = sub_key;
 21952                  if (sub_prev.iov_base &&
 21953                      unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0))
 21954                    rc = bad_page(mp,
 21955                                  "nested-node-key #%u wrong order (%s >= %s)\n",
 21956                                  j, DKEY(&sub_prev), DVAL(&sub_here));
 21957                  sub_prev = sub_here;
 21958                }
 21959                if (unlikely(sub_dsize != 0))
 21960                  rc = bad_page(mp, "nested-node non-empty data size (%zu)\n",
 21961                                sub_dsize);
 21962                if (unlikely(end_of_subpage < sub_key + sub_ksize))
 21963                  rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n",
 21964                                sub_key + sub_ksize - end_of_subpage);
 21965              }
 21966            }
 21967          }
 21968          break;
 21969        }
 21970      }
 21971    }
 21972    return rc;
 21973  }
 21974  
 21975  __cold static int cursor_check(MDBX_cursor *mc) {
 21976    cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length ==
 21977                    (mc->mc_txn->mt_parent
 21978                         ? mc->mc_txn->mt_parent->tw.dirtyroom
 21979                         : mc->mc_txn->mt_env->me_options.dp_limit));
 21980    cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING));
 21981    if (unlikely(mc->mc_top != mc->mc_snum - 1) &&
 21982        (mc->mc_checking & CC_UPDATING) == 0)
 21983      return MDBX_CURSOR_FULL;
 21984    cASSERT(mc, (mc->mc_checking & CC_UPDATING)
 21985                    ? mc->mc_snum <= mc->mc_db->md_depth
 21986                    : mc->mc_snum == mc->mc_db->md_depth);
 21987    if (unlikely((mc->mc_checking & CC_UPDATING)
 21988                     ? mc->mc_snum > mc->mc_db->md_depth
 21989                     : mc->mc_snum != mc->mc_db->md_depth))
 21990      return MDBX_CURSOR_FULL;
 21991  
 21992    for (int n = 0; n < (int)mc->mc_snum; ++n) {
 21993      MDBX_page *mp = mc->mc_pg[n];
 21994      const unsigned nkeys = page_numkeys(mp);
 21995      const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false;
 21996      const bool expect_nested_leaf =
 21997          (n + 1 == mc->mc_db->md_depth - 1) ? true : false;
 21998      const bool branch = IS_BRANCH(mp) ? true : false;
 21999      cASSERT(mc, branch == expect_branch);
 22000      if (unlikely(branch != expect_branch))
 22001        return MDBX_CURSOR_FULL;
 22002      if ((mc->mc_checking & CC_UPDATING) == 0) {
 22003        cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] &&
 22004                                             (mc->mc_flags & C_EOF) != 0));
 22005        if (unlikely(nkeys <= mc->mc_ki[n] &&
 22006                     !(!branch && nkeys == mc->mc_ki[n] &&
 22007                       (mc->mc_flags & C_EOF) != 0)))
 22008          return MDBX_CURSOR_FULL;
 22009      } else {
 22010        cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]);
 22011        if (unlikely(nkeys + 1 < mc->mc_ki[n]))
 22012          return MDBX_CURSOR_FULL;
 22013      }
 22014  
 22015      int err = page_check(mc, mp);
 22016      if (unlikely(err != MDBX_SUCCESS))
 22017        return err;
 22018  
 22019      for (unsigned i = 0; i < nkeys; ++i) {
 22020        if (branch) {
 22021          MDBX_node *node = page_node(mp, i);
 22022          cASSERT(mc, node_flags(node) == 0);
 22023          if (unlikely(node_flags(node) != 0))
 22024            return MDBX_CURSOR_FULL;
 22025          pgno_t pgno = node_pgno(node);
 22026          MDBX_page *np;
 22027          err = page_get(mc, pgno, &np, mp->mp_txnid);
 22028          cASSERT(mc, err == MDBX_SUCCESS);
 22029          if (unlikely(err != MDBX_SUCCESS))
 22030            return err;
 22031          const bool nested_leaf = IS_LEAF(np) ? true : false;
 22032          cASSERT(mc, nested_leaf == expect_nested_leaf);
 22033          if (unlikely(nested_leaf != expect_nested_leaf))
 22034            return MDBX_CURSOR_FULL;
 22035          err = page_check(mc, np);
 22036          if (unlikely(err != MDBX_SUCCESS))
 22037            return err;
 22038        }
 22039      }
 22040    }
 22041    return MDBX_SUCCESS;
 22042  }
 22043  
 22044  __cold static int cursor_check_updating(MDBX_cursor *mc) {
 22045    const uint8_t checking = mc->mc_checking;
 22046    mc->mc_checking |= CC_UPDATING;
 22047    const int rc = cursor_check(mc);
 22048    mc->mc_checking = checking;
 22049    return rc;
 22050  }
 22051  
 22052  /* Complete a delete operation started by mdbx_cursor_del(). */
 22053  static int cursor_del(MDBX_cursor *mc) {
 22054    int rc;
 22055    MDBX_page *mp;
 22056    indx_t ki;
 22057    unsigned nkeys;
 22058    MDBX_dbi dbi = mc->mc_dbi;
 22059  
 22060    cASSERT(mc, cursor_is_tracked(mc));
 22061    cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
 22062    ki = mc->mc_ki[mc->mc_top];
 22063    mp = mc->mc_pg[mc->mc_top];
 22064    node_del(mc, mc->mc_db->md_xsize);
 22065    mc->mc_db->md_entries--;
 22066  
 22067    /* Adjust other cursors pointing to mp */
 22068    for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
 22069      MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 22070      if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
 22071        continue;
 22072      if (m3->mc_snum < mc->mc_snum)
 22073        continue;
 22074      if (m3->mc_pg[mc->mc_top] == mp) {
 22075        if (m3->mc_ki[mc->mc_top] == ki) {
 22076          m3->mc_flags |= C_DEL;
 22077          if (mc->mc_db->md_flags & MDBX_DUPSORT) {
 22078            /* Sub-cursor referred into dataset which is gone */
 22079            m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
 22080          }
 22081          continue;
 22082        } else if (m3->mc_ki[mc->mc_top] > ki) {
 22083          m3->mc_ki[mc->mc_top]--;
 22084        }
 22085        if (XCURSOR_INITED(m3))
 22086          XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
 22087      }
 22088    }
 22089  
 22090    rc = rebalance(mc);
 22091    if (unlikely(rc != MDBX_SUCCESS))
 22092      goto bailout;
 22093  
 22094    if (unlikely(!mc->mc_snum)) {
 22095      /* DB is totally empty now, just bail out.
 22096       * Other cursors adjustments were already done
 22097       * by rebalance and aren't needed here. */
 22098      cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 &&
 22099                      mc->mc_db->md_root == P_INVALID);
 22100      mc->mc_flags |= C_EOF;
 22101      return MDBX_SUCCESS;
 22102    }
 22103  
 22104    ki = mc->mc_ki[mc->mc_top];
 22105    mp = mc->mc_pg[mc->mc_top];
 22106    cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
 22107    nkeys = page_numkeys(mp);
 22108    cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) ||
 22109                    ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 &&
 22110                     nkeys == 0));
 22111  
 22112    /* Adjust this and other cursors pointing to mp */
 22113    for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) {
 22114      MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 22115      if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
 22116        continue;
 22117      if (m3->mc_snum < mc->mc_snum)
 22118        continue;
 22119      if (m3->mc_pg[mc->mc_top] == mp) {
 22120        /* if m3 points past last node in page, find next sibling */
 22121        if (m3->mc_ki[mc->mc_top] >= nkeys) {
 22122          rc = cursor_sibling(m3, SIBLING_RIGHT);
 22123          if (rc == MDBX_NOTFOUND) {
 22124            m3->mc_flags |= C_EOF;
 22125            rc = MDBX_SUCCESS;
 22126            continue;
 22127          }
 22128          if (unlikely(rc != MDBX_SUCCESS))
 22129            goto bailout;
 22130        }
 22131        if (m3->mc_ki[mc->mc_top] >= ki ||
 22132            /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) {
 22133          if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) {
 22134            MDBX_node *node =
 22135                page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
 22136            /* If this node has dupdata, it may need to be reinited
 22137             * because its data has moved.
 22138             * If the xcursor was not inited it must be reinited.
 22139             * Else if node points to a subDB, nothing is needed. */
 22140            if (node_flags(node) & F_DUPDATA) {
 22141              if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
 22142                if (!(node_flags(node) & F_SUBDATA))
 22143                  m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
 22144              } else {
 22145                rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]);
 22146                if (unlikely(rc != MDBX_SUCCESS))
 22147                  goto bailout;
 22148                rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL);
 22149                if (unlikely(rc != MDBX_SUCCESS))
 22150                  goto bailout;
 22151              }
 22152            }
 22153            m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
 22154          }
 22155          m3->mc_flags |= C_DEL;
 22156        }
 22157      }
 22158    }
 22159  
 22160    cASSERT(mc, rc == MDBX_SUCCESS);
 22161    if (AUDIT_ENABLED())
 22162      rc = cursor_check(mc);
 22163    return rc;
 22164  
 22165  bailout:
 22166    mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 22167    return rc;
 22168  }
 22169  
 22170  int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
 22171               const MDBX_val *data) {
 22172    int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
 22173    if (unlikely(rc != MDBX_SUCCESS))
 22174      return rc;
 22175  
 22176    if (unlikely(!key))
 22177      return MDBX_EINVAL;
 22178  
 22179    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 22180      return MDBX_BAD_DBI;
 22181  
 22182    if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
 22183      return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
 22184  
 22185    return delete (txn, dbi, key, data, 0);
 22186  }
 22187  
 22188  static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
 22189                     const MDBX_val *data, unsigned flags) {
 22190    MDBX_cursor_couple cx;
 22191    MDBX_cursor_op op;
 22192    MDBX_val rdata;
 22193    int rc;
 22194    DKBUF_DEBUG;
 22195  
 22196    DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key),
 22197          DVAL_DEBUG(data));
 22198  
 22199    rc = cursor_init(&cx.outer, txn, dbi);
 22200    if (unlikely(rc != MDBX_SUCCESS))
 22201      return rc;
 22202  
 22203    if (data) {
 22204      op = MDBX_GET_BOTH;
 22205      rdata = *data;
 22206      data = &rdata;
 22207    } else {
 22208      op = MDBX_SET;
 22209      flags |= MDBX_ALLDUPS;
 22210    }
 22211    rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err;
 22212    if (likely(rc == MDBX_SUCCESS)) {
 22213      /* let mdbx_page_split know about this cursor if needed:
 22214       * delete will trigger a rebalance; if it needs to move
 22215       * a node from one page to another, it will have to
 22216       * update the parent's separator key(s). If the new sepkey
 22217       * is larger than the current one, the parent page may
 22218       * run out of space, triggering a split. We need this
 22219       * cursor to be consistent until the end of the rebalance. */
 22220      cx.outer.mc_next = txn->mt_cursors[dbi];
 22221      txn->mt_cursors[dbi] = &cx.outer;
 22222      rc = mdbx_cursor_del(&cx.outer, flags);
 22223      txn->mt_cursors[dbi] = cx.outer.mc_next;
 22224    }
 22225    return rc;
 22226  }
 22227  
 22228  /* Split a page and insert a new node.
 22229   * Set MDBX_TXN_ERROR on failure.
 22230   * [in,out] mc Cursor pointing to the page and desired insertion index.
 22231   * The cursor will be updated to point to the actual page and index where
 22232   * the node got inserted after the split.
 22233   * [in] newkey The key for the newly inserted node.
 22234   * [in] newdata The data for the newly inserted node.
 22235   * [in] newpgno The page number, if the new node is a branch node.
 22236   * [in] naf The NODE_ADD_FLAGS for the new node.
 22237   * Returns 0 on success, non-zero on failure. */
 22238  static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
 22239                        MDBX_val *const newdata, pgno_t newpgno,
 22240                        const unsigned naf) {
 22241    unsigned flags;
 22242    int rc = MDBX_SUCCESS, foliage = 0;
 22243    unsigned i, ptop;
 22244    MDBX_env *const env = mc->mc_txn->mt_env;
 22245    MDBX_val sepkey, rkey, xdata;
 22246    MDBX_page *tmp_ki_copy = NULL;
 22247    DKBUF;
 22248  
 22249    MDBX_page *const mp = mc->mc_pg[mc->mc_top];
 22250    const unsigned newindx = mc->mc_ki[mc->mc_top];
 22251    unsigned nkeys = page_numkeys(mp);
 22252    if (AUDIT_ENABLED()) {
 22253      rc = cursor_check_updating(mc);
 22254      if (unlikely(rc != MDBX_SUCCESS))
 22255        return rc;
 22256    }
 22257    STATIC_ASSERT(P_BRANCH == 1);
 22258    const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1;
 22259  
 22260    DEBUG(">> splitting %s-page %" PRIaPGNO
 22261          " and adding %zu+%zu [%s] at %i, nkeys %i",
 22262          IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len,
 22263          newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey),
 22264          mc->mc_ki[mc->mc_top], nkeys);
 22265    cASSERT(mc, nkeys + 1 >= minkeys * 2);
 22266  
 22267    /* Create a new sibling page. */
 22268    pgr_t npr = page_new(mc, mp->mp_flags);
 22269    if (unlikely(npr.err != MDBX_SUCCESS))
 22270      return npr.err;
 22271    MDBX_page *const sister = npr.page;
 22272    sister->mp_leaf2_ksize = mp->mp_leaf2_ksize;
 22273    DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno);
 22274  
 22275    /* Usually when splitting the root page, the cursor
 22276     * height is 1. But when called from update_key,
 22277     * the cursor height may be greater because it walks
 22278     * up the stack while finding the branch slot to update. */
 22279    if (mc->mc_top < 1) {
 22280      npr = page_new(mc, P_BRANCH);
 22281      rc = npr.err;
 22282      if (unlikely(rc != MDBX_SUCCESS))
 22283        goto done;
 22284      MDBX_page *const pp = npr.page;
 22285      /* shift current top to make room for new parent */
 22286      cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0);
 22287  #if MDBX_DEBUG
 22288      memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3);
 22289      memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3);
 22290  #endif
 22291      mc->mc_pg[2] = mc->mc_pg[1];
 22292      mc->mc_ki[2] = mc->mc_ki[1];
 22293      mc->mc_pg[1] = mc->mc_pg[0];
 22294      mc->mc_ki[1] = mc->mc_ki[0];
 22295      mc->mc_pg[0] = pp;
 22296      mc->mc_ki[0] = 0;
 22297      mc->mc_db->md_root = pp->mp_pgno;
 22298      DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno);
 22299      foliage = mc->mc_db->md_depth++;
 22300  
 22301      /* Add left (implicit) pointer. */
 22302      rc = node_add_branch(mc, 0, NULL, mp->mp_pgno);
 22303      if (unlikely(rc != MDBX_SUCCESS)) {
 22304        /* undo the pre-push */
 22305        mc->mc_pg[0] = mc->mc_pg[1];
 22306        mc->mc_ki[0] = mc->mc_ki[1];
 22307        mc->mc_db->md_root = mp->mp_pgno;
 22308        mc->mc_db->md_depth--;
 22309        goto done;
 22310      }
 22311      mc->mc_snum++;
 22312      mc->mc_top++;
 22313      ptop = 0;
 22314      if (AUDIT_ENABLED()) {
 22315        rc = cursor_check_updating(mc);
 22316        if (unlikely(rc != MDBX_SUCCESS))
 22317          goto done;
 22318      }
 22319    } else {
 22320      ptop = mc->mc_top - 1;
 22321      DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno);
 22322    }
 22323  
 22324    MDBX_cursor mn;
 22325    cursor_copy(mc, &mn);
 22326    mn.mc_pg[mn.mc_top] = sister;
 22327    mn.mc_ki[mn.mc_top] = 0;
 22328    mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1;
 22329  
 22330    unsigned split_indx =
 22331        (newindx < nkeys)
 22332            ? /* split at the middle */ (nkeys + 1) >> 1
 22333            : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1;
 22334    eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1);
 22335  
 22336    cASSERT(mc, !IS_BRANCH(mp) || newindx > 0);
 22337    /* It is reasonable and possible to split the page at the begin */
 22338    if (unlikely(newindx < minkeys)) {
 22339      split_indx = minkeys;
 22340      if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) {
 22341        split_indx = 0;
 22342        /* Checking for ability of splitting by the left-side insertion
 22343         * of a pure page with the new key */
 22344        for (i = 0; i < mc->mc_top; ++i)
 22345          if (mc->mc_ki[i]) {
 22346            get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey);
 22347            if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0)
 22348              split_indx = minkeys;
 22349            break;
 22350          }
 22351        if (split_indx == 0) {
 22352          /* Save the current first key which was omitted on the parent branch
 22353           * page and should be updated if the new first entry will be added */
 22354          if (IS_LEAF2(mp)) {
 22355            sepkey.iov_len = mp->mp_leaf2_ksize;
 22356            sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len);
 22357          } else
 22358            get_key(page_node(mp, 0), &sepkey);
 22359          cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0);
 22360          /* Avoiding rare complex cases of split the parent page */
 22361          if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey))
 22362            split_indx = minkeys;
 22363        }
 22364      }
 22365    }
 22366  
 22367    const bool pure_right = split_indx == nkeys;
 22368    const bool pure_left = split_indx == 0;
 22369    if (unlikely(pure_right)) {
 22370      /* newindx == split_indx == nkeys */
 22371      TRACE("no-split, but add new pure page at the %s", "right/after");
 22372      cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1);
 22373      sepkey = *newkey;
 22374    } else if (unlikely(pure_left)) {
 22375      /* newindx == split_indx == 0 */
 22376      TRACE("no-split, but add new pure page at the %s", "left/before");
 22377      cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1);
 22378      TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey));
 22379    } else {
 22380      if (IS_LEAF2(sister)) {
 22381        char *split, *ins;
 22382        unsigned lsize, rsize, ksize;
 22383        /* Move half of the keys to the right sibling */
 22384        const int distance = mc->mc_ki[mc->mc_top] - split_indx;
 22385        ksize = mc->mc_db->md_xsize;
 22386        split = page_leaf2key(mp, split_indx, ksize);
 22387        rsize = (nkeys - split_indx) * ksize;
 22388        lsize = (nkeys - split_indx) * sizeof(indx_t);
 22389        cASSERT(mc, mp->mp_lower >= lsize);
 22390        mp->mp_lower -= (indx_t)lsize;
 22391        cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX);
 22392        sister->mp_lower += (indx_t)lsize;
 22393        cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX);
 22394        mp->mp_upper += (indx_t)(rsize - lsize);
 22395        cASSERT(mc, sister->mp_upper >= rsize - lsize);
 22396        sister->mp_upper -= (indx_t)(rsize - lsize);
 22397        sepkey.iov_len = ksize;
 22398        sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base;
 22399        if (distance < 0) {
 22400          cASSERT(mc, ksize >= sizeof(indx_t));
 22401          ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize);
 22402          memcpy(sister->mp_ptrs, split, rsize);
 22403          sepkey.iov_base = sister->mp_ptrs;
 22404          memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
 22405          memcpy(ins, newkey->iov_base, ksize);
 22406          cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t));
 22407          mp->mp_lower += sizeof(indx_t);
 22408          cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t));
 22409          mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
 22410        } else {
 22411          memcpy(sister->mp_ptrs, split, distance * ksize);
 22412          ins = page_leaf2key(sister, distance, ksize);
 22413          memcpy(ins, newkey->iov_base, ksize);
 22414          memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize);
 22415          cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t));
 22416          sister->mp_lower += sizeof(indx_t);
 22417          cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t));
 22418          sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
 22419          cASSERT(mc, distance <= (int)UINT16_MAX);
 22420          mc->mc_ki[mc->mc_top] = (indx_t)distance;
 22421        }
 22422  
 22423        if (AUDIT_ENABLED()) {
 22424          rc = cursor_check_updating(mc);
 22425          if (unlikely(rc != MDBX_SUCCESS))
 22426            goto done;
 22427          rc = cursor_check_updating(&mn);
 22428          if (unlikely(rc != MDBX_SUCCESS))
 22429            goto done;
 22430        }
 22431      } else {
 22432        /* grab a page to hold a temporary copy */
 22433        tmp_ki_copy = page_malloc(mc->mc_txn, 1);
 22434        if (unlikely(tmp_ki_copy == NULL)) {
 22435          rc = MDBX_ENOMEM;
 22436          goto done;
 22437        }
 22438  
 22439        const unsigned max_space = page_space(env);
 22440        const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata)
 22441                                            : branch_size(env, newkey);
 22442  
 22443        /* prepare to insert */
 22444        for (i = 0; i < newindx; ++i)
 22445          tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i];
 22446        tmp_ki_copy->mp_ptrs[i] = (indx_t)-1;
 22447        while (++i <= nkeys)
 22448          tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1];
 22449        tmp_ki_copy->mp_pgno = mp->mp_pgno;
 22450        tmp_ki_copy->mp_flags = mp->mp_flags;
 22451        tmp_ki_copy->mp_txnid = INVALID_TXNID;
 22452        tmp_ki_copy->mp_lower = 0;
 22453        tmp_ki_copy->mp_upper = (indx_t)max_space;
 22454  
 22455        /* Добавляемый узел может не поместиться в страницу-половину вместе
 22456         * с количественной половиной узлов из исходной страницы. В худшем случае,
 22457         * в страницу-половину с добавляемым узлом могут попасть самые больше узлы
 22458         * из исходной страницы, а другую половину только узлы с самыми короткими
 22459         * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу
 22460         * разреза требуется итерировать узлы и считая их объем.
 22461         *
 22462         * Однако, при простом количественном делении (без учета размера ключей
 22463         * и данных) на страницах-половинах будет примерно вдвое меньше узлов.
 22464         * Поэтому добавляемый узел точно поместится, если его размер не больше
 22465         * чем место "освобождающееся" от заголовков узлов, которые переедут
 22466         * в другую страницу-половину. Кроме этого, как минимум по одному байту
 22467         * будет в каждом ключе, в худшем случае кроме одного, который может быть
 22468         * нулевого размера. */
 22469  
 22470        if (newindx == split_indx && nkeys >= 5) {
 22471          STATIC_ASSERT(P_BRANCH == 1);
 22472          split_indx += mp->mp_flags & P_BRANCH;
 22473        }
 22474        eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys);
 22475        const unsigned dim_nodes =
 22476            (newindx >= split_indx) ? split_indx : nkeys - split_indx;
 22477        const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes;
 22478        if (new_size >= dim_used) {
 22479          /* Search for best acceptable split point */
 22480          i = (newindx < split_indx) ? 0 : nkeys;
 22481          int dir = (newindx < split_indx) ? 1 : -1;
 22482          size_t before = 0, after = new_size + page_used(env, mp);
 22483          unsigned best_split = split_indx;
 22484          unsigned best_shift = INT_MAX;
 22485  
 22486          TRACE("seek separator from %u, step %i, default %u, new-idx %u, "
 22487                "new-size %zu",
 22488                i, dir, split_indx, newindx, new_size);
 22489          do {
 22490            cASSERT(mc, i <= nkeys);
 22491            size_t size = new_size;
 22492            if (i != newindx) {
 22493              MDBX_node *node =
 22494                  (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
 22495              size = NODESIZE + node_ks(node) + sizeof(indx_t);
 22496              if (IS_LEAF(mp))
 22497                size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t)
 22498                                                       : node_ds(node);
 22499              size = EVEN(size);
 22500            }
 22501  
 22502            before += size;
 22503            after -= size;
 22504            TRACE("step %u, size %zu, before %zu, after %zu, max %u", i, size,
 22505                  before, after, max_space);
 22506  
 22507            if (before <= max_space && after <= max_space) {
 22508              const unsigned split = i + (dir > 0);
 22509              if (split >= minkeys && split <= nkeys + 1 - minkeys) {
 22510                const unsigned shift = branchless_abs(split_indx - split);
 22511                if (shift >= best_shift)
 22512                  break;
 22513                best_shift = shift;
 22514                best_split = split;
 22515                if (!best_shift)
 22516                  break;
 22517              }
 22518            }
 22519            i += dir;
 22520          } while (i < nkeys);
 22521  
 22522          split_indx = best_split;
 22523          TRACE("chosen %u", split_indx);
 22524        }
 22525        eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys);
 22526  
 22527        sepkey = *newkey;
 22528        if (split_indx != newindx) {
 22529          MDBX_node *node =
 22530              (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] +
 22531                            PAGEHDRSZ);
 22532          sepkey.iov_len = node_ks(node);
 22533          sepkey.iov_base = node_key(node);
 22534        }
 22535      }
 22536    }
 22537    DEBUG("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey));
 22538  
 22539    bool did_split_parent = false;
 22540    /* Copy separator key to the parent. */
 22541    if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) {
 22542      TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey));
 22543      cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2);
 22544      cASSERT(mc, !pure_left);
 22545      const int snum = mc->mc_snum;
 22546      const int depth = mc->mc_db->md_depth;
 22547      mn.mc_snum--;
 22548      mn.mc_top--;
 22549      did_split_parent = true;
 22550      /* We want other splits to find mn when doing fixups */
 22551      WITH_CURSOR_TRACKING(
 22552          mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0));
 22553      if (unlikely(rc != MDBX_SUCCESS))
 22554        goto done;
 22555      cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth);
 22556      if (AUDIT_ENABLED()) {
 22557        rc = cursor_check_updating(mc);
 22558        if (unlikely(rc != MDBX_SUCCESS))
 22559          goto done;
 22560      }
 22561  
 22562      /* root split? */
 22563      ptop += mc->mc_snum - snum;
 22564  
 22565      /* Right page might now have changed parent.
 22566       * Check if left page also changed parent. */
 22567      if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
 22568          mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
 22569        for (i = 0; i < ptop; i++) {
 22570          mc->mc_pg[i] = mn.mc_pg[i];
 22571          mc->mc_ki[i] = mn.mc_ki[i];
 22572        }
 22573        mc->mc_pg[ptop] = mn.mc_pg[ptop];
 22574        if (mn.mc_ki[ptop]) {
 22575          mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
 22576        } else {
 22577          /* find right page's left sibling */
 22578          mc->mc_ki[ptop] = mn.mc_ki[ptop];
 22579          rc = cursor_sibling(mc, SIBLING_LEFT);
 22580          if (unlikely(rc != MDBX_SUCCESS)) {
 22581            if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ {
 22582              ERROR("unexpected %i error going left sibling", rc);
 22583              rc = MDBX_PROBLEM;
 22584            }
 22585            goto done;
 22586          }
 22587        }
 22588      }
 22589    } else if (unlikely(pure_left)) {
 22590      MDBX_page *ptop_page = mc->mc_pg[ptop];
 22591      DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s",
 22592            ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno,
 22593            DKEY(mc->mc_ki[ptop] ? newkey : NULL));
 22594      mc->mc_top--;
 22595      rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL,
 22596                           sister->mp_pgno);
 22597      cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] &&
 22598                      ptop == mc->mc_top);
 22599  
 22600      if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) {
 22601        DEBUG("update prev-first key on parent %s", DKEY(&sepkey));
 22602        MDBX_node *node = page_node(mc->mc_pg[ptop], 1);
 22603        cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno);
 22604        cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0);
 22605        mc->mc_ki[ptop] = 1;
 22606        rc = update_key(mc, &sepkey);
 22607        cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1);
 22608        cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]);
 22609        mc->mc_ki[ptop] = 0;
 22610      }
 22611  
 22612      mc->mc_top++;
 22613      if (unlikely(rc != MDBX_SUCCESS))
 22614        goto done;
 22615  
 22616      MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1);
 22617      cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page);
 22618    } else {
 22619      mn.mc_top--;
 22620      TRACE("add-to-parent the right-entry[%u] for new sibling-page",
 22621            mn.mc_ki[ptop]);
 22622      rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno);
 22623      mn.mc_top++;
 22624      if (unlikely(rc != MDBX_SUCCESS))
 22625        goto done;
 22626    }
 22627  
 22628    if (unlikely(pure_left | pure_right)) {
 22629      mc->mc_pg[mc->mc_top] = sister;
 22630      mc->mc_ki[mc->mc_top] = 0;
 22631      switch (PAGETYPE_WHOLE(sister)) {
 22632      case P_LEAF: {
 22633        cASSERT(mc, newpgno == 0 || newpgno == P_INVALID);
 22634        rc = node_add_leaf(mc, 0, newkey, newdata, naf);
 22635      } break;
 22636      case P_LEAF | P_LEAF2: {
 22637        cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
 22638        cASSERT(mc, newpgno == 0 || newpgno == P_INVALID);
 22639        rc = node_add_leaf2(mc, 0, newkey);
 22640      } break;
 22641      default:
 22642        rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister));
 22643      }
 22644      if (unlikely(rc != MDBX_SUCCESS))
 22645        goto done;
 22646  
 22647      if (pure_right) {
 22648        for (i = 0; i < mc->mc_top; i++)
 22649          mc->mc_ki[i] = mn.mc_ki[i];
 22650      } else if (mc->mc_ki[mc->mc_top - 1] == 0) {
 22651        for (i = 2; i <= mc->mc_top; ++i)
 22652          if (mc->mc_ki[mc->mc_top - i]) {
 22653            get_key(
 22654                page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]),
 22655                &sepkey);
 22656            if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) {
 22657              mc->mc_top -= (uint8_t)i;
 22658              DEBUG("update new-first on parent [%i] page %u key %s",
 22659                    mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno,
 22660                    DKEY(newkey));
 22661              rc = update_key(mc, newkey);
 22662              mc->mc_top += (uint8_t)i;
 22663              if (unlikely(rc != MDBX_SUCCESS))
 22664                goto done;
 22665            }
 22666            break;
 22667          }
 22668      }
 22669    } else if (!IS_LEAF2(mp)) {
 22670      /* Move nodes */
 22671      mc->mc_pg[mc->mc_top] = sister;
 22672      i = split_indx;
 22673      unsigned n = 0;
 22674      do {
 22675        TRACE("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno);
 22676        pgno_t pgno = 0;
 22677        MDBX_val *rdata = NULL;
 22678        if (i == newindx) {
 22679          rkey = *newkey;
 22680          if (IS_LEAF(mp))
 22681            rdata = newdata;
 22682          else
 22683            pgno = newpgno;
 22684          flags = naf;
 22685          /* Update index for the new key. */
 22686          mc->mc_ki[mc->mc_top] = (indx_t)n;
 22687        } else {
 22688          MDBX_node *node =
 22689              (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
 22690          rkey.iov_base = node_key(node);
 22691          rkey.iov_len = node_ks(node);
 22692          if (IS_LEAF(mp)) {
 22693            xdata.iov_base = node_data(node);
 22694            xdata.iov_len = node_ds(node);
 22695            rdata = &xdata;
 22696          } else
 22697            pgno = node_pgno(node);
 22698          flags = node_flags(node);
 22699        }
 22700  
 22701        switch (PAGETYPE_WHOLE(sister)) {
 22702        case P_BRANCH: {
 22703          cASSERT(mc, 0 == (uint16_t)flags);
 22704          /* First branch index doesn't need key data. */
 22705          rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno);
 22706        } break;
 22707        case P_LEAF: {
 22708          cASSERT(mc, pgno == 0);
 22709          cASSERT(mc, rdata != NULL);
 22710          rc = node_add_leaf(mc, n, &rkey, rdata, flags);
 22711        } break;
 22712        /* case P_LEAF | P_LEAF2: {
 22713          cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
 22714          cASSERT(mc, gno == 0);
 22715          rc = mdbx_node_add_leaf2(mc, n, &rkey);
 22716        } break; */
 22717        default:
 22718          rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister));
 22719        }
 22720        if (unlikely(rc != MDBX_SUCCESS))
 22721          goto done;
 22722  
 22723        ++n;
 22724        if (++i > nkeys) {
 22725          i = 0;
 22726          n = 0;
 22727          mc->mc_pg[mc->mc_top] = tmp_ki_copy;
 22728          TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno);
 22729        }
 22730      } while (i != split_indx);
 22731  
 22732      TRACE("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n,
 22733            mc->mc_pg[mc->mc_top]->mp_pgno);
 22734  
 22735      nkeys = page_numkeys(tmp_ki_copy);
 22736      for (i = 0; i < nkeys; i++)
 22737        mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i];
 22738      mp->mp_lower = tmp_ki_copy->mp_lower;
 22739      mp->mp_upper = tmp_ki_copy->mp_upper;
 22740      memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1),
 22741             env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ);
 22742  
 22743      /* reset back to original page */
 22744      if (newindx < split_indx) {
 22745        mc->mc_pg[mc->mc_top] = mp;
 22746      } else {
 22747        mc->mc_pg[mc->mc_top] = sister;
 22748        mc->mc_ki[ptop]++;
 22749        /* Make sure mc_ki is still valid. */
 22750        if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
 22751            mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
 22752          for (i = 0; i <= ptop; i++) {
 22753            mc->mc_pg[i] = mn.mc_pg[i];
 22754            mc->mc_ki[i] = mn.mc_ki[i];
 22755          }
 22756        }
 22757      }
 22758    } else if (newindx >= split_indx) {
 22759      mc->mc_pg[mc->mc_top] = sister;
 22760      mc->mc_ki[ptop]++;
 22761      /* Make sure mc_ki is still valid. */
 22762      if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
 22763          mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
 22764        for (i = 0; i <= ptop; i++) {
 22765          mc->mc_pg[i] = mn.mc_pg[i];
 22766          mc->mc_ki[i] = mn.mc_ki[i];
 22767        }
 22768      }
 22769    }
 22770  
 22771    /* Adjust other cursors pointing to mp and/or to parent page */
 22772    nkeys = page_numkeys(mp);
 22773    for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2;
 22774         m2 = m2->mc_next) {
 22775      MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
 22776      if (m3 == mc)
 22777        continue;
 22778      if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
 22779        continue;
 22780      if (foliage) {
 22781        /* sub cursors may be on different DB */
 22782        if (m3->mc_pg[0] != mp)
 22783          continue;
 22784        /* root split */
 22785        for (int k = foliage; k >= 0; k--) {
 22786          m3->mc_ki[k + 1] = m3->mc_ki[k];
 22787          m3->mc_pg[k + 1] = m3->mc_pg[k];
 22788        }
 22789        m3->mc_ki[0] = m3->mc_ki[0] >= nkeys;
 22790        m3->mc_pg[0] = mc->mc_pg[0];
 22791        m3->mc_snum++;
 22792        m3->mc_top++;
 22793      }
 22794  
 22795      if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) {
 22796        if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE))
 22797          m3->mc_ki[mc->mc_top]++;
 22798        if (m3->mc_ki[mc->mc_top] >= nkeys) {
 22799          m3->mc_pg[mc->mc_top] = sister;
 22800          cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys);
 22801          m3->mc_ki[mc->mc_top] -= (indx_t)nkeys;
 22802          for (i = 0; i < mc->mc_top; i++) {
 22803            m3->mc_ki[i] = mn.mc_ki[i];
 22804            m3->mc_pg[i] = mn.mc_pg[i];
 22805          }
 22806        }
 22807      } else if (!did_split_parent && m3->mc_top >= ptop &&
 22808                 m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
 22809                 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
 22810        m3->mc_ki[ptop]++; /* also for the `pure-left` case */
 22811      }
 22812      if (XCURSOR_INITED(m3) && IS_LEAF(mp))
 22813        XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
 22814    }
 22815    TRACE("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp),
 22816          sister->mp_pgno, page_room(sister));
 22817  
 22818  done:
 22819    if (tmp_ki_copy)
 22820      dpage_free(env, tmp_ki_copy, 1);
 22821  
 22822    if (unlikely(rc != MDBX_SUCCESS))
 22823      mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
 22824    else {
 22825      if (AUDIT_ENABLED())
 22826        rc = cursor_check_updating(mc);
 22827      if (unlikely(naf & MDBX_RESERVE)) {
 22828        MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
 22829        if (!(node_flags(node) & F_BIGDATA))
 22830          newdata->iov_base = node_data(node);
 22831      }
 22832  #if MDBX_ENABLE_PGOP_STAT
 22833      env->me_lck->mti_pgop_stat.split.weak += 1;
 22834  #endif /* MDBX_ENABLE_PGOP_STAT */
 22835    }
 22836  
 22837    DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc);
 22838    return rc;
 22839  }
 22840  
 22841  int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
 22842               unsigned flags) {
 22843    int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
 22844    if (unlikely(rc != MDBX_SUCCESS))
 22845      return rc;
 22846  
 22847    if (unlikely(!key || !data))
 22848      return MDBX_EINVAL;
 22849  
 22850    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 22851      return MDBX_BAD_DBI;
 22852  
 22853    if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
 22854                           MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
 22855                           MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE)))
 22856      return MDBX_EINVAL;
 22857  
 22858    if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
 22859      return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
 22860  
 22861    MDBX_cursor_couple cx;
 22862    rc = cursor_init(&cx.outer, txn, dbi);
 22863    if (unlikely(rc != MDBX_SUCCESS))
 22864      return rc;
 22865    cx.outer.mc_next = txn->mt_cursors[dbi];
 22866    txn->mt_cursors[dbi] = &cx.outer;
 22867  
 22868    /* LY: support for update (explicit overwrite) */
 22869    if (flags & MDBX_CURRENT) {
 22870      rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET);
 22871      if (likely(rc == MDBX_SUCCESS) &&
 22872          (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) &&
 22873          (flags & MDBX_ALLDUPS) == 0) {
 22874        /* LY: allows update (explicit overwrite) only for unique keys */
 22875        MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
 22876                                    cx.outer.mc_ki[cx.outer.mc_top]);
 22877        if (node_flags(node) & F_DUPDATA) {
 22878          tASSERT(txn, XCURSOR_INITED(&cx.outer) &&
 22879                           cx.outer.mc_xcursor->mx_db.md_entries > 1);
 22880          rc = MDBX_EMULTIVAL;
 22881        }
 22882      }
 22883    }
 22884  
 22885    if (likely(rc == MDBX_SUCCESS))
 22886      rc = mdbx_cursor_put(&cx.outer, key, data, flags);
 22887    txn->mt_cursors[dbi] = cx.outer.mc_next;
 22888  
 22889    return rc;
 22890  }
 22891  
 22892  /**** COPYING *****************************************************************/
 22893  
 22894  /* State needed for a double-buffering compacting copy. */
 22895  typedef struct mdbx_compacting_ctx {
 22896    MDBX_env *mc_env;
 22897    MDBX_txn *mc_txn;
 22898    osal_condpair_t mc_condpair;
 22899    uint8_t *mc_wbuf[2];
 22900    size_t mc_wlen[2];
 22901    mdbx_filehandle_t mc_fd;
 22902    /* Error code.  Never cleared if set.  Both threads can set nonzero
 22903     * to fail the copy.  Not mutex-protected, MDBX expects atomic int. */
 22904    volatile int mc_error;
 22905    pgno_t mc_next_pgno;
 22906    volatile unsigned mc_head;
 22907    volatile unsigned mc_tail;
 22908  } mdbx_compacting_ctx;
 22909  
 22910  /* Dedicated writer thread for compacting copy. */
 22911  __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
 22912    mdbx_compacting_ctx *const ctx = arg;
 22913  
 22914  #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
 22915    sigset_t sigset;
 22916    sigemptyset(&sigset);
 22917    sigaddset(&sigset, SIGPIPE);
 22918    ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
 22919  #endif /* EPIPE */
 22920  
 22921    osal_condpair_lock(&ctx->mc_condpair);
 22922    while (!ctx->mc_error) {
 22923      while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) {
 22924        int err = osal_condpair_wait(&ctx->mc_condpair, true);
 22925        if (err != MDBX_SUCCESS) {
 22926          ctx->mc_error = err;
 22927          goto bailout;
 22928        }
 22929      }
 22930      const unsigned toggle = ctx->mc_tail & 1;
 22931      size_t wsize = ctx->mc_wlen[toggle];
 22932      if (wsize == 0) {
 22933        ctx->mc_tail += 1;
 22934        break /* EOF */;
 22935      }
 22936      ctx->mc_wlen[toggle] = 0;
 22937      uint8_t *ptr = ctx->mc_wbuf[toggle];
 22938      if (!ctx->mc_error) {
 22939        int err = osal_write(ctx->mc_fd, ptr, wsize);
 22940        if (err != MDBX_SUCCESS) {
 22941  #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
 22942          if (err == EPIPE) {
 22943            /* Collect the pending SIGPIPE,
 22944             * otherwise at least OS X gives it to the process on thread-exit. */
 22945            int unused;
 22946            sigwait(&sigset, &unused);
 22947          }
 22948  #endif /* EPIPE */
 22949          ctx->mc_error = err;
 22950          goto bailout;
 22951        }
 22952      }
 22953      ctx->mc_tail += 1;
 22954      osal_condpair_signal(&ctx->mc_condpair, false);
 22955    }
 22956  bailout:
 22957    osal_condpair_unlock(&ctx->mc_condpair);
 22958    return (THREAD_RESULT)0;
 22959  }
 22960  
 22961  /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
 22962  __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) {
 22963    osal_condpair_lock(&ctx->mc_condpair);
 22964    eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error);
 22965    ctx->mc_head += 1;
 22966    osal_condpair_signal(&ctx->mc_condpair, true);
 22967    while (!ctx->mc_error &&
 22968           ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) {
 22969      int err = osal_condpair_wait(&ctx->mc_condpair, false);
 22970      if (err != MDBX_SUCCESS)
 22971        ctx->mc_error = err;
 22972    }
 22973    osal_condpair_unlock(&ctx->mc_condpair);
 22974    return ctx->mc_error;
 22975  }
 22976  
 22977  __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb);
 22978  
 22979  static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src,
 22980                                  size_t bytes, pgno_t pgno, pgno_t npages) {
 22981    assert(pgno == 0 || bytes > PAGEHDRSZ);
 22982    while (bytes > 0) {
 22983      const unsigned side = ctx->mc_head & 1;
 22984      const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side];
 22985      if (left < (pgno ? PAGEHDRSZ : 1)) {
 22986        int err = compacting_toggle_write_buffers(ctx);
 22987        if (unlikely(err != MDBX_SUCCESS))
 22988          return err;
 22989        continue;
 22990      }
 22991      const size_t chunk = (bytes < left) ? bytes : left;
 22992      void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side];
 22993      if (src) {
 22994        memcpy(dst, src, chunk);
 22995        if (pgno) {
 22996          assert(chunk > PAGEHDRSZ);
 22997          MDBX_page *mp = dst;
 22998          mp->mp_pgno = pgno;
 22999          if (mp->mp_txnid == 0)
 23000            mp->mp_txnid = ctx->mc_txn->mt_txnid;
 23001          if (mp->mp_flags == P_OVERFLOW) {
 23002            assert(bytes <= pgno2bytes(ctx->mc_env, npages));
 23003            mp->mp_pages = npages;
 23004          }
 23005          pgno = 0;
 23006        }
 23007        src = (const char *)src + chunk;
 23008      } else
 23009        memset(dst, 0, chunk);
 23010      bytes -= chunk;
 23011      ctx->mc_wlen[side] += chunk;
 23012    }
 23013    return MDBX_SUCCESS;
 23014  }
 23015  
 23016  static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp,
 23017                                 const size_t head_bytes, const size_t tail_bytes,
 23018                                 const pgno_t npages) {
 23019    if (tail_bytes) {
 23020      assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize);
 23021      assert(npages == 1 &&
 23022             (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF));
 23023    } else {
 23024      assert(head_bytes <= pgno2bytes(ctx->mc_env, npages));
 23025      assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) ||
 23026             PAGETYPE_WHOLE(mp) == P_OVERFLOW);
 23027    }
 23028  
 23029    const pgno_t pgno = ctx->mc_next_pgno;
 23030    ctx->mc_next_pgno += npages;
 23031    int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages);
 23032    if (unlikely(err != MDBX_SUCCESS))
 23033      return err;
 23034    err = compacting_put_bytes(
 23035        ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes),
 23036        0, 0);
 23037    if (unlikely(err != MDBX_SUCCESS))
 23038      return err;
 23039    return compacting_put_bytes(
 23040        ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0,
 23041        0);
 23042  }
 23043  
 23044  __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
 23045                                         MDBX_cursor *mc, pgno_t *root,
 23046                                         txnid_t parent_txnid) {
 23047    mc->mc_snum = 1;
 23048    int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid);
 23049    if (unlikely(rc != MDBX_SUCCESS))
 23050      return rc;
 23051  
 23052    rc = page_search_root(mc, nullptr, MDBX_PS_FIRST);
 23053    if (unlikely(rc != MDBX_SUCCESS))
 23054      return rc;
 23055  
 23056    /* Make cursor pages writable */
 23057    char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum));
 23058    if (buf == NULL)
 23059      return MDBX_ENOMEM;
 23060  
 23061    char *ptr = buf;
 23062    for (unsigned i = 0; i < mc->mc_top; i++) {
 23063      page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize);
 23064      mc->mc_pg[i] = (MDBX_page *)ptr;
 23065      ptr += ctx->mc_env->me_psize;
 23066    }
 23067    /* This is writable space for a leaf page. Usually not needed. */
 23068    MDBX_page *const leaf = (MDBX_page *)ptr;
 23069  
 23070    while (mc->mc_snum > 0) {
 23071      MDBX_page *mp = mc->mc_pg[mc->mc_top];
 23072      unsigned n = page_numkeys(mp);
 23073  
 23074      if (IS_LEAF(mp)) {
 23075        if (!(mc->mc_flags &
 23076              C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) {
 23077          for (unsigned i = 0; i < n; i++) {
 23078            MDBX_node *node = page_node(mp, i);
 23079            if (node_flags(node) == F_BIGDATA) {
 23080              /* Need writable leaf */
 23081              if (mp != leaf) {
 23082                mc->mc_pg[mc->mc_top] = leaf;
 23083                page_copy(leaf, mp, ctx->mc_env->me_psize);
 23084                mp = leaf;
 23085                node = page_node(mp, i);
 23086              }
 23087  
 23088              const pgr_t lp =
 23089                  page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid);
 23090              if (unlikely((rc = lp.err) != MDBX_SUCCESS))
 23091                goto done;
 23092              const size_t datasize = node_ds(node);
 23093              const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize);
 23094              poke_pgno(node_data(node), ctx->mc_next_pgno);
 23095              rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0,
 23096                                       npages);
 23097              if (unlikely(rc != MDBX_SUCCESS))
 23098                goto done;
 23099            } else if (node_flags(node) & F_SUBDATA) {
 23100              if (!MDBX_DISABLE_VALIDATION &&
 23101                  unlikely(node_ds(node) != sizeof(MDBX_db))) {
 23102                rc = MDBX_CORRUPTED;
 23103                goto done;
 23104              }
 23105  
 23106              /* Need writable leaf */
 23107              if (mp != leaf) {
 23108                mc->mc_pg[mc->mc_top] = leaf;
 23109                page_copy(leaf, mp, ctx->mc_env->me_psize);
 23110                mp = leaf;
 23111                node = page_node(mp, i);
 23112              }
 23113  
 23114              MDBX_db *nested = nullptr;
 23115              if (node_flags(node) & F_DUPDATA) {
 23116                rc = cursor_xinit1(mc, node, mp);
 23117                if (likely(rc == MDBX_SUCCESS)) {
 23118                  nested = &mc->mc_xcursor->mx_db;
 23119                  rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor,
 23120                                            &nested->md_root, mp->mp_txnid);
 23121                }
 23122              } else {
 23123                cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0);
 23124                MDBX_cursor_couple *couple =
 23125                    container_of(mc, MDBX_cursor_couple, outer);
 23126                cASSERT(mc,
 23127                        couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE &&
 23128                            !couple->inner.mx_cursor.mc_flags &&
 23129                            !couple->inner.mx_cursor.mc_db &&
 23130                            !couple->inner.mx_cursor.mc_dbx);
 23131                nested = &couple->inner.mx_db;
 23132                memcpy(nested, node_data(node), sizeof(MDBX_db));
 23133                rc = compacting_walk_sdb(ctx, nested);
 23134              }
 23135              if (unlikely(rc != MDBX_SUCCESS))
 23136                goto done;
 23137              memcpy(node_data(node), nested, sizeof(MDBX_db));
 23138            }
 23139          }
 23140        }
 23141      } else {
 23142        mc->mc_ki[mc->mc_top]++;
 23143        if (mc->mc_ki[mc->mc_top] < n) {
 23144          while (1) {
 23145            const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
 23146            rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
 23147            if (unlikely(rc != MDBX_SUCCESS))
 23148              goto done;
 23149            mc->mc_top++;
 23150            mc->mc_snum++;
 23151            mc->mc_ki[mc->mc_top] = 0;
 23152            if (!IS_BRANCH(mp)) {
 23153              mc->mc_pg[mc->mc_top] = mp;
 23154              break;
 23155            }
 23156            /* Whenever we advance to a sibling branch page,
 23157             * we must proceed all the way down to its first leaf. */
 23158            page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize);
 23159          }
 23160          continue;
 23161        }
 23162      }
 23163  
 23164      const pgno_t pgno = ctx->mc_next_pgno;
 23165      if (likely(!IS_LEAF2(mp))) {
 23166        rc = compacting_put_page(
 23167            ctx, mp, PAGEHDRSZ + mp->mp_lower,
 23168            ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1);
 23169      } else {
 23170        rc = compacting_put_page(
 23171            ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1);
 23172      }
 23173      if (unlikely(rc != MDBX_SUCCESS))
 23174        goto done;
 23175  
 23176      if (mc->mc_top) {
 23177        /* Update parent if there is one */
 23178        node_set_pgno(
 23179            page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]),
 23180            pgno);
 23181        cursor_pop(mc);
 23182      } else {
 23183        /* Otherwise we're done */
 23184        *root = pgno;
 23185        break;
 23186      }
 23187    }
 23188  done:
 23189    osal_free(buf);
 23190    return rc;
 23191  }
 23192  
 23193  __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) {
 23194    if (unlikely(sdb->md_root == P_INVALID))
 23195      return MDBX_SUCCESS; /* empty db */
 23196  
 23197    MDBX_cursor_couple couple;
 23198    memset(&couple, 0, sizeof(couple));
 23199    couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE;
 23200    MDBX_dbx dbx = {.md_klen_min = INT_MAX};
 23201    uint8_t dbistate = DBI_VALID | DBI_AUDITED;
 23202    int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate);
 23203    if (unlikely(rc != MDBX_SUCCESS))
 23204      return rc;
 23205  
 23206    couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
 23207    couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
 23208    if (!sdb->md_mod_txnid)
 23209      sdb->md_mod_txnid = ctx->mc_txn->mt_txnid;
 23210    return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root,
 23211                                sdb->md_mod_txnid);
 23212  }
 23213  
 23214  __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
 23215    eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid ||
 23216                     meta->mm_dbs[FREE_DBI].md_root == P_INVALID);
 23217    eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid ||
 23218                     meta->mm_dbs[MAIN_DBI].md_root == P_INVALID);
 23219  
 23220    /* Calculate filesize taking in account shrink/growing thresholds */
 23221    if (meta->mm_geo.next != meta->mm_geo.now) {
 23222      meta->mm_geo.now = meta->mm_geo.next;
 23223      const pgno_t aligner = pv2pages(
 23224          meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv);
 23225      if (aligner) {
 23226        const pgno_t aligned = pgno_align2os_pgno(
 23227            env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner);
 23228        meta->mm_geo.now = aligned;
 23229      }
 23230    }
 23231  
 23232    if (meta->mm_geo.now < meta->mm_geo.lower)
 23233      meta->mm_geo.now = meta->mm_geo.lower;
 23234    if (meta->mm_geo.now > meta->mm_geo.upper)
 23235      meta->mm_geo.now = meta->mm_geo.upper;
 23236  
 23237    /* Update signature */
 23238    assert(meta->mm_geo.now >= meta->mm_geo.next);
 23239    unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta));
 23240  }
 23241  
 23242  /* Make resizeable */
 23243  __cold static void meta_make_sizeable(MDBX_meta *meta) {
 23244    meta->mm_geo.lower = MIN_PAGENO;
 23245    if (meta->mm_geo.grow_pv == 0) {
 23246      const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42;
 23247      meta->mm_geo.grow_pv = pages2pv(step);
 23248    }
 23249    if (meta->mm_geo.shrink_pv == 0) {
 23250      const pgno_t step = pv2pages(meta->mm_geo.grow_pv) << 1;
 23251      meta->mm_geo.shrink_pv = pages2pv(step);
 23252    }
 23253  }
 23254  
 23255  /* Copy environment with compaction. */
 23256  __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn,
 23257                                mdbx_filehandle_t fd, uint8_t *buffer,
 23258                                const bool dest_is_pipe, const int flags) {
 23259    const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
 23260    uint8_t *const data_buffer =
 23261        buffer + ceil_powerof2(meta_bytes, env->me_os_psize);
 23262    MDBX_meta *const meta = init_metas(env, buffer);
 23263    meta_set_txnid(env, meta, read_txn->mt_txnid);
 23264  
 23265    if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
 23266      meta_make_sizeable(meta);
 23267  
 23268    /* copy canary sequences if present */
 23269    if (read_txn->mt_canary.v) {
 23270      meta->mm_canary = read_txn->mt_canary;
 23271      meta->mm_canary.v = constmeta_txnid(meta);
 23272    }
 23273  
 23274    if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) {
 23275      /* When the DB is empty, handle it specially to
 23276       * fix any breakage like page leaks from ITS#8174. */
 23277      meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags;
 23278      compacting_fixup_meta(env, meta);
 23279      if (dest_is_pipe) {
 23280        int rc = osal_write(fd, buffer, meta_bytes);
 23281        if (unlikely(rc != MDBX_SUCCESS))
 23282          return rc;
 23283      }
 23284    } else {
 23285      /* Count free pages + GC pages. */
 23286      MDBX_cursor_couple couple;
 23287      int rc = cursor_init(&couple.outer, read_txn, FREE_DBI);
 23288      if (unlikely(rc != MDBX_SUCCESS))
 23289        return rc;
 23290      pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages +
 23291                  read_txn->mt_dbs[FREE_DBI].md_leaf_pages +
 23292                  read_txn->mt_dbs[FREE_DBI].md_overflow_pages;
 23293      MDBX_val key, data;
 23294      while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) ==
 23295             MDBX_SUCCESS) {
 23296        const MDBX_PNL pnl = data.iov_base;
 23297        if (unlikely(data.iov_len % sizeof(pgno_t) ||
 23298                     data.iov_len < MDBX_PNL_SIZEOF(pnl) ||
 23299                     !(pnl_check(pnl, read_txn->mt_next_pgno))))
 23300          return MDBX_CORRUPTED;
 23301        gc += MDBX_PNL_SIZE(pnl);
 23302      }
 23303      if (unlikely(rc != MDBX_NOTFOUND))
 23304        return rc;
 23305  
 23306      /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */
 23307      meta->mm_geo.next = read_txn->mt_next_pgno - gc;
 23308      /* Set with current main DB */
 23309      meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI];
 23310  
 23311      mdbx_compacting_ctx ctx;
 23312      memset(&ctx, 0, sizeof(ctx));
 23313      rc = osal_condpair_init(&ctx.mc_condpair);
 23314      if (unlikely(rc != MDBX_SUCCESS))
 23315        return rc;
 23316  
 23317      memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
 23318      ctx.mc_wbuf[0] = data_buffer;
 23319      ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
 23320      ctx.mc_next_pgno = NUM_METAS;
 23321      ctx.mc_env = env;
 23322      ctx.mc_fd = fd;
 23323      ctx.mc_txn = read_txn;
 23324  
 23325      osal_thread_t thread;
 23326      int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx);
 23327      if (likely(thread_err == MDBX_SUCCESS)) {
 23328        if (dest_is_pipe) {
 23329          if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid)
 23330            meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid;
 23331          compacting_fixup_meta(env, meta);
 23332          rc = osal_write(fd, buffer, meta_bytes);
 23333        }
 23334        if (likely(rc == MDBX_SUCCESS))
 23335          rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]);
 23336        if (ctx.mc_wlen[ctx.mc_head & 1])
 23337          /* toggle to flush non-empty buffers */
 23338          compacting_toggle_write_buffers(&ctx);
 23339  
 23340        if (likely(rc == MDBX_SUCCESS) &&
 23341            unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) {
 23342          if (ctx.mc_next_pgno > meta->mm_geo.next) {
 23343            ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO
 23344                  " %c expected %" PRIaPGNO,
 23345                  "has double-used pages or other corruption", ctx.mc_next_pgno,
 23346                  '>', meta->mm_geo.next);
 23347            rc = MDBX_CORRUPTED; /* corrupted DB */
 23348          }
 23349          if (ctx.mc_next_pgno < meta->mm_geo.next) {
 23350            WARNING(
 23351                "the source DB %s: post-compactification used pages %" PRIaPGNO
 23352                " %c expected %" PRIaPGNO,
 23353                "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next);
 23354            if (dest_is_pipe)
 23355              /* the root within already written meta-pages is wrong */
 23356              rc = MDBX_CORRUPTED;
 23357          }
 23358          /* fixup meta */
 23359          meta->mm_geo.next = ctx.mc_next_pgno;
 23360        }
 23361  
 23362        /* toggle with empty buffers to exit thread's loop */
 23363        eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0);
 23364        compacting_toggle_write_buffers(&ctx);
 23365        thread_err = osal_thread_join(thread);
 23366        eASSERT(env, (ctx.mc_tail == ctx.mc_head &&
 23367                      ctx.mc_wlen[ctx.mc_head & 1] == 0) ||
 23368                         ctx.mc_error);
 23369        osal_condpair_destroy(&ctx.mc_condpair);
 23370      }
 23371      if (unlikely(thread_err != MDBX_SUCCESS))
 23372        return thread_err;
 23373      if (unlikely(rc != MDBX_SUCCESS))
 23374        return rc;
 23375      if (unlikely(ctx.mc_error != MDBX_SUCCESS))
 23376        return ctx.mc_error;
 23377      if (!dest_is_pipe)
 23378        compacting_fixup_meta(env, meta);
 23379    }
 23380  
 23381    /* Extend file if required */
 23382    if (meta->mm_geo.now != meta->mm_geo.next) {
 23383      const size_t whole_size = pgno2bytes(env, meta->mm_geo.now);
 23384      if (!dest_is_pipe)
 23385        return osal_ftruncate(fd, whole_size);
 23386  
 23387      const size_t used_size = pgno2bytes(env, meta->mm_geo.next);
 23388      memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
 23389      for (size_t offset = used_size; offset < whole_size;) {
 23390        const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
 23391                                 ? (size_t)MDBX_ENVCOPY_WRITEBUF
 23392                                 : whole_size - offset;
 23393        /* copy to avoid EFAULT in case swapped-out */
 23394        int rc = osal_write(fd, data_buffer, chunk);
 23395        if (unlikely(rc != MDBX_SUCCESS))
 23396          return rc;
 23397        offset += chunk;
 23398      }
 23399    }
 23400    return MDBX_SUCCESS;
 23401  }
 23402  
 23403  /* Copy environment as-is. */
 23404  __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
 23405                                  mdbx_filehandle_t fd, uint8_t *buffer,
 23406                                  const bool dest_is_pipe, const int flags) {
 23407    /* We must start the actual read txn after blocking writers */
 23408    int rc = txn_end(read_txn, MDBX_END_RESET_TMP);
 23409    if (unlikely(rc != MDBX_SUCCESS))
 23410      return rc;
 23411  
 23412    /* Temporarily block writers until we snapshot the meta pages */
 23413    rc = mdbx_txn_lock(env, false);
 23414    if (unlikely(rc != MDBX_SUCCESS))
 23415      return rc;
 23416  
 23417    rc = txn_renew(read_txn, MDBX_TXN_RDONLY);
 23418    if (unlikely(rc != MDBX_SUCCESS)) {
 23419      mdbx_txn_unlock(env);
 23420      return rc;
 23421    }
 23422  
 23423    jitter4testing(false);
 23424    const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
 23425    const meta_troika_t troika = meta_tap(env);
 23426    /* Make a snapshot of meta-pages,
 23427     * but writing ones after the data was flushed */
 23428    memcpy(buffer, env->me_map, meta_bytes);
 23429    MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */
 23430        (MDBX_meta *)(buffer +
 23431                      ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map));
 23432    mdbx_txn_unlock(env);
 23433  
 23434    if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
 23435      meta_make_sizeable(headcopy);
 23436    /* Update signature to steady */
 23437    unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy));
 23438  
 23439    /* Copy the data */
 23440    const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno);
 23441    const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno);
 23442    jitter4testing(false);
 23443  
 23444    if (dest_is_pipe)
 23445      rc = osal_write(fd, buffer, meta_bytes);
 23446  
 23447    uint8_t *const data_buffer =
 23448        buffer + ceil_powerof2(meta_bytes, env->me_os_psize);
 23449  #if MDBX_USE_COPYFILERANGE
 23450    static bool copyfilerange_unavailable;
 23451    bool not_the_same_filesystem = false;
 23452  #endif /* MDBX_USE_COPYFILERANGE */
 23453    for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) {
 23454  #if MDBX_USE_SENDFILE
 23455      static bool sendfile_unavailable;
 23456      if (dest_is_pipe && likely(!sendfile_unavailable)) {
 23457        off_t in_offset = offset;
 23458        const ssize_t written =
 23459            sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset);
 23460        if (likely(written > 0)) {
 23461          offset = in_offset;
 23462          continue;
 23463        }
 23464        rc = MDBX_ENODATA;
 23465        if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE)
 23466          break;
 23467        sendfile_unavailable = true;
 23468      }
 23469  #endif /* MDBX_USE_SENDFILE */
 23470  
 23471  #if MDBX_USE_COPYFILERANGE
 23472      if (!dest_is_pipe && !not_the_same_filesystem &&
 23473          likely(!copyfilerange_unavailable)) {
 23474        off_t in_offset = offset, out_offset = offset;
 23475        ssize_t bytes_copied = copy_file_range(
 23476            env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
 23477        if (likely(bytes_copied > 0)) {
 23478          offset = in_offset;
 23479          continue;
 23480        }
 23481        rc = MDBX_ENODATA;
 23482        if (bytes_copied == 0)
 23483          break;
 23484        rc = errno;
 23485        if (rc == EXDEV)
 23486          not_the_same_filesystem = true;
 23487        else if (ignore_enosys(rc) == MDBX_RESULT_TRUE)
 23488          copyfilerange_unavailable = true;
 23489        else
 23490          break;
 23491      }
 23492  #endif /* MDBX_USE_COPYFILERANGE */
 23493  
 23494      /* fallback to portable */
 23495      const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset)
 23496                               ? (size_t)MDBX_ENVCOPY_WRITEBUF
 23497                               : used_size - offset;
 23498      /* copy to avoid EFAULT in case swapped-out */
 23499      memcpy(data_buffer, env->me_map + offset, chunk);
 23500      rc = osal_write(fd, data_buffer, chunk);
 23501      offset += chunk;
 23502    }
 23503  
 23504    /* Extend file if required */
 23505    if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
 23506      if (!dest_is_pipe)
 23507        rc = osal_ftruncate(fd, whole_size);
 23508      else {
 23509        memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
 23510        for (size_t offset = used_size;
 23511             rc == MDBX_SUCCESS && offset < whole_size;) {
 23512          const size_t chunk =
 23513              ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
 23514                  ? (size_t)MDBX_ENVCOPY_WRITEBUF
 23515                  : whole_size - offset;
 23516          /* copy to avoid EFAULT in case swapped-out */
 23517          rc = osal_write(fd, data_buffer, chunk);
 23518          offset += chunk;
 23519        }
 23520      }
 23521    }
 23522  
 23523    return rc;
 23524  }
 23525  
 23526  __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
 23527                              unsigned flags) {
 23528    int rc = check_env(env, true);
 23529    if (unlikely(rc != MDBX_SUCCESS))
 23530      return rc;
 23531  
 23532    const int dest_is_pipe = osal_is_pipe(fd);
 23533    if (MDBX_IS_ERROR(dest_is_pipe))
 23534      return dest_is_pipe;
 23535  
 23536    if (!dest_is_pipe) {
 23537      rc = osal_fseek(fd, 0);
 23538      if (unlikely(rc != MDBX_SUCCESS))
 23539        return rc;
 23540    }
 23541  
 23542    const size_t buffer_size =
 23543        pgno_align2os_bytes(env, NUM_METAS) +
 23544        ceil_powerof2(((flags & MDBX_CP_COMPACT)
 23545                           ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF
 23546                           : (size_t)MDBX_ENVCOPY_WRITEBUF),
 23547                      env->me_os_psize);
 23548  
 23549    uint8_t *buffer = NULL;
 23550    rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer);
 23551    if (unlikely(rc != MDBX_SUCCESS))
 23552      return rc;
 23553  
 23554    MDBX_txn *read_txn = NULL;
 23555    /* Do the lock/unlock of the reader mutex before starting the
 23556     * write txn. Otherwise other read txns could block writers. */
 23557    rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn);
 23558    if (unlikely(rc != MDBX_SUCCESS)) {
 23559      osal_memalign_free(buffer);
 23560      return rc;
 23561    }
 23562  
 23563    if (!dest_is_pipe) {
 23564      /* Firstly write a stub to meta-pages.
 23565       * Now we sure to incomplete copy will not be used. */
 23566      memset(buffer, -1, pgno2bytes(env, NUM_METAS));
 23567      rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS));
 23568    }
 23569  
 23570    if (likely(rc == MDBX_SUCCESS)) {
 23571      memset(buffer, 0, pgno2bytes(env, NUM_METAS));
 23572      rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)(
 23573          env, read_txn, fd, buffer, dest_is_pipe, flags);
 23574    }
 23575    mdbx_txn_abort(read_txn);
 23576  
 23577    if (!dest_is_pipe) {
 23578      if (likely(rc == MDBX_SUCCESS))
 23579        rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
 23580  
 23581      /* Write actual meta */
 23582      if (likely(rc == MDBX_SUCCESS))
 23583        rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
 23584  
 23585      if (likely(rc == MDBX_SUCCESS))
 23586        rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
 23587    }
 23588  
 23589    osal_memalign_free(buffer);
 23590    return rc;
 23591  }
 23592  
 23593  __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path,
 23594                           MDBX_copy_flags_t flags) {
 23595  #if defined(_WIN32) || defined(_WIN64)
 23596    const wchar_t *dest_pathW = nullptr;
 23597    OSAL_MB2WIDE(dest_path, dest_pathW);
 23598    return mdbx_env_copyW(env, dest_pathW, flags);
 23599  }
 23600  
 23601  LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path,
 23602                                 MDBX_copy_flags_t flags) {
 23603  #endif /* Windows */
 23604  
 23605    int rc = check_env(env, true);
 23606    if (unlikely(rc != MDBX_SUCCESS))
 23607      return rc;
 23608  
 23609    if (unlikely(!dest_path))
 23610      return MDBX_EINVAL;
 23611  
 23612    /* The destination path must exist, but the destination file must not.
 23613     * We don't want the OS to cache the writes, since the source data is
 23614     * already in the OS cache. */
 23615    mdbx_filehandle_t newfd;
 23616    rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd,
 23617  #if defined(_WIN32) || defined(_WIN64)
 23618                       (mdbx_mode_t)-1
 23619  #else
 23620                       S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
 23621  #endif
 23622    );
 23623  
 23624    if (rc == MDBX_SUCCESS) {
 23625  #if defined(_WIN32) || defined(_WIN64)
 23626      OVERLAPPED ov;
 23627      memset(&ov, 0, sizeof(ov));
 23628      if (!LockFileEx(newfd, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY,
 23629                      0, 0, INT32_MAX, &ov))
 23630        rc = GetLastError();
 23631  #else
 23632      struct flock lock_op;
 23633      memset(&lock_op, 0, sizeof(lock_op));
 23634      lock_op.l_type = F_WRLCK;
 23635      lock_op.l_whence = SEEK_SET;
 23636      lock_op.l_start = 0;
 23637      lock_op.l_len =
 23638          (sizeof(lock_op.l_len) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff;
 23639      if (fcntl(newfd, F_SETLK, &lock_op)
 23640  #if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) &&      \
 23641      (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
 23642          || flock(newfd, LOCK_EX | LOCK_NB)
 23643  #endif /* Linux */
 23644      )
 23645        rc = errno;
 23646  #endif /* Windows / POSIX */
 23647    }
 23648  
 23649    if (rc == MDBX_SUCCESS)
 23650      rc = mdbx_env_copy2fd(env, newfd, flags);
 23651  
 23652    if (newfd != INVALID_HANDLE_VALUE) {
 23653      int err = osal_closefile(newfd);
 23654      if (rc == MDBX_SUCCESS && err != rc)
 23655        rc = err;
 23656      if (rc != MDBX_SUCCESS)
 23657        (void)osal_removefile(dest_path);
 23658    }
 23659  
 23660    return rc;
 23661  }
 23662  
 23663  /******************************************************************************/
 23664  
 23665  __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags,
 23666                                bool onoff) {
 23667    int rc = check_env(env, false);
 23668    if (unlikely(rc != MDBX_SUCCESS))
 23669      return rc;
 23670  
 23671    if (unlikely(flags &
 23672                 ((env->me_flags & MDBX_ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS
 23673                                                    : ~ENV_USABLE_FLAGS)))
 23674      return MDBX_EPERM;
 23675  
 23676    if (unlikely(env->me_flags & MDBX_RDONLY))
 23677      return MDBX_EACCESS;
 23678  
 23679    if ((env->me_flags & MDBX_ENV_ACTIVE) &&
 23680        unlikely(env->me_txn0->mt_owner == osal_thread_self()))
 23681      return MDBX_BUSY;
 23682  
 23683    const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) &&
 23684                             env->me_txn0->mt_owner != osal_thread_self();
 23685    bool should_unlock = false;
 23686    if (lock_needed) {
 23687      rc = mdbx_txn_lock(env, false);
 23688      if (unlikely(rc))
 23689        return rc;
 23690      should_unlock = true;
 23691    }
 23692  
 23693    if (onoff)
 23694      env->me_flags = merge_sync_flags(env->me_flags, flags);
 23695    else
 23696      env->me_flags &= ~flags;
 23697  
 23698    if (should_unlock)
 23699      mdbx_txn_unlock(env);
 23700    return MDBX_SUCCESS;
 23701  }
 23702  
 23703  __cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) {
 23704    int rc = check_env(env, false);
 23705    if (unlikely(rc != MDBX_SUCCESS))
 23706      return rc;
 23707  
 23708    if (unlikely(!arg))
 23709      return MDBX_EINVAL;
 23710  
 23711    *arg = env->me_flags & ENV_USABLE_FLAGS;
 23712    return MDBX_SUCCESS;
 23713  }
 23714  
 23715  __cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
 23716    int rc = check_env(env, false);
 23717    if (unlikely(rc != MDBX_SUCCESS))
 23718      return rc;
 23719  
 23720    env->me_userctx = ctx;
 23721    return MDBX_SUCCESS;
 23722  }
 23723  
 23724  __cold void *mdbx_env_get_userctx(const MDBX_env *env) {
 23725    return env ? env->me_userctx : NULL;
 23726  }
 23727  
 23728  __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
 23729    int rc = check_env(env, false);
 23730    if (unlikely(rc != MDBX_SUCCESS))
 23731      return rc;
 23732  
 23733  #if MDBX_DEBUG
 23734    env->me_assert_func = func;
 23735    return MDBX_SUCCESS;
 23736  #else
 23737    (void)func;
 23738    return MDBX_ENOSYS;
 23739  #endif
 23740  }
 23741  
 23742  #if !(defined(_WIN32) || defined(_WIN64))
 23743  __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) {
 23744    int rc = check_env(env, true);
 23745    if (unlikely(rc != MDBX_SUCCESS))
 23746      return rc;
 23747  
 23748    if (unlikely(!arg))
 23749      return MDBX_EINVAL;
 23750  
 23751    *arg = env->me_pathname;
 23752    return MDBX_SUCCESS;
 23753  }
 23754  #else
 23755  __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) {
 23756    int rc = check_env(env, true);
 23757    if (unlikely(rc != MDBX_SUCCESS))
 23758      return rc;
 23759  
 23760    if (unlikely(!arg))
 23761      return MDBX_EINVAL;
 23762  
 23763    *arg = env->me_pathname;
 23764    return MDBX_SUCCESS;
 23765  }
 23766  #endif /* Windows */
 23767  
 23768  __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) {
 23769    int rc = check_env(env, true);
 23770    if (unlikely(rc != MDBX_SUCCESS))
 23771      return rc;
 23772  
 23773    if (unlikely(!arg))
 23774      return MDBX_EINVAL;
 23775  
 23776    *arg = env->me_lazy_fd;
 23777    return MDBX_SUCCESS;
 23778  }
 23779  
 23780  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 23781  __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) {
 23782    return __inline_mdbx_env_stat(env, stat, bytes);
 23783  }
 23784  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 23785  
 23786  static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) {
 23787    st->ms_depth = db->md_depth;
 23788    st->ms_branch_pages = db->md_branch_pages;
 23789    st->ms_leaf_pages = db->md_leaf_pages;
 23790    st->ms_overflow_pages = db->md_overflow_pages;
 23791    st->ms_entries = db->md_entries;
 23792    if (likely(bytes >=
 23793               offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
 23794      st->ms_mod_txnid = db->md_mod_txnid;
 23795  }
 23796  
 23797  static void stat_add(const MDBX_db *db, MDBX_stat *const st,
 23798                       const size_t bytes) {
 23799    st->ms_depth += db->md_depth;
 23800    st->ms_branch_pages += db->md_branch_pages;
 23801    st->ms_leaf_pages += db->md_leaf_pages;
 23802    st->ms_overflow_pages += db->md_overflow_pages;
 23803    st->ms_entries += db->md_entries;
 23804    if (likely(bytes >=
 23805               offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
 23806      st->ms_mod_txnid = (st->ms_mod_txnid > db->md_mod_txnid) ? st->ms_mod_txnid
 23807                                                               : db->md_mod_txnid;
 23808  }
 23809  
 23810  __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) {
 23811    int err = check_txn(txn, MDBX_TXN_BLOCKED);
 23812    if (unlikely(err != MDBX_SUCCESS))
 23813      return err;
 23814  
 23815    st->ms_psize = txn->mt_env->me_psize;
 23816  #if 1
 23817    /* assuming GC is internal and not subject for accounting */
 23818    stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes);
 23819  #else
 23820    stat_get(&txn->mt_dbs[FREE_DBI], st, bytes);
 23821    stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes);
 23822  #endif
 23823  
 23824    /* account opened named subDBs */
 23825    for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++)
 23826      if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID)
 23827        stat_add(txn->mt_dbs + dbi, st, bytes);
 23828  
 23829    if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) &&
 23830        txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) {
 23831      MDBX_cursor_couple cx;
 23832      err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI);
 23833      if (unlikely(err != MDBX_SUCCESS))
 23834        return err;
 23835  
 23836      /* scan and account not opened named subDBs */
 23837      err = page_search(&cx.outer, NULL, MDBX_PS_FIRST);
 23838      while (err == MDBX_SUCCESS) {
 23839        const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top];
 23840        for (unsigned i = 0; i < page_numkeys(mp); i++) {
 23841          const MDBX_node *node = page_node(mp, i);
 23842          if (node_flags(node) != F_SUBDATA)
 23843            continue;
 23844          if (unlikely(node_ds(node) != sizeof(MDBX_db)))
 23845            return MDBX_CORRUPTED;
 23846  
 23847          /* skip opened and already accounted */
 23848          for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++)
 23849            if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID &&
 23850                node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len &&
 23851                memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base,
 23852                       node_ks(node)) == 0) {
 23853              node = NULL;
 23854              break;
 23855            }
 23856  
 23857          if (node) {
 23858            MDBX_db db;
 23859            memcpy(&db, node_data(node), sizeof(db));
 23860            stat_add(&db, st, bytes);
 23861          }
 23862        }
 23863        err = cursor_sibling(&cx.outer, SIBLING_RIGHT);
 23864      }
 23865      if (unlikely(err != MDBX_NOTFOUND))
 23866        return err;
 23867    }
 23868  
 23869    return MDBX_SUCCESS;
 23870  }
 23871  
 23872  __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn,
 23873                              MDBX_stat *dest, size_t bytes) {
 23874    if (unlikely(!dest))
 23875      return MDBX_EINVAL;
 23876    const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
 23877    if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
 23878      return MDBX_EINVAL;
 23879  
 23880    if (likely(txn)) {
 23881      if (env && unlikely(txn->mt_env != env))
 23882        return MDBX_EINVAL;
 23883      return stat_acc(txn, dest, bytes);
 23884    }
 23885  
 23886    int err = check_env(env, true);
 23887    if (unlikely(err != MDBX_SUCCESS))
 23888      return err;
 23889  
 23890    if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self())
 23891      /* inside write-txn */
 23892      return stat_acc(env->me_txn, dest, bytes);
 23893  
 23894    MDBX_txn *tmp_txn;
 23895    err = mdbx_txn_begin((MDBX_env *)env, NULL, MDBX_TXN_RDONLY, &tmp_txn);
 23896    if (unlikely(err != MDBX_SUCCESS))
 23897      return err;
 23898  
 23899    const int rc = stat_acc(tmp_txn, dest, bytes);
 23900    err = mdbx_txn_abort(tmp_txn);
 23901    if (unlikely(err != MDBX_SUCCESS))
 23902      return err;
 23903    return rc;
 23904  }
 23905  
 23906  __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi,
 23907                                        uint32_t *mask) {
 23908    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 23909    if (unlikely(rc != MDBX_SUCCESS))
 23910      return rc;
 23911  
 23912    if (unlikely(!mask))
 23913      return MDBX_EINVAL;
 23914  
 23915    if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
 23916      return MDBX_BAD_DBI;
 23917  
 23918    MDBX_cursor_couple cx;
 23919    rc = cursor_init(&cx.outer, txn, dbi);
 23920    if (unlikely(rc != MDBX_SUCCESS))
 23921      return rc;
 23922    if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0)
 23923      return MDBX_RESULT_TRUE;
 23924  
 23925    MDBX_val key, data;
 23926    rc = cursor_first(&cx.outer, &key, &data);
 23927    *mask = 0;
 23928    while (rc == MDBX_SUCCESS) {
 23929      const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
 23930                                        cx.outer.mc_ki[cx.outer.mc_top]);
 23931      const MDBX_db *db = node_data(node);
 23932      const unsigned flags = node_flags(node);
 23933      switch (flags) {
 23934      case F_BIGDATA:
 23935      case 0:
 23936        /* single-value entry, deep = 0 */
 23937        *mask |= 1 << 0;
 23938        break;
 23939      case F_DUPDATA:
 23940        /* single sub-page, deep = 1 */
 23941        *mask |= 1 << 1;
 23942        break;
 23943      case F_DUPDATA | F_SUBDATA:
 23944        /* sub-tree */
 23945        *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth);
 23946        break;
 23947      default:
 23948        ERROR("wrong node-flags %u", flags);
 23949        return MDBX_CORRUPTED;
 23950      }
 23951      rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP);
 23952    }
 23953  
 23954    return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
 23955  }
 23956  
 23957  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 23958  __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info,
 23959                           size_t bytes) {
 23960    return __inline_mdbx_env_info(env, info, bytes);
 23961  }
 23962  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 23963  
 23964  __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
 23965                                     MDBX_envinfo *arg, const size_t bytes) {
 23966  
 23967    const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
 23968    const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
 23969  
 23970    /* is the environment open?
 23971     * (todo4recovery://erased_by_github/libmdbx/issues/171) */
 23972    if (unlikely(!env->me_map)) {
 23973      /* environment not yet opened */
 23974  #if 1
 23975      /* default behavior: returns the available info but zeroed the rest */
 23976      memset(arg, 0, bytes);
 23977      arg->mi_geo.lower = env->me_dbgeo.lower;
 23978      arg->mi_geo.upper = env->me_dbgeo.upper;
 23979      arg->mi_geo.shrink = env->me_dbgeo.shrink;
 23980      arg->mi_geo.grow = env->me_dbgeo.grow;
 23981      arg->mi_geo.current = env->me_dbgeo.now;
 23982      arg->mi_maxreaders = env->me_maxreaders;
 23983      arg->mi_dxb_pagesize = env->me_psize;
 23984      arg->mi_sys_pagesize = env->me_os_psize;
 23985      if (likely(bytes > size_before_bootid)) {
 23986        arg->mi_bootid.current.x = bootid.x;
 23987        arg->mi_bootid.current.y = bootid.y;
 23988      }
 23989      return MDBX_SUCCESS;
 23990  #else
 23991      /* some users may prefer this behavior: return appropriate error */
 23992      return MDBX_EPERM;
 23993  #endif
 23994    }
 23995  
 23996    const MDBX_meta *const meta0 = METAPAGE(env, 0);
 23997    const MDBX_meta *const meta1 = METAPAGE(env, 1);
 23998    const MDBX_meta *const meta2 = METAPAGE(env, 2);
 23999    if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
 24000      return MDBX_PANIC;
 24001  
 24002    meta_troika_t holder;
 24003    meta_troika_t const *troika;
 24004    if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY))
 24005      troika = &txn->tw.troika;
 24006    else {
 24007      holder = meta_tap(env);
 24008      troika = &holder;
 24009    }
 24010  
 24011    const meta_ptr_t head = meta_recent(env, troika);
 24012    arg->mi_recent_txnid = head.txnid;
 24013    arg->mi_meta0_txnid = troika->txnid[0];
 24014    arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign);
 24015    arg->mi_meta1_txnid = troika->txnid[1];
 24016    arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign);
 24017    arg->mi_meta2_txnid = troika->txnid[2];
 24018    arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign);
 24019    if (likely(bytes > size_before_bootid)) {
 24020      memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16);
 24021      memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16);
 24022      memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16);
 24023    }
 24024  
 24025    const volatile MDBX_meta *txn_meta = head.ptr_v;
 24026    arg->mi_last_pgno = txn_meta->mm_geo.next - 1;
 24027    arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now);
 24028    if (txn) {
 24029      arg->mi_last_pgno = txn->mt_next_pgno - 1;
 24030      arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno);
 24031  
 24032      const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY)
 24033                                           ? txn->mt_txnid
 24034                                           : txn->mt_txnid - xMDBX_TXNID_STEP;
 24035      txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta;
 24036      txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta;
 24037      txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta;
 24038    }
 24039    arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower);
 24040    arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper);
 24041    arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv));
 24042    arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv));
 24043    const pgno_t unsynced_pages =
 24044        atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) +
 24045        (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
 24046         (uint32_t)arg->mi_recent_txnid);
 24047  
 24048    arg->mi_mapsize = env->me_dxb_mmap.limit;
 24049  
 24050    const MDBX_lockinfo *const lck = env->me_lck;
 24051    arg->mi_maxreaders = env->me_maxreaders;
 24052    arg->mi_numreaders = env->me_lck_mmap.lck
 24053                             ? atomic_load32(&lck->mti_numreaders, mo_Relaxed)
 24054                             : INT32_MAX;
 24055    arg->mi_dxb_pagesize = env->me_psize;
 24056    arg->mi_sys_pagesize = env->me_os_psize;
 24057  
 24058    if (likely(bytes > size_before_bootid)) {
 24059      arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages);
 24060      const uint64_t monotime_now = osal_monotime();
 24061      uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed);
 24062      arg->mi_since_sync_seconds16dot16 =
 24063          ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0;
 24064      ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed);
 24065      arg->mi_since_reader_check_seconds16dot16 =
 24066          ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0;
 24067      arg->mi_autosync_threshold = pgno2bytes(
 24068          env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed));
 24069      arg->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16(
 24070          atomic_load64(&lck->mti_autosync_period, mo_Relaxed));
 24071      arg->mi_bootid.current.x = bootid.x;
 24072      arg->mi_bootid.current.y = bootid.y;
 24073      arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags;
 24074    }
 24075  
 24076    if (likely(bytes > size_before_pgop_stat)) {
 24077  #if MDBX_ENABLE_PGOP_STAT
 24078      arg->mi_pgop_stat.newly =
 24079          atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed);
 24080      arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed);
 24081      arg->mi_pgop_stat.clone =
 24082          atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed);
 24083      arg->mi_pgop_stat.split =
 24084          atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed);
 24085      arg->mi_pgop_stat.merge =
 24086          atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed);
 24087      arg->mi_pgop_stat.spill =
 24088          atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed);
 24089      arg->mi_pgop_stat.unspill =
 24090          atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
 24091      arg->mi_pgop_stat.wops =
 24092          atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
 24093      arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16(
 24094          atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed));
 24095  #else
 24096      memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat));
 24097  #endif /* MDBX_ENABLE_PGOP_STAT*/
 24098    }
 24099  
 24100    arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid =
 24101        arg->mi_recent_txnid;
 24102    if (env->me_lck_mmap.lck) {
 24103      for (unsigned i = 0; i < arg->mi_numreaders; ++i) {
 24104        const uint32_t pid =
 24105            atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease);
 24106        if (pid) {
 24107          const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
 24108          if (arg->mi_latter_reader_txnid > txnid)
 24109            arg->mi_latter_reader_txnid = txnid;
 24110          if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid)
 24111            arg->mi_self_latter_reader_txnid = txnid;
 24112        }
 24113      }
 24114    }
 24115  
 24116    osal_compiler_barrier();
 24117    return MDBX_SUCCESS;
 24118  }
 24119  
 24120  __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
 24121                              MDBX_envinfo *arg, size_t bytes) {
 24122    if (unlikely((env == NULL && txn == NULL) || arg == NULL))
 24123      return MDBX_EINVAL;
 24124  
 24125    if (txn) {
 24126      int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
 24127      if (unlikely(err != MDBX_SUCCESS))
 24128        return err;
 24129    }
 24130    if (env) {
 24131      int err = check_env(env, false);
 24132      if (unlikely(err != MDBX_SUCCESS))
 24133        return err;
 24134      if (txn && unlikely(txn->mt_env != env))
 24135        return MDBX_EINVAL;
 24136    } else {
 24137      env = txn->mt_env;
 24138    }
 24139  
 24140    const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
 24141    const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
 24142    if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid &&
 24143        bytes != size_before_pgop_stat)
 24144      return MDBX_EINVAL;
 24145  
 24146    MDBX_envinfo snap;
 24147    int rc = fetch_envinfo_ex(env, txn, &snap, sizeof(snap));
 24148    if (unlikely(rc != MDBX_SUCCESS))
 24149      return rc;
 24150  
 24151    while (1) {
 24152      rc = fetch_envinfo_ex(env, txn, arg, bytes);
 24153      if (unlikely(rc != MDBX_SUCCESS))
 24154        return rc;
 24155      snap.mi_since_sync_seconds16dot16 = arg->mi_since_sync_seconds16dot16;
 24156      snap.mi_since_reader_check_seconds16dot16 =
 24157          arg->mi_since_reader_check_seconds16dot16;
 24158      if (likely(memcmp(&snap, arg, bytes) == 0))
 24159        return MDBX_SUCCESS;
 24160      memcpy(&snap, arg, bytes);
 24161    }
 24162  }
 24163  
 24164  static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags) {
 24165    return (flags & MDBX_REVERSEKEY)   ? cmp_reverse
 24166           : (flags & MDBX_INTEGERKEY) ? cmp_int_align2
 24167                                       : cmp_lexical;
 24168  }
 24169  
 24170  static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) {
 24171    return !(flags & MDBX_DUPSORT)
 24172               ? cmp_lenfast
 24173               : ((flags & MDBX_INTEGERDUP)
 24174                      ? cmp_int_unaligned
 24175                      : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical));
 24176  }
 24177  
 24178  static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags,
 24179                      MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
 24180    /* LY: so, accepting only three cases for the table's flags:
 24181     * 1) user_flags and both comparators are zero
 24182     *    = assume that a by-default mode/flags is requested for reading;
 24183     * 2) user_flags exactly the same
 24184     *    = assume that the target mode/flags are requested properly;
 24185     * 3) user_flags differs, but table is empty and MDBX_CREATE is provided
 24186     *    = assume that a properly create request with custom flags;
 24187     */
 24188    if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) {
 24189      /* flags are differs, check other conditions */
 24190      if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) &&
 24191           (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) ||
 24192          user_flags == MDBX_ACCEDE) {
 24193        /* no comparators were provided and flags are zero,
 24194         * seems that is case #1 above */
 24195        user_flags = txn->mt_dbs[dbi].md_flags;
 24196      } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) {
 24197        if (txn->mt_flags & MDBX_TXN_RDONLY)
 24198          return /* FIXME: return extended info */ MDBX_EACCESS;
 24199        /* make sure flags changes get committed */
 24200        txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS;
 24201        txn->mt_flags |= MDBX_TXN_DIRTY;
 24202      } else {
 24203        return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
 24204      }
 24205    }
 24206  
 24207    if (!keycmp)
 24208      keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp
 24209                                        : get_default_keycmp(user_flags);
 24210    if (txn->mt_dbxs[dbi].md_cmp != keycmp) {
 24211      if (txn->mt_dbxs[dbi].md_cmp)
 24212        return MDBX_EINVAL;
 24213      txn->mt_dbxs[dbi].md_cmp = keycmp;
 24214    }
 24215  
 24216    if (!datacmp)
 24217      datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp
 24218                                          : get_default_datacmp(user_flags);
 24219    if (txn->mt_dbxs[dbi].md_dcmp != datacmp) {
 24220      if (txn->mt_dbxs[dbi].md_dcmp)
 24221        return MDBX_EINVAL;
 24222      txn->mt_dbxs[dbi].md_dcmp = datacmp;
 24223    }
 24224  
 24225    return MDBX_SUCCESS;
 24226  }
 24227  
 24228  static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags,
 24229                      MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
 24230                      MDBX_cmp_func *datacmp) {
 24231    int rc = MDBX_EINVAL;
 24232    if (unlikely(!dbi))
 24233      return rc;
 24234  
 24235    if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) {
 24236    early_bailout:
 24237      *dbi = 0;
 24238      return rc;
 24239    }
 24240  
 24241    rc = check_txn(txn, MDBX_TXN_BLOCKED);
 24242    if (unlikely(rc != MDBX_SUCCESS))
 24243      goto early_bailout;
 24244  
 24245    switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT |
 24246                          MDBX_REVERSEDUP | MDBX_ACCEDE)) {
 24247    case MDBX_ACCEDE:
 24248      if ((user_flags & MDBX_CREATE) == 0)
 24249        break;
 24250      __fallthrough /* fall through */;
 24251    default:
 24252      rc = MDBX_EINVAL;
 24253      goto early_bailout;
 24254  
 24255    case MDBX_DUPSORT:
 24256    case MDBX_DUPSORT | MDBX_REVERSEDUP:
 24257    case MDBX_DUPSORT | MDBX_DUPFIXED:
 24258    case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
 24259    case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
 24260    case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
 24261    case 0:
 24262      break;
 24263    }
 24264  
 24265    /* main table? */
 24266    if (!table_name) {
 24267      rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp);
 24268      if (unlikely(rc != MDBX_SUCCESS))
 24269        goto early_bailout;
 24270      *dbi = MAIN_DBI;
 24271      return rc;
 24272    }
 24273  
 24274    MDBX_env *env = txn->mt_env;
 24275    size_t len = strlen(table_name);
 24276    if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db))
 24277      return MDBX_EINVAL;
 24278  
 24279    if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
 24280      txn->mt_dbxs[MAIN_DBI].md_cmp =
 24281          get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags);
 24282      txn->mt_dbxs[MAIN_DBI].md_dcmp =
 24283          get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags);
 24284    }
 24285  
 24286    /* Is the DB already open? */
 24287    MDBX_dbi scan, slot;
 24288    for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) {
 24289      if (!txn->mt_dbxs[scan].md_name.iov_len) {
 24290        /* Remember this free slot */
 24291        slot = scan;
 24292        continue;
 24293      }
 24294      if (len == txn->mt_dbxs[scan].md_name.iov_len &&
 24295          !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) {
 24296        rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp);
 24297        if (unlikely(rc != MDBX_SUCCESS))
 24298          goto early_bailout;
 24299        *dbi = scan;
 24300        return rc;
 24301      }
 24302    }
 24303  
 24304    /* Fail, if no free slot and max hit */
 24305    if (unlikely(slot >= env->me_maxdbs)) {
 24306      rc = MDBX_DBS_FULL;
 24307      goto early_bailout;
 24308    }
 24309  
 24310    /* Cannot mix named table with some main-table flags */
 24311    if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags &
 24312                 (MDBX_DUPSORT | MDBX_INTEGERKEY))) {
 24313      rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND;
 24314      goto early_bailout;
 24315    }
 24316  
 24317    /* Find the DB info */
 24318    MDBX_val key, data;
 24319    key.iov_len = len;
 24320    key.iov_base = (void *)table_name;
 24321    MDBX_cursor_couple couple;
 24322    rc = cursor_init(&couple.outer, txn, MAIN_DBI);
 24323    if (unlikely(rc != MDBX_SUCCESS))
 24324      goto early_bailout;
 24325    rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err;
 24326    if (unlikely(rc != MDBX_SUCCESS)) {
 24327      if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
 24328        goto early_bailout;
 24329    } else {
 24330      /* make sure this is actually a table */
 24331      MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top],
 24332                                  couple.outer.mc_ki[couple.outer.mc_top]);
 24333      if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) {
 24334        rc = MDBX_INCOMPATIBLE;
 24335        goto early_bailout;
 24336      }
 24337      if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) {
 24338        rc = MDBX_CORRUPTED;
 24339        goto early_bailout;
 24340      }
 24341    }
 24342  
 24343    if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) {
 24344      rc = MDBX_EACCESS;
 24345      goto early_bailout;
 24346    }
 24347  
 24348    /* Done here so we cannot fail after creating a new DB */
 24349    char *namedup = osal_strdup(table_name);
 24350    if (unlikely(!namedup)) {
 24351      rc = MDBX_ENOMEM;
 24352      goto early_bailout;
 24353    }
 24354  
 24355    int err = osal_fastmutex_acquire(&env->me_dbi_lock);
 24356    if (unlikely(err != MDBX_SUCCESS)) {
 24357      rc = err;
 24358      osal_free(namedup);
 24359      goto early_bailout;
 24360    }
 24361  
 24362    /* Import handles from env */
 24363    dbi_import_locked(txn);
 24364  
 24365    /* Rescan after mutex acquisition & import handles */
 24366    for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) {
 24367      if (!txn->mt_dbxs[scan].md_name.iov_len) {
 24368        /* Remember this free slot */
 24369        slot = scan;
 24370        continue;
 24371      }
 24372      if (len == txn->mt_dbxs[scan].md_name.iov_len &&
 24373          !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) {
 24374        rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp);
 24375        if (unlikely(rc != MDBX_SUCCESS))
 24376          goto later_bailout;
 24377        *dbi = scan;
 24378        goto later_exit;
 24379      }
 24380    }
 24381  
 24382    if (unlikely(slot >= env->me_maxdbs)) {
 24383      rc = MDBX_DBS_FULL;
 24384      goto later_bailout;
 24385    }
 24386  
 24387    unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID;
 24388    MDBX_db db_dummy;
 24389    if (unlikely(rc)) {
 24390      /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
 24391      tASSERT(txn, rc == MDBX_NOTFOUND);
 24392      memset(&db_dummy, 0, sizeof(db_dummy));
 24393      db_dummy.md_root = P_INVALID;
 24394      db_dummy.md_mod_txnid = txn->mt_txnid;
 24395      db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS;
 24396      data.iov_len = sizeof(db_dummy);
 24397      data.iov_base = &db_dummy;
 24398      WITH_CURSOR_TRACKING(couple.outer,
 24399                           rc = mdbx_cursor_put(&couple.outer, &key, &data,
 24400                                                F_SUBDATA | MDBX_NOOVERWRITE));
 24401  
 24402      if (unlikely(rc != MDBX_SUCCESS))
 24403        goto later_bailout;
 24404  
 24405      dbiflags |= DBI_DIRTY | DBI_CREAT;
 24406      txn->mt_flags |= MDBX_TXN_DIRTY;
 24407      tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0);
 24408    }
 24409  
 24410    /* Got info, register DBI in this txn */
 24411    memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx));
 24412    memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db));
 24413    env->me_dbflags[slot] = 0;
 24414    rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp);
 24415    if (unlikely(rc != MDBX_SUCCESS)) {
 24416      tASSERT(txn, (dbiflags & DBI_CREAT) == 0);
 24417    later_bailout:
 24418      *dbi = 0;
 24419    later_exit:
 24420      osal_free(namedup);
 24421    } else {
 24422      txn->mt_dbistate[slot] = (uint8_t)dbiflags;
 24423      txn->mt_dbxs[slot].md_name.iov_base = namedup;
 24424      txn->mt_dbxs[slot].md_name.iov_len = len;
 24425      txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak =
 24426          dbi_seq(env, slot);
 24427      if (!(dbiflags & DBI_CREAT))
 24428        env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID;
 24429      if (txn->mt_numdbs == slot) {
 24430        txn->mt_cursors[slot] = NULL;
 24431        osal_compiler_barrier();
 24432        txn->mt_numdbs = slot + 1;
 24433      }
 24434      if (env->me_numdbs <= slot) {
 24435        osal_memory_fence(mo_AcquireRelease, true);
 24436        env->me_numdbs = slot + 1;
 24437      }
 24438      *dbi = slot;
 24439    }
 24440  
 24441    ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
 24442    return rc;
 24443  }
 24444  
 24445  int mdbx_dbi_open(MDBX_txn *txn, const char *table_name,
 24446                    MDBX_db_flags_t table_flags, MDBX_dbi *dbi) {
 24447    return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr);
 24448  }
 24449  
 24450  int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name,
 24451                       MDBX_db_flags_t table_flags, MDBX_dbi *dbi,
 24452                       MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
 24453    return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp);
 24454  }
 24455  
 24456  __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest,
 24457                           size_t bytes) {
 24458    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 24459    if (unlikely(rc != MDBX_SUCCESS))
 24460      return rc;
 24461  
 24462    if (unlikely(!dest))
 24463      return MDBX_EINVAL;
 24464  
 24465    if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
 24466      return MDBX_BAD_DBI;
 24467  
 24468    const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
 24469    if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
 24470      return MDBX_EINVAL;
 24471  
 24472    if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED))
 24473      return MDBX_BAD_TXN;
 24474  
 24475    if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) {
 24476      rc = fetch_sdb(txn, dbi);
 24477      if (unlikely(rc != MDBX_SUCCESS))
 24478        return rc;
 24479    }
 24480  
 24481    dest->ms_psize = txn->mt_env->me_psize;
 24482    stat_get(&txn->mt_dbs[dbi], dest, bytes);
 24483    return MDBX_SUCCESS;
 24484  }
 24485  
 24486  static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
 24487    eASSERT(env, dbi >= CORE_DBS);
 24488    if (unlikely(dbi >= env->me_numdbs))
 24489      return MDBX_BAD_DBI;
 24490  
 24491    char *ptr = env->me_dbxs[dbi].md_name.iov_base;
 24492    /* If there was no name, this was already closed */
 24493    if (unlikely(!ptr))
 24494      return MDBX_BAD_DBI;
 24495  
 24496    env->me_dbflags[dbi] = 0;
 24497    env->me_dbxs[dbi].md_name.iov_len = 0;
 24498    osal_memory_fence(mo_AcquireRelease, true);
 24499    env->me_dbxs[dbi].md_name.iov_base = NULL;
 24500    osal_free(ptr);
 24501  
 24502    if (env->me_numdbs == dbi + 1) {
 24503      unsigned i = env->me_numdbs;
 24504      do
 24505        --i;
 24506      while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base);
 24507      env->me_numdbs = i;
 24508    }
 24509  
 24510    return MDBX_SUCCESS;
 24511  }
 24512  
 24513  int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
 24514    int rc = check_env(env, true);
 24515    if (unlikely(rc != MDBX_SUCCESS))
 24516      return rc;
 24517  
 24518    if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs))
 24519      return MDBX_BAD_DBI;
 24520  
 24521    rc = osal_fastmutex_acquire(&env->me_dbi_lock);
 24522    if (likely(rc == MDBX_SUCCESS)) {
 24523      rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID))
 24524               ? dbi_close_locked(env, dbi)
 24525               : MDBX_BAD_DBI;
 24526      ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
 24527    }
 24528    return rc;
 24529  }
 24530  
 24531  int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags,
 24532                        unsigned *state) {
 24533    int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
 24534    if (unlikely(rc != MDBX_SUCCESS))
 24535      return rc;
 24536  
 24537    if (unlikely(!flags || !state))
 24538      return MDBX_EINVAL;
 24539  
 24540    if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
 24541      return MDBX_BAD_DBI;
 24542  
 24543    *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS;
 24544    *state =
 24545        txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE);
 24546  
 24547    return MDBX_SUCCESS;
 24548  }
 24549  
 24550  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 24551  int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) {
 24552    return __inline_mdbx_dbi_flags(txn, dbi, flags);
 24553  }
 24554  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 24555  
 24556  static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) {
 24557    int rc = page_search(mc, NULL, MDBX_PS_FIRST);
 24558    if (likely(rc == MDBX_SUCCESS)) {
 24559      MDBX_txn *txn = mc->mc_txn;
 24560  
 24561      /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves.
 24562       * This also avoids any P_LEAF2 pages, which have no nodes.
 24563       * Also if the DB doesn't have sub-DBs and has no large/overflow
 24564       * pages, omit scanning leaves. */
 24565      if (!(may_have_subDBs | mc->mc_db->md_overflow_pages))
 24566        cursor_pop(mc);
 24567  
 24568      rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages +
 24569                                                mc->mc_db->md_leaf_pages +
 24570                                                mc->mc_db->md_overflow_pages);
 24571      if (unlikely(rc != MDBX_SUCCESS))
 24572        goto bailout;
 24573  
 24574      MDBX_cursor mx;
 24575      cursor_copy(mc, &mx);
 24576      while (mc->mc_snum > 0) {
 24577        MDBX_page *const mp = mc->mc_pg[mc->mc_top];
 24578        const unsigned nkeys = page_numkeys(mp);
 24579        if (IS_LEAF(mp)) {
 24580          cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth);
 24581          for (unsigned i = 0; i < nkeys; i++) {
 24582            MDBX_node *node = page_node(mp, i);
 24583            if (node_flags(node) & F_BIGDATA) {
 24584              rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0);
 24585              if (unlikely(rc != MDBX_SUCCESS))
 24586                goto bailout;
 24587              if (!(may_have_subDBs | mc->mc_db->md_overflow_pages))
 24588                goto pop;
 24589            } else if (node_flags(node) & F_SUBDATA) {
 24590              if (unlikely((node_flags(node) & F_DUPDATA) == 0)) {
 24591                rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE;
 24592                goto bailout;
 24593              }
 24594              rc = cursor_xinit1(mc, node, mp);
 24595              if (unlikely(rc != MDBX_SUCCESS))
 24596                goto bailout;
 24597              rc = drop_tree(&mc->mc_xcursor->mx_cursor, false);
 24598              if (unlikely(rc != MDBX_SUCCESS))
 24599                goto bailout;
 24600            }
 24601          }
 24602        } else {
 24603          cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth);
 24604          mc->mc_checking |= CC_RETIRING;
 24605          const unsigned pagetype =
 24606              (IS_FROZEN(txn, mp) ? P_FROZEN : 0) +
 24607              ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH);
 24608          for (unsigned i = 0; i < nkeys; i++) {
 24609            MDBX_node *node = page_node(mp, i);
 24610            tASSERT(txn, (node_flags(node) &
 24611                          (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
 24612            const pgno_t pgno = node_pgno(node);
 24613            rc = page_retire_ex(mc, pgno, nullptr, pagetype);
 24614            if (unlikely(rc != MDBX_SUCCESS))
 24615              goto bailout;
 24616          }
 24617          mc->mc_checking -= CC_RETIRING;
 24618        }
 24619        if (!mc->mc_top)
 24620          break;
 24621        cASSERT(mc, nkeys > 0);
 24622        mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
 24623        rc = cursor_sibling(mc, SIBLING_RIGHT);
 24624        if (unlikely(rc != MDBX_SUCCESS)) {
 24625          if (unlikely(rc != MDBX_NOTFOUND))
 24626            goto bailout;
 24627        /* no more siblings, go back to beginning
 24628         * of previous level. */
 24629        pop:
 24630          cursor_pop(mc);
 24631          mc->mc_ki[0] = 0;
 24632          for (unsigned i = 1; i < mc->mc_snum; i++) {
 24633            mc->mc_ki[i] = 0;
 24634            mc->mc_pg[i] = mx.mc_pg[i];
 24635          }
 24636        }
 24637      }
 24638      rc = page_retire(mc, mc->mc_pg[0]);
 24639    bailout:
 24640      if (unlikely(rc != MDBX_SUCCESS))
 24641        txn->mt_flags |= MDBX_TXN_ERROR;
 24642    } else if (rc == MDBX_NOTFOUND) {
 24643      rc = MDBX_SUCCESS;
 24644    }
 24645    mc->mc_flags &= ~C_INITIALIZED;
 24646    return rc;
 24647  }
 24648  
 24649  int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
 24650    int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
 24651    if (unlikely(rc != MDBX_SUCCESS))
 24652      return rc;
 24653  
 24654    MDBX_cursor *mc;
 24655    rc = mdbx_cursor_open(txn, dbi, &mc);
 24656    if (unlikely(rc != MDBX_SUCCESS))
 24657      return rc;
 24658  
 24659    rc = drop_tree(mc,
 24660                   dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0);
 24661    /* Invalidate the dropped DB's cursors */
 24662    for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
 24663      m2->mc_flags &= ~(C_INITIALIZED | C_EOF);
 24664    if (unlikely(rc))
 24665      goto bailout;
 24666  
 24667    /* Can't delete the main DB */
 24668    if (del && dbi >= CORE_DBS) {
 24669      rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA);
 24670      if (likely(rc == MDBX_SUCCESS)) {
 24671        tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY);
 24672        tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY);
 24673        txn->mt_dbistate[dbi] = DBI_STALE;
 24674        MDBX_env *env = txn->mt_env;
 24675        rc = osal_fastmutex_acquire(&env->me_dbi_lock);
 24676        if (unlikely(rc != MDBX_SUCCESS)) {
 24677          txn->mt_flags |= MDBX_TXN_ERROR;
 24678          goto bailout;
 24679        }
 24680        dbi_close_locked(env, dbi);
 24681        ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
 24682      } else {
 24683        txn->mt_flags |= MDBX_TXN_ERROR;
 24684      }
 24685    } else {
 24686      /* reset the DB record, mark it dirty */
 24687      txn->mt_dbistate[dbi] |= DBI_DIRTY;
 24688      txn->mt_dbs[dbi].md_depth = 0;
 24689      txn->mt_dbs[dbi].md_branch_pages = 0;
 24690      txn->mt_dbs[dbi].md_leaf_pages = 0;
 24691      txn->mt_dbs[dbi].md_overflow_pages = 0;
 24692      txn->mt_dbs[dbi].md_entries = 0;
 24693      txn->mt_dbs[dbi].md_root = P_INVALID;
 24694      txn->mt_dbs[dbi].md_seq = 0;
 24695      txn->mt_flags |= MDBX_TXN_DIRTY;
 24696    }
 24697  
 24698  bailout:
 24699    mdbx_cursor_close(mc);
 24700    return rc;
 24701  }
 24702  
 24703  int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) {
 24704    int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
 24705    if (unlikely(rc != MDBX_SUCCESS))
 24706      return rc;
 24707  
 24708    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 24709      return MDBX_BAD_DBI;
 24710  
 24711    txn->mt_dbxs[dbi].md_cmp = cmp;
 24712    return MDBX_SUCCESS;
 24713  }
 24714  
 24715  int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) {
 24716    int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
 24717    if (unlikely(rc != MDBX_SUCCESS))
 24718      return rc;
 24719  
 24720    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 24721      return MDBX_BAD_DBI;
 24722  
 24723    txn->mt_dbxs[dbi].md_dcmp = cmp;
 24724    return MDBX_SUCCESS;
 24725  }
 24726  
 24727  __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func,
 24728                              void *ctx) {
 24729    int rc = check_env(env, true);
 24730    if (unlikely(rc != MDBX_SUCCESS))
 24731      return rc;
 24732  
 24733    if (unlikely(!func))
 24734      return MDBX_EINVAL;
 24735  
 24736    rc = MDBX_RESULT_TRUE;
 24737    int serial = 0;
 24738    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 24739    if (likely(lck)) {
 24740      const unsigned snap_nreaders =
 24741          atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 24742      for (unsigned i = 0; i < snap_nreaders; i++) {
 24743        const MDBX_reader *r = lck->mti_readers + i;
 24744      retry_reader:;
 24745        const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease);
 24746        if (!pid)
 24747          continue;
 24748        txnid_t txnid = safe64_read(&r->mr_txnid);
 24749        const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed);
 24750        const pgno_t pages_used =
 24751            atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed);
 24752        const uint64_t reader_pages_retired =
 24753            atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed);
 24754        if (unlikely(
 24755                txnid != safe64_read(&r->mr_txnid) ||
 24756                pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) ||
 24757                tid != atomic_load64(&r->mr_tid, mo_Relaxed) ||
 24758                pages_used !=
 24759                    atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) ||
 24760                reader_pages_retired !=
 24761                    atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed)))
 24762          goto retry_reader;
 24763  
 24764        eASSERT(env, txnid > 0);
 24765        if (txnid >= SAFE64_INVALID_THRESHOLD)
 24766          txnid = 0;
 24767  
 24768        size_t bytes_used = 0;
 24769        size_t bytes_retained = 0;
 24770        uint64_t lag = 0;
 24771        if (txnid) {
 24772          meta_troika_t troika = meta_tap(env);
 24773        retry_header:;
 24774          const meta_ptr_t head = meta_recent(env, &troika);
 24775          const uint64_t head_pages_retired =
 24776              unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired);
 24777          if (unlikely(meta_should_retry(env, &troika) ||
 24778                       head_pages_retired !=
 24779                           unaligned_peek_u64_volatile(
 24780                               4, head.ptr_v->mm_pages_retired)))
 24781            goto retry_header;
 24782  
 24783          lag = (head.txnid - txnid) / xMDBX_TXNID_STEP;
 24784          bytes_used = pgno2bytes(env, pages_used);
 24785          bytes_retained = (head_pages_retired > reader_pages_retired)
 24786                               ? pgno2bytes(env, (pgno_t)(head_pages_retired -
 24787                                                          reader_pages_retired))
 24788                               : 0;
 24789        }
 24790        rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used,
 24791                  bytes_retained);
 24792        if (unlikely(rc != MDBX_SUCCESS))
 24793          break;
 24794      }
 24795    }
 24796  
 24797    return rc;
 24798  }
 24799  
 24800  /* Insert pid into list if not already present.
 24801   * return -1 if already present. */
 24802  __cold static bool pid_insert(uint32_t *ids, uint32_t pid) {
 24803    /* binary search of pid in list */
 24804    unsigned base = 0;
 24805    unsigned cursor = 1;
 24806    int val = 0;
 24807    unsigned n = ids[0];
 24808  
 24809    while (n > 0) {
 24810      unsigned pivot = n >> 1;
 24811      cursor = base + pivot + 1;
 24812      val = pid - ids[cursor];
 24813  
 24814      if (val < 0) {
 24815        n = pivot;
 24816      } else if (val > 0) {
 24817        base = cursor;
 24818        n -= pivot + 1;
 24819      } else {
 24820        /* found, so it's a duplicate */
 24821        return false;
 24822      }
 24823    }
 24824  
 24825    if (val > 0)
 24826      ++cursor;
 24827  
 24828    ids[0]++;
 24829    for (n = ids[0]; n > cursor; n--)
 24830      ids[n] = ids[n - 1];
 24831    ids[n] = pid;
 24832    return true;
 24833  }
 24834  
 24835  __cold int mdbx_reader_check(MDBX_env *env, int *dead) {
 24836    if (dead)
 24837      *dead = 0;
 24838    return cleanup_dead_readers(env, false, dead);
 24839  }
 24840  
 24841  /* Return:
 24842   *  MDBX_RESULT_TRUE - done and mutex recovered
 24843   *  MDBX_SUCCESS     - done
 24844   *  Otherwise errcode. */
 24845  __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env,
 24846                                                     int rdt_locked, int *dead) {
 24847    int rc = check_env(env, true);
 24848    if (unlikely(rc != MDBX_SUCCESS))
 24849      return rc;
 24850  
 24851    eASSERT(env, rdt_locked >= 0);
 24852    MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 24853    if (unlikely(lck == NULL)) {
 24854      /* exclusive mode */
 24855      if (dead)
 24856        *dead = 0;
 24857      return MDBX_SUCCESS;
 24858    }
 24859  
 24860    const unsigned snap_nreaders =
 24861        atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
 24862    uint32_t pidsbuf_onstask[142];
 24863    uint32_t *const pids =
 24864        (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
 24865            ? pidsbuf_onstask
 24866            : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t));
 24867    if (unlikely(!pids))
 24868      return MDBX_ENOMEM;
 24869  
 24870    pids[0] = 0;
 24871    int count = 0;
 24872    for (unsigned i = 0; i < snap_nreaders; i++) {
 24873      const uint32_t pid =
 24874          atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease);
 24875      if (pid == 0)
 24876        continue /* skip empty */;
 24877      if (pid == env->me_pid)
 24878        continue /* skip self */;
 24879      if (!pid_insert(pids, pid))
 24880        continue /* such pid already processed */;
 24881  
 24882      int err = osal_rpid_check(env, pid);
 24883      if (err == MDBX_RESULT_TRUE)
 24884        continue /* reader is live */;
 24885  
 24886      if (err != MDBX_SUCCESS) {
 24887        rc = err;
 24888        break /* osal_rpid_check() failed */;
 24889      }
 24890  
 24891      /* stale reader found */
 24892      if (!rdt_locked) {
 24893        err = osal_rdt_lock(env);
 24894        if (MDBX_IS_ERROR(err)) {
 24895          rc = err;
 24896          break;
 24897        }
 24898  
 24899        rdt_locked = -1;
 24900        if (err == MDBX_RESULT_TRUE) {
 24901          /* mutex recovered, the mdbx_ipclock_failed() checked all readers */
 24902          rc = MDBX_RESULT_TRUE;
 24903          break;
 24904        }
 24905  
 24906        /* a other process may have clean and reused slot, recheck */
 24907        if (lck->mti_readers[i].mr_pid.weak != pid)
 24908          continue;
 24909  
 24910        err = osal_rpid_check(env, pid);
 24911        if (MDBX_IS_ERROR(err)) {
 24912          rc = err;
 24913          break;
 24914        }
 24915  
 24916        if (err != MDBX_SUCCESS)
 24917          continue /* the race with other process, slot reused */;
 24918      }
 24919  
 24920      /* clean it */
 24921      for (unsigned j = i; j < snap_nreaders; j++) {
 24922        if (lck->mti_readers[j].mr_pid.weak == pid) {
 24923          DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid,
 24924                lck->mti_readers[j].mr_txnid.weak);
 24925          atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed);
 24926          atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease);
 24927          count++;
 24928        }
 24929      }
 24930    }
 24931  
 24932    if (likely(!MDBX_IS_ERROR(rc)))
 24933      atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(),
 24934                     mo_Relaxed);
 24935  
 24936    if (rdt_locked < 0)
 24937      osal_rdt_unlock(env);
 24938  
 24939    if (pids != pidsbuf_onstask)
 24940      osal_free(pids);
 24941  
 24942    if (dead)
 24943      *dead = count;
 24944    return rc;
 24945  }
 24946  
 24947  __cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) {
 24948    const int rc = runtime_flags | (loglevel << 16);
 24949  
 24950    if (level != MDBX_LOG_DONTCHANGE)
 24951      loglevel = (uint8_t)level;
 24952  
 24953    if (flags != MDBX_DBG_DONTCHANGE) {
 24954      flags &=
 24955  #if MDBX_DEBUG
 24956          MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER |
 24957  #endif
 24958          MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP |
 24959          MDBX_DBG_DONT_UPGRADE;
 24960      runtime_flags = (uint8_t)flags;
 24961    }
 24962  
 24963    if (logger != MDBX_LOGGER_DONTCHANGE)
 24964      debug_logger = logger;
 24965    return rc;
 24966  }
 24967  
 24968  __cold static txnid_t kick_longlived_readers(MDBX_env *env,
 24969                                               const txnid_t laggard) {
 24970    DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard);
 24971    osal_memory_fence(mo_AcquireRelease, false);
 24972    MDBX_hsr_func *const callback = env->me_hsr_callback;
 24973    txnid_t oldest = 0;
 24974    bool notify_eof_of_loop = false;
 24975    int retry = 0;
 24976    do {
 24977      const txnid_t steady =
 24978          env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady];
 24979      env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true;
 24980      oldest = find_oldest_reader(env, steady);
 24981      eASSERT(env, oldest < env->me_txn0->mt_txnid);
 24982      eASSERT(env, oldest >= laggard);
 24983      eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak);
 24984  
 24985      MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
 24986      if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck)
 24987        break;
 24988  
 24989      if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL)))
 24990        break;
 24991  
 24992      if (!callback)
 24993        break;
 24994  
 24995      MDBX_reader *stucked = nullptr;
 24996      uint64_t hold_retired = 0;
 24997      for (unsigned i = 0; i < lck->mti_numreaders.weak; ++i) {
 24998        const uint64_t snap_retired = atomic_load64(
 24999            &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed);
 25000        const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid);
 25001        if (rtxn == laggard &&
 25002            atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
 25003          hold_retired = snap_retired;
 25004          stucked = &lck->mti_readers[i];
 25005        }
 25006      }
 25007  
 25008      if (!stucked)
 25009        break;
 25010  
 25011      uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease);
 25012      uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease);
 25013      if (safe64_read(&stucked->mr_txnid) != laggard || !pid ||
 25014          stucked->mr_snapshot_pages_retired.weak != hold_retired)
 25015        continue;
 25016  
 25017      const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika);
 25018      const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP;
 25019      const uint64_t head_retired =
 25020          unaligned_peek_u64(4, head.ptr_c->mm_pages_retired);
 25021      const size_t space =
 25022          (head_retired > hold_retired)
 25023              ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired))
 25024              : 0;
 25025      int rc =
 25026          callback(env, env->me_txn, pid, (mdbx_tid_t)tid, laggard,
 25027                   (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
 25028      if (rc < 0)
 25029        /* hsr returned error and/or agree MDBX_MAP_FULL error */
 25030        break;
 25031  
 25032      if (rc > 0) {
 25033        if (rc == 1) {
 25034          /* hsr reported transaction (will be) aborted asynchronous */
 25035          safe64_reset_compare(&stucked->mr_txnid, laggard);
 25036        } else {
 25037          /* hsr reported reader process was killed and slot should be cleared */
 25038          safe64_reset(&stucked->mr_txnid, true);
 25039          atomic_store64(&stucked->mr_tid, 0, mo_Relaxed);
 25040          atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease);
 25041        }
 25042      } else
 25043        notify_eof_of_loop = true;
 25044  
 25045    } while (++retry < INT_MAX);
 25046  
 25047    if (notify_eof_of_loop) {
 25048      /* notify end of hsr-loop */
 25049      const txnid_t turn = oldest - laggard;
 25050      if (turn)
 25051        NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN,
 25052               laggard, oldest, turn);
 25053      callback(env, env->me_txn, 0, 0, laggard,
 25054               (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
 25055    }
 25056    return oldest;
 25057  }
 25058  
 25059  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 25060  __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
 25061    return __inline_mdbx_env_set_syncbytes(env, threshold);
 25062  }
 25063  
 25064  __cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
 25065    return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
 25066  }
 25067  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 25068  
 25069  __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
 25070    int rc = check_env(env, false);
 25071    if (unlikely(rc != MDBX_SUCCESS))
 25072      return rc;
 25073  
 25074    env->me_hsr_callback = hsr;
 25075    return MDBX_SUCCESS;
 25076  }
 25077  
 25078  __cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) {
 25079    return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE)
 25080               ? env->me_hsr_callback
 25081               : NULL;
 25082  }
 25083  
 25084  #ifdef __SANITIZE_THREAD__
 25085  /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */
 25086  __attribute__((__no_sanitize_thread__, __noinline__))
 25087  #endif
 25088  int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
 25089  {
 25090    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 25091    if (unlikely(rc != MDBX_SUCCESS))
 25092      return (rc > 0) ? -rc : rc;
 25093  
 25094    MDBX_env *env = txn->mt_env;
 25095    if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) {
 25096      if (percent)
 25097        *percent =
 25098            (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) /
 25099                  txn->mt_end_pgno);
 25100      return 0;
 25101    }
 25102  
 25103    txnid_t lag;
 25104    meta_troika_t troika = meta_tap(env);
 25105    do {
 25106      const meta_ptr_t head = meta_recent(env, &troika);
 25107      if (percent) {
 25108        const pgno_t maxpg = head.ptr_v->mm_geo.now;
 25109        *percent =
 25110            (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg);
 25111      }
 25112      lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP;
 25113    } while (unlikely(meta_should_retry(env, &troika)));
 25114  
 25115    return (lag > INT_MAX) ? INT_MAX : (int)lag;
 25116  }
 25117  
 25118  typedef struct mdbx_walk_ctx {
 25119    void *mw_user;
 25120    MDBX_pgvisitor_func *mw_visitor;
 25121    MDBX_txn *mw_txn;
 25122    MDBX_cursor *mw_cursor;
 25123    bool mw_dont_check_keys_ordering;
 25124  } mdbx_walk_ctx_t;
 25125  
 25126  __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb,
 25127                             const char *name, int deep);
 25128  
 25129  static MDBX_page_type_t walk_page_type(const MDBX_page *mp) {
 25130    if (mp)
 25131      switch (mp->mp_flags) {
 25132      case P_BRANCH:
 25133        return MDBX_page_branch;
 25134      case P_LEAF:
 25135        return MDBX_page_leaf;
 25136      case P_LEAF | P_LEAF2:
 25137        return MDBX_page_dupfixed_leaf;
 25138      case P_OVERFLOW:
 25139        return MDBX_page_large;
 25140      case P_META:
 25141        return MDBX_page_meta;
 25142      }
 25143    return MDBX_page_broken;
 25144  }
 25145  
 25146  /* Depth-first tree traversal. */
 25147  __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno,
 25148                              const char *name, int deep, txnid_t parent_txnid) {
 25149    assert(pgno != P_INVALID);
 25150    MDBX_page *mp = nullptr;
 25151    int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid);
 25152  
 25153    MDBX_page_type_t type = walk_page_type(mp);
 25154    const unsigned nentries = mp ? page_numkeys(mp) : 0;
 25155    unsigned npages = 1;
 25156    size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages);
 25157    size_t header_size =
 25158        (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ;
 25159    size_t payload_size = 0;
 25160    size_t unused_size =
 25161        (mp ? page_room(mp) : pagesize - header_size) - payload_size;
 25162    size_t align_bytes = 0;
 25163  
 25164    for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries;
 25165         align_bytes += ((payload_size + align_bytes) & 1), ++i) {
 25166      if (type == MDBX_page_dupfixed_leaf) {
 25167        /* LEAF2 pages have no mp_ptrs[] or node headers */
 25168        payload_size += mp->mp_leaf2_ksize;
 25169        continue;
 25170      }
 25171  
 25172      MDBX_node *node = page_node(mp, i);
 25173      payload_size += NODESIZE + node_ks(node);
 25174  
 25175      if (type == MDBX_page_branch) {
 25176        assert(i > 0 || node_ks(node) == 0);
 25177        continue;
 25178      }
 25179  
 25180      assert(type == MDBX_page_leaf);
 25181      switch (node_flags(node)) {
 25182      case 0 /* usual node */:
 25183        payload_size += node_ds(node);
 25184        break;
 25185  
 25186      case F_BIGDATA /* long data on the large/overflow page */: {
 25187        payload_size += sizeof(pgno_t);
 25188        const pgno_t large_pgno = node_largedata_pgno(node);
 25189        const size_t over_payload = node_ds(node);
 25190        const size_t over_header = PAGEHDRSZ;
 25191        npages = 1;
 25192  
 25193        assert(err == MDBX_SUCCESS);
 25194        pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid);
 25195        err = lp.err;
 25196        if (err == MDBX_SUCCESS) {
 25197          cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW);
 25198          npages = lp.page->mp_pages;
 25199        }
 25200  
 25201        pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages);
 25202        const size_t over_unused = pagesize - over_payload - over_header;
 25203        const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep,
 25204                                       name, pagesize, MDBX_page_large, err, 1,
 25205                                       over_payload, over_header, over_unused);
 25206        if (unlikely(rc != MDBX_SUCCESS))
 25207          return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
 25208      } break;
 25209  
 25210      case F_SUBDATA /* sub-db */: {
 25211        const size_t namelen = node_ks(node);
 25212        payload_size += node_ds(node);
 25213        if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) {
 25214          assert(err == MDBX_CORRUPTED);
 25215          err = MDBX_CORRUPTED;
 25216        }
 25217      } break;
 25218  
 25219      case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
 25220        payload_size += sizeof(MDBX_db);
 25221        if (unlikely(node_ds(node) != sizeof(MDBX_db))) {
 25222          assert(err == MDBX_CORRUPTED);
 25223          err = MDBX_CORRUPTED;
 25224        }
 25225        break;
 25226  
 25227      case F_DUPDATA /* short sub-page */: {
 25228        if (unlikely(node_ds(node) <= PAGEHDRSZ)) {
 25229          assert(err == MDBX_CORRUPTED);
 25230          err = MDBX_CORRUPTED;
 25231          break;
 25232        }
 25233  
 25234        MDBX_page *sp = node_data(node);
 25235        const unsigned nsubkeys = page_numkeys(sp);
 25236        size_t subheader_size =
 25237            IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower;
 25238        size_t subunused_size = page_room(sp);
 25239        size_t subpayload_size = 0;
 25240        size_t subalign_bytes = 0;
 25241        MDBX_page_type_t subtype;
 25242  
 25243        switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) {
 25244        case P_LEAF | P_SUBP:
 25245          subtype = MDBX_subpage_leaf;
 25246          break;
 25247        case P_LEAF | P_LEAF2 | P_SUBP:
 25248          subtype = MDBX_subpage_dupfixed_leaf;
 25249          break;
 25250        default:
 25251          assert(err == MDBX_CORRUPTED);
 25252          subtype = MDBX_subpage_broken;
 25253          err = MDBX_CORRUPTED;
 25254        }
 25255  
 25256        for (unsigned j = 0; err == MDBX_SUCCESS && j < nsubkeys;
 25257             subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) {
 25258  
 25259          if (subtype == MDBX_subpage_dupfixed_leaf) {
 25260            /* LEAF2 pages have no mp_ptrs[] or node headers */
 25261            subpayload_size += sp->mp_leaf2_ksize;
 25262          } else {
 25263            assert(subtype == MDBX_subpage_leaf);
 25264            MDBX_node *subnode = page_node(sp, j);
 25265            subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode);
 25266            if (unlikely(node_flags(subnode) != 0)) {
 25267              assert(err == MDBX_CORRUPTED);
 25268              err = MDBX_CORRUPTED;
 25269            }
 25270          }
 25271        }
 25272  
 25273        const int rc =
 25274            ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node),
 25275                            subtype, err, nsubkeys, subpayload_size,
 25276                            subheader_size, subunused_size + subalign_bytes);
 25277        if (unlikely(rc != MDBX_SUCCESS))
 25278          return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
 25279        header_size += subheader_size;
 25280        unused_size += subunused_size;
 25281        payload_size += subpayload_size;
 25282        align_bytes += subalign_bytes;
 25283      } break;
 25284  
 25285      default:
 25286        assert(err == MDBX_CORRUPTED);
 25287        err = MDBX_CORRUPTED;
 25288      }
 25289    }
 25290  
 25291    const int rc = ctx->mw_visitor(
 25292        pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type,
 25293        err, nentries, payload_size, header_size, unused_size + align_bytes);
 25294    if (unlikely(rc != MDBX_SUCCESS))
 25295      return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
 25296  
 25297    for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; ++i) {
 25298      if (type == MDBX_page_dupfixed_leaf)
 25299        continue;
 25300  
 25301      MDBX_node *node = page_node(mp, i);
 25302      if (type == MDBX_page_branch) {
 25303        assert(err == MDBX_SUCCESS);
 25304        err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid);
 25305        if (unlikely(err != MDBX_SUCCESS)) {
 25306          if (err == MDBX_RESULT_TRUE)
 25307            break;
 25308          return err;
 25309        }
 25310        continue;
 25311      }
 25312  
 25313      assert(type == MDBX_page_leaf);
 25314      MDBX_db db;
 25315      switch (node_flags(node)) {
 25316      default:
 25317        continue;
 25318  
 25319      case F_SUBDATA /* sub-db */: {
 25320        const size_t namelen = node_ks(node);
 25321        if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) {
 25322          assert(err == MDBX_CORRUPTED);
 25323          err = MDBX_CORRUPTED;
 25324          break;
 25325        }
 25326  
 25327        char namebuf_onstask[64];
 25328        char *const sub_name = (namelen < sizeof(namebuf_onstask))
 25329                                   ? namebuf_onstask
 25330                                   : osal_malloc(namelen + 1);
 25331        if (unlikely(!sub_name))
 25332          return MDBX_ENOMEM;
 25333        memcpy(sub_name, node_key(node), namelen);
 25334        sub_name[namelen] = 0;
 25335        memcpy(&db, node_data(node), sizeof(db));
 25336        assert(err == MDBX_SUCCESS);
 25337        err = walk_sdb(ctx, &db, sub_name, deep + 1);
 25338        if (sub_name != namebuf_onstask)
 25339          osal_free(sub_name);
 25340      } break;
 25341  
 25342      case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
 25343        if (unlikely(node_ds(node) != sizeof(MDBX_db) ||
 25344                     ctx->mw_cursor->mc_xcursor == NULL)) {
 25345          assert(err == MDBX_CORRUPTED);
 25346          err = MDBX_CORRUPTED;
 25347        } else {
 25348          memcpy(&db, node_data(node), sizeof(db));
 25349          assert(ctx->mw_cursor->mc_xcursor ==
 25350                 &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner);
 25351          assert(err == MDBX_SUCCESS);
 25352          err = cursor_xinit1(ctx->mw_cursor, node, mp);
 25353          if (likely(err == MDBX_SUCCESS)) {
 25354            ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor;
 25355            err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid);
 25356            MDBX_xcursor *inner_xcursor =
 25357                container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor);
 25358            MDBX_cursor_couple *couple =
 25359                container_of(inner_xcursor, MDBX_cursor_couple, inner);
 25360            ctx->mw_cursor = &couple->outer;
 25361          }
 25362        }
 25363        break;
 25364      }
 25365    }
 25366  
 25367    return MDBX_SUCCESS;
 25368  }
 25369  
 25370  __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb,
 25371                             const char *name, int deep) {
 25372    if (unlikely(sdb->md_root == P_INVALID))
 25373      return MDBX_SUCCESS; /* empty db */
 25374  
 25375    MDBX_cursor_couple couple;
 25376    MDBX_dbx dbx = {.md_klen_min = INT_MAX};
 25377    uint8_t dbistate = DBI_VALID | DBI_AUDITED;
 25378    int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate);
 25379    if (unlikely(rc != MDBX_SUCCESS))
 25380      return rc;
 25381  
 25382    couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering
 25383                                    ? CC_SKIPORD | CC_PAGECHECK
 25384                                    : CC_PAGECHECK;
 25385    couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering
 25386                                              ? CC_SKIPORD | CC_PAGECHECK
 25387                                              : CC_PAGECHECK;
 25388    couple.outer.mc_next = ctx->mw_cursor;
 25389    ctx->mw_cursor = &couple.outer;
 25390    rc = walk_tree(ctx, sdb->md_root, name, deep,
 25391                   sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid);
 25392    ctx->mw_cursor = couple.outer.mc_next;
 25393    return rc;
 25394  }
 25395  
 25396  __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor,
 25397                             void *user, bool dont_check_keys_ordering) {
 25398    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 25399    if (unlikely(rc != MDBX_SUCCESS))
 25400      return rc;
 25401  
 25402    mdbx_walk_ctx_t ctx;
 25403    memset(&ctx, 0, sizeof(ctx));
 25404    ctx.mw_txn = txn;
 25405    ctx.mw_user = user;
 25406    ctx.mw_visitor = visitor;
 25407    ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering;
 25408  
 25409    rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META,
 25410                 pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS,
 25411                 NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS,
 25412                 (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) *
 25413                     NUM_METAS);
 25414    if (!MDBX_IS_ERROR(rc))
 25415      rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0);
 25416    if (!MDBX_IS_ERROR(rc))
 25417      rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0);
 25418    return rc;
 25419  }
 25420  
 25421  int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) {
 25422    int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
 25423    if (unlikely(rc != MDBX_SUCCESS))
 25424      return rc;
 25425  
 25426    if (likely(canary)) {
 25427      if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y &&
 25428          txn->mt_canary.z == canary->z)
 25429        return MDBX_SUCCESS;
 25430      txn->mt_canary.x = canary->x;
 25431      txn->mt_canary.y = canary->y;
 25432      txn->mt_canary.z = canary->z;
 25433    }
 25434    txn->mt_canary.v = txn->mt_txnid;
 25435    txn->mt_flags |= MDBX_TXN_DIRTY;
 25436  
 25437    return MDBX_SUCCESS;
 25438  }
 25439  
 25440  int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) {
 25441    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 25442    if (unlikely(rc != MDBX_SUCCESS))
 25443      return rc;
 25444  
 25445    if (unlikely(canary == NULL))
 25446      return MDBX_EINVAL;
 25447  
 25448    *canary = txn->mt_canary;
 25449    return MDBX_SUCCESS;
 25450  }
 25451  
 25452  int mdbx_cursor_on_first(const MDBX_cursor *mc) {
 25453    if (unlikely(mc == NULL))
 25454      return MDBX_EINVAL;
 25455  
 25456    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 25457      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25458                                                       : MDBX_EBADSIGN;
 25459  
 25460    if (!(mc->mc_flags & C_INITIALIZED))
 25461      return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
 25462  
 25463    for (unsigned i = 0; i < mc->mc_snum; ++i) {
 25464      if (mc->mc_ki[i])
 25465        return MDBX_RESULT_FALSE;
 25466    }
 25467  
 25468    return MDBX_RESULT_TRUE;
 25469  }
 25470  
 25471  int mdbx_cursor_on_last(const MDBX_cursor *mc) {
 25472    if (unlikely(mc == NULL))
 25473      return MDBX_EINVAL;
 25474  
 25475    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 25476      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25477                                                       : MDBX_EBADSIGN;
 25478  
 25479    if (!(mc->mc_flags & C_INITIALIZED))
 25480      return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
 25481  
 25482    for (unsigned i = 0; i < mc->mc_snum; ++i) {
 25483      unsigned nkeys = page_numkeys(mc->mc_pg[i]);
 25484      if (mc->mc_ki[i] < nkeys - 1)
 25485        return MDBX_RESULT_FALSE;
 25486    }
 25487  
 25488    return MDBX_RESULT_TRUE;
 25489  }
 25490  
 25491  int mdbx_cursor_eof(const MDBX_cursor *mc) {
 25492    if (unlikely(mc == NULL))
 25493      return MDBX_EINVAL;
 25494  
 25495    if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
 25496      return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25497                                                       : MDBX_EBADSIGN;
 25498  
 25499    return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED &&
 25500            mc->mc_snum &&
 25501            mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]))
 25502               ? MDBX_RESULT_FALSE
 25503               : MDBX_RESULT_TRUE;
 25504  }
 25505  
 25506  //------------------------------------------------------------------------------
 25507  
 25508  struct diff_result {
 25509    ptrdiff_t diff;
 25510    unsigned level;
 25511    int root_nkeys;
 25512  };
 25513  
 25514  /* calculates: r = x - y */
 25515  __hot static int cursor_diff(const MDBX_cursor *const __restrict x,
 25516                               const MDBX_cursor *const __restrict y,
 25517                               struct diff_result *const __restrict r) {
 25518    r->diff = 0;
 25519    r->level = 0;
 25520    r->root_nkeys = 0;
 25521  
 25522    if (unlikely(x->mc_signature != MDBX_MC_LIVE))
 25523      return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25524                                                      : MDBX_EBADSIGN;
 25525  
 25526    if (unlikely(y->mc_signature != MDBX_MC_LIVE))
 25527      return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25528                                                      : MDBX_EBADSIGN;
 25529  
 25530    int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED);
 25531    if (unlikely(rc != MDBX_SUCCESS))
 25532      return rc;
 25533  
 25534    if (unlikely(x->mc_txn != y->mc_txn))
 25535      return MDBX_BAD_TXN;
 25536  
 25537    if (unlikely(y->mc_dbi != x->mc_dbi))
 25538      return MDBX_EINVAL;
 25539  
 25540    if (unlikely(!(y->mc_flags & x->mc_flags & C_INITIALIZED)))
 25541      return MDBX_ENODATA;
 25542  
 25543    while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) {
 25544      if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) {
 25545        ERROR("Mismatch cursors's pages at %u level", r->level);
 25546        return MDBX_PROBLEM;
 25547      }
 25548  
 25549      int nkeys = page_numkeys(y->mc_pg[r->level]);
 25550      assert(nkeys > 0);
 25551      if (r->level == 0)
 25552        r->root_nkeys = nkeys;
 25553  
 25554      const int limit_ki = nkeys - 1;
 25555      const int x_ki = x->mc_ki[r->level];
 25556      const int y_ki = y->mc_ki[r->level];
 25557      r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) -
 25558                ((y_ki < limit_ki) ? y_ki : limit_ki);
 25559      if (r->diff == 0) {
 25560        r->level += 1;
 25561        continue;
 25562      }
 25563  
 25564      while (unlikely(r->diff == 1) &&
 25565             likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) {
 25566        r->level += 1;
 25567        /*   DB'PAGEs: 0------------------>MAX
 25568         *
 25569         *    CURSORs:       y < x
 25570         *  STACK[i ]:         |
 25571         *  STACK[+1]:  ...y++N|0++x...
 25572         */
 25573        nkeys = page_numkeys(y->mc_pg[r->level]);
 25574        r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level];
 25575        assert(r->diff > 0);
 25576      }
 25577  
 25578      while (unlikely(r->diff == -1) &&
 25579             likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) {
 25580        r->level += 1;
 25581        /*   DB'PAGEs: 0------------------>MAX
 25582         *
 25583         *    CURSORs:       x < y
 25584         *  STACK[i ]:         |
 25585         *  STACK[+1]:  ...x--N|0--y...
 25586         */
 25587        nkeys = page_numkeys(x->mc_pg[r->level]);
 25588        r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level];
 25589        assert(r->diff < 0);
 25590      }
 25591  
 25592      return MDBX_SUCCESS;
 25593    }
 25594  
 25595    r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF);
 25596    return MDBX_SUCCESS;
 25597  }
 25598  
 25599  __hot static ptrdiff_t estimate(const MDBX_db *db,
 25600                                  struct diff_result *const __restrict dr) {
 25601    /*        root: branch-page    => scale = leaf-factor * branch-factor^(N-1)
 25602     *     level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
 25603     *     level-2: branch-page(s) => scale = leaf-factor * branch-factor
 25604     *     level-N: branch-page(s) => scale = leaf-factor
 25605     *  leaf-level: leaf-page(s)   => scale = 1
 25606     */
 25607    ptrdiff_t btree_power = (ptrdiff_t)db->md_depth - 2 - (ptrdiff_t)dr->level;
 25608    if (btree_power < 0)
 25609      return dr->diff;
 25610  
 25611    ptrdiff_t estimated =
 25612        (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages;
 25613    if (btree_power == 0)
 25614      return estimated;
 25615  
 25616    if (db->md_depth < 4) {
 25617      assert(dr->level == 0 && btree_power == 1);
 25618      return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys;
 25619    }
 25620  
 25621    /* average_branchpage_fillfactor = total(branch_entries) / branch_pages
 25622       total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
 25623    const size_t log2_fixedpoint = sizeof(size_t) - 1;
 25624    const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
 25625    const size_t factor =
 25626        ((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) /
 25627        db->md_branch_pages;
 25628    while (1) {
 25629      switch ((size_t)btree_power) {
 25630      default: {
 25631        const size_t square = (factor * factor + half) >> log2_fixedpoint;
 25632        const size_t quad = (square * square + half) >> log2_fixedpoint;
 25633        do {
 25634          estimated = estimated * quad + half;
 25635          estimated >>= log2_fixedpoint;
 25636          btree_power -= 4;
 25637        } while (btree_power >= 4);
 25638        continue;
 25639      }
 25640      case 3:
 25641        estimated = estimated * factor + half;
 25642        estimated >>= log2_fixedpoint;
 25643        __fallthrough /* fall through */;
 25644      case 2:
 25645        estimated = estimated * factor + half;
 25646        estimated >>= log2_fixedpoint;
 25647        __fallthrough /* fall through */;
 25648      case 1:
 25649        estimated = estimated * factor + half;
 25650        estimated >>= log2_fixedpoint;
 25651        __fallthrough /* fall through */;
 25652      case 0:
 25653        if (unlikely(estimated > (ptrdiff_t)db->md_entries))
 25654          return (ptrdiff_t)db->md_entries;
 25655        if (unlikely(estimated < -(ptrdiff_t)db->md_entries))
 25656          return -(ptrdiff_t)db->md_entries;
 25657        return estimated;
 25658      }
 25659    }
 25660  }
 25661  
 25662  int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last,
 25663                             ptrdiff_t *distance_items) {
 25664    if (unlikely(first == NULL || last == NULL || distance_items == NULL))
 25665      return MDBX_EINVAL;
 25666  
 25667    *distance_items = 0;
 25668    struct diff_result dr;
 25669    int rc = cursor_diff(last, first, &dr);
 25670    if (unlikely(rc != MDBX_SUCCESS))
 25671      return rc;
 25672  
 25673    if (unlikely(dr.diff == 0) &&
 25674        F_ISSET(first->mc_db->md_flags & last->mc_db->md_flags,
 25675                MDBX_DUPSORT | C_INITIALIZED)) {
 25676      first = &first->mc_xcursor->mx_cursor;
 25677      last = &last->mc_xcursor->mx_cursor;
 25678      rc = cursor_diff(first, last, &dr);
 25679      if (unlikely(rc != MDBX_SUCCESS))
 25680        return rc;
 25681    }
 25682  
 25683    if (likely(dr.diff != 0))
 25684      *distance_items = estimate(first->mc_db, &dr);
 25685  
 25686    return MDBX_SUCCESS;
 25687  }
 25688  
 25689  int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data,
 25690                         MDBX_cursor_op move_op, ptrdiff_t *distance_items) {
 25691    if (unlikely(cursor == NULL || distance_items == NULL ||
 25692                 move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
 25693      return MDBX_EINVAL;
 25694  
 25695    if (unlikely(cursor->mc_signature != MDBX_MC_LIVE))
 25696      return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
 25697                                                           : MDBX_EBADSIGN;
 25698  
 25699    int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED);
 25700    if (unlikely(rc != MDBX_SUCCESS))
 25701      return rc;
 25702  
 25703    if (!(cursor->mc_flags & C_INITIALIZED))
 25704      return MDBX_ENODATA;
 25705  
 25706    MDBX_cursor_couple next;
 25707    cursor_copy(cursor, &next.outer);
 25708    if (cursor->mc_db->md_flags & MDBX_DUPSORT) {
 25709      next.outer.mc_xcursor = &next.inner;
 25710      rc = cursor_xinit0(&next.outer);
 25711      if (unlikely(rc != MDBX_SUCCESS))
 25712        return rc;
 25713      MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner;
 25714      cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor);
 25715    }
 25716  
 25717    MDBX_val stub = {0, 0};
 25718    if (data == NULL) {
 25719      const unsigned mask =
 25720          1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
 25721      if (unlikely(mask & (1 << move_op)))
 25722        return MDBX_EINVAL;
 25723      data = &stub;
 25724    }
 25725  
 25726    if (key == NULL) {
 25727      const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE |
 25728                            1 << MDBX_SET_KEY | 1 << MDBX_SET |
 25729                            1 << MDBX_SET_RANGE;
 25730      if (unlikely(mask & (1 << move_op)))
 25731        return MDBX_EINVAL;
 25732      key = &stub;
 25733    }
 25734  
 25735    next.outer.mc_signature = MDBX_MC_LIVE;
 25736    rc = mdbx_cursor_get(&next.outer, key, data, move_op);
 25737    if (unlikely(rc != MDBX_SUCCESS &&
 25738                 (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED))))
 25739      return rc;
 25740  
 25741    return mdbx_estimate_distance(cursor, &next.outer, distance_items);
 25742  }
 25743  
 25744  int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key,
 25745                          MDBX_val *begin_data, MDBX_val *end_key,
 25746                          MDBX_val *end_data, ptrdiff_t *size_items) {
 25747    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 25748    if (unlikely(rc != MDBX_SUCCESS))
 25749      return rc;
 25750  
 25751    if (unlikely(!size_items))
 25752      return MDBX_EINVAL;
 25753  
 25754    if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON)))
 25755      return MDBX_EINVAL;
 25756  
 25757    if (unlikely(end_data && (end_key == NULL || end_key == MDBX_EPSILON)))
 25758      return MDBX_EINVAL;
 25759  
 25760    if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON))
 25761      return MDBX_EINVAL;
 25762  
 25763    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 25764      return MDBX_BAD_DBI;
 25765  
 25766    MDBX_cursor_couple begin;
 25767    /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
 25768    rc = cursor_init(&begin.outer, txn, dbi);
 25769    if (unlikely(rc != MDBX_SUCCESS))
 25770      return rc;
 25771  
 25772    if (unlikely(begin.outer.mc_db->md_entries == 0)) {
 25773      *size_items = 0;
 25774      return MDBX_SUCCESS;
 25775    }
 25776  
 25777    if (!begin_key) {
 25778      if (unlikely(!end_key)) {
 25779        /* LY: FIRST..LAST case */
 25780        *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
 25781        return MDBX_SUCCESS;
 25782      }
 25783      MDBX_val stub = {0, 0};
 25784      rc = cursor_first(&begin.outer, &stub, &stub);
 25785      if (unlikely(end_key == MDBX_EPSILON)) {
 25786        /* LY: FIRST..+epsilon case */
 25787        return (rc == MDBX_SUCCESS)
 25788                   ? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
 25789                   : rc;
 25790      }
 25791    } else {
 25792      if (unlikely(begin_key == MDBX_EPSILON)) {
 25793        if (end_key == NULL) {
 25794          /* LY: -epsilon..LAST case */
 25795          MDBX_val stub = {0, 0};
 25796          rc = cursor_last(&begin.outer, &stub, &stub);
 25797          return (rc == MDBX_SUCCESS)
 25798                     ? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
 25799                     : rc;
 25800        }
 25801        /* LY: -epsilon..value case */
 25802        assert(end_key != MDBX_EPSILON);
 25803        begin_key = end_key;
 25804      } else if (unlikely(end_key == MDBX_EPSILON)) {
 25805        /* LY: value..+epsilon case */
 25806        assert(begin_key != MDBX_EPSILON);
 25807        end_key = begin_key;
 25808      }
 25809      if (end_key && !begin_data && !end_data &&
 25810          (begin_key == end_key ||
 25811           begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) {
 25812        /* LY: single key case */
 25813        rc = cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err;
 25814        if (unlikely(rc != MDBX_SUCCESS)) {
 25815          *size_items = 0;
 25816          return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
 25817        }
 25818        *size_items = 1;
 25819        if (begin.outer.mc_xcursor != NULL) {
 25820          MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top],
 25821                                      begin.outer.mc_ki[begin.outer.mc_top]);
 25822          if (node_flags(node) & F_DUPDATA) {
 25823            /* LY: return the number of duplicates for given key */
 25824            tASSERT(txn, begin.outer.mc_xcursor == &begin.inner &&
 25825                             (begin.inner.mx_cursor.mc_flags & C_INITIALIZED));
 25826            *size_items =
 25827                (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) ||
 25828                 begin.inner.mx_db.md_entries <= PTRDIFF_MAX)
 25829                    ? (size_t)begin.inner.mx_db.md_entries
 25830                    : PTRDIFF_MAX;
 25831          }
 25832        }
 25833        return MDBX_SUCCESS;
 25834      } else {
 25835        rc = cursor_set(&begin.outer, begin_key, begin_data,
 25836                        begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE)
 25837                 .err;
 25838      }
 25839    }
 25840  
 25841    if (unlikely(rc != MDBX_SUCCESS)) {
 25842      if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED))
 25843        return rc;
 25844    }
 25845  
 25846    MDBX_cursor_couple end;
 25847    rc = cursor_init(&end.outer, txn, dbi);
 25848    if (unlikely(rc != MDBX_SUCCESS))
 25849      return rc;
 25850    if (!end_key) {
 25851      MDBX_val stub = {0, 0};
 25852      rc = cursor_last(&end.outer, &stub, &stub);
 25853    } else {
 25854      rc = cursor_set(&end.outer, end_key, end_data,
 25855                      end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE)
 25856               .err;
 25857    }
 25858    if (unlikely(rc != MDBX_SUCCESS)) {
 25859      if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED))
 25860        return rc;
 25861    }
 25862  
 25863    rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
 25864    if (unlikely(rc != MDBX_SUCCESS))
 25865      return rc;
 25866    assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries &&
 25867           *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
 25868  
 25869  #if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation       \
 25870         * results for an inverted ranges. */
 25871  
 25872    /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63
 25873       Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */
 25874  
 25875    if (*size_items < 0) {
 25876      /* LY: inverted range case */
 25877      *size_items += (ptrdiff_t)begin.outer.mc_db->md_entries;
 25878    } else if (*size_items == 0 && begin_key && end_key) {
 25879      int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key);
 25880      if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) &&
 25881          begin_data && end_data)
 25882        cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data);
 25883      if (cmp > 0) {
 25884        /* LY: inverted range case with empty scope */
 25885        *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
 25886      }
 25887    }
 25888    assert(*size_items >= 0 &&
 25889           *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
 25890  #endif
 25891  
 25892    return MDBX_SUCCESS;
 25893  }
 25894  
 25895  //------------------------------------------------------------------------------
 25896  
 25897  /* Позволяет обновить или удалить существующую запись с получением
 25898   * в old_data предыдущего значения данных. При этом если new_data равен
 25899   * нулю, то выполняется удаление, иначе обновление/вставка.
 25900   *
 25901   * Текущее значение может находиться в уже измененной (грязной) странице.
 25902   * В этом случае страница будет перезаписана при обновлении, а само старое
 25903   * значение утрачено. Поэтому исходно в old_data должен быть передан
 25904   * дополнительный буфер для копирования старого значения.
 25905   * Если переданный буфер слишком мал, то функция вернет -1, установив
 25906   * old_data->iov_len в соответствующее значение.
 25907   *
 25908   * Для не-уникальных ключей также возможен второй сценарий использования,
 25909   * когда посредством old_data из записей с одинаковым ключом для
 25910   * удаления/обновления выбирается конкретная. Для выбора этого сценария
 25911   * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE.
 25912   * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет
 25913   * идентифицировать запрос такого сценария.
 25914   *
 25915   * Функция может быть замещена соответствующими операциями с курсорами
 25916   * после двух доработок (TODO):
 25917   *  - внешняя аллокация курсоров, в том числе на стеке (без malloc).
 25918   *  - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE).
 25919   */
 25920  
 25921  int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
 25922                      MDBX_val *new_data, MDBX_val *old_data,
 25923                      MDBX_put_flags_t flags, MDBX_preserve_func preserver,
 25924                      void *preserver_context) {
 25925    int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
 25926    if (unlikely(rc != MDBX_SUCCESS))
 25927      return rc;
 25928  
 25929    if (unlikely(!key || !old_data || old_data == new_data))
 25930      return MDBX_EINVAL;
 25931  
 25932    if (unlikely(old_data->iov_base == NULL && old_data->iov_len))
 25933      return MDBX_EINVAL;
 25934  
 25935    if (unlikely(new_data == NULL &&
 25936                 (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT))
 25937      return MDBX_EINVAL;
 25938  
 25939    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 25940      return MDBX_BAD_DBI;
 25941  
 25942    if (unlikely(flags &
 25943                 ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
 25944                   MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT)))
 25945      return MDBX_EINVAL;
 25946  
 25947    MDBX_cursor_couple cx;
 25948    rc = cursor_init(&cx.outer, txn, dbi);
 25949    if (unlikely(rc != MDBX_SUCCESS))
 25950      return rc;
 25951    cx.outer.mc_next = txn->mt_cursors[dbi];
 25952    txn->mt_cursors[dbi] = &cx.outer;
 25953  
 25954    MDBX_val present_key = *key;
 25955    if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) {
 25956      /* в old_data значение для выбора конкретного дубликата */
 25957      if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) {
 25958        rc = MDBX_EINVAL;
 25959        goto bailout;
 25960      }
 25961  
 25962      /* убираем лишний бит, он был признаком запрошенного режима */
 25963      flags -= MDBX_NOOVERWRITE;
 25964  
 25965      rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH);
 25966      if (rc != MDBX_SUCCESS)
 25967        goto bailout;
 25968    } else {
 25969      /* в old_data буфер для сохранения предыдущего значения */
 25970      if (unlikely(new_data && old_data->iov_base == new_data->iov_base))
 25971        return MDBX_EINVAL;
 25972      MDBX_val present_data;
 25973      rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY);
 25974      if (unlikely(rc != MDBX_SUCCESS)) {
 25975        old_data->iov_base = NULL;
 25976        old_data->iov_len = 0;
 25977        if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT))
 25978          goto bailout;
 25979      } else if (flags & MDBX_NOOVERWRITE) {
 25980        rc = MDBX_KEYEXIST;
 25981        *old_data = present_data;
 25982        goto bailout;
 25983      } else {
 25984        MDBX_page *page = cx.outer.mc_pg[cx.outer.mc_top];
 25985        if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) {
 25986          if (flags & MDBX_CURRENT) {
 25987            /* disallow update/delete for multi-values */
 25988            MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]);
 25989            if (node_flags(node) & F_DUPDATA) {
 25990              tASSERT(txn, XCURSOR_INITED(&cx.outer) &&
 25991                               cx.outer.mc_xcursor->mx_db.md_entries > 1);
 25992              if (cx.outer.mc_xcursor->mx_db.md_entries > 1) {
 25993                rc = MDBX_EMULTIVAL;
 25994                goto bailout;
 25995              }
 25996            }
 25997            /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет
 25998             * к замене данных без учета MDBX_DUPSORT сортировки,
 25999             * но здесь это в любом случае допустимо, так как мы
 26000             * проверили что для ключа есть только одно значение. */
 26001          }
 26002        }
 26003  
 26004        if (IS_MODIFIABLE(txn, page)) {
 26005          if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
 26006            /* если данные совпадают, то ничего делать не надо */
 26007            *old_data = *new_data;
 26008            goto bailout;
 26009          }
 26010          rc = preserver ? preserver(preserver_context, old_data,
 26011                                     present_data.iov_base, present_data.iov_len)
 26012                         : MDBX_SUCCESS;
 26013          if (unlikely(rc != MDBX_SUCCESS))
 26014            goto bailout;
 26015        } else {
 26016          *old_data = present_data;
 26017        }
 26018        flags |= MDBX_CURRENT;
 26019      }
 26020    }
 26021  
 26022    if (likely(new_data))
 26023      rc = mdbx_cursor_put(&cx.outer, key, new_data, flags);
 26024    else
 26025      rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS);
 26026  
 26027  bailout:
 26028    txn->mt_cursors[dbi] = cx.outer.mc_next;
 26029    return rc;
 26030  }
 26031  
 26032  static int default_value_preserver(void *context, MDBX_val *target,
 26033                                     const void *src, size_t bytes) {
 26034    (void)context;
 26035    if (unlikely(target->iov_len < bytes)) {
 26036      target->iov_base = nullptr;
 26037      target->iov_len = bytes;
 26038      return MDBX_RESULT_TRUE;
 26039    }
 26040    memcpy(target->iov_base, src, target->iov_len = bytes);
 26041    return MDBX_SUCCESS;
 26042  }
 26043  
 26044  int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
 26045                   MDBX_val *new_data, MDBX_val *old_data,
 26046                   MDBX_put_flags_t flags) {
 26047    return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags,
 26048                           default_value_preserver, nullptr);
 26049  }
 26050  
 26051  /* Функция сообщает находится ли указанный адрес в "грязной" странице у
 26052   * заданной пишущей транзакции. В конечном счете это позволяет избавиться от
 26053   * лишнего копирования данных из НЕ-грязных страниц.
 26054   *
 26055   * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей
 26056   * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести
 26057   * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в
 26058   * качестве аргументов НЕ должны получать указатели на данные в таких
 26059   * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут
 26060   * скопированы.
 26061   *
 26062   * Другими словами, данные из "грязных" страниц должны быть либо скопированы
 26063   * перед передачей в качестве аргументов для дальнейших модификаций, либо
 26064   * отвергнуты на стадии проверки корректности аргументов.
 26065   *
 26066   * Таким образом, функция позволяет как избавится от лишнего копирования,
 26067   * так и выполнить более полную проверку аргументов.
 26068   *
 26069   * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только
 26070   * так гарантируется что актуальный заголовок страницы будет физически
 26071   * расположен в той-же странице памяти, в том числе для многостраничных
 26072   * P_OVERFLOW страниц с длинными данными. */
 26073  int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
 26074    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 26075    if (unlikely(rc != MDBX_SUCCESS))
 26076      return rc;
 26077  
 26078    const MDBX_env *env = txn->mt_env;
 26079    const ptrdiff_t offset = (uint8_t *)ptr - env->me_map;
 26080    if (offset >= 0) {
 26081      const pgno_t pgno = bytes2pgno(env, offset);
 26082      if (likely(pgno < txn->mt_next_pgno)) {
 26083        const MDBX_page *page = pgno2page(env, pgno);
 26084        if (unlikely(page->mp_pgno != pgno ||
 26085                     (page->mp_flags & P_ILL_BITS) != 0)) {
 26086          /* The ptr pointed into middle of a large page,
 26087           * not to the beginning of a data. */
 26088          return MDBX_EINVAL;
 26089        }
 26090        return ((txn->mt_flags & MDBX_TXN_RDONLY) || !IS_MODIFIABLE(txn, page))
 26091                   ? MDBX_RESULT_FALSE
 26092                   : MDBX_RESULT_TRUE;
 26093      }
 26094      if ((size_t)offset < env->me_dxb_mmap.limit) {
 26095        /* Указатель адресует что-то в пределах mmap, но за границей
 26096         * распределенных страниц. Такое может случится если mdbx_is_dirty()
 26097         * вызывается после операции, в ходе которой грязная страница была
 26098         * возвращена в нераспределенное пространство. */
 26099        return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE;
 26100      }
 26101    }
 26102  
 26103    /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
 26104     * передан некорректный адрес, либо адрес в теневой странице, которая была
 26105     * выделена посредством malloc().
 26106     *
 26107     * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная",
 26108     * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */
 26109    return (txn->mt_flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL
 26110                                                               : MDBX_RESULT_TRUE;
 26111  }
 26112  
 26113  int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result,
 26114                        uint64_t increment) {
 26115    int rc = check_txn(txn, MDBX_TXN_BLOCKED);
 26116    if (unlikely(rc != MDBX_SUCCESS))
 26117      return rc;
 26118  
 26119    if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
 26120      return MDBX_BAD_DBI;
 26121  
 26122    if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) {
 26123      rc = fetch_sdb(txn, dbi);
 26124      if (unlikely(rc != MDBX_SUCCESS))
 26125        return rc;
 26126    }
 26127  
 26128    MDBX_db *dbs = &txn->mt_dbs[dbi];
 26129    if (likely(result))
 26130      *result = dbs->md_seq;
 26131  
 26132    if (likely(increment > 0)) {
 26133      if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY))
 26134        return MDBX_EACCESS;
 26135  
 26136      uint64_t new = dbs->md_seq + increment;
 26137      if (unlikely(new < increment))
 26138        return MDBX_RESULT_TRUE;
 26139  
 26140      tASSERT(txn, new > dbs->md_seq);
 26141      dbs->md_seq = new;
 26142      txn->mt_flags |= MDBX_TXN_DIRTY;
 26143      txn->mt_dbistate[dbi] |= DBI_DIRTY;
 26144    }
 26145  
 26146    return MDBX_SUCCESS;
 26147  }
 26148  
 26149  /*----------------------------------------------------------------------------*/
 26150  
 26151  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 26152  __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) {
 26153    return __inline_mdbx_limits_pgsize_min();
 26154  }
 26155  
 26156  __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) {
 26157    return __inline_mdbx_limits_pgsize_max();
 26158  }
 26159  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 26160  
 26161  __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) {
 26162    if (pagesize < 1)
 26163      pagesize = (intptr_t)mdbx_default_pagesize();
 26164    else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
 26165                      pagesize > (intptr_t)MAX_PAGESIZE ||
 26166                      !is_powerof2((size_t)pagesize)))
 26167      return -1;
 26168  
 26169    return MIN_PAGENO * pagesize;
 26170  }
 26171  
 26172  __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) {
 26173    if (pagesize < 1)
 26174      pagesize = (intptr_t)mdbx_default_pagesize();
 26175    else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
 26176                      pagesize > (intptr_t)MAX_PAGESIZE ||
 26177                      !is_powerof2((size_t)pagesize)))
 26178      return -1;
 26179  
 26180    STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
 26181    const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize;
 26182    return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE;
 26183  }
 26184  
 26185  __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
 26186    if (pagesize < 1)
 26187      pagesize = (intptr_t)mdbx_default_pagesize();
 26188    else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
 26189                      pagesize > (intptr_t)MAX_PAGESIZE ||
 26190                      !is_powerof2((size_t)pagesize)))
 26191      return -1;
 26192  
 26193    STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
 26194    const uint64_t pgl_limit =
 26195        pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482);
 26196    const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482);
 26197    return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit;
 26198  }
 26199  
 26200  /*** Key-making functions to avoid custom comparators *************************/
 26201  
 26202  static __always_inline double key2double(const int64_t key) {
 26203    union {
 26204      uint64_t u;
 26205      double f;
 26206    } casting;
 26207  
 26208    casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000)
 26209                          : UINT64_C(0xffffFFFFffffFFFF) - key;
 26210    return casting.f;
 26211  }
 26212  
 26213  static __always_inline uint64_t double2key(const double *const ptr) {
 26214    STATIC_ASSERT(sizeof(double) == sizeof(int64_t));
 26215    const int64_t i = *(const int64_t *)ptr;
 26216    const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i
 26217                               : i + UINT64_C(0x8000000000000000);
 26218    if (ASSERT_ENABLED()) {
 26219      const double f = key2double(u);
 26220      assert(memcmp(&f, ptr, 8) == 0);
 26221    }
 26222    return u;
 26223  }
 26224  
 26225  static __always_inline float key2float(const int32_t key) {
 26226    union {
 26227      uint32_t u;
 26228      float f;
 26229    } casting;
 26230  
 26231    casting.u =
 26232        (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key;
 26233    return casting.f;
 26234  }
 26235  
 26236  static __always_inline uint32_t float2key(const float *const ptr) {
 26237    STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
 26238    const int32_t i = *(const int32_t *)ptr;
 26239    const uint32_t u =
 26240        (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000);
 26241    if (ASSERT_ENABLED()) {
 26242      const float f = key2float(u);
 26243      assert(memcmp(&f, ptr, 4) == 0);
 26244    }
 26245    return u;
 26246  }
 26247  
 26248  uint64_t mdbx_key_from_double(const double ieee754_64bit) {
 26249    return double2key(&ieee754_64bit);
 26250  }
 26251  
 26252  uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) {
 26253    return double2key(ieee754_64bit);
 26254  }
 26255  
 26256  uint32_t mdbx_key_from_float(const float ieee754_32bit) {
 26257    return float2key(&ieee754_32bit);
 26258  }
 26259  
 26260  uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) {
 26261    return float2key(ieee754_32bit);
 26262  }
 26263  
 26264  #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
 26265  MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) {
 26266    return __inline_mdbx_key_from_int64(i64);
 26267  }
 26268  
 26269  MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) {
 26270    return __inline_mdbx_key_from_int32(i32);
 26271  }
 26272  #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
 26273  
 26274  #define IEEE754_DOUBLE_MANTISSA_SIZE 52
 26275  #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF
 26276  #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF
 26277  #define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000)
 26278  #define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF)
 26279  #define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF)
 26280  
 26281  static __inline int clz64(uint64_t value) {
 26282  #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl)
 26283    if (sizeof(value) == sizeof(int))
 26284      return __builtin_clz(value);
 26285    if (sizeof(value) == sizeof(long))
 26286      return __builtin_clzl(value);
 26287  #if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) ||            \
 26288      __has_builtin(__builtin_clzll)
 26289    return __builtin_clzll(value);
 26290  #endif /* have(long long) && long long == uint64_t */
 26291  #endif /* GNU C */
 26292  
 26293  #if defined(_MSC_VER)
 26294    unsigned long index;
 26295  #if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
 26296    _BitScanReverse64(&index, value);
 26297    return 63 - index;
 26298  #else
 26299    if (value > UINT32_MAX) {
 26300      _BitScanReverse(&index, (uint32_t)(value >> 32));
 26301      return 31 - index;
 26302    }
 26303    _BitScanReverse(&index, (uint32_t)value);
 26304    return 63 - index;
 26305  #endif
 26306  #endif /* MSVC */
 26307  
 26308    value |= value >> 1;
 26309    value |= value >> 2;
 26310    value |= value >> 4;
 26311    value |= value >> 8;
 26312    value |= value >> 16;
 26313    value |= value >> 32;
 26314    static const uint8_t debruijn_clz64[64] = {
 26315        63, 16, 62, 7,  15, 36, 61, 3,  6,  14, 22, 26, 35, 47, 60, 2,
 26316        9,  5,  28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1,
 26317        17, 8,  37, 4,  23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18,
 26318        38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0};
 26319    return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58];
 26320  }
 26321  
 26322  static __inline uint64_t round_mantissa(const uint64_t u64, int shift) {
 26323    assert(shift < 0 && u64 > 0);
 26324    shift = -shift;
 26325    const unsigned half = 1 << (shift - 1);
 26326    const unsigned lsb = 1 & (unsigned)(u64 >> shift);
 26327    const unsigned tie2even = 1 ^ lsb;
 26328    return (u64 + half - tie2even) >> shift;
 26329  }
 26330  
 26331  uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
 26332    const uint64_t bias = UINT64_C(0x8000000000000000);
 26333    if (json_integer > 0) {
 26334      const uint64_t u64 = json_integer;
 26335      int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
 26336      uint64_t mantissa = u64 << shift;
 26337      if (unlikely(shift < 0)) {
 26338        mantissa = round_mantissa(u64, shift);
 26339        if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
 26340          mantissa = round_mantissa(u64, --shift);
 26341      }
 26342  
 26343      assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
 26344             mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
 26345      const uint64_t exponent =
 26346          IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
 26347      assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
 26348      const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) +
 26349                           (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
 26350  #if !defined(_MSC_VER) ||                                                      \
 26351      defined(                                                                   \
 26352          _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external      \
 26353                     symbol __except1 referenced in function __ftol3_except */
 26354      assert(key == mdbx_key_from_double((double)json_integer));
 26355  #endif /* Workaround for MSVC */
 26356      return key;
 26357    }
 26358  
 26359    if (json_integer < 0) {
 26360      const uint64_t u64 = -json_integer;
 26361      int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
 26362      uint64_t mantissa = u64 << shift;
 26363      if (unlikely(shift < 0)) {
 26364        mantissa = round_mantissa(u64, shift);
 26365        if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
 26366          mantissa = round_mantissa(u64, --shift);
 26367      }
 26368  
 26369      assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
 26370             mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
 26371      const uint64_t exponent =
 26372          IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
 26373      assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
 26374      const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) -
 26375                           (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
 26376  #if !defined(_MSC_VER) ||                                                      \
 26377      defined(                                                                   \
 26378          _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external      \
 26379                     symbol __except1 referenced in function __ftol3_except */
 26380      assert(key == mdbx_key_from_double((double)json_integer));
 26381  #endif /* Workaround for MSVC */
 26382      return key;
 26383    }
 26384  
 26385    return bias;
 26386  }
 26387  
 26388  int64_t mdbx_jsonInteger_from_key(const MDBX_val v) {
 26389    assert(v.iov_len == 8);
 26390    const uint64_t key = unaligned_peek_u64(2, v.iov_base);
 26391    const uint64_t bias = UINT64_C(0x8000000000000000);
 26392    const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1;
 26393    const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 -
 26394                      (IEEE754_DOUBLE_EXPONENTA_MAX &
 26395                       (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE));
 26396    if (unlikely(shift < 1))
 26397      return (key < bias) ? INT64_MIN : INT64_MAX;
 26398    if (unlikely(shift > 63))
 26399      return 0;
 26400  
 26401    const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK)
 26402                               << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) +
 26403                              bias;
 26404    const int64_t absolute = unscaled >> shift;
 26405    const int64_t value = (key < bias) ? -absolute : absolute;
 26406    assert(key == mdbx_key_from_jsonInteger(value) ||
 26407           (mdbx_key_from_jsonInteger(value - 1) < key &&
 26408            key < mdbx_key_from_jsonInteger(value + 1)));
 26409    return value;
 26410  }
 26411  
 26412  double mdbx_double_from_key(const MDBX_val v) {
 26413    assert(v.iov_len == 8);
 26414    return key2double(unaligned_peek_u64(2, v.iov_base));
 26415  }
 26416  
 26417  float mdbx_float_from_key(const MDBX_val v) {
 26418    assert(v.iov_len == 4);
 26419    return key2float(unaligned_peek_u32(2, v.iov_base));
 26420  }
 26421  
 26422  int32_t mdbx_int32_from_key(const MDBX_val v) {
 26423    assert(v.iov_len == 4);
 26424    return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000));
 26425  }
 26426  
 26427  int64_t mdbx_int64_from_key(const MDBX_val v) {
 26428    assert(v.iov_len == 8);
 26429    return (int64_t)(unaligned_peek_u64(2, v.iov_base) -
 26430                     UINT64_C(0x8000000000000000));
 26431  }
 26432  
 26433  __cold MDBX_cmp_func *mdbx_get_keycmp(unsigned flags) {
 26434    return get_default_keycmp(flags);
 26435  }
 26436  
 26437  __cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) {
 26438    return get_default_datacmp(flags);
 26439  }
 26440  
 26441  __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
 26442                                 uint64_t value) {
 26443    int err = check_env(env, false);
 26444    if (unlikely(err != MDBX_SUCCESS))
 26445      return err;
 26446  
 26447    const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 &&
 26448                              env->me_txn0->mt_owner != osal_thread_self());
 26449    bool should_unlock = false;
 26450    switch (option) {
 26451    case MDBX_opt_sync_bytes:
 26452      if (value == UINT64_MAX)
 26453        value = SIZE_MAX - 65536;
 26454      if (unlikely(env->me_flags & MDBX_RDONLY))
 26455        return MDBX_EACCESS;
 26456      if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
 26457        return MDBX_EPERM;
 26458      if (unlikely(value > SIZE_MAX - 65536))
 26459        return MDBX_TOO_LARGE;
 26460      if (atomic_store32(&env->me_lck->mti_autosync_threshold,
 26461                         bytes2pgno(env, (size_t)value + env->me_psize - 1),
 26462                         mo_Relaxed) != 0 &&
 26463          (env->me_flags & MDBX_ENV_ACTIVE)) {
 26464        err = mdbx_env_sync_poll(env);
 26465        if (unlikely(MDBX_IS_ERROR(err)))
 26466          return err;
 26467        err = MDBX_SUCCESS;
 26468      }
 26469      break;
 26470  
 26471    case MDBX_opt_sync_period:
 26472      if (value == UINT64_MAX)
 26473        value = UINT32_MAX;
 26474      if (unlikely(env->me_flags & MDBX_RDONLY))
 26475        return MDBX_EACCESS;
 26476      if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
 26477        return MDBX_EPERM;
 26478      if (unlikely(value > UINT32_MAX))
 26479        return MDBX_TOO_LARGE;
 26480      if (atomic_store64(&env->me_lck->mti_autosync_period,
 26481                         osal_16dot16_to_monotime((uint32_t)value),
 26482                         mo_Relaxed) != 0 &&
 26483          (env->me_flags & MDBX_ENV_ACTIVE)) {
 26484        err = mdbx_env_sync_poll(env);
 26485        if (unlikely(MDBX_IS_ERROR(err)))
 26486          return err;
 26487        err = MDBX_SUCCESS;
 26488      }
 26489      break;
 26490  
 26491    case MDBX_opt_max_db:
 26492      if (value == UINT64_MAX)
 26493        value = MDBX_MAX_DBI;
 26494      if (unlikely(value > MDBX_MAX_DBI))
 26495        return MDBX_EINVAL;
 26496      if (unlikely(env->me_map))
 26497        return MDBX_EPERM;
 26498      env->me_maxdbs = (unsigned)value + CORE_DBS;
 26499      break;
 26500  
 26501    case MDBX_opt_max_readers:
 26502      if (value == UINT64_MAX)
 26503        value = MDBX_READERS_LIMIT;
 26504      if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
 26505        return MDBX_EINVAL;
 26506      if (unlikely(env->me_map))
 26507        return MDBX_EPERM;
 26508      env->me_maxreaders = (unsigned)value;
 26509      break;
 26510  
 26511    case MDBX_opt_dp_reserve_limit:
 26512      if (value == UINT64_MAX)
 26513        value = INT_MAX;
 26514      if (unlikely(value > INT_MAX))
 26515        return MDBX_EINVAL;
 26516      if (env->me_options.dp_reserve_limit != (unsigned)value) {
 26517        if (lock_needed) {
 26518          err = mdbx_txn_lock(env, false);
 26519          if (unlikely(err != MDBX_SUCCESS))
 26520            return err;
 26521          should_unlock = true;
 26522        }
 26523        env->me_options.dp_reserve_limit = (unsigned)value;
 26524        while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) {
 26525          eASSERT(env, env->me_dp_reserve != NULL);
 26526          MDBX_page *dp = env->me_dp_reserve;
 26527          MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
 26528          VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
 26529          env->me_dp_reserve = dp->mp_next;
 26530          VALGRIND_MEMPOOL_FREE(env, dp);
 26531          osal_free(dp);
 26532          env->me_dp_reserve_len -= 1;
 26533        }
 26534      }
 26535      break;
 26536  
 26537    case MDBX_opt_rp_augment_limit:
 26538      if (value == UINT64_MAX)
 26539        value = MDBX_PGL_LIMIT;
 26540      if (unlikely(value > MDBX_PGL_LIMIT))
 26541        return MDBX_EINVAL;
 26542      env->me_options.rp_augment_limit = (unsigned)value;
 26543      break;
 26544  
 26545    case MDBX_opt_txn_dp_limit:
 26546    case MDBX_opt_txn_dp_initial:
 26547      if (value == UINT64_MAX)
 26548        value = MDBX_PGL_LIMIT;
 26549      if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4))
 26550        return MDBX_EINVAL;
 26551      if (unlikely(env->me_flags & MDBX_RDONLY))
 26552        return MDBX_EACCESS;
 26553      if (lock_needed) {
 26554        err = mdbx_txn_lock(env, false);
 26555        if (unlikely(err != MDBX_SUCCESS))
 26556          return err;
 26557        should_unlock = true;
 26558      }
 26559      if (env->me_txn)
 26560        err = MDBX_EPERM /* unable change during transaction */;
 26561      else {
 26562        const pgno_t value32 = (pgno_t)value;
 26563        if (option == MDBX_opt_txn_dp_initial &&
 26564            env->me_options.dp_initial != value32) {
 26565          env->me_options.dp_initial = value32;
 26566          if (env->me_options.dp_limit < value32) {
 26567            env->me_options.dp_limit = value32;
 26568            env->me_options.flags.non_auto.dp_limit = 1;
 26569          }
 26570        }
 26571        if (option == MDBX_opt_txn_dp_limit &&
 26572            env->me_options.dp_limit != value32) {
 26573          env->me_options.dp_limit = value32;
 26574          env->me_options.flags.non_auto.dp_limit = 1;
 26575          if (env->me_options.dp_initial > value32)
 26576            env->me_options.dp_initial = value32;
 26577        }
 26578      }
 26579      break;
 26580  
 26581    case MDBX_opt_spill_max_denominator:
 26582      if (value == UINT64_MAX)
 26583        value = 255;
 26584      if (unlikely(value > 255))
 26585        return MDBX_EINVAL;
 26586      env->me_options.spill_max_denominator = (uint8_t)value;
 26587      break;
 26588    case MDBX_opt_spill_min_denominator:
 26589      if (unlikely(value > 255))
 26590        return MDBX_EINVAL;
 26591      env->me_options.spill_min_denominator = (uint8_t)value;
 26592      break;
 26593    case MDBX_opt_spill_parent4child_denominator:
 26594      if (unlikely(value > 255))
 26595        return MDBX_EINVAL;
 26596      env->me_options.spill_parent4child_denominator = (uint8_t)value;
 26597      break;
 26598  
 26599    case MDBX_opt_loose_limit:
 26600      if (value == UINT64_MAX)
 26601        value = 255;
 26602      if (unlikely(value > 255))
 26603        return MDBX_EINVAL;
 26604      env->me_options.dp_loose_limit = (uint8_t)value;
 26605      break;
 26606  
 26607    case MDBX_opt_merge_threshold_16dot16_percent:
 26608      if (value == UINT64_MAX)
 26609        value = 32768;
 26610      if (unlikely(value < 8192 || value > 32768))
 26611        return MDBX_EINVAL;
 26612      env->me_options.merge_threshold_16dot16_percent = (unsigned)value;
 26613      recalculate_merge_threshold(env);
 26614      break;
 26615  
 26616    default:
 26617      return MDBX_EINVAL;
 26618    }
 26619  
 26620    if (should_unlock)
 26621      mdbx_txn_unlock(env);
 26622    return err;
 26623  }
 26624  
 26625  __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
 26626                                 uint64_t *pvalue) {
 26627    int err = check_env(env, false);
 26628    if (unlikely(err != MDBX_SUCCESS))
 26629      return err;
 26630    if (unlikely(!pvalue))
 26631      return MDBX_EINVAL;
 26632  
 26633    switch (option) {
 26634    case MDBX_opt_sync_bytes:
 26635      if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
 26636        return MDBX_EPERM;
 26637      *pvalue = pgno2bytes(
 26638          env, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed));
 26639      break;
 26640  
 26641    case MDBX_opt_sync_period:
 26642      if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
 26643        return MDBX_EPERM;
 26644      *pvalue = osal_monotime_to_16dot16(
 26645          atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed));
 26646      break;
 26647  
 26648    case MDBX_opt_max_db:
 26649      *pvalue = env->me_maxdbs - CORE_DBS;
 26650      break;
 26651  
 26652    case MDBX_opt_max_readers:
 26653      *pvalue = env->me_maxreaders;
 26654      break;
 26655  
 26656    case MDBX_opt_dp_reserve_limit:
 26657      *pvalue = env->me_options.dp_reserve_limit;
 26658      break;
 26659  
 26660    case MDBX_opt_rp_augment_limit:
 26661      *pvalue = env->me_options.rp_augment_limit;
 26662      break;
 26663  
 26664    case MDBX_opt_txn_dp_limit:
 26665      *pvalue = env->me_options.dp_limit;
 26666      break;
 26667    case MDBX_opt_txn_dp_initial:
 26668      *pvalue = env->me_options.dp_initial;
 26669      break;
 26670  
 26671    case MDBX_opt_spill_max_denominator:
 26672      *pvalue = env->me_options.spill_max_denominator;
 26673      break;
 26674    case MDBX_opt_spill_min_denominator:
 26675      *pvalue = env->me_options.spill_min_denominator;
 26676      break;
 26677    case MDBX_opt_spill_parent4child_denominator:
 26678      *pvalue = env->me_options.spill_parent4child_denominator;
 26679      break;
 26680  
 26681    case MDBX_opt_loose_limit:
 26682      *pvalue = env->me_options.dp_loose_limit;
 26683      break;
 26684  
 26685    case MDBX_opt_merge_threshold_16dot16_percent:
 26686      *pvalue = env->me_options.merge_threshold_16dot16_percent;
 26687      break;
 26688  
 26689    default:
 26690      return MDBX_EINVAL;
 26691    }
 26692  
 26693    return MDBX_SUCCESS;
 26694  }
 26695  
 26696  __cold void global_ctor(void) {
 26697    rthc_limit = RTHC_INITIAL_LIMIT;
 26698    rthc_table = rthc_table_static;
 26699  #if defined(_WIN32) || defined(_WIN64)
 26700    InitializeCriticalSection(&rthc_critical_section);
 26701    InitializeCriticalSection(&lcklist_critical_section);
 26702  #else
 26703    ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0);
 26704    TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(),
 26705          __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key);
 26706  #endif
 26707    /* checking time conversion, this also avoids racing on 32-bit architectures
 26708     * during storing calculated 64-bit ratio(s) into memory. */
 26709    uint32_t proba = UINT32_MAX;
 26710    while (true) {
 26711      unsigned time_conversion_checkup =
 26712          osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba));
 26713      unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba;
 26714      unsigned one_less = (proba > 0) ? proba - 1 : proba;
 26715      ENSURE(nullptr, time_conversion_checkup >= one_less &&
 26716                          time_conversion_checkup <= one_more);
 26717      if (proba == 0)
 26718        break;
 26719      proba >>= 1;
 26720    }
 26721  
 26722    bootid = osal_bootid();
 26723  
 26724  #if MDBX_DEBUG
 26725    for (unsigned i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) {
 26726      const bool s0 = (i >> 0) & 1;
 26727      const bool s1 = (i >> 1) & 1;
 26728      const bool s2 = (i >> 2) & 1;
 26729      const uint8_t c01 = (i / (8 * 1)) % 3;
 26730      const uint8_t c02 = (i / (8 * 3)) % 3;
 26731      const uint8_t c12 = (i / (8 * 9)) % 3;
 26732  
 26733      const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2);
 26734      meta_troika_t troika;
 26735      troika.fsm = (uint8_t)i;
 26736      meta_troika_unpack(&troika, packed);
 26737  
 26738      const uint8_t tail = TROIKA_TAIL(&troika);
 26739      const bool strict = TROIKA_STRICT_VALID(&troika);
 26740      const bool valid = TROIKA_VALID(&troika);
 26741  
 26742      const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1)
 26743                                     ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
 26744                                     : (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
 26745      const uint8_t prefer_steady_chk =
 26746          meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
 26747                                       : (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
 26748  
 26749      uint8_t tail_chk;
 26750      if (recent_chk == 0)
 26751        tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
 26752      else if (recent_chk == 1)
 26753        tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
 26754      else
 26755        tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
 26756  
 26757      const bool valid_chk =
 26758          c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
 26759      const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
 26760                              (c12 != 1 || s1 != s2);
 26761      assert(troika.recent == recent_chk);
 26762      assert(troika.prefer_steady == prefer_steady_chk);
 26763      assert(tail == tail_chk);
 26764      assert(valid == valid_chk);
 26765      assert(strict == strict_chk);
 26766      // printf(" %d, ", packed);
 26767      assert(troika_fsm_map[troika.fsm] == packed);
 26768    }
 26769  #endif /* MDBX_DEBUG*/
 26770  
 26771  #if 0  /* debug */
 26772    for (unsigned i = 0; i < 65536; ++i) {
 26773      size_t pages = pv2pages(i);
 26774      unsigned x = pages2pv(pages);
 26775      size_t xp = pv2pages(x);
 26776      if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp)
 26777        printf("%u => %zu => %u => %zu\n", i, pages, x, xp);
 26778      assert(pages == xp);
 26779    }
 26780    fflush(stdout);
 26781  #endif /* #if 0 */
 26782  }
 26783  
 26784  /******************************************************************************/
 26785  
 26786  __dll_export
 26787  #ifdef __attribute_used__
 26788      __attribute_used__
 26789  #elif defined(__GNUC__) || __has_attribute(__used__)
 26790      __attribute__((__used__))
 26791  #endif
 26792  #ifdef __attribute_externally_visible__
 26793          __attribute_externally_visible__
 26794  #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
 26795      __has_attribute(__externally_visible__)
 26796      __attribute__((__externally_visible__))
 26797  #endif
 26798      const struct MDBX_build_info mdbx_build = {
 26799  #ifdef MDBX_BUILD_TIMESTAMP
 26800      MDBX_BUILD_TIMESTAMP
 26801  #else
 26802      "\"" __DATE__ " " __TIME__ "\""
 26803  #endif /* MDBX_BUILD_TIMESTAMP */
 26804  
 26805      ,
 26806  #ifdef MDBX_BUILD_TARGET
 26807      MDBX_BUILD_TARGET
 26808  #else
 26809    #if defined(__ANDROID_API__)
 26810      "Android" MDBX_STRINGIFY(__ANDROID_API__)
 26811    #elif defined(__linux__) || defined(__gnu_linux__)
 26812      "Linux"
 26813    #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__)
 26814      "webassembly"
 26815    #elif defined(__CYGWIN__)
 26816      "CYGWIN"
 26817    #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \
 26818        || defined(__WINDOWS__)
 26819      "Windows"
 26820    #elif defined(__APPLE__)
 26821      #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \
 26822        || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
 26823        "iOS"
 26824      #else
 26825        "MacOS"
 26826      #endif
 26827    #elif defined(__FreeBSD__)
 26828      "FreeBSD"
 26829    #elif defined(__DragonFly__)
 26830      "DragonFlyBSD"
 26831    #elif defined(__NetBSD__)
 26832      "NetBSD"
 26833    #elif defined(__OpenBSD__)
 26834      "OpenBSD"
 26835    #elif defined(__bsdi__)
 26836      "UnixBSDI"
 26837    #elif defined(__MACH__)
 26838      "MACH"
 26839    #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC))
 26840      "HPUX"
 26841    #elif defined(_AIX)
 26842      "AIX"
 26843    #elif defined(__sun) && defined(__SVR4)
 26844      "Solaris"
 26845    #elif defined(__BSD__) || defined(BSD)
 26846      "UnixBSD"
 26847    #elif defined(__unix__) || defined(UNIX) || defined(__unix) \
 26848        || defined(__UNIX) || defined(__UNIX__)
 26849      "UNIX"
 26850    #elif defined(_POSIX_VERSION)
 26851      "POSIX" MDBX_STRINGIFY(_POSIX_VERSION)
 26852    #else
 26853      "UnknownOS"
 26854    #endif /* Target OS */
 26855  
 26856      "-"
 26857  
 26858    #if defined(__amd64__)
 26859      "AMD64"
 26860    #elif defined(__ia32__)
 26861      "IA32"
 26862    #elif defined(__e2k__) || defined(__elbrus__)
 26863      "Elbrus"
 26864    #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
 26865      "Alpha"
 26866    #elif defined(__aarch64__) || defined(_M_ARM64)
 26867      "ARM64"
 26868    #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \
 26869        || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \
 26870        || defined(_M_ARMT) || defined(__arm)
 26871      "ARM"
 26872    #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64))
 26873      "MIPS64"
 26874    #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__)
 26875      "MIPS"
 26876    #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64)
 26877      "PARISC64"
 26878    #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
 26879      "PARISC"
 26880    #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \
 26881        || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__)
 26882      "Itanium"
 26883    #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \
 26884        || defined(__powerpc64) || defined(_ARCH_PPC64)
 26885      "PowerPC64"
 26886    #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \
 26887        || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__)
 26888      "PowerPC"
 26889    #elif defined(__sparc64__) || defined(__sparc64)
 26890      "SPARC64"
 26891    #elif defined(__sparc__) || defined(__sparc)
 26892      "SPARC"
 26893    #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch)
 26894      "S390"
 26895    #else
 26896      "UnknownARCH"
 26897    #endif
 26898  #endif /* MDBX_BUILD_TARGET */
 26899  
 26900  #ifdef MDBX_BUILD_TYPE
 26901  # if defined(_MSC_VER)
 26902  #   pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE)
 26903  # endif
 26904      "-" MDBX_BUILD_TYPE
 26905  #endif /* MDBX_BUILD_TYPE */
 26906      ,
 26907      "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG)
 26908  #ifdef ENABLE_GPROF
 26909      " ENABLE_GPROF"
 26910  #endif /* ENABLE_GPROF */
 26911      " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS)
 26912      " BYTE_ORDER="
 26913  #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 26914      "LITTLE_ENDIAN"
 26915  #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 26916      "BIG_ENDIAN"
 26917  #else
 26918      #error "FIXME: Unsupported byte order"
 26919  #endif /* __BYTE_ORDER__ */
 26920      " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT)
 26921      " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
 26922      " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
 26923      " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
 26924      " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
 26925      " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
 26926      " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
 26927      " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
 26928  #if MDBX_DISABLE_VALIDATION
 26929      " MDBX_DISABLE_VALIDATION=YES"
 26930  #endif /* MDBX_DISABLE_VALIDATION */
 26931  #ifdef __SANITIZE_ADDRESS__
 26932      " SANITIZE_ADDRESS=YES"
 26933  #endif /* __SANITIZE_ADDRESS__ */
 26934  #ifdef MDBX_USE_VALGRIND
 26935      " MDBX_USE_VALGRIND=YES"
 26936  #endif /* MDBX_USE_VALGRIND */
 26937  #if MDBX_FORCE_ASSERTIONS
 26938      " MDBX_FORCE_ASSERTIONS=YES"
 26939  #endif /* MDBX_FORCE_ASSERTIONS */
 26940  #ifdef _GNU_SOURCE
 26941      " _GNU_SOURCE=YES"
 26942  #else
 26943      " _GNU_SOURCE=NO"
 26944  #endif /* _GNU_SOURCE */
 26945  #ifdef __APPLE__
 26946      " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY)
 26947  #endif /* MacOS */
 26948  #if defined(_WIN32) || defined(_WIN64)
 26949      " MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT)
 26950      " MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY)
 26951  #if !MDBX_BUILD_SHARED_LIBRARY
 26952      " MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER)
 26953  #endif
 26954      " WINVER=" MDBX_STRINGIFY(WINVER)
 26955  #else /* Windows */
 26956      " MDBX_LOCKING=" MDBX_LOCKING_CONFIG
 26957      " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
 26958  #endif /* !Windows */
 26959      " MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
 26960      " MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)
 26961      " MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE)
 26962      " MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE)
 26963      " MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK)
 26964      " MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING)
 26965      ,
 26966  #ifdef MDBX_BUILD_COMPILER
 26967      MDBX_BUILD_COMPILER
 26968  #else
 26969    #ifdef __INTEL_COMPILER
 26970      "Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER)
 26971    #elif defined(__apple_build_version__)
 26972      "Apple clang " MDBX_STRINGIFY(__apple_build_version__)
 26973    #elif defined(__ibmxl__)
 26974      "IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__)
 26975      "." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__)
 26976    #elif defined(__clang__)
 26977      "clang " MDBX_STRINGIFY(__clang_version__)
 26978    #elif defined(__MINGW64__)
 26979      "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION)
 26980    #elif defined(__MINGW32__)
 26981      "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION)
 26982    #elif defined(__IBMC__)
 26983      "IBM C " MDBX_STRINGIFY(__IBMC__)
 26984    #elif defined(__GNUC__)
 26985      "GNU C/C++ "
 26986      #ifdef __VERSION__
 26987        __VERSION__
 26988      #else
 26989        MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__)
 26990      #endif
 26991    #elif defined(_MSC_VER)
 26992      "MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD)
 26993    #else
 26994      "Unknown compiler"
 26995    #endif
 26996  #endif /* MDBX_BUILD_COMPILER */
 26997      ,
 26998  #ifdef MDBX_BUILD_FLAGS_CONFIG
 26999      MDBX_BUILD_FLAGS_CONFIG
 27000  #endif /* MDBX_BUILD_FLAGS_CONFIG */
 27001  #ifdef MDBX_BUILD_FLAGS
 27002      MDBX_BUILD_FLAGS
 27003  #endif /* MDBX_BUILD_FLAGS */
 27004  #if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS))
 27005      "undefined (please use correct build script)"
 27006  #ifdef _MSC_VER
 27007  #pragma message("warning: Build flags undefined. Please use correct build script")
 27008  #else
 27009  #warning "Build flags undefined. Please use correct build script"
 27010  #endif // _MSC_VER
 27011  #endif
 27012  };
 27013  
 27014  #ifdef __SANITIZE_ADDRESS__
 27015  LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() {
 27016    return "symbolize=1:allow_addr2line=1:"
 27017  #if MDBX_DEBUG
 27018           "debug=1:"
 27019           "verbosity=2:"
 27020  #endif /* MDBX_DEBUG */
 27021           "log_threads=1:"
 27022           "report_globals=1:"
 27023           "replace_str=1:replace_intrin=1:"
 27024           "malloc_context_size=9:"
 27025  #if !defined(__APPLE__)
 27026           "detect_leaks=1:"
 27027  #endif
 27028           "check_printf=1:"
 27029           "detect_deadlocks=1:"
 27030  #ifndef LTO_ENABLED
 27031           "check_initialization_order=1:"
 27032  #endif
 27033           "detect_stack_use_after_return=1:"
 27034           "intercept_tls_get_addr=1:"
 27035           "decorate_proc_maps=1:"
 27036           "abort_on_error=1";
 27037  }
 27038  #endif /* __SANITIZE_ADDRESS__ */
 27039  
 27040  /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
 27041  
 27042  /*
 27043   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
 27044   * and other libmdbx authors: please see AUTHORS file.
 27045   * All rights reserved.
 27046   *
 27047   * Redistribution and use in source and binary forms, with or without
 27048   * modification, are permitted only as authorized by the OpenLDAP
 27049   * Public License.
 27050   *
 27051   * A copy of this license is available in the file LICENSE in the
 27052   * top-level directory of the distribution or, alternatively, at
 27053   * <http://www.OpenLDAP.org/license.html>.
 27054   */
 27055  
 27056  
 27057  #if defined(_WIN32) || defined(_WIN64)
 27058  
 27059  #include <winioctl.h>
 27060  
 27061  static int waitstatus2errcode(DWORD result) {
 27062    switch (result) {
 27063    case WAIT_OBJECT_0:
 27064      return MDBX_SUCCESS;
 27065    case WAIT_FAILED:
 27066      return (int)GetLastError();
 27067    case WAIT_ABANDONED:
 27068      return ERROR_ABANDONED_WAIT_0;
 27069    case WAIT_IO_COMPLETION:
 27070      return ERROR_USER_APC;
 27071    case WAIT_TIMEOUT:
 27072      return ERROR_TIMEOUT;
 27073    default:
 27074      return ERROR_UNHANDLED_ERROR;
 27075    }
 27076  }
 27077  
 27078  /* Map a result from an NTAPI call to WIN32 error code. */
 27079  static int ntstatus2errcode(NTSTATUS status) {
 27080    DWORD dummy;
 27081    OVERLAPPED ov;
 27082    memset(&ov, 0, sizeof(ov));
 27083    ov.Internal = status;
 27084    return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS
 27085                                                         : (int)GetLastError();
 27086  }
 27087  
 27088  /* We use native NT APIs to setup the memory map, so that we can
 27089   * let the DB file grow incrementally instead of always preallocating
 27090   * the full size. These APIs are defined in <wdm.h> and <ntifs.h>
 27091   * but those headers are meant for driver-level development and
 27092   * conflict with the regular user-level headers, so we explicitly
 27093   * declare them here. Using these APIs also means we must link to
 27094   * ntdll.dll, which is not linked by default in user code. */
 27095  
 27096  extern NTSTATUS NTAPI NtCreateSection(
 27097      OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess,
 27098      IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes,
 27099      IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection,
 27100      IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle);
 27101  
 27102  typedef struct _SECTION_BASIC_INFORMATION {
 27103    ULONG Unknown;
 27104    ULONG SectionAttributes;
 27105    LARGE_INTEGER SectionSize;
 27106  } SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION;
 27107  
 27108  extern NTSTATUS NTAPI NtMapViewOfSection(
 27109      IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress,
 27110      IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize,
 27111      IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize,
 27112      IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType,
 27113      IN ULONG Win32Protect);
 27114  
 27115  extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle,
 27116                                             IN OPTIONAL PVOID BaseAddress);
 27117  
 27118  extern NTSTATUS NTAPI NtClose(HANDLE Handle);
 27119  
 27120  extern NTSTATUS NTAPI NtAllocateVirtualMemory(
 27121      IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits,
 27122      IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect);
 27123  
 27124  extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle,
 27125                                            IN PVOID *BaseAddress,
 27126                                            IN OUT PSIZE_T RegionSize,
 27127                                            IN ULONG FreeType);
 27128  
 27129  #ifndef WOF_CURRENT_VERSION
 27130  typedef struct _WOF_EXTERNAL_INFO {
 27131    DWORD Version;
 27132    DWORD Provider;
 27133  } WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO;
 27134  #endif /* WOF_CURRENT_VERSION */
 27135  
 27136  #ifndef WIM_PROVIDER_CURRENT_VERSION
 27137  #define WIM_PROVIDER_HASH_SIZE 20
 27138  
 27139  typedef struct _WIM_PROVIDER_EXTERNAL_INFO {
 27140    DWORD Version;
 27141    DWORD Flags;
 27142    LARGE_INTEGER DataSourceId;
 27143    BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE];
 27144  } WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO;
 27145  #endif /* WIM_PROVIDER_CURRENT_VERSION */
 27146  
 27147  #ifndef FILE_PROVIDER_CURRENT_VERSION
 27148  typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 {
 27149    ULONG Version;
 27150    ULONG Algorithm;
 27151    ULONG Flags;
 27152  } FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1;
 27153  #endif /* FILE_PROVIDER_CURRENT_VERSION */
 27154  
 27155  #ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED
 27156  #define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL)
 27157  #endif
 27158  #ifndef STATUS_INVALID_DEVICE_REQUEST
 27159  #define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L)
 27160  #endif
 27161  #ifndef STATUS_NOT_SUPPORTED
 27162  #define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL)
 27163  #endif
 27164  
 27165  #ifndef FILE_DEVICE_FILE_SYSTEM
 27166  #define FILE_DEVICE_FILE_SYSTEM 0x00000009
 27167  #endif
 27168  
 27169  #ifndef FSCTL_GET_EXTERNAL_BACKING
 27170  #define FSCTL_GET_EXTERNAL_BACKING                                             \
 27171    CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS)
 27172  #endif
 27173  
 27174  #ifndef ERROR_NOT_CAPABLE
 27175  #define ERROR_NOT_CAPABLE 775L
 27176  #endif
 27177  
 27178  #endif /* _WIN32 || _WIN64 */
 27179  
 27180  /*----------------------------------------------------------------------------*/
 27181  
 27182  #if defined(__ANDROID_API__)
 27183  __extern_C void __assert2(const char *file, int line, const char *function,
 27184                            const char *msg) __noreturn;
 27185  #define __assert_fail(assertion, file, line, function)                         \
 27186    __assert2(file, line, function, assertion)
 27187  
 27188  #elif defined(__UCLIBC__)
 27189  __extern_C void __assert(const char *, const char *, unsigned int, const char *)
 27190  #ifdef __THROW
 27191      __THROW
 27192  #else
 27193      __nothrow
 27194  #endif /* __THROW */
 27195      MDBX_NORETURN;
 27196  #define __assert_fail(assertion, file, line, function)                         \
 27197    __assert(assertion, file, line, function)
 27198  
 27199  #elif _POSIX_C_SOURCE > 200212 &&                                              \
 27200      /* workaround for avoid musl libc wrong prototype */ (                     \
 27201          defined(__GLIBC__) || defined(__GNU_LIBRARY__))
 27202  /* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */
 27203  __extern_C void __assert_fail(const char *assertion, const char *file,
 27204                                unsigned line, const char *function)
 27205  #ifdef __THROW
 27206      __THROW
 27207  #else
 27208      __nothrow
 27209  #endif /* __THROW */
 27210      MDBX_NORETURN;
 27211  
 27212  #elif defined(__APPLE__) || defined(__MACH__)
 27213  __extern_C void __assert_rtn(const char *function, const char *file, int line,
 27214                               const char *assertion) /* __nothrow */
 27215  #ifdef __dead2
 27216      __dead2
 27217  #else
 27218      MDBX_NORETURN
 27219  #endif /* __dead2 */
 27220  #ifdef __disable_tail_calls
 27221      __disable_tail_calls
 27222  #endif /* __disable_tail_calls */
 27223      ;
 27224  
 27225  #define __assert_fail(assertion, file, line, function)                         \
 27226    __assert_rtn(function, file, line, assertion)
 27227  #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
 27228  __extern_C void __assert_c99(const char *assection, const char *file, int line,
 27229                               const char *function) MDBX_NORETURN;
 27230  #define __assert_fail(assertion, file, line, function)                         \
 27231    __assert_c99(assertion, file, line, function)
 27232  #elif defined(__OpenBSD__)
 27233  __extern_C __dead void __assert2(const char *file, int line,
 27234                                   const char *function,
 27235                                   const char *assertion) /* __nothrow */;
 27236  #define __assert_fail(assertion, file, line, function)                         \
 27237    __assert2(file, line, function, assertion)
 27238  #elif defined(__NetBSD__)
 27239  __extern_C __dead void __assert13(const char *file, int line,
 27240                                    const char *function,
 27241                                    const char *assertion) /* __nothrow */;
 27242  #define __assert_fail(assertion, file, line, function)                         \
 27243    __assert13(file, line, function, assertion)
 27244  #elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) ||         \
 27245      defined(__DragonFly__)
 27246  __extern_C void __assert(const char *function, const char *file, int line,
 27247                           const char *assertion) /* __nothrow */
 27248  #ifdef __dead2
 27249      __dead2
 27250  #else
 27251      MDBX_NORETURN
 27252  #endif /* __dead2 */
 27253  #ifdef __disable_tail_calls
 27254      __disable_tail_calls
 27255  #endif /* __disable_tail_calls */
 27256      ;
 27257  #define __assert_fail(assertion, file, line, function)                         \
 27258    __assert(function, file, line, assertion)
 27259  
 27260  #endif /* __assert_fail */
 27261  
 27262  __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg,
 27263                               const char *func, unsigned line) {
 27264  #if MDBX_DEBUG
 27265    if (env && env->me_assert_func) {
 27266      env->me_assert_func(env, msg, func, line);
 27267      return;
 27268    }
 27269  #else
 27270    (void)env;
 27271  #endif /* MDBX_DEBUG */
 27272  
 27273    if (debug_logger)
 27274      debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg);
 27275    else {
 27276  #if defined(_WIN32) || defined(_WIN64)
 27277      char *message = nullptr;
 27278      const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u",
 27279                                    msg, func ? func : "unknown", line);
 27280      if (num < 1 || !message)
 27281        message = "<troubles with assertion-message preparation>";
 27282      OutputDebugStringA(message);
 27283      if (IsDebuggerPresent())
 27284        DebugBreak();
 27285  #else
 27286      __assert_fail(msg, "mdbx", line, func);
 27287  #endif
 27288    }
 27289  
 27290  #if defined(_WIN32) || defined(_WIN64)
 27291    FatalExit(ERROR_UNHANDLED_ERROR);
 27292  #else
 27293    abort();
 27294  #endif
 27295  }
 27296  
 27297  __cold void mdbx_panic(const char *fmt, ...) {
 27298    va_list ap;
 27299    va_start(ap, fmt);
 27300  
 27301    char *message = nullptr;
 27302    const int num = osal_vasprintf(&message, fmt, ap);
 27303    va_end(ap);
 27304    const char *const const_message =
 27305        (num < 1 || !message) ? "<troubles with panic-message preparation>"
 27306                              : message;
 27307  
 27308  #if defined(_WIN32) || defined(_WIN64)
 27309    OutputDebugStringA("\r\nMDBX-PANIC: ");
 27310    OutputDebugStringA(const_message);
 27311    if (IsDebuggerPresent())
 27312      DebugBreak();
 27313    FatalExit(ERROR_UNHANDLED_ERROR);
 27314  #else
 27315    __assert_fail(const_message, "mdbx", 0, "panic");
 27316    abort();
 27317  #endif
 27318  }
 27319  
 27320  /*----------------------------------------------------------------------------*/
 27321  
 27322  #ifndef osal_vasprintf
 27323  MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt,
 27324                                        va_list ap) {
 27325    va_list ones;
 27326    va_copy(ones, ap);
 27327    int needed = vsnprintf(nullptr, 0, fmt, ap);
 27328  
 27329    if (unlikely(needed < 0 || needed >= INT_MAX)) {
 27330      *strp = nullptr;
 27331      va_end(ones);
 27332      return needed;
 27333    }
 27334  
 27335    *strp = osal_malloc(needed + 1);
 27336    if (unlikely(*strp == nullptr)) {
 27337      va_end(ones);
 27338  #if defined(_WIN32) || defined(_WIN64)
 27339      SetLastError(MDBX_ENOMEM);
 27340  #else
 27341      errno = MDBX_ENOMEM;
 27342  #endif
 27343      return -1;
 27344    }
 27345  
 27346    int actual = vsnprintf(*strp, needed + 1, fmt, ones);
 27347    va_end(ones);
 27348  
 27349    assert(actual == needed);
 27350    if (unlikely(actual < 0)) {
 27351      osal_free(*strp);
 27352      *strp = nullptr;
 27353    }
 27354    return actual;
 27355  }
 27356  #endif /* osal_vasprintf */
 27357  
 27358  #ifndef osal_asprintf
 27359  MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) {
 27360    va_list ap;
 27361    va_start(ap, fmt);
 27362    int rc = osal_vasprintf(strp, fmt, ap);
 27363    va_end(ap);
 27364    return rc;
 27365  }
 27366  #endif /* osal_asprintf */
 27367  
 27368  #ifndef osal_memalign_alloc
 27369  MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes,
 27370                                             void **result) {
 27371    assert(is_powerof2(alignment) && alignment >= sizeof(void *));
 27372  #if defined(_WIN32) || defined(_WIN64)
 27373    (void)alignment;
 27374    *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
 27375    return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */;
 27376  #elif defined(_ISOC11_SOURCE)
 27377    *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment));
 27378    return *result ? MDBX_SUCCESS : errno;
 27379  #elif _POSIX_VERSION >= 200112L &&                                             \
 27380      (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17)
 27381    *result = nullptr;
 27382    return posix_memalign(result, alignment, bytes);
 27383  #elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L
 27384    *result = memalign(alignment, bytes);
 27385    return *result ? MDBX_SUCCESS : errno;
 27386  #else
 27387  #error FIXME
 27388  #endif
 27389  }
 27390  #endif /* osal_memalign_alloc */
 27391  
 27392  #ifndef osal_memalign_free
 27393  MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) {
 27394  #if defined(_WIN32) || defined(_WIN64)
 27395    VirtualFree(ptr, 0, MEM_RELEASE);
 27396  #else
 27397    osal_free(ptr);
 27398  #endif
 27399  }
 27400  #endif /* osal_memalign_free */
 27401  
 27402  #ifndef osal_strdup
 27403  char *osal_strdup(const char *str) {
 27404    if (!str)
 27405      return NULL;
 27406    size_t bytes = strlen(str) + 1;
 27407    char *dup = osal_malloc(bytes);
 27408    if (dup)
 27409      memcpy(dup, str, bytes);
 27410    return dup;
 27411  }
 27412  #endif /* osal_strdup */
 27413  
 27414  /*----------------------------------------------------------------------------*/
 27415  
 27416  MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) {
 27417    int rc;
 27418    memset(condpair, 0, sizeof(osal_condpair_t));
 27419  #if defined(_WIN32) || defined(_WIN64)
 27420    if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) {
 27421      rc = (int)GetLastError();
 27422      goto bailout_mutex;
 27423    }
 27424    if ((condpair->event[0] = CreateEventW(NULL, FALSE, FALSE, NULL)) == NULL) {
 27425      rc = (int)GetLastError();
 27426      goto bailout_event;
 27427    }
 27428    if ((condpair->event[1] = CreateEventW(NULL, FALSE, FALSE, NULL)) != NULL)
 27429      return MDBX_SUCCESS;
 27430  
 27431    rc = (int)GetLastError();
 27432    (void)CloseHandle(condpair->event[0]);
 27433  bailout_event:
 27434    (void)CloseHandle(condpair->mutex);
 27435  #else
 27436    rc = pthread_mutex_init(&condpair->mutex, NULL);
 27437    if (unlikely(rc != 0))
 27438      goto bailout_mutex;
 27439    rc = pthread_cond_init(&condpair->cond[0], NULL);
 27440    if (unlikely(rc != 0))
 27441      goto bailout_cond;
 27442    rc = pthread_cond_init(&condpair->cond[1], NULL);
 27443    if (likely(rc == 0))
 27444      return MDBX_SUCCESS;
 27445  
 27446    (void)pthread_cond_destroy(&condpair->cond[0]);
 27447  bailout_cond:
 27448    (void)pthread_mutex_destroy(&condpair->mutex);
 27449  #endif
 27450  bailout_mutex:
 27451    memset(condpair, 0, sizeof(osal_condpair_t));
 27452    return rc;
 27453  }
 27454  
 27455  MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) {
 27456  #if defined(_WIN32) || defined(_WIN64)
 27457    int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
 27458    rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError();
 27459    rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError();
 27460  #else
 27461    int err, rc = pthread_mutex_destroy(&condpair->mutex);
 27462    rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc;
 27463    rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc;
 27464  #endif
 27465    memset(condpair, 0, sizeof(osal_condpair_t));
 27466    return rc;
 27467  }
 27468  
 27469  MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) {
 27470  #if defined(_WIN32) || defined(_WIN64)
 27471    DWORD code = WaitForSingleObject(condpair->mutex, INFINITE);
 27472    return waitstatus2errcode(code);
 27473  #else
 27474    return osal_pthread_mutex_lock(&condpair->mutex);
 27475  #endif
 27476  }
 27477  
 27478  MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) {
 27479  #if defined(_WIN32) || defined(_WIN64)
 27480    return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
 27481  #else
 27482    return pthread_mutex_unlock(&condpair->mutex);
 27483  #endif
 27484  }
 27485  
 27486  MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair,
 27487                                              bool part) {
 27488  #if defined(_WIN32) || defined(_WIN64)
 27489    return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError();
 27490  #else
 27491    return pthread_cond_signal(&condpair->cond[part]);
 27492  #endif
 27493  }
 27494  
 27495  MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair,
 27496                                            bool part) {
 27497  #if defined(_WIN32) || defined(_WIN64)
 27498    DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part],
 27499                                     INFINITE, FALSE);
 27500    if (code == WAIT_OBJECT_0) {
 27501      code = WaitForSingleObject(condpair->mutex, INFINITE);
 27502      if (code == WAIT_OBJECT_0)
 27503        return MDBX_SUCCESS;
 27504    }
 27505    return waitstatus2errcode(code);
 27506  #else
 27507    return pthread_cond_wait(&condpair->cond[part], &condpair->mutex);
 27508  #endif
 27509  }
 27510  
 27511  /*----------------------------------------------------------------------------*/
 27512  
 27513  MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) {
 27514  #if defined(_WIN32) || defined(_WIN64)
 27515    InitializeCriticalSection(fastmutex);
 27516    return MDBX_SUCCESS;
 27517  #else
 27518    return pthread_mutex_init(fastmutex, NULL);
 27519  #endif
 27520  }
 27521  
 27522  MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) {
 27523  #if defined(_WIN32) || defined(_WIN64)
 27524    DeleteCriticalSection(fastmutex);
 27525    return MDBX_SUCCESS;
 27526  #else
 27527    return pthread_mutex_destroy(fastmutex);
 27528  #endif
 27529  }
 27530  
 27531  MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) {
 27532  #if defined(_WIN32) || defined(_WIN64)
 27533    __try {
 27534      EnterCriticalSection(fastmutex);
 27535    } __except (
 27536        (GetExceptionCode() ==
 27537         0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
 27538            ? EXCEPTION_EXECUTE_HANDLER
 27539            : EXCEPTION_CONTINUE_SEARCH) {
 27540      return ERROR_POSSIBLE_DEADLOCK;
 27541    }
 27542    return MDBX_SUCCESS;
 27543  #else
 27544    return osal_pthread_mutex_lock(fastmutex);
 27545  #endif
 27546  }
 27547  
 27548  MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) {
 27549  #if defined(_WIN32) || defined(_WIN64)
 27550    LeaveCriticalSection(fastmutex);
 27551    return MDBX_SUCCESS;
 27552  #else
 27553    return pthread_mutex_unlock(fastmutex);
 27554  #endif
 27555  }
 27556  
 27557  /*----------------------------------------------------------------------------*/
 27558  
 27559  #if defined(_WIN32) || defined(_WIN64)
 27560  
 27561  #ifndef WC_ERR_INVALID_CHARS
 27562  static const DWORD WC_ERR_INVALID_CHARS =
 27563      (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion())))
 27564          ? 0x00000080
 27565          : 0;
 27566  #endif /* WC_ERR_INVALID_CHARS */
 27567  
 27568  MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src,
 27569                                      size_t src_n) {
 27570    return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src,
 27571                               (int)src_n, dst, (int)dst_n);
 27572  }
 27573  
 27574  #endif /* Windows */
 27575  
 27576  /*----------------------------------------------------------------------------*/
 27577  
 27578  MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) {
 27579  #if defined(_WIN32) || defined(_WIN64)
 27580    return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
 27581  #else
 27582    return unlink(pathname) ? errno : MDBX_SUCCESS;
 27583  #endif
 27584  }
 27585  
 27586  #if !(defined(_WIN32) || defined(_WIN64))
 27587  static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); }
 27588  #endif /*! Windows */
 27589  
 27590  MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) {
 27591  #if defined(_WIN32) || defined(_WIN64)
 27592    return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
 27593  #else
 27594    return rmdir(pathname) ? errno : MDBX_SUCCESS;
 27595  #endif
 27596  }
 27597  
 27598  MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose,
 27599                                       const MDBX_env *env,
 27600                                       const pathchar_t *pathname,
 27601                                       mdbx_filehandle_t *fd,
 27602                                       mdbx_mode_t unix_mode_bits) {
 27603    *fd = INVALID_HANDLE_VALUE;
 27604  
 27605  #if defined(_WIN32) || defined(_WIN64)
 27606    DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING;
 27607    DWORD FlagsAndAttributes =
 27608        FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
 27609    DWORD DesiredAccess = FILE_READ_ATTRIBUTES;
 27610    DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE)
 27611                          ? 0
 27612                          : (FILE_SHARE_READ | FILE_SHARE_WRITE);
 27613  
 27614    switch (purpose) {
 27615    default:
 27616      return ERROR_INVALID_PARAMETER;
 27617    case MDBX_OPEN_LCK:
 27618      CreationDisposition = OPEN_ALWAYS;
 27619      DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
 27620      FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY;
 27621      break;
 27622    case MDBX_OPEN_DXB_READ:
 27623      CreationDisposition = OPEN_EXISTING;
 27624      DesiredAccess |= GENERIC_READ;
 27625      ShareMode |= FILE_SHARE_READ;
 27626      break;
 27627    case MDBX_OPEN_DXB_LAZY:
 27628      DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
 27629      break;
 27630    case MDBX_OPEN_DXB_DSYNC:
 27631      CreationDisposition = OPEN_EXISTING;
 27632      DesiredAccess |= GENERIC_WRITE;
 27633      FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
 27634      break;
 27635    case MDBX_OPEN_COPY:
 27636      CreationDisposition = CREATE_NEW;
 27637      ShareMode = 0;
 27638      DesiredAccess |= GENERIC_WRITE;
 27639      FlagsAndAttributes |=
 27640          (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING;
 27641      break;
 27642    case MDBX_OPEN_DELETE:
 27643      CreationDisposition = OPEN_EXISTING;
 27644      ShareMode |= FILE_SHARE_DELETE;
 27645      DesiredAccess =
 27646          FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE;
 27647      break;
 27648    }
 27649  
 27650    *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL,
 27651                      CreationDisposition, FlagsAndAttributes, NULL);
 27652    if (*fd == INVALID_HANDLE_VALUE) {
 27653      int err = (int)GetLastError();
 27654      if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) {
 27655        if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES &&
 27656            GetLastError() == ERROR_FILE_NOT_FOUND)
 27657          err = ERROR_FILE_NOT_FOUND;
 27658      }
 27659      return err;
 27660    }
 27661  
 27662    BY_HANDLE_FILE_INFORMATION info;
 27663    if (!GetFileInformationByHandle(*fd, &info)) {
 27664      int err = (int)GetLastError();
 27665      CloseHandle(*fd);
 27666      *fd = INVALID_HANDLE_VALUE;
 27667      return err;
 27668    }
 27669    const DWORD AttributesDiff =
 27670        (info.dwFileAttributes ^ FlagsAndAttributes) &
 27671        (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED |
 27672         FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED);
 27673    if (AttributesDiff)
 27674      (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff);
 27675  
 27676  #else
 27677    int flags = unix_mode_bits ? O_CREAT : 0;
 27678    switch (purpose) {
 27679    default:
 27680      return EINVAL;
 27681    case MDBX_OPEN_LCK:
 27682      flags |= O_RDWR;
 27683      break;
 27684    case MDBX_OPEN_DXB_READ:
 27685      flags = O_RDONLY;
 27686      break;
 27687    case MDBX_OPEN_DXB_LAZY:
 27688      flags |= O_RDWR;
 27689      break;
 27690    case MDBX_OPEN_COPY:
 27691      flags = O_CREAT | O_WRONLY | O_EXCL;
 27692      break;
 27693    case MDBX_OPEN_DXB_DSYNC:
 27694      flags |= O_WRONLY;
 27695  #if defined(O_DSYNC)
 27696      flags |= O_DSYNC;
 27697  #elif defined(O_SYNC)
 27698      flags |= O_SYNC;
 27699  #elif defined(O_FSYNC)
 27700      flags |= O_FSYNC;
 27701  #endif
 27702      break;
 27703    case MDBX_OPEN_DELETE:
 27704      flags = O_RDWR;
 27705      break;
 27706    }
 27707  
 27708    const bool direct_nocache_for_copy =
 27709        env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY;
 27710    if (direct_nocache_for_copy) {
 27711  #if defined(O_DIRECT)
 27712      flags |= O_DIRECT;
 27713  #endif /* O_DIRECT */
 27714  #if defined(O_NOCACHE)
 27715      flags |= O_NOCACHE;
 27716  #endif /* O_NOCACHE */
 27717    }
 27718  
 27719  #ifdef O_CLOEXEC
 27720    flags |= O_CLOEXEC;
 27721  #endif /* O_CLOEXEC */
 27722  
 27723    /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */
 27724  #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
 27725    int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1;
 27726    static const char dev_null[] = "/dev/null";
 27727    if (!is_valid_fd(STDIN_FILENO)) {
 27728      WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN",
 27729              STDIN_FILENO, dev_null);
 27730      stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY);
 27731    }
 27732    if (!is_valid_fd(STDOUT_FILENO)) {
 27733      WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT",
 27734              STDOUT_FILENO, dev_null);
 27735      stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY);
 27736    }
 27737    if (!is_valid_fd(STDERR_FILENO)) {
 27738      WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR",
 27739              STDERR_FILENO, dev_null);
 27740      stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY);
 27741    }
 27742  #else
 27743  #error "Unexpected or unsupported UNIX or POSIX system"
 27744  #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
 27745  
 27746    *fd = open(pathname, flags, unix_mode_bits);
 27747  #if defined(O_DIRECT)
 27748    if (*fd < 0 && (flags & O_DIRECT) &&
 27749        (errno == EINVAL || errno == EAFNOSUPPORT)) {
 27750      flags &= ~(O_DIRECT | O_EXCL);
 27751      *fd = open(pathname, flags, unix_mode_bits);
 27752    }
 27753  #endif /* O_DIRECT */
 27754  
 27755    if (*fd < 0 && errno == EACCES && purpose == MDBX_OPEN_LCK) {
 27756      struct stat unused;
 27757      if (stat(pathname, &unused) == 0 || errno != ENOENT)
 27758        errno = EACCES /* restore errno if file exists */;
 27759    }
 27760  
 27761    /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */
 27762  #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
 27763    if (*fd == STDIN_FILENO) {
 27764      WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN",
 27765              STDIN_FILENO);
 27766      assert(stub_fd0 == -1);
 27767      *fd = dup(stub_fd0 = *fd);
 27768    }
 27769    if (*fd == STDOUT_FILENO) {
 27770      WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT",
 27771              STDOUT_FILENO);
 27772      assert(stub_fd1 == -1);
 27773      *fd = dup(stub_fd1 = *fd);
 27774    }
 27775    if (*fd == STDERR_FILENO) {
 27776      WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR",
 27777              STDERR_FILENO);
 27778      assert(stub_fd2 == -1);
 27779      *fd = dup(stub_fd2 = *fd);
 27780    }
 27781    if (stub_fd0 != -1)
 27782      close(stub_fd0);
 27783    if (stub_fd1 != -1)
 27784      close(stub_fd1);
 27785    if (stub_fd2 != -1)
 27786      close(stub_fd2);
 27787    if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) {
 27788      ERROR("Rejecting the use of a FD in the range "
 27789            "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption",
 27790            STDIN_FILENO, STDERR_FILENO);
 27791      close(*fd);
 27792      return EBADF;
 27793    }
 27794  #else
 27795  #error "Unexpected or unsupported UNIX or POSIX system"
 27796  #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
 27797  
 27798    if (*fd < 0)
 27799      return errno;
 27800  
 27801  #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
 27802    const int fd_flags = fcntl(*fd, F_GETFD);
 27803    if (fd_flags != -1)
 27804      (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC);
 27805  #endif /* FD_CLOEXEC && !O_CLOEXEC */
 27806  
 27807    if (direct_nocache_for_copy) {
 27808  #if defined(F_NOCACHE) && !defined(O_NOCACHE)
 27809      (void)fcntl(*fd, F_NOCACHE, 1);
 27810  #endif /* F_NOCACHE */
 27811    }
 27812  
 27813  #endif
 27814    return MDBX_SUCCESS;
 27815  }
 27816  
 27817  MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) {
 27818  #if defined(_WIN32) || defined(_WIN64)
 27819    return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError();
 27820  #else
 27821    assert(fd > STDERR_FILENO);
 27822    return (close(fd) == 0) ? MDBX_SUCCESS : errno;
 27823  #endif
 27824  }
 27825  
 27826  MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes,
 27827                                    uint64_t offset) {
 27828    if (bytes > MAX_WRITE)
 27829      return MDBX_EINVAL;
 27830  #if defined(_WIN32) || defined(_WIN64)
 27831    OVERLAPPED ov;
 27832    ov.hEvent = 0;
 27833    ov.Offset = (DWORD)offset;
 27834    ov.OffsetHigh = HIGH_DWORD(offset);
 27835  
 27836    DWORD read = 0;
 27837    if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) {
 27838      int rc = (int)GetLastError();
 27839      return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc;
 27840    }
 27841  #else
 27842    STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 27843                      "libmdbx requires 64-bit file I/O on 64-bit systems");
 27844    intptr_t read = pread(fd, buf, bytes, offset);
 27845    if (read < 0) {
 27846      int rc = errno;
 27847      return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc;
 27848    }
 27849  #endif
 27850    return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA;
 27851  }
 27852  
 27853  MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
 27854                                     size_t bytes, uint64_t offset) {
 27855    while (true) {
 27856  #if defined(_WIN32) || defined(_WIN64)
 27857      OVERLAPPED ov;
 27858      ov.hEvent = 0;
 27859      ov.Offset = (DWORD)offset;
 27860      ov.OffsetHigh = HIGH_DWORD(offset);
 27861  
 27862      DWORD written;
 27863      if (unlikely(!WriteFile(
 27864              fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
 27865              &written, &ov)))
 27866        return (int)GetLastError();
 27867      if (likely(bytes == written))
 27868        return MDBX_SUCCESS;
 27869  #else
 27870      STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 27871                        "libmdbx requires 64-bit file I/O on 64-bit systems");
 27872      const intptr_t written =
 27873          pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset);
 27874      if (likely(bytes == (size_t)written))
 27875        return MDBX_SUCCESS;
 27876      if (written < 0) {
 27877        const int rc = errno;
 27878        if (rc != EINTR)
 27879          return rc;
 27880        continue;
 27881      }
 27882  #endif
 27883      bytes -= written;
 27884      offset += written;
 27885      buf = (char *)buf + written;
 27886    }
 27887  }
 27888  
 27889  MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf,
 27890                                    size_t bytes) {
 27891    while (true) {
 27892  #if defined(_WIN32) || defined(_WIN64)
 27893      DWORD written;
 27894      if (unlikely(!WriteFile(
 27895              fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
 27896              &written, nullptr)))
 27897        return (int)GetLastError();
 27898      if (likely(bytes == written))
 27899        return MDBX_SUCCESS;
 27900  #else
 27901      STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 27902                        "libmdbx requires 64-bit file I/O on 64-bit systems");
 27903      const intptr_t written =
 27904          write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE);
 27905      if (likely(bytes == (size_t)written))
 27906        return MDBX_SUCCESS;
 27907      if (written < 0) {
 27908        const int rc = errno;
 27909        if (rc != EINTR)
 27910          return rc;
 27911        continue;
 27912      }
 27913  #endif
 27914      bytes -= written;
 27915      buf = (char *)buf + written;
 27916    }
 27917  }
 27918  
 27919  int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
 27920                   uint64_t offset, size_t expected_written) {
 27921  #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) ||                \
 27922      (defined(__ANDROID_API__) && __ANDROID_API__ < 24)
 27923    size_t written = 0;
 27924    for (int i = 0; i < iovcnt; ++i) {
 27925      int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
 27926      if (unlikely(rc != MDBX_SUCCESS))
 27927        return rc;
 27928      written += iov[i].iov_len;
 27929      offset += iov[i].iov_len;
 27930    }
 27931    return (expected_written == written) ? MDBX_SUCCESS
 27932                                         : MDBX_EIO /* ERROR_WRITE_FAULT */;
 27933  #else
 27934    int rc;
 27935    intptr_t written;
 27936    do {
 27937      STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 27938                        "libmdbx requires 64-bit file I/O on 64-bit systems");
 27939      written = pwritev(fd, iov, iovcnt, offset);
 27940      if (likely(expected_written == (size_t)written))
 27941        return MDBX_SUCCESS;
 27942      rc = errno;
 27943    } while (rc == EINTR);
 27944    return (written < 0) ? rc : MDBX_EIO /* Use which error code? */;
 27945  #endif
 27946  }
 27947  
 27948  MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd,
 27949                                    enum osal_syncmode_bits mode_bits) {
 27950  #if defined(_WIN32) || defined(_WIN64)
 27951    if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd))
 27952      return (int)GetLastError();
 27953    return MDBX_SUCCESS;
 27954  #else
 27955  
 27956  #if defined(__APPLE__) &&                                                      \
 27957      MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
 27958    if (mode_bits & MDBX_SYNC_IODQ)
 27959      return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
 27960  #endif /* MacOS */
 27961  
 27962    /* LY: This approach is always safe and without appreciable performance
 27963     * degradation, even on a kernel with fdatasync's bug.
 27964     *
 27965     * For more info about of a corresponding fdatasync() bug
 27966     * see http://www.spinics.net/lists/linux-ext4/msg33714.html */
 27967    while (1) {
 27968      switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) {
 27969      case MDBX_SYNC_NONE:
 27970        return MDBX_SUCCESS /* nothing to do */;
 27971  #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0
 27972      case MDBX_SYNC_DATA:
 27973        if (fdatasync(fd) == 0)
 27974          return MDBX_SUCCESS;
 27975        break /* error */;
 27976  #if defined(__linux__) || defined(__gnu_linux__)
 27977      case MDBX_SYNC_SIZE:
 27978        if (linux_kernel_version >= 0x03060000)
 27979          return MDBX_SUCCESS;
 27980        __fallthrough /* fall through */;
 27981  #endif /* Linux */
 27982  #endif /* _POSIX_SYNCHRONIZED_IO > 0 */
 27983      default:
 27984        if (fsync(fd) == 0)
 27985          return MDBX_SUCCESS;
 27986      }
 27987  
 27988      int rc = errno;
 27989      if (rc != EINTR)
 27990        return rc;
 27991    }
 27992  #endif
 27993  }
 27994  
 27995  int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) {
 27996  #if defined(_WIN32) || defined(_WIN64)
 27997    BY_HANDLE_FILE_INFORMATION info;
 27998    if (!GetFileInformationByHandle(fd, &info))
 27999      return (int)GetLastError();
 28000    *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
 28001  #else
 28002    struct stat st;
 28003  
 28004    STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
 28005                      "libmdbx requires 64-bit file I/O on 64-bit systems");
 28006    if (fstat(fd, &st))
 28007      return errno;
 28008  
 28009    *length = st.st_size;
 28010  #endif
 28011    return MDBX_SUCCESS;
 28012  }
 28013  
 28014  MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) {
 28015  #if defined(_WIN32) || defined(_WIN64)
 28016    switch (GetFileType(fd)) {
 28017    case FILE_TYPE_DISK:
 28018      return MDBX_RESULT_FALSE;
 28019    case FILE_TYPE_CHAR:
 28020    case FILE_TYPE_PIPE:
 28021      return MDBX_RESULT_TRUE;
 28022    default:
 28023      return (int)GetLastError();
 28024    }
 28025  #else
 28026    struct stat info;
 28027    if (fstat(fd, &info))
 28028      return errno;
 28029    switch (info.st_mode & S_IFMT) {
 28030    case S_IFBLK:
 28031    case S_IFREG:
 28032      return MDBX_RESULT_FALSE;
 28033    case S_IFCHR:
 28034    case S_IFIFO:
 28035    case S_IFSOCK:
 28036      return MDBX_RESULT_TRUE;
 28037    case S_IFDIR:
 28038    case S_IFLNK:
 28039    default:
 28040      return MDBX_INCOMPATIBLE;
 28041    }
 28042  #endif
 28043  }
 28044  
 28045  MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
 28046  #if defined(_WIN32) || defined(_WIN64)
 28047    if (mdbx_SetFileInformationByHandle) {
 28048      FILE_END_OF_FILE_INFO EndOfFileInfo;
 28049      EndOfFileInfo.EndOfFile.QuadPart = length;
 28050      return mdbx_SetFileInformationByHandle(fd, FileEndOfFileInfo,
 28051                                             &EndOfFileInfo,
 28052                                             sizeof(FILE_END_OF_FILE_INFO))
 28053                 ? MDBX_SUCCESS
 28054                 : (int)GetLastError();
 28055    } else {
 28056      LARGE_INTEGER li;
 28057      li.QuadPart = length;
 28058      return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd))
 28059                 ? MDBX_SUCCESS
 28060                 : (int)GetLastError();
 28061    }
 28062  #else
 28063    STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 28064                      "libmdbx requires 64-bit file I/O on 64-bit systems");
 28065    return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno;
 28066  #endif
 28067  }
 28068  
 28069  MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) {
 28070  #if defined(_WIN32) || defined(_WIN64)
 28071    LARGE_INTEGER li;
 28072    li.QuadPart = pos;
 28073    return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS
 28074                                                      : (int)GetLastError();
 28075  #else
 28076    STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
 28077                      "libmdbx requires 64-bit file I/O on 64-bit systems");
 28078    return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS;
 28079  #endif
 28080  }
 28081  
 28082  /*----------------------------------------------------------------------------*/
 28083  
 28084  MDBX_INTERNAL_FUNC int
 28085  osal_thread_create(osal_thread_t *thread,
 28086                     THREAD_RESULT(THREAD_CALL *start_routine)(void *),
 28087                     void *arg) {
 28088  #if defined(_WIN32) || defined(_WIN64)
 28089    *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL);
 28090    return *thread ? MDBX_SUCCESS : (int)GetLastError();
 28091  #else
 28092    return pthread_create(thread, NULL, start_routine, arg);
 28093  #endif
 28094  }
 28095  
 28096  MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) {
 28097  #if defined(_WIN32) || defined(_WIN64)
 28098    DWORD code = WaitForSingleObject(thread, INFINITE);
 28099    return waitstatus2errcode(code);
 28100  #else
 28101    void *unused_retval = &unused_retval;
 28102    return pthread_join(thread, &unused_retval);
 28103  #endif
 28104  }
 28105  
 28106  /*----------------------------------------------------------------------------*/
 28107  
 28108  MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset,
 28109                                    size_t length,
 28110                                    enum osal_syncmode_bits mode_bits) {
 28111    uint8_t *ptr = (uint8_t *)map->address + offset;
 28112  #if defined(_WIN32) || defined(_WIN64)
 28113    if (!FlushViewOfFile(ptr, length))
 28114      return (int)GetLastError();
 28115  #else
 28116  #if defined(__linux__) || defined(__gnu_linux__)
 28117    if (mode_bits == MDBX_SYNC_NONE && linux_kernel_version > 0x02061300)
 28118      /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly
 28119       * tracks dirty pages and flushes them to storage as necessary. */
 28120      return MDBX_SUCCESS;
 28121  #endif /* Linux */
 28122    if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC))
 28123      return errno;
 28124    mode_bits &= ~MDBX_SYNC_DATA;
 28125  #endif
 28126    return osal_fsync(map->fd, mode_bits);
 28127  }
 28128  
 28129  MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle,
 28130                                              const pathchar_t *pathname,
 28131                                              int err) {
 28132  #if defined(_WIN32) || defined(_WIN64)
 28133    (void)pathname;
 28134    (void)err;
 28135    if (!mdbx_GetVolumeInformationByHandleW)
 28136      return MDBX_ENOSYS;
 28137    DWORD unused, flags;
 28138    if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused,
 28139                                            &flags, nullptr, 0))
 28140      return (int)GetLastError();
 28141    if ((flags & FILE_READ_ONLY_VOLUME) == 0)
 28142      return MDBX_EACCESS;
 28143  #else
 28144    struct statvfs info;
 28145    if (err != MDBX_ENOFILE) {
 28146      if (statvfs(pathname, &info) == 0 && (info.f_flag & ST_RDONLY) == 0)
 28147        return err;
 28148      if (errno != MDBX_ENOFILE)
 28149        return errno;
 28150    }
 28151    if (fstatvfs(handle, &info))
 28152      return errno;
 28153    if ((info.f_flag & ST_RDONLY) == 0)
 28154      return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err;
 28155  #endif /* !Windows */
 28156    return MDBX_SUCCESS;
 28157  }
 28158  
 28159  static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) {
 28160  #if defined(_WIN32) || defined(_WIN64)
 28161    if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE))
 28162      return ERROR_NOT_CAPABLE /* workaround for Wine */;
 28163  
 28164    if (GetFileType(handle) != FILE_TYPE_DISK)
 28165      return ERROR_FILE_OFFLINE;
 28166  
 28167    if (mdbx_GetFileInformationByHandleEx) {
 28168      FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo;
 28169      if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo,
 28170                                            &RemoteProtocolInfo,
 28171                                            sizeof(RemoteProtocolInfo))) {
 28172        if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) &&
 28173            !(flags & MDBX_RDONLY))
 28174          return ERROR_FILE_OFFLINE;
 28175        if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) &&
 28176            !(flags & MDBX_EXCLUSIVE))
 28177          return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
 28178      }
 28179    }
 28180  
 28181    if (mdbx_NtFsControlFile) {
 28182      NTSTATUS rc;
 28183      struct {
 28184        WOF_EXTERNAL_INFO wof_info;
 28185        union {
 28186          WIM_PROVIDER_EXTERNAL_INFO wim_info;
 28187          FILE_PROVIDER_EXTERNAL_INFO_V1 file_info;
 28188        };
 28189        size_t reserved_for_microsoft_madness[42];
 28190      } GetExternalBacking_OutputBuffer;
 28191      IO_STATUS_BLOCK StatusBlock;
 28192      rc = mdbx_NtFsControlFile(handle, NULL, NULL, NULL, &StatusBlock,
 28193                                FSCTL_GET_EXTERNAL_BACKING, NULL, 0,
 28194                                &GetExternalBacking_OutputBuffer,
 28195                                sizeof(GetExternalBacking_OutputBuffer));
 28196      if (NT_SUCCESS(rc)) {
 28197        if (!(flags & MDBX_EXCLUSIVE))
 28198          return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
 28199      } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED &&
 28200                 rc != STATUS_INVALID_DEVICE_REQUEST &&
 28201                 rc != STATUS_NOT_SUPPORTED)
 28202        return ntstatus2errcode(rc);
 28203    }
 28204  
 28205    if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) {
 28206      WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX);
 28207      if (!PathBuffer)
 28208        return MDBX_ENOMEM;
 28209  
 28210      int rc = MDBX_SUCCESS;
 28211      DWORD VolumeSerialNumber, FileSystemFlags;
 28212      if (!mdbx_GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX,
 28213                                              &VolumeSerialNumber, NULL,
 28214                                              &FileSystemFlags, NULL, 0)) {
 28215        rc = (int)GetLastError();
 28216        goto bailout;
 28217      }
 28218  
 28219      if ((flags & MDBX_RDONLY) == 0) {
 28220        if (FileSystemFlags &
 28221            (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME |
 28222             FILE_VOLUME_IS_COMPRESSED)) {
 28223          rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
 28224          goto bailout;
 28225        }
 28226      }
 28227  
 28228      if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
 28229                                         FILE_NAME_NORMALIZED | VOLUME_NAME_NT)) {
 28230        if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) {
 28231          if (!(flags & MDBX_EXCLUSIVE)) {
 28232            rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
 28233            goto bailout;
 28234          }
 28235        }
 28236      }
 28237  
 28238      if (F_ISSET(flags, MDBX_RDONLY | MDBX_EXCLUSIVE) &&
 28239          (FileSystemFlags & FILE_READ_ONLY_VOLUME)) {
 28240        /* without-LCK (exclusive readonly) mode for DB on a read-only volume */
 28241        goto bailout;
 28242      }
 28243  
 28244      if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
 28245                                         FILE_NAME_NORMALIZED |
 28246                                             VOLUME_NAME_DOS)) {
 28247        UINT DriveType = GetDriveTypeW(PathBuffer);
 28248        if (DriveType == DRIVE_NO_ROOT_DIR &&
 28249            _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 &&
 28250            _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) {
 28251          PathBuffer[7] = 0;
 28252          DriveType = GetDriveTypeW(PathBuffer + 4);
 28253        }
 28254        switch (DriveType) {
 28255        case DRIVE_CDROM:
 28256          if (flags & MDBX_RDONLY)
 28257            break;
 28258        // fall through
 28259        case DRIVE_UNKNOWN:
 28260        case DRIVE_NO_ROOT_DIR:
 28261        case DRIVE_REMOTE:
 28262        default:
 28263          if (!(flags & MDBX_EXCLUSIVE))
 28264            rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
 28265        // fall through
 28266        case DRIVE_REMOVABLE:
 28267        case DRIVE_FIXED:
 28268        case DRIVE_RAMDISK:
 28269          break;
 28270        }
 28271      }
 28272  
 28273    bailout:
 28274      osal_free(PathBuffer);
 28275      return rc;
 28276    }
 28277  
 28278  #else
 28279  
 28280    struct statvfs statvfs_info;
 28281    if (fstatvfs(handle, &statvfs_info))
 28282      return errno;
 28283  #if defined(ST_LOCAL) || defined(ST_EXPORTED)
 28284    const unsigned long st_flags = statvfs_info.f_flag;
 28285  #endif /* ST_LOCAL || ST_EXPORTED */
 28286  
 28287  #if defined(__NetBSD__)
 28288    const unsigned type = 0;
 28289    const char *const name = statvfs_info.f_fstypename;
 28290    const size_t name_len = VFS_NAMELEN;
 28291  #elif defined(_AIX) || defined(__OS400__)
 28292    const char *const name = statvfs_info.f_basetype;
 28293    const size_t name_len = sizeof(statvfs_info.f_basetype);
 28294    struct stat st;
 28295    if (fstat(handle, &st))
 28296      return errno;
 28297    const unsigned type = st.st_vfstype;
 28298    if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE))
 28299      return MDBX_EREMOTE;
 28300  #elif defined(FSTYPSZ) || defined(_FSTYPSZ)
 28301    const unsigned type = 0;
 28302    const char *const name = statvfs_info.f_basetype;
 28303    const size_t name_len = sizeof(statvfs_info.f_basetype);
 28304  #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) ||                \
 28305      defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ)
 28306    const unsigned type = 0;
 28307    struct stat st;
 28308    if (fstat(handle, &st))
 28309      return errno;
 28310    const char *const name = st.st_fstype;
 28311    const size_t name_len = strlen(name);
 28312  #else
 28313    struct statfs statfs_info;
 28314    if (fstatfs(handle, &statfs_info))
 28315      return errno;
 28316  #if defined(__OpenBSD__)
 28317    const unsigned type = 0;
 28318  #else
 28319    const unsigned type = statfs_info.f_type;
 28320  #endif
 28321  #if defined(MNT_LOCAL) || defined(MNT_EXPORTED)
 28322    const unsigned long mnt_flags = statfs_info.f_flags;
 28323  #endif /* MNT_LOCAL || MNT_EXPORTED */
 28324  #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
 28325      defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) ||         \
 28326      defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) ||          \
 28327      defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN)
 28328    const char *const name = statfs_info.f_fstypename;
 28329    const size_t name_len = sizeof(statfs_info.f_fstypename);
 28330  #elif defined(__ANDROID_API__) && __ANDROID_API__ < 21
 28331    const char *const name = "";
 28332    const unsigned name_len = 0;
 28333  #else
 28334  
 28335    const char *name = "";
 28336    unsigned name_len = 0;
 28337  
 28338    struct stat st;
 28339    if (fstat(handle, &st))
 28340      return errno;
 28341  
 28342    char pathbuf[PATH_MAX];
 28343    FILE *mounted = nullptr;
 28344  #if defined(__linux__) || defined(__gnu_linux__)
 28345    mounted = setmntent("/proc/mounts", "r");
 28346  #endif /* Linux */
 28347    if (!mounted)
 28348      mounted = setmntent("/etc/mtab", "r");
 28349    if (mounted) {
 28350      const struct mntent *ent;
 28351  #if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) ||    \
 28352      (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19))
 28353      struct mntent entbuf;
 28354      const bool should_copy = false;
 28355      while (nullptr !=
 28356             (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf))))
 28357  #else
 28358      const bool should_copy = true;
 28359      while (nullptr != (ent = getmntent(mounted)))
 28360  #endif
 28361      {
 28362        struct stat mnt;
 28363        if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) {
 28364          if (should_copy) {
 28365            name =
 28366                strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1);
 28367            pathbuf[name_len] = 0;
 28368          } else {
 28369            name = ent->mnt_fsname;
 28370            name_len = strlen(name);
 28371          }
 28372          break;
 28373        }
 28374      }
 28375      endmntent(mounted);
 28376    }
 28377  #endif /* !xBSD && !Android/Bionic */
 28378  #endif
 28379  
 28380    if (name_len) {
 28381      if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) ||
 28382           strncasecmp("cifs", name, name_len) == 0 ||
 28383           strncasecmp("ncpfs", name, name_len) == 0 ||
 28384           strncasecmp("smbfs", name, name_len) == 0 ||
 28385           strcasecmp("9P" /* WSL2 */, name) == 0 ||
 28386           ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) &&
 28387            strncasecmp("fuseblk", name, name_len) != 0)) &&
 28388          !(flags & MDBX_EXCLUSIVE))
 28389        return MDBX_EREMOTE;
 28390      if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 ||
 28391          strcasecmp("sshfs", name) == 0)
 28392        return MDBX_EREMOTE;
 28393    }
 28394  
 28395  #ifdef ST_LOCAL
 28396    if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
 28397      return MDBX_EREMOTE;
 28398  #elif defined(MNT_LOCAL)
 28399    if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
 28400      return MDBX_EREMOTE;
 28401  #endif /* ST/MNT_LOCAL */
 28402  
 28403  #ifdef ST_EXPORTED
 28404    if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
 28405      return MDBX_EREMOTE;
 28406  #elif defined(MNT_EXPORTED)
 28407    if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
 28408      return MDBX_EREMOTE;
 28409  #endif /* ST/MNT_EXPORTED */
 28410  
 28411    switch (type) {
 28412    case 0xFF534D42 /* CIFS_MAGIC_NUMBER */:
 28413    case 0x6969 /* NFS_SUPER_MAGIC */:
 28414    case 0x564c /* NCP_SUPER_MAGIC */:
 28415    case 0x517B /* SMB_SUPER_MAGIC */:
 28416  #if defined(__digital__) || defined(__osf__) || defined(__osf)
 28417    case 0x0E /* Tru64 NFS */:
 28418  #endif
 28419  #ifdef ST_FST_NFS
 28420    case ST_FST_NFS:
 28421  #endif
 28422      if ((flags & MDBX_EXCLUSIVE) == 0)
 28423        return MDBX_EREMOTE;
 28424    case 0:
 28425    default:
 28426      break;
 28427    }
 28428  #endif /* Unix */
 28429  
 28430    return MDBX_SUCCESS;
 28431  }
 28432  
 28433  static int check_mmap_limit(const size_t limit) {
 28434    const bool should_check =
 28435  #if defined(__SANITIZE_ADDRESS__)
 28436        true;
 28437  #else
 28438        RUNNING_ON_VALGRIND;
 28439  #endif /* __SANITIZE_ADDRESS__ */
 28440  
 28441    if (should_check) {
 28442      intptr_t pagesize, total_ram_pages, avail_ram_pages;
 28443      int err =
 28444          mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages);
 28445      if (unlikely(err != MDBX_SUCCESS))
 28446        return err;
 28447  
 28448      const int log2page = log2n_powerof2(pagesize);
 28449      if ((limit >> (log2page + 7)) > (size_t)total_ram_pages ||
 28450          (limit >> (log2page + 6)) > (size_t)avail_ram_pages) {
 28451        ERROR("%s (%zu pages) is too large for available (%zu pages) or total "
 28452              "(%zu pages) system RAM",
 28453              "database upper size limit", limit >> log2page, avail_ram_pages,
 28454              total_ram_pages);
 28455        return MDBX_TOO_LARGE;
 28456      }
 28457    }
 28458  
 28459    return MDBX_SUCCESS;
 28460  }
 28461  
 28462  MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map,
 28463                                   const size_t size, const size_t limit,
 28464                                   const unsigned options) {
 28465    assert(size <= limit);
 28466    map->limit = 0;
 28467    map->current = 0;
 28468    map->address = nullptr;
 28469    map->filesize = 0;
 28470  #if defined(_WIN32) || defined(_WIN64)
 28471    map->section = NULL;
 28472  #endif /* Windows */
 28473  
 28474    int err = osal_check_fs_local(map->fd, flags);
 28475    if (unlikely(err != MDBX_SUCCESS))
 28476      return err;
 28477  
 28478    err = check_mmap_limit(limit);
 28479    if (unlikely(err != MDBX_SUCCESS))
 28480      return err;
 28481  
 28482    if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) {
 28483      err = osal_ftruncate(map->fd, size);
 28484      if (err != MDBX_SUCCESS)
 28485        return err;
 28486      map->filesize = size;
 28487  #if !(defined(_WIN32) || defined(_WIN64))
 28488      map->current = size;
 28489  #endif /* !Windows */
 28490    } else {
 28491      err = osal_filesize(map->fd, &map->filesize);
 28492      if (err != MDBX_SUCCESS)
 28493        return err;
 28494  #if !(defined(_WIN32) || defined(_WIN64))
 28495      map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
 28496  #endif /* !Windows */
 28497    }
 28498  
 28499  #if defined(_WIN32) || defined(_WIN64)
 28500    LARGE_INTEGER SectionSize;
 28501    SectionSize.QuadPart = size;
 28502    err = NtCreateSection(
 28503        &map->section,
 28504        /* DesiredAccess */
 28505        (flags & MDBX_WRITEMAP)
 28506            ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE |
 28507                  SECTION_MAP_WRITE
 28508            : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE,
 28509        /* ObjectAttributes */ NULL, /* MaximumSize (InitialSize) */ &SectionSize,
 28510        /* SectionPageProtection */
 28511        (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
 28512        /* AllocationAttributes */ SEC_RESERVE, map->fd);
 28513    if (!NT_SUCCESS(err))
 28514      return ntstatus2errcode(err);
 28515  
 28516    SIZE_T ViewSize = (flags & MDBX_RDONLY)     ? 0
 28517                      : mdbx_RunningUnderWine() ? size
 28518                                                : limit;
 28519    err = NtMapViewOfSection(
 28520        map->section, GetCurrentProcess(), &map->address,
 28521        /* ZeroBits */ 0,
 28522        /* CommitSize */ 0,
 28523        /* SectionOffset */ NULL, &ViewSize,
 28524        /* InheritDisposition */ ViewUnmap,
 28525        /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
 28526        /* Win32Protect */
 28527        (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
 28528    if (!NT_SUCCESS(err)) {
 28529      NtClose(map->section);
 28530      map->section = 0;
 28531      map->address = nullptr;
 28532      return ntstatus2errcode(err);
 28533    }
 28534    assert(map->address != MAP_FAILED);
 28535  
 28536    map->current = (size_t)SectionSize.QuadPart;
 28537    map->limit = ViewSize;
 28538  
 28539  #else /* Windows */
 28540  
 28541  #ifndef MAP_TRYFIXED
 28542  #define MAP_TRYFIXED 0
 28543  #endif
 28544  
 28545  #ifndef MAP_HASSEMAPHORE
 28546  #define MAP_HASSEMAPHORE 0
 28547  #endif
 28548  
 28549  #ifndef MAP_CONCEAL
 28550  #define MAP_CONCEAL 0
 28551  #endif
 28552  
 28553  #ifndef MAP_NOSYNC
 28554  #define MAP_NOSYNC 0
 28555  #endif
 28556  
 28557  #ifndef MAP_FIXED_NOREPLACE
 28558  #define MAP_FIXED_NOREPLACE 0
 28559  #endif
 28560  
 28561  #ifndef MAP_NORESERVE
 28562  #define MAP_NORESERVE 0
 28563  #endif
 28564  
 28565    map->address = mmap(
 28566        NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ,
 28567        MAP_SHARED | MAP_FILE | MAP_NORESERVE |
 28568            (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) |
 28569            ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC
 28570                                               : MAP_CONCEAL),
 28571        map->fd, 0);
 28572  
 28573    if (unlikely(map->address == MAP_FAILED)) {
 28574      map->limit = 0;
 28575      map->current = 0;
 28576      map->address = nullptr;
 28577      return errno;
 28578    }
 28579    map->limit = limit;
 28580  
 28581  #if MDBX_ENABLE_MADVISE
 28582  #ifdef MADV_DONTFORK
 28583    if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0))
 28584      return errno;
 28585  #endif /* MADV_DONTFORK */
 28586  #ifdef MADV_NOHUGEPAGE
 28587    (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE);
 28588  #endif /* MADV_NOHUGEPAGE */
 28589  #endif /* MDBX_ENABLE_MADVISE */
 28590  
 28591  #endif /* ! Windows */
 28592  
 28593    VALGRIND_MAKE_MEM_DEFINED(map->address, map->current);
 28594    MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current);
 28595    return MDBX_SUCCESS;
 28596  }
 28597  
 28598  MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) {
 28599    VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
 28600    /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
 28601     * when this memory will re-used by malloc or another mmapping.
 28602     * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203
 28603     */
 28604    MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address,
 28605                                     (map->filesize && map->filesize < map->limit)
 28606                                         ? map->filesize
 28607                                         : map->limit);
 28608  #if defined(_WIN32) || defined(_WIN64)
 28609    if (map->section)
 28610      NtClose(map->section);
 28611    NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address);
 28612    if (!NT_SUCCESS(rc))
 28613      ntstatus2errcode(rc);
 28614  #else
 28615    if (unlikely(munmap(map->address, map->limit)))
 28616      return errno;
 28617  #endif /* ! Windows */
 28618  
 28619    map->limit = 0;
 28620    map->current = 0;
 28621    map->address = nullptr;
 28622    return MDBX_SUCCESS;
 28623  }
 28624  
 28625  MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map,
 28626                                      size_t size, size_t limit) {
 28627    assert(size <= limit);
 28628  #if defined(_WIN32) || defined(_WIN64)
 28629    assert(size != map->current || limit != map->limit || size < map->filesize);
 28630  
 28631    NTSTATUS status;
 28632    LARGE_INTEGER SectionSize;
 28633    int err, rc = MDBX_SUCCESS;
 28634  
 28635    if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current &&
 28636        /* workaround for Wine */ mdbx_NtExtendSection) {
 28637      /* growth rw-section */
 28638      SectionSize.QuadPart = size;
 28639      status = mdbx_NtExtendSection(map->section, &SectionSize);
 28640      if (!NT_SUCCESS(status))
 28641        return ntstatus2errcode(status);
 28642      map->current = size;
 28643      if (map->filesize < size)
 28644        map->filesize = size;
 28645      return MDBX_SUCCESS;
 28646    }
 28647  
 28648    if (limit > map->limit) {
 28649      err = check_mmap_limit(limit);
 28650      if (unlikely(err != MDBX_SUCCESS))
 28651        return err;
 28652  
 28653      /* check ability of address space for growth before unmap */
 28654      PVOID BaseAddress = (PBYTE)map->address + map->limit;
 28655      SIZE_T RegionSize = limit - map->limit;
 28656      status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0,
 28657                                       &RegionSize, MEM_RESERVE, PAGE_NOACCESS);
 28658      if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
 28659        return MDBX_UNABLE_EXTEND_MAPSIZE;
 28660      if (!NT_SUCCESS(status))
 28661        return ntstatus2errcode(status);
 28662  
 28663      status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize,
 28664                                   MEM_RELEASE);
 28665      if (!NT_SUCCESS(status))
 28666        return ntstatus2errcode(status);
 28667    }
 28668  
 28669    /* Windows unable:
 28670     *  - shrink a mapped file;
 28671     *  - change size of mapped view;
 28672     *  - extend read-only mapping;
 28673     * Therefore we should unmap/map entire section. */
 28674    if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0)
 28675      return MDBX_EPERM;
 28676  
 28677    /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
 28678     * when this memory will re-used by malloc or another mmapping.
 28679     * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203
 28680     */
 28681    MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit);
 28682    status = NtUnmapViewOfSection(GetCurrentProcess(), map->address);
 28683    if (!NT_SUCCESS(status))
 28684      return ntstatus2errcode(status);
 28685    status = NtClose(map->section);
 28686    map->section = NULL;
 28687    PVOID ReservedAddress = NULL;
 28688    SIZE_T ReservedSize = limit;
 28689  
 28690    if (!NT_SUCCESS(status)) {
 28691    bailout_ntstatus:
 28692      err = ntstatus2errcode(status);
 28693    bailout:
 28694      map->address = NULL;
 28695      map->current = map->limit = 0;
 28696      if (ReservedAddress) {
 28697        ReservedSize = 0;
 28698        status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
 28699                                     &ReservedSize, MEM_RELEASE);
 28700        assert(NT_SUCCESS(status));
 28701        (void)status;
 28702      }
 28703      return err;
 28704    }
 28705  
 28706  retry_file_and_section:
 28707    /* resizing of the file may take a while,
 28708     * therefore we reserve address space to avoid occupy it by other threads */
 28709    ReservedAddress = map->address;
 28710    status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0,
 28711                                     &ReservedSize, MEM_RESERVE, PAGE_NOACCESS);
 28712    if (!NT_SUCCESS(status)) {
 28713      ReservedAddress = NULL;
 28714      if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
 28715        goto bailout_ntstatus /* no way to recovery */;
 28716  
 28717      if (flags & MDBX_MRESIZE_MAY_MOVE)
 28718        /* the base address could be changed */
 28719        map->address = NULL;
 28720    }
 28721  
 28722    err = osal_filesize(map->fd, &map->filesize);
 28723    if (err != MDBX_SUCCESS)
 28724      goto bailout;
 28725  
 28726    if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) {
 28727      err = osal_ftruncate(map->fd, size);
 28728      if (err == MDBX_SUCCESS)
 28729        map->filesize = size;
 28730      /* ignore error, because Windows unable shrink file
 28731       * that already mapped (by another process) */
 28732    }
 28733  
 28734    SectionSize.QuadPart = size;
 28735    status = NtCreateSection(
 28736        &map->section,
 28737        /* DesiredAccess */
 28738        (flags & MDBX_WRITEMAP)
 28739            ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE |
 28740                  SECTION_MAP_WRITE
 28741            : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE,
 28742        /* ObjectAttributes */ NULL,
 28743        /* MaximumSize (InitialSize) */ &SectionSize,
 28744        /* SectionPageProtection */
 28745        (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
 28746        /* AllocationAttributes */ SEC_RESERVE, map->fd);
 28747  
 28748    if (!NT_SUCCESS(status))
 28749      goto bailout_ntstatus;
 28750  
 28751    if (ReservedAddress) {
 28752      /* release reserved address space */
 28753      ReservedSize = 0;
 28754      status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
 28755                                   &ReservedSize, MEM_RELEASE);
 28756      ReservedAddress = NULL;
 28757      if (!NT_SUCCESS(status))
 28758        goto bailout_ntstatus;
 28759    }
 28760  
 28761  retry_mapview:;
 28762    SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit;
 28763    status = NtMapViewOfSection(
 28764        map->section, GetCurrentProcess(), &map->address,
 28765        /* ZeroBits */ 0,
 28766        /* CommitSize */ 0,
 28767        /* SectionOffset */ NULL, &ViewSize,
 28768        /* InheritDisposition */ ViewUnmap,
 28769        /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
 28770        /* Win32Protect */
 28771        (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
 28772  
 28773    if (!NT_SUCCESS(status)) {
 28774      if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 &&
 28775          map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) {
 28776        /* try remap at another base address */
 28777        map->address = NULL;
 28778        goto retry_mapview;
 28779      }
 28780      NtClose(map->section);
 28781      map->section = NULL;
 28782  
 28783      if (map->address && (size != map->current || limit != map->limit)) {
 28784        /* try remap with previously size and limit,
 28785         * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */
 28786        rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM;
 28787        size = map->current;
 28788        ReservedSize = limit = map->limit;
 28789        goto retry_file_and_section;
 28790      }
 28791  
 28792      /* no way to recovery */
 28793      goto bailout_ntstatus;
 28794    }
 28795    assert(map->address != MAP_FAILED);
 28796  
 28797    map->current = (size_t)SectionSize.QuadPart;
 28798    map->limit = ViewSize;
 28799  
 28800  #else /* Windows */
 28801  
 28802    map->filesize = 0;
 28803    int rc = osal_filesize(map->fd, &map->filesize);
 28804    if (rc != MDBX_SUCCESS)
 28805      return rc;
 28806  
 28807    if (flags & MDBX_RDONLY) {
 28808      map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
 28809      if (map->current != size)
 28810        rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM;
 28811    } else {
 28812      if (map->filesize != size) {
 28813        rc = osal_ftruncate(map->fd, size);
 28814        if (rc != MDBX_SUCCESS)
 28815          return rc;
 28816        map->filesize = size;
 28817      }
 28818  
 28819      if (map->current > size) {
 28820        /* Clearing asan's bitmask for the region which released in shrinking,
 28821         * since:
 28822         *  - after the shrinking we will get an exception when accessing
 28823         *    this region and (therefore) do not need the help of ASAN.
 28824         *  - this allows us to clear the mask only within the file size
 28825         *    when closing the mapping. */
 28826        MDBX_ASAN_UNPOISON_MEMORY_REGION(
 28827            (char *)map->address + size,
 28828            ((map->current < map->limit) ? map->current : map->limit) - size);
 28829      }
 28830      map->current = size;
 28831    }
 28832  
 28833    if (limit == map->limit)
 28834      return rc;
 28835  
 28836    if (limit < map->limit) {
 28837      /* unmap an excess at end of mapping. */
 28838      // coverity[offset_free : FALSE]
 28839      if (unlikely(munmap(map->dxb + limit, map->limit - limit)))
 28840        return errno;
 28841      map->limit = limit;
 28842      return rc;
 28843    }
 28844  
 28845    int err = check_mmap_limit(limit);
 28846    if (unlikely(err != MDBX_SUCCESS))
 28847      return err;
 28848  
 28849    assert(limit > map->limit);
 28850    uint8_t *ptr = MAP_FAILED;
 28851  
 28852  #if defined(MREMAP_MAYMOVE)
 28853    ptr = mremap(map->address, map->limit, limit,
 28854                 (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0);
 28855    if (ptr == MAP_FAILED) {
 28856      err = errno;
 28857      switch (err) {
 28858      default:
 28859        return err;
 28860      case EAGAIN:
 28861      case ENOMEM:
 28862        return MDBX_UNABLE_EXTEND_MAPSIZE;
 28863      case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */:
 28864        break;
 28865      }
 28866    }
 28867  #endif /* MREMAP_MAYMOVE */
 28868  
 28869    const unsigned mmap_flags =
 28870        MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE |
 28871        (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0);
 28872    const unsigned mmap_prot =
 28873        (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ;
 28874  
 28875    if (ptr == MAP_FAILED) {
 28876      /* Try to mmap additional space beyond the end of mapping. */
 28877      ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot,
 28878                 mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit);
 28879      if (ptr == map->dxb + map->limit)
 28880        ptr = map->dxb;
 28881      else if (ptr != MAP_FAILED) {
 28882        /* the desired address is busy, unmap unsuitable one */
 28883        if (unlikely(munmap(ptr, limit - map->limit)))
 28884          return errno;
 28885        ptr = MAP_FAILED;
 28886      } else {
 28887        err = errno;
 28888        switch (err) {
 28889        default:
 28890          return err;
 28891        case EAGAIN:
 28892        case ENOMEM:
 28893          return MDBX_UNABLE_EXTEND_MAPSIZE;
 28894        case EEXIST: /* address busy */
 28895        case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */
 28896          break;
 28897        }
 28898      }
 28899    }
 28900  
 28901    if (ptr == MAP_FAILED) {
 28902      /* unmap and map again whole region */
 28903      if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) {
 28904        /* TODO: Perhaps here it is worth to implement suspend/resume threads
 28905         * and perform unmap/map as like for Windows. */
 28906        return MDBX_UNABLE_EXTEND_MAPSIZE;
 28907      }
 28908  
 28909      if (unlikely(munmap(map->address, map->limit)))
 28910        return errno;
 28911  
 28912      // coverity[pass_freed_arg : FALSE]
 28913      ptr = mmap(map->address, limit, mmap_prot,
 28914                 (flags & MDBX_MRESIZE_MAY_MOVE)
 28915                     ? mmap_flags
 28916                     : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
 28917                                                         : MAP_FIXED),
 28918                 map->fd, 0);
 28919      if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
 28920          unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
 28921          errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
 28922        // coverity[pass_freed_arg : FALSE]
 28923        ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED,
 28924                   map->fd, 0);
 28925  
 28926      if (unlikely(ptr == MAP_FAILED)) {
 28927        /* try to restore prev mapping */
 28928        // coverity[pass_freed_arg : FALSE]
 28929        ptr = mmap(map->address, map->limit, mmap_prot,
 28930                   (flags & MDBX_MRESIZE_MAY_MOVE)
 28931                       ? mmap_flags
 28932                       : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
 28933                                                           : MAP_FIXED),
 28934                   map->fd, 0);
 28935        if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
 28936            unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
 28937            errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
 28938          // coverity[pass_freed_arg : FALSE]
 28939          ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED,
 28940                     map->fd, 0);
 28941        if (unlikely(ptr == MAP_FAILED)) {
 28942          VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
 28943          /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
 28944           * when this memory will re-used by malloc or another mmapping.
 28945           * See
 28946           * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203
 28947           */
 28948          MDBX_ASAN_UNPOISON_MEMORY_REGION(
 28949              map->address,
 28950              (map->current < map->limit) ? map->current : map->limit);
 28951          map->limit = 0;
 28952          map->current = 0;
 28953          map->address = nullptr;
 28954          return errno;
 28955        }
 28956        rc = MDBX_UNABLE_EXTEND_MAPSIZE;
 28957        limit = map->limit;
 28958      }
 28959    }
 28960  
 28961    assert(ptr && ptr != MAP_FAILED);
 28962    if (map->address != ptr) {
 28963      VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
 28964      /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
 28965       * when this memory will re-used by malloc or another mmapping.
 28966       * See
 28967       * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203
 28968       */
 28969      MDBX_ASAN_UNPOISON_MEMORY_REGION(
 28970          map->address, (map->current < map->limit) ? map->current : map->limit);
 28971  
 28972      VALGRIND_MAKE_MEM_DEFINED(ptr, map->current);
 28973      MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current);
 28974      map->address = ptr;
 28975    }
 28976    map->limit = limit;
 28977  
 28978  #if MDBX_ENABLE_MADVISE
 28979  #ifdef MADV_DONTFORK
 28980    if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0))
 28981      return errno;
 28982  #endif /* MADV_DONTFORK */
 28983  #ifdef MADV_NOHUGEPAGE
 28984    (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE);
 28985  #endif /* MADV_NOHUGEPAGE */
 28986  #endif /* MDBX_ENABLE_MADVISE */
 28987  
 28988  #endif /* POSIX / Windows */
 28989  
 28990    return rc;
 28991  }
 28992  
 28993  /*----------------------------------------------------------------------------*/
 28994  
 28995  __cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) {
 28996    for (;;) {
 28997  #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) ||                \
 28998      defined(__x86_64__)
 28999      const unsigned salt = 277u * (unsigned)__rdtsc();
 29000  #elif (defined(_WIN32) || defined(_WIN64)) && MDBX_WITHOUT_MSVC_CRT
 29001      static ULONG state;
 29002      const unsigned salt = (unsigned)RtlRandomEx(&state);
 29003  #else
 29004      const unsigned salt = rand();
 29005  #endif
 29006  
 29007      const unsigned coin = salt % (tiny ? 29u : 43u);
 29008      if (coin < 43 / 3)
 29009        break;
 29010  #if defined(_WIN32) || defined(_WIN64)
 29011      SwitchToThread();
 29012      if (coin > 43 * 2 / 3)
 29013        Sleep(1);
 29014  #else
 29015      sched_yield();
 29016      if (coin > 43 * 2 / 3)
 29017        usleep(coin);
 29018  #endif
 29019    }
 29020  }
 29021  
 29022  #if defined(_WIN32) || defined(_WIN64)
 29023  #elif defined(__APPLE__) || defined(__MACH__)
 29024  #include <mach/mach_time.h>
 29025  #elif defined(__linux__) || defined(__gnu_linux__)
 29026  __cold static clockid_t choice_monoclock(void) {
 29027    struct timespec probe;
 29028  #if defined(CLOCK_BOOTTIME)
 29029    if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0)
 29030      return CLOCK_BOOTTIME;
 29031  #elif defined(CLOCK_MONOTONIC_RAW)
 29032    if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0)
 29033      return CLOCK_MONOTONIC_RAW;
 29034  #elif defined(CLOCK_MONOTONIC_COARSE)
 29035    if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0)
 29036      return CLOCK_MONOTONIC_COARSE;
 29037  #endif
 29038    return CLOCK_MONOTONIC;
 29039  }
 29040  #endif
 29041  
 29042  /*----------------------------------------------------------------------------*/
 29043  
 29044  #if defined(_WIN32) || defined(_WIN64)
 29045  static LARGE_INTEGER performance_frequency;
 29046  #elif defined(__APPLE__) || defined(__MACH__)
 29047  static uint64_t ratio_16dot16_to_monotine;
 29048  #endif
 29049  
 29050  MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) {
 29051  #if defined(_WIN32) || defined(_WIN64)
 29052    if (unlikely(performance_frequency.QuadPart == 0))
 29053      QueryPerformanceFrequency(&performance_frequency);
 29054    const uint64_t ratio = performance_frequency.QuadPart;
 29055  #elif defined(__APPLE__) || defined(__MACH__)
 29056    if (unlikely(ratio_16dot16_to_monotine == 0)) {
 29057      mach_timebase_info_data_t ti;
 29058      mach_timebase_info(&ti);
 29059      ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer;
 29060    }
 29061    const uint64_t ratio = ratio_16dot16_to_monotine;
 29062  #else
 29063    const uint64_t ratio = UINT64_C(1000000000);
 29064  #endif
 29065    const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16;
 29066    return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1;
 29067  }
 29068  
 29069  MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) {
 29070    static uint64_t limit;
 29071    if (unlikely(monotime > limit)) {
 29072      if (likely(limit != 0))
 29073        return UINT32_MAX;
 29074      limit = osal_16dot16_to_monotime(UINT32_MAX - 1);
 29075      if (unlikely(monotime > limit))
 29076        return UINT32_MAX;
 29077    }
 29078    const uint32_t ret =
 29079  #if defined(_WIN32) || defined(_WIN64)
 29080        (uint32_t)((monotime << 16) / performance_frequency.QuadPart);
 29081  #elif defined(__APPLE__) || defined(__MACH__)
 29082        (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine);
 29083  #else
 29084        (uint32_t)(monotime * 128 / 1953125);
 29085  #endif
 29086    if (likely(ret > 0))
 29087      return ret;
 29088    return monotime > 0 /* fix underflow */;
 29089  }
 29090  
 29091  MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) {
 29092  #if defined(_WIN32) || defined(_WIN64)
 29093    LARGE_INTEGER counter;
 29094    counter.QuadPart = 0;
 29095    QueryPerformanceCounter(&counter);
 29096    return counter.QuadPart;
 29097  #elif defined(__APPLE__) || defined(__MACH__)
 29098    return mach_absolute_time();
 29099  #else
 29100  
 29101  #if defined(__linux__) || defined(__gnu_linux__)
 29102    static clockid_t posix_clockid = -1;
 29103    if (unlikely(posix_clockid < 0))
 29104      posix_clockid = choice_monoclock();
 29105  #elif defined(CLOCK_MONOTONIC)
 29106  #define posix_clockid CLOCK_MONOTONIC
 29107  #else
 29108  #define posix_clockid CLOCK_REALTIME
 29109  #endif
 29110  
 29111    struct timespec ts;
 29112    if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) {
 29113      ts.tv_nsec = 0;
 29114      ts.tv_sec = 0;
 29115    }
 29116    return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec;
 29117  #endif
 29118  }
 29119  
 29120  /*----------------------------------------------------------------------------*/
 29121  
 29122  static void bootid_shake(bin128_t *p) {
 29123    /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */
 29124    const uint32_t e = p->a - (p->b << 23 | p->b >> 9);
 29125    p->a = p->b ^ (p->c << 16 | p->c >> 16);
 29126    p->b = p->c + (p->d << 11 | p->d >> 21);
 29127    p->c = p->d + e;
 29128    p->d = e + p->a;
 29129  }
 29130  
 29131  __cold static void bootid_collect(bin128_t *p, const void *s, size_t n) {
 29132    p->y += UINT64_C(64526882297375213);
 29133    bootid_shake(p);
 29134    for (size_t i = 0; i < n; ++i) {
 29135      bootid_shake(p);
 29136      p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i];
 29137      bootid_shake(p);
 29138      p->y += 14621231;
 29139    }
 29140    bootid_shake(p);
 29141  
 29142    /* minor non-linear tomfoolery */
 29143    const unsigned z = p->x % 61;
 29144    p->y = p->y << z | p->y >> (64 - z);
 29145    bootid_shake(p);
 29146    bootid_shake(p);
 29147    const unsigned q = p->x % 59;
 29148    p->y = p->y << q | p->y >> (64 - q);
 29149    bootid_shake(p);
 29150    bootid_shake(p);
 29151    bootid_shake(p);
 29152  }
 29153  
 29154  #if defined(_WIN32) || defined(_WIN64)
 29155  
 29156  static uint64_t windows_systemtime_ms() {
 29157    FILETIME ft;
 29158    GetSystemTimeAsFileTime(&ft);
 29159    return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul;
 29160  }
 29161  
 29162  static uint64_t windows_bootime(void) {
 29163    unsigned confirmed = 0;
 29164    uint64_t boottime = 0;
 29165    uint64_t up0 = mdbx_GetTickCount64();
 29166    uint64_t st0 = windows_systemtime_ms();
 29167    for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) {
 29168      YieldProcessor();
 29169      const uint64_t up1 = mdbx_GetTickCount64();
 29170      const uint64_t st1 = windows_systemtime_ms();
 29171      if (st1 > fuse && st1 == st0 && up1 == up0) {
 29172        uint64_t diff = st1 - up1;
 29173        if (boottime == diff) {
 29174          if (++confirmed > 4)
 29175            return boottime;
 29176        } else {
 29177          confirmed = 0;
 29178          boottime = diff;
 29179        }
 29180        fuse = st1;
 29181        Sleep(1);
 29182      }
 29183      st0 = st1;
 29184      up0 = up1;
 29185    }
 29186    return 0;
 29187  }
 29188  
 29189  static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue,
 29190                                  PVOID pvData, LPDWORD pcbData) {
 29191    LSTATUS rc;
 29192    if (!mdbx_RegGetValueA) {
 29193      /* an old Windows 2000/XP */
 29194      HKEY hSubKey;
 29195      rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey);
 29196      if (rc == ERROR_SUCCESS) {
 29197        rc = RegQueryValueExA(hSubKey, lpValue, NULL, NULL, pvData, pcbData);
 29198        RegCloseKey(hSubKey);
 29199      }
 29200      return rc;
 29201    }
 29202  
 29203    rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, NULL, pvData,
 29204                           pcbData);
 29205    if (rc != ERROR_FILE_NOT_FOUND)
 29206      return rc;
 29207  
 29208    rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue,
 29209                           RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */,
 29210                           NULL, pvData, pcbData);
 29211    if (rc != ERROR_FILE_NOT_FOUND)
 29212      return rc;
 29213    return mdbx_RegGetValueA(hKey, lpSubKey, lpValue,
 29214                             RRF_RT_ANY | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */,
 29215                             NULL, pvData, pcbData);
 29216  }
 29217  #endif
 29218  
 29219  __cold MDBX_MAYBE_UNUSED static bool
 29220  bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) {
 29221    if (n > 31) {
 29222      unsigned bits = 0;
 29223      for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ {
 29224        uint8_t c = ((const uint8_t *)p)[i];
 29225        if (c >= '0' && c <= '9')
 29226          c -= '0';
 29227        else if (c >= 'a' && c <= 'f')
 29228          c -= 'a' - 10;
 29229        else if (c >= 'A' && c <= 'F')
 29230          c -= 'A' - 10;
 29231        else
 29232          continue;
 29233        assert(c <= 15);
 29234        c ^= s->y >> 60;
 29235        s->y = s->y << 4 | s->x >> 60;
 29236        s->x = s->x << 4 | c;
 29237        bits += 4;
 29238      }
 29239      if (bits > 42 * 3)
 29240        /* UUID parsed successfully */
 29241        return true;
 29242    }
 29243  
 29244    if (n > 15) /* is enough handle it as a binary? */ {
 29245      if (n == sizeof(bin128_t)) {
 29246        bin128_t aligned;
 29247        memcpy(&aligned, p, sizeof(bin128_t));
 29248        s->x += aligned.x;
 29249        s->y += aligned.y;
 29250      } else
 29251        bootid_collect(s, p, n);
 29252      return true;
 29253    }
 29254  
 29255    if (n)
 29256      bootid_collect(s, p, n);
 29257    return false;
 29258  }
 29259  
 29260  __cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) {
 29261    bin128_t bin = {{0, 0}};
 29262    bool got_machineid = false, got_boottime = false, got_bootseq = false;
 29263  
 29264  #if defined(__linux__) || defined(__gnu_linux__)
 29265    {
 29266      const int fd =
 29267          open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW);
 29268      if (fd != -1) {
 29269        struct statfs fs;
 29270        char buf[42];
 29271        const ssize_t len =
 29272            (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0)
 29273                ? read(fd, buf, sizeof(buf))
 29274                : -1;
 29275        const int err = close(fd);
 29276        assert(err == 0);
 29277        (void)err;
 29278        if (len > 0 && bootid_parse_uuid(&bin, buf, len))
 29279          return bin;
 29280      }
 29281    }
 29282  #endif /* Linux */
 29283  
 29284  #if defined(__APPLE__) || defined(__MACH__)
 29285    {
 29286      char buf[42];
 29287      size_t len = sizeof(buf);
 29288      if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) &&
 29289          bootid_parse_uuid(&bin, buf, len))
 29290        return bin;
 29291  
 29292  #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) &&                                \
 29293      __MAC_OS_X_VERSION_MIN_REQUIRED > 1050
 29294      uuid_t uuid;
 29295      struct timespec wait = {0, 1000000000u / 42};
 29296      if (!gethostuuid(uuid, &wait) &&
 29297          bootid_parse_uuid(&bin, uuid, sizeof(uuid)))
 29298        got_machineid = true;
 29299  #endif /* > 10.5 */
 29300  
 29301      struct timeval boottime;
 29302      len = sizeof(boottime);
 29303      if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) &&
 29304          len == sizeof(boottime) && boottime.tv_sec)
 29305        got_boottime = true;
 29306    }
 29307  #endif /* Apple/Darwin */
 29308  
 29309  #if defined(_WIN32) || defined(_WIN64)
 29310    {
 29311      union buf {
 29312        DWORD BootId;
 29313        DWORD BaseTime;
 29314        SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo;
 29315        struct {
 29316          LARGE_INTEGER BootTime;
 29317          LARGE_INTEGER CurrentTime;
 29318          LARGE_INTEGER TimeZoneBias;
 29319          ULONG TimeZoneId;
 29320          ULONG Reserved;
 29321          ULONGLONG BootTimeBias;
 29322          ULONGLONG SleepTimeBias;
 29323        } SysTimeOfDayInfoHacked;
 29324        wchar_t MachineGuid[42];
 29325        char DigitalProductId[248];
 29326      } buf;
 29327  
 29328      static const char HKLM_MicrosoftCryptography[] =
 29329          "SOFTWARE\\Microsoft\\Cryptography";
 29330      DWORD len = sizeof(buf);
 29331      /* Windows is madness and must die */
 29332      if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography,
 29333                           "MachineGuid", &buf.MachineGuid,
 29334                           &len) == ERROR_SUCCESS &&
 29335          len < sizeof(buf))
 29336        got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len);
 29337  
 29338      if (!got_machineid) {
 29339        /* again, Windows is madness */
 29340        static const char HKLM_WindowsNT[] =
 29341            "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion";
 29342        static const char HKLM_WindowsNT_DPK[] =
 29343            "SOFTWARE\\Microsoft\\Windows "
 29344            "NT\\CurrentVersion\\DefaultProductKey";
 29345        static const char HKLM_WindowsNT_DPK2[] =
 29346            "SOFTWARE\\Microsoft\\Windows "
 29347            "NT\\CurrentVersion\\DefaultProductKey2";
 29348  
 29349        len = sizeof(buf);
 29350        if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT,
 29351                             "DigitalProductId", &buf.DigitalProductId,
 29352                             &len) == ERROR_SUCCESS &&
 29353            len > 42 && len < sizeof(buf)) {
 29354          bootid_collect(&bin, &buf.DigitalProductId, len);
 29355          got_machineid = true;
 29356        }
 29357        len = sizeof(buf);
 29358        if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK,
 29359                             "DigitalProductId", &buf.DigitalProductId,
 29360                             &len) == ERROR_SUCCESS &&
 29361            len > 42 && len < sizeof(buf)) {
 29362          bootid_collect(&bin, &buf.DigitalProductId, len);
 29363          got_machineid = true;
 29364        }
 29365        len = sizeof(buf);
 29366        if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2,
 29367                             "DigitalProductId", &buf.DigitalProductId,
 29368                             &len) == ERROR_SUCCESS &&
 29369            len > 42 && len < sizeof(buf)) {
 29370          bootid_collect(&bin, &buf.DigitalProductId, len);
 29371          got_machineid = true;
 29372        }
 29373      }
 29374  
 29375      static const char HKLM_PrefetcherParams[] =
 29376          "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory "
 29377          "Management\\PrefetchParameters";
 29378      len = sizeof(buf);
 29379      if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId",
 29380                           &buf.BootId, &len) == ERROR_SUCCESS &&
 29381          len > 1 && len < sizeof(buf)) {
 29382        bootid_collect(&bin, &buf.BootId, len);
 29383        got_bootseq = true;
 29384      }
 29385  
 29386      len = sizeof(buf);
 29387      if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime",
 29388                           &buf.BaseTime, &len) == ERROR_SUCCESS &&
 29389          len >= sizeof(buf.BaseTime) && buf.BaseTime) {
 29390        bootid_collect(&bin, &buf.BaseTime, len);
 29391        got_boottime = true;
 29392      }
 29393  
 29394      /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */
 29395      NTSTATUS status = NtQuerySystemInformation(
 29396          0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo,
 29397          sizeof(buf.SysTimeOfDayInfo), &len);
 29398      if (NT_SUCCESS(status) &&
 29399          len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) +
 29400                     sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) &&
 29401          buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) {
 29402        const uint64_t UnbiasedBootTime =
 29403            buf.SysTimeOfDayInfoHacked.BootTime.QuadPart -
 29404            buf.SysTimeOfDayInfoHacked.BootTimeBias;
 29405        if (UnbiasedBootTime) {
 29406          bootid_collect(&bin, &UnbiasedBootTime, sizeof(UnbiasedBootTime));
 29407          got_boottime = true;
 29408        }
 29409      }
 29410  
 29411      if (!got_boottime) {
 29412        uint64_t boottime = windows_bootime();
 29413        if (boottime) {
 29414          bootid_collect(&bin, &boottime, sizeof(boottime));
 29415          got_boottime = true;
 29416        }
 29417      }
 29418    }
 29419  #endif /* Windows */
 29420  
 29421  #if defined(CTL_HW) && defined(HW_UUID)
 29422    if (!got_machineid) {
 29423      static const int mib[] = {CTL_HW, HW_UUID};
 29424      char buf[42];
 29425      size_t len = sizeof(buf);
 29426      if (sysctl(
 29427  #ifdef SYSCTL_LEGACY_NONCONST_MIB
 29428              (int *)
 29429  #endif
 29430                  mib,
 29431              ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0)
 29432        got_machineid = bootid_parse_uuid(&bin, buf, len);
 29433    }
 29434  #endif /* CTL_HW && HW_UUID */
 29435  
 29436  #if defined(CTL_KERN) && defined(KERN_HOSTUUID)
 29437    if (!got_machineid) {
 29438      static const int mib[] = {CTL_KERN, KERN_HOSTUUID};
 29439      char buf[42];
 29440      size_t len = sizeof(buf);
 29441      if (sysctl(
 29442  #ifdef SYSCTL_LEGACY_NONCONST_MIB
 29443              (int *)
 29444  #endif
 29445                  mib,
 29446              ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0)
 29447        got_machineid = bootid_parse_uuid(&bin, buf, len);
 29448    }
 29449  #endif /* CTL_KERN && KERN_HOSTUUID */
 29450  
 29451  #if defined(__NetBSD__)
 29452    if (!got_machineid) {
 29453      char buf[42];
 29454      size_t len = sizeof(buf);
 29455      if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0)
 29456        got_machineid = bootid_parse_uuid(&bin, buf, len);
 29457    }
 29458  #endif /* __NetBSD__ */
 29459  
 29460  #if _XOPEN_SOURCE_EXTENDED
 29461    if (!got_machineid) {
 29462      const int hostid = gethostid();
 29463      if (hostid > 0) {
 29464        bootid_collect(&bin, &hostid, sizeof(hostid));
 29465        got_machineid = true;
 29466      }
 29467    }
 29468  #endif /* _XOPEN_SOURCE_EXTENDED */
 29469  
 29470    if (!got_machineid) {
 29471    lack:
 29472      bin.x = bin.y = 0;
 29473      return bin;
 29474    }
 29475  
 29476    /*--------------------------------------------------------------------------*/
 29477  
 29478  #if defined(CTL_KERN) && defined(KERN_BOOTTIME)
 29479    if (!got_boottime) {
 29480      static const int mib[] = {CTL_KERN, KERN_BOOTTIME};
 29481      struct timeval boottime;
 29482      size_t len = sizeof(boottime);
 29483      if (sysctl(
 29484  #ifdef SYSCTL_LEGACY_NONCONST_MIB
 29485              (int *)
 29486  #endif
 29487                  mib,
 29488              ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 &&
 29489          len == sizeof(boottime) && boottime.tv_sec) {
 29490        bootid_collect(&bin, &boottime, len);
 29491        got_boottime = true;
 29492      }
 29493    }
 29494  #endif /* CTL_KERN && KERN_BOOTTIME */
 29495  
 29496  #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
 29497    if (!got_boottime) {
 29498      kstat_ctl_t *kc = kstat_open();
 29499      if (kc) {
 29500        kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc");
 29501        if (kp && kstat_read(kc, kp, 0) != -1) {
 29502          kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time");
 29503          if (kn) {
 29504            switch (kn->data_type) {
 29505            case KSTAT_DATA_INT32:
 29506            case KSTAT_DATA_UINT32:
 29507              bootid_collect(&bin, &kn->value, sizeof(int32_t));
 29508              got_boottime = true;
 29509            case KSTAT_DATA_INT64:
 29510            case KSTAT_DATA_UINT64:
 29511              bootid_collect(&bin, &kn->value, sizeof(int64_t));
 29512              got_boottime = true;
 29513            }
 29514          }
 29515        }
 29516        kstat_close(kc);
 29517      }
 29518    }
 29519  #endif /* SunOS / Solaris */
 29520  
 29521  #if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME)
 29522    if (!got_boottime) {
 29523      setutxent();
 29524      const struct utmpx id = {.ut_type = BOOT_TIME};
 29525      const struct utmpx *entry = getutxid(&id);
 29526      if (entry) {
 29527        bootid_collect(&bin, entry, sizeof(*entry));
 29528        got_boottime = true;
 29529        while (unlikely((entry = getutxid(&id)) != nullptr)) {
 29530          /* have multiple reboot records, assuming we can distinguish next
 29531           * bootsession even if RTC is wrong or absent */
 29532          bootid_collect(&bin, entry, sizeof(*entry));
 29533          got_bootseq = true;
 29534        }
 29535      }
 29536      endutxent();
 29537    }
 29538  #endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */
 29539  
 29540    if (!got_bootseq) {
 29541      if (!got_boottime || !MDBX_TRUST_RTC)
 29542        goto lack;
 29543  
 29544  #if defined(_WIN32) || defined(_WIN64)
 29545      FILETIME now;
 29546      GetSystemTimeAsFileTime(&now);
 29547      if (0x1CCCCCC > now.dwHighDateTime)
 29548  #else
 29549      struct timespec mono, real;
 29550      if (clock_gettime(CLOCK_MONOTONIC, &mono) ||
 29551          clock_gettime(CLOCK_REALTIME, &real) ||
 29552          /* wrong time, RTC is mad or absent */
 29553          1555555555l > real.tv_sec ||
 29554          /* seems no adjustment by RTC/NTP, i.e. a fake time */
 29555          real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec ||
 29556          (real.tv_sec - mono.tv_sec) % 900u == 0)
 29557  #endif
 29558        goto lack;
 29559    }
 29560  
 29561    return bin;
 29562  }
 29563  
 29564  __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages,
 29565                                 intptr_t *avail_pages) {
 29566    if (!page_size && !total_pages && !avail_pages)
 29567      return MDBX_EINVAL;
 29568    if (total_pages)
 29569      *total_pages = -1;
 29570    if (avail_pages)
 29571      *avail_pages = -1;
 29572  
 29573    const intptr_t pagesize = osal_syspagesize();
 29574    if (page_size)
 29575      *page_size = pagesize;
 29576    if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize)))
 29577      return MDBX_INCOMPATIBLE;
 29578  
 29579    MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize);
 29580    assert(pagesize == (INT64_C(1) << log2page));
 29581    (void)log2page;
 29582  
 29583  #if defined(_WIN32) || defined(_WIN64)
 29584    MEMORYSTATUSEX info;
 29585    memset(&info, 0, sizeof(info));
 29586    info.dwLength = sizeof(info);
 29587    if (!GlobalMemoryStatusEx(&info))
 29588      return (int)GetLastError();
 29589  #endif
 29590  
 29591    if (total_pages) {
 29592  #if defined(_WIN32) || defined(_WIN64)
 29593      const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page);
 29594  #elif defined(_SC_PHYS_PAGES)
 29595      const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES);
 29596      if (total_ram_pages == -1)
 29597        return errno;
 29598  #elif defined(_SC_AIX_REALMEM)
 29599      const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM);
 29600      if (total_ram_Kb == -1)
 29601        return errno;
 29602      const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page;
 29603  #elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) ||   \
 29604      defined(HW_PHYSMEM)
 29605      size_t ram, len = sizeof(ram);
 29606      static const int mib[] = {
 29607        CTL_HW,
 29608  #if defined(HW_USERMEM)
 29609        HW_USERMEM
 29610  #elif defined(HW_PHYSMEM64)
 29611        HW_PHYSMEM64
 29612  #elif defined(HW_MEMSIZE)
 29613        HW_MEMSIZE
 29614  #else
 29615        HW_PHYSMEM
 29616  #endif
 29617      };
 29618      if (sysctl(
 29619  #ifdef SYSCTL_LEGACY_NONCONST_MIB
 29620              (int *)
 29621  #endif
 29622                  mib,
 29623              ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0)
 29624        return errno;
 29625      if (len != sizeof(ram))
 29626        return MDBX_ENOSYS;
 29627      const intptr_t total_ram_pages = (intptr_t)(ram >> log2page);
 29628  #else
 29629  #error "FIXME: Get User-accessible or physical RAM"
 29630  #endif
 29631      *total_pages = total_ram_pages;
 29632      if (total_ram_pages < 1)
 29633        return MDBX_ENOSYS;
 29634    }
 29635  
 29636    if (avail_pages) {
 29637  #if defined(_WIN32) || defined(_WIN64)
 29638      const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page);
 29639  #elif defined(_SC_AVPHYS_PAGES)
 29640      const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES);
 29641      if (avail_ram_pages == -1)
 29642        return errno;
 29643  #elif defined(__MACH__)
 29644      mach_msg_type_number_t count = HOST_VM_INFO_COUNT;
 29645      vm_statistics_data_t vmstat;
 29646      mach_port_t mport = mach_host_self();
 29647      kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO,
 29648                                           (host_info_t)&vmstat, &count);
 29649      mach_port_deallocate(mach_task_self(), mport);
 29650      if (unlikely(kerr != KERN_SUCCESS))
 29651        return MDBX_ENOSYS;
 29652      const intptr_t avail_ram_pages = vmstat.free_count;
 29653  #elif defined(VM_TOTAL) || defined(VM_METER)
 29654      struct vmtotal info;
 29655      size_t len = sizeof(info);
 29656      static const int mib[] = {
 29657        CTL_VM,
 29658  #if defined(VM_TOTAL)
 29659        VM_TOTAL
 29660  #elif defined(VM_METER)
 29661        VM_METER
 29662  #endif
 29663      };
 29664      if (sysctl(
 29665  #ifdef SYSCTL_LEGACY_NONCONST_MIB
 29666              (int *)
 29667  #endif
 29668                  mib,
 29669              ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0)
 29670        return errno;
 29671      if (len != sizeof(info))
 29672        return MDBX_ENOSYS;
 29673      const intptr_t avail_ram_pages = info.t_free;
 29674  #else
 29675  #error "FIXME: Get Available RAM"
 29676  #endif
 29677      *avail_pages = avail_ram_pages;
 29678      if (avail_ram_pages < 1)
 29679        return MDBX_ENOSYS;
 29680    }
 29681  
 29682    return MDBX_SUCCESS;
 29683  }
 29684  /* This is CMake-template for libmdbx's version.c
 29685   ******************************************************************************/
 29686  
 29687  
 29688  #if MDBX_VERSION_MAJOR != 0 ||                             \
 29689      MDBX_VERSION_MINOR != 12
 29690  #error "API version mismatch! Had `git fetch --tags` done?"
 29691  #endif
 29692  
 29693  static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
 29694  
 29695  __dll_export
 29696  #ifdef __attribute_used__
 29697      __attribute_used__
 29698  #elif defined(__GNUC__) || __has_attribute(__used__)
 29699      __attribute__((__used__))
 29700  #endif
 29701  #ifdef __attribute_externally_visible__
 29702          __attribute_externally_visible__
 29703  #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
 29704      __has_attribute(__externally_visible__)
 29705      __attribute__((__externally_visible__))
 29706  #endif
 29707      const struct MDBX_version_info mdbx_version = {
 29708          0,
 29709          12,
 29710          1,
 29711          0,
 29712          {"2022-08-24T16:24:22+03:00", "0803c79d2d94f2d1496166a9a86bd47da18c7eed", "b36a07a512c1412d5753219aa8fc66cab75a012a",
 29713           "v0.12.1-0-gb36a07a5"},
 29714          sourcery};
 29715  
 29716  __dll_export
 29717  #ifdef __attribute_used__
 29718      __attribute_used__
 29719  #elif defined(__GNUC__) || __has_attribute(__used__)
 29720      __attribute__((__used__))
 29721  #endif
 29722  #ifdef __attribute_externally_visible__
 29723          __attribute_externally_visible__
 29724  #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
 29725      __has_attribute(__externally_visible__)
 29726      __attribute__((__externally_visible__))
 29727  #endif
 29728      const char *const mdbx_sourcery_anchor = sourcery;
 29729  /*
 29730   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
 29731   * and other libmdbx authors: please see AUTHORS file.
 29732   * All rights reserved.
 29733   *
 29734   * Redistribution and use in source and binary forms, with or without
 29735   * modification, are permitted only as authorized by the OpenLDAP
 29736   * Public License.
 29737   *
 29738   * A copy of this license is available in the file LICENSE in the
 29739   * top-level directory of the distribution or, alternatively, at
 29740   * <http://www.OpenLDAP.org/license.html>.
 29741   */
 29742  
 29743  #if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */
 29744  
 29745  /* PREAMBLE FOR WINDOWS:
 29746   *
 29747   * We are not concerned for performance here.
 29748   * If you are running Windows a performance could NOT be the goal.
 29749   * Otherwise please use Linux. */
 29750  
 29751  
 29752  static void mdbx_winnt_import(void);
 29753  
 29754  #if MDBX_BUILD_SHARED_LIBRARY
 29755  #if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
 29756  /* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
 29757   *
 29758   * Define dll's entry point only for Release build when NDEBUG is defined and
 29759   * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
 29760   * automatically use DllMainCRTStartup() from CRT library, which also
 29761   * automatically call DllMain() from our mdbx.dll */
 29762  #pragma comment(linker, "/ENTRY:DllMain")
 29763  #endif /* MDBX_WITHOUT_MSVC_CRT */
 29764  
 29765  BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
 29766  #else
 29767  #if !MDBX_MANUAL_MODULE_HANDLER
 29768  static
 29769  #endif /* !MDBX_MANUAL_MODULE_HANDLER */
 29770      void NTAPI
 29771      mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
 29772  #endif /* MDBX_BUILD_SHARED_LIBRARY */
 29773  {
 29774    (void)reserved;
 29775    switch (reason) {
 29776    case DLL_PROCESS_ATTACH:
 29777      mdbx_winnt_import();
 29778      global_ctor();
 29779      break;
 29780    case DLL_PROCESS_DETACH:
 29781      global_dtor();
 29782      break;
 29783  
 29784    case DLL_THREAD_ATTACH:
 29785      break;
 29786    case DLL_THREAD_DETACH:
 29787      thread_dtor(module);
 29788      break;
 29789    }
 29790  #if MDBX_BUILD_SHARED_LIBRARY
 29791    return TRUE;
 29792  #endif
 29793  }
 29794  
 29795  #if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
 29796  #if defined(_MSC_VER)
 29797  #  pragma const_seg(push)
 29798  #  pragma data_seg(push)
 29799  
 29800  #  ifndef _M_IX86
 29801       /* kick a linker to create the TLS directory if not already done */
 29802  #    pragma comment(linker, "/INCLUDE:_tls_used")
 29803       /* Force some symbol references. */
 29804  #    pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
 29805       /* specific const-segment for WIN64 */
 29806  #    pragma const_seg(".CRT$XLB")
 29807       const
 29808  #  else
 29809       /* kick a linker to create the TLS directory if not already done */
 29810  #    pragma comment(linker, "/INCLUDE:__tls_used")
 29811       /* Force some symbol references. */
 29812  #    pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
 29813       /* specific data-segment for WIN32 */
 29814  #    pragma data_seg(".CRT$XLB")
 29815  #  endif
 29816  
 29817     __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
 29818  #  pragma data_seg(pop)
 29819  #  pragma const_seg(pop)
 29820  
 29821  #elif defined(__GNUC__)
 29822  #  ifndef _M_IX86
 29823       const
 29824  #  endif
 29825     PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
 29826  #else
 29827  #  error FIXME
 29828  #endif
 29829  #endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
 29830  
 29831  /*----------------------------------------------------------------------------*/
 29832  
 29833  #define LCK_SHARED 0
 29834  #define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK
 29835  #define LCK_WAITFOR 0
 29836  #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
 29837  
 29838  static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset,
 29839                             size_t bytes) {
 29840    OVERLAPPED ov;
 29841    ov.hEvent = 0;
 29842    ov.Offset = (DWORD)offset;
 29843    ov.OffsetHigh = HIGH_DWORD(offset);
 29844    return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov);
 29845  }
 29846  
 29847  static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset,
 29848                               size_t bytes) {
 29849    return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes,
 29850                      HIGH_DWORD(bytes));
 29851  }
 29852  
 29853  /*----------------------------------------------------------------------------*/
 29854  /* global `write` lock for write-txt processing,
 29855   * exclusive locking both meta-pages) */
 29856  
 29857  #define LCK_MAXLEN (1u + ((~(size_t)0) >> 1))
 29858  #define LCK_META_OFFSET 0
 29859  #define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS)
 29860  #define LCK_BODY_OFFSET LCK_META_LEN
 29861  #define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET)
 29862  #define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN
 29863  #define LCK_WHOLE 0, LCK_MAXLEN
 29864  
 29865  int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
 29866    if (dontwait) {
 29867      if (!TryEnterCriticalSection(&env->me_windowsbug_lock))
 29868        return MDBX_BUSY;
 29869    } else {
 29870      __try {
 29871        EnterCriticalSection(&env->me_windowsbug_lock);
 29872      }
 29873      __except ((GetExceptionCode() ==
 29874                   0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
 29875                      ? EXCEPTION_EXECUTE_HANDLER
 29876                      : EXCEPTION_CONTINUE_SEARCH) {
 29877        return ERROR_POSSIBLE_DEADLOCK;
 29878      }
 29879    }
 29880  
 29881    if ((env->me_flags & MDBX_EXCLUSIVE) ||
 29882        flock(env->me_lazy_fd,
 29883              dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
 29884                       : (LCK_EXCLUSIVE | LCK_WAITFOR),
 29885              LCK_BODY))
 29886      return MDBX_SUCCESS;
 29887    int rc = (int)GetLastError();
 29888    LeaveCriticalSection(&env->me_windowsbug_lock);
 29889    return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
 29890  }
 29891  
 29892  void mdbx_txn_unlock(MDBX_env *env) {
 29893    int rc = (env->me_flags & MDBX_EXCLUSIVE)
 29894                 ? TRUE
 29895                 : funlock(env->me_lazy_fd, LCK_BODY);
 29896    LeaveCriticalSection(&env->me_windowsbug_lock);
 29897    if (!rc)
 29898      mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
 29899  }
 29900  
 29901  /*----------------------------------------------------------------------------*/
 29902  /* global `read` lock for readers registration,
 29903   * exclusive locking `mti_numreaders` (second) cacheline */
 29904  
 29905  #define LCK_LO_OFFSET 0
 29906  #define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
 29907  #define LCK_UP_OFFSET LCK_LO_LEN
 29908  #define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
 29909  #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
 29910  #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
 29911  
 29912  MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
 29913    osal_srwlock_AcquireShared(&env->me_remap_guard);
 29914    if (env->me_lfd == INVALID_HANDLE_VALUE)
 29915      return MDBX_SUCCESS; /* readonly database in readonly filesystem */
 29916  
 29917    /* transition from S-? (used) to S-E (locked),
 29918     * e.g. exclusive lock upper-part */
 29919    if ((env->me_flags & MDBX_EXCLUSIVE) ||
 29920        flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER))
 29921      return MDBX_SUCCESS;
 29922  
 29923    int rc = (int)GetLastError();
 29924    osal_srwlock_ReleaseShared(&env->me_remap_guard);
 29925    return rc;
 29926  }
 29927  
 29928  MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
 29929    if (env->me_lfd != INVALID_HANDLE_VALUE) {
 29930      /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
 29931      if ((env->me_flags & MDBX_EXCLUSIVE) == 0 &&
 29932          !funlock(env->me_lfd, LCK_UPPER))
 29933        mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
 29934    }
 29935    osal_srwlock_ReleaseShared(&env->me_remap_guard);
 29936  }
 29937  
 29938  MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
 29939    return flock(fd,
 29940                 wait ? LCK_EXCLUSIVE | LCK_WAITFOR
 29941                      : LCK_EXCLUSIVE | LCK_DONTWAIT,
 29942                 0, LCK_MAXLEN)
 29943               ? MDBX_SUCCESS
 29944               : (int)GetLastError();
 29945  }
 29946  
 29947  static int suspend_and_append(mdbx_handle_array_t **array,
 29948                                const DWORD ThreadId) {
 29949    const unsigned limit = (*array)->limit;
 29950    if ((*array)->count == limit) {
 29951      void *ptr = osal_realloc(
 29952          (limit > ARRAY_LENGTH((*array)->handles))
 29953              ? *array
 29954              : /* don't free initial array on the stack */ NULL,
 29955          sizeof(mdbx_handle_array_t) +
 29956              sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles)));
 29957      if (!ptr)
 29958        return MDBX_ENOMEM;
 29959      if (limit == ARRAY_LENGTH((*array)->handles))
 29960        memcpy(ptr, *array, sizeof(mdbx_handle_array_t));
 29961      *array = (mdbx_handle_array_t *)ptr;
 29962      (*array)->limit = limit * 2;
 29963    }
 29964  
 29965    HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION,
 29966                                FALSE, ThreadId);
 29967    if (hThread == NULL)
 29968      return (int)GetLastError();
 29969  
 29970    if (SuspendThread(hThread) == (DWORD)-1) {
 29971      int err = (int)GetLastError();
 29972      DWORD ExitCode;
 29973      if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED ||
 29974          !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE)
 29975        err = MDBX_SUCCESS;
 29976      CloseHandle(hThread);
 29977      return err;
 29978    }
 29979  
 29980    (*array)->handles[(*array)->count++] = hThread;
 29981    return MDBX_SUCCESS;
 29982  }
 29983  
 29984  MDBX_INTERNAL_FUNC int
 29985  osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
 29986    eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0);
 29987    const uintptr_t CurrentTid = GetCurrentThreadId();
 29988    int rc;
 29989    if (env->me_lck_mmap.lck) {
 29990      /* Scan LCK for threads of the current process */
 29991      const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
 29992      const MDBX_reader *const end =
 29993          begin +
 29994          atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease);
 29995      const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0;
 29996      for (const MDBX_reader *reader = begin; reader < end; ++reader) {
 29997        if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) {
 29998        skip_lck:
 29999          continue;
 30000        }
 30001        if (reader->mr_tid.weak == CurrentTid ||
 30002            reader->mr_tid.weak == WriteTxnOwner)
 30003          goto skip_lck;
 30004  
 30005        rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
 30006        if (rc != MDBX_SUCCESS) {
 30007        bailout_lck:
 30008          (void)osal_resume_threads_after_remap(*array);
 30009          return rc;
 30010        }
 30011      }
 30012      if (WriteTxnOwner && WriteTxnOwner != CurrentTid) {
 30013        rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner);
 30014        if (rc != MDBX_SUCCESS)
 30015          goto bailout_lck;
 30016      }
 30017    } else {
 30018      /* Without LCK (i.e. read-only mode).
 30019       * Walk through a snapshot of all running threads */
 30020      eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
 30021      const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
 30022      if (hSnapshot == INVALID_HANDLE_VALUE)
 30023        return (int)GetLastError();
 30024  
 30025      THREADENTRY32 entry;
 30026      entry.dwSize = sizeof(THREADENTRY32);
 30027  
 30028      if (!Thread32First(hSnapshot, &entry)) {
 30029        rc = (int)GetLastError();
 30030      bailout_toolhelp:
 30031        CloseHandle(hSnapshot);
 30032        (void)osal_resume_threads_after_remap(*array);
 30033        return rc;
 30034      }
 30035  
 30036      do {
 30037        if (entry.th32OwnerProcessID != env->me_pid ||
 30038            entry.th32ThreadID == CurrentTid)
 30039          continue;
 30040  
 30041        rc = suspend_and_append(array, entry.th32ThreadID);
 30042        if (rc != MDBX_SUCCESS)
 30043          goto bailout_toolhelp;
 30044  
 30045      } while (Thread32Next(hSnapshot, &entry));
 30046  
 30047      rc = (int)GetLastError();
 30048      if (rc != ERROR_NO_MORE_FILES)
 30049        goto bailout_toolhelp;
 30050      CloseHandle(hSnapshot);
 30051    }
 30052  
 30053    return MDBX_SUCCESS;
 30054  }
 30055  
 30056  MDBX_INTERNAL_FUNC int
 30057  osal_resume_threads_after_remap(mdbx_handle_array_t *array) {
 30058    int rc = MDBX_SUCCESS;
 30059    for (unsigned i = 0; i < array->count; ++i) {
 30060      const HANDLE hThread = array->handles[i];
 30061      if (ResumeThread(hThread) == (DWORD)-1) {
 30062        const int err = (int)GetLastError();
 30063        DWORD ExitCode;
 30064        if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED &&
 30065            GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE)
 30066          rc = err;
 30067      }
 30068      CloseHandle(hThread);
 30069    }
 30070    return rc;
 30071  }
 30072  
 30073  /*----------------------------------------------------------------------------*/
 30074  /* global `initial` lock for lockfile initialization,
 30075   * exclusive/shared locking first cacheline */
 30076  
 30077  /* Briefly description of locking schema/algorithm:
 30078   *  - Windows does not support upgrading or downgrading for file locking.
 30079   *  - Therefore upgrading/downgrading is emulated by shared and exclusive
 30080   *    locking of upper and lower halves.
 30081   *  - In other words, we have FSM with possible 9 states,
 30082   *    i.e. free/shared/exclusive x free/shared/exclusive == 9.
 30083   *    Only 6 states of FSM are used, which 2 of ones are transitive.
 30084   *
 30085   * States:
 30086   *   ?-?  = free, i.e. unlocked
 30087   *   S-?  = used, i.e. shared lock
 30088   *   E-?  = exclusive-read, i.e. operational exclusive
 30089   *   ?-S
 30090   *   ?-E  = middle (transitive state)
 30091   *   S-S
 30092   *   S-E  = locked (transitive state)
 30093   *   E-S
 30094   *   E-E  = exclusive-write, i.e. exclusive due (re)initialization
 30095   *
 30096   *  The osal_lck_seize() moves the locking-FSM from the initial free/unlocked
 30097   *  state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible,
 30098   *  or to the "used" (and returns MDBX_RESULT_FALSE).
 30099   *
 30100   *  The osal_lck_downgrade() moves the locking-FSM from "exclusive write"
 30101   *  state to the "used" (i.e. shared) state.
 30102   *
 30103   *  The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
 30104   *  state to the "exclusive write" state.
 30105   */
 30106  
 30107  static void lck_unlock(MDBX_env *env) {
 30108    int err;
 30109  
 30110    if (env->me_lfd != INVALID_HANDLE_VALUE) {
 30111      /* double `unlock` for robustly remove overlapped shared/exclusive locks */
 30112      while (funlock(env->me_lfd, LCK_LOWER))
 30113        ;
 30114      err = (int)GetLastError();
 30115      assert(err == ERROR_NOT_LOCKED ||
 30116             (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
 30117      (void)err;
 30118      SetLastError(ERROR_SUCCESS);
 30119  
 30120      while (funlock(env->me_lfd, LCK_UPPER))
 30121        ;
 30122      err = (int)GetLastError();
 30123      assert(err == ERROR_NOT_LOCKED ||
 30124             (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
 30125      (void)err;
 30126      SetLastError(ERROR_SUCCESS);
 30127    }
 30128  
 30129    if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
 30130      /* explicitly unlock to avoid latency for other processes (windows kernel
 30131       * releases such locks via deferred queues) */
 30132      while (funlock(env->me_lazy_fd, LCK_BODY))
 30133        ;
 30134      err = (int)GetLastError();
 30135      assert(err == ERROR_NOT_LOCKED ||
 30136             (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
 30137      (void)err;
 30138      SetLastError(ERROR_SUCCESS);
 30139  
 30140      while (funlock(env->me_lazy_fd, LCK_WHOLE))
 30141        ;
 30142      err = (int)GetLastError();
 30143      assert(err == ERROR_NOT_LOCKED ||
 30144             (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
 30145      (void)err;
 30146      SetLastError(ERROR_SUCCESS);
 30147    }
 30148  }
 30149  
 30150  /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE)
 30151   * or as 'used' (S-? and returns MDBX_RESULT_FALSE).
 30152   * Otherwise returns an error. */
 30153  static int internal_seize_lck(HANDLE lfd) {
 30154    int rc;
 30155    assert(lfd != INVALID_HANDLE_VALUE);
 30156  
 30157    /* 1) now on ?-? (free), get ?-E (middle) */
 30158    jitter4testing(false);
 30159    if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) {
 30160      rc = (int)GetLastError() /* 2) something went wrong, give up */;
 30161      ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
 30162      return rc;
 30163    }
 30164  
 30165    /* 3) now on ?-E (middle), try E-E (exclusive-write) */
 30166    jitter4testing(false);
 30167    if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER))
 30168      return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
 30169  
 30170    /* 5) still on ?-E (middle) */
 30171    rc = (int)GetLastError();
 30172    jitter4testing(false);
 30173    if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
 30174      /* 6) something went wrong, give up */
 30175      if (!funlock(lfd, LCK_UPPER))
 30176        mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)",
 30177                   (int)GetLastError());
 30178      return rc;
 30179    }
 30180  
 30181    /* 7) still on ?-E (middle), try S-E (locked) */
 30182    jitter4testing(false);
 30183    rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE
 30184                                                          : (int)GetLastError();
 30185  
 30186    jitter4testing(false);
 30187    if (rc != MDBX_RESULT_FALSE)
 30188      ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
 30189  
 30190    /* 8) now on S-E (locked) or still on ?-E (middle),
 30191     *    transition to S-? (used) or ?-? (free) */
 30192    if (!funlock(lfd, LCK_UPPER))
 30193      mdbx_panic("%s(%s) failed: err %u", __func__,
 30194                 "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError());
 30195  
 30196    /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */
 30197    return rc;
 30198  }
 30199  
 30200  MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
 30201    int rc;
 30202  
 30203    assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
 30204    if (env->me_flags & MDBX_EXCLUSIVE)
 30205      return MDBX_RESULT_TRUE /* nope since files were must be opened
 30206                                 non-shareable */
 30207          ;
 30208  
 30209    if (env->me_lfd == INVALID_HANDLE_VALUE) {
 30210      /* LY: without-lck mode (e.g. on read-only filesystem) */
 30211      jitter4testing(false);
 30212      if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
 30213        rc = (int)GetLastError();
 30214        ERROR("%s, err %u", "without-lck", rc);
 30215        return rc;
 30216      }
 30217      return MDBX_RESULT_FALSE;
 30218    }
 30219  
 30220    rc = internal_seize_lck(env->me_lfd);
 30221    jitter4testing(false);
 30222    if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
 30223      /* Check that another process don't operates in without-lck mode.
 30224       * Doing such check by exclusive locking the body-part of db. Should be
 30225       * noted:
 30226       *  - we need an exclusive lock for do so;
 30227       *  - we can't lock meta-pages, otherwise other process could get an error
 30228       *    while opening db in valid (non-conflict) mode. */
 30229      if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
 30230        rc = (int)GetLastError();
 30231        ERROR("%s, err %u", "lock-against-without-lck", rc);
 30232        jitter4testing(false);
 30233        lck_unlock(env);
 30234      } else {
 30235        jitter4testing(false);
 30236        if (!funlock(env->me_lazy_fd, LCK_BODY))
 30237          mdbx_panic("%s(%s) failed: err %u", __func__,
 30238                     "unlock-against-without-lck", (int)GetLastError());
 30239      }
 30240    }
 30241  
 30242    return rc;
 30243  }
 30244  
 30245  MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
 30246    /* Transite from exclusive-write state (E-E) to used (S-?) */
 30247    assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
 30248    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 30249  
 30250    if (env->me_flags & MDBX_EXCLUSIVE)
 30251      return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
 30252          ;
 30253    /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
 30254    if (!funlock(env->me_lfd, LCK_LOWER))
 30255      mdbx_panic("%s(%s) failed: err %u", __func__,
 30256                 "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError());
 30257  
 30258    /* 2) now at ?-E (middle), transition to S-E (locked) */
 30259    if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) {
 30260      int rc = (int)GetLastError() /* 3) something went wrong, give up */;
 30261      ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
 30262      return rc;
 30263    }
 30264  
 30265    /* 4) got S-E (locked), continue transition to S-? (used) */
 30266    if (!funlock(env->me_lfd, LCK_UPPER))
 30267      mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
 30268                 (int)GetLastError());
 30269  
 30270    return MDBX_SUCCESS /* 5) now at S-? (used), done */;
 30271  }
 30272  
 30273  MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) {
 30274    /* Transite from used state (S-?) to exclusive-write (E-E) */
 30275    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 30276  
 30277    if (env->me_flags & MDBX_EXCLUSIVE)
 30278      return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
 30279          ;
 30280  
 30281    int rc;
 30282    /* 1) now on S-? (used), try S-E (locked) */
 30283    jitter4testing(false);
 30284    if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) {
 30285      rc = (int)GetLastError() /* 2) something went wrong, give up */;
 30286      VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc);
 30287      return rc;
 30288    }
 30289  
 30290    /* 3) now on S-E (locked), transition to ?-E (middle) */
 30291    if (!funlock(env->me_lfd, LCK_LOWER))
 30292      mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
 30293                 (int)GetLastError());
 30294  
 30295    /* 4) now on ?-E (middle), try E-E (exclusive-write) */
 30296    jitter4testing(false);
 30297    if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) {
 30298      rc = (int)GetLastError() /* 5) something went wrong, give up */;
 30299      VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
 30300      return rc;
 30301    }
 30302  
 30303    return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */;
 30304  }
 30305  
 30306  MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
 30307                                       MDBX_env *inprocess_neighbor,
 30308                                       int global_uniqueness_flag) {
 30309    (void)env;
 30310    (void)inprocess_neighbor;
 30311    (void)global_uniqueness_flag;
 30312    return MDBX_SUCCESS;
 30313  }
 30314  
 30315  MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
 30316                                          MDBX_env *inprocess_neighbor) {
 30317    /* LY: should unmap before releasing the locks to avoid race condition and
 30318     * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */
 30319    if (env->me_map)
 30320      osal_munmap(&env->me_dxb_mmap);
 30321    if (env->me_lck_mmap.lck) {
 30322      const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
 30323      osal_munmap(&env->me_lck_mmap);
 30324      if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE &&
 30325          mdbx_lck_upgrade(env) == MDBX_SUCCESS)
 30326        /* this will fail if LCK is used/mmapped by other process(es) */
 30327        osal_ftruncate(env->me_lfd, 0);
 30328    }
 30329    lck_unlock(env);
 30330    return MDBX_SUCCESS;
 30331  }
 30332  
 30333  /*----------------------------------------------------------------------------*/
 30334  /* reader checking (by pid) */
 30335  
 30336  MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
 30337    (void)env;
 30338    return MDBX_SUCCESS;
 30339  }
 30340  
 30341  MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
 30342    (void)env;
 30343    return MDBX_SUCCESS;
 30344  }
 30345  
 30346  /* Checks reader by pid.
 30347   *
 30348   * Returns:
 30349   *   MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
 30350   *   MDBX_RESULT_FALSE, if pid is dead (lock acquired)
 30351   *   or otherwise the errcode. */
 30352  MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
 30353    (void)env;
 30354    HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid);
 30355    int rc;
 30356    if (likely(hProcess)) {
 30357      rc = WaitForSingleObject(hProcess, 0);
 30358      if (unlikely(rc == (int)WAIT_FAILED))
 30359        rc = (int)GetLastError();
 30360      CloseHandle(hProcess);
 30361    } else {
 30362      rc = (int)GetLastError();
 30363    }
 30364  
 30365    switch (rc) {
 30366    case ERROR_INVALID_PARAMETER:
 30367      /* pid seems invalid */
 30368      return MDBX_RESULT_FALSE;
 30369    case WAIT_OBJECT_0:
 30370      /* process just exited */
 30371      return MDBX_RESULT_FALSE;
 30372    case ERROR_ACCESS_DENIED:
 30373      /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc.
 30374       * assume pid exists */
 30375      return MDBX_RESULT_TRUE;
 30376    case WAIT_TIMEOUT:
 30377      /* pid running */
 30378      return MDBX_RESULT_TRUE;
 30379    default:
 30380      /* failure */
 30381      return rc;
 30382    }
 30383  }
 30384  
 30385  //----------------------------------------------------------------------------
 30386  // Stub for slim read-write lock
 30387  // Copyright (C) 1995-2002 Brad Wilson
 30388  
 30389  static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) {
 30390    srwl->readerCount = srwl->writerCount = 0;
 30391  }
 30392  
 30393  static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) {
 30394    while (true) {
 30395      assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
 30396  
 30397      //  If there's a writer already, spin without unnecessarily
 30398      //  interlocking the CPUs
 30399      if (srwl->writerCount != 0) {
 30400        YieldProcessor();
 30401        continue;
 30402      }
 30403  
 30404      //  Add to the readers list
 30405      _InterlockedIncrement(&srwl->readerCount);
 30406  
 30407      // Check for writers again (we may have been preempted). If
 30408      // there are no writers writing or waiting, then we're done.
 30409      if (srwl->writerCount == 0)
 30410        break;
 30411  
 30412      // Remove from the readers list, spin, try again
 30413      _InterlockedDecrement(&srwl->readerCount);
 30414      YieldProcessor();
 30415    }
 30416  }
 30417  
 30418  static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) {
 30419    assert(srwl->readerCount > 0);
 30420    _InterlockedDecrement(&srwl->readerCount);
 30421  }
 30422  
 30423  static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) {
 30424    while (true) {
 30425      assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
 30426  
 30427      //  If there's a writer already, spin without unnecessarily
 30428      //  interlocking the CPUs
 30429      if (srwl->writerCount != 0) {
 30430        YieldProcessor();
 30431        continue;
 30432      }
 30433  
 30434      // See if we can become the writer (expensive, because it inter-
 30435      // locks the CPUs, so writing should be an infrequent process)
 30436      if (_InterlockedExchange(&srwl->writerCount, 1) == 0)
 30437        break;
 30438    }
 30439  
 30440    // Now we're the writer, but there may be outstanding readers.
 30441    // Spin until there aren't any more; new readers will wait now
 30442    // that we're the writer.
 30443    while (srwl->readerCount != 0) {
 30444      assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
 30445      YieldProcessor();
 30446    }
 30447  }
 30448  
 30449  static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) {
 30450    assert(srwl->writerCount == 1 && srwl->readerCount >= 0);
 30451    srwl->writerCount = 0;
 30452  }
 30453  
 30454  static uint64_t WINAPI stub_GetTickCount64(void) {
 30455    LARGE_INTEGER Counter, Frequency;
 30456    return (QueryPerformanceFrequency(&Frequency) &&
 30457            QueryPerformanceCounter(&Counter))
 30458               ? Counter.QuadPart * 1000ul / Frequency.QuadPart
 30459               : 0;
 30460  }
 30461  
 30462  /*----------------------------------------------------------------------------*/
 30463  
 30464  #ifndef xMDBX_ALLOY
 30465  osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared,
 30466      osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive,
 30467      osal_srwlock_ReleaseExclusive;
 30468  
 30469  MDBX_NtExtendSection mdbx_NtExtendSection;
 30470  MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
 30471  MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
 30472  MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
 30473  MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
 30474  MDBX_NtFsControlFile mdbx_NtFsControlFile;
 30475  MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
 30476  MDBX_GetTickCount64 mdbx_GetTickCount64;
 30477  MDBX_RegGetValueA mdbx_RegGetValueA;
 30478  #endif /* xMDBX_ALLOY */
 30479  
 30480  #if __GNUC_PREREQ(8, 0)
 30481  #pragma GCC diagnostic push
 30482  #pragma GCC diagnostic ignored "-Wcast-function-type"
 30483  #endif /* GCC/MINGW */
 30484  
 30485  static void mdbx_winnt_import(void) {
 30486    const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
 30487  
 30488  #define GET_PROC_ADDR(dll, ENTRY)                                              \
 30489    mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY)
 30490  
 30491    if (GetProcAddress(hNtdll, "wine_get_version")) {
 30492      assert(mdbx_RunningUnderWine());
 30493    } else {
 30494      GET_PROC_ADDR(hNtdll, NtFsControlFile);
 30495      GET_PROC_ADDR(hNtdll, NtExtendSection);
 30496      assert(!mdbx_RunningUnderWine());
 30497    }
 30498  
 30499    const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll");
 30500    GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx);
 30501    GET_PROC_ADDR(hKernel32dll, GetTickCount64);
 30502    if (!mdbx_GetTickCount64)
 30503      mdbx_GetTickCount64 = stub_GetTickCount64;
 30504    if (!mdbx_RunningUnderWine()) {
 30505      GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle);
 30506      GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
 30507      GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
 30508      GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
 30509    }
 30510  
 30511    const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");
 30512    GET_PROC_ADDR(hAdvapi32dll, RegGetValueA);
 30513  #undef GET_PROC_ADDR
 30514  
 30515    const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress(
 30516        hKernel32dll, "InitializeSRWLock");
 30517    if (init != NULL) {
 30518      osal_srwlock_Init = init;
 30519      osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress(
 30520          hKernel32dll, "AcquireSRWLockShared");
 30521      osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress(
 30522          hKernel32dll, "ReleaseSRWLockShared");
 30523      osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress(
 30524          hKernel32dll, "AcquireSRWLockExclusive");
 30525      osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress(
 30526          hKernel32dll, "ReleaseSRWLockExclusive");
 30527    } else {
 30528      osal_srwlock_Init = stub_srwlock_Init;
 30529      osal_srwlock_AcquireShared = stub_srwlock_AcquireShared;
 30530      osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared;
 30531      osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive;
 30532      osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive;
 30533    }
 30534  }
 30535  
 30536  #if __GNUC_PREREQ(8, 0)
 30537  #pragma GCC diagnostic pop
 30538  #endif /* GCC/MINGW */
 30539  
 30540  #endif /* Windows LCK-implementation */
 30541  /*
 30542   * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>
 30543   * and other libmdbx authors: please see AUTHORS file.
 30544   * All rights reserved.
 30545   *
 30546   * Redistribution and use in source and binary forms, with or without
 30547   * modification, are permitted only as authorized by the OpenLDAP
 30548   * Public License.
 30549   *
 30550   * A copy of this license is available in the file LICENSE in the
 30551   * top-level directory of the distribution or, alternatively, at
 30552   * <http://www.OpenLDAP.org/license.html>.
 30553   */
 30554  
 30555  #if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */
 30556  
 30557  
 30558  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
 30559  #include <sys/sem.h>
 30560  #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
 30561  
 30562  /*----------------------------------------------------------------------------*/
 30563  /* global constructor/destructor */
 30564  
 30565  #if defined(__linux__) || defined(__gnu_linux__)
 30566  
 30567  #include <sys/utsname.h>
 30568  
 30569  #ifndef xMDBX_ALLOY
 30570  uint32_t linux_kernel_version;
 30571  bool mdbx_RunningOnWSL1;
 30572  #endif /* xMDBX_ALLOY */
 30573  
 30574  MDBX_EXCLUDE_FOR_GPROF
 30575  __cold static uint8_t probe_for_WSL(const char *tag) {
 30576    const char *const WSL = strstr(tag, "WSL");
 30577    if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
 30578      return WSL[3] - '0';
 30579    const char *const wsl = strstr(tag, "wsl");
 30580    if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
 30581      return wsl[3] - '0';
 30582    if (WSL || wsl || strcasestr(tag, "Microsoft"))
 30583      /* Expecting no new kernel within WSL1, either it will explicitly
 30584       * marked by an appropriate WSL-version hint. */
 30585      return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
 30586    return 0;
 30587  }
 30588  
 30589  #endif /* Linux */
 30590  
 30591  #ifdef ENABLE_GPROF
 30592  extern void _mcleanup(void);
 30593  extern void monstartup(unsigned long, unsigned long);
 30594  extern void _init(void);
 30595  extern void _fini(void);
 30596  extern void __gmon_start__(void) __attribute__((__weak__));
 30597  #endif /* ENABLE_GPROF */
 30598  
 30599  MDBX_EXCLUDE_FOR_GPROF
 30600  __cold static __attribute__((__constructor__)) void
 30601  mdbx_global_constructor(void) {
 30602  #ifdef ENABLE_GPROF
 30603    if (!&__gmon_start__)
 30604      monstartup((uintptr_t)&_init, (uintptr_t)&_fini);
 30605  #endif /* ENABLE_GPROF */
 30606  
 30607  #if defined(__linux__) || defined(__gnu_linux__)
 30608    struct utsname buffer;
 30609    if (uname(&buffer) == 0) {
 30610      int i = 0;
 30611      char *p = buffer.release;
 30612      while (*p && i < 4) {
 30613        if (*p >= '0' && *p <= '9') {
 30614          long number = strtol(p, &p, 10);
 30615          if (number > 0) {
 30616            if (number > 255)
 30617              number = 255;
 30618            linux_kernel_version += number << (24 - i * 8);
 30619          }
 30620          ++i;
 30621        } else {
 30622          ++p;
 30623        }
 30624      }
 30625      /* "Official" way of detecting WSL1 but not WSL2
 30626       * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
 30627       *
 30628       * WARNING: False negative detection of WSL1 will result in DATA LOSS!
 30629       * So, the REQUIREMENTS for this code:
 30630       *  1. MUST detect WSL1 without false-negatives.
 30631       *  2. DESIRABLE detect WSL2 but without the risk of violating the first. */
 30632      mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 ||
 30633                           probe_for_WSL(buffer.sysname) == 1 ||
 30634                           probe_for_WSL(buffer.release) == 1;
 30635    }
 30636  #endif /* Linux */
 30637  
 30638    global_ctor();
 30639  }
 30640  
 30641  MDBX_EXCLUDE_FOR_GPROF
 30642  __cold static __attribute__((__destructor__)) void
 30643  mdbx_global_destructor(void) {
 30644    global_dtor();
 30645  #ifdef ENABLE_GPROF
 30646    if (!&__gmon_start__)
 30647      _mcleanup();
 30648  #endif /* ENABLE_GPROF */
 30649  }
 30650  
 30651  /*----------------------------------------------------------------------------*/
 30652  /* lck */
 30653  
 30654  /* Описание реализации блокировок для POSIX & Linux:
 30655   *
 30656   * lck-файл отображается в память, в нём организуется таблица читателей и
 30657   * размещаются совместно используемые posix-мьютексы (futex). Посредством
 30658   * этих мьютексов (см struct MDBX_lockinfo) реализуются:
 30659   *  - Блокировка таблицы читателей для регистрации,
 30660   *    т.е. функции osal_rdt_lock() и osal_rdt_unlock().
 30661   *  - Блокировка БД для пишущих транзакций,
 30662   *    т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
 30663   *
 30664   * Остальной функционал реализуется отдельно посредством файловых блокировок:
 30665   *  - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
 30666   *    в операционный режим, функции osal_lck_seize() и osal_lck_downgrade().
 30667   *  - Проверка присутствие процессов-читателей,
 30668   *    т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check().
 30669   *
 30670   * Для блокировки файлов используется fcntl(F_SETLK), так как:
 30671   *  - lockf() оперирует только эксклюзивной блокировкой и требует
 30672   *    открытия файла в RW-режиме.
 30673   *  - flock() не гарантирует атомарности при смене блокировок
 30674   *    и оперирует только всем файлом целиком.
 30675   *  - Для контроля процессов-читателей используются однобайтовые
 30676   *    range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
 30677   *    в качестве позиции используется pid процесса-читателя.
 30678   *  - Для первоначального захвата и shared/exclusive выполняется блокировка
 30679   *    основного файла БД и при успехе lck-файла.
 30680   *
 30681   * ----------------------------------------------------------------------------
 30682   * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ
 30683   *
 30684   * Эксклюзивный режим без lck-файла:
 30685   *   = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK,
 30686   *     в зависимости от MDBX_RDONLY.
 30687   *
 30688   * Не-операционный режим на время пере-инициализации и разрушении lck-файла:
 30689   *   = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её
 30690   *     снятия при получении F_RDLCK через F_SETLKW.
 30691   *   - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки
 30692   *    lck-файла:
 30693   *       + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
 30694   *         посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
 30695   *       + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла
 30696   *         посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
 30697   *
 30698   * ОПЕРАЦИОННЫЙ режим с lck-файлом:
 30699   *   = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут
 30700   *     получить F_WRLCK и таким образом видят что БД используется.
 30701   *   + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения.
 30702   *   + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
 30703   *     посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
 30704   *   + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла
 30705   *     посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
 30706   */
 30707  
 30708  #if MDBX_USE_OFDLOCKS
 30709  static int op_setlk, op_setlkw, op_getlk;
 30710  __cold static void choice_fcntl(void) {
 30711    assert(!op_setlk && !op_setlkw && !op_getlk);
 30712    if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
 30713  #if defined(__linux__) || defined(__gnu_linux__)
 30714        && linux_kernel_version >
 30715               0x030f0000 /* OFD locks are available since 3.15, but engages here
 30716                             only for 3.16 and later kernels (i.e. LTS) because
 30717                             of reliability reasons */
 30718  #endif                  /* linux */
 30719    ) {
 30720      op_setlk = F_OFD_SETLK;
 30721      op_setlkw = F_OFD_SETLKW;
 30722      op_getlk = F_OFD_GETLK;
 30723      return;
 30724    }
 30725    op_setlk = F_SETLK;
 30726    op_setlkw = F_SETLKW;
 30727    op_getlk = F_GETLK;
 30728  }
 30729  #else
 30730  #define op_setlk F_SETLK
 30731  #define op_setlkw F_SETLKW
 30732  #define op_getlk F_GETLK
 30733  #endif /* MDBX_USE_OFDLOCKS */
 30734  
 30735  #ifndef OFF_T_MAX
 30736  #define OFF_T_MAX                                                              \
 30737    (((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
 30738  #endif
 30739  
 30740  static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck,
 30741                    const off_t offset, off_t len) {
 30742    STATIC_ASSERT(sizeof(off_t) >= sizeof(void *) &&
 30743                  sizeof(off_t) >= sizeof(size_t));
 30744  #ifdef __ANDROID_API__
 30745    STATIC_ASSERT_MSG((sizeof(off_t) * 8 == MDBX_WORDBITS),
 30746                      "The bitness of system `off_t` type is mismatch. Please "
 30747                      "fix build and/or NDK configuration.");
 30748  #endif /* Android */
 30749    jitter4testing(true);
 30750    assert(offset >= 0 && len > 0);
 30751    assert((uint64_t)offset < (uint64_t)INT64_MAX &&
 30752           (uint64_t)len < (uint64_t)INT64_MAX &&
 30753           (uint64_t)(offset + len) > (uint64_t)offset);
 30754  
 30755    assert((uint64_t)offset < (uint64_t)OFF_T_MAX &&
 30756           (uint64_t)len <= (uint64_t)OFF_T_MAX &&
 30757           (uint64_t)(offset + len) <= (uint64_t)OFF_T_MAX);
 30758  
 30759    assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) ==
 30760           ((uint64_t)offset + (uint64_t)len));
 30761    for (;;) {
 30762      struct flock lock_op;
 30763      STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) &&
 30764                            sizeof(off_t) <= sizeof(lock_op.l_len) &&
 30765                            OFF_T_MAX == (off_t)OFF_T_MAX,
 30766                        "Support for large/64-bit-sized files is misconfigured "
 30767                        "for the target system and/or toolchain. "
 30768                        "Please fix it or at least disable it completely.");
 30769      memset(&lock_op, 0, sizeof(lock_op));
 30770      lock_op.l_type = lck;
 30771      lock_op.l_whence = SEEK_SET;
 30772      lock_op.l_start = offset;
 30773      lock_op.l_len = len;
 30774      int rc = fcntl(fd, cmd, &lock_op);
 30775      jitter4testing(true);
 30776      if (rc != -1) {
 30777        if (cmd == op_getlk) {
 30778          /* Checks reader by pid. Returns:
 30779           *   MDBX_RESULT_TRUE   - if pid is live (reader holds a lock).
 30780           *   MDBX_RESULT_FALSE  - if pid is dead (a lock could be placed). */
 30781          return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
 30782                                             : MDBX_RESULT_TRUE;
 30783        }
 30784        return MDBX_SUCCESS;
 30785      }
 30786      rc = errno;
 30787  #if MDBX_USE_OFDLOCKS
 30788      if (rc == EINVAL &&
 30789          (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) {
 30790        /* fallback to non-OFD locks */
 30791        if (cmd == F_OFD_SETLK)
 30792          cmd = F_SETLK;
 30793        else if (cmd == F_OFD_SETLKW)
 30794          cmd = F_SETLKW;
 30795        else
 30796          cmd = F_GETLK;
 30797        op_setlk = F_SETLK;
 30798        op_setlkw = F_SETLKW;
 30799        op_getlk = F_GETLK;
 30800        continue;
 30801      }
 30802  #endif /* MDBX_USE_OFDLOCKS */
 30803      if (rc != EINTR || cmd == op_setlkw) {
 30804        assert(MDBX_IS_ERROR(rc));
 30805        return rc;
 30806      }
 30807    }
 30808  }
 30809  
 30810  MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
 30811  #if MDBX_USE_OFDLOCKS
 30812    if (unlikely(op_setlk == 0))
 30813      choice_fcntl();
 30814  #endif /* MDBX_USE_OFDLOCKS */
 30815    return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX);
 30816  }
 30817  
 30818  MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) {
 30819    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 30820    assert(env->me_pid > 0);
 30821    if (unlikely(osal_getpid() != env->me_pid))
 30822      return MDBX_PANIC;
 30823    return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
 30824  }
 30825  
 30826  MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) {
 30827    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 30828    assert(env->me_pid > 0);
 30829    return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1);
 30830  }
 30831  
 30832  MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) {
 30833    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 30834    assert(pid > 0);
 30835    return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
 30836  }
 30837  
 30838  /*---------------------------------------------------------------------------*/
 30839  
 30840  #if MDBX_LOCKING > MDBX_LOCKING_SYSV
 30841  MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) {
 30842  #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 30843    return sem_init(ipc, false, 1) ? errno : 0;
 30844  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
 30845      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 30846    return pthread_mutex_init(ipc, nullptr);
 30847  #else
 30848  #error "FIXME"
 30849  #endif
 30850  }
 30851  
 30852  MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) {
 30853  #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 30854    return sem_destroy(ipc) ? errno : 0;
 30855  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
 30856      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 30857    return pthread_mutex_destroy(ipc);
 30858  #else
 30859  #error "FIXME"
 30860  #endif
 30861  }
 30862  #endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
 30863  
 30864  static int check_fstat(MDBX_env *env) {
 30865    struct stat st;
 30866  
 30867    int rc = MDBX_SUCCESS;
 30868    if (fstat(env->me_lazy_fd, &st)) {
 30869      rc = errno;
 30870      ERROR("fstat(%s), err %d", "DXB", rc);
 30871      return rc;
 30872    }
 30873  
 30874    if (!S_ISREG(st.st_mode) || st.st_nlink < 1) {
 30875  #ifdef EBADFD
 30876      rc = EBADFD;
 30877  #else
 30878      rc = EPERM;
 30879  #endif
 30880      ERROR("%s %s, err %d", "DXB",
 30881            (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc);
 30882      return rc;
 30883    }
 30884  
 30885    if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) {
 30886      VERBOSE("dxb-file is too short (%u), exclusive-lock needed",
 30887              (unsigned)st.st_size);
 30888      rc = MDBX_RESULT_TRUE;
 30889    }
 30890  
 30891    //----------------------------------------------------------------------------
 30892  
 30893    if (fstat(env->me_lfd, &st)) {
 30894      rc = errno;
 30895      ERROR("fstat(%s), err %d", "LCK", rc);
 30896      return rc;
 30897    }
 30898  
 30899    if (!S_ISREG(st.st_mode) || st.st_nlink < 1) {
 30900  #ifdef EBADFD
 30901      rc = EBADFD;
 30902  #else
 30903      rc = EPERM;
 30904  #endif
 30905      ERROR("%s %s, err %d", "LCK",
 30906            (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc);
 30907      return rc;
 30908    }
 30909  
 30910    /* Checking file size for detect the situation when we got the shared lock
 30911     * immediately after osal_lck_destroy(). */
 30912    if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) {
 30913      VERBOSE("lck-file is too short (%u), exclusive-lock needed",
 30914              (unsigned)st.st_size);
 30915      rc = MDBX_RESULT_TRUE;
 30916    }
 30917  
 30918    return rc;
 30919  }
 30920  
 30921  __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
 30922    assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
 30923    if (unlikely(osal_getpid() != env->me_pid))
 30924      return MDBX_PANIC;
 30925  #if MDBX_USE_OFDLOCKS
 30926    if (unlikely(op_setlk == 0))
 30927      choice_fcntl();
 30928  #endif /* MDBX_USE_OFDLOCKS */
 30929  
 30930    int rc = MDBX_SUCCESS;
 30931  #if defined(__linux__) || defined(__gnu_linux__)
 30932    if (unlikely(mdbx_RunningOnWSL1)) {
 30933      rc = ENOLCK /* No record locks available */;
 30934      ERROR("%s, err %u",
 30935            "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, "
 30936            "injecting failure to avoid data loss",
 30937            rc);
 30938      return rc;
 30939    }
 30940  #endif /* Linux */
 30941  
 30942    if (env->me_lfd == INVALID_HANDLE_VALUE) {
 30943      /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
 30944      rc =
 30945          lck_op(env->me_lazy_fd, op_setlk,
 30946                 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
 30947      if (rc != MDBX_SUCCESS) {
 30948        ERROR("%s, err %u", "without-lck", rc);
 30949        eASSERT(env, MDBX_IS_ERROR(rc));
 30950        return rc;
 30951      }
 30952      return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
 30953    }
 30954  #if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0
 30955    sched_yield();
 30956  #endif
 30957  
 30958  retry:
 30959    if (rc == MDBX_RESULT_TRUE) {
 30960      rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1);
 30961      if (rc != MDBX_SUCCESS) {
 30962        ERROR("%s, err %u", "unlock-before-retry", rc);
 30963        eASSERT(env, MDBX_IS_ERROR(rc));
 30964        return rc;
 30965      }
 30966    }
 30967  
 30968    /* Firstly try to get exclusive locking.  */
 30969    rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
 30970    if (rc == MDBX_SUCCESS) {
 30971      rc = check_fstat(env);
 30972      if (MDBX_IS_ERROR(rc))
 30973        return rc;
 30974  
 30975    continue_dxb_exclusive:
 30976      rc =
 30977          lck_op(env->me_lazy_fd, op_setlk,
 30978                 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
 30979      if (rc == MDBX_SUCCESS)
 30980        return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
 30981  
 30982      int err = check_fstat(env);
 30983      if (MDBX_IS_ERROR(err))
 30984        return err;
 30985  
 30986      /* the cause may be a collision with POSIX's file-lock recovery. */
 30987      if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
 30988            rc == EDEADLK)) {
 30989        ERROR("%s, err %u", "dxb-exclusive", rc);
 30990        eASSERT(env, MDBX_IS_ERROR(rc));
 30991        return rc;
 30992      }
 30993  
 30994      /* Fallback to lck-shared */
 30995    } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY ||
 30996                 rc == EWOULDBLOCK || rc == EDEADLK)) {
 30997      ERROR("%s, err %u", "try-exclusive", rc);
 30998      eASSERT(env, MDBX_IS_ERROR(rc));
 30999      return rc;
 31000    }
 31001  
 31002    /* Here could be one of two:
 31003     *  - osal_lck_destroy() from the another process was hold the lock
 31004     *    during a destruction.
 31005     *  - either osal_lck_seize() from the another process was got the exclusive
 31006     *    lock and doing initialization.
 31007     * For distinguish these cases will use size of the lck-file later. */
 31008  
 31009    /* Wait for lck-shared now. */
 31010    /* Here may be await during transient processes, for instance until another
 31011     * competing process doesn't call lck_downgrade(). */
 31012    rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1);
 31013    if (rc != MDBX_SUCCESS) {
 31014      ERROR("%s, err %u", "try-shared", rc);
 31015      eASSERT(env, MDBX_IS_ERROR(rc));
 31016      return rc;
 31017    }
 31018  
 31019    rc = check_fstat(env);
 31020    if (rc == MDBX_RESULT_TRUE)
 31021      goto retry;
 31022    if (rc != MDBX_SUCCESS) {
 31023      ERROR("%s, err %u", "lck_fstat", rc);
 31024      return rc;
 31025    }
 31026  
 31027    /* got shared, retry exclusive */
 31028    rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
 31029    if (rc == MDBX_SUCCESS)
 31030      goto continue_dxb_exclusive;
 31031  
 31032    if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
 31033          rc == EDEADLK)) {
 31034      ERROR("%s, err %u", "try-exclusive", rc);
 31035      eASSERT(env, MDBX_IS_ERROR(rc));
 31036      return rc;
 31037    }
 31038  
 31039    /* Lock against another process operating in without-lck or exclusive mode. */
 31040    rc =
 31041        lck_op(env->me_lazy_fd, op_setlk,
 31042               (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
 31043    if (rc != MDBX_SUCCESS) {
 31044      ERROR("%s, err %u", "lock-against-without-lck", rc);
 31045      eASSERT(env, MDBX_IS_ERROR(rc));
 31046      return rc;
 31047    }
 31048  
 31049    /* Done: return with shared locking. */
 31050    return MDBX_RESULT_FALSE;
 31051  }
 31052  
 31053  MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
 31054    assert(env->me_lfd != INVALID_HANDLE_VALUE);
 31055    if (unlikely(osal_getpid() != env->me_pid))
 31056      return MDBX_PANIC;
 31057  
 31058    int rc = MDBX_SUCCESS;
 31059    if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
 31060      rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid);
 31061      if (rc == MDBX_SUCCESS)
 31062        rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1,
 31063                    OFF_T_MAX - env->me_pid - 1);
 31064    }
 31065    if (rc == MDBX_SUCCESS)
 31066      rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
 31067    if (unlikely(rc != 0)) {
 31068      ERROR("%s, err %u", "lck", rc);
 31069      assert(MDBX_IS_ERROR(rc));
 31070    }
 31071    return rc;
 31072  }
 31073  
 31074  __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env,
 31075                                                 MDBX_env *inprocess_neighbor) {
 31076    if (unlikely(osal_getpid() != env->me_pid))
 31077      return MDBX_PANIC;
 31078  
 31079    int rc = MDBX_SUCCESS;
 31080    struct stat lck_info;
 31081    MDBX_lockinfo *lck = env->me_lck_mmap.lck;
 31082    if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck &&
 31083        /* try get exclusive access */
 31084        lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
 31085        /* if LCK was not removed */
 31086        fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
 31087        lck_op(env->me_lazy_fd, op_setlk,
 31088               (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
 31089               OFF_T_MAX) == 0) {
 31090  
 31091      VERBOSE("%p got exclusive, drown locks", (void *)env);
 31092  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
 31093      if (env->me_sysv_ipc.semid != -1)
 31094        rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0;
 31095  #else
 31096      rc = osal_ipclock_destroy(&lck->mti_rlock);
 31097      if (rc == 0)
 31098        rc = osal_ipclock_destroy(&lck->mti_wlock);
 31099  #endif /* MDBX_LOCKING */
 31100  
 31101      eASSERT(env, rc == 0);
 31102      if (rc == 0) {
 31103        const bool synced = lck->mti_unsynced_pages.weak == 0;
 31104        osal_munmap(&env->me_lck_mmap);
 31105        if (synced)
 31106          rc = ftruncate(env->me_lfd, 0) ? errno : 0;
 31107      }
 31108  
 31109      jitter4testing(false);
 31110    }
 31111  
 31112    /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored
 31113     * after file was closed.
 31114     *
 31115     * 2) File locks would be released (by kernel) while the file-descriptors will
 31116     * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel,
 31117     * locks should be released here explicitly with properly order. */
 31118  
 31119    /* close dxb and restore lock */
 31120    if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
 31121      if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS)
 31122        rc = errno;
 31123      env->me_dsync_fd = INVALID_HANDLE_VALUE;
 31124    }
 31125    if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
 31126      if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS)
 31127        rc = errno;
 31128      env->me_lazy_fd = INVALID_HANDLE_VALUE;
 31129      if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
 31130        /* restore file-lock */
 31131        rc = lck_op(
 31132            inprocess_neighbor->me_lazy_fd, F_SETLKW,
 31133            (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
 31134            (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
 31135                ? 0
 31136                : inprocess_neighbor->me_pid,
 31137            (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
 31138      }
 31139    }
 31140  
 31141    /* close clk and restore locks */
 31142    if (env->me_lfd != INVALID_HANDLE_VALUE) {
 31143      if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS)
 31144        rc = errno;
 31145      env->me_lfd = INVALID_HANDLE_VALUE;
 31146      if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
 31147        /* restore file-locks */
 31148        rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
 31149        if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader)
 31150          rc = osal_rpid_set(inprocess_neighbor);
 31151      }
 31152    }
 31153  
 31154    if (inprocess_neighbor && rc != MDBX_SUCCESS)
 31155      inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
 31156    return rc;
 31157  }
 31158  
 31159  /*---------------------------------------------------------------------------*/
 31160  
 31161  __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
 31162                                              MDBX_env *inprocess_neighbor,
 31163                                              int global_uniqueness_flag) {
 31164  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
 31165    int semid = -1;
 31166    /* don't initialize semaphores twice */
 31167    (void)inprocess_neighbor;
 31168    if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
 31169      struct stat st;
 31170      if (fstat(env->me_lazy_fd, &st))
 31171        return errno;
 31172    sysv_retry_create:
 31173      semid = semget(env->me_sysv_ipc.key, 2,
 31174                     IPC_CREAT | IPC_EXCL |
 31175                         (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO)));
 31176      if (unlikely(semid == -1)) {
 31177        int err = errno;
 31178        if (err != EEXIST)
 31179          return err;
 31180  
 31181        /* remove and re-create semaphore set */
 31182        semid = semget(env->me_sysv_ipc.key, 2, 0);
 31183        if (semid == -1) {
 31184          err = errno;
 31185          if (err != ENOENT)
 31186            return err;
 31187          goto sysv_retry_create;
 31188        }
 31189        if (semctl(semid, 2, IPC_RMID)) {
 31190          err = errno;
 31191          if (err != EIDRM)
 31192            return err;
 31193        }
 31194        goto sysv_retry_create;
 31195      }
 31196  
 31197      unsigned short val_array[2] = {1, 1};
 31198      if (semctl(semid, 2, SETALL, val_array))
 31199        return errno;
 31200    } else {
 31201      semid = semget(env->me_sysv_ipc.key, 2, 0);
 31202      if (semid == -1)
 31203        return errno;
 31204  
 31205      /* check read & write access */
 31206      struct semid_ds data[2];
 31207      if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data))
 31208        return errno;
 31209    }
 31210  
 31211    env->me_sysv_ipc.semid = semid;
 31212    return MDBX_SUCCESS;
 31213  
 31214  #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX
 31215    (void)inprocess_neighbor;
 31216    if (global_uniqueness_flag != MDBX_RESULT_TRUE)
 31217      return MDBX_SUCCESS;
 31218  #error "FIXME: Not implemented"
 31219  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 31220  
 31221    /* don't initialize semaphores twice */
 31222    (void)inprocess_neighbor;
 31223    if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
 31224      if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1))
 31225        return errno;
 31226      if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1))
 31227        return errno;
 31228    }
 31229    return MDBX_SUCCESS;
 31230  
 31231  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
 31232      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 31233    if (inprocess_neighbor)
 31234      return MDBX_SUCCESS /* don't need any initialization for mutexes
 31235        if LCK already opened/used inside current process */
 31236          ;
 31237  
 31238      /* FIXME: Unfortunately, there is no other reliable way but to long testing
 31239       * on each platform. On the other hand, behavior like FreeBSD is incorrect
 31240       * and we can expect it to be rare. Moreover, even on FreeBSD without
 31241       * additional in-process initialization, the probability of an problem
 31242       * occurring is vanishingly small, and the symptom is a return of EINVAL
 31243       * while locking a mutex. In other words, in the worst case, the problem
 31244       * results in an EINVAL error at the start of the transaction, but NOT data
 31245       * loss, nor database corruption, nor other fatal troubles. Thus, the code
 31246       * below I am inclined to think the workaround for erroneous platforms (like
 31247       * FreeBSD), rather than a defect of libmdbx. */
 31248  #if defined(__FreeBSD__)
 31249    /* seems that shared mutexes on FreeBSD required in-process initialization */
 31250    (void)global_uniqueness_flag;
 31251  #else
 31252    /* shared mutexes on many other platforms (including Darwin and Linux's
 31253     * futexes) doesn't need any addition in-process initialization */
 31254    if (global_uniqueness_flag != MDBX_RESULT_TRUE)
 31255      return MDBX_SUCCESS;
 31256  #endif
 31257  
 31258    pthread_mutexattr_t ma;
 31259    int rc = pthread_mutexattr_init(&ma);
 31260    if (rc)
 31261      return rc;
 31262  
 31263    rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 31264    if (rc)
 31265      goto bailout;
 31266  
 31267  #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 31268  #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust)
 31269    rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 31270  #elif defined(PTHREAD_MUTEX_ROBUST_NP) ||                                      \
 31271      defined(pthread_mutexattr_setrobust_np)
 31272    rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
 31273  #elif _POSIX_THREAD_PROCESS_SHARED < 200809L
 31274    rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
 31275  #else
 31276    rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 31277  #endif
 31278    if (rc)
 31279      goto bailout;
 31280  #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */
 31281  
 31282  #if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 &&  \
 31283      !defined(MDBX_SAFE4QEMU)
 31284    rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT);
 31285    if (rc == ENOTSUP)
 31286      rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE);
 31287    if (rc && rc != ENOTSUP)
 31288      goto bailout;
 31289  #endif /* PTHREAD_PRIO_INHERIT */
 31290  
 31291    rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 31292    if (rc && rc != ENOTSUP)
 31293      goto bailout;
 31294  
 31295    rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma);
 31296    if (rc)
 31297      goto bailout;
 31298    rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma);
 31299  
 31300  bailout:
 31301    pthread_mutexattr_destroy(&ma);
 31302    return rc;
 31303  #else
 31304  #error "FIXME"
 31305  #endif /* MDBX_LOCKING > 0 */
 31306  }
 31307  
 31308  __cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc,
 31309                                        const int err) {
 31310    int rc = err;
 31311  #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV
 31312    if (err == EOWNERDEAD) {
 31313      /* We own the mutex. Clean up after dead previous owner. */
 31314  
 31315      const bool rlocked = ipc == &env->me_lck->mti_rlock;
 31316      rc = MDBX_SUCCESS;
 31317      if (!rlocked) {
 31318        if (unlikely(env->me_txn)) {
 31319          /* env is hosed if the dead thread was ours */
 31320          env->me_flags |= MDBX_FATAL_ERROR;
 31321          env->me_txn = NULL;
 31322          rc = MDBX_PANIC;
 31323        }
 31324      }
 31325      WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'),
 31326              (rc ? "this process' env is hosed" : "recovering"));
 31327  
 31328      int check_rc = cleanup_dead_readers(env, rlocked, NULL);
 31329      check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
 31330  
 31331  #if MDBX_LOCKING == MDBX_LOCKING_SYSV
 31332      rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
 31333  #else
 31334  #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent)
 31335      int mreco_rc = pthread_mutex_consistent(ipc);
 31336  #elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np)
 31337      int mreco_rc = pthread_mutex_consistent_np(ipc);
 31338  #elif _POSIX_THREAD_PROCESS_SHARED < 200809L
 31339      int mreco_rc = pthread_mutex_consistent_np(ipc);
 31340  #else
 31341      int mreco_rc = pthread_mutex_consistent(ipc);
 31342  #endif
 31343      check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
 31344  
 31345      if (unlikely(mreco_rc))
 31346        ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc));
 31347  
 31348      rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
 31349      if (MDBX_IS_ERROR(rc))
 31350        pthread_mutex_unlock(ipc);
 31351  #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */
 31352      return rc;
 31353    }
 31354  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001
 31355    (void)ipc;
 31356  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 31357    (void)ipc;
 31358  #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX
 31359  #ifdef _MSC_VER
 31360  #pragma message("warning: TODO")
 31361  #else
 31362  #warning "TODO"
 31363  #endif
 31364    (void)ipc;
 31365  #else
 31366  #error "FIXME"
 31367  #endif /* MDBX_LOCKING */
 31368  
 31369    ERROR("mutex (un)lock failed, %s", mdbx_strerror(err));
 31370    if (rc != EDEADLK)
 31371      env->me_flags |= MDBX_FATAL_ERROR;
 31372    return rc;
 31373  }
 31374  
 31375  #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC)
 31376  MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) {
 31377    /* avoid 32-bit Bionic bug/hang with 32-pit TID */
 31378    if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) {
 31379      pid_t tid = gettid();
 31380      if (unlikely(tid > 0xffff)) {
 31381        FATAL("Raise the ENOSYS(%d) error to avoid hang due "
 31382              "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) "
 31383              "that don’t fit in 16 bits, see "
 31384              "https://android.googlesource.com/platform/bionic/+/master/"
 31385              "docs/32-bit-abi.md#is-too-small-for-large-pids",
 31386              ENOSYS, tid, tid);
 31387        return ENOSYS;
 31388      }
 31389    }
 31390    return 0;
 31391  }
 31392  #endif /* __ANDROID_API__ || ANDROID) || BIONIC */
 31393  
 31394  static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc,
 31395                               const bool dont_wait) {
 31396  #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                  \
 31397      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 31398    int rc = osal_check_tid4bionic();
 31399    if (likely(rc == 0))
 31400      rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc);
 31401    rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc;
 31402  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 31403    int rc = MDBX_SUCCESS;
 31404    if (dont_wait) {
 31405      if (sem_trywait(ipc)) {
 31406        rc = errno;
 31407        if (rc == EAGAIN)
 31408          rc = MDBX_BUSY;
 31409      }
 31410    } else if (sem_wait(ipc))
 31411      rc = errno;
 31412  #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
 31413    struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
 31414                        .sem_op = -1,
 31415                        .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO};
 31416    int rc;
 31417    if (semop(env->me_sysv_ipc.semid, &op, 1)) {
 31418      rc = errno;
 31419      if (dont_wait && rc == EAGAIN)
 31420        rc = MDBX_BUSY;
 31421    } else {
 31422      rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS;
 31423      *ipc = env->me_pid;
 31424    }
 31425  #else
 31426  #error "FIXME"
 31427  #endif /* MDBX_LOCKING */
 31428  
 31429    if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY))
 31430      rc = mdbx_ipclock_failed(env, ipc, rc);
 31431    return rc;
 31432  }
 31433  
 31434  static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) {
 31435  #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                  \
 31436      MDBX_LOCKING == MDBX_LOCKING_POSIX2008
 31437    int rc = pthread_mutex_unlock(ipc);
 31438    (void)env;
 31439  #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
 31440    int rc = sem_post(ipc) ? errno : MDBX_SUCCESS;
 31441    (void)env;
 31442  #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
 31443    if (unlikely(*ipc != (pid_t)env->me_pid))
 31444      return EPERM;
 31445    *ipc = 0;
 31446    struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
 31447                        .sem_op = 1,
 31448                        .sem_flg = SEM_UNDO};
 31449    int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS;
 31450  #else
 31451  #error "FIXME"
 31452  #endif /* MDBX_LOCKING */
 31453    return rc;
 31454  }
 31455  
 31456  MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
 31457    TRACE("%s", ">>");
 31458    jitter4testing(true);
 31459    int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false);
 31460    TRACE("<< rc %d", rc);
 31461    return rc;
 31462  }
 31463  
 31464  MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
 31465    TRACE("%s", ">>");
 31466    int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock);
 31467    TRACE("<< rc %d", rc);
 31468    if (unlikely(rc != MDBX_SUCCESS))
 31469      mdbx_panic("%s() failed: err %d\n", __func__, rc);
 31470    jitter4testing(true);
 31471  }
 31472  
 31473  int mdbx_txn_lock(MDBX_env *env, bool dont_wait) {
 31474    TRACE("%swait %s", dont_wait ? "dont-" : "", ">>");
 31475    jitter4testing(true);
 31476    int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait);
 31477    TRACE("<< rc %d", rc);
 31478    return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS;
 31479  }
 31480  
 31481  void mdbx_txn_unlock(MDBX_env *env) {
 31482    TRACE("%s", ">>");
 31483    int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock);
 31484    TRACE("<< rc %d", rc);
 31485    if (unlikely(rc != MDBX_SUCCESS))
 31486      mdbx_panic("%s() failed: err %d\n", __func__, rc);
 31487    jitter4testing(true);
 31488  }
 31489  
 31490  #else
 31491  #ifdef _MSC_VER
 31492  #pragma warning(disable : 4206) /* nonstandard extension used: translation     \
 31493                                     unit is empty */
 31494  #endif                          /* _MSC_VER (warnings) */
 31495  #endif                          /* !Windows LCK-implementation */