github.com/moontrade/mdbx-go@v0.4.0/mdbx.c (about) 1 /* 2 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 3 * and other libmdbx authors: please see AUTHORS file. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted only as authorized by the OpenLDAP 8 * Public License. 9 * 10 * A copy of this license is available in the file LICENSE in the 11 * top-level directory of the distribution or, alternatively, at 12 * <http://www.OpenLDAP.org/license.html>. */ 13 14 #define xMDBX_ALLOY 1 15 #define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 16 #ifdef MDBX_CONFIG_H 17 #include MDBX_CONFIG_H 18 #endif 19 20 #define LIBMDBX_INTERNALS 21 #ifdef xMDBX_TOOLS 22 #define MDBX_DEPRECATED 23 #endif /* xMDBX_TOOLS */ 24 25 #ifdef xMDBX_ALLOY 26 /* Amalgamated build */ 27 #define MDBX_INTERNAL_FUNC static 28 #define MDBX_INTERNAL_VAR static 29 #else 30 /* Non-amalgamated build */ 31 #define MDBX_INTERNAL_FUNC 32 #define MDBX_INTERNAL_VAR extern 33 #endif /* xMDBX_ALLOY */ 34 35 /*----------------------------------------------------------------------------*/ 36 37 /** Disables using GNU/Linux libc extensions. 38 * \ingroup build_option 39 * \note This option couldn't be moved to the options.h since dependant 40 * control macros/defined should be prepared before include the options.h */ 41 #ifndef MDBX_DISABLE_GNU_SOURCE 42 #define MDBX_DISABLE_GNU_SOURCE 0 43 #endif 44 #if MDBX_DISABLE_GNU_SOURCE 45 #undef _GNU_SOURCE 46 #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE) 47 #define _GNU_SOURCE 48 #endif /* MDBX_DISABLE_GNU_SOURCE */ 49 50 /* Should be defined before any includes */ 51 #if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) && \ 52 !defined(ANDROID) 53 #define _FILE_OFFSET_BITS 64 54 #endif 55 56 #ifdef __APPLE__ 57 #define _DARWIN_C_SOURCE 58 #endif 59 60 #ifdef _MSC_VER 61 #if _MSC_FULL_VER < 190024234 62 /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual 63 * Studio 2015 Update 3). But you could remove this #error and try to continue 64 * at your own risk. In such case please don't rise up an issues related ONLY to 65 * old compilers. 66 * 67 * NOTE: 68 * Unfortunately, there are several different builds of "Visual Studio" that 69 * are called "Visual Studio 2015 Update 3". 70 * 71 * The 190024234 is used here because it is minimal version of Visual Studio 72 * that was used for build and testing libmdbx in recent years. Soon this 73 * value will be increased to 19.0.24241.7, since build and testing using 74 * "Visual Studio 2015" will be performed only at https://ci.appveyor.com. 75 * 76 * Please ask Microsoft (but not us) for information about version differences 77 * and how to and where you can obtain the latest "Visual Studio 2015" build 78 * with all fixes. 79 */ 80 #error \ 81 "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required." 82 #endif 83 #ifndef _CRT_SECURE_NO_WARNINGS 84 #define _CRT_SECURE_NO_WARNINGS 85 #endif /* _CRT_SECURE_NO_WARNINGS */ 86 #if _MSC_VER > 1800 87 #pragma warning(disable : 4464) /* relative include path contains '..' */ 88 #endif 89 #if _MSC_VER > 1913 90 #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ 91 */ 92 #endif 93 #if _MSC_VER > 1914 94 #pragma warning( \ 95 disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ 96 producing 'defined' has undefined behavior */ 97 #endif 98 #pragma warning(disable : 4710) /* 'xyz': function not inlined */ 99 #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ 100 inline expansion */ 101 #pragma warning( \ 102 disable : 4201) /* nonstandard extension used : nameless struct / union */ 103 #pragma warning(disable : 4702) /* unreachable code */ 104 #pragma warning(disable : 4706) /* assignment within conditional expression */ 105 #pragma warning(disable : 4127) /* conditional expression is constant */ 106 #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ 107 alignment specifier */ 108 #pragma warning(disable : 4310) /* cast truncates constant value */ 109 #pragma warning( \ 110 disable : 4820) /* bytes padding added after data member for alignment */ 111 #pragma warning(disable : 4548) /* expression before comma has no effect; \ 112 expected expression with side - effect */ 113 #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ 114 unaligned */ 115 #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \ 116 array in struct/union */ 117 #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ 118 aggregate initializer */ 119 #pragma warning( \ 120 disable : 4505) /* unreferenced local function has been removed */ 121 #endif /* _MSC_VER (warnings) */ 122 123 #if defined(__GNUC__) && __GNUC__ < 9 124 #pragma GCC diagnostic ignored "-Wattributes" 125 #endif /* GCC < 9 */ 126 127 #if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ 128 !defined(__USE_MINGW_ANSI_STDIO) 129 #define __USE_MINGW_ANSI_STDIO 1 130 #endif /* __USE_MINGW_ANSI_STDIO */ 131 132 #include "mdbx.h" 133 /* 134 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 135 * and other libmdbx authors: please see AUTHORS file. 136 * All rights reserved. 137 * 138 * Redistribution and use in source and binary forms, with or without 139 * modification, are permitted only as authorized by the OpenLDAP 140 * Public License. 141 * 142 * A copy of this license is available in the file LICENSE in the 143 * top-level directory of the distribution or, alternatively, at 144 * <http://www.OpenLDAP.org/license.html>. 145 */ 146 147 148 /*----------------------------------------------------------------------------*/ 149 /* Microsoft compiler generates a lot of warning for self includes... */ 150 151 #ifdef _MSC_VER 152 #pragma warning(push, 1) 153 #pragma warning(disable : 4548) /* expression before comma has no effect; \ 154 expected expression with side - effect */ 155 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ 156 * semantics are not enabled. Specify /EHsc */ 157 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ 158 * mode specified; termination on exception is \ 159 * not guaranteed. Specify /EHsc */ 160 #endif /* _MSC_VER (warnings) */ 161 162 #if defined(_WIN32) || defined(_WIN64) 163 #if !defined(_CRT_SECURE_NO_WARNINGS) 164 #define _CRT_SECURE_NO_WARNINGS 165 #endif /* _CRT_SECURE_NO_WARNINGS */ 166 #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ 167 !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT 168 #define _NO_CRT_STDIO_INLINE 169 #endif 170 #elif !defined(_POSIX_C_SOURCE) 171 #define _POSIX_C_SOURCE 200809L 172 #endif /* Windows */ 173 174 /*----------------------------------------------------------------------------*/ 175 /* basic C99 includes */ 176 #include <inttypes.h> 177 #include <stddef.h> 178 #include <stdint.h> 179 #include <stdlib.h> 180 181 #include <assert.h> 182 #include <fcntl.h> 183 #include <limits.h> 184 #include <stdio.h> 185 #include <string.h> 186 #include <time.h> 187 188 #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 189 #error \ 190 "Sanity checking failed: Two's complement, reasonably sized integer types" 191 #endif 192 193 #ifndef SSIZE_MAX 194 #define SSIZE_MAX INTPTR_MAX 195 #endif 196 197 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul 198 #define MDBX_WORDBITS 64 199 #else 200 #define MDBX_WORDBITS 32 201 #endif /* MDBX_WORDBITS */ 202 203 /*----------------------------------------------------------------------------*/ 204 /* feature testing */ 205 206 #ifndef __has_warning 207 #define __has_warning(x) (0) 208 #endif 209 210 #ifndef __has_include 211 #define __has_include(x) (0) 212 #endif 213 214 #ifndef __has_feature 215 #define __has_feature(x) (0) 216 #endif 217 218 #ifndef __has_extension 219 #define __has_extension(x) (0) 220 #endif 221 222 #if __has_feature(thread_sanitizer) 223 #define __SANITIZE_THREAD__ 1 224 #endif 225 226 #if __has_feature(address_sanitizer) 227 #define __SANITIZE_ADDRESS__ 1 228 #endif 229 230 #ifndef __GNUC_PREREQ 231 #if defined(__GNUC__) && defined(__GNUC_MINOR__) 232 #define __GNUC_PREREQ(maj, min) \ 233 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) 234 #else 235 #define __GNUC_PREREQ(maj, min) (0) 236 #endif 237 #endif /* __GNUC_PREREQ */ 238 239 #ifndef __CLANG_PREREQ 240 #ifdef __clang__ 241 #define __CLANG_PREREQ(maj, min) \ 242 ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) 243 #else 244 #define __CLANG_PREREQ(maj, min) (0) 245 #endif 246 #endif /* __CLANG_PREREQ */ 247 248 #ifndef __GLIBC_PREREQ 249 #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) 250 #define __GLIBC_PREREQ(maj, min) \ 251 ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) 252 #else 253 #define __GLIBC_PREREQ(maj, min) (0) 254 #endif 255 #endif /* __GLIBC_PREREQ */ 256 257 /*----------------------------------------------------------------------------*/ 258 /* C11' alignas() */ 259 260 #if __has_include(<stdalign.h>) 261 #include <stdalign.h> 262 #endif 263 #if defined(alignas) || defined(__cplusplus) 264 #define MDBX_ALIGNAS(N) alignas(N) 265 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L 266 #define MDBX_ALIGNAS(N) _Alignas(N) 267 #elif defined(_MSC_VER) 268 #define MDBX_ALIGNAS(N) __declspec(align(N)) 269 #elif __has_attribute(__aligned__) || defined(__GNUC__) 270 #define MDBX_ALIGNAS(N) __attribute__((__aligned__(N))) 271 #else 272 #error "FIXME: Required alignas() or equivalent." 273 #endif /* MDBX_ALIGNAS */ 274 275 /*----------------------------------------------------------------------------*/ 276 /* Systems macros and includes */ 277 278 #ifndef __extern_C 279 #ifdef __cplusplus 280 #define __extern_C extern "C" 281 #else 282 #define __extern_C 283 #endif 284 #endif /* __extern_C */ 285 286 #if !defined(nullptr) && !defined(__cplusplus) || \ 287 (__cplusplus < 201103L && !defined(_MSC_VER)) 288 #define nullptr NULL 289 #endif 290 291 #if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) 292 #include <AvailabilityMacros.h> 293 #include <TargetConditionals.h> 294 #ifndef MAC_OS_X_VERSION_MIN_REQUIRED 295 #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ 296 #endif 297 #endif /* Apple OSX & iOS */ 298 299 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ 300 defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ 301 defined(__APPLE__) || defined(__MACH__) 302 #include <sys/cdefs.h> 303 #include <sys/mount.h> 304 #include <sys/sysctl.h> 305 #include <sys/types.h> 306 #if defined(__FreeBSD__) || defined(__DragonFly__) 307 #include <vm/vm_param.h> 308 #elif defined(__OpenBSD__) || defined(__NetBSD__) 309 #include <uvm/uvm_param.h> 310 #else 311 #define SYSCTL_LEGACY_NONCONST_MIB 312 #endif 313 #ifndef __MACH__ 314 #include <sys/vmmeter.h> 315 #endif 316 #else 317 #include <malloc.h> 318 #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ 319 defined(_WIN32) || defined(_WIN64)) 320 #include <mntent.h> 321 #endif /* !Solaris */ 322 #endif /* !xBSD */ 323 324 #if defined(__FreeBSD__) || __has_include(<malloc_np.h>) 325 #include <malloc_np.h> 326 #endif 327 328 #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>) 329 #include <malloc/malloc.h> 330 #endif /* MacOS */ 331 332 #if defined(__MACH__) 333 #include <mach/host_info.h> 334 #include <mach/mach_host.h> 335 #include <mach/mach_port.h> 336 #include <uuid/uuid.h> 337 #endif 338 339 #if defined(__linux__) || defined(__gnu_linux__) 340 #include <sched.h> 341 #include <sys/sendfile.h> 342 #include <sys/statfs.h> 343 #endif /* Linux */ 344 345 #ifndef _XOPEN_SOURCE 346 #define _XOPEN_SOURCE 0 347 #endif 348 349 #ifndef _XOPEN_SOURCE_EXTENDED 350 #define _XOPEN_SOURCE_EXTENDED 0 351 #else 352 #include <utmpx.h> 353 #endif /* _XOPEN_SOURCE_EXTENDED */ 354 355 #if defined(__sun) || defined(__SVR4) || defined(__svr4__) 356 #include <kstat.h> 357 #include <sys/mnttab.h> 358 /* On Solaris, it's easier to add a missing prototype rather than find a 359 * combination of #defines that break nothing. */ 360 __extern_C key_t ftok(const char *, int); 361 #endif /* SunOS/Solaris */ 362 363 #if defined(_WIN32) || defined(_WIN64) /*-------------------------------------*/ 364 365 #ifndef _WIN32_WINNT 366 #define _WIN32_WINNT 0x0601 /* Windows 7 */ 367 #elif _WIN32_WINNT < 0x0500 368 #error At least 'Windows 2000' API is required for libmdbx. 369 #endif /* _WIN32_WINNT */ 370 #if (defined(__MINGW32__) || defined(__MINGW64__)) && \ 371 !defined(__USE_MINGW_ANSI_STDIO) 372 #define __USE_MINGW_ANSI_STDIO 1 373 #endif /* MinGW */ 374 #ifndef WIN32_LEAN_AND_MEAN 375 #define WIN32_LEAN_AND_MEAN 376 #endif /* WIN32_LEAN_AND_MEAN */ 377 #include <excpt.h> 378 #include <tlhelp32.h> 379 #include <windows.h> 380 #include <winnt.h> 381 #include <winternl.h> 382 383 #else /*----------------------------------------------------------------------*/ 384 385 #include <unistd.h> 386 #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1 387 #error "libmdbx requires the _POSIX_MAPPED_FILES feature" 388 #endif /* _POSIX_MAPPED_FILES */ 389 390 #include <pthread.h> 391 #include <semaphore.h> 392 #include <signal.h> 393 #include <sys/file.h> 394 #include <sys/ipc.h> 395 #include <sys/mman.h> 396 #include <sys/param.h> 397 #include <sys/stat.h> 398 #include <sys/statvfs.h> 399 #include <sys/uio.h> 400 401 #endif /*---------------------------------------------------------------------*/ 402 403 #if defined(__ANDROID_API__) || defined(ANDROID) 404 #include <android/log.h> 405 #if __ANDROID_API__ >= 21 406 #include <sys/sendfile.h> 407 #endif 408 #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS != MDBX_WORDBITS 409 #error "_FILE_OFFSET_BITS != MDBX_WORDBITS" (_FILE_OFFSET_BITS != MDBX_WORDBITS) 410 #elif defined(__FILE_OFFSET_BITS) && __FILE_OFFSET_BITS != MDBX_WORDBITS 411 #error "__FILE_OFFSET_BITS != MDBX_WORDBITS" (__FILE_OFFSET_BITS != MDBX_WORDBITS) 412 #endif 413 #endif /* Android */ 414 415 #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>) 416 #include <sys/stat.h> 417 #endif 418 #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>) 419 #include <sys/types.h> 420 #endif 421 #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>) 422 #include <sys/file.h> 423 #endif 424 425 /*----------------------------------------------------------------------------*/ 426 /* Byteorder */ 427 428 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ 429 defined(i486) || defined(__i486) || defined(__i486__) || \ 430 defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ 431 defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ 432 defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ 433 defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ 434 defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ 435 defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) 436 #ifndef __ia32__ 437 /* LY: define neutral __ia32__ for x86 and x86-64 */ 438 #define __ia32__ 1 439 #endif /* __ia32__ */ 440 #if !defined(__amd64__) && \ 441 (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ 442 defined(_M_X64) || defined(_M_AMD64)) 443 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ 444 #define __amd64__ 1 445 #endif /* __amd64__ */ 446 #endif /* all x86 */ 447 448 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ 449 !defined(__ORDER_BIG_ENDIAN__) 450 451 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || \ 452 defined(__ANDROID_API__) || defined(HAVE_ENDIAN_H) || __has_include(<endian.h>) 453 #include <endian.h> 454 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ 455 defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>) 456 #include <machine/endian.h> 457 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>) 458 #include <sys/isa_defs.h> 459 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ 460 (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>)) 461 #include <sys/endian.h> 462 #include <sys/types.h> 463 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ 464 defined(__NetBSD__) || defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>) 465 #include <sys/param.h> 466 #endif /* OS */ 467 468 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) 469 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN 470 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN 471 #define __BYTE_ORDER__ __BYTE_ORDER 472 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) 473 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN 474 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN 475 #define __BYTE_ORDER__ _BYTE_ORDER 476 #else 477 #define __ORDER_LITTLE_ENDIAN__ 1234 478 #define __ORDER_BIG_ENDIAN__ 4321 479 480 #if defined(__LITTLE_ENDIAN__) || \ 481 (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ 482 defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ 483 defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ 484 defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ 485 defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ 486 defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ 487 defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ 488 defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ 489 defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ 490 defined(__WINDOWS__) 491 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ 492 493 #elif defined(__BIG_ENDIAN__) || \ 494 (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ 495 defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ 496 defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ 497 defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ 498 defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ 499 defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ 500 defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) 501 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ 502 503 #else 504 #error __BYTE_ORDER__ should be defined. 505 #endif /* Arch */ 506 507 #endif 508 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ 509 510 /*----------------------------------------------------------------------------*/ 511 /* Availability of CMOV or equivalent */ 512 513 #ifndef MDBX_HAVE_CMOV 514 #if defined(__e2k__) 515 #define MDBX_HAVE_CMOV 1 516 #elif defined(__thumb2__) || defined(__thumb2) 517 #define MDBX_HAVE_CMOV 1 518 #elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) 519 #define MDBX_HAVE_CMOV 0 520 #elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ 521 defined(__aarch64) || defined(__arm__) || defined(__arm) || \ 522 defined(__CC_ARM) 523 #define MDBX_HAVE_CMOV 1 524 #elif (defined(__riscv__) || defined(__riscv64)) && \ 525 (defined(__riscv_b) || defined(__riscv_bitmanip)) 526 #define MDBX_HAVE_CMOV 1 527 #elif defined(i686) || defined(__i686) || defined(__i686__) || \ 528 (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ 529 defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ 530 defined(_M_X64) || defined(_M_AMD64) 531 #define MDBX_HAVE_CMOV 1 532 #else 533 #define MDBX_HAVE_CMOV 0 534 #endif 535 #endif /* MDBX_HAVE_CMOV */ 536 537 /*----------------------------------------------------------------------------*/ 538 /* Compiler's includes for builtins/intrinsics */ 539 540 #if defined(_MSC_VER) || defined(__INTEL_COMPILER) 541 #include <intrin.h> 542 #elif __GNUC_PREREQ(4, 4) || defined(__clang__) 543 #if defined(__e2k__) 544 #include <e2kintrin.h> 545 #include <x86intrin.h> 546 #endif /* __e2k__ */ 547 #if defined(__ia32__) 548 #include <cpuid.h> 549 #include <x86intrin.h> 550 #endif /* __ia32__ */ 551 #ifdef __ARM_NEON 552 #include <arm_neon.h> 553 #endif 554 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 555 #include <mbarrier.h> 556 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 557 (defined(HP_IA64) || defined(__ia64)) 558 #include <machine/sys/inline.h> 559 #elif defined(__IBMC__) && defined(__powerpc) 560 #include <atomic.h> 561 #elif defined(_AIX) 562 #include <builtins.h> 563 #include <sys/atomic_op.h> 564 #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) 565 #include <c_asm.h> 566 #include <machine/builtins.h> 567 #elif defined(__MWERKS__) 568 /* CodeWarrior - troubles ? */ 569 #pragma gcc_extensions 570 #elif defined(__SNC__) 571 /* Sony PS3 - troubles ? */ 572 #elif defined(__hppa__) || defined(__hppa) 573 #include <machine/inline.h> 574 #else 575 #error Unsupported C compiler, please use GNU C 4.4 or newer 576 #endif /* Compiler */ 577 578 #if !defined(__noop) && !defined(_MSC_VER) 579 #define __noop \ 580 do { \ 581 } while (0) 582 #endif /* __noop */ 583 584 #if defined(__fallthrough) && \ 585 (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) 586 #undef __fallthrough 587 #endif /* __fallthrough workaround for MinGW */ 588 589 #ifndef __fallthrough 590 #if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) && \ 591 (!defined(__clang__) || __clang__ > 4)) || \ 592 __cplusplus >= 201703L 593 #define __fallthrough [[fallthrough]] 594 #elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L 595 #define __fallthrough [[fallthrough]] 596 #elif __GNUC_PREREQ(7, 0) && \ 597 (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) || \ 598 (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126)) 599 #define __fallthrough __attribute__((__fallthrough__)) 600 #elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L && \ 601 __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") 602 #define __fallthrough [[clang::fallthrough]] 603 #else 604 #define __fallthrough 605 #endif 606 #endif /* __fallthrough */ 607 608 #ifndef __unreachable 609 #if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) 610 #define __unreachable() __builtin_unreachable() 611 #elif defined(_MSC_VER) 612 #define __unreachable() __assume(0) 613 #else 614 #define __unreachable() \ 615 do { \ 616 } while (1) 617 #endif 618 #endif /* __unreachable */ 619 620 #ifndef __prefetch 621 #if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch) 622 #define __prefetch(ptr) __builtin_prefetch(ptr) 623 #else 624 #define __prefetch(ptr) \ 625 do { \ 626 (void)(ptr); \ 627 } while (0) 628 #endif 629 #endif /* __prefetch */ 630 631 #ifndef offsetof 632 #define offsetof(type, member) __builtin_offsetof(type, member) 633 #endif /* offsetof */ 634 635 #ifndef container_of 636 #define container_of(ptr, type, member) \ 637 ((type *)((char *)(ptr)-offsetof(type, member))) 638 #endif /* container_of */ 639 640 /*----------------------------------------------------------------------------*/ 641 642 #ifndef __always_inline 643 #if defined(__GNUC__) || __has_attribute(__always_inline__) 644 #define __always_inline __inline __attribute__((__always_inline__)) 645 #elif defined(_MSC_VER) 646 #define __always_inline __forceinline 647 #else 648 #define __always_inline 649 #endif 650 #endif /* __always_inline */ 651 652 #ifndef __noinline 653 #if defined(__GNUC__) || __has_attribute(__noinline__) 654 #define __noinline __attribute__((__noinline__)) 655 #elif defined(_MSC_VER) 656 #define __noinline __declspec(noinline) 657 #else 658 #define __noinline 659 #endif 660 #endif /* __noinline */ 661 662 #ifndef __must_check_result 663 #if defined(__GNUC__) || __has_attribute(__warn_unused_result__) 664 #define __must_check_result __attribute__((__warn_unused_result__)) 665 #else 666 #define __must_check_result 667 #endif 668 #endif /* __must_check_result */ 669 670 #ifndef __nothrow 671 #if defined(__cplusplus) 672 #if __cplusplus < 201703L 673 #define __nothrow throw() 674 #else 675 #define __nothrow noexcept(true) 676 #endif /* __cplusplus */ 677 #elif defined(__GNUC__) || __has_attribute(__nothrow__) 678 #define __nothrow __attribute__((__nothrow__)) 679 #elif defined(_MSC_VER) && defined(__cplusplus) 680 #define __nothrow __declspec(nothrow) 681 #else 682 #define __nothrow 683 #endif 684 #endif /* __nothrow */ 685 686 #ifndef __hidden 687 #if defined(__GNUC__) || __has_attribute(__visibility__) 688 #define __hidden __attribute__((__visibility__("hidden"))) 689 #else 690 #define __hidden 691 #endif 692 #endif /* __hidden */ 693 694 #ifndef __optimize 695 #if defined(__OPTIMIZE__) 696 #if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__) 697 #define __optimize(ops) __attribute__((__optimize__(ops))) 698 #else 699 #define __optimize(ops) 700 #endif 701 #else 702 #define __optimize(ops) 703 #endif 704 #endif /* __optimize */ 705 706 #ifndef __hot 707 #if defined(__OPTIMIZE__) 708 #if defined(__e2k__) 709 #define __hot __attribute__((__hot__)) __optimize(3) 710 #elif defined(__clang__) && !__has_attribute(__hot_) && \ 711 __has_attribute(__section__) && \ 712 (defined(__linux__) || defined(__gnu_linux__)) 713 /* just put frequently used functions in separate section */ 714 #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") 715 #elif defined(__LCC__) 716 #define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) 717 #elif defined(__GNUC__) || __has_attribute(__hot__) 718 #define __hot __attribute__((__hot__)) __optimize("O3") 719 #else 720 #define __hot __optimize("O3") 721 #endif 722 #else 723 #define __hot 724 #endif 725 #endif /* __hot */ 726 727 #ifndef __cold 728 #if defined(__OPTIMIZE__) 729 #if defined(__e2k__) 730 #define __cold __attribute__((__cold__)) __optimize(1) 731 #elif defined(__clang__) && !__has_attribute(cold) && \ 732 __has_attribute(__section__) && \ 733 (defined(__linux__) || defined(__gnu_linux__)) 734 /* just put infrequently used functions in separate section */ 735 #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") 736 #elif defined(__LCC__) 737 #define __hot __attribute__((__cold__, __optimize__("Osize"))) 738 #elif defined(__GNUC__) || __has_attribute(cold) 739 #define __cold __attribute__((__cold__)) __optimize("Os") 740 #else 741 #define __cold __optimize("Os") 742 #endif 743 #else 744 #define __cold 745 #endif 746 #endif /* __cold */ 747 748 #ifndef __flatten 749 #if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__)) 750 #define __flatten __attribute__((__flatten__)) 751 #else 752 #define __flatten 753 #endif 754 #endif /* __flatten */ 755 756 #ifndef likely 757 #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && \ 758 !defined(__COVERITY__) 759 #define likely(cond) __builtin_expect(!!(cond), 1) 760 #else 761 #define likely(x) (!!(x)) 762 #endif 763 #endif /* likely */ 764 765 #ifndef unlikely 766 #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && \ 767 !defined(__COVERITY__) 768 #define unlikely(cond) __builtin_expect(!!(cond), 0) 769 #else 770 #define unlikely(x) (!!(x)) 771 #endif 772 #endif /* unlikely */ 773 774 #ifndef __anonymous_struct_extension__ 775 #if defined(__GNUC__) 776 #define __anonymous_struct_extension__ __extension__ 777 #else 778 #define __anonymous_struct_extension__ 779 #endif 780 #endif /* __anonymous_struct_extension__ */ 781 782 #ifndef expect_with_probability 783 #if defined(__builtin_expect_with_probability) || \ 784 __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) 785 #define expect_with_probability(expr, value, prob) \ 786 __builtin_expect_with_probability(expr, value, prob) 787 #else 788 #define expect_with_probability(expr, value, prob) (expr) 789 #endif 790 #endif /* expect_with_probability */ 791 792 #ifndef MDBX_WEAK_IMPORT_ATTRIBUTE 793 #ifdef WEAK_IMPORT_ATTRIBUTE 794 #define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE 795 #elif __has_attribute(__weak__) && __has_attribute(__weak_import__) 796 #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) 797 #elif __has_attribute(__weak__) || \ 798 (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) 799 #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) 800 #else 801 #define MDBX_WEAK_IMPORT_ATTRIBUTE 802 #endif 803 #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ 804 805 /*----------------------------------------------------------------------------*/ 806 807 #if defined(MDBX_USE_VALGRIND) 808 #include <valgrind/memcheck.h> 809 #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE 810 /* LY: available since Valgrind 3.10 */ 811 #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 812 #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 813 #endif 814 #elif !defined(RUNNING_ON_VALGRIND) 815 #define VALGRIND_CREATE_MEMPOOL(h, r, z) 816 #define VALGRIND_DESTROY_MEMPOOL(h) 817 #define VALGRIND_MEMPOOL_TRIM(h, a, s) 818 #define VALGRIND_MEMPOOL_ALLOC(h, a, s) 819 #define VALGRIND_MEMPOOL_FREE(h, a) 820 #define VALGRIND_MEMPOOL_CHANGE(h, a, b, s) 821 #define VALGRIND_MAKE_MEM_NOACCESS(a, s) 822 #define VALGRIND_MAKE_MEM_DEFINED(a, s) 823 #define VALGRIND_MAKE_MEM_UNDEFINED(a, s) 824 #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 825 #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 826 #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) 827 #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) 828 #define RUNNING_ON_VALGRIND (0) 829 #endif /* MDBX_USE_VALGRIND */ 830 831 #ifdef __SANITIZE_ADDRESS__ 832 #include <sanitizer/asan_interface.h> 833 #elif !defined(ASAN_POISON_MEMORY_REGION) 834 #define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) 835 #define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) 836 #endif /* __SANITIZE_ADDRESS__ */ 837 838 /*----------------------------------------------------------------------------*/ 839 840 #ifndef ARRAY_LENGTH 841 #ifdef __cplusplus 842 template <typename T, size_t N> char (&__ArraySizeHelper(T (&array)[N]))[N]; 843 #define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) 844 #else 845 #define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) 846 #endif 847 #endif /* ARRAY_LENGTH */ 848 849 #ifndef ARRAY_END 850 #define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) 851 #endif /* ARRAY_END */ 852 853 #define CONCAT(a, b) a##b 854 #define XCONCAT(a, b) CONCAT(a, b) 855 856 #define MDBX_TETRAD(a, b, c, d) \ 857 ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) 858 859 #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3]) 860 861 #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__) 862 863 #ifndef STATIC_ASSERT_MSG 864 #if defined(static_assert) 865 #define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg) 866 #elif defined(_STATIC_ASSERT) 867 #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) 868 #elif defined(_MSC_VER) 869 #include <crtdbg.h> 870 #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) 871 #elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \ 872 __has_feature(c_static_assert) 873 #define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg) 874 #else 875 #define STATIC_ASSERT_MSG(expr, msg) \ 876 switch (0) { \ 877 case 0: \ 878 case (expr):; \ 879 } 880 #endif 881 #endif /* STATIC_ASSERT */ 882 883 #ifndef STATIC_ASSERT 884 #define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr) 885 #endif 886 887 #ifndef __Wpedantic_format_voidptr 888 MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void * 889 __Wpedantic_format_voidptr(const void *ptr) { 890 return ptr; 891 } 892 #define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) 893 #endif /* __Wpedantic_format_voidptr */ 894 895 #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2) 896 /* Actually libmdbx was not tested with compilers older than GCC 4.2. 897 * But you could ignore this warning at your own risk. 898 * In such case please don't rise up an issues related ONLY to old compilers. 899 */ 900 #warning "libmdbx required GCC >= 4.2" 901 #endif 902 903 #if defined(__clang__) && !__CLANG_PREREQ(3, 8) 904 /* Actually libmdbx was not tested with CLANG older than 3.8. 905 * But you could ignore this warning at your own risk. 906 * In such case please don't rise up an issues related ONLY to old compilers. 907 */ 908 #warning "libmdbx required CLANG >= 3.8" 909 #endif 910 911 #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) 912 /* Actually libmdbx was not tested with something older than glibc 2.12. 913 * But you could ignore this warning at your own risk. 914 * In such case please don't rise up an issues related ONLY to old systems. 915 */ 916 #warning "libmdbx was only tested with GLIBC >= 2.12." 917 #endif 918 919 #ifdef __SANITIZE_THREAD__ 920 #warning \ 921 "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." 922 #endif /* __SANITIZE_THREAD__ */ 923 924 #if __has_warning("-Wnested-anon-types") 925 #if defined(__clang__) 926 #pragma clang diagnostic ignored "-Wnested-anon-types" 927 #elif defined(__GNUC__) 928 #pragma GCC diagnostic ignored "-Wnested-anon-types" 929 #else 930 #pragma warning disable "nested-anon-types" 931 #endif 932 #endif /* -Wnested-anon-types */ 933 934 #if __has_warning("-Wconstant-logical-operand") 935 #if defined(__clang__) 936 #pragma clang diagnostic ignored "-Wconstant-logical-operand" 937 #elif defined(__GNUC__) 938 #pragma GCC diagnostic ignored "-Wconstant-logical-operand" 939 #else 940 #pragma warning disable "constant-logical-operand" 941 #endif 942 #endif /* -Wconstant-logical-operand */ 943 944 #if defined(__LCC__) && (__LCC__ <= 121) 945 /* bug #2798 */ 946 #pragma diag_suppress alignment_reduction_ignored 947 #elif defined(__ICC) 948 #pragma warning(disable : 3453 1366) 949 #elif __has_warning("-Walignment-reduction-ignored") 950 #if defined(__clang__) 951 #pragma clang diagnostic ignored "-Walignment-reduction-ignored" 952 #elif defined(__GNUC__) 953 #pragma GCC diagnostic ignored "-Walignment-reduction-ignored" 954 #else 955 #pragma warning disable "alignment-reduction-ignored" 956 #endif 957 #endif /* -Walignment-reduction-ignored */ 958 959 #ifndef MDBX_EXCLUDE_FOR_GPROF 960 #ifdef ENABLE_GPROF 961 #define MDBX_EXCLUDE_FOR_GPROF \ 962 __attribute__((__no_instrument_function__, \ 963 __no_profile_instrument_function__)) 964 #else 965 #define MDBX_EXCLUDE_FOR_GPROF 966 #endif /* ENABLE_GPROF */ 967 #endif /* MDBX_EXCLUDE_FOR_GPROF */ 968 969 #ifdef __cplusplus 970 extern "C" { 971 #endif 972 973 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ 974 975 /* 976 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 977 * and other libmdbx authors: please see AUTHORS file. 978 * All rights reserved. 979 * 980 * Redistribution and use in source and binary forms, with or without 981 * modification, are permitted only as authorized by the OpenLDAP 982 * Public License. 983 * 984 * A copy of this license is available in the file LICENSE in the 985 * top-level directory of the distribution or, alternatively, at 986 * <http://www.OpenLDAP.org/license.html>. 987 */ 988 989 990 /*----------------------------------------------------------------------------*/ 991 /* C11 Atomics */ 992 993 #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) 994 #include <cstdatomic> 995 #define MDBX_HAVE_C11ATOMICS 996 #elif !defined(__cplusplus) && \ 997 (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ 998 !defined(__STDC_NO_ATOMICS__) && \ 999 (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ 1000 !(defined(__GNUC__) || defined(__clang__))) 1001 #include <stdatomic.h> 1002 #define MDBX_HAVE_C11ATOMICS 1003 #elif defined(__GNUC__) || defined(__clang__) 1004 #elif defined(_MSC_VER) 1005 #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ 1006 #pragma warning(disable : 4133) /* 'function': incompatible types - from \ 1007 'size_t' to 'LONGLONG' */ 1008 #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ 1009 'std::size_t', possible loss of data */ 1010 #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ 1011 'long', possible loss of data */ 1012 #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) 1013 #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) 1014 #elif defined(__APPLE__) 1015 #include <libkern/OSAtomic.h> 1016 #else 1017 #error FIXME atomic-ops 1018 #endif 1019 1020 /*----------------------------------------------------------------------------*/ 1021 /* Memory/Compiler barriers, cache coherence */ 1022 1023 #if __has_include(<sys/cachectl.h>) 1024 #include <sys/cachectl.h> 1025 #elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ 1026 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ 1027 defined(__MWERKS__) || defined(__sgi) 1028 /* MIPS should have explicit cache control */ 1029 #include <sys/cachectl.h> 1030 #endif 1031 1032 MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { 1033 #if defined(__clang__) || defined(__GNUC__) 1034 __asm__ __volatile__("" ::: "memory"); 1035 #elif defined(_MSC_VER) 1036 _ReadWriteBarrier(); 1037 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ 1038 __memory_barrier(); 1039 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 1040 __compiler_barrier(); 1041 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 1042 (defined(HP_IA64) || defined(__ia64)) 1043 _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); 1044 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ 1045 defined(__ppc64__) || defined(__powerpc64__) 1046 __fence(); 1047 #else 1048 #error "Could not guess the kind of compiler, please report to us." 1049 #endif 1050 } 1051 1052 MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { 1053 #ifdef MDBX_HAVE_C11ATOMICS 1054 atomic_thread_fence(memory_order_seq_cst); 1055 #elif defined(__ATOMIC_SEQ_CST) 1056 #ifdef __clang__ 1057 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); 1058 #else 1059 __atomic_thread_fence(__ATOMIC_SEQ_CST); 1060 #endif 1061 #elif defined(__clang__) || defined(__GNUC__) 1062 __sync_synchronize(); 1063 #elif defined(_WIN32) || defined(_WIN64) 1064 MemoryBarrier(); 1065 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ 1066 #if defined(__ia32__) 1067 _mm_mfence(); 1068 #else 1069 __mf(); 1070 #endif 1071 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 1072 __machine_rw_barrier(); 1073 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 1074 (defined(HP_IA64) || defined(__ia64)) 1075 _Asm_mf(); 1076 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ 1077 defined(__ppc64__) || defined(__powerpc64__) 1078 __lwsync(); 1079 #else 1080 #error "Could not guess the kind of compiler, please report to us." 1081 #endif 1082 } 1083 1084 /*----------------------------------------------------------------------------*/ 1085 /* system-depended definitions */ 1086 1087 #if defined(_WIN32) || defined(_WIN64) 1088 #define HAVE_SYS_STAT_H 1089 #define HAVE_SYS_TYPES_H 1090 typedef HANDLE osal_thread_t; 1091 typedef unsigned osal_thread_key_t; 1092 #define MAP_FAILED NULL 1093 #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) 1094 #define THREAD_CALL WINAPI 1095 #define THREAD_RESULT DWORD 1096 typedef struct { 1097 HANDLE mutex; 1098 HANDLE event[2]; 1099 } osal_condpair_t; 1100 typedef CRITICAL_SECTION osal_fastmutex_t; 1101 1102 #if !defined(_MSC_VER) && !defined(__try) 1103 #define __try 1104 #define __except(COND) if (false) 1105 #endif /* stub for MSVC's __try/__except */ 1106 1107 #if MDBX_WITHOUT_MSVC_CRT 1108 1109 #ifndef osal_malloc 1110 static inline void *osal_malloc(size_t bytes) { 1111 return HeapAlloc(GetProcessHeap(), 0, bytes); 1112 } 1113 #endif /* osal_malloc */ 1114 1115 #ifndef osal_calloc 1116 static inline void *osal_calloc(size_t nelem, size_t size) { 1117 return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); 1118 } 1119 #endif /* osal_calloc */ 1120 1121 #ifndef osal_realloc 1122 static inline void *osal_realloc(void *ptr, size_t bytes) { 1123 return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) 1124 : HeapAlloc(GetProcessHeap(), 0, bytes); 1125 } 1126 #endif /* osal_realloc */ 1127 1128 #ifndef osal_free 1129 static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } 1130 #endif /* osal_free */ 1131 1132 #else /* MDBX_WITHOUT_MSVC_CRT */ 1133 1134 #define osal_malloc malloc 1135 #define osal_calloc calloc 1136 #define osal_realloc realloc 1137 #define osal_free free 1138 #define osal_strdup _strdup 1139 1140 #endif /* MDBX_WITHOUT_MSVC_CRT */ 1141 1142 #ifndef snprintf 1143 #define snprintf _snprintf /* ntdll */ 1144 #endif 1145 1146 #ifndef vsnprintf 1147 #define vsnprintf _vsnprintf /* ntdll */ 1148 #endif 1149 1150 MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, 1151 size_t src_n); 1152 1153 #else /*----------------------------------------------------------------------*/ 1154 1155 typedef pthread_t osal_thread_t; 1156 typedef pthread_key_t osal_thread_key_t; 1157 #define INVALID_HANDLE_VALUE (-1) 1158 #define THREAD_CALL 1159 #define THREAD_RESULT void * 1160 typedef struct { 1161 pthread_mutex_t mutex; 1162 pthread_cond_t cond[2]; 1163 } osal_condpair_t; 1164 typedef pthread_mutex_t osal_fastmutex_t; 1165 #define osal_malloc malloc 1166 #define osal_calloc calloc 1167 #define osal_realloc realloc 1168 #define osal_free free 1169 #define osal_strdup strdup 1170 #endif /* Platform */ 1171 1172 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 1173 /* malloc_usable_size() already provided */ 1174 #elif defined(__APPLE__) 1175 #define malloc_usable_size(ptr) malloc_size(ptr) 1176 #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT 1177 #define malloc_usable_size(ptr) _msize(ptr) 1178 #endif /* malloc_usable_size */ 1179 1180 /*----------------------------------------------------------------------------*/ 1181 /* OS abstraction layer stuff */ 1182 1183 /* Get the size of a memory page for the system. 1184 * This is the basic size that the platform's memory manager uses, and is 1185 * fundamental to the use of memory-mapped files. */ 1186 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t 1187 osal_syspagesize(void) { 1188 #if defined(_WIN32) || defined(_WIN64) 1189 SYSTEM_INFO si; 1190 GetSystemInfo(&si); 1191 return si.dwPageSize; 1192 #else 1193 return sysconf(_SC_PAGE_SIZE); 1194 #endif 1195 } 1196 1197 #if defined(_WIN32) || defined(_WIN64) 1198 typedef wchar_t pathchar_t; 1199 #else 1200 typedef char pathchar_t; 1201 #endif 1202 1203 typedef struct osal_mmap_param { 1204 union { 1205 void *address; 1206 uint8_t *dxb; 1207 struct MDBX_lockinfo *lck; 1208 }; 1209 mdbx_filehandle_t fd; 1210 size_t limit; /* mapping length, but NOT a size of file nor DB */ 1211 size_t current; /* mapped region size, i.e. the size of file and DB */ 1212 uint64_t filesize /* in-process cache of a file size */; 1213 #if defined(_WIN32) || defined(_WIN64) 1214 HANDLE section; /* memory-mapped section handle */ 1215 #endif 1216 } osal_mmap_t; 1217 1218 typedef union bin128 { 1219 __anonymous_struct_extension__ struct { uint64_t x, y; }; 1220 __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; 1221 } bin128_t; 1222 1223 #if defined(_WIN32) || defined(_WIN64) 1224 typedef union osal_srwlock { 1225 __anonymous_struct_extension__ struct { 1226 long volatile readerCount; 1227 long volatile writerCount; 1228 }; 1229 RTL_SRWLOCK native; 1230 } osal_srwlock_t; 1231 #endif /* Windows */ 1232 1233 #ifndef __cplusplus 1234 1235 /*----------------------------------------------------------------------------*/ 1236 /* libc compatibility stuff */ 1237 1238 #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ 1239 (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) 1240 #define osal_asprintf asprintf 1241 #define osal_vasprintf vasprintf 1242 #else 1243 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC 1244 MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); 1245 MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); 1246 #endif 1247 1248 #if !defined(MADV_DODUMP) && defined(MADV_CORE) 1249 #define MADV_DODUMP MADV_CORE 1250 #endif /* MADV_CORE -> MADV_DODUMP */ 1251 1252 #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE) 1253 #define MADV_DONTDUMP MADV_NOCORE 1254 #endif /* MADV_NOCORE -> MADV_DONTDUMP */ 1255 1256 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); 1257 MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); 1258 1259 /* max bytes to write in one call */ 1260 #if defined(_WIN32) || defined(_WIN64) 1261 #define MAX_WRITE UINT32_C(0x01000000) 1262 #else 1263 #define MAX_WRITE UINT32_C(0x3fff0000) 1264 #endif 1265 1266 #if defined(__linux__) || defined(__gnu_linux__) 1267 MDBX_INTERNAL_VAR uint32_t linux_kernel_version; 1268 MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; 1269 #endif /* Linux */ 1270 1271 #ifndef osal_strdup 1272 LIBMDBX_API char *osal_strdup(const char *str); 1273 #endif 1274 1275 MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { 1276 #if defined(_WIN32) || defined(_WIN64) 1277 DWORD rc = GetLastError(); 1278 #else 1279 int rc = errno; 1280 #endif 1281 return rc; 1282 } 1283 1284 #ifndef osal_memalign_alloc 1285 MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, 1286 void **result); 1287 #endif 1288 #ifndef osal_memalign_free 1289 MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); 1290 #endif 1291 1292 MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); 1293 MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); 1294 MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); 1295 MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, 1296 bool part); 1297 MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); 1298 MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); 1299 1300 MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); 1301 MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); 1302 MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); 1303 MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); 1304 1305 MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, 1306 int iovcnt, uint64_t offset, 1307 size_t expected_written); 1308 MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, 1309 uint64_t offset); 1310 MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, 1311 size_t count, uint64_t offset); 1312 MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, 1313 size_t count); 1314 1315 MDBX_INTERNAL_FUNC int 1316 osal_thread_create(osal_thread_t *thread, 1317 THREAD_RESULT(THREAD_CALL *start_routine)(void *), 1318 void *arg); 1319 MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); 1320 1321 enum osal_syncmode_bits { 1322 MDBX_SYNC_NONE = 0, 1323 MDBX_SYNC_DATA = 1, 1324 MDBX_SYNC_SIZE = 2, 1325 MDBX_SYNC_IODQ = 4 1326 }; 1327 1328 MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, 1329 const enum osal_syncmode_bits mode_bits); 1330 MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); 1331 MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); 1332 MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); 1333 1334 enum osal_openfile_purpose { 1335 MDBX_OPEN_DXB_READ = 0, 1336 MDBX_OPEN_DXB_LAZY = 1, 1337 MDBX_OPEN_DXB_DSYNC = 2, 1338 MDBX_OPEN_LCK = 3, 1339 MDBX_OPEN_COPY = 4, 1340 MDBX_OPEN_DELETE = 5 1341 }; 1342 1343 MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, 1344 const MDBX_env *env, 1345 const pathchar_t *pathname, 1346 mdbx_filehandle_t *fd, 1347 mdbx_mode_t unix_mode_bits); 1348 MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); 1349 MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); 1350 MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); 1351 MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); 1352 MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); 1353 1354 #define MMAP_OPTION_TRUNCATE 1 1355 #define MMAP_OPTION_SEMAPHORE 2 1356 MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, 1357 const size_t must, const size_t limit, 1358 const unsigned options); 1359 MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); 1360 #define MDBX_MRESIZE_MAY_MOVE 0x00000100 1361 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 1362 MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, 1363 size_t size, size_t limit); 1364 #if defined(_WIN32) || defined(_WIN64) 1365 typedef struct { 1366 unsigned limit, count; 1367 HANDLE handles[31]; 1368 } mdbx_handle_array_t; 1369 MDBX_INTERNAL_FUNC int 1370 osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); 1371 MDBX_INTERNAL_FUNC int 1372 osal_resume_threads_after_remap(mdbx_handle_array_t *array); 1373 #endif /* Windows */ 1374 MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, 1375 size_t length, 1376 enum osal_syncmode_bits mode_bits); 1377 MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, 1378 const pathchar_t *pathname, 1379 int err); 1380 1381 MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { 1382 STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); 1383 #if defined(_WIN32) || defined(_WIN64) 1384 return GetCurrentProcessId(); 1385 #else 1386 STATIC_ASSERT(sizeof(pid_t) <= sizeof(uint32_t)); 1387 return getpid(); 1388 #endif 1389 } 1390 1391 MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { 1392 mdbx_tid_t thunk; 1393 STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); 1394 #if defined(_WIN32) || defined(_WIN64) 1395 thunk = GetCurrentThreadId(); 1396 #else 1397 thunk = pthread_self(); 1398 #endif 1399 return (uintptr_t)thunk; 1400 } 1401 1402 #if !defined(_WIN32) && !defined(_WIN64) 1403 #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) 1404 MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); 1405 #else 1406 static __inline int osal_check_tid4bionic(void) { return 0; } 1407 #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ 1408 1409 MDBX_MAYBE_UNUSED static __inline int 1410 osal_pthread_mutex_lock(pthread_mutex_t *mutex) { 1411 int err = osal_check_tid4bionic(); 1412 return unlikely(err) ? err : pthread_mutex_lock(mutex); 1413 } 1414 #endif /* !Windows */ 1415 1416 MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); 1417 MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); 1418 MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); 1419 1420 MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); 1421 /*----------------------------------------------------------------------------*/ 1422 /* lck stuff */ 1423 1424 /// \brief Initialization of synchronization primitives linked with MDBX_env 1425 /// instance both in LCK-file and within the current process. 1426 /// \param 1427 /// global_uniqueness_flag = true - denotes that there are no other processes 1428 /// working with DB and LCK-file. Thus the function MUST initialize 1429 /// shared synchronization objects in memory-mapped LCK-file. 1430 /// global_uniqueness_flag = false - denotes that at least one process is 1431 /// already working with DB and LCK-file, including the case when DB 1432 /// has already been opened in the current process. Thus the function 1433 /// MUST NOT initialize shared synchronization objects in memory-mapped 1434 /// LCK-file that are already in use. 1435 /// \return Error code or zero on success. 1436 MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, 1437 MDBX_env *inprocess_neighbor, 1438 int global_uniqueness_flag); 1439 1440 /// \brief Disconnects from shared interprocess objects and destructs 1441 /// synchronization objects linked with MDBX_env instance 1442 /// within the current process. 1443 /// \param 1444 /// inprocess_neighbor = NULL - if the current process does not have other 1445 /// instances of MDBX_env linked with the DB being closed. 1446 /// Thus the function MUST check for other processes working with DB or 1447 /// LCK-file, and keep or destroy shared synchronization objects in 1448 /// memory-mapped LCK-file depending on the result. 1449 /// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env 1450 /// (anyone of there is several) working with DB or LCK-file within the 1451 /// current process. Thus the function MUST NOT try to acquire exclusive 1452 /// lock and/or try to destruct shared synchronization objects linked with 1453 /// DB or LCK-file. Moreover, the implementation MUST ensure correct work 1454 /// of other instances of MDBX_env within the current process, e.g. 1455 /// restore POSIX-fcntl locks after the closing of file descriptors. 1456 /// \return Error code (MDBX_PANIC) or zero on success. 1457 MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, 1458 MDBX_env *inprocess_neighbor); 1459 1460 /// \brief Connects to shared interprocess locking objects and tries to acquire 1461 /// the maximum lock level (shared if exclusive is not available) 1462 /// Depending on implementation or/and platform (Windows) this function may 1463 /// acquire the non-OS super-level lock (e.g. for shared synchronization 1464 /// objects initialization), which will be downgraded to OS-exclusive or 1465 /// shared via explicit calling of osal_lck_downgrade(). 1466 /// \return 1467 /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus 1468 /// the current process is the first and only after the last use of DB. 1469 /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus 1470 /// DB has already been opened and now is used by other processes. 1471 /// Otherwise (not 0 and not -1) - error code. 1472 MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); 1473 1474 /// \brief Downgrades the level of initially acquired lock to 1475 /// operational level specified by argument. The reson for such downgrade: 1476 /// - unblocking of other processes that are waiting for access, i.e. 1477 /// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes 1478 /// should be made aware that access is unavailable rather than 1479 /// wait for it. 1480 /// - freeing locks that interfere file operation (especially for Windows) 1481 /// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock. 1482 /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive 1483 /// operational lock. 1484 /// \return Error code or zero on success 1485 MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); 1486 1487 /// \brief Locks LCK-file or/and table of readers for (de)registering. 1488 /// \return Error code or zero on success 1489 MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); 1490 1491 /// \brief Unlocks LCK-file or/and table of readers after (de)registering. 1492 MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); 1493 1494 /// \brief Acquires lock for DB change (on writing transaction start) 1495 /// Reading transactions will not be blocked. 1496 /// Declared as LIBMDBX_API because it is used in mdbx_chk. 1497 /// \return Error code or zero on success 1498 LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); 1499 1500 /// \brief Releases lock once DB changes is made (after writing transaction 1501 /// has finished). 1502 /// Declared as LIBMDBX_API because it is used in mdbx_chk. 1503 LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); 1504 1505 /// \brief Sets alive-flag of reader presence (indicative lock) for PID of 1506 /// the current process. The function does no more than needed for 1507 /// the correct working of osal_rpid_check() in other processes. 1508 /// \return Error code or zero on success 1509 MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); 1510 1511 /// \brief Resets alive-flag of reader presence (indicative lock) 1512 /// for PID of the current process. The function does no more than needed 1513 /// for the correct working of osal_rpid_check() in other processes. 1514 /// \return Error code or zero on success 1515 MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); 1516 1517 /// \brief Checks for reading process status with the given pid with help of 1518 /// alive-flag of presence (indicative lock) or using another way. 1519 /// \return 1520 /// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive 1521 /// and working with DB (indicative lock is present). 1522 /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent 1523 /// or not working with DB (indicative lock is not present). 1524 /// Otherwise (not 0 and not -1) - error code. 1525 MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); 1526 1527 #if defined(_WIN32) || defined(_WIN64) 1528 1529 #define OSAL_MB2WIDE(FROM, TO) \ 1530 do { \ 1531 const char *const from_tmp = (FROM); \ 1532 const size_t from_mblen = strlen(from_tmp); \ 1533 const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ 1534 if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ 1535 return ERROR_INVALID_NAME; \ 1536 wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ 1537 if (to_wlen + 1 != \ 1538 osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ 1539 return ERROR_INVALID_NAME; \ 1540 (TO) = to_tmp; \ 1541 } while (0) 1542 1543 typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); 1544 MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, 1545 osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, 1546 osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; 1547 1548 #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ 1549 typedef enum _FILE_INFO_BY_HANDLE_CLASS { 1550 FileBasicInfo, 1551 FileStandardInfo, 1552 FileNameInfo, 1553 FileRenameInfo, 1554 FileDispositionInfo, 1555 FileAllocationInfo, 1556 FileEndOfFileInfo, 1557 FileStreamInfo, 1558 FileCompressionInfo, 1559 FileAttributeTagInfo, 1560 FileIdBothDirectoryInfo, 1561 FileIdBothDirectoryRestartInfo, 1562 FileIoPriorityHintInfo, 1563 FileRemoteProtocolInfo, 1564 MaximumFileInfoByHandleClass 1565 } FILE_INFO_BY_HANDLE_CLASS, 1566 *PFILE_INFO_BY_HANDLE_CLASS; 1567 1568 typedef struct _FILE_END_OF_FILE_INFO { 1569 LARGE_INTEGER EndOfFile; 1570 } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO; 1571 1572 #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001 1573 #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002 1574 1575 typedef struct _FILE_REMOTE_PROTOCOL_INFO { 1576 USHORT StructureVersion; 1577 USHORT StructureSize; 1578 DWORD Protocol; 1579 USHORT ProtocolMajorVersion; 1580 USHORT ProtocolMinorVersion; 1581 USHORT ProtocolRevision; 1582 USHORT Reserved; 1583 DWORD Flags; 1584 struct { 1585 DWORD Reserved[8]; 1586 } GenericReserved; 1587 struct { 1588 DWORD Reserved[16]; 1589 } ProtocolSpecificReserved; 1590 } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO; 1591 1592 #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */ 1593 1594 typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( 1595 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, 1596 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); 1597 MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx 1598 mdbx_GetFileInformationByHandleEx; 1599 1600 typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( 1601 _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, 1602 _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber, 1603 _Out_opt_ LPDWORD lpMaximumComponentLength, 1604 _Out_opt_ LPDWORD lpFileSystemFlags, 1605 _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); 1606 MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW 1607 mdbx_GetVolumeInformationByHandleW; 1608 1609 typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, 1610 _Out_ LPWSTR lpszFilePath, 1611 _In_ DWORD cchFilePath, 1612 _In_ DWORD dwFlags); 1613 MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; 1614 1615 typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( 1616 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, 1617 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); 1618 MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle 1619 mdbx_SetFileInformationByHandle; 1620 1621 typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( 1622 IN HANDLE FileHandle, IN OUT HANDLE Event, 1623 IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext, 1624 OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, 1625 IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, 1626 OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); 1627 MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; 1628 1629 typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); 1630 MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; 1631 1632 #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 1633 typedef struct _WIN32_MEMORY_RANGE_ENTRY { 1634 PVOID VirtualAddress; 1635 SIZE_T NumberOfBytes; 1636 } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; 1637 #endif /* Windows 8.x */ 1638 1639 typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( 1640 HANDLE hProcess, ULONG_PTR NumberOfEntries, 1641 PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); 1642 MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; 1643 1644 typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; 1645 1646 typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, 1647 IN PLARGE_INTEGER NewSectionSize); 1648 MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; 1649 1650 static __inline bool mdbx_RunningUnderWine(void) { 1651 return !mdbx_NtExtendSection; 1652 } 1653 1654 typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, 1655 LPCSTR lpValue, DWORD dwFlags, 1656 LPDWORD pdwType, PVOID pvData, 1657 LPDWORD pcbData); 1658 MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; 1659 1660 NTSYSAPI ULONG RtlRandomEx(PULONG Seed); 1661 1662 #endif /* Windows */ 1663 1664 #endif /* !__cplusplus */ 1665 1666 /*----------------------------------------------------------------------------*/ 1667 1668 #if defined(_MSC_VER) && _MSC_VER >= 1900 1669 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros 1670 * for internal format-args checker. */ 1671 #undef PRIuPTR 1672 #undef PRIiPTR 1673 #undef PRIdPTR 1674 #undef PRIxPTR 1675 #define PRIuPTR "Iu" 1676 #define PRIiPTR "Ii" 1677 #define PRIdPTR "Id" 1678 #define PRIxPTR "Ix" 1679 #define PRIuSIZE "zu" 1680 #define PRIiSIZE "zi" 1681 #define PRIdSIZE "zd" 1682 #define PRIxSIZE "zx" 1683 #endif /* fix PRI*PTR for _MSC_VER */ 1684 1685 #ifndef PRIuSIZE 1686 #define PRIuSIZE PRIuPTR 1687 #define PRIiSIZE PRIiPTR 1688 #define PRIdSIZE PRIdPTR 1689 #define PRIxSIZE PRIxPTR 1690 #endif /* PRI*SIZE macros for MSVC */ 1691 1692 #ifdef _MSC_VER 1693 #pragma warning(pop) 1694 #endif 1695 1696 #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) 1697 #if defined(xMDBX_TOOLS) 1698 extern LIBMDBX_API const char *const mdbx_sourcery_anchor; 1699 #endif 1700 1701 /******************************************************************************* 1702 ******************************************************************************* 1703 ******************************************************************************* 1704 * 1705 * 1706 * #### ##### ##### # #### # # #### 1707 * # # # # # # # # ## # # 1708 * # # # # # # # # # # # #### 1709 * # # ##### # # # # # # # # 1710 * # # # # # # # # ## # # 1711 * #### # # # #### # # #### 1712 * 1713 * 1714 */ 1715 1716 /** \defgroup build_option Build options 1717 * The libmdbx build options. 1718 @{ */ 1719 1720 /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ 1721 #define MDBX_OSX_WANNA_DURABILITY 0 1722 /** Using fsync() with chance of data lost on power failure */ 1723 #define MDBX_OSX_WANNA_SPEED 1 1724 1725 #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY 1726 /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED 1727 * for OSX & iOS */ 1728 #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY 1729 #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */ 1730 1731 /** Controls checking PID against reuse DB environment after the fork() */ 1732 #ifndef MDBX_ENV_CHECKPID 1733 #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) 1734 /* PID check could be omitted: 1735 * - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork() 1736 * mapped pages will not be available for child process. 1737 * - in Windows where fork() not available. */ 1738 #define MDBX_ENV_CHECKPID 0 1739 #else 1740 #define MDBX_ENV_CHECKPID 1 1741 #endif 1742 #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) 1743 #else 1744 #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) 1745 #endif /* MDBX_ENV_CHECKPID */ 1746 1747 /** Controls checking transaction owner thread against misuse transactions from 1748 * other threads. */ 1749 #ifndef MDBX_TXN_CHECKOWNER 1750 #define MDBX_TXN_CHECKOWNER 1 1751 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) 1752 #else 1753 #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) 1754 #endif /* MDBX_TXN_CHECKOWNER */ 1755 1756 /** Does a system have battery-backed Real-Time Clock or just a fake. */ 1757 #ifndef MDBX_TRUST_RTC 1758 #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) || \ 1759 defined(__OpenBSD__) 1760 #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */ 1761 #else 1762 #define MDBX_TRUST_RTC 1 1763 #endif 1764 #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) 1765 #else 1766 #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) 1767 #endif /* MDBX_TRUST_RTC */ 1768 1769 /** Controls online database auto-compactification during write-transactions. */ 1770 #ifndef MDBX_ENABLE_REFUND 1771 #define MDBX_ENABLE_REFUND 1 1772 #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) 1773 #error MDBX_ENABLE_REFUND must be defined as 0 or 1 1774 #endif /* MDBX_ENABLE_REFUND */ 1775 1776 /** Controls gathering statistics for page operations. */ 1777 #ifndef MDBX_ENABLE_PGOP_STAT 1778 #define MDBX_ENABLE_PGOP_STAT 1 1779 #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1) 1780 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 1781 #endif /* MDBX_ENABLE_PGOP_STAT */ 1782 1783 /** Enables chunking long list of retired pages during huge transactions commit 1784 * to avoid use sequences of pages. */ 1785 #ifndef MDBX_ENABLE_BIGFOOT 1786 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) 1787 #define MDBX_ENABLE_BIGFOOT 1 1788 #else 1789 #define MDBX_ENABLE_BIGFOOT 0 1790 #endif 1791 #elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) 1792 #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 1793 #endif /* MDBX_ENABLE_BIGFOOT */ 1794 1795 /** Controls use of POSIX madvise() hints and friends. */ 1796 #ifndef MDBX_ENABLE_MADVISE 1797 #define MDBX_ENABLE_MADVISE 1 1798 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1) 1799 #error MDBX_ENABLE_MADVISE must be defined as 0 or 1 1800 #endif /* MDBX_ENABLE_MADVISE */ 1801 1802 /** Disable some checks to reduce an overhead and detection probability of 1803 * database corruption to a values closer to the LMDB. */ 1804 #ifndef MDBX_DISABLE_VALIDATION 1805 #define MDBX_DISABLE_VALIDATION 0 1806 #elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) 1807 #error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 1808 #endif /* MDBX_DISABLE_VALIDATION */ 1809 1810 #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT 1811 #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 1812 #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 || \ 1813 MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1) 1814 #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 1815 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ 1816 1817 #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT 1818 #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1 1819 #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 || \ 1820 MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1) 1821 #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 1822 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ 1823 1824 /** Basically, this build-option is for TODO. Guess it should be replaced 1825 * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: 1826 * 0/OFF = Don't track dirty pages at all and don't spilling ones. 1827 * This should be by-default on Linux and may-be other systems 1828 * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides 1829 * properly LRU tracking and async writing on-demand. 1830 * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit 1831 * spilling with msync(MS_ASYNC). */ 1832 #ifndef MDBX_FAKE_SPILL_WRITEMAP 1833 #if defined(__linux__) || defined(__gnu_linux__) 1834 #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ 1835 #else 1836 #define MDBX_FAKE_SPILL_WRITEMAP 0 1837 #endif 1838 #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) 1839 #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 1840 #endif /* MDBX_FAKE_SPILL_WRITEMAP */ 1841 1842 /** Controls sort order of internal page number lists. 1843 * This mostly experimental/advanced option with not for regular MDBX users. 1844 * \warning The database format depend on this option and libmdbx builded with 1845 * different option value are incompatible. */ 1846 #ifndef MDBX_PNL_ASCENDING 1847 #define MDBX_PNL_ASCENDING 0 1848 #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) 1849 #error MDBX_PNL_ASCENDING must be defined as 0 or 1 1850 #endif /* MDBX_PNL_ASCENDING */ 1851 1852 /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */ 1853 #ifndef MDBX_WITHOUT_MSVC_CRT 1854 #define MDBX_WITHOUT_MSVC_CRT 1 1855 #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1) 1856 #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1 1857 #endif /* MDBX_WITHOUT_MSVC_CRT */ 1858 1859 /** Size of buffer used during copying a environment/database file. */ 1860 #ifndef MDBX_ENVCOPY_WRITEBUF 1861 #define MDBX_ENVCOPY_WRITEBUF 1048576u 1862 #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \ 1863 MDBX_ENVCOPY_WRITEBUF % 65536u 1864 #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536 1865 #endif /* MDBX_ENVCOPY_WRITEBUF */ 1866 1867 /** Forces assertion checking */ 1868 #ifndef MDBX_FORCE_ASSERTIONS 1869 #define MDBX_FORCE_ASSERTIONS 0 1870 #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1) 1871 #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1 1872 #endif /* MDBX_FORCE_ASSERTIONS */ 1873 1874 /** Presumed malloc size overhead for each allocation 1875 * to adjust allocations to be more aligned. */ 1876 #ifndef MDBX_ASSUME_MALLOC_OVERHEAD 1877 #ifdef __SIZEOF_POINTER__ 1878 #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u) 1879 #else 1880 #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u) 1881 #endif 1882 #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 || \ 1883 MDBX_ASSUME_MALLOC_OVERHEAD % 4 1884 #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4 1885 #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */ 1886 1887 /** If defined then enables integration with Valgrind, 1888 * a memory analyzing tool. */ 1889 #ifndef MDBX_USE_VALGRIND 1890 #endif /* MDBX_USE_VALGRIND */ 1891 1892 /** If defined then enables use C11 atomics, 1893 * otherwise detects ones availability automatically. */ 1894 #ifndef MDBX_HAVE_C11ATOMICS 1895 #endif /* MDBX_HAVE_C11ATOMICS */ 1896 1897 //------------------------------------------------------------------------------ 1898 1899 /** Win32 File Locking API for \ref MDBX_LOCKING */ 1900 #define MDBX_LOCKING_WIN32FILES -1 1901 1902 /** SystemV IPC semaphores for \ref MDBX_LOCKING */ 1903 #define MDBX_LOCKING_SYSV 5 1904 1905 /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */ 1906 #define MDBX_LOCKING_POSIX1988 1988 1907 1908 /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */ 1909 #define MDBX_LOCKING_POSIX2001 2001 1910 1911 /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */ 1912 #define MDBX_LOCKING_POSIX2008 2008 1913 1914 /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */ 1915 #define MDBX_LOCKING_BENAPHORE 1995 1916 1917 /** Advanced: Choices the locking implementation (autodetection by default). */ 1918 #if defined(_WIN32) || defined(_WIN64) 1919 #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES 1920 #else 1921 #ifndef MDBX_LOCKING 1922 #if defined(_POSIX_THREAD_PROCESS_SHARED) && \ 1923 _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__) 1924 1925 /* Some platforms define the EOWNERDEAD error code even though they 1926 * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */ 1927 #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L && \ 1928 ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) && \ 1929 _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) || \ 1930 (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) && \ 1931 _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) || \ 1932 defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \ 1933 (!defined(__GLIBC__) || \ 1934 __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */) 1935 #define MDBX_LOCKING MDBX_LOCKING_POSIX2008 1936 #else 1937 #define MDBX_LOCKING MDBX_LOCKING_POSIX2001 1938 #endif 1939 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) 1940 #define MDBX_LOCKING MDBX_LOCKING_POSIX1988 1941 #else 1942 #define MDBX_LOCKING MDBX_LOCKING_SYSV 1943 #endif 1944 #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING) 1945 #else 1946 #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING) 1947 #endif /* MDBX_LOCKING */ 1948 #endif /* !Windows */ 1949 1950 /** Advanced: Using POSIX OFD-locks (autodetection by default). */ 1951 #ifndef MDBX_USE_OFDLOCKS 1952 #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \ 1953 !defined(MDBX_SAFE4QEMU) && \ 1954 !defined(__sun) /* OFD-lock are broken on Solaris */ 1955 #define MDBX_USE_OFDLOCKS 1 1956 #else 1957 #define MDBX_USE_OFDLOCKS 0 1958 #endif 1959 #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) 1960 #else 1961 #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) 1962 #endif /* MDBX_USE_OFDLOCKS */ 1963 1964 /** Advanced: Using sendfile() syscall (autodetection by default). */ 1965 #ifndef MDBX_USE_SENDFILE 1966 #if ((defined(__linux__) || defined(__gnu_linux__)) && \ 1967 !defined(__ANDROID_API__)) || \ 1968 (defined(__ANDROID_API__) && __ANDROID_API__ >= 21) 1969 #define MDBX_USE_SENDFILE 1 1970 #else 1971 #define MDBX_USE_SENDFILE 0 1972 #endif 1973 #endif /* MDBX_USE_SENDFILE */ 1974 1975 /** Advanced: Using copy_file_range() syscall (autodetection by default). */ 1976 #ifndef MDBX_USE_COPYFILERANGE 1977 #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) 1978 #define MDBX_USE_COPYFILERANGE 1 1979 #else 1980 #define MDBX_USE_COPYFILERANGE 0 1981 #endif 1982 #endif /* MDBX_USE_COPYFILERANGE */ 1983 1984 /** Advanced: Using sync_file_range() syscall (autodetection by default). */ 1985 #ifndef MDBX_USE_SYNCFILERANGE 1986 #if ((defined(__linux__) || defined(__gnu_linux__)) && \ 1987 defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) || \ 1988 (defined(__ANDROID_API__) && __ANDROID_API__ >= 26) 1989 #define MDBX_USE_SYNCFILERANGE 1 1990 #else 1991 #define MDBX_USE_SYNCFILERANGE 0 1992 #endif 1993 #endif /* MDBX_USE_SYNCFILERANGE */ 1994 1995 //------------------------------------------------------------------------------ 1996 1997 #ifndef MDBX_CPU_WRITEBACK_INCOHERENT 1998 #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ 1999 defined(__hppa__) || defined(DOXYGEN) 2000 #define MDBX_CPU_WRITEBACK_INCOHERENT 0 2001 #else 2002 #define MDBX_CPU_WRITEBACK_INCOHERENT 1 2003 #endif 2004 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ 2005 2006 #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE 2007 #ifdef __OpenBSD__ 2008 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1 2009 #else 2010 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 2011 #endif 2012 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 2013 2014 #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE 2015 #if defined(__mips) || defined(__mips__) || defined(__mips64) || \ 2016 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ 2017 defined(__MWERKS__) || defined(__sgi) 2018 /* MIPS has cache coherency issues. */ 2019 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1 2020 #else 2021 /* LY: assume no relevant mmap/dcache issues. */ 2022 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 2023 #endif 2024 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ 2025 2026 #ifndef MDBX_64BIT_ATOMIC 2027 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) 2028 #define MDBX_64BIT_ATOMIC 1 2029 #else 2030 #define MDBX_64BIT_ATOMIC 0 2031 #endif 2032 #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) 2033 #else 2034 #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) 2035 #endif /* MDBX_64BIT_ATOMIC */ 2036 2037 #ifndef MDBX_64BIT_CAS 2038 #if defined(ATOMIC_LLONG_LOCK_FREE) 2039 #if ATOMIC_LLONG_LOCK_FREE > 1 2040 #define MDBX_64BIT_CAS 1 2041 #else 2042 #define MDBX_64BIT_CAS 0 2043 #endif 2044 #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE) 2045 #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1 2046 #define MDBX_64BIT_CAS 1 2047 #else 2048 #define MDBX_64BIT_CAS 0 2049 #endif 2050 #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE) 2051 #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1 2052 #define MDBX_64BIT_CAS 1 2053 #else 2054 #define MDBX_64BIT_CAS 0 2055 #endif 2056 #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) 2057 #define MDBX_64BIT_CAS 1 2058 #else 2059 #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC 2060 #endif 2061 #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS) 2062 #else 2063 #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS) 2064 #endif /* MDBX_64BIT_CAS */ 2065 2066 #ifndef MDBX_UNALIGNED_OK 2067 #if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ 2068 defined(ENABLE_UBSAN) 2069 #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ 2070 #elif defined(__ARM_FEATURE_UNALIGNED) 2071 #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ 2072 #elif defined(__e2k__) || defined(__elbrus__) 2073 #if __iset__ > 4 2074 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ 2075 #else 2076 #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ 2077 #endif 2078 #elif defined(__ia32__) 2079 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ 2080 #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) 2081 /* expecting an optimization will well done, also this 2082 * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ 2083 #define MDBX_UNALIGNED_OK 0 2084 #else 2085 #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ 2086 #endif 2087 #elif MDBX_UNALIGNED_OK == 1 2088 #undef MDBX_UNALIGNED_OK 2089 #define MDBX_UNALIGNED_OK 32 /* any unaligned access allowed */ 2090 #endif /* MDBX_UNALIGNED_OK */ 2091 2092 #ifndef MDBX_CACHELINE_SIZE 2093 #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) 2094 #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE 2095 #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) 2096 #define MDBX_CACHELINE_SIZE 128 2097 #else 2098 #define MDBX_CACHELINE_SIZE 64 2099 #endif 2100 #endif /* MDBX_CACHELINE_SIZE */ 2101 2102 /** @} end of build options */ 2103 /******************************************************************************* 2104 ******************************************************************************* 2105 ******************************************************************************/ 2106 2107 #ifndef DOXYGEN 2108 2109 /* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */ 2110 #ifndef MDBX_DEBUG 2111 #ifdef NDEBUG 2112 #define MDBX_DEBUG 0 2113 #else 2114 #define MDBX_DEBUG 1 2115 #endif 2116 #endif /* MDBX_DEBUG */ 2117 2118 #else 2119 2120 /* !!! Actually this is a fake definitions for Doxygen !!! */ 2121 2122 /** Controls enabling of debugging features. 2123 * 2124 * - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all, 2125 * including logging and assertion controls. 2126 * Logging level and corresponding debug flags changing 2127 * by \ref mdbx_setup_debug() will not have effect. 2128 * - `MDBX_DEBUG > 0` Enables code for the debugging features (logging, 2129 * assertions checking and internal audit). 2130 * Simultaneously sets the default logging level 2131 * to the `MDBX_DEBUG` value. 2132 * Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`. 2133 * 2134 * \ingroup build_option */ 2135 #define MDBX_DEBUG 0...7 2136 2137 /** Disables using of GNU libc extensions. */ 2138 #define MDBX_DISABLE_GNU_SOURCE 0 or 1 2139 2140 #endif /* DOXYGEN */ 2141 2142 /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ 2143 #if MDBX_DEBUG 2144 #undef NDEBUG 2145 #endif 2146 2147 /*----------------------------------------------------------------------------*/ 2148 /* Atomics */ 2149 2150 enum MDBX_memory_order { 2151 mo_Relaxed, 2152 mo_AcquireRelease 2153 /* , mo_SequentialConsistency */ 2154 }; 2155 2156 typedef union { 2157 volatile uint32_t weak; 2158 #ifdef MDBX_HAVE_C11ATOMICS 2159 volatile _Atomic uint32_t c11a; 2160 #endif /* MDBX_HAVE_C11ATOMICS */ 2161 } MDBX_atomic_uint32_t; 2162 2163 typedef union { 2164 volatile uint64_t weak; 2165 #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) 2166 volatile _Atomic uint64_t c11a; 2167 #endif 2168 #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC 2169 __anonymous_struct_extension__ struct { 2170 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 2171 MDBX_atomic_uint32_t low, high; 2172 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 2173 MDBX_atomic_uint32_t high, low; 2174 #else 2175 #error "FIXME: Unsupported byte order" 2176 #endif /* __BYTE_ORDER__ */ 2177 }; 2178 #endif 2179 } MDBX_atomic_uint64_t; 2180 2181 #ifdef MDBX_HAVE_C11ATOMICS 2182 2183 /* Crutches for C11 atomic compiler's bugs */ 2184 #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127 2185 #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak) 2186 #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak) 2187 #elif defined(__clang__) && __clang__ < 8 2188 #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a) 2189 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) 2190 #else 2191 #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a) 2192 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) 2193 #endif /* Crutches for C11 atomic compiler's bugs */ 2194 2195 #define mo_c11_store(fence) \ 2196 (((fence) == mo_Relaxed) ? memory_order_relaxed \ 2197 : ((fence) == mo_AcquireRelease) ? memory_order_release \ 2198 : memory_order_seq_cst) 2199 #define mo_c11_load(fence) \ 2200 (((fence) == mo_Relaxed) ? memory_order_relaxed \ 2201 : ((fence) == mo_AcquireRelease) ? memory_order_acquire \ 2202 : memory_order_seq_cst) 2203 2204 #endif /* MDBX_HAVE_C11ATOMICS */ 2205 2206 #ifndef __cplusplus 2207 2208 #ifdef MDBX_HAVE_C11ATOMICS 2209 #define osal_memory_fence(order, write) \ 2210 atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) 2211 #else /* MDBX_HAVE_C11ATOMICS */ 2212 #define osal_memory_fence(order, write) \ 2213 do { \ 2214 osal_compiler_barrier(); \ 2215 if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ 2216 : mo_AcquireRelease)) \ 2217 osal_memory_barrier(); \ 2218 } while (0) 2219 #endif /* MDBX_HAVE_C11ATOMICS */ 2220 2221 #if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) 2222 #define atomic_store32(p, value, order) \ 2223 ({ \ 2224 const uint32_t value_to_store = (value); \ 2225 atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \ 2226 mo_c11_store(order)); \ 2227 value_to_store; \ 2228 }) 2229 #define atomic_load32(p, order) \ 2230 atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)) 2231 #define atomic_store64(p, value, order) \ 2232 ({ \ 2233 const uint64_t value_to_store = (value); \ 2234 atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \ 2235 mo_c11_store(order)); \ 2236 value_to_store; \ 2237 }) 2238 #define atomic_load64(p, order) \ 2239 atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)) 2240 #endif /* LCC && MDBX_HAVE_C11ATOMICS */ 2241 2242 #ifndef atomic_store32 2243 MDBX_MAYBE_UNUSED static __always_inline uint32_t 2244 atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, 2245 enum MDBX_memory_order order) { 2246 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); 2247 #ifdef MDBX_HAVE_C11ATOMICS 2248 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 2249 atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); 2250 #else /* MDBX_HAVE_C11ATOMICS */ 2251 if (order != mo_Relaxed) 2252 osal_compiler_barrier(); 2253 p->weak = value; 2254 osal_memory_fence(order, true); 2255 #endif /* MDBX_HAVE_C11ATOMICS */ 2256 return value; 2257 } 2258 #endif /* atomic_store32 */ 2259 2260 #ifndef atomic_load32 2261 MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( 2262 const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { 2263 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); 2264 #ifdef MDBX_HAVE_C11ATOMICS 2265 assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); 2266 return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); 2267 #else /* MDBX_HAVE_C11ATOMICS */ 2268 osal_memory_fence(order, false); 2269 const uint32_t value = p->weak; 2270 if (order != mo_Relaxed) 2271 osal_compiler_barrier(); 2272 return value; 2273 #endif /* MDBX_HAVE_C11ATOMICS */ 2274 } 2275 #endif /* atomic_load32 */ 2276 2277 #endif /* !__cplusplus */ 2278 2279 /*----------------------------------------------------------------------------*/ 2280 /* Basic constants and types */ 2281 2282 /* A stamp that identifies a file as an MDBX file. 2283 * There's nothing special about this value other than that it is easily 2284 * recognizable, and it will reflect any byte order mismatches. */ 2285 #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) 2286 2287 /* FROZEN: The version number for a database's datafile format. */ 2288 #define MDBX_DATA_VERSION 3 2289 /* The version number for a database's lockfile format. */ 2290 #define MDBX_LOCK_VERSION 4 2291 2292 /* handle for the DB used to track free pages. */ 2293 #define FREE_DBI 0 2294 /* handle for the default DB. */ 2295 #define MAIN_DBI 1 2296 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ 2297 #define CORE_DBS 2 2298 2299 /* Number of meta pages - also hardcoded elsewhere */ 2300 #define NUM_METAS 3 2301 2302 /* A page number in the database. 2303 * 2304 * MDBX uses 32 bit for page numbers. This limits database 2305 * size up to 2^44 bytes, in case of 4K pages. */ 2306 typedef uint32_t pgno_t; 2307 typedef MDBX_atomic_uint32_t atomic_pgno_t; 2308 #define PRIaPGNO PRIu32 2309 #define MAX_PAGENO UINT32_C(0x7FFFffff) 2310 #define MIN_PAGENO NUM_METAS 2311 2312 #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) 2313 2314 /* A transaction ID. */ 2315 typedef uint64_t txnid_t; 2316 typedef MDBX_atomic_uint64_t atomic_txnid_t; 2317 #define PRIaTXN PRIi64 2318 #define MIN_TXNID UINT64_C(1) 2319 #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) 2320 #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1) 2321 #define INVALID_TXNID UINT64_MAX 2322 /* LY: for testing non-atomic 64-bit txnid on 32-bit arches. 2323 * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */ 2324 #ifndef xMDBX_TXNID_STEP 2325 #if MDBX_64BIT_CAS 2326 #define xMDBX_TXNID_STEP 1u 2327 #else 2328 #define xMDBX_TXNID_STEP 2u 2329 #endif 2330 #endif /* xMDBX_TXNID_STEP */ 2331 2332 /* Used for offsets within a single page. 2333 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 2334 * this is plenty. */ 2335 typedef uint16_t indx_t; 2336 2337 #define MEGABYTE ((size_t)1 << 20) 2338 2339 /*----------------------------------------------------------------------------*/ 2340 /* Core structures for database and shared memory (i.e. format definition) */ 2341 #pragma pack(push, 4) 2342 2343 /* Information about a single database in the environment. */ 2344 typedef struct MDBX_db { 2345 uint16_t md_flags; /* see mdbx_dbi_open */ 2346 uint16_t md_depth; /* depth of this tree */ 2347 uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */ 2348 pgno_t md_root; /* the root page of this tree */ 2349 pgno_t md_branch_pages; /* number of internal pages */ 2350 pgno_t md_leaf_pages; /* number of leaf pages */ 2351 pgno_t md_overflow_pages; /* number of overflow pages */ 2352 uint64_t md_seq; /* table sequence counter */ 2353 uint64_t md_entries; /* number of data items */ 2354 uint64_t md_mod_txnid; /* txnid of last committed modification */ 2355 } MDBX_db; 2356 2357 /* database size-related parameters */ 2358 typedef struct MDBX_geo { 2359 uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential 2360 quantized) value */ 2361 uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed 2362 (exponential quantized) value */ 2363 pgno_t lower; /* minimal size of datafile in pages */ 2364 pgno_t upper; /* maximal size of datafile in pages */ 2365 pgno_t now; /* current size of datafile in pages */ 2366 pgno_t next; /* first unused page in the datafile, 2367 but actually the file may be shorter. */ 2368 } MDBX_geo; 2369 2370 /* Meta page content. 2371 * A meta page is the start point for accessing a database snapshot. 2372 * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ 2373 typedef struct MDBX_meta { 2374 /* Stamp identifying this as an MDBX file. 2375 * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ 2376 uint32_t mm_magic_and_version[2]; 2377 2378 /* txnid that committed this page, the first of a two-phase-update pair */ 2379 union { 2380 MDBX_atomic_uint32_t mm_txnid_a[2]; 2381 uint64_t unsafe_txnid; 2382 }; 2383 2384 uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ 2385 uint8_t mm_validator_id; /* ID of checksum and page validation method, 2386 * zero (nothing) for now */ 2387 uint8_t mm_extra_pagehdr; /* extra bytes in the page header, 2388 * zero (nothing) for now */ 2389 2390 MDBX_geo mm_geo; /* database size-related parameters */ 2391 2392 MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ 2393 /* The size of pages used in this DB */ 2394 #define mm_psize mm_dbs[FREE_DBI].md_xsize 2395 MDBX_canary mm_canary; 2396 2397 #define MDBX_DATASIGN_NONE 0u 2398 #define MDBX_DATASIGN_WEAK 1u 2399 #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) 2400 #define META_IS_STEADY(meta) \ 2401 SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) 2402 union { 2403 uint32_t mm_sign[2]; 2404 uint64_t unsafe_sign; 2405 }; 2406 2407 /* txnid that committed this page, the second of a two-phase-update pair */ 2408 MDBX_atomic_uint32_t mm_txnid_b[2]; 2409 2410 /* Number of non-meta pages which were put in GC after COW. May be 0 in case 2411 * DB was previously handled by libmdbx without corresponding feature. 2412 * This value in couple with mr_snapshot_pages_retired allows fast estimation 2413 * of "how much reader is restraining GC recycling". */ 2414 uint32_t mm_pages_retired[2]; 2415 2416 /* The analogue /proc/sys/kernel/random/boot_id or similar to determine 2417 * whether the system was rebooted after the last use of the database files. 2418 * If there was no reboot, but there is no need to rollback to the last 2419 * steady sync point. Zeros mean that no relevant information is available 2420 * from the system. */ 2421 bin128_t mm_bootid; 2422 2423 } MDBX_meta; 2424 2425 #pragma pack(1) 2426 2427 /* Common header for all page types. The page type depends on mp_flags. 2428 * 2429 * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with 2430 * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages 2431 * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. 2432 * 2433 * P_OVERFLOW records occupy one or more contiguous pages where only the 2434 * first has a page header. They hold the real data of F_BIGDATA nodes. 2435 * 2436 * P_SUBP sub-pages are small leaf "pages" with duplicate data. 2437 * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. 2438 * (Duplicate data can also go in sub-databases, which use normal pages.) 2439 * 2440 * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. 2441 * 2442 * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once 2443 * in the snapshot: Either used by a database or listed in a GC record. */ 2444 typedef struct MDBX_page { 2445 union { 2446 #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) 2447 #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) 2448 #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) 2449 #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) 2450 #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) 2451 uint64_t 2452 mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ 2453 struct MDBX_page *mp_next; /* for in-memory list of freed pages */ 2454 }; 2455 uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ 2456 #define P_BRANCH 0x01u /* branch page */ 2457 #define P_LEAF 0x02u /* leaf page */ 2458 #define P_OVERFLOW 0x04u /* overflow page */ 2459 #define P_META 0x08u /* meta page */ 2460 #define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ 2461 #define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ 2462 #define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ 2463 #define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ 2464 #define P_SPILLED 0x2000u /* spilled in parent txn */ 2465 #define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ 2466 #define P_FROZEN 0x8000u /* used for retire page with known status */ 2467 #define P_ILL_BITS \ 2468 ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) 2469 uint16_t mp_flags; 2470 union { 2471 uint32_t mp_pages; /* number of overflow pages */ 2472 __anonymous_struct_extension__ struct { 2473 indx_t mp_lower; /* lower bound of free space */ 2474 indx_t mp_upper; /* upper bound of free space */ 2475 }; 2476 }; 2477 pgno_t mp_pgno; /* page number */ 2478 2479 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2480 (!defined(__cplusplus) && defined(_MSC_VER)) 2481 indx_t mp_ptrs[] /* dynamic size */; 2482 #endif /* C99 */ 2483 } MDBX_page; 2484 2485 #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) 2486 2487 /* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ 2488 #define PAGETYPE_COMPAT(p) \ 2489 (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ 2490 ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ 2491 : PAGETYPE_WHOLE(p)) 2492 2493 /* Size of the page header, excluding dynamic data at the end */ 2494 #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) 2495 2496 #pragma pack(pop) 2497 2498 #if MDBX_ENABLE_PGOP_STAT 2499 /* Statistics of page operations overall of all (running, completed and aborted) 2500 * transactions */ 2501 typedef struct { 2502 MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ 2503 MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ 2504 MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones 2505 for nested transactions */ 2506 MDBX_atomic_uint64_t split; /* Page splits */ 2507 MDBX_atomic_uint64_t merge; /* Page merges */ 2508 MDBX_atomic_uint64_t spill; /* Quantity of spilled dirty pages */ 2509 MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ 2510 MDBX_atomic_uint64_t 2511 wops; /* Number of explicit write operations (not a pages) to a disk */ 2512 MDBX_atomic_uint64_t 2513 gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The 2514 unit/scale is platform-depended, see osal_monotime(). */ 2515 } MDBX_pgop_stat_t; 2516 #endif /* MDBX_ENABLE_PGOP_STAT */ 2517 2518 #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES 2519 #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) 2520 typedef void osal_ipclock_t; 2521 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV 2522 2523 #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) 2524 typedef mdbx_pid_t osal_ipclock_t; 2525 #ifndef EOWNERDEAD 2526 #define EOWNERDEAD MDBX_RESULT_TRUE 2527 #endif 2528 2529 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 2530 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 2531 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) 2532 typedef pthread_mutex_t osal_ipclock_t; 2533 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 2534 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) 2535 typedef sem_t osal_ipclock_t; 2536 #else 2537 #error "FIXME" 2538 #endif /* MDBX_LOCKING */ 2539 2540 #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) 2541 MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); 2542 MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); 2543 #endif /* MDBX_LOCKING */ 2544 2545 /* Reader Lock Table 2546 * 2547 * Readers don't acquire any locks for their data access. Instead, they 2548 * simply record their transaction ID in the reader table. The reader 2549 * mutex is needed just to find an empty slot in the reader table. The 2550 * slot's address is saved in thread-specific data so that subsequent 2551 * read transactions started by the same thread need no further locking to 2552 * proceed. 2553 * 2554 * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. 2555 * No reader table is used if the database is on a read-only filesystem. 2556 * 2557 * Since the database uses multi-version concurrency control, readers don't 2558 * actually need any locking. This table is used to keep track of which 2559 * readers are using data from which old transactions, so that we'll know 2560 * when a particular old transaction is no longer in use. Old transactions 2561 * that have discarded any data pages can then have those pages reclaimed 2562 * for use by a later write transaction. 2563 * 2564 * The lock table is constructed such that reader slots are aligned with the 2565 * processor's cache line size. Any slot is only ever used by one thread. 2566 * This alignment guarantees that there will be no contention or cache 2567 * thrashing as threads update their own slot info, and also eliminates 2568 * any need for locking when accessing a slot. 2569 * 2570 * A writer thread will scan every slot in the table to determine the oldest 2571 * outstanding reader transaction. Any freed pages older than this will be 2572 * reclaimed by the writer. The writer doesn't use any locks when scanning 2573 * this table. This means that there's no guarantee that the writer will 2574 * see the most up-to-date reader info, but that's not required for correct 2575 * operation - all we need is to know the upper bound on the oldest reader, 2576 * we don't care at all about the newest reader. So the only consequence of 2577 * reading stale information here is that old pages might hang around a 2578 * while longer before being reclaimed. That's actually good anyway, because 2579 * the longer we delay reclaiming old pages, the more likely it is that a 2580 * string of contiguous pages can be found after coalescing old pages from 2581 * many old transactions together. */ 2582 2583 /* The actual reader record, with cacheline padding. */ 2584 typedef struct MDBX_reader { 2585 /* Current Transaction ID when this transaction began, or (txnid_t)-1. 2586 * Multiple readers that start at the same time will probably have the 2587 * same ID here. Again, it's not important to exclude them from 2588 * anything; all we need to know is which version of the DB they 2589 * started from so we can avoid overwriting any data used in that 2590 * particular version. */ 2591 MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; 2592 2593 /* The information we store in a single slot of the reader table. 2594 * In addition to a transaction ID, we also record the process and 2595 * thread ID that owns a slot, so that we can detect stale information, 2596 * e.g. threads or processes that went away without cleaning up. 2597 * 2598 * NOTE: We currently don't check for stale records. 2599 * We simply re-init the table when we know that we're the only process 2600 * opening the lock file. */ 2601 2602 /* The thread ID of the thread owning this txn. */ 2603 MDBX_atomic_uint64_t mr_tid; 2604 2605 /* The process ID of the process owning this reader txn. */ 2606 MDBX_atomic_uint32_t mr_pid; 2607 2608 /* The number of pages used in the reader's MVCC snapshot, 2609 * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ 2610 atomic_pgno_t mr_snapshot_pages_used; 2611 /* Number of retired pages at the time this reader starts transaction. So, 2612 * at any time the difference mm_pages_retired - mr_snapshot_pages_retired 2613 * will give the number of pages which this reader restraining from reuse. */ 2614 MDBX_atomic_uint64_t mr_snapshot_pages_retired; 2615 } MDBX_reader; 2616 2617 /* The header for the reader table (a memory-mapped lock file). */ 2618 typedef struct MDBX_lockinfo { 2619 /* Stamp identifying this as an MDBX file. 2620 * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ 2621 uint64_t mti_magic_and_version; 2622 2623 /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ 2624 uint32_t mti_os_and_format; 2625 2626 /* Flags which environment was opened. */ 2627 MDBX_atomic_uint32_t mti_envmode; 2628 2629 /* Threshold of un-synced-with-disk pages for auto-sync feature, 2630 * zero means no-threshold, i.e. auto-sync is disabled. */ 2631 atomic_pgno_t mti_autosync_threshold; 2632 2633 /* Low 32-bit of txnid with which meta-pages was synced, 2634 * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ 2635 MDBX_atomic_uint32_t mti_meta_sync_txnid; 2636 2637 /* Period for timed auto-sync feature, i.e. at the every steady checkpoint 2638 * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. 2639 * The time value is represented in a suitable system-dependent form, for 2640 * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). 2641 * Zero means timed auto-sync is disabled. */ 2642 MDBX_atomic_uint64_t mti_autosync_period; 2643 2644 /* Marker to distinguish uniqueness of DB/CLK. */ 2645 MDBX_atomic_uint64_t mti_bait_uniqueness; 2646 2647 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2648 2649 #if MDBX_ENABLE_PGOP_STAT 2650 /* Statistics of costly ops of all (running, completed and aborted) 2651 * transactions */ 2652 MDBX_pgop_stat_t mti_pgop_stat; 2653 #endif /* MDBX_ENABLE_PGOP_STAT*/ 2654 2655 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2656 2657 /* Write transaction lock. */ 2658 #if MDBX_LOCKING > 0 2659 osal_ipclock_t mti_wlock; 2660 #endif /* MDBX_LOCKING > 0 */ 2661 2662 atomic_txnid_t mti_oldest_reader; 2663 2664 /* Timestamp of the last steady sync. Value is represented in a suitable 2665 * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or 2666 * clock_gettime(CLOCK_MONOTONIC). */ 2667 MDBX_atomic_uint64_t mti_sync_timestamp; 2668 2669 /* Number un-synced-with-disk pages for auto-sync feature. */ 2670 atomic_pgno_t mti_unsynced_pages; 2671 2672 /* Number of page which was discarded last time by madvise(MADV_FREE). */ 2673 atomic_pgno_t mti_discarded_tail; 2674 2675 /* Timestamp of the last readers check. */ 2676 MDBX_atomic_uint64_t mti_reader_check_timestamp; 2677 2678 /* Shared anchor for tracking readahead edge and enabled/disabled status. */ 2679 pgno_t mti_readahead_anchor; 2680 2681 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2682 2683 /* Readeaders registration lock. */ 2684 #if MDBX_LOCKING > 0 2685 osal_ipclock_t mti_rlock; 2686 #endif /* MDBX_LOCKING > 0 */ 2687 2688 /* The number of slots that have been used in the reader table. 2689 * This always records the maximum count, it is not decremented 2690 * when readers release their slots. */ 2691 MDBX_atomic_uint32_t mti_numreaders; 2692 MDBX_atomic_uint32_t mti_readers_refresh_flag; 2693 2694 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2695 (!defined(__cplusplus) && defined(_MSC_VER)) 2696 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2697 MDBX_reader mti_readers[] /* dynamic size */; 2698 #endif /* C99 */ 2699 } MDBX_lockinfo; 2700 2701 /* Lockfile format signature: version, features and field layout */ 2702 #define MDBX_LOCK_FORMAT \ 2703 (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ 2704 (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ 2705 (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ 2706 (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ 2707 (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) 2708 2709 #define MDBX_DATA_MAGIC \ 2710 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) 2711 2712 #define MDBX_DATA_MAGIC_LEGACY_COMPAT \ 2713 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2) 2714 2715 #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255) 2716 2717 #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) 2718 2719 /* The maximum size of a database page. 2720 * 2721 * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. 2722 * 2723 * MDBX will use database pages < OS pages if needed. 2724 * That causes more I/O in write transactions: The OS must 2725 * know (read) the whole page before writing a partial page. 2726 * 2727 * Note that we don't currently support Huge pages. On Linux, 2728 * regular data files cannot use Huge pages, and in general 2729 * Huge pages aren't actually pageable. We rely on the OS 2730 * demand-pager to read our data and page it out when memory 2731 * pressure from other processes is high. So until OSs have 2732 * actual paging support for Huge pages, they're not viable. */ 2733 #define MAX_PAGESIZE MDBX_MAX_PAGESIZE 2734 #define MIN_PAGESIZE MDBX_MIN_PAGESIZE 2735 2736 #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) 2737 #if defined(_WIN32) || defined(_WIN64) 2738 #define MAX_MAPSIZE32 UINT32_C(0x38000000) 2739 #else 2740 #define MAX_MAPSIZE32 UINT32_C(0x7f000000) 2741 #endif 2742 #define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MAX_PAGESIZE) 2743 2744 #if MDBX_WORDBITS >= 64 2745 #define MAX_MAPSIZE MAX_MAPSIZE64 2746 #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO) 2747 #else 2748 #define MAX_MAPSIZE MAX_MAPSIZE32 2749 #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) 2750 #endif /* MDBX_WORDBITS */ 2751 2752 #define MDBX_READERS_LIMIT 32767 2753 #define MDBX_RADIXSORT_THRESHOLD 333 2754 2755 /*----------------------------------------------------------------------------*/ 2756 2757 /* An PNL is an Page Number List, a sorted array of IDs. 2758 * The first element of the array is a counter for how many actual page-numbers 2759 * are in the list. By default PNLs are sorted in descending order, this allow 2760 * cut off a page with lowest pgno (at the tail) just truncating the list. The 2761 * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ 2762 typedef pgno_t *MDBX_PNL; 2763 2764 #if MDBX_PNL_ASCENDING 2765 #define MDBX_PNL_ORDERED(first, last) ((first) < (last)) 2766 #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) 2767 #else 2768 #define MDBX_PNL_ORDERED(first, last) ((first) > (last)) 2769 #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) 2770 #endif 2771 2772 /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */ 2773 typedef txnid_t *MDBX_TXL; 2774 2775 /* An Dirty-Page list item is an pgno/pointer pair. */ 2776 typedef struct MDBX_dp { 2777 MDBX_page *ptr; 2778 pgno_t pgno; 2779 union { 2780 unsigned extra; 2781 __anonymous_struct_extension__ struct { 2782 unsigned multi : 1; 2783 unsigned lru : 31; 2784 }; 2785 }; 2786 } MDBX_dp; 2787 2788 /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ 2789 typedef struct MDBX_dpl { 2790 unsigned sorted; 2791 unsigned length; 2792 unsigned pages_including_loose; /* number of pages, but not an entries. */ 2793 unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ 2794 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2795 (!defined(__cplusplus) && defined(_MSC_VER)) 2796 MDBX_dp items[] /* dynamic size with holes at zero and after the last */; 2797 #endif 2798 } MDBX_dpl; 2799 2800 /* PNL sizes */ 2801 #define MDBX_PNL_GRANULATE 1024 2802 #define MDBX_PNL_INITIAL \ 2803 (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) 2804 2805 #define MDBX_TXL_GRANULATE 32 2806 #define MDBX_TXL_INITIAL \ 2807 (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) 2808 #define MDBX_TXL_MAX \ 2809 ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) 2810 2811 #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) 2812 #define MDBX_PNL_SIZE(pl) ((pl)[0]) 2813 #define MDBX_PNL_FIRST(pl) ((pl)[1]) 2814 #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)]) 2815 #define MDBX_PNL_BEGIN(pl) (&(pl)[1]) 2816 #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) 2817 2818 #if MDBX_PNL_ASCENDING 2819 #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) 2820 #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) 2821 #else 2822 #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) 2823 #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) 2824 #endif 2825 2826 #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) 2827 #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) 2828 2829 /*----------------------------------------------------------------------------*/ 2830 /* Internal structures */ 2831 2832 /* Auxiliary DB info. 2833 * The information here is mostly static/read-only. There is 2834 * only a single copy of this record in the environment. */ 2835 typedef struct MDBX_dbx { 2836 MDBX_val md_name; /* name of the database */ 2837 MDBX_cmp_func *md_cmp; /* function for comparing keys */ 2838 MDBX_cmp_func *md_dcmp; /* function for comparing data items */ 2839 size_t md_klen_min, md_klen_max; /* min/max key length for the database */ 2840 size_t md_vlen_min, 2841 md_vlen_max; /* min/max value/data length for the database */ 2842 } MDBX_dbx; 2843 2844 typedef struct troika { 2845 uint8_t fsm, recent, prefer_steady, tail_and_flags; 2846 #define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) 2847 #define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) 2848 #define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) 2849 #define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) 2850 txnid_t txnid[NUM_METAS]; 2851 } meta_troika_t; 2852 2853 /* A database transaction. 2854 * Every operation requires a transaction handle. */ 2855 struct MDBX_txn { 2856 #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) 2857 uint32_t mt_signature; 2858 2859 /* Transaction Flags */ 2860 /* mdbx_txn_begin() flags */ 2861 #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) 2862 #define MDBX_TXN_RW_BEGIN_FLAGS \ 2863 (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) 2864 /* Additional flag for sync_locked() */ 2865 #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) 2866 2867 #define TXN_FLAGS \ 2868 (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ 2869 MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) 2870 2871 #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ 2872 ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ 2873 MDBX_SHRINK_ALLOWED) 2874 #error "Oops, some txn flags overlapped or wrong" 2875 #endif 2876 uint32_t mt_flags; 2877 2878 MDBX_txn *mt_parent; /* parent of a nested txn */ 2879 /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ 2880 MDBX_txn *mt_child; 2881 MDBX_geo mt_geo; 2882 /* next unallocated page */ 2883 #define mt_next_pgno mt_geo.next 2884 /* corresponding to the current size of datafile */ 2885 #define mt_end_pgno mt_geo.now 2886 2887 /* The ID of this transaction. IDs are integers incrementing from 2888 * INITIAL_TXNID. Only committed write transactions increment the ID. If a 2889 * transaction aborts, the ID may be re-used by the next writer. */ 2890 txnid_t mt_txnid; 2891 txnid_t mt_front; 2892 2893 MDBX_env *mt_env; /* the DB environment */ 2894 /* Array of records for each DB known in the environment. */ 2895 MDBX_dbx *mt_dbxs; 2896 /* Array of MDBX_db records for each known DB */ 2897 MDBX_db *mt_dbs; 2898 /* Array of sequence numbers for each DB handle */ 2899 MDBX_atomic_uint32_t *mt_dbiseqs; 2900 2901 /* Transaction DBI Flags */ 2902 #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ 2903 #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ 2904 #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ 2905 #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ 2906 #define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ 2907 #define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ 2908 #define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ 2909 /* Array of flags for each DB */ 2910 uint8_t *mt_dbistate; 2911 /* Number of DB records in use, or 0 when the txn is finished. 2912 * This number only ever increments until the txn finishes; we 2913 * don't decrement it when individual DB handles are closed. */ 2914 MDBX_dbi mt_numdbs; 2915 size_t mt_owner; /* thread ID that owns this transaction */ 2916 MDBX_canary mt_canary; 2917 void *mt_userctx; /* User-settable context */ 2918 MDBX_cursor **mt_cursors; 2919 2920 union { 2921 struct { 2922 /* For read txns: This thread/txn's reader table slot, or NULL. */ 2923 MDBX_reader *reader; 2924 } to; 2925 struct { 2926 meta_troika_t troika; 2927 /* In write txns, array of cursors for each DB */ 2928 pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ 2929 txnid_t last_reclaimed; /* ID of last used record */ 2930 #if MDBX_ENABLE_REFUND 2931 pgno_t loose_refund_wl /* FIXME: describe */; 2932 #endif /* MDBX_ENABLE_REFUND */ 2933 /* dirtylist room: Dirty array size - dirty pages visible to this txn. 2934 * Includes ancestor txns' dirty pages not hidden by other txns' 2935 * dirty/spilled pages. Thus commit(nested txn) has room to merge 2936 * dirtylist into mt_parent after freeing hidden mt_parent pages. */ 2937 unsigned dirtyroom; 2938 /* a sequence to spilling dirty page with LRU policy */ 2939 unsigned dirtylru; 2940 /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ 2941 MDBX_dpl *dirtylist; 2942 /* The list of reclaimed txns from GC */ 2943 MDBX_TXL lifo_reclaimed; 2944 /* The list of pages that became unused during this transaction. */ 2945 MDBX_PNL retired_pages; 2946 /* The list of loose pages that became unused and may be reused 2947 * in this transaction, linked through `mp_next`. */ 2948 MDBX_page *loose_pages; 2949 /* Number of loose pages (tw.loose_pages) */ 2950 unsigned loose_count; 2951 unsigned spill_least_removed; 2952 /* The sorted list of dirty pages we temporarily wrote to disk 2953 * because the dirty list was full. page numbers in here are 2954 * shifted left by 1, deleted slots have the LSB set. */ 2955 MDBX_PNL spill_pages; 2956 } tw; 2957 }; 2958 }; 2959 2960 #if MDBX_WORDBITS >= 64 2961 #define CURSOR_STACK 32 2962 #else 2963 #define CURSOR_STACK 24 2964 #endif 2965 2966 struct MDBX_xcursor; 2967 2968 /* Cursors are used for all DB operations. 2969 * A cursor holds a path of (page pointer, key index) from the DB 2970 * root to a position in the DB, plus other state. MDBX_DUPSORT 2971 * cursors include an xcursor to the current data item. Write txns 2972 * track their cursors and keep them up to date when data moves. 2973 * Exception: An xcursor's pointer to a P_SUBP page can be stale. 2974 * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ 2975 struct MDBX_cursor { 2976 #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1) 2977 #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) 2978 #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) 2979 uint32_t mc_signature; 2980 /* The database handle this cursor operates on */ 2981 MDBX_dbi mc_dbi; 2982 /* Next cursor on this DB in this txn */ 2983 MDBX_cursor *mc_next; 2984 /* Backup of the original cursor if this cursor is a shadow */ 2985 MDBX_cursor *mc_backup; 2986 /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ 2987 struct MDBX_xcursor *mc_xcursor; 2988 /* The transaction that owns this cursor */ 2989 MDBX_txn *mc_txn; 2990 /* The database record for this cursor */ 2991 MDBX_db *mc_db; 2992 /* The database auxiliary record for this cursor */ 2993 MDBX_dbx *mc_dbx; 2994 /* The mt_dbistate for this database */ 2995 uint8_t *mc_dbistate; 2996 uint8_t mc_snum; /* number of pushed pages */ 2997 uint8_t mc_top; /* index of top page, normally mc_snum-1 */ 2998 2999 /* Cursor state flags. */ 3000 #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ 3001 #define C_EOF 0x02 /* No more data */ 3002 #define C_SUB 0x04 /* Cursor is a sub-cursor */ 3003 #define C_DEL 0x08 /* last op was a cursor_del */ 3004 #define C_UNTRACK 0x10 /* Un-track cursor when closing */ 3005 #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ 3006 #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ 3007 uint8_t mc_flags; /* see mdbx_cursor */ 3008 3009 /* Cursor checking flags. */ 3010 #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ 3011 #define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ 3012 #define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ 3013 #define CC_UPDATING 0x08 /* update/rebalance pending */ 3014 #define CC_SKIPORD 0x10 /* don't check keys ordering */ 3015 #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ 3016 #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ 3017 #define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ 3018 uint8_t mc_checking; /* page checking level */ 3019 3020 MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ 3021 indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ 3022 }; 3023 3024 #define CHECK_LEAF_TYPE(mc, mp) \ 3025 (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ 3026 (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) 3027 3028 /* Context for sorted-dup records. 3029 * We could have gone to a fully recursive design, with arbitrarily 3030 * deep nesting of sub-databases. But for now we only handle these 3031 * levels - main DB, optional sub-DB, sorted-duplicate DB. */ 3032 typedef struct MDBX_xcursor { 3033 /* A sub-cursor for traversing the Dup DB */ 3034 MDBX_cursor mx_cursor; 3035 /* The database record for this Dup DB */ 3036 MDBX_db mx_db; 3037 /* The auxiliary DB record for this Dup DB */ 3038 MDBX_dbx mx_dbx; 3039 } MDBX_xcursor; 3040 3041 typedef struct MDBX_cursor_couple { 3042 MDBX_cursor outer; 3043 void *mc_userctx; /* User-settable context */ 3044 MDBX_xcursor inner; 3045 } MDBX_cursor_couple; 3046 3047 /* The database environment. */ 3048 struct MDBX_env { 3049 /* ----------------------------------------------------- mostly static part */ 3050 #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) 3051 MDBX_atomic_uint32_t me_signature; 3052 /* Failed to update the meta page. Probably an I/O error. */ 3053 #define MDBX_FATAL_ERROR UINT32_C(0x80000000) 3054 /* Some fields are initialized. */ 3055 #define MDBX_ENV_ACTIVE UINT32_C(0x20000000) 3056 /* me_txkey is set */ 3057 #define MDBX_ENV_TXKEY UINT32_C(0x10000000) 3058 /* Legacy MDBX_MAPASYNC (prior v0.9) */ 3059 #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) 3060 /* Legacy MDBX_COALESCE (prior v0.12) */ 3061 #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) 3062 #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) 3063 uint32_t me_flags; 3064 osal_mmap_t me_dxb_mmap; /* The main data file */ 3065 #define me_map me_dxb_mmap.dxb 3066 #define me_lazy_fd me_dxb_mmap.fd 3067 mdbx_filehandle_t me_dsync_fd; 3068 osal_mmap_t me_lck_mmap; /* The lock file */ 3069 #define me_lfd me_lck_mmap.fd 3070 struct MDBX_lockinfo *me_lck; 3071 3072 unsigned me_psize; /* DB page size, initialized from me_os_psize */ 3073 unsigned me_leaf_nodemax; /* max size of a leaf-node */ 3074 uint8_t me_psize2log; /* log2 of DB page size */ 3075 int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ 3076 uint16_t me_merge_threshold, 3077 me_merge_threshold_gc; /* pages emptier than this are candidates for 3078 merging */ 3079 unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ 3080 unsigned me_maxreaders; /* size of the reader table */ 3081 MDBX_dbi me_maxdbs; /* size of the DB table */ 3082 uint32_t me_pid; /* process ID of this env */ 3083 osal_thread_key_t me_txkey; /* thread-key for readers */ 3084 pathchar_t *me_pathname; /* path to the DB files */ 3085 void *me_pbuf; /* scratch area for DUPSORT put() */ 3086 MDBX_txn *me_txn0; /* preallocated write transaction */ 3087 3088 MDBX_dbx *me_dbxs; /* array of static DB info */ 3089 uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ 3090 MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ 3091 unsigned 3092 me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ 3093 uint32_t me_live_reader; /* have liveness lock in reader table */ 3094 void *me_userctx; /* User-settable context */ 3095 MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ 3096 3097 struct { 3098 unsigned dp_reserve_limit; 3099 unsigned rp_augment_limit; 3100 unsigned dp_limit; 3101 unsigned dp_initial; 3102 uint8_t dp_loose_limit; 3103 uint8_t spill_max_denominator; 3104 uint8_t spill_min_denominator; 3105 uint8_t spill_parent4child_denominator; 3106 unsigned merge_threshold_16dot16_percent; 3107 union { 3108 unsigned all; 3109 /* tracks options with non-auto values but tuned by user */ 3110 struct { 3111 unsigned dp_limit : 1; 3112 } non_auto; 3113 } flags; 3114 } me_options; 3115 3116 /* struct me_dbgeo used for accepting db-geo params from user for the new 3117 * database creation, i.e. when mdbx_env_set_geometry() was called before 3118 * mdbx_env_open(). */ 3119 struct { 3120 size_t lower; /* minimal size of datafile */ 3121 size_t upper; /* maximal size of datafile */ 3122 size_t now; /* current size of datafile */ 3123 size_t grow; /* step to grow datafile */ 3124 size_t shrink; /* threshold to shrink datafile */ 3125 } me_dbgeo; 3126 3127 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 3128 union { 3129 key_t key; 3130 int semid; 3131 } me_sysv_ipc; 3132 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ 3133 3134 MDBX_env *me_lcklist_next; 3135 3136 /* --------------------------------------------------- mostly volatile part */ 3137 3138 MDBX_txn *me_txn; /* current write transaction */ 3139 osal_fastmutex_t me_dbi_lock; 3140 MDBX_dbi me_numdbs; /* number of DBs opened */ 3141 3142 MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ 3143 unsigned me_dp_reserve_len; 3144 /* PNL of pages that became unused in a write txn */ 3145 MDBX_PNL me_retired_pages; 3146 3147 #if defined(_WIN32) || defined(_WIN64) 3148 osal_srwlock_t me_remap_guard; 3149 /* Workaround for LockFileEx and WriteFile multithread bug */ 3150 CRITICAL_SECTION me_windowsbug_lock; 3151 #else 3152 osal_fastmutex_t me_remap_guard; 3153 #endif 3154 3155 /* -------------------------------------------------------------- debugging */ 3156 3157 #if MDBX_DEBUG 3158 MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ 3159 #endif 3160 #ifdef MDBX_USE_VALGRIND 3161 int me_valgrind_handle; 3162 #endif 3163 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 3164 pgno_t me_poison_edge; 3165 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ 3166 3167 #ifndef xMDBX_DEBUG_SPILLING 3168 #define xMDBX_DEBUG_SPILLING 0 3169 #endif 3170 #if xMDBX_DEBUG_SPILLING == 2 3171 unsigned debug_dirtied_est, debug_dirtied_act; 3172 #endif /* xMDBX_DEBUG_SPILLING */ 3173 3174 /* ------------------------------------------------- stub for lck-less mode */ 3175 MDBX_atomic_uint64_t 3176 x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) / 3177 sizeof(MDBX_atomic_uint64_t)]; 3178 }; 3179 3180 #ifndef __cplusplus 3181 /*----------------------------------------------------------------------------*/ 3182 /* Debug and Logging stuff */ 3183 3184 #define MDBX_RUNTIME_FLAGS_INIT \ 3185 ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT 3186 3187 extern uint8_t runtime_flags; 3188 extern uint8_t loglevel; 3189 extern MDBX_debug_func *debug_logger; 3190 3191 MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { 3192 #if MDBX_DEBUG 3193 if (MDBX_DBG_JITTER & runtime_flags) 3194 osal_jitter(tiny); 3195 #else 3196 (void)tiny; 3197 #endif 3198 } 3199 3200 MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) 3201 debug_log(int level, const char *function, int line, const char *fmt, ...) 3202 MDBX_PRINTF_ARGS(4, 5); 3203 MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, 3204 const char *fmt, va_list args); 3205 3206 #if MDBX_DEBUG 3207 #define LOG_ENABLED(msg) unlikely(msg <= loglevel) 3208 #define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) 3209 #else /* MDBX_DEBUG */ 3210 #define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) 3211 #define AUDIT_ENABLED() (0) 3212 #endif /* MDBX_DEBUG */ 3213 3214 #if MDBX_FORCE_ASSERTIONS 3215 #define ASSERT_ENABLED() (1) 3216 #elif MDBX_DEBUG 3217 #define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) 3218 #else 3219 #define ASSERT_ENABLED() (0) 3220 #endif /* assertions */ 3221 3222 #define DEBUG_EXTRA(fmt, ...) \ 3223 do { \ 3224 if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ 3225 debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ 3226 } while (0) 3227 3228 #define DEBUG_EXTRA_PRINT(fmt, ...) \ 3229 do { \ 3230 if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ 3231 debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ 3232 } while (0) 3233 3234 #define TRACE(fmt, ...) \ 3235 do { \ 3236 if (LOG_ENABLED(MDBX_LOG_TRACE)) \ 3237 debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3238 } while (0) 3239 3240 #define DEBUG(fmt, ...) \ 3241 do { \ 3242 if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ 3243 debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3244 } while (0) 3245 3246 #define VERBOSE(fmt, ...) \ 3247 do { \ 3248 if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ 3249 debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3250 } while (0) 3251 3252 #define NOTICE(fmt, ...) \ 3253 do { \ 3254 if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ 3255 debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3256 } while (0) 3257 3258 #define WARNING(fmt, ...) \ 3259 do { \ 3260 if (LOG_ENABLED(MDBX_LOG_WARN)) \ 3261 debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3262 } while (0) 3263 3264 #undef ERROR /* wingdi.h \ 3265 Yeah, morons from M$ put such definition to the public header. */ 3266 3267 #define ERROR(fmt, ...) \ 3268 do { \ 3269 if (LOG_ENABLED(MDBX_LOG_ERROR)) \ 3270 debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3271 } while (0) 3272 3273 #define FATAL(fmt, ...) \ 3274 debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); 3275 3276 #define ENSURE_MSG(env, expr, msg) \ 3277 do { \ 3278 if (unlikely(!(expr))) \ 3279 mdbx_assert_fail(env, msg, __func__, __LINE__); \ 3280 } while (0) 3281 3282 #define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) 3283 3284 /* assert(3) variant in environment context */ 3285 #define eASSERT(env, expr) \ 3286 do { \ 3287 if (ASSERT_ENABLED()) \ 3288 ENSURE(env, expr); \ 3289 } while (0) 3290 3291 /* assert(3) variant in cursor context */ 3292 #define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) 3293 3294 /* assert(3) variant in transaction context */ 3295 #define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) 3296 3297 #ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ 3298 #undef assert 3299 #define assert(expr) eASSERT(NULL, expr) 3300 #endif 3301 3302 /*----------------------------------------------------------------------------*/ 3303 /* Cache coherence and mmap invalidation */ 3304 3305 #if MDBX_CPU_WRITEBACK_INCOHERENT 3306 #define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() 3307 #else 3308 #define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() 3309 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ 3310 3311 MDBX_MAYBE_UNUSED static __inline void 3312 osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { 3313 #if MDBX_MMAP_INCOHERENT_FILE_WRITE 3314 char *const begin = (char *)(-pagesize & (intptr_t)addr); 3315 char *const end = 3316 (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); 3317 int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; 3318 eASSERT(nullptr, err == 0); 3319 (void)err; 3320 #else 3321 (void)pagesize; 3322 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 3323 3324 #if MDBX_MMAP_INCOHERENT_CPU_CACHE 3325 #ifdef DCACHE 3326 /* MIPS has cache coherency issues. 3327 * Note: for any nbytes >= on-chip cache size, entire is flushed. */ 3328 cacheflush(addr, nbytes, DCACHE); 3329 #else 3330 #error "Oops, cacheflush() not available" 3331 #endif /* DCACHE */ 3332 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ 3333 3334 #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE 3335 (void)addr; 3336 (void)nbytes; 3337 #endif 3338 } 3339 3340 /*----------------------------------------------------------------------------*/ 3341 /* Internal prototypes */ 3342 3343 MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, 3344 int *dead); 3345 MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, 3346 MDBX_reader *end); 3347 MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); 3348 3349 MDBX_INTERNAL_FUNC void global_ctor(void); 3350 MDBX_INTERNAL_FUNC void global_dtor(void); 3351 MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); 3352 3353 #endif /* !__cplusplus */ 3354 3355 #define MDBX_IS_ERROR(rc) \ 3356 ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) 3357 3358 /* Internal error codes, not exposed outside libmdbx */ 3359 #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) 3360 3361 /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ 3362 #define DDBI(mc) \ 3363 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 3364 3365 /* Key size which fits in a DKBUF (debug key buffer). */ 3366 #define DKBUF_MAX 511 3367 #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2] 3368 #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1) 3369 #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1) 3370 3371 #if MDBX_DEBUG 3372 #define DKBUF_DEBUG DKBUF 3373 #define DKEY_DEBUG(x) DKEY(x) 3374 #define DVAL_DEBUG(x) DVAL(x) 3375 #else 3376 #define DKBUF_DEBUG ((void)(0)) 3377 #define DKEY_DEBUG(x) ("-") 3378 #define DVAL_DEBUG(x) ("-") 3379 #endif 3380 3381 /* An invalid page number. 3382 * Mainly used to denote an empty tree. */ 3383 #define P_INVALID (~(pgno_t)0) 3384 3385 /* Test if the flags f are set in a flag word w. */ 3386 #define F_ISSET(w, f) (((w) & (f)) == (f)) 3387 3388 /* Round n up to an even number. */ 3389 #define EVEN(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */ 3390 3391 /* Default size of memory map. 3392 * This is certainly too small for any actual applications. Apps should 3393 * always set the size explicitly using mdbx_env_set_geometry(). */ 3394 #define DEFAULT_MAPSIZE MEGABYTE 3395 3396 /* Number of slots in the reader table. 3397 * This value was chosen somewhat arbitrarily. The 61 is a prime number, 3398 * and such readers plus a couple mutexes fit into single 4KB page. 3399 * Applications should set the table size using mdbx_env_set_maxreaders(). */ 3400 #define DEFAULT_READERS 61 3401 3402 /* Test if a page is a leaf page */ 3403 #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0) 3404 /* Test if a page is a LEAF2 page */ 3405 #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0) 3406 /* Test if a page is a branch page */ 3407 #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0) 3408 /* Test if a page is an overflow page */ 3409 #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0) 3410 /* Test if a page is a sub page */ 3411 #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) 3412 3413 /* Header for a single key/data pair within a page. 3414 * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. 3415 * We guarantee 2-byte alignment for 'MDBX_node's. 3416 * 3417 * Leaf node flags describe node contents. F_BIGDATA says the node's 3418 * data part is the page number of an overflow page with actual data. 3419 * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in 3420 * a sub-page/sub-database, and named databases (just F_SUBDATA). */ 3421 typedef struct MDBX_node { 3422 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 3423 union { 3424 uint32_t mn_dsize; 3425 uint32_t mn_pgno32; 3426 }; 3427 uint8_t mn_flags; /* see mdbx_node flags */ 3428 uint8_t mn_extra; 3429 uint16_t mn_ksize; /* key size */ 3430 #else 3431 uint16_t mn_ksize; /* key size */ 3432 uint8_t mn_extra; 3433 uint8_t mn_flags; /* see mdbx_node flags */ 3434 union { 3435 uint32_t mn_pgno32; 3436 uint32_t mn_dsize; 3437 }; 3438 #endif /* __BYTE_ORDER__ */ 3439 3440 /* mdbx_node Flags */ 3441 #define F_BIGDATA 0x01 /* data put on overflow page */ 3442 #define F_SUBDATA 0x02 /* data is a sub-database */ 3443 #define F_DUPDATA 0x04 /* data has duplicates */ 3444 3445 /* valid flags for mdbx_node_add() */ 3446 #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) 3447 3448 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 3449 (!defined(__cplusplus) && defined(_MSC_VER)) 3450 uint8_t mn_data[] /* key and data are appended here */; 3451 #endif /* C99 */ 3452 } MDBX_node; 3453 3454 #define DB_PERSISTENT_FLAGS \ 3455 (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ 3456 MDBX_INTEGERDUP | MDBX_REVERSEDUP) 3457 3458 /* mdbx_dbi_open() flags */ 3459 #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) 3460 3461 #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ 3462 #define DB_INTERNAL_FLAGS DB_VALID 3463 3464 #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS 3465 #error "Oops, some flags overlapped or wrong" 3466 #endif 3467 #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS 3468 #error "Oops, some flags overlapped or wrong" 3469 #endif 3470 3471 /* max number of pages to commit in one writev() call */ 3472 #define MDBX_COMMIT_PAGES 64 3473 #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ 3474 #undef MDBX_COMMIT_PAGES 3475 #define MDBX_COMMIT_PAGES IOV_MAX 3476 #endif 3477 3478 /* 3479 * / 3480 * | -1, a < b 3481 * CMP2INT(a,b) = < 0, a == b 3482 * | 1, a > b 3483 * \ 3484 */ 3485 #ifndef __e2k__ 3486 /* LY: fast enough on most systems */ 3487 #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) 3488 #else 3489 /* LY: more parallelable on VLIW Elbrus */ 3490 #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) 3491 #endif 3492 3493 /* Do not spill pages to disk if txn is getting full, may fail instead */ 3494 #define MDBX_NOSPILL 0x8000 3495 3496 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3497 int64pgno(int64_t i64) { 3498 if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1)) 3499 return (pgno_t)i64; 3500 return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO; 3501 } 3502 3503 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3504 pgno_add(size_t base, size_t augend) { 3505 assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); 3506 return int64pgno(base + augend); 3507 } 3508 3509 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3510 pgno_sub(size_t base, size_t subtrahend) { 3511 assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && 3512 subtrahend < MAX_PAGENO); 3513 return int64pgno(base - subtrahend); 3514 } 3515 3516 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool 3517 is_powerof2(size_t x) { 3518 return (x & (x - 1)) == 0; 3519 } 3520 3521 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t 3522 floor_powerof2(size_t value, size_t granularity) { 3523 assert(is_powerof2(granularity)); 3524 return value & ~(granularity - 1); 3525 } 3526 3527 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t 3528 ceil_powerof2(size_t value, size_t granularity) { 3529 return floor_powerof2(value + granularity - 1, granularity); 3530 } 3531 3532 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned 3533 log2n_powerof2(size_t value) { 3534 assert(value > 0 && value < INT32_MAX && is_powerof2(value)); 3535 assert((value & -(int32_t)value) == value); 3536 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) 3537 return __builtin_ctzl(value); 3538 #elif defined(_MSC_VER) 3539 unsigned long index; 3540 _BitScanForward(&index, (unsigned long)value); 3541 return index; 3542 #else 3543 static const uint8_t debruijn_ctz32[32] = { 3544 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 3545 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; 3546 return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; 3547 #endif 3548 } 3549 3550 /* Only a subset of the mdbx_env flags can be changed 3551 * at runtime. Changing other flags requires closing the 3552 * environment and re-opening it with the new flags. */ 3553 #define ENV_CHANGEABLE_FLAGS \ 3554 (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ 3555 MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ 3556 MDBX_VALIDATION) 3557 #define ENV_CHANGELESS_FLAGS \ 3558 (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ 3559 MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) 3560 #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) 3561 3562 #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS 3563 MDBX_MAYBE_UNUSED static void static_checks(void) { 3564 STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, 3565 "Oops, MDBX_MAX_DBI or CORE_DBS?"); 3566 STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == 3567 ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) & 3568 (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)), 3569 "Oops, some flags overlapped or wrong"); 3570 STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0, 3571 "Oops, some flags overlapped or wrong"); 3572 } 3573 #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */ 3574 3575 #ifdef __cplusplus 3576 } 3577 #endif 3578 3579 #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ 3580 do { \ 3581 TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ 3582 (size_t)(size), __LINE__); \ 3583 ASAN_POISON_MEMORY_REGION(addr, size); \ 3584 } while (0) 3585 3586 #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ 3587 do { \ 3588 TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ 3589 (size_t)(size), __LINE__); \ 3590 ASAN_UNPOISON_MEMORY_REGION(addr, size); \ 3591 } while (0) 3592 /* 3593 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru>. 3594 * and other libmdbx authors: please see AUTHORS file. 3595 * All rights reserved. 3596 * 3597 * This code is derived from "LMDB engine" written by 3598 * Howard Chu (Symas Corporation), which itself derived from btree.c 3599 * written by Martin Hedenfalk. 3600 * 3601 * --- 3602 * 3603 * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. 3604 * 3605 * Redistribution and use in source and binary forms, with or without 3606 * modification, are permitted only as authorized by the OpenLDAP 3607 * Public License. 3608 * 3609 * A copy of this license is available in the file LICENSE in the 3610 * top-level directory of the distribution or, alternatively, at 3611 * <http://www.OpenLDAP.org/license.html>. 3612 * 3613 * --- 3614 * 3615 * Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 3616 * 3617 * Permission to use, copy, modify, and distribute this software for any 3618 * purpose with or without fee is hereby granted, provided that the above 3619 * copyright notice and this permission notice appear in all copies. 3620 * 3621 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 3622 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 3623 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 3624 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 3625 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 3626 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 3627 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 3628 3629 3630 /*------------------------------------------------------------------------------ 3631 * Internal inline functions */ 3632 3633 MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) { 3634 assert(value > INT_MIN); 3635 const unsigned expanded_sign = 3636 (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1)); 3637 return ((unsigned)value + expanded_sign) ^ expanded_sign; 3638 } 3639 3640 /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ 3641 MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m, 3642 unsigned e) { 3643 assert(m < 2048 && e < 8); 3644 return (pgno_t)(32768 + ((m + 1) << (e + 8))); 3645 } 3646 3647 MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, 3648 unsigned e) { 3649 assert(v > (e ? me2v(2047, e - 1) : 32768)); 3650 assert(v <= me2v(2047, e)); 3651 size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); 3652 m -= m > 0; 3653 assert(m < 2048 && e < 8); 3654 // f e d c b a 9 8 7 6 5 4 3 2 1 0 3655 // 1 e e e m m m m m m m m m m m 1 3656 const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1)); 3657 assert(pv != 65535); 3658 return pv; 3659 } 3660 3661 /* Convert 16-bit packed (exponential quantized) value to number of pages */ 3662 MDBX_NOTHROW_CONST_FUNCTION static pgno_t pv2pages(uint16_t pv) { 3663 if ((pv & 0x8001) != 0x8001) 3664 return pv; 3665 if (pv == 65535) 3666 return 65536; 3667 // f e d c b a 9 8 7 6 5 4 3 2 1 0 3668 // 1 e e e m m m m m m m m m m m 1 3669 return me2v((pv >> 1) & 2047, (pv >> 12) & 7); 3670 } 3671 3672 /* Convert number of pages to 16-bit packed (exponential quantized) value */ 3673 MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { 3674 if (pages < 32769 || (pages < 65536 && (pages & 1) == 0)) 3675 return (uint16_t)pages; 3676 if (pages <= me2v(2047, 0)) 3677 return v2me(pages, 0); 3678 if (pages <= me2v(2047, 1)) 3679 return v2me(pages, 1); 3680 if (pages <= me2v(2047, 2)) 3681 return v2me(pages, 2); 3682 if (pages <= me2v(2047, 3)) 3683 return v2me(pages, 3); 3684 if (pages <= me2v(2047, 4)) 3685 return v2me(pages, 4); 3686 if (pages <= me2v(2047, 5)) 3687 return v2me(pages, 5); 3688 if (pages <= me2v(2047, 6)) 3689 return v2me(pages, 6); 3690 return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533; 3691 } 3692 3693 /*------------------------------------------------------------------------------ 3694 * Unaligned access */ 3695 3696 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned 3697 field_alignment(unsigned alignment_baseline, size_t field_offset) { 3698 unsigned merge = alignment_baseline | (unsigned)field_offset; 3699 return merge & -(int)merge; 3700 } 3701 3702 /* read-thunk for UB-sanitizer */ 3703 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t 3704 peek_u8(const uint8_t *const __restrict ptr) { 3705 return *ptr; 3706 } 3707 3708 /* write-thunk for UB-sanitizer */ 3709 static __always_inline void poke_u8(uint8_t *const __restrict ptr, 3710 const uint8_t v) { 3711 *ptr = v; 3712 } 3713 3714 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t 3715 unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { 3716 assert((uintptr_t)ptr % expected_alignment == 0); 3717 if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0) 3718 return *(const uint16_t *)ptr; 3719 else { 3720 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3721 defined(_M_X64) || defined(_M_IA64) 3722 return *(const __unaligned uint16_t *)ptr; 3723 #else 3724 uint16_t v; 3725 memcpy(&v, ptr, sizeof(v)); 3726 return v; 3727 #endif /* _MSC_VER || __unaligned */ 3728 } 3729 } 3730 3731 static __always_inline void 3732 unaligned_poke_u16(const unsigned expected_alignment, 3733 void *const __restrict ptr, const uint16_t v) { 3734 assert((uintptr_t)ptr % expected_alignment == 0); 3735 if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0) 3736 *(uint16_t *)ptr = v; 3737 else { 3738 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3739 defined(_M_X64) || defined(_M_IA64) 3740 *((uint16_t __unaligned *)ptr) = v; 3741 #else 3742 memcpy(ptr, &v, sizeof(v)); 3743 #endif /* _MSC_VER || __unaligned */ 3744 } 3745 } 3746 3747 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( 3748 const unsigned expected_alignment, const void *const __restrict ptr) { 3749 assert((uintptr_t)ptr % expected_alignment == 0); 3750 if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0) 3751 return *(const uint32_t *)ptr; 3752 else if ((expected_alignment % sizeof(uint16_t)) == 0) { 3753 const uint16_t lo = 3754 ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; 3755 const uint16_t hi = 3756 ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; 3757 return lo | (uint32_t)hi << 16; 3758 } else { 3759 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3760 defined(_M_X64) || defined(_M_IA64) 3761 return *(const __unaligned uint32_t *)ptr; 3762 #else 3763 uint32_t v; 3764 memcpy(&v, ptr, sizeof(v)); 3765 return v; 3766 #endif /* _MSC_VER || __unaligned */ 3767 } 3768 } 3769 3770 static __always_inline void 3771 unaligned_poke_u32(const unsigned expected_alignment, 3772 void *const __restrict ptr, const uint32_t v) { 3773 assert((uintptr_t)ptr % expected_alignment == 0); 3774 if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0) 3775 *(uint32_t *)ptr = v; 3776 else if ((expected_alignment % sizeof(uint16_t)) == 0) { 3777 ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v; 3778 ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = 3779 (uint16_t)(v >> 16); 3780 } else { 3781 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3782 defined(_M_X64) || defined(_M_IA64) 3783 *((uint32_t __unaligned *)ptr) = v; 3784 #else 3785 memcpy(ptr, &v, sizeof(v)); 3786 #endif /* _MSC_VER || __unaligned */ 3787 } 3788 } 3789 3790 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( 3791 const unsigned expected_alignment, const void *const __restrict ptr) { 3792 assert((uintptr_t)ptr % expected_alignment == 0); 3793 if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) 3794 return *(const uint64_t *)ptr; 3795 else if ((expected_alignment % sizeof(uint32_t)) == 0) { 3796 const uint32_t lo = 3797 ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; 3798 const uint32_t hi = 3799 ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; 3800 return lo | (uint64_t)hi << 32; 3801 } else { 3802 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3803 defined(_M_X64) || defined(_M_IA64) 3804 return *(const __unaligned uint64_t *)ptr; 3805 #else 3806 uint64_t v; 3807 memcpy(&v, ptr, sizeof(v)); 3808 return v; 3809 #endif /* _MSC_VER || __unaligned */ 3810 } 3811 } 3812 3813 static __always_inline uint64_t 3814 unaligned_peek_u64_volatile(const unsigned expected_alignment, 3815 const volatile void *const __restrict ptr) { 3816 assert((uintptr_t)ptr % expected_alignment == 0); 3817 assert(expected_alignment % sizeof(uint32_t) == 0); 3818 if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) 3819 return *(const volatile uint64_t *)ptr; 3820 else { 3821 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3822 defined(_M_X64) || defined(_M_IA64) 3823 return *(const volatile __unaligned uint64_t *)ptr; 3824 #else 3825 const uint32_t lo = ((const volatile uint32_t *) 3826 ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; 3827 const uint32_t hi = ((const volatile uint32_t *) 3828 ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; 3829 return lo | (uint64_t)hi << 32; 3830 #endif /* _MSC_VER || __unaligned */ 3831 } 3832 } 3833 3834 static __always_inline void 3835 unaligned_poke_u64(const unsigned expected_alignment, 3836 void *const __restrict ptr, const uint64_t v) { 3837 assert((uintptr_t)ptr % expected_alignment == 0); 3838 if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0) 3839 *(uint64_t *)ptr = v; 3840 else if ((expected_alignment % sizeof(uint32_t)) == 0) { 3841 ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v; 3842 ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = 3843 (uint32_t)(v >> 32); 3844 } else { 3845 #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ 3846 defined(_M_X64) || defined(_M_IA64) 3847 *((uint64_t __unaligned *)ptr) = v; 3848 #else 3849 memcpy(ptr, &v, sizeof(v)); 3850 #endif /* _MSC_VER || __unaligned */ 3851 } 3852 } 3853 3854 #define UNALIGNED_PEEK_8(ptr, struct, field) \ 3855 peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) 3856 #define UNALIGNED_POKE_8(ptr, struct, field, value) \ 3857 poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) 3858 3859 #define UNALIGNED_PEEK_16(ptr, struct, field) \ 3860 unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) 3861 #define UNALIGNED_POKE_16(ptr, struct, field, value) \ 3862 unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) 3863 3864 #define UNALIGNED_PEEK_32(ptr, struct, field) \ 3865 unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) 3866 #define UNALIGNED_POKE_32(ptr, struct, field, value) \ 3867 unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) 3868 3869 #define UNALIGNED_PEEK_64(ptr, struct, field) \ 3870 unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) 3871 #define UNALIGNED_POKE_64(ptr, struct, field, value) \ 3872 unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) 3873 3874 /* Get the page number pointed to by a branch node */ 3875 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t 3876 node_pgno(const MDBX_node *const __restrict node) { 3877 pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32); 3878 if (sizeof(pgno) > 4) 3879 pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32; 3880 return pgno; 3881 } 3882 3883 /* Set the page number in a branch node */ 3884 static __always_inline void node_set_pgno(MDBX_node *const __restrict node, 3885 pgno_t pgno) { 3886 assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); 3887 3888 UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno); 3889 if (sizeof(pgno) > 4) 3890 UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 3891 (uint8_t)((uint64_t)pgno >> 32)); 3892 } 3893 3894 /* Get the size of the data in a leaf node */ 3895 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 3896 node_ds(const MDBX_node *const __restrict node) { 3897 return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize); 3898 } 3899 3900 /* Set the size of the data for a leaf node */ 3901 static __always_inline void node_set_ds(MDBX_node *const __restrict node, 3902 size_t size) { 3903 assert(size < INT_MAX); 3904 UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size); 3905 } 3906 3907 /* The size of a key in a node */ 3908 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 3909 node_ks(const MDBX_node *const __restrict node) { 3910 return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize); 3911 } 3912 3913 /* Set the size of the key for a leaf node */ 3914 static __always_inline void node_set_ks(MDBX_node *const __restrict node, 3915 size_t size) { 3916 assert(size < INT16_MAX); 3917 UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size); 3918 } 3919 3920 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t 3921 node_flags(const MDBX_node *const __restrict node) { 3922 return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags); 3923 } 3924 3925 static __always_inline void node_set_flags(MDBX_node *const __restrict node, 3926 uint8_t flags) { 3927 UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags); 3928 } 3929 3930 /* Size of the node header, excluding dynamic data at the end */ 3931 #define NODESIZE offsetof(MDBX_node, mn_data) 3932 3933 /* Address of the key for the node */ 3934 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * 3935 node_key(const MDBX_node *const __restrict node) { 3936 return (char *)node + NODESIZE; 3937 } 3938 3939 /* Address of the data for a node */ 3940 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * 3941 node_data(const MDBX_node *const __restrict node) { 3942 return (char *)node_key(node) + node_ks(node); 3943 } 3944 3945 /* Size of a node in a leaf page with a given key and data. 3946 * This is node header plus key plus data size. */ 3947 MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t 3948 node_size_len(const size_t key_len, const size_t value_len) { 3949 return NODESIZE + EVEN(key_len + value_len); 3950 } 3951 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 3952 node_size(const MDBX_val *key, const MDBX_val *value) { 3953 return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); 3954 } 3955 3956 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t 3957 peek_pgno(const void *const __restrict ptr) { 3958 if (sizeof(pgno_t) == sizeof(uint32_t)) 3959 return (pgno_t)unaligned_peek_u32(1, ptr); 3960 else if (sizeof(pgno_t) == sizeof(uint64_t)) 3961 return (pgno_t)unaligned_peek_u64(1, ptr); 3962 else { 3963 pgno_t pgno; 3964 memcpy(&pgno, ptr, sizeof(pgno)); 3965 return pgno; 3966 } 3967 } 3968 3969 static __always_inline void poke_pgno(void *const __restrict ptr, 3970 const pgno_t pgno) { 3971 if (sizeof(pgno) == sizeof(uint32_t)) 3972 unaligned_poke_u32(1, ptr, pgno); 3973 else if (sizeof(pgno) == sizeof(uint64_t)) 3974 unaligned_poke_u64(1, ptr, pgno); 3975 else 3976 memcpy(ptr, &pgno, sizeof(pgno)); 3977 } 3978 3979 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t 3980 node_largedata_pgno(const MDBX_node *const __restrict node) { 3981 assert(node_flags(node) & F_BIGDATA); 3982 return peek_pgno(node_data(node)); 3983 } 3984 3985 /*------------------------------------------------------------------------------ 3986 * Nodes, Keys & Values length limitation factors: 3987 * 3988 * BRANCH_NODE_MAX 3989 * Branch-page must contain at least two nodes, within each a key and a child 3990 * page number. But page can't be splitted if it contains less that 4 keys, 3991 * i.e. a page should not overflow before adding the fourth key. Therefore, 3992 * at least 3 branch-node should fit in the single branch-page. Further, the 3993 * first node of a branch-page doesn't contain a key, i.e. the first node 3994 * is always require space just for itself. Thus: 3995 * PAGEROOM = pagesize - page_hdr_len; 3996 * BRANCH_NODE_MAX = even_floor( 3997 * (PAGEROOM - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t)); 3998 * KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len; 3999 * 4000 * LEAF_NODE_MAX 4001 * Leaf-node must fit into single leaf-page, where a value could be placed on 4002 * a large/overflow page. However, may require to insert a nearly page-sized 4003 * node between two large nodes are already fill-up a page. In this case the 4004 * page must be splitted to two if some pair of nodes fits on one page, or 4005 * otherwise the page should be splitted to the THREE with a single node 4006 * per each of ones. Such 1-into-3 page splitting is costly and complex since 4007 * requires TWO insertion into the parent page, that could lead to split it 4008 * and so on up to the root. Therefore double-splitting is avoided here and 4009 * the maximum node size is half of a leaf page space: 4010 * LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t)); 4011 * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX; 4012 * 4013 * - SubDatabase-node must fit into one leaf-page: 4014 * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db); 4015 * 4016 * - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer 4017 * than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX, 4018 * since dupsort value couldn't be placed on a large/overflow page: 4019 * DUPSORT_DATALEN_MAX = min(KEYLEN_MAX, 4020 * max(DATALEN_NO_OVERFLOW, sizeof(MDBX_db)); 4021 */ 4022 4023 #define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ) 4024 #define EVEN_FLOOR(n) ((n) & ~(size_t)1) 4025 #define BRANCH_NODE_MAX(pagesize) \ 4026 (EVEN_FLOOR((PAGEROOM(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \ 4027 sizeof(indx_t))) 4028 #define LEAF_NODE_MAX(pagesize) \ 4029 (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t)) 4030 #define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) 4031 4032 static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { 4033 assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && 4034 is_powerof2(pagesize)); 4035 STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8); 4036 if (flags & MDBX_INTEGERKEY) 4037 return 8 /* sizeof(uint64_t) */; 4038 4039 const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE; 4040 STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) - NODESIZE - 4041 /* sizeof(uint64) as a key */ 8 > 4042 sizeof(MDBX_db)); 4043 if (flags & 4044 (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { 4045 const intptr_t max_dupsort_leaf_key = 4046 LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db); 4047 return (max_branch_key < max_dupsort_leaf_key) 4048 ? (unsigned)max_branch_key 4049 : (unsigned)max_dupsort_leaf_key; 4050 } 4051 return (unsigned)max_branch_key; 4052 } 4053 4054 static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { 4055 assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && 4056 is_powerof2(pagesize)); 4057 4058 if (flags & MDBX_INTEGERDUP) 4059 return 8 /* sizeof(uint64_t) */; 4060 4061 if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP)) 4062 return keysize_max(pagesize, 0); 4063 4064 const unsigned page_ln2 = log2n_powerof2(pagesize); 4065 const size_t hard = 0x7FF00000ul; 4066 const size_t hard_pages = hard >> page_ln2; 4067 STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO); 4068 const size_t pages_limit = MDBX_PGL_LIMIT / 4; 4069 const size_t limit = 4070 (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2); 4071 return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; 4072 } 4073 4074 __cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { 4075 return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); 4076 } 4077 4078 __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, 4079 MDBX_db_flags_t flags) { 4080 if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) 4081 return -1; 4082 4083 return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags); 4084 } 4085 4086 size_t mdbx_default_pagesize(void) { 4087 size_t pagesize = osal_syspagesize(); 4088 ENSURE(nullptr, is_powerof2(pagesize)); 4089 pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; 4090 pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; 4091 return pagesize; 4092 } 4093 4094 __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, 4095 MDBX_db_flags_t flags) { 4096 if (pagesize < 1) 4097 pagesize = (intptr_t)mdbx_default_pagesize(); 4098 if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || 4099 pagesize > (intptr_t)MAX_PAGESIZE || 4100 !is_powerof2((size_t)pagesize))) 4101 return -1; 4102 4103 return keysize_max(pagesize, flags); 4104 } 4105 4106 __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, 4107 MDBX_db_flags_t flags) { 4108 if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) 4109 return -1; 4110 4111 return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags); 4112 } 4113 4114 __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, 4115 MDBX_db_flags_t flags) { 4116 if (pagesize < 1) 4117 pagesize = (intptr_t)mdbx_default_pagesize(); 4118 if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || 4119 pagesize > (intptr_t)MAX_PAGESIZE || 4120 !is_powerof2((size_t)pagesize))) 4121 return -1; 4122 4123 return valsize_max(pagesize, flags); 4124 } 4125 4126 /* Calculate the size of a leaf node. 4127 * 4128 * The size depends on the environment's page size; if a data item 4129 * is too large it will be put onto an large/overflow page and the node 4130 * size will only include the key and not the data. Sizes are always 4131 * rounded up to an even number of bytes, to guarantee 2-byte alignment 4132 * of the MDBX_node headers. */ 4133 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 4134 leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { 4135 size_t node_bytes = node_size(key, data); 4136 if (node_bytes > env->me_leaf_nodemax) { 4137 /* put on large/overflow page */ 4138 node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); 4139 } 4140 4141 return node_bytes + sizeof(indx_t); 4142 } 4143 4144 /* Calculate the size of a branch node. 4145 * 4146 * The size should depend on the environment's page size but since 4147 * we currently don't support spilling large keys onto large/overflow 4148 * pages, it's simply the size of the MDBX_node header plus the 4149 * size of the key. Sizes are always rounded up to an even number 4150 * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. 4151 * 4152 * [in] env The environment handle. 4153 * [in] key The key for the node. 4154 * 4155 * Returns The number of bytes needed to store the node. */ 4156 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 4157 branch_size(const MDBX_env *env, const MDBX_val *key) { 4158 /* Size of a node in a branch page with a given key. 4159 * This is just the node header plus the key, there is no data. */ 4160 size_t node_bytes = node_size(key, nullptr); 4161 if (unlikely(node_bytes > env->me_leaf_nodemax)) { 4162 /* put on large/overflow page */ 4163 /* not implemented */ 4164 mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, 4165 __LINE__); 4166 node_bytes = node_size(key, nullptr) + sizeof(pgno_t); 4167 } 4168 4169 return node_bytes + sizeof(indx_t); 4170 } 4171 4172 MDBX_NOTHROW_CONST_FUNCTION static __always_inline uint16_t 4173 flags_db2sub(uint16_t db_flags) { 4174 uint16_t sub_flags = db_flags & MDBX_DUPFIXED; 4175 4176 /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */ 4177 #define SHIFT_INTEGERDUP_TO_INTEGERKEY 2 4178 STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) == 4179 MDBX_INTEGERKEY); 4180 sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY; 4181 4182 /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */ 4183 #define SHIFT_REVERSEDUP_TO_REVERSEKEY 5 4184 STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) == 4185 MDBX_REVERSEKEY); 4186 sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY; 4187 4188 return sub_flags; 4189 } 4190 4191 /*----------------------------------------------------------------------------*/ 4192 4193 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t 4194 pgno2bytes(const MDBX_env *env, pgno_t pgno) { 4195 eASSERT(env, (1u << env->me_psize2log) == env->me_psize); 4196 return ((size_t)pgno) << env->me_psize2log; 4197 } 4198 4199 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * 4200 pgno2page(const MDBX_env *env, pgno_t pgno) { 4201 return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); 4202 } 4203 4204 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t 4205 bytes2pgno(const MDBX_env *env, size_t bytes) { 4206 eASSERT(env, (env->me_psize >> env->me_psize2log) == 1); 4207 return (pgno_t)(bytes >> env->me_psize2log); 4208 } 4209 4210 MDBX_NOTHROW_PURE_FUNCTION static size_t 4211 pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { 4212 return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize); 4213 } 4214 4215 MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env, 4216 pgno_t pgno) { 4217 return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); 4218 } 4219 4220 MDBX_NOTHROW_PURE_FUNCTION static size_t 4221 bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { 4222 return ceil_powerof2(ceil_powerof2(bytes, env->me_psize), env->me_os_psize); 4223 } 4224 4225 /* Address of first usable data byte in a page, after the header */ 4226 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * 4227 page_data(const MDBX_page *mp) { 4228 return (char *)mp + PAGEHDRSZ; 4229 } 4230 4231 MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * 4232 data_page(const void *data) { 4233 return container_of(data, MDBX_page, mp_ptrs); 4234 } 4235 4236 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_meta * 4237 page_meta(MDBX_page *mp) { 4238 return (MDBX_meta *)page_data(mp); 4239 } 4240 4241 /* Number of nodes on a page */ 4242 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned 4243 page_numkeys(const MDBX_page *mp) { 4244 return mp->mp_lower >> 1; 4245 } 4246 4247 /* The amount of space remaining in the page */ 4248 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned 4249 page_room(const MDBX_page *mp) { 4250 return mp->mp_upper - mp->mp_lower; 4251 } 4252 4253 /* Maximum free space in an empty page */ 4254 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned 4255 page_space(const MDBX_env *env) { 4256 STATIC_ASSERT(PAGEHDRSZ % 2 == 0); 4257 return env->me_psize - PAGEHDRSZ; 4258 } 4259 4260 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned 4261 page_used(const MDBX_env *env, const MDBX_page *mp) { 4262 return page_space(env) - page_room(mp); 4263 } 4264 4265 /* The percentage of space used in the page, in a percents. */ 4266 MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double 4267 page_fill(const MDBX_env *env, const MDBX_page *mp) { 4268 return page_used(env, mp) * 100.0 / page_space(env); 4269 } 4270 4271 /* The number of large/overflow pages needed to store the given size. */ 4272 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t 4273 number_of_ovpages(const MDBX_env *env, size_t bytes) { 4274 return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; 4275 } 4276 4277 __cold static const char *pagetype_caption(const uint8_t type, 4278 char buf4unknown[16]) { 4279 switch (type) { 4280 case P_BRANCH: 4281 return "branch"; 4282 case P_LEAF: 4283 return "leaf"; 4284 case P_LEAF | P_SUBP: 4285 return "subleaf"; 4286 case P_LEAF | P_LEAF2: 4287 return "dupfixed-leaf"; 4288 case P_LEAF | P_LEAF2 | P_SUBP: 4289 return "dupfixed-subleaf"; 4290 case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY: 4291 return "dupfixed-subleaf.legacy-dirty"; 4292 case P_OVERFLOW: 4293 return "large"; 4294 default: 4295 snprintf(buf4unknown, 16, "unknown_0x%x", type); 4296 return buf4unknown; 4297 } 4298 } 4299 4300 __cold static __must_check_result int MDBX_PRINTF_ARGS(2, 3) 4301 bad_page(const MDBX_page *mp, const char *fmt, ...) { 4302 if (LOG_ENABLED(MDBX_LOG_ERROR)) { 4303 static const MDBX_page *prev; 4304 if (prev != mp) { 4305 char buf4unknown[16]; 4306 prev = mp; 4307 debug_log(MDBX_LOG_ERROR, "badpage", 0, 4308 "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", 4309 pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, 4310 mp->mp_txnid); 4311 } 4312 4313 va_list args; 4314 va_start(args, fmt); 4315 debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); 4316 va_end(args); 4317 } 4318 return MDBX_CORRUPTED; 4319 } 4320 4321 __cold static void MDBX_PRINTF_ARGS(2, 3) 4322 poor_page(const MDBX_page *mp, const char *fmt, ...) { 4323 if (LOG_ENABLED(MDBX_LOG_NOTICE)) { 4324 static const MDBX_page *prev; 4325 if (prev != mp) { 4326 char buf4unknown[16]; 4327 prev = mp; 4328 debug_log(MDBX_LOG_NOTICE, "poorpage", 0, 4329 "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", 4330 pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, 4331 mp->mp_txnid); 4332 } 4333 4334 va_list args; 4335 va_start(args, fmt); 4336 debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); 4337 va_end(args); 4338 } 4339 } 4340 4341 /* Address of node i in page p */ 4342 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * 4343 page_node(const MDBX_page *mp, unsigned i) { 4344 assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); 4345 assert(page_numkeys(mp) > (unsigned)(i)); 4346 assert(mp->mp_ptrs[i] % 2 == 0); 4347 return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); 4348 } 4349 4350 /* The address of a key in a LEAF2 page. 4351 * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. 4352 * There are no node headers, keys are stored contiguously. */ 4353 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * 4354 page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { 4355 assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); 4356 assert(mp->mp_leaf2_ksize == keysize); 4357 (void)keysize; 4358 return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); 4359 } 4360 4361 /* Set the node's key into keyptr. */ 4362 static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { 4363 keyptr->iov_len = node_ks(node); 4364 keyptr->iov_base = node_key(node); 4365 } 4366 4367 /* Set the node's key into keyptr, if requested. */ 4368 static __always_inline void 4369 get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { 4370 if (keyptr) 4371 get_key(node, keyptr); 4372 } 4373 4374 /*------------------------------------------------------------------------------ 4375 * safe read/write volatile 64-bit fields on 32-bit architectures. */ 4376 4377 #ifndef atomic_store64 4378 MDBX_MAYBE_UNUSED static __always_inline uint64_t 4379 atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, 4380 enum MDBX_memory_order order) { 4381 STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); 4382 #if MDBX_64BIT_ATOMIC 4383 #ifdef MDBX_HAVE_C11ATOMICS 4384 assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); 4385 atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); 4386 #else /* MDBX_HAVE_C11ATOMICS */ 4387 if (order != mo_Relaxed) 4388 osal_compiler_barrier(); 4389 p->weak = value; 4390 osal_memory_fence(order, true); 4391 #endif /* MDBX_HAVE_C11ATOMICS */ 4392 #else /* !MDBX_64BIT_ATOMIC */ 4393 osal_compiler_barrier(); 4394 atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); 4395 jitter4testing(true); 4396 atomic_store32(&p->high, (uint32_t)(value >> 32), order); 4397 jitter4testing(true); 4398 #endif /* !MDBX_64BIT_ATOMIC */ 4399 return value; 4400 } 4401 #endif /* atomic_store64 */ 4402 4403 #ifndef atomic_load64 4404 MDBX_MAYBE_UNUSED static 4405 #if MDBX_64BIT_ATOMIC 4406 __always_inline 4407 #endif /* MDBX_64BIT_ATOMIC */ 4408 uint64_t 4409 atomic_load64(const volatile MDBX_atomic_uint64_t *p, 4410 enum MDBX_memory_order order) { 4411 STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); 4412 #if MDBX_64BIT_ATOMIC 4413 #ifdef MDBX_HAVE_C11ATOMICS 4414 assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); 4415 return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); 4416 #else /* MDBX_HAVE_C11ATOMICS */ 4417 osal_memory_fence(order, false); 4418 const uint64_t value = p->weak; 4419 if (order != mo_Relaxed) 4420 osal_compiler_barrier(); 4421 return value; 4422 #endif /* MDBX_HAVE_C11ATOMICS */ 4423 #else /* !MDBX_64BIT_ATOMIC */ 4424 osal_compiler_barrier(); 4425 uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; 4426 jitter4testing(true); 4427 value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed 4428 : mo_AcquireRelease); 4429 jitter4testing(true); 4430 for (;;) { 4431 osal_compiler_barrier(); 4432 uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; 4433 jitter4testing(true); 4434 again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed 4435 : mo_AcquireRelease); 4436 jitter4testing(true); 4437 if (likely(value == again)) 4438 return value; 4439 value = again; 4440 } 4441 #endif /* !MDBX_64BIT_ATOMIC */ 4442 } 4443 #endif /* atomic_load64 */ 4444 4445 static __always_inline void atomic_yield(void) { 4446 #if defined(_WIN32) || defined(_WIN64) 4447 YieldProcessor(); 4448 #elif defined(__ia32__) || defined(__e2k__) 4449 __builtin_ia32_pause(); 4450 #elif defined(__ia64__) 4451 #if defined(__HP_cc__) || defined(__HP_aCC__) 4452 _Asm_hint(_HINT_PAUSE); 4453 #else 4454 __asm__ __volatile__("hint @pause"); 4455 #endif 4456 #elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \ 4457 defined(__ARM_ARCH_6K__) 4458 #ifdef __CC_ARM 4459 __yield(); 4460 #else 4461 __asm__ __volatile__("yield"); 4462 #endif 4463 #elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \ 4464 __mips_isa_rev >= 2 4465 __asm__ __volatile__("pause"); 4466 #elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ 4467 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ 4468 defined(__MWERKS__) || defined(__sgi) 4469 __asm__ __volatile__(".word 0x00000140"); 4470 #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) 4471 sched_yield(); 4472 #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) 4473 pthread_yield(); 4474 #endif 4475 } 4476 4477 #if MDBX_64BIT_CAS 4478 static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c, 4479 uint64_t v) { 4480 #ifdef MDBX_HAVE_C11ATOMICS 4481 STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); 4482 #ifdef ATOMIC_LLONG_LOCK_FREE 4483 STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); 4484 #if ATOMIC_LLONG_LOCK_FREE < 2 4485 assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); 4486 #endif /* ATOMIC_LLONG_LOCK_FREE < 2 */ 4487 #else /* defined(ATOMIC_LLONG_LOCK_FREE) */ 4488 assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); 4489 #endif 4490 return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v); 4491 #elif defined(__GNUC__) || defined(__clang__) 4492 return __sync_bool_compare_and_swap(&p->weak, c, v); 4493 #elif defined(_MSC_VER) 4494 return c == (uint64_t)_InterlockedCompareExchange64( 4495 (volatile __int64 *)&p->weak, v, c); 4496 #elif defined(__APPLE__) 4497 return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak); 4498 #else 4499 #error FIXME: Unsupported compiler 4500 #endif 4501 } 4502 #endif /* MDBX_64BIT_CAS */ 4503 4504 static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c, 4505 uint32_t v) { 4506 #ifdef MDBX_HAVE_C11ATOMICS 4507 STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); 4508 #ifdef ATOMIC_INT_LOCK_FREE 4509 STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); 4510 #if ATOMIC_INT_LOCK_FREE < 2 4511 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 4512 #endif 4513 #else 4514 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 4515 #endif 4516 return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v); 4517 #elif defined(__GNUC__) || defined(__clang__) 4518 return __sync_bool_compare_and_swap(&p->weak, c, v); 4519 #elif defined(_MSC_VER) 4520 STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); 4521 return c == 4522 (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c); 4523 #elif defined(__APPLE__) 4524 return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak); 4525 #else 4526 #error FIXME: Unsupported compiler 4527 #endif 4528 } 4529 4530 static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p, 4531 uint32_t v) { 4532 #ifdef MDBX_HAVE_C11ATOMICS 4533 STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); 4534 #ifdef ATOMIC_INT_LOCK_FREE 4535 STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); 4536 #if ATOMIC_INT_LOCK_FREE < 2 4537 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 4538 #endif 4539 #else 4540 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 4541 #endif 4542 return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v); 4543 #elif defined(__GNUC__) || defined(__clang__) 4544 return __sync_fetch_and_add(&p->weak, v); 4545 #elif defined(_MSC_VER) 4546 STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); 4547 return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v); 4548 #elif defined(__APPLE__) 4549 return OSAtomicAdd32Barrier(v, &p->weak); 4550 #else 4551 #error FIXME: Unsupported compiler 4552 #endif 4553 } 4554 4555 #define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) 4556 4557 static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { 4558 txnid += xMDBX_TXNID_STEP; 4559 #if !MDBX_64BIT_CAS 4560 /* avoid overflow of low-part in safe64_reset() */ 4561 txnid += (UINT32_MAX == (uint32_t)txnid); 4562 #endif 4563 return txnid; 4564 } 4565 4566 /* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ 4567 static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, 4568 bool single_writer) { 4569 if (single_writer) { 4570 #if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 4571 atomic_store64(p, UINT64_MAX, mo_AcquireRelease); 4572 #else 4573 atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); 4574 #endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ 4575 } else { 4576 #if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC 4577 /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ 4578 atomic_store64(p, UINT64_MAX, mo_AcquireRelease); 4579 #elif MDBX_64BIT_CAS 4580 /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ 4581 atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); 4582 #else 4583 /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 4584 * and overflow was preserved in safe64_txnid_next() */ 4585 STATIC_ASSERT(xMDBX_TXNID_STEP > 1); 4586 atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; 4587 atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); 4588 atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; 4589 #endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ 4590 } 4591 assert(p->weak >= SAFE64_INVALID_THRESHOLD); 4592 jitter4testing(true); 4593 } 4594 4595 static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, 4596 txnid_t compare) { 4597 /* LY: This function is used to reset `mr_txnid` from hsr-handler in case 4598 * the asynchronously cancellation of read transaction. Therefore, 4599 * there may be a collision between the cleanup performed here and 4600 * asynchronous termination and restarting of the read transaction 4601 * in another proces/thread. In general we MUST NOT reset the `mr_txnid` 4602 * if a new transaction was started (i.e. if `mr_txnid` was changed). */ 4603 #if MDBX_64BIT_CAS 4604 bool rc = atomic_cas64(p, compare, UINT64_MAX); 4605 #else 4606 /* LY: There is no gold ratio here since shared mutex is too costly, 4607 * in such way we must acquire/release it for every update of mr_txnid, 4608 * i.e. twice for each read transaction). */ 4609 bool rc = false; 4610 if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare && 4611 atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) { 4612 if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) != 4613 (uint32_t)compare)) 4614 atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32)); 4615 else 4616 rc = true; 4617 } 4618 #endif /* MDBX_64BIT_CAS */ 4619 jitter4testing(true); 4620 return rc; 4621 } 4622 4623 static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, 4624 const uint64_t v) { 4625 assert(p->weak >= SAFE64_INVALID_THRESHOLD); 4626 #if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS 4627 atomic_store64(p, v, mo_AcquireRelease); 4628 #else /* MDBX_64BIT_ATOMIC */ 4629 osal_compiler_barrier(); 4630 /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ 4631 atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); 4632 assert(p->weak >= SAFE64_INVALID_THRESHOLD); 4633 jitter4testing(true); 4634 /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ 4635 atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); 4636 #endif /* MDBX_64BIT_ATOMIC */ 4637 assert(p->weak == v); 4638 jitter4testing(true); 4639 } 4640 4641 static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { 4642 jitter4testing(true); 4643 uint64_t v; 4644 do 4645 v = atomic_load64(p, mo_AcquireRelease); 4646 while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); 4647 return v; 4648 } 4649 4650 #if 0 /* unused for now */ 4651 MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) { 4652 #if MDBX_WORDBITS >= 64 4653 return v < SAFE64_INVALID_THRESHOLD; 4654 #else 4655 return (v >> 32) != UINT32_MAX; 4656 #endif /* MDBX_WORDBITS */ 4657 } 4658 4659 MDBX_MAYBE_UNUSED static __always_inline bool 4660 safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) { 4661 #if MDBX_64BIT_ATOMIC 4662 return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; 4663 #else 4664 return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX; 4665 #endif /* MDBX_64BIT_ATOMIC */ 4666 } 4667 #endif /* unused for now */ 4668 4669 /* non-atomic write with safety for reading a half-updated value */ 4670 static __always_inline void safe64_update(MDBX_atomic_uint64_t *p, 4671 const uint64_t v) { 4672 #if MDBX_64BIT_ATOMIC 4673 atomic_store64(p, v, mo_Relaxed); 4674 #else 4675 safe64_reset(p, true); 4676 safe64_write(p, v); 4677 #endif /* MDBX_64BIT_ATOMIC */ 4678 } 4679 4680 /* non-atomic increment with safety for reading a half-updated value */ 4681 MDBX_MAYBE_UNUSED static 4682 #if MDBX_64BIT_ATOMIC 4683 __always_inline 4684 #endif /* MDBX_64BIT_ATOMIC */ 4685 void 4686 safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { 4687 assert(v > 0); 4688 safe64_update(p, safe64_read(p) + v); 4689 } 4690 4691 /*----------------------------------------------------------------------------*/ 4692 /* rthc (tls keys and destructors) */ 4693 4694 typedef struct rthc_entry_t { 4695 MDBX_reader *begin; 4696 MDBX_reader *end; 4697 osal_thread_key_t thr_tls_key; 4698 } rthc_entry_t; 4699 4700 #if MDBX_DEBUG 4701 #define RTHC_INITIAL_LIMIT 1 4702 #else 4703 #define RTHC_INITIAL_LIMIT 16 4704 #endif 4705 4706 static bin128_t bootid; 4707 4708 #if defined(_WIN32) || defined(_WIN64) 4709 static CRITICAL_SECTION rthc_critical_section; 4710 static CRITICAL_SECTION lcklist_critical_section; 4711 #else 4712 4713 static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; 4714 static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; 4715 static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; 4716 static osal_thread_key_t rthc_key; 4717 static MDBX_atomic_uint32_t rthc_pending; 4718 4719 static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { 4720 uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ 4721 UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); 4722 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 4723 return salt << 8 | kind; 4724 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 4725 return (uint64_t)kind << 56 | salt >> 8; 4726 #else 4727 #error "FIXME: Unsupported byte order" 4728 #endif /* __BYTE_ORDER__ */ 4729 } 4730 4731 #define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) 4732 #define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) 4733 static __thread uint64_t rthc_thread_state; 4734 4735 #if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ 4736 !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) 4737 /* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */ 4738 #define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS \ 4739 __attribute__((__no_sanitize_address__, __noinline__)) 4740 #else 4741 #define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS __inline 4742 #endif 4743 4744 MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) { 4745 return *(volatile uint64_t *)rthc; 4746 } 4747 4748 MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t 4749 rthc_compare_and_clean(const void *rthc, const uint64_t signature) { 4750 #if MDBX_64BIT_CAS 4751 return atomic_cas64((MDBX_atomic_uint64_t *)rthc, signature, 0); 4752 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 4753 return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)signature, 0); 4754 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 4755 return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32), 4756 0); 4757 #else 4758 #error "FIXME: Unsupported byte order" 4759 #endif 4760 } 4761 4762 static __inline int rthc_atexit(void (*dtor)(void *), void *obj, 4763 void *dso_symbol) { 4764 #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 4765 #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ 4766 defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ 4767 defined(ANDROID) 4768 #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 4769 #else 4770 #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 4771 #endif 4772 #endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */ 4773 4774 #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT 4775 #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \ 4776 defined(HAVE___CXA_THREAD_ATEXIT) 4777 #define MDBX_HAVE_CXA_THREAD_ATEXIT 1 4778 #elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \ 4779 (defined(__linux__) || defined(__gnu_linux__)) 4780 #define MDBX_HAVE_CXA_THREAD_ATEXIT 1 4781 #else 4782 #define MDBX_HAVE_CXA_THREAD_ATEXIT 0 4783 #endif 4784 #endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */ 4785 4786 int rc = MDBX_ENOSYS; 4787 #if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT 4788 #define __cxa_thread_atexit __cxa_thread_atexit_impl 4789 #endif 4790 #if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit) 4791 extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj, 4792 void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE; 4793 if (&__cxa_thread_atexit) 4794 rc = __cxa_thread_atexit(dtor, obj, dso_symbol); 4795 #elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE) 4796 extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr) 4797 MDBX_WEAK_IMPORT_ATTRIBUTE; 4798 if (&_tlv_atexit) { 4799 (void)dso_symbol; 4800 _tlv_atexit(dtor, obj); 4801 rc = 0; 4802 } 4803 #else 4804 (void)dtor; 4805 (void)obj; 4806 (void)dso_symbol; 4807 #endif 4808 return rc; 4809 } 4810 4811 __cold static void workaround_glibc_bug21031(void) { 4812 /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 4813 * 4814 * Due race between pthread_key_delete() and __nptl_deallocate_tsd() 4815 * The destructor(s) of thread-local-storage object(s) may be running 4816 * in another thread(s) and be blocked or not finished yet. 4817 * In such case we get a SEGFAULT after unload this library DSO. 4818 * 4819 * So just by yielding a few timeslices we give a chance 4820 * to such destructor(s) for completion and avoids segfault. */ 4821 sched_yield(); 4822 sched_yield(); 4823 sched_yield(); 4824 } 4825 #endif 4826 4827 static unsigned rthc_count, rthc_limit; 4828 static rthc_entry_t *rthc_table; 4829 static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; 4830 4831 static __inline void rthc_lock(void) { 4832 #if defined(_WIN32) || defined(_WIN64) 4833 EnterCriticalSection(&rthc_critical_section); 4834 #else 4835 ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); 4836 #endif 4837 } 4838 4839 static __inline void rthc_unlock(void) { 4840 #if defined(_WIN32) || defined(_WIN64) 4841 LeaveCriticalSection(&rthc_critical_section); 4842 #else 4843 ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); 4844 #endif 4845 } 4846 4847 static __inline int thread_key_create(osal_thread_key_t *key) { 4848 int rc; 4849 #if defined(_WIN32) || defined(_WIN64) 4850 *key = TlsAlloc(); 4851 rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError(); 4852 #else 4853 rc = pthread_key_create(key, nullptr); 4854 #endif 4855 TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), 4856 (uintptr_t)*key, rc); 4857 return rc; 4858 } 4859 4860 static __inline void thread_key_delete(osal_thread_key_t key) { 4861 TRACE("key = %" PRIuPTR, (uintptr_t)key); 4862 #if defined(_WIN32) || defined(_WIN64) 4863 ENSURE(nullptr, TlsFree(key)); 4864 #else 4865 ENSURE(nullptr, pthread_key_delete(key) == 0); 4866 workaround_glibc_bug21031(); 4867 #endif 4868 } 4869 4870 static __inline void *thread_rthc_get(osal_thread_key_t key) { 4871 #if defined(_WIN32) || defined(_WIN64) 4872 return TlsGetValue(key); 4873 #else 4874 return pthread_getspecific(key); 4875 #endif 4876 } 4877 4878 static void thread_rthc_set(osal_thread_key_t key, const void *value) { 4879 #if defined(_WIN32) || defined(_WIN64) 4880 ENSURE(nullptr, TlsSetValue(key, (void *)value)); 4881 #else 4882 const uint64_t sign_registered = 4883 MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); 4884 const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state); 4885 if (value && unlikely(rthc_thread_state != sign_registered && 4886 rthc_thread_state != sign_counted)) { 4887 rthc_thread_state = sign_registered; 4888 TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); 4889 if (rthc_atexit(thread_dtor, &rthc_thread_state, 4890 (void *)&mdbx_version /* dso_anchor */)) { 4891 ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); 4892 rthc_thread_state = sign_counted; 4893 const unsigned count_before = atomic_add32(&rthc_pending, 1); 4894 ENSURE(nullptr, count_before < INT_MAX); 4895 NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", 4896 (uintptr_t)rthc_key, count_before); 4897 (void)count_before; 4898 } 4899 } 4900 ENSURE(nullptr, pthread_setspecific(key, value) == 0); 4901 #endif 4902 } 4903 4904 /* dtor called for thread, i.e. for all mdbx's environment objects */ 4905 __cold void thread_dtor(void *rthc) { 4906 rthc_lock(); 4907 TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), 4908 osal_thread_self(), rthc); 4909 4910 const uint32_t self_pid = osal_getpid(); 4911 for (unsigned i = 0; i < rthc_count; ++i) { 4912 const osal_thread_key_t key = rthc_table[i].thr_tls_key; 4913 MDBX_reader *const reader = thread_rthc_get(key); 4914 if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) 4915 continue; 4916 #if !defined(_WIN32) && !defined(_WIN64) 4917 if (pthread_setspecific(key, nullptr) != 0) { 4918 TRACE("== thread 0x%" PRIxPTR 4919 ", rthc %p: ignore race with tsd-key deletion", 4920 osal_thread_self(), __Wpedantic_format_voidptr(reader)); 4921 continue /* ignore race with tsd-key deletion by mdbx_env_close() */; 4922 } 4923 #endif 4924 4925 TRACE("== thread 0x%" PRIxPTR 4926 ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " 4927 "current-pid %i", 4928 osal_thread_self(), __Wpedantic_format_voidptr(reader), i, 4929 __Wpedantic_format_voidptr(rthc_table[i].begin), 4930 __Wpedantic_format_voidptr(rthc_table[i].end), 4931 (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); 4932 if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { 4933 TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), 4934 __Wpedantic_format_voidptr(reader)); 4935 atomic_cas32(&reader->mr_pid, self_pid, 0); 4936 } 4937 } 4938 4939 #if defined(_WIN32) || defined(_WIN64) 4940 TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); 4941 rthc_unlock(); 4942 #else 4943 const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); 4944 const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); 4945 const uint64_t state = rthc_read(rthc); 4946 if (state == sign_registered && 4947 rthc_compare_and_clean(rthc, sign_registered)) { 4948 TRACE("== thread 0x%" PRIxPTR 4949 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 4950 osal_thread_self(), rthc, osal_getpid(), "registered", state); 4951 } else if (state == sign_counted && 4952 rthc_compare_and_clean(rthc, sign_counted)) { 4953 TRACE("== thread 0x%" PRIxPTR 4954 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 4955 osal_thread_self(), rthc, osal_getpid(), "counted", state); 4956 ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); 4957 } else { 4958 WARNING("thread 0x%" PRIxPTR 4959 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 4960 osal_thread_self(), rthc, osal_getpid(), "wrong", state); 4961 } 4962 4963 if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { 4964 TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), 4965 rthc, osal_getpid()); 4966 ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); 4967 } 4968 4969 TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); 4970 /* Allow tail call optimization, i.e. gcc should generate the jmp instruction 4971 * instead of a call for pthread_mutex_unlock() and therefore CPU could not 4972 * return to current DSO's code section, which may be unloaded immediately 4973 * after the mutex got released. */ 4974 pthread_mutex_unlock(&rthc_mutex); 4975 #endif 4976 } 4977 4978 MDBX_EXCLUDE_FOR_GPROF 4979 __cold void global_dtor(void) { 4980 TRACE(">> pid %d", osal_getpid()); 4981 4982 rthc_lock(); 4983 #if !defined(_WIN32) && !defined(_WIN64) 4984 uint64_t *rthc = pthread_getspecific(rthc_key); 4985 TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 4986 ", left %d", 4987 osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), 4988 rthc ? rthc_read(rthc) : ~UINT64_C(0), 4989 atomic_load32(&rthc_pending, mo_Relaxed)); 4990 if (rthc) { 4991 const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); 4992 const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); 4993 const uint64_t state = rthc_read(rthc); 4994 if (state == sign_registered && 4995 rthc_compare_and_clean(rthc, sign_registered)) { 4996 TRACE("== thread 0x%" PRIxPTR 4997 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 4998 osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), 4999 "registered", state); 5000 } else if (state == sign_counted && 5001 rthc_compare_and_clean(rthc, sign_counted)) { 5002 TRACE("== thread 0x%" PRIxPTR 5003 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 5004 osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), 5005 "counted", state); 5006 ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); 5007 } else { 5008 WARNING("thread 0x%" PRIxPTR 5009 ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", 5010 osal_thread_self(), __Wpedantic_format_voidptr(rthc), 5011 osal_getpid(), "wrong", state); 5012 } 5013 } 5014 5015 struct timespec abstime; 5016 ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); 5017 abstime.tv_nsec += 1000000000l / 10; 5018 if (abstime.tv_nsec >= 1000000000l) { 5019 abstime.tv_nsec -= 1000000000l; 5020 abstime.tv_sec += 1; 5021 } 5022 #if MDBX_DEBUG > 0 5023 abstime.tv_sec += 600; 5024 #endif 5025 5026 for (unsigned left; 5027 (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { 5028 NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); 5029 const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); 5030 if (rc && rc != EINTR) 5031 break; 5032 } 5033 thread_key_delete(rthc_key); 5034 #endif 5035 5036 const uint32_t self_pid = osal_getpid(); 5037 for (unsigned i = 0; i < rthc_count; ++i) { 5038 const osal_thread_key_t key = rthc_table[i].thr_tls_key; 5039 thread_key_delete(key); 5040 for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; 5041 ++rthc) { 5042 TRACE("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " 5043 "rthc-pid %i, current-pid %i", 5044 i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), 5045 __Wpedantic_format_voidptr(rthc_table[i].end), 5046 __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), 5047 rthc->mr_pid.weak, self_pid); 5048 if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { 5049 atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); 5050 TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); 5051 } 5052 } 5053 } 5054 5055 rthc_limit = rthc_count = 0; 5056 if (rthc_table != rthc_table_static) 5057 osal_free(rthc_table); 5058 rthc_table = nullptr; 5059 rthc_unlock(); 5060 5061 #if defined(_WIN32) || defined(_WIN64) 5062 DeleteCriticalSection(&lcklist_critical_section); 5063 DeleteCriticalSection(&rthc_critical_section); 5064 #else 5065 /* LY: yielding a few timeslices to give a more chance 5066 * to racing destructor(s) for completion. */ 5067 workaround_glibc_bug21031(); 5068 #endif 5069 5070 TRACE("<< pid %d\n", osal_getpid()); 5071 } 5072 5073 __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, 5074 MDBX_reader *end) { 5075 assert(pkey != NULL); 5076 #ifndef NDEBUG 5077 *pkey = (osal_thread_key_t)0xBADBADBAD; 5078 #endif /* NDEBUG */ 5079 5080 rthc_lock(); 5081 TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); 5082 int rc; 5083 if (rthc_count == rthc_limit) { 5084 rthc_entry_t *new_table = 5085 osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, 5086 sizeof(rthc_entry_t) * rthc_limit * 2); 5087 if (new_table == nullptr) { 5088 rc = MDBX_ENOMEM; 5089 goto bailout; 5090 } 5091 if (rthc_table == rthc_table_static) 5092 memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); 5093 rthc_table = new_table; 5094 rthc_limit *= 2; 5095 } 5096 5097 rc = thread_key_create(&rthc_table[rthc_count].thr_tls_key); 5098 if (rc != MDBX_SUCCESS) 5099 goto bailout; 5100 5101 *pkey = rthc_table[rthc_count].thr_tls_key; 5102 TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, 5103 __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); 5104 5105 rthc_table[rthc_count].begin = begin; 5106 rthc_table[rthc_count].end = end; 5107 ++rthc_count; 5108 TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, 5109 rthc_count, rthc_limit); 5110 rthc_unlock(); 5111 return MDBX_SUCCESS; 5112 5113 bailout: 5114 rthc_unlock(); 5115 return rc; 5116 } 5117 5118 __cold void rthc_remove(const osal_thread_key_t key) { 5119 thread_key_delete(key); 5120 rthc_lock(); 5121 TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, 5122 rthc_limit); 5123 5124 for (unsigned i = 0; i < rthc_count; ++i) { 5125 if (key == rthc_table[i].thr_tls_key) { 5126 const uint32_t self_pid = osal_getpid(); 5127 TRACE("== [%i], %p ...%p, current-pid %d", i, 5128 __Wpedantic_format_voidptr(rthc_table[i].begin), 5129 __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); 5130 5131 for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; 5132 ++rthc) { 5133 if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { 5134 atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); 5135 TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); 5136 } 5137 } 5138 if (--rthc_count > 0) 5139 rthc_table[i] = rthc_table[rthc_count]; 5140 else if (rthc_table != rthc_table_static) { 5141 osal_free(rthc_table); 5142 rthc_table = rthc_table_static; 5143 rthc_limit = RTHC_INITIAL_LIMIT; 5144 } 5145 break; 5146 } 5147 } 5148 5149 TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, 5150 rthc_limit); 5151 rthc_unlock(); 5152 } 5153 5154 //------------------------------------------------------------------------------ 5155 5156 #define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459)) 5157 static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END; 5158 5159 static __inline void lcklist_lock(void) { 5160 #if defined(_WIN32) || defined(_WIN64) 5161 EnterCriticalSection(&lcklist_critical_section); 5162 #else 5163 ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); 5164 #endif 5165 } 5166 5167 static __inline void lcklist_unlock(void) { 5168 #if defined(_WIN32) || defined(_WIN64) 5169 LeaveCriticalSection(&lcklist_critical_section); 5170 #else 5171 ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); 5172 #endif 5173 } 5174 5175 MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { 5176 /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ 5177 v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); 5178 v *= UINT64_C(0xA24BAED4963EE407); 5179 v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49); 5180 v *= UINT64_C(0x9FB21C651E98DF25); 5181 return v ^ v >> 28; 5182 } 5183 5184 static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { 5185 int rc; 5186 uint64_t bait; 5187 MDBX_lockinfo *const pending_lck = pending->lck; 5188 MDBX_lockinfo *const scan_lck = scan->lck; 5189 if (pending_lck) { 5190 bait = atomic_load64(&pending_lck->mti_bait_uniqueness, mo_AcquireRelease); 5191 rc = MDBX_SUCCESS; 5192 } else { 5193 bait = 0 /* hush MSVC warning */; 5194 rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); 5195 if (rc == MDBX_SUCCESS) 5196 rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), 5197 offsetof(MDBX_lockinfo, mti_bait_uniqueness)); 5198 } 5199 if (likely(rc == MDBX_SUCCESS) && 5200 bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) 5201 rc = MDBX_RESULT_TRUE; 5202 5203 TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", 5204 pending_lck ? "mem" : "file", bait, 5205 (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); 5206 return rc; 5207 } 5208 5209 static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, 5210 uint64_t *abra) { 5211 if (*abra == 0) { 5212 const uintptr_t tid = osal_thread_self(); 5213 uintptr_t uit = 0; 5214 memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); 5215 *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); 5216 } 5217 const uint64_t cadabra = 5218 rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) 5219 << 24 | 5220 *abra >> 40; 5221 MDBX_lockinfo *const scan_lck = scan->lck; 5222 atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease); 5223 *abra = *abra * UINT64_C(6364136223846793005) + 1; 5224 return uniq_peek(pending, scan); 5225 } 5226 5227 __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { 5228 *found = nullptr; 5229 uint64_t salt = 0; 5230 for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; 5231 scan = scan->me_lcklist_next) { 5232 MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck; 5233 int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease) 5234 ? uniq_peek(pending, &scan->me_lck_mmap) 5235 : uniq_poke(pending, &scan->me_lck_mmap, &salt); 5236 if (err == MDBX_ENODATA) { 5237 uint64_t length; 5238 if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && 5239 length == 0)) { 5240 /* LY: skip checking since LCK-file is empty, i.e. just created. */ 5241 DEBUG("uniq-probe: %s", "unique (new/empty lck)"); 5242 return MDBX_RESULT_TRUE; 5243 } 5244 } 5245 if (err == MDBX_RESULT_TRUE) 5246 err = uniq_poke(pending, &scan->me_lck_mmap, &salt); 5247 if (err == MDBX_RESULT_TRUE) { 5248 (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), 5249 MDBX_SYNC_NONE); 5250 err = uniq_poke(pending, &scan->me_lck_mmap, &salt); 5251 } 5252 if (err == MDBX_RESULT_TRUE) { 5253 err = uniq_poke(pending, &scan->me_lck_mmap, &salt); 5254 *found = scan; 5255 DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); 5256 return MDBX_RESULT_FALSE; 5257 } 5258 if (unlikely(err != MDBX_SUCCESS)) { 5259 DEBUG("uniq-probe: failed rc %d", err); 5260 return err; 5261 } 5262 } 5263 5264 DEBUG("uniq-probe: %s", "unique"); 5265 return MDBX_RESULT_TRUE; 5266 } 5267 5268 static int lcklist_detach_locked(MDBX_env *env) { 5269 MDBX_env *inprocess_neighbor = nullptr; 5270 int rc = MDBX_SUCCESS; 5271 if (env->me_lcklist_next != nullptr) { 5272 ENSURE(env, env->me_lcklist_next != nullptr); 5273 ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); 5274 for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; 5275 ptr = &(*ptr)->me_lcklist_next) { 5276 if (*ptr == env) { 5277 *ptr = env->me_lcklist_next; 5278 env->me_lcklist_next = nullptr; 5279 break; 5280 } 5281 } 5282 ENSURE(env, env->me_lcklist_next == nullptr); 5283 } 5284 5285 rc = likely(osal_getpid() == env->me_pid) 5286 ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) 5287 : MDBX_PANIC; 5288 if (!inprocess_neighbor && env->me_live_reader) 5289 (void)osal_rpid_clear(env); 5290 if (!MDBX_IS_ERROR(rc)) 5291 rc = osal_lck_destroy(env, inprocess_neighbor); 5292 return rc; 5293 } 5294 5295 /*------------------------------------------------------------------------------ 5296 * LY: State of the art quicksort-based sorting, with internal stack 5297 * and network-sort for small chunks. 5298 * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ 5299 5300 #if MDBX_HAVE_CMOV 5301 #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ 5302 do { \ 5303 const TYPE swap_tmp = (a); \ 5304 const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ 5305 (a) = swap_cmp ? swap_tmp : b; \ 5306 (b) = swap_cmp ? b : swap_tmp; \ 5307 } while (0) 5308 #else 5309 #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ 5310 do \ 5311 if (expect_with_probability(!CMP(a, b), 0, .5)) { \ 5312 const TYPE swap_tmp = (a); \ 5313 (a) = (b); \ 5314 (b) = swap_tmp; \ 5315 } \ 5316 while (0) 5317 #endif 5318 5319 // 3 comparators, 3 parallel operations 5320 // o-----^--^--o 5321 // | | 5322 // o--^--|--v--o 5323 // | | 5324 // o--v--v-----o 5325 // 5326 // [[1,2]] 5327 // [[0,2]] 5328 // [[0,1]] 5329 #define SORT_NETWORK_3(TYPE, CMP, begin) \ 5330 do { \ 5331 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5332 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5333 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5334 } while (0) 5335 5336 // 5 comparators, 3 parallel operations 5337 // o--^--^--------o 5338 // | | 5339 // o--v--|--^--^--o 5340 // | | | 5341 // o--^--v--|--v--o 5342 // | | 5343 // o--v-----v-----o 5344 // 5345 // [[0,1],[2,3]] 5346 // [[0,2],[1,3]] 5347 // [[1,2]] 5348 #define SORT_NETWORK_4(TYPE, CMP, begin) \ 5349 do { \ 5350 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5351 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ 5352 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5353 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ 5354 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5355 } while (0) 5356 5357 // 9 comparators, 5 parallel operations 5358 // o--^--^-----^-----------o 5359 // | | | 5360 // o--|--|--^--v-----^--^--o 5361 // | | | | | 5362 // o--|--v--|--^--^--|--v--o 5363 // | | | | | 5364 // o--|-----v--|--v--|--^--o 5365 // | | | | 5366 // o--v--------v-----v--v--o 5367 // 5368 // [[0,4],[1,3]] 5369 // [[0,2]] 5370 // [[2,4],[0,1]] 5371 // [[2,3],[1,4]] 5372 // [[1,2],[3,4]] 5373 #define SORT_NETWORK_5(TYPE, CMP, begin) \ 5374 do { \ 5375 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ 5376 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ 5377 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5378 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ 5379 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5380 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ 5381 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ 5382 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5383 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ 5384 } while (0) 5385 5386 // 12 comparators, 6 parallel operations 5387 // o-----^--^--^-----------------o 5388 // | | | 5389 // o--^--|--v--|--^--------^-----o 5390 // | | | | | 5391 // o--v--v-----|--|--^--^--|--^--o 5392 // | | | | | | 5393 // o-----^--^--v--|--|--|--v--v--o 5394 // | | | | | 5395 // o--^--|--v-----v--|--v--------o 5396 // | | | 5397 // o--v--v-----------v-----------o 5398 // 5399 // [[1,2],[4,5]] 5400 // [[0,2],[3,5]] 5401 // [[0,1],[3,4],[2,5]] 5402 // [[0,3],[1,4]] 5403 // [[2,4],[1,3]] 5404 // [[2,3]] 5405 #define SORT_NETWORK_6(TYPE, CMP, begin) \ 5406 do { \ 5407 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5408 SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ 5409 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5410 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ 5411 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5412 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ 5413 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ 5414 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ 5415 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ 5416 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ 5417 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ 5418 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ 5419 } while (0) 5420 5421 // 16 comparators, 6 parallel operations 5422 // o--^--------^-----^-----------------o 5423 // | | | 5424 // o--|--^-----|--^--v--------^--^-----o 5425 // | | | | | | 5426 // o--|--|--^--v--|--^-----^--|--v-----o 5427 // | | | | | | | 5428 // o--|--|--|-----v--|--^--v--|--^--^--o 5429 // | | | | | | | | 5430 // o--v--|--|--^-----v--|--^--v--|--v--o 5431 // | | | | | | 5432 // o-----v--|--|--------v--v-----|--^--o 5433 // | | | | 5434 // o--------v--v-----------------v--v--o 5435 // 5436 // [[0,4],[1,5],[2,6]] 5437 // [[0,2],[1,3],[4,6]] 5438 // [[2,4],[3,5],[0,1]] 5439 // [[2,3],[4,5]] 5440 // [[1,4],[3,6]] 5441 // [[1,2],[3,4],[5,6]] 5442 #define SORT_NETWORK_7(TYPE, CMP, begin) \ 5443 do { \ 5444 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ 5445 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ 5446 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ 5447 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5448 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ 5449 SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ 5450 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ 5451 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ 5452 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5453 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ 5454 SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ 5455 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ 5456 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ 5457 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5458 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ 5459 SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ 5460 } while (0) 5461 5462 // 19 comparators, 6 parallel operations 5463 // o--^--------^-----^-----------------o 5464 // | | | 5465 // o--|--^-----|--^--v--------^--^-----o 5466 // | | | | | | 5467 // o--|--|--^--v--|--^-----^--|--v-----o 5468 // | | | | | | | 5469 // o--|--|--|--^--v--|--^--v--|--^--^--o 5470 // | | | | | | | | | 5471 // o--v--|--|--|--^--v--|--^--v--|--v--o 5472 // | | | | | | | 5473 // o-----v--|--|--|--^--v--v-----|--^--o 5474 // | | | | | | 5475 // o--------v--|--v--|--^--------v--v--o 5476 // | | | 5477 // o-----------v-----v--v--------------o 5478 // 5479 // [[0,4],[1,5],[2,6],[3,7]] 5480 // [[0,2],[1,3],[4,6],[5,7]] 5481 // [[2,4],[3,5],[0,1],[6,7]] 5482 // [[2,3],[4,5]] 5483 // [[1,4],[3,6]] 5484 // [[1,2],[3,4],[5,6]] 5485 #define SORT_NETWORK_8(TYPE, CMP, begin) \ 5486 do { \ 5487 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ 5488 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ 5489 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ 5490 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ 5491 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ 5492 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ 5493 SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ 5494 SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ 5495 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ 5496 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ 5497 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5498 SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ 5499 SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ 5500 SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ 5501 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ 5502 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ 5503 SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ 5504 SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ 5505 SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ 5506 } while (0) 5507 5508 #define SORT_INNER(TYPE, CMP, begin, end, len) \ 5509 switch (len) { \ 5510 default: \ 5511 assert(false); \ 5512 __unreachable(); \ 5513 case 0: \ 5514 case 1: \ 5515 break; \ 5516 case 2: \ 5517 SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ 5518 break; \ 5519 case 3: \ 5520 SORT_NETWORK_3(TYPE, CMP, begin); \ 5521 break; \ 5522 case 4: \ 5523 SORT_NETWORK_4(TYPE, CMP, begin); \ 5524 break; \ 5525 case 5: \ 5526 SORT_NETWORK_5(TYPE, CMP, begin); \ 5527 break; \ 5528 case 6: \ 5529 SORT_NETWORK_6(TYPE, CMP, begin); \ 5530 break; \ 5531 case 7: \ 5532 SORT_NETWORK_7(TYPE, CMP, begin); \ 5533 break; \ 5534 case 8: \ 5535 SORT_NETWORK_8(TYPE, CMP, begin); \ 5536 break; \ 5537 } 5538 5539 #define SORT_SWAP(TYPE, a, b) \ 5540 do { \ 5541 const TYPE swap_tmp = (a); \ 5542 (a) = (b); \ 5543 (b) = swap_tmp; \ 5544 } while (0) 5545 5546 #define SORT_PUSH(low, high) \ 5547 do { \ 5548 top->lo = (low); \ 5549 top->hi = (high); \ 5550 ++top; \ 5551 } while (0) 5552 5553 #define SORT_POP(low, high) \ 5554 do { \ 5555 --top; \ 5556 low = top->lo; \ 5557 high = top->hi; \ 5558 } while (0) 5559 5560 #define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \ 5561 \ 5562 static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ 5563 while (++first <= last) \ 5564 if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ 5565 return false; \ 5566 return true; \ 5567 } \ 5568 \ 5569 typedef struct { \ 5570 TYPE *lo, *hi; \ 5571 } NAME##_stack; \ 5572 \ 5573 __hot static void NAME(TYPE *const __restrict begin, \ 5574 TYPE *const __restrict end) { \ 5575 NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *__restrict top = stack; \ 5576 \ 5577 TYPE *__restrict hi = end - 1; \ 5578 TYPE *__restrict lo = begin; \ 5579 while (true) { \ 5580 const ptrdiff_t len = hi - lo; \ 5581 if (len < 8) { \ 5582 SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ 5583 if (unlikely(top == stack)) \ 5584 break; \ 5585 SORT_POP(lo, hi); \ 5586 continue; \ 5587 } \ 5588 \ 5589 TYPE *__restrict mid = lo + (len >> 1); \ 5590 SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ 5591 SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ 5592 SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ 5593 \ 5594 TYPE *right = hi - 1; \ 5595 TYPE *left = lo + 1; \ 5596 while (1) { \ 5597 while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ 5598 ++left; \ 5599 while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ 5600 --right; \ 5601 if (unlikely(left > right)) { \ 5602 if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ 5603 if (NAME##_is_sorted(lo, right)) \ 5604 lo = right + 1; \ 5605 if (NAME##_is_sorted(left, hi)) \ 5606 hi = left; \ 5607 } \ 5608 break; \ 5609 } \ 5610 SORT_SWAP(TYPE, *left, *right); \ 5611 mid = (mid == left) ? right : (mid == right) ? left : mid; \ 5612 ++left; \ 5613 --right; \ 5614 } \ 5615 \ 5616 if (right - lo > hi - left) { \ 5617 SORT_PUSH(lo, right); \ 5618 lo = left; \ 5619 } else { \ 5620 SORT_PUSH(left, hi); \ 5621 hi = right; \ 5622 } \ 5623 } \ 5624 \ 5625 if (AUDIT_ENABLED()) { \ 5626 for (TYPE *scan = begin + 1; scan < end; ++scan) \ 5627 assert(CMP(scan[-1], scan[0])); \ 5628 } \ 5629 } 5630 5631 /*------------------------------------------------------------------------------ 5632 * LY: radix sort for large chunks */ 5633 5634 #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ 5635 \ 5636 __hot static bool NAME##_radixsort(TYPE *const begin, \ 5637 const unsigned length) { \ 5638 TYPE *tmp; \ 5639 if (BUFFER_PREALLOCATED) { \ 5640 tmp = begin + length + END_GAP; \ 5641 /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ 5642 } else { \ 5643 tmp = osal_malloc(sizeof(TYPE) * length); \ 5644 if (unlikely(!tmp)) \ 5645 return false; \ 5646 } \ 5647 \ 5648 unsigned key_shift = 0, key_diff_mask; \ 5649 do { \ 5650 struct { \ 5651 unsigned a[256], b[256]; \ 5652 } counters; \ 5653 memset(&counters, 0, sizeof(counters)); \ 5654 \ 5655 key_diff_mask = 0; \ 5656 unsigned prev_key = EXTRACT_KEY(begin) >> key_shift; \ 5657 TYPE *r = begin, *end = begin + length; \ 5658 do { \ 5659 const unsigned key = EXTRACT_KEY(r) >> key_shift; \ 5660 counters.a[key & 255]++; \ 5661 counters.b[(key >> 8) & 255]++; \ 5662 key_diff_mask |= prev_key ^ key; \ 5663 prev_key = key; \ 5664 } while (++r != end); \ 5665 \ 5666 unsigned ta = 0, tb = 0; \ 5667 for (unsigned i = 0; i < 256; ++i) { \ 5668 const unsigned ia = counters.a[i]; \ 5669 counters.a[i] = ta; \ 5670 ta += ia; \ 5671 const unsigned ib = counters.b[i]; \ 5672 counters.b[i] = tb; \ 5673 tb += ib; \ 5674 } \ 5675 \ 5676 r = begin; \ 5677 do { \ 5678 const unsigned key = EXTRACT_KEY(r) >> key_shift; \ 5679 tmp[counters.a[key & 255]++] = *r; \ 5680 } while (++r != end); \ 5681 \ 5682 if (unlikely(key_diff_mask < 256)) { \ 5683 memcpy(begin, tmp, (char *)end - (char *)begin); \ 5684 break; \ 5685 } \ 5686 end = (r = tmp) + length; \ 5687 do { \ 5688 const unsigned key = EXTRACT_KEY(r) >> key_shift; \ 5689 begin[counters.b[(key >> 8) & 255]++] = *r; \ 5690 } while (++r != end); \ 5691 \ 5692 key_shift += 16; \ 5693 } while (key_diff_mask >> 16); \ 5694 \ 5695 if (!(BUFFER_PREALLOCATED)) \ 5696 osal_free(tmp); \ 5697 return true; \ 5698 } 5699 5700 /*------------------------------------------------------------------------------ 5701 * LY: Binary search */ 5702 5703 #if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) 5704 #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ 5705 do \ 5706 __asm __volatile("" \ 5707 : "+r"(size) \ 5708 : "r" /* the `b` constraint is more suitable here, but \ 5709 cause CLANG to allocate and push/pop an one more \ 5710 register, so using the `r` which avoids this. */ \ 5711 (flag)); \ 5712 while (0) 5713 #else 5714 #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ 5715 do { \ 5716 /* nope for non-clang or non-x86 */; \ 5717 } while (0) 5718 #endif /* Workaround for CLANG */ 5719 5720 #define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key) \ 5721 do { \ 5722 } while (0) 5723 5724 #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ 5725 static __always_inline const TYPE_LIST *NAME( \ 5726 const TYPE_LIST *it, unsigned length, const TYPE_ARG item) { \ 5727 const TYPE_LIST *const begin = it, *const end = begin + length; \ 5728 \ 5729 if (MDBX_HAVE_CMOV) \ 5730 do { \ 5731 /* Адаптивно-упрощенный шаг двоичного поиска: \ 5732 * - без переходов при наличии cmov или аналога; \ 5733 * - допускает лишние итерации; \ 5734 * - но ищет пока size > 2, что требует дозавершения поиска \ 5735 * среди остающихся 0-1-2 элементов. */ \ 5736 const TYPE_LIST *const middle = it + (length >> 1); \ 5737 length = (length + 1) >> 1; \ 5738 const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ 5739 WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ 5740 it = flag ? middle : it; \ 5741 } while (length > 2); \ 5742 else \ 5743 while (length > 2) { \ 5744 /* Вариант с использованием условного перехода. Основное отличие в \ 5745 * том, что при "не равно" (true от компаратора) переход делается на 1 \ 5746 * ближе к концу массива. Алгоритмически это верно и обеспечивает \ 5747 * чуть-чуть более быструю сходимость, но зато требует больше \ 5748 * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ 5749 * спекулятивное выполнение при size == 0. */ \ 5750 const TYPE_LIST *const middle = it + (length >> 1); \ 5751 length = (length + 1) >> 1; \ 5752 const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ 5753 if (flag) { \ 5754 it = middle + 1; \ 5755 length -= 1; \ 5756 } \ 5757 } \ 5758 it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ 5759 it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ 5760 \ 5761 if (AUDIT_ENABLED()) { \ 5762 for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ 5763 assert(CMP(*scan, item)); \ 5764 for (const TYPE_LIST *scan = it; scan < end; ++scan) \ 5765 assert(!CMP(*scan, item)); \ 5766 (void)begin, (void)end; \ 5767 } \ 5768 \ 5769 return it; \ 5770 } 5771 5772 /*----------------------------------------------------------------------------*/ 5773 5774 static __always_inline size_t pnl_size2bytes(size_t size) { 5775 assert(size > 0 && size <= MDBX_PGL_LIMIT); 5776 #if MDBX_PNL_PREALLOC_FOR_RADIXSORT 5777 size += size; 5778 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ 5779 STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + 5780 (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + 5781 MDBX_PNL_GRANULATE + 2) * 5782 sizeof(pgno_t) < 5783 SIZE_MAX / 4 * 3); 5784 size_t bytes = 5785 ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), 5786 MDBX_PNL_GRANULATE * sizeof(pgno_t)) - 5787 MDBX_ASSUME_MALLOC_OVERHEAD; 5788 return bytes; 5789 } 5790 5791 static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { 5792 size_t size = bytes / sizeof(pgno_t); 5793 assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); 5794 size -= 2; 5795 #if MDBX_PNL_PREALLOC_FOR_RADIXSORT 5796 size >>= 1; 5797 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ 5798 return (pgno_t)size; 5799 } 5800 5801 static MDBX_PNL pnl_alloc(size_t size) { 5802 size_t bytes = pnl_size2bytes(size); 5803 MDBX_PNL pl = osal_malloc(bytes); 5804 if (likely(pl)) { 5805 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 5806 bytes = malloc_usable_size(pl); 5807 #endif /* malloc_usable_size */ 5808 pl[0] = pnl_bytes2size(bytes); 5809 assert(pl[0] >= size); 5810 pl[1] = 0; 5811 pl += 1; 5812 } 5813 return pl; 5814 } 5815 5816 static void pnl_free(MDBX_PNL pl) { 5817 if (likely(pl)) 5818 osal_free(pl - 1); 5819 } 5820 5821 /* Shrink the PNL to the default size if it has grown larger */ 5822 static void pnl_shrink(MDBX_PNL *ppl) { 5823 assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && 5824 pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < 5825 MDBX_PNL_INITIAL * 3 / 2); 5826 assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && 5827 MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); 5828 MDBX_PNL_SIZE(*ppl) = 0; 5829 if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > 5830 MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { 5831 size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL); 5832 MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); 5833 if (likely(pl)) { 5834 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 5835 bytes = malloc_usable_size(pl); 5836 #endif /* malloc_usable_size */ 5837 *pl = pnl_bytes2size(bytes); 5838 *ppl = pl + 1; 5839 } 5840 } 5841 } 5842 5843 /* Grow the PNL to the size growed to at least given size */ 5844 static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { 5845 const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); 5846 assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && 5847 MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); 5848 if (likely(allocated >= wanna)) 5849 return MDBX_SUCCESS; 5850 5851 if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { 5852 ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); 5853 return MDBX_TXN_FULL; 5854 } 5855 5856 const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) 5857 ? wanna + wanna - allocated 5858 : MDBX_PGL_LIMIT; 5859 size_t bytes = pnl_size2bytes(size); 5860 MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); 5861 if (likely(pl)) { 5862 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 5863 bytes = malloc_usable_size(pl); 5864 #endif /* malloc_usable_size */ 5865 *pl = pnl_bytes2size(bytes); 5866 assert(*pl >= wanna); 5867 *ppl = pl + 1; 5868 return MDBX_SUCCESS; 5869 } 5870 return MDBX_ENOMEM; 5871 } 5872 5873 /* Make room for num additional elements in an PNL */ 5874 static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, 5875 size_t num) { 5876 assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && 5877 MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); 5878 assert(num <= MDBX_PGL_LIMIT); 5879 const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; 5880 return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS 5881 : pnl_reserve(ppl, wanna); 5882 } 5883 5884 static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { 5885 assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); 5886 if (AUDIT_ENABLED()) { 5887 for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) 5888 assert(pgno != pl[i]); 5889 } 5890 MDBX_PNL_SIZE(pl) += 1; 5891 MDBX_PNL_LAST(pl) = pgno; 5892 } 5893 5894 /* Append an pgno range onto an unsorted PNL */ 5895 __always_inline static int __must_check_result pnl_append_range(bool spilled, 5896 MDBX_PNL *ppl, 5897 pgno_t pgno, 5898 unsigned n) { 5899 assert(n > 0); 5900 int rc = pnl_need(ppl, n); 5901 if (unlikely(rc != MDBX_SUCCESS)) 5902 return rc; 5903 5904 const MDBX_PNL pnl = *ppl; 5905 #if MDBX_PNL_ASCENDING 5906 unsigned w = MDBX_PNL_SIZE(pnl); 5907 do { 5908 pnl[++w] = pgno; 5909 pgno += spilled ? 2 : 1; 5910 } while (--n); 5911 MDBX_PNL_SIZE(pnl) = w; 5912 #else 5913 unsigned w = MDBX_PNL_SIZE(pnl) + n; 5914 MDBX_PNL_SIZE(pnl) = w; 5915 do { 5916 pnl[w--] = pgno; 5917 pgno += spilled ? 2 : 1; 5918 } while (--n); 5919 #endif 5920 5921 return MDBX_SUCCESS; 5922 } 5923 5924 /* Append an pgno range into the sorted PNL */ 5925 __hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, 5926 pgno_t pgno, unsigned n) { 5927 assert(n > 0); 5928 int rc = pnl_need(ppl, n); 5929 if (unlikely(rc != MDBX_SUCCESS)) 5930 return rc; 5931 5932 const MDBX_PNL pnl = *ppl; 5933 unsigned r = MDBX_PNL_SIZE(pnl), w = r + n; 5934 MDBX_PNL_SIZE(pnl) = w; 5935 while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) 5936 pnl[w--] = pnl[r--]; 5937 5938 for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w) 5939 pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++; 5940 5941 return MDBX_SUCCESS; 5942 } 5943 5944 __hot static bool pnl_check(const pgno_t *pl, const size_t limit) { 5945 assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); 5946 if (likely(MDBX_PNL_SIZE(pl))) { 5947 if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) 5948 return false; 5949 if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) 5950 return false; 5951 if (unlikely(MDBX_PNL_MOST(pl) >= limit)) 5952 return false; 5953 5954 if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && 5955 likely(MDBX_PNL_SIZE(pl) > 1)) { 5956 const pgno_t *scan = MDBX_PNL_BEGIN(pl); 5957 const pgno_t *const end = MDBX_PNL_END(pl); 5958 pgno_t prev = *scan++; 5959 do { 5960 if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) 5961 return false; 5962 prev = *scan; 5963 } while (likely(++scan != end)); 5964 } 5965 } 5966 return true; 5967 } 5968 5969 static __always_inline bool pnl_check_allocated(const pgno_t *pl, 5970 const size_t limit) { 5971 return pl == nullptr || 5972 (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl) && pnl_check(pl, limit)); 5973 } 5974 5975 static __always_inline void 5976 pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, 5977 const pgno_t *__restrict src_b, 5978 const pgno_t *__restrict const src_b_detent) { 5979 do { 5980 #if MDBX_HAVE_CMOV 5981 const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); 5982 #if defined(__LCC__) || __CLANG_PREREQ(13, 0) 5983 // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode 5984 // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) 5985 // gcc<=6: cmov×3 5986 // clang<=12: cmov×3 5987 // clang>=13: cmov, set+add/sub 5988 *dst = flag ? *src_a-- : *src_b--; 5989 #else 5990 // gcc: cmov, cmp+set+add/sub 5991 // clang<=5: cmov×2, set+add/sub 5992 // clang>=6: cmov, set+add/sub 5993 *dst = flag ? *src_a : *src_b; 5994 src_b += flag - 1; 5995 src_a -= flag; 5996 #endif 5997 --dst; 5998 #else /* MDBX_HAVE_CMOV */ 5999 while (MDBX_PNL_ORDERED(*src_b, *src_a)) 6000 *dst-- = *src_a--; 6001 *dst-- = *src_b--; 6002 #endif /* !MDBX_HAVE_CMOV */ 6003 } while (likely(src_b > src_b_detent)); 6004 } 6005 6006 /* Merge a PNL onto a PNL. The destination PNL must be big enough */ 6007 __hot static void pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { 6008 assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); 6009 assert(pnl_check(src, MAX_PAGENO + 1)); 6010 const pgno_t src_len = MDBX_PNL_SIZE(src); 6011 const pgno_t dst_len = MDBX_PNL_SIZE(dst); 6012 if (likely(src_len > 0)) { 6013 const pgno_t total = dst_len + src_len; 6014 assert(MDBX_PNL_ALLOCLEN(dst) >= total); 6015 dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); 6016 pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); 6017 MDBX_PNL_SIZE(dst) = total; 6018 } 6019 assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); 6020 } 6021 6022 static void spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { 6023 tASSERT(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && 6024 txn->tw.spill_least_removed > 0); 6025 txn->tw.spill_least_removed = 6026 (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; 6027 txn->tw.spill_pages[idx] |= 1; 6028 MDBX_PNL_SIZE(txn->tw.spill_pages) -= 6029 (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); 6030 6031 while (unlikely(npages > 1)) { 6032 const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; 6033 if (MDBX_PNL_ASCENDING) { 6034 if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) || 6035 (txn->tw.spill_pages[idx] >> 1) != pgno) 6036 return; 6037 } else { 6038 if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) 6039 return; 6040 txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) 6041 ? idx 6042 : txn->tw.spill_least_removed; 6043 } 6044 txn->tw.spill_pages[idx] |= 1; 6045 MDBX_PNL_SIZE(txn->tw.spill_pages) -= 6046 (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); 6047 --npages; 6048 } 6049 } 6050 6051 static MDBX_PNL spill_purge(MDBX_txn *txn) { 6052 tASSERT(txn, txn->tw.spill_least_removed > 0); 6053 const MDBX_PNL sl = txn->tw.spill_pages; 6054 if (txn->tw.spill_least_removed != INT_MAX) { 6055 unsigned len = MDBX_PNL_SIZE(sl), r, w; 6056 for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { 6057 sl[w] = sl[r]; 6058 w += 1 - (sl[r] & 1); 6059 } 6060 for (size_t i = 1; i < w; ++i) 6061 tASSERT(txn, (sl[i] & 1) == 0); 6062 MDBX_PNL_SIZE(sl) = w - 1; 6063 txn->tw.spill_least_removed = INT_MAX; 6064 } else { 6065 for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) 6066 tASSERT(txn, (sl[i] & 1) == 0); 6067 } 6068 return sl; 6069 } 6070 6071 #if MDBX_PNL_ASCENDING 6072 #define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr)) 6073 #else 6074 #define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr)) 6075 #endif 6076 RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, 6077 MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0) 6078 6079 SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) 6080 6081 __hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { 6082 if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || 6083 unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) 6084 pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); 6085 } 6086 6087 static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { 6088 pnl_sort_nochk(pnl); 6089 assert(pnl_check(pnl, limit4check)); 6090 (void)limit4check; 6091 } 6092 6093 /* Search for an pgno in an PNL. 6094 * Returns The index of the first item greater than or equal to pgno. */ 6095 SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) 6096 6097 __hot __noinline static unsigned pnl_search_nochk(const MDBX_PNL pnl, 6098 pgno_t pgno) { 6099 const pgno_t *begin = MDBX_PNL_BEGIN(pnl); 6100 const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); 6101 const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); 6102 assert(it >= begin && it <= end); 6103 if (it != begin) 6104 assert(MDBX_PNL_ORDERED(it[-1], pgno)); 6105 if (it != end) 6106 assert(!MDBX_PNL_ORDERED(it[0], pgno)); 6107 return (unsigned)(it - begin + 1); 6108 } 6109 6110 static __inline unsigned pnl_search(const MDBX_PNL pnl, pgno_t pgno, 6111 size_t limit) { 6112 assert(pnl_check_allocated(pnl, limit)); 6113 assert(pgno < limit); 6114 (void)limit; 6115 return pnl_search_nochk(pnl, pgno); 6116 } 6117 6118 static __inline unsigned search_spilled(const MDBX_txn *txn, pgno_t pgno) { 6119 const MDBX_PNL pnl = txn->tw.spill_pages; 6120 if (likely(!pnl)) 6121 return 0; 6122 pgno <<= 1; 6123 unsigned n = pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); 6124 return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; 6125 } 6126 6127 static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, 6128 unsigned npages) { 6129 const MDBX_PNL pnl = txn->tw.spill_pages; 6130 if (likely(!pnl)) 6131 return false; 6132 const unsigned len = MDBX_PNL_SIZE(pnl); 6133 if (LOG_ENABLED(MDBX_LOG_EXTRA)) { 6134 DEBUG_EXTRA("PNL len %u [", len); 6135 for (unsigned i = 1; i <= len; ++i) 6136 DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) 6137 : (long)(pnl[i] >> 1)); 6138 DEBUG_EXTRA_PRINT("%s\n", "]"); 6139 } 6140 const pgno_t spilled_range_begin = pgno << 1; 6141 const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; 6142 #if MDBX_PNL_ASCENDING 6143 const unsigned n = 6144 pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); 6145 assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); 6146 const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last; 6147 #else 6148 const unsigned n = 6149 pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); 6150 assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n])); 6151 const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin; 6152 #endif 6153 if (ASSERT_ENABLED()) { 6154 bool check = false; 6155 for (unsigned i = 0; i < npages; ++i) 6156 check |= search_spilled(txn, pgno + i) != 0; 6157 assert(check == rc); 6158 } 6159 return rc; 6160 } 6161 6162 /*----------------------------------------------------------------------------*/ 6163 6164 static __always_inline size_t txl_size2bytes(const size_t size) { 6165 assert(size > 0 && size <= MDBX_TXL_MAX * 2); 6166 size_t bytes = 6167 ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), 6168 MDBX_TXL_GRANULATE * sizeof(txnid_t)) - 6169 MDBX_ASSUME_MALLOC_OVERHEAD; 6170 return bytes; 6171 } 6172 6173 static __always_inline size_t txl_bytes2size(const size_t bytes) { 6174 size_t size = bytes / sizeof(txnid_t); 6175 assert(size > 2 && size <= MDBX_TXL_MAX * 2); 6176 return size - 2; 6177 } 6178 6179 static MDBX_TXL txl_alloc(void) { 6180 size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL); 6181 MDBX_TXL tl = osal_malloc(bytes); 6182 if (likely(tl)) { 6183 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 6184 bytes = malloc_usable_size(tl); 6185 #endif /* malloc_usable_size */ 6186 tl[0] = txl_bytes2size(bytes); 6187 assert(tl[0] >= MDBX_TXL_INITIAL); 6188 tl[1] = 0; 6189 tl += 1; 6190 } 6191 return tl; 6192 } 6193 6194 static void txl_free(MDBX_TXL tl) { 6195 if (likely(tl)) 6196 osal_free(tl - 1); 6197 } 6198 6199 static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { 6200 const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); 6201 assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && 6202 MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); 6203 if (likely(allocated >= wanna)) 6204 return MDBX_SUCCESS; 6205 6206 if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { 6207 ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); 6208 return MDBX_TXN_FULL; 6209 } 6210 6211 const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) 6212 ? wanna + wanna - allocated 6213 : MDBX_TXL_MAX; 6214 size_t bytes = txl_size2bytes(size); 6215 MDBX_TXL tl = osal_realloc(*ptl - 1, bytes); 6216 if (likely(tl)) { 6217 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 6218 bytes = malloc_usable_size(tl); 6219 #endif /* malloc_usable_size */ 6220 *tl = txl_bytes2size(bytes); 6221 assert(*tl >= wanna); 6222 *ptl = tl + 1; 6223 return MDBX_SUCCESS; 6224 } 6225 return MDBX_ENOMEM; 6226 } 6227 6228 static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, 6229 size_t num) { 6230 assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && 6231 MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); 6232 assert(num <= MDBX_PGL_LIMIT); 6233 const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; 6234 return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS 6235 : txl_reserve(ptl, wanna); 6236 } 6237 6238 static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { 6239 assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); 6240 MDBX_PNL_SIZE(tl) += 1; 6241 MDBX_PNL_LAST(tl) = id; 6242 } 6243 6244 #define TXNID_SORT_CMP(first, last) ((first) > (last)) 6245 SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) 6246 static void txl_sort(MDBX_TXL tl) { 6247 txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); 6248 } 6249 6250 static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { 6251 if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { 6252 int rc = txl_need(ptl, MDBX_TXL_GRANULATE); 6253 if (unlikely(rc != MDBX_SUCCESS)) 6254 return rc; 6255 } 6256 txl_xappend(*ptl, id); 6257 return MDBX_SUCCESS; 6258 } 6259 6260 /*----------------------------------------------------------------------------*/ 6261 6262 #define MDBX_DPL_UNSORTED_BACKLOG 16 6263 #define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG 6264 #define MDBX_DPL_GAP_FOR_EDGING 2 6265 #define MDBX_DPL_RESERVE_GAP \ 6266 (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) 6267 6268 static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { 6269 assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); 6270 #if MDBX_DPL_PREALLOC_FOR_RADIXSORT 6271 size += size; 6272 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ 6273 STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + 6274 (MDBX_PGL_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1) + 6275 MDBX_DPL_RESERVE_GAP) * 6276 sizeof(MDBX_dp) + 6277 MDBX_PNL_GRANULATE * sizeof(void *) * 2 < 6278 SIZE_MAX / 4 * 3); 6279 size_t bytes = 6280 ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + 6281 ((size_t)size + MDBX_DPL_RESERVE_GAP) * sizeof(MDBX_dp), 6282 MDBX_PNL_GRANULATE * sizeof(void *) * 2) - 6283 MDBX_ASSUME_MALLOC_OVERHEAD; 6284 return bytes; 6285 } 6286 6287 static __always_inline unsigned dpl_bytes2size(const ptrdiff_t bytes) { 6288 size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); 6289 assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && 6290 size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); 6291 size -= MDBX_DPL_RESERVE_GAP; 6292 #if MDBX_DPL_PREALLOC_FOR_RADIXSORT 6293 size >>= 1; 6294 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ 6295 return (unsigned)size; 6296 } 6297 6298 static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) { 6299 static const MDBX_page dpl_stub_pageE = { 6300 {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; 6301 assert(dpl_stub_pageE.mp_flags == P_BAD && 6302 dpl_stub_pageE.mp_pgno == P_INVALID); 6303 dl->length = len; 6304 dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; 6305 dl->items[len + 1].pgno = P_INVALID; 6306 dl->items[len + 1].extra = 0; 6307 return len; 6308 } 6309 6310 static __always_inline void dpl_clear(MDBX_dpl *dl) { 6311 static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; 6312 assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); 6313 dl->sorted = dpl_setlen(dl, 0); 6314 dl->pages_including_loose = 0; 6315 dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; 6316 dl->items[0].pgno = 0; 6317 dl->items[0].extra = 0; 6318 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6319 } 6320 6321 static void dpl_free(MDBX_txn *txn) { 6322 if (likely(txn->tw.dirtylist)) { 6323 osal_free(txn->tw.dirtylist); 6324 txn->tw.dirtylist = NULL; 6325 } 6326 } 6327 6328 static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { 6329 size_t bytes = 6330 dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); 6331 MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); 6332 if (likely(dl)) { 6333 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 6334 bytes = malloc_usable_size(dl); 6335 #endif /* malloc_usable_size */ 6336 dl->detent = dpl_bytes2size(bytes); 6337 tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); 6338 txn->tw.dirtylist = dl; 6339 } 6340 return dl; 6341 } 6342 6343 static int dpl_alloc(MDBX_txn *txn) { 6344 tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); 6345 const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) 6346 ? txn->mt_env->me_options.dp_initial 6347 : txn->mt_geo.upper; 6348 if (txn->tw.dirtylist) { 6349 dpl_clear(txn->tw.dirtylist); 6350 const int realloc_threshold = 64; 6351 if (likely( 6352 !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold || 6353 (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) 6354 return MDBX_SUCCESS; 6355 } 6356 if (unlikely(!dpl_reserve(txn, wanna))) 6357 return MDBX_ENOMEM; 6358 dpl_clear(txn->tw.dirtylist); 6359 return MDBX_SUCCESS; 6360 } 6361 6362 #define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno) 6363 RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, 6364 MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1) 6365 6366 #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) 6367 SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) 6368 6369 __hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { 6370 MDBX_dpl *dl = txn->tw.dirtylist; 6371 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6372 const unsigned unsorted = dl->length - dl->sorted; 6373 if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || 6374 unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { 6375 if (dl->sorted > unsorted / 4 + 4 && 6376 (MDBX_DPL_PREALLOC_FOR_RADIXSORT || 6377 dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) { 6378 MDBX_dp *const sorted_begin = dl->items + 1; 6379 MDBX_dp *const sorted_end = sorted_begin + dl->sorted; 6380 MDBX_dp *const end = 6381 dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT 6382 ? dl->length + dl->length + 1 6383 : dl->detent + MDBX_DPL_RESERVE_GAP); 6384 MDBX_dp *const tmp = end - unsorted; 6385 assert(dl->items + dl->length + 1 < tmp); 6386 /* copy unsorted to the end of allocated space and sort it */ 6387 memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); 6388 dp_sort(tmp, tmp + unsorted); 6389 /* merge two parts from end to begin */ 6390 MDBX_dp *__restrict w = dl->items + dl->length; 6391 MDBX_dp *__restrict l = dl->items + dl->sorted; 6392 MDBX_dp *__restrict r = end - 1; 6393 do { 6394 const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); 6395 #if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV 6396 *w = cmp ? *l-- : *r--; 6397 #else 6398 *w = cmp ? *l : *r; 6399 l -= cmp; 6400 r += cmp - 1; 6401 #endif 6402 } while (likely(--w > l)); 6403 assert(r == tmp - 1); 6404 assert(dl->items[0].pgno == 0 && 6405 dl->items[dl->length + 1].pgno == P_INVALID); 6406 if (ASSERT_ENABLED()) 6407 for (unsigned i = 0; i <= dl->length; ++i) 6408 assert(dl->items[i].pgno < dl->items[i + 1].pgno); 6409 } else { 6410 dp_sort(dl->items + 1, dl->items + dl->length + 1); 6411 assert(dl->items[0].pgno == 0 && 6412 dl->items[dl->length + 1].pgno == P_INVALID); 6413 } 6414 } else { 6415 assert(dl->items[0].pgno == 0 && 6416 dl->items[dl->length + 1].pgno == P_INVALID); 6417 } 6418 dl->sorted = dl->length; 6419 return dl; 6420 } 6421 6422 static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { 6423 MDBX_dpl *dl = txn->tw.dirtylist; 6424 assert(dl->length <= MDBX_PGL_LIMIT); 6425 assert(dl->sorted <= dl->length); 6426 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6427 return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); 6428 } 6429 6430 /* Returns the index of the first dirty-page whose pgno 6431 * member is greater than or equal to id. */ 6432 #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) 6433 SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) 6434 6435 __hot __noinline static unsigned dpl_search(const MDBX_txn *txn, pgno_t pgno) { 6436 MDBX_dpl *dl = txn->tw.dirtylist; 6437 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6438 if (AUDIT_ENABLED()) { 6439 for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { 6440 assert(ptr[0].pgno < ptr[1].pgno); 6441 assert(ptr[0].pgno >= NUM_METAS); 6442 } 6443 } 6444 6445 switch (dl->length - dl->sorted) { 6446 default: 6447 /* sort a whole */ 6448 dpl_sort_slowpath(txn); 6449 break; 6450 case 0: 6451 /* whole sorted cases */ 6452 break; 6453 6454 #define LINEAR_SEARCH_CASE(N) \ 6455 case N: \ 6456 if (dl->items[dl->length - N + 1].pgno == pgno) \ 6457 return dl->length - N + 1; \ 6458 __fallthrough 6459 6460 /* use linear scan until the threshold */ 6461 LINEAR_SEARCH_CASE(7); /* fall through */ 6462 LINEAR_SEARCH_CASE(6); /* fall through */ 6463 LINEAR_SEARCH_CASE(5); /* fall through */ 6464 LINEAR_SEARCH_CASE(4); /* fall through */ 6465 LINEAR_SEARCH_CASE(3); /* fall through */ 6466 LINEAR_SEARCH_CASE(2); /* fall through */ 6467 case 1: 6468 if (dl->items[dl->length].pgno == pgno) 6469 return dl->length; 6470 /* continue bsearch on the sorted part */ 6471 break; 6472 } 6473 return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); 6474 } 6475 6476 MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned 6477 dpl_npages(const MDBX_dpl *dl, unsigned i) { 6478 assert(0 <= (int)i && i <= dl->length); 6479 unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; 6480 assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); 6481 return n; 6482 } 6483 6484 MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned 6485 dpl_endpgno(const MDBX_dpl *dl, unsigned i) { 6486 return dpl_npages(dl, i) + dl->items[i].pgno; 6487 } 6488 6489 static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, 6490 unsigned npages) { 6491 MDBX_dpl *dl = txn->tw.dirtylist; 6492 assert(dl->sorted == dl->length); 6493 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6494 unsigned const n = dpl_search(txn, pgno); 6495 assert(n >= 1 && n <= dl->length + 1); 6496 assert(pgno <= dl->items[n].pgno); 6497 assert(pgno > dl->items[n - 1].pgno); 6498 const bool rc = 6499 /* intersection with founded */ pgno + npages > dl->items[n].pgno || 6500 /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; 6501 if (ASSERT_ENABLED()) { 6502 bool check = false; 6503 for (unsigned i = 1; i <= dl->length; ++i) { 6504 const MDBX_page *const dp = dl->items[i].ptr; 6505 if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || 6506 dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) 6507 check |= true; 6508 } 6509 assert(check == rc); 6510 } 6511 return rc; 6512 } 6513 6514 static __always_inline unsigned dpl_exist(MDBX_txn *txn, pgno_t pgno) { 6515 MDBX_dpl *dl = txn->tw.dirtylist; 6516 unsigned i = dpl_search(txn, pgno); 6517 assert((int)i > 0); 6518 return (dl->items[i].pgno == pgno) ? i : 0; 6519 } 6520 6521 MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, 6522 const pgno_t pgno) { 6523 const MDBX_dpl *dl = txn->tw.dirtylist; 6524 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6525 for (unsigned i = dl->length; i > dl->sorted; --i) 6526 if (dl->items[i].pgno == pgno) 6527 return dl->items[i].ptr; 6528 6529 if (dl->sorted) { 6530 const unsigned i = 6531 (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); 6532 if (dl->items[i].pgno == pgno) 6533 return dl->items[i].ptr; 6534 } 6535 return nullptr; 6536 } 6537 6538 static void dpl_remove_ex(const MDBX_txn *txn, unsigned i, unsigned npages) { 6539 MDBX_dpl *dl = txn->tw.dirtylist; 6540 assert((int)i > 0 && i <= dl->length); 6541 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6542 dl->pages_including_loose -= npages; 6543 dl->sorted -= dl->sorted >= i; 6544 dl->length -= 1; 6545 memmove(dl->items + i, dl->items + i + 1, 6546 (dl->length - i + 2) * sizeof(dl->items[0])); 6547 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6548 } 6549 6550 static void dpl_remove(const MDBX_txn *txn, unsigned i) { 6551 dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); 6552 } 6553 6554 static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, 6555 pgno_t pgno, 6556 MDBX_page *page, 6557 unsigned npages) { 6558 MDBX_dpl *dl = txn->tw.dirtylist; 6559 assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); 6560 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6561 if (AUDIT_ENABLED()) { 6562 for (unsigned i = dl->length; i > 0; --i) { 6563 assert(dl->items[i].pgno != pgno); 6564 if (unlikely(dl->items[i].pgno == pgno)) { 6565 ERROR("Page %u already exist in the DPL at %u", pgno, i); 6566 return MDBX_PROBLEM; 6567 } 6568 } 6569 } 6570 6571 const unsigned length = dl->length + 1; 6572 const unsigned sorted = 6573 (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) 6574 ? length 6575 : dl->sorted; 6576 6577 if (unlikely(dl->length == dl->detent)) { 6578 if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { 6579 ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); 6580 return MDBX_TXN_FULL; 6581 } 6582 const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) 6583 ? dl->detent + dl->detent 6584 : dl->detent + dl->detent / 2; 6585 dl = dpl_reserve(txn, size); 6586 if (unlikely(!dl)) 6587 return MDBX_ENOMEM; 6588 tASSERT(txn, dl->length < dl->detent); 6589 } 6590 6591 /* copy the stub beyond the end */ 6592 dl->items[length + 1] = dl->items[length]; 6593 /* append page */ 6594 dl->items[length].ptr = page; 6595 dl->items[length].pgno = pgno; 6596 dl->items[length].multi = npages > 1; 6597 dl->items[length].lru = txn->tw.dirtylru++; 6598 dl->length = length; 6599 dl->sorted = sorted; 6600 dl->pages_including_loose += npages; 6601 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 6602 return MDBX_SUCCESS; 6603 } 6604 6605 static __inline uint32_t dpl_age(const MDBX_txn *txn, unsigned i) { 6606 const MDBX_dpl *dl = txn->tw.dirtylist; 6607 assert((int)i > 0 && i <= dl->length); 6608 /* overflow could be here */ 6609 return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); 6610 } 6611 6612 /*----------------------------------------------------------------------------*/ 6613 6614 uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; 6615 uint8_t loglevel = MDBX_LOG_FATAL; 6616 MDBX_debug_func *debug_logger; 6617 6618 static __must_check_result __inline int page_retire(MDBX_cursor *mc, 6619 MDBX_page *mp); 6620 6621 static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, 6622 unsigned npages); 6623 typedef struct page_result { 6624 MDBX_page *page; 6625 int err; 6626 } pgr_t; 6627 6628 static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); 6629 6630 static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); 6631 static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages); 6632 static int page_touch(MDBX_cursor *mc); 6633 static int cursor_touch(MDBX_cursor *mc); 6634 static int touch_dbi(MDBX_cursor *mc); 6635 6636 #define MDBX_END_NAMES \ 6637 { \ 6638 "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ 6639 "fail-beginchild" \ 6640 } 6641 enum { 6642 /* txn_end operation number, for logging */ 6643 MDBX_END_COMMITTED, 6644 MDBX_END_PURE_COMMIT, 6645 MDBX_END_ABORT, 6646 MDBX_END_RESET, 6647 MDBX_END_RESET_TMP, 6648 MDBX_END_FAIL_BEGIN, 6649 MDBX_END_FAIL_BEGINCHILD 6650 }; 6651 #define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ 6652 #define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ 6653 #define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ 6654 #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ 6655 #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ 6656 static int txn_end(MDBX_txn *txn, const unsigned mode); 6657 6658 static __always_inline pgr_t page_get_inline(const uint16_t ILL, 6659 MDBX_cursor *const mc, 6660 const pgno_t pgno, 6661 const txnid_t front); 6662 6663 static pgr_t page_get_any(MDBX_cursor *const mc, const pgno_t pgno, 6664 const txnid_t front) { 6665 return page_get_inline(P_ILL_BITS, mc, pgno, front); 6666 } 6667 6668 __hot static pgr_t page_get_three(MDBX_cursor *const mc, const pgno_t pgno, 6669 const txnid_t front) { 6670 return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); 6671 } 6672 6673 static pgr_t page_get_large(MDBX_cursor *const mc, const pgno_t pgno, 6674 const txnid_t front) { 6675 return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, 6676 front); 6677 } 6678 6679 static __always_inline int __must_check_result page_get(MDBX_cursor *mc, 6680 const pgno_t pgno, 6681 MDBX_page **mp, 6682 const txnid_t front) { 6683 pgr_t ret = page_get_three(mc, pgno, front); 6684 *mp = ret.page; 6685 return ret.err; 6686 } 6687 6688 static int __must_check_result page_search_root(MDBX_cursor *mc, 6689 const MDBX_val *key, int flags); 6690 6691 #define MDBX_PS_MODIFY 1 6692 #define MDBX_PS_ROOTONLY 2 6693 #define MDBX_PS_FIRST 4 6694 #define MDBX_PS_LAST 8 6695 static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key, 6696 int flags); 6697 static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); 6698 6699 #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ 6700 static int __must_check_result page_split(MDBX_cursor *mc, 6701 const MDBX_val *const newkey, 6702 MDBX_val *const newdata, 6703 pgno_t newpgno, const unsigned naf); 6704 6705 static bool coherency_check_meta(const MDBX_env *env, 6706 const volatile MDBX_meta *meta, bool report); 6707 static int __must_check_result validate_meta_copy(MDBX_env *env, 6708 const MDBX_meta *meta, 6709 MDBX_meta *dest); 6710 static int __must_check_result override_meta(MDBX_env *env, unsigned target, 6711 txnid_t txnid, 6712 const MDBX_meta *shape); 6713 static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, 6714 const int lck_exclusive, 6715 const mdbx_mode_t mode_bits); 6716 static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, 6717 MDBX_meta *const pending, 6718 meta_troika_t *const troika); 6719 static int env_close(MDBX_env *env); 6720 6721 struct node_result { 6722 MDBX_node *node; 6723 bool exact; 6724 }; 6725 6726 static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); 6727 6728 static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, 6729 const MDBX_val *key, 6730 pgno_t pgno); 6731 static int __must_check_result node_add_leaf(MDBX_cursor *mc, unsigned indx, 6732 const MDBX_val *key, 6733 MDBX_val *data, unsigned flags); 6734 static int __must_check_result node_add_leaf2(MDBX_cursor *mc, unsigned indx, 6735 const MDBX_val *key); 6736 6737 static void node_del(MDBX_cursor *mc, size_t ksize); 6738 static void node_shrink(MDBX_page *mp, unsigned indx); 6739 static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, 6740 bool fromleft); 6741 static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, 6742 MDBX_val *data, const MDBX_page *mp); 6743 static int __must_check_result rebalance(MDBX_cursor *mc); 6744 static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); 6745 6746 static void cursor_pop(MDBX_cursor *mc); 6747 static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); 6748 6749 static int __must_check_result audit_ex(MDBX_txn *txn, unsigned retired_stored, 6750 bool dont_filter_gc); 6751 6752 static int __must_check_result page_check(MDBX_cursor *const mc, 6753 const MDBX_page *const mp); 6754 static int __must_check_result cursor_check(MDBX_cursor *mc); 6755 static int __must_check_result cursor_check_updating(MDBX_cursor *mc); 6756 static int __must_check_result cursor_del(MDBX_cursor *mc); 6757 static int __must_check_result delete (MDBX_txn *txn, MDBX_dbi dbi, 6758 const MDBX_val *key, 6759 const MDBX_val *data, unsigned flags); 6760 #define SIBLING_LEFT 0 6761 #define SIBLING_RIGHT 2 6762 static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); 6763 static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key, 6764 MDBX_val *data, MDBX_cursor_op op); 6765 static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key, 6766 MDBX_val *data, MDBX_cursor_op op); 6767 struct cursor_set_result { 6768 int err; 6769 bool exact; 6770 }; 6771 6772 static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key, 6773 MDBX_val *data, MDBX_cursor_op op); 6774 static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key, 6775 MDBX_val *data); 6776 static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, 6777 MDBX_val *data); 6778 6779 static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn, 6780 MDBX_dbi dbi); 6781 static int __must_check_result cursor_xinit0(MDBX_cursor *mc); 6782 static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, 6783 const MDBX_page *mp); 6784 static int __must_check_result cursor_xinit2(MDBX_cursor *mc, 6785 MDBX_xcursor *src_mx, 6786 bool new_dupdata); 6787 static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); 6788 6789 static int __must_check_result drop_tree(MDBX_cursor *mc, 6790 const bool may_have_subDBs); 6791 static int __must_check_result fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); 6792 static int __must_check_result setup_dbx(MDBX_dbx *const dbx, 6793 const MDBX_db *const db, 6794 const unsigned pagesize); 6795 6796 static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, 6797 cmp_int_unaligned, cmp_lenfast; 6798 6799 static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags); 6800 static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags); 6801 6802 __cold const char *mdbx_liberr2str(int errnum) { 6803 /* Table of descriptions for MDBX errors */ 6804 static const char *const tbl[] = { 6805 "MDBX_KEYEXIST: Key/data pair already exists", 6806 "MDBX_NOTFOUND: No matching key/data pair found", 6807 "MDBX_PAGE_NOTFOUND: Requested page not found", 6808 "MDBX_CORRUPTED: Database is corrupted", 6809 "MDBX_PANIC: Environment had fatal error", 6810 "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", 6811 "MDBX_INVALID: File is not an MDBX file", 6812 "MDBX_MAP_FULL: Environment mapsize limit reached", 6813 "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)", 6814 "MDBX_READERS_FULL: Too many readers (maxreaders reached)", 6815 NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */, 6816 "MDBX_TXN_FULL: Transaction has too many dirty pages," 6817 " i.e transaction is too big", 6818 "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates" 6819 " corruption, i.e branch-pages loop", 6820 "MDBX_PAGE_FULL: Internal error - Page has no more space", 6821 "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend" 6822 " mapping, e.g. since address space is unavailable or busy," 6823 " or Operation system not supported such operations", 6824 "MDBX_INCOMPATIBLE: Environment or database is not compatible" 6825 " with the requested operation or the specified flags", 6826 "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot," 6827 " e.g. read-transaction already run for current thread", 6828 "MDBX_BAD_TXN: Transaction is not valid for requested operation," 6829 " e.g. had errored and be must aborted, has a child, or is invalid", 6830 "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data" 6831 " for target database, either invalid subDB name", 6832 "MDBX_BAD_DBI: The specified DBI-handle is invalid" 6833 " or changed by another thread/transaction", 6834 "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted", 6835 "MDBX_BUSY: Another write transaction is running," 6836 " or environment is already used while opening with MDBX_EXCLUSIVE flag", 6837 }; 6838 6839 if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) { 6840 int i = errnum - MDBX_KEYEXIST; 6841 return tbl[i]; 6842 } 6843 6844 switch (errnum) { 6845 case MDBX_SUCCESS: 6846 return "MDBX_SUCCESS: Successful"; 6847 case MDBX_EMULTIVAL: 6848 return "MDBX_EMULTIVAL: The specified key has" 6849 " more than one associated value"; 6850 case MDBX_EBADSIGN: 6851 return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)," 6852 " e.g. memory corruption or double-free"; 6853 case MDBX_WANNA_RECOVERY: 6854 return "MDBX_WANNA_RECOVERY: Database should be recovered," 6855 " but this could NOT be done automatically for now" 6856 " since it opened in read-only mode"; 6857 case MDBX_EKEYMISMATCH: 6858 return "MDBX_EKEYMISMATCH: The given key value is mismatched to the" 6859 " current cursor position"; 6860 case MDBX_TOO_LARGE: 6861 return "MDBX_TOO_LARGE: Database is too large for current system," 6862 " e.g. could NOT be mapped into RAM"; 6863 case MDBX_THREAD_MISMATCH: 6864 return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not" 6865 " owned object, e.g. a transaction that started by another thread"; 6866 case MDBX_TXN_OVERLAPPING: 6867 return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" 6868 " the current thread"; 6869 default: 6870 return NULL; 6871 } 6872 } 6873 6874 __cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { 6875 const char *msg = mdbx_liberr2str(errnum); 6876 if (!msg && buflen > 0 && buflen < INT_MAX) { 6877 #if defined(_WIN32) || defined(_WIN64) 6878 const DWORD size = FormatMessageA( 6879 FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, 6880 errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, 6881 NULL); 6882 return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; 6883 #elif defined(_GNU_SOURCE) && defined(__GLIBC__) 6884 /* GNU-specific */ 6885 if (errnum > 0) 6886 msg = strerror_r(errnum, buf, buflen); 6887 #elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) 6888 /* XSI-compliant */ 6889 if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0) 6890 msg = buf; 6891 #else 6892 if (errnum > 0) { 6893 msg = strerror(errnum); 6894 if (msg) { 6895 strncpy(buf, msg, buflen); 6896 msg = buf; 6897 } 6898 } 6899 #endif 6900 if (!msg) { 6901 (void)snprintf(buf, buflen, "error %d", errnum); 6902 msg = buf; 6903 } 6904 buf[buflen - 1] = '\0'; 6905 } 6906 return msg; 6907 } 6908 6909 __cold const char *mdbx_strerror(int errnum) { 6910 #if defined(_WIN32) || defined(_WIN64) 6911 static char buf[1024]; 6912 return mdbx_strerror_r(errnum, buf, sizeof(buf)); 6913 #else 6914 const char *msg = mdbx_liberr2str(errnum); 6915 if (!msg) { 6916 if (errnum > 0) 6917 msg = strerror(errnum); 6918 if (!msg) { 6919 static char buf[32]; 6920 (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum); 6921 msg = buf; 6922 } 6923 } 6924 return msg; 6925 #endif 6926 } 6927 6928 #if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */ 6929 const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) { 6930 const char *msg = mdbx_liberr2str(errnum); 6931 if (!msg && buflen > 0 && buflen < INT_MAX) { 6932 const DWORD size = FormatMessageA( 6933 FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, 6934 errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, 6935 NULL); 6936 if (!size) 6937 msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; 6938 else if (!CharToOemBuffA(buf, buf, size)) 6939 msg = "CharToOemBuffA() failed"; 6940 else 6941 msg = buf; 6942 } 6943 return msg; 6944 } 6945 6946 const char *mdbx_strerror_ANSI2OEM(int errnum) { 6947 static char buf[1024]; 6948 return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf)); 6949 } 6950 #endif /* Bit of madness for Windows */ 6951 6952 __cold void debug_log_va(int level, const char *function, int line, 6953 const char *fmt, va_list args) { 6954 if (debug_logger) 6955 debug_logger(level, function, line, fmt, args); 6956 else { 6957 #if defined(_WIN32) || defined(_WIN64) 6958 if (IsDebuggerPresent()) { 6959 int prefix_len = 0; 6960 char *prefix = nullptr; 6961 if (function && line > 0) 6962 prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); 6963 else if (function) 6964 prefix_len = osal_asprintf(&prefix, "%s: ", function); 6965 else if (line > 0) 6966 prefix_len = osal_asprintf(&prefix, "%d: ", line); 6967 if (prefix_len > 0 && prefix) { 6968 OutputDebugStringA(prefix); 6969 osal_free(prefix); 6970 } 6971 char *msg = nullptr; 6972 int msg_len = osal_vasprintf(&msg, fmt, args); 6973 if (msg_len > 0 && msg) { 6974 OutputDebugStringA(msg); 6975 osal_free(msg); 6976 } 6977 } 6978 #else 6979 if (function && line > 0) 6980 fprintf(stderr, "%s:%d ", function, line); 6981 else if (function) 6982 fprintf(stderr, "%s: ", function); 6983 else if (line > 0) 6984 fprintf(stderr, "%d: ", line); 6985 vfprintf(stderr, fmt, args); 6986 fflush(stderr); 6987 #endif 6988 } 6989 } 6990 6991 __cold void debug_log(int level, const char *function, int line, 6992 const char *fmt, ...) { 6993 va_list args; 6994 va_start(args, fmt); 6995 debug_log_va(level, function, line, fmt, args); 6996 va_end(args); 6997 } 6998 6999 /* Dump a key in ascii or hexadecimal. */ 7000 const char *mdbx_dump_val(const MDBX_val *key, char *const buf, 7001 const size_t bufsize) { 7002 if (!key) 7003 return "<null>"; 7004 if (!key->iov_len) 7005 return "<empty>"; 7006 if (!buf || bufsize < 4) 7007 return nullptr; 7008 7009 bool is_ascii = true; 7010 const uint8_t *const data = key->iov_base; 7011 for (unsigned i = 0; i < key->iov_len; i++) 7012 if (data[i] < ' ' || data[i] > '~') { 7013 is_ascii = false; 7014 break; 7015 } 7016 7017 if (is_ascii) { 7018 int len = 7019 snprintf(buf, bufsize, "%.*s", 7020 (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); 7021 assert(len > 0 && (unsigned)len < bufsize); 7022 (void)len; 7023 } else { 7024 char *const detent = buf + bufsize - 2; 7025 char *ptr = buf; 7026 *ptr++ = '<'; 7027 for (unsigned i = 0; i < key->iov_len; i++) { 7028 const ptrdiff_t left = detent - ptr; 7029 assert(left > 0); 7030 int len = snprintf(ptr, left, "%02x", data[i]); 7031 if (len < 0 || len >= left) 7032 break; 7033 ptr += len; 7034 } 7035 if (ptr < detent) { 7036 ptr[0] = '>'; 7037 ptr[1] = '\0'; 7038 } 7039 } 7040 return buf; 7041 } 7042 7043 /*------------------------------------------------------------------------------ 7044 LY: debug stuff */ 7045 7046 static const char *leafnode_type(MDBX_node *n) { 7047 static const char *const tp[2][2] = {{"", ": DB"}, 7048 {": sub-page", ": sub-DB"}}; 7049 return (node_flags(n) & F_BIGDATA) 7050 ? ": large page" 7051 : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)]; 7052 } 7053 7054 /* Display all the keys in the page. */ 7055 MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { 7056 pgno_t pgno = mp->mp_pgno; 7057 const char *type; 7058 MDBX_node *node; 7059 unsigned i, nkeys, nsize, total = 0; 7060 MDBX_val key; 7061 DKBUF; 7062 7063 switch (PAGETYPE_WHOLE(mp)) { 7064 case P_BRANCH: 7065 type = "Branch page"; 7066 break; 7067 case P_LEAF: 7068 type = "Leaf page"; 7069 break; 7070 case P_LEAF | P_SUBP: 7071 type = "Leaf sub-page"; 7072 break; 7073 case P_LEAF | P_LEAF2: 7074 type = "Leaf2 page"; 7075 break; 7076 case P_LEAF | P_LEAF2 | P_SUBP: 7077 type = "Leaf2 sub-page"; 7078 break; 7079 case P_OVERFLOW: 7080 VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); 7081 return; 7082 case P_META: 7083 VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, 7084 unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); 7085 return; 7086 default: 7087 VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); 7088 return; 7089 } 7090 7091 nkeys = page_numkeys(mp); 7092 VERBOSE("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); 7093 7094 for (i = 0; i < nkeys; i++) { 7095 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ 7096 key.iov_len = nsize = mp->mp_leaf2_ksize; 7097 key.iov_base = page_leaf2key(mp, i, nsize); 7098 total += nsize; 7099 VERBOSE("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); 7100 continue; 7101 } 7102 node = page_node(mp, i); 7103 key.iov_len = node_ks(node); 7104 key.iov_base = node->mn_data; 7105 nsize = (unsigned)(NODESIZE + key.iov_len); 7106 if (IS_BRANCH(mp)) { 7107 VERBOSE("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), 7108 DKEY(&key)); 7109 total += nsize; 7110 } else { 7111 if (node_flags(node) & F_BIGDATA) 7112 nsize += sizeof(pgno_t); 7113 else 7114 nsize += (unsigned)node_ds(node); 7115 total += nsize; 7116 nsize += sizeof(indx_t); 7117 VERBOSE("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), 7118 leafnode_type(node)); 7119 } 7120 total = EVEN(total); 7121 } 7122 VERBOSE("Total: header %u + contents %u + unused %u\n", 7123 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, 7124 page_room(mp)); 7125 } 7126 7127 /*----------------------------------------------------------------------------*/ 7128 7129 /* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */ 7130 #define XCURSOR_INITED(mc) \ 7131 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 7132 7133 /* Update sub-page pointer, if any, in mc->mc_xcursor. 7134 * Needed when the node which contains the sub-page may have moved. 7135 * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ 7136 #define XCURSOR_REFRESH(mc, mp, ki) \ 7137 do { \ 7138 MDBX_page *xr_pg = (mp); \ 7139 MDBX_node *xr_node = page_node(xr_pg, ki); \ 7140 if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ 7141 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ 7142 } while (0) 7143 7144 MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { 7145 for (MDBX_cursor *scan = mc->mc_txn->mt_cursors[mc->mc_dbi]; scan; 7146 scan = scan->mc_next) 7147 if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) 7148 return true; 7149 return false; 7150 } 7151 7152 /* Perform act while tracking temporary cursor mn */ 7153 #define WITH_CURSOR_TRACKING(mn, act) \ 7154 do { \ 7155 cASSERT(&(mn), \ 7156 mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ 7157 cASSERT(&(mn), !cursor_is_tracked(&(mn))); \ 7158 MDBX_cursor mc_dummy; \ 7159 MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ 7160 MDBX_cursor *tracked = &(mn); \ 7161 if ((mn).mc_flags & C_SUB) { \ 7162 mc_dummy.mc_flags = C_INITIALIZED; \ 7163 mc_dummy.mc_top = 0; \ 7164 mc_dummy.mc_snum = 0; \ 7165 mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ 7166 tracked = &mc_dummy; \ 7167 } \ 7168 tracked->mc_next = *tracking_head; \ 7169 *tracking_head = tracked; \ 7170 { act; } \ 7171 *tracking_head = tracked->mc_next; \ 7172 } while (0) 7173 7174 int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, 7175 const MDBX_val *b) { 7176 eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); 7177 return txn->mt_dbxs[dbi].md_cmp(a, b); 7178 } 7179 7180 int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, 7181 const MDBX_val *b) { 7182 eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); 7183 return txn->mt_dbxs[dbi].md_dcmp(a, b); 7184 } 7185 7186 /* Allocate memory for a page. 7187 * Re-use old malloc'ed pages first for singletons, otherwise just malloc. 7188 * Set MDBX_TXN_ERROR on failure. */ 7189 static MDBX_page *page_malloc(MDBX_txn *txn, unsigned num) { 7190 MDBX_env *env = txn->mt_env; 7191 MDBX_page *np = env->me_dp_reserve; 7192 size_t size = env->me_psize; 7193 if (likely(num == 1 && np)) { 7194 eASSERT(env, env->me_dp_reserve_len > 0); 7195 MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); 7196 VALGRIND_MEMPOOL_ALLOC(env, np, size); 7197 VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); 7198 env->me_dp_reserve = np->mp_next; 7199 env->me_dp_reserve_len -= 1; 7200 } else { 7201 size = pgno2bytes(env, num); 7202 np = osal_malloc(size); 7203 if (unlikely(!np)) { 7204 txn->mt_flags |= MDBX_TXN_ERROR; 7205 return np; 7206 } 7207 VALGRIND_MEMPOOL_ALLOC(env, np, size); 7208 } 7209 7210 if ((env->me_flags & MDBX_NOMEMINIT) == 0) { 7211 /* For a single page alloc, we init everything after the page header. 7212 * For multi-page, we init the final page; if the caller needed that 7213 * many pages they will be filling in at least up to the last page. */ 7214 size_t skip = PAGEHDRSZ; 7215 if (num > 1) 7216 skip += pgno2bytes(env, num - 1); 7217 memset((char *)np + skip, 0, size - skip); 7218 } 7219 #if MDBX_DEBUG 7220 np->mp_pgno = 0; 7221 #endif 7222 VALGRIND_MAKE_MEM_UNDEFINED(np, size); 7223 np->mp_flags = 0; 7224 np->mp_pages = num; 7225 return np; 7226 } 7227 7228 /* Free a shadow dirty page */ 7229 static void dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { 7230 VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); 7231 MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); 7232 if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) 7233 memset(dp, -1, pgno2bytes(env, npages)); 7234 if (npages == 1 && 7235 env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { 7236 MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), 7237 pgno2bytes(env, npages) - 7238 sizeof(dp->mp_next)); 7239 dp->mp_next = env->me_dp_reserve; 7240 VALGRIND_MEMPOOL_FREE(env, dp); 7241 env->me_dp_reserve = dp; 7242 env->me_dp_reserve_len += 1; 7243 } else { 7244 /* large pages just get freed directly */ 7245 VALGRIND_MEMPOOL_FREE(env, dp); 7246 osal_free(dp); 7247 } 7248 } 7249 7250 /* Return all dirty pages to dpage list */ 7251 static void dlist_free(MDBX_txn *txn) { 7252 MDBX_env *env = txn->mt_env; 7253 MDBX_dpl *const dl = txn->tw.dirtylist; 7254 7255 for (unsigned i = 1; i <= dl->length; i++) 7256 dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); 7257 7258 dpl_clear(dl); 7259 } 7260 7261 static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { 7262 cASSERT(mc, (mc->mc_flags & C_SUB) != 0); 7263 MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); 7264 MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); 7265 cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); 7266 cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); 7267 return couple->outer.mc_db; 7268 } 7269 7270 MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { 7271 const MDBX_dpl *const dl = txn->tw.dirtylist; 7272 assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); 7273 tASSERT(txn, txn->tw.dirtyroom + dl->length == 7274 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 7275 : txn->mt_env->me_options.dp_limit)); 7276 7277 if (!AUDIT_ENABLED()) 7278 return true; 7279 7280 unsigned loose = 0, pages = 0; 7281 for (unsigned i = dl->length; i > 0; --i) { 7282 const MDBX_page *const dp = dl->items[i].ptr; 7283 if (!dp) 7284 continue; 7285 7286 tASSERT(txn, dp->mp_pgno == dl->items[i].pgno); 7287 if (unlikely(dp->mp_pgno != dl->items[i].pgno)) 7288 return false; 7289 7290 const uint32_t age = dpl_age(txn, i); 7291 tASSERT(txn, age < UINT32_MAX / 3); 7292 if (unlikely(age > UINT32_MAX / 3)) 7293 return false; 7294 7295 tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); 7296 if (dp->mp_flags == P_LOOSE) { 7297 loose += 1; 7298 } else if (unlikely(!IS_MODIFIABLE(txn, dp))) 7299 return false; 7300 7301 const unsigned num = dpl_npages(dl, i); 7302 pages += num; 7303 tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); 7304 if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) 7305 return false; 7306 7307 if (i < dl->sorted) { 7308 tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); 7309 if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) 7310 return false; 7311 } 7312 7313 const unsigned rpa = 7314 pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, txn->mt_next_pgno); 7315 tASSERT(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || 7316 txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); 7317 if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && 7318 unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) 7319 return false; 7320 if (num > 1) { 7321 const unsigned rpb = pnl_search(txn->tw.reclaimed_pglist, 7322 dp->mp_pgno + num - 1, txn->mt_next_pgno); 7323 tASSERT(txn, rpa == rpb); 7324 if (unlikely(rpa != rpb)) 7325 return false; 7326 } 7327 } 7328 7329 tASSERT(txn, loose == txn->tw.loose_count); 7330 if (unlikely(loose != txn->tw.loose_count)) 7331 return false; 7332 7333 tASSERT(txn, pages == dl->pages_including_loose); 7334 if (unlikely(pages != dl->pages_including_loose)) 7335 return false; 7336 7337 for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { 7338 const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); 7339 tASSERT(txn, !dp); 7340 if (unlikely(dp)) 7341 return false; 7342 } 7343 7344 return true; 7345 } 7346 7347 #if MDBX_ENABLE_REFUND 7348 static void refund_reclaimed(MDBX_txn *txn) { 7349 /* Scanning in descend order */ 7350 pgno_t next_pgno = txn->mt_next_pgno; 7351 const MDBX_PNL pnl = txn->tw.reclaimed_pglist; 7352 tASSERT(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); 7353 #if MDBX_PNL_ASCENDING 7354 unsigned i = MDBX_PNL_SIZE(pnl); 7355 tASSERT(txn, pnl[i] == next_pgno - 1); 7356 while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) 7357 ; 7358 MDBX_PNL_SIZE(pnl) = i; 7359 #else 7360 unsigned i = 1; 7361 tASSERT(txn, pnl[i] == next_pgno - 1); 7362 unsigned len = MDBX_PNL_SIZE(pnl); 7363 while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) 7364 ; 7365 MDBX_PNL_SIZE(pnl) = len -= i - 1; 7366 for (unsigned move = 0; move < len; ++move) 7367 pnl[1 + move] = pnl[i + move]; 7368 #endif 7369 VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, 7370 txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); 7371 txn->mt_next_pgno = next_pgno; 7372 tASSERT(txn, 7373 pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno - 1)); 7374 } 7375 7376 static void refund_loose(MDBX_txn *txn) { 7377 tASSERT(txn, txn->tw.loose_pages != nullptr); 7378 tASSERT(txn, txn->tw.loose_count > 0); 7379 7380 MDBX_dpl *const dl = txn->tw.dirtylist; 7381 tASSERT(txn, dl->length >= txn->tw.loose_count); 7382 7383 pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; 7384 MDBX_PNL suitable = onstack; 7385 7386 if (dl->length - dl->sorted > txn->tw.loose_count) { 7387 /* Dirty list is useless since unsorted. */ 7388 if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { 7389 suitable = pnl_alloc(txn->tw.loose_count); 7390 if (unlikely(!suitable)) 7391 return /* this is not a reason for transaction fail */; 7392 } 7393 7394 /* Collect loose-pages which may be refunded. */ 7395 tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); 7396 pgno_t most = MIN_PAGENO; 7397 unsigned w = 0; 7398 for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { 7399 tASSERT(txn, lp->mp_flags == P_LOOSE); 7400 tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); 7401 if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { 7402 tASSERT(txn, 7403 w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) 7404 : MDBX_PNL_ALLOCLEN(suitable))); 7405 suitable[++w] = lp->mp_pgno; 7406 most = (lp->mp_pgno > most) ? lp->mp_pgno : most; 7407 } 7408 } 7409 7410 if (most + 1 == txn->mt_next_pgno) { 7411 /* Sort suitable list and refund pages at the tail. */ 7412 MDBX_PNL_SIZE(suitable) = w; 7413 pnl_sort(suitable, MAX_PAGENO + 1); 7414 7415 /* Scanning in descend order */ 7416 const int step = MDBX_PNL_ASCENDING ? -1 : 1; 7417 const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; 7418 const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; 7419 tASSERT(txn, suitable[begin] >= suitable[end - step]); 7420 tASSERT(txn, most == suitable[begin]); 7421 7422 for (int i = begin + step; i != end; i += step) { 7423 if (suitable[i] != most - 1) 7424 break; 7425 most -= 1; 7426 } 7427 const unsigned refunded = txn->mt_next_pgno - most; 7428 DEBUG("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, 7429 most, txn->mt_next_pgno); 7430 txn->tw.loose_count -= refunded; 7431 txn->tw.dirtyroom += refunded; 7432 dl->pages_including_loose -= refunded; 7433 assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); 7434 txn->mt_next_pgno = most; 7435 7436 /* Filter-out dirty list */ 7437 unsigned r = 0; 7438 w = 0; 7439 if (dl->sorted) { 7440 do { 7441 if (dl->items[++r].pgno < most) { 7442 if (++w != r) 7443 dl->items[w] = dl->items[r]; 7444 } 7445 } while (r < dl->sorted); 7446 dl->sorted = w; 7447 } 7448 while (r < dl->length) { 7449 if (dl->items[++r].pgno < most) { 7450 if (++w != r) 7451 dl->items[w] = dl->items[r]; 7452 } 7453 } 7454 dpl_setlen(dl, w); 7455 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 7456 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 7457 : txn->mt_env->me_options.dp_limit)); 7458 7459 goto unlink_loose; 7460 } 7461 } else { 7462 /* Dirtylist is mostly sorted, just refund loose pages at the end. */ 7463 dpl_sort(txn); 7464 tASSERT(txn, 7465 dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); 7466 tASSERT(txn, dl->sorted == dl->length); 7467 7468 /* Scan dirtylist tail-forward and cutoff suitable pages. */ 7469 unsigned n; 7470 for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && 7471 dl->items[n].ptr->mp_flags == P_LOOSE; 7472 --n) { 7473 tASSERT(txn, n > 0); 7474 MDBX_page *dp = dl->items[n].ptr; 7475 DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno); 7476 tASSERT(txn, dp->mp_pgno == dl->items[n].pgno); 7477 txn->mt_next_pgno -= 1; 7478 } 7479 dpl_setlen(dl, n); 7480 7481 if (dl->sorted != dl->length) { 7482 const unsigned refunded = dl->sorted - dl->length; 7483 dl->sorted = dl->length; 7484 txn->tw.loose_count -= refunded; 7485 txn->tw.dirtyroom += refunded; 7486 dl->pages_including_loose -= refunded; 7487 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 7488 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 7489 : txn->mt_env->me_options.dp_limit)); 7490 7491 /* Filter-out loose chain & dispose refunded pages. */ 7492 unlink_loose: 7493 for (MDBX_page **link = &txn->tw.loose_pages; *link;) { 7494 MDBX_page *dp = *link; 7495 tASSERT(txn, dp->mp_flags == P_LOOSE); 7496 if (txn->mt_next_pgno > dp->mp_pgno) { 7497 link = &dp->mp_next; 7498 } else { 7499 *link = dp->mp_next; 7500 if ((txn->mt_flags & MDBX_WRITEMAP) == 0) 7501 dpage_free(txn->mt_env, dp, 1); 7502 } 7503 } 7504 } 7505 } 7506 7507 tASSERT(txn, dirtylist_check(txn)); 7508 if (suitable != onstack) 7509 pnl_free(suitable); 7510 txn->tw.loose_refund_wl = txn->mt_next_pgno; 7511 } 7512 7513 static bool txn_refund(MDBX_txn *txn) { 7514 const pgno_t before = txn->mt_next_pgno; 7515 7516 if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) 7517 refund_loose(txn); 7518 7519 while (true) { 7520 if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || 7521 MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) 7522 break; 7523 7524 refund_reclaimed(txn); 7525 if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) 7526 break; 7527 7528 const pgno_t memo = txn->mt_next_pgno; 7529 refund_loose(txn); 7530 if (memo == txn->mt_next_pgno) 7531 break; 7532 } 7533 7534 if (before == txn->mt_next_pgno) 7535 return false; 7536 7537 if (txn->tw.spill_pages) 7538 /* Squash deleted pagenums if we refunded any */ 7539 spill_purge(txn); 7540 7541 return true; 7542 } 7543 #else /* MDBX_ENABLE_REFUND */ 7544 static __inline bool txn_refund(MDBX_txn *txn) { 7545 (void)txn; 7546 /* No online auto-compactification. */ 7547 return false; 7548 } 7549 #endif /* MDBX_ENABLE_REFUND */ 7550 7551 __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, 7552 unsigned npages) { 7553 MDBX_env *const env = txn->mt_env; 7554 DEBUG("kill %u page(s) %" PRIaPGNO, npages, pgno); 7555 eASSERT(env, pgno >= NUM_METAS && npages); 7556 if (!IS_FROZEN(txn, mp)) { 7557 const size_t bytes = pgno2bytes(env, npages); 7558 memset(mp, -1, bytes); 7559 mp->mp_pgno = pgno; 7560 if ((env->me_flags & MDBX_WRITEMAP) == 0) 7561 osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); 7562 } else { 7563 struct iovec iov[MDBX_COMMIT_PAGES]; 7564 iov[0].iov_len = env->me_psize; 7565 iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; 7566 size_t iov_off = pgno2bytes(env, pgno); 7567 unsigned n = 1; 7568 while (--npages) { 7569 iov[n] = iov[0]; 7570 if (++n == MDBX_COMMIT_PAGES) { 7571 osal_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, 7572 pgno2bytes(env, MDBX_COMMIT_PAGES)); 7573 iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); 7574 n = 0; 7575 } 7576 } 7577 osal_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); 7578 } 7579 } 7580 7581 /* Remove page from dirty list */ 7582 static __inline void page_wash(MDBX_txn *txn, const unsigned di, 7583 MDBX_page *const mp, const unsigned npages) { 7584 tASSERT(txn, di && di <= txn->tw.dirtylist->length && 7585 txn->tw.dirtylist->items[di].ptr == mp); 7586 dpl_remove_ex(txn, di, npages); 7587 txn->tw.dirtyroom++; 7588 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 7589 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 7590 : txn->mt_env->me_options.dp_limit)); 7591 mp->mp_txnid = INVALID_TXNID; 7592 mp->mp_flags = P_BAD; 7593 VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); 7594 if (txn->mt_flags & MDBX_WRITEMAP) { 7595 VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), 7596 pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); 7597 MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), 7598 pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); 7599 } else 7600 dpage_free(txn->mt_env, mp, npages); 7601 } 7602 7603 /* Retire, loosen or free a single page. 7604 * 7605 * For dirty pages, saves single pages to a list for future reuse in this same 7606 * txn. It has been pulled from the GC and already resides on the dirty list, 7607 * but has been deleted. Use these pages first before pulling again from the GC. 7608 * 7609 * If the page wasn't dirtied in this txn, just add it 7610 * to this txn's free list. */ 7611 static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, 7612 MDBX_page *mp /* maybe null */, 7613 unsigned pageflags /* maybe unknown/zero */) { 7614 int rc; 7615 MDBX_txn *const txn = mc->mc_txn; 7616 tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); 7617 7618 /* During deleting entire subtrees, it is reasonable and possible to avoid 7619 * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: 7620 * - mp is null, i.e. the page has not yet been read; 7621 * - pagetype is known and the P_LEAF bit is set; 7622 * - we can determine the page status via scanning the lists 7623 * of dirty and spilled pages. 7624 * 7625 * On the other hand, this could be suboptimal for WRITEMAP mode, since 7626 * requires support the list of dirty pages and avoid explicit spilling. 7627 * So for flexibility and avoid extra internal dependencies we just 7628 * fallback to reading if dirty list was not allocated yet. */ 7629 unsigned di = 0, si = 0, npages = 1; 7630 bool is_frozen = false, is_spilled = false, is_shadowed = false; 7631 if (unlikely(!mp)) { 7632 if (ASSERT_ENABLED() && pageflags) { 7633 pgr_t check; 7634 check = page_get_any(mc, pgno, txn->mt_front); 7635 if (unlikely(check.err != MDBX_SUCCESS)) 7636 return check.err; 7637 tASSERT(txn, 7638 (check.page->mp_flags & ~P_LEAF2) == (pageflags & ~P_FROZEN)); 7639 tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); 7640 } 7641 if (pageflags & P_FROZEN) { 7642 is_frozen = true; 7643 if (ASSERT_ENABLED()) { 7644 for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { 7645 tASSERT(txn, !search_spilled(scan, pgno)); 7646 tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); 7647 } 7648 } 7649 goto status_done; 7650 } else if (pageflags && txn->tw.dirtylist) { 7651 if ((di = dpl_exist(txn, pgno)) != 0) { 7652 mp = txn->tw.dirtylist->items[di].ptr; 7653 tASSERT(txn, IS_MODIFIABLE(txn, mp)); 7654 goto status_done; 7655 } 7656 if ((si = search_spilled(txn, pgno)) != 0) { 7657 is_spilled = true; 7658 goto status_done; 7659 } 7660 for (MDBX_txn *parent = txn->mt_parent; parent; 7661 parent = parent->mt_parent) { 7662 if (dpl_exist(parent, pgno)) { 7663 is_shadowed = true; 7664 goto status_done; 7665 } 7666 if (search_spilled(parent, pgno)) { 7667 is_spilled = true; 7668 goto status_done; 7669 } 7670 } 7671 is_frozen = true; 7672 goto status_done; 7673 } 7674 7675 pgr_t pg = page_get_any(mc, pgno, txn->mt_front); 7676 if (unlikely(pg.err != MDBX_SUCCESS)) 7677 return pg.err; 7678 mp = pg.page; 7679 tASSERT(txn, !pageflags || mp->mp_flags == pageflags); 7680 pageflags = mp->mp_flags; 7681 } 7682 7683 is_frozen = IS_FROZEN(txn, mp); 7684 if (!is_frozen) { 7685 const bool is_dirty = IS_MODIFIABLE(txn, mp); 7686 is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); 7687 is_shadowed = IS_SHADOWED(txn, mp); 7688 if (is_dirty) { 7689 tASSERT(txn, !is_spilled); 7690 tASSERT(txn, !search_spilled(txn, pgno)); 7691 tASSERT(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || 7692 (txn->mt_flags & MDBX_WRITEMAP)); 7693 } else { 7694 tASSERT(txn, !debug_dpl_find(txn, pgno)); 7695 } 7696 7697 di = is_dirty ? dpl_exist(txn, pgno) : 0; 7698 si = is_spilled ? search_spilled(txn, pgno) : 0; 7699 tASSERT(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); 7700 } else { 7701 tASSERT(txn, !IS_MODIFIABLE(txn, mp)); 7702 tASSERT(txn, !IS_SPILLED(txn, mp)); 7703 tASSERT(txn, !IS_SHADOWED(txn, mp)); 7704 } 7705 7706 status_done: 7707 if (likely((pageflags & P_OVERFLOW) == 0)) { 7708 STATIC_ASSERT(P_BRANCH == 1); 7709 const bool is_branch = pageflags & P_BRANCH; 7710 if (unlikely(mc->mc_flags & C_SUB)) { 7711 MDBX_db *outer = outer_db(mc); 7712 cASSERT(mc, !is_branch || outer->md_branch_pages > 0); 7713 outer->md_branch_pages -= is_branch; 7714 cASSERT(mc, is_branch || outer->md_leaf_pages > 0); 7715 outer->md_leaf_pages -= 1 - is_branch; 7716 } 7717 cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0); 7718 mc->mc_db->md_branch_pages -= is_branch; 7719 cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); 7720 mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; 7721 } else { 7722 npages = mp->mp_pages; 7723 cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); 7724 mc->mc_db->md_overflow_pages -= npages; 7725 } 7726 7727 if (is_frozen) { 7728 retire: 7729 DEBUG("retire %u page %" PRIaPGNO, npages, pgno); 7730 rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); 7731 tASSERT(txn, dirtylist_check(txn)); 7732 return rc; 7733 } 7734 7735 /* Возврат страниц в нераспределенный "хвост" БД. 7736 * Содержимое страниц не уничтожается, а для вложенных транзакций граница 7737 * нераспределенного "хвоста" БД сдвигается только при их коммите. */ 7738 if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { 7739 const char *kind = nullptr; 7740 if (di) { 7741 /* Страница испачкана в этой транзакции, но до этого могла быть 7742 * аллоцирована, испачкана и пролита в одной из родительских транзакций. 7743 * Её МОЖНО вытолкнуть в нераспределенный хвост. */ 7744 kind = "dirty"; 7745 /* Remove from dirty list */ 7746 page_wash(txn, di, mp, npages); 7747 } else if (si) { 7748 /* Страница пролита в этой транзакции, т.е. она аллоцирована 7749 * и запачкана в этой или одной из родительских транзакций. 7750 * Её МОЖНО вытолкнуть в нераспределенный хвост. */ 7751 kind = "spilled"; 7752 spill_remove(txn, si, npages); 7753 } else if ((txn->mt_flags & MDBX_WRITEMAP)) { 7754 kind = "writemap"; 7755 tASSERT(txn, mp && IS_MODIFIABLE(txn, mp)); 7756 } else { 7757 /* Страница аллоцирована, запачкана и возможно пролита в одной 7758 * из родительских транзакций. 7759 * Её МОЖНО вытолкнуть в нераспределенный хвост. */ 7760 kind = "parent's"; 7761 if (ASSERT_ENABLED() && mp) { 7762 kind = nullptr; 7763 for (MDBX_txn *parent = txn->mt_parent; parent; 7764 parent = parent->mt_parent) { 7765 if (search_spilled(parent, pgno)) { 7766 kind = "parent-spilled"; 7767 tASSERT(txn, is_spilled); 7768 break; 7769 } 7770 if (mp == debug_dpl_find(parent, pgno)) { 7771 kind = "parent-dirty"; 7772 tASSERT(txn, !is_spilled); 7773 break; 7774 } 7775 } 7776 tASSERT(txn, kind != nullptr); 7777 } 7778 tASSERT(txn, is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); 7779 } 7780 DEBUG("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); 7781 txn->mt_next_pgno = pgno; 7782 txn_refund(txn); 7783 return MDBX_SUCCESS; 7784 } 7785 7786 if (di) { 7787 /* Dirty page from this transaction */ 7788 /* If suitable we can reuse it through loose list */ 7789 if (likely(npages == 1 && 7790 txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && 7791 (!MDBX_ENABLE_REFUND || 7792 /* skip pages near to the end in favor of compactification */ 7793 txn->mt_next_pgno > 7794 pgno + txn->mt_env->me_options.dp_loose_limit || 7795 txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { 7796 DEBUG("loosen dirty page %" PRIaPGNO, pgno); 7797 mp->mp_flags = P_LOOSE; 7798 mp->mp_next = txn->tw.loose_pages; 7799 txn->tw.loose_pages = mp; 7800 txn->tw.loose_count++; 7801 #if MDBX_ENABLE_REFUND 7802 txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) 7803 ? pgno + 2 7804 : txn->tw.loose_refund_wl; 7805 #endif /* MDBX_ENABLE_REFUND */ 7806 if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) 7807 memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); 7808 VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), 7809 txn->mt_env->me_psize - PAGEHDRSZ); 7810 MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), 7811 txn->mt_env->me_psize - PAGEHDRSZ); 7812 return MDBX_SUCCESS; 7813 } 7814 7815 #if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) 7816 if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) 7817 #endif 7818 { 7819 /* Страница могла быть изменена в одной из родительских транзакций, 7820 * в том числе, позже выгружена и затем снова загружена и изменена. 7821 * В обоих случаях её нельзя затирать на диске и помечать недоступной 7822 * в asan и/или valgrind */ 7823 for (MDBX_txn *parent = txn->mt_parent; 7824 parent && (parent->mt_flags & MDBX_TXN_SPILLS); 7825 parent = parent->mt_parent) { 7826 if (intersect_spilled(parent, pgno, npages)) 7827 goto skip_invalidate; 7828 if (dpl_intersect(parent, pgno, npages)) 7829 goto skip_invalidate; 7830 } 7831 7832 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 7833 if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) 7834 #endif 7835 kill_page(txn, mp, pgno, npages); 7836 if (!(txn->mt_flags & MDBX_WRITEMAP)) { 7837 VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), 7838 pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); 7839 MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), 7840 pgno2bytes(txn->mt_env, npages) - 7841 PAGEHDRSZ); 7842 } 7843 } 7844 skip_invalidate: 7845 /* Remove from dirty list */ 7846 page_wash(txn, di, mp, npages); 7847 7848 reclaim: 7849 DEBUG("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); 7850 rc = pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); 7851 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 7852 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 7853 tASSERT(txn, dirtylist_check(txn)); 7854 return rc; 7855 } 7856 7857 if (si) { 7858 /* Page ws spilled in this txn */ 7859 spill_remove(txn, si, npages); 7860 /* Страница могла быть выделена и затем пролита в этой транзакции, 7861 * тогда её необходимо поместить в reclaimed-список. 7862 * Либо она могла быть выделена в одной из родительских транзакций и затем 7863 * пролита в этой транзакции, тогда её необходимо поместить в 7864 * retired-список для последующей фильтрации при коммите. */ 7865 for (MDBX_txn *parent = txn->mt_parent; parent; 7866 parent = parent->mt_parent) { 7867 if (dpl_exist(parent, pgno)) 7868 goto retire; 7869 } 7870 /* Страница точно была выделена в этой транзакции 7871 * и теперь может быть использована повторно. */ 7872 goto reclaim; 7873 } 7874 7875 if (is_shadowed) { 7876 /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ 7877 if (ASSERT_ENABLED()) { 7878 const MDBX_page *parent_dp = nullptr; 7879 /* Check parent(s)'s dirty lists. */ 7880 for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; 7881 parent = parent->mt_parent) { 7882 tASSERT(txn, !search_spilled(parent, pgno)); 7883 parent_dp = debug_dpl_find(parent, pgno); 7884 } 7885 tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); 7886 } 7887 /* Страница была выделена в родительской транзакции и теперь может быть 7888 * использована повторно, но только внутри этой транзакции, либо дочерних. 7889 */ 7890 goto reclaim; 7891 } 7892 7893 /* Страница может входить в доступный читателям MVCC-снимок, либо же она 7894 * могла быть выделена, а затем пролита в одной из родительских 7895 * транзакций. Поэтому пока помещаем её в retired-список, который будет 7896 * фильтроваться относительно dirty- и spilled-списков родительских 7897 * транзакций при коммите дочерних транзакций, либо же будет записан 7898 * в GC в неизменном виде. */ 7899 goto retire; 7900 } 7901 7902 static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { 7903 return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); 7904 } 7905 7906 struct iov_ctx { 7907 unsigned iov_items; 7908 size_t iov_bytes; 7909 size_t iov_off; 7910 pgno_t flush_begin; 7911 pgno_t flush_end; 7912 struct iovec iov[MDBX_COMMIT_PAGES]; 7913 }; 7914 7915 static __inline void iov_init(MDBX_txn *const txn, struct iov_ctx *ctx) { 7916 ctx->flush_begin = MAX_PAGENO; 7917 ctx->flush_end = MIN_PAGENO; 7918 ctx->iov_items = 0; 7919 ctx->iov_bytes = 0; 7920 ctx->iov_off = 0; 7921 (void)txn; 7922 } 7923 7924 static __inline void iov_done(MDBX_txn *const txn, struct iov_ctx *ctx) { 7925 tASSERT(txn, ctx->iov_items == 0); 7926 #if defined(__linux__) || defined(__gnu_linux__) 7927 MDBX_env *const env = txn->mt_env; 7928 if (!(txn->mt_flags & MDBX_WRITEMAP) && linux_kernel_version < 0x02060b00) 7929 /* Linux kernels older than version 2.6.11 ignore the addr and nbytes 7930 * arguments, making this function fairly expensive. Therefore, the 7931 * whole cache is always flushed. */ 7932 osal_flush_incoherent_mmap( 7933 env->me_map + pgno2bytes(env, ctx->flush_begin), 7934 pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); 7935 #endif /* Linux */ 7936 } 7937 7938 static int iov_write(MDBX_txn *const txn, struct iov_ctx *ctx) { 7939 tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); 7940 tASSERT(txn, ctx->iov_items > 0); 7941 7942 MDBX_env *const env = txn->mt_env; 7943 int rc; 7944 if (likely(ctx->iov_items == 1)) { 7945 eASSERT(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); 7946 rc = osal_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, 7947 ctx->iov_off); 7948 } else { 7949 rc = osal_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, 7950 ctx->iov_bytes); 7951 } 7952 7953 if (unlikely(rc != MDBX_SUCCESS)) 7954 ERROR("Write error: %s", mdbx_strerror(rc)); 7955 else { 7956 VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, 7957 ctx->iov_bytes); 7958 MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off, 7959 ctx->iov_bytes); 7960 } 7961 7962 unsigned iov_items = ctx->iov_items; 7963 #if MDBX_ENABLE_PGOP_STAT 7964 txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items; 7965 #endif /* MDBX_ENABLE_PGOP_STAT */ 7966 ctx->iov_items = 0; 7967 ctx->iov_bytes = 0; 7968 7969 uint64_t timestamp = 0; 7970 for (unsigned i = 0; i < iov_items; i++) { 7971 MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base; 7972 const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno); 7973 /* check with timeout as the workaround 7974 * for todo4recovery://erased_by_github/libmdbx/issues/269 */ 7975 while (likely(rc == MDBX_SUCCESS) && 7976 unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { 7977 if (!timestamp) { 7978 timestamp = osal_monotime(); 7979 iov_done(txn, ctx); 7980 WARNING( 7981 "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, 7982 "(workaround for incoherent flaw of unified page/buffer cache)"); 7983 } else if (unlikely(osal_monotime() - timestamp > 65536 / 10)) { 7984 ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, 7985 "(workaround for incoherent flaw of unified page/buffer cache)"); 7986 rc = MDBX_CORRUPTED; 7987 } 7988 #if defined(_WIN32) || defined(_WIN64) 7989 SwitchToThread(); 7990 #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) 7991 sched_yield(); 7992 #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) 7993 pthread_yield(); 7994 #else 7995 usleep(42); 7996 #endif 7997 } 7998 dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); 7999 } 8000 return rc; 8001 } 8002 8003 static int iov_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, 8004 unsigned npages) { 8005 MDBX_env *const env = txn->mt_env; 8006 tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); 8007 tASSERT(txn, IS_MODIFIABLE(txn, dp)); 8008 tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); 8009 8010 ctx->flush_begin = 8011 (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; 8012 ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) 8013 ? ctx->flush_end 8014 : dp->mp_pgno + npages; 8015 env->me_lck->mti_unsynced_pages.weak += npages; 8016 8017 if (IS_SHADOWED(txn, dp)) { 8018 tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); 8019 dp->mp_txnid = txn->mt_txnid; 8020 tASSERT(txn, IS_SPILLED(txn, dp)); 8021 const size_t size = pgno2bytes(env, npages); 8022 if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || 8023 ctx->iov_items == ARRAY_LENGTH(ctx->iov) || 8024 ctx->iov_bytes + size > MAX_WRITE) { 8025 if (ctx->iov_items) { 8026 int err = iov_write(txn, ctx); 8027 if (unlikely(err != MDBX_SUCCESS)) 8028 return err; 8029 #if defined(__linux__) || defined(__gnu_linux__) 8030 if (linux_kernel_version >= 0x02060b00) 8031 /* Linux kernels older than version 2.6.11 ignore the addr and nbytes 8032 * arguments, making this function fairly expensive. Therefore, the 8033 * whole cache is always flushed. */ 8034 #endif /* Linux */ 8035 osal_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, 8036 env->me_os_psize); 8037 } 8038 ctx->iov_off = pgno2bytes(env, dp->mp_pgno); 8039 } 8040 ctx->iov[ctx->iov_items].iov_base = (void *)dp; 8041 ctx->iov[ctx->iov_items].iov_len = size; 8042 ctx->iov_items += 1; 8043 ctx->iov_bytes += size; 8044 } else { 8045 tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); 8046 } 8047 return MDBX_SUCCESS; 8048 } 8049 8050 static int spill_page(MDBX_txn *txn, struct iov_ctx *ctx, MDBX_page *dp, 8051 unsigned npages) { 8052 tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); 8053 pgno_t pgno = dp->mp_pgno; 8054 int err = iov_page(txn, ctx, dp, npages); 8055 if (likely(err == MDBX_SUCCESS)) { 8056 err = pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); 8057 #if MDBX_ENABLE_PGOP_STAT 8058 if (likely(err == MDBX_SUCCESS)) 8059 txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; 8060 #endif /* MDBX_ENABLE_PGOP_STAT */ 8061 } 8062 return err; 8063 } 8064 8065 /* Set unspillable LRU-label for dirty pages watched by txn. 8066 * Returns the number of pages marked as unspillable. */ 8067 static unsigned cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { 8068 unsigned keep = 0; 8069 while (mc->mc_flags & C_INITIALIZED) { 8070 for (unsigned i = 0; i < mc->mc_snum; ++i) { 8071 const MDBX_page *mp = mc->mc_pg[i]; 8072 if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { 8073 unsigned const n = dpl_search(txn, mp->mp_pgno); 8074 if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && 8075 dpl_age(txn, n)) { 8076 txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; 8077 ++keep; 8078 } 8079 } 8080 } 8081 if (!mc->mc_xcursor) 8082 break; 8083 mc = &mc->mc_xcursor->mx_cursor; 8084 } 8085 return keep; 8086 } 8087 8088 static unsigned txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { 8089 unsigned keep = m0 ? cursor_keep(txn, m0) : 0; 8090 for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) 8091 if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && 8092 txn->mt_dbs[i].md_root != P_INVALID) 8093 for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) 8094 if (mc != m0) 8095 keep += cursor_keep(txn, mc); 8096 return keep; 8097 } 8098 8099 /* Returns the spilling priority (0..255) for a dirty page: 8100 * 0 = should be spilled; 8101 * ... 8102 * > 255 = must not be spilled. */ 8103 static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, 8104 const uint32_t reciprocal) { 8105 MDBX_dpl *const dl = txn->tw.dirtylist; 8106 const uint32_t age = dpl_age(txn, i); 8107 const unsigned npages = dpl_npages(dl, i); 8108 const pgno_t pgno = dl->items[i].pgno; 8109 if (age == 0) { 8110 DEBUG("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); 8111 return 256; 8112 } 8113 8114 MDBX_page *const dp = dl->items[i].ptr; 8115 if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { 8116 DEBUG("skip %s %u page %" PRIaPGNO, 8117 (dp->mp_flags & P_LOOSE) ? "loose" 8118 : (dp->mp_flags & P_LOOSE) ? "loose" 8119 : "parent-spilled", 8120 npages, pgno); 8121 return 256; 8122 } 8123 8124 /* Can't spill twice, 8125 * make sure it's not already in a parent's spill list(s). */ 8126 MDBX_txn *parent = txn->mt_parent; 8127 if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { 8128 do 8129 if (intersect_spilled(parent, pgno, npages)) { 8130 DEBUG("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); 8131 dp->mp_flags |= P_SPILLED; 8132 return 256; 8133 } 8134 while ((parent = parent->mt_parent) != nullptr); 8135 } 8136 8137 tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); 8138 unsigned prio = age * reciprocal >> 24; 8139 tASSERT(txn, prio < 256); 8140 if (likely(npages == 1)) 8141 return prio = 256 - prio; 8142 8143 /* make a large/overflow pages be likely to spill */ 8144 uint32_t factor = npages | npages >> 1; 8145 factor |= factor >> 2; 8146 factor |= factor >> 4; 8147 factor |= factor >> 8; 8148 factor |= factor >> 16; 8149 factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; 8150 factor = (factor < 256) ? 255 - factor : 0; 8151 tASSERT(txn, factor < 256 && factor < (256 - prio)); 8152 return prio = factor; 8153 } 8154 8155 /* Spill pages from the dirty list back to disk. 8156 * This is intended to prevent running into MDBX_TXN_FULL situations, 8157 * but note that they may still occur in a few cases: 8158 * 8159 * 1) our estimate of the txn size could be too small. Currently this 8160 * seems unlikely, except with a large number of MDBX_MULTIPLE items. 8161 * 8162 * 2) child txns may run out of space if their parents dirtied a 8163 * lot of pages and never spilled them. TODO: we probably should do 8164 * a preemptive spill during mdbx_txn_begin() of a child txn, if 8165 * the parent's dirtyroom is below a given threshold. 8166 * 8167 * Otherwise, if not using nested txns, it is expected that apps will 8168 * not run into MDBX_TXN_FULL any more. The pages are flushed to disk 8169 * the same way as for a txn commit, e.g. their dirty status is cleared. 8170 * If the txn never references them again, they can be left alone. 8171 * If the txn only reads them, they can be used without any fuss. 8172 * If the txn writes them again, they can be dirtied immediately without 8173 * going thru all of the work of page_touch(). Such references are 8174 * handled by page_unspill(). 8175 * 8176 * Also note, we never spill DB root pages, nor pages of active cursors, 8177 * because we'll need these back again soon anyway. And in nested txns, 8178 * we can't spill a page in a child txn if it was already spilled in a 8179 * parent txn. That would alter the parent txns' data even though 8180 * the child hasn't committed yet, and we'd have no way to undo it if 8181 * the child aborted. */ 8182 static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, 8183 const unsigned need) { 8184 #if xMDBX_DEBUG_SPILLING != 1 8185 /* production mode */ 8186 if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) 8187 return MDBX_SUCCESS; 8188 unsigned wanna_spill = need - txn->tw.dirtyroom; 8189 #else 8190 /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */ 8191 unsigned wanna_spill = 8192 (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; 8193 #endif /* xMDBX_DEBUG_SPILLING */ 8194 8195 const unsigned dirty = txn->tw.dirtylist->length; 8196 const unsigned spill_min = 8197 txn->mt_env->me_options.spill_min_denominator 8198 ? dirty / txn->mt_env->me_options.spill_min_denominator 8199 : 0; 8200 const unsigned spill_max = 8201 dirty - (txn->mt_env->me_options.spill_max_denominator 8202 ? dirty / txn->mt_env->me_options.spill_max_denominator 8203 : 0); 8204 wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min; 8205 wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max; 8206 if (!wanna_spill) 8207 return MDBX_SUCCESS; 8208 8209 NOTICE("spilling %u dirty-entries (have %u dirty-room, need %u)", wanna_spill, 8210 txn->tw.dirtyroom, need); 8211 tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill); 8212 8213 struct iov_ctx ctx; 8214 iov_init(txn, &ctx); 8215 int rc = MDBX_SUCCESS; 8216 if (txn->mt_flags & MDBX_WRITEMAP) { 8217 MDBX_dpl *const dl = txn->tw.dirtylist; 8218 const unsigned span = dl->length - txn->tw.loose_count; 8219 txn->tw.dirtyroom += span; 8220 unsigned r, w; 8221 for (w = 0, r = 1; r <= dl->length; ++r) { 8222 MDBX_page *dp = dl->items[r].ptr; 8223 if (dp->mp_flags & P_LOOSE) 8224 dl->items[++w] = dl->items[r]; 8225 else if (!MDBX_FAKE_SPILL_WRITEMAP) { 8226 rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); 8227 tASSERT(txn, rc == MDBX_SUCCESS); 8228 } 8229 } 8230 8231 tASSERT(txn, span == r - 1 - w && w == txn->tw.loose_count); 8232 dl->sorted = (dl->sorted == dl->length) ? w : 0; 8233 dpl_setlen(dl, w); 8234 tASSERT(txn, dirtylist_check(txn)); 8235 8236 if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { 8237 MDBX_env *const env = txn->mt_env; 8238 #if MDBX_ENABLE_PGOP_STAT 8239 env->me_lck->mti_pgop_stat.wops.weak += 1; 8240 #endif /* MDBX_ENABLE_PGOP_STAT */ 8241 rc = osal_msync(&env->me_dxb_mmap, 8242 pgno_align2os_bytes(env, ctx.flush_begin), 8243 pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), 8244 MDBX_SYNC_NONE); 8245 } 8246 return rc; 8247 } 8248 8249 tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); 8250 if (!txn->tw.spill_pages) { 8251 txn->tw.spill_least_removed = INT_MAX; 8252 txn->tw.spill_pages = pnl_alloc(wanna_spill); 8253 if (unlikely(!txn->tw.spill_pages)) { 8254 rc = MDBX_ENOMEM; 8255 bailout: 8256 txn->mt_flags |= MDBX_TXN_ERROR; 8257 return rc; 8258 } 8259 } else { 8260 /* purge deleted slots */ 8261 spill_purge(txn); 8262 rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill); 8263 (void)rc /* ignore since the resulting list may be shorter 8264 and pnl_append() will increase pnl on demand */ 8265 ; 8266 } 8267 8268 /* Сортируем чтобы запись на диск была полее последовательна */ 8269 MDBX_dpl *const dl = dpl_sort(txn); 8270 8271 /* Preserve pages which may soon be dirtied again */ 8272 const unsigned unspillable = txn_keep(txn, m0); 8273 if (unspillable + txn->tw.loose_count >= dl->length) { 8274 #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ 8275 if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) 8276 return MDBX_SUCCESS; 8277 #endif /* xMDBX_DEBUG_SPILLING */ 8278 ERROR("all %u dirty pages are unspillable since referenced " 8279 "by a cursor(s), use fewer cursors or increase " 8280 "MDBX_opt_txn_dp_limit", 8281 unspillable); 8282 goto done; 8283 } 8284 8285 /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU, 8286 * но при этом учесть важные поправки: 8287 * - лучше выталкивать старые large/overflow страницы, так будет освобождено 8288 * больше памяти, а также так как они (в текущем понимании) гораздо реже 8289 * повторно изменяются; 8290 * - при прочих равных лучше выталкивать смежные страницы, так будет 8291 * меньше I/O операций; 8292 * - желательно потратить на это меньше времени чем std::partial_sort_copy; 8293 * 8294 * Решение: 8295 * - Квантуем весь диапазон lru-меток до 256 значений и задействуем один 8296 * проход 8-битного radix-sort. В результате получаем 256 уровней 8297 * "свежести", в том числе значение lru-метки, старее которой страницы 8298 * должны быть выгружены; 8299 * - Двигаемся последовательно в сторону увеличения номеров страниц 8300 * и выталкиваем страницы с lru-меткой старее отсекающего значения, 8301 * пока не вытолкнем достаточно; 8302 * - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва 8303 * I/O операций выталкиваем и их, если они попадают в первую половину 8304 * между выталкиваемыми и самыми свежими lru-метками; 8305 * - дополнительно при сортировке умышленно старим large/overflow страницы, 8306 * тем самым повышая их шансы на выталкивание. */ 8307 8308 /* get min/max of LRU-labels */ 8309 uint32_t age_max = 0; 8310 for (unsigned i = 1; i <= dl->length; ++i) { 8311 const uint32_t age = dpl_age(txn, i); 8312 age_max = (age_max >= age) ? age_max : age; 8313 } 8314 8315 VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); 8316 8317 /* half of 8-bit radix-sort */ 8318 unsigned radix_counters[256], spillable = 0, spilled = 0; 8319 memset(&radix_counters, 0, sizeof(radix_counters)); 8320 const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); 8321 for (unsigned i = 1; i <= dl->length; ++i) { 8322 unsigned prio = spill_prio(txn, i, reciprocal); 8323 if (prio < 256) { 8324 radix_counters[prio] += 1; 8325 spillable += 1; 8326 } 8327 } 8328 8329 if (likely(spillable > 0)) { 8330 unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; 8331 for (unsigned i = 1; i < 256; i++) { 8332 if (amount < wanna_spill) { 8333 prio2spill = i; 8334 prio2adjacent = i + (257 - i) / 2; 8335 amount += radix_counters[i]; 8336 } else if (amount + amount < spillable + wanna_spill 8337 /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) { 8338 prio2adjacent = i; 8339 amount += radix_counters[i]; 8340 } else 8341 break; 8342 } 8343 8344 VERBOSE("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " 8345 "wanna_spill %u", 8346 prio2spill, prio2adjacent, amount, spillable, wanna_spill); 8347 tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); 8348 8349 unsigned prev_prio = 256; 8350 unsigned r, w, prio; 8351 for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; 8352 prev_prio = prio, ++r) { 8353 prio = spill_prio(txn, r, reciprocal); 8354 MDBX_page *const dp = dl->items[r].ptr; 8355 if (prio < prio2adjacent) { 8356 const pgno_t pgno = dl->items[r].pgno; 8357 const unsigned npages = dpl_npages(dl, r); 8358 if (prio <= prio2spill) { 8359 if (prev_prio < prio2adjacent && prev_prio > prio2spill && 8360 dpl_endpgno(dl, r - 1) == pgno) { 8361 DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO 8362 " (age %d, prio %u)", 8363 dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), 8364 prev_prio); 8365 --w; 8366 rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, 8367 dpl_npages(dl, r - 1)); 8368 if (unlikely(rc != MDBX_SUCCESS)) 8369 break; 8370 ++spilled; 8371 } 8372 8373 DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, 8374 dp->mp_pgno, dpl_age(txn, r), prio); 8375 rc = spill_page(txn, &ctx, dp, npages); 8376 if (unlikely(rc != MDBX_SUCCESS)) 8377 break; 8378 ++spilled; 8379 continue; 8380 } 8381 8382 if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { 8383 DEBUG("co-spill %u next-adjacent page %" PRIaPGNO 8384 " (age %d, prio %u)", 8385 npages, dp->mp_pgno, dpl_age(txn, r), prio); 8386 rc = spill_page(txn, &ctx, dp, npages); 8387 if (unlikely(rc != MDBX_SUCCESS)) 8388 break; 8389 prio = prev_prio /* to continue co-spilling next adjacent pages */; 8390 ++spilled; 8391 continue; 8392 } 8393 } 8394 dl->items[++w] = dl->items[r]; 8395 } 8396 8397 tASSERT(txn, spillable == 0 || spilled > 0); 8398 8399 while (r <= dl->length) 8400 dl->items[++w] = dl->items[r++]; 8401 tASSERT(txn, r - 1 - w == spilled); 8402 8403 dl->sorted = dpl_setlen(dl, w); 8404 txn->tw.dirtyroom += spilled; 8405 tASSERT(txn, dirtylist_check(txn)); 8406 8407 if (ctx.iov_items) { 8408 /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ 8409 tASSERT(txn, rc == MDBX_SUCCESS); 8410 rc = iov_write(txn, &ctx); 8411 } 8412 8413 if (unlikely(rc != MDBX_SUCCESS)) 8414 goto bailout; 8415 8416 pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); 8417 txn->mt_flags |= MDBX_TXN_SPILLS; 8418 NOTICE("spilled %u dirty-entries, now have %u dirty-room", spilled, 8419 txn->tw.dirtyroom); 8420 iov_done(txn, &ctx); 8421 } else { 8422 tASSERT(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); 8423 for (unsigned i = 1; i <= dl->length; ++i) { 8424 MDBX_page *dp = dl->items[i].ptr; 8425 NOTICE("dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", 8426 i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), 8427 spill_prio(txn, i, reciprocal)); 8428 } 8429 } 8430 8431 #if xMDBX_DEBUG_SPILLING == 2 8432 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) 8433 ERROR("dirty-list length: before %u, after %u, parent %i, loose %u; " 8434 "needed %u, spillable %u; " 8435 "spilled %u dirty-entries, now have %u dirty-room", 8436 dl->length + spilled, dl->length, 8437 (txn->mt_parent && txn->mt_parent->tw.dirtylist) 8438 ? (int)txn->mt_parent->tw.dirtylist->length 8439 : -1, 8440 txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom); 8441 ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); 8442 #endif /* xMDBX_DEBUG_SPILLING */ 8443 8444 done: 8445 return likely(txn->tw.dirtyroom + txn->tw.loose_count > 8446 ((need > CURSOR_STACK) ? CURSOR_STACK : need)) 8447 ? MDBX_SUCCESS 8448 : MDBX_TXN_FULL; 8449 } 8450 8451 static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, 8452 const MDBX_val *data) { 8453 MDBX_txn *txn = mc->mc_txn; 8454 /* Estimate how much space this operation will take: */ 8455 /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ 8456 unsigned need = CURSOR_STACK + 3; 8457 /* 2) GC/FreeDB for any payload */ 8458 if (mc->mc_dbi > FREE_DBI) { 8459 need += txn->mt_dbs[FREE_DBI].md_depth + 3; 8460 /* 3) Named DBs also dirty the main DB */ 8461 if (mc->mc_dbi > MAIN_DBI) 8462 need += txn->mt_dbs[MAIN_DBI].md_depth + 3; 8463 } 8464 #if xMDBX_DEBUG_SPILLING != 2 8465 /* production mode */ 8466 /* 4) Double the page chain estimation 8467 * for extensively splitting, rebalance and merging */ 8468 need += need; 8469 /* 5) Factor the key+data which to be put in */ 8470 need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; 8471 #else 8472 /* debug mode */ 8473 (void)key; 8474 (void)data; 8475 mc->mc_txn->mt_env->debug_dirtied_est = ++need; 8476 mc->mc_txn->mt_env->debug_dirtied_act = 0; 8477 #endif /* xMDBX_DEBUG_SPILLING == 2 */ 8478 8479 return txn_spill(txn, mc, need); 8480 } 8481 8482 /*----------------------------------------------------------------------------*/ 8483 8484 static bool meta_bootid_match(const MDBX_meta *meta) { 8485 return memcmp(&meta->mm_bootid, &bootid, 16) == 0 && 8486 (bootid.x | bootid.y) != 0; 8487 } 8488 8489 static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, 8490 const int lck_exclusive) { 8491 return lck_exclusive 8492 ? /* exclusive lock */ meta_bootid_match(meta) 8493 : /* db already opened */ env->me_lck_mmap.lck && 8494 (env->me_lck_mmap.lck->mti_envmode.weak & MDBX_RDONLY) == 0; 8495 } 8496 8497 #define METAPAGE(env, n) page_meta(pgno2page(env, n)) 8498 #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) 8499 8500 MDBX_NOTHROW_PURE_FUNCTION static txnid_t 8501 constmeta_txnid(const MDBX_meta *meta) { 8502 const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); 8503 const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); 8504 return likely(a == b) ? a : 0; 8505 } 8506 8507 typedef struct { 8508 uint64_t txnid; 8509 size_t is_steady; 8510 } meta_snap_t; 8511 8512 static __always_inline txnid_t 8513 atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) { 8514 #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ 8515 MDBX_UNALIGNED_OK >= 8 8516 return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr, 8517 mo_AcquireRelease); 8518 #else 8519 const uint32_t l = atomic_load32( 8520 &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); 8521 const uint32_t h = atomic_load32( 8522 &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); 8523 return (uint64_t)h << 32 | l; 8524 #endif 8525 } 8526 8527 static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) { 8528 txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a); 8529 jitter4testing(true); 8530 size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID; 8531 jitter4testing(true); 8532 if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b))) 8533 txnid = is_steady = 0; 8534 meta_snap_t r = {txnid, is_steady}; 8535 return r; 8536 } 8537 8538 static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) { 8539 return meta_snap(meta).txnid; 8540 } 8541 8542 static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, 8543 txnid_t txnid) { 8544 eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); 8545 eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && 8546 unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); 8547 (void)env; 8548 #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ 8549 MDBX_UNALIGNED_OK >= 8 8550 atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, 8551 mo_AcquireRelease); 8552 atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, 8553 mo_AcquireRelease); 8554 #else 8555 atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 8556 0, mo_AcquireRelease); 8557 atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 8558 0, mo_AcquireRelease); 8559 atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 8560 (uint32_t)txnid, mo_AcquireRelease); 8561 atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 8562 (uint32_t)(txnid >> 32), mo_AcquireRelease); 8563 #endif 8564 } 8565 8566 static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, 8567 txnid_t txnid) { 8568 eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); 8569 eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); 8570 eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); 8571 (void)env; 8572 jitter4testing(true); 8573 memcpy(&meta->mm_bootid, &bootid, 16); 8574 #if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ 8575 MDBX_UNALIGNED_OK >= 8 8576 atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, 8577 mo_AcquireRelease); 8578 #else 8579 atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 8580 (uint32_t)txnid, mo_AcquireRelease); 8581 atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 8582 (uint32_t)(txnid >> 32), mo_AcquireRelease); 8583 #endif 8584 } 8585 8586 static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, 8587 const txnid_t txnid) { 8588 eASSERT(env, 8589 !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env)); 8590 (void)env; 8591 /* update inconsistently since this function used ONLY for filling meta-image 8592 * for writing, but not the actual meta-page */ 8593 memcpy(&meta->mm_bootid, &bootid, 16); 8594 unaligned_poke_u64(4, meta->mm_txnid_a, txnid); 8595 unaligned_poke_u64(4, meta->mm_txnid_b, txnid); 8596 } 8597 8598 static __inline uint64_t meta_sign(const MDBX_meta *meta) { 8599 uint64_t sign = MDBX_DATASIGN_NONE; 8600 #if 0 /* TODO */ 8601 sign = hippeus_hash64(...); 8602 #else 8603 (void)meta; 8604 #endif 8605 /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */ 8606 return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; 8607 } 8608 8609 typedef struct { 8610 txnid_t txnid; 8611 union { 8612 const volatile MDBX_meta *ptr_v; 8613 const MDBX_meta *ptr_c; 8614 }; 8615 size_t is_steady; 8616 } meta_ptr_t; 8617 8618 static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { 8619 eASSERT(env, n < NUM_METAS); 8620 meta_ptr_t r; 8621 meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); 8622 r.txnid = snap.txnid; 8623 r.is_steady = snap.is_steady; 8624 return r; 8625 } 8626 8627 static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { 8628 return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; 8629 } 8630 8631 static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, 8632 bool a_steady, bool b_steady) { 8633 assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); 8634 return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); 8635 } 8636 8637 static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, 8638 bool a_steady, bool b_steady) { 8639 assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); 8640 return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); 8641 } 8642 8643 static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, 8644 txnid_t b_txnid, bool b_steady) { 8645 return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); 8646 } 8647 8648 static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, 8649 txnid_t b_txnid, bool b_steady) { 8650 return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); 8651 } 8652 8653 MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, 8654 uint8_t c12, bool s0, bool s1, 8655 bool s2) { 8656 assert(c01 < 3 && c02 < 3 && c12 < 3); 8657 /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ 8658 const uint8_t recent = meta_cmp2recent(c01, s0, s1) 8659 ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) 8660 : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); 8661 const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) 8662 ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) 8663 : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); 8664 8665 uint8_t tail; 8666 if (recent == 0) 8667 tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; 8668 else if (recent == 1) 8669 tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; 8670 else 8671 tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; 8672 8673 const bool valid = 8674 c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; 8675 const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && 8676 (c12 != 1 || s1 != s2); 8677 return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; 8678 } 8679 8680 static __inline void meta_troika_unpack(meta_troika_t *troika, 8681 const uint8_t packed) { 8682 troika->recent = (packed >> 2) & 3; 8683 troika->prefer_steady = (packed >> 4) & 3; 8684 troika->tail_and_flags = packed & 0xC3; 8685 } 8686 8687 static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { 8688 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, 8689 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, 8690 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, 8691 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, 8692 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, 8693 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, 8694 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, 8695 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, 8696 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, 8697 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, 8698 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, 8699 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, 8700 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, 8701 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, 8702 210, 194, 225, 193, 210, 194}; 8703 8704 __hot static meta_troika_t meta_tap(const MDBX_env *env) { 8705 meta_snap_t snap; 8706 meta_troika_t troika; 8707 snap = meta_snap(METAPAGE(env, 0)); 8708 troika.txnid[0] = snap.txnid; 8709 troika.fsm = (uint8_t)snap.is_steady << 0; 8710 snap = meta_snap(METAPAGE(env, 1)); 8711 troika.txnid[1] = snap.txnid; 8712 troika.fsm += (uint8_t)snap.is_steady << 1; 8713 troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); 8714 snap = meta_snap(METAPAGE(env, 2)); 8715 troika.txnid[2] = snap.txnid; 8716 troika.fsm += (uint8_t)snap.is_steady << 2; 8717 troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); 8718 troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); 8719 8720 meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); 8721 return troika; 8722 } 8723 8724 static txnid_t recent_committed_txnid(const MDBX_env *env) { 8725 const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); 8726 const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); 8727 const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); 8728 return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); 8729 } 8730 8731 static __inline bool meta_eq(const meta_troika_t *troika, unsigned a, 8732 unsigned b) { 8733 assert(a < NUM_METAS && b < NUM_METAS); 8734 return troika->txnid[a] == troika->txnid[b] && 8735 (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && 8736 troika->txnid[a]; 8737 } 8738 8739 static unsigned meta_eq_mask(const meta_troika_t *troika) { 8740 return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | 8741 meta_eq(troika, 2, 0) << 2; 8742 } 8743 8744 __hot static bool meta_should_retry(const MDBX_env *env, 8745 meta_troika_t *troika) { 8746 const meta_troika_t prev = *troika; 8747 *troika = meta_tap(env); 8748 return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || 8749 prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; 8750 } 8751 8752 static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, 8753 const meta_troika_t *troika) { 8754 meta_ptr_t r; 8755 r.txnid = troika->txnid[troika->recent]; 8756 r.ptr_v = METAPAGE(env, troika->recent); 8757 r.is_steady = (troika->fsm >> troika->recent) & 1; 8758 return r; 8759 } 8760 8761 static __always_inline meta_ptr_t 8762 meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { 8763 meta_ptr_t r; 8764 r.txnid = troika->txnid[troika->prefer_steady]; 8765 r.ptr_v = METAPAGE(env, troika->prefer_steady); 8766 r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; 8767 return r; 8768 } 8769 8770 static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, 8771 const meta_troika_t *troika) { 8772 const uint8_t tail = troika->tail_and_flags & 3; 8773 meta_ptr_t r; 8774 r.txnid = troika->txnid[tail]; 8775 r.ptr_v = METAPAGE(env, tail); 8776 r.is_steady = (troika->fsm >> tail) & 1; 8777 return r; 8778 } 8779 8780 static const char *durable_caption(const volatile MDBX_meta *const meta) { 8781 if (META_IS_STEADY(meta)) 8782 return (unaligned_peek_u64_volatile(4, meta->mm_sign) == 8783 meta_sign((const MDBX_meta *)meta)) 8784 ? "Steady" 8785 : "Tainted"; 8786 return "Weak"; 8787 } 8788 8789 __cold static void meta_troika_dump(const MDBX_env *env, 8790 const meta_troika_t *troika) { 8791 const meta_ptr_t recent = meta_recent(env, troika); 8792 const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); 8793 const meta_ptr_t tail = meta_tail(env, troika); 8794 NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " 8795 "head=%d-%" PRIaTXN ".%c, " 8796 "base=%d-%" PRIaTXN ".%c, " 8797 "tail=%d-%" PRIaTXN ".%c, " 8798 "valid %c, strict %c", 8799 troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], 8800 (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], 8801 (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, 8802 recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, 8803 prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', 8804 troika->tail_and_flags % NUM_METAS, tail.txnid, 8805 tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', 8806 TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); 8807 } 8808 8809 /*----------------------------------------------------------------------------*/ 8810 8811 /* Find oldest txnid still referenced. */ 8812 static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { 8813 const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); 8814 eASSERT(env, steady <= env->me_txn0->mt_txnid); 8815 8816 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 8817 if (unlikely(lck == NULL /* exclusive without-lck mode */)) { 8818 eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); 8819 return env->me_lck->mti_oldest_reader.weak = steady; 8820 } 8821 8822 const txnid_t prev_oldest = 8823 atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); 8824 eASSERT(env, steady >= prev_oldest); 8825 8826 txnid_t new_oldest = prev_oldest; 8827 while (new_oldest != steady && 8828 nothing_changed != 8829 atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { 8830 lck->mti_readers_refresh_flag.weak = nothing_changed; 8831 jitter4testing(false); 8832 const unsigned snap_nreaders = 8833 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 8834 new_oldest = steady; 8835 8836 for (unsigned i = 0; i < snap_nreaders; ++i) { 8837 const mdbx_pid_t pid = 8838 atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); 8839 if (!pid) 8840 continue; 8841 jitter4testing(true); 8842 8843 const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); 8844 if (unlikely(rtxn < prev_oldest)) { 8845 if (unlikely(nothing_changed == 8846 atomic_load32(&lck->mti_readers_refresh_flag, 8847 mo_AcquireRelease)) && 8848 safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { 8849 NOTICE("kick stuck reader[%u of %u].pid_%u %" PRIaTXN 8850 " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, 8851 i, snap_nreaders, pid, rtxn, prev_oldest, steady); 8852 } 8853 continue; 8854 } 8855 8856 if (rtxn < new_oldest) { 8857 new_oldest = rtxn; 8858 if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) 8859 break; 8860 } 8861 } 8862 } 8863 8864 if (new_oldest != prev_oldest) { 8865 VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); 8866 eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak); 8867 atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); 8868 } 8869 return new_oldest; 8870 } 8871 8872 static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { 8873 return find_oldest_reader(txn->mt_env, 8874 txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); 8875 } 8876 8877 /* Find largest mvcc-snapshot still referenced. */ 8878 __cold static pgno_t find_largest_snapshot(const MDBX_env *env, 8879 pgno_t last_used_page) { 8880 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 8881 if (likely(lck != NULL /* check for exclusive without-lck mode */)) { 8882 retry:; 8883 const unsigned snap_nreaders = 8884 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 8885 for (unsigned i = 0; i < snap_nreaders; ++i) { 8886 if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { 8887 /* jitter4testing(true); */ 8888 const pgno_t snap_pages = atomic_load32( 8889 &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); 8890 const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); 8891 if (unlikely( 8892 snap_pages != 8893 atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, 8894 mo_AcquireRelease) || 8895 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) 8896 goto retry; 8897 if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid) 8898 last_used_page = snap_pages; 8899 } 8900 } 8901 } 8902 8903 return last_used_page; 8904 } 8905 8906 /* Add a page to the txn's dirty list */ 8907 __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, 8908 unsigned npages) { 8909 #if xMDBX_DEBUG_SPILLING == 2 8910 txn->mt_env->debug_dirtied_act += 1; 8911 ENSURE(txn->mt_env, 8912 txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); 8913 ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); 8914 #endif /* xMDBX_DEBUG_SPILLING == 2 */ 8915 8916 int rc; 8917 mp->mp_txnid = txn->mt_front; 8918 if (unlikely(txn->tw.dirtyroom == 0)) { 8919 if (txn->tw.loose_count) { 8920 MDBX_page *loose = txn->tw.loose_pages; 8921 DEBUG("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); 8922 rc = pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); 8923 if (unlikely(rc != MDBX_SUCCESS)) 8924 goto bailout; 8925 unsigned di = dpl_search(txn, loose->mp_pgno); 8926 tASSERT(txn, txn->tw.dirtylist->items[di].ptr == loose); 8927 dpl_remove(txn, di); 8928 txn->tw.loose_pages = loose->mp_next; 8929 txn->tw.loose_count--; 8930 txn->tw.dirtyroom++; 8931 if (!(txn->mt_flags & MDBX_WRITEMAP)) 8932 dpage_free(txn->mt_env, loose, 1); 8933 } else { 8934 ERROR("Dirtyroom is depleted, DPL length %u", txn->tw.dirtylist->length); 8935 if (!(txn->mt_flags & MDBX_WRITEMAP)) 8936 dpage_free(txn->mt_env, mp, npages); 8937 return MDBX_TXN_FULL; 8938 } 8939 } 8940 8941 rc = dpl_append(txn, mp->mp_pgno, mp, npages); 8942 if (unlikely(rc != MDBX_SUCCESS)) { 8943 bailout: 8944 txn->mt_flags |= MDBX_TXN_ERROR; 8945 return rc; 8946 } 8947 txn->tw.dirtyroom--; 8948 tASSERT(txn, dirtylist_check(txn)); 8949 return MDBX_SUCCESS; 8950 } 8951 8952 #if !(defined(_WIN32) || defined(_WIN64)) 8953 MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { 8954 #ifdef ENOSYS 8955 if (err == ENOSYS) 8956 return MDBX_RESULT_TRUE; 8957 #endif /* ENOSYS */ 8958 #ifdef ENOIMPL 8959 if (err == ENOIMPL) 8960 return MDBX_RESULT_TRUE; 8961 #endif /* ENOIMPL */ 8962 #ifdef ENOTSUP 8963 if (err == ENOTSUP) 8964 return MDBX_RESULT_TRUE; 8965 #endif /* ENOTSUP */ 8966 #ifdef ENOSUPP 8967 if (err == ENOSUPP) 8968 return MDBX_RESULT_TRUE; 8969 #endif /* ENOSUPP */ 8970 #ifdef EOPNOTSUPP 8971 if (err == EOPNOTSUPP) 8972 return MDBX_RESULT_TRUE; 8973 #endif /* EOPNOTSUPP */ 8974 if (err == EAGAIN) 8975 return MDBX_RESULT_TRUE; 8976 return err; 8977 } 8978 #endif /* defined(_WIN32) || defined(_WIN64) */ 8979 8980 #if MDBX_ENABLE_MADVISE 8981 /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ 8982 __cold static int set_readahead(MDBX_env *env, const pgno_t edge, 8983 const bool enable, const bool force_whole) { 8984 eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); 8985 eASSERT(env, (enable & 1) == (enable != 0)); 8986 const bool toggle = force_whole || 8987 ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || 8988 !env->me_lck->mti_readahead_anchor; 8989 const pgno_t prev_edge = env->me_lck->mti_readahead_anchor >> 1; 8990 const size_t limit = env->me_dxb_mmap.limit; 8991 size_t offset = 8992 toggle ? 0 8993 : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge); 8994 offset = (offset < limit) ? offset : limit; 8995 8996 size_t length = 8997 pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge); 8998 length = (length < limit) ? length : limit; 8999 length -= offset; 9000 9001 eASSERT(env, 0 <= (intptr_t)length); 9002 if (length == 0) 9003 return MDBX_SUCCESS; 9004 9005 NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), 9006 bytes2pgno(env, offset + length)); 9007 9008 #if defined(F_RDAHEAD) 9009 if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) 9010 return errno; 9011 #endif /* F_RDAHEAD */ 9012 9013 int err; 9014 if (enable) { 9015 #if defined(MADV_NORMAL) 9016 err = madvise(env->me_map + offset, length, MADV_NORMAL) 9017 ? ignore_enosys(errno) 9018 : MDBX_SUCCESS; 9019 if (unlikely(MDBX_IS_ERROR(err))) 9020 return err; 9021 #elif defined(POSIX_MADV_NORMAL) 9022 err = ignore_enosys( 9023 posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); 9024 if (unlikely(MDBX_IS_ERROR(err))) 9025 return err; 9026 #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) 9027 err = ignore_enosys( 9028 posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL)); 9029 if (unlikely(MDBX_IS_ERROR(err))) 9030 return err; 9031 #elif defined(_WIN32) || defined(_WIN64) 9032 /* no madvise on Windows */ 9033 #else 9034 #warning "FIXME" 9035 #endif 9036 if (toggle) { 9037 /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, 9038 * because MADV_WILLNEED with offset != 0 may cause SIGBUS 9039 * on following access to the hinted region. 9040 * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; 9041 * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */ 9042 #if defined(F_RDADVISE) 9043 struct radvisory hint; 9044 hint.ra_offset = offset; 9045 hint.ra_count = 9046 unlikely(length > INT_MAX && sizeof(length) > sizeof(hint.ra_count)) 9047 ? INT_MAX 9048 : (int)length; 9049 (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( 9050 env->me_lazy_fd, F_RDADVISE, &hint); 9051 #elif defined(MADV_WILLNEED) 9052 err = madvise(env->me_map + offset, length, MADV_WILLNEED) 9053 ? ignore_enosys(errno) 9054 : MDBX_SUCCESS; 9055 if (unlikely(MDBX_IS_ERROR(err))) 9056 return err; 9057 #elif defined(POSIX_MADV_WILLNEED) 9058 err = ignore_enosys( 9059 posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); 9060 if (unlikely(MDBX_IS_ERROR(err))) 9061 return err; 9062 #elif defined(_WIN32) || defined(_WIN64) 9063 if (mdbx_PrefetchVirtualMemory) { 9064 WIN32_MEMORY_RANGE_ENTRY hint; 9065 hint.VirtualAddress = env->me_map + offset; 9066 hint.NumberOfBytes = length; 9067 (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); 9068 } 9069 #elif defined(POSIX_FADV_WILLNEED) 9070 err = ignore_enosys( 9071 posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); 9072 if (unlikely(MDBX_IS_ERROR(err))) 9073 return err; 9074 #else 9075 #warning "FIXME" 9076 #endif 9077 } 9078 } else { 9079 #if defined(MADV_RANDOM) 9080 err = madvise(env->me_map + offset, length, MADV_RANDOM) 9081 ? ignore_enosys(errno) 9082 : MDBX_SUCCESS; 9083 if (unlikely(MDBX_IS_ERROR(err))) 9084 return err; 9085 #elif defined(POSIX_MADV_RANDOM) 9086 err = ignore_enosys( 9087 posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); 9088 if (unlikely(MDBX_IS_ERROR(err))) 9089 return err; 9090 #elif defined(POSIX_FADV_RANDOM) 9091 err = ignore_enosys( 9092 posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM)); 9093 if (unlikely(MDBX_IS_ERROR(err))) 9094 return err; 9095 #elif defined(_WIN32) || defined(_WIN64) 9096 /* no madvise on Windows */ 9097 #else 9098 #warning "FIXME" 9099 #endif /* MADV_RANDOM */ 9100 } 9101 9102 env->me_lck->mti_readahead_anchor = (enable & 1) + (edge << 1); 9103 err = MDBX_SUCCESS; 9104 return err; 9105 } 9106 #endif /* MDBX_ENABLE_MADVISE */ 9107 9108 __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, 9109 const pgno_t size_pgno, const pgno_t limit_pgno, 9110 const bool implicit) { 9111 const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); 9112 const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); 9113 const size_t prev_size = env->me_dxb_mmap.current; 9114 const size_t prev_limit = env->me_dxb_mmap.limit; 9115 #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) 9116 const void *const prev_addr = env->me_map; 9117 #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ 9118 9119 VERBOSE("resize datafile/mapping: " 9120 "present %" PRIuPTR " -> %" PRIuPTR ", " 9121 "limit %" PRIuPTR " -> %" PRIuPTR, 9122 prev_size, size_bytes, prev_limit, limit_bytes); 9123 9124 eASSERT(env, limit_bytes >= size_bytes); 9125 eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); 9126 eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); 9127 9128 unsigned mresize_flags = 9129 env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); 9130 #if defined(_WIN32) || defined(_WIN64) 9131 /* Acquire guard in exclusive mode for: 9132 * - to avoid collision between read and write txns around env->me_dbgeo; 9133 * - to avoid attachment of new reading threads (see osal_rdt_lock); */ 9134 osal_srwlock_AcquireExclusive(&env->me_remap_guard); 9135 mdbx_handle_array_t *suspended = NULL; 9136 mdbx_handle_array_t array_onstack; 9137 int rc = MDBX_SUCCESS; 9138 if (limit_bytes == env->me_dxb_mmap.limit && 9139 size_bytes == env->me_dxb_mmap.current && 9140 size_bytes == env->me_dxb_mmap.filesize) 9141 goto bailout; 9142 9143 if ((env->me_flags & MDBX_NOTLS) == 0) { 9144 /* 1) Windows allows only extending a read-write section, but not a 9145 * corresponding mapped view. Therefore in other cases we must suspend 9146 * the local threads for safe remap. 9147 * 2) At least on Windows 10 1803 the entire mapped section is unavailable 9148 * for short time during NtExtendSection() or VirtualAlloc() execution. 9149 * 3) Under Wine runtime environment on Linux a section extending is not 9150 * supported. 9151 * 9152 * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ 9153 array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); 9154 array_onstack.count = 0; 9155 suspended = &array_onstack; 9156 rc = osal_suspend_threads_before_remap(env, &suspended); 9157 if (rc != MDBX_SUCCESS) { 9158 ERROR("failed suspend-for-remap: errcode %d", rc); 9159 goto bailout; 9160 } 9161 mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP 9162 : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; 9163 } 9164 #else /* Windows */ 9165 /* Acquire guard to avoid collision between read and write txns 9166 * around env->me_dbgeo */ 9167 int rc = osal_fastmutex_acquire(&env->me_remap_guard); 9168 if (unlikely(rc != MDBX_SUCCESS)) 9169 return rc; 9170 if (limit_bytes == env->me_dxb_mmap.limit && 9171 size_bytes == env->me_dxb_mmap.current) 9172 goto bailout; 9173 9174 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 9175 if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && 9176 lck && !implicit) { 9177 int err = osal_rdt_lock(env) /* lock readers table until remap done */; 9178 if (unlikely(MDBX_IS_ERROR(err))) { 9179 rc = err; 9180 goto bailout; 9181 } 9182 9183 /* looking for readers from this process */ 9184 const unsigned snap_nreaders = 9185 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 9186 eASSERT(env, !implicit); 9187 mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; 9188 for (unsigned i = 0; i < snap_nreaders; ++i) { 9189 if (lck->mti_readers[i].mr_pid.weak == env->me_pid && 9190 lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { 9191 /* the base address of the mapping can't be changed since 9192 * the other reader thread from this process exists. */ 9193 osal_rdt_unlock(env); 9194 mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); 9195 break; 9196 } 9197 } 9198 } 9199 #endif /* ! Windows */ 9200 9201 if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { 9202 #if MDBX_ENABLE_PGOP_STAT 9203 env->me_lck->mti_pgop_stat.wops.weak += 1; 9204 #endif /* MDBX_ENABLE_PGOP_STAT */ 9205 rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), 9206 MDBX_SYNC_NONE); 9207 if (unlikely(rc != MDBX_SUCCESS)) 9208 goto bailout; 9209 } 9210 9211 #if MDBX_ENABLE_MADVISE 9212 if (size_bytes < prev_size) { 9213 NOTICE("resize-MADV_%s %u..%u", 9214 (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, 9215 bytes2pgno(env, prev_size)); 9216 rc = MDBX_RESULT_TRUE; 9217 #if defined(MADV_REMOVE) 9218 if (env->me_flags & MDBX_WRITEMAP) 9219 rc = 9220 madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) 9221 ? ignore_enosys(errno) 9222 : MDBX_SUCCESS; 9223 #endif /* MADV_REMOVE */ 9224 #if defined(MADV_DONTNEED) 9225 if (rc == MDBX_RESULT_TRUE) 9226 rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, 9227 MADV_DONTNEED) 9228 ? ignore_enosys(errno) 9229 : MDBX_SUCCESS; 9230 #elif defined(POSIX_MADV_DONTNEED) 9231 if (rc == MDBX_RESULT_TRUE) 9232 rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, 9233 prev_size - size_bytes, 9234 POSIX_MADV_DONTNEED)); 9235 #elif defined(POSIX_FADV_DONTNEED) 9236 if (rc == MDBX_RESULT_TRUE) 9237 rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes, 9238 prev_size - size_bytes, 9239 POSIX_FADV_DONTNEED)); 9240 #endif /* MADV_DONTNEED */ 9241 if (unlikely(MDBX_IS_ERROR(rc))) 9242 goto bailout; 9243 if (env->me_lck->mti_discarded_tail.weak > size_pgno) 9244 env->me_lck->mti_discarded_tail.weak = size_pgno; 9245 } 9246 #endif /* MDBX_ENABLE_MADVISE */ 9247 9248 rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); 9249 9250 #if MDBX_ENABLE_MADVISE 9251 if (rc == MDBX_SUCCESS) { 9252 env->me_lck->mti_discarded_tail.weak = size_pgno; 9253 const bool readahead = 9254 !(env->me_flags & MDBX_NORDAHEAD) && 9255 mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); 9256 const bool force = limit_bytes != prev_limit || 9257 env->me_dxb_mmap.address != prev_addr 9258 #if defined(_WIN32) || defined(_WIN64) 9259 || prev_size > size_bytes 9260 #endif /* Windows */ 9261 ; 9262 rc = set_readahead(env, size_pgno, readahead, force); 9263 } 9264 #endif /* MDBX_ENABLE_MADVISE */ 9265 9266 bailout: 9267 if (rc == MDBX_SUCCESS) { 9268 eASSERT(env, size_bytes == env->me_dxb_mmap.current); 9269 eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); 9270 eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); 9271 #ifdef MDBX_USE_VALGRIND 9272 if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { 9273 VALGRIND_DISCARD(env->me_valgrind_handle); 9274 env->me_valgrind_handle = 0; 9275 if (env->me_dxb_mmap.limit) 9276 env->me_valgrind_handle = 9277 VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); 9278 } 9279 #endif /* MDBX_USE_VALGRIND */ 9280 } else { 9281 if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { 9282 ERROR("failed resize datafile/mapping: " 9283 "present %" PRIuPTR " -> %" PRIuPTR ", " 9284 "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", 9285 prev_size, size_bytes, prev_limit, limit_bytes, rc); 9286 } else { 9287 WARNING("unable resize datafile/mapping: " 9288 "present %" PRIuPTR " -> %" PRIuPTR ", " 9289 "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", 9290 prev_size, size_bytes, prev_limit, limit_bytes, rc); 9291 } 9292 if (!env->me_dxb_mmap.address) { 9293 env->me_flags |= MDBX_FATAL_ERROR; 9294 if (env->me_txn) 9295 env->me_txn->mt_flags |= MDBX_TXN_ERROR; 9296 rc = MDBX_PANIC; 9297 } 9298 } 9299 9300 #if defined(_WIN32) || defined(_WIN64) 9301 int err = MDBX_SUCCESS; 9302 osal_srwlock_ReleaseExclusive(&env->me_remap_guard); 9303 if (suspended) { 9304 err = osal_resume_threads_after_remap(suspended); 9305 if (suspended != &array_onstack) 9306 osal_free(suspended); 9307 } 9308 #else 9309 if (env->me_lck_mmap.lck && 9310 (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) 9311 osal_rdt_unlock(env); 9312 int err = osal_fastmutex_release(&env->me_remap_guard); 9313 #endif /* Windows */ 9314 if (err != MDBX_SUCCESS) { 9315 FATAL("failed resume-after-remap: errcode %d", err); 9316 return MDBX_PANIC; 9317 } 9318 return rc; 9319 } 9320 9321 __cold static int map_resize_implicit(MDBX_env *env, const pgno_t used_pgno, 9322 const pgno_t size_pgno, 9323 const pgno_t limit_pgno) { 9324 const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); 9325 eASSERT(env, mapped_pgno >= used_pgno); 9326 return map_resize( 9327 env, used_pgno, size_pgno, 9328 (size_pgno > mapped_pgno) 9329 ? limit_pgno 9330 : /* The actual mapsize may be less since the geo.upper may be changed 9331 by other process. So, avoids remapping until it necessary. */ 9332 mapped_pgno, 9333 true); 9334 } 9335 9336 static int meta_unsteady(MDBX_env *env, const txnid_t last_steady, 9337 MDBX_meta *const meta, mdbx_filehandle_t fd) { 9338 const uint64_t wipe = MDBX_DATASIGN_NONE; 9339 if (unlikely(META_IS_STEADY(meta)) && constmeta_txnid(meta) <= last_steady) { 9340 WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, 9341 data_page(meta)->mp_pgno); 9342 if (env->me_flags & MDBX_WRITEMAP) 9343 unaligned_poke_u64(4, meta->mm_sign, wipe); 9344 else 9345 return osal_pwrite(fd, &wipe, sizeof(meta->mm_sign), 9346 (uint8_t *)&meta->mm_sign - env->me_map); 9347 } 9348 return MDBX_SUCCESS; 9349 } 9350 9351 __cold static int wipe_steady(MDBX_txn *txn, const txnid_t last_steady) { 9352 MDBX_env *const env = txn->mt_env; 9353 #if MDBX_ENABLE_PGOP_STAT 9354 env->me_lck->mti_pgop_stat.wops.weak += 1; 9355 #endif /* MDBX_ENABLE_PGOP_STAT */ 9356 const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) 9357 ? env->me_dsync_fd 9358 : env->me_lazy_fd; 9359 int err = meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); 9360 if (unlikely(err != MDBX_SUCCESS)) 9361 return err; 9362 err = meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); 9363 if (unlikely(err != MDBX_SUCCESS)) 9364 return err; 9365 err = meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); 9366 if (unlikely(err != MDBX_SUCCESS)) 9367 return err; 9368 9369 if (env->me_flags & MDBX_WRITEMAP) { 9370 osal_flush_incoherent_cpu_writeback(); 9371 err = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), 9372 MDBX_SYNC_DATA); 9373 if (unlikely(err != MDBX_SUCCESS)) 9374 return err; 9375 } else { 9376 if (fd == env->me_lazy_fd) { 9377 #if MDBX_USE_SYNCFILERANGE 9378 static bool syncfilerange_unavailable; 9379 if (!syncfilerange_unavailable && 9380 sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), 9381 SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { 9382 err = errno; 9383 if (ignore_enosys(err) == MDBX_RESULT_TRUE) 9384 syncfilerange_unavailable = true; 9385 } 9386 if (syncfilerange_unavailable) 9387 #endif /* MDBX_USE_SYNCFILERANGE */ 9388 err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); 9389 if (unlikely(err != MDBX_SUCCESS)) 9390 return err; 9391 } 9392 osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), 9393 env->me_os_psize); 9394 } 9395 9396 /* force oldest refresh */ 9397 atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); 9398 tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); 9399 txn->tw.troika = meta_tap(env); 9400 for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) 9401 if (scan != txn) 9402 scan->tw.troika = txn->tw.troika; 9403 return MDBX_SUCCESS; 9404 } 9405 9406 //------------------------------------------------------------------------------ 9407 9408 MDBX_MAYBE_UNUSED __hot static pgno_t * 9409 scan4seq_fallback(pgno_t *range, const size_t len, const unsigned seq) { 9410 assert(seq > 0 && len > seq); 9411 #if MDBX_PNL_ASCENDING 9412 assert(range[-1] == len); 9413 const pgno_t *const detent = range + len - seq; 9414 const ptrdiff_t offset = (ptrdiff_t)seq; 9415 const pgno_t target = (pgno_t)offset; 9416 if (likely(len > seq + 3)) { 9417 do { 9418 const pgno_t diff0 = range[offset + 0] - range[0]; 9419 const pgno_t diff1 = range[offset + 1] - range[1]; 9420 const pgno_t diff2 = range[offset + 2] - range[2]; 9421 const pgno_t diff3 = range[offset + 3] - range[3]; 9422 if (diff0 == target) 9423 return range + 0; 9424 if (diff1 == target) 9425 return range + 1; 9426 if (diff2 == target) 9427 return range + 2; 9428 if (diff3 == target) 9429 return range + 3; 9430 range += 4; 9431 } while (range + 3 < detent); 9432 if (range == detent) 9433 return nullptr; 9434 } 9435 do 9436 if (range[offset] - *range == target) 9437 return range; 9438 while (++range < detent); 9439 #else 9440 assert(range[-(ptrdiff_t)len] == len); 9441 const pgno_t *const detent = range - len + seq; 9442 const ptrdiff_t offset = -(ptrdiff_t)seq; 9443 const pgno_t target = (pgno_t)offset; 9444 if (likely(len > seq + 3)) { 9445 do { 9446 const pgno_t diff0 = range[-0] - range[offset - 0]; 9447 const pgno_t diff1 = range[-1] - range[offset - 1]; 9448 const pgno_t diff2 = range[-2] - range[offset - 2]; 9449 const pgno_t diff3 = range[-3] - range[offset - 3]; 9450 /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору 9451 * загружать и вычислять все значения параллельно. */ 9452 if (diff0 == target) 9453 return range - 0; 9454 if (diff1 == target) 9455 return range - 1; 9456 if (diff2 == target) 9457 return range - 2; 9458 if (diff3 == target) 9459 return range - 3; 9460 range -= 4; 9461 } while (range > detent + 3); 9462 if (range == detent) 9463 return nullptr; 9464 } 9465 do 9466 if (*range - range[offset] == target) 9467 return range; 9468 while (--range > detent); 9469 #endif /* MDBX_PNL sort-order */ 9470 return nullptr; 9471 } 9472 9473 MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, 9474 const unsigned seq) { 9475 size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pnl); 9476 #if MDBX_PNL_ASCENDING 9477 while (seq <= MDBX_PNL_SIZE(pnl) - begin) { 9478 if (pnl[begin + seq] - pnl[begin] == seq) 9479 return pnl + begin; 9480 ++begin; 9481 } 9482 #else 9483 while (begin > seq) { 9484 if (pnl[begin - seq] - pnl[begin] == seq) 9485 return pnl + begin; 9486 --begin; 9487 } 9488 #endif /* MDBX_PNL sort-order */ 9489 return nullptr; 9490 } 9491 9492 #if defined(_MSC_VER) && !defined(__builtin_clz) && \ 9493 !__has_builtin(__builtin_clz) 9494 MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(unsigned value) { 9495 unsigned long index; 9496 _BitScanReverse(&index, value); 9497 return index; 9498 } 9499 #endif /* _MSC_VER */ 9500 9501 #if defined(_MSC_VER) && !defined(__builtin_clzl) && \ 9502 !__has_builtin(__builtin_clzl) 9503 #define __builtin_clzl(value) __builtin_clz(value) 9504 #endif /* _MSC_VER */ 9505 9506 #if !defined(MDBX_ATTRIBUTE_TARGET) && \ 9507 (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) 9508 #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) 9509 #endif /* MDBX_ATTRIBUTE_TARGET */ 9510 9511 #if defined(__SSE2__) 9512 #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ 9513 #elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) 9514 #define __SSE2__ 9515 #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ 9516 #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) 9517 #define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") 9518 #endif /* __SSE2__ */ 9519 9520 #if defined(__AVX2__) 9521 #define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ 9522 #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) 9523 #define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") 9524 #endif /* __AVX2__ */ 9525 9526 #if defined(__AVX512BW__) 9527 #define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ 9528 #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ 9529 (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) 9530 #define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw") 9531 #endif /* __AVX512BW__ */ 9532 9533 #ifdef MDBX_ATTRIBUTE_TARGET_SSE2 9534 MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned 9535 diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, 9536 const __m128i pattern) { 9537 const __m128i f = _mm_loadu_si128((const __m128i *)ptr); 9538 const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); 9539 const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); 9540 return _mm_movemask_ps(*(const __m128 *)&cmp); 9541 } 9542 9543 MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * 9544 scan4seq_sse2(pgno_t *range, const size_t len, const unsigned seq) { 9545 assert(seq > 0 && len > seq); 9546 #if MDBX_PNL_ASCENDING 9547 #error "FIXME: Not implemented" 9548 #endif /* MDBX_PNL_ASCENDING */ 9549 assert(range[-(ptrdiff_t)len] == len); 9550 pgno_t *const detent = range - len + seq; 9551 const ptrdiff_t offset = -(ptrdiff_t)seq; 9552 const pgno_t target = (pgno_t)offset; 9553 const __m128i pattern = _mm_set1_epi32(target); 9554 uint8_t mask; 9555 if (likely(len > seq + 3)) { 9556 do { 9557 mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); 9558 if (mask) { 9559 #ifndef __SANITIZE_ADDRESS__ 9560 found: 9561 #endif /* __SANITIZE_ADDRESS__ */ 9562 return range + 28 - __builtin_clz(mask); 9563 } 9564 range -= 4; 9565 } while (range > detent + 3); 9566 if (range == detent) 9567 return nullptr; 9568 } 9569 9570 /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не 9571 * только за пределами региона выделенного под PNL, но и пересекать границу 9572 * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. 9573 * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ 9574 #ifndef __SANITIZE_ADDRESS__ 9575 const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; 9576 if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && 9577 !RUNNING_ON_VALGRIND) { 9578 const unsigned extra = (unsigned)(detent + 4 - range); 9579 assert(extra > 0 && extra < 4); 9580 mask = 0xF << extra; 9581 mask &= diffcmp2mask_sse2(range - 3, offset, pattern); 9582 if (mask) 9583 goto found; 9584 return nullptr; 9585 } 9586 #endif /* __SANITIZE_ADDRESS__ */ 9587 do 9588 if (*range - range[offset] == target) 9589 return range; 9590 while (--range != detent); 9591 return nullptr; 9592 } 9593 #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ 9594 9595 #ifdef MDBX_ATTRIBUTE_TARGET_AVX2 9596 MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned 9597 diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, 9598 const __m256i pattern) { 9599 const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); 9600 const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); 9601 const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); 9602 return _mm256_movemask_ps(*(const __m256 *)&cmp); 9603 } 9604 9605 MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * 9606 scan4seq_avx2(pgno_t *range, const size_t len, const unsigned seq) { 9607 assert(seq > 0 && len > seq); 9608 #if MDBX_PNL_ASCENDING 9609 #error "FIXME: Not implemented" 9610 #endif /* MDBX_PNL_ASCENDING */ 9611 assert(range[-(ptrdiff_t)len] == len); 9612 pgno_t *const detent = range - len + seq; 9613 const ptrdiff_t offset = -(ptrdiff_t)seq; 9614 const pgno_t target = (pgno_t)offset; 9615 const __m256i pattern = _mm256_set1_epi32(target); 9616 uint8_t mask; 9617 if (likely(len > seq + 7)) { 9618 do { 9619 mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); 9620 if (mask) { 9621 #ifndef __SANITIZE_ADDRESS__ 9622 found: 9623 #endif /* __SANITIZE_ADDRESS__ */ 9624 return range + 24 - __builtin_clz(mask); 9625 } 9626 range -= 8; 9627 } while (range > detent + 7); 9628 if (range == detent) 9629 return nullptr; 9630 } 9631 9632 /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не 9633 * только за пределами региона выделенного под PNL, но и пересекать границу 9634 * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. 9635 * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ 9636 #ifndef __SANITIZE_ADDRESS__ 9637 const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; 9638 if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && 9639 !RUNNING_ON_VALGRIND) { 9640 const unsigned extra = (unsigned)(detent + 8 - range); 9641 assert(extra > 0 && extra < 8); 9642 mask = 0xFF << extra; 9643 mask &= diffcmp2mask_avx2(range - 7, offset, pattern); 9644 if (mask) 9645 goto found; 9646 return nullptr; 9647 } 9648 #endif /* __SANITIZE_ADDRESS__ */ 9649 if (range - 3 > detent) { 9650 mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); 9651 if (mask) 9652 return range + 28 - __builtin_clz(mask); 9653 range -= 4; 9654 } 9655 while (range > detent) { 9656 if (*range - range[offset] == target) 9657 return range; 9658 --range; 9659 } 9660 return nullptr; 9661 } 9662 #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ 9663 9664 #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW 9665 MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned 9666 diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, 9667 const __m512i pattern) { 9668 const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); 9669 const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); 9670 return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); 9671 } 9672 9673 MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * 9674 scan4seq_avx512bw(pgno_t *range, const size_t len, const unsigned seq) { 9675 assert(seq > 0 && len > seq); 9676 #if MDBX_PNL_ASCENDING 9677 #error "FIXME: Not implemented" 9678 #endif /* MDBX_PNL_ASCENDING */ 9679 assert(range[-(ptrdiff_t)len] == len); 9680 pgno_t *const detent = range - len + seq; 9681 const ptrdiff_t offset = -(ptrdiff_t)seq; 9682 const pgno_t target = (pgno_t)offset; 9683 const __m512i pattern = _mm512_set1_epi32(target); 9684 unsigned mask; 9685 if (likely(len > seq + 15)) { 9686 do { 9687 mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); 9688 if (mask) { 9689 #ifndef __SANITIZE_ADDRESS__ 9690 found: 9691 #endif /* __SANITIZE_ADDRESS__ */ 9692 return range + 16 - __builtin_clz(mask); 9693 } 9694 range -= 16; 9695 } while (range > detent + 15); 9696 if (range == detent) 9697 return nullptr; 9698 } 9699 9700 /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не 9701 * только за пределами региона выделенного под PNL, но и пересекать границу 9702 * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. 9703 * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ 9704 #ifndef __SANITIZE_ADDRESS__ 9705 const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; 9706 if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && 9707 !RUNNING_ON_VALGRIND) { 9708 const unsigned extra = (unsigned)(detent + 16 - range); 9709 assert(extra > 0 && extra < 16); 9710 mask = 0xFFFF << extra; 9711 mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); 9712 if (mask) 9713 goto found; 9714 return nullptr; 9715 } 9716 #endif /* __SANITIZE_ADDRESS__ */ 9717 if (range - 7 > detent) { 9718 mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); 9719 if (mask) 9720 return range + 24 - __builtin_clz(mask); 9721 range -= 8; 9722 } 9723 if (range - 3 > detent) { 9724 mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); 9725 if (mask) 9726 return range + 28 - __builtin_clz(mask); 9727 range -= 4; 9728 } 9729 while (range > detent) { 9730 if (*range - range[offset] == target) 9731 return range; 9732 --range; 9733 } 9734 return nullptr; 9735 } 9736 #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ 9737 9738 #if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ 9739 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 9740 static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, 9741 const ptrdiff_t offset, 9742 const uint32x4_t pattern) { 9743 const uint32x4_t f = vld1q_u32(ptr); 9744 const uint32x4_t l = vld1q_u32(ptr + offset); 9745 const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); 9746 if (sizeof(size_t) > 7) 9747 return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); 9748 else 9749 return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), 9750 0); 9751 } 9752 9753 __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, 9754 const unsigned seq) { 9755 assert(seq > 0 && len > seq); 9756 #if MDBX_PNL_ASCENDING 9757 #error "FIXME: Not implemented" 9758 #endif /* MDBX_PNL_ASCENDING */ 9759 assert(range[-(ptrdiff_t)len] == len); 9760 pgno_t *const detent = range - len + seq; 9761 const ptrdiff_t offset = -(ptrdiff_t)seq; 9762 const pgno_t target = (pgno_t)offset; 9763 const uint32x4_t pattern = vmovq_n_u32(target); 9764 size_t mask; 9765 if (likely(len > seq + 3)) { 9766 do { 9767 mask = diffcmp2mask_neon(range - 3, offset, pattern); 9768 if (mask) { 9769 #ifndef __SANITIZE_ADDRESS__ 9770 found: 9771 #endif /* __SANITIZE_ADDRESS__ */ 9772 return (pgno_t *)((char *)range - 9773 (__builtin_clzl(mask) >> sizeof(size_t) / 4)); 9774 } 9775 range -= 4; 9776 } while (range > detent + 3); 9777 if (range == detent) 9778 return nullptr; 9779 } 9780 9781 /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не 9782 * только за пределами региона выделенного под PNL, но и пересекать границу 9783 * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. 9784 * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ 9785 #ifndef __SANITIZE_ADDRESS__ 9786 const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; 9787 if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && 9788 !RUNNING_ON_VALGRIND) { 9789 const unsigned extra = (unsigned)(detent + 4 - range); 9790 assert(extra > 0 && extra < 4); 9791 mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); 9792 mask &= diffcmp2mask_neon(range - 3, offset, pattern); 9793 if (mask) 9794 goto found; 9795 return nullptr; 9796 } 9797 #endif /* __SANITIZE_ADDRESS__ */ 9798 do 9799 if (*range - range[offset] == target) 9800 return range; 9801 while (--range != detent); 9802 return nullptr; 9803 } 9804 #endif /* __ARM_NEON || __ARM_NEON__ */ 9805 9806 #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) 9807 #define scan4seq_default scan4seq_avx512bw 9808 #define scan4seq scan4seq_default 9809 #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) 9810 #define scan4seq_default scan4seq_avx2 9811 #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) 9812 #define scan4seq_default scan4seq_sse2 9813 #elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ 9814 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 9815 #define scan4seq_default scan4seq_neon 9816 /* Choosing of another variants should be added here. */ 9817 #endif /* scan4seq_default */ 9818 9819 #ifndef scan4seq_default 9820 #define scan4seq_default scan4seq_fallback 9821 #endif /* scan4seq_default */ 9822 9823 #ifdef scan4seq 9824 /* The scan4seq() is the best or no alternatives */ 9825 #else 9826 #if !(__has_builtin(__builtin_cpu_supports) || \ 9827 defined(__BUILTIN_CPU_SUPPORTS__) || \ 9828 (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23))) 9829 /* The scan4seq_default() will be used since no cpu-features detection support 9830 * from compiler. Please don't ask to implement cpuid-based detection and don't 9831 * make such PRs. */ 9832 #define scan4seq scan4seq_default 9833 #else 9834 /* Selecting the most appropriate implementation at runtime, 9835 * depending on the available CPU features. */ 9836 static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, 9837 const unsigned seq); 9838 static pgno_t *(*scan4seq)(pgno_t *range, const size_t len, 9839 const unsigned seq) = scan4seq_resolver; 9840 9841 static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, 9842 const unsigned seq) { 9843 pgno_t *(*choice)(pgno_t * range, const size_t len, const unsigned seq) = 9844 nullptr; 9845 #if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ 9846 __GNUC_PREREQ(4, 8) 9847 __builtin_cpu_init(); 9848 #endif /* __builtin_cpu_init() */ 9849 #ifdef MDBX_ATTRIBUTE_TARGET_SSE2 9850 if (__builtin_cpu_supports("sse2")) 9851 choice = scan4seq_sse2; 9852 #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ 9853 #ifdef MDBX_ATTRIBUTE_TARGET_AVX2 9854 if (__builtin_cpu_supports("avx2")) 9855 choice = scan4seq_avx2; 9856 #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ 9857 #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW 9858 if (__builtin_cpu_supports("avx512bw")) 9859 choice = scan4seq_avx512bw; 9860 #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ 9861 /* Choosing of another variants should be added here. */ 9862 scan4seq = choice ? choice : scan4seq_default; 9863 return scan4seq(range, len, seq); 9864 } 9865 #endif /* __has_builtin(__builtin_cpu_supports */ 9866 #endif /* scan4seq */ 9867 9868 //------------------------------------------------------------------------------ 9869 9870 /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, 9871 * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. 9872 * 9873 * If there are free pages available from older transactions, they 9874 * are re-used first. Otherwise allocate a new page at mt_next_pgno. 9875 * Do not modify the GC, just merge GC records into mt_reclaimed_pglist 9876 * and move mt_last_reclaimed to say which records were consumed. Only this 9877 * function can create mt_reclaimed_pglist and move 9878 * mt_last_reclaimed/mt_next_pgno. 9879 * 9880 * [in] mc cursor A cursor handle identifying the transaction and 9881 * database for which we are allocating. 9882 * [in] num the number of pages to allocate. 9883 * 9884 * Returns 0 on success, non-zero on failure.*/ 9885 9886 #define MDBX_ALLOC_GC 1 9887 #define MDBX_ALLOC_NEW 2 9888 #define MDBX_ALLOC_COALESCE 4 9889 #define MDBX_ALLOC_SLOT 8 9890 #define MDBX_ALLOC_FAKE 16 9891 #define MDBX_ALLOC_NOLOG 32 9892 #define MDBX_ALLOC_ALL (MDBX_ALLOC_GC | MDBX_ALLOC_NEW) 9893 9894 static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { 9895 pgr_t ret; 9896 MDBX_txn *const txn = mc->mc_txn; 9897 MDBX_env *const env = txn->mt_env; 9898 eASSERT(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); 9899 eASSERT(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); 9900 9901 const unsigned coalesce_threshold = env->me_maxgc_ov1page >> 2; 9902 if (likely(flags & MDBX_ALLOC_GC)) { 9903 flags |= env->me_flags & MDBX_LIFORECLAIM; 9904 if (txn->mt_dbs[FREE_DBI].md_branch_pages && 9905 MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) < coalesce_threshold) 9906 flags |= MDBX_ALLOC_COALESCE; 9907 if (unlikely( 9908 /* If mc is updating the GC, then the retired-list cannot play 9909 catch-up with itself by growing while trying to save it. */ 9910 (mc->mc_flags & C_RECLAIMING) || 9911 /* avoid (recursive) search inside empty tree and while tree is 9912 updating, todo4recovery://erased_by_github/libmdbx/issues/31 */ 9913 txn->mt_dbs[FREE_DBI].md_entries == 0 || 9914 /* If our dirty list is already full, we can't touch GC */ 9915 (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && 9916 !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) 9917 flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); 9918 } 9919 9920 eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, 9921 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 9922 pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; 9923 unsigned re_len = MDBX_PNL_SIZE(re_list); 9924 pgno_t *range = nullptr; 9925 txnid_t detent = 0, last = 0; 9926 #if MDBX_ENABLE_PGOP_STAT 9927 uint64_t timestamp = 0; 9928 #endif /* MDBX_ENABLE_PGOP_STAT */ 9929 9930 while (true) { /* hsr-kick retry loop */ 9931 MDBX_cursor_couple recur; 9932 for (MDBX_cursor_op op = MDBX_FIRST;; 9933 op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { 9934 MDBX_val key, data; 9935 9936 /* Seek a big enough contiguous page range. 9937 * Prefer pages with lower pgno. */ 9938 eASSERT(env, 9939 pnl_check_allocated(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); 9940 if (!(flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { 9941 eASSERT(env, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && 9942 MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); 9943 range = re_list + (MDBX_PNL_ASCENDING ? 1 : re_len); 9944 pgno = *range; 9945 if (num == 1) 9946 goto done; 9947 range = scan4seq(range, re_len, num - 1); 9948 tASSERT(txn, range == scan4range_checker(re_list, num - 1)); 9949 if (likely(range)) { 9950 pgno = *range; 9951 goto done; 9952 } 9953 } 9954 9955 if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ 9956 if (unlikely(!(flags & MDBX_ALLOC_GC))) 9957 break /* reclaiming is prohibited for now */; 9958 9959 /* Prepare to fetch and coalesce */ 9960 #if MDBX_ENABLE_PGOP_STAT 9961 if (likely(timestamp == 0)) 9962 timestamp = osal_monotime(); 9963 #endif /* MDBX_ENABLE_PGOP_STAT */ 9964 detent = txn_oldest_reader(txn) + 1; 9965 9966 ret.err = cursor_init(&recur.outer, txn, FREE_DBI); 9967 if (unlikely(ret.err != MDBX_SUCCESS)) 9968 goto fail; 9969 if (flags & MDBX_LIFORECLAIM) { 9970 /* Begin from oldest reader if any */ 9971 if (detent > MIN_TXNID) { 9972 last = detent - 1; 9973 op = MDBX_SET_RANGE; 9974 } 9975 } else if (txn->tw.last_reclaimed) { 9976 /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ 9977 last = txn->tw.last_reclaimed; 9978 op = MDBX_SET_RANGE; 9979 } 9980 9981 key.iov_base = &last; 9982 key.iov_len = sizeof(last); 9983 } 9984 9985 if (!(flags & MDBX_LIFORECLAIM)) { 9986 /* Do not try fetch more if the record will be too recent */ 9987 if (op != MDBX_FIRST && ++last >= detent) { 9988 detent = txn_oldest_reader(txn) + 1; 9989 if (detent <= last) 9990 break; 9991 } 9992 } 9993 9994 ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); 9995 if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { 9996 if (op == MDBX_SET_RANGE) 9997 continue; 9998 const txnid_t snap = txn_oldest_reader(txn); 9999 if (unlikely(detent <= snap)) { 10000 detent = snap + 1; 10001 last = snap; 10002 key.iov_base = &last; 10003 key.iov_len = sizeof(last); 10004 op = MDBX_SET_RANGE; 10005 ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); 10006 } 10007 } 10008 if (unlikely(ret.err)) { 10009 if (ret.err == MDBX_NOTFOUND) 10010 break; 10011 goto fail; 10012 } 10013 10014 if (unlikely(key.iov_len != sizeof(txnid_t))) { 10015 ret.err = MDBX_CORRUPTED; 10016 goto fail; 10017 } 10018 last = unaligned_peek_u64(4, key.iov_base); 10019 if (detent <= last) { 10020 detent = txn_oldest_reader(txn) + 1; 10021 if (detent <= last) { 10022 if (flags & MDBX_LIFORECLAIM) 10023 continue; 10024 break; 10025 } 10026 } 10027 10028 if (flags & MDBX_LIFORECLAIM) { 10029 /* skip IDs of records that already reclaimed */ 10030 if (txn->tw.lifo_reclaimed) { 10031 size_t i; 10032 for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i) 10033 if (txn->tw.lifo_reclaimed[i] == last) 10034 break; 10035 if (i) 10036 continue; 10037 } 10038 } 10039 10040 /* Reading next GC record */ 10041 MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; 10042 if (unlikely((ret.err = node_read( 10043 &recur.outer, 10044 page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), 10045 &data, mp)) != MDBX_SUCCESS)) 10046 goto fail; 10047 10048 if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { 10049 txn->tw.lifo_reclaimed = txl_alloc(); 10050 if (unlikely(!txn->tw.lifo_reclaimed)) { 10051 ret.err = MDBX_ENOMEM; 10052 goto fail; 10053 } 10054 } 10055 10056 /* Append PNL from GC record to tw.reclaimed_pglist */ 10057 cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); 10058 pgno_t *gc_pnl = (pgno_t *)data.iov_base; 10059 tASSERT(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); 10060 if (unlikely(data.iov_len % sizeof(pgno_t) || 10061 data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || 10062 !pnl_check(gc_pnl, txn->mt_next_pgno))) { 10063 ret.err = MDBX_CORRUPTED; 10064 goto fail; 10065 } 10066 const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); 10067 if (unlikely(/* list is too long already */ MDBX_PNL_SIZE( 10068 txn->tw.reclaimed_pglist) >= 10069 env->me_options.rp_augment_limit) && 10070 ((/* not a slot-request from gc-update */ 10071 (flags & MDBX_ALLOC_SLOT) == 0 && 10072 /* have enough unallocated space */ txn->mt_geo.upper >= 10073 txn->mt_next_pgno + (size_t)num) || 10074 gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= 10075 MDBX_PGL_LIMIT)) { 10076 /* Stop reclaiming to avoid large/overflow the page list. 10077 * This is a rare case while search for a continuously multi-page region 10078 * in a large database. 10079 * todo4recovery://erased_by_github/libmdbx/issues/123 */ 10080 NOTICE("stop reclaiming to avoid PNL overflow: %u (current) + %u " 10081 "(chunk) -> %u", 10082 MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, 10083 gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); 10084 flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_COALESCE); 10085 break; 10086 } 10087 ret.err = pnl_need(&txn->tw.reclaimed_pglist, gc_len); 10088 if (unlikely(ret.err != MDBX_SUCCESS)) 10089 goto fail; 10090 re_list = txn->tw.reclaimed_pglist; 10091 10092 /* Remember ID of GC record */ 10093 if (flags & MDBX_LIFORECLAIM) { 10094 ret.err = txl_append(&txn->tw.lifo_reclaimed, last); 10095 if (unlikely(ret.err != MDBX_SUCCESS)) 10096 goto fail; 10097 } 10098 txn->tw.last_reclaimed = last; 10099 10100 if (LOG_ENABLED(MDBX_LOG_EXTRA)) { 10101 DEBUG_EXTRA("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", 10102 last, txn->mt_dbs[FREE_DBI].md_root, gc_len); 10103 for (unsigned i = gc_len; i; i--) 10104 DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); 10105 DEBUG_EXTRA_PRINT("%s\n", "."); 10106 } 10107 10108 /* Merge in descending sorted order */ 10109 pnl_merge(re_list, gc_pnl); 10110 if (AUDIT_ENABLED() && unlikely(!pnl_check(re_list, txn->mt_next_pgno))) { 10111 ret.err = MDBX_CORRUPTED; 10112 goto fail; 10113 } 10114 tASSERT(txn, dirtylist_check(txn)); 10115 10116 re_len = MDBX_PNL_SIZE(re_list); 10117 tASSERT(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); 10118 if (MDBX_ENABLE_REFUND && re_len && 10119 unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { 10120 /* Refund suitable pages into "unallocated" space */ 10121 txn_refund(txn); 10122 re_list = txn->tw.reclaimed_pglist; 10123 re_len = MDBX_PNL_SIZE(re_list); 10124 } 10125 10126 /* Done for a kick-reclaim mode, actually no page needed */ 10127 if (unlikely(flags & MDBX_ALLOC_SLOT)) { 10128 DEBUG("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); 10129 #if MDBX_ENABLE_PGOP_STAT 10130 eASSERT(env, timestamp != 0); 10131 env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; 10132 #endif /* MDBX_ENABLE_PGOP_STAT */ 10133 ret.err = MDBX_SUCCESS; 10134 ret.page = NULL; 10135 return ret; 10136 } 10137 10138 /* Don't try to coalesce too much. */ 10139 if (re_len /* current size */ > coalesce_threshold) { 10140 if (flags & MDBX_ALLOC_COALESCE) 10141 TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); 10142 flags &= ~MDBX_ALLOC_COALESCE; 10143 } 10144 } 10145 10146 if (F_ISSET(flags, MDBX_ALLOC_COALESCE | MDBX_ALLOC_GC)) { 10147 DEBUG_EXTRA("clear %s and continue", "MDBX_ALLOC_COALESCE"); 10148 flags &= ~MDBX_ALLOC_COALESCE; 10149 continue; 10150 } 10151 10152 /* There is no suitable pages in the GC and to be able to allocate 10153 * we should CHOICE one of: 10154 * - make a new steady checkpoint if reclaiming was stopped by 10155 * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; 10156 * - kick lagging reader(s) if reclaiming was stopped by ones of it. 10157 * - extend the database file. */ 10158 10159 /* Will use new pages from the map if nothing is suitable in the GC. */ 10160 range = nullptr; 10161 pgno = txn->mt_next_pgno; 10162 const size_t next = (size_t)pgno + num; 10163 10164 if (flags & MDBX_ALLOC_GC) { 10165 const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); 10166 const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); 10167 /* does reclaiming stopped at the last steady point? */ 10168 if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && 10169 detent == prefer_steady.txnid + 1) { 10170 DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN 10171 "-%s, detent %" PRIaTXN, 10172 recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, 10173 durable_caption(prefer_steady.ptr_c), detent); 10174 ret.err = MDBX_RESULT_TRUE; 10175 const pgno_t autosync_threshold = 10176 atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); 10177 const uint64_t autosync_period = 10178 atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); 10179 /* wipe the last steady-point if one of: 10180 * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified 10181 * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted 10182 * otherwise, make a new steady-point if one of: 10183 * - auto-sync threshold is specified and reached; 10184 * - upper limit of database size is reached; 10185 * - database is full (with the current file size) 10186 * AND auto-sync threshold it NOT specified */ 10187 if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && 10188 ((autosync_threshold | autosync_period) == 0 || 10189 next >= prefer_steady.ptr_c->mm_geo.now)) { 10190 /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode 10191 * without any auto-sync threshold(s). */ 10192 ret.err = wipe_steady(txn, detent); 10193 DEBUG("gc-wipe-steady, rc %d", ret.err); 10194 eASSERT(env, prefer_steady.ptr_c != 10195 meta_prefer_steady(env, &txn->tw.troika).ptr_c); 10196 } else if ((flags & MDBX_ALLOC_NEW) == 0 || 10197 (autosync_threshold && 10198 atomic_load32(&env->me_lck->mti_unsynced_pages, 10199 mo_Relaxed) >= autosync_threshold) || 10200 (autosync_period && 10201 osal_monotime() - 10202 atomic_load64(&env->me_lck->mti_sync_timestamp, 10203 mo_Relaxed) >= 10204 autosync_period) || 10205 next >= txn->mt_geo.upper || 10206 (next >= txn->mt_end_pgno && 10207 (autosync_threshold | autosync_period) == 0)) { 10208 /* make steady checkpoint. */ 10209 MDBX_meta meta = *recent.ptr_c; 10210 ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, 10211 &txn->tw.troika); 10212 DEBUG("gc-make-steady, rc %d", ret.err); 10213 eASSERT(env, prefer_steady.ptr_c != 10214 meta_prefer_steady(env, &txn->tw.troika).ptr_c); 10215 } 10216 if (likely(ret.err != MDBX_RESULT_TRUE)) { 10217 if (unlikely(ret.err != MDBX_SUCCESS)) 10218 goto fail; 10219 continue; 10220 } 10221 } 10222 } 10223 10224 /* don't kick lagging reader(s) if is enough unallocated space 10225 * at the end of database file. */ 10226 if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) 10227 goto done; 10228 10229 if (flags & MDBX_ALLOC_GC) { 10230 const txnid_t laggard = txn_oldest_reader(txn); 10231 if (laggard >= detent || (laggard < txn->mt_txnid - xMDBX_TXNID_STEP && 10232 kick_longlived_readers(env, laggard) >= detent)) 10233 continue; 10234 } 10235 10236 ret.err = MDBX_NOTFOUND; 10237 if (flags & MDBX_ALLOC_NEW) { 10238 ret.err = MDBX_MAP_FULL; 10239 if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { 10240 eASSERT(env, next > txn->mt_end_pgno); 10241 const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); 10242 size_t aligned = pgno_align2os_pgno( 10243 env, (pgno_t)(next + grow_step - next % grow_step)); 10244 10245 if (aligned > txn->mt_geo.upper) 10246 aligned = txn->mt_geo.upper; 10247 eASSERT(env, aligned > txn->mt_end_pgno); 10248 10249 VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, 10250 aligned - txn->mt_end_pgno); 10251 ret.err = map_resize_implicit(env, txn->mt_next_pgno, (pgno_t)aligned, 10252 txn->mt_geo.upper); 10253 if (ret.err == MDBX_SUCCESS) { 10254 env->me_txn->mt_end_pgno = (pgno_t)aligned; 10255 goto done; 10256 } 10257 10258 ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, 10259 aligned - txn->mt_end_pgno, ret.err); 10260 } else { 10261 NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, next, 10262 txn->mt_geo.upper); 10263 } 10264 } 10265 10266 fail: 10267 #if MDBX_ENABLE_PGOP_STAT 10268 if (timestamp) 10269 env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; 10270 #endif /* MDBX_ENABLE_PGOP_STAT */ 10271 eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, 10272 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 10273 int level; 10274 const char *what; 10275 if (likely(!(flags & MDBX_ALLOC_FAKE))) { 10276 txn->mt_flags |= MDBX_TXN_ERROR; 10277 level = MDBX_LOG_ERROR; 10278 what = "pages"; 10279 } else { 10280 level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; 10281 what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; 10282 } 10283 if (LOG_ENABLED(level)) 10284 debug_log(level, __func__, __LINE__, 10285 "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, 10286 flags, ret.err); 10287 10288 eASSERT(env, ret.err != MDBX_SUCCESS); 10289 ret.page = NULL; 10290 return ret; 10291 } 10292 10293 done: 10294 eASSERT(env, !(flags & MDBX_ALLOC_SLOT)); 10295 ENSURE(env, pgno >= NUM_METAS); 10296 #if MDBX_ENABLE_PGOP_STAT 10297 if (likely(timestamp)) 10298 env->me_lck->mti_pgop_stat.gcrtime.weak += osal_monotime() - timestamp; 10299 #endif /* MDBX_ENABLE_PGOP_STAT */ 10300 if (unlikely(flags & MDBX_ALLOC_FAKE)) { 10301 DEBUG("return NULL-page for %u pages %s allocation", num, 10302 "gc-slot/backlog"); 10303 ret.page = NULL; 10304 ret.err = MDBX_SUCCESS; 10305 return ret; 10306 } 10307 10308 if (env->me_flags & MDBX_WRITEMAP) { 10309 ret.page = pgno2page(env, pgno); 10310 VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); 10311 MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); 10312 } else { 10313 ret.page = page_malloc(txn, num); 10314 if (unlikely(!ret.page)) { 10315 ret.err = MDBX_ENOMEM; 10316 goto fail; 10317 } 10318 } 10319 10320 if (range) { 10321 cASSERT(mc, (mc->mc_flags & C_GCFREEZE) == 0); 10322 tASSERT(txn, pgno < txn->mt_next_pgno); 10323 tASSERT(txn, pgno == *range); 10324 /* Cutoff allocated pages from tw.reclaimed_pglist */ 10325 #if MDBX_PNL_ASCENDING 10326 for (const pgno_t *const end = re_list + re_len - num; range <= end; 10327 ++range) 10328 *range = range[num]; 10329 #else 10330 for (const pgno_t *const end = re_list + re_len; ++range <= end;) 10331 range[-(ptrdiff_t)num] = *range; 10332 #endif 10333 MDBX_PNL_SIZE(re_list) = re_len -= num; 10334 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 10335 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 10336 } else { 10337 txn->mt_next_pgno = pgno + num; 10338 eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); 10339 } 10340 10341 if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) 10342 memset(ret.page, -1, pgno2bytes(env, num)); 10343 VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); 10344 10345 ret.page->mp_pgno = pgno; 10346 ret.page->mp_leaf2_ksize = 0; 10347 ret.page->mp_flags = 0; 10348 if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { 10349 ret.page->mp_pages = num; 10350 ret.page->mp_flags = P_OVERFLOW; 10351 } 10352 ret.err = page_dirty(txn, ret.page, num); 10353 if (unlikely(ret.err != MDBX_SUCCESS)) 10354 goto fail; 10355 10356 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 10357 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 10358 return ret; 10359 } 10360 10361 __hot static pgr_t page_alloc(MDBX_cursor *mc) { 10362 MDBX_txn *const txn = mc->mc_txn; 10363 10364 /* If there are any loose pages, just use them */ 10365 while (likely(txn->tw.loose_pages)) { 10366 #if MDBX_ENABLE_REFUND 10367 if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { 10368 txn_refund(txn); 10369 if (!txn->tw.loose_pages) 10370 break; 10371 } 10372 #endif /* MDBX_ENABLE_REFUND */ 10373 10374 MDBX_page *page = txn->tw.loose_pages; 10375 txn->tw.loose_pages = page->mp_next; 10376 txn->tw.loose_count--; 10377 DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), page->mp_pgno); 10378 tASSERT(txn, page->mp_pgno < txn->mt_next_pgno); 10379 tASSERT(txn, page->mp_pgno >= NUM_METAS); 10380 VALGRIND_MAKE_MEM_UNDEFINED(page_data(page), page_space(txn->mt_env)); 10381 MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(page), page_space(txn->mt_env)); 10382 page->mp_txnid = txn->mt_front; 10383 pgr_t ret = {page, MDBX_SUCCESS}; 10384 return ret; 10385 } 10386 10387 if (likely(!(mc->mc_flags & C_GCFREEZE))) { 10388 MDBX_PNL pnl = txn->tw.reclaimed_pglist; 10389 const unsigned len = MDBX_PNL_SIZE(pnl); 10390 if (likely(len > 0)) { 10391 MDBX_PNL_SIZE(pnl) = len - 1; 10392 #if MDBX_PNL_ASCENDING 10393 const pgno_t pgno = pnl[1]; 10394 for (unsigned i = 1; i < len; ++i) 10395 pnl[i] = pnl[i + 1]; 10396 #else 10397 const pgno_t pgno = pnl[len]; 10398 #endif 10399 10400 MDBX_env *const env = txn->mt_env; 10401 pgr_t ret; 10402 if (env->me_flags & MDBX_WRITEMAP) { 10403 ret.page = pgno2page(env, pgno); 10404 MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, env->me_psize); 10405 } else { 10406 ret.page = page_malloc(txn, 1); 10407 if (unlikely(!ret.page)) { 10408 ret.err = MDBX_ENOMEM; 10409 return ret; 10410 } 10411 } 10412 10413 VALGRIND_MAKE_MEM_UNDEFINED(ret.page, env->me_psize); 10414 ret.page->mp_pgno = pgno; 10415 ret.page->mp_leaf2_ksize = 0; 10416 ret.page->mp_flags = 0; 10417 tASSERT(txn, ret.page->mp_pgno >= NUM_METAS); 10418 10419 ret.err = page_dirty(txn, ret.page, 1); 10420 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 10421 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 10422 return ret; 10423 } 10424 } 10425 10426 return page_alloc_slowpath(mc, 1, MDBX_ALLOC_ALL); 10427 } 10428 10429 /* Copy the used portions of a non-large/overflow page. */ 10430 __hot static void page_copy(MDBX_page *dst, const MDBX_page *src, 10431 size_t psize) { 10432 STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); 10433 STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); 10434 if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { 10435 size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; 10436 10437 /* If page isn't full, just copy the used portion. Adjust 10438 * alignment so memcpy may copy words instead of bytes. */ 10439 if (unused >= MDBX_CACHELINE_SIZE * 2) { 10440 lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); 10441 upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); 10442 memcpy(dst, src, lower); 10443 dst = (void *)((char *)dst + upper); 10444 src = (void *)((char *)src + upper); 10445 psize -= upper; 10446 } 10447 } 10448 memcpy(dst, src, psize); 10449 } 10450 10451 /* Pull a page off the txn's spill list, if present. 10452 * 10453 * If a page being referenced was spilled to disk in this txn, bring 10454 * it back and make it dirty/writable again. */ 10455 static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, 10456 const MDBX_page *const mp) { 10457 VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); 10458 tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); 10459 tASSERT(txn, IS_SPILLED(txn, mp)); 10460 const MDBX_txn *scan = txn; 10461 pgr_t ret; 10462 do { 10463 tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); 10464 const unsigned si = search_spilled(scan, mp->mp_pgno); 10465 if (!si) 10466 continue; 10467 const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; 10468 ret.page = page_malloc(txn, npages); 10469 if (unlikely(!ret.page)) { 10470 ret.err = MDBX_ENOMEM; 10471 return ret; 10472 } 10473 page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); 10474 if (scan == txn) { 10475 /* If in current txn, this page is no longer spilled. 10476 * If it happens to be the last page, truncate the spill list. 10477 * Otherwise mark it as deleted by setting the LSB. */ 10478 spill_remove(txn, si, npages); 10479 } /* otherwise, if belonging to a parent txn, the 10480 * page remains spilled until child commits */ 10481 10482 ret.err = page_dirty(txn, ret.page, npages); 10483 if (unlikely(ret.err != MDBX_SUCCESS)) 10484 return ret; 10485 #if MDBX_ENABLE_PGOP_STAT 10486 txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += npages; 10487 #endif /* MDBX_ENABLE_PGOP_STAT */ 10488 ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED; 10489 ret.err = MDBX_SUCCESS; 10490 return ret; 10491 } while (likely((scan = scan->mt_parent) != nullptr && 10492 (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); 10493 ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN 10494 " not found in the spill-list(s), current txn %" PRIaTXN 10495 " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, 10496 mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, 10497 txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); 10498 ret.err = MDBX_PROBLEM; 10499 ret.page = NULL; 10500 return ret; 10501 } 10502 10503 /* Touch a page: make it dirty and re-insert into tree with updated pgno. 10504 * Set MDBX_TXN_ERROR on failure. 10505 * 10506 * [in] mc cursor pointing to the page to be touched 10507 * 10508 * Returns 0 on success, non-zero on failure. */ 10509 __hot static int page_touch(MDBX_cursor *mc) { 10510 const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; 10511 MDBX_page *np; 10512 MDBX_txn *txn = mc->mc_txn; 10513 int rc; 10514 10515 if (ASSERT_ENABLED()) { 10516 if (mc->mc_flags & C_SUB) { 10517 MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); 10518 MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); 10519 tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); 10520 tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); 10521 tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); 10522 } else { 10523 tASSERT(txn, *mc->mc_dbistate & DBI_DIRTY); 10524 } 10525 tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); 10526 tASSERT(txn, !IS_OVERFLOW(mp)); 10527 tASSERT(txn, dirtylist_check(txn)); 10528 } 10529 10530 if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) 10531 return MDBX_SUCCESS; 10532 10533 if (IS_FROZEN(txn, mp)) { 10534 /* CoW the page */ 10535 rc = pnl_need(&txn->tw.retired_pages, 1); 10536 if (unlikely(rc != MDBX_SUCCESS)) 10537 goto fail; 10538 const pgr_t par = page_alloc(mc); 10539 rc = par.err; 10540 np = par.page; 10541 if (unlikely(rc != MDBX_SUCCESS)) 10542 goto fail; 10543 10544 const pgno_t pgno = np->mp_pgno; 10545 DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), 10546 mp->mp_pgno, pgno); 10547 tASSERT(txn, mp->mp_pgno != pgno); 10548 pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); 10549 /* Update the parent page, if any, to point to the new page */ 10550 if (mc->mc_top) { 10551 MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; 10552 MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]); 10553 node_set_pgno(node, pgno); 10554 } else { 10555 mc->mc_db->md_root = pgno; 10556 } 10557 10558 #if MDBX_ENABLE_PGOP_STAT 10559 txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1; 10560 #endif /* MDBX_ENABLE_PGOP_STAT */ 10561 page_copy(np, mp, txn->mt_env->me_psize); 10562 np->mp_pgno = pgno; 10563 np->mp_txnid = txn->mt_front; 10564 } else if (IS_SPILLED(txn, mp)) { 10565 pgr_t pur = page_unspill(txn, mp); 10566 np = pur.page; 10567 rc = pur.err; 10568 if (likely(rc == MDBX_SUCCESS)) { 10569 tASSERT(txn, np != nullptr); 10570 goto done; 10571 } 10572 goto fail; 10573 } else { 10574 if (unlikely(!txn->mt_parent)) { 10575 ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " 10576 "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," 10577 " without parent transaction, current txn %" PRIaTXN 10578 " front %" PRIaTXN, 10579 IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, 10580 mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); 10581 rc = MDBX_PROBLEM; 10582 goto fail; 10583 } 10584 10585 DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); 10586 tASSERT(txn, 10587 txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); 10588 /* No - copy it */ 10589 np = page_malloc(txn, 1); 10590 if (unlikely(!np)) { 10591 rc = MDBX_ENOMEM; 10592 goto fail; 10593 } 10594 page_copy(np, mp, txn->mt_env->me_psize); 10595 10596 /* insert a clone of parent's dirty page, so don't touch dirtyroom */ 10597 rc = page_dirty(txn, np, 1); 10598 if (unlikely(rc != MDBX_SUCCESS)) 10599 goto fail; 10600 10601 #if MDBX_ENABLE_PGOP_STAT 10602 txn->mt_env->me_lck->mti_pgop_stat.clone.weak += 1; 10603 #endif /* MDBX_ENABLE_PGOP_STAT */ 10604 } 10605 10606 done: 10607 /* Adjust cursors pointing to mp */ 10608 mc->mc_pg[mc->mc_top] = np; 10609 MDBX_cursor *m2 = txn->mt_cursors[mc->mc_dbi]; 10610 if (mc->mc_flags & C_SUB) { 10611 for (; m2; m2 = m2->mc_next) { 10612 MDBX_cursor *m3 = &m2->mc_xcursor->mx_cursor; 10613 if (m3->mc_snum < mc->mc_snum) 10614 continue; 10615 if (m3->mc_pg[mc->mc_top] == mp) 10616 m3->mc_pg[mc->mc_top] = np; 10617 } 10618 } else { 10619 for (; m2; m2 = m2->mc_next) { 10620 if (m2->mc_snum < mc->mc_snum) 10621 continue; 10622 if (m2 == mc) 10623 continue; 10624 if (m2->mc_pg[mc->mc_top] == mp) { 10625 m2->mc_pg[mc->mc_top] = np; 10626 if (XCURSOR_INITED(m2) && IS_LEAF(np)) 10627 XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); 10628 } 10629 } 10630 } 10631 return MDBX_SUCCESS; 10632 10633 fail: 10634 txn->mt_flags |= MDBX_TXN_ERROR; 10635 return rc; 10636 } 10637 10638 __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { 10639 bool locked = false; 10640 int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; 10641 10642 retry:; 10643 unsigned flags = env->me_flags & ~(MDBX_NOMETASYNC | MDBX_SHRINK_ALLOWED); 10644 if (unlikely((flags & (MDBX_RDONLY | MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != 10645 MDBX_ENV_ACTIVE)) { 10646 rc = MDBX_EACCESS; 10647 if (!(flags & MDBX_ENV_ACTIVE)) 10648 rc = MDBX_EPERM; 10649 if (flags & MDBX_FATAL_ERROR) 10650 rc = MDBX_PANIC; 10651 goto bailout; 10652 } 10653 10654 const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); 10655 meta_ptr_t head; 10656 if (inside_txn | locked) 10657 head = meta_recent(env, &env->me_txn0->tw.troika); 10658 else { 10659 const meta_troika_t troika = meta_tap(env); 10660 head = meta_recent(env, &troika); 10661 } 10662 const pgno_t unsynced_pages = 10663 atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); 10664 if (unsynced_pages == 0) { 10665 const uint32_t synched_meta_txnid_u32 = 10666 atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); 10667 if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) 10668 goto bailout; 10669 } 10670 10671 const pgno_t autosync_threshold = 10672 atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); 10673 const uint64_t autosync_period = 10674 atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); 10675 if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || 10676 (autosync_period && 10677 osal_monotime() - 10678 atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= 10679 autosync_period)) 10680 flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; 10681 10682 if (!inside_txn) { 10683 if (!locked) { 10684 #if MDBX_ENABLE_PGOP_STAT 10685 unsigned wops = 0; 10686 #endif /* MDBX_ENABLE_PGOP_STAT */ 10687 10688 int err; 10689 /* pre-sync to avoid latency for writer */ 10690 if (unsynced_pages > /* FIXME: define threshold */ 16 && 10691 (flags & MDBX_SAFE_NOSYNC) == 0) { 10692 eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); 10693 if (flags & MDBX_WRITEMAP) { 10694 /* Acquire guard to avoid collision with remap */ 10695 #if defined(_WIN32) || defined(_WIN64) 10696 osal_srwlock_AcquireShared(&env->me_remap_guard); 10697 #else 10698 err = osal_fastmutex_acquire(&env->me_remap_guard); 10699 if (unlikely(err != MDBX_SUCCESS)) 10700 return err; 10701 #endif 10702 const size_t usedbytes = 10703 pgno_align2os_bytes(env, head.ptr_c->mm_geo.next); 10704 err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); 10705 #if defined(_WIN32) || defined(_WIN64) 10706 osal_srwlock_ReleaseShared(&env->me_remap_guard); 10707 #else 10708 int unlock_err = osal_fastmutex_release(&env->me_remap_guard); 10709 if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) 10710 err = unlock_err; 10711 #endif 10712 } else 10713 err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); 10714 10715 if (unlikely(err != MDBX_SUCCESS)) 10716 return err; 10717 10718 #if MDBX_ENABLE_PGOP_STAT 10719 wops = 1; 10720 #endif /* MDBX_ENABLE_PGOP_STAT */ 10721 /* pre-sync done */ 10722 rc = MDBX_SUCCESS /* means "some data was synced" */; 10723 } 10724 10725 err = mdbx_txn_lock(env, nonblock); 10726 if (unlikely(err != MDBX_SUCCESS)) 10727 return err; 10728 10729 locked = true; 10730 #if MDBX_ENABLE_PGOP_STAT 10731 env->me_lck->mti_pgop_stat.wops.weak += wops; 10732 #endif /* MDBX_ENABLE_PGOP_STAT */ 10733 env->me_txn0->tw.troika = meta_tap(env); 10734 eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); 10735 goto retry; 10736 } 10737 eASSERT(env, head.txnid == recent_committed_txnid(env)); 10738 env->me_txn0->mt_txnid = head.txnid; 10739 txn_oldest_reader(env->me_txn0); 10740 flags |= MDBX_SHRINK_ALLOWED; 10741 } 10742 10743 eASSERT(env, inside_txn || locked); 10744 eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); 10745 10746 if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { 10747 DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, 10748 data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), 10749 unsynced_pages); 10750 MDBX_meta meta = *head.ptr_c; 10751 rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika); 10752 if (unlikely(rc != MDBX_SUCCESS)) 10753 goto bailout; 10754 } 10755 10756 /* LY: sync meta-pages if MDBX_NOMETASYNC enabled 10757 * and someone was not synced above. */ 10758 if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != 10759 (uint32_t)head.txnid) { 10760 #if MDBX_ENABLE_PGOP_STAT 10761 env->me_lck->mti_pgop_stat.wops.weak += 1; 10762 #endif /* MDBX_ENABLE_PGOP_STAT */ 10763 rc = (flags & MDBX_WRITEMAP) 10764 ? osal_msync(&env->me_dxb_mmap, 0, 10765 pgno_align2os_bytes(env, NUM_METAS), 10766 MDBX_SYNC_DATA | MDBX_SYNC_IODQ) 10767 : osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 10768 if (likely(rc == MDBX_SUCCESS)) 10769 atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head.txnid, 10770 mo_Relaxed); 10771 } 10772 10773 bailout: 10774 if (locked) 10775 mdbx_txn_unlock(env); 10776 return rc; 10777 } 10778 10779 static __inline int check_env(const MDBX_env *env, const bool wanna_active) { 10780 if (unlikely(!env)) 10781 return MDBX_EINVAL; 10782 10783 if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) 10784 return MDBX_EBADSIGN; 10785 10786 #if MDBX_ENV_CHECKPID 10787 if (unlikely(env->me_pid != osal_getpid())) { 10788 ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; 10789 return MDBX_PANIC; 10790 } 10791 #endif /* MDBX_ENV_CHECKPID */ 10792 10793 if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) 10794 return MDBX_PANIC; 10795 10796 if (wanna_active) { 10797 if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) 10798 return MDBX_EPERM; 10799 eASSERT(env, env->me_map != nullptr); 10800 } 10801 10802 return MDBX_SUCCESS; 10803 } 10804 10805 __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { 10806 int rc = check_env(env, true); 10807 if (unlikely(rc != MDBX_SUCCESS)) 10808 return rc; 10809 10810 return env_sync(env, force, nonblock); 10811 } 10812 10813 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 10814 __cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); } 10815 10816 __cold int mdbx_env_sync_poll(MDBX_env *env) { 10817 return __inline_mdbx_env_sync_poll(env); 10818 } 10819 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 10820 10821 /* Back up parent txn's cursors, then grab the originals for tracking */ 10822 static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { 10823 for (int i = parent->mt_numdbs; --i >= 0;) { 10824 nested->mt_cursors[i] = NULL; 10825 MDBX_cursor *mc = parent->mt_cursors[i]; 10826 if (mc != NULL) { 10827 size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) 10828 : sizeof(MDBX_cursor); 10829 for (MDBX_cursor *bk; mc; mc = bk->mc_next) { 10830 bk = mc; 10831 if (mc->mc_signature != MDBX_MC_LIVE) 10832 continue; 10833 bk = osal_malloc(size); 10834 if (unlikely(!bk)) 10835 return MDBX_ENOMEM; 10836 #if MDBX_DEBUG 10837 memset(bk, 0xCD, size); 10838 VALGRIND_MAKE_MEM_UNDEFINED(bk, size); 10839 #endif /* MDBX_DEBUG */ 10840 *bk = *mc; 10841 mc->mc_backup = bk; 10842 /* Kill pointers into src to reduce abuse: The 10843 * user may not use mc until dst ends. But we need a valid 10844 * txn pointer here for cursor fixups to keep working. */ 10845 mc->mc_txn = nested; 10846 mc->mc_db = &nested->mt_dbs[i]; 10847 mc->mc_dbistate = &nested->mt_dbistate[i]; 10848 MDBX_xcursor *mx = mc->mc_xcursor; 10849 if (mx != NULL) { 10850 *(MDBX_xcursor *)(bk + 1) = *mx; 10851 mx->mx_cursor.mc_txn = nested; 10852 } 10853 mc->mc_next = nested->mt_cursors[i]; 10854 nested->mt_cursors[i] = mc; 10855 } 10856 } 10857 } 10858 return MDBX_SUCCESS; 10859 } 10860 10861 /* Close this txn's cursors, give parent txn's cursors back to parent. 10862 * 10863 * [in] txn the transaction handle. 10864 * [in] merge true to keep changes to parent cursors, false to revert. 10865 * 10866 * Returns 0 on success, non-zero on failure. */ 10867 static void cursors_eot(MDBX_txn *txn, const bool merge) { 10868 for (int i = txn->mt_numdbs; --i >= 0;) { 10869 MDBX_cursor *next, *mc = txn->mt_cursors[i]; 10870 if (!mc) 10871 continue; 10872 txn->mt_cursors[i] = NULL; 10873 do { 10874 const unsigned stage = mc->mc_signature; 10875 MDBX_cursor *bk = mc->mc_backup; 10876 next = mc->mc_next; 10877 ENSURE(txn->mt_env, 10878 stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); 10879 cASSERT(mc, mc->mc_dbi == (unsigned)i); 10880 if (bk) { 10881 MDBX_xcursor *mx = mc->mc_xcursor; 10882 cASSERT(mc, mx == bk->mc_xcursor); 10883 tASSERT(txn, txn->mt_parent != NULL); 10884 ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); 10885 if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) 10886 mc->mc_signature = stage /* Promote closed state to parent txn */; 10887 else if (merge) { 10888 /* Restore pointers to parent txn */ 10889 mc->mc_next = bk->mc_next; 10890 mc->mc_backup = bk->mc_backup; 10891 mc->mc_txn = bk->mc_txn; 10892 mc->mc_db = bk->mc_db; 10893 mc->mc_dbistate = bk->mc_dbistate; 10894 if (mx) { 10895 if (mx != bk->mc_xcursor) { 10896 *bk->mc_xcursor = *mx; 10897 mx = bk->mc_xcursor; 10898 } 10899 mx->mx_cursor.mc_txn = bk->mc_txn; 10900 } 10901 } else { 10902 /* Restore from backup, i.e. rollback/abort nested txn */ 10903 *mc = *bk; 10904 if (mx) 10905 *mx = *(MDBX_xcursor *)(bk + 1); 10906 } 10907 bk->mc_signature = 0; 10908 osal_free(bk); 10909 } else { 10910 ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); 10911 mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; 10912 mc->mc_flags = 0 /* reset C_UNTRACK */; 10913 } 10914 } while ((mc = next) != NULL); 10915 } 10916 } 10917 10918 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 10919 /* Find largest mvcc-snapshot still referenced by this process. */ 10920 static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { 10921 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 10922 if (likely(lck != NULL /* exclusive mode */)) { 10923 const unsigned snap_nreaders = 10924 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 10925 for (unsigned i = 0; i < snap_nreaders; ++i) { 10926 retry: 10927 if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == 10928 env->me_pid) { 10929 /* jitter4testing(true); */ 10930 const pgno_t snap_pages = atomic_load32( 10931 &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); 10932 const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); 10933 if (unlikely( 10934 snap_pages != 10935 atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, 10936 mo_AcquireRelease) || 10937 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) 10938 goto retry; 10939 if (largest < snap_pages && 10940 atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= 10941 /* ignore pending updates */ snap_txnid && 10942 snap_txnid <= MAX_TXNID) 10943 largest = snap_pages; 10944 } 10945 } 10946 } 10947 return largest; 10948 } 10949 10950 static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { 10951 #if !defined(__SANITIZE_ADDRESS__) 10952 if (!RUNNING_ON_VALGRIND) 10953 return; 10954 #endif 10955 10956 if (txn) { /* transaction start */ 10957 if (env->me_poison_edge < txn->mt_next_pgno) 10958 env->me_poison_edge = txn->mt_next_pgno; 10959 VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); 10960 MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map, 10961 pgno2bytes(env, txn->mt_next_pgno)); 10962 /* don't touch more, it should be already poisoned */ 10963 } else { /* transaction end */ 10964 bool should_unlock = false; 10965 pgno_t last = MAX_PAGENO + 1; 10966 if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { 10967 /* inside write-txn */ 10968 last = meta_recent(env, &env->me_txn0->troika).ptr_v->mm_geo.next; 10969 } else if (env->me_flags & MDBX_RDONLY) { 10970 /* read-only mode, no write-txn, no wlock mutex */ 10971 last = NUM_METAS; 10972 } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) { 10973 /* no write-txn */ 10974 last = NUM_METAS; 10975 should_unlock = true; 10976 } else { 10977 /* write txn is running, therefore shouldn't poison any memory range */ 10978 return; 10979 } 10980 10981 last = find_largest_this(env, last); 10982 const pgno_t edge = env->me_poison_edge; 10983 if (edge > last) { 10984 eASSERT(env, last >= NUM_METAS); 10985 env->me_poison_edge = last; 10986 VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), 10987 pgno2bytes(env, edge - last)); 10988 MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), 10989 pgno2bytes(env, edge - last)); 10990 } 10991 if (should_unlock) 10992 mdbx_txn_unlock(env); 10993 } 10994 } 10995 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ 10996 10997 typedef struct { 10998 int err; 10999 MDBX_reader *rslot; 11000 } bind_rslot_result; 11001 11002 static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { 11003 eASSERT(env, env->me_lck_mmap.lck); 11004 eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); 11005 eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); 11006 11007 bind_rslot_result result = {osal_rdt_lock(env), nullptr}; 11008 if (unlikely(MDBX_IS_ERROR(result.err))) 11009 return result; 11010 if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { 11011 osal_rdt_unlock(env); 11012 result.err = MDBX_PANIC; 11013 return result; 11014 } 11015 if (unlikely(!env->me_map)) { 11016 osal_rdt_unlock(env); 11017 result.err = MDBX_EPERM; 11018 return result; 11019 } 11020 11021 if (unlikely(env->me_live_reader != env->me_pid)) { 11022 result.err = osal_rpid_set(env); 11023 if (unlikely(result.err != MDBX_SUCCESS)) { 11024 osal_rdt_unlock(env); 11025 return result; 11026 } 11027 env->me_live_reader = env->me_pid; 11028 } 11029 11030 result.err = MDBX_SUCCESS; 11031 unsigned slot, nreaders; 11032 while (1) { 11033 nreaders = env->me_lck->mti_numreaders.weak; 11034 for (slot = 0; slot < nreaders; slot++) 11035 if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, 11036 mo_AcquireRelease)) 11037 break; 11038 11039 if (likely(slot < env->me_maxreaders)) 11040 break; 11041 11042 result.err = cleanup_dead_readers(env, true, NULL); 11043 if (result.err != MDBX_RESULT_TRUE) { 11044 osal_rdt_unlock(env); 11045 result.err = 11046 (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; 11047 return result; 11048 } 11049 } 11050 11051 result.rslot = &env->me_lck->mti_readers[slot]; 11052 /* Claim the reader slot, carefully since other code 11053 * uses the reader table un-mutexed: First reset the 11054 * slot, next publish it in lck->mti_numreaders. After 11055 * that, it is safe for mdbx_env_close() to touch it. 11056 * When it will be closed, we can finally claim it. */ 11057 atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); 11058 safe64_reset(&result.rslot->mr_txnid, true); 11059 if (slot == nreaders) 11060 env->me_lck->mti_numreaders.weak = ++nreaders; 11061 result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; 11062 atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); 11063 osal_rdt_unlock(env); 11064 11065 if (likely(env->me_flags & MDBX_ENV_TXKEY)) { 11066 eASSERT(env, env->me_live_reader == env->me_pid); 11067 thread_rthc_set(env->me_txkey, result.rslot); 11068 } 11069 return result; 11070 } 11071 11072 __cold int mdbx_thread_register(const MDBX_env *env) { 11073 int rc = check_env(env, true); 11074 if (unlikely(rc != MDBX_SUCCESS)) 11075 return rc; 11076 11077 if (unlikely(!env->me_lck_mmap.lck)) 11078 return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; 11079 11080 if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { 11081 eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); 11082 return MDBX_EINVAL /* MDBX_NOTLS mode */; 11083 } 11084 11085 eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | 11086 MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); 11087 MDBX_reader *r = thread_rthc_get(env->me_txkey); 11088 if (unlikely(r != NULL)) { 11089 eASSERT(env, r->mr_pid.weak == env->me_pid); 11090 eASSERT(env, r->mr_tid.weak == osal_thread_self()); 11091 if (unlikely(r->mr_pid.weak != env->me_pid)) 11092 return MDBX_BAD_RSLOT; 11093 return MDBX_RESULT_TRUE /* already registered */; 11094 } 11095 11096 const uintptr_t tid = osal_thread_self(); 11097 if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) 11098 return MDBX_TXN_OVERLAPPING; 11099 return bind_rslot((MDBX_env *)env, tid).err; 11100 } 11101 11102 __cold int mdbx_thread_unregister(const MDBX_env *env) { 11103 int rc = check_env(env, true); 11104 if (unlikely(rc != MDBX_SUCCESS)) 11105 return rc; 11106 11107 if (unlikely(!env->me_lck_mmap.lck)) 11108 return MDBX_RESULT_TRUE; 11109 11110 if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { 11111 eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); 11112 return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; 11113 } 11114 11115 eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | 11116 MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); 11117 MDBX_reader *r = thread_rthc_get(env->me_txkey); 11118 if (unlikely(r == NULL)) 11119 return MDBX_RESULT_TRUE /* not registered */; 11120 11121 eASSERT(env, r->mr_pid.weak == env->me_pid); 11122 eASSERT(env, r->mr_tid.weak == osal_thread_self()); 11123 if (unlikely(r->mr_pid.weak != env->me_pid || 11124 r->mr_tid.weak != osal_thread_self())) 11125 return MDBX_BAD_RSLOT; 11126 11127 eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); 11128 if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) 11129 return MDBX_BUSY /* transaction is still active */; 11130 11131 atomic_store32(&r->mr_pid, 0, mo_Relaxed); 11132 atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, 11133 mo_AcquireRelease); 11134 thread_rthc_set(env->me_txkey, nullptr); 11135 return MDBX_SUCCESS; 11136 } 11137 11138 /* check against todo4recovery://erased_by_github/libmdbx/issues/269 */ 11139 static bool coherency_check(const MDBX_env *env, const txnid_t txnid, 11140 const volatile MDBX_db *dbs, 11141 const volatile MDBX_meta *meta, bool report) { 11142 const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid; 11143 const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid; 11144 11145 const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root; 11146 const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno != P_INVALID) 11147 ? pgno2page(env, freedb_root_pgno) 11148 : nullptr; 11149 11150 const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root; 11151 const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID) 11152 ? pgno2page(env, maindb_root_pgno) 11153 : nullptr; 11154 const uint64_t magic_and_version = 11155 unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); 11156 11157 bool ok = true; 11158 if (unlikely(txnid < freedb_mod_txnid || 11159 (!freedb_mod_txnid && freedb_root && 11160 likely(magic_and_version == MDBX_DATA_MAGIC)))) { 11161 if (report) 11162 WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN 11163 " for meta_txnid %" PRIaTXN " %s", 11164 "free", freedb_mod_txnid, txnid, 11165 "(workaround for incoherent flaw of unified page/buffer cache)"); 11166 ok = false; 11167 } 11168 if (unlikely(txnid < maindb_mod_txnid || 11169 (!maindb_mod_txnid && maindb_root && 11170 likely(magic_and_version == MDBX_DATA_MAGIC)))) { 11171 if (report) 11172 WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN 11173 " for meta_txnid %" PRIaTXN " %s", 11174 "main", maindb_mod_txnid, txnid, 11175 "(workaround for incoherent flaw of unified page/buffer cache)"); 11176 ok = false; 11177 } 11178 if (likely(freedb_root && freedb_mod_txnid)) { 11179 VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->mp_txnid)); 11180 MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, 11181 sizeof(freedb_root->mp_txnid)); 11182 const txnid_t root_txnid = freedb_root->mp_txnid; 11183 if (unlikely(root_txnid != freedb_mod_txnid)) { 11184 if (report) 11185 WARNING( 11186 "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN 11187 " for %sdb.mod_txnid %" PRIaTXN " %s", 11188 freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, 11189 "(workaround for incoherent flaw of unified page/buffer cache)"); 11190 ok = false; 11191 } 11192 } 11193 if (likely(maindb_root && maindb_mod_txnid)) { 11194 VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->mp_txnid)); 11195 MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, 11196 sizeof(maindb_root->mp_txnid)); 11197 const txnid_t root_txnid = maindb_root->mp_txnid; 11198 if (unlikely(root_txnid != maindb_mod_txnid)) { 11199 if (report) 11200 WARNING( 11201 "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN 11202 " for %sdb.mod_txnid %" PRIaTXN " %s", 11203 maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, 11204 "(workaround for incoherent flaw of unified page/buffer cache)"); 11205 ok = false; 11206 } 11207 } 11208 return ok; 11209 } 11210 11211 __cold static int coherency_timeout(uint64_t *timestamp) { 11212 if (likely(timestamp && *timestamp == 0)) 11213 *timestamp = osal_monotime(); 11214 else if (unlikely(!timestamp || osal_monotime() - *timestamp > 65536 / 10)) { 11215 ERROR("bailout waiting for valid snapshot (%s)", 11216 "workaround for incoherent flaw of unified page/buffer cache"); 11217 return MDBX_CORRUPTED; 11218 } 11219 11220 osal_memory_fence(mo_AcquireRelease, true); 11221 #if defined(_WIN32) || defined(_WIN64) 11222 SwitchToThread(); 11223 #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) 11224 sched_yield(); 11225 #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) 11226 pthread_yield(); 11227 #else 11228 usleep(42); 11229 #endif 11230 return MDBX_RESULT_TRUE; 11231 } 11232 11233 /* check with timeout as the workaround 11234 * for todo4recovery://erased_by_github/libmdbx/issues/269 */ 11235 __hot static int coherency_check_readed(const MDBX_env *env, 11236 const txnid_t txnid, 11237 const volatile MDBX_db *dbs, 11238 const volatile MDBX_meta *meta, 11239 uint64_t *timestamp) { 11240 const bool report = !(timestamp && *timestamp); 11241 if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) 11242 return coherency_timeout(timestamp); 11243 return MDBX_SUCCESS; 11244 } 11245 11246 static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, 11247 const volatile MDBX_meta *meta, 11248 uint64_t *timestamp) { 11249 const bool report = !(timestamp && *timestamp); 11250 const txnid_t head_txnid = meta_txnid(meta); 11251 if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { 11252 if (report) 11253 WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", 11254 (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, 11255 bytes2pgno(env, (const uint8_t *)meta - env->me_dxb_mmap.dxb), 11256 "(workaround for incoherent flaw of unified page/buffer cache)"); 11257 return coherency_timeout(timestamp); 11258 } 11259 return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); 11260 } 11261 11262 static bool coherency_check_meta(const MDBX_env *env, 11263 const volatile MDBX_meta *meta, bool report) { 11264 uint64_t timestamp = 0; 11265 return coherency_check_written(env, 0, meta, report ? ×tamp : nullptr) == 11266 MDBX_SUCCESS; 11267 } 11268 11269 /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ 11270 static int txn_renew(MDBX_txn *txn, const unsigned flags) { 11271 MDBX_env *env = txn->mt_env; 11272 int rc; 11273 11274 #if MDBX_ENV_CHECKPID 11275 if (unlikely(env->me_pid != osal_getpid())) { 11276 env->me_flags |= MDBX_FATAL_ERROR; 11277 return MDBX_PANIC; 11278 } 11279 #endif /* MDBX_ENV_CHECKPID */ 11280 11281 STATIC_ASSERT(sizeof(MDBX_reader) == 32); 11282 #if MDBX_LOCKING > 0 11283 STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0); 11284 STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0); 11285 #else 11286 STATIC_ASSERT( 11287 offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0); 11288 STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == 11289 0); 11290 #endif /* MDBX_LOCKING */ 11291 STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 11292 0); 11293 11294 const uintptr_t tid = osal_thread_self(); 11295 if (flags & MDBX_TXN_RDONLY) { 11296 eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); 11297 txn->mt_flags = 11298 MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); 11299 MDBX_reader *r = txn->to.reader; 11300 STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); 11301 if (likely(env->me_flags & MDBX_ENV_TXKEY)) { 11302 eASSERT(env, !(env->me_flags & MDBX_NOTLS)); 11303 r = thread_rthc_get(env->me_txkey); 11304 if (likely(r)) { 11305 if (unlikely(!r->mr_pid.weak) && 11306 (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { 11307 thread_rthc_set(env->me_txkey, nullptr); 11308 r = nullptr; 11309 } else { 11310 eASSERT(env, r->mr_pid.weak == env->me_pid); 11311 eASSERT(env, r->mr_tid.weak == osal_thread_self()); 11312 } 11313 } 11314 } else { 11315 eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); 11316 } 11317 11318 if (likely(r)) { 11319 if (unlikely(r->mr_pid.weak != env->me_pid || 11320 r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) 11321 return MDBX_BAD_RSLOT; 11322 } else if (env->me_lck_mmap.lck) { 11323 bind_rslot_result brs = bind_rslot(env, tid); 11324 if (unlikely(brs.err != MDBX_SUCCESS)) 11325 return brs.err; 11326 r = brs.rslot; 11327 } 11328 txn->to.reader = r; 11329 if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { 11330 eASSERT(env, txn->mt_txnid == 0); 11331 eASSERT(env, txn->mt_owner == 0); 11332 eASSERT(env, txn->mt_numdbs == 0); 11333 if (likely(r)) { 11334 eASSERT(env, r->mr_snapshot_pages_used.weak == 0); 11335 eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); 11336 atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); 11337 } 11338 txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; 11339 return MDBX_SUCCESS; 11340 } 11341 11342 /* Seek & fetch the last meta */ 11343 uint64_t timestamp = 0; 11344 unsigned loop = 0; 11345 meta_troika_t troika = meta_tap(env); 11346 while (1) { 11347 const meta_ptr_t head = 11348 likely(env->me_stuck_meta < 0) 11349 ? /* regular */ meta_recent(env, &troika) 11350 : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); 11351 if (likely(r)) { 11352 safe64_reset(&r->mr_txnid, false); 11353 atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next, 11354 mo_Relaxed); 11355 atomic_store64( 11356 &r->mr_snapshot_pages_retired, 11357 unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired), 11358 mo_Relaxed); 11359 safe64_write(&r->mr_txnid, head.txnid); 11360 eASSERT(env, r->mr_pid.weak == osal_getpid()); 11361 eASSERT(env, 11362 r->mr_tid.weak == 11363 ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); 11364 eASSERT(env, r->mr_txnid.weak == head.txnid || 11365 (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && 11366 head.txnid < env->me_lck->mti_oldest_reader.weak)); 11367 atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, 11368 mo_AcquireRelease); 11369 } else { 11370 /* exclusive mode without lck */ 11371 eASSERT(env, !env->me_lck_mmap.lck && 11372 env->me_lck == (void *)&env->x_lckless_stub); 11373 } 11374 jitter4testing(true); 11375 11376 /* Snap the state from current meta-head */ 11377 txn->mt_txnid = head.txnid; 11378 txn->mt_geo = head.ptr_v->mm_geo; 11379 memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); 11380 txn->mt_canary = head.ptr_v->mm_canary; 11381 11382 if (unlikely(env->me_stuck_meta >= 0)) 11383 break; 11384 if (unlikely(meta_should_retry(env, &troika) || 11385 head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, 11386 mo_AcquireRelease))) { 11387 if (unlikely(++loop > 42)) { 11388 ERROR("bailout waiting for valid snapshot (%s)", 11389 "metapages are too volatile"); 11390 rc = MDBX_PROBLEM; 11391 txn->mt_txnid = INVALID_TXNID; 11392 if (likely(r)) 11393 safe64_reset(&r->mr_txnid, false); 11394 goto bailout; 11395 } 11396 timestamp = 0; 11397 continue; 11398 } 11399 11400 rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v, 11401 ×tamp); 11402 jitter4testing(false); 11403 if (likely(rc == MDBX_SUCCESS)) 11404 break; 11405 11406 if (unlikely(rc != MDBX_RESULT_TRUE)) { 11407 txn->mt_txnid = INVALID_TXNID; 11408 if (likely(r)) 11409 safe64_reset(&r->mr_txnid, false); 11410 goto bailout; 11411 } 11412 } 11413 11414 if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { 11415 ERROR("%s", "environment corrupted by died writer, must shutdown!"); 11416 if (likely(r)) 11417 safe64_reset(&r->mr_txnid, false); 11418 txn->mt_txnid = INVALID_TXNID; 11419 rc = MDBX_CORRUPTED; 11420 goto bailout; 11421 } 11422 eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); 11423 txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ 11424 ENSURE(env, txn->mt_txnid >= 11425 /* paranoia is appropriate here */ env->me_lck 11426 ->mti_oldest_reader.weak); 11427 txn->mt_numdbs = env->me_numdbs; 11428 } else { 11429 eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | 11430 MDBX_WRITEMAP)) == 0); 11431 if (unlikely(txn->mt_owner == tid || 11432 /* not recovery mode */ env->me_stuck_meta >= 0)) 11433 return MDBX_BUSY; 11434 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 11435 if (lck && (env->me_flags & MDBX_NOTLS) == 0 && 11436 (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { 11437 const unsigned snap_nreaders = 11438 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 11439 for (unsigned i = 0; i < snap_nreaders; ++i) { 11440 if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == 11441 env->me_pid && 11442 unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == 11443 tid)) { 11444 const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); 11445 if (txnid >= MIN_TXNID && txnid <= MAX_TXNID) 11446 return MDBX_TXN_OVERLAPPING; 11447 } 11448 } 11449 } 11450 11451 /* Not yet touching txn == env->me_txn0, it may be active */ 11452 jitter4testing(false); 11453 rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); 11454 if (unlikely(rc)) 11455 return rc; 11456 if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { 11457 mdbx_txn_unlock(env); 11458 return MDBX_PANIC; 11459 } 11460 #if defined(_WIN32) || defined(_WIN64) 11461 if (unlikely(!env->me_map)) { 11462 mdbx_txn_unlock(env); 11463 return MDBX_EPERM; 11464 } 11465 #endif /* Windows */ 11466 11467 txn->tw.troika = meta_tap(env); 11468 const meta_ptr_t head = meta_recent(env, &txn->tw.troika); 11469 uint64_t timestamp = 0; 11470 while ( 11471 "workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { 11472 rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, 11473 head.ptr_v, ×tamp); 11474 if (likely(rc == MDBX_SUCCESS)) 11475 break; 11476 if (unlikely(rc != MDBX_RESULT_TRUE)) 11477 goto bailout; 11478 } 11479 txn->mt_canary = head.ptr_c->mm_canary; 11480 eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); 11481 txn->mt_txnid = safe64_txnid_next(head.txnid); 11482 if (unlikely(txn->mt_txnid > MAX_TXNID)) { 11483 rc = MDBX_TXN_FULL; 11484 ERROR("txnid overflow, raise %d", rc); 11485 goto bailout; 11486 } 11487 11488 txn->mt_flags = flags; 11489 txn->mt_child = NULL; 11490 txn->tw.loose_pages = NULL; 11491 txn->tw.loose_count = 0; 11492 #if MDBX_ENABLE_REFUND 11493 txn->tw.loose_refund_wl = 0; 11494 #endif /* MDBX_ENABLE_REFUND */ 11495 MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; 11496 txn->tw.spill_pages = NULL; 11497 txn->tw.spill_least_removed = 0; 11498 txn->tw.last_reclaimed = 0; 11499 if (txn->tw.lifo_reclaimed) 11500 MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; 11501 env->me_txn = txn; 11502 txn->mt_numdbs = env->me_numdbs; 11503 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); 11504 /* Copy the DB info and flags */ 11505 memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); 11506 /* Moved to here to avoid a data race in read TXNs */ 11507 txn->mt_geo = head.ptr_c->mm_geo; 11508 11509 rc = dpl_alloc(txn); 11510 if (unlikely(rc != MDBX_SUCCESS)) 11511 goto bailout; 11512 txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; 11513 txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; 11514 } 11515 11516 /* Setup db info */ 11517 osal_compiler_barrier(); 11518 memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); 11519 for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { 11520 const unsigned db_flags = env->me_dbflags[i]; 11521 txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; 11522 txn->mt_dbistate[i] = 11523 (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; 11524 } 11525 txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; 11526 txn->mt_dbistate[FREE_DBI] = DBI_VALID; 11527 txn->mt_front = 11528 txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); 11529 11530 if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { 11531 WARNING("%s", "environment had fatal error, must shutdown!"); 11532 rc = MDBX_PANIC; 11533 } else { 11534 const size_t size = 11535 pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno 11536 : txn->mt_end_pgno); 11537 if (unlikely(size > env->me_dxb_mmap.limit)) { 11538 if (txn->mt_geo.upper > MAX_PAGENO + 1 || 11539 bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != 11540 txn->mt_geo.upper) { 11541 rc = MDBX_UNABLE_EXTEND_MAPSIZE; 11542 goto bailout; 11543 } 11544 rc = map_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, 11545 txn->mt_geo.upper, 11546 (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); 11547 if (rc != MDBX_SUCCESS) 11548 goto bailout; 11549 } else { 11550 env->me_dxb_mmap.current = size; 11551 env->me_dxb_mmap.filesize = 11552 (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; 11553 } 11554 if (txn->mt_flags & MDBX_TXN_RDONLY) { 11555 #if defined(_WIN32) || defined(_WIN64) 11556 if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || 11557 (mdbx_RunningUnderWine() && 11558 /* under Wine acquisition of remap_guard is always required, 11559 * since Wine don't support section extending, 11560 * i.e. in both cases unmap+map are required. */ 11561 size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && 11562 /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { 11563 txn->mt_flags |= MDBX_SHRINK_ALLOWED; 11564 osal_srwlock_AcquireShared(&env->me_remap_guard); 11565 } 11566 #endif /* Windows */ 11567 } 11568 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 11569 txn_valgrind(env, txn); 11570 #endif 11571 txn->mt_owner = tid; 11572 return MDBX_SUCCESS; 11573 } 11574 bailout: 11575 tASSERT(txn, rc != MDBX_SUCCESS); 11576 txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); 11577 return rc; 11578 } 11579 11580 static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { 11581 if (unlikely(!txn)) 11582 return MDBX_EINVAL; 11583 11584 if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) 11585 return MDBX_EBADSIGN; 11586 11587 if (unlikely(txn->mt_flags & bad_bits)) 11588 return MDBX_BAD_TXN; 11589 11590 tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == 11591 ((txn->mt_flags & MDBX_TXN_RDONLY) 11592 ? txn->mt_env->me_flags & MDBX_NOTLS 11593 : 0)); 11594 #if MDBX_TXN_CHECKOWNER 11595 STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); 11596 if (unlikely(txn->mt_owner != osal_thread_self()) && 11597 (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) < 11598 (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) 11599 return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; 11600 #endif /* MDBX_TXN_CHECKOWNER */ 11601 11602 if (bad_bits && unlikely(!txn->mt_env->me_map)) 11603 return MDBX_EPERM; 11604 11605 return MDBX_SUCCESS; 11606 } 11607 11608 static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { 11609 int err = check_txn(txn, bad_bits); 11610 if (unlikely(err)) 11611 return err; 11612 11613 if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) 11614 return MDBX_EACCESS; 11615 11616 return MDBX_SUCCESS; 11617 } 11618 11619 int mdbx_txn_renew(MDBX_txn *txn) { 11620 if (unlikely(!txn)) 11621 return MDBX_EINVAL; 11622 11623 if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) 11624 return MDBX_EBADSIGN; 11625 11626 if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) 11627 return MDBX_EINVAL; 11628 11629 int rc; 11630 if (unlikely(txn->mt_owner != 0 || !(txn->mt_flags & MDBX_TXN_FINISHED))) { 11631 rc = mdbx_txn_reset(txn); 11632 if (unlikely(rc != MDBX_SUCCESS)) 11633 return rc; 11634 } 11635 11636 rc = txn_renew(txn, MDBX_TXN_RDONLY); 11637 if (rc == MDBX_SUCCESS) { 11638 txn->mt_owner = osal_thread_self(); 11639 DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO 11640 "/%" PRIaPGNO, 11641 txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', 11642 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, 11643 txn->mt_dbs[FREE_DBI].md_root); 11644 } 11645 return rc; 11646 } 11647 11648 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 11649 int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, 11650 MDBX_txn **ret) { 11651 return __inline_mdbx_txn_begin(env, parent, flags, ret); 11652 } 11653 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 11654 11655 int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { 11656 int rc = check_txn(txn, MDBX_TXN_FINISHED); 11657 if (unlikely(rc != MDBX_SUCCESS)) 11658 return rc; 11659 11660 txn->mt_userctx = ctx; 11661 return MDBX_SUCCESS; 11662 } 11663 11664 void *mdbx_txn_get_userctx(const MDBX_txn *txn) { 11665 return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->mt_userctx; 11666 } 11667 11668 int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, 11669 MDBX_txn **ret, void *context) { 11670 MDBX_txn *txn; 11671 unsigned size, tsize; 11672 11673 if (unlikely(!ret)) 11674 return MDBX_EINVAL; 11675 *ret = NULL; 11676 11677 if (unlikely((flags & ~MDBX_TXN_RW_BEGIN_FLAGS) && 11678 (flags & ~MDBX_TXN_RO_BEGIN_FLAGS))) 11679 return MDBX_EINVAL; 11680 11681 int rc = check_env(env, true); 11682 if (unlikely(rc != MDBX_SUCCESS)) 11683 return rc; 11684 11685 if (unlikely(env->me_flags & MDBX_RDONLY & 11686 ~flags)) /* write txn in RDONLY env */ 11687 return MDBX_EACCESS; 11688 11689 flags |= env->me_flags & MDBX_WRITEMAP; 11690 11691 if (parent) { 11692 /* Nested transactions: Max 1 child, write txns only, no writemap */ 11693 rc = check_txn_rw(parent, 11694 MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); 11695 if (unlikely(rc != MDBX_SUCCESS)) 11696 return rc; 11697 11698 if (env->me_options.spill_parent4child_denominator) { 11699 /* Spill dirty-pages of parent to provide dirtyroom for child txn */ 11700 rc = txn_spill(parent, nullptr, 11701 parent->tw.dirtylist->length / 11702 env->me_options.spill_parent4child_denominator); 11703 if (unlikely(rc != MDBX_SUCCESS)) 11704 return rc; 11705 } 11706 tASSERT(parent, audit_ex(parent, 0, false) == 0); 11707 11708 flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); 11709 } else if (flags & MDBX_TXN_RDONLY) { 11710 if (env->me_txn0 && 11711 unlikely(env->me_txn0->mt_owner == osal_thread_self()) && 11712 (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) 11713 return MDBX_TXN_OVERLAPPING; 11714 } else { 11715 /* Reuse preallocated write txn. However, do not touch it until 11716 * txn_renew() succeeds, since it currently may be active. */ 11717 txn = env->me_txn0; 11718 goto renew; 11719 } 11720 11721 size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); 11722 size += tsize = sizeof(MDBX_txn); 11723 if (unlikely((txn = osal_malloc(size)) == NULL)) { 11724 DEBUG("calloc: %s", "failed"); 11725 return MDBX_ENOMEM; 11726 } 11727 #if MDBX_DEBUG 11728 memset(txn, 0xCD, size); 11729 VALGRIND_MAKE_MEM_UNDEFINED(txn, size); 11730 #endif /* MDBX_DEBUG */ 11731 memset(txn, 0, tsize); 11732 txn->mt_dbxs = env->me_dbxs; /* static */ 11733 txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); 11734 txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); 11735 txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs; 11736 txn->mt_flags = flags; 11737 txn->mt_env = env; 11738 11739 if (parent) { 11740 tASSERT(parent, dirtylist_check(parent)); 11741 txn->mt_dbiseqs = parent->mt_dbiseqs; 11742 txn->mt_geo = parent->mt_geo; 11743 rc = dpl_alloc(txn); 11744 if (likely(rc == MDBX_SUCCESS)) { 11745 const unsigned len = 11746 MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; 11747 txn->tw.reclaimed_pglist = 11748 pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); 11749 if (unlikely(!txn->tw.reclaimed_pglist)) 11750 rc = MDBX_ENOMEM; 11751 } 11752 if (unlikely(rc != MDBX_SUCCESS)) { 11753 nested_failed: 11754 pnl_free(txn->tw.reclaimed_pglist); 11755 dpl_free(txn); 11756 osal_free(txn); 11757 return rc; 11758 } 11759 11760 /* Move loose pages to reclaimed list */ 11761 if (parent->tw.loose_count) { 11762 do { 11763 MDBX_page *lp = parent->tw.loose_pages; 11764 const unsigned di = dpl_exist(parent, lp->mp_pgno); 11765 tASSERT(parent, di && parent->tw.dirtylist->items[di].ptr == lp); 11766 tASSERT(parent, lp->mp_flags == P_LOOSE); 11767 rc = pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); 11768 if (unlikely(rc != MDBX_SUCCESS)) 11769 goto nested_failed; 11770 parent->tw.loose_pages = lp->mp_next; 11771 /* Remove from dirty list */ 11772 page_wash(parent, di, lp, 1); 11773 } while (parent->tw.loose_pages); 11774 parent->tw.loose_count = 0; 11775 #if MDBX_ENABLE_REFUND 11776 parent->tw.loose_refund_wl = 0; 11777 #endif /* MDBX_ENABLE_REFUND */ 11778 tASSERT(parent, dirtylist_check(parent)); 11779 } 11780 txn->tw.dirtyroom = parent->tw.dirtyroom; 11781 txn->tw.dirtylru = parent->tw.dirtylru; 11782 11783 dpl_sort(parent); 11784 if (parent->tw.spill_pages) 11785 spill_purge(parent); 11786 11787 tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= 11788 MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); 11789 memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, 11790 MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); 11791 eASSERT(env, pnl_check_allocated( 11792 txn->tw.reclaimed_pglist, 11793 (txn->mt_next_pgno /* LY: intentional assignment here, 11794 only for assertion */ 11795 = parent->mt_next_pgno) - 11796 MDBX_ENABLE_REFUND)); 11797 11798 txn->tw.last_reclaimed = parent->tw.last_reclaimed; 11799 if (parent->tw.lifo_reclaimed) { 11800 txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; 11801 parent->tw.lifo_reclaimed = 11802 (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed); 11803 } 11804 11805 txn->tw.retired_pages = parent->tw.retired_pages; 11806 parent->tw.retired_pages = 11807 (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); 11808 11809 txn->mt_txnid = parent->mt_txnid; 11810 txn->mt_front = parent->mt_front + 1; 11811 #if MDBX_ENABLE_REFUND 11812 txn->tw.loose_refund_wl = 0; 11813 #endif /* MDBX_ENABLE_REFUND */ 11814 txn->mt_canary = parent->mt_canary; 11815 parent->mt_flags |= MDBX_TXN_HAS_CHILD; 11816 parent->mt_child = txn; 11817 txn->mt_parent = parent; 11818 txn->mt_numdbs = parent->mt_numdbs; 11819 txn->mt_owner = parent->mt_owner; 11820 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); 11821 txn->tw.troika = parent->tw.troika; 11822 /* Copy parent's mt_dbistate, but clear DB_NEW */ 11823 for (unsigned i = 0; i < txn->mt_numdbs; i++) 11824 txn->mt_dbistate[i] = 11825 parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); 11826 tASSERT(parent, 11827 parent->tw.dirtyroom + parent->tw.dirtylist->length == 11828 (parent->mt_parent ? parent->mt_parent->tw.dirtyroom 11829 : parent->mt_env->me_options.dp_limit)); 11830 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 11831 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 11832 : txn->mt_env->me_options.dp_limit)); 11833 env->me_txn = txn; 11834 rc = cursor_shadow(parent, txn); 11835 if (AUDIT_ENABLED() && ASSERT_ENABLED()) { 11836 txn->mt_signature = MDBX_MT_SIGNATURE; 11837 tASSERT(txn, audit_ex(txn, 0, false) == 0); 11838 } 11839 if (unlikely(rc != MDBX_SUCCESS)) 11840 txn_end(txn, MDBX_END_FAIL_BEGINCHILD); 11841 } else { /* MDBX_TXN_RDONLY */ 11842 txn->mt_dbiseqs = env->me_dbiseqs; 11843 renew: 11844 rc = txn_renew(txn, flags); 11845 } 11846 11847 if (unlikely(rc != MDBX_SUCCESS)) { 11848 if (txn != env->me_txn0) 11849 osal_free(txn); 11850 } else { 11851 if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) 11852 eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); 11853 else if (flags & MDBX_TXN_RDONLY) 11854 eASSERT(env, (txn->mt_flags & 11855 ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | 11856 /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); 11857 else { 11858 eASSERT(env, (txn->mt_flags & 11859 ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | 11860 MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); 11861 assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); 11862 } 11863 txn->mt_signature = MDBX_MT_SIGNATURE; 11864 txn->mt_userctx = context; 11865 *ret = txn; 11866 DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO 11867 "/%" PRIaPGNO, 11868 txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, 11869 (void *)env, txn->mt_dbs[MAIN_DBI].md_root, 11870 txn->mt_dbs[FREE_DBI].md_root); 11871 } 11872 11873 return rc; 11874 } 11875 11876 int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { 11877 int rc = check_txn(txn, MDBX_TXN_FINISHED); 11878 if (unlikely(rc != MDBX_SUCCESS)) 11879 return rc; 11880 11881 if (unlikely(!info)) 11882 return MDBX_EINVAL; 11883 11884 MDBX_env *const env = txn->mt_env; 11885 #if MDBX_ENV_CHECKPID 11886 if (unlikely(env->me_pid != osal_getpid())) { 11887 env->me_flags |= MDBX_FATAL_ERROR; 11888 return MDBX_PANIC; 11889 } 11890 #endif /* MDBX_ENV_CHECKPID */ 11891 11892 info->txn_id = txn->mt_txnid; 11893 info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); 11894 11895 if (txn->mt_flags & MDBX_TXN_RDONLY) { 11896 meta_ptr_t head; 11897 uint64_t head_retired; 11898 meta_troika_t troika = meta_tap(env); 11899 do { 11900 /* fetch info from volatile head */ 11901 head = meta_recent(env, &troika); 11902 head_retired = 11903 unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); 11904 info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); 11905 info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); 11906 info->txn_space_leftover = 11907 pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); 11908 } while (unlikely(meta_should_retry(env, &troika))); 11909 11910 info->txn_reader_lag = head.txnid - info->txn_id; 11911 info->txn_space_dirty = info->txn_space_retired = 0; 11912 uint64_t reader_snapshot_pages_retired; 11913 if (txn->to.reader && 11914 head_retired > 11915 (reader_snapshot_pages_retired = atomic_load64( 11916 &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) { 11917 info->txn_space_dirty = info->txn_space_retired = pgno2bytes( 11918 env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); 11919 11920 size_t retired_next_reader = 0; 11921 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 11922 if (scan_rlt && info->txn_reader_lag > 1 && lck) { 11923 /* find next more recent reader */ 11924 txnid_t next_reader = head.txnid; 11925 const unsigned snap_nreaders = 11926 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 11927 for (unsigned i = 0; i < snap_nreaders; ++i) { 11928 retry: 11929 if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { 11930 jitter4testing(true); 11931 const txnid_t snap_txnid = 11932 safe64_read(&lck->mti_readers[i].mr_txnid); 11933 const uint64_t snap_retired = 11934 atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired, 11935 mo_AcquireRelease); 11936 if (unlikely(snap_retired != 11937 atomic_load64( 11938 &lck->mti_readers[i].mr_snapshot_pages_retired, 11939 mo_Relaxed)) || 11940 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)) 11941 goto retry; 11942 if (snap_txnid <= txn->mt_txnid) { 11943 retired_next_reader = 0; 11944 break; 11945 } 11946 if (snap_txnid < next_reader) { 11947 next_reader = snap_txnid; 11948 retired_next_reader = pgno2bytes( 11949 env, (pgno_t)(snap_retired - 11950 atomic_load64( 11951 &txn->to.reader->mr_snapshot_pages_retired, 11952 mo_Relaxed))); 11953 } 11954 } 11955 } 11956 } 11957 info->txn_space_dirty = retired_next_reader; 11958 } 11959 } else { 11960 info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); 11961 info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); 11962 info->txn_space_retired = pgno2bytes( 11963 env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages 11964 : MDBX_PNL_SIZE(txn->tw.retired_pages)); 11965 info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); 11966 info->txn_space_dirty = 11967 pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); 11968 info->txn_reader_lag = INT64_MAX; 11969 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 11970 if (scan_rlt && lck) { 11971 txnid_t oldest_snapshot = txn->mt_txnid; 11972 const unsigned snap_nreaders = 11973 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 11974 if (snap_nreaders) { 11975 oldest_snapshot = txn_oldest_reader(txn); 11976 if (oldest_snapshot == txn->mt_txnid - 1) { 11977 /* check if there is at least one reader */ 11978 bool exists = false; 11979 for (unsigned i = 0; i < snap_nreaders; ++i) { 11980 if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && 11981 txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { 11982 exists = true; 11983 break; 11984 } 11985 } 11986 oldest_snapshot += !exists; 11987 } 11988 } 11989 info->txn_reader_lag = txn->mt_txnid - oldest_snapshot; 11990 } 11991 } 11992 11993 return MDBX_SUCCESS; 11994 } 11995 11996 MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { 11997 if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE || 11998 txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE)) 11999 return NULL; 12000 return txn->mt_env; 12001 } 12002 12003 uint64_t mdbx_txn_id(const MDBX_txn *txn) { 12004 if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) 12005 return 0; 12006 return txn->mt_txnid; 12007 } 12008 12009 int mdbx_txn_flags(const MDBX_txn *txn) { 12010 if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) { 12011 assert((-1 & (int)MDBX_TXN_INVALID) != 0); 12012 return -1; 12013 } 12014 assert(0 == (int)(txn->mt_flags & MDBX_TXN_INVALID)); 12015 return txn->mt_flags; 12016 } 12017 12018 /* Check for misused dbi handles */ 12019 static __inline bool dbi_changed(MDBX_txn *txn, MDBX_dbi dbi) { 12020 if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) 12021 return false; 12022 if (likely( 12023 txn->mt_dbiseqs[dbi].weak == 12024 atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], 12025 mo_AcquireRelease))) 12026 return false; 12027 return true; 12028 } 12029 12030 static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) { 12031 unsigned v = env->me_dbiseqs[slot].weak + 1; 12032 return v + (v == 0); 12033 } 12034 12035 static void dbi_import_locked(MDBX_txn *txn) { 12036 const MDBX_env *const env = txn->mt_env; 12037 unsigned n = env->me_numdbs; 12038 for (unsigned i = CORE_DBS; i < n; ++i) { 12039 if (i >= txn->mt_numdbs) { 12040 txn->mt_cursors[i] = NULL; 12041 if (txn->mt_dbiseqs != env->me_dbiseqs) 12042 txn->mt_dbiseqs[i].weak = 0; 12043 txn->mt_dbistate[i] = 0; 12044 } 12045 if ((dbi_changed(txn, i) && 12046 (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || 12047 ((env->me_dbflags[i] & DB_VALID) && 12048 !(txn->mt_dbistate[i] & DBI_VALID))) { 12049 tASSERT(txn, 12050 (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); 12051 txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; 12052 txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; 12053 txn->mt_dbistate[i] = 0; 12054 if (env->me_dbflags[i] & DB_VALID) { 12055 txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; 12056 tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); 12057 tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); 12058 } 12059 } 12060 } 12061 while (unlikely(n < txn->mt_numdbs)) 12062 if (txn->mt_cursors[txn->mt_numdbs - 1] == NULL && 12063 (txn->mt_dbistate[txn->mt_numdbs - 1] & DBI_USRVALID) == 0) 12064 txn->mt_numdbs -= 1; 12065 else { 12066 if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { 12067 if (txn->mt_dbiseqs != env->me_dbiseqs) 12068 txn->mt_dbiseqs[n].weak = 0; 12069 txn->mt_dbistate[n] = 0; 12070 } 12071 ++n; 12072 } 12073 txn->mt_numdbs = n; 12074 } 12075 12076 /* Import DBI which opened after txn started into context */ 12077 __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { 12078 if (dbi < CORE_DBS || 12079 (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) 12080 return false; 12081 12082 ENSURE(txn->mt_env, 12083 osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); 12084 dbi_import_locked(txn); 12085 ENSURE(txn->mt_env, 12086 osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); 12087 return txn->mt_dbistate[dbi] & DBI_USRVALID; 12088 } 12089 12090 /* Export or close DBI handles opened in this txn. */ 12091 static void dbi_update(MDBX_txn *txn, int keep) { 12092 tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); 12093 MDBX_dbi n = txn->mt_numdbs; 12094 if (n) { 12095 bool locked = false; 12096 MDBX_env *const env = txn->mt_env; 12097 12098 for (unsigned i = n; --i >= CORE_DBS;) { 12099 if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) 12100 continue; 12101 if (!locked) { 12102 ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); 12103 locked = true; 12104 } 12105 if (env->me_numdbs <= i || 12106 txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) 12107 continue /* dbi explicitly closed and/or then re-opened by other txn */; 12108 if (keep) { 12109 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; 12110 } else { 12111 char *ptr = env->me_dbxs[i].md_name.iov_base; 12112 if (ptr) { 12113 env->me_dbxs[i].md_name.iov_len = 0; 12114 eASSERT(env, env->me_dbflags[i] == 0); 12115 atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), 12116 mo_AcquireRelease); 12117 env->me_dbxs[i].md_name.iov_base = NULL; 12118 osal_free(ptr); 12119 } 12120 } 12121 } 12122 12123 n = env->me_numdbs; 12124 if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { 12125 if (!locked) { 12126 ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); 12127 locked = true; 12128 } 12129 12130 n = env->me_numdbs; 12131 while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID)) 12132 --n; 12133 env->me_numdbs = n; 12134 } 12135 12136 if (unlikely(locked)) 12137 ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); 12138 } 12139 } 12140 12141 /* Filter-out pgno list from transaction's dirty-page list */ 12142 static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { 12143 if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { 12144 tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); 12145 MDBX_dpl *dl = dpl_sort(txn); 12146 12147 /* Scanning in ascend order */ 12148 const int step = MDBX_PNL_ASCENDING ? 1 : -1; 12149 const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); 12150 const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; 12151 tASSERT(txn, pl[begin] <= pl[end - step]); 12152 12153 unsigned r = dpl_search(txn, pl[begin] >> spilled); 12154 tASSERT(txn, dl->sorted == dl->length); 12155 for (int i = begin; r <= dl->length;) { /* scan loop */ 12156 assert(i != end); 12157 tASSERT(txn, !spilled || (pl[i] & 1) == 0); 12158 pgno_t pl_pgno = pl[i] >> spilled; 12159 pgno_t dp_pgno = dl->items[r].pgno; 12160 if (likely(dp_pgno != pl_pgno)) { 12161 const bool cmp = dp_pgno < pl_pgno; 12162 r += cmp; 12163 i += cmp ? 0 : step; 12164 if (likely(i != end)) 12165 continue; 12166 return; 12167 } 12168 12169 /* update loop */ 12170 unsigned npages, w = r; 12171 remove_dl: 12172 npages = dpl_npages(dl, r); 12173 dl->pages_including_loose -= npages; 12174 if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) 12175 dpage_free(txn->mt_env, dl->items[r].ptr, npages); 12176 ++r; 12177 next_i: 12178 i += step; 12179 if (unlikely(i == end)) { 12180 while (r <= dl->length) 12181 dl->items[w++] = dl->items[r++]; 12182 } else { 12183 while (r <= dl->length) { 12184 assert(i != end); 12185 tASSERT(txn, !spilled || (pl[i] & 1) == 0); 12186 pl_pgno = pl[i] >> spilled; 12187 dp_pgno = dl->items[r].pgno; 12188 if (dp_pgno < pl_pgno) 12189 dl->items[w++] = dl->items[r++]; 12190 else if (dp_pgno > pl_pgno) 12191 goto next_i; 12192 else 12193 goto remove_dl; 12194 } 12195 } 12196 dl->sorted = dpl_setlen(dl, w - 1); 12197 txn->tw.dirtyroom += r - w; 12198 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 12199 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 12200 : txn->mt_env->me_options.dp_limit)); 12201 return; 12202 } 12203 } 12204 } 12205 12206 /* End a transaction, except successful commit of a nested transaction. 12207 * May be called twice for readonly txns: First reset it, then abort. 12208 * [in] txn the transaction handle to end 12209 * [in] mode why and how to end the transaction */ 12210 static int txn_end(MDBX_txn *txn, const unsigned mode) { 12211 MDBX_env *env = txn->mt_env; 12212 static const char *const names[] = MDBX_END_NAMES; 12213 12214 #if MDBX_ENV_CHECKPID 12215 if (unlikely(txn->mt_env->me_pid != osal_getpid())) { 12216 env->me_flags |= MDBX_FATAL_ERROR; 12217 return MDBX_PANIC; 12218 } 12219 #endif /* MDBX_ENV_CHECKPID */ 12220 12221 DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO 12222 "/%" PRIaPGNO, 12223 names[mode & MDBX_END_OPMASK], txn->mt_txnid, 12224 (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, 12225 txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); 12226 12227 ENSURE(env, txn->mt_txnid >= 12228 /* paranoia is appropriate here */ env->me_lck 12229 ->mti_oldest_reader.weak); 12230 12231 if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ 12232 cursors_eot(txn, false); 12233 12234 int rc = MDBX_SUCCESS; 12235 if (txn->mt_flags & MDBX_TXN_RDONLY) { 12236 if (txn->to.reader) { 12237 MDBX_reader *slot = txn->to.reader; 12238 eASSERT(env, slot->mr_pid.weak == env->me_pid); 12239 if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { 12240 eASSERT(env, 12241 txn->mt_txnid == slot->mr_txnid.weak && 12242 slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); 12243 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 12244 txn_valgrind(env, nullptr); 12245 #endif 12246 atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); 12247 safe64_reset(&slot->mr_txnid, false); 12248 atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, 12249 mo_Relaxed); 12250 } else { 12251 eASSERT(env, slot->mr_pid.weak == env->me_pid); 12252 eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); 12253 } 12254 if (mode & MDBX_END_SLOT) { 12255 if ((env->me_flags & MDBX_ENV_TXKEY) == 0) 12256 atomic_store32(&slot->mr_pid, 0, mo_Relaxed); 12257 txn->to.reader = NULL; 12258 } 12259 } 12260 #if defined(_WIN32) || defined(_WIN64) 12261 if (txn->mt_flags & MDBX_SHRINK_ALLOWED) 12262 osal_srwlock_ReleaseShared(&env->me_remap_guard); 12263 #endif 12264 txn->mt_numdbs = 0; /* prevent further DBI activity */ 12265 txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; 12266 txn->mt_owner = 0; 12267 } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { 12268 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 12269 if (txn == env->me_txn0) 12270 txn_valgrind(env, nullptr); 12271 #endif 12272 12273 txn->mt_flags = MDBX_TXN_FINISHED; 12274 txn->mt_owner = 0; 12275 env->me_txn = txn->mt_parent; 12276 pnl_free(txn->tw.spill_pages); 12277 txn->tw.spill_pages = nullptr; 12278 if (txn == env->me_txn0) { 12279 eASSERT(env, txn->mt_parent == NULL); 12280 /* Export or close DBI handles created in this txn */ 12281 dbi_update(txn, mode & MDBX_END_UPDATE); 12282 pnl_shrink(&txn->tw.retired_pages); 12283 pnl_shrink(&txn->tw.reclaimed_pglist); 12284 if (!(env->me_flags & MDBX_WRITEMAP)) 12285 dlist_free(txn); 12286 /* The writer mutex was locked in mdbx_txn_begin. */ 12287 mdbx_txn_unlock(env); 12288 } else { 12289 eASSERT(env, txn->mt_parent != NULL); 12290 MDBX_txn *const parent = txn->mt_parent; 12291 eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); 12292 eASSERT(env, parent->mt_child == txn && 12293 (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); 12294 eASSERT(env, pnl_check_allocated(txn->tw.reclaimed_pglist, 12295 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 12296 eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, 12297 sizeof(meta_troika_t)) == 0); 12298 12299 if (txn->tw.lifo_reclaimed) { 12300 eASSERT(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= 12301 (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); 12302 MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 12303 (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; 12304 parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; 12305 } 12306 12307 if (txn->tw.retired_pages) { 12308 eASSERT(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= 12309 (unsigned)(uintptr_t)parent->tw.retired_pages); 12310 MDBX_PNL_SIZE(txn->tw.retired_pages) = 12311 (unsigned)(uintptr_t)parent->tw.retired_pages; 12312 parent->tw.retired_pages = txn->tw.retired_pages; 12313 } 12314 12315 parent->mt_child = nullptr; 12316 parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; 12317 parent->tw.dirtylru = txn->tw.dirtylru; 12318 tASSERT(parent, dirtylist_check(parent)); 12319 tASSERT(parent, audit_ex(parent, 0, false) == 0); 12320 if (!(env->me_flags & MDBX_WRITEMAP)) 12321 dlist_free(txn); 12322 dpl_free(txn); 12323 pnl_free(txn->tw.reclaimed_pglist); 12324 12325 if (parent->mt_geo.upper != txn->mt_geo.upper || 12326 parent->mt_geo.now != txn->mt_geo.now) { 12327 /* undo resize performed by child txn */ 12328 rc = map_resize_implicit(env, parent->mt_next_pgno, parent->mt_geo.now, 12329 parent->mt_geo.upper); 12330 if (rc == MDBX_EPERM) { 12331 /* unable undo resize (it is regular for Windows), 12332 * therefore promote size changes from child to the parent txn */ 12333 WARNING("unable undo resize performed by child txn, promote to " 12334 "the parent (%u->%u, %u->%u)", 12335 txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, 12336 parent->mt_geo.upper); 12337 parent->mt_geo.now = txn->mt_geo.now; 12338 parent->mt_geo.upper = txn->mt_geo.upper; 12339 parent->mt_flags |= MDBX_TXN_DIRTY; 12340 rc = MDBX_SUCCESS; 12341 } else if (unlikely(rc != MDBX_SUCCESS)) { 12342 ERROR("error %d while undo resize performed by child txn, fail " 12343 "the parent", 12344 rc); 12345 parent->mt_flags |= MDBX_TXN_ERROR; 12346 if (!env->me_dxb_mmap.address) 12347 env->me_flags |= MDBX_FATAL_ERROR; 12348 } 12349 } 12350 } 12351 } 12352 12353 eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); 12354 if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { 12355 txn->mt_signature = 0; 12356 osal_free(txn); 12357 } 12358 12359 return rc; 12360 } 12361 12362 int mdbx_txn_reset(MDBX_txn *txn) { 12363 int rc = check_txn(txn, 0); 12364 if (unlikely(rc != MDBX_SUCCESS)) 12365 return rc; 12366 12367 /* This call is only valid for read-only txns */ 12368 if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) 12369 return MDBX_EINVAL; 12370 12371 /* LY: don't close DBI-handles */ 12372 rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); 12373 if (rc == MDBX_SUCCESS) { 12374 tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); 12375 tASSERT(txn, txn->mt_owner == 0); 12376 } 12377 return rc; 12378 } 12379 12380 int mdbx_txn_break(MDBX_txn *txn) { 12381 do { 12382 int rc = check_txn(txn, 0); 12383 if (unlikely(rc != MDBX_SUCCESS)) 12384 return rc; 12385 txn->mt_flags |= MDBX_TXN_ERROR; 12386 if (txn->mt_flags & MDBX_TXN_RDONLY) 12387 break; 12388 txn = txn->mt_child; 12389 } while (txn); 12390 return MDBX_SUCCESS; 12391 } 12392 12393 int mdbx_txn_abort(MDBX_txn *txn) { 12394 int rc = check_txn(txn, 0); 12395 if (unlikely(rc != MDBX_SUCCESS)) 12396 return rc; 12397 12398 if (txn->mt_flags & MDBX_TXN_RDONLY) 12399 /* LY: don't close DBI-handles */ 12400 return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | 12401 MDBX_END_FREE); 12402 12403 if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) 12404 return MDBX_BAD_TXN; 12405 12406 if (txn->mt_child) 12407 mdbx_txn_abort(txn->mt_child); 12408 12409 tASSERT(txn, dirtylist_check(txn)); 12410 return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); 12411 } 12412 12413 /* Count all the pages in each DB and in the GC and make sure 12414 * it matches the actual number of pages being used. */ 12415 __cold static int audit_ex(MDBX_txn *txn, unsigned retired_stored, 12416 bool dont_filter_gc) { 12417 pgno_t pending = 0; 12418 if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { 12419 pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + 12420 (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored); 12421 } 12422 12423 MDBX_cursor_couple cx; 12424 int rc = cursor_init(&cx.outer, txn, FREE_DBI); 12425 if (unlikely(rc != MDBX_SUCCESS)) 12426 return rc; 12427 12428 pgno_t gc = 0; 12429 MDBX_val key, data; 12430 while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { 12431 if (!dont_filter_gc) { 12432 if (unlikely(key.iov_len != sizeof(txnid_t))) 12433 return MDBX_CORRUPTED; 12434 txnid_t id = unaligned_peek_u64(4, key.iov_base); 12435 if (txn->tw.lifo_reclaimed) { 12436 for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i) 12437 if (id == txn->tw.lifo_reclaimed[i]) 12438 goto skip; 12439 } else if (id <= txn->tw.last_reclaimed) 12440 goto skip; 12441 } 12442 12443 gc += *(pgno_t *)data.iov_base; 12444 skip:; 12445 } 12446 tASSERT(txn, rc == MDBX_NOTFOUND); 12447 12448 for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) 12449 txn->mt_dbistate[i] &= ~DBI_AUDITED; 12450 12451 pgno_t used = NUM_METAS; 12452 for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { 12453 if (!(txn->mt_dbistate[i] & DBI_VALID)) 12454 continue; 12455 rc = cursor_init(&cx.outer, txn, i); 12456 if (unlikely(rc != MDBX_SUCCESS)) 12457 return rc; 12458 txn->mt_dbistate[i] |= DBI_AUDITED; 12459 if (txn->mt_dbs[i].md_root == P_INVALID) 12460 continue; 12461 used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + 12462 txn->mt_dbs[i].md_overflow_pages; 12463 12464 if (i != MAIN_DBI) 12465 continue; 12466 rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); 12467 while (rc == MDBX_SUCCESS) { 12468 MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; 12469 for (unsigned j = 0; j < page_numkeys(mp); j++) { 12470 MDBX_node *node = page_node(mp, j); 12471 if (node_flags(node) == F_SUBDATA) { 12472 if (unlikely(node_ds(node) != sizeof(MDBX_db))) 12473 return MDBX_CORRUPTED; 12474 MDBX_db db_copy, *db; 12475 memcpy(db = &db_copy, node_data(node), sizeof(db_copy)); 12476 if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { 12477 for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { 12478 if ((txn->mt_dbistate[k] & DBI_VALID) && 12479 /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ 12480 node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && 12481 memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, 12482 node_ks(node)) == 0) { 12483 txn->mt_dbistate[k] |= DBI_AUDITED; 12484 if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE)) 12485 db = txn->mt_dbs + k; 12486 break; 12487 } 12488 } 12489 } 12490 used += 12491 db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; 12492 } 12493 } 12494 rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); 12495 } 12496 tASSERT(txn, rc == MDBX_NOTFOUND); 12497 } 12498 12499 for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { 12500 if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != 12501 DBI_VALID) 12502 continue; 12503 for (MDBX_txn *t = txn; t; t = t->mt_parent) 12504 if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { 12505 used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + 12506 t->mt_dbs[i].md_overflow_pages; 12507 txn->mt_dbistate[i] |= DBI_AUDITED; 12508 break; 12509 } 12510 if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { 12511 WARNING("audit %s@%" PRIaTXN 12512 ": unable account dbi %d / \"%*s\", state 0x%02x", 12513 txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, 12514 (int)txn->mt_dbxs[i].md_name.iov_len, 12515 (const char *)txn->mt_dbxs[i].md_name.iov_base, 12516 txn->mt_dbistate[i]); 12517 } 12518 } 12519 12520 if (pending + gc + used == txn->mt_next_pgno) 12521 return MDBX_SUCCESS; 12522 12523 if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) 12524 ERROR("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " 12525 "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", 12526 txn->mt_txnid, pending, txn->tw.loose_count, 12527 MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), 12528 txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, 12529 retired_stored); 12530 ERROR("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO 12531 "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO 12532 "(allocated)", 12533 txn->mt_txnid, pending, gc, used, pending + gc + used, 12534 txn->mt_next_pgno); 12535 return MDBX_PROBLEM; 12536 } 12537 12538 typedef struct gc_update_context { 12539 unsigned retired_stored, loop; 12540 unsigned settled, cleaned_slot, reused_slot, filled_slot; 12541 txnid_t cleaned_id, rid; 12542 bool lifo, dense; 12543 #if MDBX_ENABLE_BIGFOOT 12544 txnid_t bigfoot; 12545 #endif /* MDBX_ENABLE_BIGFOOT */ 12546 MDBX_cursor_couple cursor; 12547 } gcu_context_t; 12548 12549 static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { 12550 memset(ctx, 0, offsetof(gcu_context_t, cursor)); 12551 ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; 12552 #if MDBX_ENABLE_BIGFOOT 12553 ctx->bigfoot = txn->mt_txnid; 12554 #endif /* MDBX_ENABLE_BIGFOOT */ 12555 return cursor_init(&ctx->cursor.outer, txn, FREE_DBI); 12556 } 12557 12558 static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) { 12559 return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; 12560 } 12561 12562 static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { 12563 int err = MDBX_SUCCESS; 12564 if (ctx->retired_stored) 12565 do { 12566 MDBX_val key, val; 12567 #if MDBX_ENABLE_BIGFOOT 12568 key.iov_base = &ctx->bigfoot; 12569 #else 12570 key.iov_base = &txn->mt_txnid; 12571 #endif /* MDBX_ENABLE_BIGFOOT */ 12572 key.iov_len = sizeof(txnid_t); 12573 const struct cursor_set_result csr = 12574 cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET); 12575 if (csr.err == MDBX_SUCCESS && csr.exact) { 12576 ctx->retired_stored = 0; 12577 err = mdbx_cursor_del(&ctx->cursor.outer, 0); 12578 TRACE("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn), 12579 err); 12580 } 12581 } 12582 #if MDBX_ENABLE_BIGFOOT 12583 while (!err && --ctx->bigfoot >= txn->mt_txnid); 12584 #else 12585 while (0); 12586 #endif /* MDBX_ENABLE_BIGFOOT */ 12587 return err; 12588 } 12589 12590 /* Prepare a backlog of pages to modify GC itself, while reclaiming is 12591 * prohibited. It should be enough to prevent search in page_alloc_slowpath() 12592 * during a deleting, when GC tree is unbalanced. */ 12593 static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, 12594 const bool reserve4retired) { 12595 const unsigned pages4retiredlist = 12596 reserve4retired ? number_of_ovpages( 12597 txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) 12598 : 0; 12599 const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; 12600 const unsigned backlog4rebalance = backlog4cow + 1; 12601 12602 if (likely(pages4retiredlist < 2 && 12603 gcu_backlog_size(txn) > (reserve4retired 12604 ? backlog4rebalance 12605 : (backlog4cow + backlog4rebalance)))) 12606 return MDBX_SUCCESS; 12607 12608 TRACE(">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u", 12609 reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, 12610 backlog4cow, backlog4rebalance); 12611 12612 int err; 12613 if (unlikely(pages4retiredlist > 2)) { 12614 MDBX_val key, val; 12615 key.iov_base = val.iov_base = nullptr; 12616 key.iov_len = sizeof(txnid_t); 12617 val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); 12618 err = cursor_spill(&ctx->cursor.outer, &key, &val); 12619 if (unlikely(err != MDBX_SUCCESS)) 12620 return err; 12621 } 12622 12623 ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; 12624 err = cursor_touch(&ctx->cursor.outer); 12625 TRACE("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err); 12626 12627 if (unlikely(pages4retiredlist > 1) && 12628 MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored && 12629 err == MDBX_SUCCESS) { 12630 tASSERT(txn, reserve4retired); 12631 err = gcu_clean_stored_retired(txn, ctx); 12632 if (unlikely(err != MDBX_SUCCESS)) 12633 return err; 12634 err = page_alloc_slowpath(&ctx->cursor.outer, pages4retiredlist, 12635 MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) 12636 .err; 12637 TRACE("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn), err); 12638 cASSERT(&ctx->cursor.outer, 12639 gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); 12640 } 12641 12642 while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && 12643 err == MDBX_SUCCESS) 12644 err = page_alloc_slowpath(&ctx->cursor.outer, 0, 12645 MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | 12646 MDBX_ALLOC_FAKE | MDBX_ALLOC_NOLOG) 12647 .err; 12648 12649 ctx->cursor.outer.mc_flags |= C_RECLAIMING; 12650 TRACE("<< backlog %u, err %d", gcu_backlog_size(txn), err); 12651 return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; 12652 } 12653 12654 static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { 12655 /* PNL is initially empty, zero out at least the length */ 12656 memset(pnl.iov_base, 0, sizeof(pgno_t)); 12657 if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) 12658 /* zero out to avoid leaking values from uninitialized malloc'ed memory 12659 * to the file in non-writemap mode if length of the saving page-list 12660 * was changed during space reservation. */ 12661 memset(pnl.iov_base, 0, pnl.iov_len); 12662 } 12663 12664 /* Cleanups reclaimed GC (aka freeDB) records, saves the retired-list (aka 12665 * freelist) of current transaction to GC, puts back into GC leftover of the 12666 * reclaimed pages with chunking. This recursive changes the reclaimed-list, 12667 * loose-list and retired-list. Keep trying until it stabilizes. 12668 * 12669 * NOTE: This code is a consequence of many iterations of adding crutches (aka 12670 * "checks and balances") to partially bypass the fundamental design problems 12671 * inherited from LMDB. So do not try to understand it completely in order to 12672 * avoid your madness. */ 12673 static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { 12674 TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid); 12675 MDBX_env *const env = txn->mt_env; 12676 const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; 12677 (void)dbg_prefix_mode; 12678 ctx->cursor.outer.mc_flags |= C_RECLAIMING; 12679 ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI]; 12680 txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer; 12681 12682 /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. 12683 * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. 12684 * Page numbers cannot disappear from txn->tw.retired_pages[]. */ 12685 12686 retry: 12687 ++ctx->loop; 12688 TRACE("%s", " >> restart"); 12689 int rc = MDBX_SUCCESS; 12690 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 12691 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 12692 tASSERT(txn, dirtylist_check(txn)); 12693 if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { 12694 ERROR("too more loops %u, bailout", ctx->loop); 12695 rc = MDBX_PROBLEM; 12696 goto bailout; 12697 } 12698 12699 if (unlikely(ctx->dense)) { 12700 rc = gcu_clean_stored_retired(txn, ctx); 12701 if (unlikely(rc != MDBX_SUCCESS)) 12702 goto bailout; 12703 } 12704 12705 ctx->settled = 0; 12706 ctx->cleaned_slot = 0; 12707 ctx->reused_slot = 0; 12708 ctx->filled_slot = ~0u; 12709 ctx->cleaned_id = 0; 12710 ctx->rid = txn->tw.last_reclaimed; 12711 while (true) { 12712 /* Come back here after each Put() in case retired-list changed */ 12713 MDBX_val key, data; 12714 TRACE("%s", " >> continue"); 12715 12716 if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && 12717 (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || 12718 ctx->retired_stored > env->me_maxgc_ov1page)) { 12719 rc = gcu_prepare_backlog(txn, ctx, true); 12720 if (unlikely(rc != MDBX_SUCCESS)) 12721 goto bailout; 12722 } 12723 12724 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 12725 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 12726 if (ctx->lifo) { 12727 if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed 12728 ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 12729 : 0)) { 12730 ctx->settled = 0; 12731 ctx->cleaned_slot = 0; 12732 ctx->reused_slot = 0; 12733 ctx->filled_slot = ~0u; 12734 /* LY: cleanup reclaimed records. */ 12735 do { 12736 ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; 12737 tASSERT(txn, 12738 ctx->cleaned_slot > 0 && 12739 ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); 12740 key.iov_base = &ctx->cleaned_id; 12741 key.iov_len = sizeof(ctx->cleaned_id); 12742 rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET); 12743 if (rc == MDBX_NOTFOUND) 12744 continue; 12745 if (unlikely(rc != MDBX_SUCCESS)) 12746 goto bailout; 12747 if (likely(!ctx->dense)) { 12748 rc = gcu_prepare_backlog(txn, ctx, false); 12749 if (unlikely(rc != MDBX_SUCCESS)) 12750 goto bailout; 12751 } 12752 tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); 12753 TRACE("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, 12754 ctx->cleaned_slot, ctx->cleaned_id); 12755 tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); 12756 rc = mdbx_cursor_del(&ctx->cursor.outer, 0); 12757 if (unlikely(rc != MDBX_SUCCESS)) 12758 goto bailout; 12759 } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); 12760 txl_sort(txn->tw.lifo_reclaimed); 12761 } 12762 } else { 12763 /* If using records from GC which we have not yet deleted, 12764 * now delete them and any we reserved for tw.reclaimed_pglist. */ 12765 while (ctx->cleaned_id <= txn->tw.last_reclaimed) { 12766 rc = cursor_first(&ctx->cursor.outer, &key, NULL); 12767 if (rc == MDBX_NOTFOUND) 12768 break; 12769 if (unlikely(rc != MDBX_SUCCESS)) 12770 goto bailout; 12771 if (!MDBX_DISABLE_VALIDATION && 12772 unlikely(key.iov_len != sizeof(txnid_t))) { 12773 rc = MDBX_CORRUPTED; 12774 goto bailout; 12775 } 12776 ctx->rid = ctx->cleaned_id; 12777 ctx->settled = 0; 12778 ctx->reused_slot = 0; 12779 ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); 12780 if (ctx->cleaned_id > txn->tw.last_reclaimed) 12781 break; 12782 if (likely(!ctx->dense)) { 12783 rc = gcu_prepare_backlog(txn, ctx, false); 12784 if (unlikely(rc != MDBX_SUCCESS)) 12785 goto bailout; 12786 } 12787 tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); 12788 tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); 12789 TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, 12790 ctx->cleaned_id); 12791 tASSERT(txn, *txn->mt_cursors == &ctx->cursor.outer); 12792 rc = mdbx_cursor_del(&ctx->cursor.outer, 0); 12793 if (unlikely(rc != MDBX_SUCCESS)) 12794 goto bailout; 12795 } 12796 } 12797 12798 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 12799 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 12800 tASSERT(txn, dirtylist_check(txn)); 12801 if (AUDIT_ENABLED()) { 12802 rc = audit_ex(txn, ctx->retired_stored, false); 12803 if (unlikely(rc != MDBX_SUCCESS)) 12804 goto bailout; 12805 } 12806 12807 /* return suitable into unallocated space */ 12808 if (txn_refund(txn)) { 12809 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 12810 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 12811 if (AUDIT_ENABLED()) { 12812 rc = audit_ex(txn, ctx->retired_stored, false); 12813 if (unlikely(rc != MDBX_SUCCESS)) 12814 goto bailout; 12815 } 12816 } 12817 12818 /* handle loose pages - put ones into the reclaimed- or retired-list */ 12819 if (txn->tw.loose_pages) { 12820 /* Return loose page numbers to tw.reclaimed_pglist, 12821 * though usually none are left at this point. 12822 * The pages themselves remain in dirtylist. */ 12823 if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { 12824 if (txn->tw.loose_count > 0) { 12825 TRACE("%s: try allocate gc-slot for %u loose-pages", dbg_prefix_mode, 12826 txn->tw.loose_count); 12827 rc = page_alloc_slowpath(&ctx->cursor.outer, 0, 12828 MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | 12829 MDBX_ALLOC_FAKE) 12830 .err; 12831 if (rc == MDBX_SUCCESS) { 12832 TRACE("%s: retry since gc-slot for %u loose-pages available", 12833 dbg_prefix_mode, txn->tw.loose_count); 12834 continue; 12835 } 12836 12837 /* Put loose page numbers in tw.retired_pages, 12838 * since unable to return them to tw.reclaimed_pglist. */ 12839 if (unlikely((rc = pnl_need(&txn->tw.retired_pages, 12840 txn->tw.loose_count)) != 0)) 12841 goto bailout; 12842 for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) 12843 pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); 12844 TRACE("%s: append %u loose-pages to retired-pages", dbg_prefix_mode, 12845 txn->tw.loose_count); 12846 } 12847 } else { 12848 /* Room for loose pages + temp PNL with same */ 12849 rc = pnl_need(&txn->tw.reclaimed_pglist, 2 * txn->tw.loose_count + 2); 12850 if (unlikely(rc != MDBX_SUCCESS)) 12851 goto bailout; 12852 MDBX_PNL loose = txn->tw.reclaimed_pglist + 12853 MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - 12854 txn->tw.loose_count - 1; 12855 unsigned count = 0; 12856 for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { 12857 tASSERT(txn, mp->mp_flags == P_LOOSE); 12858 loose[++count] = mp->mp_pgno; 12859 } 12860 tASSERT(txn, count == txn->tw.loose_count); 12861 MDBX_PNL_SIZE(loose) = count; 12862 pnl_sort(loose, txn->mt_next_pgno); 12863 pnl_merge(txn->tw.reclaimed_pglist, loose); 12864 TRACE("%s: append %u loose-pages to reclaimed-pages", dbg_prefix_mode, 12865 txn->tw.loose_count); 12866 } 12867 12868 /* filter-out list of dirty-pages from loose-pages */ 12869 MDBX_dpl *const dl = txn->tw.dirtylist; 12870 unsigned w = 0; 12871 for (unsigned r = w; ++r <= dl->length;) { 12872 MDBX_page *dp = dl->items[r].ptr; 12873 tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); 12874 tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); 12875 if ((dp->mp_flags & P_LOOSE) == 0) { 12876 if (++w != r) 12877 dl->items[w] = dl->items[r]; 12878 } else { 12879 tASSERT(txn, dp->mp_flags == P_LOOSE); 12880 if ((env->me_flags & MDBX_WRITEMAP) == 0) 12881 dpage_free(env, dp, 1); 12882 } 12883 } 12884 TRACE("%s: filtered-out loose-pages from %u -> %u dirty-pages", 12885 dbg_prefix_mode, dl->length, w); 12886 tASSERT(txn, txn->tw.loose_count == dl->length - w); 12887 dpl_setlen(dl, w); 12888 dl->sorted = 0; 12889 dl->pages_including_loose -= txn->tw.loose_count; 12890 txn->tw.dirtyroom += txn->tw.loose_count; 12891 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 12892 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 12893 : txn->mt_env->me_options.dp_limit)); 12894 txn->tw.loose_pages = NULL; 12895 txn->tw.loose_count = 0; 12896 #if MDBX_ENABLE_REFUND 12897 txn->tw.loose_refund_wl = 0; 12898 #endif /* MDBX_ENABLE_REFUND */ 12899 } 12900 12901 const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); 12902 /* handle retired-list - store ones into single gc-record */ 12903 if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { 12904 if (unlikely(!ctx->retired_stored)) { 12905 /* Make sure last page of GC is touched and on retired-list */ 12906 ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; 12907 rc = page_search(&ctx->cursor.outer, NULL, 12908 MDBX_PS_LAST | MDBX_PS_MODIFY); 12909 ctx->cursor.outer.mc_flags |= C_RECLAIMING; 12910 if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) 12911 goto bailout; 12912 } 12913 12914 #if MDBX_ENABLE_BIGFOOT 12915 unsigned retired_pages_before; 12916 do { 12917 if (ctx->bigfoot > txn->mt_txnid) { 12918 rc = gcu_clean_stored_retired(txn, ctx); 12919 tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); 12920 } 12921 12922 retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages); 12923 rc = gcu_prepare_backlog(txn, ctx, true); 12924 if (unlikely(rc != MDBX_SUCCESS)) 12925 goto bailout; 12926 12927 pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); 12928 ctx->retired_stored = 0; 12929 ctx->bigfoot = txn->mt_txnid; 12930 do { 12931 key.iov_len = sizeof(txnid_t); 12932 key.iov_base = &ctx->bigfoot; 12933 const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) - 12934 ctx->retired_stored; 12935 const unsigned chunk = 12936 (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) 12937 ? env->me_maxgc_ov1page 12938 : left; 12939 data.iov_len = (chunk + 1) * sizeof(pgno_t); 12940 rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); 12941 if (unlikely(rc != MDBX_SUCCESS)) 12942 goto bailout; 12943 12944 if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) { 12945 const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING) 12946 ? left - chunk 12947 : ctx->retired_stored; 12948 pgno_t *const begin = txn->tw.retired_pages + at; 12949 /* MDBX_PNL_ASCENDING == false && LIFO == false: 12950 * - the larger pgno is at the beginning of retired list 12951 * and should be placed with the larger txnid. 12952 * MDBX_PNL_ASCENDING == true && LIFO == true: 12953 * - the larger pgno is at the ending of retired list 12954 * and should be placed with the smaller txnid. 12955 */ 12956 const pgno_t save = *begin; 12957 *begin = chunk; 12958 memcpy(data.iov_base, begin, data.iov_len); 12959 *begin = save; 12960 TRACE("%s: put-retired/bigfoot @ %" PRIaTXN 12961 " (slice #%u) #%u [%u..%u] of %u", 12962 dbg_prefix_mode, ctx->bigfoot, 12963 (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, 12964 at + chunk, retired_pages_before); 12965 } 12966 ctx->retired_stored += chunk; 12967 } while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) && 12968 (++ctx->bigfoot, true)); 12969 } while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages)); 12970 #else 12971 /* Write to last page of GC */ 12972 key.iov_len = sizeof(txnid_t); 12973 key.iov_base = &txn->mt_txnid; 12974 do { 12975 gcu_prepare_backlog(txn, ctx, true); 12976 data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); 12977 rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE); 12978 if (unlikely(rc != MDBX_SUCCESS)) 12979 goto bailout; 12980 /* Retry if tw.retired_pages[] grew during the Put() */ 12981 } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); 12982 12983 ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); 12984 pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); 12985 eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); 12986 memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); 12987 12988 TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, 12989 ctx->retired_stored, txn->mt_txnid); 12990 #endif /* MDBX_ENABLE_BIGFOOT */ 12991 if (LOG_ENABLED(MDBX_LOG_EXTRA)) { 12992 unsigned i = ctx->retired_stored; 12993 DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %u, retired-PNL", 12994 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); 12995 for (; i; i--) 12996 DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); 12997 DEBUG_EXTRA_PRINT("%s\n", "."); 12998 } 12999 if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && 13000 ctx->settled)) { 13001 TRACE("%s: reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, 13002 amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); 13003 goto retry /* rare case, but avoids GC fragmentation 13004 and one cycle. */ 13005 ; 13006 } 13007 continue; 13008 } 13009 13010 /* handle reclaimed and lost pages - merge and store both into gc */ 13011 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 13012 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 13013 tASSERT(txn, txn->tw.loose_count == 0); 13014 13015 TRACE("%s", " >> reserving"); 13016 if (AUDIT_ENABLED()) { 13017 rc = audit_ex(txn, ctx->retired_stored, false); 13018 if (unlikely(rc != MDBX_SUCCESS)) 13019 goto bailout; 13020 } 13021 const unsigned left = amount - ctx->settled; 13022 TRACE("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " 13023 "reused-gc-slots %u", 13024 dbg_prefix_mode, amount, ctx->settled, (int)left, 13025 txn->tw.lifo_reclaimed 13026 ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 13027 : 0, 13028 ctx->reused_slot); 13029 if (0 >= (int)left) 13030 break; 13031 13032 const unsigned prefer_max_scatter = 257; 13033 txnid_t reservation_gc_id; 13034 if (ctx->lifo) { 13035 if (txn->tw.lifo_reclaimed == nullptr) { 13036 txn->tw.lifo_reclaimed = txl_alloc(); 13037 if (unlikely(!txn->tw.lifo_reclaimed)) { 13038 rc = MDBX_ENOMEM; 13039 goto bailout; 13040 } 13041 } 13042 if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < 13043 prefer_max_scatter && 13044 left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - 13045 ctx->reused_slot) * 13046 env->me_maxgc_ov1page && 13047 !ctx->dense) { 13048 /* LY: need just a txn-id for save page list. */ 13049 bool need_cleanup = false; 13050 txnid_t snap_oldest; 13051 retry_rid: 13052 ctx->cursor.outer.mc_flags &= ~C_RECLAIMING; 13053 do { 13054 snap_oldest = txn_oldest_reader(txn); 13055 rc = page_alloc_slowpath(&ctx->cursor.outer, 0, 13056 MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | 13057 MDBX_ALLOC_FAKE) 13058 .err; 13059 if (likely(rc == MDBX_SUCCESS)) { 13060 TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, 13061 MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); 13062 need_cleanup = true; 13063 } 13064 } while (rc == MDBX_SUCCESS && 13065 (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < 13066 prefer_max_scatter && 13067 left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - 13068 ctx->reused_slot) * 13069 env->me_maxgc_ov1page); 13070 ctx->cursor.outer.mc_flags |= C_RECLAIMING; 13071 13072 if (likely(rc == MDBX_SUCCESS)) { 13073 TRACE("%s: got enough from GC.", dbg_prefix_mode); 13074 continue; 13075 } else if (unlikely(rc != MDBX_NOTFOUND)) 13076 /* LY: some troubles... */ 13077 goto bailout; 13078 13079 if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { 13080 if (need_cleanup) { 13081 txl_sort(txn->tw.lifo_reclaimed); 13082 ctx->cleaned_slot = 0; 13083 } 13084 ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); 13085 } else { 13086 tASSERT(txn, txn->tw.last_reclaimed == 0); 13087 if (unlikely(txn_oldest_reader(txn) != snap_oldest)) 13088 /* should retry page_alloc_slowpath(MDBX_ALLOC_GC) 13089 * if the oldest reader changes since the last attempt */ 13090 goto retry_rid; 13091 /* no reclaimable GC entries, 13092 * therefore no entries with ID < mdbx_find_oldest(txn) */ 13093 txn->tw.last_reclaimed = ctx->rid = snap_oldest; 13094 TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, 13095 ctx->rid); 13096 } 13097 13098 /* LY: GC is empty, will look any free txn-id in high2low order. */ 13099 while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && 13100 left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - 13101 ctx->reused_slot) * 13102 env->me_maxgc_ov1page) { 13103 if (unlikely(ctx->rid <= MIN_TXNID)) { 13104 if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= 13105 ctx->reused_slot)) { 13106 NOTICE("** restart: reserve depleted (reused_gc_slot %u >= " 13107 "lifo_reclaimed %u" PRIaTXN, 13108 ctx->reused_slot, 13109 (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); 13110 goto retry; 13111 } 13112 break; 13113 } 13114 13115 tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); 13116 --ctx->rid; 13117 key.iov_base = &ctx->rid; 13118 key.iov_len = sizeof(ctx->rid); 13119 rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); 13120 if (unlikely(rc == MDBX_SUCCESS)) { 13121 DEBUG("%s: GC's id %" PRIaTXN " is used, continue bottom-up search", 13122 dbg_prefix_mode, ctx->rid); 13123 ++ctx->rid; 13124 rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST); 13125 if (rc == MDBX_NOTFOUND) { 13126 DEBUG("%s: GC is empty (going dense-mode)", dbg_prefix_mode); 13127 ctx->dense = true; 13128 break; 13129 } 13130 if (unlikely(rc != MDBX_SUCCESS || 13131 key.iov_len != sizeof(txnid_t))) { 13132 rc = MDBX_CORRUPTED; 13133 goto bailout; 13134 } 13135 txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); 13136 if (gc_first <= MIN_TXNID) { 13137 DEBUG("%s: no free GC's id(s) less than %" PRIaTXN 13138 " (going dense-mode)", 13139 dbg_prefix_mode, ctx->rid); 13140 ctx->dense = true; 13141 break; 13142 } 13143 ctx->rid = gc_first - 1; 13144 } 13145 13146 eASSERT(env, !ctx->dense); 13147 rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid); 13148 if (unlikely(rc != MDBX_SUCCESS)) 13149 goto bailout; 13150 13151 if (ctx->reused_slot) 13152 /* rare case, but it is better to clear and re-create GC entries 13153 * with less fragmentation. */ 13154 need_cleanup = true; 13155 else 13156 ctx->cleaned_slot += 13157 1 /* mark cleanup is not needed for added slot. */; 13158 13159 TRACE("%s: append @%" PRIaTXN 13160 " to lifo-reclaimed, cleaned-gc-slot = %u", 13161 dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); 13162 } 13163 13164 if (need_cleanup || ctx->dense) { 13165 if (ctx->cleaned_slot) 13166 TRACE("%s: restart inner-loop to clear and re-create GC entries", 13167 dbg_prefix_mode); 13168 ctx->cleaned_slot = 0; 13169 continue; 13170 } 13171 } 13172 13173 const unsigned i = 13174 (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; 13175 tASSERT(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); 13176 reservation_gc_id = txn->tw.lifo_reclaimed[i]; 13177 TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, 13178 reservation_gc_id, i); 13179 } else { 13180 tASSERT(txn, txn->tw.lifo_reclaimed == NULL); 13181 if (unlikely(ctx->rid == 0)) { 13182 ctx->rid = txn_oldest_reader(txn); 13183 rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST); 13184 if (rc == MDBX_SUCCESS) { 13185 if (unlikely(key.iov_len != sizeof(txnid_t))) { 13186 rc = MDBX_CORRUPTED; 13187 goto bailout; 13188 } 13189 txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); 13190 if (ctx->rid >= gc_first) 13191 ctx->rid = gc_first - 1; 13192 if (unlikely(ctx->rid == 0)) { 13193 ERROR("%s", "** no GC tail-space to store (going dense-mode)"); 13194 ctx->dense = true; 13195 goto retry; 13196 } 13197 } else if (rc != MDBX_NOTFOUND) 13198 goto bailout; 13199 txn->tw.last_reclaimed = ctx->rid; 13200 ctx->cleaned_id = ctx->rid + 1; 13201 } 13202 reservation_gc_id = ctx->rid--; 13203 TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, 13204 reservation_gc_id); 13205 } 13206 ++ctx->reused_slot; 13207 13208 unsigned chunk = left; 13209 if (unlikely(chunk > env->me_maxgc_ov1page)) { 13210 const unsigned avail_gc_slots = 13211 txn->tw.lifo_reclaimed 13212 ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - 13213 ctx->reused_slot + 1 13214 : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid 13215 : INT16_MAX; 13216 if (avail_gc_slots > 1) { 13217 if (chunk < env->me_maxgc_ov1page * 2) 13218 chunk /= 2; 13219 else { 13220 const unsigned threshold = 13221 env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) 13222 ? avail_gc_slots 13223 : prefer_max_scatter); 13224 if (left < threshold) 13225 chunk = env->me_maxgc_ov1page; 13226 else { 13227 const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; 13228 unsigned span = 1; 13229 unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / 13230 sizeof(pgno_t)) /* - 1 + span */; 13231 if (tail > avail) { 13232 for (unsigned i = amount - span; i > 0; --i) { 13233 if (MDBX_PNL_ASCENDING 13234 ? (txn->tw.reclaimed_pglist[i] + span) 13235 : (txn->tw.reclaimed_pglist[i] - span) == 13236 txn->tw.reclaimed_pglist[i + span]) { 13237 span += 1; 13238 avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / 13239 sizeof(pgno_t)) - 13240 1 + span; 13241 if (avail >= tail) 13242 break; 13243 } 13244 } 13245 } 13246 13247 chunk = (avail >= tail) ? tail - span 13248 : (avail_gc_slots > 3 && 13249 ctx->reused_slot < prefer_max_scatter - 3) 13250 ? avail - span 13251 : tail; 13252 } 13253 } 13254 } 13255 } 13256 tASSERT(txn, chunk > 0); 13257 13258 TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " 13259 "%" PRIaTXN, 13260 dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); 13261 13262 TRACE("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, 13263 env->me_maxgc_ov1page); 13264 13265 tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); 13266 if (unlikely( 13267 reservation_gc_id < MIN_TXNID || 13268 reservation_gc_id > 13269 atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { 13270 ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", 13271 reservation_gc_id); 13272 rc = MDBX_PROBLEM; 13273 goto bailout; 13274 } 13275 13276 key.iov_len = sizeof(reservation_gc_id); 13277 key.iov_base = &reservation_gc_id; 13278 data.iov_len = (chunk + 1) * sizeof(pgno_t); 13279 TRACE("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, 13280 ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); 13281 gcu_prepare_backlog(txn, ctx, true); 13282 rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, 13283 MDBX_RESERVE | MDBX_NOOVERWRITE); 13284 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 13285 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 13286 if (unlikely(rc != MDBX_SUCCESS)) 13287 goto bailout; 13288 13289 gcu_clean_reserved(env, data); 13290 ctx->settled += chunk; 13291 TRACE("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled, 13292 chunk); 13293 13294 if (txn->tw.lifo_reclaimed && 13295 unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && 13296 (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > 13297 env->me_maxgc_ov1page)) { 13298 NOTICE("** restart: reclaimed-list growth %u -> %u", amount, 13299 (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); 13300 goto retry; 13301 } 13302 13303 continue; 13304 } 13305 13306 tASSERT(txn, ctx->cleaned_slot == (txn->tw.lifo_reclaimed 13307 ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 13308 : 0)); 13309 13310 TRACE("%s", " >> filling"); 13311 /* Fill in the reserved records */ 13312 ctx->filled_slot = 13313 txn->tw.lifo_reclaimed 13314 ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot 13315 : ctx->reused_slot; 13316 rc = MDBX_SUCCESS; 13317 tASSERT(txn, pnl_check_allocated(txn->tw.reclaimed_pglist, 13318 txn->mt_next_pgno - MDBX_ENABLE_REFUND)); 13319 tASSERT(txn, dirtylist_check(txn)); 13320 if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { 13321 MDBX_val key, data; 13322 key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ 13323 key.iov_base = data.iov_base = NULL; 13324 13325 const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); 13326 unsigned left = amount; 13327 if (txn->tw.lifo_reclaimed == nullptr) { 13328 tASSERT(txn, ctx->lifo == 0); 13329 rc = cursor_first(&ctx->cursor.outer, &key, &data); 13330 if (unlikely(rc != MDBX_SUCCESS)) 13331 goto bailout; 13332 } else { 13333 tASSERT(txn, ctx->lifo != 0); 13334 } 13335 13336 while (true) { 13337 txnid_t fill_gc_id; 13338 TRACE("%s: left %u of %u", dbg_prefix_mode, left, 13339 (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); 13340 if (txn->tw.lifo_reclaimed == nullptr) { 13341 tASSERT(txn, ctx->lifo == 0); 13342 fill_gc_id = unaligned_peek_u64(4, key.iov_base); 13343 if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { 13344 NOTICE( 13345 "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN 13346 " > last_reclaimed %" PRIaTXN, 13347 ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); 13348 goto retry; 13349 } 13350 } else { 13351 tASSERT(txn, ctx->lifo != 0); 13352 if (++ctx->filled_slot > 13353 (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { 13354 NOTICE("** restart: reserve depleted (filled_gc_slot %u > " 13355 "lifo_reclaimed %u" PRIaTXN, 13356 ctx->filled_slot, 13357 (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); 13358 goto retry; 13359 } 13360 fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; 13361 TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", 13362 dbg_prefix_mode, fill_gc_id, ctx->filled_slot); 13363 key.iov_base = &fill_gc_id; 13364 key.iov_len = sizeof(fill_gc_id); 13365 rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY); 13366 if (unlikely(rc != MDBX_SUCCESS)) 13367 goto bailout; 13368 } 13369 tASSERT(txn, 13370 ctx->cleaned_slot == (txn->tw.lifo_reclaimed 13371 ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 13372 : 0)); 13373 tASSERT(txn, fill_gc_id > 0 && 13374 fill_gc_id <= env->me_lck->mti_oldest_reader.weak); 13375 key.iov_base = &fill_gc_id; 13376 key.iov_len = sizeof(fill_gc_id); 13377 13378 tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); 13379 ctx->cursor.outer.mc_flags |= C_GCFREEZE; 13380 unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; 13381 if (unlikely(chunk > left)) { 13382 TRACE("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, 13383 left, fill_gc_id); 13384 if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || 13385 chunk - left > env->me_maxgc_ov1page) { 13386 data.iov_len = (left + 1) * sizeof(pgno_t); 13387 if (ctx->loop < 7) 13388 ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; 13389 } 13390 chunk = left; 13391 } 13392 rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, 13393 MDBX_CURRENT | MDBX_RESERVE); 13394 ctx->cursor.outer.mc_flags &= ~C_GCFREEZE; 13395 if (unlikely(rc != MDBX_SUCCESS)) 13396 goto bailout; 13397 gcu_clean_reserved(env, data); 13398 13399 if (unlikely(txn->tw.loose_count || 13400 amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { 13401 NOTICE("** restart: reclaimed-list growth (%u -> %u, loose +%u)", 13402 amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), 13403 txn->tw.loose_count); 13404 goto retry; 13405 } 13406 if (unlikely(txn->tw.lifo_reclaimed 13407 ? ctx->cleaned_slot < 13408 MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 13409 : ctx->cleaned_id < txn->tw.last_reclaimed)) { 13410 NOTICE("%s", "** restart: reclaimed-slots changed"); 13411 goto retry; 13412 } 13413 if (unlikely(ctx->retired_stored != 13414 MDBX_PNL_SIZE(txn->tw.retired_pages))) { 13415 tASSERT(txn, 13416 ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); 13417 NOTICE("** restart: retired-list growth (%u -> %u)", 13418 ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); 13419 goto retry; 13420 } 13421 13422 pgno_t *dst = data.iov_base; 13423 *dst++ = chunk; 13424 pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; 13425 memcpy(dst, src, chunk * sizeof(pgno_t)); 13426 pgno_t *from = src, *to = src + chunk; 13427 TRACE("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN, 13428 dbg_prefix_mode, chunk, (unsigned)(from - txn->tw.reclaimed_pglist), 13429 from[0], (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], 13430 fill_gc_id); 13431 13432 left -= chunk; 13433 if (AUDIT_ENABLED()) { 13434 rc = audit_ex(txn, ctx->retired_stored + amount - left, true); 13435 if (unlikely(rc != MDBX_SUCCESS)) 13436 goto bailout; 13437 } 13438 if (left == 0) { 13439 rc = MDBX_SUCCESS; 13440 break; 13441 } 13442 13443 if (txn->tw.lifo_reclaimed == nullptr) { 13444 tASSERT(txn, ctx->lifo == 0); 13445 rc = cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT); 13446 if (unlikely(rc != MDBX_SUCCESS)) 13447 goto bailout; 13448 } else { 13449 tASSERT(txn, ctx->lifo != 0); 13450 } 13451 } 13452 } 13453 13454 tASSERT(txn, rc == MDBX_SUCCESS); 13455 if (unlikely(txn->tw.loose_count != 0)) { 13456 NOTICE("** restart: got %u loose pages", txn->tw.loose_count); 13457 goto retry; 13458 } 13459 if (unlikely(ctx->filled_slot != 13460 (txn->tw.lifo_reclaimed 13461 ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) 13462 : 0))) { 13463 13464 const bool will_retry = ctx->loop < 9; 13465 NOTICE("** %s: reserve excess (filled-slot %u, loop %u)", 13466 will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); 13467 if (will_retry) 13468 goto retry; 13469 } 13470 13471 tASSERT(txn, txn->tw.lifo_reclaimed == NULL || 13472 ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); 13473 13474 bailout: 13475 txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next; 13476 13477 MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; 13478 TRACE("<<< %u loops, rc = %d", ctx->loop, rc); 13479 return rc; 13480 } 13481 13482 static int txn_write(MDBX_txn *txn, struct iov_ctx *ctx) { 13483 MDBX_dpl *const dl = 13484 (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : dpl_sort(txn); 13485 int rc = MDBX_SUCCESS; 13486 unsigned r, w; 13487 for (w = 0, r = 1; r <= dl->length; ++r) { 13488 MDBX_page *dp = dl->items[r].ptr; 13489 if (dp->mp_flags & P_LOOSE) { 13490 dl->items[++w] = dl->items[r]; 13491 continue; 13492 } 13493 unsigned npages = dpl_npages(dl, r); 13494 rc = iov_page(txn, ctx, dp, npages); 13495 if (unlikely(rc != MDBX_SUCCESS)) 13496 break; 13497 } 13498 13499 if (ctx->iov_items) { 13500 /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ 13501 tASSERT(txn, rc == MDBX_SUCCESS); 13502 rc = iov_write(txn, ctx); 13503 } 13504 13505 while (r <= dl->length) 13506 dl->items[++w] = dl->items[r++]; 13507 13508 dl->sorted = dpl_setlen(dl, w); 13509 txn->tw.dirtyroom += r - 1 - w; 13510 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 13511 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 13512 : txn->mt_env->me_options.dp_limit)); 13513 return rc; 13514 } 13515 13516 /* Check txn and dbi arguments to a function */ 13517 static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, 13518 unsigned validity) { 13519 if (likely(dbi < txn->mt_numdbs)) { 13520 if (likely(!dbi_changed(txn, dbi))) { 13521 if (likely(txn->mt_dbistate[dbi] & validity)) 13522 return true; 13523 if (likely(dbi < CORE_DBS || 13524 (txn->mt_env->me_dbflags[dbi] & DB_VALID) == 0)) 13525 return false; 13526 } 13527 } 13528 return dbi_import(txn, dbi); 13529 } 13530 13531 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 13532 int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } 13533 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 13534 13535 /* Merge child txn into parent */ 13536 static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, 13537 const unsigned parent_retired_len) { 13538 MDBX_dpl *const src = dpl_sort(txn); 13539 13540 /* Remove refunded pages from parent's dirty list */ 13541 MDBX_dpl *const dst = dpl_sort(parent); 13542 if (MDBX_ENABLE_REFUND) { 13543 unsigned n = dst->length; 13544 while (n && dst->items[n].pgno >= parent->mt_next_pgno) { 13545 if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { 13546 unsigned npages = dpl_npages(dst, n); 13547 dpage_free(txn->mt_env, dst->items[n].ptr, npages); 13548 } 13549 --n; 13550 } 13551 parent->tw.dirtyroom += dst->sorted - n; 13552 dst->sorted = dpl_setlen(dst, n); 13553 tASSERT(parent, 13554 parent->tw.dirtyroom + parent->tw.dirtylist->length == 13555 (parent->mt_parent ? parent->mt_parent->tw.dirtyroom 13556 : parent->mt_env->me_options.dp_limit)); 13557 } 13558 13559 /* Remove reclaimed pages from parent's dirty list */ 13560 const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; 13561 dpl_sift(parent, reclaimed_list, false); 13562 13563 /* Move retired pages from parent's dirty & spilled list to reclaimed */ 13564 unsigned r, w, d, s, l; 13565 for (r = w = parent_retired_len; 13566 ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { 13567 const pgno_t pgno = parent->tw.retired_pages[r]; 13568 const unsigned di = dpl_exist(parent, pgno); 13569 const unsigned si = !di ? search_spilled(parent, pgno) : 0; 13570 unsigned npages; 13571 const char *kind; 13572 if (di) { 13573 MDBX_page *dp = dst->items[di].ptr; 13574 tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | 13575 P_OVERFLOW | P_SPILLED)) == 0); 13576 npages = dpl_npages(dst, di); 13577 page_wash(parent, di, dp, npages); 13578 kind = "dirty"; 13579 l = 1; 13580 if (unlikely(npages > l)) { 13581 /* OVERFLOW-страница могла быть переиспользована по частям. Тогда 13582 * в retired-списке может быть только начало последовательности, 13583 * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому 13584 * переносим в reclaimed с проверкой на обрыв последовательности. 13585 * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если 13586 * страница была разбита на части, то важно удалить dirty-элемент, 13587 * а все осколки будут учтены отдельно. */ 13588 13589 /* Список retired страниц не сортирован, но для ускорения сортировки 13590 * дополняется в соответствии с MDBX_PNL_ASCENDING */ 13591 #if MDBX_PNL_ASCENDING 13592 const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages); 13593 while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { 13594 ++r; 13595 if (++l == npages) 13596 break; 13597 } 13598 #else 13599 while (w > parent_retired_len && 13600 parent->tw.retired_pages[w - 1] == pgno + l) { 13601 --w; 13602 if (++l == npages) 13603 break; 13604 } 13605 #endif 13606 } 13607 } else if (unlikely(si)) { 13608 l = npages = 1; 13609 spill_remove(parent, si, 1); 13610 kind = "spilled"; 13611 } else { 13612 parent->tw.retired_pages[++w] = pgno; 13613 continue; 13614 } 13615 13616 DEBUG("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, kind, 13617 pgno); 13618 int err = pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); 13619 ENSURE(txn->mt_env, err == MDBX_SUCCESS); 13620 } 13621 MDBX_PNL_SIZE(parent->tw.retired_pages) = w; 13622 13623 /* Filter-out parent spill list */ 13624 if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { 13625 const MDBX_PNL sl = spill_purge(parent); 13626 unsigned len = MDBX_PNL_SIZE(sl); 13627 if (len) { 13628 /* Remove refunded pages from parent's spill list */ 13629 if (MDBX_ENABLE_REFUND && 13630 MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { 13631 #if MDBX_PNL_ASCENDING 13632 unsigned i = MDBX_PNL_SIZE(sl); 13633 assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); 13634 do { 13635 if ((sl[i] & 1) == 0) 13636 DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); 13637 i -= 1; 13638 } while (i && sl[i] >= (parent->mt_next_pgno << 1)); 13639 MDBX_PNL_SIZE(sl) = i; 13640 #else 13641 assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); 13642 unsigned i = 0; 13643 do { 13644 ++i; 13645 if ((sl[i] & 1) == 0) 13646 DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); 13647 } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); 13648 MDBX_PNL_SIZE(sl) = len -= i; 13649 memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); 13650 #endif 13651 } 13652 tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); 13653 13654 /* Remove reclaimed pages from parent's spill list */ 13655 s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); 13656 /* Scanning from end to begin */ 13657 while (s && r) { 13658 if (sl[s] & 1) { 13659 --s; 13660 continue; 13661 } 13662 const pgno_t spilled_pgno = sl[s] >> 1; 13663 const pgno_t reclaimed_pgno = reclaimed_list[r]; 13664 if (reclaimed_pgno != spilled_pgno) { 13665 const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno); 13666 s -= !cmp; 13667 r -= cmp; 13668 } else { 13669 DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, 13670 reclaimed_pgno); 13671 spill_remove(parent, s, 1); 13672 --s; 13673 --r; 13674 } 13675 } 13676 13677 /* Remove anything in our dirty list from parent's spill list */ 13678 /* Scanning spill list in descend order */ 13679 const int step = MDBX_PNL_ASCENDING ? -1 : 1; 13680 s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1; 13681 d = src->length; 13682 while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) { 13683 if (sl[s] & 1) { 13684 s += step; 13685 continue; 13686 } 13687 const pgno_t spilled_pgno = sl[s] >> 1; 13688 const pgno_t dirty_pgno_form = src->items[d].pgno; 13689 const unsigned npages = dpl_npages(src, d); 13690 const pgno_t dirty_pgno_to = dirty_pgno_form + npages; 13691 if (dirty_pgno_form > spilled_pgno) { 13692 --d; 13693 continue; 13694 } 13695 if (dirty_pgno_to <= spilled_pgno) { 13696 s += step; 13697 continue; 13698 } 13699 13700 DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, 13701 dirty_pgno_form); 13702 spill_remove(parent, s, 1); 13703 s += step; 13704 } 13705 13706 /* Squash deleted pagenums if we deleted any */ 13707 spill_purge(parent); 13708 } 13709 } 13710 13711 /* Remove anything in our spill list from parent's dirty list */ 13712 if (txn->tw.spill_pages) { 13713 tASSERT(txn, pnl_check_allocated(txn->tw.spill_pages, 13714 (size_t)parent->mt_next_pgno << 1)); 13715 dpl_sift(parent, txn->tw.spill_pages, true); 13716 tASSERT(parent, 13717 parent->tw.dirtyroom + parent->tw.dirtylist->length == 13718 (parent->mt_parent ? parent->mt_parent->tw.dirtyroom 13719 : parent->mt_env->me_options.dp_limit)); 13720 } 13721 13722 /* Find length of merging our dirty list with parent's and release 13723 * filter-out pages */ 13724 for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { 13725 MDBX_page *sp = src->items[s].ptr; 13726 tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | 13727 P_LOOSE | P_SPILLED)) == 0); 13728 const unsigned s_npages = dpl_npages(src, s); 13729 const pgno_t s_pgno = src->items[s].pgno; 13730 13731 MDBX_page *dp = dst->items[d].ptr; 13732 tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | 13733 P_SPILLED)) == 0); 13734 const unsigned d_npages = dpl_npages(dst, d); 13735 const pgno_t d_pgno = dst->items[d].pgno; 13736 13737 if (d_pgno >= s_pgno + s_npages) { 13738 --d; 13739 ++l; 13740 } else if (d_pgno + d_npages <= s_pgno) { 13741 if (sp->mp_flags != P_LOOSE) { 13742 sp->mp_txnid = parent->mt_front; 13743 sp->mp_flags &= ~P_SPILLED; 13744 } 13745 --s; 13746 ++l; 13747 } else { 13748 dst->items[d--].ptr = nullptr; 13749 if ((txn->mt_flags & MDBX_WRITEMAP) == 0) 13750 dpage_free(txn->mt_env, dp, d_npages); 13751 } 13752 } 13753 assert(dst->sorted == dst->length); 13754 tASSERT(parent, dst->detent >= l + d + s); 13755 dst->sorted = l + d + s; /* the merged length */ 13756 13757 while (s > 0) { 13758 MDBX_page *sp = src->items[s].ptr; 13759 tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | 13760 P_LOOSE | P_SPILLED)) == 0); 13761 if (sp->mp_flags != P_LOOSE) { 13762 sp->mp_txnid = parent->mt_front; 13763 sp->mp_flags &= ~P_SPILLED; 13764 } 13765 --s; 13766 } 13767 13768 /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */ 13769 if (dst->sorted >= dst->length) { 13770 /* from end to begin with dst extending */ 13771 for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) { 13772 if (unlikely(l <= d)) { 13773 /* squash to get a gap of free space for merge */ 13774 for (r = w = 1; r <= d; ++r) 13775 if (dst->items[r].ptr) { 13776 if (w != r) { 13777 dst->items[w] = dst->items[r]; 13778 dst->items[r].ptr = nullptr; 13779 } 13780 ++w; 13781 } 13782 NOTICE("squash to begin for extending-merge %u -> %u", d, w - 1); 13783 d = w - 1; 13784 continue; 13785 } 13786 assert(l > d); 13787 if (dst->items[d].ptr) { 13788 dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) 13789 ? dst->items[d--] 13790 : src->items[s--]; 13791 } else 13792 --d; 13793 } 13794 if (s > 0) { 13795 assert(l == s); 13796 while (d > 0) { 13797 assert(dst->items[d].ptr == nullptr); 13798 --d; 13799 } 13800 do { 13801 assert(l > 0); 13802 dst->items[l--] = src->items[s--]; 13803 } while (s > 0); 13804 } else { 13805 assert(l == d); 13806 while (l > 0) { 13807 assert(dst->items[l].ptr != nullptr); 13808 --l; 13809 } 13810 } 13811 } else { 13812 /* from begin to end with shrinking (a lot of new large/overflow pages) */ 13813 for (l = s = d = 1; s <= src->length && d <= dst->length;) { 13814 if (unlikely(l >= d)) { 13815 /* squash to get a gap of free space for merge */ 13816 for (r = w = dst->length; r >= d; --r) 13817 if (dst->items[r].ptr) { 13818 if (w != r) { 13819 dst->items[w] = dst->items[r]; 13820 dst->items[r].ptr = nullptr; 13821 } 13822 --w; 13823 } 13824 NOTICE("squash to end for shrinking-merge %u -> %u", d, w + 1); 13825 d = w + 1; 13826 continue; 13827 } 13828 assert(l < d); 13829 if (dst->items[d].ptr) { 13830 dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) 13831 ? dst->items[d++] 13832 : src->items[s++]; 13833 } else 13834 ++d; 13835 } 13836 if (s <= src->length) { 13837 assert(dst->sorted - l == src->length - s); 13838 while (d <= dst->length) { 13839 assert(dst->items[d].ptr == nullptr); 13840 --d; 13841 } 13842 do { 13843 assert(l <= dst->sorted); 13844 dst->items[l++] = src->items[s++]; 13845 } while (s <= src->length); 13846 } else { 13847 assert(dst->sorted - l == dst->length - d); 13848 while (l <= dst->sorted) { 13849 assert(l <= d && d <= dst->length && dst->items[d].ptr); 13850 dst->items[l++] = dst->items[d++]; 13851 } 13852 } 13853 } 13854 parent->tw.dirtyroom -= dst->sorted - dst->length; 13855 assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); 13856 dpl_setlen(dst, dst->sorted); 13857 parent->tw.dirtylru = txn->tw.dirtylru; 13858 13859 /* В текущем понимании выгоднее пересчитать кол-во страниц, 13860 * чем подмешивать лишние ветвления и вычисления в циклы выше. */ 13861 dst->pages_including_loose = 0; 13862 for (r = 1; r <= dst->length; ++r) 13863 dst->pages_including_loose += dpl_npages(dst, r); 13864 13865 tASSERT(parent, dirtylist_check(parent)); 13866 dpl_free(txn); 13867 13868 if (txn->tw.spill_pages) { 13869 if (parent->tw.spill_pages) { 13870 /* Must not fail since space was preserved above. */ 13871 pnl_merge(parent->tw.spill_pages, txn->tw.spill_pages); 13872 pnl_free(txn->tw.spill_pages); 13873 } else { 13874 parent->tw.spill_pages = txn->tw.spill_pages; 13875 parent->tw.spill_least_removed = txn->tw.spill_least_removed; 13876 } 13877 tASSERT(parent, dirtylist_check(parent)); 13878 } 13879 13880 parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; 13881 if (parent->tw.spill_pages) { 13882 assert(pnl_check_allocated(parent->tw.spill_pages, 13883 (size_t)parent->mt_next_pgno << 1)); 13884 if (MDBX_PNL_SIZE(parent->tw.spill_pages)) 13885 parent->mt_flags |= MDBX_TXN_SPILLS; 13886 } 13887 } 13888 13889 int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { 13890 STATIC_ASSERT(MDBX_TXN_FINISHED == 13891 MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); 13892 const uint64_t ts_0 = latency ? osal_monotime() : 0; 13893 uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; 13894 uint32_t audit_duration = 0; 13895 13896 int rc = check_txn(txn, MDBX_TXN_FINISHED); 13897 if (unlikely(rc != MDBX_SUCCESS)) 13898 goto provide_latency; 13899 13900 if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { 13901 rc = MDBX_RESULT_TRUE; 13902 goto fail; 13903 } 13904 13905 MDBX_env *env = txn->mt_env; 13906 #if MDBX_ENV_CHECKPID 13907 if (unlikely(env->me_pid != osal_getpid())) { 13908 env->me_flags |= MDBX_FATAL_ERROR; 13909 rc = MDBX_PANIC; 13910 goto provide_latency; 13911 } 13912 #endif /* MDBX_ENV_CHECKPID */ 13913 13914 /* txn_end() mode for a commit which writes nothing */ 13915 unsigned end_mode = 13916 MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; 13917 if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) 13918 goto done; 13919 13920 if (txn->mt_child) { 13921 rc = mdbx_txn_commit_ex(txn->mt_child, NULL); 13922 tASSERT(txn, txn->mt_child == NULL); 13923 if (unlikely(rc != MDBX_SUCCESS)) 13924 goto fail; 13925 } 13926 13927 if (unlikely(txn != env->me_txn)) { 13928 DEBUG("%s", "attempt to commit unknown transaction"); 13929 rc = MDBX_EINVAL; 13930 goto fail; 13931 } 13932 13933 if (txn->mt_parent) { 13934 tASSERT(txn, audit_ex(txn, 0, false) == 0); 13935 eASSERT(env, txn != env->me_txn0); 13936 MDBX_txn *const parent = txn->mt_parent; 13937 eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); 13938 eASSERT(env, parent->mt_child == txn && 13939 (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); 13940 eASSERT(env, dirtylist_check(txn)); 13941 13942 if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && 13943 parent->mt_numdbs == txn->mt_numdbs) { 13944 for (int i = txn->mt_numdbs; --i >= 0;) { 13945 tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); 13946 if ((txn->mt_dbistate[i] & DBI_STALE) && 13947 !(parent->mt_dbistate[i] & DBI_STALE)) 13948 tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], 13949 sizeof(MDBX_db)) == 0); 13950 } 13951 13952 tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo, 13953 sizeof(parent->mt_geo)) == 0); 13954 tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, 13955 sizeof(parent->mt_canary)) == 0); 13956 tASSERT(txn, 13957 !txn->tw.spill_pages || MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); 13958 tASSERT(txn, txn->tw.loose_count == 0); 13959 13960 /* fast completion of pure nested transaction */ 13961 end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; 13962 goto done; 13963 } 13964 13965 /* Preserve space for spill list to avoid parent's state corruption 13966 * if allocation fails. */ 13967 const unsigned parent_retired_len = 13968 (unsigned)(uintptr_t)parent->tw.retired_pages; 13969 tASSERT(txn, parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); 13970 const unsigned retired_delta = 13971 MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; 13972 if (retired_delta) { 13973 rc = pnl_need(&txn->tw.reclaimed_pglist, retired_delta); 13974 if (unlikely(rc != MDBX_SUCCESS)) 13975 goto fail; 13976 } 13977 13978 if (txn->tw.spill_pages) { 13979 if (parent->tw.spill_pages) { 13980 rc = pnl_need(&parent->tw.spill_pages, 13981 MDBX_PNL_SIZE(txn->tw.spill_pages)); 13982 if (unlikely(rc != MDBX_SUCCESS)) 13983 goto fail; 13984 } 13985 spill_purge(txn); 13986 } 13987 13988 if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > 13989 parent->tw.dirtylist->detent && 13990 !dpl_reserve(parent, txn->tw.dirtylist->length + 13991 parent->tw.dirtylist->length))) { 13992 rc = MDBX_ENOMEM; 13993 goto fail; 13994 } 13995 13996 //------------------------------------------------------------------------- 13997 13998 parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; 13999 txn->tw.lifo_reclaimed = NULL; 14000 14001 parent->tw.retired_pages = txn->tw.retired_pages; 14002 txn->tw.retired_pages = NULL; 14003 14004 pnl_free(parent->tw.reclaimed_pglist); 14005 parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; 14006 txn->tw.reclaimed_pglist = NULL; 14007 parent->tw.last_reclaimed = txn->tw.last_reclaimed; 14008 14009 parent->mt_geo = txn->mt_geo; 14010 parent->mt_canary = txn->mt_canary; 14011 parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; 14012 14013 /* Move loose pages to parent */ 14014 #if MDBX_ENABLE_REFUND 14015 parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; 14016 #endif /* MDBX_ENABLE_REFUND */ 14017 parent->tw.loose_count = txn->tw.loose_count; 14018 parent->tw.loose_pages = txn->tw.loose_pages; 14019 14020 /* Merge our cursors into parent's and close them */ 14021 cursors_eot(txn, true); 14022 end_mode |= MDBX_END_EOTDONE; 14023 14024 /* Update parent's DBs array */ 14025 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); 14026 parent->mt_numdbs = txn->mt_numdbs; 14027 for (unsigned i = 0; i < txn->mt_numdbs; i++) { 14028 /* preserve parent's status */ 14029 const uint8_t state = 14030 txn->mt_dbistate[i] | 14031 (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); 14032 DEBUG("db %u dbi-state %s 0x%02x -> 0x%02x", i, 14033 (parent->mt_dbistate[i] != state) ? "update" : "still", 14034 parent->mt_dbistate[i], state); 14035 parent->mt_dbistate[i] = state; 14036 } 14037 14038 ts_1 = latency ? osal_monotime() : 0; 14039 txn_merge(parent, txn, parent_retired_len); 14040 ts_2 = latency ? osal_monotime() : 0; 14041 env->me_txn = parent; 14042 parent->mt_child = NULL; 14043 tASSERT(parent, dirtylist_check(parent)); 14044 14045 #if MDBX_ENABLE_REFUND 14046 txn_refund(parent); 14047 if (ASSERT_ENABLED()) { 14048 /* Check parent's loose pages not suitable for refund */ 14049 for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) 14050 tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && 14051 lp->mp_pgno + 1 < parent->mt_next_pgno); 14052 /* Check parent's reclaimed pages not suitable for refund */ 14053 if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) 14054 tASSERT(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < 14055 parent->mt_next_pgno); 14056 } 14057 #endif /* MDBX_ENABLE_REFUND */ 14058 14059 ts_4 = ts_3 = latency ? osal_monotime() : 0; 14060 txn->mt_signature = 0; 14061 osal_free(txn); 14062 tASSERT(parent, audit_ex(parent, 0, false) == 0); 14063 rc = MDBX_SUCCESS; 14064 goto provide_latency; 14065 } 14066 14067 tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == 14068 (txn->mt_parent ? txn->mt_parent->tw.dirtyroom 14069 : txn->mt_env->me_options.dp_limit)); 14070 cursors_eot(txn, false); 14071 end_mode |= MDBX_END_EOTDONE; 14072 14073 if (txn->tw.dirtylist->length == 0 && 14074 (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { 14075 for (int i = txn->mt_numdbs; --i >= 0;) 14076 tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); 14077 #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT 14078 rc = txn_end(txn, end_mode); 14079 if (unlikely(rc != MDBX_SUCCESS)) 14080 goto fail; 14081 rc = MDBX_RESULT_TRUE; 14082 goto provide_latency; 14083 #else 14084 goto done; 14085 #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ 14086 } 14087 14088 DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO 14089 "/%" PRIaPGNO, 14090 txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, 14091 txn->mt_dbs[FREE_DBI].md_root); 14092 14093 /* Update DB root pointers */ 14094 if (txn->mt_numdbs > CORE_DBS) { 14095 MDBX_cursor_couple couple; 14096 MDBX_val data; 14097 data.iov_len = sizeof(MDBX_db); 14098 14099 rc = cursor_init(&couple.outer, txn, MAIN_DBI); 14100 if (unlikely(rc != MDBX_SUCCESS)) 14101 goto fail; 14102 for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { 14103 if (txn->mt_dbistate[i] & DBI_DIRTY) { 14104 MDBX_db *db = &txn->mt_dbs[i]; 14105 DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN 14106 " -> %" PRIaTXN, 14107 i, db->md_mod_txnid, txn->mt_txnid); 14108 /* Может быть mod_txnid > front после коммита вложенных тразакций */ 14109 db->md_mod_txnid = txn->mt_txnid; 14110 data.iov_base = db; 14111 WITH_CURSOR_TRACKING(couple.outer, 14112 rc = mdbx_cursor_put(&couple.outer, 14113 &txn->mt_dbxs[i].md_name, 14114 &data, F_SUBDATA)); 14115 if (unlikely(rc != MDBX_SUCCESS)) 14116 goto fail; 14117 } 14118 } 14119 } 14120 14121 ts_1 = latency ? osal_monotime() : 0; 14122 gcu_context_t gcu_ctx; 14123 rc = gcu_context_init(txn, &gcu_ctx); 14124 if (unlikely(rc != MDBX_SUCCESS)) 14125 goto fail; 14126 rc = update_gc(txn, &gcu_ctx); 14127 if (unlikely(rc != MDBX_SUCCESS)) 14128 goto fail; 14129 14130 txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) 14131 ? txn->mt_txnid 14132 : txn->mt_dbs[FREE_DBI].md_mod_txnid; 14133 14134 txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) 14135 ? txn->mt_txnid 14136 : txn->mt_dbs[MAIN_DBI].md_mod_txnid; 14137 14138 ts_2 = latency ? osal_monotime() : 0; 14139 if (AUDIT_ENABLED()) { 14140 rc = audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); 14141 const uint64_t audit_end = osal_monotime(); 14142 audit_duration = osal_monotime_to_16dot16(audit_end - ts_2); 14143 ts_2 = audit_end; 14144 if (unlikely(rc != MDBX_SUCCESS)) 14145 goto fail; 14146 } 14147 14148 struct iov_ctx write_ctx; 14149 iov_init(txn, &write_ctx); 14150 rc = txn_write(txn, &write_ctx); 14151 if (likely(rc == MDBX_SUCCESS)) 14152 iov_done(txn, &write_ctx); 14153 /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ 14154 ts_3 = latency ? osal_monotime() : 0; 14155 14156 if (likely(rc == MDBX_SUCCESS)) { 14157 const meta_ptr_t head = meta_recent(env, &txn->tw.troika); 14158 MDBX_meta meta; 14159 memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); 14160 meta.mm_extra_flags = head.ptr_c->mm_extra_flags; 14161 meta.mm_validator_id = head.ptr_c->mm_validator_id; 14162 meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; 14163 unaligned_poke_u64(4, meta.mm_pages_retired, 14164 unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + 14165 MDBX_PNL_SIZE(txn->tw.retired_pages)); 14166 meta.mm_geo = txn->mt_geo; 14167 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 14168 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 14169 meta.mm_canary = txn->mt_canary; 14170 14171 txnid_t commit_txnid = txn->mt_txnid; 14172 #if MDBX_ENABLE_BIGFOOT 14173 if (gcu_ctx.bigfoot > txn->mt_txnid) { 14174 commit_txnid = gcu_ctx.bigfoot; 14175 TRACE("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid, 14176 (unsigned)(commit_txnid - txn->mt_txnid)); 14177 } 14178 #endif 14179 meta_set_txnid(env, &meta, commit_txnid); 14180 14181 rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, 14182 &meta, &txn->tw.troika); 14183 } 14184 ts_4 = latency ? osal_monotime() : 0; 14185 if (unlikely(rc != MDBX_SUCCESS)) { 14186 env->me_flags |= MDBX_FATAL_ERROR; 14187 goto fail; 14188 } 14189 14190 end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; 14191 14192 done: 14193 rc = txn_end(txn, end_mode); 14194 14195 provide_latency: 14196 if (latency) { 14197 latency->audit = audit_duration; 14198 latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; 14199 latency->gc = (ts_1 && ts_2) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; 14200 latency->write = (ts_2 && ts_3) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; 14201 latency->sync = (ts_3 && ts_4) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; 14202 const uint64_t ts_5 = osal_monotime(); 14203 latency->ending = ts_4 ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; 14204 latency->whole = osal_monotime_to_16dot16(ts_5 - ts_0); 14205 } 14206 return rc; 14207 14208 fail: 14209 mdbx_txn_abort(txn); 14210 goto provide_latency; 14211 } 14212 14213 static int validate_meta(MDBX_env *env, MDBX_meta *const meta, 14214 const MDBX_page *const page, 14215 const unsigned meta_number, unsigned *guess_pagesize) { 14216 const uint64_t magic_and_version = 14217 unaligned_peek_u64(4, &meta->mm_magic_and_version); 14218 if (unlikely(magic_and_version != MDBX_DATA_MAGIC && 14219 magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && 14220 magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { 14221 ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, 14222 magic_and_version); 14223 return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID 14224 : MDBX_VERSION_MISMATCH; 14225 } 14226 14227 if (unlikely(page->mp_pgno != meta_number)) { 14228 ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno); 14229 return MDBX_INVALID; 14230 } 14231 14232 if (unlikely(page->mp_flags != P_META)) { 14233 ERROR("page #%u not a meta-page", meta_number); 14234 return MDBX_INVALID; 14235 } 14236 14237 /* LY: check pagesize */ 14238 if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || 14239 meta->mm_psize > MAX_PAGESIZE)) { 14240 WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, 14241 meta->mm_psize); 14242 return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; 14243 } 14244 14245 if (guess_pagesize && *guess_pagesize != meta->mm_psize) { 14246 *guess_pagesize = meta->mm_psize; 14247 VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize); 14248 } 14249 14250 const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a); 14251 if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) { 14252 WARNING("meta[%u] not completely updated, skip it", meta_number); 14253 return MDBX_RESULT_TRUE; 14254 } 14255 14256 /* LY: check signature as a checksum */ 14257 if (META_IS_STEADY(meta) && 14258 unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) { 14259 WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 14260 "), skip it", 14261 meta_number, unaligned_peek_u64(4, &meta->mm_sign), 14262 meta_sign(meta)); 14263 return MDBX_RESULT_TRUE; 14264 } 14265 14266 DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO 14267 ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO 14268 " +%u -%u, txn_id %" PRIaTXN ", %s", 14269 page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, 14270 meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, 14271 meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv), 14272 pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta)); 14273 14274 if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { 14275 WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, 14276 txnid); 14277 return MDBX_RESULT_TRUE; 14278 } 14279 14280 /* LY: check min-pages value */ 14281 if (unlikely(meta->mm_geo.lower < MIN_PAGENO || 14282 meta->mm_geo.lower > MAX_PAGENO + 1)) { 14283 WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", 14284 meta_number, meta->mm_geo.lower); 14285 return MDBX_INVALID; 14286 } 14287 14288 /* LY: check max-pages value */ 14289 if (unlikely(meta->mm_geo.upper < MIN_PAGENO || 14290 meta->mm_geo.upper > MAX_PAGENO + 1 || 14291 meta->mm_geo.upper < meta->mm_geo.lower)) { 14292 WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", 14293 meta_number, meta->mm_geo.upper); 14294 return MDBX_INVALID; 14295 } 14296 14297 /* LY: check last_pgno */ 14298 if (unlikely(meta->mm_geo.next < MIN_PAGENO || 14299 meta->mm_geo.next - 1 > MAX_PAGENO)) { 14300 WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", 14301 meta_number, meta->mm_geo.next); 14302 return MDBX_CORRUPTED; 14303 } 14304 14305 /* LY: check filesize & used_bytes */ 14306 const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; 14307 if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { 14308 /* Here could be a race with DB-shrinking performed by other process */ 14309 int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); 14310 if (unlikely(err != MDBX_SUCCESS)) 14311 return err; 14312 if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { 14313 WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 14314 "), skip it", 14315 meta_number, used_bytes, env->me_dxb_mmap.filesize); 14316 return MDBX_CORRUPTED; 14317 } 14318 } 14319 if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO || 14320 used_bytes > MAX_MAPSIZE)) { 14321 WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", 14322 meta_number, used_bytes); 14323 return MDBX_TOO_LARGE; 14324 } 14325 14326 /* LY: check mapsize limits */ 14327 pgno_t geo_lower = meta->mm_geo.lower; 14328 uint64_t mapsize_min = geo_lower * (uint64_t)meta->mm_psize; 14329 STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE); 14330 STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); 14331 STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MIN_PAGESIZE % (4ul << 20) == 0); 14332 if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { 14333 if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && 14334 mapsize_min <= MAX_MAPSIZE64) { 14335 eASSERT(env, 14336 meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); 14337 WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " 14338 "but size of used space still acceptable (%" PRIu64 ")", 14339 meta_number, mapsize_min, used_bytes); 14340 geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize); 14341 if (geo_lower > MAX_PAGENO + 1) { 14342 geo_lower = MAX_PAGENO + 1; 14343 mapsize_min = geo_lower * (uint64_t)meta->mm_psize; 14344 } 14345 WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO 14346 " instead of wrong %" PRIaPGNO 14347 ", will be corrected on next commit(s)", 14348 meta_number, "lower", geo_lower, meta->mm_geo.lower); 14349 meta->mm_geo.lower = geo_lower; 14350 } else { 14351 WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", 14352 meta_number, mapsize_min); 14353 return MDBX_VERSION_MISMATCH; 14354 } 14355 } 14356 14357 pgno_t geo_upper = meta->mm_geo.upper; 14358 uint64_t mapsize_max = geo_upper * (uint64_t)meta->mm_psize; 14359 STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); 14360 if (unlikely(mapsize_max > MAX_MAPSIZE || 14361 (MAX_PAGENO + 1) < 14362 ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / 14363 (size_t)meta->mm_psize)) { 14364 if (mapsize_max > MAX_MAPSIZE64) { 14365 WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", 14366 meta_number, mapsize_max); 14367 return MDBX_VERSION_MISMATCH; 14368 } 14369 /* allow to open large DB from a 32-bit environment */ 14370 eASSERT(env, 14371 meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); 14372 WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " 14373 "but size of used space still acceptable (%" PRIu64 ")", 14374 meta_number, mapsize_max, used_bytes); 14375 geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize); 14376 if (geo_upper > MAX_PAGENO + 1) { 14377 geo_upper = MAX_PAGENO + 1; 14378 mapsize_max = geo_upper * (uint64_t)meta->mm_psize; 14379 } 14380 WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO 14381 " instead of wrong %" PRIaPGNO 14382 ", will be corrected on next commit(s)", 14383 meta_number, "upper", geo_upper, meta->mm_geo.upper); 14384 meta->mm_geo.upper = geo_upper; 14385 } 14386 14387 /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. 14388 * 14389 * Copy-with-compaction by previous version of libmdbx could produce DB-file 14390 * less than meta.geo.lower bound, in case actual filling is low or no data 14391 * at all. This is not a problem as there is no damage or loss of data. 14392 * Therefore it is better not to consider such situation as an error, but 14393 * silently correct it. */ 14394 pgno_t geo_now = meta->mm_geo.now; 14395 if (geo_now < geo_lower) 14396 geo_now = geo_lower; 14397 if (geo_now > geo_upper && meta->mm_geo.next <= geo_upper) 14398 geo_now = geo_upper; 14399 14400 if (unlikely(meta->mm_geo.next > geo_now)) { 14401 WARNING("meta[%u] next-pageno (%" PRIaPGNO 14402 ") is beyond end-pgno (%" PRIaPGNO "), skip it", 14403 meta_number, meta->mm_geo.next, geo_now); 14404 return MDBX_CORRUPTED; 14405 } 14406 if (meta->mm_geo.now != geo_now) { 14407 WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO 14408 " instead of wrong %" PRIaPGNO 14409 ", will be corrected on next commit(s)", 14410 meta_number, "now", geo_now, meta->mm_geo.now); 14411 meta->mm_geo.now = geo_now; 14412 } 14413 14414 /* GC */ 14415 if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) { 14416 if (unlikely(meta->mm_dbs[FREE_DBI].md_branch_pages || 14417 meta->mm_dbs[FREE_DBI].md_depth || 14418 meta->mm_dbs[FREE_DBI].md_entries || 14419 meta->mm_dbs[FREE_DBI].md_leaf_pages || 14420 meta->mm_dbs[FREE_DBI].md_overflow_pages)) { 14421 WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); 14422 return MDBX_CORRUPTED; 14423 } 14424 } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) { 14425 WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, 14426 "GC", meta->mm_dbs[FREE_DBI].md_root); 14427 return MDBX_CORRUPTED; 14428 } 14429 14430 /* MainDB */ 14431 if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) { 14432 if (unlikely(meta->mm_dbs[MAIN_DBI].md_branch_pages || 14433 meta->mm_dbs[MAIN_DBI].md_depth || 14434 meta->mm_dbs[MAIN_DBI].md_entries || 14435 meta->mm_dbs[MAIN_DBI].md_leaf_pages || 14436 meta->mm_dbs[MAIN_DBI].md_overflow_pages)) { 14437 WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); 14438 return MDBX_CORRUPTED; 14439 } 14440 } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) { 14441 WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, 14442 "MainDB", meta->mm_dbs[MAIN_DBI].md_root); 14443 return MDBX_CORRUPTED; 14444 } 14445 14446 if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) { 14447 WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", 14448 meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); 14449 return MDBX_CORRUPTED; 14450 } 14451 14452 if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) { 14453 WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", 14454 meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); 14455 return MDBX_CORRUPTED; 14456 } 14457 14458 return MDBX_SUCCESS; 14459 } 14460 14461 static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, 14462 MDBX_meta *dest) { 14463 *dest = *meta; 14464 return validate_meta(env, dest, data_page(meta), 14465 bytes2pgno(env, (uint8_t *)meta - env->me_map), nullptr); 14466 } 14467 14468 /* Read the environment parameters of a DB environment 14469 * before mapping it into memory. */ 14470 __cold static int read_header(MDBX_env *env, MDBX_meta *dest, 14471 const int lck_exclusive, 14472 const mdbx_mode_t mode_bits) { 14473 int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); 14474 if (unlikely(rc != MDBX_SUCCESS)) 14475 return rc; 14476 14477 memset(dest, 0, sizeof(MDBX_meta)); 14478 unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); 14479 rc = MDBX_CORRUPTED; 14480 14481 /* Read twice all meta pages so we can find the latest one. */ 14482 unsigned loop_limit = NUM_METAS * 2; 14483 /* We don't know the page size on first time. So, just guess it. */ 14484 unsigned guess_pagesize = 0; 14485 for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { 14486 const unsigned meta_number = loop_count % NUM_METAS; 14487 const unsigned offset = (guess_pagesize ? guess_pagesize 14488 : (loop_count > NUM_METAS) ? env->me_psize 14489 : env->me_os_psize) * 14490 meta_number; 14491 14492 char buffer[MIN_PAGESIZE]; 14493 unsigned retryleft = 42; 14494 while (1) { 14495 TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, 14496 offset, MIN_PAGESIZE, retryleft); 14497 int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); 14498 if (err != MDBX_SUCCESS) { 14499 if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && 14500 env->me_dxb_mmap.filesize == 0 && 14501 mode_bits /* non-zero for DB creation */ != 0) 14502 NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); 14503 else 14504 ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, 14505 mdbx_strerror(err)); 14506 return err; 14507 } 14508 14509 char again[MIN_PAGESIZE]; 14510 err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); 14511 if (err != MDBX_SUCCESS) { 14512 ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, 14513 mdbx_strerror(err)); 14514 return err; 14515 } 14516 14517 if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) 14518 break; 14519 14520 VERBOSE("meta[%u] was updated, re-read it", meta_number); 14521 } 14522 14523 if (!retryleft) { 14524 ERROR("meta[%u] is too volatile, skip it", meta_number); 14525 continue; 14526 } 14527 14528 MDBX_page *const page = (MDBX_page *)buffer; 14529 MDBX_meta *const meta = page_meta(page); 14530 rc = validate_meta(env, meta, page, meta_number, &guess_pagesize); 14531 if (rc != MDBX_SUCCESS) 14532 continue; 14533 14534 bool latch; 14535 if (env->me_stuck_meta >= 0) 14536 latch = (meta_number == (unsigned)env->me_stuck_meta); 14537 else if (meta_bootid_match(meta)) 14538 latch = meta_choice_recent( 14539 meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), 14540 dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); 14541 else 14542 latch = meta_choice_steady( 14543 meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), 14544 dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); 14545 if (latch) { 14546 *dest = *meta; 14547 if (!lck_exclusive && !META_IS_STEADY(dest)) 14548 loop_limit += 1; /* LY: should re-read to hush race with update */ 14549 VERBOSE("latch meta[%u]", meta_number); 14550 } 14551 } 14552 14553 if (dest->mm_psize == 0 || 14554 (env->me_stuck_meta < 0 && 14555 !(META_IS_STEADY(dest) || 14556 meta_weak_acceptable(env, dest, lck_exclusive)))) { 14557 ERROR("%s", "no usable meta-pages, database is corrupted"); 14558 if (rc == MDBX_SUCCESS) { 14559 /* TODO: try to restore the database by fully checking b-tree structure 14560 * for the each meta page, if the corresponding option was given */ 14561 return MDBX_CORRUPTED; 14562 } 14563 return rc; 14564 } 14565 14566 return MDBX_SUCCESS; 14567 } 14568 14569 __cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, 14570 unsigned num) { 14571 ENSURE(env, is_powerof2(env->me_psize)); 14572 ENSURE(env, env->me_psize >= MIN_PAGESIZE); 14573 ENSURE(env, env->me_psize <= MAX_PAGESIZE); 14574 ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); 14575 ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); 14576 ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); 14577 ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); 14578 14579 memset(model, 0, env->me_psize); 14580 model->mp_pgno = num; 14581 model->mp_flags = P_META; 14582 MDBX_meta *const model_meta = page_meta(model); 14583 unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); 14584 14585 model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); 14586 model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); 14587 model_meta->mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); 14588 model_meta->mm_geo.shrink_pv = 14589 pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); 14590 model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); 14591 model_meta->mm_geo.next = NUM_METAS; 14592 14593 ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO); 14594 ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); 14595 ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); 14596 ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); 14597 ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO); 14598 ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); 14599 ENSURE(env, model_meta->mm_geo.grow_pv == 14600 pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); 14601 ENSURE(env, model_meta->mm_geo.shrink_pv == 14602 pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); 14603 14604 model_meta->mm_psize = env->me_psize; 14605 model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY; 14606 model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; 14607 model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; 14608 meta_set_txnid(env, model_meta, MIN_TXNID + num); 14609 unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); 14610 eASSERT(env, coherency_check_meta(env, model_meta, true)); 14611 return (MDBX_page *)((uint8_t *)model + env->me_psize); 14612 } 14613 14614 /* Fill in most of the zeroed meta-pages for an empty database environment. 14615 * Return pointer to recently (head) meta-page. */ 14616 __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { 14617 MDBX_page *page0 = (MDBX_page *)buffer; 14618 MDBX_page *page1 = meta_model(env, page0, 0); 14619 MDBX_page *page2 = meta_model(env, page1, 1); 14620 meta_model(env, page2, 2); 14621 return page_meta(page2); 14622 } 14623 14624 #if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) 14625 static size_t madvise_threshold(const MDBX_env *env, 14626 const size_t largest_bytes) { 14627 /* TODO: use options */ 14628 const unsigned factor = 9; 14629 const size_t threshold = (largest_bytes < (65536ul << factor)) 14630 ? 65536 /* minimal threshold */ 14631 : (largest_bytes > (MEGABYTE * 4 << factor)) 14632 ? MEGABYTE * 4 /* maximal threshold */ 14633 : largest_bytes >> factor; 14634 return bytes_align2os_bytes(env, threshold); 14635 } 14636 #endif /* MDBX_ENABLE_MADVISE */ 14637 14638 static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, 14639 meta_troika_t *const troika) { 14640 eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); 14641 const MDBX_meta *const meta0 = METAPAGE(env, 0); 14642 const MDBX_meta *const meta1 = METAPAGE(env, 1); 14643 const MDBX_meta *const meta2 = METAPAGE(env, 2); 14644 const meta_ptr_t head = meta_recent(env, troika); 14645 int rc; 14646 14647 eASSERT(env, 14648 pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); 14649 eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); 14650 eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now); 14651 14652 if (flags & MDBX_SAFE_NOSYNC) { 14653 /* Check auto-sync conditions */ 14654 const pgno_t autosync_threshold = 14655 atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); 14656 const uint64_t autosync_period = 14657 atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); 14658 if ((autosync_threshold && 14659 atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= 14660 autosync_threshold) || 14661 (autosync_period && 14662 osal_monotime() - 14663 atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= 14664 autosync_period)) 14665 flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ 14666 } 14667 14668 pgno_t shrink = 0; 14669 if (flags & MDBX_SHRINK_ALLOWED) { 14670 /* LY: check conditions to discard unused pages */ 14671 const pgno_t largest_pgno = find_largest_snapshot( 14672 env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) 14673 ? head.ptr_c->mm_geo.next 14674 : pending->mm_geo.next); 14675 eASSERT(env, largest_pgno >= NUM_METAS); 14676 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 14677 const pgno_t edge = env->me_poison_edge; 14678 if (edge > largest_pgno) { 14679 env->me_poison_edge = largest_pgno; 14680 VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), 14681 pgno2bytes(env, edge - largest_pgno)); 14682 MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + 14683 pgno2bytes(env, largest_pgno), 14684 pgno2bytes(env, edge - largest_pgno)); 14685 } 14686 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ 14687 #if MDBX_ENABLE_MADVISE && \ 14688 (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) 14689 const size_t largest_bytes = pgno2bytes(env, largest_pgno); 14690 /* threshold to avoid unreasonable frequent madvise() calls */ 14691 const size_t threshold = madvise_threshold(env, largest_bytes); 14692 const size_t discard_edge_bytes = bytes_align2os_bytes( 14693 env, ((MDBX_RDONLY & 14694 (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak 14695 : env->me_flags)) 14696 ? largest_bytes 14697 : largest_bytes + threshold)); 14698 const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); 14699 const pgno_t prev_discarded_pgno = 14700 atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); 14701 if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, threshold)) { 14702 NOTICE("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, 14703 prev_discarded_pgno); 14704 atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, 14705 mo_Relaxed); 14706 const size_t prev_discarded_bytes = 14707 ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); 14708 ENSURE(env, prev_discarded_bytes > discard_edge_bytes); 14709 #if defined(MADV_DONTNEED) 14710 int advise = MADV_DONTNEED; 14711 #if defined(MADV_FREE) && \ 14712 0 /* MADV_FREE works for only anonymous vma at the moment */ 14713 if ((env->me_flags & MDBX_WRITEMAP) && linux_kernel_version > 0x04050000) 14714 advise = MADV_FREE; 14715 #endif /* MADV_FREE */ 14716 int err = madvise(env->me_map + discard_edge_bytes, 14717 prev_discarded_bytes - discard_edge_bytes, advise) 14718 ? ignore_enosys(errno) 14719 : MDBX_SUCCESS; 14720 #else 14721 int err = ignore_enosys(posix_madvise( 14722 env->me_map + discard_edge_bytes, 14723 prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); 14724 #endif 14725 if (unlikely(MDBX_IS_ERROR(err))) 14726 return err; 14727 } 14728 #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ 14729 14730 /* LY: check conditions to shrink datafile */ 14731 const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; 14732 pgno_t shrink_step = 0; 14733 if (pending->mm_geo.shrink_pv && 14734 pending->mm_geo.now - pending->mm_geo.next > 14735 (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { 14736 if (pending->mm_geo.now > largest_pgno && 14737 pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { 14738 pgno_t grow_step = 0; 14739 const pgno_t aligner = 14740 pending->mm_geo.grow_pv 14741 ? (grow_step = pv2pages(pending->mm_geo.grow_pv)) 14742 : shrink_step; 14743 const pgno_t with_backlog_gap = largest_pgno + backlog_gap; 14744 const pgno_t aligned = pgno_align2os_pgno( 14745 env, with_backlog_gap + aligner - with_backlog_gap % aligner); 14746 const pgno_t bottom = 14747 (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; 14748 if (pending->mm_geo.now > bottom) { 14749 if (TROIKA_HAVE_STEADY(troika)) 14750 /* force steady, but only if steady-checkpoint is present */ 14751 flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; 14752 shrink = pending->mm_geo.now - bottom; 14753 pending->mm_geo.now = bottom; 14754 if (unlikely(head.txnid == pending->unsafe_txnid)) { 14755 const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); 14756 NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, 14757 pending->unsafe_txnid, txnid); 14758 ENSURE(env, !env->me_txn0 || 14759 (env->me_txn0->mt_owner != osal_thread_self() && 14760 !env->me_txn)); 14761 if (unlikely(txnid > MAX_TXNID)) { 14762 rc = MDBX_TXN_FULL; 14763 ERROR("txnid overflow, raise %d", rc); 14764 goto fail; 14765 } 14766 meta_set_txnid(env, pending, txnid); 14767 eASSERT(env, coherency_check_meta(env, pending, true)); 14768 } 14769 } 14770 } 14771 } 14772 } 14773 14774 /* LY: step#1 - sync previously written/updated data-pages */ 14775 rc = MDBX_RESULT_FALSE /* carry steady */; 14776 if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { 14777 eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); 14778 enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; 14779 if ((flags & MDBX_SAFE_NOSYNC) == 0) { 14780 mode_bits = MDBX_SYNC_DATA; 14781 if (pending->mm_geo.next > 14782 meta_prefer_steady(env, troika).ptr_c->mm_geo.now) 14783 mode_bits |= MDBX_SYNC_SIZE; 14784 if (flags & MDBX_NOMETASYNC) 14785 mode_bits |= MDBX_SYNC_IODQ; 14786 } 14787 #if MDBX_ENABLE_PGOP_STAT 14788 env->me_lck->mti_pgop_stat.wops.weak += 1; 14789 #endif /* MDBX_ENABLE_PGOP_STAT */ 14790 if (flags & MDBX_WRITEMAP) 14791 rc = 14792 osal_msync(&env->me_dxb_mmap, 0, 14793 pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); 14794 else 14795 rc = osal_fsync(env->me_lazy_fd, mode_bits); 14796 if (unlikely(rc != MDBX_SUCCESS)) 14797 goto fail; 14798 rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ 14799 : MDBX_RESULT_FALSE /* carry steady */; 14800 } 14801 eASSERT(env, coherency_check_meta(env, pending, true)); 14802 14803 /* Steady or Weak */ 14804 if (rc == MDBX_RESULT_FALSE /* carry steady */) { 14805 atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), 14806 mo_Relaxed); 14807 unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); 14808 atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); 14809 } else { 14810 assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); 14811 unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); 14812 } 14813 14814 const bool legal4overwrite = 14815 head.txnid == pending->unsafe_txnid && 14816 memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) == 14817 0 && 14818 memcmp(&head.ptr_c->mm_canary, &pending->mm_canary, 14819 sizeof(pending->mm_canary)) == 0 && 14820 memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) == 14821 0; 14822 MDBX_meta *target = nullptr; 14823 if (head.txnid == pending->unsafe_txnid) { 14824 ENSURE(env, legal4overwrite); 14825 if (!head.is_steady && META_IS_STEADY(pending)) 14826 target = (MDBX_meta *)head.ptr_c; 14827 else { 14828 WARNING("%s", "skip update meta"); 14829 return MDBX_SUCCESS; 14830 } 14831 } else { 14832 const unsigned troika_tail = troika->tail_and_flags & 3; 14833 ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && 14834 troika_tail != troika->prefer_steady); 14835 target = (MDBX_meta *)meta_tail(env, troika).ptr_c; 14836 } 14837 14838 /* LY: step#2 - update meta-page. */ 14839 DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO 14840 ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO 14841 " +%u -%u, txn_id %" PRIaTXN ", %s", 14842 data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, 14843 pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, 14844 pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, 14845 pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), 14846 pending->unsafe_txnid, durable_caption(pending)); 14847 14848 DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, 14849 (meta0 == head.ptr_c) ? "head" 14850 : (meta0 == target) ? "tail" 14851 : "stay", 14852 durable_caption(meta0), constmeta_txnid(meta0), 14853 meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); 14854 DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, 14855 (meta1 == head.ptr_c) ? "head" 14856 : (meta1 == target) ? "tail" 14857 : "stay", 14858 durable_caption(meta1), constmeta_txnid(meta1), 14859 meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); 14860 DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, 14861 (meta2 == head.ptr_c) ? "head" 14862 : (meta2 == target) ? "tail" 14863 : "stay", 14864 durable_caption(meta2), constmeta_txnid(meta2), 14865 meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); 14866 14867 eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || 14868 (META_IS_STEADY(pending) && !META_IS_STEADY(meta0))); 14869 eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || 14870 (META_IS_STEADY(pending) && !META_IS_STEADY(meta1))); 14871 eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || 14872 (META_IS_STEADY(pending) && !META_IS_STEADY(meta2))); 14873 14874 eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); 14875 ENSURE(env, target == head.ptr_c || 14876 constmeta_txnid(target) < pending->unsafe_txnid); 14877 #if MDBX_ENABLE_PGOP_STAT 14878 env->me_lck->mti_pgop_stat.wops.weak += 1; 14879 #endif /* MDBX_ENABLE_PGOP_STAT */ 14880 if (flags & MDBX_WRITEMAP) { 14881 jitter4testing(true); 14882 if (likely(target != head.ptr_c)) { 14883 /* LY: 'invalidate' the meta. */ 14884 meta_update_begin(env, target, pending->unsafe_txnid); 14885 unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK); 14886 #ifndef NDEBUG 14887 /* debug: provoke failure to catch a violators, but don't touch mm_psize 14888 * to allow readers catch actual pagesize. */ 14889 uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; 14890 uint8_t *provoke_end = (uint8_t *)&target->mm_sign; 14891 memset(provoke_begin, 0xCC, provoke_end - provoke_begin); 14892 jitter4testing(false); 14893 #endif 14894 14895 /* LY: update info */ 14896 target->mm_geo = pending->mm_geo; 14897 target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; 14898 target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; 14899 target->mm_canary = pending->mm_canary; 14900 memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); 14901 jitter4testing(true); 14902 14903 /* LY: 'commit' the meta */ 14904 meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); 14905 jitter4testing(true); 14906 eASSERT(env, coherency_check_meta(env, target, true)); 14907 } else { 14908 /* dangerous case (target == head), only mm_sign could 14909 * me updated, check assertions once again */ 14910 eASSERT(env, 14911 legal4overwrite && !head.is_steady && META_IS_STEADY(pending)); 14912 } 14913 memcpy(target->mm_sign, pending->mm_sign, 8); 14914 osal_flush_incoherent_cpu_writeback(); 14915 jitter4testing(true); 14916 /* sync meta-pages */ 14917 rc = 14918 osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), 14919 (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE 14920 : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 14921 if (unlikely(rc != MDBX_SUCCESS)) 14922 goto fail; 14923 } else { 14924 const MDBX_meta undo_meta = *target; 14925 const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) 14926 ? env->me_dsync_fd 14927 : env->me_lazy_fd; 14928 #if MDBX_ENABLE_PGOP_STAT 14929 env->me_lck->mti_pgop_stat.wops.weak += 1; 14930 #endif /* MDBX_ENABLE_PGOP_STAT */ 14931 rc = osal_pwrite(fd, pending, sizeof(MDBX_meta), 14932 (uint8_t *)target - env->me_map); 14933 if (unlikely(rc != MDBX_SUCCESS)) { 14934 undo: 14935 DEBUG("%s", "write failed, disk error?"); 14936 /* On a failure, the pagecache still contains the new data. 14937 * Try write some old data back, to prevent it from being used. */ 14938 osal_pwrite(fd, &undo_meta, sizeof(MDBX_meta), 14939 (uint8_t *)target - env->me_map); 14940 goto fail; 14941 } 14942 osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); 14943 /* sync meta-pages */ 14944 if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { 14945 rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 14946 if (rc != MDBX_SUCCESS) 14947 goto undo; 14948 } 14949 } 14950 14951 uint64_t timestamp = 0; 14952 while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") { 14953 rc = 14954 coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); 14955 if (likely(rc == MDBX_SUCCESS)) 14956 break; 14957 if (unlikely(rc != MDBX_RESULT_TRUE)) 14958 goto fail; 14959 } 14960 env->me_lck->mti_meta_sync_txnid.weak = 14961 (uint32_t)pending->unsafe_txnid - 14962 ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); 14963 14964 *troika = meta_tap(env); 14965 for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) 14966 if (troika != &txn->tw.troika) 14967 txn->tw.troika = *troika; 14968 14969 /* LY: shrink datafile if needed */ 14970 if (unlikely(shrink)) { 14971 VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", 14972 pending->mm_geo.now, shrink); 14973 rc = map_resize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, 14974 pending->mm_geo.upper); 14975 if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) 14976 goto fail; 14977 eASSERT(env, coherency_check_meta(env, target, true)); 14978 } 14979 14980 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 14981 if (likely(lck)) 14982 /* toggle oldest refresh */ 14983 atomic_store32(&lck->mti_readers_refresh_flag, false, mo_Relaxed); 14984 14985 return MDBX_SUCCESS; 14986 14987 fail: 14988 env->me_flags |= MDBX_FATAL_ERROR; 14989 return rc; 14990 } 14991 14992 static void recalculate_merge_threshold(MDBX_env *env) { 14993 const unsigned bytes = page_space(env); 14994 env->me_merge_threshold = 14995 (uint16_t)(bytes - 14996 (bytes * env->me_options.merge_threshold_16dot16_percent >> 14997 16)); 14998 env->me_merge_threshold_gc = 14999 (uint16_t)(bytes - 15000 ((env->me_options.merge_threshold_16dot16_percent > 19005) 15001 ? bytes / 3 /* 33 % */ 15002 : bytes / 4 /* 25 % */)); 15003 } 15004 15005 __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { 15006 STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); 15007 STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); 15008 ENSURE(env, is_powerof2(pagesize)); 15009 ENSURE(env, pagesize >= MIN_PAGESIZE); 15010 ENSURE(env, pagesize <= MAX_PAGESIZE); 15011 env->me_psize = (unsigned)pagesize; 15012 if (env->me_pbuf) { 15013 osal_memalign_free(env->me_pbuf); 15014 env->me_pbuf = nullptr; 15015 } 15016 15017 STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); 15018 STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT); 15019 const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; 15020 ENSURE(env, 15021 maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); 15022 env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; 15023 15024 STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); 15025 STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); 15026 STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) >= BRANCH_NODE_MAX(MIN_PAGESIZE)); 15027 STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) > NODESIZE + 42); 15028 STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); 15029 const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); 15030 const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); 15031 ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && 15032 branch_nodemax % 2 == 0 && 15033 leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && 15034 leaf_nodemax >= branch_nodemax && 15035 leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); 15036 env->me_leaf_nodemax = (unsigned)leaf_nodemax; 15037 env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); 15038 eASSERT(env, pgno2bytes(env, 1) == pagesize); 15039 eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); 15040 recalculate_merge_threshold(env); 15041 15042 const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); 15043 if (!env->me_options.flags.non_auto.dp_limit) { 15044 /* auto-setup dp_limit by "The42" ;-) */ 15045 intptr_t total_ram_pages, avail_ram_pages; 15046 int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); 15047 if (unlikely(err != MDBX_SUCCESS)) 15048 ERROR("mdbx_get_sysraminfo(), rc %d", err); 15049 else { 15050 size_t reasonable_dpl_limit = 15051 (size_t)(total_ram_pages + avail_ram_pages) / 42; 15052 if (pagesize > env->me_os_psize) 15053 reasonable_dpl_limit /= pagesize / env->me_os_psize; 15054 else if (pagesize < env->me_os_psize) 15055 reasonable_dpl_limit *= env->me_os_psize / pagesize; 15056 reasonable_dpl_limit = (reasonable_dpl_limit < MDBX_PGL_LIMIT) 15057 ? reasonable_dpl_limit 15058 : MDBX_PGL_LIMIT; 15059 reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK * 4) 15060 ? reasonable_dpl_limit 15061 : CURSOR_STACK * 4; 15062 env->me_options.dp_limit = (unsigned)reasonable_dpl_limit; 15063 } 15064 } 15065 if (env->me_options.dp_limit > max_pgno - NUM_METAS) 15066 env->me_options.dp_limit = max_pgno - NUM_METAS; 15067 if (env->me_options.dp_initial > env->me_options.dp_limit) 15068 env->me_options.dp_initial = env->me_options.dp_limit; 15069 } 15070 15071 static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * 15072 lckless_stub(const MDBX_env *env) { 15073 uintptr_t stub = (uintptr_t)&env->x_lckless_stub; 15074 /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ 15075 stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); 15076 return (MDBX_lockinfo *)stub; 15077 } 15078 15079 __cold int mdbx_env_create(MDBX_env **penv) { 15080 MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); 15081 if (unlikely(!env)) 15082 return MDBX_ENOMEM; 15083 15084 env->me_maxreaders = DEFAULT_READERS; 15085 env->me_maxdbs = env->me_numdbs = CORE_DBS; 15086 env->me_lazy_fd = INVALID_HANDLE_VALUE; 15087 env->me_dsync_fd = INVALID_HANDLE_VALUE; 15088 env->me_lfd = INVALID_HANDLE_VALUE; 15089 env->me_pid = osal_getpid(); 15090 env->me_stuck_meta = -1; 15091 15092 env->me_options.dp_reserve_limit = 1024; 15093 env->me_options.rp_augment_limit = 256 * 1024; 15094 env->me_options.dp_limit = 64 * 1024; 15095 if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS) 15096 env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS; 15097 env->me_options.dp_initial = MDBX_PNL_INITIAL; 15098 if (env->me_options.dp_initial > env->me_options.dp_limit) 15099 env->me_options.dp_initial = env->me_options.dp_limit; 15100 env->me_options.spill_max_denominator = 8; 15101 env->me_options.spill_min_denominator = 8; 15102 env->me_options.spill_parent4child_denominator = 0; 15103 env->me_options.dp_loose_limit = 64; 15104 env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; 15105 15106 int rc; 15107 const size_t os_psize = osal_syspagesize(); 15108 if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { 15109 ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); 15110 rc = MDBX_INCOMPATIBLE; 15111 goto bailout; 15112 } 15113 env->me_os_psize = (unsigned)os_psize; 15114 setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize 15115 : MAX_PAGESIZE); 15116 15117 rc = osal_fastmutex_init(&env->me_dbi_lock); 15118 if (unlikely(rc != MDBX_SUCCESS)) 15119 goto bailout; 15120 15121 #if defined(_WIN32) || defined(_WIN64) 15122 osal_srwlock_Init(&env->me_remap_guard); 15123 InitializeCriticalSection(&env->me_windowsbug_lock); 15124 #else 15125 rc = osal_fastmutex_init(&env->me_remap_guard); 15126 if (unlikely(rc != MDBX_SUCCESS)) { 15127 osal_fastmutex_destroy(&env->me_dbi_lock); 15128 goto bailout; 15129 } 15130 15131 #if MDBX_LOCKING > MDBX_LOCKING_SYSV 15132 MDBX_lockinfo *const stub = lckless_stub(env); 15133 rc = osal_ipclock_stub(&stub->mti_wlock); 15134 #endif /* MDBX_LOCKING */ 15135 if (unlikely(rc != MDBX_SUCCESS)) { 15136 osal_fastmutex_destroy(&env->me_remap_guard); 15137 osal_fastmutex_destroy(&env->me_dbi_lock); 15138 goto bailout; 15139 } 15140 #endif /* Windows */ 15141 15142 VALGRIND_CREATE_MEMPOOL(env, 0, 0); 15143 env->me_signature.weak = MDBX_ME_SIGNATURE; 15144 *penv = env; 15145 return MDBX_SUCCESS; 15146 15147 bailout: 15148 osal_free(env); 15149 *penv = nullptr; 15150 return rc; 15151 } 15152 15153 __cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) { 15154 if (*cached_result == 0) { 15155 intptr_t pagesize, total_ram_pages; 15156 if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) != 15157 MDBX_SUCCESS)) 15158 return *cached_result = MAX_MAPSIZE32 /* the 32-bit limit is good enough 15159 for fallback */ 15160 ; 15161 15162 if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize)) 15163 return *cached_result = MAX_MAPSIZE; 15164 assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2)); 15165 15166 /* Suggesting should not be more than golden ratio of the size of RAM. */ 15167 *cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize; 15168 15169 /* Round to the nearest human-readable granulation. */ 15170 for (size_t unit = MEGABYTE; unit; unit <<= 5) { 15171 const size_t floor = floor_powerof2(*cached_result, unit); 15172 const size_t ceil = ceil_powerof2(*cached_result, unit); 15173 const size_t threshold = (size_t)*cached_result >> 4; 15174 const bool down = 15175 *cached_result - floor < ceil - *cached_result || ceil > MAX_MAPSIZE; 15176 if (threshold < (down ? *cached_result - floor : ceil - *cached_result)) 15177 break; 15178 *cached_result = down ? floor : ceil; 15179 } 15180 } 15181 return *cached_result; 15182 } 15183 15184 __cold LIBMDBX_API int 15185 mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, 15186 intptr_t size_upper, intptr_t growth_step, 15187 intptr_t shrink_threshold, intptr_t pagesize) { 15188 int rc = check_env(env, false); 15189 if (unlikely(rc != MDBX_SUCCESS)) 15190 return rc; 15191 15192 const bool inside_txn = 15193 (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); 15194 15195 #if MDBX_DEBUG 15196 if (growth_step < 0) { 15197 growth_step = 1; 15198 if (shrink_threshold < 0) 15199 shrink_threshold = 1; 15200 } 15201 #endif /* MDBX_DEBUG */ 15202 15203 intptr_t reasonable_maxsize = 0; 15204 bool need_unlock = false; 15205 if (env->me_map) { 15206 /* env already mapped */ 15207 if (unlikely(env->me_flags & MDBX_RDONLY)) 15208 return MDBX_EACCESS; 15209 15210 if (!inside_txn) { 15211 int err = mdbx_txn_lock(env, false); 15212 if (unlikely(err != MDBX_SUCCESS)) 15213 return err; 15214 need_unlock = true; 15215 env->me_txn0->tw.troika = meta_tap(env); 15216 eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); 15217 env->me_txn0->mt_txnid = 15218 env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent]; 15219 txn_oldest_reader(env->me_txn0); 15220 } 15221 15222 /* get untouched params from current TXN or DB */ 15223 if (pagesize <= 0 || pagesize >= INT_MAX) 15224 pagesize = env->me_psize; 15225 const MDBX_geo *const geo = 15226 inside_txn ? &env->me_txn->mt_geo 15227 : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo; 15228 if (size_lower < 0) 15229 size_lower = pgno2bytes(env, geo->lower); 15230 if (size_now < 0) 15231 size_now = pgno2bytes(env, geo->now); 15232 if (size_upper < 0) 15233 size_upper = pgno2bytes(env, geo->upper); 15234 if (growth_step < 0) 15235 growth_step = pgno2bytes(env, pv2pages(geo->grow_pv)); 15236 if (shrink_threshold < 0) 15237 shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv)); 15238 15239 if (pagesize != (intptr_t)env->me_psize) { 15240 rc = MDBX_EINVAL; 15241 goto bailout; 15242 } 15243 const size_t usedbytes = 15244 pgno2bytes(env, find_largest_snapshot(env, geo->next)); 15245 if ((size_t)size_upper < usedbytes) { 15246 rc = MDBX_MAP_FULL; 15247 goto bailout; 15248 } 15249 if ((size_t)size_now < usedbytes) 15250 size_now = usedbytes; 15251 } else { 15252 /* env NOT yet mapped */ 15253 if (unlikely(inside_txn)) 15254 return MDBX_PANIC; 15255 15256 /* is requested some auto-value for pagesize ? */ 15257 if (pagesize >= INT_MAX /* maximal */) 15258 pagesize = MAX_PAGESIZE; 15259 else if (pagesize <= 0) { 15260 if (pagesize < 0 /* default */) { 15261 pagesize = env->me_os_psize; 15262 if ((uintptr_t)pagesize > MAX_PAGESIZE) 15263 pagesize = MAX_PAGESIZE; 15264 eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE); 15265 } else if (pagesize == 0 /* minimal */) 15266 pagesize = MIN_PAGESIZE; 15267 15268 /* choose pagesize */ 15269 intptr_t max_size = (size_now > size_lower) ? size_now : size_lower; 15270 max_size = (size_upper > max_size) ? size_upper : max_size; 15271 if (max_size < 0 /* default */) 15272 max_size = DEFAULT_MAPSIZE; 15273 else if (max_size == 0 /* minimal */) 15274 max_size = MIN_MAPSIZE; 15275 else if (max_size >= (intptr_t)MAX_MAPSIZE /* maximal */) 15276 max_size = get_reasonable_db_maxsize(&reasonable_maxsize); 15277 15278 while (max_size > pagesize * (int64_t)(MAX_PAGENO + 1) && 15279 pagesize < MAX_PAGESIZE) 15280 pagesize <<= 1; 15281 } 15282 } 15283 15284 if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || 15285 !is_powerof2(pagesize)) { 15286 rc = MDBX_EINVAL; 15287 goto bailout; 15288 } 15289 15290 if (size_lower <= 0) { 15291 size_lower = MIN_MAPSIZE; 15292 if (MIN_MAPSIZE / pagesize < MIN_PAGENO) 15293 size_lower = MIN_PAGENO * pagesize; 15294 } 15295 if (size_lower >= INTPTR_MAX) { 15296 size_lower = get_reasonable_db_maxsize(&reasonable_maxsize); 15297 if ((size_t)size_lower / pagesize > MAX_PAGENO + 1) 15298 size_lower = pagesize * (MAX_PAGENO + 1); 15299 } 15300 15301 if (size_now <= 0) { 15302 size_now = size_lower; 15303 if (size_upper >= size_lower && size_now > size_upper) 15304 size_now = size_upper; 15305 } 15306 if (size_now >= INTPTR_MAX) { 15307 size_now = get_reasonable_db_maxsize(&reasonable_maxsize); 15308 if ((size_t)size_now / pagesize > MAX_PAGENO + 1) 15309 size_now = pagesize * (MAX_PAGENO + 1); 15310 } 15311 15312 if (size_upper <= 0) { 15313 if (size_now >= get_reasonable_db_maxsize(&reasonable_maxsize) / 2) 15314 size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); 15315 else if (MAX_MAPSIZE != MAX_MAPSIZE32 && 15316 (size_t)size_now >= MAX_MAPSIZE32 / 2 && 15317 (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3) 15318 size_upper = MAX_MAPSIZE32; 15319 else { 15320 size_upper = size_now + size_now; 15321 if ((size_t)size_upper < DEFAULT_MAPSIZE * 2) 15322 size_upper = DEFAULT_MAPSIZE * 2; 15323 } 15324 if ((size_t)size_upper / pagesize > (MAX_PAGENO + 1)) 15325 size_upper = pagesize * (MAX_PAGENO + 1); 15326 } else if (size_upper >= INTPTR_MAX) { 15327 size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); 15328 if ((size_t)size_upper / pagesize > MAX_PAGENO + 1) 15329 size_upper = pagesize * (MAX_PAGENO + 1); 15330 } 15331 15332 if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) { 15333 rc = MDBX_EINVAL; 15334 goto bailout; 15335 } 15336 15337 if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { 15338 size_lower = pagesize * MIN_PAGENO; 15339 if (unlikely(size_lower > size_upper)) { 15340 rc = MDBX_EINVAL; 15341 goto bailout; 15342 } 15343 if (size_now < size_lower) 15344 size_now = size_lower; 15345 } 15346 15347 if (unlikely((size_t)size_upper > MAX_MAPSIZE || 15348 (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { 15349 rc = MDBX_TOO_LARGE; 15350 goto bailout; 15351 } 15352 15353 const size_t unit = (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize 15354 : (size_t)pagesize; 15355 size_lower = ceil_powerof2(size_lower, unit); 15356 size_upper = ceil_powerof2(size_upper, unit); 15357 size_now = ceil_powerof2(size_now, unit); 15358 15359 /* LY: подбираем значение size_upper: 15360 * - кратное размеру страницы 15361 * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ 15362 while (unlikely((size_t)size_upper > MAX_MAPSIZE || 15363 (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { 15364 if ((size_t)size_upper < unit + MIN_MAPSIZE || 15365 (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) { 15366 /* паранойа на случай переполнения при невероятных значениях */ 15367 rc = MDBX_EINVAL; 15368 goto bailout; 15369 } 15370 size_upper -= unit; 15371 if ((size_t)size_upper < (size_t)size_lower) 15372 size_lower = size_upper; 15373 } 15374 eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0); 15375 15376 if (size_now < size_lower) 15377 size_now = size_lower; 15378 if (size_now > size_upper) 15379 size_now = size_upper; 15380 15381 if (growth_step < 0) { 15382 growth_step = ((size_t)(size_upper - size_lower)) / 42; 15383 if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE) 15384 growth_step = size_lower; 15385 if (growth_step < 65536) 15386 growth_step = 65536; 15387 if ((size_t)growth_step > MAX_MAPSIZE / 64) 15388 growth_step = MAX_MAPSIZE / 64; 15389 } 15390 if (growth_step == 0 && shrink_threshold > 0) 15391 growth_step = 1; 15392 growth_step = ceil_powerof2(growth_step, unit); 15393 15394 if (shrink_threshold < 0) 15395 shrink_threshold = growth_step + growth_step; 15396 shrink_threshold = ceil_powerof2(shrink_threshold, unit); 15397 15398 //---------------------------------------------------------------------------- 15399 15400 if (!env->me_map) { 15401 /* save user's geo-params for future open/create */ 15402 if (pagesize != (intptr_t)env->me_psize) 15403 setup_pagesize(env, pagesize); 15404 env->me_dbgeo.lower = size_lower; 15405 env->me_dbgeo.now = size_now; 15406 env->me_dbgeo.upper = size_upper; 15407 env->me_dbgeo.grow = 15408 pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); 15409 env->me_dbgeo.shrink = 15410 pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); 15411 15412 ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); 15413 ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); 15414 ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); 15415 ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0); 15416 15417 ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); 15418 ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); 15419 ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); 15420 ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0); 15421 15422 ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); 15423 ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); 15424 ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0); 15425 ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0); 15426 15427 ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); 15428 ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0); 15429 ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); 15430 ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0); 15431 15432 rc = MDBX_SUCCESS; 15433 } else { 15434 /* apply new params to opened environment */ 15435 ENSURE(env, pagesize == (intptr_t)env->me_psize); 15436 MDBX_meta meta; 15437 memset(&meta, 0, sizeof(meta)); 15438 const MDBX_geo *current_geo; 15439 if (!inside_txn) { 15440 eASSERT(env, need_unlock); 15441 const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); 15442 15443 uint64_t timestamp = 0; 15444 while ("workaround for " 15445 "todo4recovery://erased_by_github/libmdbx/issues/269") { 15446 meta = *head.ptr_c; 15447 rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, 15448 ×tamp); 15449 if (likely(rc == MDBX_SUCCESS)) 15450 break; 15451 if (unlikely(rc != MDBX_RESULT_TRUE)) 15452 goto bailout; 15453 } 15454 const txnid_t txnid = safe64_txnid_next(head.txnid); 15455 if (unlikely(txnid > MAX_TXNID)) { 15456 rc = MDBX_TXN_FULL; 15457 ERROR("txnid overflow, raise %d", rc); 15458 goto bailout; 15459 } 15460 meta_set_txnid(env, &meta, txnid); 15461 current_geo = &meta.mm_geo; 15462 } else { 15463 current_geo = &env->me_txn->mt_geo; 15464 } 15465 15466 MDBX_geo new_geo; 15467 new_geo.lower = bytes2pgno(env, size_lower); 15468 new_geo.now = bytes2pgno(env, size_now); 15469 new_geo.upper = bytes2pgno(env, size_upper); 15470 new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step)); 15471 new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); 15472 new_geo.next = current_geo->next; 15473 15474 ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); 15475 ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); 15476 ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); 15477 ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); 15478 ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); 15479 15480 ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); 15481 ENSURE(env, new_geo.lower >= MIN_PAGENO); 15482 ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); 15483 ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); 15484 ENSURE(env, new_geo.now >= new_geo.next); 15485 ENSURE(env, new_geo.upper >= new_geo.now); 15486 ENSURE(env, new_geo.now >= new_geo.lower); 15487 15488 if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { 15489 #if defined(_WIN32) || defined(_WIN64) 15490 /* Was DB shrinking disabled before and now it will be enabled? */ 15491 if (new_geo.lower < new_geo.upper && new_geo.shrink_pv && 15492 !(current_geo->lower < current_geo->upper && 15493 current_geo->shrink_pv)) { 15494 if (!env->me_lck_mmap.lck) { 15495 rc = MDBX_EPERM; 15496 goto bailout; 15497 } 15498 int err = osal_rdt_lock(env); 15499 if (unlikely(MDBX_IS_ERROR(err))) { 15500 rc = err; 15501 goto bailout; 15502 } 15503 15504 /* Check if there are any reading threads that do not use the SRWL */ 15505 const size_t CurrentTid = GetCurrentThreadId(); 15506 const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; 15507 const MDBX_reader *const end = 15508 begin + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, 15509 mo_AcquireRelease); 15510 for (const MDBX_reader *reader = begin; reader < end; ++reader) { 15511 if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak && 15512 reader->mr_tid.weak != CurrentTid) { 15513 /* At least one thread may don't use SRWL */ 15514 rc = MDBX_EPERM; 15515 break; 15516 } 15517 } 15518 15519 osal_rdt_unlock(env); 15520 if (unlikely(rc != MDBX_SUCCESS)) 15521 goto bailout; 15522 } 15523 #endif 15524 15525 if (new_geo.now != current_geo->now || 15526 new_geo.upper != current_geo->upper) { 15527 rc = map_resize(env, current_geo->next, new_geo.now, new_geo.upper, 15528 false); 15529 if (unlikely(rc != MDBX_SUCCESS)) 15530 goto bailout; 15531 } 15532 if (inside_txn) { 15533 env->me_txn->mt_geo = new_geo; 15534 env->me_txn->mt_flags |= MDBX_TXN_DIRTY; 15535 } else { 15536 meta.mm_geo = new_geo; 15537 rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); 15538 } 15539 15540 if (likely(rc == MDBX_SUCCESS)) { 15541 /* store new geo to env to avoid influences */ 15542 env->me_dbgeo.now = pgno2bytes(env, new_geo.now); 15543 env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); 15544 env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper); 15545 env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); 15546 env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); 15547 } 15548 } 15549 } 15550 15551 bailout: 15552 if (need_unlock) 15553 mdbx_txn_unlock(env); 15554 return rc; 15555 } 15556 15557 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 15558 __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { 15559 return __inline_mdbx_env_set_mapsize(env, size); 15560 } 15561 15562 __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { 15563 return __inline_mdbx_env_set_maxdbs(env, dbs); 15564 } 15565 15566 __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { 15567 return __inline_mdbx_env_get_maxdbs(env, dbs); 15568 } 15569 15570 __cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { 15571 return __inline_mdbx_env_set_maxreaders(env, readers); 15572 } 15573 15574 __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { 15575 return __inline_mdbx_env_get_maxreaders(env, readers); 15576 } 15577 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 15578 15579 __cold static int alloc_page_buf(MDBX_env *env) { 15580 return env->me_pbuf 15581 ? MDBX_SUCCESS 15582 : osal_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, 15583 &env->me_pbuf); 15584 } 15585 15586 /* Further setup required for opening an MDBX environment */ 15587 __cold static int setup_dxb(MDBX_env *env, const int lck_rc, 15588 const mdbx_mode_t mode_bits) { 15589 MDBX_meta header; 15590 int rc = MDBX_RESULT_FALSE; 15591 int err = read_header(env, &header, lck_rc, mode_bits); 15592 if (unlikely(err != MDBX_SUCCESS)) { 15593 if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || 15594 (env->me_flags & MDBX_RDONLY) != 0 || 15595 /* recovery mode */ env->me_stuck_meta >= 0) 15596 return err; 15597 15598 DEBUG("%s", "create new database"); 15599 rc = /* new database */ MDBX_RESULT_TRUE; 15600 15601 if (!env->me_dbgeo.now) { 15602 /* set defaults if not configured */ 15603 err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1); 15604 if (unlikely(err != MDBX_SUCCESS)) 15605 return err; 15606 } 15607 15608 err = alloc_page_buf(env); 15609 if (unlikely(err != MDBX_SUCCESS)) 15610 return err; 15611 15612 header = *init_metas(env, env->me_pbuf); 15613 err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, 15614 0); 15615 if (unlikely(err != MDBX_SUCCESS)) 15616 return err; 15617 15618 err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = 15619 env->me_dxb_mmap.current = 15620 env->me_dbgeo.now); 15621 if (unlikely(err != MDBX_SUCCESS)) 15622 return err; 15623 15624 #ifndef NDEBUG /* just for checking */ 15625 err = read_header(env, &header, lck_rc, mode_bits); 15626 if (unlikely(err != MDBX_SUCCESS)) 15627 return err; 15628 #endif 15629 } 15630 15631 VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO 15632 "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN 15633 ", %s", 15634 header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, 15635 header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, 15636 header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), 15637 pv2pages(header.mm_geo.shrink_pv), 15638 unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); 15639 15640 if (env->me_psize != header.mm_psize) 15641 setup_pagesize(env, header.mm_psize); 15642 const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); 15643 const size_t used_aligned2os_bytes = 15644 ceil_powerof2(used_bytes, env->me_os_psize); 15645 if ((env->me_flags & MDBX_RDONLY) /* readonly */ 15646 || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ 15647 || /* recovery mode */ env->me_stuck_meta >= 0) { 15648 /* use present params from db */ 15649 const size_t pagesize = header.mm_psize; 15650 err = mdbx_env_set_geometry( 15651 env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize, 15652 header.mm_geo.upper * pagesize, 15653 pv2pages(header.mm_geo.grow_pv) * pagesize, 15654 pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); 15655 if (unlikely(err != MDBX_SUCCESS)) { 15656 ERROR("%s: err %d", "could not apply preconfigured geometry from db", 15657 err); 15658 return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; 15659 } 15660 } else if (env->me_dbgeo.now) { 15661 /* silently growth to last used page */ 15662 if (env->me_dbgeo.now < used_aligned2os_bytes) 15663 env->me_dbgeo.now = used_aligned2os_bytes; 15664 if (env->me_dbgeo.upper < used_aligned2os_bytes) 15665 env->me_dbgeo.upper = used_aligned2os_bytes; 15666 15667 /* apply preconfigured params, but only if substantial changes: 15668 * - upper or lower limit changes 15669 * - shrink threshold or growth step 15670 * But ignore change just a 'now/current' size. */ 15671 if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != 15672 pgno2bytes(env, header.mm_geo.upper) || 15673 bytes_align2os_bytes(env, env->me_dbgeo.lower) != 15674 pgno2bytes(env, header.mm_geo.lower) || 15675 bytes_align2os_bytes(env, env->me_dbgeo.shrink) != 15676 pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) || 15677 bytes_align2os_bytes(env, env->me_dbgeo.grow) != 15678 pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) { 15679 15680 if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) 15681 /* pre-shrink if enabled */ 15682 env->me_dbgeo.now = used_bytes + env->me_dbgeo.shrink - 15683 used_bytes % env->me_dbgeo.shrink; 15684 15685 err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, 15686 env->me_dbgeo.upper, env->me_dbgeo.grow, 15687 env->me_dbgeo.shrink, header.mm_psize); 15688 if (unlikely(err != MDBX_SUCCESS)) { 15689 ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); 15690 return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; 15691 } 15692 15693 /* update meta fields */ 15694 header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); 15695 header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); 15696 header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); 15697 header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); 15698 header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); 15699 15700 VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO 15701 "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO 15702 " +%u -%u, txn_id %" PRIaTXN ", %s", 15703 header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, 15704 header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, 15705 header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), 15706 pv2pages(header.mm_geo.shrink_pv), 15707 unaligned_peek_u64(4, header.mm_txnid_a), 15708 durable_caption(&header)); 15709 } else { 15710 /* fetch back 'now/current' size, since it was ignored during comparison 15711 * and may differ. */ 15712 env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now); 15713 } 15714 ENSURE(env, header.mm_geo.now >= header.mm_geo.next); 15715 } else { 15716 /* geo-params are not pre-configured by user, 15717 * get current values from the meta. */ 15718 env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now); 15719 env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower); 15720 env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper); 15721 env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv)); 15722 env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)); 15723 } 15724 15725 ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now); 15726 ENSURE(env, env->me_dbgeo.now >= used_bytes); 15727 const uint64_t filesize_before = env->me_dxb_mmap.filesize; 15728 if (unlikely(filesize_before != env->me_dbgeo.now)) { 15729 if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { 15730 VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO 15731 "p, have %" PRIu64 "b/%" PRIaPGNO "p), " 15732 "assume other process working", 15733 env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), 15734 filesize_before, bytes2pgno(env, (size_t)filesize_before)); 15735 } else { 15736 WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO 15737 "p, have %" PRIu64 "b/%" PRIaPGNO "p)", 15738 env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), 15739 filesize_before, bytes2pgno(env, (size_t)filesize_before)); 15740 if (filesize_before < used_bytes) { 15741 ERROR("last-page beyond end-of-file (last %" PRIaPGNO 15742 ", have %" PRIaPGNO ")", 15743 header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); 15744 return MDBX_CORRUPTED; 15745 } 15746 15747 if (env->me_flags & MDBX_RDONLY) { 15748 if (filesize_before & (env->me_os_psize - 1)) { 15749 ERROR("%s", "filesize should be rounded-up to system page"); 15750 return MDBX_WANNA_RECOVERY; 15751 } 15752 WARNING("%s", "ignore filesize mismatch in readonly-mode"); 15753 } else { 15754 VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO 15755 " pages", 15756 env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); 15757 } 15758 } 15759 } 15760 15761 VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, 15762 bootid.y, (bootid.x | bootid.y) ? "" : "not-"); 15763 15764 #if MDBX_ENABLE_MADVISE 15765 /* calculate readahead hint before mmap with zero redundant pages */ 15766 const bool readahead = 15767 !(env->me_flags & MDBX_NORDAHEAD) && 15768 mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; 15769 #endif /* MDBX_ENABLE_MADVISE */ 15770 15771 err = osal_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, 15772 env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); 15773 if (unlikely(err != MDBX_SUCCESS)) 15774 return err; 15775 15776 #if MDBX_ENABLE_MADVISE 15777 #if defined(MADV_DONTDUMP) 15778 err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP) 15779 ? ignore_enosys(errno) 15780 : MDBX_SUCCESS; 15781 if (unlikely(MDBX_IS_ERROR(err))) 15782 return err; 15783 #endif /* MADV_DONTDUMP */ 15784 #if defined(MADV_DODUMP) 15785 if (runtime_flags & MDBX_DBG_DUMP) { 15786 const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); 15787 err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) 15788 ? ignore_enosys(errno) 15789 : MDBX_SUCCESS; 15790 if (unlikely(MDBX_IS_ERROR(err))) 15791 return err; 15792 } 15793 #endif /* MADV_DODUMP */ 15794 #endif /* MDBX_ENABLE_MADVISE */ 15795 15796 #ifdef MDBX_USE_VALGRIND 15797 env->me_valgrind_handle = 15798 VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); 15799 #endif /* MDBX_USE_VALGRIND */ 15800 15801 eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && 15802 used_bytes <= env->me_dxb_mmap.limit); 15803 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 15804 if (env->me_dxb_mmap.filesize > used_bytes && 15805 env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { 15806 VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, 15807 env->me_dxb_mmap.filesize - used_bytes); 15808 MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, 15809 env->me_dxb_mmap.filesize - used_bytes); 15810 } 15811 env->me_poison_edge = 15812 bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) 15813 ? env->me_dxb_mmap.filesize 15814 : env->me_dxb_mmap.limit); 15815 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ 15816 15817 meta_troika_t troika = meta_tap(env); 15818 #if MDBX_DEBUG 15819 meta_troika_dump(env, &troika); 15820 #endif 15821 eASSERT(env, !env->me_txn && !env->me_txn0); 15822 //-------------------------------- validate/rollback head & steady meta-pages 15823 if (unlikely(env->me_stuck_meta >= 0)) { 15824 /* recovery mode */ 15825 MDBX_meta clone; 15826 MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta); 15827 err = validate_meta_copy(env, target, &clone); 15828 if (unlikely(err != MDBX_SUCCESS)) { 15829 ERROR("target meta[%u] is corrupted", 15830 bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); 15831 meta_troika_dump(env, &troika); 15832 return MDBX_CORRUPTED; 15833 } 15834 } else /* not recovery mode */ 15835 while (1) { 15836 const unsigned meta_clash_mask = meta_eq_mask(&troika); 15837 if (unlikely(meta_clash_mask)) { 15838 ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); 15839 meta_troika_dump(env, &troika); 15840 return MDBX_CORRUPTED; 15841 } 15842 15843 if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { 15844 /* non-exclusive mode, 15845 * meta-pages should be validated by a first process opened the DB */ 15846 if (troika.recent == troika.prefer_steady) 15847 break; 15848 15849 if (!env->me_lck_mmap.lck) { 15850 /* LY: without-lck (read-only) mode, so it is impossible that other 15851 * process made weak checkpoint. */ 15852 ERROR("%s", "without-lck, unable recovery/rollback"); 15853 meta_troika_dump(env, &troika); 15854 return MDBX_WANNA_RECOVERY; 15855 } 15856 15857 /* LY: assume just have a collision with other running process, 15858 * or someone make a weak checkpoint */ 15859 VERBOSE("%s", "assume collision or online weak checkpoint"); 15860 break; 15861 } 15862 eASSERT(env, lck_rc == MDBX_RESULT_TRUE); 15863 /* exclusive mode */ 15864 15865 const meta_ptr_t recent = meta_recent(env, &troika); 15866 const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); 15867 MDBX_meta clone; 15868 if (prefer_steady.is_steady) { 15869 err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); 15870 if (unlikely(err != MDBX_SUCCESS)) { 15871 ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", 15872 bytes2pgno(env, (uint8_t *)prefer_steady.ptr_c - env->me_map), 15873 "steady", prefer_steady.txnid, "manual recovery"); 15874 meta_troika_dump(env, &troika); 15875 return MDBX_CORRUPTED; 15876 } 15877 if (prefer_steady.ptr_c == recent.ptr_c) 15878 break; 15879 } 15880 15881 const pgno_t pgno = 15882 bytes2pgno(env, (uint8_t *)recent.ptr_c - env->me_map); 15883 const bool last_valid = 15884 validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; 15885 eASSERT(env, 15886 !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); 15887 if (unlikely(!last_valid)) { 15888 if (unlikely(!prefer_steady.is_steady)) { 15889 ERROR("%s for open or automatic rollback, %s", 15890 "there are no suitable meta-pages", 15891 "manual recovery is required"); 15892 meta_troika_dump(env, &troika); 15893 return MDBX_CORRUPTED; 15894 } 15895 WARNING("meta[%u] with last txnid %" PRIaTXN 15896 " is corrupted, rollback needed", 15897 pgno, recent.txnid); 15898 meta_troika_dump(env, &troika); 15899 goto purge_meta_head; 15900 } 15901 15902 if (meta_bootid_match(recent.ptr_c)) { 15903 if (env->me_flags & MDBX_RDONLY) { 15904 ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " 15905 "rollback NOT needed, steady-sync NEEDED%s", 15906 "opening after an unclean shutdown", bootid.x, bootid.y, 15907 ", but unable in read-only mode"); 15908 meta_troika_dump(env, &troika); 15909 return MDBX_WANNA_RECOVERY; 15910 } 15911 WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " 15912 "rollback NOT needed, steady-sync NEEDED%s", 15913 "opening after an unclean shutdown", bootid.x, bootid.y, ""); 15914 header = clone; 15915 atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, 15916 mo_Relaxed); 15917 break; 15918 } 15919 if (unlikely(!prefer_steady.is_steady)) { 15920 ERROR("%s, but %s for automatic rollback: %s", 15921 "opening after an unclean shutdown", 15922 "there are no suitable meta-pages", 15923 "manual recovery is required"); 15924 meta_troika_dump(env, &troika); 15925 return MDBX_CORRUPTED; 15926 } 15927 if (env->me_flags & MDBX_RDONLY) { 15928 ERROR("%s and rollback needed: (from head %" PRIaTXN 15929 " to steady %" PRIaTXN ")%s", 15930 "opening after an unclean shutdown", recent.txnid, 15931 prefer_steady.txnid, ", but unable in read-only mode"); 15932 meta_troika_dump(env, &troika); 15933 return MDBX_WANNA_RECOVERY; 15934 } 15935 15936 purge_meta_head: 15937 NOTICE("%s and doing automatic rollback: " 15938 "purge%s meta[%u] with%s txnid %" PRIaTXN, 15939 "opening after an unclean shutdown", last_valid ? "" : " invalid", 15940 pgno, last_valid ? " weak" : "", recent.txnid); 15941 meta_troika_dump(env, &troika); 15942 ENSURE(env, prefer_steady.is_steady); 15943 err = override_meta(env, pgno, 0, 15944 last_valid ? recent.ptr_c : prefer_steady.ptr_c); 15945 if (err) { 15946 ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", 15947 pgno, recent.txnid, err); 15948 return err; 15949 } 15950 troika = meta_tap(env); 15951 ENSURE(env, 0 == meta_txnid(recent.ptr_v)); 15952 ENSURE(env, 0 == meta_eq_mask(&troika)); 15953 } 15954 15955 if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { 15956 //-------------------------------------------------- shrink DB & update geo 15957 /* re-check size after mmap */ 15958 if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || 15959 env->me_dxb_mmap.current < used_bytes) { 15960 ERROR("unacceptable/unexpected datafile size %" PRIuPTR, 15961 env->me_dxb_mmap.current); 15962 return MDBX_PROBLEM; 15963 } 15964 if (env->me_dxb_mmap.current != env->me_dbgeo.now) { 15965 header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); 15966 NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO 15967 " pages", 15968 env->me_dxb_mmap.current, header.mm_geo.now); 15969 } 15970 15971 const meta_ptr_t recent = meta_recent(env, &troika); 15972 if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) { 15973 if ((env->me_flags & MDBX_RDONLY) != 0 || 15974 /* recovery mode */ env->me_stuck_meta >= 0) { 15975 WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO 15976 "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO 15977 "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", 15978 (env->me_stuck_meta < 0) ? "read-only" : "recovery", 15979 recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, 15980 recent.ptr_c->mm_geo.upper, 15981 pv2pages(recent.ptr_c->mm_geo.shrink_pv), 15982 pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower, 15983 header.mm_geo.now, header.mm_geo.upper, 15984 pv2pages(header.mm_geo.shrink_pv), 15985 pv2pages(header.mm_geo.grow_pv)); 15986 } else { 15987 const txnid_t next_txnid = safe64_txnid_next(recent.txnid); 15988 if (unlikely(next_txnid > MAX_TXNID)) { 15989 ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); 15990 return MDBX_TXN_FULL; 15991 } 15992 NOTICE("updating meta.geo: " 15993 "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO 15994 "/s%u-g%u (txn#%" PRIaTXN "), " 15995 "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO 15996 "/s%u-g%u (txn#%" PRIaTXN ")", 15997 recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, 15998 recent.ptr_c->mm_geo.upper, 15999 pv2pages(recent.ptr_c->mm_geo.shrink_pv), 16000 pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, 16001 header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, 16002 pv2pages(header.mm_geo.shrink_pv), 16003 pv2pages(header.mm_geo.grow_pv), next_txnid); 16004 16005 ENSURE(env, header.unsafe_txnid == recent.txnid); 16006 meta_set_txnid(env, &header, next_txnid); 16007 err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, 16008 &troika); 16009 if (err) { 16010 ERROR("error %d, while updating meta.geo: " 16011 "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO 16012 "/s%u-g%u (txn#%" PRIaTXN "), " 16013 "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO 16014 "/s%u-g%u (txn#%" PRIaTXN ")", 16015 err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, 16016 recent.ptr_c->mm_geo.upper, 16017 pv2pages(recent.ptr_c->mm_geo.shrink_pv), 16018 pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, 16019 header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, 16020 pv2pages(header.mm_geo.shrink_pv), 16021 pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid); 16022 return err; 16023 } 16024 } 16025 } 16026 16027 atomic_store32(&env->me_lck->mti_discarded_tail, 16028 bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); 16029 16030 if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && 16031 (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { 16032 for (int n = 0; n < NUM_METAS; ++n) { 16033 MDBX_meta *const meta = METAPAGE(env, n); 16034 if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != 16035 MDBX_DATA_MAGIC)) { 16036 const txnid_t txnid = constmeta_txnid(meta); 16037 NOTICE("%s %s" 16038 "meta[%u], txnid %" PRIaTXN, 16039 "updating db-format signature for", 16040 META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid); 16041 err = override_meta(env, n, txnid, meta); 16042 if (unlikely(err != MDBX_SUCCESS) && 16043 /* Just ignore the MDBX_PROBLEM error, since here it is 16044 * returned only in case of the attempt to upgrade an obsolete 16045 * meta-page that is invalid for current state of a DB, 16046 * e.g. after shrinking DB file */ 16047 err != MDBX_PROBLEM) { 16048 ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", 16049 "updating db-format signature for", n, txnid, err); 16050 return err; 16051 } 16052 troika = meta_tap(env); 16053 } 16054 } 16055 } 16056 } /* lck exclusive, lck_rc == MDBX_RESULT_TRUE */ 16057 16058 //---------------------------------------------------- setup madvise/readahead 16059 #if MDBX_ENABLE_MADVISE 16060 if (used_aligned2os_bytes < env->me_dxb_mmap.current) { 16061 #if defined(MADV_REMOVE) 16062 if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && 16063 /* not recovery mode */ env->me_stuck_meta < 0) { 16064 NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", 16065 env->me_lck->mti_discarded_tail.weak, 16066 bytes2pgno(env, env->me_dxb_mmap.current)); 16067 err = 16068 madvise(env->me_map + used_aligned2os_bytes, 16069 env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) 16070 ? ignore_enosys(errno) 16071 : MDBX_SUCCESS; 16072 if (unlikely(MDBX_IS_ERROR(err))) 16073 return err; 16074 } 16075 #endif /* MADV_REMOVE */ 16076 #if defined(MADV_DONTNEED) 16077 NOTICE("open-MADV_%s %u..%u", "DONTNEED", 16078 env->me_lck->mti_discarded_tail.weak, 16079 bytes2pgno(env, env->me_dxb_mmap.current)); 16080 err = 16081 madvise(env->me_map + used_aligned2os_bytes, 16082 env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) 16083 ? ignore_enosys(errno) 16084 : MDBX_SUCCESS; 16085 if (unlikely(MDBX_IS_ERROR(err))) 16086 return err; 16087 #elif defined(POSIX_MADV_DONTNEED) 16088 err = ignore_enosys(posix_madvise( 16089 env->me_map + used_aligned2os_bytes, 16090 env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); 16091 if (unlikely(MDBX_IS_ERROR(err))) 16092 return err; 16093 #elif defined(POSIX_FADV_DONTNEED) 16094 err = ignore_enosys(posix_fadvise( 16095 env->me_lazy_fd, used_aligned2os_bytes, 16096 env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); 16097 if (unlikely(MDBX_IS_ERROR(err))) 16098 return err; 16099 #endif /* MADV_DONTNEED */ 16100 } 16101 16102 err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); 16103 if (unlikely(err != MDBX_SUCCESS)) 16104 return err; 16105 #endif /* MDBX_ENABLE_MADVISE */ 16106 16107 return rc; 16108 } 16109 16110 /******************************************************************************/ 16111 16112 /* Open and/or initialize the lock region for the environment. */ 16113 __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, 16114 mdbx_mode_t mode) { 16115 eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); 16116 eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); 16117 16118 int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); 16119 if (err != MDBX_SUCCESS) { 16120 switch (err) { 16121 default: 16122 return err; 16123 case MDBX_ENOFILE: 16124 case MDBX_EACCESS: 16125 case MDBX_EPERM: 16126 if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) 16127 return err; 16128 break; 16129 case MDBX_EROFS: 16130 if ((env->me_flags & MDBX_RDONLY) == 0) 16131 return err; 16132 break; 16133 } 16134 16135 if (err != MDBX_ENOFILE) { 16136 /* ENSURE the file system is read-only */ 16137 err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); 16138 if (err != MDBX_SUCCESS && 16139 /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ 16140 !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) 16141 return err; 16142 } 16143 16144 /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ 16145 /* beginning of a locked section ---------------------------------------- */ 16146 lcklist_lock(); 16147 eASSERT(env, env->me_lcklist_next == nullptr); 16148 env->me_lfd = INVALID_HANDLE_VALUE; 16149 const int rc = osal_lck_seize(env); 16150 if (MDBX_IS_ERROR(rc)) { 16151 /* Calling lcklist_detach_locked() is required to restore POSIX-filelock 16152 * and this job will be done by env_close(). */ 16153 lcklist_unlock(); 16154 return rc; 16155 } 16156 /* insert into inprocess lck-list */ 16157 env->me_lcklist_next = inprocess_lcklist_head; 16158 inprocess_lcklist_head = env; 16159 lcklist_unlock(); 16160 /* end of a locked section ---------------------------------------------- */ 16161 16162 env->me_lck = lckless_stub(env); 16163 env->me_maxreaders = UINT_MAX; 16164 DEBUG("lck-setup:%s%s%s", " lck-less", 16165 (env->me_flags & MDBX_RDONLY) ? " readonly" : "", 16166 (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); 16167 return rc; 16168 } 16169 16170 /* beginning of a locked section ------------------------------------------ */ 16171 lcklist_lock(); 16172 eASSERT(env, env->me_lcklist_next == nullptr); 16173 16174 /* Try to get exclusive lock. If we succeed, then 16175 * nobody is using the lock region and we should initialize it. */ 16176 err = osal_lck_seize(env); 16177 if (MDBX_IS_ERROR(err)) { 16178 bailout: 16179 /* Calling lcklist_detach_locked() is required to restore POSIX-filelock 16180 * and this job will be done by env_close(). */ 16181 lcklist_unlock(); 16182 return err; 16183 } 16184 16185 MDBX_env *inprocess_neighbor = nullptr; 16186 if (err == MDBX_RESULT_TRUE) { 16187 err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); 16188 if (MDBX_IS_ERROR(err)) 16189 goto bailout; 16190 if (inprocess_neighbor && 16191 ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || 16192 (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { 16193 err = MDBX_BUSY; 16194 goto bailout; 16195 } 16196 } 16197 const int lck_seize_rc = err; 16198 16199 DEBUG("lck-setup:%s%s%s", " with-lck", 16200 (env->me_flags & MDBX_RDONLY) ? " readonly" : "", 16201 (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); 16202 16203 uint64_t size = 0; 16204 err = osal_filesize(env->me_lfd, &size); 16205 if (unlikely(err != MDBX_SUCCESS)) 16206 goto bailout; 16207 16208 if (lck_seize_rc == MDBX_RESULT_TRUE) { 16209 size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + 16210 sizeof(MDBX_lockinfo), 16211 env->me_os_psize); 16212 jitter4testing(false); 16213 } else { 16214 if (env->me_flags & MDBX_EXCLUSIVE) { 16215 err = MDBX_BUSY; 16216 goto bailout; 16217 } 16218 if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || 16219 size < env->me_os_psize) { 16220 ERROR("lck-file has invalid size %" PRIu64 " bytes", size); 16221 err = MDBX_PROBLEM; 16222 goto bailout; 16223 } 16224 } 16225 16226 const size_t maxreaders = 16227 ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); 16228 if (maxreaders < 4) { 16229 ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); 16230 err = MDBX_PROBLEM; 16231 goto bailout; 16232 } 16233 env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) 16234 ? (unsigned)maxreaders 16235 : (unsigned)MDBX_READERS_LIMIT; 16236 16237 err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, 16238 &env->me_lck_mmap, (size_t)size, (size_t)size, 16239 lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE 16240 : MMAP_OPTION_SEMAPHORE); 16241 if (unlikely(err != MDBX_SUCCESS)) 16242 goto bailout; 16243 16244 #if MDBX_ENABLE_MADVISE 16245 #ifdef MADV_DODUMP 16246 err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) 16247 : MDBX_SUCCESS; 16248 if (unlikely(MDBX_IS_ERROR(err))) 16249 goto bailout; 16250 #endif /* MADV_DODUMP */ 16251 16252 #ifdef MADV_WILLNEED 16253 err = madvise(env->me_lck_mmap.lck, size, MADV_WILLNEED) 16254 ? ignore_enosys(errno) 16255 : MDBX_SUCCESS; 16256 if (unlikely(MDBX_IS_ERROR(err))) 16257 goto bailout; 16258 #elif defined(POSIX_MADV_WILLNEED) 16259 err = ignore_enosys( 16260 posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED)); 16261 if (unlikely(MDBX_IS_ERROR(err))) 16262 goto bailout; 16263 #endif /* MADV_WILLNEED */ 16264 #endif /* MDBX_ENABLE_MADVISE */ 16265 16266 struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 16267 if (lck_seize_rc == MDBX_RESULT_TRUE) { 16268 /* LY: exclusive mode, check and reset lck content */ 16269 memset(lck, 0, (size_t)size); 16270 jitter4testing(false); 16271 lck->mti_magic_and_version = MDBX_LOCK_MAGIC; 16272 lck->mti_os_and_format = MDBX_LOCK_FORMAT; 16273 #if MDBX_ENABLE_PGOP_STAT 16274 lck->mti_pgop_stat.wops.weak = 1; 16275 #endif /* MDBX_ENABLE_PGOP_STAT */ 16276 err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); 16277 if (unlikely(err != MDBX_SUCCESS)) { 16278 ERROR("initial-%s for lck-file failed", "msync"); 16279 goto bailout; 16280 } 16281 err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); 16282 if (unlikely(err != MDBX_SUCCESS)) { 16283 ERROR("initial-%s for lck-file failed", "fsync"); 16284 goto bailout; 16285 } 16286 } else { 16287 if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { 16288 const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC; 16289 ERROR("lock region has %s", 16290 invalid 16291 ? "invalid magic" 16292 : "incompatible version (only applications with nearly or the " 16293 "same versions of libmdbx can share the same database)"); 16294 err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; 16295 goto bailout; 16296 } 16297 if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { 16298 ERROR("lock region has os/format signature 0x%" PRIx32 16299 ", expected 0x%" PRIx32, 16300 lck->mti_os_and_format, MDBX_LOCK_FORMAT); 16301 err = MDBX_VERSION_MISMATCH; 16302 goto bailout; 16303 } 16304 } 16305 16306 err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); 16307 if (MDBX_IS_ERROR(err)) 16308 goto bailout; 16309 16310 ENSURE(env, env->me_lcklist_next == nullptr); 16311 /* insert into inprocess lck-list */ 16312 env->me_lcklist_next = inprocess_lcklist_head; 16313 inprocess_lcklist_head = env; 16314 lcklist_unlock(); 16315 /* end of a locked section ------------------------------------------------ */ 16316 16317 eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); 16318 env->me_lck = lck; 16319 return lck_seize_rc; 16320 } 16321 16322 __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { 16323 if (volume <= 1024 * 1024 * 4ul) 16324 return MDBX_RESULT_TRUE; 16325 16326 intptr_t pagesize, total_ram_pages; 16327 int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr); 16328 if (unlikely(err != MDBX_SUCCESS)) 16329 return err; 16330 16331 const int log2page = log2n_powerof2(pagesize); 16332 const intptr_t volume_pages = (volume + pagesize - 1) >> log2page; 16333 const intptr_t redundancy_pages = 16334 (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page) 16335 : (intptr_t)(redundancy + pagesize - 1) >> log2page; 16336 if (volume_pages >= total_ram_pages || 16337 volume_pages + redundancy_pages >= total_ram_pages) 16338 return MDBX_RESULT_FALSE; 16339 16340 intptr_t avail_ram_pages; 16341 err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages); 16342 if (unlikely(err != MDBX_SUCCESS)) 16343 return err; 16344 16345 return (volume_pages + redundancy_pages >= avail_ram_pages) 16346 ? MDBX_RESULT_FALSE 16347 : MDBX_RESULT_TRUE; 16348 } 16349 16350 /* Merge sync flags */ 16351 static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { 16352 uint32_t r = a | b; 16353 16354 /* avoid false MDBX_UTTERLY_NOSYNC */ 16355 if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && 16356 !F_ISSET(b, MDBX_UTTERLY_NOSYNC)) 16357 r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC; 16358 16359 /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */ 16360 if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) == 16361 (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC) && 16362 !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) 16363 r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; 16364 16365 /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ 16366 if (r & MDBX_SAFE_NOSYNC) 16367 r |= MDBX_NOMETASYNC; 16368 16369 assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && 16370 !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && 16371 !F_ISSET(b, MDBX_UTTERLY_NOSYNC))); 16372 return r; 16373 } 16374 16375 __cold static int __must_check_result override_meta(MDBX_env *env, 16376 unsigned target, 16377 txnid_t txnid, 16378 const MDBX_meta *shape) { 16379 int rc = alloc_page_buf(env); 16380 if (unlikely(rc != MDBX_SUCCESS)) 16381 return rc; 16382 MDBX_page *const page = env->me_pbuf; 16383 meta_model(env, page, target); 16384 MDBX_meta *const model = page_meta(page); 16385 meta_set_txnid(env, model, txnid); 16386 eASSERT(env, coherency_check_meta(env, model, true)); 16387 if (shape) { 16388 if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { 16389 ERROR("bailout overriding meta-%u since model failed " 16390 "freedb/maindb %s-check for txnid #%" PRIaTXN, 16391 target, "pre", constmeta_txnid(shape)); 16392 return MDBX_PROBLEM; 16393 } 16394 if (runtime_flags & MDBX_DBG_DONT_UPGRADE) 16395 memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, 16396 sizeof(model->mm_magic_and_version)); 16397 model->mm_extra_flags = shape->mm_extra_flags; 16398 model->mm_validator_id = shape->mm_validator_id; 16399 model->mm_extra_pagehdr = shape->mm_extra_pagehdr; 16400 memcpy(&model->mm_geo, &shape->mm_geo, sizeof(model->mm_geo)); 16401 memcpy(&model->mm_dbs, &shape->mm_dbs, sizeof(model->mm_dbs)); 16402 memcpy(&model->mm_canary, &shape->mm_canary, sizeof(model->mm_canary)); 16403 memcpy(&model->mm_pages_retired, &shape->mm_pages_retired, 16404 sizeof(model->mm_pages_retired)); 16405 if (txnid) { 16406 if ((!model->mm_dbs[FREE_DBI].md_mod_txnid && 16407 model->mm_dbs[FREE_DBI].md_root != P_INVALID) || 16408 (!model->mm_dbs[MAIN_DBI].md_mod_txnid && 16409 model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) 16410 memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, 16411 sizeof(model->mm_magic_and_version)); 16412 if (unlikely(!coherency_check_meta(env, model, false))) { 16413 ERROR("bailout overriding meta-%u since model failed " 16414 "freedb/maindb %s-check for txnid #%" PRIaTXN, 16415 target, "post", txnid); 16416 return MDBX_PROBLEM; 16417 } 16418 } 16419 } 16420 unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); 16421 rc = validate_meta(env, model, page, target, nullptr); 16422 if (unlikely(MDBX_IS_ERROR(rc))) 16423 return MDBX_PROBLEM; 16424 16425 if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) 16426 return MDBX_SUCCESS; 16427 16428 #if MDBX_ENABLE_PGOP_STAT 16429 env->me_lck->mti_pgop_stat.wops.weak += 1; 16430 #endif /* MDBX_ENABLE_PGOP_STAT */ 16431 if (env->me_flags & MDBX_WRITEMAP) { 16432 rc = osal_msync(&env->me_dxb_mmap, 0, 16433 pgno_align2os_bytes(env, model->mm_geo.next), 16434 MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 16435 if (unlikely(rc != MDBX_SUCCESS)) 16436 return rc; 16437 /* override_meta() called only while current process have exclusive 16438 * lock of a DB file. So meta-page could be updated directly without 16439 * clearing consistency flag by mdbx_meta_update_begin() */ 16440 memcpy(pgno2page(env, target), page, env->me_psize); 16441 osal_flush_incoherent_cpu_writeback(); 16442 rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), 16443 MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 16444 } else { 16445 const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) 16446 ? env->me_dsync_fd 16447 : env->me_lazy_fd; 16448 rc = osal_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); 16449 if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) 16450 rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 16451 } 16452 osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), 16453 env->me_os_psize); 16454 eASSERT(env, !env->me_txn && !env->me_txn0); 16455 return rc; 16456 } 16457 16458 __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { 16459 if (unlikely(target >= NUM_METAS)) 16460 return MDBX_EINVAL; 16461 int rc = check_env(env, true); 16462 if (unlikely(rc != MDBX_SUCCESS)) 16463 return rc; 16464 16465 if (unlikely((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != 16466 MDBX_EXCLUSIVE)) 16467 return MDBX_EPERM; 16468 16469 const MDBX_meta *target_meta = METAPAGE(env, target); 16470 txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta)); 16471 for (unsigned n = 0; n < NUM_METAS; ++n) { 16472 if (n == target) 16473 continue; 16474 MDBX_meta meta = *METAPAGE(env, target); 16475 if (validate_meta(env, &meta, pgno2page(env, n), n, nullptr) != 16476 MDBX_SUCCESS) { 16477 int err = override_meta(env, n, 0, nullptr); 16478 if (unlikely(err != MDBX_SUCCESS)) 16479 return err; 16480 } else { 16481 txnid_t txnid = constmeta_txnid(&meta); 16482 if (new_txnid <= txnid) 16483 new_txnid = safe64_txnid_next(txnid); 16484 } 16485 } 16486 16487 if (unlikely(new_txnid > MAX_TXNID)) { 16488 ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); 16489 return MDBX_TXN_FULL; 16490 } 16491 return override_meta(env, target, new_txnid, target_meta); 16492 } 16493 16494 __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, 16495 unsigned target_meta, bool writeable) { 16496 #if defined(_WIN32) || defined(_WIN64) 16497 const wchar_t *pathnameW = nullptr; 16498 OSAL_MB2WIDE(pathname, pathnameW); 16499 return mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); 16500 } 16501 16502 __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, 16503 unsigned target_meta, bool writeable) { 16504 #endif /* Windows */ 16505 16506 if (unlikely(target_meta >= NUM_METAS)) 16507 return MDBX_EINVAL; 16508 int rc = check_env(env, false); 16509 if (unlikely(rc != MDBX_SUCCESS)) 16510 return rc; 16511 if (unlikely(env->me_map)) 16512 return MDBX_EPERM; 16513 16514 env->me_stuck_meta = (int8_t)target_meta; 16515 return 16516 #if defined(_WIN32) || defined(_WIN64) 16517 mdbx_env_openW 16518 #else 16519 mdbx_env_open 16520 #endif /* Windows */ 16521 (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, 16522 0); 16523 } 16524 16525 typedef struct { 16526 void *buffer_for_free; 16527 pathchar_t *lck, *dxb; 16528 size_t ent_len; 16529 } MDBX_handle_env_pathname; 16530 16531 static bool path_equal(const pathchar_t *l, const pathchar_t *r, size_t len) { 16532 #if defined(_WIN32) || defined(_WIN64) 16533 while (len > 0) { 16534 pathchar_t a = *l++; 16535 pathchar_t b = *r++; 16536 a = (a == '\\') ? '/' : a; 16537 b = (b == '\\') ? '/' : b; 16538 if (a != b) 16539 return false; 16540 } 16541 return true; 16542 #else 16543 return memcmp(l, r, len * sizeof(pathchar_t)) == 0; 16544 #endif 16545 } 16546 16547 __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, 16548 const pathchar_t *pathname, 16549 MDBX_env_flags_t *flags, 16550 const mdbx_mode_t mode) { 16551 memset(ctx, 0, sizeof(*ctx)); 16552 if (unlikely(!pathname || !*pathname)) 16553 return MDBX_EINVAL; 16554 16555 int rc; 16556 #if defined(_WIN32) || defined(_WIN64) 16557 const DWORD dwAttrib = GetFileAttributesW(pathname); 16558 if (dwAttrib == INVALID_FILE_ATTRIBUTES) { 16559 rc = GetLastError(); 16560 if (rc != MDBX_ENOFILE) 16561 return rc; 16562 if (mode == 0 || (*flags & MDBX_RDONLY) != 0) 16563 /* can't open existing */ 16564 return rc; 16565 16566 /* auto-create directory if requested */ 16567 if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { 16568 rc = GetLastError(); 16569 if (rc != ERROR_ALREADY_EXISTS) 16570 return rc; 16571 } 16572 } else { 16573 /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ 16574 *flags |= MDBX_NOSUBDIR; 16575 if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) 16576 *flags -= MDBX_NOSUBDIR; 16577 } 16578 #else 16579 struct stat st; 16580 if (stat(pathname, &st)) { 16581 rc = errno; 16582 if (rc != MDBX_ENOFILE) 16583 return rc; 16584 if (mode == 0 || (*flags & MDBX_RDONLY) != 0) 16585 /* can't open existing */ 16586 return rc; 16587 16588 /* auto-create directory if requested */ 16589 const mdbx_mode_t dir_mode = 16590 (/* inherit read/write permissions for group and others */ mode & 16591 (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | 16592 /* always add read/write/search for owner */ S_IRWXU | 16593 ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | 16594 ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); 16595 if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { 16596 rc = errno; 16597 if (rc != EEXIST) 16598 return rc; 16599 } 16600 } else { 16601 /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ 16602 *flags |= MDBX_NOSUBDIR; 16603 if (S_ISDIR(st.st_mode)) 16604 *flags -= MDBX_NOSUBDIR; 16605 } 16606 #endif 16607 16608 static const pathchar_t dxb_name[] = MDBX_DATANAME; 16609 static const pathchar_t lck_name[] = MDBX_LOCKNAME; 16610 static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; 16611 16612 #if defined(_WIN32) || defined(_WIN64) 16613 assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); 16614 const size_t pathname_len = wcslen(pathname); 16615 #else 16616 assert(dxb_name[0] == '/' && lck_name[0] == '/'); 16617 const size_t pathname_len = strlen(pathname); 16618 #endif 16619 assert(lock_suffix[0] != '\\' && lock_suffix[0] != '/'); 16620 ctx->ent_len = pathname_len; 16621 static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; 16622 if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len > dxb_name_len && 16623 path_equal(pathname + ctx->ent_len - dxb_name_len, dxb_name, 16624 dxb_name_len)) { 16625 *flags -= MDBX_NOSUBDIR; 16626 ctx->ent_len -= dxb_name_len; 16627 } 16628 16629 const size_t bytes_needed = 16630 sizeof(pathchar_t) * ctx->ent_len * 2 + 16631 ((*flags & MDBX_NOSUBDIR) ? sizeof(lock_suffix) + sizeof(pathchar_t) 16632 : sizeof(lck_name) + sizeof(dxb_name)); 16633 ctx->buffer_for_free = osal_malloc(bytes_needed); 16634 if (!ctx->buffer_for_free) 16635 return MDBX_ENOMEM; 16636 16637 ctx->dxb = ctx->buffer_for_free; 16638 ctx->lck = ctx->dxb + ctx->ent_len + 1; 16639 memcpy(ctx->dxb, pathname, sizeof(pathchar_t) * (ctx->ent_len + 1)); 16640 if (*flags & MDBX_NOSUBDIR) { 16641 memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); 16642 } else { 16643 ctx->lck += dxb_name_len; 16644 memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); 16645 memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); 16646 } 16647 memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); 16648 16649 return MDBX_SUCCESS; 16650 } 16651 16652 __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { 16653 #if defined(_WIN32) || defined(_WIN64) 16654 const wchar_t *pathnameW = nullptr; 16655 OSAL_MB2WIDE(pathname, pathnameW); 16656 return mdbx_env_deleteW(pathnameW, mode); 16657 } 16658 16659 __cold int mdbx_env_deleteW(const wchar_t *pathname, 16660 MDBX_env_delete_mode_t mode) { 16661 #endif /* Windows */ 16662 16663 switch (mode) { 16664 default: 16665 return MDBX_EINVAL; 16666 case MDBX_ENV_JUST_DELETE: 16667 case MDBX_ENV_ENSURE_UNUSED: 16668 case MDBX_ENV_WAIT_FOR_UNUSED: 16669 break; 16670 } 16671 16672 #ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */ 16673 MDBX_env *const dummy_env = alloca(sizeof(MDBX_env)); 16674 #else 16675 MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo; 16676 #endif 16677 memset(dummy_env, 0, sizeof(*dummy_env)); 16678 dummy_env->me_flags = 16679 (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; 16680 dummy_env->me_os_psize = (unsigned)osal_syspagesize(); 16681 dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); 16682 dummy_env->me_pathname = (pathchar_t *)pathname; 16683 16684 MDBX_handle_env_pathname env_pathname; 16685 STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); 16686 int rc = MDBX_RESULT_TRUE, 16687 err = handle_env_pathname(&env_pathname, pathname, 16688 (MDBX_env_flags_t *)&dummy_env->me_flags, 0); 16689 if (likely(err == MDBX_SUCCESS)) { 16690 mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, 16691 dxb_handle = INVALID_HANDLE_VALUE; 16692 if (mode > MDBX_ENV_JUST_DELETE) { 16693 err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, 16694 &dxb_handle, 0); 16695 err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; 16696 if (err == MDBX_SUCCESS) { 16697 err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, 16698 &clk_handle, 0); 16699 err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; 16700 } 16701 if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) 16702 err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); 16703 if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) 16704 err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); 16705 } 16706 16707 if (err == MDBX_SUCCESS) { 16708 err = osal_removefile(env_pathname.dxb); 16709 if (err == MDBX_SUCCESS) 16710 rc = MDBX_SUCCESS; 16711 else if (err == MDBX_ENOFILE) 16712 err = MDBX_SUCCESS; 16713 } 16714 16715 if (err == MDBX_SUCCESS) { 16716 err = osal_removefile(env_pathname.lck); 16717 if (err == MDBX_SUCCESS) 16718 rc = MDBX_SUCCESS; 16719 else if (err == MDBX_ENOFILE) 16720 err = MDBX_SUCCESS; 16721 } 16722 16723 if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { 16724 err = osal_removedirectory(pathname); 16725 if (err == MDBX_SUCCESS) 16726 rc = MDBX_SUCCESS; 16727 else if (err == MDBX_ENOFILE) 16728 err = MDBX_SUCCESS; 16729 } 16730 16731 if (dxb_handle != INVALID_HANDLE_VALUE) 16732 osal_closefile(dxb_handle); 16733 if (clk_handle != INVALID_HANDLE_VALUE) 16734 osal_closefile(clk_handle); 16735 } else if (err == MDBX_ENOFILE) 16736 err = MDBX_SUCCESS; 16737 16738 osal_free(env_pathname.buffer_for_free); 16739 return (err == MDBX_SUCCESS) ? rc : err; 16740 } 16741 16742 __cold int mdbx_env_open(MDBX_env *env, const char *pathname, 16743 MDBX_env_flags_t flags, mdbx_mode_t mode) { 16744 #if defined(_WIN32) || defined(_WIN64) 16745 const wchar_t *pathnameW = nullptr; 16746 OSAL_MB2WIDE(pathname, pathnameW); 16747 return mdbx_env_openW(env, pathnameW, flags, mode); 16748 } 16749 16750 __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, 16751 MDBX_env_flags_t flags, mdbx_mode_t mode) { 16752 #endif /* Windows */ 16753 16754 int rc = check_env(env, false); 16755 if (unlikely(rc != MDBX_SUCCESS)) 16756 return rc; 16757 16758 if (unlikely(flags & ~ENV_USABLE_FLAGS)) 16759 return MDBX_EINVAL; 16760 16761 if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || 16762 (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) 16763 return MDBX_EPERM; 16764 16765 /* Pickup previously mdbx_env_set_flags(), 16766 * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ 16767 const uint32_t saved_me_flags = env->me_flags; 16768 flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); 16769 16770 if (flags & MDBX_RDONLY) { 16771 /* Silently ignore irrelevant flags when we're only getting read access */ 16772 flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | 16773 MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | 16774 MDBX_NOMEMINIT | MDBX_ACCEDE); 16775 mode = 0; 16776 } else { 16777 #if MDBX_MMAP_INCOHERENT_FILE_WRITE 16778 /* Temporary `workaround` for OpenBSD kernel's flaw. 16779 * See todo4recovery://erased_by_github/libmdbx/issues/67 */ 16780 if ((flags & MDBX_WRITEMAP) == 0) { 16781 if (flags & MDBX_ACCEDE) 16782 flags |= MDBX_WRITEMAP; 16783 else { 16784 debug_log(MDBX_LOG_ERROR, __func__, __LINE__, 16785 "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " 16786 "of an internal flaw(s) in a file/buffer/page cache.\n"); 16787 return 42 /* ENOPROTOOPT */; 16788 } 16789 } 16790 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 16791 } 16792 16793 MDBX_handle_env_pathname env_pathname; 16794 rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); 16795 if (unlikely(rc != MDBX_SUCCESS)) 16796 goto bailout; 16797 16798 env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; 16799 env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); 16800 env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); 16801 env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); 16802 env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); 16803 if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && 16804 env->me_dbiseqs)) { 16805 rc = MDBX_ENOMEM; 16806 goto bailout; 16807 } 16808 memcpy(env->me_pathname, env_pathname.dxb, 16809 env_pathname.ent_len * sizeof(pathchar_t)); 16810 env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ 16811 env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; 16812 16813 rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ 16814 : MDBX_OPEN_DXB_LAZY, 16815 env, env_pathname.dxb, &env->me_lazy_fd, mode); 16816 if (rc != MDBX_SUCCESS) 16817 goto bailout; 16818 16819 eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); 16820 if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { 16821 rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, 16822 &env->me_dsync_fd, 0); 16823 ENSURE(env, 16824 (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); 16825 } 16826 16827 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 16828 env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); 16829 if (env->me_sysv_ipc.key == -1) { 16830 rc = errno; 16831 goto bailout; 16832 } 16833 #endif /* MDBX_LOCKING */ 16834 16835 #if !(defined(_WIN32) || defined(_WIN64)) 16836 if (mode == 0) { 16837 /* pickup mode for lck-file */ 16838 struct stat st; 16839 if (fstat(env->me_lazy_fd, &st)) { 16840 rc = errno; 16841 goto bailout; 16842 } 16843 mode = st.st_mode; 16844 } 16845 mode = (/* inherit read permissions for group and others */ mode & 16846 (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | 16847 /* always add read/write for owner */ S_IRUSR | S_IWUSR | 16848 ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | 16849 ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); 16850 #endif /* !Windows */ 16851 const int lck_rc = setup_lck(env, env_pathname.lck, mode); 16852 if (MDBX_IS_ERROR(lck_rc)) { 16853 rc = lck_rc; 16854 goto bailout; 16855 } 16856 16857 /* Set the position in files outside of the data to avoid corruption 16858 * due to erroneous use of file descriptors in the application code. */ 16859 osal_fseek(env->me_lfd, UINT64_C(1) << 63); 16860 osal_fseek(env->me_lazy_fd, UINT64_C(1) << 63); 16861 if (env->me_dsync_fd != INVALID_HANDLE_VALUE) 16862 osal_fseek(env->me_dsync_fd, UINT64_C(1) << 63); 16863 16864 const MDBX_env_flags_t rigorous_flags = 16865 MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; 16866 const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | 16867 MDBX_LIFORECLAIM | 16868 MDBX_DEPRECATED_COALESCE | MDBX_NORDAHEAD; 16869 16870 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 16871 if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { 16872 while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { 16873 if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, 16874 env->me_flags & mode_flags)) { 16875 /* The case: 16876 * - let's assume that for some reason the DB file is smaller 16877 * than it should be according to the geometry, 16878 * but not smaller than the last page used; 16879 * - the first process that opens the database (lck_rc == RESULT_TRUE) 16880 * does this in readonly mode and therefore cannot bring 16881 * the file size back to normal; 16882 * - some next process (lck_rc != RESULT_TRUE) opens the DB in 16883 * read-write mode and now is here. 16884 * 16885 * FIXME: Should we re-check and set the size of DB-file right here? */ 16886 break; 16887 } 16888 atomic_yield(); 16889 } 16890 16891 if (env->me_flags & MDBX_ACCEDE) { 16892 /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ 16893 const unsigned diff = 16894 (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; 16895 NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, 16896 env->me_flags ^ diff); 16897 env->me_flags ^= diff; 16898 } 16899 16900 if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { 16901 ERROR("%s", "current mode/flags incompatible with requested"); 16902 rc = MDBX_INCOMPATIBLE; 16903 goto bailout; 16904 } 16905 } 16906 16907 const int dxb_rc = setup_dxb(env, lck_rc, mode); 16908 if (MDBX_IS_ERROR(dxb_rc)) { 16909 rc = dxb_rc; 16910 goto bailout; 16911 } 16912 16913 if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && 16914 (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || 16915 (flags & MDBX_EXCLUSIVE) == 0)) { 16916 ERROR("%s", "recovery requires exclusive mode"); 16917 rc = MDBX_BUSY; 16918 goto bailout; 16919 } 16920 16921 DEBUG("opened dbenv %p", (void *)env); 16922 if (lck) { 16923 if (lck_rc == MDBX_RESULT_TRUE) { 16924 lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); 16925 lck->mti_meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); 16926 lck->mti_reader_check_timestamp.weak = osal_monotime(); 16927 rc = osal_lck_downgrade(env); 16928 DEBUG("lck-downgrade-%s: rc %i", 16929 (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); 16930 if (rc != MDBX_SUCCESS) 16931 goto bailout; 16932 } else { 16933 rc = cleanup_dead_readers(env, false, NULL); 16934 if (MDBX_IS_ERROR(rc)) 16935 goto bailout; 16936 } 16937 16938 if ((env->me_flags & MDBX_NOTLS) == 0) { 16939 rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], 16940 &lck->mti_readers[env->me_maxreaders]); 16941 if (unlikely(rc != MDBX_SUCCESS)) 16942 goto bailout; 16943 env->me_flags |= MDBX_ENV_TXKEY; 16944 } 16945 } else { 16946 env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); 16947 env->me_lck->mti_meta_sync_txnid.weak = 16948 (uint32_t)recent_committed_txnid(env); 16949 env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); 16950 } 16951 16952 if ((flags & MDBX_RDONLY) == 0) { 16953 const size_t tsize = sizeof(MDBX_txn), 16954 size = tsize + env->me_maxdbs * 16955 (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 16956 sizeof(MDBX_atomic_uint32_t) + 1); 16957 rc = alloc_page_buf(env); 16958 if (rc == MDBX_SUCCESS) { 16959 memset(env->me_pbuf, -1, env->me_psize * 2); 16960 MDBX_txn *txn = osal_calloc(1, size); 16961 if (txn) { 16962 txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); 16963 txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); 16964 txn->mt_dbiseqs = 16965 (MDBX_atomic_uint32_t *)(txn->mt_cursors + env->me_maxdbs); 16966 txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); 16967 txn->mt_env = env; 16968 txn->mt_dbxs = env->me_dbxs; 16969 txn->mt_flags = MDBX_TXN_FINISHED; 16970 env->me_txn0 = txn; 16971 txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); 16972 txn->tw.reclaimed_pglist = pnl_alloc(MDBX_PNL_INITIAL); 16973 if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) 16974 rc = MDBX_ENOMEM; 16975 } else 16976 rc = MDBX_ENOMEM; 16977 } 16978 } 16979 16980 #if MDBX_DEBUG 16981 if (rc == MDBX_SUCCESS) { 16982 const meta_troika_t troika = meta_tap(env); 16983 const meta_ptr_t head = meta_recent(env, &troika); 16984 const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; 16985 16986 DEBUG("opened database version %u, pagesize %u", 16987 (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), 16988 env->me_psize); 16989 DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, 16990 data_page(head.ptr_c)->mp_pgno, head.txnid); 16991 DEBUG("depth: %u", db->md_depth); 16992 DEBUG("entries: %" PRIu64, db->md_entries); 16993 DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); 16994 DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); 16995 DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); 16996 DEBUG("root: %" PRIaPGNO, db->md_root); 16997 DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); 16998 } 16999 #endif 17000 17001 bailout: 17002 if (rc != MDBX_SUCCESS) { 17003 rc = env_close(env) ? MDBX_PANIC : rc; 17004 env->me_flags = 17005 saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); 17006 } else { 17007 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 17008 txn_valgrind(env, nullptr); 17009 #endif 17010 } 17011 osal_free(env_pathname.buffer_for_free); 17012 return rc; 17013 } 17014 17015 /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ 17016 __cold static int env_close(MDBX_env *env) { 17017 const unsigned flags = env->me_flags; 17018 if (!(flags & MDBX_ENV_ACTIVE)) { 17019 ENSURE(env, env->me_lcklist_next == nullptr); 17020 return MDBX_SUCCESS; 17021 } 17022 17023 env->me_flags &= ~ENV_INTERNAL_FLAGS; 17024 env->me_lck = nullptr; 17025 if (flags & MDBX_ENV_TXKEY) { 17026 rthc_remove(env->me_txkey); 17027 env->me_txkey = (osal_thread_key_t)0; 17028 } 17029 17030 lcklist_lock(); 17031 const int rc = lcklist_detach_locked(env); 17032 lcklist_unlock(); 17033 17034 if (env->me_map) { 17035 osal_munmap(&env->me_dxb_mmap); 17036 #ifdef MDBX_USE_VALGRIND 17037 VALGRIND_DISCARD(env->me_valgrind_handle); 17038 env->me_valgrind_handle = -1; 17039 #endif 17040 } 17041 17042 if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { 17043 (void)osal_closefile(env->me_dsync_fd); 17044 env->me_dsync_fd = INVALID_HANDLE_VALUE; 17045 } 17046 17047 if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { 17048 (void)osal_closefile(env->me_lazy_fd); 17049 env->me_lazy_fd = INVALID_HANDLE_VALUE; 17050 } 17051 17052 if (env->me_lck_mmap.lck) 17053 osal_munmap(&env->me_lck_mmap); 17054 17055 if (env->me_lfd != INVALID_HANDLE_VALUE) { 17056 (void)osal_closefile(env->me_lfd); 17057 env->me_lfd = INVALID_HANDLE_VALUE; 17058 } 17059 17060 if (env->me_dbxs) { 17061 for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) 17062 osal_free(env->me_dbxs[i].md_name.iov_base); 17063 osal_free(env->me_dbxs); 17064 env->me_dbxs = nullptr; 17065 } 17066 if (env->me_pbuf) { 17067 osal_memalign_free(env->me_pbuf); 17068 env->me_pbuf = nullptr; 17069 } 17070 if (env->me_dbiseqs) { 17071 osal_free(env->me_dbiseqs); 17072 env->me_dbiseqs = nullptr; 17073 } 17074 if (env->me_dbflags) { 17075 osal_free(env->me_dbflags); 17076 env->me_dbflags = nullptr; 17077 } 17078 if (env->me_pathname) { 17079 osal_free(env->me_pathname); 17080 env->me_pathname = nullptr; 17081 } 17082 if (env->me_txn0) { 17083 dpl_free(env->me_txn0); 17084 txl_free(env->me_txn0->tw.lifo_reclaimed); 17085 pnl_free(env->me_txn0->tw.retired_pages); 17086 pnl_free(env->me_txn0->tw.spill_pages); 17087 pnl_free(env->me_txn0->tw.reclaimed_pglist); 17088 osal_free(env->me_txn0); 17089 env->me_txn0 = nullptr; 17090 } 17091 env->me_stuck_meta = -1; 17092 return rc; 17093 } 17094 17095 __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { 17096 MDBX_page *dp; 17097 int rc = MDBX_SUCCESS; 17098 17099 if (unlikely(!env)) 17100 return MDBX_EINVAL; 17101 17102 if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) 17103 return MDBX_EBADSIGN; 17104 17105 #if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) 17106 /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows 17107 * platforms (i.e. where fork() is available). 17108 * This is required to legitimize a call after fork() 17109 * from a child process, that should be allowed to free resources. */ 17110 if (unlikely(env->me_pid != osal_getpid())) 17111 env->me_flags |= MDBX_FATAL_ERROR; 17112 #endif /* MDBX_ENV_CHECKPID */ 17113 17114 if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && 17115 env->me_txn0) { 17116 if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self()) 17117 return MDBX_BUSY; 17118 } else 17119 dont_sync = true; 17120 17121 if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, 0)) 17122 return MDBX_EBADSIGN; 17123 17124 if (!dont_sync) { 17125 #if defined(_WIN32) || defined(_WIN64) 17126 /* On windows, without blocking is impossible to determine whether another 17127 * process is running a writing transaction or not. 17128 * Because in the "owner died" condition kernel don't release 17129 * file lock immediately. */ 17130 rc = env_sync(env, true, false); 17131 rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; 17132 #else 17133 struct stat st; 17134 if (unlikely(fstat(env->me_lazy_fd, &st))) 17135 rc = errno; 17136 else if (st.st_nlink > 0 /* don't sync deleted files */) { 17137 rc = env_sync(env, true, true); 17138 rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || 17139 rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) 17140 ? MDBX_SUCCESS 17141 : rc; 17142 } 17143 #endif 17144 } 17145 17146 eASSERT(env, env->me_signature.weak == 0); 17147 rc = env_close(env) ? MDBX_PANIC : rc; 17148 ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); 17149 #if defined(_WIN32) || defined(_WIN64) 17150 /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ 17151 DeleteCriticalSection(&env->me_windowsbug_lock); 17152 #else 17153 ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); 17154 #endif /* Windows */ 17155 17156 #if MDBX_LOCKING > MDBX_LOCKING_SYSV 17157 MDBX_lockinfo *const stub = lckless_stub(env); 17158 ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); 17159 #endif /* MDBX_LOCKING */ 17160 17161 while ((dp = env->me_dp_reserve) != NULL) { 17162 MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); 17163 VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 17164 env->me_dp_reserve = dp->mp_next; 17165 osal_free(dp); 17166 } 17167 VALGRIND_DESTROY_MEMPOOL(env); 17168 ENSURE(env, env->me_lcklist_next == nullptr); 17169 env->me_pid = 0; 17170 osal_free(env); 17171 17172 return rc; 17173 } 17174 17175 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 17176 __cold int mdbx_env_close(MDBX_env *env) { 17177 return __inline_mdbx_env_close(env); 17178 } 17179 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 17180 17181 /* Compare two items pointing at aligned unsigned int's. */ 17182 __hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { 17183 eASSERT(NULL, a->iov_len == b->iov_len); 17184 switch (a->iov_len) { 17185 case 4: 17186 return CMP2INT(unaligned_peek_u32(4, a->iov_base), 17187 unaligned_peek_u32(4, b->iov_base)); 17188 case 8: 17189 return CMP2INT(unaligned_peek_u64(4, a->iov_base), 17190 unaligned_peek_u64(4, b->iov_base)); 17191 default: 17192 mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, 17193 __LINE__); 17194 return 0; 17195 } 17196 } 17197 17198 /* Compare two items pointing at 2-byte aligned unsigned int's. */ 17199 __hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { 17200 eASSERT(NULL, a->iov_len == b->iov_len); 17201 switch (a->iov_len) { 17202 case 4: 17203 return CMP2INT(unaligned_peek_u32(2, a->iov_base), 17204 unaligned_peek_u32(2, b->iov_base)); 17205 case 8: 17206 return CMP2INT(unaligned_peek_u64(2, a->iov_base), 17207 unaligned_peek_u64(2, b->iov_base)); 17208 default: 17209 mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, 17210 __LINE__); 17211 return 0; 17212 } 17213 } 17214 17215 /* Compare two items pointing at unsigned values with unknown alignment. 17216 * 17217 * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ 17218 __hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { 17219 eASSERT(NULL, a->iov_len == b->iov_len); 17220 switch (a->iov_len) { 17221 case 4: 17222 return CMP2INT(unaligned_peek_u32(1, a->iov_base), 17223 unaligned_peek_u32(1, b->iov_base)); 17224 case 8: 17225 return CMP2INT(unaligned_peek_u64(1, a->iov_base), 17226 unaligned_peek_u64(1, b->iov_base)); 17227 default: 17228 mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, 17229 __LINE__); 17230 return 0; 17231 } 17232 } 17233 17234 /* Compare two items lexically */ 17235 __hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { 17236 if (a->iov_len == b->iov_len) 17237 return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; 17238 17239 const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; 17240 const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; 17241 int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; 17242 return likely(diff_data) ? diff_data : diff_len; 17243 } 17244 17245 /* Compare two items in reverse byte order */ 17246 __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { 17247 const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; 17248 if (likely(shortest)) { 17249 const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; 17250 const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; 17251 const uint8_t *const end = pa - shortest; 17252 do { 17253 int diff = *--pa - *--pb; 17254 if (likely(diff)) 17255 return diff; 17256 } while (pa != end); 17257 } 17258 return CMP2INT(a->iov_len, b->iov_len); 17259 } 17260 17261 /* Fast non-lexically comparator */ 17262 __hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { 17263 int diff = CMP2INT(a->iov_len, b->iov_len); 17264 return likely(diff) || a->iov_len == 0 17265 ? diff 17266 : memcmp(a->iov_base, b->iov_base, a->iov_len); 17267 } 17268 17269 static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, 17270 const MDBX_val *b) { 17271 /* checking for the use of a known good comparator 17272 * or/otherwise for a full byte-to-byte match */ 17273 return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse || 17274 cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0; 17275 } 17276 17277 /* Search for key within a page, using binary search. 17278 * Returns the smallest entry larger or equal to the key. 17279 * Updates the cursor index with the index of the found entry. 17280 * If no entry larger or equal to the key is found, returns NULL. */ 17281 __hot static struct node_result node_search(MDBX_cursor *mc, 17282 const MDBX_val *key) { 17283 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 17284 const int nkeys = page_numkeys(mp); 17285 DKBUF_DEBUG; 17286 17287 DEBUG("searching %u keys in %s %spage %" PRIaPGNO, nkeys, 17288 IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", 17289 mp->mp_pgno); 17290 17291 struct node_result ret; 17292 ret.exact = false; 17293 STATIC_ASSERT(P_BRANCH == 1); 17294 int low = mp->mp_flags & P_BRANCH; 17295 int high = nkeys - 1; 17296 if (unlikely(high < low)) { 17297 mc->mc_ki[mc->mc_top] = 0; 17298 ret.node = NULL; 17299 return ret; 17300 } 17301 17302 int i; 17303 MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; 17304 MDBX_val nodekey; 17305 if (unlikely(IS_LEAF2(mp))) { 17306 cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); 17307 nodekey.iov_len = mp->mp_leaf2_ksize; 17308 do { 17309 i = (low + high) >> 1; 17310 nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); 17311 cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= 17312 (char *)nodekey.iov_base + nodekey.iov_len); 17313 int cr = cmp(key, &nodekey); 17314 DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); 17315 if (cr > 0) 17316 /* Found entry is less than the key. */ 17317 /* Skip to get the smallest entry larger than key. */ 17318 low = ++i; 17319 else if (cr < 0) 17320 high = i - 1; 17321 else { 17322 ret.exact = true; 17323 break; 17324 } 17325 } while (likely(low <= high)); 17326 17327 /* store the key index */ 17328 mc->mc_ki[mc->mc_top] = (indx_t)i; 17329 ret.node = (i < nkeys) 17330 ? /* fake for LEAF2 */ (MDBX_node *)(intptr_t)-1 17331 : /* There is no entry larger or equal to the key. */ NULL; 17332 return ret; 17333 } 17334 17335 if (IS_BRANCH(mp) && cmp == cmp_int_align2) 17336 /* Branch pages have no data, so if using integer keys, 17337 * alignment is guaranteed. Use faster cmp_int_align4(). */ 17338 cmp = cmp_int_align4; 17339 17340 MDBX_node *node; 17341 do { 17342 i = (low + high) >> 1; 17343 node = page_node(mp, i); 17344 nodekey.iov_len = node_ks(node); 17345 nodekey.iov_base = node_key(node); 17346 cASSERT(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= 17347 (char *)nodekey.iov_base + nodekey.iov_len); 17348 int cr = cmp(key, &nodekey); 17349 if (IS_LEAF(mp)) 17350 DEBUG("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); 17351 else 17352 DEBUG("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, 17353 DKEY_DEBUG(&nodekey), node_pgno(node), cr); 17354 if (cr > 0) 17355 /* Found entry is less than the key. */ 17356 /* Skip to get the smallest entry larger than key. */ 17357 low = ++i; 17358 else if (cr < 0) 17359 high = i - 1; 17360 else { 17361 ret.exact = true; 17362 break; 17363 } 17364 } while (likely(low <= high)); 17365 17366 /* store the key index */ 17367 mc->mc_ki[mc->mc_top] = (indx_t)i; 17368 ret.node = (i < nkeys) 17369 ? page_node(mp, i) 17370 : /* There is no entry larger or equal to the key. */ NULL; 17371 return ret; 17372 } 17373 17374 /* Pop a page off the top of the cursor's stack. */ 17375 static __inline void cursor_pop(MDBX_cursor *mc) { 17376 if (likely(mc->mc_snum)) { 17377 DEBUG("popped page %" PRIaPGNO " off db %d cursor %p", 17378 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); 17379 if (likely(--mc->mc_snum)) { 17380 mc->mc_top--; 17381 } else { 17382 mc->mc_flags &= ~C_INITIALIZED; 17383 } 17384 } 17385 } 17386 17387 /* Push a page onto the top of the cursor's stack. 17388 * Set MDBX_TXN_ERROR on failure. */ 17389 static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) { 17390 DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), 17391 (void *)mc); 17392 17393 if (unlikely(mc->mc_snum >= CURSOR_STACK)) { 17394 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 17395 return MDBX_CURSOR_FULL; 17396 } 17397 17398 mc->mc_top = mc->mc_snum++; 17399 mc->mc_pg[mc->mc_top] = mp; 17400 mc->mc_ki[mc->mc_top] = 0; 17401 return MDBX_SUCCESS; 17402 } 17403 17404 __hot static __always_inline int page_get_checker_lite(const uint16_t ILL, 17405 const MDBX_page *page, 17406 MDBX_txn *const txn, 17407 const txnid_t front) { 17408 if (unlikely(page->mp_flags & ILL)) { 17409 if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS)) 17410 return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags); 17411 else if (ILL & P_OVERFLOW) { 17412 assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); 17413 assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); 17414 return bad_page(page, "unexpected %s instead of %s (%u)\n", 17415 "large/overlow", "branch/leaf/leaf2", page->mp_flags); 17416 } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { 17417 assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); 17418 assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); 17419 return bad_page(page, "unexpected %s instead of %s (%u)\n", 17420 "branch/leaf/leaf2", "large/overlow", page->mp_flags); 17421 } else { 17422 assert(false); 17423 } 17424 } 17425 17426 if (unlikely(page->mp_txnid > front) && 17427 unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) 17428 return bad_page( 17429 page, 17430 "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", 17431 page->mp_txnid, 17432 (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" 17433 : "parent-page", 17434 front); 17435 17436 if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && 17437 (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { 17438 if (unlikely(page->mp_upper < page->mp_lower || 17439 ((page->mp_lower | page->mp_upper) & 1) || 17440 PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) 17441 return bad_page(page, "invalid page' lower(%u)/upper(%u) with limit %u\n", 17442 page->mp_lower, page->mp_upper, page_space(txn->mt_env)); 17443 17444 } else if ((ILL & P_OVERFLOW) == 0) { 17445 const pgno_t npages = page->mp_pages; 17446 if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) 17447 return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); 17448 if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno)) 17449 return bad_page( 17450 page, 17451 "end of large-page beyond (%u) allocated space (%u next-pgno)\n", 17452 page->mp_pgno + npages, txn->mt_next_pgno); 17453 } else { 17454 assert(false); 17455 } 17456 return MDBX_SUCCESS; 17457 } 17458 17459 __cold static __noinline pgr_t page_get_checker_full(const uint16_t ILL, 17460 MDBX_page *page, 17461 MDBX_cursor *const mc, 17462 const txnid_t front) { 17463 pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; 17464 if (likely(r.err == MDBX_SUCCESS)) 17465 r.err = page_check(mc, page); 17466 if (unlikely(r.err != MDBX_SUCCESS)) 17467 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 17468 return r; 17469 } 17470 17471 __hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, 17472 MDBX_cursor *const mc, 17473 const pgno_t pgno, 17474 const txnid_t front) { 17475 MDBX_txn *const txn = mc->mc_txn; 17476 tASSERT(txn, front <= txn->mt_front); 17477 17478 pgr_t r; 17479 if (unlikely(pgno >= txn->mt_next_pgno)) { 17480 ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); 17481 r.page = nullptr; 17482 r.err = MDBX_PAGE_NOTFOUND; 17483 bailout: 17484 txn->mt_flags |= MDBX_TXN_ERROR; 17485 return r; 17486 } 17487 17488 eASSERT(txn->mt_env, 17489 ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); 17490 r.page = pgno2page(txn->mt_env, pgno); 17491 if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { 17492 const MDBX_txn *spiller = txn; 17493 do { 17494 /* Spilled pages were dirtied in this txn and flushed 17495 * because the dirty list got full. Bring this page 17496 * back in from the map (but don't unspill it here, 17497 * leave that unless page_touch happens again). */ 17498 if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && 17499 search_spilled(spiller, pgno)) 17500 break; 17501 17502 const unsigned i = dpl_search(spiller, pgno); 17503 tASSERT(txn, (int)i > 0); 17504 if (spiller->tw.dirtylist->items[i].pgno == pgno) { 17505 spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; 17506 r.page = spiller->tw.dirtylist->items[i].ptr; 17507 break; 17508 } 17509 17510 spiller = spiller->mt_parent; 17511 } while (spiller); 17512 } 17513 17514 if (unlikely(r.page->mp_pgno != pgno)) { 17515 r.err = bad_page( 17516 r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", 17517 r.page->mp_pgno, pgno); 17518 goto bailout; 17519 } 17520 17521 if (unlikely(mc->mc_checking & CC_PAGECHECK)) 17522 return page_get_checker_full(ILL, r.page, mc, front); 17523 17524 #if MDBX_DISABLE_VALIDATION 17525 r.err = MDBX_SUCCESS; 17526 #else 17527 r.err = page_get_checker_lite(ILL, r.page, txn, front); 17528 if (unlikely(r.err != MDBX_SUCCESS)) 17529 goto bailout; 17530 #endif /* MDBX_DISABLE_VALIDATION */ 17531 return r; 17532 } 17533 17534 /* Finish mdbx_page_search() / mdbx_page_search_lowest(). 17535 * The cursor is at the root page, set up the rest of it. */ 17536 __hot __noinline static int page_search_root(MDBX_cursor *mc, 17537 const MDBX_val *key, int flags) { 17538 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 17539 int rc; 17540 DKBUF_DEBUG; 17541 17542 while (IS_BRANCH(mp)) { 17543 MDBX_node *node; 17544 int i; 17545 17546 DEBUG("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, 17547 page_numkeys(mp)); 17548 /* Don't assert on branch pages in the GC. We can get here 17549 * while in the process of rebalancing a GC branch page; we must 17550 * let that proceed. ITS#8336 */ 17551 cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1); 17552 DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); 17553 17554 if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { 17555 i = 0; 17556 if (flags & MDBX_PS_LAST) { 17557 i = page_numkeys(mp) - 1; 17558 /* if already init'd, see if we're already in right place */ 17559 if (mc->mc_flags & C_INITIALIZED) { 17560 if (mc->mc_ki[mc->mc_top] == i) { 17561 mc->mc_top = mc->mc_snum++; 17562 mp = mc->mc_pg[mc->mc_top]; 17563 goto ready; 17564 } 17565 } 17566 } 17567 } else { 17568 const struct node_result nsr = node_search(mc, key); 17569 if (likely(nsr.node)) 17570 i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; 17571 else 17572 i = page_numkeys(mp) - 1; 17573 DEBUG("following index %u for key [%s]", i, DKEY_DEBUG(key)); 17574 } 17575 17576 cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); 17577 node = page_node(mp, i); 17578 17579 rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); 17580 if (unlikely(rc != MDBX_SUCCESS)) 17581 return rc; 17582 17583 mc->mc_ki[mc->mc_top] = (indx_t)i; 17584 if (unlikely(rc = cursor_push(mc, mp))) 17585 return rc; 17586 17587 ready: 17588 if (flags & MDBX_PS_MODIFY) { 17589 if (unlikely((rc = page_touch(mc)) != 0)) 17590 return rc; 17591 mp = mc->mc_pg[mc->mc_top]; 17592 } 17593 } 17594 17595 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 17596 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 17597 mp->mp_pgno, mp->mp_flags); 17598 return MDBX_CORRUPTED; 17599 } 17600 17601 DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, 17602 DKEY_DEBUG(key)); 17603 mc->mc_flags |= C_INITIALIZED; 17604 mc->mc_flags &= ~C_EOF; 17605 17606 return MDBX_SUCCESS; 17607 } 17608 17609 static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, 17610 const unsigned pagesize) { 17611 if (unlikely(!dbx->md_cmp)) { 17612 dbx->md_cmp = get_default_keycmp(db->md_flags); 17613 dbx->md_dcmp = get_default_datacmp(db->md_flags); 17614 } 17615 17616 dbx->md_klen_min = 17617 (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; 17618 dbx->md_klen_max = keysize_max(pagesize, db->md_flags); 17619 assert(dbx->md_klen_max != (unsigned)-1); 17620 17621 dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP) 17622 ? 4 /* sizeof(uint32_t) */ 17623 : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); 17624 dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); 17625 assert(dbx->md_vlen_max != (unsigned)-1); 17626 17627 if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { 17628 if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || 17629 db->md_xsize > dbx->md_vlen_max)) { 17630 ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, 17631 dbx->md_vlen_min, dbx->md_vlen_max); 17632 return MDBX_CORRUPTED; 17633 } 17634 dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; 17635 } 17636 return MDBX_SUCCESS; 17637 } 17638 17639 static int fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { 17640 MDBX_cursor_couple couple; 17641 if (unlikely(dbi_changed(txn, dbi))) { 17642 NOTICE("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); 17643 return MDBX_BAD_DBI; 17644 } 17645 int rc = cursor_init(&couple.outer, txn, MAIN_DBI); 17646 if (unlikely(rc != MDBX_SUCCESS)) 17647 return rc; 17648 17649 MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; 17650 rc = page_search(&couple.outer, &dbx->md_name, 0); 17651 if (unlikely(rc != MDBX_SUCCESS)) { 17652 notfound: 17653 NOTICE("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN 17654 " (err %d)", 17655 dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, 17656 txn->mt_txnid, rc); 17657 return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; 17658 } 17659 17660 MDBX_val data; 17661 struct node_result nsr = node_search(&couple.outer, &dbx->md_name); 17662 if (unlikely(!nsr.exact)) { 17663 rc = MDBX_NOTFOUND; 17664 goto notfound; 17665 } 17666 if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { 17667 NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", 17668 dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, 17669 txn->mt_txnid, "wrong flags"); 17670 return MDBX_INCOMPATIBLE; /* not a named DB */ 17671 } 17672 17673 rc = node_read(&couple.outer, nsr.node, &data, 17674 couple.outer.mc_pg[couple.outer.mc_top]); 17675 if (unlikely(rc != MDBX_SUCCESS)) 17676 return rc; 17677 17678 if (unlikely(data.iov_len != sizeof(MDBX_db))) { 17679 NOTICE("dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", 17680 dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, 17681 txn->mt_txnid, "wrong rec-size"); 17682 return MDBX_INCOMPATIBLE; /* not a named DB */ 17683 } 17684 17685 uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags); 17686 /* The txn may not know this DBI, or another process may 17687 * have dropped and recreated the DB with other flags. */ 17688 MDBX_db *const db = &txn->mt_dbs[dbi]; 17689 if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { 17690 NOTICE("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN 17691 " with different flags (present 0x%X != wanna 0x%X)", 17692 dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, 17693 txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); 17694 return MDBX_INCOMPATIBLE; 17695 } 17696 17697 memcpy(db, data.iov_base, sizeof(MDBX_db)); 17698 #if !MDBX_DISABLE_VALIDATION 17699 const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; 17700 tASSERT(txn, txn->mt_front >= pp_txnid); 17701 if (unlikely(db->md_mod_txnid > pp_txnid)) { 17702 ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", 17703 db->md_mod_txnid, pp_txnid); 17704 return MDBX_CORRUPTED; 17705 } 17706 #endif /* !MDBX_DISABLE_VALIDATION */ 17707 rc = setup_dbx(dbx, db, txn->mt_env->me_psize); 17708 if (unlikely(rc != MDBX_SUCCESS)) 17709 return rc; 17710 17711 txn->mt_dbistate[dbi] &= ~DBI_STALE; 17712 return MDBX_SUCCESS; 17713 } 17714 17715 /* Search for the lowest key under the current branch page. 17716 * This just bypasses a numkeys check in the current page 17717 * before calling mdbx_page_search_root(), because the callers 17718 * are all in situations where the current page is known to 17719 * be underfilled. */ 17720 __hot static int page_search_lowest(MDBX_cursor *mc) { 17721 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 17722 cASSERT(mc, IS_BRANCH(mp)); 17723 MDBX_node *node = page_node(mp, 0); 17724 17725 int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); 17726 if (unlikely(rc != MDBX_SUCCESS)) 17727 return rc; 17728 17729 mc->mc_ki[mc->mc_top] = 0; 17730 if (unlikely(rc = cursor_push(mc, mp))) 17731 return rc; 17732 return page_search_root(mc, NULL, MDBX_PS_FIRST); 17733 } 17734 17735 /* Search for the page a given key should be in. 17736 * Push it and its parent pages on the cursor stack. 17737 * 17738 * [in,out] mc the cursor for this operation. 17739 * [in] key the key to search for, or NULL for first/last page. 17740 * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB 17741 * are touched (updated with new page numbers). 17742 * If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last 17743 * leaf. 17744 * This is used by mdbx_cursor_first() and mdbx_cursor_last(). 17745 * If MDBX_PS_ROOTONLY set, just fetch root node, no further 17746 * lookups. 17747 * 17748 * Returns 0 on success, non-zero on failure. */ 17749 __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { 17750 int rc; 17751 pgno_t root; 17752 17753 /* Make sure the txn is still viable, then find the root from 17754 * the txn's db table and set it as the root of the cursor's stack. */ 17755 if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { 17756 DEBUG("%s", "transaction has failed, must abort"); 17757 return MDBX_BAD_TXN; 17758 } 17759 17760 /* Make sure we're using an up-to-date root */ 17761 if (unlikely(*mc->mc_dbistate & DBI_STALE)) { 17762 rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); 17763 if (unlikely(rc != MDBX_SUCCESS)) 17764 return rc; 17765 } 17766 root = mc->mc_db->md_root; 17767 17768 if (unlikely(root == P_INVALID)) { /* Tree is empty. */ 17769 DEBUG("%s", "tree is empty"); 17770 return MDBX_NOTFOUND; 17771 } 17772 17773 cASSERT(mc, root >= NUM_METAS); 17774 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { 17775 txnid_t pp_txnid = mc->mc_db->md_mod_txnid; 17776 pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid 17777 ? pp_txnid 17778 : mc->mc_txn->mt_txnid; 17779 if ((mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0) { 17780 MDBX_txn *scan = mc->mc_txn; 17781 do 17782 if ((scan->mt_flags & MDBX_TXN_DIRTY) && 17783 (mc->mc_dbi == MAIN_DBI || 17784 (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { 17785 /* После коммита вложенных тразакций может быть mod_txnid > front */ 17786 pp_txnid = scan->mt_front; 17787 break; 17788 } 17789 while (unlikely((scan = scan->mt_parent) != nullptr)); 17790 } 17791 if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) 17792 return rc; 17793 } 17794 17795 mc->mc_snum = 1; 17796 mc->mc_top = 0; 17797 17798 DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, 17799 mc->mc_pg[0]->mp_flags); 17800 17801 if (flags & MDBX_PS_MODIFY) { 17802 if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = touch_dbi(mc))) 17803 return rc; 17804 if (unlikely(rc = page_touch(mc))) 17805 return rc; 17806 } 17807 17808 if (flags & MDBX_PS_ROOTONLY) 17809 return MDBX_SUCCESS; 17810 17811 return page_search_root(mc, key, flags); 17812 } 17813 17814 /* Read large/overflow node data. */ 17815 static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, 17816 MDBX_val *data, const MDBX_page *mp) { 17817 cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); 17818 17819 pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); 17820 if (unlikely((lp.err != MDBX_SUCCESS))) { 17821 DEBUG("read large/overflow page %" PRIaPGNO " failed", 17822 node_largedata_pgno(node)); 17823 return lp.err; 17824 } 17825 17826 cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); 17827 data->iov_base = page_data(lp.page); 17828 if (!MDBX_DISABLE_VALIDATION) { 17829 const MDBX_env *env = mc->mc_txn->mt_env; 17830 const size_t dsize = data->iov_len; 17831 if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax)) 17832 poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); 17833 const unsigned npages = number_of_ovpages(env, dsize); 17834 if (unlikely(lp.page->mp_pages != npages)) { 17835 if (lp.page->mp_pages < npages) 17836 return bad_page(lp.page, 17837 "too less n-pages %u for bigdata-node (%zu bytes)", 17838 lp.page->mp_pages, dsize); 17839 else 17840 poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", 17841 lp.page->mp_pages, dsize); 17842 } 17843 } 17844 return MDBX_SUCCESS; 17845 } 17846 17847 /* Return the data associated with a given node. */ 17848 static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node, 17849 MDBX_val *data, const MDBX_page *mp) { 17850 data->iov_len = node_ds(node); 17851 data->iov_base = node_data(node); 17852 if (likely(node_flags(node) != F_BIGDATA)) 17853 return MDBX_SUCCESS; 17854 return node_read_bigdata(mc, node, data, mp); 17855 } 17856 17857 int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { 17858 DKBUF_DEBUG; 17859 DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); 17860 17861 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 17862 if (unlikely(rc != MDBX_SUCCESS)) 17863 return rc; 17864 17865 if (unlikely(!key || !data)) 17866 return MDBX_EINVAL; 17867 17868 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 17869 return MDBX_BAD_DBI; 17870 17871 MDBX_cursor_couple cx; 17872 rc = cursor_init(&cx.outer, txn, dbi); 17873 if (unlikely(rc != MDBX_SUCCESS)) 17874 return rc; 17875 17876 return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; 17877 } 17878 17879 int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, 17880 MDBX_val *data) { 17881 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 17882 if (unlikely(rc != MDBX_SUCCESS)) 17883 return rc; 17884 17885 if (unlikely(!key || !data)) 17886 return MDBX_EINVAL; 17887 17888 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 17889 return MDBX_BAD_DBI; 17890 17891 if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) 17892 return MDBX_BAD_TXN; 17893 17894 MDBX_cursor_couple cx; 17895 rc = cursor_init(&cx.outer, txn, dbi); 17896 if (unlikely(rc != MDBX_SUCCESS)) 17897 return rc; 17898 17899 return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); 17900 } 17901 17902 int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, 17903 size_t *values_count) { 17904 DKBUF_DEBUG; 17905 DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); 17906 17907 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 17908 if (unlikely(rc != MDBX_SUCCESS)) 17909 return rc; 17910 17911 if (unlikely(!key || !data)) 17912 return MDBX_EINVAL; 17913 17914 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 17915 return MDBX_BAD_DBI; 17916 17917 MDBX_cursor_couple cx; 17918 rc = cursor_init(&cx.outer, txn, dbi); 17919 if (unlikely(rc != MDBX_SUCCESS)) 17920 return rc; 17921 17922 rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; 17923 if (unlikely(rc != MDBX_SUCCESS)) { 17924 if (rc == MDBX_NOTFOUND && values_count) 17925 *values_count = 0; 17926 return rc; 17927 } 17928 17929 if (values_count) { 17930 *values_count = 1; 17931 if (cx.outer.mc_xcursor != NULL) { 17932 MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], 17933 cx.outer.mc_ki[cx.outer.mc_top]); 17934 if (node_flags(node) & F_DUPDATA) { 17935 // coverity[uninit_use : FALSE] 17936 tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && 17937 (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); 17938 // coverity[uninit_use : FALSE] 17939 *values_count = 17940 (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || 17941 cx.inner.mx_db.md_entries <= PTRDIFF_MAX) 17942 ? (size_t)cx.inner.mx_db.md_entries 17943 : PTRDIFF_MAX; 17944 } 17945 } 17946 } 17947 return MDBX_SUCCESS; 17948 } 17949 17950 /* Find a sibling for a page. 17951 * Replaces the page at the top of the cursor's stack with the specified 17952 * sibling, if one exists. 17953 * 17954 * [in] mc The cursor for this operation. 17955 * [in] dir SIBLING_LEFT or SIBLING_RIGHT. 17956 * 17957 * Returns 0 on success, non-zero on failure. */ 17958 static int cursor_sibling(MDBX_cursor *mc, int dir) { 17959 int rc; 17960 MDBX_node *node; 17961 MDBX_page *mp; 17962 assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); 17963 17964 if (unlikely(mc->mc_snum < 2)) 17965 return MDBX_NOTFOUND; /* root has no siblings */ 17966 17967 cursor_pop(mc); 17968 DEBUG("parent page is page %" PRIaPGNO ", index %u", 17969 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); 17970 17971 if ((dir == SIBLING_RIGHT) 17972 ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) 17973 : (mc->mc_ki[mc->mc_top] == 0)) { 17974 DEBUG("no more keys aside, moving to next %s sibling", 17975 dir ? "right" : "left"); 17976 if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { 17977 /* undo cursor_pop before returning */ 17978 mc->mc_top++; 17979 mc->mc_snum++; 17980 return rc; 17981 } 17982 } else { 17983 assert((dir - 1) == -1 || (dir - 1) == 1); 17984 mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); 17985 DEBUG("just moving to %s index key %u", 17986 (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); 17987 } 17988 cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); 17989 17990 node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 17991 rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); 17992 if (unlikely(rc != MDBX_SUCCESS)) { 17993 /* mc will be inconsistent if caller does mc_snum++ as above */ 17994 mc->mc_flags &= ~(C_INITIALIZED | C_EOF); 17995 return rc; 17996 } 17997 17998 rc = cursor_push(mc, mp); 17999 if (unlikely(rc != MDBX_SUCCESS)) 18000 return rc; 18001 18002 mc->mc_ki[mc->mc_top] = 18003 (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0; 18004 return MDBX_SUCCESS; 18005 } 18006 18007 /* Move the cursor to the next data item. */ 18008 static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, 18009 MDBX_cursor_op op) { 18010 MDBX_page *mp; 18011 MDBX_node *node; 18012 int rc; 18013 18014 if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) 18015 return MDBX_NOTFOUND; 18016 18017 if (unlikely(!(mc->mc_flags & C_INITIALIZED))) 18018 return cursor_first(mc, key, data); 18019 18020 mp = mc->mc_pg[mc->mc_top]; 18021 if (unlikely(mc->mc_flags & C_EOF)) { 18022 if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) 18023 return MDBX_NOTFOUND; 18024 mc->mc_flags ^= C_EOF; 18025 } 18026 18027 if (mc->mc_db->md_flags & MDBX_DUPSORT) { 18028 node = page_node(mp, mc->mc_ki[mc->mc_top]); 18029 if (node_flags(node) & F_DUPDATA) { 18030 if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { 18031 rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); 18032 if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { 18033 if (likely(rc == MDBX_SUCCESS)) 18034 get_key_optional(node, key); 18035 return rc; 18036 } 18037 } 18038 } else { 18039 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 18040 if (op == MDBX_NEXT_DUP) 18041 return MDBX_NOTFOUND; 18042 } 18043 } 18044 18045 DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, 18046 (void *)mc); 18047 if (mc->mc_flags & C_DEL) { 18048 mc->mc_flags ^= C_DEL; 18049 goto skip; 18050 } 18051 18052 int ki = mc->mc_ki[mc->mc_top]; 18053 mc->mc_ki[mc->mc_top] = (indx_t)++ki; 18054 const int numkeys = page_numkeys(mp); 18055 if (unlikely(ki >= numkeys)) { 18056 DEBUG("%s", "=====> move to next sibling page"); 18057 mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); 18058 rc = cursor_sibling(mc, SIBLING_RIGHT); 18059 if (unlikely(rc != MDBX_SUCCESS)) { 18060 mc->mc_flags |= C_EOF; 18061 return rc; 18062 } 18063 mp = mc->mc_pg[mc->mc_top]; 18064 DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, 18065 mc->mc_ki[mc->mc_top]); 18066 } 18067 18068 skip: 18069 DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", 18070 mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); 18071 18072 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18073 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18074 mp->mp_pgno, mp->mp_flags); 18075 return MDBX_CORRUPTED; 18076 } 18077 18078 if (IS_LEAF2(mp)) { 18079 if (likely(key)) { 18080 key->iov_len = mc->mc_db->md_xsize; 18081 key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); 18082 } 18083 return MDBX_SUCCESS; 18084 } 18085 18086 node = page_node(mp, mc->mc_ki[mc->mc_top]); 18087 if (node_flags(node) & F_DUPDATA) { 18088 rc = cursor_xinit1(mc, node, mp); 18089 if (unlikely(rc != MDBX_SUCCESS)) 18090 return rc; 18091 rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 18092 if (unlikely(rc != MDBX_SUCCESS)) 18093 return rc; 18094 } else if (likely(data)) { 18095 rc = node_read(mc, node, data, mp); 18096 if (unlikely(rc != MDBX_SUCCESS)) 18097 return rc; 18098 } 18099 18100 get_key_optional(node, key); 18101 return MDBX_SUCCESS; 18102 } 18103 18104 /* Move the cursor to the previous data item. */ 18105 static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, 18106 MDBX_cursor_op op) { 18107 MDBX_page *mp; 18108 MDBX_node *node; 18109 int rc; 18110 18111 if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) 18112 return MDBX_NOTFOUND; 18113 18114 if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { 18115 rc = cursor_last(mc, key, data); 18116 if (unlikely(rc)) 18117 return rc; 18118 mc->mc_ki[mc->mc_top]++; 18119 } 18120 18121 mp = mc->mc_pg[mc->mc_top]; 18122 if ((mc->mc_db->md_flags & MDBX_DUPSORT) && 18123 mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { 18124 node = page_node(mp, mc->mc_ki[mc->mc_top]); 18125 if (node_flags(node) & F_DUPDATA) { 18126 if (op == MDBX_PREV || op == MDBX_PREV_DUP) { 18127 rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); 18128 if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { 18129 if (likely(rc == MDBX_SUCCESS)) { 18130 get_key_optional(node, key); 18131 mc->mc_flags &= ~C_EOF; 18132 } 18133 return rc; 18134 } 18135 } 18136 } else { 18137 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 18138 if (op == MDBX_PREV_DUP) 18139 return MDBX_NOTFOUND; 18140 } 18141 } 18142 18143 DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, 18144 (void *)mc); 18145 18146 mc->mc_flags &= ~(C_EOF | C_DEL); 18147 18148 int ki = mc->mc_ki[mc->mc_top]; 18149 mc->mc_ki[mc->mc_top] = (indx_t)--ki; 18150 if (unlikely(ki < 0)) { 18151 mc->mc_ki[mc->mc_top] = 0; 18152 DEBUG("%s", "=====> move to prev sibling page"); 18153 if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) 18154 return rc; 18155 mp = mc->mc_pg[mc->mc_top]; 18156 DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, 18157 mc->mc_ki[mc->mc_top]); 18158 } 18159 DEBUG("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", 18160 mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); 18161 18162 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18163 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18164 mp->mp_pgno, mp->mp_flags); 18165 return MDBX_CORRUPTED; 18166 } 18167 18168 if (IS_LEAF2(mp)) { 18169 if (likely(key)) { 18170 key->iov_len = mc->mc_db->md_xsize; 18171 key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); 18172 } 18173 return MDBX_SUCCESS; 18174 } 18175 18176 node = page_node(mp, mc->mc_ki[mc->mc_top]); 18177 18178 if (node_flags(node) & F_DUPDATA) { 18179 rc = cursor_xinit1(mc, node, mp); 18180 if (unlikely(rc != MDBX_SUCCESS)) 18181 return rc; 18182 rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 18183 if (unlikely(rc != MDBX_SUCCESS)) 18184 return rc; 18185 } else if (likely(data)) { 18186 rc = node_read(mc, node, data, mp); 18187 if (unlikely(rc != MDBX_SUCCESS)) 18188 return rc; 18189 } 18190 18191 get_key_optional(node, key); 18192 return MDBX_SUCCESS; 18193 } 18194 18195 /* Set the cursor on a specific data item. */ 18196 __hot static struct cursor_set_result 18197 cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { 18198 MDBX_page *mp; 18199 MDBX_node *node = NULL; 18200 DKBUF_DEBUG; 18201 18202 struct cursor_set_result ret; 18203 ret.exact = false; 18204 if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || 18205 key->iov_len > mc->mc_dbx->md_klen_max)) { 18206 cASSERT(mc, !"Invalid key-size"); 18207 ret.err = MDBX_BAD_VALSIZE; 18208 return ret; 18209 } 18210 18211 MDBX_val aligned_key = *key; 18212 uint64_t aligned_keybytes; 18213 if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { 18214 switch (aligned_key.iov_len) { 18215 default: 18216 cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); 18217 ret.err = MDBX_BAD_VALSIZE; 18218 return ret; 18219 case 4: 18220 if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) 18221 /* copy instead of return error to avoid break compatibility */ 18222 aligned_key.iov_base = 18223 memcpy(&aligned_keybytes, aligned_key.iov_base, 4); 18224 break; 18225 case 8: 18226 if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) 18227 /* copy instead of return error to avoid break compatibility */ 18228 aligned_key.iov_base = 18229 memcpy(&aligned_keybytes, aligned_key.iov_base, 8); 18230 break; 18231 } 18232 } 18233 18234 if (mc->mc_xcursor) 18235 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 18236 18237 /* See if we're already on the right page */ 18238 if (mc->mc_flags & C_INITIALIZED) { 18239 MDBX_val nodekey; 18240 18241 cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 18242 mp = mc->mc_pg[mc->mc_top]; 18243 if (unlikely(!page_numkeys(mp))) { 18244 mc->mc_ki[mc->mc_top] = 0; 18245 mc->mc_flags |= C_EOF; 18246 ret.err = MDBX_NOTFOUND; 18247 return ret; 18248 } 18249 if (IS_LEAF2(mp)) { 18250 nodekey.iov_len = mc->mc_db->md_xsize; 18251 nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len); 18252 } else { 18253 node = page_node(mp, 0); 18254 get_key(node, &nodekey); 18255 } 18256 int cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); 18257 if (unlikely(cmp == 0)) { 18258 /* Probably happens rarely, but first node on the page 18259 * was the one we wanted. */ 18260 mc->mc_ki[mc->mc_top] = 0; 18261 ret.exact = true; 18262 cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18263 (mc->mc_flags & C_EOF)); 18264 goto got_node; 18265 } 18266 if (cmp > 0) { 18267 const unsigned nkeys = page_numkeys(mp); 18268 if (nkeys > 1) { 18269 if (IS_LEAF2(mp)) { 18270 nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); 18271 } else { 18272 node = page_node(mp, nkeys - 1); 18273 get_key(node, &nodekey); 18274 } 18275 cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); 18276 if (cmp == 0) { 18277 /* last node was the one we wanted */ 18278 cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); 18279 mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); 18280 ret.exact = true; 18281 cASSERT(mc, 18282 mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18283 (mc->mc_flags & C_EOF)); 18284 goto got_node; 18285 } 18286 if (cmp < 0) { 18287 if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { 18288 /* This is definitely the right page, skip search_page */ 18289 if (IS_LEAF2(mp)) { 18290 nodekey.iov_base = 18291 page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); 18292 } else { 18293 node = page_node(mp, mc->mc_ki[mc->mc_top]); 18294 get_key(node, &nodekey); 18295 } 18296 cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); 18297 if (cmp == 0) { 18298 /* current node was the one we wanted */ 18299 ret.exact = true; 18300 cASSERT(mc, mc->mc_ki[mc->mc_top] < 18301 page_numkeys(mc->mc_pg[mc->mc_top]) || 18302 (mc->mc_flags & C_EOF)); 18303 goto got_node; 18304 } 18305 } 18306 mc->mc_flags &= ~C_EOF; 18307 goto search_node; 18308 } 18309 } 18310 /* If any parents have right-sibs, search. 18311 * Otherwise, there's nothing further. */ 18312 unsigned i; 18313 for (i = 0; i < mc->mc_top; i++) 18314 if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) 18315 break; 18316 if (i == mc->mc_top) { 18317 /* There are no other pages */ 18318 cASSERT(mc, nkeys <= UINT16_MAX); 18319 mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; 18320 mc->mc_flags |= C_EOF; 18321 ret.err = MDBX_NOTFOUND; 18322 return ret; 18323 } 18324 } 18325 if (!mc->mc_top) { 18326 /* There are no other pages */ 18327 mc->mc_ki[mc->mc_top] = 0; 18328 if (op == MDBX_SET_RANGE) 18329 goto got_node; 18330 18331 cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18332 (mc->mc_flags & C_EOF)); 18333 ret.err = MDBX_NOTFOUND; 18334 return ret; 18335 } 18336 } else { 18337 mc->mc_pg[0] = 0; 18338 } 18339 18340 ret.err = page_search(mc, &aligned_key, 0); 18341 if (unlikely(ret.err != MDBX_SUCCESS)) 18342 return ret; 18343 18344 mp = mc->mc_pg[mc->mc_top]; 18345 cASSERT(mc, IS_LEAF(mp)); 18346 18347 search_node:; 18348 struct node_result nsr = node_search(mc, &aligned_key); 18349 node = nsr.node; 18350 ret.exact = nsr.exact; 18351 if (!ret.exact) { 18352 if (op != MDBX_SET_RANGE) { 18353 /* MDBX_SET specified and not an exact match. */ 18354 if (unlikely(mc->mc_ki[mc->mc_top] >= 18355 page_numkeys(mc->mc_pg[mc->mc_top]))) 18356 mc->mc_flags |= C_EOF; 18357 ret.err = MDBX_NOTFOUND; 18358 return ret; 18359 } 18360 18361 if (node == NULL) { 18362 DEBUG("%s", "===> inexact leaf not found, goto sibling"); 18363 ret.err = cursor_sibling(mc, SIBLING_RIGHT); 18364 if (unlikely(ret.err != MDBX_SUCCESS)) { 18365 mc->mc_flags |= C_EOF; 18366 return ret; /* no entries matched */ 18367 } 18368 mp = mc->mc_pg[mc->mc_top]; 18369 cASSERT(mc, IS_LEAF(mp)); 18370 if (!IS_LEAF2(mp)) 18371 node = page_node(mp, 0); 18372 } 18373 } 18374 cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18375 (mc->mc_flags & C_EOF)); 18376 18377 got_node: 18378 mc->mc_flags |= C_INITIALIZED; 18379 mc->mc_flags &= ~C_EOF; 18380 18381 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18382 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18383 mp->mp_pgno, mp->mp_flags); 18384 ret.err = MDBX_CORRUPTED; 18385 return ret; 18386 } 18387 18388 if (IS_LEAF2(mp)) { 18389 if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { 18390 key->iov_len = mc->mc_db->md_xsize; 18391 key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); 18392 } 18393 ret.err = MDBX_SUCCESS; 18394 return ret; 18395 } 18396 18397 if (node_flags(node) & F_DUPDATA) { 18398 ret.err = cursor_xinit1(mc, node, mp); 18399 if (unlikely(ret.err != MDBX_SUCCESS)) 18400 return ret; 18401 if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { 18402 ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 18403 if (unlikely(ret.err != MDBX_SUCCESS)) 18404 return ret; 18405 } else { 18406 ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); 18407 if (unlikely(ret.err != MDBX_SUCCESS)) 18408 return ret; 18409 if (op == MDBX_GET_BOTH && !ret.exact) { 18410 ret.err = MDBX_NOTFOUND; 18411 return ret; 18412 } 18413 } 18414 } else if (likely(data)) { 18415 if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { 18416 if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || 18417 data->iov_len > mc->mc_dbx->md_vlen_max)) { 18418 cASSERT(mc, !"Invalid data-size"); 18419 ret.err = MDBX_BAD_VALSIZE; 18420 return ret; 18421 } 18422 MDBX_val aligned_data = *data; 18423 uint64_t aligned_databytes; 18424 if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { 18425 switch (aligned_data.iov_len) { 18426 default: 18427 cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); 18428 ret.err = MDBX_BAD_VALSIZE; 18429 return ret; 18430 case 4: 18431 if (unlikely(3 & (uintptr_t)aligned_data.iov_base)) 18432 /* copy instead of return error to avoid break compatibility */ 18433 aligned_data.iov_base = 18434 memcpy(&aligned_databytes, aligned_data.iov_base, 4); 18435 break; 18436 case 8: 18437 if (unlikely(7 & (uintptr_t)aligned_data.iov_base)) 18438 /* copy instead of return error to avoid break compatibility */ 18439 aligned_data.iov_base = 18440 memcpy(&aligned_databytes, aligned_data.iov_base, 8); 18441 break; 18442 } 18443 } 18444 MDBX_val actual_data; 18445 ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); 18446 if (unlikely(ret.err != MDBX_SUCCESS)) 18447 return ret; 18448 const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); 18449 if (cmp) { 18450 cASSERT(mc, 18451 mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18452 (mc->mc_flags & C_EOF)); 18453 if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { 18454 ret.err = MDBX_NOTFOUND; 18455 return ret; 18456 } 18457 } 18458 *data = actual_data; 18459 } else { 18460 ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); 18461 if (unlikely(ret.err != MDBX_SUCCESS)) 18462 return ret; 18463 } 18464 } 18465 18466 /* The key already matches in all other cases */ 18467 if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) 18468 get_key_optional(node, key); 18469 18470 DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), 18471 DVAL_DEBUG(data)); 18472 ret.err = MDBX_SUCCESS; 18473 return ret; 18474 } 18475 18476 /* Move the cursor to the first item in the database. */ 18477 static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { 18478 int rc; 18479 18480 if (mc->mc_xcursor) 18481 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 18482 18483 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 18484 rc = page_search(mc, NULL, MDBX_PS_FIRST); 18485 if (unlikely(rc != MDBX_SUCCESS)) 18486 return rc; 18487 } 18488 18489 const MDBX_page *mp = mc->mc_pg[mc->mc_top]; 18490 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18491 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18492 mp->mp_pgno, mp->mp_flags); 18493 return MDBX_CORRUPTED; 18494 } 18495 18496 mc->mc_flags |= C_INITIALIZED; 18497 mc->mc_flags &= ~C_EOF; 18498 mc->mc_ki[mc->mc_top] = 0; 18499 18500 if (IS_LEAF2(mp)) { 18501 if (likely(key)) { 18502 key->iov_len = mc->mc_db->md_xsize; 18503 key->iov_base = page_leaf2key(mp, 0, key->iov_len); 18504 } 18505 return MDBX_SUCCESS; 18506 } 18507 18508 MDBX_node *node = page_node(mp, 0); 18509 if (node_flags(node) & F_DUPDATA) { 18510 rc = cursor_xinit1(mc, node, mp); 18511 if (unlikely(rc != MDBX_SUCCESS)) 18512 return rc; 18513 rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 18514 if (unlikely(rc)) 18515 return rc; 18516 } else if (likely(data)) { 18517 rc = node_read(mc, node, data, mp); 18518 if (unlikely(rc != MDBX_SUCCESS)) 18519 return rc; 18520 } 18521 18522 get_key_optional(node, key); 18523 return MDBX_SUCCESS; 18524 } 18525 18526 /* Move the cursor to the last item in the database. */ 18527 static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { 18528 int rc; 18529 18530 if (mc->mc_xcursor) 18531 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 18532 18533 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 18534 rc = page_search(mc, NULL, MDBX_PS_LAST); 18535 if (unlikely(rc != MDBX_SUCCESS)) 18536 return rc; 18537 } 18538 18539 const MDBX_page *mp = mc->mc_pg[mc->mc_top]; 18540 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18541 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18542 mp->mp_pgno, mp->mp_flags); 18543 return MDBX_CORRUPTED; 18544 } 18545 18546 mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; 18547 mc->mc_flags |= C_INITIALIZED | C_EOF; 18548 18549 if (IS_LEAF2(mp)) { 18550 if (likely(key)) { 18551 key->iov_len = mc->mc_db->md_xsize; 18552 key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); 18553 } 18554 return MDBX_SUCCESS; 18555 } 18556 18557 MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); 18558 if (node_flags(node) & F_DUPDATA) { 18559 rc = cursor_xinit1(mc, node, mp); 18560 if (unlikely(rc != MDBX_SUCCESS)) 18561 return rc; 18562 rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 18563 if (unlikely(rc)) 18564 return rc; 18565 } else if (likely(data)) { 18566 rc = node_read(mc, node, data, mp); 18567 if (unlikely(rc != MDBX_SUCCESS)) 18568 return rc; 18569 } 18570 18571 get_key_optional(node, key); 18572 return MDBX_SUCCESS; 18573 } 18574 18575 __hot int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, 18576 MDBX_cursor_op op) { 18577 if (unlikely(mc == NULL)) 18578 return MDBX_EINVAL; 18579 18580 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 18581 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 18582 : MDBX_EBADSIGN; 18583 18584 int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); 18585 if (unlikely(rc != MDBX_SUCCESS)) 18586 return rc; 18587 18588 int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); 18589 switch (op) { 18590 case MDBX_GET_CURRENT: { 18591 if (unlikely(!(mc->mc_flags & C_INITIALIZED))) 18592 return MDBX_ENODATA; 18593 const MDBX_page *mp = mc->mc_pg[mc->mc_top]; 18594 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18595 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18596 mp->mp_pgno, mp->mp_flags); 18597 return MDBX_CORRUPTED; 18598 } 18599 const unsigned nkeys = page_numkeys(mp); 18600 if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { 18601 cASSERT(mc, nkeys <= UINT16_MAX); 18602 if (mc->mc_flags & C_EOF) 18603 return MDBX_ENODATA; 18604 mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; 18605 mc->mc_flags |= C_EOF; 18606 return MDBX_NOTFOUND; 18607 } 18608 cASSERT(mc, nkeys > 0); 18609 18610 rc = MDBX_SUCCESS; 18611 if (IS_LEAF2(mp)) { 18612 key->iov_len = mc->mc_db->md_xsize; 18613 key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); 18614 } else { 18615 MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); 18616 get_key_optional(node, key); 18617 if (data) { 18618 if (node_flags(node) & F_DUPDATA) { 18619 if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { 18620 rc = cursor_xinit1(mc, node, mp); 18621 if (unlikely(rc != MDBX_SUCCESS)) 18622 return rc; 18623 rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 18624 if (unlikely(rc)) 18625 return rc; 18626 } else { 18627 rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, 18628 MDBX_GET_CURRENT); 18629 if (unlikely(rc)) 18630 return rc; 18631 } 18632 } else { 18633 rc = node_read(mc, node, data, mp); 18634 if (unlikely(rc)) 18635 return rc; 18636 } 18637 } 18638 } 18639 break; 18640 } 18641 case MDBX_GET_BOTH: 18642 case MDBX_GET_BOTH_RANGE: 18643 if (unlikely(data == NULL)) 18644 return MDBX_EINVAL; 18645 if (unlikely(mc->mc_xcursor == NULL)) 18646 return MDBX_INCOMPATIBLE; 18647 /* fall through */ 18648 __fallthrough; 18649 case MDBX_SET: 18650 case MDBX_SET_KEY: 18651 case MDBX_SET_RANGE: 18652 if (unlikely(key == NULL)) 18653 return MDBX_EINVAL; 18654 rc = cursor_set(mc, key, data, op).err; 18655 if (mc->mc_flags & C_INITIALIZED) { 18656 cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); 18657 cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || 18658 (mc->mc_flags & C_EOF)); 18659 } 18660 break; 18661 case MDBX_GET_MULTIPLE: 18662 if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) 18663 return MDBX_EINVAL; 18664 if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) 18665 return MDBX_INCOMPATIBLE; 18666 rc = MDBX_SUCCESS; 18667 if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) != 18668 C_INITIALIZED) 18669 break; 18670 goto fetchm; 18671 case MDBX_NEXT_MULTIPLE: 18672 if (unlikely(data == NULL)) 18673 return MDBX_EINVAL; 18674 if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) 18675 return MDBX_INCOMPATIBLE; 18676 rc = cursor_next(mc, key, data, MDBX_NEXT_DUP); 18677 if (rc == MDBX_SUCCESS) { 18678 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 18679 MDBX_cursor *mx; 18680 fetchm: 18681 mx = &mc->mc_xcursor->mx_cursor; 18682 data->iov_len = 18683 page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; 18684 data->iov_base = page_data(mx->mc_pg[mx->mc_top]); 18685 mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1; 18686 } else { 18687 rc = MDBX_NOTFOUND; 18688 } 18689 } 18690 break; 18691 case MDBX_PREV_MULTIPLE: 18692 if (data == NULL) 18693 return MDBX_EINVAL; 18694 if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) 18695 return MDBX_INCOMPATIBLE; 18696 rc = MDBX_SUCCESS; 18697 if (!(mc->mc_flags & C_INITIALIZED)) 18698 rc = cursor_last(mc, key, data); 18699 if (rc == MDBX_SUCCESS) { 18700 MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; 18701 if (mx->mc_flags & C_INITIALIZED) { 18702 rc = cursor_sibling(mx, SIBLING_LEFT); 18703 if (rc == MDBX_SUCCESS) 18704 goto fetchm; 18705 } else { 18706 rc = MDBX_NOTFOUND; 18707 } 18708 } 18709 break; 18710 case MDBX_NEXT: 18711 case MDBX_NEXT_DUP: 18712 case MDBX_NEXT_NODUP: 18713 rc = cursor_next(mc, key, data, op); 18714 break; 18715 case MDBX_PREV: 18716 case MDBX_PREV_DUP: 18717 case MDBX_PREV_NODUP: 18718 rc = cursor_prev(mc, key, data, op); 18719 break; 18720 case MDBX_FIRST: 18721 rc = cursor_first(mc, key, data); 18722 break; 18723 case MDBX_FIRST_DUP: 18724 mfunc = cursor_first; 18725 move: 18726 if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) 18727 return MDBX_EINVAL; 18728 if (unlikely(mc->mc_xcursor == NULL)) 18729 return MDBX_INCOMPATIBLE; 18730 if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { 18731 mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); 18732 mc->mc_flags |= C_EOF; 18733 return MDBX_NOTFOUND; 18734 } 18735 { 18736 MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 18737 if (!(node_flags(node) & F_DUPDATA)) { 18738 get_key_optional(node, key); 18739 rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); 18740 break; 18741 } 18742 } 18743 if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) 18744 return MDBX_EINVAL; 18745 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); 18746 break; 18747 case MDBX_LAST: 18748 rc = cursor_last(mc, key, data); 18749 break; 18750 case MDBX_LAST_DUP: 18751 mfunc = cursor_last; 18752 goto move; 18753 case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ 18754 case MDBX_SET_LOWERBOUND: { 18755 if (unlikely(key == NULL || data == NULL)) 18756 return MDBX_EINVAL; 18757 MDBX_val save_data = *data; 18758 struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); 18759 rc = csr.err; 18760 if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { 18761 mc->mc_flags &= ~C_DEL; 18762 csr.exact = false; 18763 if (!save_data.iov_base && (mc->mc_db->md_flags & MDBX_DUPFIXED)) { 18764 /* Avoiding search nested dupfixed hive if no data provided. 18765 * This is changes the semantic of MDBX_SET_LOWERBOUND but avoid 18766 * returning MDBX_BAD_VALSIZE. */ 18767 } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 18768 *data = save_data; 18769 csr = 18770 cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); 18771 rc = csr.err; 18772 if (rc == MDBX_NOTFOUND) { 18773 cASSERT(mc, !csr.exact); 18774 rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); 18775 } 18776 } else { 18777 int cmp = mc->mc_dbx->md_dcmp(&save_data, data); 18778 csr.exact = (cmp == 0); 18779 if (cmp > 0) 18780 rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); 18781 } 18782 } 18783 if (rc == MDBX_SUCCESS && !csr.exact) 18784 rc = MDBX_RESULT_TRUE; 18785 if (unlikely(op == MDBX_SET_UPPERBOUND)) { 18786 /* minor fixups for MDBX_SET_UPPERBOUND */ 18787 if (rc == MDBX_RESULT_TRUE) 18788 /* already at great-than by MDBX_SET_LOWERBOUND */ 18789 rc = MDBX_SUCCESS; 18790 else if (rc == MDBX_SUCCESS) 18791 /* exactly match, going next */ 18792 rc = cursor_next(mc, key, data, MDBX_NEXT); 18793 } 18794 break; 18795 } 18796 default: 18797 DEBUG("unhandled/unimplemented cursor operation %u", op); 18798 return MDBX_EINVAL; 18799 } 18800 18801 mc->mc_flags &= ~C_DEL; 18802 return rc; 18803 } 18804 18805 static int cursor_first_batch(MDBX_cursor *mc) { 18806 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 18807 int err = page_search(mc, NULL, MDBX_PS_FIRST); 18808 if (unlikely(err != MDBX_SUCCESS)) 18809 return err; 18810 } 18811 cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 18812 18813 mc->mc_flags |= C_INITIALIZED; 18814 mc->mc_flags &= ~C_EOF; 18815 mc->mc_ki[mc->mc_top] = 0; 18816 return MDBX_SUCCESS; 18817 } 18818 18819 static int cursor_next_batch(MDBX_cursor *mc) { 18820 if (unlikely(!(mc->mc_flags & C_INITIALIZED))) 18821 return cursor_first_batch(mc); 18822 18823 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 18824 if (unlikely(mc->mc_flags & C_EOF)) { 18825 if ((unsigned)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) 18826 return MDBX_NOTFOUND; 18827 mc->mc_flags ^= C_EOF; 18828 } 18829 18830 int ki = mc->mc_ki[mc->mc_top]; 18831 mc->mc_ki[mc->mc_top] = (indx_t)++ki; 18832 const int numkeys = page_numkeys(mp); 18833 if (likely(ki >= numkeys)) { 18834 DEBUG("%s", "=====> move to next sibling page"); 18835 mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); 18836 int err = cursor_sibling(mc, SIBLING_RIGHT); 18837 if (unlikely(err != MDBX_SUCCESS)) { 18838 mc->mc_flags |= C_EOF; 18839 return err; 18840 } 18841 mp = mc->mc_pg[mc->mc_top]; 18842 DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, 18843 mc->mc_ki[mc->mc_top]); 18844 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18845 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18846 mp->mp_pgno, mp->mp_flags); 18847 return MDBX_CORRUPTED; 18848 } 18849 } 18850 return MDBX_SUCCESS; 18851 } 18852 18853 int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, 18854 size_t limit, MDBX_cursor_op op) { 18855 if (unlikely(mc == NULL || count == NULL || limit < 4)) 18856 return MDBX_EINVAL; 18857 18858 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 18859 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 18860 : MDBX_EBADSIGN; 18861 18862 int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); 18863 if (unlikely(rc != MDBX_SUCCESS)) 18864 return rc; 18865 18866 if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) 18867 return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */; 18868 18869 switch (op) { 18870 case MDBX_FIRST: 18871 rc = cursor_first_batch(mc); 18872 break; 18873 case MDBX_NEXT: 18874 rc = cursor_next_batch(mc); 18875 break; 18876 case MDBX_GET_CURRENT: 18877 rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; 18878 break; 18879 default: 18880 DEBUG("unhandled/unimplemented cursor operation %u", op); 18881 rc = MDBX_EINVAL; 18882 break; 18883 } 18884 18885 if (unlikely(rc != MDBX_SUCCESS)) { 18886 *count = 0; 18887 return rc; 18888 } 18889 18890 const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; 18891 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 18892 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 18893 mp->mp_pgno, mp->mp_flags); 18894 return MDBX_CORRUPTED; 18895 } 18896 const unsigned nkeys = page_numkeys(mp); 18897 unsigned i = mc->mc_ki[mc->mc_top], n = 0; 18898 if (unlikely(i >= nkeys)) { 18899 cASSERT(mc, op == MDBX_GET_CURRENT); 18900 cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); 18901 *count = 0; 18902 if (mc->mc_flags & C_EOF) { 18903 cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); 18904 return MDBX_ENODATA; 18905 } 18906 if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) 18907 return MDBX_EINVAL /* again MDBX_GET_CURRENT after MDBX_GET_CURRENT */; 18908 mc->mc_flags |= C_EOF; 18909 return MDBX_NOTFOUND; 18910 } 18911 18912 do { 18913 if (unlikely(n + 2 > limit)) { 18914 rc = MDBX_RESULT_TRUE; 18915 break; 18916 } 18917 const MDBX_node *leaf = page_node(mp, i); 18918 get_key(leaf, &pairs[n]); 18919 rc = node_read(mc, leaf, &pairs[n + 1], mp); 18920 if (unlikely(rc != MDBX_SUCCESS)) 18921 break; 18922 n += 2; 18923 } while (++i < nkeys); 18924 18925 mc->mc_ki[mc->mc_top] = (indx_t)i; 18926 *count = n; 18927 return rc; 18928 } 18929 18930 static int touch_dbi(MDBX_cursor *mc) { 18931 cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); 18932 *mc->mc_dbistate |= DBI_DIRTY; 18933 mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; 18934 if (mc->mc_dbi >= CORE_DBS) { 18935 cASSERT(mc, (mc->mc_flags & C_RECLAIMING) == 0); 18936 /* Touch DB record of named DB */ 18937 MDBX_cursor_couple cx; 18938 int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); 18939 if (unlikely(rc != MDBX_SUCCESS)) 18940 return rc; 18941 mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; 18942 rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); 18943 if (unlikely(rc != MDBX_SUCCESS)) 18944 return rc; 18945 } 18946 return MDBX_SUCCESS; 18947 } 18948 18949 /* Touch all the pages in the cursor stack. Set mc_top. 18950 * Makes sure all the pages are writable, before attempting a write operation. 18951 * [in] mc The cursor to operate on. */ 18952 static int cursor_touch(MDBX_cursor *mc) { 18953 int rc = MDBX_SUCCESS; 18954 if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { 18955 rc = touch_dbi(mc); 18956 if (unlikely(rc != MDBX_SUCCESS)) 18957 return rc; 18958 } 18959 if (likely(mc->mc_snum)) { 18960 mc->mc_top = 0; 18961 do { 18962 rc = page_touch(mc); 18963 } while (!rc && ++(mc->mc_top) < mc->mc_snum); 18964 mc->mc_top = mc->mc_snum - 1; 18965 } 18966 return rc; 18967 } 18968 18969 __hot int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, 18970 unsigned flags) { 18971 MDBX_env *env; 18972 MDBX_page *sub_root = NULL; 18973 MDBX_val xdata, *rdata, dkey, olddata; 18974 MDBX_db nested_dupdb; 18975 int err; 18976 DKBUF_DEBUG; 18977 18978 if (unlikely(mc == NULL || key == NULL || data == NULL)) 18979 return MDBX_EINVAL; 18980 18981 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 18982 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 18983 : MDBX_EBADSIGN; 18984 18985 int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); 18986 if (unlikely(rc != MDBX_SUCCESS)) 18987 return rc; 18988 18989 if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) 18990 return MDBX_BAD_DBI; 18991 18992 cASSERT(mc, cursor_is_tracked(mc)); 18993 env = mc->mc_txn->mt_env; 18994 18995 /* Check this first so counter will always be zero on any early failures. */ 18996 size_t mcount = 0, dcount = 0; 18997 if (unlikely(flags & MDBX_MULTIPLE)) { 18998 if (unlikely(flags & MDBX_RESERVE)) 18999 return MDBX_EINVAL; 19000 if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) 19001 return MDBX_INCOMPATIBLE; 19002 dcount = data[1].iov_len; 19003 if (unlikely(dcount < 2 || data->iov_len == 0)) 19004 return MDBX_BAD_VALSIZE; 19005 if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) 19006 return MDBX_BAD_VALSIZE; 19007 if (unlikely(dcount > MAX_MAPSIZE / 2 / 19008 (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { 19009 /* checking for multiplication overflow */ 19010 if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) 19011 return MDBX_TOO_LARGE; 19012 } 19013 data[1].iov_len = 0 /* reset done item counter */; 19014 } 19015 19016 if (flags & MDBX_RESERVE) { 19017 if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | 19018 MDBX_INTEGERDUP | MDBX_DUPFIXED))) 19019 return MDBX_INCOMPATIBLE; 19020 data->iov_base = nullptr; 19021 } 19022 19023 const unsigned nospill = flags & MDBX_NOSPILL; 19024 flags -= nospill; 19025 19026 if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) 19027 return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS 19028 : MDBX_BAD_TXN; 19029 19030 uint64_t aligned_keybytes, aligned_databytes; 19031 MDBX_val aligned_key, aligned_data; 19032 if (likely((mc->mc_flags & C_SUB) == 0)) { 19033 if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || 19034 key->iov_len > mc->mc_dbx->md_klen_max)) { 19035 cASSERT(mc, !"Invalid key-size"); 19036 return MDBX_BAD_VALSIZE; 19037 } 19038 if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || 19039 data->iov_len > mc->mc_dbx->md_vlen_max)) { 19040 cASSERT(mc, !"Invalid data-size"); 19041 return MDBX_BAD_VALSIZE; 19042 } 19043 19044 if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { 19045 switch (key->iov_len) { 19046 default: 19047 cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); 19048 return MDBX_BAD_VALSIZE; 19049 case 4: 19050 if (unlikely(3 & (uintptr_t)key->iov_base)) { 19051 /* copy instead of return error to avoid break compatibility */ 19052 aligned_key.iov_base = 19053 memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); 19054 key = &aligned_key; 19055 } 19056 break; 19057 case 8: 19058 if (unlikely(7 & (uintptr_t)key->iov_base)) { 19059 /* copy instead of return error to avoid break compatibility */ 19060 aligned_key.iov_base = 19061 memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); 19062 key = &aligned_key; 19063 } 19064 break; 19065 } 19066 } 19067 if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { 19068 switch (data->iov_len) { 19069 default: 19070 cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); 19071 return MDBX_BAD_VALSIZE; 19072 case 4: 19073 if (unlikely(3 & (uintptr_t)data->iov_base)) { 19074 if (unlikely(flags & MDBX_MULTIPLE)) 19075 return MDBX_BAD_VALSIZE; 19076 /* copy instead of return error to avoid break compatibility */ 19077 aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, 19078 aligned_data.iov_len = 4); 19079 data = &aligned_data; 19080 } 19081 break; 19082 case 8: 19083 if (unlikely(7 & (uintptr_t)data->iov_base)) { 19084 if (unlikely(flags & MDBX_MULTIPLE)) 19085 return MDBX_BAD_VALSIZE; 19086 /* copy instead of return error to avoid break compatibility */ 19087 aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, 19088 aligned_data.iov_len = 8); 19089 data = &aligned_data; 19090 } 19091 break; 19092 } 19093 } 19094 } 19095 19096 DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, 19097 DDBI(mc), DKEY_DEBUG(key), key->iov_len, 19098 DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); 19099 19100 int dupdata_flag = 0; 19101 if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { 19102 if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE))) 19103 return MDBX_EINVAL; 19104 /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, 19105 * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает 19106 * со значением в текущей позиции курсора. 19107 * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц 19108 * с MDBX_DUPSORT также требуется текущий размер данных. */ 19109 MDBX_val current_key, current_data; 19110 rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); 19111 if (unlikely(rc != MDBX_SUCCESS)) 19112 return rc; 19113 if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) 19114 return MDBX_EKEYMISMATCH; 19115 19116 if (unlikely((flags & MDBX_MULTIPLE))) 19117 goto drop_current; 19118 19119 if (mc->mc_db->md_flags & MDBX_DUPSORT) { 19120 MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 19121 if (node_flags(node) & F_DUPDATA) { 19122 cASSERT(mc, mc->mc_xcursor != NULL && 19123 (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); 19124 /* Если за ключом более одного значения, либо если размер данных 19125 * отличается, то вместо обновления требуется удаление и 19126 * последующая вставка. */ 19127 if (mc->mc_xcursor->mx_db.md_entries > 1 || 19128 current_data.iov_len != data->iov_len) { 19129 drop_current: 19130 rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); 19131 if (unlikely(rc != MDBX_SUCCESS)) 19132 return rc; 19133 flags -= MDBX_CURRENT; 19134 goto skip_check_samedata; 19135 } 19136 } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { 19137 rc = mdbx_cursor_del(mc, 0); 19138 if (unlikely(rc != MDBX_SUCCESS)) 19139 return rc; 19140 flags -= MDBX_CURRENT; 19141 goto skip_check_samedata; 19142 } 19143 } 19144 if (!(flags & MDBX_RESERVE) && 19145 unlikely(cmp_lenfast(¤t_data, data) == 0)) 19146 return MDBX_SUCCESS /* the same data, nothing to update */; 19147 skip_check_samedata:; 19148 } 19149 19150 if (mc->mc_db->md_root == P_INVALID) { 19151 /* new database, cursor has nothing to point to */ 19152 mc->mc_snum = 0; 19153 mc->mc_top = 0; 19154 mc->mc_flags &= ~C_INITIALIZED; 19155 rc = MDBX_NO_ROOT; 19156 } else if ((flags & MDBX_CURRENT) == 0) { 19157 bool exact = false; 19158 if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { 19159 rc = cursor_last(mc, &dkey, &olddata); 19160 if (likely(rc == MDBX_SUCCESS)) { 19161 rc = mc->mc_dbx->md_cmp(key, &dkey); 19162 if (likely(rc > 0)) { 19163 mc->mc_ki[mc->mc_top]++; /* step forward for appending */ 19164 rc = MDBX_NOTFOUND; 19165 } else { 19166 if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP))) 19167 /* new-key < last-key 19168 * or new-key == last-key without MDBX_APPENDDUP */ 19169 return MDBX_EKEYMISMATCH; 19170 exact = true; 19171 } 19172 } 19173 } else { 19174 struct cursor_set_result csr = 19175 /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ 19176 cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); 19177 rc = csr.err; 19178 exact = csr.exact; 19179 } 19180 if (likely(rc == MDBX_SUCCESS)) { 19181 if (exact) { 19182 if (unlikely(flags & MDBX_NOOVERWRITE)) { 19183 DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); 19184 *data = olddata; 19185 return MDBX_KEYEXIST; 19186 } 19187 if (unlikely(mc->mc_flags & C_SUB)) { 19188 /* nested subtree of DUPSORT-database with the same key, 19189 * nothing to update */ 19190 eASSERT(env, data->iov_len == 0 && 19191 (olddata.iov_len == 0 || 19192 /* olddata may not be updated in case LEAF2-page 19193 of dupfixed-subDB */ 19194 (mc->mc_db->md_flags & MDBX_DUPFIXED))); 19195 return MDBX_SUCCESS; 19196 } 19197 if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && 19198 (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { 19199 rc = mdbx_cursor_del(mc, MDBX_ALLDUPS); 19200 if (unlikely(rc != MDBX_SUCCESS)) 19201 return rc; 19202 flags -= MDBX_ALLDUPS; 19203 rc = MDBX_NOTFOUND; 19204 exact = false; 19205 } else /* checking for early exit without dirtying pages */ 19206 if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) && 19207 unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) { 19208 if (!mc->mc_xcursor) 19209 /* the same data, nothing to update */ 19210 return MDBX_SUCCESS; 19211 if (flags & MDBX_NODUPDATA) 19212 return MDBX_KEYEXIST; 19213 if (flags & MDBX_APPENDDUP) 19214 return MDBX_EKEYMISMATCH; 19215 if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) 19216 /* data is match exactly byte-to-byte, nothing to update */ 19217 return MDBX_SUCCESS; 19218 else { 19219 /* The data has differences, but the user-provided comparator 19220 * considers them equal. So continue update since called without. 19221 * Continue to update since was called without MDBX_NODUPDATA. */ 19222 } 19223 } 19224 } 19225 } else if (unlikely(rc != MDBX_NOTFOUND)) 19226 return rc; 19227 } 19228 19229 mc->mc_flags &= ~C_DEL; 19230 19231 /* Cursor is positioned, check for room in the dirty list */ 19232 if (!nospill) { 19233 rdata = data; 19234 if (unlikely(flags & MDBX_MULTIPLE)) { 19235 rdata = &xdata; 19236 xdata.iov_len = data->iov_len * dcount; 19237 } 19238 if (unlikely(err = cursor_spill(mc, key, rdata))) 19239 return err; 19240 } 19241 19242 if (unlikely(rc == MDBX_NO_ROOT)) { 19243 /* new database, write a root leaf page */ 19244 DEBUG("%s", "allocating new root leaf page"); 19245 if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { 19246 err = touch_dbi(mc); 19247 if (unlikely(err != MDBX_SUCCESS)) 19248 return err; 19249 } 19250 pgr_t npr = page_new(mc, P_LEAF); 19251 if (unlikely(npr.err != MDBX_SUCCESS)) 19252 return npr.err; 19253 npr.err = cursor_push(mc, npr.page); 19254 if (unlikely(npr.err != MDBX_SUCCESS)) 19255 return npr.err; 19256 mc->mc_db->md_root = npr.page->mp_pgno; 19257 mc->mc_db->md_depth++; 19258 if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { 19259 assert(key->iov_len >= mc->mc_dbx->md_klen_min && 19260 key->iov_len <= mc->mc_dbx->md_klen_max); 19261 mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = key->iov_len; 19262 } 19263 if (mc->mc_db->md_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) { 19264 assert(data->iov_len >= mc->mc_dbx->md_vlen_min && 19265 data->iov_len <= mc->mc_dbx->md_vlen_max); 19266 assert(mc->mc_xcursor != NULL); 19267 mc->mc_db->md_xsize = mc->mc_xcursor->mx_db.md_xsize = 19268 (unsigned)(mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = 19269 mc->mc_xcursor->mx_dbx.md_klen_min = 19270 mc->mc_xcursor->mx_dbx.md_klen_max = 19271 data->iov_len); 19272 } 19273 if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) 19274 npr.page->mp_flags |= P_LEAF2; 19275 mc->mc_flags |= C_INITIALIZED; 19276 } else { 19277 /* make sure all cursor pages are writable */ 19278 err = cursor_touch(mc); 19279 if (unlikely(err)) 19280 return err; 19281 } 19282 19283 bool insert_key, insert_data, do_sub = false; 19284 insert_key = insert_data = (rc != MDBX_SUCCESS); 19285 uint16_t fp_flags = P_LEAF; 19286 MDBX_page *fp = env->me_pbuf; 19287 fp->mp_txnid = mc->mc_txn->mt_front; 19288 if (insert_key) { 19289 /* The key does not exist */ 19290 DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); 19291 if ((mc->mc_db->md_flags & MDBX_DUPSORT) && 19292 node_size(key, data) > env->me_leaf_nodemax) { 19293 /* Too big for a node, insert in sub-DB. Set up an empty 19294 * "old sub-page" for prep_subDB to expand to a full page. */ 19295 fp->mp_leaf2_ksize = 19296 (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; 19297 fp->mp_lower = fp->mp_upper = 0; 19298 olddata.iov_len = PAGEHDRSZ; 19299 goto prep_subDB; 19300 } 19301 } else { 19302 /* there's only a key anyway, so this is a no-op */ 19303 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 19304 char *ptr; 19305 unsigned ksize = mc->mc_db->md_xsize; 19306 if (unlikely(key->iov_len != ksize)) 19307 return MDBX_BAD_VALSIZE; 19308 ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); 19309 memcpy(ptr, key->iov_base, ksize); 19310 fix_parent: 19311 /* if overwriting slot 0 of leaf, need to 19312 * update branch key if there is a parent page */ 19313 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 19314 unsigned dtop = 1; 19315 mc->mc_top--; 19316 /* slot 0 is always an empty key, find real slot */ 19317 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 19318 mc->mc_top--; 19319 dtop++; 19320 } 19321 err = MDBX_SUCCESS; 19322 if (mc->mc_ki[mc->mc_top]) 19323 err = update_key(mc, key); 19324 cASSERT(mc, mc->mc_top + dtop < UINT16_MAX); 19325 mc->mc_top += (uint8_t)dtop; 19326 if (unlikely(err != MDBX_SUCCESS)) 19327 return err; 19328 } 19329 19330 if (AUDIT_ENABLED()) { 19331 err = cursor_check(mc); 19332 if (unlikely(err != MDBX_SUCCESS)) 19333 return err; 19334 } 19335 return MDBX_SUCCESS; 19336 } 19337 19338 more:; 19339 if (AUDIT_ENABLED()) { 19340 err = cursor_check(mc); 19341 if (unlikely(err != MDBX_SUCCESS)) 19342 return err; 19343 } 19344 MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 19345 19346 /* Large/Overflow page overwrites need special handling */ 19347 if (unlikely(node_flags(node) & F_BIGDATA)) { 19348 int dpages = (node_size(key, data) > env->me_leaf_nodemax) 19349 ? number_of_ovpages(env, data->iov_len) 19350 : 0; 19351 19352 const pgno_t pgno = node_largedata_pgno(node); 19353 pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); 19354 if (unlikely(lp.err != MDBX_SUCCESS)) 19355 return lp.err; 19356 cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); 19357 19358 /* Is the ov page from this txn (or a parent) and big enough? */ 19359 int ovpages = lp.page->mp_pages; 19360 if (!IS_FROZEN(mc->mc_txn, lp.page) && 19361 (unlikely(mc->mc_flags & C_GCFREEZE) 19362 ? (ovpages >= dpages) 19363 : (ovpages == 19364 /* LY: add configurable threshold to keep reserve space */ 19365 dpages))) { 19366 /* yes, overwrite it. */ 19367 if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { 19368 if (IS_SPILLED(mc->mc_txn, lp.page)) { 19369 lp = /* TODO: avoid search and get txn & spill-index from 19370 page_result */ 19371 page_unspill(mc->mc_txn, lp.page); 19372 if (unlikely(lp.err)) 19373 return lp.err; 19374 } else { 19375 if (unlikely(!mc->mc_txn->mt_parent)) { 19376 ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " 19377 "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," 19378 " without parent transaction, current txn %" PRIaTXN 19379 " front %" PRIaTXN, 19380 "overflow/large", pgno, lp.page->mp_txnid, 19381 mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); 19382 return MDBX_PROBLEM; 19383 } 19384 19385 /* It is writable only in a parent txn */ 19386 MDBX_page *np = page_malloc(mc->mc_txn, ovpages); 19387 if (unlikely(!np)) 19388 return MDBX_ENOMEM; 19389 19390 memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ 19391 err = page_dirty(mc->mc_txn, lp.page = np, ovpages); 19392 if (unlikely(err != MDBX_SUCCESS)) 19393 return err; 19394 19395 #if MDBX_ENABLE_PGOP_STAT 19396 mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages; 19397 #endif /* MDBX_ENABLE_PGOP_STAT */ 19398 cASSERT(mc, dirtylist_check(mc->mc_txn)); 19399 } 19400 } 19401 node_set_ds(node, data->iov_len); 19402 if (flags & MDBX_RESERVE) 19403 data->iov_base = page_data(lp.page); 19404 else 19405 memcpy(page_data(lp.page), data->iov_base, data->iov_len); 19406 19407 if (AUDIT_ENABLED()) { 19408 err = cursor_check(mc); 19409 if (unlikely(err != MDBX_SUCCESS)) 19410 return err; 19411 } 19412 return MDBX_SUCCESS; 19413 } 19414 19415 if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) 19416 return err; 19417 } else { 19418 olddata.iov_len = node_ds(node); 19419 olddata.iov_base = node_data(node); 19420 cASSERT(mc, (char *)olddata.iov_base + olddata.iov_len <= 19421 (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); 19422 19423 /* DB has dups? */ 19424 if (mc->mc_db->md_flags & MDBX_DUPSORT) { 19425 /* Prepare (sub-)page/sub-DB to accept the new item, if needed. 19426 * fp: old sub-page or a header faking it. 19427 * mp: new (sub-)page. offset: growth in page size. 19428 * xdata: node data with new page or DB. */ 19429 unsigned i; 19430 size_t offset = 0; 19431 MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; 19432 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; 19433 19434 /* Was a single item before, must convert now */ 19435 if (!(node_flags(node) & F_DUPDATA)) { 19436 19437 /* does data match? */ 19438 const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); 19439 if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0)) 19440 return MDBX_EKEYMISMATCH; 19441 if (cmp == 0) { 19442 if (flags & MDBX_NODUPDATA) 19443 return MDBX_KEYEXIST; 19444 if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) { 19445 /* data is match exactly byte-to-byte, nothing to update */ 19446 if (unlikely(flags & MDBX_MULTIPLE)) { 19447 rc = MDBX_SUCCESS; 19448 goto continue_multiple; 19449 } 19450 return MDBX_SUCCESS; 19451 } else { 19452 /* The data has differences, but the user-provided comparator 19453 * considers them equal. So continue update since called without. 19454 * Continue to update since was called without MDBX_NODUPDATA. */ 19455 } 19456 cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); 19457 goto current; 19458 } 19459 19460 /* Just overwrite the current item */ 19461 if (flags & MDBX_CURRENT) { 19462 cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); 19463 goto current; 19464 } 19465 19466 /* Back up original data item */ 19467 memcpy(dkey.iov_base = fp + 1, olddata.iov_base, 19468 dkey.iov_len = olddata.iov_len); 19469 dupdata_flag = 1; 19470 19471 /* Make sub-page header for the dup items, with dummy body */ 19472 fp->mp_flags = P_LEAF | P_SUBP; 19473 fp->mp_lower = 0; 19474 xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; 19475 if (mc->mc_db->md_flags & MDBX_DUPFIXED) { 19476 fp->mp_flags |= P_LEAF2; 19477 fp->mp_leaf2_ksize = (uint16_t)data->iov_len; 19478 xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ 19479 cASSERT(mc, xdata.iov_len <= env->me_psize); 19480 } else { 19481 xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + 19482 (dkey.iov_len & 1) + (data->iov_len & 1); 19483 cASSERT(mc, xdata.iov_len <= env->me_psize); 19484 } 19485 fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); 19486 olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ 19487 } else if (node_flags(node) & F_SUBDATA) { 19488 /* Data is on sub-DB, just store it */ 19489 flags |= F_DUPDATA | F_SUBDATA; 19490 goto put_sub; 19491 } else { 19492 /* Data is on sub-page */ 19493 fp = olddata.iov_base; 19494 switch (flags) { 19495 default: 19496 if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { 19497 offset = node_size(data, nullptr) + sizeof(indx_t); 19498 break; 19499 } 19500 offset = fp->mp_leaf2_ksize; 19501 if (page_room(fp) < offset) { 19502 offset *= 4; /* space for 4 more */ 19503 break; 19504 } 19505 /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ 19506 __fallthrough; 19507 case MDBX_CURRENT | MDBX_NODUPDATA: 19508 case MDBX_CURRENT: 19509 fp->mp_txnid = mc->mc_txn->mt_front; 19510 fp->mp_pgno = mp->mp_pgno; 19511 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; 19512 flags |= F_DUPDATA; 19513 goto put_sub; 19514 } 19515 xdata.iov_len = olddata.iov_len + offset; 19516 } 19517 19518 fp_flags = fp->mp_flags; 19519 if (node_size_len(node_ks(node), xdata.iov_len) > 19520 env->me_leaf_nodemax) { 19521 /* Too big for a sub-page, convert to sub-DB */ 19522 fp_flags &= ~P_SUBP; 19523 prep_subDB: 19524 nested_dupdb.md_xsize = 0; 19525 nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags); 19526 if (mc->mc_db->md_flags & MDBX_DUPFIXED) { 19527 fp_flags |= P_LEAF2; 19528 nested_dupdb.md_xsize = fp->mp_leaf2_ksize; 19529 } 19530 nested_dupdb.md_depth = 1; 19531 nested_dupdb.md_branch_pages = 0; 19532 nested_dupdb.md_leaf_pages = 1; 19533 nested_dupdb.md_overflow_pages = 0; 19534 nested_dupdb.md_entries = page_numkeys(fp); 19535 xdata.iov_len = sizeof(nested_dupdb); 19536 xdata.iov_base = &nested_dupdb; 19537 const pgr_t par = page_alloc(mc); 19538 mp = par.page; 19539 if (unlikely(par.err != MDBX_SUCCESS)) 19540 return par.err; 19541 mc->mc_db->md_leaf_pages += 1; 19542 cASSERT(mc, env->me_psize > olddata.iov_len); 19543 offset = env->me_psize - (unsigned)olddata.iov_len; 19544 flags |= F_DUPDATA | F_SUBDATA; 19545 nested_dupdb.md_root = mp->mp_pgno; 19546 nested_dupdb.md_seq = 0; 19547 nested_dupdb.md_mod_txnid = mc->mc_txn->mt_txnid; 19548 sub_root = mp; 19549 } 19550 if (mp != fp) { 19551 mp->mp_flags = fp_flags; 19552 mp->mp_txnid = mc->mc_txn->mt_front; 19553 mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; 19554 mp->mp_lower = fp->mp_lower; 19555 cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); 19556 mp->mp_upper = (indx_t)(fp->mp_upper + offset); 19557 if (unlikely(fp_flags & P_LEAF2)) { 19558 memcpy(page_data(mp), page_data(fp), 19559 page_numkeys(fp) * fp->mp_leaf2_ksize); 19560 } else { 19561 memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, 19562 (char *)fp + fp->mp_upper + PAGEHDRSZ, 19563 olddata.iov_len - fp->mp_upper - PAGEHDRSZ); 19564 memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), 19565 page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); 19566 for (i = 0; i < page_numkeys(fp); i++) { 19567 cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); 19568 mp->mp_ptrs[i] += (indx_t)offset; 19569 } 19570 } 19571 } 19572 19573 rdata = &xdata; 19574 flags |= F_DUPDATA; 19575 do_sub = true; 19576 if (!insert_key) 19577 node_del(mc, 0); 19578 goto new_sub; 19579 } 19580 19581 /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ 19582 if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) 19583 return MDBX_INCOMPATIBLE; 19584 19585 current: 19586 if (data->iov_len == olddata.iov_len) { 19587 cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); 19588 /* same size, just replace it. Note that we could 19589 * also reuse this node if the new data is smaller, 19590 * but instead we opt to shrink the node in that case. */ 19591 if (flags & MDBX_RESERVE) 19592 data->iov_base = olddata.iov_base; 19593 else if (!(mc->mc_flags & C_SUB)) 19594 memcpy(olddata.iov_base, data->iov_base, data->iov_len); 19595 else { 19596 cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); 19597 cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); 19598 cASSERT(mc, node_ds(node) == 0); 19599 cASSERT(mc, node_flags(node) == 0); 19600 cASSERT(mc, key->iov_len < UINT16_MAX); 19601 node_set_ks(node, key->iov_len); 19602 memcpy(node_key(node), key->iov_base, key->iov_len); 19603 cASSERT(mc, (char *)node_key(node) + node_ds(node) < 19604 (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); 19605 goto fix_parent; 19606 } 19607 19608 if (AUDIT_ENABLED()) { 19609 err = cursor_check(mc); 19610 if (unlikely(err != MDBX_SUCCESS)) 19611 return err; 19612 } 19613 return MDBX_SUCCESS; 19614 } 19615 } 19616 node_del(mc, 0); 19617 } 19618 19619 rdata = data; 19620 19621 new_sub:; 19622 const unsigned naf = flags & NODE_ADD_FLAGS; 19623 size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len 19624 : leaf_size(env, key, rdata); 19625 if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { 19626 rc = page_split(mc, key, rdata, P_INVALID, 19627 insert_key ? naf : naf | MDBX_SPLIT_REPLACE); 19628 if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) 19629 rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); 19630 } else { 19631 /* There is room already in this leaf page. */ 19632 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 19633 cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && 19634 rdata->iov_len == 0); 19635 rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); 19636 } else 19637 rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); 19638 if (likely(rc == 0)) { 19639 /* Adjust other cursors pointing to mp */ 19640 const MDBX_dbi dbi = mc->mc_dbi; 19641 const unsigned i = mc->mc_top; 19642 MDBX_page *const mp = mc->mc_pg[i]; 19643 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; 19644 m2 = m2->mc_next) { 19645 MDBX_cursor *m3 = 19646 (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 19647 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) 19648 continue; 19649 if (m3->mc_ki[i] >= mc->mc_ki[i]) 19650 m3->mc_ki[i] += insert_key; 19651 if (XCURSOR_INITED(m3)) 19652 XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); 19653 } 19654 } 19655 } 19656 19657 if (likely(rc == MDBX_SUCCESS)) { 19658 /* Now store the actual data in the child DB. Note that we're 19659 * storing the user data in the keys field, so there are strict 19660 * size limits on dupdata. The actual data fields of the child 19661 * DB are all zero size. */ 19662 if (do_sub) { 19663 int xflags; 19664 size_t ecount; 19665 put_sub: 19666 xdata.iov_len = 0; 19667 xdata.iov_base = nullptr; 19668 MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 19669 #define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 19670 STATIC_ASSERT( 19671 (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == 19672 MDBX_NOOVERWRITE); 19673 xflags = MDBX_CURRENT | MDBX_NOSPILL | 19674 ((flags & MDBX_NODUPDATA) >> 19675 SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); 19676 if ((flags & MDBX_CURRENT) == 0) { 19677 xflags -= MDBX_CURRENT; 19678 err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); 19679 if (unlikely(err != MDBX_SUCCESS)) 19680 return err; 19681 } 19682 if (sub_root) 19683 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; 19684 /* converted, write the original data first */ 19685 if (dupdata_flag) { 19686 rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); 19687 if (unlikely(rc)) 19688 goto bad_sub; 19689 /* we've done our job */ 19690 dkey.iov_len = 0; 19691 } 19692 if (!(node_flags(node) & F_SUBDATA) || sub_root) { 19693 /* Adjust other cursors pointing to mp */ 19694 MDBX_cursor *m2; 19695 MDBX_xcursor *mx = mc->mc_xcursor; 19696 unsigned i = mc->mc_top; 19697 MDBX_page *mp = mc->mc_pg[i]; 19698 const int nkeys = page_numkeys(mp); 19699 19700 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { 19701 if (m2 == mc || m2->mc_snum < mc->mc_snum) 19702 continue; 19703 if (!(m2->mc_flags & C_INITIALIZED)) 19704 continue; 19705 if (m2->mc_pg[i] == mp) { 19706 if (m2->mc_ki[i] == mc->mc_ki[i]) { 19707 err = cursor_xinit2(m2, mx, dupdata_flag); 19708 if (unlikely(err != MDBX_SUCCESS)) 19709 return err; 19710 } else if (!insert_key && m2->mc_ki[i] < nkeys) { 19711 XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); 19712 } 19713 } 19714 } 19715 } 19716 cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); 19717 ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; 19718 #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 19719 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == 19720 MDBX_APPEND); 19721 xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; 19722 rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); 19723 if (flags & F_SUBDATA) { 19724 void *db = node_data(node); 19725 mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; 19726 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); 19727 } 19728 insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); 19729 } 19730 /* Increment count unless we just replaced an existing item. */ 19731 if (insert_data) 19732 mc->mc_db->md_entries++; 19733 if (insert_key) { 19734 /* Invalidate txn if we created an empty sub-DB */ 19735 if (unlikely(rc)) 19736 goto bad_sub; 19737 /* If we succeeded and the key didn't exist before, 19738 * make sure the cursor is marked valid. */ 19739 mc->mc_flags |= C_INITIALIZED; 19740 } 19741 if (unlikely(flags & MDBX_MULTIPLE)) { 19742 if (likely(rc == MDBX_SUCCESS)) { 19743 continue_multiple: 19744 mcount++; 19745 /* let caller know how many succeeded, if any */ 19746 data[1].iov_len = mcount; 19747 if (mcount < dcount) { 19748 data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; 19749 insert_key = insert_data = false; 19750 goto more; 19751 } 19752 } 19753 } 19754 if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) 19755 rc = cursor_check(mc); 19756 return rc; 19757 bad_sub: 19758 if (unlikely(rc == MDBX_KEYEXIST)) { 19759 /* should not happen, we deleted that item */ 19760 ERROR("Unexpected %i error while put to nested dupsort's hive", rc); 19761 rc = MDBX_PROBLEM; 19762 } 19763 } 19764 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 19765 return rc; 19766 } 19767 19768 __hot int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { 19769 if (unlikely(!mc)) 19770 return MDBX_EINVAL; 19771 19772 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 19773 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 19774 : MDBX_EBADSIGN; 19775 19776 int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); 19777 if (unlikely(rc != MDBX_SUCCESS)) 19778 return rc; 19779 19780 if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) 19781 return MDBX_BAD_DBI; 19782 19783 if (unlikely(!(mc->mc_flags & C_INITIALIZED))) 19784 return MDBX_ENODATA; 19785 19786 if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) 19787 return MDBX_NOTFOUND; 19788 19789 if (likely((flags & MDBX_NOSPILL) == 0) && 19790 unlikely(rc = cursor_spill(mc, NULL, NULL))) 19791 return rc; 19792 19793 rc = cursor_touch(mc); 19794 if (unlikely(rc != MDBX_SUCCESS)) 19795 return rc; 19796 19797 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 19798 if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { 19799 ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", 19800 mp->mp_pgno, mp->mp_flags); 19801 return MDBX_CORRUPTED; 19802 } 19803 if (IS_LEAF2(mp)) 19804 goto del_key; 19805 19806 MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); 19807 if (node_flags(node) & F_DUPDATA) { 19808 if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { 19809 /* cursor_del() will subtract the final entry */ 19810 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; 19811 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 19812 } else { 19813 if (!(node_flags(node) & F_SUBDATA)) 19814 mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); 19815 rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); 19816 if (unlikely(rc)) 19817 return rc; 19818 /* If sub-DB still has entries, we're done */ 19819 if (mc->mc_xcursor->mx_db.md_entries) { 19820 if (node_flags(node) & F_SUBDATA) { 19821 /* update subDB info */ 19822 void *db = node_data(node); 19823 mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; 19824 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); 19825 } else { 19826 MDBX_cursor *m2; 19827 /* shrink fake page */ 19828 node_shrink(mp, mc->mc_ki[mc->mc_top]); 19829 node = page_node(mp, mc->mc_ki[mc->mc_top]); 19830 mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); 19831 /* fix other sub-DB cursors pointed at fake pages on this page */ 19832 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { 19833 if (m2 == mc || m2->mc_snum < mc->mc_snum) 19834 continue; 19835 if (!(m2->mc_flags & C_INITIALIZED)) 19836 continue; 19837 if (m2->mc_pg[mc->mc_top] == mp) { 19838 MDBX_node *inner = node; 19839 if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp)) 19840 continue; 19841 if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { 19842 inner = page_node(mp, m2->mc_ki[mc->mc_top]); 19843 if (node_flags(inner) & F_SUBDATA) 19844 continue; 19845 } 19846 m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner); 19847 } 19848 } 19849 } 19850 mc->mc_db->md_entries--; 19851 cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && 19852 mc->mc_db->md_root != P_INVALID); 19853 return rc; 19854 } else { 19855 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 19856 } 19857 /* otherwise fall thru and delete the sub-DB */ 19858 } 19859 19860 if (node_flags(node) & F_SUBDATA) { 19861 /* add all the child DB's pages to the free list */ 19862 rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); 19863 if (unlikely(rc)) 19864 goto fail; 19865 } 19866 } 19867 /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */ 19868 else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) 19869 return MDBX_INCOMPATIBLE; 19870 19871 /* add large/overflow pages to free list */ 19872 if (node_flags(node) & F_BIGDATA) { 19873 pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); 19874 if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) 19875 goto fail; 19876 } 19877 19878 del_key: 19879 return cursor_del(mc); 19880 19881 fail: 19882 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 19883 return rc; 19884 } 19885 19886 /* Allocate and initialize new pages for a database. 19887 * Set MDBX_TXN_ERROR on failure. */ 19888 static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { 19889 cASSERT(mc, (flags & P_OVERFLOW) == 0); 19890 pgr_t ret = page_alloc(mc); 19891 if (unlikely(ret.err != MDBX_SUCCESS)) 19892 return ret; 19893 19894 DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); 19895 ret.page->mp_flags = (uint16_t)flags; 19896 ret.page->mp_txnid = mc->mc_txn->mt_front; 19897 cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); 19898 cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); 19899 #if MDBX_ENABLE_PGOP_STAT 19900 mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; 19901 #endif /* MDBX_ENABLE_PGOP_STAT */ 19902 19903 STATIC_ASSERT(P_BRANCH == 1); 19904 const unsigned is_branch = flags & P_BRANCH; 19905 19906 ret.page->mp_lower = 0; 19907 ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); 19908 mc->mc_db->md_branch_pages += is_branch; 19909 mc->mc_db->md_leaf_pages += 1 - is_branch; 19910 if (unlikely(mc->mc_flags & C_SUB)) { 19911 MDBX_db *outer = outer_db(mc); 19912 outer->md_branch_pages += is_branch; 19913 outer->md_leaf_pages += 1 - is_branch; 19914 } 19915 return ret; 19916 } 19917 19918 static pgr_t page_new_large(MDBX_cursor *mc, const unsigned npages) { 19919 pgr_t ret = likely(npages == 1) 19920 ? page_alloc(mc) 19921 : page_alloc_slowpath(mc, npages, MDBX_ALLOC_ALL); 19922 if (unlikely(ret.err != MDBX_SUCCESS)) 19923 return ret; 19924 19925 DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %u", mc->mc_dbi, 19926 ret.page->mp_pgno, npages); 19927 ret.page->mp_flags = P_OVERFLOW; 19928 ret.page->mp_txnid = mc->mc_txn->mt_front; 19929 cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); 19930 cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); 19931 #if MDBX_ENABLE_PGOP_STAT 19932 mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; 19933 #endif /* MDBX_ENABLE_PGOP_STAT */ 19934 19935 mc->mc_db->md_overflow_pages += npages; 19936 ret.page->mp_pages = npages; 19937 cASSERT(mc, !(mc->mc_flags & C_SUB)); 19938 return ret; 19939 } 19940 19941 __hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, 19942 unsigned indx, 19943 const MDBX_val *key) { 19944 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 19945 DKBUF_DEBUG; 19946 DEBUG("add to leaf2-%spage %" PRIaPGNO " index %i, " 19947 " key size %" PRIuPTR " [%s]", 19948 IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, 19949 DKEY_DEBUG(key)); 19950 19951 cASSERT(mc, key); 19952 cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); 19953 const unsigned ksize = mc->mc_db->md_xsize; 19954 cASSERT(mc, ksize == key->iov_len); 19955 const unsigned nkeys = page_numkeys(mp); 19956 19957 /* Just using these for counting */ 19958 const intptr_t lower = mp->mp_lower + sizeof(indx_t); 19959 const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t)); 19960 if (unlikely(lower > upper)) { 19961 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 19962 return MDBX_PAGE_FULL; 19963 } 19964 mp->mp_lower = (indx_t)lower; 19965 mp->mp_upper = (indx_t)upper; 19966 19967 char *const ptr = page_leaf2key(mp, indx, ksize); 19968 cASSERT(mc, nkeys >= indx); 19969 const unsigned diff = nkeys - indx; 19970 if (likely(diff > 0)) 19971 /* Move higher keys up one slot. */ 19972 memmove(ptr + ksize, ptr, diff * ksize); 19973 /* insert new key */ 19974 memcpy(ptr, key->iov_base, ksize); 19975 return MDBX_SUCCESS; 19976 } 19977 19978 static int __must_check_result node_add_branch(MDBX_cursor *mc, unsigned indx, 19979 const MDBX_val *key, 19980 pgno_t pgno) { 19981 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 19982 DKBUF_DEBUG; 19983 DEBUG("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO 19984 " key size %" PRIuPTR " [%s]", 19985 IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, 19986 key ? key->iov_len : 0, DKEY_DEBUG(key)); 19987 19988 cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); 19989 STATIC_ASSERT(NODESIZE % 2 == 0); 19990 19991 /* Move higher pointers up one slot. */ 19992 const unsigned nkeys = page_numkeys(mp); 19993 cASSERT(mc, nkeys >= indx); 19994 for (unsigned i = nkeys; i > indx; --i) 19995 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; 19996 19997 /* Adjust free space offsets. */ 19998 const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key); 19999 const intptr_t lower = mp->mp_lower + sizeof(indx_t); 20000 const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t)); 20001 if (unlikely(lower > upper)) { 20002 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 20003 return MDBX_PAGE_FULL; 20004 } 20005 mp->mp_lower = (indx_t)lower; 20006 mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; 20007 20008 /* Write the node data. */ 20009 MDBX_node *node = page_node(mp, indx); 20010 node_set_pgno(node, pgno); 20011 node_set_flags(node, 0); 20012 UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); 20013 node_set_ks(node, 0); 20014 if (likely(key != NULL)) { 20015 node_set_ks(node, key->iov_len); 20016 memcpy(node_key(node), key->iov_base, key->iov_len); 20017 } 20018 return MDBX_SUCCESS; 20019 } 20020 20021 __hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, 20022 unsigned indx, 20023 const MDBX_val *key, 20024 MDBX_val *data, 20025 unsigned flags) { 20026 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 20027 DKBUF_DEBUG; 20028 DEBUG("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR 20029 " key size %" PRIuPTR " [%s]", 20030 IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, 20031 key ? key->iov_len : 0, DKEY_DEBUG(key)); 20032 cASSERT(mc, key != NULL && data != NULL); 20033 cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); 20034 cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); 20035 MDBX_page *largepage = NULL; 20036 20037 size_t node_bytes; 20038 if (unlikely(flags & F_BIGDATA)) { 20039 /* Data already on large/overflow page. */ 20040 STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); 20041 node_bytes = 20042 node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); 20043 } else if (unlikely(node_size(key, data) > 20044 mc->mc_txn->mt_env->me_leaf_nodemax)) { 20045 /* Put data on large/overflow page. */ 20046 if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { 20047 ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", 20048 mc->mc_db->md_flags); 20049 return MDBX_PROBLEM; 20050 } 20051 if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { 20052 ERROR("Unexpected target %s flags 0x%x for large data-item", "node", 20053 flags); 20054 return MDBX_PROBLEM; 20055 } 20056 const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); 20057 const pgr_t npr = page_new_large(mc, ovpages); 20058 if (unlikely(npr.err != MDBX_SUCCESS)) 20059 return npr.err; 20060 largepage = npr.page; 20061 DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR 20062 " data bytes", 20063 largepage->mp_pages, largepage->mp_pgno, data->iov_len); 20064 flags |= F_BIGDATA; 20065 node_bytes = 20066 node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); 20067 } else { 20068 node_bytes = node_size(key, data) + sizeof(indx_t); 20069 } 20070 cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); 20071 20072 /* Move higher pointers up one slot. */ 20073 const unsigned nkeys = page_numkeys(mp); 20074 cASSERT(mc, nkeys >= indx); 20075 for (unsigned i = nkeys; i > indx; --i) 20076 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; 20077 20078 /* Adjust free space offsets. */ 20079 const intptr_t lower = mp->mp_lower + sizeof(indx_t); 20080 const intptr_t upper = mp->mp_upper - (node_bytes - sizeof(indx_t)); 20081 if (unlikely(lower > upper)) { 20082 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 20083 return MDBX_PAGE_FULL; 20084 } 20085 mp->mp_lower = (indx_t)lower; 20086 mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; 20087 20088 /* Write the node data. */ 20089 MDBX_node *node = page_node(mp, indx); 20090 node_set_ks(node, key->iov_len); 20091 node_set_flags(node, (uint8_t)flags); 20092 UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); 20093 node_set_ds(node, data->iov_len); 20094 memcpy(node_key(node), key->iov_base, key->iov_len); 20095 20096 void *nodedata = node_data(node); 20097 if (likely(largepage == NULL)) { 20098 if (unlikely(flags & F_BIGDATA)) { 20099 memcpy(nodedata, data->iov_base, sizeof(pgno_t)); 20100 return MDBX_SUCCESS; 20101 } 20102 } else { 20103 poke_pgno(nodedata, largepage->mp_pgno); 20104 nodedata = page_data(largepage); 20105 } 20106 if (unlikely(flags & MDBX_RESERVE)) 20107 data->iov_base = nodedata; 20108 else if (likely(nodedata != data->iov_base && 20109 data->iov_len /* to avoid UBSAN traps*/ != 0)) 20110 memcpy(nodedata, data->iov_base, data->iov_len); 20111 return MDBX_SUCCESS; 20112 } 20113 20114 /* Delete the specified node from a page. 20115 * [in] mc Cursor pointing to the node to delete. 20116 * [in] ksize The size of a node. Only used if the page is 20117 * part of a MDBX_DUPFIXED database. */ 20118 __hot static void node_del(MDBX_cursor *mc, size_t ksize) { 20119 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 20120 const unsigned hole = mc->mc_ki[mc->mc_top]; 20121 const unsigned nkeys = page_numkeys(mp); 20122 20123 DEBUG("delete node %u on %s page %" PRIaPGNO, hole, 20124 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); 20125 cASSERT(mc, hole < nkeys); 20126 20127 if (IS_LEAF2(mp)) { 20128 cASSERT(mc, ksize >= sizeof(indx_t)); 20129 unsigned diff = nkeys - 1 - hole; 20130 char *base = page_leaf2key(mp, hole, ksize); 20131 if (diff) 20132 memmove(base, base + ksize, diff * ksize); 20133 cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); 20134 mp->mp_lower -= sizeof(indx_t); 20135 cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); 20136 mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); 20137 return; 20138 } 20139 20140 MDBX_node *node = page_node(mp, hole); 20141 cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); 20142 size_t hole_size = NODESIZE + node_ks(node); 20143 if (IS_LEAF(mp)) 20144 hole_size += 20145 (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node); 20146 hole_size = EVEN(hole_size); 20147 20148 const indx_t hole_offset = mp->mp_ptrs[hole]; 20149 unsigned r, w; 20150 for (r = w = 0; r < nkeys; r++) 20151 if (r != hole) 20152 mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) 20153 ? mp->mp_ptrs[r] + (indx_t)hole_size 20154 : mp->mp_ptrs[r]; 20155 20156 char *base = (char *)mp + mp->mp_upper + PAGEHDRSZ; 20157 memmove(base + hole_size, base, hole_offset - mp->mp_upper); 20158 20159 cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); 20160 mp->mp_lower -= sizeof(indx_t); 20161 cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); 20162 mp->mp_upper += (indx_t)hole_size; 20163 20164 if (AUDIT_ENABLED()) { 20165 const uint8_t checking = mc->mc_checking; 20166 mc->mc_checking |= CC_UPDATING; 20167 const int page_check_err = page_check(mc, mp); 20168 mc->mc_checking = checking; 20169 cASSERT(mc, page_check_err == MDBX_SUCCESS); 20170 } 20171 } 20172 20173 /* Compact the main page after deleting a node on a subpage. 20174 * [in] mp The main page to operate on. 20175 * [in] indx The index of the subpage on the main page. */ 20176 static void node_shrink(MDBX_page *mp, unsigned indx) { 20177 MDBX_node *node; 20178 MDBX_page *sp, *xp; 20179 char *base; 20180 size_t nsize, delta, len, ptr; 20181 int i; 20182 20183 node = page_node(mp, indx); 20184 sp = (MDBX_page *)node_data(node); 20185 delta = page_room(sp); 20186 assert(delta > 0); 20187 20188 /* Prepare to shift upward, set len = length(subpage part to shift) */ 20189 if (IS_LEAF2(sp)) { 20190 delta &= /* do not make the node uneven-sized */ ~(size_t)1; 20191 if (unlikely(delta) == 0) 20192 return; 20193 nsize = node_ds(node) - delta; 20194 assert(nsize % 1 == 0); 20195 len = nsize; 20196 } else { 20197 xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ 20198 for (i = page_numkeys(sp); --i >= 0;) { 20199 assert(sp->mp_ptrs[i] >= delta); 20200 xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); 20201 } 20202 nsize = node_ds(node) - delta; 20203 len = PAGEHDRSZ; 20204 } 20205 sp->mp_upper = sp->mp_lower; 20206 sp->mp_pgno = mp->mp_pgno; 20207 node_set_ds(node, nsize); 20208 20209 /* Shift <lower nodes...initial part of subpage> upward */ 20210 base = (char *)mp + mp->mp_upper + PAGEHDRSZ; 20211 memmove(base + delta, base, (char *)sp + len - base); 20212 20213 ptr = mp->mp_ptrs[indx]; 20214 for (i = page_numkeys(mp); --i >= 0;) { 20215 if (mp->mp_ptrs[i] <= ptr) { 20216 assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); 20217 mp->mp_ptrs[i] += (indx_t)delta; 20218 } 20219 } 20220 assert((size_t)UINT16_MAX - mp->mp_upper >= delta); 20221 mp->mp_upper += (indx_t)delta; 20222 } 20223 20224 /* Initial setup of a sorted-dups cursor. 20225 * 20226 * Sorted duplicates are implemented as a sub-database for the given key. 20227 * The duplicate data items are actually keys of the sub-database. 20228 * Operations on the duplicate data items are performed using a sub-cursor 20229 * initialized when the sub-database is first accessed. This function does 20230 * the preliminary setup of the sub-cursor, filling in the fields that 20231 * depend only on the parent DB. 20232 * 20233 * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ 20234 static int cursor_xinit0(MDBX_cursor *mc) { 20235 MDBX_xcursor *mx = mc->mc_xcursor; 20236 if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { 20237 ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", 20238 mc->mc_dbi); 20239 return MDBX_CORRUPTED; 20240 } 20241 20242 mx->mx_cursor.mc_xcursor = NULL; 20243 mx->mx_cursor.mc_next = NULL; 20244 mx->mx_cursor.mc_txn = mc->mc_txn; 20245 mx->mx_cursor.mc_db = &mx->mx_db; 20246 mx->mx_cursor.mc_dbx = &mx->mx_dbx; 20247 mx->mx_cursor.mc_dbi = mc->mc_dbi; 20248 mx->mx_cursor.mc_dbistate = mc->mc_dbistate; 20249 mx->mx_cursor.mc_snum = 0; 20250 mx->mx_cursor.mc_top = 0; 20251 mx->mx_cursor.mc_flags = C_SUB; 20252 STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); 20253 cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); 20254 mx->mx_cursor.mc_checking = 20255 mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); 20256 mx->mx_dbx.md_name.iov_len = 0; 20257 mx->mx_dbx.md_name.iov_base = NULL; 20258 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; 20259 mx->mx_dbx.md_dcmp = NULL; 20260 mx->mx_dbx.md_klen_min = INT_MAX; 20261 mx->mx_dbx.md_vlen_min = mx->mx_dbx.md_klen_max = mx->mx_dbx.md_vlen_max = 0; 20262 return MDBX_SUCCESS; 20263 } 20264 20265 /* Final setup of a sorted-dups cursor. 20266 * Sets up the fields that depend on the data from the main cursor. 20267 * [in] mc The main cursor whose sorted-dups cursor is to be initialized. 20268 * [in] node The data containing the MDBX_db record for the sorted-dup database. 20269 */ 20270 static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, 20271 const MDBX_page *mp) { 20272 MDBX_xcursor *mx = mc->mc_xcursor; 20273 if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { 20274 ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", 20275 mc->mc_dbi); 20276 return MDBX_CORRUPTED; 20277 } 20278 20279 const uint8_t flags = node_flags(node); 20280 switch (flags) { 20281 default: 20282 ERROR("invalid node flags %u", flags); 20283 return MDBX_CORRUPTED; 20284 case F_DUPDATA | F_SUBDATA: 20285 if (!MDBX_DISABLE_VALIDATION && 20286 unlikely(node_ds(node) != sizeof(MDBX_db))) { 20287 ERROR("invalid nested-db record size %zu", node_ds(node)); 20288 return MDBX_CORRUPTED; 20289 } 20290 memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); 20291 const txnid_t pp_txnid = mp->mp_txnid; 20292 if (!MDBX_DISABLE_VALIDATION && 20293 unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { 20294 ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", 20295 mx->mx_db.md_mod_txnid, pp_txnid); 20296 return MDBX_CORRUPTED; 20297 } 20298 mx->mx_cursor.mc_pg[0] = 0; 20299 mx->mx_cursor.mc_snum = 0; 20300 mx->mx_cursor.mc_top = 0; 20301 mx->mx_cursor.mc_flags = C_SUB; 20302 break; 20303 case F_DUPDATA: 20304 if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { 20305 ERROR("invalid nested-page size %zu", node_ds(node)); 20306 return MDBX_CORRUPTED; 20307 } 20308 MDBX_page *fp = node_data(node); 20309 mx->mx_db.md_depth = 1; 20310 mx->mx_db.md_branch_pages = 0; 20311 mx->mx_db.md_leaf_pages = 1; 20312 mx->mx_db.md_overflow_pages = 0; 20313 mx->mx_db.md_entries = page_numkeys(fp); 20314 mx->mx_db.md_root = fp->mp_pgno; 20315 mx->mx_db.md_mod_txnid = mp->mp_txnid; 20316 mx->mx_cursor.mc_snum = 1; 20317 mx->mx_cursor.mc_top = 0; 20318 mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; 20319 mx->mx_cursor.mc_pg[0] = fp; 20320 mx->mx_cursor.mc_ki[0] = 0; 20321 mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); 20322 mx->mx_db.md_xsize = 20323 (mc->mc_db->md_flags & MDBX_DUPFIXED) ? fp->mp_leaf2_ksize : 0; 20324 break; 20325 } 20326 20327 if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { 20328 if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { 20329 ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); 20330 return MDBX_CORRUPTED; 20331 } 20332 if (!MDBX_DISABLE_VALIDATION && 20333 unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { 20334 ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags); 20335 return MDBX_CORRUPTED; 20336 } 20337 if (!MDBX_DISABLE_VALIDATION && 20338 unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || 20339 mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { 20340 ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length " 20341 "(%zu/%zu)", 20342 mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, 20343 mc->mc_dbx->md_vlen_max); 20344 return MDBX_CORRUPTED; 20345 } 20346 mc->mc_db->md_xsize = mx->mx_db.md_xsize; 20347 mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = mx->mx_db.md_xsize; 20348 } 20349 mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; 20350 mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; 20351 20352 DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, 20353 mx->mx_db.md_root); 20354 return MDBX_SUCCESS; 20355 } 20356 20357 /* Fixup a sorted-dups cursor due to underlying update. 20358 * Sets up some fields that depend on the data from the main cursor. 20359 * Almost the same as init1, but skips initialization steps if the 20360 * xcursor had already been used. 20361 * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. 20362 * [in] src_mx The xcursor of an up-to-date cursor. 20363 * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ 20364 static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, 20365 bool new_dupdata) { 20366 MDBX_xcursor *mx = mc->mc_xcursor; 20367 if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { 20368 ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", 20369 mc->mc_dbi); 20370 return MDBX_CORRUPTED; 20371 } 20372 20373 if (new_dupdata) { 20374 mx->mx_cursor.mc_snum = 1; 20375 mx->mx_cursor.mc_top = 0; 20376 mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; 20377 mx->mx_cursor.mc_ki[0] = 0; 20378 } 20379 20380 mx->mx_dbx.md_klen_min = src_mx->mx_dbx.md_klen_min; 20381 mx->mx_dbx.md_klen_max = src_mx->mx_dbx.md_klen_max; 20382 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; 20383 mx->mx_db = src_mx->mx_db; 20384 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; 20385 if (mx->mx_cursor.mc_flags & C_INITIALIZED) { 20386 DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, 20387 mx->mx_db.md_root); 20388 } 20389 return MDBX_SUCCESS; 20390 } 20391 20392 static __inline int couple_init(MDBX_cursor_couple *couple, const MDBX_dbi dbi, 20393 MDBX_txn *const txn, MDBX_db *const db, 20394 MDBX_dbx *const dbx, uint8_t *const dbstate) { 20395 couple->outer.mc_signature = MDBX_MC_LIVE; 20396 couple->outer.mc_next = NULL; 20397 couple->outer.mc_backup = NULL; 20398 couple->outer.mc_dbi = dbi; 20399 couple->outer.mc_txn = txn; 20400 couple->outer.mc_db = db; 20401 couple->outer.mc_dbx = dbx; 20402 couple->outer.mc_dbistate = dbstate; 20403 couple->outer.mc_snum = 0; 20404 couple->outer.mc_top = 0; 20405 couple->outer.mc_pg[0] = 0; 20406 couple->outer.mc_flags = 0; 20407 STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && 20408 CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); 20409 couple->outer.mc_checking = 20410 (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION)) 20411 ? CC_PAGECHECK | CC_LEAF 20412 : CC_LEAF; 20413 couple->outer.mc_ki[0] = 0; 20414 couple->outer.mc_xcursor = NULL; 20415 20416 int rc = MDBX_SUCCESS; 20417 if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { 20418 rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); 20419 rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; 20420 } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { 20421 rc = setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, 20422 txn->mt_env->me_psize); 20423 } 20424 20425 if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { 20426 couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; 20427 couple->outer.mc_xcursor = &couple->inner; 20428 rc = cursor_xinit0(&couple->outer); 20429 if (unlikely(rc != MDBX_SUCCESS)) 20430 return rc; 20431 couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; 20432 couple->inner.mx_dbx.md_klen_max = couple->outer.mc_dbx->md_vlen_max; 20433 } 20434 return rc; 20435 } 20436 20437 /* Initialize a cursor for a given transaction and database. */ 20438 static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { 20439 STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); 20440 return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, 20441 &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], 20442 &txn->mt_dbistate[dbi]); 20443 } 20444 20445 MDBX_cursor *mdbx_cursor_create(void *context) { 20446 MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple)); 20447 if (unlikely(!couple)) 20448 return nullptr; 20449 20450 couple->outer.mc_signature = MDBX_MC_READY4CLOSE; 20451 couple->outer.mc_dbi = UINT_MAX; 20452 couple->mc_userctx = context; 20453 return &couple->outer; 20454 } 20455 20456 int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { 20457 if (unlikely(!mc)) 20458 return MDBX_EINVAL; 20459 20460 if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && 20461 mc->mc_signature != MDBX_MC_LIVE)) 20462 return MDBX_EBADSIGN; 20463 20464 MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); 20465 couple->mc_userctx = ctx; 20466 return MDBX_SUCCESS; 20467 } 20468 20469 void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { 20470 if (unlikely(!mc)) 20471 return nullptr; 20472 20473 if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && 20474 mc->mc_signature != MDBX_MC_LIVE)) 20475 return nullptr; 20476 20477 MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); 20478 return couple->mc_userctx; 20479 } 20480 20481 int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { 20482 if (unlikely(!mc)) 20483 return MDBX_EINVAL; 20484 20485 if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && 20486 mc->mc_signature != MDBX_MC_LIVE)) 20487 return MDBX_EBADSIGN; 20488 20489 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 20490 if (unlikely(rc != MDBX_SUCCESS)) 20491 return rc; 20492 20493 if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) 20494 return MDBX_BAD_DBI; 20495 20496 if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) 20497 return MDBX_EACCESS; 20498 20499 if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { 20500 cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); 20501 if (unlikely(mc->mc_dbi != dbi || 20502 /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || 20503 mc->mc_txn != txn)) 20504 return MDBX_EINVAL; 20505 20506 assert(mc->mc_db == &txn->mt_dbs[dbi]); 20507 assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); 20508 assert(mc->mc_dbi == dbi); 20509 assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); 20510 return likely(mc->mc_dbi == dbi && 20511 /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && 20512 mc->mc_txn == txn) 20513 ? MDBX_SUCCESS 20514 : MDBX_EINVAL /* Disallow change DBI in nested transactions */; 20515 } 20516 20517 if (mc->mc_signature == MDBX_MC_LIVE) { 20518 if (unlikely(!mc->mc_txn || 20519 mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { 20520 ERROR("Wrong cursor's transaction %p 0x%x", 20521 __Wpedantic_format_voidptr(mc->mc_txn), 20522 mc->mc_txn ? mc->mc_txn->mt_signature : 0); 20523 return MDBX_PROBLEM; 20524 } 20525 if (mc->mc_flags & C_UNTRACK) { 20526 MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; 20527 while (*prev && *prev != mc) 20528 prev = &(*prev)->mc_next; 20529 cASSERT(mc, *prev == mc); 20530 *prev = mc->mc_next; 20531 } 20532 mc->mc_signature = MDBX_MC_READY4CLOSE; 20533 mc->mc_flags = 0; 20534 mc->mc_dbi = UINT_MAX; 20535 mc->mc_next = NULL; 20536 mc->mc_db = NULL; 20537 mc->mc_dbx = NULL; 20538 mc->mc_dbistate = NULL; 20539 } 20540 cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); 20541 20542 rc = cursor_init(mc, txn, dbi); 20543 if (unlikely(rc != MDBX_SUCCESS)) 20544 return rc; 20545 20546 mc->mc_next = txn->mt_cursors[dbi]; 20547 txn->mt_cursors[dbi] = mc; 20548 mc->mc_flags |= C_UNTRACK; 20549 20550 return MDBX_SUCCESS; 20551 } 20552 20553 int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { 20554 if (unlikely(!ret)) 20555 return MDBX_EINVAL; 20556 *ret = NULL; 20557 20558 MDBX_cursor *const mc = mdbx_cursor_create(nullptr); 20559 if (unlikely(!mc)) 20560 return MDBX_ENOMEM; 20561 20562 int rc = mdbx_cursor_bind(txn, mc, dbi); 20563 if (unlikely(rc != MDBX_SUCCESS)) { 20564 mdbx_cursor_close(mc); 20565 return rc; 20566 } 20567 20568 *ret = mc; 20569 return MDBX_SUCCESS; 20570 } 20571 20572 int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { 20573 return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; 20574 } 20575 20576 int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { 20577 if (unlikely(!src)) 20578 return MDBX_EINVAL; 20579 if (unlikely(src->mc_signature != MDBX_MC_LIVE)) 20580 return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 20581 : MDBX_EBADSIGN; 20582 20583 int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi); 20584 if (unlikely(rc != MDBX_SUCCESS)) 20585 return rc; 20586 20587 assert(dest->mc_db == src->mc_db); 20588 assert(dest->mc_dbi == src->mc_dbi); 20589 assert(dest->mc_dbx == src->mc_dbx); 20590 assert(dest->mc_dbistate == src->mc_dbistate); 20591 again: 20592 assert(dest->mc_txn == src->mc_txn); 20593 dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; 20594 dest->mc_top = src->mc_top; 20595 dest->mc_snum = src->mc_snum; 20596 for (unsigned i = 0; i < src->mc_snum; ++i) { 20597 dest->mc_ki[i] = src->mc_ki[i]; 20598 dest->mc_pg[i] = src->mc_pg[i]; 20599 } 20600 20601 if (src->mc_xcursor) { 20602 dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db; 20603 dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx; 20604 src = &src->mc_xcursor->mx_cursor; 20605 dest = &dest->mc_xcursor->mx_cursor; 20606 goto again; 20607 } 20608 20609 return MDBX_SUCCESS; 20610 } 20611 20612 void mdbx_cursor_close(MDBX_cursor *mc) { 20613 if (likely(mc)) { 20614 ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE || 20615 mc->mc_signature == MDBX_MC_READY4CLOSE); 20616 MDBX_txn *const txn = mc->mc_txn; 20617 if (!mc->mc_backup) { 20618 mc->mc_txn = NULL; 20619 /* Unlink from txn, if tracked. */ 20620 if (mc->mc_flags & C_UNTRACK) { 20621 ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); 20622 MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi]; 20623 while (*prev && *prev != mc) 20624 prev = &(*prev)->mc_next; 20625 tASSERT(txn, *prev == mc); 20626 *prev = mc->mc_next; 20627 } 20628 mc->mc_signature = 0; 20629 mc->mc_next = mc; 20630 osal_free(mc); 20631 } else { 20632 /* Cursor closed before nested txn ends */ 20633 tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE); 20634 ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); 20635 mc->mc_signature = MDBX_MC_WAIT4EOT; 20636 } 20637 } 20638 } 20639 20640 MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { 20641 if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) 20642 return NULL; 20643 MDBX_txn *txn = mc->mc_txn; 20644 if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) 20645 return NULL; 20646 if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) 20647 return NULL; 20648 return txn; 20649 } 20650 20651 MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { 20652 if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) 20653 return UINT_MAX; 20654 return mc->mc_dbi; 20655 } 20656 20657 /* Return the count of duplicate data items for the current key */ 20658 int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { 20659 if (unlikely(mc == NULL)) 20660 return MDBX_EINVAL; 20661 20662 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 20663 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 20664 : MDBX_EBADSIGN; 20665 20666 int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); 20667 if (unlikely(rc != MDBX_SUCCESS)) 20668 return rc; 20669 20670 if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) 20671 return MDBX_EINVAL; 20672 20673 if (!mc->mc_snum) { 20674 *countp = 0; 20675 return MDBX_NOTFOUND; 20676 } 20677 20678 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 20679 if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { 20680 *countp = 0; 20681 return MDBX_NOTFOUND; 20682 } 20683 20684 *countp = 1; 20685 if (mc->mc_xcursor != NULL) { 20686 MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); 20687 if (node_flags(node) & F_DUPDATA) { 20688 cASSERT(mc, mc->mc_xcursor && 20689 (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); 20690 *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) 20691 ? PTRDIFF_MAX 20692 : (size_t)mc->mc_xcursor->mx_db.md_entries; 20693 } 20694 } 20695 return MDBX_SUCCESS; 20696 } 20697 20698 /* Replace the key for a branch node with a new key. 20699 * Set MDBX_TXN_ERROR on failure. 20700 * [in] mc Cursor pointing to the node to operate on. 20701 * [in] key The new key to use. 20702 * Returns 0 on success, non-zero on failure. */ 20703 static int update_key(MDBX_cursor *mc, const MDBX_val *key) { 20704 MDBX_page *mp; 20705 MDBX_node *node; 20706 char *base; 20707 size_t len; 20708 ptrdiff_t delta, ksize, oksize; 20709 int ptr, i, nkeys, indx; 20710 DKBUF_DEBUG; 20711 20712 cASSERT(mc, cursor_is_tracked(mc)); 20713 indx = mc->mc_ki[mc->mc_top]; 20714 mp = mc->mc_pg[mc->mc_top]; 20715 node = page_node(mp, indx); 20716 ptr = mp->mp_ptrs[indx]; 20717 #if MDBX_DEBUG 20718 MDBX_val k2; 20719 k2.iov_base = node_key(node); 20720 k2.iov_len = node_ks(node); 20721 DEBUG("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, ptr, 20722 DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); 20723 #endif /* MDBX_DEBUG */ 20724 20725 /* Sizes must be 2-byte aligned. */ 20726 ksize = EVEN(key->iov_len); 20727 oksize = EVEN(node_ks(node)); 20728 delta = ksize - oksize; 20729 20730 /* Shift node contents if EVEN(key length) changed. */ 20731 if (delta) { 20732 if (delta > (int)page_room(mp)) { 20733 /* not enough space left, do a delete and split */ 20734 DEBUG("Not enough room, delta = %zd, splitting...", delta); 20735 pgno_t pgno = node_pgno(node); 20736 node_del(mc, 0); 20737 int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); 20738 if (err == MDBX_SUCCESS && AUDIT_ENABLED()) 20739 err = cursor_check_updating(mc); 20740 return err; 20741 } 20742 20743 nkeys = page_numkeys(mp); 20744 for (i = 0; i < nkeys; i++) { 20745 if (mp->mp_ptrs[i] <= ptr) { 20746 cASSERT(mc, mp->mp_ptrs[i] >= delta); 20747 mp->mp_ptrs[i] -= (indx_t)delta; 20748 } 20749 } 20750 20751 base = (char *)mp + mp->mp_upper + PAGEHDRSZ; 20752 len = ptr - mp->mp_upper + NODESIZE; 20753 memmove(base - delta, base, len); 20754 cASSERT(mc, mp->mp_upper >= delta); 20755 mp->mp_upper -= (indx_t)delta; 20756 20757 node = page_node(mp, indx); 20758 } 20759 20760 /* But even if no shift was needed, update ksize */ 20761 node_set_ks(node, key->iov_len); 20762 20763 if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0)) 20764 memcpy(node_key(node), key->iov_base, key->iov_len); 20765 return MDBX_SUCCESS; 20766 } 20767 20768 /* Move a node from csrc to cdst. */ 20769 static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { 20770 int rc; 20771 DKBUF_DEBUG; 20772 20773 MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; 20774 MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; 20775 cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); 20776 cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi); 20777 cASSERT(csrc, csrc->mc_top == cdst->mc_top); 20778 if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { 20779 bailout: 20780 ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", 20781 PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); 20782 csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 20783 return MDBX_PROBLEM; 20784 } 20785 20786 MDBX_val key4move; 20787 switch (PAGETYPE_WHOLE(psrc)) { 20788 case P_BRANCH: { 20789 const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); 20790 cASSERT(csrc, node_flags(srcnode) == 0); 20791 const pgno_t srcpg = node_pgno(srcnode); 20792 key4move.iov_len = node_ks(srcnode); 20793 key4move.iov_base = node_key(srcnode); 20794 20795 if (csrc->mc_ki[csrc->mc_top] == 0) { 20796 const unsigned snum = csrc->mc_snum; 20797 cASSERT(csrc, snum > 0); 20798 /* must find the lowest key below src */ 20799 rc = page_search_lowest(csrc); 20800 MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; 20801 if (unlikely(rc)) 20802 return rc; 20803 cASSERT(csrc, IS_LEAF(lowest_page)); 20804 if (unlikely(!IS_LEAF(lowest_page))) 20805 goto bailout; 20806 if (IS_LEAF2(lowest_page)) { 20807 key4move.iov_len = csrc->mc_db->md_xsize; 20808 key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len); 20809 } else { 20810 const MDBX_node *lowest_node = page_node(lowest_page, 0); 20811 key4move.iov_len = node_ks(lowest_node); 20812 key4move.iov_base = node_key(lowest_node); 20813 } 20814 20815 /* restore cursor after mdbx_page_search_lowest() */ 20816 csrc->mc_snum = (uint8_t)snum; 20817 csrc->mc_top = (uint8_t)snum - 1; 20818 csrc->mc_ki[csrc->mc_top] = 0; 20819 20820 /* paranoia */ 20821 cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); 20822 cASSERT(csrc, IS_BRANCH(psrc)); 20823 if (unlikely(!IS_BRANCH(psrc))) 20824 goto bailout; 20825 } 20826 20827 if (cdst->mc_ki[cdst->mc_top] == 0) { 20828 const unsigned snum = cdst->mc_snum; 20829 cASSERT(csrc, snum > 0); 20830 MDBX_cursor mn; 20831 cursor_copy(cdst, &mn); 20832 /* must find the lowest key below dst */ 20833 rc = page_search_lowest(&mn); 20834 if (unlikely(rc)) 20835 return rc; 20836 MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; 20837 cASSERT(cdst, IS_LEAF(lowest_page)); 20838 if (unlikely(!IS_LEAF(lowest_page))) 20839 goto bailout; 20840 MDBX_val key; 20841 if (IS_LEAF2(lowest_page)) { 20842 key.iov_len = mn.mc_db->md_xsize; 20843 key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len); 20844 } else { 20845 MDBX_node *lowest_node = page_node(lowest_page, 0); 20846 key.iov_len = node_ks(lowest_node); 20847 key.iov_base = node_key(lowest_node); 20848 } 20849 20850 /* restore cursor after mdbx_page_search_lowest() */ 20851 mn.mc_snum = (uint8_t)snum; 20852 mn.mc_top = (uint8_t)snum - 1; 20853 mn.mc_ki[mn.mc_top] = 0; 20854 20855 const intptr_t delta = 20856 EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0))); 20857 const intptr_t needed = 20858 branch_size(cdst->mc_txn->mt_env, &key4move) + delta; 20859 const intptr_t have = page_room(pdst); 20860 if (unlikely(needed > have)) 20861 return MDBX_RESULT_TRUE; 20862 20863 if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) 20864 return rc; 20865 psrc = csrc->mc_pg[csrc->mc_top]; 20866 pdst = cdst->mc_pg[cdst->mc_top]; 20867 20868 WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); 20869 if (unlikely(rc)) 20870 return rc; 20871 } else { 20872 const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move); 20873 const size_t have = page_room(pdst); 20874 if (unlikely(needed > have)) 20875 return MDBX_RESULT_TRUE; 20876 20877 if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) 20878 return rc; 20879 psrc = csrc->mc_pg[csrc->mc_top]; 20880 pdst = cdst->mc_pg[cdst->mc_top]; 20881 } 20882 20883 DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO 20884 " to node %u on page %" PRIaPGNO, 20885 "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), 20886 psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); 20887 /* Add the node to the destination page. */ 20888 rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); 20889 } break; 20890 20891 case P_LEAF: { 20892 /* Mark src and dst as dirty. */ 20893 if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) 20894 return rc; 20895 psrc = csrc->mc_pg[csrc->mc_top]; 20896 pdst = cdst->mc_pg[cdst->mc_top]; 20897 const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); 20898 MDBX_val data; 20899 data.iov_len = node_ds(srcnode); 20900 data.iov_base = node_data(srcnode); 20901 key4move.iov_len = node_ks(srcnode); 20902 key4move.iov_base = node_key(srcnode); 20903 DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO 20904 " to node %u on page %" PRIaPGNO, 20905 "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), 20906 psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); 20907 /* Add the node to the destination page. */ 20908 rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, 20909 node_flags(srcnode)); 20910 } break; 20911 20912 case P_LEAF | P_LEAF2: { 20913 /* Mark src and dst as dirty. */ 20914 if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) 20915 return rc; 20916 psrc = csrc->mc_pg[csrc->mc_top]; 20917 pdst = cdst->mc_pg[cdst->mc_top]; 20918 key4move.iov_len = csrc->mc_db->md_xsize; 20919 key4move.iov_base = 20920 page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); 20921 DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO 20922 " to node %u on page %" PRIaPGNO, 20923 "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), 20924 psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); 20925 /* Add the node to the destination page. */ 20926 rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); 20927 } break; 20928 20929 default: 20930 assert(false); 20931 goto bailout; 20932 } 20933 20934 if (unlikely(rc != MDBX_SUCCESS)) 20935 return rc; 20936 20937 /* Delete the node from the source page. */ 20938 node_del(csrc, key4move.iov_len); 20939 20940 cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); 20941 cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); 20942 cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); 20943 20944 { 20945 /* Adjust other cursors pointing to mp */ 20946 MDBX_cursor *m2, *m3; 20947 const MDBX_dbi dbi = csrc->mc_dbi; 20948 cASSERT(csrc, csrc->mc_top == cdst->mc_top); 20949 if (fromleft) { 20950 /* If we're adding on the left, bump others up */ 20951 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { 20952 m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 20953 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 20954 continue; 20955 if (m3 != cdst && m3->mc_pg[csrc->mc_top] == pdst && 20956 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { 20957 m3->mc_ki[csrc->mc_top]++; 20958 } 20959 if (m3 != csrc && m3->mc_pg[csrc->mc_top] == psrc && 20960 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { 20961 m3->mc_pg[csrc->mc_top] = pdst; 20962 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 20963 cASSERT(csrc, csrc->mc_top > 0); 20964 m3->mc_ki[csrc->mc_top - 1]++; 20965 } 20966 if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) 20967 XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); 20968 } 20969 } else { 20970 /* Adding on the right, bump others down */ 20971 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { 20972 m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 20973 if (m3 == csrc) 20974 continue; 20975 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 20976 continue; 20977 if (m3->mc_pg[csrc->mc_top] == psrc) { 20978 if (!m3->mc_ki[csrc->mc_top]) { 20979 m3->mc_pg[csrc->mc_top] = pdst; 20980 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 20981 cASSERT(csrc, csrc->mc_top > 0); 20982 m3->mc_ki[csrc->mc_top - 1]--; 20983 } else { 20984 m3->mc_ki[csrc->mc_top]--; 20985 } 20986 if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) 20987 XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], 20988 m3->mc_ki[csrc->mc_top]); 20989 } 20990 } 20991 } 20992 } 20993 20994 /* Update the parent separators. */ 20995 if (csrc->mc_ki[csrc->mc_top] == 0) { 20996 cASSERT(csrc, csrc->mc_top > 0); 20997 if (csrc->mc_ki[csrc->mc_top - 1] != 0) { 20998 MDBX_val key; 20999 if (IS_LEAF2(psrc)) { 21000 key.iov_len = psrc->mp_leaf2_ksize; 21001 key.iov_base = page_leaf2key(psrc, 0, key.iov_len); 21002 } else { 21003 MDBX_node *srcnode = page_node(psrc, 0); 21004 key.iov_len = node_ks(srcnode); 21005 key.iov_base = node_key(srcnode); 21006 } 21007 DEBUG("update separator for source page %" PRIaPGNO " to [%s]", 21008 psrc->mp_pgno, DKEY_DEBUG(&key)); 21009 MDBX_cursor mn; 21010 cursor_copy(csrc, &mn); 21011 cASSERT(csrc, mn.mc_snum > 0); 21012 mn.mc_snum--; 21013 mn.mc_top--; 21014 /* We want rebalance to find mn when doing fixups */ 21015 WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); 21016 if (unlikely(rc != MDBX_SUCCESS)) 21017 return rc; 21018 } 21019 if (IS_BRANCH(psrc)) { 21020 const MDBX_val nullkey = {0, 0}; 21021 const indx_t ix = csrc->mc_ki[csrc->mc_top]; 21022 csrc->mc_ki[csrc->mc_top] = 0; 21023 rc = update_key(csrc, &nullkey); 21024 csrc->mc_ki[csrc->mc_top] = ix; 21025 cASSERT(csrc, rc == MDBX_SUCCESS); 21026 } 21027 } 21028 21029 if (cdst->mc_ki[cdst->mc_top] == 0) { 21030 cASSERT(cdst, cdst->mc_top > 0); 21031 if (cdst->mc_ki[cdst->mc_top - 1] != 0) { 21032 MDBX_val key; 21033 if (IS_LEAF2(pdst)) { 21034 key.iov_len = pdst->mp_leaf2_ksize; 21035 key.iov_base = page_leaf2key(pdst, 0, key.iov_len); 21036 } else { 21037 MDBX_node *srcnode = page_node(pdst, 0); 21038 key.iov_len = node_ks(srcnode); 21039 key.iov_base = node_key(srcnode); 21040 } 21041 DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", 21042 pdst->mp_pgno, DKEY_DEBUG(&key)); 21043 MDBX_cursor mn; 21044 cursor_copy(cdst, &mn); 21045 cASSERT(cdst, mn.mc_snum > 0); 21046 mn.mc_snum--; 21047 mn.mc_top--; 21048 /* We want rebalance to find mn when doing fixups */ 21049 WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); 21050 if (unlikely(rc != MDBX_SUCCESS)) 21051 return rc; 21052 } 21053 if (IS_BRANCH(pdst)) { 21054 const MDBX_val nullkey = {0, 0}; 21055 const indx_t ix = cdst->mc_ki[cdst->mc_top]; 21056 cdst->mc_ki[cdst->mc_top] = 0; 21057 rc = update_key(cdst, &nullkey); 21058 cdst->mc_ki[cdst->mc_top] = ix; 21059 cASSERT(cdst, rc == MDBX_SUCCESS); 21060 } 21061 } 21062 21063 return MDBX_SUCCESS; 21064 } 21065 21066 /* Merge one page into another. 21067 * 21068 * The nodes from the page pointed to by csrc will be copied to the page 21069 * pointed to by cdst and then the csrc page will be freed. 21070 * 21071 * [in] csrc Cursor pointing to the source page. 21072 * [in] cdst Cursor pointing to the destination page. 21073 * 21074 * Returns 0 on success, non-zero on failure. */ 21075 static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { 21076 MDBX_val key; 21077 int rc; 21078 21079 cASSERT(csrc, csrc != cdst); 21080 cASSERT(csrc, cursor_is_tracked(csrc)); 21081 cASSERT(cdst, cursor_is_tracked(cdst)); 21082 const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; 21083 MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; 21084 DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, 21085 pdst->mp_pgno); 21086 21087 cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); 21088 cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); 21089 cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */ 21090 cASSERT(cdst, cdst->mc_snum > 1); 21091 cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || 21092 IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); 21093 cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || 21094 IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); 21095 cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); 21096 const int pagetype = PAGETYPE_WHOLE(psrc); 21097 21098 /* Move all nodes from src to dst */ 21099 const unsigned dst_nkeys = page_numkeys(pdst); 21100 const unsigned src_nkeys = page_numkeys(psrc); 21101 cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); 21102 if (likely(src_nkeys)) { 21103 unsigned j = dst_nkeys; 21104 if (unlikely(pagetype & P_LEAF2)) { 21105 /* Mark dst as dirty. */ 21106 if (unlikely(rc = page_touch(cdst))) 21107 return rc; 21108 21109 key.iov_len = csrc->mc_db->md_xsize; 21110 key.iov_base = page_data(psrc); 21111 unsigned i = 0; 21112 do { 21113 rc = node_add_leaf2(cdst, j++, &key); 21114 if (unlikely(rc != MDBX_SUCCESS)) 21115 return rc; 21116 key.iov_base = (char *)key.iov_base + key.iov_len; 21117 } while (++i != src_nkeys); 21118 } else { 21119 MDBX_node *srcnode = page_node(psrc, 0); 21120 key.iov_len = node_ks(srcnode); 21121 key.iov_base = node_key(srcnode); 21122 if (pagetype & P_BRANCH) { 21123 MDBX_cursor mn; 21124 cursor_copy(csrc, &mn); 21125 /* must find the lowest key below src */ 21126 rc = page_search_lowest(&mn); 21127 if (unlikely(rc)) 21128 return rc; 21129 21130 const MDBX_page *mp = mn.mc_pg[mn.mc_top]; 21131 if (likely(!IS_LEAF2(mp))) { 21132 cASSERT(&mn, IS_LEAF(mp)); 21133 const MDBX_node *lowest = page_node(mp, 0); 21134 key.iov_len = node_ks(lowest); 21135 key.iov_base = node_key(lowest); 21136 } else { 21137 cASSERT(&mn, mn.mc_top > csrc->mc_top); 21138 key.iov_len = mp->mp_leaf2_ksize; 21139 key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); 21140 } 21141 cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); 21142 cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); 21143 21144 const size_t dst_room = page_room(pdst); 21145 const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); 21146 const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; 21147 if (unlikely(space_needed > dst_room)) 21148 return MDBX_RESULT_TRUE; 21149 } 21150 21151 /* Mark dst as dirty. */ 21152 if (unlikely(rc = page_touch(cdst))) 21153 return rc; 21154 21155 unsigned i = 0; 21156 while (true) { 21157 if (pagetype & P_LEAF) { 21158 MDBX_val data; 21159 data.iov_len = node_ds(srcnode); 21160 data.iov_base = node_data(srcnode); 21161 rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); 21162 } else { 21163 cASSERT(csrc, node_flags(srcnode) == 0); 21164 rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); 21165 } 21166 if (unlikely(rc != MDBX_SUCCESS)) 21167 return rc; 21168 21169 if (++i == src_nkeys) 21170 break; 21171 srcnode = page_node(psrc, i); 21172 key.iov_len = node_ks(srcnode); 21173 key.iov_base = node_key(srcnode); 21174 } 21175 } 21176 21177 pdst = cdst->mc_pg[cdst->mc_top]; 21178 DEBUG("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", 21179 pdst->mp_pgno, page_numkeys(pdst), 21180 page_fill(cdst->mc_txn->mt_env, pdst)); 21181 21182 cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); 21183 cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); 21184 } 21185 21186 /* Unlink the src page from parent and add to free list. */ 21187 csrc->mc_top--; 21188 node_del(csrc, 0); 21189 if (csrc->mc_ki[csrc->mc_top] == 0) { 21190 const MDBX_val nullkey = {0, 0}; 21191 rc = update_key(csrc, &nullkey); 21192 if (unlikely(rc)) { 21193 csrc->mc_top++; 21194 return rc; 21195 } 21196 } 21197 csrc->mc_top++; 21198 21199 cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); 21200 cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); 21201 21202 { 21203 /* Adjust other cursors pointing to mp */ 21204 MDBX_cursor *m2, *m3; 21205 const MDBX_dbi dbi = csrc->mc_dbi; 21206 const unsigned top = csrc->mc_top; 21207 21208 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { 21209 m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 21210 if (m3 == csrc || top >= m3->mc_snum) 21211 continue; 21212 if (m3->mc_pg[top] == psrc) { 21213 m3->mc_pg[top] = pdst; 21214 cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); 21215 m3->mc_ki[top] += (indx_t)dst_nkeys; 21216 m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; 21217 } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && 21218 m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { 21219 m3->mc_ki[top - 1]--; 21220 } 21221 if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) 21222 XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); 21223 } 21224 } 21225 21226 /* If not operating on GC, allow this page to be reused 21227 * in this txn. Otherwise just add to free list. */ 21228 rc = page_retire(csrc, (MDBX_page *)psrc); 21229 if (unlikely(rc)) 21230 return rc; 21231 21232 cASSERT(cdst, cdst->mc_db->md_entries > 0); 21233 cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); 21234 cASSERT(cdst, cdst->mc_top > 0); 21235 cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); 21236 MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; 21237 const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; 21238 const unsigned save_snum = cdst->mc_snum; 21239 const uint16_t save_depth = cdst->mc_db->md_depth; 21240 cursor_pop(cdst); 21241 rc = rebalance(cdst); 21242 if (unlikely(rc)) 21243 return rc; 21244 21245 cASSERT(cdst, cdst->mc_db->md_entries > 0); 21246 cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); 21247 cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); 21248 21249 #if MDBX_ENABLE_PGOP_STAT 21250 cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1; 21251 #endif /* MDBX_ENABLE_PGOP_STAT */ 21252 21253 if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { 21254 /* LY: don't touch cursor if top-page is a LEAF */ 21255 cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || 21256 PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); 21257 return MDBX_SUCCESS; 21258 } 21259 21260 cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); 21261 21262 if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { 21263 /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ 21264 goto bailout; 21265 } 21266 21267 if (top_page == cdst->mc_pg[cdst->mc_top]) { 21268 /* LY: don't touch cursor if prev top-page already on the top */ 21269 cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); 21270 cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || 21271 PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); 21272 return MDBX_SUCCESS; 21273 } 21274 21275 const int new_snum = save_snum - save_depth + cdst->mc_db->md_depth; 21276 if (unlikely(new_snum < 1 || new_snum > cdst->mc_db->md_depth)) { 21277 /* LY: out of range, unable restore cursor's stack */ 21278 goto bailout; 21279 } 21280 21281 if (top_page == cdst->mc_pg[new_snum - 1]) { 21282 cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx); 21283 /* LY: restore cursor stack */ 21284 cdst->mc_snum = (uint8_t)new_snum; 21285 cdst->mc_top = (uint8_t)new_snum - 1; 21286 cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || 21287 IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); 21288 cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || 21289 PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); 21290 return MDBX_SUCCESS; 21291 } 21292 21293 MDBX_page *const stub_page = (MDBX_page *)(~(uintptr_t)top_page); 21294 const indx_t stub_indx = top_indx; 21295 if (save_depth > cdst->mc_db->md_depth && 21296 ((cdst->mc_pg[save_snum - 1] == top_page && 21297 cdst->mc_ki[save_snum - 1] == top_indx) || 21298 (cdst->mc_pg[save_snum - 1] == stub_page && 21299 cdst->mc_ki[save_snum - 1] == stub_indx))) { 21300 /* LY: restore cursor stack */ 21301 cdst->mc_pg[new_snum - 1] = top_page; 21302 cdst->mc_ki[new_snum - 1] = top_indx; 21303 cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); 21304 cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; 21305 cdst->mc_snum = (uint8_t)new_snum; 21306 cdst->mc_top = (uint8_t)new_snum - 1; 21307 cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || 21308 IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); 21309 cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || 21310 PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); 21311 return MDBX_SUCCESS; 21312 } 21313 21314 bailout: 21315 /* LY: unable restore cursor's stack */ 21316 cdst->mc_flags &= ~C_INITIALIZED; 21317 return MDBX_CURSOR_FULL; 21318 } 21319 21320 static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { 21321 cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi); 21322 cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); 21323 cASSERT(cdst, cdst->mc_db == csrc->mc_db); 21324 cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); 21325 cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); 21326 cdst->mc_snum = csrc->mc_snum; 21327 cdst->mc_top = csrc->mc_top; 21328 cdst->mc_flags = csrc->mc_flags; 21329 cdst->mc_checking = csrc->mc_checking; 21330 21331 for (unsigned i = 0; i < csrc->mc_snum; i++) { 21332 cdst->mc_pg[i] = csrc->mc_pg[i]; 21333 cdst->mc_ki[i] = csrc->mc_ki[i]; 21334 } 21335 } 21336 21337 /* Copy the contents of a cursor. 21338 * [in] csrc The cursor to copy from. 21339 * [out] cdst The cursor to copy to. */ 21340 static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { 21341 cASSERT(csrc, csrc->mc_txn->mt_txnid >= 21342 csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); 21343 cdst->mc_dbi = csrc->mc_dbi; 21344 cdst->mc_next = NULL; 21345 cdst->mc_backup = NULL; 21346 cdst->mc_xcursor = NULL; 21347 cdst->mc_txn = csrc->mc_txn; 21348 cdst->mc_db = csrc->mc_db; 21349 cdst->mc_dbx = csrc->mc_dbx; 21350 cdst->mc_dbistate = csrc->mc_dbistate; 21351 cursor_restore(csrc, cdst); 21352 } 21353 21354 /* Rebalance the tree after a delete operation. 21355 * [in] mc Cursor pointing to the page where rebalancing should begin. 21356 * Returns 0 on success, non-zero on failure. */ 21357 static int rebalance(MDBX_cursor *mc) { 21358 cASSERT(mc, cursor_is_tracked(mc)); 21359 cASSERT(mc, mc->mc_snum > 0); 21360 cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || 21361 IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); 21362 const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); 21363 21364 STATIC_ASSERT(P_BRANCH == 1); 21365 const unsigned minkeys = (pagetype & P_BRANCH) + 1; 21366 21367 /* Pages emptier than this are candidates for merging. */ 21368 unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI) 21369 ? mc->mc_txn->mt_env->me_merge_threshold 21370 : mc->mc_txn->mt_env->me_merge_threshold_gc; 21371 21372 const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; 21373 const unsigned numkeys = page_numkeys(tp); 21374 const unsigned room = page_room(tp); 21375 DEBUG("rebalancing %s page %" PRIaPGNO 21376 " (has %u keys, full %.1f%%, used %u, room %u bytes )", 21377 (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, 21378 page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), 21379 room); 21380 21381 if (unlikely(numkeys < minkeys)) { 21382 DEBUG("page %" PRIaPGNO " must be merged due keys < %u threshold", 21383 tp->mp_pgno, minkeys); 21384 } else if (unlikely(room > room_threshold)) { 21385 DEBUG("page %" PRIaPGNO " should be merged due room %u > %u threshold", 21386 tp->mp_pgno, room, room_threshold); 21387 } else { 21388 DEBUG("no need to rebalance page %" PRIaPGNO ", room %u < %u threshold", 21389 tp->mp_pgno, room, room_threshold); 21390 cASSERT(mc, mc->mc_db->md_entries > 0); 21391 return MDBX_SUCCESS; 21392 } 21393 21394 int rc; 21395 if (mc->mc_snum < 2) { 21396 MDBX_page *const mp = mc->mc_pg[0]; 21397 const unsigned nkeys = page_numkeys(mp); 21398 cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); 21399 if (IS_SUBP(mp)) { 21400 DEBUG("%s", "Can't rebalance a subpage, ignoring"); 21401 cASSERT(mc, pagetype & P_LEAF); 21402 return MDBX_SUCCESS; 21403 } 21404 if (nkeys == 0) { 21405 cASSERT(mc, IS_LEAF(mp)); 21406 DEBUG("%s", "tree is completely empty"); 21407 cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); 21408 mc->mc_db->md_root = P_INVALID; 21409 mc->mc_db->md_depth = 0; 21410 cASSERT(mc, mc->mc_db->md_branch_pages == 0 && 21411 mc->mc_db->md_overflow_pages == 0 && 21412 mc->mc_db->md_leaf_pages == 1); 21413 /* Adjust cursors pointing to mp */ 21414 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; 21415 m2 = m2->mc_next) { 21416 MDBX_cursor *m3 = 21417 (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 21418 if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) 21419 continue; 21420 if (m3->mc_pg[0] == mp) { 21421 m3->mc_snum = 0; 21422 m3->mc_top = 0; 21423 m3->mc_flags &= ~C_INITIALIZED; 21424 } 21425 } 21426 mc->mc_snum = 0; 21427 mc->mc_top = 0; 21428 mc->mc_flags &= ~C_INITIALIZED; 21429 21430 rc = page_retire(mc, mp); 21431 if (unlikely(rc != MDBX_SUCCESS)) 21432 return rc; 21433 } else if (IS_BRANCH(mp) && nkeys == 1) { 21434 DEBUG("%s", "collapsing root page!"); 21435 mc->mc_db->md_root = node_pgno(page_node(mp, 0)); 21436 rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); 21437 if (unlikely(rc != MDBX_SUCCESS)) 21438 return rc; 21439 mc->mc_db->md_depth--; 21440 mc->mc_ki[0] = mc->mc_ki[1]; 21441 for (int i = 1; i < mc->mc_db->md_depth; i++) { 21442 mc->mc_pg[i] = mc->mc_pg[i + 1]; 21443 mc->mc_ki[i] = mc->mc_ki[i + 1]; 21444 } 21445 21446 /* Adjust other cursors pointing to mp */ 21447 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; 21448 m2 = m2->mc_next) { 21449 MDBX_cursor *m3 = 21450 (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 21451 if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) 21452 continue; 21453 if (m3->mc_pg[0] == mp) { 21454 for (int i = 0; i < mc->mc_db->md_depth; i++) { 21455 m3->mc_pg[i] = m3->mc_pg[i + 1]; 21456 m3->mc_ki[i] = m3->mc_ki[i + 1]; 21457 } 21458 m3->mc_snum--; 21459 m3->mc_top--; 21460 } 21461 } 21462 cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || 21463 PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); 21464 cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || 21465 IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); 21466 21467 rc = page_retire(mc, mp); 21468 if (unlikely(rc != MDBX_SUCCESS)) 21469 return rc; 21470 } else { 21471 DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", 21472 mp->mp_pgno, mp->mp_flags); 21473 } 21474 return MDBX_SUCCESS; 21475 } 21476 21477 /* The parent (branch page) must have at least 2 pointers, 21478 * otherwise the tree is invalid. */ 21479 const unsigned pre_top = mc->mc_top - 1; 21480 cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); 21481 cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); 21482 cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); 21483 21484 /* Leaf page fill factor is below the threshold. 21485 * Try to move keys from left or right neighbor, or 21486 * merge with a neighbor page. */ 21487 21488 /* Find neighbors. */ 21489 MDBX_cursor mn; 21490 cursor_copy(mc, &mn); 21491 21492 MDBX_page *left = nullptr, *right = nullptr; 21493 if (mn.mc_ki[pre_top] > 0) { 21494 rc = page_get( 21495 &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), 21496 &left, mc->mc_pg[mc->mc_top]->mp_txnid); 21497 if (unlikely(rc != MDBX_SUCCESS)) 21498 return rc; 21499 cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); 21500 } 21501 if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { 21502 rc = page_get( 21503 &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), 21504 &right, mc->mc_pg[mc->mc_top]->mp_txnid); 21505 if (unlikely(rc != MDBX_SUCCESS)) 21506 return rc; 21507 cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); 21508 } 21509 cASSERT(mc, left || right); 21510 21511 const unsigned ki_top = mc->mc_ki[mc->mc_top]; 21512 const unsigned ki_pre_top = mn.mc_ki[pre_top]; 21513 const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); 21514 21515 const unsigned left_room = left ? page_room(left) : 0; 21516 const unsigned right_room = right ? page_room(right) : 0; 21517 const unsigned left_nkeys = left ? page_numkeys(left) : 0; 21518 const unsigned right_nkeys = right ? page_numkeys(right) : 0; 21519 retry: 21520 if (left_room > room_threshold && left_room >= right_room) { 21521 /* try merge with left */ 21522 cASSERT(mc, left_nkeys >= minkeys); 21523 mn.mc_pg[mn.mc_top] = left; 21524 mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); 21525 mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); 21526 mc->mc_ki[mc->mc_top] = 0; 21527 const unsigned new_ki = ki_top + left_nkeys; 21528 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; 21529 /* We want rebalance to find mn when doing fixups */ 21530 WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); 21531 if (likely(rc != MDBX_RESULT_TRUE)) { 21532 cursor_restore(&mn, mc); 21533 mc->mc_ki[mc->mc_top] = (indx_t)new_ki; 21534 cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); 21535 return rc; 21536 } 21537 } 21538 if (right_room > room_threshold) { 21539 /* try merge with right */ 21540 cASSERT(mc, right_nkeys >= minkeys); 21541 mn.mc_pg[mn.mc_top] = right; 21542 mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); 21543 mn.mc_ki[mn.mc_top] = 0; 21544 mc->mc_ki[mc->mc_top] = (indx_t)nkeys; 21545 WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc)); 21546 if (likely(rc != MDBX_RESULT_TRUE)) { 21547 mc->mc_ki[mc->mc_top] = (indx_t)ki_top; 21548 cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); 21549 return rc; 21550 } 21551 } 21552 21553 if (left_nkeys > minkeys && 21554 (right_nkeys <= left_nkeys || right_room >= left_room)) { 21555 /* try move from left */ 21556 mn.mc_pg[mn.mc_top] = left; 21557 mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); 21558 mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); 21559 mc->mc_ki[mc->mc_top] = 0; 21560 WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true)); 21561 if (likely(rc != MDBX_RESULT_TRUE)) { 21562 mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); 21563 cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); 21564 return rc; 21565 } 21566 } 21567 if (right_nkeys > minkeys) { 21568 /* try move from right */ 21569 mn.mc_pg[mn.mc_top] = right; 21570 mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); 21571 mn.mc_ki[mn.mc_top] = 0; 21572 mc->mc_ki[mc->mc_top] = (indx_t)nkeys; 21573 WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false)); 21574 if (likely(rc != MDBX_RESULT_TRUE)) { 21575 mc->mc_ki[mc->mc_top] = (indx_t)ki_top; 21576 cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); 21577 return rc; 21578 } 21579 } 21580 21581 if (nkeys >= minkeys) { 21582 mc->mc_ki[mc->mc_top] = (indx_t)ki_top; 21583 if (AUDIT_ENABLED()) 21584 return cursor_check_updating(mc); 21585 return MDBX_SUCCESS; 21586 } 21587 21588 if (likely(room_threshold > 0)) { 21589 room_threshold = 0; 21590 goto retry; 21591 } 21592 ERROR("Unable to merge/rebalance %s page %" PRIaPGNO 21593 " (has %u keys, full %.1f%%, used %u, room %u bytes )", 21594 (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, 21595 page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), 21596 room); 21597 return MDBX_PROBLEM; 21598 } 21599 21600 __cold static int page_check(MDBX_cursor *const mc, const MDBX_page *const mp) { 21601 DKBUF; 21602 int rc = MDBX_SUCCESS; 21603 if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) 21604 rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); 21605 21606 MDBX_env *const env = mc->mc_txn->mt_env; 21607 const ptrdiff_t offset = (uint8_t *)mp - env->me_dxb_mmap.dxb; 21608 unsigned flags_mask = P_ILL_BITS; 21609 unsigned flags_expected = 0; 21610 if (offset < 0 || 21611 offset > (ptrdiff_t)(env->me_dxb_mmap.current - ((mp->mp_flags & P_SUBP) 21612 ? PAGEHDRSZ + 1 21613 : env->me_psize))) { 21614 /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ 21615 flags_mask -= P_SUBP; 21616 if ((env->me_flags & MDBX_WRITEMAP) != 0 || 21617 (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP))) 21618 rc = bad_page(mp, "invalid page-address %p, offset %zi\n", 21619 __Wpedantic_format_voidptr(mp), offset); 21620 } else if (offset & (env->me_psize - 1)) 21621 flags_expected = P_SUBP; 21622 21623 if (unlikely((mp->mp_flags & flags_mask) != flags_expected)) 21624 rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", 21625 mp->mp_flags & flags_mask, flags_expected); 21626 21627 cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); 21628 const uint8_t type = PAGETYPE_WHOLE(mp); 21629 switch (type) { 21630 default: 21631 return bad_page(mp, "invalid type (%u)\n", type); 21632 case P_OVERFLOW: 21633 if (unlikely(mc->mc_flags & C_SUB)) 21634 rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", 21635 "nested dupsort tree", mc->mc_db->md_flags); 21636 const pgno_t npages = mp->mp_pages; 21637 if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) 21638 rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); 21639 if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) 21640 rc = bad_page( 21641 mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", 21642 mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno); 21643 return rc; //-------------------------- end of large/overflow page handling 21644 case P_LEAF | P_SUBP: 21645 if (unlikely(mc->mc_db->md_depth != 1)) 21646 rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", 21647 "leaf-sub", "nested dupsort db", mc->mc_db->md_flags); 21648 /* fall through */ 21649 __fallthrough; 21650 case P_LEAF: 21651 if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) 21652 rc = bad_page( 21653 mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", 21654 mc->mc_db->md_flags); 21655 break; 21656 case P_LEAF | P_LEAF2 | P_SUBP: 21657 if (unlikely(mc->mc_db->md_depth != 1)) 21658 rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", 21659 "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags); 21660 /* fall through */ 21661 __fallthrough; 21662 case P_LEAF | P_LEAF2: 21663 if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) 21664 rc = bad_page( 21665 mp, 21666 "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n", 21667 mc->mc_db->md_flags); 21668 break; 21669 case P_BRANCH: 21670 break; 21671 } 21672 21673 if (unlikely(mp->mp_upper < mp->mp_lower || 21674 ((mp->mp_lower | mp->mp_upper) & 1) || 21675 PAGEHDRSZ + mp->mp_upper > env->me_psize)) 21676 rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %u\n", 21677 mp->mp_lower, mp->mp_upper, page_space(env)); 21678 21679 char *const end_of_page = (char *)mp + env->me_psize; 21680 const unsigned nkeys = page_numkeys(mp); 21681 STATIC_ASSERT(P_BRANCH == 1); 21682 if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { 21683 if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && 21684 (!(mc->mc_checking & CC_UPDATING) || 21685 !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) 21686 rc = 21687 bad_page(mp, "%s-page nkeys (%u) < %u\n", 21688 IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); 21689 } 21690 if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + 21691 nkeys * sizeof(MDBX_node) + nkeys - 1 > 21692 env->me_psize)) 21693 rc = bad_page(mp, "invalid page upper (%u) for nkeys %u with limit %u\n", 21694 mp->mp_upper, nkeys, page_space(env)); 21695 21696 const size_t ksize_max = keysize_max(env->me_psize, 0); 21697 const size_t leaf2_ksize = mp->mp_leaf2_ksize; 21698 if (IS_LEAF2(mp)) { 21699 if (unlikely((mc->mc_flags & C_SUB) == 0 || 21700 (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) 21701 rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", 21702 mc->mc_db->md_flags); 21703 if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) 21704 rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); 21705 } 21706 21707 MDBX_val here, prev = {0, 0}; 21708 for (unsigned i = 0; i < nkeys; ++i) { 21709 if (IS_LEAF2(mp)) { 21710 char *const key = page_leaf2key(mp, i, leaf2_ksize); 21711 if (unlikely(end_of_page < key + leaf2_ksize)) { 21712 rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", 21713 key + leaf2_ksize - end_of_page); 21714 continue; 21715 } 21716 21717 if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { 21718 if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || 21719 leaf2_ksize > mc->mc_dbx->md_klen_max)) 21720 rc = bad_page( 21721 mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", 21722 leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); 21723 else 21724 mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; 21725 } 21726 if ((mc->mc_checking & CC_SKIPORD) == 0) { 21727 here.iov_len = leaf2_ksize; 21728 here.iov_base = key; 21729 if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) 21730 rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, 21731 DKEY(&prev), DVAL(&here)); 21732 prev = here; 21733 } 21734 } else { 21735 const MDBX_node *const node = page_node(mp, i); 21736 const char *node_end = (char *)node + NODESIZE; 21737 if (unlikely(node_end > end_of_page)) { 21738 rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i, 21739 node_end - end_of_page); 21740 continue; 21741 } 21742 const size_t ksize = node_ks(node); 21743 if (unlikely(ksize > ksize_max)) 21744 rc = bad_page(mp, "node[%u] too long key (%zu)\n", i, ksize); 21745 char *key = node_key(node); 21746 if (unlikely(end_of_page < key + ksize)) { 21747 rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, 21748 key + ksize - end_of_page); 21749 continue; 21750 } 21751 if ((IS_LEAF(mp) || i > 0)) { 21752 if (unlikely(ksize < mc->mc_dbx->md_klen_min || 21753 ksize > mc->mc_dbx->md_klen_max)) 21754 rc = bad_page( 21755 mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", 21756 i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); 21757 if ((mc->mc_checking & CC_SKIPORD) == 0) { 21758 here.iov_base = key; 21759 here.iov_len = ksize; 21760 if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) 21761 rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i, 21762 DKEY(&prev), DVAL(&here)); 21763 prev = here; 21764 } 21765 } 21766 if (IS_BRANCH(mp)) { 21767 if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && 21768 unlikely(ksize != 0)) 21769 rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", 21770 i, ksize); 21771 const pgno_t ref = node_pgno(node); 21772 if (unlikely(ref < MIN_PAGENO) || 21773 (unlikely(ref >= mc->mc_txn->mt_next_pgno) && 21774 (unlikely(ref >= mc->mc_txn->mt_geo.now) || 21775 !(mc->mc_checking & CC_RETIRING)))) 21776 rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); 21777 if (unlikely(node_flags(node))) 21778 rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, 21779 node_flags(node)); 21780 continue; 21781 } 21782 21783 switch (node_flags(node)) { 21784 default: 21785 rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node)); 21786 break; 21787 case F_BIGDATA /* data on large-page */: 21788 case 0 /* usual */: 21789 case F_SUBDATA /* sub-db */: 21790 case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: 21791 case F_DUPDATA /* short sub-page */: 21792 break; 21793 } 21794 21795 const size_t dsize = node_ds(node); 21796 const char *const data = node_data(node); 21797 if (node_flags(node) & F_BIGDATA) { 21798 if (unlikely(end_of_page < data + sizeof(pgno_t))) { 21799 rc = bad_page( 21800 mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", 21801 "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); 21802 continue; 21803 } 21804 if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || 21805 dsize > mc->mc_dbx->md_vlen_max)) 21806 rc = bad_page( 21807 mp, 21808 "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", 21809 dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); 21810 if (unlikely(node_size_len(node_ks(node), dsize) <= 21811 mc->mc_txn->mt_env->me_leaf_nodemax)) 21812 poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); 21813 21814 if ((mc->mc_checking & CC_RETIRING) == 0) { 21815 const pgr_t lp = 21816 page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); 21817 if (unlikely(lp.err != MDBX_SUCCESS)) 21818 return lp.err; 21819 cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); 21820 const unsigned npages = number_of_ovpages(env, dsize); 21821 if (unlikely(lp.page->mp_pages != npages)) { 21822 if (lp.page->mp_pages < npages) 21823 rc = bad_page(lp.page, 21824 "too less n-pages %u for bigdata-node (%zu bytes)", 21825 lp.page->mp_pages, dsize); 21826 else 21827 poor_page(lp.page, 21828 "extra n-pages %u for bigdata-node (%zu bytes)", 21829 lp.page->mp_pages, dsize); 21830 } 21831 } 21832 continue; 21833 } 21834 21835 if (unlikely(end_of_page < data + dsize)) { 21836 rc = 21837 bad_page(mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", 21838 "data", i, nkeys, dsize, data + dsize - end_of_page); 21839 continue; 21840 } 21841 21842 switch (node_flags(node)) { 21843 default: 21844 /* wrong, but already handled */ 21845 continue; 21846 case 0 /* usual */: 21847 if (unlikely(dsize < mc->mc_dbx->md_vlen_min || 21848 dsize > mc->mc_dbx->md_vlen_max)) { 21849 rc = bad_page( 21850 mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", 21851 dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); 21852 continue; 21853 } 21854 break; 21855 case F_SUBDATA /* sub-db */: 21856 if (unlikely(dsize != sizeof(MDBX_db))) { 21857 rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize); 21858 continue; 21859 } 21860 break; 21861 case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: 21862 if (unlikely(dsize != sizeof(MDBX_db))) { 21863 rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize); 21864 continue; 21865 } 21866 break; 21867 case F_DUPDATA /* short sub-page */: 21868 if (unlikely(dsize <= PAGEHDRSZ)) { 21869 rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n", 21870 dsize); 21871 continue; 21872 } else { 21873 const MDBX_page *const sp = (MDBX_page *)data; 21874 switch (sp->mp_flags & 21875 /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { 21876 case P_LEAF | P_SUBP: 21877 case P_LEAF | P_LEAF2 | P_SUBP: 21878 break; 21879 default: 21880 rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n", 21881 sp->mp_flags); 21882 continue; 21883 } 21884 21885 const char *const end_of_subpage = data + dsize; 21886 const int nsubkeys = page_numkeys(sp); 21887 if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && 21888 mc->mc_db->md_entries) 21889 rc = bad_page(mp, "no keys on a %s-page\n", 21890 IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub"); 21891 21892 MDBX_val sub_here, sub_prev = {0, 0}; 21893 for (int j = 0; j < nsubkeys; j++) { 21894 if (IS_LEAF2(sp)) { 21895 /* LEAF2 pages have no mp_ptrs[] or node headers */ 21896 size_t sub_ksize = sp->mp_leaf2_ksize; 21897 char *sub_key = page_leaf2key(sp, j, sub_ksize); 21898 if (unlikely(end_of_subpage < sub_key + sub_ksize)) { 21899 rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", 21900 sub_key + sub_ksize - end_of_subpage); 21901 continue; 21902 } 21903 21904 if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { 21905 if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || 21906 sub_ksize > mc->mc_dbx->md_vlen_max)) 21907 rc = bad_page(mp, 21908 "nested-leaf2-key size (%zu) <> min/max " 21909 "value-length (%zu/%zu)\n", 21910 sub_ksize, mc->mc_dbx->md_vlen_min, 21911 mc->mc_dbx->md_vlen_max); 21912 else 21913 mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; 21914 } 21915 if ((mc->mc_checking & CC_SKIPORD) == 0) { 21916 sub_here.iov_len = sub_ksize; 21917 sub_here.iov_base = sub_key; 21918 if (sub_prev.iov_base && 21919 unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) 21920 rc = bad_page(mp, 21921 "nested-leaf2-key #%u wrong order (%s >= %s)\n", 21922 j, DKEY(&sub_prev), DVAL(&sub_here)); 21923 sub_prev = sub_here; 21924 } 21925 } else { 21926 const MDBX_node *const sub_node = page_node(sp, j); 21927 const char *sub_node_end = (char *)sub_node + NODESIZE; 21928 if (unlikely(sub_node_end > end_of_subpage)) { 21929 rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", 21930 end_of_subpage - sub_node_end); 21931 continue; 21932 } 21933 if (unlikely(node_flags(sub_node) != 0)) 21934 rc = bad_page(mp, "nested-node invalid flags (%u)\n", 21935 node_flags(sub_node)); 21936 21937 size_t sub_ksize = node_ks(sub_node); 21938 char *sub_key = node_key(sub_node); 21939 size_t sub_dsize = node_ds(sub_node); 21940 /* char *sub_data = node_data(sub_node); */ 21941 21942 if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || 21943 sub_ksize > mc->mc_dbx->md_vlen_max)) 21944 rc = bad_page(mp, 21945 "nested-node-key size (%zu) <> min/max " 21946 "value-length (%zu/%zu)\n", 21947 sub_ksize, mc->mc_dbx->md_vlen_min, 21948 mc->mc_dbx->md_vlen_max); 21949 if ((mc->mc_checking & CC_SKIPORD) == 0) { 21950 sub_here.iov_len = sub_ksize; 21951 sub_here.iov_base = sub_key; 21952 if (sub_prev.iov_base && 21953 unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) 21954 rc = bad_page(mp, 21955 "nested-node-key #%u wrong order (%s >= %s)\n", 21956 j, DKEY(&sub_prev), DVAL(&sub_here)); 21957 sub_prev = sub_here; 21958 } 21959 if (unlikely(sub_dsize != 0)) 21960 rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", 21961 sub_dsize); 21962 if (unlikely(end_of_subpage < sub_key + sub_ksize)) 21963 rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n", 21964 sub_key + sub_ksize - end_of_subpage); 21965 } 21966 } 21967 } 21968 break; 21969 } 21970 } 21971 } 21972 return rc; 21973 } 21974 21975 __cold static int cursor_check(MDBX_cursor *mc) { 21976 cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == 21977 (mc->mc_txn->mt_parent 21978 ? mc->mc_txn->mt_parent->tw.dirtyroom 21979 : mc->mc_txn->mt_env->me_options.dp_limit)); 21980 cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); 21981 if (unlikely(mc->mc_top != mc->mc_snum - 1) && 21982 (mc->mc_checking & CC_UPDATING) == 0) 21983 return MDBX_CURSOR_FULL; 21984 cASSERT(mc, (mc->mc_checking & CC_UPDATING) 21985 ? mc->mc_snum <= mc->mc_db->md_depth 21986 : mc->mc_snum == mc->mc_db->md_depth); 21987 if (unlikely((mc->mc_checking & CC_UPDATING) 21988 ? mc->mc_snum > mc->mc_db->md_depth 21989 : mc->mc_snum != mc->mc_db->md_depth)) 21990 return MDBX_CURSOR_FULL; 21991 21992 for (int n = 0; n < (int)mc->mc_snum; ++n) { 21993 MDBX_page *mp = mc->mc_pg[n]; 21994 const unsigned nkeys = page_numkeys(mp); 21995 const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; 21996 const bool expect_nested_leaf = 21997 (n + 1 == mc->mc_db->md_depth - 1) ? true : false; 21998 const bool branch = IS_BRANCH(mp) ? true : false; 21999 cASSERT(mc, branch == expect_branch); 22000 if (unlikely(branch != expect_branch)) 22001 return MDBX_CURSOR_FULL; 22002 if ((mc->mc_checking & CC_UPDATING) == 0) { 22003 cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && 22004 (mc->mc_flags & C_EOF) != 0)); 22005 if (unlikely(nkeys <= mc->mc_ki[n] && 22006 !(!branch && nkeys == mc->mc_ki[n] && 22007 (mc->mc_flags & C_EOF) != 0))) 22008 return MDBX_CURSOR_FULL; 22009 } else { 22010 cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]); 22011 if (unlikely(nkeys + 1 < mc->mc_ki[n])) 22012 return MDBX_CURSOR_FULL; 22013 } 22014 22015 int err = page_check(mc, mp); 22016 if (unlikely(err != MDBX_SUCCESS)) 22017 return err; 22018 22019 for (unsigned i = 0; i < nkeys; ++i) { 22020 if (branch) { 22021 MDBX_node *node = page_node(mp, i); 22022 cASSERT(mc, node_flags(node) == 0); 22023 if (unlikely(node_flags(node) != 0)) 22024 return MDBX_CURSOR_FULL; 22025 pgno_t pgno = node_pgno(node); 22026 MDBX_page *np; 22027 err = page_get(mc, pgno, &np, mp->mp_txnid); 22028 cASSERT(mc, err == MDBX_SUCCESS); 22029 if (unlikely(err != MDBX_SUCCESS)) 22030 return err; 22031 const bool nested_leaf = IS_LEAF(np) ? true : false; 22032 cASSERT(mc, nested_leaf == expect_nested_leaf); 22033 if (unlikely(nested_leaf != expect_nested_leaf)) 22034 return MDBX_CURSOR_FULL; 22035 err = page_check(mc, np); 22036 if (unlikely(err != MDBX_SUCCESS)) 22037 return err; 22038 } 22039 } 22040 } 22041 return MDBX_SUCCESS; 22042 } 22043 22044 __cold static int cursor_check_updating(MDBX_cursor *mc) { 22045 const uint8_t checking = mc->mc_checking; 22046 mc->mc_checking |= CC_UPDATING; 22047 const int rc = cursor_check(mc); 22048 mc->mc_checking = checking; 22049 return rc; 22050 } 22051 22052 /* Complete a delete operation started by mdbx_cursor_del(). */ 22053 static int cursor_del(MDBX_cursor *mc) { 22054 int rc; 22055 MDBX_page *mp; 22056 indx_t ki; 22057 unsigned nkeys; 22058 MDBX_dbi dbi = mc->mc_dbi; 22059 22060 cASSERT(mc, cursor_is_tracked(mc)); 22061 cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 22062 ki = mc->mc_ki[mc->mc_top]; 22063 mp = mc->mc_pg[mc->mc_top]; 22064 node_del(mc, mc->mc_db->md_xsize); 22065 mc->mc_db->md_entries--; 22066 22067 /* Adjust other cursors pointing to mp */ 22068 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { 22069 MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 22070 if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 22071 continue; 22072 if (m3->mc_snum < mc->mc_snum) 22073 continue; 22074 if (m3->mc_pg[mc->mc_top] == mp) { 22075 if (m3->mc_ki[mc->mc_top] == ki) { 22076 m3->mc_flags |= C_DEL; 22077 if (mc->mc_db->md_flags & MDBX_DUPSORT) { 22078 /* Sub-cursor referred into dataset which is gone */ 22079 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); 22080 } 22081 continue; 22082 } else if (m3->mc_ki[mc->mc_top] > ki) { 22083 m3->mc_ki[mc->mc_top]--; 22084 } 22085 if (XCURSOR_INITED(m3)) 22086 XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); 22087 } 22088 } 22089 22090 rc = rebalance(mc); 22091 if (unlikely(rc != MDBX_SUCCESS)) 22092 goto bailout; 22093 22094 if (unlikely(!mc->mc_snum)) { 22095 /* DB is totally empty now, just bail out. 22096 * Other cursors adjustments were already done 22097 * by rebalance and aren't needed here. */ 22098 cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && 22099 mc->mc_db->md_root == P_INVALID); 22100 mc->mc_flags |= C_EOF; 22101 return MDBX_SUCCESS; 22102 } 22103 22104 ki = mc->mc_ki[mc->mc_top]; 22105 mp = mc->mc_pg[mc->mc_top]; 22106 cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 22107 nkeys = page_numkeys(mp); 22108 cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || 22109 ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && 22110 nkeys == 0)); 22111 22112 /* Adjust this and other cursors pointing to mp */ 22113 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { 22114 MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 22115 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 22116 continue; 22117 if (m3->mc_snum < mc->mc_snum) 22118 continue; 22119 if (m3->mc_pg[mc->mc_top] == mp) { 22120 /* if m3 points past last node in page, find next sibling */ 22121 if (m3->mc_ki[mc->mc_top] >= nkeys) { 22122 rc = cursor_sibling(m3, SIBLING_RIGHT); 22123 if (rc == MDBX_NOTFOUND) { 22124 m3->mc_flags |= C_EOF; 22125 rc = MDBX_SUCCESS; 22126 continue; 22127 } 22128 if (unlikely(rc != MDBX_SUCCESS)) 22129 goto bailout; 22130 } 22131 if (m3->mc_ki[mc->mc_top] >= ki || 22132 /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { 22133 if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { 22134 MDBX_node *node = 22135 page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); 22136 /* If this node has dupdata, it may need to be reinited 22137 * because its data has moved. 22138 * If the xcursor was not inited it must be reinited. 22139 * Else if node points to a subDB, nothing is needed. */ 22140 if (node_flags(node) & F_DUPDATA) { 22141 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 22142 if (!(node_flags(node) & F_SUBDATA)) 22143 m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); 22144 } else { 22145 rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); 22146 if (unlikely(rc != MDBX_SUCCESS)) 22147 goto bailout; 22148 rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); 22149 if (unlikely(rc != MDBX_SUCCESS)) 22150 goto bailout; 22151 } 22152 } 22153 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; 22154 } 22155 m3->mc_flags |= C_DEL; 22156 } 22157 } 22158 } 22159 22160 cASSERT(mc, rc == MDBX_SUCCESS); 22161 if (AUDIT_ENABLED()) 22162 rc = cursor_check(mc); 22163 return rc; 22164 22165 bailout: 22166 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 22167 return rc; 22168 } 22169 22170 int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, 22171 const MDBX_val *data) { 22172 int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); 22173 if (unlikely(rc != MDBX_SUCCESS)) 22174 return rc; 22175 22176 if (unlikely(!key)) 22177 return MDBX_EINVAL; 22178 22179 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 22180 return MDBX_BAD_DBI; 22181 22182 if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) 22183 return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; 22184 22185 return delete (txn, dbi, key, data, 0); 22186 } 22187 22188 static int delete (MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, 22189 const MDBX_val *data, unsigned flags) { 22190 MDBX_cursor_couple cx; 22191 MDBX_cursor_op op; 22192 MDBX_val rdata; 22193 int rc; 22194 DKBUF_DEBUG; 22195 22196 DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), 22197 DVAL_DEBUG(data)); 22198 22199 rc = cursor_init(&cx.outer, txn, dbi); 22200 if (unlikely(rc != MDBX_SUCCESS)) 22201 return rc; 22202 22203 if (data) { 22204 op = MDBX_GET_BOTH; 22205 rdata = *data; 22206 data = &rdata; 22207 } else { 22208 op = MDBX_SET; 22209 flags |= MDBX_ALLDUPS; 22210 } 22211 rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; 22212 if (likely(rc == MDBX_SUCCESS)) { 22213 /* let mdbx_page_split know about this cursor if needed: 22214 * delete will trigger a rebalance; if it needs to move 22215 * a node from one page to another, it will have to 22216 * update the parent's separator key(s). If the new sepkey 22217 * is larger than the current one, the parent page may 22218 * run out of space, triggering a split. We need this 22219 * cursor to be consistent until the end of the rebalance. */ 22220 cx.outer.mc_next = txn->mt_cursors[dbi]; 22221 txn->mt_cursors[dbi] = &cx.outer; 22222 rc = mdbx_cursor_del(&cx.outer, flags); 22223 txn->mt_cursors[dbi] = cx.outer.mc_next; 22224 } 22225 return rc; 22226 } 22227 22228 /* Split a page and insert a new node. 22229 * Set MDBX_TXN_ERROR on failure. 22230 * [in,out] mc Cursor pointing to the page and desired insertion index. 22231 * The cursor will be updated to point to the actual page and index where 22232 * the node got inserted after the split. 22233 * [in] newkey The key for the newly inserted node. 22234 * [in] newdata The data for the newly inserted node. 22235 * [in] newpgno The page number, if the new node is a branch node. 22236 * [in] naf The NODE_ADD_FLAGS for the new node. 22237 * Returns 0 on success, non-zero on failure. */ 22238 static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, 22239 MDBX_val *const newdata, pgno_t newpgno, 22240 const unsigned naf) { 22241 unsigned flags; 22242 int rc = MDBX_SUCCESS, foliage = 0; 22243 unsigned i, ptop; 22244 MDBX_env *const env = mc->mc_txn->mt_env; 22245 MDBX_val sepkey, rkey, xdata; 22246 MDBX_page *tmp_ki_copy = NULL; 22247 DKBUF; 22248 22249 MDBX_page *const mp = mc->mc_pg[mc->mc_top]; 22250 const unsigned newindx = mc->mc_ki[mc->mc_top]; 22251 unsigned nkeys = page_numkeys(mp); 22252 if (AUDIT_ENABLED()) { 22253 rc = cursor_check_updating(mc); 22254 if (unlikely(rc != MDBX_SUCCESS)) 22255 return rc; 22256 } 22257 STATIC_ASSERT(P_BRANCH == 1); 22258 const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; 22259 22260 DEBUG(">> splitting %s-page %" PRIaPGNO 22261 " and adding %zu+%zu [%s] at %i, nkeys %i", 22262 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, 22263 newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), 22264 mc->mc_ki[mc->mc_top], nkeys); 22265 cASSERT(mc, nkeys + 1 >= minkeys * 2); 22266 22267 /* Create a new sibling page. */ 22268 pgr_t npr = page_new(mc, mp->mp_flags); 22269 if (unlikely(npr.err != MDBX_SUCCESS)) 22270 return npr.err; 22271 MDBX_page *const sister = npr.page; 22272 sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; 22273 DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno); 22274 22275 /* Usually when splitting the root page, the cursor 22276 * height is 1. But when called from update_key, 22277 * the cursor height may be greater because it walks 22278 * up the stack while finding the branch slot to update. */ 22279 if (mc->mc_top < 1) { 22280 npr = page_new(mc, P_BRANCH); 22281 rc = npr.err; 22282 if (unlikely(rc != MDBX_SUCCESS)) 22283 goto done; 22284 MDBX_page *const pp = npr.page; 22285 /* shift current top to make room for new parent */ 22286 cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); 22287 #if MDBX_DEBUG 22288 memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); 22289 memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); 22290 #endif 22291 mc->mc_pg[2] = mc->mc_pg[1]; 22292 mc->mc_ki[2] = mc->mc_ki[1]; 22293 mc->mc_pg[1] = mc->mc_pg[0]; 22294 mc->mc_ki[1] = mc->mc_ki[0]; 22295 mc->mc_pg[0] = pp; 22296 mc->mc_ki[0] = 0; 22297 mc->mc_db->md_root = pp->mp_pgno; 22298 DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno); 22299 foliage = mc->mc_db->md_depth++; 22300 22301 /* Add left (implicit) pointer. */ 22302 rc = node_add_branch(mc, 0, NULL, mp->mp_pgno); 22303 if (unlikely(rc != MDBX_SUCCESS)) { 22304 /* undo the pre-push */ 22305 mc->mc_pg[0] = mc->mc_pg[1]; 22306 mc->mc_ki[0] = mc->mc_ki[1]; 22307 mc->mc_db->md_root = mp->mp_pgno; 22308 mc->mc_db->md_depth--; 22309 goto done; 22310 } 22311 mc->mc_snum++; 22312 mc->mc_top++; 22313 ptop = 0; 22314 if (AUDIT_ENABLED()) { 22315 rc = cursor_check_updating(mc); 22316 if (unlikely(rc != MDBX_SUCCESS)) 22317 goto done; 22318 } 22319 } else { 22320 ptop = mc->mc_top - 1; 22321 DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); 22322 } 22323 22324 MDBX_cursor mn; 22325 cursor_copy(mc, &mn); 22326 mn.mc_pg[mn.mc_top] = sister; 22327 mn.mc_ki[mn.mc_top] = 0; 22328 mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; 22329 22330 unsigned split_indx = 22331 (newindx < nkeys) 22332 ? /* split at the middle */ (nkeys + 1) >> 1 22333 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; 22334 eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); 22335 22336 cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); 22337 /* It is reasonable and possible to split the page at the begin */ 22338 if (unlikely(newindx < minkeys)) { 22339 split_indx = minkeys; 22340 if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { 22341 split_indx = 0; 22342 /* Checking for ability of splitting by the left-side insertion 22343 * of a pure page with the new key */ 22344 for (i = 0; i < mc->mc_top; ++i) 22345 if (mc->mc_ki[i]) { 22346 get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey); 22347 if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0) 22348 split_indx = minkeys; 22349 break; 22350 } 22351 if (split_indx == 0) { 22352 /* Save the current first key which was omitted on the parent branch 22353 * page and should be updated if the new first entry will be added */ 22354 if (IS_LEAF2(mp)) { 22355 sepkey.iov_len = mp->mp_leaf2_ksize; 22356 sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); 22357 } else 22358 get_key(page_node(mp, 0), &sepkey); 22359 cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); 22360 /* Avoiding rare complex cases of split the parent page */ 22361 if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) 22362 split_indx = minkeys; 22363 } 22364 } 22365 } 22366 22367 const bool pure_right = split_indx == nkeys; 22368 const bool pure_left = split_indx == 0; 22369 if (unlikely(pure_right)) { 22370 /* newindx == split_indx == nkeys */ 22371 TRACE("no-split, but add new pure page at the %s", "right/after"); 22372 cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); 22373 sepkey = *newkey; 22374 } else if (unlikely(pure_left)) { 22375 /* newindx == split_indx == 0 */ 22376 TRACE("no-split, but add new pure page at the %s", "left/before"); 22377 cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); 22378 TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); 22379 } else { 22380 if (IS_LEAF2(sister)) { 22381 char *split, *ins; 22382 unsigned lsize, rsize, ksize; 22383 /* Move half of the keys to the right sibling */ 22384 const int distance = mc->mc_ki[mc->mc_top] - split_indx; 22385 ksize = mc->mc_db->md_xsize; 22386 split = page_leaf2key(mp, split_indx, ksize); 22387 rsize = (nkeys - split_indx) * ksize; 22388 lsize = (nkeys - split_indx) * sizeof(indx_t); 22389 cASSERT(mc, mp->mp_lower >= lsize); 22390 mp->mp_lower -= (indx_t)lsize; 22391 cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); 22392 sister->mp_lower += (indx_t)lsize; 22393 cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); 22394 mp->mp_upper += (indx_t)(rsize - lsize); 22395 cASSERT(mc, sister->mp_upper >= rsize - lsize); 22396 sister->mp_upper -= (indx_t)(rsize - lsize); 22397 sepkey.iov_len = ksize; 22398 sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; 22399 if (distance < 0) { 22400 cASSERT(mc, ksize >= sizeof(indx_t)); 22401 ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); 22402 memcpy(sister->mp_ptrs, split, rsize); 22403 sepkey.iov_base = sister->mp_ptrs; 22404 memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); 22405 memcpy(ins, newkey->iov_base, ksize); 22406 cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); 22407 mp->mp_lower += sizeof(indx_t); 22408 cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); 22409 mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); 22410 } else { 22411 memcpy(sister->mp_ptrs, split, distance * ksize); 22412 ins = page_leaf2key(sister, distance, ksize); 22413 memcpy(ins, newkey->iov_base, ksize); 22414 memcpy(ins + ksize, split + distance * ksize, rsize - distance * ksize); 22415 cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); 22416 sister->mp_lower += sizeof(indx_t); 22417 cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); 22418 sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); 22419 cASSERT(mc, distance <= (int)UINT16_MAX); 22420 mc->mc_ki[mc->mc_top] = (indx_t)distance; 22421 } 22422 22423 if (AUDIT_ENABLED()) { 22424 rc = cursor_check_updating(mc); 22425 if (unlikely(rc != MDBX_SUCCESS)) 22426 goto done; 22427 rc = cursor_check_updating(&mn); 22428 if (unlikely(rc != MDBX_SUCCESS)) 22429 goto done; 22430 } 22431 } else { 22432 /* grab a page to hold a temporary copy */ 22433 tmp_ki_copy = page_malloc(mc->mc_txn, 1); 22434 if (unlikely(tmp_ki_copy == NULL)) { 22435 rc = MDBX_ENOMEM; 22436 goto done; 22437 } 22438 22439 const unsigned max_space = page_space(env); 22440 const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) 22441 : branch_size(env, newkey); 22442 22443 /* prepare to insert */ 22444 for (i = 0; i < newindx; ++i) 22445 tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i]; 22446 tmp_ki_copy->mp_ptrs[i] = (indx_t)-1; 22447 while (++i <= nkeys) 22448 tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1]; 22449 tmp_ki_copy->mp_pgno = mp->mp_pgno; 22450 tmp_ki_copy->mp_flags = mp->mp_flags; 22451 tmp_ki_copy->mp_txnid = INVALID_TXNID; 22452 tmp_ki_copy->mp_lower = 0; 22453 tmp_ki_copy->mp_upper = (indx_t)max_space; 22454 22455 /* Добавляемый узел может не поместиться в страницу-половину вместе 22456 * с количественной половиной узлов из исходной страницы. В худшем случае, 22457 * в страницу-половину с добавляемым узлом могут попасть самые больше узлы 22458 * из исходной страницы, а другую половину только узлы с самыми короткими 22459 * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу 22460 * разреза требуется итерировать узлы и считая их объем. 22461 * 22462 * Однако, при простом количественном делении (без учета размера ключей 22463 * и данных) на страницах-половинах будет примерно вдвое меньше узлов. 22464 * Поэтому добавляемый узел точно поместится, если его размер не больше 22465 * чем место "освобождающееся" от заголовков узлов, которые переедут 22466 * в другую страницу-половину. Кроме этого, как минимум по одному байту 22467 * будет в каждом ключе, в худшем случае кроме одного, который может быть 22468 * нулевого размера. */ 22469 22470 if (newindx == split_indx && nkeys >= 5) { 22471 STATIC_ASSERT(P_BRANCH == 1); 22472 split_indx += mp->mp_flags & P_BRANCH; 22473 } 22474 eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); 22475 const unsigned dim_nodes = 22476 (newindx >= split_indx) ? split_indx : nkeys - split_indx; 22477 const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; 22478 if (new_size >= dim_used) { 22479 /* Search for best acceptable split point */ 22480 i = (newindx < split_indx) ? 0 : nkeys; 22481 int dir = (newindx < split_indx) ? 1 : -1; 22482 size_t before = 0, after = new_size + page_used(env, mp); 22483 unsigned best_split = split_indx; 22484 unsigned best_shift = INT_MAX; 22485 22486 TRACE("seek separator from %u, step %i, default %u, new-idx %u, " 22487 "new-size %zu", 22488 i, dir, split_indx, newindx, new_size); 22489 do { 22490 cASSERT(mc, i <= nkeys); 22491 size_t size = new_size; 22492 if (i != newindx) { 22493 MDBX_node *node = 22494 (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); 22495 size = NODESIZE + node_ks(node) + sizeof(indx_t); 22496 if (IS_LEAF(mp)) 22497 size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) 22498 : node_ds(node); 22499 size = EVEN(size); 22500 } 22501 22502 before += size; 22503 after -= size; 22504 TRACE("step %u, size %zu, before %zu, after %zu, max %u", i, size, 22505 before, after, max_space); 22506 22507 if (before <= max_space && after <= max_space) { 22508 const unsigned split = i + (dir > 0); 22509 if (split >= minkeys && split <= nkeys + 1 - minkeys) { 22510 const unsigned shift = branchless_abs(split_indx - split); 22511 if (shift >= best_shift) 22512 break; 22513 best_shift = shift; 22514 best_split = split; 22515 if (!best_shift) 22516 break; 22517 } 22518 } 22519 i += dir; 22520 } while (i < nkeys); 22521 22522 split_indx = best_split; 22523 TRACE("chosen %u", split_indx); 22524 } 22525 eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); 22526 22527 sepkey = *newkey; 22528 if (split_indx != newindx) { 22529 MDBX_node *node = 22530 (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + 22531 PAGEHDRSZ); 22532 sepkey.iov_len = node_ks(node); 22533 sepkey.iov_base = node_key(node); 22534 } 22535 } 22536 } 22537 DEBUG("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); 22538 22539 bool did_split_parent = false; 22540 /* Copy separator key to the parent. */ 22541 if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { 22542 TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); 22543 cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2); 22544 cASSERT(mc, !pure_left); 22545 const int snum = mc->mc_snum; 22546 const int depth = mc->mc_db->md_depth; 22547 mn.mc_snum--; 22548 mn.mc_top--; 22549 did_split_parent = true; 22550 /* We want other splits to find mn when doing fixups */ 22551 WITH_CURSOR_TRACKING( 22552 mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); 22553 if (unlikely(rc != MDBX_SUCCESS)) 22554 goto done; 22555 cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); 22556 if (AUDIT_ENABLED()) { 22557 rc = cursor_check_updating(mc); 22558 if (unlikely(rc != MDBX_SUCCESS)) 22559 goto done; 22560 } 22561 22562 /* root split? */ 22563 ptop += mc->mc_snum - snum; 22564 22565 /* Right page might now have changed parent. 22566 * Check if left page also changed parent. */ 22567 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 22568 mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { 22569 for (i = 0; i < ptop; i++) { 22570 mc->mc_pg[i] = mn.mc_pg[i]; 22571 mc->mc_ki[i] = mn.mc_ki[i]; 22572 } 22573 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 22574 if (mn.mc_ki[ptop]) { 22575 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 22576 } else { 22577 /* find right page's left sibling */ 22578 mc->mc_ki[ptop] = mn.mc_ki[ptop]; 22579 rc = cursor_sibling(mc, SIBLING_LEFT); 22580 if (unlikely(rc != MDBX_SUCCESS)) { 22581 if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { 22582 ERROR("unexpected %i error going left sibling", rc); 22583 rc = MDBX_PROBLEM; 22584 } 22585 goto done; 22586 } 22587 } 22588 } 22589 } else if (unlikely(pure_left)) { 22590 MDBX_page *ptop_page = mc->mc_pg[ptop]; 22591 DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", 22592 ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, 22593 DKEY(mc->mc_ki[ptop] ? newkey : NULL)); 22594 mc->mc_top--; 22595 rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, 22596 sister->mp_pgno); 22597 cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && 22598 ptop == mc->mc_top); 22599 22600 if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { 22601 DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); 22602 MDBX_node *node = page_node(mc->mc_pg[ptop], 1); 22603 cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); 22604 cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); 22605 mc->mc_ki[ptop] = 1; 22606 rc = update_key(mc, &sepkey); 22607 cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); 22608 cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); 22609 mc->mc_ki[ptop] = 0; 22610 } 22611 22612 mc->mc_top++; 22613 if (unlikely(rc != MDBX_SUCCESS)) 22614 goto done; 22615 22616 MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); 22617 cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); 22618 } else { 22619 mn.mc_top--; 22620 TRACE("add-to-parent the right-entry[%u] for new sibling-page", 22621 mn.mc_ki[ptop]); 22622 rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); 22623 mn.mc_top++; 22624 if (unlikely(rc != MDBX_SUCCESS)) 22625 goto done; 22626 } 22627 22628 if (unlikely(pure_left | pure_right)) { 22629 mc->mc_pg[mc->mc_top] = sister; 22630 mc->mc_ki[mc->mc_top] = 0; 22631 switch (PAGETYPE_WHOLE(sister)) { 22632 case P_LEAF: { 22633 cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); 22634 rc = node_add_leaf(mc, 0, newkey, newdata, naf); 22635 } break; 22636 case P_LEAF | P_LEAF2: { 22637 cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); 22638 cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); 22639 rc = node_add_leaf2(mc, 0, newkey); 22640 } break; 22641 default: 22642 rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); 22643 } 22644 if (unlikely(rc != MDBX_SUCCESS)) 22645 goto done; 22646 22647 if (pure_right) { 22648 for (i = 0; i < mc->mc_top; i++) 22649 mc->mc_ki[i] = mn.mc_ki[i]; 22650 } else if (mc->mc_ki[mc->mc_top - 1] == 0) { 22651 for (i = 2; i <= mc->mc_top; ++i) 22652 if (mc->mc_ki[mc->mc_top - i]) { 22653 get_key( 22654 page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), 22655 &sepkey); 22656 if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { 22657 mc->mc_top -= (uint8_t)i; 22658 DEBUG("update new-first on parent [%i] page %u key %s", 22659 mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, 22660 DKEY(newkey)); 22661 rc = update_key(mc, newkey); 22662 mc->mc_top += (uint8_t)i; 22663 if (unlikely(rc != MDBX_SUCCESS)) 22664 goto done; 22665 } 22666 break; 22667 } 22668 } 22669 } else if (!IS_LEAF2(mp)) { 22670 /* Move nodes */ 22671 mc->mc_pg[mc->mc_top] = sister; 22672 i = split_indx; 22673 unsigned n = 0; 22674 do { 22675 TRACE("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, sister->mp_pgno); 22676 pgno_t pgno = 0; 22677 MDBX_val *rdata = NULL; 22678 if (i == newindx) { 22679 rkey = *newkey; 22680 if (IS_LEAF(mp)) 22681 rdata = newdata; 22682 else 22683 pgno = newpgno; 22684 flags = naf; 22685 /* Update index for the new key. */ 22686 mc->mc_ki[mc->mc_top] = (indx_t)n; 22687 } else { 22688 MDBX_node *node = 22689 (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); 22690 rkey.iov_base = node_key(node); 22691 rkey.iov_len = node_ks(node); 22692 if (IS_LEAF(mp)) { 22693 xdata.iov_base = node_data(node); 22694 xdata.iov_len = node_ds(node); 22695 rdata = &xdata; 22696 } else 22697 pgno = node_pgno(node); 22698 flags = node_flags(node); 22699 } 22700 22701 switch (PAGETYPE_WHOLE(sister)) { 22702 case P_BRANCH: { 22703 cASSERT(mc, 0 == (uint16_t)flags); 22704 /* First branch index doesn't need key data. */ 22705 rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); 22706 } break; 22707 case P_LEAF: { 22708 cASSERT(mc, pgno == 0); 22709 cASSERT(mc, rdata != NULL); 22710 rc = node_add_leaf(mc, n, &rkey, rdata, flags); 22711 } break; 22712 /* case P_LEAF | P_LEAF2: { 22713 cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); 22714 cASSERT(mc, gno == 0); 22715 rc = mdbx_node_add_leaf2(mc, n, &rkey); 22716 } break; */ 22717 default: 22718 rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); 22719 } 22720 if (unlikely(rc != MDBX_SUCCESS)) 22721 goto done; 22722 22723 ++n; 22724 if (++i > nkeys) { 22725 i = 0; 22726 n = 0; 22727 mc->mc_pg[mc->mc_top] = tmp_ki_copy; 22728 TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno); 22729 } 22730 } while (i != split_indx); 22731 22732 TRACE("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, 22733 mc->mc_pg[mc->mc_top]->mp_pgno); 22734 22735 nkeys = page_numkeys(tmp_ki_copy); 22736 for (i = 0; i < nkeys; i++) 22737 mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i]; 22738 mp->mp_lower = tmp_ki_copy->mp_lower; 22739 mp->mp_upper = tmp_ki_copy->mp_upper; 22740 memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1), 22741 env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ); 22742 22743 /* reset back to original page */ 22744 if (newindx < split_indx) { 22745 mc->mc_pg[mc->mc_top] = mp; 22746 } else { 22747 mc->mc_pg[mc->mc_top] = sister; 22748 mc->mc_ki[ptop]++; 22749 /* Make sure mc_ki is still valid. */ 22750 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 22751 mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { 22752 for (i = 0; i <= ptop; i++) { 22753 mc->mc_pg[i] = mn.mc_pg[i]; 22754 mc->mc_ki[i] = mn.mc_ki[i]; 22755 } 22756 } 22757 } 22758 } else if (newindx >= split_indx) { 22759 mc->mc_pg[mc->mc_top] = sister; 22760 mc->mc_ki[ptop]++; 22761 /* Make sure mc_ki is still valid. */ 22762 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 22763 mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { 22764 for (i = 0; i <= ptop; i++) { 22765 mc->mc_pg[i] = mn.mc_pg[i]; 22766 mc->mc_ki[i] = mn.mc_ki[i]; 22767 } 22768 } 22769 } 22770 22771 /* Adjust other cursors pointing to mp and/or to parent page */ 22772 nkeys = page_numkeys(mp); 22773 for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; 22774 m2 = m2->mc_next) { 22775 MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 22776 if (m3 == mc) 22777 continue; 22778 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 22779 continue; 22780 if (foliage) { 22781 /* sub cursors may be on different DB */ 22782 if (m3->mc_pg[0] != mp) 22783 continue; 22784 /* root split */ 22785 for (int k = foliage; k >= 0; k--) { 22786 m3->mc_ki[k + 1] = m3->mc_ki[k]; 22787 m3->mc_pg[k + 1] = m3->mc_pg[k]; 22788 } 22789 m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; 22790 m3->mc_pg[0] = mc->mc_pg[0]; 22791 m3->mc_snum++; 22792 m3->mc_top++; 22793 } 22794 22795 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { 22796 if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE)) 22797 m3->mc_ki[mc->mc_top]++; 22798 if (m3->mc_ki[mc->mc_top] >= nkeys) { 22799 m3->mc_pg[mc->mc_top] = sister; 22800 cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys); 22801 m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; 22802 for (i = 0; i < mc->mc_top; i++) { 22803 m3->mc_ki[i] = mn.mc_ki[i]; 22804 m3->mc_pg[i] = mn.mc_pg[i]; 22805 } 22806 } 22807 } else if (!did_split_parent && m3->mc_top >= ptop && 22808 m3->mc_pg[ptop] == mc->mc_pg[ptop] && 22809 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { 22810 m3->mc_ki[ptop]++; /* also for the `pure-left` case */ 22811 } 22812 if (XCURSOR_INITED(m3) && IS_LEAF(mp)) 22813 XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); 22814 } 22815 TRACE("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), 22816 sister->mp_pgno, page_room(sister)); 22817 22818 done: 22819 if (tmp_ki_copy) 22820 dpage_free(env, tmp_ki_copy, 1); 22821 22822 if (unlikely(rc != MDBX_SUCCESS)) 22823 mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; 22824 else { 22825 if (AUDIT_ENABLED()) 22826 rc = cursor_check_updating(mc); 22827 if (unlikely(naf & MDBX_RESERVE)) { 22828 MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 22829 if (!(node_flags(node) & F_BIGDATA)) 22830 newdata->iov_base = node_data(node); 22831 } 22832 #if MDBX_ENABLE_PGOP_STAT 22833 env->me_lck->mti_pgop_stat.split.weak += 1; 22834 #endif /* MDBX_ENABLE_PGOP_STAT */ 22835 } 22836 22837 DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc); 22838 return rc; 22839 } 22840 22841 int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, 22842 unsigned flags) { 22843 int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); 22844 if (unlikely(rc != MDBX_SUCCESS)) 22845 return rc; 22846 22847 if (unlikely(!key || !data)) 22848 return MDBX_EINVAL; 22849 22850 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 22851 return MDBX_BAD_DBI; 22852 22853 if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | 22854 MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | 22855 MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) 22856 return MDBX_EINVAL; 22857 22858 if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) 22859 return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; 22860 22861 MDBX_cursor_couple cx; 22862 rc = cursor_init(&cx.outer, txn, dbi); 22863 if (unlikely(rc != MDBX_SUCCESS)) 22864 return rc; 22865 cx.outer.mc_next = txn->mt_cursors[dbi]; 22866 txn->mt_cursors[dbi] = &cx.outer; 22867 22868 /* LY: support for update (explicit overwrite) */ 22869 if (flags & MDBX_CURRENT) { 22870 rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET); 22871 if (likely(rc == MDBX_SUCCESS) && 22872 (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && 22873 (flags & MDBX_ALLDUPS) == 0) { 22874 /* LY: allows update (explicit overwrite) only for unique keys */ 22875 MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], 22876 cx.outer.mc_ki[cx.outer.mc_top]); 22877 if (node_flags(node) & F_DUPDATA) { 22878 tASSERT(txn, XCURSOR_INITED(&cx.outer) && 22879 cx.outer.mc_xcursor->mx_db.md_entries > 1); 22880 rc = MDBX_EMULTIVAL; 22881 } 22882 } 22883 } 22884 22885 if (likely(rc == MDBX_SUCCESS)) 22886 rc = mdbx_cursor_put(&cx.outer, key, data, flags); 22887 txn->mt_cursors[dbi] = cx.outer.mc_next; 22888 22889 return rc; 22890 } 22891 22892 /**** COPYING *****************************************************************/ 22893 22894 /* State needed for a double-buffering compacting copy. */ 22895 typedef struct mdbx_compacting_ctx { 22896 MDBX_env *mc_env; 22897 MDBX_txn *mc_txn; 22898 osal_condpair_t mc_condpair; 22899 uint8_t *mc_wbuf[2]; 22900 size_t mc_wlen[2]; 22901 mdbx_filehandle_t mc_fd; 22902 /* Error code. Never cleared if set. Both threads can set nonzero 22903 * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ 22904 volatile int mc_error; 22905 pgno_t mc_next_pgno; 22906 volatile unsigned mc_head; 22907 volatile unsigned mc_tail; 22908 } mdbx_compacting_ctx; 22909 22910 /* Dedicated writer thread for compacting copy. */ 22911 __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { 22912 mdbx_compacting_ctx *const ctx = arg; 22913 22914 #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) 22915 sigset_t sigset; 22916 sigemptyset(&sigset); 22917 sigaddset(&sigset, SIGPIPE); 22918 ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); 22919 #endif /* EPIPE */ 22920 22921 osal_condpair_lock(&ctx->mc_condpair); 22922 while (!ctx->mc_error) { 22923 while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { 22924 int err = osal_condpair_wait(&ctx->mc_condpair, true); 22925 if (err != MDBX_SUCCESS) { 22926 ctx->mc_error = err; 22927 goto bailout; 22928 } 22929 } 22930 const unsigned toggle = ctx->mc_tail & 1; 22931 size_t wsize = ctx->mc_wlen[toggle]; 22932 if (wsize == 0) { 22933 ctx->mc_tail += 1; 22934 break /* EOF */; 22935 } 22936 ctx->mc_wlen[toggle] = 0; 22937 uint8_t *ptr = ctx->mc_wbuf[toggle]; 22938 if (!ctx->mc_error) { 22939 int err = osal_write(ctx->mc_fd, ptr, wsize); 22940 if (err != MDBX_SUCCESS) { 22941 #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) 22942 if (err == EPIPE) { 22943 /* Collect the pending SIGPIPE, 22944 * otherwise at least OS X gives it to the process on thread-exit. */ 22945 int unused; 22946 sigwait(&sigset, &unused); 22947 } 22948 #endif /* EPIPE */ 22949 ctx->mc_error = err; 22950 goto bailout; 22951 } 22952 } 22953 ctx->mc_tail += 1; 22954 osal_condpair_signal(&ctx->mc_condpair, false); 22955 } 22956 bailout: 22957 osal_condpair_unlock(&ctx->mc_condpair); 22958 return (THREAD_RESULT)0; 22959 } 22960 22961 /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ 22962 __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { 22963 osal_condpair_lock(&ctx->mc_condpair); 22964 eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); 22965 ctx->mc_head += 1; 22966 osal_condpair_signal(&ctx->mc_condpair, true); 22967 while (!ctx->mc_error && 22968 ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { 22969 int err = osal_condpair_wait(&ctx->mc_condpair, false); 22970 if (err != MDBX_SUCCESS) 22971 ctx->mc_error = err; 22972 } 22973 osal_condpair_unlock(&ctx->mc_condpair); 22974 return ctx->mc_error; 22975 } 22976 22977 __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); 22978 22979 static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, 22980 size_t bytes, pgno_t pgno, pgno_t npages) { 22981 assert(pgno == 0 || bytes > PAGEHDRSZ); 22982 while (bytes > 0) { 22983 const unsigned side = ctx->mc_head & 1; 22984 const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; 22985 if (left < (pgno ? PAGEHDRSZ : 1)) { 22986 int err = compacting_toggle_write_buffers(ctx); 22987 if (unlikely(err != MDBX_SUCCESS)) 22988 return err; 22989 continue; 22990 } 22991 const size_t chunk = (bytes < left) ? bytes : left; 22992 void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side]; 22993 if (src) { 22994 memcpy(dst, src, chunk); 22995 if (pgno) { 22996 assert(chunk > PAGEHDRSZ); 22997 MDBX_page *mp = dst; 22998 mp->mp_pgno = pgno; 22999 if (mp->mp_txnid == 0) 23000 mp->mp_txnid = ctx->mc_txn->mt_txnid; 23001 if (mp->mp_flags == P_OVERFLOW) { 23002 assert(bytes <= pgno2bytes(ctx->mc_env, npages)); 23003 mp->mp_pages = npages; 23004 } 23005 pgno = 0; 23006 } 23007 src = (const char *)src + chunk; 23008 } else 23009 memset(dst, 0, chunk); 23010 bytes -= chunk; 23011 ctx->mc_wlen[side] += chunk; 23012 } 23013 return MDBX_SUCCESS; 23014 } 23015 23016 static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, 23017 const size_t head_bytes, const size_t tail_bytes, 23018 const pgno_t npages) { 23019 if (tail_bytes) { 23020 assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); 23021 assert(npages == 1 && 23022 (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF)); 23023 } else { 23024 assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); 23025 assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) || 23026 PAGETYPE_WHOLE(mp) == P_OVERFLOW); 23027 } 23028 23029 const pgno_t pgno = ctx->mc_next_pgno; 23030 ctx->mc_next_pgno += npages; 23031 int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); 23032 if (unlikely(err != MDBX_SUCCESS)) 23033 return err; 23034 err = compacting_put_bytes( 23035 ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes), 23036 0, 0); 23037 if (unlikely(err != MDBX_SUCCESS)) 23038 return err; 23039 return compacting_put_bytes( 23040 ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0, 23041 0); 23042 } 23043 23044 __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, 23045 MDBX_cursor *mc, pgno_t *root, 23046 txnid_t parent_txnid) { 23047 mc->mc_snum = 1; 23048 int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid); 23049 if (unlikely(rc != MDBX_SUCCESS)) 23050 return rc; 23051 23052 rc = page_search_root(mc, nullptr, MDBX_PS_FIRST); 23053 if (unlikely(rc != MDBX_SUCCESS)) 23054 return rc; 23055 23056 /* Make cursor pages writable */ 23057 char *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); 23058 if (buf == NULL) 23059 return MDBX_ENOMEM; 23060 23061 char *ptr = buf; 23062 for (unsigned i = 0; i < mc->mc_top; i++) { 23063 page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize); 23064 mc->mc_pg[i] = (MDBX_page *)ptr; 23065 ptr += ctx->mc_env->me_psize; 23066 } 23067 /* This is writable space for a leaf page. Usually not needed. */ 23068 MDBX_page *const leaf = (MDBX_page *)ptr; 23069 23070 while (mc->mc_snum > 0) { 23071 MDBX_page *mp = mc->mc_pg[mc->mc_top]; 23072 unsigned n = page_numkeys(mp); 23073 23074 if (IS_LEAF(mp)) { 23075 if (!(mc->mc_flags & 23076 C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { 23077 for (unsigned i = 0; i < n; i++) { 23078 MDBX_node *node = page_node(mp, i); 23079 if (node_flags(node) == F_BIGDATA) { 23080 /* Need writable leaf */ 23081 if (mp != leaf) { 23082 mc->mc_pg[mc->mc_top] = leaf; 23083 page_copy(leaf, mp, ctx->mc_env->me_psize); 23084 mp = leaf; 23085 node = page_node(mp, i); 23086 } 23087 23088 const pgr_t lp = 23089 page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); 23090 if (unlikely((rc = lp.err) != MDBX_SUCCESS)) 23091 goto done; 23092 const size_t datasize = node_ds(node); 23093 const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize); 23094 poke_pgno(node_data(node), ctx->mc_next_pgno); 23095 rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, 23096 npages); 23097 if (unlikely(rc != MDBX_SUCCESS)) 23098 goto done; 23099 } else if (node_flags(node) & F_SUBDATA) { 23100 if (!MDBX_DISABLE_VALIDATION && 23101 unlikely(node_ds(node) != sizeof(MDBX_db))) { 23102 rc = MDBX_CORRUPTED; 23103 goto done; 23104 } 23105 23106 /* Need writable leaf */ 23107 if (mp != leaf) { 23108 mc->mc_pg[mc->mc_top] = leaf; 23109 page_copy(leaf, mp, ctx->mc_env->me_psize); 23110 mp = leaf; 23111 node = page_node(mp, i); 23112 } 23113 23114 MDBX_db *nested = nullptr; 23115 if (node_flags(node) & F_DUPDATA) { 23116 rc = cursor_xinit1(mc, node, mp); 23117 if (likely(rc == MDBX_SUCCESS)) { 23118 nested = &mc->mc_xcursor->mx_db; 23119 rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, 23120 &nested->md_root, mp->mp_txnid); 23121 } 23122 } else { 23123 cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); 23124 MDBX_cursor_couple *couple = 23125 container_of(mc, MDBX_cursor_couple, outer); 23126 cASSERT(mc, 23127 couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE && 23128 !couple->inner.mx_cursor.mc_flags && 23129 !couple->inner.mx_cursor.mc_db && 23130 !couple->inner.mx_cursor.mc_dbx); 23131 nested = &couple->inner.mx_db; 23132 memcpy(nested, node_data(node), sizeof(MDBX_db)); 23133 rc = compacting_walk_sdb(ctx, nested); 23134 } 23135 if (unlikely(rc != MDBX_SUCCESS)) 23136 goto done; 23137 memcpy(node_data(node), nested, sizeof(MDBX_db)); 23138 } 23139 } 23140 } 23141 } else { 23142 mc->mc_ki[mc->mc_top]++; 23143 if (mc->mc_ki[mc->mc_top] < n) { 23144 while (1) { 23145 const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); 23146 rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); 23147 if (unlikely(rc != MDBX_SUCCESS)) 23148 goto done; 23149 mc->mc_top++; 23150 mc->mc_snum++; 23151 mc->mc_ki[mc->mc_top] = 0; 23152 if (!IS_BRANCH(mp)) { 23153 mc->mc_pg[mc->mc_top] = mp; 23154 break; 23155 } 23156 /* Whenever we advance to a sibling branch page, 23157 * we must proceed all the way down to its first leaf. */ 23158 page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); 23159 } 23160 continue; 23161 } 23162 } 23163 23164 const pgno_t pgno = ctx->mc_next_pgno; 23165 if (likely(!IS_LEAF2(mp))) { 23166 rc = compacting_put_page( 23167 ctx, mp, PAGEHDRSZ + mp->mp_lower, 23168 ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1); 23169 } else { 23170 rc = compacting_put_page( 23171 ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1); 23172 } 23173 if (unlikely(rc != MDBX_SUCCESS)) 23174 goto done; 23175 23176 if (mc->mc_top) { 23177 /* Update parent if there is one */ 23178 node_set_pgno( 23179 page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), 23180 pgno); 23181 cursor_pop(mc); 23182 } else { 23183 /* Otherwise we're done */ 23184 *root = pgno; 23185 break; 23186 } 23187 } 23188 done: 23189 osal_free(buf); 23190 return rc; 23191 } 23192 23193 __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { 23194 if (unlikely(sdb->md_root == P_INVALID)) 23195 return MDBX_SUCCESS; /* empty db */ 23196 23197 MDBX_cursor_couple couple; 23198 memset(&couple, 0, sizeof(couple)); 23199 couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; 23200 MDBX_dbx dbx = {.md_klen_min = INT_MAX}; 23201 uint8_t dbistate = DBI_VALID | DBI_AUDITED; 23202 int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); 23203 if (unlikely(rc != MDBX_SUCCESS)) 23204 return rc; 23205 23206 couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; 23207 couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; 23208 if (!sdb->md_mod_txnid) 23209 sdb->md_mod_txnid = ctx->mc_txn->mt_txnid; 23210 return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, 23211 sdb->md_mod_txnid); 23212 } 23213 23214 __cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { 23215 eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || 23216 meta->mm_dbs[FREE_DBI].md_root == P_INVALID); 23217 eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || 23218 meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); 23219 23220 /* Calculate filesize taking in account shrink/growing thresholds */ 23221 if (meta->mm_geo.next != meta->mm_geo.now) { 23222 meta->mm_geo.now = meta->mm_geo.next; 23223 const pgno_t aligner = pv2pages( 23224 meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); 23225 if (aligner) { 23226 const pgno_t aligned = pgno_align2os_pgno( 23227 env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner); 23228 meta->mm_geo.now = aligned; 23229 } 23230 } 23231 23232 if (meta->mm_geo.now < meta->mm_geo.lower) 23233 meta->mm_geo.now = meta->mm_geo.lower; 23234 if (meta->mm_geo.now > meta->mm_geo.upper) 23235 meta->mm_geo.now = meta->mm_geo.upper; 23236 23237 /* Update signature */ 23238 assert(meta->mm_geo.now >= meta->mm_geo.next); 23239 unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); 23240 } 23241 23242 /* Make resizeable */ 23243 __cold static void meta_make_sizeable(MDBX_meta *meta) { 23244 meta->mm_geo.lower = MIN_PAGENO; 23245 if (meta->mm_geo.grow_pv == 0) { 23246 const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; 23247 meta->mm_geo.grow_pv = pages2pv(step); 23248 } 23249 if (meta->mm_geo.shrink_pv == 0) { 23250 const pgno_t step = pv2pages(meta->mm_geo.grow_pv) << 1; 23251 meta->mm_geo.shrink_pv = pages2pv(step); 23252 } 23253 } 23254 23255 /* Copy environment with compaction. */ 23256 __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, 23257 mdbx_filehandle_t fd, uint8_t *buffer, 23258 const bool dest_is_pipe, const int flags) { 23259 const size_t meta_bytes = pgno2bytes(env, NUM_METAS); 23260 uint8_t *const data_buffer = 23261 buffer + ceil_powerof2(meta_bytes, env->me_os_psize); 23262 MDBX_meta *const meta = init_metas(env, buffer); 23263 meta_set_txnid(env, meta, read_txn->mt_txnid); 23264 23265 if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) 23266 meta_make_sizeable(meta); 23267 23268 /* copy canary sequences if present */ 23269 if (read_txn->mt_canary.v) { 23270 meta->mm_canary = read_txn->mt_canary; 23271 meta->mm_canary.v = constmeta_txnid(meta); 23272 } 23273 23274 if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { 23275 /* When the DB is empty, handle it specially to 23276 * fix any breakage like page leaks from ITS#8174. */ 23277 meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; 23278 compacting_fixup_meta(env, meta); 23279 if (dest_is_pipe) { 23280 int rc = osal_write(fd, buffer, meta_bytes); 23281 if (unlikely(rc != MDBX_SUCCESS)) 23282 return rc; 23283 } 23284 } else { 23285 /* Count free pages + GC pages. */ 23286 MDBX_cursor_couple couple; 23287 int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); 23288 if (unlikely(rc != MDBX_SUCCESS)) 23289 return rc; 23290 pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + 23291 read_txn->mt_dbs[FREE_DBI].md_leaf_pages + 23292 read_txn->mt_dbs[FREE_DBI].md_overflow_pages; 23293 MDBX_val key, data; 23294 while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 23295 MDBX_SUCCESS) { 23296 const MDBX_PNL pnl = data.iov_base; 23297 if (unlikely(data.iov_len % sizeof(pgno_t) || 23298 data.iov_len < MDBX_PNL_SIZEOF(pnl) || 23299 !(pnl_check(pnl, read_txn->mt_next_pgno)))) 23300 return MDBX_CORRUPTED; 23301 gc += MDBX_PNL_SIZE(pnl); 23302 } 23303 if (unlikely(rc != MDBX_NOTFOUND)) 23304 return rc; 23305 23306 /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */ 23307 meta->mm_geo.next = read_txn->mt_next_pgno - gc; 23308 /* Set with current main DB */ 23309 meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; 23310 23311 mdbx_compacting_ctx ctx; 23312 memset(&ctx, 0, sizeof(ctx)); 23313 rc = osal_condpair_init(&ctx.mc_condpair); 23314 if (unlikely(rc != MDBX_SUCCESS)) 23315 return rc; 23316 23317 memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); 23318 ctx.mc_wbuf[0] = data_buffer; 23319 ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; 23320 ctx.mc_next_pgno = NUM_METAS; 23321 ctx.mc_env = env; 23322 ctx.mc_fd = fd; 23323 ctx.mc_txn = read_txn; 23324 23325 osal_thread_t thread; 23326 int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); 23327 if (likely(thread_err == MDBX_SUCCESS)) { 23328 if (dest_is_pipe) { 23329 if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) 23330 meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; 23331 compacting_fixup_meta(env, meta); 23332 rc = osal_write(fd, buffer, meta_bytes); 23333 } 23334 if (likely(rc == MDBX_SUCCESS)) 23335 rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); 23336 if (ctx.mc_wlen[ctx.mc_head & 1]) 23337 /* toggle to flush non-empty buffers */ 23338 compacting_toggle_write_buffers(&ctx); 23339 23340 if (likely(rc == MDBX_SUCCESS) && 23341 unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { 23342 if (ctx.mc_next_pgno > meta->mm_geo.next) { 23343 ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO 23344 " %c expected %" PRIaPGNO, 23345 "has double-used pages or other corruption", ctx.mc_next_pgno, 23346 '>', meta->mm_geo.next); 23347 rc = MDBX_CORRUPTED; /* corrupted DB */ 23348 } 23349 if (ctx.mc_next_pgno < meta->mm_geo.next) { 23350 WARNING( 23351 "the source DB %s: post-compactification used pages %" PRIaPGNO 23352 " %c expected %" PRIaPGNO, 23353 "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); 23354 if (dest_is_pipe) 23355 /* the root within already written meta-pages is wrong */ 23356 rc = MDBX_CORRUPTED; 23357 } 23358 /* fixup meta */ 23359 meta->mm_geo.next = ctx.mc_next_pgno; 23360 } 23361 23362 /* toggle with empty buffers to exit thread's loop */ 23363 eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); 23364 compacting_toggle_write_buffers(&ctx); 23365 thread_err = osal_thread_join(thread); 23366 eASSERT(env, (ctx.mc_tail == ctx.mc_head && 23367 ctx.mc_wlen[ctx.mc_head & 1] == 0) || 23368 ctx.mc_error); 23369 osal_condpair_destroy(&ctx.mc_condpair); 23370 } 23371 if (unlikely(thread_err != MDBX_SUCCESS)) 23372 return thread_err; 23373 if (unlikely(rc != MDBX_SUCCESS)) 23374 return rc; 23375 if (unlikely(ctx.mc_error != MDBX_SUCCESS)) 23376 return ctx.mc_error; 23377 if (!dest_is_pipe) 23378 compacting_fixup_meta(env, meta); 23379 } 23380 23381 /* Extend file if required */ 23382 if (meta->mm_geo.now != meta->mm_geo.next) { 23383 const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); 23384 if (!dest_is_pipe) 23385 return osal_ftruncate(fd, whole_size); 23386 23387 const size_t used_size = pgno2bytes(env, meta->mm_geo.next); 23388 memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); 23389 for (size_t offset = used_size; offset < whole_size;) { 23390 const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) 23391 ? (size_t)MDBX_ENVCOPY_WRITEBUF 23392 : whole_size - offset; 23393 /* copy to avoid EFAULT in case swapped-out */ 23394 int rc = osal_write(fd, data_buffer, chunk); 23395 if (unlikely(rc != MDBX_SUCCESS)) 23396 return rc; 23397 offset += chunk; 23398 } 23399 } 23400 return MDBX_SUCCESS; 23401 } 23402 23403 /* Copy environment as-is. */ 23404 __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, 23405 mdbx_filehandle_t fd, uint8_t *buffer, 23406 const bool dest_is_pipe, const int flags) { 23407 /* We must start the actual read txn after blocking writers */ 23408 int rc = txn_end(read_txn, MDBX_END_RESET_TMP); 23409 if (unlikely(rc != MDBX_SUCCESS)) 23410 return rc; 23411 23412 /* Temporarily block writers until we snapshot the meta pages */ 23413 rc = mdbx_txn_lock(env, false); 23414 if (unlikely(rc != MDBX_SUCCESS)) 23415 return rc; 23416 23417 rc = txn_renew(read_txn, MDBX_TXN_RDONLY); 23418 if (unlikely(rc != MDBX_SUCCESS)) { 23419 mdbx_txn_unlock(env); 23420 return rc; 23421 } 23422 23423 jitter4testing(false); 23424 const size_t meta_bytes = pgno2bytes(env, NUM_METAS); 23425 const meta_troika_t troika = meta_tap(env); 23426 /* Make a snapshot of meta-pages, 23427 * but writing ones after the data was flushed */ 23428 memcpy(buffer, env->me_map, meta_bytes); 23429 MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ 23430 (MDBX_meta *)(buffer + 23431 ((uint8_t *)meta_recent(env, &troika).ptr_c - env->me_map)); 23432 mdbx_txn_unlock(env); 23433 23434 if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) 23435 meta_make_sizeable(headcopy); 23436 /* Update signature to steady */ 23437 unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy)); 23438 23439 /* Copy the data */ 23440 const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); 23441 const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); 23442 jitter4testing(false); 23443 23444 if (dest_is_pipe) 23445 rc = osal_write(fd, buffer, meta_bytes); 23446 23447 uint8_t *const data_buffer = 23448 buffer + ceil_powerof2(meta_bytes, env->me_os_psize); 23449 #if MDBX_USE_COPYFILERANGE 23450 static bool copyfilerange_unavailable; 23451 bool not_the_same_filesystem = false; 23452 #endif /* MDBX_USE_COPYFILERANGE */ 23453 for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { 23454 #if MDBX_USE_SENDFILE 23455 static bool sendfile_unavailable; 23456 if (dest_is_pipe && likely(!sendfile_unavailable)) { 23457 off_t in_offset = offset; 23458 const ssize_t written = 23459 sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset); 23460 if (likely(written > 0)) { 23461 offset = in_offset; 23462 continue; 23463 } 23464 rc = MDBX_ENODATA; 23465 if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) 23466 break; 23467 sendfile_unavailable = true; 23468 } 23469 #endif /* MDBX_USE_SENDFILE */ 23470 23471 #if MDBX_USE_COPYFILERANGE 23472 if (!dest_is_pipe && !not_the_same_filesystem && 23473 likely(!copyfilerange_unavailable)) { 23474 off_t in_offset = offset, out_offset = offset; 23475 ssize_t bytes_copied = copy_file_range( 23476 env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); 23477 if (likely(bytes_copied > 0)) { 23478 offset = in_offset; 23479 continue; 23480 } 23481 rc = MDBX_ENODATA; 23482 if (bytes_copied == 0) 23483 break; 23484 rc = errno; 23485 if (rc == EXDEV) 23486 not_the_same_filesystem = true; 23487 else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) 23488 copyfilerange_unavailable = true; 23489 else 23490 break; 23491 } 23492 #endif /* MDBX_USE_COPYFILERANGE */ 23493 23494 /* fallback to portable */ 23495 const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) 23496 ? (size_t)MDBX_ENVCOPY_WRITEBUF 23497 : used_size - offset; 23498 /* copy to avoid EFAULT in case swapped-out */ 23499 memcpy(data_buffer, env->me_map + offset, chunk); 23500 rc = osal_write(fd, data_buffer, chunk); 23501 offset += chunk; 23502 } 23503 23504 /* Extend file if required */ 23505 if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { 23506 if (!dest_is_pipe) 23507 rc = osal_ftruncate(fd, whole_size); 23508 else { 23509 memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); 23510 for (size_t offset = used_size; 23511 rc == MDBX_SUCCESS && offset < whole_size;) { 23512 const size_t chunk = 23513 ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) 23514 ? (size_t)MDBX_ENVCOPY_WRITEBUF 23515 : whole_size - offset; 23516 /* copy to avoid EFAULT in case swapped-out */ 23517 rc = osal_write(fd, data_buffer, chunk); 23518 offset += chunk; 23519 } 23520 } 23521 } 23522 23523 return rc; 23524 } 23525 23526 __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, 23527 unsigned flags) { 23528 int rc = check_env(env, true); 23529 if (unlikely(rc != MDBX_SUCCESS)) 23530 return rc; 23531 23532 const int dest_is_pipe = osal_is_pipe(fd); 23533 if (MDBX_IS_ERROR(dest_is_pipe)) 23534 return dest_is_pipe; 23535 23536 if (!dest_is_pipe) { 23537 rc = osal_fseek(fd, 0); 23538 if (unlikely(rc != MDBX_SUCCESS)) 23539 return rc; 23540 } 23541 23542 const size_t buffer_size = 23543 pgno_align2os_bytes(env, NUM_METAS) + 23544 ceil_powerof2(((flags & MDBX_CP_COMPACT) 23545 ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF 23546 : (size_t)MDBX_ENVCOPY_WRITEBUF), 23547 env->me_os_psize); 23548 23549 uint8_t *buffer = NULL; 23550 rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); 23551 if (unlikely(rc != MDBX_SUCCESS)) 23552 return rc; 23553 23554 MDBX_txn *read_txn = NULL; 23555 /* Do the lock/unlock of the reader mutex before starting the 23556 * write txn. Otherwise other read txns could block writers. */ 23557 rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); 23558 if (unlikely(rc != MDBX_SUCCESS)) { 23559 osal_memalign_free(buffer); 23560 return rc; 23561 } 23562 23563 if (!dest_is_pipe) { 23564 /* Firstly write a stub to meta-pages. 23565 * Now we sure to incomplete copy will not be used. */ 23566 memset(buffer, -1, pgno2bytes(env, NUM_METAS)); 23567 rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); 23568 } 23569 23570 if (likely(rc == MDBX_SUCCESS)) { 23571 memset(buffer, 0, pgno2bytes(env, NUM_METAS)); 23572 rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)( 23573 env, read_txn, fd, buffer, dest_is_pipe, flags); 23574 } 23575 mdbx_txn_abort(read_txn); 23576 23577 if (!dest_is_pipe) { 23578 if (likely(rc == MDBX_SUCCESS)) 23579 rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); 23580 23581 /* Write actual meta */ 23582 if (likely(rc == MDBX_SUCCESS)) 23583 rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); 23584 23585 if (likely(rc == MDBX_SUCCESS)) 23586 rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); 23587 } 23588 23589 osal_memalign_free(buffer); 23590 return rc; 23591 } 23592 23593 __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, 23594 MDBX_copy_flags_t flags) { 23595 #if defined(_WIN32) || defined(_WIN64) 23596 const wchar_t *dest_pathW = nullptr; 23597 OSAL_MB2WIDE(dest_path, dest_pathW); 23598 return mdbx_env_copyW(env, dest_pathW, flags); 23599 } 23600 23601 LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, 23602 MDBX_copy_flags_t flags) { 23603 #endif /* Windows */ 23604 23605 int rc = check_env(env, true); 23606 if (unlikely(rc != MDBX_SUCCESS)) 23607 return rc; 23608 23609 if (unlikely(!dest_path)) 23610 return MDBX_EINVAL; 23611 23612 /* The destination path must exist, but the destination file must not. 23613 * We don't want the OS to cache the writes, since the source data is 23614 * already in the OS cache. */ 23615 mdbx_filehandle_t newfd; 23616 rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, 23617 #if defined(_WIN32) || defined(_WIN64) 23618 (mdbx_mode_t)-1 23619 #else 23620 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP 23621 #endif 23622 ); 23623 23624 if (rc == MDBX_SUCCESS) { 23625 #if defined(_WIN32) || defined(_WIN64) 23626 OVERLAPPED ov; 23627 memset(&ov, 0, sizeof(ov)); 23628 if (!LockFileEx(newfd, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 23629 0, 0, INT32_MAX, &ov)) 23630 rc = GetLastError(); 23631 #else 23632 struct flock lock_op; 23633 memset(&lock_op, 0, sizeof(lock_op)); 23634 lock_op.l_type = F_WRLCK; 23635 lock_op.l_whence = SEEK_SET; 23636 lock_op.l_start = 0; 23637 lock_op.l_len = 23638 (sizeof(lock_op.l_len) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff; 23639 if (fcntl(newfd, F_SETLK, &lock_op) 23640 #if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ 23641 (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) 23642 || flock(newfd, LOCK_EX | LOCK_NB) 23643 #endif /* Linux */ 23644 ) 23645 rc = errno; 23646 #endif /* Windows / POSIX */ 23647 } 23648 23649 if (rc == MDBX_SUCCESS) 23650 rc = mdbx_env_copy2fd(env, newfd, flags); 23651 23652 if (newfd != INVALID_HANDLE_VALUE) { 23653 int err = osal_closefile(newfd); 23654 if (rc == MDBX_SUCCESS && err != rc) 23655 rc = err; 23656 if (rc != MDBX_SUCCESS) 23657 (void)osal_removefile(dest_path); 23658 } 23659 23660 return rc; 23661 } 23662 23663 /******************************************************************************/ 23664 23665 __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, 23666 bool onoff) { 23667 int rc = check_env(env, false); 23668 if (unlikely(rc != MDBX_SUCCESS)) 23669 return rc; 23670 23671 if (unlikely(flags & 23672 ((env->me_flags & MDBX_ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS 23673 : ~ENV_USABLE_FLAGS))) 23674 return MDBX_EPERM; 23675 23676 if (unlikely(env->me_flags & MDBX_RDONLY)) 23677 return MDBX_EACCESS; 23678 23679 if ((env->me_flags & MDBX_ENV_ACTIVE) && 23680 unlikely(env->me_txn0->mt_owner == osal_thread_self())) 23681 return MDBX_BUSY; 23682 23683 const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && 23684 env->me_txn0->mt_owner != osal_thread_self(); 23685 bool should_unlock = false; 23686 if (lock_needed) { 23687 rc = mdbx_txn_lock(env, false); 23688 if (unlikely(rc)) 23689 return rc; 23690 should_unlock = true; 23691 } 23692 23693 if (onoff) 23694 env->me_flags = merge_sync_flags(env->me_flags, flags); 23695 else 23696 env->me_flags &= ~flags; 23697 23698 if (should_unlock) 23699 mdbx_txn_unlock(env); 23700 return MDBX_SUCCESS; 23701 } 23702 23703 __cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { 23704 int rc = check_env(env, false); 23705 if (unlikely(rc != MDBX_SUCCESS)) 23706 return rc; 23707 23708 if (unlikely(!arg)) 23709 return MDBX_EINVAL; 23710 23711 *arg = env->me_flags & ENV_USABLE_FLAGS; 23712 return MDBX_SUCCESS; 23713 } 23714 23715 __cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { 23716 int rc = check_env(env, false); 23717 if (unlikely(rc != MDBX_SUCCESS)) 23718 return rc; 23719 23720 env->me_userctx = ctx; 23721 return MDBX_SUCCESS; 23722 } 23723 23724 __cold void *mdbx_env_get_userctx(const MDBX_env *env) { 23725 return env ? env->me_userctx : NULL; 23726 } 23727 23728 __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { 23729 int rc = check_env(env, false); 23730 if (unlikely(rc != MDBX_SUCCESS)) 23731 return rc; 23732 23733 #if MDBX_DEBUG 23734 env->me_assert_func = func; 23735 return MDBX_SUCCESS; 23736 #else 23737 (void)func; 23738 return MDBX_ENOSYS; 23739 #endif 23740 } 23741 23742 #if !(defined(_WIN32) || defined(_WIN64)) 23743 __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { 23744 int rc = check_env(env, true); 23745 if (unlikely(rc != MDBX_SUCCESS)) 23746 return rc; 23747 23748 if (unlikely(!arg)) 23749 return MDBX_EINVAL; 23750 23751 *arg = env->me_pathname; 23752 return MDBX_SUCCESS; 23753 } 23754 #else 23755 __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { 23756 int rc = check_env(env, true); 23757 if (unlikely(rc != MDBX_SUCCESS)) 23758 return rc; 23759 23760 if (unlikely(!arg)) 23761 return MDBX_EINVAL; 23762 23763 *arg = env->me_pathname; 23764 return MDBX_SUCCESS; 23765 } 23766 #endif /* Windows */ 23767 23768 __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { 23769 int rc = check_env(env, true); 23770 if (unlikely(rc != MDBX_SUCCESS)) 23771 return rc; 23772 23773 if (unlikely(!arg)) 23774 return MDBX_EINVAL; 23775 23776 *arg = env->me_lazy_fd; 23777 return MDBX_SUCCESS; 23778 } 23779 23780 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 23781 __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) { 23782 return __inline_mdbx_env_stat(env, stat, bytes); 23783 } 23784 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 23785 23786 static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) { 23787 st->ms_depth = db->md_depth; 23788 st->ms_branch_pages = db->md_branch_pages; 23789 st->ms_leaf_pages = db->md_leaf_pages; 23790 st->ms_overflow_pages = db->md_overflow_pages; 23791 st->ms_entries = db->md_entries; 23792 if (likely(bytes >= 23793 offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) 23794 st->ms_mod_txnid = db->md_mod_txnid; 23795 } 23796 23797 static void stat_add(const MDBX_db *db, MDBX_stat *const st, 23798 const size_t bytes) { 23799 st->ms_depth += db->md_depth; 23800 st->ms_branch_pages += db->md_branch_pages; 23801 st->ms_leaf_pages += db->md_leaf_pages; 23802 st->ms_overflow_pages += db->md_overflow_pages; 23803 st->ms_entries += db->md_entries; 23804 if (likely(bytes >= 23805 offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) 23806 st->ms_mod_txnid = (st->ms_mod_txnid > db->md_mod_txnid) ? st->ms_mod_txnid 23807 : db->md_mod_txnid; 23808 } 23809 23810 __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { 23811 int err = check_txn(txn, MDBX_TXN_BLOCKED); 23812 if (unlikely(err != MDBX_SUCCESS)) 23813 return err; 23814 23815 st->ms_psize = txn->mt_env->me_psize; 23816 #if 1 23817 /* assuming GC is internal and not subject for accounting */ 23818 stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes); 23819 #else 23820 stat_get(&txn->mt_dbs[FREE_DBI], st, bytes); 23821 stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes); 23822 #endif 23823 23824 /* account opened named subDBs */ 23825 for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) 23826 if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) 23827 stat_add(txn->mt_dbs + dbi, st, bytes); 23828 23829 if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && 23830 txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { 23831 MDBX_cursor_couple cx; 23832 err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); 23833 if (unlikely(err != MDBX_SUCCESS)) 23834 return err; 23835 23836 /* scan and account not opened named subDBs */ 23837 err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); 23838 while (err == MDBX_SUCCESS) { 23839 const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; 23840 for (unsigned i = 0; i < page_numkeys(mp); i++) { 23841 const MDBX_node *node = page_node(mp, i); 23842 if (node_flags(node) != F_SUBDATA) 23843 continue; 23844 if (unlikely(node_ds(node) != sizeof(MDBX_db))) 23845 return MDBX_CORRUPTED; 23846 23847 /* skip opened and already accounted */ 23848 for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) 23849 if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && 23850 node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len && 23851 memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base, 23852 node_ks(node)) == 0) { 23853 node = NULL; 23854 break; 23855 } 23856 23857 if (node) { 23858 MDBX_db db; 23859 memcpy(&db, node_data(node), sizeof(db)); 23860 stat_add(&db, st, bytes); 23861 } 23862 } 23863 err = cursor_sibling(&cx.outer, SIBLING_RIGHT); 23864 } 23865 if (unlikely(err != MDBX_NOTFOUND)) 23866 return err; 23867 } 23868 23869 return MDBX_SUCCESS; 23870 } 23871 23872 __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, 23873 MDBX_stat *dest, size_t bytes) { 23874 if (unlikely(!dest)) 23875 return MDBX_EINVAL; 23876 const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); 23877 if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) 23878 return MDBX_EINVAL; 23879 23880 if (likely(txn)) { 23881 if (env && unlikely(txn->mt_env != env)) 23882 return MDBX_EINVAL; 23883 return stat_acc(txn, dest, bytes); 23884 } 23885 23886 int err = check_env(env, true); 23887 if (unlikely(err != MDBX_SUCCESS)) 23888 return err; 23889 23890 if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) 23891 /* inside write-txn */ 23892 return stat_acc(env->me_txn, dest, bytes); 23893 23894 MDBX_txn *tmp_txn; 23895 err = mdbx_txn_begin((MDBX_env *)env, NULL, MDBX_TXN_RDONLY, &tmp_txn); 23896 if (unlikely(err != MDBX_SUCCESS)) 23897 return err; 23898 23899 const int rc = stat_acc(tmp_txn, dest, bytes); 23900 err = mdbx_txn_abort(tmp_txn); 23901 if (unlikely(err != MDBX_SUCCESS)) 23902 return err; 23903 return rc; 23904 } 23905 23906 __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, 23907 uint32_t *mask) { 23908 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 23909 if (unlikely(rc != MDBX_SUCCESS)) 23910 return rc; 23911 23912 if (unlikely(!mask)) 23913 return MDBX_EINVAL; 23914 23915 if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) 23916 return MDBX_BAD_DBI; 23917 23918 MDBX_cursor_couple cx; 23919 rc = cursor_init(&cx.outer, txn, dbi); 23920 if (unlikely(rc != MDBX_SUCCESS)) 23921 return rc; 23922 if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) 23923 return MDBX_RESULT_TRUE; 23924 23925 MDBX_val key, data; 23926 rc = cursor_first(&cx.outer, &key, &data); 23927 *mask = 0; 23928 while (rc == MDBX_SUCCESS) { 23929 const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], 23930 cx.outer.mc_ki[cx.outer.mc_top]); 23931 const MDBX_db *db = node_data(node); 23932 const unsigned flags = node_flags(node); 23933 switch (flags) { 23934 case F_BIGDATA: 23935 case 0: 23936 /* single-value entry, deep = 0 */ 23937 *mask |= 1 << 0; 23938 break; 23939 case F_DUPDATA: 23940 /* single sub-page, deep = 1 */ 23941 *mask |= 1 << 1; 23942 break; 23943 case F_DUPDATA | F_SUBDATA: 23944 /* sub-tree */ 23945 *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); 23946 break; 23947 default: 23948 ERROR("wrong node-flags %u", flags); 23949 return MDBX_CORRUPTED; 23950 } 23951 rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); 23952 } 23953 23954 return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; 23955 } 23956 23957 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 23958 __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, 23959 size_t bytes) { 23960 return __inline_mdbx_env_info(env, info, bytes); 23961 } 23962 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 23963 23964 __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, 23965 MDBX_envinfo *arg, const size_t bytes) { 23966 23967 const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); 23968 const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); 23969 23970 /* is the environment open? 23971 * (todo4recovery://erased_by_github/libmdbx/issues/171) */ 23972 if (unlikely(!env->me_map)) { 23973 /* environment not yet opened */ 23974 #if 1 23975 /* default behavior: returns the available info but zeroed the rest */ 23976 memset(arg, 0, bytes); 23977 arg->mi_geo.lower = env->me_dbgeo.lower; 23978 arg->mi_geo.upper = env->me_dbgeo.upper; 23979 arg->mi_geo.shrink = env->me_dbgeo.shrink; 23980 arg->mi_geo.grow = env->me_dbgeo.grow; 23981 arg->mi_geo.current = env->me_dbgeo.now; 23982 arg->mi_maxreaders = env->me_maxreaders; 23983 arg->mi_dxb_pagesize = env->me_psize; 23984 arg->mi_sys_pagesize = env->me_os_psize; 23985 if (likely(bytes > size_before_bootid)) { 23986 arg->mi_bootid.current.x = bootid.x; 23987 arg->mi_bootid.current.y = bootid.y; 23988 } 23989 return MDBX_SUCCESS; 23990 #else 23991 /* some users may prefer this behavior: return appropriate error */ 23992 return MDBX_EPERM; 23993 #endif 23994 } 23995 23996 const MDBX_meta *const meta0 = METAPAGE(env, 0); 23997 const MDBX_meta *const meta1 = METAPAGE(env, 1); 23998 const MDBX_meta *const meta2 = METAPAGE(env, 2); 23999 if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) 24000 return MDBX_PANIC; 24001 24002 meta_troika_t holder; 24003 meta_troika_t const *troika; 24004 if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) 24005 troika = &txn->tw.troika; 24006 else { 24007 holder = meta_tap(env); 24008 troika = &holder; 24009 } 24010 24011 const meta_ptr_t head = meta_recent(env, troika); 24012 arg->mi_recent_txnid = head.txnid; 24013 arg->mi_meta0_txnid = troika->txnid[0]; 24014 arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); 24015 arg->mi_meta1_txnid = troika->txnid[1]; 24016 arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); 24017 arg->mi_meta2_txnid = troika->txnid[2]; 24018 arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); 24019 if (likely(bytes > size_before_bootid)) { 24020 memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); 24021 memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); 24022 memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); 24023 } 24024 24025 const volatile MDBX_meta *txn_meta = head.ptr_v; 24026 arg->mi_last_pgno = txn_meta->mm_geo.next - 1; 24027 arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); 24028 if (txn) { 24029 arg->mi_last_pgno = txn->mt_next_pgno - 1; 24030 arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); 24031 24032 const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY) 24033 ? txn->mt_txnid 24034 : txn->mt_txnid - xMDBX_TXNID_STEP; 24035 txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; 24036 txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; 24037 txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; 24038 } 24039 arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); 24040 arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); 24041 arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); 24042 arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); 24043 const pgno_t unsynced_pages = 24044 atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + 24045 (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != 24046 (uint32_t)arg->mi_recent_txnid); 24047 24048 arg->mi_mapsize = env->me_dxb_mmap.limit; 24049 24050 const MDBX_lockinfo *const lck = env->me_lck; 24051 arg->mi_maxreaders = env->me_maxreaders; 24052 arg->mi_numreaders = env->me_lck_mmap.lck 24053 ? atomic_load32(&lck->mti_numreaders, mo_Relaxed) 24054 : INT32_MAX; 24055 arg->mi_dxb_pagesize = env->me_psize; 24056 arg->mi_sys_pagesize = env->me_os_psize; 24057 24058 if (likely(bytes > size_before_bootid)) { 24059 arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); 24060 const uint64_t monotime_now = osal_monotime(); 24061 uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); 24062 arg->mi_since_sync_seconds16dot16 = 24063 ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; 24064 ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); 24065 arg->mi_since_reader_check_seconds16dot16 = 24066 ts ? osal_monotime_to_16dot16(monotime_now - ts) : 0; 24067 arg->mi_autosync_threshold = pgno2bytes( 24068 env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); 24069 arg->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16( 24070 atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); 24071 arg->mi_bootid.current.x = bootid.x; 24072 arg->mi_bootid.current.y = bootid.y; 24073 arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; 24074 } 24075 24076 if (likely(bytes > size_before_pgop_stat)) { 24077 #if MDBX_ENABLE_PGOP_STAT 24078 arg->mi_pgop_stat.newly = 24079 atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed); 24080 arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); 24081 arg->mi_pgop_stat.clone = 24082 atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed); 24083 arg->mi_pgop_stat.split = 24084 atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed); 24085 arg->mi_pgop_stat.merge = 24086 atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed); 24087 arg->mi_pgop_stat.spill = 24088 atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed); 24089 arg->mi_pgop_stat.unspill = 24090 atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); 24091 arg->mi_pgop_stat.wops = 24092 atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); 24093 arg->mi_pgop_stat.gcrtime_seconds16dot16 = osal_monotime_to_16dot16( 24094 atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed)); 24095 #else 24096 memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); 24097 #endif /* MDBX_ENABLE_PGOP_STAT*/ 24098 } 24099 24100 arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = 24101 arg->mi_recent_txnid; 24102 if (env->me_lck_mmap.lck) { 24103 for (unsigned i = 0; i < arg->mi_numreaders; ++i) { 24104 const uint32_t pid = 24105 atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); 24106 if (pid) { 24107 const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); 24108 if (arg->mi_latter_reader_txnid > txnid) 24109 arg->mi_latter_reader_txnid = txnid; 24110 if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid) 24111 arg->mi_self_latter_reader_txnid = txnid; 24112 } 24113 } 24114 } 24115 24116 osal_compiler_barrier(); 24117 return MDBX_SUCCESS; 24118 } 24119 24120 __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, 24121 MDBX_envinfo *arg, size_t bytes) { 24122 if (unlikely((env == NULL && txn == NULL) || arg == NULL)) 24123 return MDBX_EINVAL; 24124 24125 if (txn) { 24126 int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); 24127 if (unlikely(err != MDBX_SUCCESS)) 24128 return err; 24129 } 24130 if (env) { 24131 int err = check_env(env, false); 24132 if (unlikely(err != MDBX_SUCCESS)) 24133 return err; 24134 if (txn && unlikely(txn->mt_env != env)) 24135 return MDBX_EINVAL; 24136 } else { 24137 env = txn->mt_env; 24138 } 24139 24140 const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); 24141 const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); 24142 if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && 24143 bytes != size_before_pgop_stat) 24144 return MDBX_EINVAL; 24145 24146 MDBX_envinfo snap; 24147 int rc = fetch_envinfo_ex(env, txn, &snap, sizeof(snap)); 24148 if (unlikely(rc != MDBX_SUCCESS)) 24149 return rc; 24150 24151 while (1) { 24152 rc = fetch_envinfo_ex(env, txn, arg, bytes); 24153 if (unlikely(rc != MDBX_SUCCESS)) 24154 return rc; 24155 snap.mi_since_sync_seconds16dot16 = arg->mi_since_sync_seconds16dot16; 24156 snap.mi_since_reader_check_seconds16dot16 = 24157 arg->mi_since_reader_check_seconds16dot16; 24158 if (likely(memcmp(&snap, arg, bytes) == 0)) 24159 return MDBX_SUCCESS; 24160 memcpy(&snap, arg, bytes); 24161 } 24162 } 24163 24164 static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags) { 24165 return (flags & MDBX_REVERSEKEY) ? cmp_reverse 24166 : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 24167 : cmp_lexical; 24168 } 24169 24170 static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { 24171 return !(flags & MDBX_DUPSORT) 24172 ? cmp_lenfast 24173 : ((flags & MDBX_INTEGERDUP) 24174 ? cmp_int_unaligned 24175 : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); 24176 } 24177 24178 static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, 24179 MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { 24180 /* LY: so, accepting only three cases for the table's flags: 24181 * 1) user_flags and both comparators are zero 24182 * = assume that a by-default mode/flags is requested for reading; 24183 * 2) user_flags exactly the same 24184 * = assume that the target mode/flags are requested properly; 24185 * 3) user_flags differs, but table is empty and MDBX_CREATE is provided 24186 * = assume that a properly create request with custom flags; 24187 */ 24188 if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) { 24189 /* flags are differs, check other conditions */ 24190 if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) && 24191 (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) || 24192 user_flags == MDBX_ACCEDE) { 24193 /* no comparators were provided and flags are zero, 24194 * seems that is case #1 above */ 24195 user_flags = txn->mt_dbs[dbi].md_flags; 24196 } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { 24197 if (txn->mt_flags & MDBX_TXN_RDONLY) 24198 return /* FIXME: return extended info */ MDBX_EACCESS; 24199 /* make sure flags changes get committed */ 24200 txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS; 24201 txn->mt_flags |= MDBX_TXN_DIRTY; 24202 } else { 24203 return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; 24204 } 24205 } 24206 24207 if (!keycmp) 24208 keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp 24209 : get_default_keycmp(user_flags); 24210 if (txn->mt_dbxs[dbi].md_cmp != keycmp) { 24211 if (txn->mt_dbxs[dbi].md_cmp) 24212 return MDBX_EINVAL; 24213 txn->mt_dbxs[dbi].md_cmp = keycmp; 24214 } 24215 24216 if (!datacmp) 24217 datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp 24218 : get_default_datacmp(user_flags); 24219 if (txn->mt_dbxs[dbi].md_dcmp != datacmp) { 24220 if (txn->mt_dbxs[dbi].md_dcmp) 24221 return MDBX_EINVAL; 24222 txn->mt_dbxs[dbi].md_dcmp = datacmp; 24223 } 24224 24225 return MDBX_SUCCESS; 24226 } 24227 24228 static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, 24229 MDBX_dbi *dbi, MDBX_cmp_func *keycmp, 24230 MDBX_cmp_func *datacmp) { 24231 int rc = MDBX_EINVAL; 24232 if (unlikely(!dbi)) 24233 return rc; 24234 24235 if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { 24236 early_bailout: 24237 *dbi = 0; 24238 return rc; 24239 } 24240 24241 rc = check_txn(txn, MDBX_TXN_BLOCKED); 24242 if (unlikely(rc != MDBX_SUCCESS)) 24243 goto early_bailout; 24244 24245 switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | 24246 MDBX_REVERSEDUP | MDBX_ACCEDE)) { 24247 case MDBX_ACCEDE: 24248 if ((user_flags & MDBX_CREATE) == 0) 24249 break; 24250 __fallthrough /* fall through */; 24251 default: 24252 rc = MDBX_EINVAL; 24253 goto early_bailout; 24254 24255 case MDBX_DUPSORT: 24256 case MDBX_DUPSORT | MDBX_REVERSEDUP: 24257 case MDBX_DUPSORT | MDBX_DUPFIXED: 24258 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: 24259 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: 24260 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: 24261 case 0: 24262 break; 24263 } 24264 24265 /* main table? */ 24266 if (!table_name) { 24267 rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); 24268 if (unlikely(rc != MDBX_SUCCESS)) 24269 goto early_bailout; 24270 *dbi = MAIN_DBI; 24271 return rc; 24272 } 24273 24274 MDBX_env *env = txn->mt_env; 24275 size_t len = strlen(table_name); 24276 if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) 24277 return MDBX_EINVAL; 24278 24279 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { 24280 txn->mt_dbxs[MAIN_DBI].md_cmp = 24281 get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); 24282 txn->mt_dbxs[MAIN_DBI].md_dcmp = 24283 get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); 24284 } 24285 24286 /* Is the DB already open? */ 24287 MDBX_dbi scan, slot; 24288 for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { 24289 if (!txn->mt_dbxs[scan].md_name.iov_len) { 24290 /* Remember this free slot */ 24291 slot = scan; 24292 continue; 24293 } 24294 if (len == txn->mt_dbxs[scan].md_name.iov_len && 24295 !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { 24296 rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); 24297 if (unlikely(rc != MDBX_SUCCESS)) 24298 goto early_bailout; 24299 *dbi = scan; 24300 return rc; 24301 } 24302 } 24303 24304 /* Fail, if no free slot and max hit */ 24305 if (unlikely(slot >= env->me_maxdbs)) { 24306 rc = MDBX_DBS_FULL; 24307 goto early_bailout; 24308 } 24309 24310 /* Cannot mix named table with some main-table flags */ 24311 if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & 24312 (MDBX_DUPSORT | MDBX_INTEGERKEY))) { 24313 rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; 24314 goto early_bailout; 24315 } 24316 24317 /* Find the DB info */ 24318 MDBX_val key, data; 24319 key.iov_len = len; 24320 key.iov_base = (void *)table_name; 24321 MDBX_cursor_couple couple; 24322 rc = cursor_init(&couple.outer, txn, MAIN_DBI); 24323 if (unlikely(rc != MDBX_SUCCESS)) 24324 goto early_bailout; 24325 rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; 24326 if (unlikely(rc != MDBX_SUCCESS)) { 24327 if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) 24328 goto early_bailout; 24329 } else { 24330 /* make sure this is actually a table */ 24331 MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], 24332 couple.outer.mc_ki[couple.outer.mc_top]); 24333 if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { 24334 rc = MDBX_INCOMPATIBLE; 24335 goto early_bailout; 24336 } 24337 if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { 24338 rc = MDBX_CORRUPTED; 24339 goto early_bailout; 24340 } 24341 } 24342 24343 if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { 24344 rc = MDBX_EACCESS; 24345 goto early_bailout; 24346 } 24347 24348 /* Done here so we cannot fail after creating a new DB */ 24349 char *namedup = osal_strdup(table_name); 24350 if (unlikely(!namedup)) { 24351 rc = MDBX_ENOMEM; 24352 goto early_bailout; 24353 } 24354 24355 int err = osal_fastmutex_acquire(&env->me_dbi_lock); 24356 if (unlikely(err != MDBX_SUCCESS)) { 24357 rc = err; 24358 osal_free(namedup); 24359 goto early_bailout; 24360 } 24361 24362 /* Import handles from env */ 24363 dbi_import_locked(txn); 24364 24365 /* Rescan after mutex acquisition & import handles */ 24366 for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { 24367 if (!txn->mt_dbxs[scan].md_name.iov_len) { 24368 /* Remember this free slot */ 24369 slot = scan; 24370 continue; 24371 } 24372 if (len == txn->mt_dbxs[scan].md_name.iov_len && 24373 !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { 24374 rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); 24375 if (unlikely(rc != MDBX_SUCCESS)) 24376 goto later_bailout; 24377 *dbi = scan; 24378 goto later_exit; 24379 } 24380 } 24381 24382 if (unlikely(slot >= env->me_maxdbs)) { 24383 rc = MDBX_DBS_FULL; 24384 goto later_bailout; 24385 } 24386 24387 unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; 24388 MDBX_db db_dummy; 24389 if (unlikely(rc)) { 24390 /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ 24391 tASSERT(txn, rc == MDBX_NOTFOUND); 24392 memset(&db_dummy, 0, sizeof(db_dummy)); 24393 db_dummy.md_root = P_INVALID; 24394 db_dummy.md_mod_txnid = txn->mt_txnid; 24395 db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; 24396 data.iov_len = sizeof(db_dummy); 24397 data.iov_base = &db_dummy; 24398 WITH_CURSOR_TRACKING(couple.outer, 24399 rc = mdbx_cursor_put(&couple.outer, &key, &data, 24400 F_SUBDATA | MDBX_NOOVERWRITE)); 24401 24402 if (unlikely(rc != MDBX_SUCCESS)) 24403 goto later_bailout; 24404 24405 dbiflags |= DBI_DIRTY | DBI_CREAT; 24406 txn->mt_flags |= MDBX_TXN_DIRTY; 24407 tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); 24408 } 24409 24410 /* Got info, register DBI in this txn */ 24411 memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); 24412 memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); 24413 env->me_dbflags[slot] = 0; 24414 rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); 24415 if (unlikely(rc != MDBX_SUCCESS)) { 24416 tASSERT(txn, (dbiflags & DBI_CREAT) == 0); 24417 later_bailout: 24418 *dbi = 0; 24419 later_exit: 24420 osal_free(namedup); 24421 } else { 24422 txn->mt_dbistate[slot] = (uint8_t)dbiflags; 24423 txn->mt_dbxs[slot].md_name.iov_base = namedup; 24424 txn->mt_dbxs[slot].md_name.iov_len = len; 24425 txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = 24426 dbi_seq(env, slot); 24427 if (!(dbiflags & DBI_CREAT)) 24428 env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; 24429 if (txn->mt_numdbs == slot) { 24430 txn->mt_cursors[slot] = NULL; 24431 osal_compiler_barrier(); 24432 txn->mt_numdbs = slot + 1; 24433 } 24434 if (env->me_numdbs <= slot) { 24435 osal_memory_fence(mo_AcquireRelease, true); 24436 env->me_numdbs = slot + 1; 24437 } 24438 *dbi = slot; 24439 } 24440 24441 ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); 24442 return rc; 24443 } 24444 24445 int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, 24446 MDBX_db_flags_t table_flags, MDBX_dbi *dbi) { 24447 return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr); 24448 } 24449 24450 int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, 24451 MDBX_db_flags_t table_flags, MDBX_dbi *dbi, 24452 MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { 24453 return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); 24454 } 24455 24456 __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, 24457 size_t bytes) { 24458 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 24459 if (unlikely(rc != MDBX_SUCCESS)) 24460 return rc; 24461 24462 if (unlikely(!dest)) 24463 return MDBX_EINVAL; 24464 24465 if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) 24466 return MDBX_BAD_DBI; 24467 24468 const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); 24469 if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) 24470 return MDBX_EINVAL; 24471 24472 if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) 24473 return MDBX_BAD_TXN; 24474 24475 if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { 24476 rc = fetch_sdb(txn, dbi); 24477 if (unlikely(rc != MDBX_SUCCESS)) 24478 return rc; 24479 } 24480 24481 dest->ms_psize = txn->mt_env->me_psize; 24482 stat_get(&txn->mt_dbs[dbi], dest, bytes); 24483 return MDBX_SUCCESS; 24484 } 24485 24486 static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { 24487 eASSERT(env, dbi >= CORE_DBS); 24488 if (unlikely(dbi >= env->me_numdbs)) 24489 return MDBX_BAD_DBI; 24490 24491 char *ptr = env->me_dbxs[dbi].md_name.iov_base; 24492 /* If there was no name, this was already closed */ 24493 if (unlikely(!ptr)) 24494 return MDBX_BAD_DBI; 24495 24496 env->me_dbflags[dbi] = 0; 24497 env->me_dbxs[dbi].md_name.iov_len = 0; 24498 osal_memory_fence(mo_AcquireRelease, true); 24499 env->me_dbxs[dbi].md_name.iov_base = NULL; 24500 osal_free(ptr); 24501 24502 if (env->me_numdbs == dbi + 1) { 24503 unsigned i = env->me_numdbs; 24504 do 24505 --i; 24506 while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); 24507 env->me_numdbs = i; 24508 } 24509 24510 return MDBX_SUCCESS; 24511 } 24512 24513 int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { 24514 int rc = check_env(env, true); 24515 if (unlikely(rc != MDBX_SUCCESS)) 24516 return rc; 24517 24518 if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) 24519 return MDBX_BAD_DBI; 24520 24521 rc = osal_fastmutex_acquire(&env->me_dbi_lock); 24522 if (likely(rc == MDBX_SUCCESS)) { 24523 rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) 24524 ? dbi_close_locked(env, dbi) 24525 : MDBX_BAD_DBI; 24526 ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); 24527 } 24528 return rc; 24529 } 24530 24531 int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, 24532 unsigned *state) { 24533 int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); 24534 if (unlikely(rc != MDBX_SUCCESS)) 24535 return rc; 24536 24537 if (unlikely(!flags || !state)) 24538 return MDBX_EINVAL; 24539 24540 if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) 24541 return MDBX_BAD_DBI; 24542 24543 *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS; 24544 *state = 24545 txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); 24546 24547 return MDBX_SUCCESS; 24548 } 24549 24550 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 24551 int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { 24552 return __inline_mdbx_dbi_flags(txn, dbi, flags); 24553 } 24554 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 24555 24556 static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { 24557 int rc = page_search(mc, NULL, MDBX_PS_FIRST); 24558 if (likely(rc == MDBX_SUCCESS)) { 24559 MDBX_txn *txn = mc->mc_txn; 24560 24561 /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. 24562 * This also avoids any P_LEAF2 pages, which have no nodes. 24563 * Also if the DB doesn't have sub-DBs and has no large/overflow 24564 * pages, omit scanning leaves. */ 24565 if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) 24566 cursor_pop(mc); 24567 24568 rc = pnl_need(&txn->tw.retired_pages, mc->mc_db->md_branch_pages + 24569 mc->mc_db->md_leaf_pages + 24570 mc->mc_db->md_overflow_pages); 24571 if (unlikely(rc != MDBX_SUCCESS)) 24572 goto bailout; 24573 24574 MDBX_cursor mx; 24575 cursor_copy(mc, &mx); 24576 while (mc->mc_snum > 0) { 24577 MDBX_page *const mp = mc->mc_pg[mc->mc_top]; 24578 const unsigned nkeys = page_numkeys(mp); 24579 if (IS_LEAF(mp)) { 24580 cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); 24581 for (unsigned i = 0; i < nkeys; i++) { 24582 MDBX_node *node = page_node(mp, i); 24583 if (node_flags(node) & F_BIGDATA) { 24584 rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); 24585 if (unlikely(rc != MDBX_SUCCESS)) 24586 goto bailout; 24587 if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) 24588 goto pop; 24589 } else if (node_flags(node) & F_SUBDATA) { 24590 if (unlikely((node_flags(node) & F_DUPDATA) == 0)) { 24591 rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; 24592 goto bailout; 24593 } 24594 rc = cursor_xinit1(mc, node, mp); 24595 if (unlikely(rc != MDBX_SUCCESS)) 24596 goto bailout; 24597 rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); 24598 if (unlikely(rc != MDBX_SUCCESS)) 24599 goto bailout; 24600 } 24601 } 24602 } else { 24603 cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); 24604 mc->mc_checking |= CC_RETIRING; 24605 const unsigned pagetype = 24606 (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + 24607 ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); 24608 for (unsigned i = 0; i < nkeys; i++) { 24609 MDBX_node *node = page_node(mp, i); 24610 tASSERT(txn, (node_flags(node) & 24611 (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); 24612 const pgno_t pgno = node_pgno(node); 24613 rc = page_retire_ex(mc, pgno, nullptr, pagetype); 24614 if (unlikely(rc != MDBX_SUCCESS)) 24615 goto bailout; 24616 } 24617 mc->mc_checking -= CC_RETIRING; 24618 } 24619 if (!mc->mc_top) 24620 break; 24621 cASSERT(mc, nkeys > 0); 24622 mc->mc_ki[mc->mc_top] = (indx_t)nkeys; 24623 rc = cursor_sibling(mc, SIBLING_RIGHT); 24624 if (unlikely(rc != MDBX_SUCCESS)) { 24625 if (unlikely(rc != MDBX_NOTFOUND)) 24626 goto bailout; 24627 /* no more siblings, go back to beginning 24628 * of previous level. */ 24629 pop: 24630 cursor_pop(mc); 24631 mc->mc_ki[0] = 0; 24632 for (unsigned i = 1; i < mc->mc_snum; i++) { 24633 mc->mc_ki[i] = 0; 24634 mc->mc_pg[i] = mx.mc_pg[i]; 24635 } 24636 } 24637 } 24638 rc = page_retire(mc, mc->mc_pg[0]); 24639 bailout: 24640 if (unlikely(rc != MDBX_SUCCESS)) 24641 txn->mt_flags |= MDBX_TXN_ERROR; 24642 } else if (rc == MDBX_NOTFOUND) { 24643 rc = MDBX_SUCCESS; 24644 } 24645 mc->mc_flags &= ~C_INITIALIZED; 24646 return rc; 24647 } 24648 24649 int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { 24650 int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); 24651 if (unlikely(rc != MDBX_SUCCESS)) 24652 return rc; 24653 24654 MDBX_cursor *mc; 24655 rc = mdbx_cursor_open(txn, dbi, &mc); 24656 if (unlikely(rc != MDBX_SUCCESS)) 24657 return rc; 24658 24659 rc = drop_tree(mc, 24660 dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); 24661 /* Invalidate the dropped DB's cursors */ 24662 for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) 24663 m2->mc_flags &= ~(C_INITIALIZED | C_EOF); 24664 if (unlikely(rc)) 24665 goto bailout; 24666 24667 /* Can't delete the main DB */ 24668 if (del && dbi >= CORE_DBS) { 24669 rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); 24670 if (likely(rc == MDBX_SUCCESS)) { 24671 tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); 24672 tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); 24673 txn->mt_dbistate[dbi] = DBI_STALE; 24674 MDBX_env *env = txn->mt_env; 24675 rc = osal_fastmutex_acquire(&env->me_dbi_lock); 24676 if (unlikely(rc != MDBX_SUCCESS)) { 24677 txn->mt_flags |= MDBX_TXN_ERROR; 24678 goto bailout; 24679 } 24680 dbi_close_locked(env, dbi); 24681 ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); 24682 } else { 24683 txn->mt_flags |= MDBX_TXN_ERROR; 24684 } 24685 } else { 24686 /* reset the DB record, mark it dirty */ 24687 txn->mt_dbistate[dbi] |= DBI_DIRTY; 24688 txn->mt_dbs[dbi].md_depth = 0; 24689 txn->mt_dbs[dbi].md_branch_pages = 0; 24690 txn->mt_dbs[dbi].md_leaf_pages = 0; 24691 txn->mt_dbs[dbi].md_overflow_pages = 0; 24692 txn->mt_dbs[dbi].md_entries = 0; 24693 txn->mt_dbs[dbi].md_root = P_INVALID; 24694 txn->mt_dbs[dbi].md_seq = 0; 24695 txn->mt_flags |= MDBX_TXN_DIRTY; 24696 } 24697 24698 bailout: 24699 mdbx_cursor_close(mc); 24700 return rc; 24701 } 24702 24703 int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { 24704 int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); 24705 if (unlikely(rc != MDBX_SUCCESS)) 24706 return rc; 24707 24708 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 24709 return MDBX_BAD_DBI; 24710 24711 txn->mt_dbxs[dbi].md_cmp = cmp; 24712 return MDBX_SUCCESS; 24713 } 24714 24715 int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { 24716 int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); 24717 if (unlikely(rc != MDBX_SUCCESS)) 24718 return rc; 24719 24720 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 24721 return MDBX_BAD_DBI; 24722 24723 txn->mt_dbxs[dbi].md_dcmp = cmp; 24724 return MDBX_SUCCESS; 24725 } 24726 24727 __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, 24728 void *ctx) { 24729 int rc = check_env(env, true); 24730 if (unlikely(rc != MDBX_SUCCESS)) 24731 return rc; 24732 24733 if (unlikely(!func)) 24734 return MDBX_EINVAL; 24735 24736 rc = MDBX_RESULT_TRUE; 24737 int serial = 0; 24738 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 24739 if (likely(lck)) { 24740 const unsigned snap_nreaders = 24741 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 24742 for (unsigned i = 0; i < snap_nreaders; i++) { 24743 const MDBX_reader *r = lck->mti_readers + i; 24744 retry_reader:; 24745 const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); 24746 if (!pid) 24747 continue; 24748 txnid_t txnid = safe64_read(&r->mr_txnid); 24749 const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed); 24750 const pgno_t pages_used = 24751 atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed); 24752 const uint64_t reader_pages_retired = 24753 atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed); 24754 if (unlikely( 24755 txnid != safe64_read(&r->mr_txnid) || 24756 pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) || 24757 tid != atomic_load64(&r->mr_tid, mo_Relaxed) || 24758 pages_used != 24759 atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) || 24760 reader_pages_retired != 24761 atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) 24762 goto retry_reader; 24763 24764 eASSERT(env, txnid > 0); 24765 if (txnid >= SAFE64_INVALID_THRESHOLD) 24766 txnid = 0; 24767 24768 size_t bytes_used = 0; 24769 size_t bytes_retained = 0; 24770 uint64_t lag = 0; 24771 if (txnid) { 24772 meta_troika_t troika = meta_tap(env); 24773 retry_header:; 24774 const meta_ptr_t head = meta_recent(env, &troika); 24775 const uint64_t head_pages_retired = 24776 unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); 24777 if (unlikely(meta_should_retry(env, &troika) || 24778 head_pages_retired != 24779 unaligned_peek_u64_volatile( 24780 4, head.ptr_v->mm_pages_retired))) 24781 goto retry_header; 24782 24783 lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; 24784 bytes_used = pgno2bytes(env, pages_used); 24785 bytes_retained = (head_pages_retired > reader_pages_retired) 24786 ? pgno2bytes(env, (pgno_t)(head_pages_retired - 24787 reader_pages_retired)) 24788 : 0; 24789 } 24790 rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used, 24791 bytes_retained); 24792 if (unlikely(rc != MDBX_SUCCESS)) 24793 break; 24794 } 24795 } 24796 24797 return rc; 24798 } 24799 24800 /* Insert pid into list if not already present. 24801 * return -1 if already present. */ 24802 __cold static bool pid_insert(uint32_t *ids, uint32_t pid) { 24803 /* binary search of pid in list */ 24804 unsigned base = 0; 24805 unsigned cursor = 1; 24806 int val = 0; 24807 unsigned n = ids[0]; 24808 24809 while (n > 0) { 24810 unsigned pivot = n >> 1; 24811 cursor = base + pivot + 1; 24812 val = pid - ids[cursor]; 24813 24814 if (val < 0) { 24815 n = pivot; 24816 } else if (val > 0) { 24817 base = cursor; 24818 n -= pivot + 1; 24819 } else { 24820 /* found, so it's a duplicate */ 24821 return false; 24822 } 24823 } 24824 24825 if (val > 0) 24826 ++cursor; 24827 24828 ids[0]++; 24829 for (n = ids[0]; n > cursor; n--) 24830 ids[n] = ids[n - 1]; 24831 ids[n] = pid; 24832 return true; 24833 } 24834 24835 __cold int mdbx_reader_check(MDBX_env *env, int *dead) { 24836 if (dead) 24837 *dead = 0; 24838 return cleanup_dead_readers(env, false, dead); 24839 } 24840 24841 /* Return: 24842 * MDBX_RESULT_TRUE - done and mutex recovered 24843 * MDBX_SUCCESS - done 24844 * Otherwise errcode. */ 24845 __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, 24846 int rdt_locked, int *dead) { 24847 int rc = check_env(env, true); 24848 if (unlikely(rc != MDBX_SUCCESS)) 24849 return rc; 24850 24851 eASSERT(env, rdt_locked >= 0); 24852 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 24853 if (unlikely(lck == NULL)) { 24854 /* exclusive mode */ 24855 if (dead) 24856 *dead = 0; 24857 return MDBX_SUCCESS; 24858 } 24859 24860 const unsigned snap_nreaders = 24861 atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); 24862 uint32_t pidsbuf_onstask[142]; 24863 uint32_t *const pids = 24864 (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) 24865 ? pidsbuf_onstask 24866 : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); 24867 if (unlikely(!pids)) 24868 return MDBX_ENOMEM; 24869 24870 pids[0] = 0; 24871 int count = 0; 24872 for (unsigned i = 0; i < snap_nreaders; i++) { 24873 const uint32_t pid = 24874 atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); 24875 if (pid == 0) 24876 continue /* skip empty */; 24877 if (pid == env->me_pid) 24878 continue /* skip self */; 24879 if (!pid_insert(pids, pid)) 24880 continue /* such pid already processed */; 24881 24882 int err = osal_rpid_check(env, pid); 24883 if (err == MDBX_RESULT_TRUE) 24884 continue /* reader is live */; 24885 24886 if (err != MDBX_SUCCESS) { 24887 rc = err; 24888 break /* osal_rpid_check() failed */; 24889 } 24890 24891 /* stale reader found */ 24892 if (!rdt_locked) { 24893 err = osal_rdt_lock(env); 24894 if (MDBX_IS_ERROR(err)) { 24895 rc = err; 24896 break; 24897 } 24898 24899 rdt_locked = -1; 24900 if (err == MDBX_RESULT_TRUE) { 24901 /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ 24902 rc = MDBX_RESULT_TRUE; 24903 break; 24904 } 24905 24906 /* a other process may have clean and reused slot, recheck */ 24907 if (lck->mti_readers[i].mr_pid.weak != pid) 24908 continue; 24909 24910 err = osal_rpid_check(env, pid); 24911 if (MDBX_IS_ERROR(err)) { 24912 rc = err; 24913 break; 24914 } 24915 24916 if (err != MDBX_SUCCESS) 24917 continue /* the race with other process, slot reused */; 24918 } 24919 24920 /* clean it */ 24921 for (unsigned j = i; j < snap_nreaders; j++) { 24922 if (lck->mti_readers[j].mr_pid.weak == pid) { 24923 DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, 24924 lck->mti_readers[j].mr_txnid.weak); 24925 atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); 24926 atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); 24927 count++; 24928 } 24929 } 24930 } 24931 24932 if (likely(!MDBX_IS_ERROR(rc))) 24933 atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(), 24934 mo_Relaxed); 24935 24936 if (rdt_locked < 0) 24937 osal_rdt_unlock(env); 24938 24939 if (pids != pidsbuf_onstask) 24940 osal_free(pids); 24941 24942 if (dead) 24943 *dead = count; 24944 return rc; 24945 } 24946 24947 __cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { 24948 const int rc = runtime_flags | (loglevel << 16); 24949 24950 if (level != MDBX_LOG_DONTCHANGE) 24951 loglevel = (uint8_t)level; 24952 24953 if (flags != MDBX_DBG_DONTCHANGE) { 24954 flags &= 24955 #if MDBX_DEBUG 24956 MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | 24957 #endif 24958 MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | 24959 MDBX_DBG_DONT_UPGRADE; 24960 runtime_flags = (uint8_t)flags; 24961 } 24962 24963 if (logger != MDBX_LOGGER_DONTCHANGE) 24964 debug_logger = logger; 24965 return rc; 24966 } 24967 24968 __cold static txnid_t kick_longlived_readers(MDBX_env *env, 24969 const txnid_t laggard) { 24970 DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); 24971 osal_memory_fence(mo_AcquireRelease, false); 24972 MDBX_hsr_func *const callback = env->me_hsr_callback; 24973 txnid_t oldest = 0; 24974 bool notify_eof_of_loop = false; 24975 int retry = 0; 24976 do { 24977 const txnid_t steady = 24978 env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady]; 24979 env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; 24980 oldest = find_oldest_reader(env, steady); 24981 eASSERT(env, oldest < env->me_txn0->mt_txnid); 24982 eASSERT(env, oldest >= laggard); 24983 eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); 24984 24985 MDBX_lockinfo *const lck = env->me_lck_mmap.lck; 24986 if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) 24987 break; 24988 24989 if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL))) 24990 break; 24991 24992 if (!callback) 24993 break; 24994 24995 MDBX_reader *stucked = nullptr; 24996 uint64_t hold_retired = 0; 24997 for (unsigned i = 0; i < lck->mti_numreaders.weak; ++i) { 24998 const uint64_t snap_retired = atomic_load64( 24999 &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); 25000 const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); 25001 if (rtxn == laggard && 25002 atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { 25003 hold_retired = snap_retired; 25004 stucked = &lck->mti_readers[i]; 25005 } 25006 } 25007 25008 if (!stucked) 25009 break; 25010 25011 uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease); 25012 uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease); 25013 if (safe64_read(&stucked->mr_txnid) != laggard || !pid || 25014 stucked->mr_snapshot_pages_retired.weak != hold_retired) 25015 continue; 25016 25017 const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika); 25018 const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; 25019 const uint64_t head_retired = 25020 unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); 25021 const size_t space = 25022 (head_retired > hold_retired) 25023 ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) 25024 : 0; 25025 int rc = 25026 callback(env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, 25027 (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); 25028 if (rc < 0) 25029 /* hsr returned error and/or agree MDBX_MAP_FULL error */ 25030 break; 25031 25032 if (rc > 0) { 25033 if (rc == 1) { 25034 /* hsr reported transaction (will be) aborted asynchronous */ 25035 safe64_reset_compare(&stucked->mr_txnid, laggard); 25036 } else { 25037 /* hsr reported reader process was killed and slot should be cleared */ 25038 safe64_reset(&stucked->mr_txnid, true); 25039 atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); 25040 atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); 25041 } 25042 } else 25043 notify_eof_of_loop = true; 25044 25045 } while (++retry < INT_MAX); 25046 25047 if (notify_eof_of_loop) { 25048 /* notify end of hsr-loop */ 25049 const txnid_t turn = oldest - laggard; 25050 if (turn) 25051 NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, 25052 laggard, oldest, turn); 25053 callback(env, env->me_txn, 0, 0, laggard, 25054 (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); 25055 } 25056 return oldest; 25057 } 25058 25059 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 25060 __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { 25061 return __inline_mdbx_env_set_syncbytes(env, threshold); 25062 } 25063 25064 __cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { 25065 return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); 25066 } 25067 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 25068 25069 __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { 25070 int rc = check_env(env, false); 25071 if (unlikely(rc != MDBX_SUCCESS)) 25072 return rc; 25073 25074 env->me_hsr_callback = hsr; 25075 return MDBX_SUCCESS; 25076 } 25077 25078 __cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) { 25079 return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE) 25080 ? env->me_hsr_callback 25081 : NULL; 25082 } 25083 25084 #ifdef __SANITIZE_THREAD__ 25085 /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ 25086 __attribute__((__no_sanitize_thread__, __noinline__)) 25087 #endif 25088 int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) 25089 { 25090 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 25091 if (unlikely(rc != MDBX_SUCCESS)) 25092 return (rc > 0) ? -rc : rc; 25093 25094 MDBX_env *env = txn->mt_env; 25095 if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) { 25096 if (percent) 25097 *percent = 25098 (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) / 25099 txn->mt_end_pgno); 25100 return 0; 25101 } 25102 25103 txnid_t lag; 25104 meta_troika_t troika = meta_tap(env); 25105 do { 25106 const meta_ptr_t head = meta_recent(env, &troika); 25107 if (percent) { 25108 const pgno_t maxpg = head.ptr_v->mm_geo.now; 25109 *percent = 25110 (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); 25111 } 25112 lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; 25113 } while (unlikely(meta_should_retry(env, &troika))); 25114 25115 return (lag > INT_MAX) ? INT_MAX : (int)lag; 25116 } 25117 25118 typedef struct mdbx_walk_ctx { 25119 void *mw_user; 25120 MDBX_pgvisitor_func *mw_visitor; 25121 MDBX_txn *mw_txn; 25122 MDBX_cursor *mw_cursor; 25123 bool mw_dont_check_keys_ordering; 25124 } mdbx_walk_ctx_t; 25125 25126 __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, 25127 const char *name, int deep); 25128 25129 static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { 25130 if (mp) 25131 switch (mp->mp_flags) { 25132 case P_BRANCH: 25133 return MDBX_page_branch; 25134 case P_LEAF: 25135 return MDBX_page_leaf; 25136 case P_LEAF | P_LEAF2: 25137 return MDBX_page_dupfixed_leaf; 25138 case P_OVERFLOW: 25139 return MDBX_page_large; 25140 case P_META: 25141 return MDBX_page_meta; 25142 } 25143 return MDBX_page_broken; 25144 } 25145 25146 /* Depth-first tree traversal. */ 25147 __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, 25148 const char *name, int deep, txnid_t parent_txnid) { 25149 assert(pgno != P_INVALID); 25150 MDBX_page *mp = nullptr; 25151 int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); 25152 25153 MDBX_page_type_t type = walk_page_type(mp); 25154 const unsigned nentries = mp ? page_numkeys(mp) : 0; 25155 unsigned npages = 1; 25156 size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); 25157 size_t header_size = 25158 (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ; 25159 size_t payload_size = 0; 25160 size_t unused_size = 25161 (mp ? page_room(mp) : pagesize - header_size) - payload_size; 25162 size_t align_bytes = 0; 25163 25164 for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; 25165 align_bytes += ((payload_size + align_bytes) & 1), ++i) { 25166 if (type == MDBX_page_dupfixed_leaf) { 25167 /* LEAF2 pages have no mp_ptrs[] or node headers */ 25168 payload_size += mp->mp_leaf2_ksize; 25169 continue; 25170 } 25171 25172 MDBX_node *node = page_node(mp, i); 25173 payload_size += NODESIZE + node_ks(node); 25174 25175 if (type == MDBX_page_branch) { 25176 assert(i > 0 || node_ks(node) == 0); 25177 continue; 25178 } 25179 25180 assert(type == MDBX_page_leaf); 25181 switch (node_flags(node)) { 25182 case 0 /* usual node */: 25183 payload_size += node_ds(node); 25184 break; 25185 25186 case F_BIGDATA /* long data on the large/overflow page */: { 25187 payload_size += sizeof(pgno_t); 25188 const pgno_t large_pgno = node_largedata_pgno(node); 25189 const size_t over_payload = node_ds(node); 25190 const size_t over_header = PAGEHDRSZ; 25191 npages = 1; 25192 25193 assert(err == MDBX_SUCCESS); 25194 pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); 25195 err = lp.err; 25196 if (err == MDBX_SUCCESS) { 25197 cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); 25198 npages = lp.page->mp_pages; 25199 } 25200 25201 pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); 25202 const size_t over_unused = pagesize - over_payload - over_header; 25203 const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, 25204 name, pagesize, MDBX_page_large, err, 1, 25205 over_payload, over_header, over_unused); 25206 if (unlikely(rc != MDBX_SUCCESS)) 25207 return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; 25208 } break; 25209 25210 case F_SUBDATA /* sub-db */: { 25211 const size_t namelen = node_ks(node); 25212 payload_size += node_ds(node); 25213 if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { 25214 assert(err == MDBX_CORRUPTED); 25215 err = MDBX_CORRUPTED; 25216 } 25217 } break; 25218 25219 case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: 25220 payload_size += sizeof(MDBX_db); 25221 if (unlikely(node_ds(node) != sizeof(MDBX_db))) { 25222 assert(err == MDBX_CORRUPTED); 25223 err = MDBX_CORRUPTED; 25224 } 25225 break; 25226 25227 case F_DUPDATA /* short sub-page */: { 25228 if (unlikely(node_ds(node) <= PAGEHDRSZ)) { 25229 assert(err == MDBX_CORRUPTED); 25230 err = MDBX_CORRUPTED; 25231 break; 25232 } 25233 25234 MDBX_page *sp = node_data(node); 25235 const unsigned nsubkeys = page_numkeys(sp); 25236 size_t subheader_size = 25237 IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; 25238 size_t subunused_size = page_room(sp); 25239 size_t subpayload_size = 0; 25240 size_t subalign_bytes = 0; 25241 MDBX_page_type_t subtype; 25242 25243 switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { 25244 case P_LEAF | P_SUBP: 25245 subtype = MDBX_subpage_leaf; 25246 break; 25247 case P_LEAF | P_LEAF2 | P_SUBP: 25248 subtype = MDBX_subpage_dupfixed_leaf; 25249 break; 25250 default: 25251 assert(err == MDBX_CORRUPTED); 25252 subtype = MDBX_subpage_broken; 25253 err = MDBX_CORRUPTED; 25254 } 25255 25256 for (unsigned j = 0; err == MDBX_SUCCESS && j < nsubkeys; 25257 subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { 25258 25259 if (subtype == MDBX_subpage_dupfixed_leaf) { 25260 /* LEAF2 pages have no mp_ptrs[] or node headers */ 25261 subpayload_size += sp->mp_leaf2_ksize; 25262 } else { 25263 assert(subtype == MDBX_subpage_leaf); 25264 MDBX_node *subnode = page_node(sp, j); 25265 subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); 25266 if (unlikely(node_flags(subnode) != 0)) { 25267 assert(err == MDBX_CORRUPTED); 25268 err = MDBX_CORRUPTED; 25269 } 25270 } 25271 } 25272 25273 const int rc = 25274 ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), 25275 subtype, err, nsubkeys, subpayload_size, 25276 subheader_size, subunused_size + subalign_bytes); 25277 if (unlikely(rc != MDBX_SUCCESS)) 25278 return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; 25279 header_size += subheader_size; 25280 unused_size += subunused_size; 25281 payload_size += subpayload_size; 25282 align_bytes += subalign_bytes; 25283 } break; 25284 25285 default: 25286 assert(err == MDBX_CORRUPTED); 25287 err = MDBX_CORRUPTED; 25288 } 25289 } 25290 25291 const int rc = ctx->mw_visitor( 25292 pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, 25293 err, nentries, payload_size, header_size, unused_size + align_bytes); 25294 if (unlikely(rc != MDBX_SUCCESS)) 25295 return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; 25296 25297 for (unsigned i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { 25298 if (type == MDBX_page_dupfixed_leaf) 25299 continue; 25300 25301 MDBX_node *node = page_node(mp, i); 25302 if (type == MDBX_page_branch) { 25303 assert(err == MDBX_SUCCESS); 25304 err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); 25305 if (unlikely(err != MDBX_SUCCESS)) { 25306 if (err == MDBX_RESULT_TRUE) 25307 break; 25308 return err; 25309 } 25310 continue; 25311 } 25312 25313 assert(type == MDBX_page_leaf); 25314 MDBX_db db; 25315 switch (node_flags(node)) { 25316 default: 25317 continue; 25318 25319 case F_SUBDATA /* sub-db */: { 25320 const size_t namelen = node_ks(node); 25321 if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { 25322 assert(err == MDBX_CORRUPTED); 25323 err = MDBX_CORRUPTED; 25324 break; 25325 } 25326 25327 char namebuf_onstask[64]; 25328 char *const sub_name = (namelen < sizeof(namebuf_onstask)) 25329 ? namebuf_onstask 25330 : osal_malloc(namelen + 1); 25331 if (unlikely(!sub_name)) 25332 return MDBX_ENOMEM; 25333 memcpy(sub_name, node_key(node), namelen); 25334 sub_name[namelen] = 0; 25335 memcpy(&db, node_data(node), sizeof(db)); 25336 assert(err == MDBX_SUCCESS); 25337 err = walk_sdb(ctx, &db, sub_name, deep + 1); 25338 if (sub_name != namebuf_onstask) 25339 osal_free(sub_name); 25340 } break; 25341 25342 case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: 25343 if (unlikely(node_ds(node) != sizeof(MDBX_db) || 25344 ctx->mw_cursor->mc_xcursor == NULL)) { 25345 assert(err == MDBX_CORRUPTED); 25346 err = MDBX_CORRUPTED; 25347 } else { 25348 memcpy(&db, node_data(node), sizeof(db)); 25349 assert(ctx->mw_cursor->mc_xcursor == 25350 &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); 25351 assert(err == MDBX_SUCCESS); 25352 err = cursor_xinit1(ctx->mw_cursor, node, mp); 25353 if (likely(err == MDBX_SUCCESS)) { 25354 ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; 25355 err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); 25356 MDBX_xcursor *inner_xcursor = 25357 container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); 25358 MDBX_cursor_couple *couple = 25359 container_of(inner_xcursor, MDBX_cursor_couple, inner); 25360 ctx->mw_cursor = &couple->outer; 25361 } 25362 } 25363 break; 25364 } 25365 } 25366 25367 return MDBX_SUCCESS; 25368 } 25369 25370 __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, 25371 const char *name, int deep) { 25372 if (unlikely(sdb->md_root == P_INVALID)) 25373 return MDBX_SUCCESS; /* empty db */ 25374 25375 MDBX_cursor_couple couple; 25376 MDBX_dbx dbx = {.md_klen_min = INT_MAX}; 25377 uint8_t dbistate = DBI_VALID | DBI_AUDITED; 25378 int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); 25379 if (unlikely(rc != MDBX_SUCCESS)) 25380 return rc; 25381 25382 couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering 25383 ? CC_SKIPORD | CC_PAGECHECK 25384 : CC_PAGECHECK; 25385 couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering 25386 ? CC_SKIPORD | CC_PAGECHECK 25387 : CC_PAGECHECK; 25388 couple.outer.mc_next = ctx->mw_cursor; 25389 ctx->mw_cursor = &couple.outer; 25390 rc = walk_tree(ctx, sdb->md_root, name, deep, 25391 sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); 25392 ctx->mw_cursor = couple.outer.mc_next; 25393 return rc; 25394 } 25395 25396 __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, 25397 void *user, bool dont_check_keys_ordering) { 25398 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 25399 if (unlikely(rc != MDBX_SUCCESS)) 25400 return rc; 25401 25402 mdbx_walk_ctx_t ctx; 25403 memset(&ctx, 0, sizeof(ctx)); 25404 ctx.mw_txn = txn; 25405 ctx.mw_user = user; 25406 ctx.mw_visitor = visitor; 25407 ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering; 25408 25409 rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META, 25410 pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS, 25411 NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, 25412 (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * 25413 NUM_METAS); 25414 if (!MDBX_IS_ERROR(rc)) 25415 rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); 25416 if (!MDBX_IS_ERROR(rc)) 25417 rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); 25418 return rc; 25419 } 25420 25421 int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { 25422 int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); 25423 if (unlikely(rc != MDBX_SUCCESS)) 25424 return rc; 25425 25426 if (likely(canary)) { 25427 if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && 25428 txn->mt_canary.z == canary->z) 25429 return MDBX_SUCCESS; 25430 txn->mt_canary.x = canary->x; 25431 txn->mt_canary.y = canary->y; 25432 txn->mt_canary.z = canary->z; 25433 } 25434 txn->mt_canary.v = txn->mt_txnid; 25435 txn->mt_flags |= MDBX_TXN_DIRTY; 25436 25437 return MDBX_SUCCESS; 25438 } 25439 25440 int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { 25441 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 25442 if (unlikely(rc != MDBX_SUCCESS)) 25443 return rc; 25444 25445 if (unlikely(canary == NULL)) 25446 return MDBX_EINVAL; 25447 25448 *canary = txn->mt_canary; 25449 return MDBX_SUCCESS; 25450 } 25451 25452 int mdbx_cursor_on_first(const MDBX_cursor *mc) { 25453 if (unlikely(mc == NULL)) 25454 return MDBX_EINVAL; 25455 25456 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 25457 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25458 : MDBX_EBADSIGN; 25459 25460 if (!(mc->mc_flags & C_INITIALIZED)) 25461 return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; 25462 25463 for (unsigned i = 0; i < mc->mc_snum; ++i) { 25464 if (mc->mc_ki[i]) 25465 return MDBX_RESULT_FALSE; 25466 } 25467 25468 return MDBX_RESULT_TRUE; 25469 } 25470 25471 int mdbx_cursor_on_last(const MDBX_cursor *mc) { 25472 if (unlikely(mc == NULL)) 25473 return MDBX_EINVAL; 25474 25475 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 25476 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25477 : MDBX_EBADSIGN; 25478 25479 if (!(mc->mc_flags & C_INITIALIZED)) 25480 return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; 25481 25482 for (unsigned i = 0; i < mc->mc_snum; ++i) { 25483 unsigned nkeys = page_numkeys(mc->mc_pg[i]); 25484 if (mc->mc_ki[i] < nkeys - 1) 25485 return MDBX_RESULT_FALSE; 25486 } 25487 25488 return MDBX_RESULT_TRUE; 25489 } 25490 25491 int mdbx_cursor_eof(const MDBX_cursor *mc) { 25492 if (unlikely(mc == NULL)) 25493 return MDBX_EINVAL; 25494 25495 if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) 25496 return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25497 : MDBX_EBADSIGN; 25498 25499 return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED && 25500 mc->mc_snum && 25501 mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])) 25502 ? MDBX_RESULT_FALSE 25503 : MDBX_RESULT_TRUE; 25504 } 25505 25506 //------------------------------------------------------------------------------ 25507 25508 struct diff_result { 25509 ptrdiff_t diff; 25510 unsigned level; 25511 int root_nkeys; 25512 }; 25513 25514 /* calculates: r = x - y */ 25515 __hot static int cursor_diff(const MDBX_cursor *const __restrict x, 25516 const MDBX_cursor *const __restrict y, 25517 struct diff_result *const __restrict r) { 25518 r->diff = 0; 25519 r->level = 0; 25520 r->root_nkeys = 0; 25521 25522 if (unlikely(x->mc_signature != MDBX_MC_LIVE)) 25523 return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25524 : MDBX_EBADSIGN; 25525 25526 if (unlikely(y->mc_signature != MDBX_MC_LIVE)) 25527 return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25528 : MDBX_EBADSIGN; 25529 25530 int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); 25531 if (unlikely(rc != MDBX_SUCCESS)) 25532 return rc; 25533 25534 if (unlikely(x->mc_txn != y->mc_txn)) 25535 return MDBX_BAD_TXN; 25536 25537 if (unlikely(y->mc_dbi != x->mc_dbi)) 25538 return MDBX_EINVAL; 25539 25540 if (unlikely(!(y->mc_flags & x->mc_flags & C_INITIALIZED))) 25541 return MDBX_ENODATA; 25542 25543 while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { 25544 if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { 25545 ERROR("Mismatch cursors's pages at %u level", r->level); 25546 return MDBX_PROBLEM; 25547 } 25548 25549 int nkeys = page_numkeys(y->mc_pg[r->level]); 25550 assert(nkeys > 0); 25551 if (r->level == 0) 25552 r->root_nkeys = nkeys; 25553 25554 const int limit_ki = nkeys - 1; 25555 const int x_ki = x->mc_ki[r->level]; 25556 const int y_ki = y->mc_ki[r->level]; 25557 r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) - 25558 ((y_ki < limit_ki) ? y_ki : limit_ki); 25559 if (r->diff == 0) { 25560 r->level += 1; 25561 continue; 25562 } 25563 25564 while (unlikely(r->diff == 1) && 25565 likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { 25566 r->level += 1; 25567 /* DB'PAGEs: 0------------------>MAX 25568 * 25569 * CURSORs: y < x 25570 * STACK[i ]: | 25571 * STACK[+1]: ...y++N|0++x... 25572 */ 25573 nkeys = page_numkeys(y->mc_pg[r->level]); 25574 r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level]; 25575 assert(r->diff > 0); 25576 } 25577 25578 while (unlikely(r->diff == -1) && 25579 likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { 25580 r->level += 1; 25581 /* DB'PAGEs: 0------------------>MAX 25582 * 25583 * CURSORs: x < y 25584 * STACK[i ]: | 25585 * STACK[+1]: ...x--N|0--y... 25586 */ 25587 nkeys = page_numkeys(x->mc_pg[r->level]); 25588 r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level]; 25589 assert(r->diff < 0); 25590 } 25591 25592 return MDBX_SUCCESS; 25593 } 25594 25595 r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF); 25596 return MDBX_SUCCESS; 25597 } 25598 25599 __hot static ptrdiff_t estimate(const MDBX_db *db, 25600 struct diff_result *const __restrict dr) { 25601 /* root: branch-page => scale = leaf-factor * branch-factor^(N-1) 25602 * level-1: branch-page(s) => scale = leaf-factor * branch-factor^2 25603 * level-2: branch-page(s) => scale = leaf-factor * branch-factor 25604 * level-N: branch-page(s) => scale = leaf-factor 25605 * leaf-level: leaf-page(s) => scale = 1 25606 */ 25607 ptrdiff_t btree_power = (ptrdiff_t)db->md_depth - 2 - (ptrdiff_t)dr->level; 25608 if (btree_power < 0) 25609 return dr->diff; 25610 25611 ptrdiff_t estimated = 25612 (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages; 25613 if (btree_power == 0) 25614 return estimated; 25615 25616 if (db->md_depth < 4) { 25617 assert(dr->level == 0 && btree_power == 1); 25618 return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys; 25619 } 25620 25621 /* average_branchpage_fillfactor = total(branch_entries) / branch_pages 25622 total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */ 25623 const size_t log2_fixedpoint = sizeof(size_t) - 1; 25624 const size_t half = UINT64_C(1) << (log2_fixedpoint - 1); 25625 const size_t factor = 25626 ((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) / 25627 db->md_branch_pages; 25628 while (1) { 25629 switch ((size_t)btree_power) { 25630 default: { 25631 const size_t square = (factor * factor + half) >> log2_fixedpoint; 25632 const size_t quad = (square * square + half) >> log2_fixedpoint; 25633 do { 25634 estimated = estimated * quad + half; 25635 estimated >>= log2_fixedpoint; 25636 btree_power -= 4; 25637 } while (btree_power >= 4); 25638 continue; 25639 } 25640 case 3: 25641 estimated = estimated * factor + half; 25642 estimated >>= log2_fixedpoint; 25643 __fallthrough /* fall through */; 25644 case 2: 25645 estimated = estimated * factor + half; 25646 estimated >>= log2_fixedpoint; 25647 __fallthrough /* fall through */; 25648 case 1: 25649 estimated = estimated * factor + half; 25650 estimated >>= log2_fixedpoint; 25651 __fallthrough /* fall through */; 25652 case 0: 25653 if (unlikely(estimated > (ptrdiff_t)db->md_entries)) 25654 return (ptrdiff_t)db->md_entries; 25655 if (unlikely(estimated < -(ptrdiff_t)db->md_entries)) 25656 return -(ptrdiff_t)db->md_entries; 25657 return estimated; 25658 } 25659 } 25660 } 25661 25662 int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, 25663 ptrdiff_t *distance_items) { 25664 if (unlikely(first == NULL || last == NULL || distance_items == NULL)) 25665 return MDBX_EINVAL; 25666 25667 *distance_items = 0; 25668 struct diff_result dr; 25669 int rc = cursor_diff(last, first, &dr); 25670 if (unlikely(rc != MDBX_SUCCESS)) 25671 return rc; 25672 25673 if (unlikely(dr.diff == 0) && 25674 F_ISSET(first->mc_db->md_flags & last->mc_db->md_flags, 25675 MDBX_DUPSORT | C_INITIALIZED)) { 25676 first = &first->mc_xcursor->mx_cursor; 25677 last = &last->mc_xcursor->mx_cursor; 25678 rc = cursor_diff(first, last, &dr); 25679 if (unlikely(rc != MDBX_SUCCESS)) 25680 return rc; 25681 } 25682 25683 if (likely(dr.diff != 0)) 25684 *distance_items = estimate(first->mc_db, &dr); 25685 25686 return MDBX_SUCCESS; 25687 } 25688 25689 int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, 25690 MDBX_cursor_op move_op, ptrdiff_t *distance_items) { 25691 if (unlikely(cursor == NULL || distance_items == NULL || 25692 move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE)) 25693 return MDBX_EINVAL; 25694 25695 if (unlikely(cursor->mc_signature != MDBX_MC_LIVE)) 25696 return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL 25697 : MDBX_EBADSIGN; 25698 25699 int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); 25700 if (unlikely(rc != MDBX_SUCCESS)) 25701 return rc; 25702 25703 if (!(cursor->mc_flags & C_INITIALIZED)) 25704 return MDBX_ENODATA; 25705 25706 MDBX_cursor_couple next; 25707 cursor_copy(cursor, &next.outer); 25708 if (cursor->mc_db->md_flags & MDBX_DUPSORT) { 25709 next.outer.mc_xcursor = &next.inner; 25710 rc = cursor_xinit0(&next.outer); 25711 if (unlikely(rc != MDBX_SUCCESS)) 25712 return rc; 25713 MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; 25714 cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor); 25715 } 25716 25717 MDBX_val stub = {0, 0}; 25718 if (data == NULL) { 25719 const unsigned mask = 25720 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY; 25721 if (unlikely(mask & (1 << move_op))) 25722 return MDBX_EINVAL; 25723 data = &stub; 25724 } 25725 25726 if (key == NULL) { 25727 const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 25728 1 << MDBX_SET_KEY | 1 << MDBX_SET | 25729 1 << MDBX_SET_RANGE; 25730 if (unlikely(mask & (1 << move_op))) 25731 return MDBX_EINVAL; 25732 key = &stub; 25733 } 25734 25735 next.outer.mc_signature = MDBX_MC_LIVE; 25736 rc = mdbx_cursor_get(&next.outer, key, data, move_op); 25737 if (unlikely(rc != MDBX_SUCCESS && 25738 (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) 25739 return rc; 25740 25741 return mdbx_estimate_distance(cursor, &next.outer, distance_items); 25742 } 25743 25744 int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, 25745 MDBX_val *begin_data, MDBX_val *end_key, 25746 MDBX_val *end_data, ptrdiff_t *size_items) { 25747 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 25748 if (unlikely(rc != MDBX_SUCCESS)) 25749 return rc; 25750 25751 if (unlikely(!size_items)) 25752 return MDBX_EINVAL; 25753 25754 if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON))) 25755 return MDBX_EINVAL; 25756 25757 if (unlikely(end_data && (end_key == NULL || end_key == MDBX_EPSILON))) 25758 return MDBX_EINVAL; 25759 25760 if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) 25761 return MDBX_EINVAL; 25762 25763 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 25764 return MDBX_BAD_DBI; 25765 25766 MDBX_cursor_couple begin; 25767 /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ 25768 rc = cursor_init(&begin.outer, txn, dbi); 25769 if (unlikely(rc != MDBX_SUCCESS)) 25770 return rc; 25771 25772 if (unlikely(begin.outer.mc_db->md_entries == 0)) { 25773 *size_items = 0; 25774 return MDBX_SUCCESS; 25775 } 25776 25777 if (!begin_key) { 25778 if (unlikely(!end_key)) { 25779 /* LY: FIRST..LAST case */ 25780 *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; 25781 return MDBX_SUCCESS; 25782 } 25783 MDBX_val stub = {0, 0}; 25784 rc = cursor_first(&begin.outer, &stub, &stub); 25785 if (unlikely(end_key == MDBX_EPSILON)) { 25786 /* LY: FIRST..+epsilon case */ 25787 return (rc == MDBX_SUCCESS) 25788 ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) 25789 : rc; 25790 } 25791 } else { 25792 if (unlikely(begin_key == MDBX_EPSILON)) { 25793 if (end_key == NULL) { 25794 /* LY: -epsilon..LAST case */ 25795 MDBX_val stub = {0, 0}; 25796 rc = cursor_last(&begin.outer, &stub, &stub); 25797 return (rc == MDBX_SUCCESS) 25798 ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) 25799 : rc; 25800 } 25801 /* LY: -epsilon..value case */ 25802 assert(end_key != MDBX_EPSILON); 25803 begin_key = end_key; 25804 } else if (unlikely(end_key == MDBX_EPSILON)) { 25805 /* LY: value..+epsilon case */ 25806 assert(begin_key != MDBX_EPSILON); 25807 end_key = begin_key; 25808 } 25809 if (end_key && !begin_data && !end_data && 25810 (begin_key == end_key || 25811 begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { 25812 /* LY: single key case */ 25813 rc = cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; 25814 if (unlikely(rc != MDBX_SUCCESS)) { 25815 *size_items = 0; 25816 return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; 25817 } 25818 *size_items = 1; 25819 if (begin.outer.mc_xcursor != NULL) { 25820 MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], 25821 begin.outer.mc_ki[begin.outer.mc_top]); 25822 if (node_flags(node) & F_DUPDATA) { 25823 /* LY: return the number of duplicates for given key */ 25824 tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && 25825 (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); 25826 *size_items = 25827 (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || 25828 begin.inner.mx_db.md_entries <= PTRDIFF_MAX) 25829 ? (size_t)begin.inner.mx_db.md_entries 25830 : PTRDIFF_MAX; 25831 } 25832 } 25833 return MDBX_SUCCESS; 25834 } else { 25835 rc = cursor_set(&begin.outer, begin_key, begin_data, 25836 begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) 25837 .err; 25838 } 25839 } 25840 25841 if (unlikely(rc != MDBX_SUCCESS)) { 25842 if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED)) 25843 return rc; 25844 } 25845 25846 MDBX_cursor_couple end; 25847 rc = cursor_init(&end.outer, txn, dbi); 25848 if (unlikely(rc != MDBX_SUCCESS)) 25849 return rc; 25850 if (!end_key) { 25851 MDBX_val stub = {0, 0}; 25852 rc = cursor_last(&end.outer, &stub, &stub); 25853 } else { 25854 rc = cursor_set(&end.outer, end_key, end_data, 25855 end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) 25856 .err; 25857 } 25858 if (unlikely(rc != MDBX_SUCCESS)) { 25859 if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED)) 25860 return rc; 25861 } 25862 25863 rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items); 25864 if (unlikely(rc != MDBX_SUCCESS)) 25865 return rc; 25866 assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries && 25867 *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); 25868 25869 #if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \ 25870 * results for an inverted ranges. */ 25871 25872 /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63 25873 Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */ 25874 25875 if (*size_items < 0) { 25876 /* LY: inverted range case */ 25877 *size_items += (ptrdiff_t)begin.outer.mc_db->md_entries; 25878 } else if (*size_items == 0 && begin_key && end_key) { 25879 int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key); 25880 if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) && 25881 begin_data && end_data) 25882 cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data); 25883 if (cmp > 0) { 25884 /* LY: inverted range case with empty scope */ 25885 *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; 25886 } 25887 } 25888 assert(*size_items >= 0 && 25889 *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); 25890 #endif 25891 25892 return MDBX_SUCCESS; 25893 } 25894 25895 //------------------------------------------------------------------------------ 25896 25897 /* Позволяет обновить или удалить существующую запись с получением 25898 * в old_data предыдущего значения данных. При этом если new_data равен 25899 * нулю, то выполняется удаление, иначе обновление/вставка. 25900 * 25901 * Текущее значение может находиться в уже измененной (грязной) странице. 25902 * В этом случае страница будет перезаписана при обновлении, а само старое 25903 * значение утрачено. Поэтому исходно в old_data должен быть передан 25904 * дополнительный буфер для копирования старого значения. 25905 * Если переданный буфер слишком мал, то функция вернет -1, установив 25906 * old_data->iov_len в соответствующее значение. 25907 * 25908 * Для не-уникальных ключей также возможен второй сценарий использования, 25909 * когда посредством old_data из записей с одинаковым ключом для 25910 * удаления/обновления выбирается конкретная. Для выбора этого сценария 25911 * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. 25912 * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет 25913 * идентифицировать запрос такого сценария. 25914 * 25915 * Функция может быть замещена соответствующими операциями с курсорами 25916 * после двух доработок (TODO): 25917 * - внешняя аллокация курсоров, в том числе на стеке (без malloc). 25918 * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). 25919 */ 25920 25921 int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, 25922 MDBX_val *new_data, MDBX_val *old_data, 25923 MDBX_put_flags_t flags, MDBX_preserve_func preserver, 25924 void *preserver_context) { 25925 int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); 25926 if (unlikely(rc != MDBX_SUCCESS)) 25927 return rc; 25928 25929 if (unlikely(!key || !old_data || old_data == new_data)) 25930 return MDBX_EINVAL; 25931 25932 if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) 25933 return MDBX_EINVAL; 25934 25935 if (unlikely(new_data == NULL && 25936 (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) 25937 return MDBX_EINVAL; 25938 25939 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 25940 return MDBX_BAD_DBI; 25941 25942 if (unlikely(flags & 25943 ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | 25944 MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) 25945 return MDBX_EINVAL; 25946 25947 MDBX_cursor_couple cx; 25948 rc = cursor_init(&cx.outer, txn, dbi); 25949 if (unlikely(rc != MDBX_SUCCESS)) 25950 return rc; 25951 cx.outer.mc_next = txn->mt_cursors[dbi]; 25952 txn->mt_cursors[dbi] = &cx.outer; 25953 25954 MDBX_val present_key = *key; 25955 if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { 25956 /* в old_data значение для выбора конкретного дубликата */ 25957 if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) { 25958 rc = MDBX_EINVAL; 25959 goto bailout; 25960 } 25961 25962 /* убираем лишний бит, он был признаком запрошенного режима */ 25963 flags -= MDBX_NOOVERWRITE; 25964 25965 rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH); 25966 if (rc != MDBX_SUCCESS) 25967 goto bailout; 25968 } else { 25969 /* в old_data буфер для сохранения предыдущего значения */ 25970 if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) 25971 return MDBX_EINVAL; 25972 MDBX_val present_data; 25973 rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY); 25974 if (unlikely(rc != MDBX_SUCCESS)) { 25975 old_data->iov_base = NULL; 25976 old_data->iov_len = 0; 25977 if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) 25978 goto bailout; 25979 } else if (flags & MDBX_NOOVERWRITE) { 25980 rc = MDBX_KEYEXIST; 25981 *old_data = present_data; 25982 goto bailout; 25983 } else { 25984 MDBX_page *page = cx.outer.mc_pg[cx.outer.mc_top]; 25985 if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) { 25986 if (flags & MDBX_CURRENT) { 25987 /* disallow update/delete for multi-values */ 25988 MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); 25989 if (node_flags(node) & F_DUPDATA) { 25990 tASSERT(txn, XCURSOR_INITED(&cx.outer) && 25991 cx.outer.mc_xcursor->mx_db.md_entries > 1); 25992 if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { 25993 rc = MDBX_EMULTIVAL; 25994 goto bailout; 25995 } 25996 } 25997 /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет 25998 * к замене данных без учета MDBX_DUPSORT сортировки, 25999 * но здесь это в любом случае допустимо, так как мы 26000 * проверили что для ключа есть только одно значение. */ 26001 } 26002 } 26003 26004 if (IS_MODIFIABLE(txn, page)) { 26005 if (new_data && cmp_lenfast(&present_data, new_data) == 0) { 26006 /* если данные совпадают, то ничего делать не надо */ 26007 *old_data = *new_data; 26008 goto bailout; 26009 } 26010 rc = preserver ? preserver(preserver_context, old_data, 26011 present_data.iov_base, present_data.iov_len) 26012 : MDBX_SUCCESS; 26013 if (unlikely(rc != MDBX_SUCCESS)) 26014 goto bailout; 26015 } else { 26016 *old_data = present_data; 26017 } 26018 flags |= MDBX_CURRENT; 26019 } 26020 } 26021 26022 if (likely(new_data)) 26023 rc = mdbx_cursor_put(&cx.outer, key, new_data, flags); 26024 else 26025 rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); 26026 26027 bailout: 26028 txn->mt_cursors[dbi] = cx.outer.mc_next; 26029 return rc; 26030 } 26031 26032 static int default_value_preserver(void *context, MDBX_val *target, 26033 const void *src, size_t bytes) { 26034 (void)context; 26035 if (unlikely(target->iov_len < bytes)) { 26036 target->iov_base = nullptr; 26037 target->iov_len = bytes; 26038 return MDBX_RESULT_TRUE; 26039 } 26040 memcpy(target->iov_base, src, target->iov_len = bytes); 26041 return MDBX_SUCCESS; 26042 } 26043 26044 int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, 26045 MDBX_val *new_data, MDBX_val *old_data, 26046 MDBX_put_flags_t flags) { 26047 return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, 26048 default_value_preserver, nullptr); 26049 } 26050 26051 /* Функция сообщает находится ли указанный адрес в "грязной" странице у 26052 * заданной пишущей транзакции. В конечном счете это позволяет избавиться от 26053 * лишнего копирования данных из НЕ-грязных страниц. 26054 * 26055 * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей 26056 * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести 26057 * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в 26058 * качестве аргументов НЕ должны получать указатели на данные в таких 26059 * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут 26060 * скопированы. 26061 * 26062 * Другими словами, данные из "грязных" страниц должны быть либо скопированы 26063 * перед передачей в качестве аргументов для дальнейших модификаций, либо 26064 * отвергнуты на стадии проверки корректности аргументов. 26065 * 26066 * Таким образом, функция позволяет как избавится от лишнего копирования, 26067 * так и выполнить более полную проверку аргументов. 26068 * 26069 * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только 26070 * так гарантируется что актуальный заголовок страницы будет физически 26071 * расположен в той-же странице памяти, в том числе для многостраничных 26072 * P_OVERFLOW страниц с длинными данными. */ 26073 int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { 26074 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 26075 if (unlikely(rc != MDBX_SUCCESS)) 26076 return rc; 26077 26078 const MDBX_env *env = txn->mt_env; 26079 const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; 26080 if (offset >= 0) { 26081 const pgno_t pgno = bytes2pgno(env, offset); 26082 if (likely(pgno < txn->mt_next_pgno)) { 26083 const MDBX_page *page = pgno2page(env, pgno); 26084 if (unlikely(page->mp_pgno != pgno || 26085 (page->mp_flags & P_ILL_BITS) != 0)) { 26086 /* The ptr pointed into middle of a large page, 26087 * not to the beginning of a data. */ 26088 return MDBX_EINVAL; 26089 } 26090 return ((txn->mt_flags & MDBX_TXN_RDONLY) || !IS_MODIFIABLE(txn, page)) 26091 ? MDBX_RESULT_FALSE 26092 : MDBX_RESULT_TRUE; 26093 } 26094 if ((size_t)offset < env->me_dxb_mmap.limit) { 26095 /* Указатель адресует что-то в пределах mmap, но за границей 26096 * распределенных страниц. Такое может случится если mdbx_is_dirty() 26097 * вызывается после операции, в ходе которой грязная страница была 26098 * возвращена в нераспределенное пространство. */ 26099 return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE; 26100 } 26101 } 26102 26103 /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был 26104 * передан некорректный адрес, либо адрес в теневой странице, которая была 26105 * выделена посредством malloc(). 26106 * 26107 * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", 26108 * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ 26109 return (txn->mt_flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL 26110 : MDBX_RESULT_TRUE; 26111 } 26112 26113 int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, 26114 uint64_t increment) { 26115 int rc = check_txn(txn, MDBX_TXN_BLOCKED); 26116 if (unlikely(rc != MDBX_SUCCESS)) 26117 return rc; 26118 26119 if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) 26120 return MDBX_BAD_DBI; 26121 26122 if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { 26123 rc = fetch_sdb(txn, dbi); 26124 if (unlikely(rc != MDBX_SUCCESS)) 26125 return rc; 26126 } 26127 26128 MDBX_db *dbs = &txn->mt_dbs[dbi]; 26129 if (likely(result)) 26130 *result = dbs->md_seq; 26131 26132 if (likely(increment > 0)) { 26133 if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) 26134 return MDBX_EACCESS; 26135 26136 uint64_t new = dbs->md_seq + increment; 26137 if (unlikely(new < increment)) 26138 return MDBX_RESULT_TRUE; 26139 26140 tASSERT(txn, new > dbs->md_seq); 26141 dbs->md_seq = new; 26142 txn->mt_flags |= MDBX_TXN_DIRTY; 26143 txn->mt_dbistate[dbi] |= DBI_DIRTY; 26144 } 26145 26146 return MDBX_SUCCESS; 26147 } 26148 26149 /*----------------------------------------------------------------------------*/ 26150 26151 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 26152 __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) { 26153 return __inline_mdbx_limits_pgsize_min(); 26154 } 26155 26156 __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { 26157 return __inline_mdbx_limits_pgsize_max(); 26158 } 26159 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 26160 26161 __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { 26162 if (pagesize < 1) 26163 pagesize = (intptr_t)mdbx_default_pagesize(); 26164 else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || 26165 pagesize > (intptr_t)MAX_PAGESIZE || 26166 !is_powerof2((size_t)pagesize))) 26167 return -1; 26168 26169 return MIN_PAGENO * pagesize; 26170 } 26171 26172 __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { 26173 if (pagesize < 1) 26174 pagesize = (intptr_t)mdbx_default_pagesize(); 26175 else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || 26176 pagesize > (intptr_t)MAX_PAGESIZE || 26177 !is_powerof2((size_t)pagesize))) 26178 return -1; 26179 26180 STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); 26181 const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize; 26182 return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE; 26183 } 26184 26185 __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { 26186 if (pagesize < 1) 26187 pagesize = (intptr_t)mdbx_default_pagesize(); 26188 else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || 26189 pagesize > (intptr_t)MAX_PAGESIZE || 26190 !is_powerof2((size_t)pagesize))) 26191 return -1; 26192 26193 STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); 26194 const uint64_t pgl_limit = 26195 pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482); 26196 const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482); 26197 return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; 26198 } 26199 26200 /*** Key-making functions to avoid custom comparators *************************/ 26201 26202 static __always_inline double key2double(const int64_t key) { 26203 union { 26204 uint64_t u; 26205 double f; 26206 } casting; 26207 26208 casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000) 26209 : UINT64_C(0xffffFFFFffffFFFF) - key; 26210 return casting.f; 26211 } 26212 26213 static __always_inline uint64_t double2key(const double *const ptr) { 26214 STATIC_ASSERT(sizeof(double) == sizeof(int64_t)); 26215 const int64_t i = *(const int64_t *)ptr; 26216 const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i 26217 : i + UINT64_C(0x8000000000000000); 26218 if (ASSERT_ENABLED()) { 26219 const double f = key2double(u); 26220 assert(memcmp(&f, ptr, 8) == 0); 26221 } 26222 return u; 26223 } 26224 26225 static __always_inline float key2float(const int32_t key) { 26226 union { 26227 uint32_t u; 26228 float f; 26229 } casting; 26230 26231 casting.u = 26232 (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key; 26233 return casting.f; 26234 } 26235 26236 static __always_inline uint32_t float2key(const float *const ptr) { 26237 STATIC_ASSERT(sizeof(float) == sizeof(int32_t)); 26238 const int32_t i = *(const int32_t *)ptr; 26239 const uint32_t u = 26240 (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); 26241 if (ASSERT_ENABLED()) { 26242 const float f = key2float(u); 26243 assert(memcmp(&f, ptr, 4) == 0); 26244 } 26245 return u; 26246 } 26247 26248 uint64_t mdbx_key_from_double(const double ieee754_64bit) { 26249 return double2key(&ieee754_64bit); 26250 } 26251 26252 uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) { 26253 return double2key(ieee754_64bit); 26254 } 26255 26256 uint32_t mdbx_key_from_float(const float ieee754_32bit) { 26257 return float2key(&ieee754_32bit); 26258 } 26259 26260 uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { 26261 return float2key(ieee754_32bit); 26262 } 26263 26264 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API 26265 MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) { 26266 return __inline_mdbx_key_from_int64(i64); 26267 } 26268 26269 MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) { 26270 return __inline_mdbx_key_from_int32(i32); 26271 } 26272 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ 26273 26274 #define IEEE754_DOUBLE_MANTISSA_SIZE 52 26275 #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF 26276 #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF 26277 #define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000) 26278 #define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF) 26279 #define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF) 26280 26281 static __inline int clz64(uint64_t value) { 26282 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl) 26283 if (sizeof(value) == sizeof(int)) 26284 return __builtin_clz(value); 26285 if (sizeof(value) == sizeof(long)) 26286 return __builtin_clzl(value); 26287 #if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ 26288 __has_builtin(__builtin_clzll) 26289 return __builtin_clzll(value); 26290 #endif /* have(long long) && long long == uint64_t */ 26291 #endif /* GNU C */ 26292 26293 #if defined(_MSC_VER) 26294 unsigned long index; 26295 #if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) 26296 _BitScanReverse64(&index, value); 26297 return 63 - index; 26298 #else 26299 if (value > UINT32_MAX) { 26300 _BitScanReverse(&index, (uint32_t)(value >> 32)); 26301 return 31 - index; 26302 } 26303 _BitScanReverse(&index, (uint32_t)value); 26304 return 63 - index; 26305 #endif 26306 #endif /* MSVC */ 26307 26308 value |= value >> 1; 26309 value |= value >> 2; 26310 value |= value >> 4; 26311 value |= value >> 8; 26312 value |= value >> 16; 26313 value |= value >> 32; 26314 static const uint8_t debruijn_clz64[64] = { 26315 63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2, 26316 9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1, 26317 17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18, 26318 38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0}; 26319 return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58]; 26320 } 26321 26322 static __inline uint64_t round_mantissa(const uint64_t u64, int shift) { 26323 assert(shift < 0 && u64 > 0); 26324 shift = -shift; 26325 const unsigned half = 1 << (shift - 1); 26326 const unsigned lsb = 1 & (unsigned)(u64 >> shift); 26327 const unsigned tie2even = 1 ^ lsb; 26328 return (u64 + half - tie2even) >> shift; 26329 } 26330 26331 uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { 26332 const uint64_t bias = UINT64_C(0x8000000000000000); 26333 if (json_integer > 0) { 26334 const uint64_t u64 = json_integer; 26335 int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); 26336 uint64_t mantissa = u64 << shift; 26337 if (unlikely(shift < 0)) { 26338 mantissa = round_mantissa(u64, shift); 26339 if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) 26340 mantissa = round_mantissa(u64, --shift); 26341 } 26342 26343 assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && 26344 mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); 26345 const uint64_t exponent = 26346 IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; 26347 assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); 26348 const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + 26349 (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); 26350 #if !defined(_MSC_VER) || \ 26351 defined( \ 26352 _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ 26353 symbol __except1 referenced in function __ftol3_except */ 26354 assert(key == mdbx_key_from_double((double)json_integer)); 26355 #endif /* Workaround for MSVC */ 26356 return key; 26357 } 26358 26359 if (json_integer < 0) { 26360 const uint64_t u64 = -json_integer; 26361 int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); 26362 uint64_t mantissa = u64 << shift; 26363 if (unlikely(shift < 0)) { 26364 mantissa = round_mantissa(u64, shift); 26365 if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) 26366 mantissa = round_mantissa(u64, --shift); 26367 } 26368 26369 assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && 26370 mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); 26371 const uint64_t exponent = 26372 IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; 26373 assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); 26374 const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - 26375 (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); 26376 #if !defined(_MSC_VER) || \ 26377 defined( \ 26378 _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ 26379 symbol __except1 referenced in function __ftol3_except */ 26380 assert(key == mdbx_key_from_double((double)json_integer)); 26381 #endif /* Workaround for MSVC */ 26382 return key; 26383 } 26384 26385 return bias; 26386 } 26387 26388 int64_t mdbx_jsonInteger_from_key(const MDBX_val v) { 26389 assert(v.iov_len == 8); 26390 const uint64_t key = unaligned_peek_u64(2, v.iov_base); 26391 const uint64_t bias = UINT64_C(0x8000000000000000); 26392 const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1; 26393 const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 - 26394 (IEEE754_DOUBLE_EXPONENTA_MAX & 26395 (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE)); 26396 if (unlikely(shift < 1)) 26397 return (key < bias) ? INT64_MIN : INT64_MAX; 26398 if (unlikely(shift > 63)) 26399 return 0; 26400 26401 const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK) 26402 << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) + 26403 bias; 26404 const int64_t absolute = unscaled >> shift; 26405 const int64_t value = (key < bias) ? -absolute : absolute; 26406 assert(key == mdbx_key_from_jsonInteger(value) || 26407 (mdbx_key_from_jsonInteger(value - 1) < key && 26408 key < mdbx_key_from_jsonInteger(value + 1))); 26409 return value; 26410 } 26411 26412 double mdbx_double_from_key(const MDBX_val v) { 26413 assert(v.iov_len == 8); 26414 return key2double(unaligned_peek_u64(2, v.iov_base)); 26415 } 26416 26417 float mdbx_float_from_key(const MDBX_val v) { 26418 assert(v.iov_len == 4); 26419 return key2float(unaligned_peek_u32(2, v.iov_base)); 26420 } 26421 26422 int32_t mdbx_int32_from_key(const MDBX_val v) { 26423 assert(v.iov_len == 4); 26424 return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000)); 26425 } 26426 26427 int64_t mdbx_int64_from_key(const MDBX_val v) { 26428 assert(v.iov_len == 8); 26429 return (int64_t)(unaligned_peek_u64(2, v.iov_base) - 26430 UINT64_C(0x8000000000000000)); 26431 } 26432 26433 __cold MDBX_cmp_func *mdbx_get_keycmp(unsigned flags) { 26434 return get_default_keycmp(flags); 26435 } 26436 26437 __cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) { 26438 return get_default_datacmp(flags); 26439 } 26440 26441 __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, 26442 uint64_t value) { 26443 int err = check_env(env, false); 26444 if (unlikely(err != MDBX_SUCCESS)) 26445 return err; 26446 26447 const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && 26448 env->me_txn0->mt_owner != osal_thread_self()); 26449 bool should_unlock = false; 26450 switch (option) { 26451 case MDBX_opt_sync_bytes: 26452 if (value == UINT64_MAX) 26453 value = SIZE_MAX - 65536; 26454 if (unlikely(env->me_flags & MDBX_RDONLY)) 26455 return MDBX_EACCESS; 26456 if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) 26457 return MDBX_EPERM; 26458 if (unlikely(value > SIZE_MAX - 65536)) 26459 return MDBX_TOO_LARGE; 26460 if (atomic_store32(&env->me_lck->mti_autosync_threshold, 26461 bytes2pgno(env, (size_t)value + env->me_psize - 1), 26462 mo_Relaxed) != 0 && 26463 (env->me_flags & MDBX_ENV_ACTIVE)) { 26464 err = mdbx_env_sync_poll(env); 26465 if (unlikely(MDBX_IS_ERROR(err))) 26466 return err; 26467 err = MDBX_SUCCESS; 26468 } 26469 break; 26470 26471 case MDBX_opt_sync_period: 26472 if (value == UINT64_MAX) 26473 value = UINT32_MAX; 26474 if (unlikely(env->me_flags & MDBX_RDONLY)) 26475 return MDBX_EACCESS; 26476 if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) 26477 return MDBX_EPERM; 26478 if (unlikely(value > UINT32_MAX)) 26479 return MDBX_TOO_LARGE; 26480 if (atomic_store64(&env->me_lck->mti_autosync_period, 26481 osal_16dot16_to_monotime((uint32_t)value), 26482 mo_Relaxed) != 0 && 26483 (env->me_flags & MDBX_ENV_ACTIVE)) { 26484 err = mdbx_env_sync_poll(env); 26485 if (unlikely(MDBX_IS_ERROR(err))) 26486 return err; 26487 err = MDBX_SUCCESS; 26488 } 26489 break; 26490 26491 case MDBX_opt_max_db: 26492 if (value == UINT64_MAX) 26493 value = MDBX_MAX_DBI; 26494 if (unlikely(value > MDBX_MAX_DBI)) 26495 return MDBX_EINVAL; 26496 if (unlikely(env->me_map)) 26497 return MDBX_EPERM; 26498 env->me_maxdbs = (unsigned)value + CORE_DBS; 26499 break; 26500 26501 case MDBX_opt_max_readers: 26502 if (value == UINT64_MAX) 26503 value = MDBX_READERS_LIMIT; 26504 if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) 26505 return MDBX_EINVAL; 26506 if (unlikely(env->me_map)) 26507 return MDBX_EPERM; 26508 env->me_maxreaders = (unsigned)value; 26509 break; 26510 26511 case MDBX_opt_dp_reserve_limit: 26512 if (value == UINT64_MAX) 26513 value = INT_MAX; 26514 if (unlikely(value > INT_MAX)) 26515 return MDBX_EINVAL; 26516 if (env->me_options.dp_reserve_limit != (unsigned)value) { 26517 if (lock_needed) { 26518 err = mdbx_txn_lock(env, false); 26519 if (unlikely(err != MDBX_SUCCESS)) 26520 return err; 26521 should_unlock = true; 26522 } 26523 env->me_options.dp_reserve_limit = (unsigned)value; 26524 while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { 26525 eASSERT(env, env->me_dp_reserve != NULL); 26526 MDBX_page *dp = env->me_dp_reserve; 26527 MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); 26528 VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 26529 env->me_dp_reserve = dp->mp_next; 26530 VALGRIND_MEMPOOL_FREE(env, dp); 26531 osal_free(dp); 26532 env->me_dp_reserve_len -= 1; 26533 } 26534 } 26535 break; 26536 26537 case MDBX_opt_rp_augment_limit: 26538 if (value == UINT64_MAX) 26539 value = MDBX_PGL_LIMIT; 26540 if (unlikely(value > MDBX_PGL_LIMIT)) 26541 return MDBX_EINVAL; 26542 env->me_options.rp_augment_limit = (unsigned)value; 26543 break; 26544 26545 case MDBX_opt_txn_dp_limit: 26546 case MDBX_opt_txn_dp_initial: 26547 if (value == UINT64_MAX) 26548 value = MDBX_PGL_LIMIT; 26549 if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) 26550 return MDBX_EINVAL; 26551 if (unlikely(env->me_flags & MDBX_RDONLY)) 26552 return MDBX_EACCESS; 26553 if (lock_needed) { 26554 err = mdbx_txn_lock(env, false); 26555 if (unlikely(err != MDBX_SUCCESS)) 26556 return err; 26557 should_unlock = true; 26558 } 26559 if (env->me_txn) 26560 err = MDBX_EPERM /* unable change during transaction */; 26561 else { 26562 const pgno_t value32 = (pgno_t)value; 26563 if (option == MDBX_opt_txn_dp_initial && 26564 env->me_options.dp_initial != value32) { 26565 env->me_options.dp_initial = value32; 26566 if (env->me_options.dp_limit < value32) { 26567 env->me_options.dp_limit = value32; 26568 env->me_options.flags.non_auto.dp_limit = 1; 26569 } 26570 } 26571 if (option == MDBX_opt_txn_dp_limit && 26572 env->me_options.dp_limit != value32) { 26573 env->me_options.dp_limit = value32; 26574 env->me_options.flags.non_auto.dp_limit = 1; 26575 if (env->me_options.dp_initial > value32) 26576 env->me_options.dp_initial = value32; 26577 } 26578 } 26579 break; 26580 26581 case MDBX_opt_spill_max_denominator: 26582 if (value == UINT64_MAX) 26583 value = 255; 26584 if (unlikely(value > 255)) 26585 return MDBX_EINVAL; 26586 env->me_options.spill_max_denominator = (uint8_t)value; 26587 break; 26588 case MDBX_opt_spill_min_denominator: 26589 if (unlikely(value > 255)) 26590 return MDBX_EINVAL; 26591 env->me_options.spill_min_denominator = (uint8_t)value; 26592 break; 26593 case MDBX_opt_spill_parent4child_denominator: 26594 if (unlikely(value > 255)) 26595 return MDBX_EINVAL; 26596 env->me_options.spill_parent4child_denominator = (uint8_t)value; 26597 break; 26598 26599 case MDBX_opt_loose_limit: 26600 if (value == UINT64_MAX) 26601 value = 255; 26602 if (unlikely(value > 255)) 26603 return MDBX_EINVAL; 26604 env->me_options.dp_loose_limit = (uint8_t)value; 26605 break; 26606 26607 case MDBX_opt_merge_threshold_16dot16_percent: 26608 if (value == UINT64_MAX) 26609 value = 32768; 26610 if (unlikely(value < 8192 || value > 32768)) 26611 return MDBX_EINVAL; 26612 env->me_options.merge_threshold_16dot16_percent = (unsigned)value; 26613 recalculate_merge_threshold(env); 26614 break; 26615 26616 default: 26617 return MDBX_EINVAL; 26618 } 26619 26620 if (should_unlock) 26621 mdbx_txn_unlock(env); 26622 return err; 26623 } 26624 26625 __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, 26626 uint64_t *pvalue) { 26627 int err = check_env(env, false); 26628 if (unlikely(err != MDBX_SUCCESS)) 26629 return err; 26630 if (unlikely(!pvalue)) 26631 return MDBX_EINVAL; 26632 26633 switch (option) { 26634 case MDBX_opt_sync_bytes: 26635 if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) 26636 return MDBX_EPERM; 26637 *pvalue = pgno2bytes( 26638 env, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed)); 26639 break; 26640 26641 case MDBX_opt_sync_period: 26642 if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) 26643 return MDBX_EPERM; 26644 *pvalue = osal_monotime_to_16dot16( 26645 atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); 26646 break; 26647 26648 case MDBX_opt_max_db: 26649 *pvalue = env->me_maxdbs - CORE_DBS; 26650 break; 26651 26652 case MDBX_opt_max_readers: 26653 *pvalue = env->me_maxreaders; 26654 break; 26655 26656 case MDBX_opt_dp_reserve_limit: 26657 *pvalue = env->me_options.dp_reserve_limit; 26658 break; 26659 26660 case MDBX_opt_rp_augment_limit: 26661 *pvalue = env->me_options.rp_augment_limit; 26662 break; 26663 26664 case MDBX_opt_txn_dp_limit: 26665 *pvalue = env->me_options.dp_limit; 26666 break; 26667 case MDBX_opt_txn_dp_initial: 26668 *pvalue = env->me_options.dp_initial; 26669 break; 26670 26671 case MDBX_opt_spill_max_denominator: 26672 *pvalue = env->me_options.spill_max_denominator; 26673 break; 26674 case MDBX_opt_spill_min_denominator: 26675 *pvalue = env->me_options.spill_min_denominator; 26676 break; 26677 case MDBX_opt_spill_parent4child_denominator: 26678 *pvalue = env->me_options.spill_parent4child_denominator; 26679 break; 26680 26681 case MDBX_opt_loose_limit: 26682 *pvalue = env->me_options.dp_loose_limit; 26683 break; 26684 26685 case MDBX_opt_merge_threshold_16dot16_percent: 26686 *pvalue = env->me_options.merge_threshold_16dot16_percent; 26687 break; 26688 26689 default: 26690 return MDBX_EINVAL; 26691 } 26692 26693 return MDBX_SUCCESS; 26694 } 26695 26696 __cold void global_ctor(void) { 26697 rthc_limit = RTHC_INITIAL_LIMIT; 26698 rthc_table = rthc_table_static; 26699 #if defined(_WIN32) || defined(_WIN64) 26700 InitializeCriticalSection(&rthc_critical_section); 26701 InitializeCriticalSection(&lcklist_critical_section); 26702 #else 26703 ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); 26704 TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), 26705 __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); 26706 #endif 26707 /* checking time conversion, this also avoids racing on 32-bit architectures 26708 * during storing calculated 64-bit ratio(s) into memory. */ 26709 uint32_t proba = UINT32_MAX; 26710 while (true) { 26711 unsigned time_conversion_checkup = 26712 osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); 26713 unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; 26714 unsigned one_less = (proba > 0) ? proba - 1 : proba; 26715 ENSURE(nullptr, time_conversion_checkup >= one_less && 26716 time_conversion_checkup <= one_more); 26717 if (proba == 0) 26718 break; 26719 proba >>= 1; 26720 } 26721 26722 bootid = osal_bootid(); 26723 26724 #if MDBX_DEBUG 26725 for (unsigned i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { 26726 const bool s0 = (i >> 0) & 1; 26727 const bool s1 = (i >> 1) & 1; 26728 const bool s2 = (i >> 2) & 1; 26729 const uint8_t c01 = (i / (8 * 1)) % 3; 26730 const uint8_t c02 = (i / (8 * 3)) % 3; 26731 const uint8_t c12 = (i / (8 * 9)) % 3; 26732 26733 const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); 26734 meta_troika_t troika; 26735 troika.fsm = (uint8_t)i; 26736 meta_troika_unpack(&troika, packed); 26737 26738 const uint8_t tail = TROIKA_TAIL(&troika); 26739 const bool strict = TROIKA_STRICT_VALID(&troika); 26740 const bool valid = TROIKA_VALID(&troika); 26741 26742 const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) 26743 ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) 26744 : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); 26745 const uint8_t prefer_steady_chk = 26746 meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) 26747 : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); 26748 26749 uint8_t tail_chk; 26750 if (recent_chk == 0) 26751 tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; 26752 else if (recent_chk == 1) 26753 tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; 26754 else 26755 tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; 26756 26757 const bool valid_chk = 26758 c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; 26759 const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && 26760 (c12 != 1 || s1 != s2); 26761 assert(troika.recent == recent_chk); 26762 assert(troika.prefer_steady == prefer_steady_chk); 26763 assert(tail == tail_chk); 26764 assert(valid == valid_chk); 26765 assert(strict == strict_chk); 26766 // printf(" %d, ", packed); 26767 assert(troika_fsm_map[troika.fsm] == packed); 26768 } 26769 #endif /* MDBX_DEBUG*/ 26770 26771 #if 0 /* debug */ 26772 for (unsigned i = 0; i < 65536; ++i) { 26773 size_t pages = pv2pages(i); 26774 unsigned x = pages2pv(pages); 26775 size_t xp = pv2pages(x); 26776 if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) 26777 printf("%u => %zu => %u => %zu\n", i, pages, x, xp); 26778 assert(pages == xp); 26779 } 26780 fflush(stdout); 26781 #endif /* #if 0 */ 26782 } 26783 26784 /******************************************************************************/ 26785 26786 __dll_export 26787 #ifdef __attribute_used__ 26788 __attribute_used__ 26789 #elif defined(__GNUC__) || __has_attribute(__used__) 26790 __attribute__((__used__)) 26791 #endif 26792 #ifdef __attribute_externally_visible__ 26793 __attribute_externally_visible__ 26794 #elif (defined(__GNUC__) && !defined(__clang__)) || \ 26795 __has_attribute(__externally_visible__) 26796 __attribute__((__externally_visible__)) 26797 #endif 26798 const struct MDBX_build_info mdbx_build = { 26799 #ifdef MDBX_BUILD_TIMESTAMP 26800 MDBX_BUILD_TIMESTAMP 26801 #else 26802 "\"" __DATE__ " " __TIME__ "\"" 26803 #endif /* MDBX_BUILD_TIMESTAMP */ 26804 26805 , 26806 #ifdef MDBX_BUILD_TARGET 26807 MDBX_BUILD_TARGET 26808 #else 26809 #if defined(__ANDROID_API__) 26810 "Android" MDBX_STRINGIFY(__ANDROID_API__) 26811 #elif defined(__linux__) || defined(__gnu_linux__) 26812 "Linux" 26813 #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__) 26814 "webassembly" 26815 #elif defined(__CYGWIN__) 26816 "CYGWIN" 26817 #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \ 26818 || defined(__WINDOWS__) 26819 "Windows" 26820 #elif defined(__APPLE__) 26821 #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \ 26822 || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR) 26823 "iOS" 26824 #else 26825 "MacOS" 26826 #endif 26827 #elif defined(__FreeBSD__) 26828 "FreeBSD" 26829 #elif defined(__DragonFly__) 26830 "DragonFlyBSD" 26831 #elif defined(__NetBSD__) 26832 "NetBSD" 26833 #elif defined(__OpenBSD__) 26834 "OpenBSD" 26835 #elif defined(__bsdi__) 26836 "UnixBSDI" 26837 #elif defined(__MACH__) 26838 "MACH" 26839 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) 26840 "HPUX" 26841 #elif defined(_AIX) 26842 "AIX" 26843 #elif defined(__sun) && defined(__SVR4) 26844 "Solaris" 26845 #elif defined(__BSD__) || defined(BSD) 26846 "UnixBSD" 26847 #elif defined(__unix__) || defined(UNIX) || defined(__unix) \ 26848 || defined(__UNIX) || defined(__UNIX__) 26849 "UNIX" 26850 #elif defined(_POSIX_VERSION) 26851 "POSIX" MDBX_STRINGIFY(_POSIX_VERSION) 26852 #else 26853 "UnknownOS" 26854 #endif /* Target OS */ 26855 26856 "-" 26857 26858 #if defined(__amd64__) 26859 "AMD64" 26860 #elif defined(__ia32__) 26861 "IA32" 26862 #elif defined(__e2k__) || defined(__elbrus__) 26863 "Elbrus" 26864 #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) 26865 "Alpha" 26866 #elif defined(__aarch64__) || defined(_M_ARM64) 26867 "ARM64" 26868 #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \ 26869 || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \ 26870 || defined(_M_ARMT) || defined(__arm) 26871 "ARM" 26872 #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64)) 26873 "MIPS64" 26874 #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__) 26875 "MIPS" 26876 #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64) 26877 "PARISC64" 26878 #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) 26879 "PARISC" 26880 #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \ 26881 || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__) 26882 "Itanium" 26883 #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \ 26884 || defined(__powerpc64) || defined(_ARCH_PPC64) 26885 "PowerPC64" 26886 #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \ 26887 || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__) 26888 "PowerPC" 26889 #elif defined(__sparc64__) || defined(__sparc64) 26890 "SPARC64" 26891 #elif defined(__sparc__) || defined(__sparc) 26892 "SPARC" 26893 #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch) 26894 "S390" 26895 #else 26896 "UnknownARCH" 26897 #endif 26898 #endif /* MDBX_BUILD_TARGET */ 26899 26900 #ifdef MDBX_BUILD_TYPE 26901 # if defined(_MSC_VER) 26902 # pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE) 26903 # endif 26904 "-" MDBX_BUILD_TYPE 26905 #endif /* MDBX_BUILD_TYPE */ 26906 , 26907 "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) 26908 #ifdef ENABLE_GPROF 26909 " ENABLE_GPROF" 26910 #endif /* ENABLE_GPROF */ 26911 " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) 26912 " BYTE_ORDER=" 26913 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 26914 "LITTLE_ENDIAN" 26915 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 26916 "BIG_ENDIAN" 26917 #else 26918 #error "FIXME: Unsupported byte order" 26919 #endif /* __BYTE_ORDER__ */ 26920 " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) 26921 " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG 26922 " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG 26923 " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG 26924 " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG 26925 " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG 26926 " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) 26927 " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) 26928 #if MDBX_DISABLE_VALIDATION 26929 " MDBX_DISABLE_VALIDATION=YES" 26930 #endif /* MDBX_DISABLE_VALIDATION */ 26931 #ifdef __SANITIZE_ADDRESS__ 26932 " SANITIZE_ADDRESS=YES" 26933 #endif /* __SANITIZE_ADDRESS__ */ 26934 #ifdef MDBX_USE_VALGRIND 26935 " MDBX_USE_VALGRIND=YES" 26936 #endif /* MDBX_USE_VALGRIND */ 26937 #if MDBX_FORCE_ASSERTIONS 26938 " MDBX_FORCE_ASSERTIONS=YES" 26939 #endif /* MDBX_FORCE_ASSERTIONS */ 26940 #ifdef _GNU_SOURCE 26941 " _GNU_SOURCE=YES" 26942 #else 26943 " _GNU_SOURCE=NO" 26944 #endif /* _GNU_SOURCE */ 26945 #ifdef __APPLE__ 26946 " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) 26947 #endif /* MacOS */ 26948 #if defined(_WIN32) || defined(_WIN64) 26949 " MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT) 26950 " MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY) 26951 #if !MDBX_BUILD_SHARED_LIBRARY 26952 " MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER) 26953 #endif 26954 " WINVER=" MDBX_STRINGIFY(WINVER) 26955 #else /* Windows */ 26956 " MDBX_LOCKING=" MDBX_LOCKING_CONFIG 26957 " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG 26958 #endif /* !Windows */ 26959 " MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE) 26960 " MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT) 26961 " MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE) 26962 " MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE) 26963 " MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK) 26964 " MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING) 26965 , 26966 #ifdef MDBX_BUILD_COMPILER 26967 MDBX_BUILD_COMPILER 26968 #else 26969 #ifdef __INTEL_COMPILER 26970 "Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER) 26971 #elif defined(__apple_build_version__) 26972 "Apple clang " MDBX_STRINGIFY(__apple_build_version__) 26973 #elif defined(__ibmxl__) 26974 "IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__) 26975 "." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__) 26976 #elif defined(__clang__) 26977 "clang " MDBX_STRINGIFY(__clang_version__) 26978 #elif defined(__MINGW64__) 26979 "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION) 26980 #elif defined(__MINGW32__) 26981 "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION) 26982 #elif defined(__IBMC__) 26983 "IBM C " MDBX_STRINGIFY(__IBMC__) 26984 #elif defined(__GNUC__) 26985 "GNU C/C++ " 26986 #ifdef __VERSION__ 26987 __VERSION__ 26988 #else 26989 MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__) 26990 #endif 26991 #elif defined(_MSC_VER) 26992 "MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD) 26993 #else 26994 "Unknown compiler" 26995 #endif 26996 #endif /* MDBX_BUILD_COMPILER */ 26997 , 26998 #ifdef MDBX_BUILD_FLAGS_CONFIG 26999 MDBX_BUILD_FLAGS_CONFIG 27000 #endif /* MDBX_BUILD_FLAGS_CONFIG */ 27001 #ifdef MDBX_BUILD_FLAGS 27002 MDBX_BUILD_FLAGS 27003 #endif /* MDBX_BUILD_FLAGS */ 27004 #if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS)) 27005 "undefined (please use correct build script)" 27006 #ifdef _MSC_VER 27007 #pragma message("warning: Build flags undefined. Please use correct build script") 27008 #else 27009 #warning "Build flags undefined. Please use correct build script" 27010 #endif // _MSC_VER 27011 #endif 27012 }; 27013 27014 #ifdef __SANITIZE_ADDRESS__ 27015 LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { 27016 return "symbolize=1:allow_addr2line=1:" 27017 #if MDBX_DEBUG 27018 "debug=1:" 27019 "verbosity=2:" 27020 #endif /* MDBX_DEBUG */ 27021 "log_threads=1:" 27022 "report_globals=1:" 27023 "replace_str=1:replace_intrin=1:" 27024 "malloc_context_size=9:" 27025 #if !defined(__APPLE__) 27026 "detect_leaks=1:" 27027 #endif 27028 "check_printf=1:" 27029 "detect_deadlocks=1:" 27030 #ifndef LTO_ENABLED 27031 "check_initialization_order=1:" 27032 #endif 27033 "detect_stack_use_after_return=1:" 27034 "intercept_tls_get_addr=1:" 27035 "decorate_proc_maps=1:" 27036 "abort_on_error=1"; 27037 } 27038 #endif /* __SANITIZE_ADDRESS__ */ 27039 27040 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ 27041 27042 /* 27043 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 27044 * and other libmdbx authors: please see AUTHORS file. 27045 * All rights reserved. 27046 * 27047 * Redistribution and use in source and binary forms, with or without 27048 * modification, are permitted only as authorized by the OpenLDAP 27049 * Public License. 27050 * 27051 * A copy of this license is available in the file LICENSE in the 27052 * top-level directory of the distribution or, alternatively, at 27053 * <http://www.OpenLDAP.org/license.html>. 27054 */ 27055 27056 27057 #if defined(_WIN32) || defined(_WIN64) 27058 27059 #include <winioctl.h> 27060 27061 static int waitstatus2errcode(DWORD result) { 27062 switch (result) { 27063 case WAIT_OBJECT_0: 27064 return MDBX_SUCCESS; 27065 case WAIT_FAILED: 27066 return (int)GetLastError(); 27067 case WAIT_ABANDONED: 27068 return ERROR_ABANDONED_WAIT_0; 27069 case WAIT_IO_COMPLETION: 27070 return ERROR_USER_APC; 27071 case WAIT_TIMEOUT: 27072 return ERROR_TIMEOUT; 27073 default: 27074 return ERROR_UNHANDLED_ERROR; 27075 } 27076 } 27077 27078 /* Map a result from an NTAPI call to WIN32 error code. */ 27079 static int ntstatus2errcode(NTSTATUS status) { 27080 DWORD dummy; 27081 OVERLAPPED ov; 27082 memset(&ov, 0, sizeof(ov)); 27083 ov.Internal = status; 27084 return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS 27085 : (int)GetLastError(); 27086 } 27087 27088 /* We use native NT APIs to setup the memory map, so that we can 27089 * let the DB file grow incrementally instead of always preallocating 27090 * the full size. These APIs are defined in <wdm.h> and <ntifs.h> 27091 * but those headers are meant for driver-level development and 27092 * conflict with the regular user-level headers, so we explicitly 27093 * declare them here. Using these APIs also means we must link to 27094 * ntdll.dll, which is not linked by default in user code. */ 27095 27096 extern NTSTATUS NTAPI NtCreateSection( 27097 OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, 27098 IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes, 27099 IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection, 27100 IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle); 27101 27102 typedef struct _SECTION_BASIC_INFORMATION { 27103 ULONG Unknown; 27104 ULONG SectionAttributes; 27105 LARGE_INTEGER SectionSize; 27106 } SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION; 27107 27108 extern NTSTATUS NTAPI NtMapViewOfSection( 27109 IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, 27110 IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize, 27111 IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize, 27112 IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType, 27113 IN ULONG Win32Protect); 27114 27115 extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, 27116 IN OPTIONAL PVOID BaseAddress); 27117 27118 extern NTSTATUS NTAPI NtClose(HANDLE Handle); 27119 27120 extern NTSTATUS NTAPI NtAllocateVirtualMemory( 27121 IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits, 27122 IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect); 27123 27124 extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, 27125 IN PVOID *BaseAddress, 27126 IN OUT PSIZE_T RegionSize, 27127 IN ULONG FreeType); 27128 27129 #ifndef WOF_CURRENT_VERSION 27130 typedef struct _WOF_EXTERNAL_INFO { 27131 DWORD Version; 27132 DWORD Provider; 27133 } WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO; 27134 #endif /* WOF_CURRENT_VERSION */ 27135 27136 #ifndef WIM_PROVIDER_CURRENT_VERSION 27137 #define WIM_PROVIDER_HASH_SIZE 20 27138 27139 typedef struct _WIM_PROVIDER_EXTERNAL_INFO { 27140 DWORD Version; 27141 DWORD Flags; 27142 LARGE_INTEGER DataSourceId; 27143 BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE]; 27144 } WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO; 27145 #endif /* WIM_PROVIDER_CURRENT_VERSION */ 27146 27147 #ifndef FILE_PROVIDER_CURRENT_VERSION 27148 typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { 27149 ULONG Version; 27150 ULONG Algorithm; 27151 ULONG Flags; 27152 } FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; 27153 #endif /* FILE_PROVIDER_CURRENT_VERSION */ 27154 27155 #ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED 27156 #define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) 27157 #endif 27158 #ifndef STATUS_INVALID_DEVICE_REQUEST 27159 #define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) 27160 #endif 27161 #ifndef STATUS_NOT_SUPPORTED 27162 #define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) 27163 #endif 27164 27165 #ifndef FILE_DEVICE_FILE_SYSTEM 27166 #define FILE_DEVICE_FILE_SYSTEM 0x00000009 27167 #endif 27168 27169 #ifndef FSCTL_GET_EXTERNAL_BACKING 27170 #define FSCTL_GET_EXTERNAL_BACKING \ 27171 CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS) 27172 #endif 27173 27174 #ifndef ERROR_NOT_CAPABLE 27175 #define ERROR_NOT_CAPABLE 775L 27176 #endif 27177 27178 #endif /* _WIN32 || _WIN64 */ 27179 27180 /*----------------------------------------------------------------------------*/ 27181 27182 #if defined(__ANDROID_API__) 27183 __extern_C void __assert2(const char *file, int line, const char *function, 27184 const char *msg) __noreturn; 27185 #define __assert_fail(assertion, file, line, function) \ 27186 __assert2(file, line, function, assertion) 27187 27188 #elif defined(__UCLIBC__) 27189 __extern_C void __assert(const char *, const char *, unsigned int, const char *) 27190 #ifdef __THROW 27191 __THROW 27192 #else 27193 __nothrow 27194 #endif /* __THROW */ 27195 MDBX_NORETURN; 27196 #define __assert_fail(assertion, file, line, function) \ 27197 __assert(assertion, file, line, function) 27198 27199 #elif _POSIX_C_SOURCE > 200212 && \ 27200 /* workaround for avoid musl libc wrong prototype */ ( \ 27201 defined(__GLIBC__) || defined(__GNU_LIBRARY__)) 27202 /* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */ 27203 __extern_C void __assert_fail(const char *assertion, const char *file, 27204 unsigned line, const char *function) 27205 #ifdef __THROW 27206 __THROW 27207 #else 27208 __nothrow 27209 #endif /* __THROW */ 27210 MDBX_NORETURN; 27211 27212 #elif defined(__APPLE__) || defined(__MACH__) 27213 __extern_C void __assert_rtn(const char *function, const char *file, int line, 27214 const char *assertion) /* __nothrow */ 27215 #ifdef __dead2 27216 __dead2 27217 #else 27218 MDBX_NORETURN 27219 #endif /* __dead2 */ 27220 #ifdef __disable_tail_calls 27221 __disable_tail_calls 27222 #endif /* __disable_tail_calls */ 27223 ; 27224 27225 #define __assert_fail(assertion, file, line, function) \ 27226 __assert_rtn(function, file, line, assertion) 27227 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) 27228 __extern_C void __assert_c99(const char *assection, const char *file, int line, 27229 const char *function) MDBX_NORETURN; 27230 #define __assert_fail(assertion, file, line, function) \ 27231 __assert_c99(assertion, file, line, function) 27232 #elif defined(__OpenBSD__) 27233 __extern_C __dead void __assert2(const char *file, int line, 27234 const char *function, 27235 const char *assertion) /* __nothrow */; 27236 #define __assert_fail(assertion, file, line, function) \ 27237 __assert2(file, line, function, assertion) 27238 #elif defined(__NetBSD__) 27239 __extern_C __dead void __assert13(const char *file, int line, 27240 const char *function, 27241 const char *assertion) /* __nothrow */; 27242 #define __assert_fail(assertion, file, line, function) \ 27243 __assert13(file, line, function, assertion) 27244 #elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \ 27245 defined(__DragonFly__) 27246 __extern_C void __assert(const char *function, const char *file, int line, 27247 const char *assertion) /* __nothrow */ 27248 #ifdef __dead2 27249 __dead2 27250 #else 27251 MDBX_NORETURN 27252 #endif /* __dead2 */ 27253 #ifdef __disable_tail_calls 27254 __disable_tail_calls 27255 #endif /* __disable_tail_calls */ 27256 ; 27257 #define __assert_fail(assertion, file, line, function) \ 27258 __assert(function, file, line, assertion) 27259 27260 #endif /* __assert_fail */ 27261 27262 __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, 27263 const char *func, unsigned line) { 27264 #if MDBX_DEBUG 27265 if (env && env->me_assert_func) { 27266 env->me_assert_func(env, msg, func, line); 27267 return; 27268 } 27269 #else 27270 (void)env; 27271 #endif /* MDBX_DEBUG */ 27272 27273 if (debug_logger) 27274 debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); 27275 else { 27276 #if defined(_WIN32) || defined(_WIN64) 27277 char *message = nullptr; 27278 const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", 27279 msg, func ? func : "unknown", line); 27280 if (num < 1 || !message) 27281 message = "<troubles with assertion-message preparation>"; 27282 OutputDebugStringA(message); 27283 if (IsDebuggerPresent()) 27284 DebugBreak(); 27285 #else 27286 __assert_fail(msg, "mdbx", line, func); 27287 #endif 27288 } 27289 27290 #if defined(_WIN32) || defined(_WIN64) 27291 FatalExit(ERROR_UNHANDLED_ERROR); 27292 #else 27293 abort(); 27294 #endif 27295 } 27296 27297 __cold void mdbx_panic(const char *fmt, ...) { 27298 va_list ap; 27299 va_start(ap, fmt); 27300 27301 char *message = nullptr; 27302 const int num = osal_vasprintf(&message, fmt, ap); 27303 va_end(ap); 27304 const char *const const_message = 27305 (num < 1 || !message) ? "<troubles with panic-message preparation>" 27306 : message; 27307 27308 #if defined(_WIN32) || defined(_WIN64) 27309 OutputDebugStringA("\r\nMDBX-PANIC: "); 27310 OutputDebugStringA(const_message); 27311 if (IsDebuggerPresent()) 27312 DebugBreak(); 27313 FatalExit(ERROR_UNHANDLED_ERROR); 27314 #else 27315 __assert_fail(const_message, "mdbx", 0, "panic"); 27316 abort(); 27317 #endif 27318 } 27319 27320 /*----------------------------------------------------------------------------*/ 27321 27322 #ifndef osal_vasprintf 27323 MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, 27324 va_list ap) { 27325 va_list ones; 27326 va_copy(ones, ap); 27327 int needed = vsnprintf(nullptr, 0, fmt, ap); 27328 27329 if (unlikely(needed < 0 || needed >= INT_MAX)) { 27330 *strp = nullptr; 27331 va_end(ones); 27332 return needed; 27333 } 27334 27335 *strp = osal_malloc(needed + 1); 27336 if (unlikely(*strp == nullptr)) { 27337 va_end(ones); 27338 #if defined(_WIN32) || defined(_WIN64) 27339 SetLastError(MDBX_ENOMEM); 27340 #else 27341 errno = MDBX_ENOMEM; 27342 #endif 27343 return -1; 27344 } 27345 27346 int actual = vsnprintf(*strp, needed + 1, fmt, ones); 27347 va_end(ones); 27348 27349 assert(actual == needed); 27350 if (unlikely(actual < 0)) { 27351 osal_free(*strp); 27352 *strp = nullptr; 27353 } 27354 return actual; 27355 } 27356 #endif /* osal_vasprintf */ 27357 27358 #ifndef osal_asprintf 27359 MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { 27360 va_list ap; 27361 va_start(ap, fmt); 27362 int rc = osal_vasprintf(strp, fmt, ap); 27363 va_end(ap); 27364 return rc; 27365 } 27366 #endif /* osal_asprintf */ 27367 27368 #ifndef osal_memalign_alloc 27369 MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, 27370 void **result) { 27371 assert(is_powerof2(alignment) && alignment >= sizeof(void *)); 27372 #if defined(_WIN32) || defined(_WIN64) 27373 (void)alignment; 27374 *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); 27375 return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; 27376 #elif defined(_ISOC11_SOURCE) 27377 *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment)); 27378 return *result ? MDBX_SUCCESS : errno; 27379 #elif _POSIX_VERSION >= 200112L && \ 27380 (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17) 27381 *result = nullptr; 27382 return posix_memalign(result, alignment, bytes); 27383 #elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L 27384 *result = memalign(alignment, bytes); 27385 return *result ? MDBX_SUCCESS : errno; 27386 #else 27387 #error FIXME 27388 #endif 27389 } 27390 #endif /* osal_memalign_alloc */ 27391 27392 #ifndef osal_memalign_free 27393 MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) { 27394 #if defined(_WIN32) || defined(_WIN64) 27395 VirtualFree(ptr, 0, MEM_RELEASE); 27396 #else 27397 osal_free(ptr); 27398 #endif 27399 } 27400 #endif /* osal_memalign_free */ 27401 27402 #ifndef osal_strdup 27403 char *osal_strdup(const char *str) { 27404 if (!str) 27405 return NULL; 27406 size_t bytes = strlen(str) + 1; 27407 char *dup = osal_malloc(bytes); 27408 if (dup) 27409 memcpy(dup, str, bytes); 27410 return dup; 27411 } 27412 #endif /* osal_strdup */ 27413 27414 /*----------------------------------------------------------------------------*/ 27415 27416 MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) { 27417 int rc; 27418 memset(condpair, 0, sizeof(osal_condpair_t)); 27419 #if defined(_WIN32) || defined(_WIN64) 27420 if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { 27421 rc = (int)GetLastError(); 27422 goto bailout_mutex; 27423 } 27424 if ((condpair->event[0] = CreateEventW(NULL, FALSE, FALSE, NULL)) == NULL) { 27425 rc = (int)GetLastError(); 27426 goto bailout_event; 27427 } 27428 if ((condpair->event[1] = CreateEventW(NULL, FALSE, FALSE, NULL)) != NULL) 27429 return MDBX_SUCCESS; 27430 27431 rc = (int)GetLastError(); 27432 (void)CloseHandle(condpair->event[0]); 27433 bailout_event: 27434 (void)CloseHandle(condpair->mutex); 27435 #else 27436 rc = pthread_mutex_init(&condpair->mutex, NULL); 27437 if (unlikely(rc != 0)) 27438 goto bailout_mutex; 27439 rc = pthread_cond_init(&condpair->cond[0], NULL); 27440 if (unlikely(rc != 0)) 27441 goto bailout_cond; 27442 rc = pthread_cond_init(&condpair->cond[1], NULL); 27443 if (likely(rc == 0)) 27444 return MDBX_SUCCESS; 27445 27446 (void)pthread_cond_destroy(&condpair->cond[0]); 27447 bailout_cond: 27448 (void)pthread_mutex_destroy(&condpair->mutex); 27449 #endif 27450 bailout_mutex: 27451 memset(condpair, 0, sizeof(osal_condpair_t)); 27452 return rc; 27453 } 27454 27455 MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) { 27456 #if defined(_WIN32) || defined(_WIN64) 27457 int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); 27458 rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); 27459 rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError(); 27460 #else 27461 int err, rc = pthread_mutex_destroy(&condpair->mutex); 27462 rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; 27463 rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; 27464 #endif 27465 memset(condpair, 0, sizeof(osal_condpair_t)); 27466 return rc; 27467 } 27468 27469 MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) { 27470 #if defined(_WIN32) || defined(_WIN64) 27471 DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); 27472 return waitstatus2errcode(code); 27473 #else 27474 return osal_pthread_mutex_lock(&condpair->mutex); 27475 #endif 27476 } 27477 27478 MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) { 27479 #if defined(_WIN32) || defined(_WIN64) 27480 return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); 27481 #else 27482 return pthread_mutex_unlock(&condpair->mutex); 27483 #endif 27484 } 27485 27486 MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, 27487 bool part) { 27488 #if defined(_WIN32) || defined(_WIN64) 27489 return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); 27490 #else 27491 return pthread_cond_signal(&condpair->cond[part]); 27492 #endif 27493 } 27494 27495 MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, 27496 bool part) { 27497 #if defined(_WIN32) || defined(_WIN64) 27498 DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], 27499 INFINITE, FALSE); 27500 if (code == WAIT_OBJECT_0) { 27501 code = WaitForSingleObject(condpair->mutex, INFINITE); 27502 if (code == WAIT_OBJECT_0) 27503 return MDBX_SUCCESS; 27504 } 27505 return waitstatus2errcode(code); 27506 #else 27507 return pthread_cond_wait(&condpair->cond[part], &condpair->mutex); 27508 #endif 27509 } 27510 27511 /*----------------------------------------------------------------------------*/ 27512 27513 MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { 27514 #if defined(_WIN32) || defined(_WIN64) 27515 InitializeCriticalSection(fastmutex); 27516 return MDBX_SUCCESS; 27517 #else 27518 return pthread_mutex_init(fastmutex, NULL); 27519 #endif 27520 } 27521 27522 MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { 27523 #if defined(_WIN32) || defined(_WIN64) 27524 DeleteCriticalSection(fastmutex); 27525 return MDBX_SUCCESS; 27526 #else 27527 return pthread_mutex_destroy(fastmutex); 27528 #endif 27529 } 27530 27531 MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { 27532 #if defined(_WIN32) || defined(_WIN64) 27533 __try { 27534 EnterCriticalSection(fastmutex); 27535 } __except ( 27536 (GetExceptionCode() == 27537 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) 27538 ? EXCEPTION_EXECUTE_HANDLER 27539 : EXCEPTION_CONTINUE_SEARCH) { 27540 return ERROR_POSSIBLE_DEADLOCK; 27541 } 27542 return MDBX_SUCCESS; 27543 #else 27544 return osal_pthread_mutex_lock(fastmutex); 27545 #endif 27546 } 27547 27548 MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { 27549 #if defined(_WIN32) || defined(_WIN64) 27550 LeaveCriticalSection(fastmutex); 27551 return MDBX_SUCCESS; 27552 #else 27553 return pthread_mutex_unlock(fastmutex); 27554 #endif 27555 } 27556 27557 /*----------------------------------------------------------------------------*/ 27558 27559 #if defined(_WIN32) || defined(_WIN64) 27560 27561 #ifndef WC_ERR_INVALID_CHARS 27562 static const DWORD WC_ERR_INVALID_CHARS = 27563 (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) 27564 ? 0x00000080 27565 : 0; 27566 #endif /* WC_ERR_INVALID_CHARS */ 27567 27568 MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, 27569 size_t src_n) { 27570 return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, 27571 (int)src_n, dst, (int)dst_n); 27572 } 27573 27574 #endif /* Windows */ 27575 27576 /*----------------------------------------------------------------------------*/ 27577 27578 MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { 27579 #if defined(_WIN32) || defined(_WIN64) 27580 return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); 27581 #else 27582 return unlink(pathname) ? errno : MDBX_SUCCESS; 27583 #endif 27584 } 27585 27586 #if !(defined(_WIN32) || defined(_WIN64)) 27587 static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } 27588 #endif /*! Windows */ 27589 27590 MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { 27591 #if defined(_WIN32) || defined(_WIN64) 27592 return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); 27593 #else 27594 return rmdir(pathname) ? errno : MDBX_SUCCESS; 27595 #endif 27596 } 27597 27598 MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, 27599 const MDBX_env *env, 27600 const pathchar_t *pathname, 27601 mdbx_filehandle_t *fd, 27602 mdbx_mode_t unix_mode_bits) { 27603 *fd = INVALID_HANDLE_VALUE; 27604 27605 #if defined(_WIN32) || defined(_WIN64) 27606 DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; 27607 DWORD FlagsAndAttributes = 27608 FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; 27609 DWORD DesiredAccess = FILE_READ_ATTRIBUTES; 27610 DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE) 27611 ? 0 27612 : (FILE_SHARE_READ | FILE_SHARE_WRITE); 27613 27614 switch (purpose) { 27615 default: 27616 return ERROR_INVALID_PARAMETER; 27617 case MDBX_OPEN_LCK: 27618 CreationDisposition = OPEN_ALWAYS; 27619 DesiredAccess |= GENERIC_READ | GENERIC_WRITE; 27620 FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY; 27621 break; 27622 case MDBX_OPEN_DXB_READ: 27623 CreationDisposition = OPEN_EXISTING; 27624 DesiredAccess |= GENERIC_READ; 27625 ShareMode |= FILE_SHARE_READ; 27626 break; 27627 case MDBX_OPEN_DXB_LAZY: 27628 DesiredAccess |= GENERIC_READ | GENERIC_WRITE; 27629 break; 27630 case MDBX_OPEN_DXB_DSYNC: 27631 CreationDisposition = OPEN_EXISTING; 27632 DesiredAccess |= GENERIC_WRITE; 27633 FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; 27634 break; 27635 case MDBX_OPEN_COPY: 27636 CreationDisposition = CREATE_NEW; 27637 ShareMode = 0; 27638 DesiredAccess |= GENERIC_WRITE; 27639 FlagsAndAttributes |= 27640 (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; 27641 break; 27642 case MDBX_OPEN_DELETE: 27643 CreationDisposition = OPEN_EXISTING; 27644 ShareMode |= FILE_SHARE_DELETE; 27645 DesiredAccess = 27646 FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; 27647 break; 27648 } 27649 27650 *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL, 27651 CreationDisposition, FlagsAndAttributes, NULL); 27652 if (*fd == INVALID_HANDLE_VALUE) { 27653 int err = (int)GetLastError(); 27654 if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { 27655 if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && 27656 GetLastError() == ERROR_FILE_NOT_FOUND) 27657 err = ERROR_FILE_NOT_FOUND; 27658 } 27659 return err; 27660 } 27661 27662 BY_HANDLE_FILE_INFORMATION info; 27663 if (!GetFileInformationByHandle(*fd, &info)) { 27664 int err = (int)GetLastError(); 27665 CloseHandle(*fd); 27666 *fd = INVALID_HANDLE_VALUE; 27667 return err; 27668 } 27669 const DWORD AttributesDiff = 27670 (info.dwFileAttributes ^ FlagsAndAttributes) & 27671 (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | 27672 FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); 27673 if (AttributesDiff) 27674 (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); 27675 27676 #else 27677 int flags = unix_mode_bits ? O_CREAT : 0; 27678 switch (purpose) { 27679 default: 27680 return EINVAL; 27681 case MDBX_OPEN_LCK: 27682 flags |= O_RDWR; 27683 break; 27684 case MDBX_OPEN_DXB_READ: 27685 flags = O_RDONLY; 27686 break; 27687 case MDBX_OPEN_DXB_LAZY: 27688 flags |= O_RDWR; 27689 break; 27690 case MDBX_OPEN_COPY: 27691 flags = O_CREAT | O_WRONLY | O_EXCL; 27692 break; 27693 case MDBX_OPEN_DXB_DSYNC: 27694 flags |= O_WRONLY; 27695 #if defined(O_DSYNC) 27696 flags |= O_DSYNC; 27697 #elif defined(O_SYNC) 27698 flags |= O_SYNC; 27699 #elif defined(O_FSYNC) 27700 flags |= O_FSYNC; 27701 #endif 27702 break; 27703 case MDBX_OPEN_DELETE: 27704 flags = O_RDWR; 27705 break; 27706 } 27707 27708 const bool direct_nocache_for_copy = 27709 env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY; 27710 if (direct_nocache_for_copy) { 27711 #if defined(O_DIRECT) 27712 flags |= O_DIRECT; 27713 #endif /* O_DIRECT */ 27714 #if defined(O_NOCACHE) 27715 flags |= O_NOCACHE; 27716 #endif /* O_NOCACHE */ 27717 } 27718 27719 #ifdef O_CLOEXEC 27720 flags |= O_CLOEXEC; 27721 #endif /* O_CLOEXEC */ 27722 27723 /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ 27724 #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 27725 int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; 27726 static const char dev_null[] = "/dev/null"; 27727 if (!is_valid_fd(STDIN_FILENO)) { 27728 WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", 27729 STDIN_FILENO, dev_null); 27730 stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); 27731 } 27732 if (!is_valid_fd(STDOUT_FILENO)) { 27733 WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", 27734 STDOUT_FILENO, dev_null); 27735 stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); 27736 } 27737 if (!is_valid_fd(STDERR_FILENO)) { 27738 WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", 27739 STDERR_FILENO, dev_null); 27740 stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); 27741 } 27742 #else 27743 #error "Unexpected or unsupported UNIX or POSIX system" 27744 #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ 27745 27746 *fd = open(pathname, flags, unix_mode_bits); 27747 #if defined(O_DIRECT) 27748 if (*fd < 0 && (flags & O_DIRECT) && 27749 (errno == EINVAL || errno == EAFNOSUPPORT)) { 27750 flags &= ~(O_DIRECT | O_EXCL); 27751 *fd = open(pathname, flags, unix_mode_bits); 27752 } 27753 #endif /* O_DIRECT */ 27754 27755 if (*fd < 0 && errno == EACCES && purpose == MDBX_OPEN_LCK) { 27756 struct stat unused; 27757 if (stat(pathname, &unused) == 0 || errno != ENOENT) 27758 errno = EACCES /* restore errno if file exists */; 27759 } 27760 27761 /* Safeguard for todo4recovery://erased_by_github/libmdbx/issues/144 */ 27762 #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 27763 if (*fd == STDIN_FILENO) { 27764 WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", 27765 STDIN_FILENO); 27766 assert(stub_fd0 == -1); 27767 *fd = dup(stub_fd0 = *fd); 27768 } 27769 if (*fd == STDOUT_FILENO) { 27770 WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", 27771 STDOUT_FILENO); 27772 assert(stub_fd1 == -1); 27773 *fd = dup(stub_fd1 = *fd); 27774 } 27775 if (*fd == STDERR_FILENO) { 27776 WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", 27777 STDERR_FILENO); 27778 assert(stub_fd2 == -1); 27779 *fd = dup(stub_fd2 = *fd); 27780 } 27781 if (stub_fd0 != -1) 27782 close(stub_fd0); 27783 if (stub_fd1 != -1) 27784 close(stub_fd1); 27785 if (stub_fd2 != -1) 27786 close(stub_fd2); 27787 if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { 27788 ERROR("Rejecting the use of a FD in the range " 27789 "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", 27790 STDIN_FILENO, STDERR_FILENO); 27791 close(*fd); 27792 return EBADF; 27793 } 27794 #else 27795 #error "Unexpected or unsupported UNIX or POSIX system" 27796 #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ 27797 27798 if (*fd < 0) 27799 return errno; 27800 27801 #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) 27802 const int fd_flags = fcntl(*fd, F_GETFD); 27803 if (fd_flags != -1) 27804 (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); 27805 #endif /* FD_CLOEXEC && !O_CLOEXEC */ 27806 27807 if (direct_nocache_for_copy) { 27808 #if defined(F_NOCACHE) && !defined(O_NOCACHE) 27809 (void)fcntl(*fd, F_NOCACHE, 1); 27810 #endif /* F_NOCACHE */ 27811 } 27812 27813 #endif 27814 return MDBX_SUCCESS; 27815 } 27816 27817 MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) { 27818 #if defined(_WIN32) || defined(_WIN64) 27819 return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); 27820 #else 27821 assert(fd > STDERR_FILENO); 27822 return (close(fd) == 0) ? MDBX_SUCCESS : errno; 27823 #endif 27824 } 27825 27826 MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, 27827 uint64_t offset) { 27828 if (bytes > MAX_WRITE) 27829 return MDBX_EINVAL; 27830 #if defined(_WIN32) || defined(_WIN64) 27831 OVERLAPPED ov; 27832 ov.hEvent = 0; 27833 ov.Offset = (DWORD)offset; 27834 ov.OffsetHigh = HIGH_DWORD(offset); 27835 27836 DWORD read = 0; 27837 if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { 27838 int rc = (int)GetLastError(); 27839 return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; 27840 } 27841 #else 27842 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 27843 "libmdbx requires 64-bit file I/O on 64-bit systems"); 27844 intptr_t read = pread(fd, buf, bytes, offset); 27845 if (read < 0) { 27846 int rc = errno; 27847 return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; 27848 } 27849 #endif 27850 return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; 27851 } 27852 27853 MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, 27854 size_t bytes, uint64_t offset) { 27855 while (true) { 27856 #if defined(_WIN32) || defined(_WIN64) 27857 OVERLAPPED ov; 27858 ov.hEvent = 0; 27859 ov.Offset = (DWORD)offset; 27860 ov.OffsetHigh = HIGH_DWORD(offset); 27861 27862 DWORD written; 27863 if (unlikely(!WriteFile( 27864 fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, 27865 &written, &ov))) 27866 return (int)GetLastError(); 27867 if (likely(bytes == written)) 27868 return MDBX_SUCCESS; 27869 #else 27870 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 27871 "libmdbx requires 64-bit file I/O on 64-bit systems"); 27872 const intptr_t written = 27873 pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); 27874 if (likely(bytes == (size_t)written)) 27875 return MDBX_SUCCESS; 27876 if (written < 0) { 27877 const int rc = errno; 27878 if (rc != EINTR) 27879 return rc; 27880 continue; 27881 } 27882 #endif 27883 bytes -= written; 27884 offset += written; 27885 buf = (char *)buf + written; 27886 } 27887 } 27888 27889 MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, 27890 size_t bytes) { 27891 while (true) { 27892 #if defined(_WIN32) || defined(_WIN64) 27893 DWORD written; 27894 if (unlikely(!WriteFile( 27895 fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, 27896 &written, nullptr))) 27897 return (int)GetLastError(); 27898 if (likely(bytes == written)) 27899 return MDBX_SUCCESS; 27900 #else 27901 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 27902 "libmdbx requires 64-bit file I/O on 64-bit systems"); 27903 const intptr_t written = 27904 write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE); 27905 if (likely(bytes == (size_t)written)) 27906 return MDBX_SUCCESS; 27907 if (written < 0) { 27908 const int rc = errno; 27909 if (rc != EINTR) 27910 return rc; 27911 continue; 27912 } 27913 #endif 27914 bytes -= written; 27915 buf = (char *)buf + written; 27916 } 27917 } 27918 27919 int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, 27920 uint64_t offset, size_t expected_written) { 27921 #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ 27922 (defined(__ANDROID_API__) && __ANDROID_API__ < 24) 27923 size_t written = 0; 27924 for (int i = 0; i < iovcnt; ++i) { 27925 int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); 27926 if (unlikely(rc != MDBX_SUCCESS)) 27927 return rc; 27928 written += iov[i].iov_len; 27929 offset += iov[i].iov_len; 27930 } 27931 return (expected_written == written) ? MDBX_SUCCESS 27932 : MDBX_EIO /* ERROR_WRITE_FAULT */; 27933 #else 27934 int rc; 27935 intptr_t written; 27936 do { 27937 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 27938 "libmdbx requires 64-bit file I/O on 64-bit systems"); 27939 written = pwritev(fd, iov, iovcnt, offset); 27940 if (likely(expected_written == (size_t)written)) 27941 return MDBX_SUCCESS; 27942 rc = errno; 27943 } while (rc == EINTR); 27944 return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; 27945 #endif 27946 } 27947 27948 MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, 27949 enum osal_syncmode_bits mode_bits) { 27950 #if defined(_WIN32) || defined(_WIN64) 27951 if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) 27952 return (int)GetLastError(); 27953 return MDBX_SUCCESS; 27954 #else 27955 27956 #if defined(__APPLE__) && \ 27957 MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY 27958 if (mode_bits & MDBX_SYNC_IODQ) 27959 return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; 27960 #endif /* MacOS */ 27961 27962 /* LY: This approach is always safe and without appreciable performance 27963 * degradation, even on a kernel with fdatasync's bug. 27964 * 27965 * For more info about of a corresponding fdatasync() bug 27966 * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ 27967 while (1) { 27968 switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { 27969 case MDBX_SYNC_NONE: 27970 return MDBX_SUCCESS /* nothing to do */; 27971 #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 27972 case MDBX_SYNC_DATA: 27973 if (fdatasync(fd) == 0) 27974 return MDBX_SUCCESS; 27975 break /* error */; 27976 #if defined(__linux__) || defined(__gnu_linux__) 27977 case MDBX_SYNC_SIZE: 27978 if (linux_kernel_version >= 0x03060000) 27979 return MDBX_SUCCESS; 27980 __fallthrough /* fall through */; 27981 #endif /* Linux */ 27982 #endif /* _POSIX_SYNCHRONIZED_IO > 0 */ 27983 default: 27984 if (fsync(fd) == 0) 27985 return MDBX_SUCCESS; 27986 } 27987 27988 int rc = errno; 27989 if (rc != EINTR) 27990 return rc; 27991 } 27992 #endif 27993 } 27994 27995 int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { 27996 #if defined(_WIN32) || defined(_WIN64) 27997 BY_HANDLE_FILE_INFORMATION info; 27998 if (!GetFileInformationByHandle(fd, &info)) 27999 return (int)GetLastError(); 28000 *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; 28001 #else 28002 struct stat st; 28003 28004 STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), 28005 "libmdbx requires 64-bit file I/O on 64-bit systems"); 28006 if (fstat(fd, &st)) 28007 return errno; 28008 28009 *length = st.st_size; 28010 #endif 28011 return MDBX_SUCCESS; 28012 } 28013 28014 MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) { 28015 #if defined(_WIN32) || defined(_WIN64) 28016 switch (GetFileType(fd)) { 28017 case FILE_TYPE_DISK: 28018 return MDBX_RESULT_FALSE; 28019 case FILE_TYPE_CHAR: 28020 case FILE_TYPE_PIPE: 28021 return MDBX_RESULT_TRUE; 28022 default: 28023 return (int)GetLastError(); 28024 } 28025 #else 28026 struct stat info; 28027 if (fstat(fd, &info)) 28028 return errno; 28029 switch (info.st_mode & S_IFMT) { 28030 case S_IFBLK: 28031 case S_IFREG: 28032 return MDBX_RESULT_FALSE; 28033 case S_IFCHR: 28034 case S_IFIFO: 28035 case S_IFSOCK: 28036 return MDBX_RESULT_TRUE; 28037 case S_IFDIR: 28038 case S_IFLNK: 28039 default: 28040 return MDBX_INCOMPATIBLE; 28041 } 28042 #endif 28043 } 28044 28045 MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { 28046 #if defined(_WIN32) || defined(_WIN64) 28047 if (mdbx_SetFileInformationByHandle) { 28048 FILE_END_OF_FILE_INFO EndOfFileInfo; 28049 EndOfFileInfo.EndOfFile.QuadPart = length; 28050 return mdbx_SetFileInformationByHandle(fd, FileEndOfFileInfo, 28051 &EndOfFileInfo, 28052 sizeof(FILE_END_OF_FILE_INFO)) 28053 ? MDBX_SUCCESS 28054 : (int)GetLastError(); 28055 } else { 28056 LARGE_INTEGER li; 28057 li.QuadPart = length; 28058 return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) 28059 ? MDBX_SUCCESS 28060 : (int)GetLastError(); 28061 } 28062 #else 28063 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 28064 "libmdbx requires 64-bit file I/O on 64-bit systems"); 28065 return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; 28066 #endif 28067 } 28068 28069 MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { 28070 #if defined(_WIN32) || defined(_WIN64) 28071 LARGE_INTEGER li; 28072 li.QuadPart = pos; 28073 return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS 28074 : (int)GetLastError(); 28075 #else 28076 STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), 28077 "libmdbx requires 64-bit file I/O on 64-bit systems"); 28078 return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS; 28079 #endif 28080 } 28081 28082 /*----------------------------------------------------------------------------*/ 28083 28084 MDBX_INTERNAL_FUNC int 28085 osal_thread_create(osal_thread_t *thread, 28086 THREAD_RESULT(THREAD_CALL *start_routine)(void *), 28087 void *arg) { 28088 #if defined(_WIN32) || defined(_WIN64) 28089 *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); 28090 return *thread ? MDBX_SUCCESS : (int)GetLastError(); 28091 #else 28092 return pthread_create(thread, NULL, start_routine, arg); 28093 #endif 28094 } 28095 28096 MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { 28097 #if defined(_WIN32) || defined(_WIN64) 28098 DWORD code = WaitForSingleObject(thread, INFINITE); 28099 return waitstatus2errcode(code); 28100 #else 28101 void *unused_retval = &unused_retval; 28102 return pthread_join(thread, &unused_retval); 28103 #endif 28104 } 28105 28106 /*----------------------------------------------------------------------------*/ 28107 28108 MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, 28109 size_t length, 28110 enum osal_syncmode_bits mode_bits) { 28111 uint8_t *ptr = (uint8_t *)map->address + offset; 28112 #if defined(_WIN32) || defined(_WIN64) 28113 if (!FlushViewOfFile(ptr, length)) 28114 return (int)GetLastError(); 28115 #else 28116 #if defined(__linux__) || defined(__gnu_linux__) 28117 if (mode_bits == MDBX_SYNC_NONE && linux_kernel_version > 0x02061300) 28118 /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly 28119 * tracks dirty pages and flushes them to storage as necessary. */ 28120 return MDBX_SUCCESS; 28121 #endif /* Linux */ 28122 if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) 28123 return errno; 28124 mode_bits &= ~MDBX_SYNC_DATA; 28125 #endif 28126 return osal_fsync(map->fd, mode_bits); 28127 } 28128 28129 MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, 28130 const pathchar_t *pathname, 28131 int err) { 28132 #if defined(_WIN32) || defined(_WIN64) 28133 (void)pathname; 28134 (void)err; 28135 if (!mdbx_GetVolumeInformationByHandleW) 28136 return MDBX_ENOSYS; 28137 DWORD unused, flags; 28138 if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused, 28139 &flags, nullptr, 0)) 28140 return (int)GetLastError(); 28141 if ((flags & FILE_READ_ONLY_VOLUME) == 0) 28142 return MDBX_EACCESS; 28143 #else 28144 struct statvfs info; 28145 if (err != MDBX_ENOFILE) { 28146 if (statvfs(pathname, &info) == 0 && (info.f_flag & ST_RDONLY) == 0) 28147 return err; 28148 if (errno != MDBX_ENOFILE) 28149 return errno; 28150 } 28151 if (fstatvfs(handle, &info)) 28152 return errno; 28153 if ((info.f_flag & ST_RDONLY) == 0) 28154 return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err; 28155 #endif /* !Windows */ 28156 return MDBX_SUCCESS; 28157 } 28158 28159 static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { 28160 #if defined(_WIN32) || defined(_WIN64) 28161 if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) 28162 return ERROR_NOT_CAPABLE /* workaround for Wine */; 28163 28164 if (GetFileType(handle) != FILE_TYPE_DISK) 28165 return ERROR_FILE_OFFLINE; 28166 28167 if (mdbx_GetFileInformationByHandleEx) { 28168 FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo; 28169 if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo, 28170 &RemoteProtocolInfo, 28171 sizeof(RemoteProtocolInfo))) { 28172 if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) && 28173 !(flags & MDBX_RDONLY)) 28174 return ERROR_FILE_OFFLINE; 28175 if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) && 28176 !(flags & MDBX_EXCLUSIVE)) 28177 return ERROR_REMOTE_STORAGE_MEDIA_ERROR; 28178 } 28179 } 28180 28181 if (mdbx_NtFsControlFile) { 28182 NTSTATUS rc; 28183 struct { 28184 WOF_EXTERNAL_INFO wof_info; 28185 union { 28186 WIM_PROVIDER_EXTERNAL_INFO wim_info; 28187 FILE_PROVIDER_EXTERNAL_INFO_V1 file_info; 28188 }; 28189 size_t reserved_for_microsoft_madness[42]; 28190 } GetExternalBacking_OutputBuffer; 28191 IO_STATUS_BLOCK StatusBlock; 28192 rc = mdbx_NtFsControlFile(handle, NULL, NULL, NULL, &StatusBlock, 28193 FSCTL_GET_EXTERNAL_BACKING, NULL, 0, 28194 &GetExternalBacking_OutputBuffer, 28195 sizeof(GetExternalBacking_OutputBuffer)); 28196 if (NT_SUCCESS(rc)) { 28197 if (!(flags & MDBX_EXCLUSIVE)) 28198 return ERROR_REMOTE_STORAGE_MEDIA_ERROR; 28199 } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && 28200 rc != STATUS_INVALID_DEVICE_REQUEST && 28201 rc != STATUS_NOT_SUPPORTED) 28202 return ntstatus2errcode(rc); 28203 } 28204 28205 if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { 28206 WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); 28207 if (!PathBuffer) 28208 return MDBX_ENOMEM; 28209 28210 int rc = MDBX_SUCCESS; 28211 DWORD VolumeSerialNumber, FileSystemFlags; 28212 if (!mdbx_GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX, 28213 &VolumeSerialNumber, NULL, 28214 &FileSystemFlags, NULL, 0)) { 28215 rc = (int)GetLastError(); 28216 goto bailout; 28217 } 28218 28219 if ((flags & MDBX_RDONLY) == 0) { 28220 if (FileSystemFlags & 28221 (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME | 28222 FILE_VOLUME_IS_COMPRESSED)) { 28223 rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; 28224 goto bailout; 28225 } 28226 } 28227 28228 if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, 28229 FILE_NAME_NORMALIZED | VOLUME_NAME_NT)) { 28230 if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) { 28231 if (!(flags & MDBX_EXCLUSIVE)) { 28232 rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; 28233 goto bailout; 28234 } 28235 } 28236 } 28237 28238 if (F_ISSET(flags, MDBX_RDONLY | MDBX_EXCLUSIVE) && 28239 (FileSystemFlags & FILE_READ_ONLY_VOLUME)) { 28240 /* without-LCK (exclusive readonly) mode for DB on a read-only volume */ 28241 goto bailout; 28242 } 28243 28244 if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, 28245 FILE_NAME_NORMALIZED | 28246 VOLUME_NAME_DOS)) { 28247 UINT DriveType = GetDriveTypeW(PathBuffer); 28248 if (DriveType == DRIVE_NO_ROOT_DIR && 28249 _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 && 28250 _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) { 28251 PathBuffer[7] = 0; 28252 DriveType = GetDriveTypeW(PathBuffer + 4); 28253 } 28254 switch (DriveType) { 28255 case DRIVE_CDROM: 28256 if (flags & MDBX_RDONLY) 28257 break; 28258 // fall through 28259 case DRIVE_UNKNOWN: 28260 case DRIVE_NO_ROOT_DIR: 28261 case DRIVE_REMOTE: 28262 default: 28263 if (!(flags & MDBX_EXCLUSIVE)) 28264 rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; 28265 // fall through 28266 case DRIVE_REMOVABLE: 28267 case DRIVE_FIXED: 28268 case DRIVE_RAMDISK: 28269 break; 28270 } 28271 } 28272 28273 bailout: 28274 osal_free(PathBuffer); 28275 return rc; 28276 } 28277 28278 #else 28279 28280 struct statvfs statvfs_info; 28281 if (fstatvfs(handle, &statvfs_info)) 28282 return errno; 28283 #if defined(ST_LOCAL) || defined(ST_EXPORTED) 28284 const unsigned long st_flags = statvfs_info.f_flag; 28285 #endif /* ST_LOCAL || ST_EXPORTED */ 28286 28287 #if defined(__NetBSD__) 28288 const unsigned type = 0; 28289 const char *const name = statvfs_info.f_fstypename; 28290 const size_t name_len = VFS_NAMELEN; 28291 #elif defined(_AIX) || defined(__OS400__) 28292 const char *const name = statvfs_info.f_basetype; 28293 const size_t name_len = sizeof(statvfs_info.f_basetype); 28294 struct stat st; 28295 if (fstat(handle, &st)) 28296 return errno; 28297 const unsigned type = st.st_vfstype; 28298 if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE)) 28299 return MDBX_EREMOTE; 28300 #elif defined(FSTYPSZ) || defined(_FSTYPSZ) 28301 const unsigned type = 0; 28302 const char *const name = statvfs_info.f_basetype; 28303 const size_t name_len = sizeof(statvfs_info.f_basetype); 28304 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ 28305 defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ) 28306 const unsigned type = 0; 28307 struct stat st; 28308 if (fstat(handle, &st)) 28309 return errno; 28310 const char *const name = st.st_fstype; 28311 const size_t name_len = strlen(name); 28312 #else 28313 struct statfs statfs_info; 28314 if (fstatfs(handle, &statfs_info)) 28315 return errno; 28316 #if defined(__OpenBSD__) 28317 const unsigned type = 0; 28318 #else 28319 const unsigned type = statfs_info.f_type; 28320 #endif 28321 #if defined(MNT_LOCAL) || defined(MNT_EXPORTED) 28322 const unsigned long mnt_flags = statfs_info.f_flags; 28323 #endif /* MNT_LOCAL || MNT_EXPORTED */ 28324 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ 28325 defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ 28326 defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ 28327 defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) 28328 const char *const name = statfs_info.f_fstypename; 28329 const size_t name_len = sizeof(statfs_info.f_fstypename); 28330 #elif defined(__ANDROID_API__) && __ANDROID_API__ < 21 28331 const char *const name = ""; 28332 const unsigned name_len = 0; 28333 #else 28334 28335 const char *name = ""; 28336 unsigned name_len = 0; 28337 28338 struct stat st; 28339 if (fstat(handle, &st)) 28340 return errno; 28341 28342 char pathbuf[PATH_MAX]; 28343 FILE *mounted = nullptr; 28344 #if defined(__linux__) || defined(__gnu_linux__) 28345 mounted = setmntent("/proc/mounts", "r"); 28346 #endif /* Linux */ 28347 if (!mounted) 28348 mounted = setmntent("/etc/mtab", "r"); 28349 if (mounted) { 28350 const struct mntent *ent; 28351 #if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) || \ 28352 (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19)) 28353 struct mntent entbuf; 28354 const bool should_copy = false; 28355 while (nullptr != 28356 (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf)))) 28357 #else 28358 const bool should_copy = true; 28359 while (nullptr != (ent = getmntent(mounted))) 28360 #endif 28361 { 28362 struct stat mnt; 28363 if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) { 28364 if (should_copy) { 28365 name = 28366 strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1); 28367 pathbuf[name_len] = 0; 28368 } else { 28369 name = ent->mnt_fsname; 28370 name_len = strlen(name); 28371 } 28372 break; 28373 } 28374 } 28375 endmntent(mounted); 28376 } 28377 #endif /* !xBSD && !Android/Bionic */ 28378 #endif 28379 28380 if (name_len) { 28381 if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) || 28382 strncasecmp("cifs", name, name_len) == 0 || 28383 strncasecmp("ncpfs", name, name_len) == 0 || 28384 strncasecmp("smbfs", name, name_len) == 0 || 28385 strcasecmp("9P" /* WSL2 */, name) == 0 || 28386 ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && 28387 strncasecmp("fuseblk", name, name_len) != 0)) && 28388 !(flags & MDBX_EXCLUSIVE)) 28389 return MDBX_EREMOTE; 28390 if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 || 28391 strcasecmp("sshfs", name) == 0) 28392 return MDBX_EREMOTE; 28393 } 28394 28395 #ifdef ST_LOCAL 28396 if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) 28397 return MDBX_EREMOTE; 28398 #elif defined(MNT_LOCAL) 28399 if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) 28400 return MDBX_EREMOTE; 28401 #endif /* ST/MNT_LOCAL */ 28402 28403 #ifdef ST_EXPORTED 28404 if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) 28405 return MDBX_EREMOTE; 28406 #elif defined(MNT_EXPORTED) 28407 if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) 28408 return MDBX_EREMOTE; 28409 #endif /* ST/MNT_EXPORTED */ 28410 28411 switch (type) { 28412 case 0xFF534D42 /* CIFS_MAGIC_NUMBER */: 28413 case 0x6969 /* NFS_SUPER_MAGIC */: 28414 case 0x564c /* NCP_SUPER_MAGIC */: 28415 case 0x517B /* SMB_SUPER_MAGIC */: 28416 #if defined(__digital__) || defined(__osf__) || defined(__osf) 28417 case 0x0E /* Tru64 NFS */: 28418 #endif 28419 #ifdef ST_FST_NFS 28420 case ST_FST_NFS: 28421 #endif 28422 if ((flags & MDBX_EXCLUSIVE) == 0) 28423 return MDBX_EREMOTE; 28424 case 0: 28425 default: 28426 break; 28427 } 28428 #endif /* Unix */ 28429 28430 return MDBX_SUCCESS; 28431 } 28432 28433 static int check_mmap_limit(const size_t limit) { 28434 const bool should_check = 28435 #if defined(__SANITIZE_ADDRESS__) 28436 true; 28437 #else 28438 RUNNING_ON_VALGRIND; 28439 #endif /* __SANITIZE_ADDRESS__ */ 28440 28441 if (should_check) { 28442 intptr_t pagesize, total_ram_pages, avail_ram_pages; 28443 int err = 28444 mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages); 28445 if (unlikely(err != MDBX_SUCCESS)) 28446 return err; 28447 28448 const int log2page = log2n_powerof2(pagesize); 28449 if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || 28450 (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { 28451 ERROR("%s (%zu pages) is too large for available (%zu pages) or total " 28452 "(%zu pages) system RAM", 28453 "database upper size limit", limit >> log2page, avail_ram_pages, 28454 total_ram_pages); 28455 return MDBX_TOO_LARGE; 28456 } 28457 } 28458 28459 return MDBX_SUCCESS; 28460 } 28461 28462 MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, 28463 const size_t size, const size_t limit, 28464 const unsigned options) { 28465 assert(size <= limit); 28466 map->limit = 0; 28467 map->current = 0; 28468 map->address = nullptr; 28469 map->filesize = 0; 28470 #if defined(_WIN32) || defined(_WIN64) 28471 map->section = NULL; 28472 #endif /* Windows */ 28473 28474 int err = osal_check_fs_local(map->fd, flags); 28475 if (unlikely(err != MDBX_SUCCESS)) 28476 return err; 28477 28478 err = check_mmap_limit(limit); 28479 if (unlikely(err != MDBX_SUCCESS)) 28480 return err; 28481 28482 if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { 28483 err = osal_ftruncate(map->fd, size); 28484 if (err != MDBX_SUCCESS) 28485 return err; 28486 map->filesize = size; 28487 #if !(defined(_WIN32) || defined(_WIN64)) 28488 map->current = size; 28489 #endif /* !Windows */ 28490 } else { 28491 err = osal_filesize(map->fd, &map->filesize); 28492 if (err != MDBX_SUCCESS) 28493 return err; 28494 #if !(defined(_WIN32) || defined(_WIN64)) 28495 map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; 28496 #endif /* !Windows */ 28497 } 28498 28499 #if defined(_WIN32) || defined(_WIN64) 28500 LARGE_INTEGER SectionSize; 28501 SectionSize.QuadPart = size; 28502 err = NtCreateSection( 28503 &map->section, 28504 /* DesiredAccess */ 28505 (flags & MDBX_WRITEMAP) 28506 ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | 28507 SECTION_MAP_WRITE 28508 : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, 28509 /* ObjectAttributes */ NULL, /* MaximumSize (InitialSize) */ &SectionSize, 28510 /* SectionPageProtection */ 28511 (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, 28512 /* AllocationAttributes */ SEC_RESERVE, map->fd); 28513 if (!NT_SUCCESS(err)) 28514 return ntstatus2errcode(err); 28515 28516 SIZE_T ViewSize = (flags & MDBX_RDONLY) ? 0 28517 : mdbx_RunningUnderWine() ? size 28518 : limit; 28519 err = NtMapViewOfSection( 28520 map->section, GetCurrentProcess(), &map->address, 28521 /* ZeroBits */ 0, 28522 /* CommitSize */ 0, 28523 /* SectionOffset */ NULL, &ViewSize, 28524 /* InheritDisposition */ ViewUnmap, 28525 /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, 28526 /* Win32Protect */ 28527 (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); 28528 if (!NT_SUCCESS(err)) { 28529 NtClose(map->section); 28530 map->section = 0; 28531 map->address = nullptr; 28532 return ntstatus2errcode(err); 28533 } 28534 assert(map->address != MAP_FAILED); 28535 28536 map->current = (size_t)SectionSize.QuadPart; 28537 map->limit = ViewSize; 28538 28539 #else /* Windows */ 28540 28541 #ifndef MAP_TRYFIXED 28542 #define MAP_TRYFIXED 0 28543 #endif 28544 28545 #ifndef MAP_HASSEMAPHORE 28546 #define MAP_HASSEMAPHORE 0 28547 #endif 28548 28549 #ifndef MAP_CONCEAL 28550 #define MAP_CONCEAL 0 28551 #endif 28552 28553 #ifndef MAP_NOSYNC 28554 #define MAP_NOSYNC 0 28555 #endif 28556 28557 #ifndef MAP_FIXED_NOREPLACE 28558 #define MAP_FIXED_NOREPLACE 0 28559 #endif 28560 28561 #ifndef MAP_NORESERVE 28562 #define MAP_NORESERVE 0 28563 #endif 28564 28565 map->address = mmap( 28566 NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, 28567 MAP_SHARED | MAP_FILE | MAP_NORESERVE | 28568 (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | 28569 ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC 28570 : MAP_CONCEAL), 28571 map->fd, 0); 28572 28573 if (unlikely(map->address == MAP_FAILED)) { 28574 map->limit = 0; 28575 map->current = 0; 28576 map->address = nullptr; 28577 return errno; 28578 } 28579 map->limit = limit; 28580 28581 #if MDBX_ENABLE_MADVISE 28582 #ifdef MADV_DONTFORK 28583 if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) 28584 return errno; 28585 #endif /* MADV_DONTFORK */ 28586 #ifdef MADV_NOHUGEPAGE 28587 (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); 28588 #endif /* MADV_NOHUGEPAGE */ 28589 #endif /* MDBX_ENABLE_MADVISE */ 28590 28591 #endif /* ! Windows */ 28592 28593 VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); 28594 MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); 28595 return MDBX_SUCCESS; 28596 } 28597 28598 MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { 28599 VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); 28600 /* Unpoisoning is required for ASAN to avoid false-positive diagnostic 28601 * when this memory will re-used by malloc or another mmapping. 28602 * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 28603 */ 28604 MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, 28605 (map->filesize && map->filesize < map->limit) 28606 ? map->filesize 28607 : map->limit); 28608 #if defined(_WIN32) || defined(_WIN64) 28609 if (map->section) 28610 NtClose(map->section); 28611 NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); 28612 if (!NT_SUCCESS(rc)) 28613 ntstatus2errcode(rc); 28614 #else 28615 if (unlikely(munmap(map->address, map->limit))) 28616 return errno; 28617 #endif /* ! Windows */ 28618 28619 map->limit = 0; 28620 map->current = 0; 28621 map->address = nullptr; 28622 return MDBX_SUCCESS; 28623 } 28624 28625 MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, 28626 size_t size, size_t limit) { 28627 assert(size <= limit); 28628 #if defined(_WIN32) || defined(_WIN64) 28629 assert(size != map->current || limit != map->limit || size < map->filesize); 28630 28631 NTSTATUS status; 28632 LARGE_INTEGER SectionSize; 28633 int err, rc = MDBX_SUCCESS; 28634 28635 if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && 28636 /* workaround for Wine */ mdbx_NtExtendSection) { 28637 /* growth rw-section */ 28638 SectionSize.QuadPart = size; 28639 status = mdbx_NtExtendSection(map->section, &SectionSize); 28640 if (!NT_SUCCESS(status)) 28641 return ntstatus2errcode(status); 28642 map->current = size; 28643 if (map->filesize < size) 28644 map->filesize = size; 28645 return MDBX_SUCCESS; 28646 } 28647 28648 if (limit > map->limit) { 28649 err = check_mmap_limit(limit); 28650 if (unlikely(err != MDBX_SUCCESS)) 28651 return err; 28652 28653 /* check ability of address space for growth before unmap */ 28654 PVOID BaseAddress = (PBYTE)map->address + map->limit; 28655 SIZE_T RegionSize = limit - map->limit; 28656 status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, 28657 &RegionSize, MEM_RESERVE, PAGE_NOACCESS); 28658 if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) 28659 return MDBX_UNABLE_EXTEND_MAPSIZE; 28660 if (!NT_SUCCESS(status)) 28661 return ntstatus2errcode(status); 28662 28663 status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize, 28664 MEM_RELEASE); 28665 if (!NT_SUCCESS(status)) 28666 return ntstatus2errcode(status); 28667 } 28668 28669 /* Windows unable: 28670 * - shrink a mapped file; 28671 * - change size of mapped view; 28672 * - extend read-only mapping; 28673 * Therefore we should unmap/map entire section. */ 28674 if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) 28675 return MDBX_EPERM; 28676 28677 /* Unpoisoning is required for ASAN to avoid false-positive diagnostic 28678 * when this memory will re-used by malloc or another mmapping. 28679 * See todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 28680 */ 28681 MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); 28682 status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); 28683 if (!NT_SUCCESS(status)) 28684 return ntstatus2errcode(status); 28685 status = NtClose(map->section); 28686 map->section = NULL; 28687 PVOID ReservedAddress = NULL; 28688 SIZE_T ReservedSize = limit; 28689 28690 if (!NT_SUCCESS(status)) { 28691 bailout_ntstatus: 28692 err = ntstatus2errcode(status); 28693 bailout: 28694 map->address = NULL; 28695 map->current = map->limit = 0; 28696 if (ReservedAddress) { 28697 ReservedSize = 0; 28698 status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, 28699 &ReservedSize, MEM_RELEASE); 28700 assert(NT_SUCCESS(status)); 28701 (void)status; 28702 } 28703 return err; 28704 } 28705 28706 retry_file_and_section: 28707 /* resizing of the file may take a while, 28708 * therefore we reserve address space to avoid occupy it by other threads */ 28709 ReservedAddress = map->address; 28710 status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, 28711 &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); 28712 if (!NT_SUCCESS(status)) { 28713 ReservedAddress = NULL; 28714 if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) 28715 goto bailout_ntstatus /* no way to recovery */; 28716 28717 if (flags & MDBX_MRESIZE_MAY_MOVE) 28718 /* the base address could be changed */ 28719 map->address = NULL; 28720 } 28721 28722 err = osal_filesize(map->fd, &map->filesize); 28723 if (err != MDBX_SUCCESS) 28724 goto bailout; 28725 28726 if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { 28727 err = osal_ftruncate(map->fd, size); 28728 if (err == MDBX_SUCCESS) 28729 map->filesize = size; 28730 /* ignore error, because Windows unable shrink file 28731 * that already mapped (by another process) */ 28732 } 28733 28734 SectionSize.QuadPart = size; 28735 status = NtCreateSection( 28736 &map->section, 28737 /* DesiredAccess */ 28738 (flags & MDBX_WRITEMAP) 28739 ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | 28740 SECTION_MAP_WRITE 28741 : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, 28742 /* ObjectAttributes */ NULL, 28743 /* MaximumSize (InitialSize) */ &SectionSize, 28744 /* SectionPageProtection */ 28745 (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, 28746 /* AllocationAttributes */ SEC_RESERVE, map->fd); 28747 28748 if (!NT_SUCCESS(status)) 28749 goto bailout_ntstatus; 28750 28751 if (ReservedAddress) { 28752 /* release reserved address space */ 28753 ReservedSize = 0; 28754 status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, 28755 &ReservedSize, MEM_RELEASE); 28756 ReservedAddress = NULL; 28757 if (!NT_SUCCESS(status)) 28758 goto bailout_ntstatus; 28759 } 28760 28761 retry_mapview:; 28762 SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; 28763 status = NtMapViewOfSection( 28764 map->section, GetCurrentProcess(), &map->address, 28765 /* ZeroBits */ 0, 28766 /* CommitSize */ 0, 28767 /* SectionOffset */ NULL, &ViewSize, 28768 /* InheritDisposition */ ViewUnmap, 28769 /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, 28770 /* Win32Protect */ 28771 (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); 28772 28773 if (!NT_SUCCESS(status)) { 28774 if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && 28775 map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { 28776 /* try remap at another base address */ 28777 map->address = NULL; 28778 goto retry_mapview; 28779 } 28780 NtClose(map->section); 28781 map->section = NULL; 28782 28783 if (map->address && (size != map->current || limit != map->limit)) { 28784 /* try remap with previously size and limit, 28785 * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ 28786 rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; 28787 size = map->current; 28788 ReservedSize = limit = map->limit; 28789 goto retry_file_and_section; 28790 } 28791 28792 /* no way to recovery */ 28793 goto bailout_ntstatus; 28794 } 28795 assert(map->address != MAP_FAILED); 28796 28797 map->current = (size_t)SectionSize.QuadPart; 28798 map->limit = ViewSize; 28799 28800 #else /* Windows */ 28801 28802 map->filesize = 0; 28803 int rc = osal_filesize(map->fd, &map->filesize); 28804 if (rc != MDBX_SUCCESS) 28805 return rc; 28806 28807 if (flags & MDBX_RDONLY) { 28808 map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; 28809 if (map->current != size) 28810 rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; 28811 } else { 28812 if (map->filesize != size) { 28813 rc = osal_ftruncate(map->fd, size); 28814 if (rc != MDBX_SUCCESS) 28815 return rc; 28816 map->filesize = size; 28817 } 28818 28819 if (map->current > size) { 28820 /* Clearing asan's bitmask for the region which released in shrinking, 28821 * since: 28822 * - after the shrinking we will get an exception when accessing 28823 * this region and (therefore) do not need the help of ASAN. 28824 * - this allows us to clear the mask only within the file size 28825 * when closing the mapping. */ 28826 MDBX_ASAN_UNPOISON_MEMORY_REGION( 28827 (char *)map->address + size, 28828 ((map->current < map->limit) ? map->current : map->limit) - size); 28829 } 28830 map->current = size; 28831 } 28832 28833 if (limit == map->limit) 28834 return rc; 28835 28836 if (limit < map->limit) { 28837 /* unmap an excess at end of mapping. */ 28838 // coverity[offset_free : FALSE] 28839 if (unlikely(munmap(map->dxb + limit, map->limit - limit))) 28840 return errno; 28841 map->limit = limit; 28842 return rc; 28843 } 28844 28845 int err = check_mmap_limit(limit); 28846 if (unlikely(err != MDBX_SUCCESS)) 28847 return err; 28848 28849 assert(limit > map->limit); 28850 uint8_t *ptr = MAP_FAILED; 28851 28852 #if defined(MREMAP_MAYMOVE) 28853 ptr = mremap(map->address, map->limit, limit, 28854 (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0); 28855 if (ptr == MAP_FAILED) { 28856 err = errno; 28857 switch (err) { 28858 default: 28859 return err; 28860 case EAGAIN: 28861 case ENOMEM: 28862 return MDBX_UNABLE_EXTEND_MAPSIZE; 28863 case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */: 28864 break; 28865 } 28866 } 28867 #endif /* MREMAP_MAYMOVE */ 28868 28869 const unsigned mmap_flags = 28870 MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE | 28871 (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0); 28872 const unsigned mmap_prot = 28873 (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ; 28874 28875 if (ptr == MAP_FAILED) { 28876 /* Try to mmap additional space beyond the end of mapping. */ 28877 ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, 28878 mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); 28879 if (ptr == map->dxb + map->limit) 28880 ptr = map->dxb; 28881 else if (ptr != MAP_FAILED) { 28882 /* the desired address is busy, unmap unsuitable one */ 28883 if (unlikely(munmap(ptr, limit - map->limit))) 28884 return errno; 28885 ptr = MAP_FAILED; 28886 } else { 28887 err = errno; 28888 switch (err) { 28889 default: 28890 return err; 28891 case EAGAIN: 28892 case ENOMEM: 28893 return MDBX_UNABLE_EXTEND_MAPSIZE; 28894 case EEXIST: /* address busy */ 28895 case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */ 28896 break; 28897 } 28898 } 28899 } 28900 28901 if (ptr == MAP_FAILED) { 28902 /* unmap and map again whole region */ 28903 if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { 28904 /* TODO: Perhaps here it is worth to implement suspend/resume threads 28905 * and perform unmap/map as like for Windows. */ 28906 return MDBX_UNABLE_EXTEND_MAPSIZE; 28907 } 28908 28909 if (unlikely(munmap(map->address, map->limit))) 28910 return errno; 28911 28912 // coverity[pass_freed_arg : FALSE] 28913 ptr = mmap(map->address, limit, mmap_prot, 28914 (flags & MDBX_MRESIZE_MAY_MOVE) 28915 ? mmap_flags 28916 : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE 28917 : MAP_FIXED), 28918 map->fd, 0); 28919 if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && 28920 unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && 28921 errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) 28922 // coverity[pass_freed_arg : FALSE] 28923 ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, 28924 map->fd, 0); 28925 28926 if (unlikely(ptr == MAP_FAILED)) { 28927 /* try to restore prev mapping */ 28928 // coverity[pass_freed_arg : FALSE] 28929 ptr = mmap(map->address, map->limit, mmap_prot, 28930 (flags & MDBX_MRESIZE_MAY_MOVE) 28931 ? mmap_flags 28932 : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE 28933 : MAP_FIXED), 28934 map->fd, 0); 28935 if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && 28936 unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && 28937 errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) 28938 // coverity[pass_freed_arg : FALSE] 28939 ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, 28940 map->fd, 0); 28941 if (unlikely(ptr == MAP_FAILED)) { 28942 VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); 28943 /* Unpoisoning is required for ASAN to avoid false-positive diagnostic 28944 * when this memory will re-used by malloc or another mmapping. 28945 * See 28946 * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 28947 */ 28948 MDBX_ASAN_UNPOISON_MEMORY_REGION( 28949 map->address, 28950 (map->current < map->limit) ? map->current : map->limit); 28951 map->limit = 0; 28952 map->current = 0; 28953 map->address = nullptr; 28954 return errno; 28955 } 28956 rc = MDBX_UNABLE_EXTEND_MAPSIZE; 28957 limit = map->limit; 28958 } 28959 } 28960 28961 assert(ptr && ptr != MAP_FAILED); 28962 if (map->address != ptr) { 28963 VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); 28964 /* Unpoisoning is required for ASAN to avoid false-positive diagnostic 28965 * when this memory will re-used by malloc or another mmapping. 28966 * See 28967 * todo4recovery://erased_by_github/libmdbx/pull/93#issuecomment-613687203 28968 */ 28969 MDBX_ASAN_UNPOISON_MEMORY_REGION( 28970 map->address, (map->current < map->limit) ? map->current : map->limit); 28971 28972 VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); 28973 MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); 28974 map->address = ptr; 28975 } 28976 map->limit = limit; 28977 28978 #if MDBX_ENABLE_MADVISE 28979 #ifdef MADV_DONTFORK 28980 if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) 28981 return errno; 28982 #endif /* MADV_DONTFORK */ 28983 #ifdef MADV_NOHUGEPAGE 28984 (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); 28985 #endif /* MADV_NOHUGEPAGE */ 28986 #endif /* MDBX_ENABLE_MADVISE */ 28987 28988 #endif /* POSIX / Windows */ 28989 28990 return rc; 28991 } 28992 28993 /*----------------------------------------------------------------------------*/ 28994 28995 __cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { 28996 for (;;) { 28997 #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ 28998 defined(__x86_64__) 28999 const unsigned salt = 277u * (unsigned)__rdtsc(); 29000 #elif (defined(_WIN32) || defined(_WIN64)) && MDBX_WITHOUT_MSVC_CRT 29001 static ULONG state; 29002 const unsigned salt = (unsigned)RtlRandomEx(&state); 29003 #else 29004 const unsigned salt = rand(); 29005 #endif 29006 29007 const unsigned coin = salt % (tiny ? 29u : 43u); 29008 if (coin < 43 / 3) 29009 break; 29010 #if defined(_WIN32) || defined(_WIN64) 29011 SwitchToThread(); 29012 if (coin > 43 * 2 / 3) 29013 Sleep(1); 29014 #else 29015 sched_yield(); 29016 if (coin > 43 * 2 / 3) 29017 usleep(coin); 29018 #endif 29019 } 29020 } 29021 29022 #if defined(_WIN32) || defined(_WIN64) 29023 #elif defined(__APPLE__) || defined(__MACH__) 29024 #include <mach/mach_time.h> 29025 #elif defined(__linux__) || defined(__gnu_linux__) 29026 __cold static clockid_t choice_monoclock(void) { 29027 struct timespec probe; 29028 #if defined(CLOCK_BOOTTIME) 29029 if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) 29030 return CLOCK_BOOTTIME; 29031 #elif defined(CLOCK_MONOTONIC_RAW) 29032 if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0) 29033 return CLOCK_MONOTONIC_RAW; 29034 #elif defined(CLOCK_MONOTONIC_COARSE) 29035 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0) 29036 return CLOCK_MONOTONIC_COARSE; 29037 #endif 29038 return CLOCK_MONOTONIC; 29039 } 29040 #endif 29041 29042 /*----------------------------------------------------------------------------*/ 29043 29044 #if defined(_WIN32) || defined(_WIN64) 29045 static LARGE_INTEGER performance_frequency; 29046 #elif defined(__APPLE__) || defined(__MACH__) 29047 static uint64_t ratio_16dot16_to_monotine; 29048 #endif 29049 29050 MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { 29051 #if defined(_WIN32) || defined(_WIN64) 29052 if (unlikely(performance_frequency.QuadPart == 0)) 29053 QueryPerformanceFrequency(&performance_frequency); 29054 const uint64_t ratio = performance_frequency.QuadPart; 29055 #elif defined(__APPLE__) || defined(__MACH__) 29056 if (unlikely(ratio_16dot16_to_monotine == 0)) { 29057 mach_timebase_info_data_t ti; 29058 mach_timebase_info(&ti); 29059 ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; 29060 } 29061 const uint64_t ratio = ratio_16dot16_to_monotine; 29062 #else 29063 const uint64_t ratio = UINT64_C(1000000000); 29064 #endif 29065 const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16; 29066 return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; 29067 } 29068 29069 MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { 29070 static uint64_t limit; 29071 if (unlikely(monotime > limit)) { 29072 if (likely(limit != 0)) 29073 return UINT32_MAX; 29074 limit = osal_16dot16_to_monotime(UINT32_MAX - 1); 29075 if (unlikely(monotime > limit)) 29076 return UINT32_MAX; 29077 } 29078 const uint32_t ret = 29079 #if defined(_WIN32) || defined(_WIN64) 29080 (uint32_t)((monotime << 16) / performance_frequency.QuadPart); 29081 #elif defined(__APPLE__) || defined(__MACH__) 29082 (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); 29083 #else 29084 (uint32_t)(monotime * 128 / 1953125); 29085 #endif 29086 if (likely(ret > 0)) 29087 return ret; 29088 return monotime > 0 /* fix underflow */; 29089 } 29090 29091 MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { 29092 #if defined(_WIN32) || defined(_WIN64) 29093 LARGE_INTEGER counter; 29094 counter.QuadPart = 0; 29095 QueryPerformanceCounter(&counter); 29096 return counter.QuadPart; 29097 #elif defined(__APPLE__) || defined(__MACH__) 29098 return mach_absolute_time(); 29099 #else 29100 29101 #if defined(__linux__) || defined(__gnu_linux__) 29102 static clockid_t posix_clockid = -1; 29103 if (unlikely(posix_clockid < 0)) 29104 posix_clockid = choice_monoclock(); 29105 #elif defined(CLOCK_MONOTONIC) 29106 #define posix_clockid CLOCK_MONOTONIC 29107 #else 29108 #define posix_clockid CLOCK_REALTIME 29109 #endif 29110 29111 struct timespec ts; 29112 if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) { 29113 ts.tv_nsec = 0; 29114 ts.tv_sec = 0; 29115 } 29116 return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; 29117 #endif 29118 } 29119 29120 /*----------------------------------------------------------------------------*/ 29121 29122 static void bootid_shake(bin128_t *p) { 29123 /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */ 29124 const uint32_t e = p->a - (p->b << 23 | p->b >> 9); 29125 p->a = p->b ^ (p->c << 16 | p->c >> 16); 29126 p->b = p->c + (p->d << 11 | p->d >> 21); 29127 p->c = p->d + e; 29128 p->d = e + p->a; 29129 } 29130 29131 __cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { 29132 p->y += UINT64_C(64526882297375213); 29133 bootid_shake(p); 29134 for (size_t i = 0; i < n; ++i) { 29135 bootid_shake(p); 29136 p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i]; 29137 bootid_shake(p); 29138 p->y += 14621231; 29139 } 29140 bootid_shake(p); 29141 29142 /* minor non-linear tomfoolery */ 29143 const unsigned z = p->x % 61; 29144 p->y = p->y << z | p->y >> (64 - z); 29145 bootid_shake(p); 29146 bootid_shake(p); 29147 const unsigned q = p->x % 59; 29148 p->y = p->y << q | p->y >> (64 - q); 29149 bootid_shake(p); 29150 bootid_shake(p); 29151 bootid_shake(p); 29152 } 29153 29154 #if defined(_WIN32) || defined(_WIN64) 29155 29156 static uint64_t windows_systemtime_ms() { 29157 FILETIME ft; 29158 GetSystemTimeAsFileTime(&ft); 29159 return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; 29160 } 29161 29162 static uint64_t windows_bootime(void) { 29163 unsigned confirmed = 0; 29164 uint64_t boottime = 0; 29165 uint64_t up0 = mdbx_GetTickCount64(); 29166 uint64_t st0 = windows_systemtime_ms(); 29167 for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) { 29168 YieldProcessor(); 29169 const uint64_t up1 = mdbx_GetTickCount64(); 29170 const uint64_t st1 = windows_systemtime_ms(); 29171 if (st1 > fuse && st1 == st0 && up1 == up0) { 29172 uint64_t diff = st1 - up1; 29173 if (boottime == diff) { 29174 if (++confirmed > 4) 29175 return boottime; 29176 } else { 29177 confirmed = 0; 29178 boottime = diff; 29179 } 29180 fuse = st1; 29181 Sleep(1); 29182 } 29183 st0 = st1; 29184 up0 = up1; 29185 } 29186 return 0; 29187 } 29188 29189 static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue, 29190 PVOID pvData, LPDWORD pcbData) { 29191 LSTATUS rc; 29192 if (!mdbx_RegGetValueA) { 29193 /* an old Windows 2000/XP */ 29194 HKEY hSubKey; 29195 rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey); 29196 if (rc == ERROR_SUCCESS) { 29197 rc = RegQueryValueExA(hSubKey, lpValue, NULL, NULL, pvData, pcbData); 29198 RegCloseKey(hSubKey); 29199 } 29200 return rc; 29201 } 29202 29203 rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, NULL, pvData, 29204 pcbData); 29205 if (rc != ERROR_FILE_NOT_FOUND) 29206 return rc; 29207 29208 rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, 29209 RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */, 29210 NULL, pvData, pcbData); 29211 if (rc != ERROR_FILE_NOT_FOUND) 29212 return rc; 29213 return mdbx_RegGetValueA(hKey, lpSubKey, lpValue, 29214 RRF_RT_ANY | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */, 29215 NULL, pvData, pcbData); 29216 } 29217 #endif 29218 29219 __cold MDBX_MAYBE_UNUSED static bool 29220 bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { 29221 if (n > 31) { 29222 unsigned bits = 0; 29223 for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { 29224 uint8_t c = ((const uint8_t *)p)[i]; 29225 if (c >= '0' && c <= '9') 29226 c -= '0'; 29227 else if (c >= 'a' && c <= 'f') 29228 c -= 'a' - 10; 29229 else if (c >= 'A' && c <= 'F') 29230 c -= 'A' - 10; 29231 else 29232 continue; 29233 assert(c <= 15); 29234 c ^= s->y >> 60; 29235 s->y = s->y << 4 | s->x >> 60; 29236 s->x = s->x << 4 | c; 29237 bits += 4; 29238 } 29239 if (bits > 42 * 3) 29240 /* UUID parsed successfully */ 29241 return true; 29242 } 29243 29244 if (n > 15) /* is enough handle it as a binary? */ { 29245 if (n == sizeof(bin128_t)) { 29246 bin128_t aligned; 29247 memcpy(&aligned, p, sizeof(bin128_t)); 29248 s->x += aligned.x; 29249 s->y += aligned.y; 29250 } else 29251 bootid_collect(s, p, n); 29252 return true; 29253 } 29254 29255 if (n) 29256 bootid_collect(s, p, n); 29257 return false; 29258 } 29259 29260 __cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) { 29261 bin128_t bin = {{0, 0}}; 29262 bool got_machineid = false, got_boottime = false, got_bootseq = false; 29263 29264 #if defined(__linux__) || defined(__gnu_linux__) 29265 { 29266 const int fd = 29267 open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW); 29268 if (fd != -1) { 29269 struct statfs fs; 29270 char buf[42]; 29271 const ssize_t len = 29272 (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0) 29273 ? read(fd, buf, sizeof(buf)) 29274 : -1; 29275 const int err = close(fd); 29276 assert(err == 0); 29277 (void)err; 29278 if (len > 0 && bootid_parse_uuid(&bin, buf, len)) 29279 return bin; 29280 } 29281 } 29282 #endif /* Linux */ 29283 29284 #if defined(__APPLE__) || defined(__MACH__) 29285 { 29286 char buf[42]; 29287 size_t len = sizeof(buf); 29288 if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) && 29289 bootid_parse_uuid(&bin, buf, len)) 29290 return bin; 29291 29292 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ 29293 __MAC_OS_X_VERSION_MIN_REQUIRED > 1050 29294 uuid_t uuid; 29295 struct timespec wait = {0, 1000000000u / 42}; 29296 if (!gethostuuid(uuid, &wait) && 29297 bootid_parse_uuid(&bin, uuid, sizeof(uuid))) 29298 got_machineid = true; 29299 #endif /* > 10.5 */ 29300 29301 struct timeval boottime; 29302 len = sizeof(boottime); 29303 if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) && 29304 len == sizeof(boottime) && boottime.tv_sec) 29305 got_boottime = true; 29306 } 29307 #endif /* Apple/Darwin */ 29308 29309 #if defined(_WIN32) || defined(_WIN64) 29310 { 29311 union buf { 29312 DWORD BootId; 29313 DWORD BaseTime; 29314 SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo; 29315 struct { 29316 LARGE_INTEGER BootTime; 29317 LARGE_INTEGER CurrentTime; 29318 LARGE_INTEGER TimeZoneBias; 29319 ULONG TimeZoneId; 29320 ULONG Reserved; 29321 ULONGLONG BootTimeBias; 29322 ULONGLONG SleepTimeBias; 29323 } SysTimeOfDayInfoHacked; 29324 wchar_t MachineGuid[42]; 29325 char DigitalProductId[248]; 29326 } buf; 29327 29328 static const char HKLM_MicrosoftCryptography[] = 29329 "SOFTWARE\\Microsoft\\Cryptography"; 29330 DWORD len = sizeof(buf); 29331 /* Windows is madness and must die */ 29332 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography, 29333 "MachineGuid", &buf.MachineGuid, 29334 &len) == ERROR_SUCCESS && 29335 len < sizeof(buf)) 29336 got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len); 29337 29338 if (!got_machineid) { 29339 /* again, Windows is madness */ 29340 static const char HKLM_WindowsNT[] = 29341 "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion"; 29342 static const char HKLM_WindowsNT_DPK[] = 29343 "SOFTWARE\\Microsoft\\Windows " 29344 "NT\\CurrentVersion\\DefaultProductKey"; 29345 static const char HKLM_WindowsNT_DPK2[] = 29346 "SOFTWARE\\Microsoft\\Windows " 29347 "NT\\CurrentVersion\\DefaultProductKey2"; 29348 29349 len = sizeof(buf); 29350 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT, 29351 "DigitalProductId", &buf.DigitalProductId, 29352 &len) == ERROR_SUCCESS && 29353 len > 42 && len < sizeof(buf)) { 29354 bootid_collect(&bin, &buf.DigitalProductId, len); 29355 got_machineid = true; 29356 } 29357 len = sizeof(buf); 29358 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK, 29359 "DigitalProductId", &buf.DigitalProductId, 29360 &len) == ERROR_SUCCESS && 29361 len > 42 && len < sizeof(buf)) { 29362 bootid_collect(&bin, &buf.DigitalProductId, len); 29363 got_machineid = true; 29364 } 29365 len = sizeof(buf); 29366 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2, 29367 "DigitalProductId", &buf.DigitalProductId, 29368 &len) == ERROR_SUCCESS && 29369 len > 42 && len < sizeof(buf)) { 29370 bootid_collect(&bin, &buf.DigitalProductId, len); 29371 got_machineid = true; 29372 } 29373 } 29374 29375 static const char HKLM_PrefetcherParams[] = 29376 "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory " 29377 "Management\\PrefetchParameters"; 29378 len = sizeof(buf); 29379 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId", 29380 &buf.BootId, &len) == ERROR_SUCCESS && 29381 len > 1 && len < sizeof(buf)) { 29382 bootid_collect(&bin, &buf.BootId, len); 29383 got_bootseq = true; 29384 } 29385 29386 len = sizeof(buf); 29387 if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime", 29388 &buf.BaseTime, &len) == ERROR_SUCCESS && 29389 len >= sizeof(buf.BaseTime) && buf.BaseTime) { 29390 bootid_collect(&bin, &buf.BaseTime, len); 29391 got_boottime = true; 29392 } 29393 29394 /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */ 29395 NTSTATUS status = NtQuerySystemInformation( 29396 0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo, 29397 sizeof(buf.SysTimeOfDayInfo), &len); 29398 if (NT_SUCCESS(status) && 29399 len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) + 29400 sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) && 29401 buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) { 29402 const uint64_t UnbiasedBootTime = 29403 buf.SysTimeOfDayInfoHacked.BootTime.QuadPart - 29404 buf.SysTimeOfDayInfoHacked.BootTimeBias; 29405 if (UnbiasedBootTime) { 29406 bootid_collect(&bin, &UnbiasedBootTime, sizeof(UnbiasedBootTime)); 29407 got_boottime = true; 29408 } 29409 } 29410 29411 if (!got_boottime) { 29412 uint64_t boottime = windows_bootime(); 29413 if (boottime) { 29414 bootid_collect(&bin, &boottime, sizeof(boottime)); 29415 got_boottime = true; 29416 } 29417 } 29418 } 29419 #endif /* Windows */ 29420 29421 #if defined(CTL_HW) && defined(HW_UUID) 29422 if (!got_machineid) { 29423 static const int mib[] = {CTL_HW, HW_UUID}; 29424 char buf[42]; 29425 size_t len = sizeof(buf); 29426 if (sysctl( 29427 #ifdef SYSCTL_LEGACY_NONCONST_MIB 29428 (int *) 29429 #endif 29430 mib, 29431 ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) 29432 got_machineid = bootid_parse_uuid(&bin, buf, len); 29433 } 29434 #endif /* CTL_HW && HW_UUID */ 29435 29436 #if defined(CTL_KERN) && defined(KERN_HOSTUUID) 29437 if (!got_machineid) { 29438 static const int mib[] = {CTL_KERN, KERN_HOSTUUID}; 29439 char buf[42]; 29440 size_t len = sizeof(buf); 29441 if (sysctl( 29442 #ifdef SYSCTL_LEGACY_NONCONST_MIB 29443 (int *) 29444 #endif 29445 mib, 29446 ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) 29447 got_machineid = bootid_parse_uuid(&bin, buf, len); 29448 } 29449 #endif /* CTL_KERN && KERN_HOSTUUID */ 29450 29451 #if defined(__NetBSD__) 29452 if (!got_machineid) { 29453 char buf[42]; 29454 size_t len = sizeof(buf); 29455 if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0) 29456 got_machineid = bootid_parse_uuid(&bin, buf, len); 29457 } 29458 #endif /* __NetBSD__ */ 29459 29460 #if _XOPEN_SOURCE_EXTENDED 29461 if (!got_machineid) { 29462 const int hostid = gethostid(); 29463 if (hostid > 0) { 29464 bootid_collect(&bin, &hostid, sizeof(hostid)); 29465 got_machineid = true; 29466 } 29467 } 29468 #endif /* _XOPEN_SOURCE_EXTENDED */ 29469 29470 if (!got_machineid) { 29471 lack: 29472 bin.x = bin.y = 0; 29473 return bin; 29474 } 29475 29476 /*--------------------------------------------------------------------------*/ 29477 29478 #if defined(CTL_KERN) && defined(KERN_BOOTTIME) 29479 if (!got_boottime) { 29480 static const int mib[] = {CTL_KERN, KERN_BOOTTIME}; 29481 struct timeval boottime; 29482 size_t len = sizeof(boottime); 29483 if (sysctl( 29484 #ifdef SYSCTL_LEGACY_NONCONST_MIB 29485 (int *) 29486 #endif 29487 mib, 29488 ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 && 29489 len == sizeof(boottime) && boottime.tv_sec) { 29490 bootid_collect(&bin, &boottime, len); 29491 got_boottime = true; 29492 } 29493 } 29494 #endif /* CTL_KERN && KERN_BOOTTIME */ 29495 29496 #if defined(__sun) || defined(__SVR4) || defined(__svr4__) 29497 if (!got_boottime) { 29498 kstat_ctl_t *kc = kstat_open(); 29499 if (kc) { 29500 kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc"); 29501 if (kp && kstat_read(kc, kp, 0) != -1) { 29502 kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time"); 29503 if (kn) { 29504 switch (kn->data_type) { 29505 case KSTAT_DATA_INT32: 29506 case KSTAT_DATA_UINT32: 29507 bootid_collect(&bin, &kn->value, sizeof(int32_t)); 29508 got_boottime = true; 29509 case KSTAT_DATA_INT64: 29510 case KSTAT_DATA_UINT64: 29511 bootid_collect(&bin, &kn->value, sizeof(int64_t)); 29512 got_boottime = true; 29513 } 29514 } 29515 } 29516 kstat_close(kc); 29517 } 29518 } 29519 #endif /* SunOS / Solaris */ 29520 29521 #if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME) 29522 if (!got_boottime) { 29523 setutxent(); 29524 const struct utmpx id = {.ut_type = BOOT_TIME}; 29525 const struct utmpx *entry = getutxid(&id); 29526 if (entry) { 29527 bootid_collect(&bin, entry, sizeof(*entry)); 29528 got_boottime = true; 29529 while (unlikely((entry = getutxid(&id)) != nullptr)) { 29530 /* have multiple reboot records, assuming we can distinguish next 29531 * bootsession even if RTC is wrong or absent */ 29532 bootid_collect(&bin, entry, sizeof(*entry)); 29533 got_bootseq = true; 29534 } 29535 } 29536 endutxent(); 29537 } 29538 #endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */ 29539 29540 if (!got_bootseq) { 29541 if (!got_boottime || !MDBX_TRUST_RTC) 29542 goto lack; 29543 29544 #if defined(_WIN32) || defined(_WIN64) 29545 FILETIME now; 29546 GetSystemTimeAsFileTime(&now); 29547 if (0x1CCCCCC > now.dwHighDateTime) 29548 #else 29549 struct timespec mono, real; 29550 if (clock_gettime(CLOCK_MONOTONIC, &mono) || 29551 clock_gettime(CLOCK_REALTIME, &real) || 29552 /* wrong time, RTC is mad or absent */ 29553 1555555555l > real.tv_sec || 29554 /* seems no adjustment by RTC/NTP, i.e. a fake time */ 29555 real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec || 29556 (real.tv_sec - mono.tv_sec) % 900u == 0) 29557 #endif 29558 goto lack; 29559 } 29560 29561 return bin; 29562 } 29563 29564 __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, 29565 intptr_t *avail_pages) { 29566 if (!page_size && !total_pages && !avail_pages) 29567 return MDBX_EINVAL; 29568 if (total_pages) 29569 *total_pages = -1; 29570 if (avail_pages) 29571 *avail_pages = -1; 29572 29573 const intptr_t pagesize = osal_syspagesize(); 29574 if (page_size) 29575 *page_size = pagesize; 29576 if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) 29577 return MDBX_INCOMPATIBLE; 29578 29579 MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize); 29580 assert(pagesize == (INT64_C(1) << log2page)); 29581 (void)log2page; 29582 29583 #if defined(_WIN32) || defined(_WIN64) 29584 MEMORYSTATUSEX info; 29585 memset(&info, 0, sizeof(info)); 29586 info.dwLength = sizeof(info); 29587 if (!GlobalMemoryStatusEx(&info)) 29588 return (int)GetLastError(); 29589 #endif 29590 29591 if (total_pages) { 29592 #if defined(_WIN32) || defined(_WIN64) 29593 const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page); 29594 #elif defined(_SC_PHYS_PAGES) 29595 const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES); 29596 if (total_ram_pages == -1) 29597 return errno; 29598 #elif defined(_SC_AIX_REALMEM) 29599 const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM); 29600 if (total_ram_Kb == -1) 29601 return errno; 29602 const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page; 29603 #elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \ 29604 defined(HW_PHYSMEM) 29605 size_t ram, len = sizeof(ram); 29606 static const int mib[] = { 29607 CTL_HW, 29608 #if defined(HW_USERMEM) 29609 HW_USERMEM 29610 #elif defined(HW_PHYSMEM64) 29611 HW_PHYSMEM64 29612 #elif defined(HW_MEMSIZE) 29613 HW_MEMSIZE 29614 #else 29615 HW_PHYSMEM 29616 #endif 29617 }; 29618 if (sysctl( 29619 #ifdef SYSCTL_LEGACY_NONCONST_MIB 29620 (int *) 29621 #endif 29622 mib, 29623 ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0) 29624 return errno; 29625 if (len != sizeof(ram)) 29626 return MDBX_ENOSYS; 29627 const intptr_t total_ram_pages = (intptr_t)(ram >> log2page); 29628 #else 29629 #error "FIXME: Get User-accessible or physical RAM" 29630 #endif 29631 *total_pages = total_ram_pages; 29632 if (total_ram_pages < 1) 29633 return MDBX_ENOSYS; 29634 } 29635 29636 if (avail_pages) { 29637 #if defined(_WIN32) || defined(_WIN64) 29638 const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page); 29639 #elif defined(_SC_AVPHYS_PAGES) 29640 const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES); 29641 if (avail_ram_pages == -1) 29642 return errno; 29643 #elif defined(__MACH__) 29644 mach_msg_type_number_t count = HOST_VM_INFO_COUNT; 29645 vm_statistics_data_t vmstat; 29646 mach_port_t mport = mach_host_self(); 29647 kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO, 29648 (host_info_t)&vmstat, &count); 29649 mach_port_deallocate(mach_task_self(), mport); 29650 if (unlikely(kerr != KERN_SUCCESS)) 29651 return MDBX_ENOSYS; 29652 const intptr_t avail_ram_pages = vmstat.free_count; 29653 #elif defined(VM_TOTAL) || defined(VM_METER) 29654 struct vmtotal info; 29655 size_t len = sizeof(info); 29656 static const int mib[] = { 29657 CTL_VM, 29658 #if defined(VM_TOTAL) 29659 VM_TOTAL 29660 #elif defined(VM_METER) 29661 VM_METER 29662 #endif 29663 }; 29664 if (sysctl( 29665 #ifdef SYSCTL_LEGACY_NONCONST_MIB 29666 (int *) 29667 #endif 29668 mib, 29669 ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0) 29670 return errno; 29671 if (len != sizeof(info)) 29672 return MDBX_ENOSYS; 29673 const intptr_t avail_ram_pages = info.t_free; 29674 #else 29675 #error "FIXME: Get Available RAM" 29676 #endif 29677 *avail_pages = avail_ram_pages; 29678 if (avail_ram_pages < 1) 29679 return MDBX_ENOSYS; 29680 } 29681 29682 return MDBX_SUCCESS; 29683 } 29684 /* This is CMake-template for libmdbx's version.c 29685 ******************************************************************************/ 29686 29687 29688 #if MDBX_VERSION_MAJOR != 0 || \ 29689 MDBX_VERSION_MINOR != 12 29690 #error "API version mismatch! Had `git fetch --tags` done?" 29691 #endif 29692 29693 static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY); 29694 29695 __dll_export 29696 #ifdef __attribute_used__ 29697 __attribute_used__ 29698 #elif defined(__GNUC__) || __has_attribute(__used__) 29699 __attribute__((__used__)) 29700 #endif 29701 #ifdef __attribute_externally_visible__ 29702 __attribute_externally_visible__ 29703 #elif (defined(__GNUC__) && !defined(__clang__)) || \ 29704 __has_attribute(__externally_visible__) 29705 __attribute__((__externally_visible__)) 29706 #endif 29707 const struct MDBX_version_info mdbx_version = { 29708 0, 29709 12, 29710 1, 29711 0, 29712 {"2022-08-24T16:24:22+03:00", "0803c79d2d94f2d1496166a9a86bd47da18c7eed", "b36a07a512c1412d5753219aa8fc66cab75a012a", 29713 "v0.12.1-0-gb36a07a5"}, 29714 sourcery}; 29715 29716 __dll_export 29717 #ifdef __attribute_used__ 29718 __attribute_used__ 29719 #elif defined(__GNUC__) || __has_attribute(__used__) 29720 __attribute__((__used__)) 29721 #endif 29722 #ifdef __attribute_externally_visible__ 29723 __attribute_externally_visible__ 29724 #elif (defined(__GNUC__) && !defined(__clang__)) || \ 29725 __has_attribute(__externally_visible__) 29726 __attribute__((__externally_visible__)) 29727 #endif 29728 const char *const mdbx_sourcery_anchor = sourcery; 29729 /* 29730 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 29731 * and other libmdbx authors: please see AUTHORS file. 29732 * All rights reserved. 29733 * 29734 * Redistribution and use in source and binary forms, with or without 29735 * modification, are permitted only as authorized by the OpenLDAP 29736 * Public License. 29737 * 29738 * A copy of this license is available in the file LICENSE in the 29739 * top-level directory of the distribution or, alternatively, at 29740 * <http://www.OpenLDAP.org/license.html>. 29741 */ 29742 29743 #if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */ 29744 29745 /* PREAMBLE FOR WINDOWS: 29746 * 29747 * We are not concerned for performance here. 29748 * If you are running Windows a performance could NOT be the goal. 29749 * Otherwise please use Linux. */ 29750 29751 29752 static void mdbx_winnt_import(void); 29753 29754 #if MDBX_BUILD_SHARED_LIBRARY 29755 #if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG) 29756 /* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks. 29757 * 29758 * Define dll's entry point only for Release build when NDEBUG is defined and 29759 * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will 29760 * automatically use DllMainCRTStartup() from CRT library, which also 29761 * automatically call DllMain() from our mdbx.dll */ 29762 #pragma comment(linker, "/ENTRY:DllMain") 29763 #endif /* MDBX_WITHOUT_MSVC_CRT */ 29764 29765 BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) 29766 #else 29767 #if !MDBX_MANUAL_MODULE_HANDLER 29768 static 29769 #endif /* !MDBX_MANUAL_MODULE_HANDLER */ 29770 void NTAPI 29771 mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved) 29772 #endif /* MDBX_BUILD_SHARED_LIBRARY */ 29773 { 29774 (void)reserved; 29775 switch (reason) { 29776 case DLL_PROCESS_ATTACH: 29777 mdbx_winnt_import(); 29778 global_ctor(); 29779 break; 29780 case DLL_PROCESS_DETACH: 29781 global_dtor(); 29782 break; 29783 29784 case DLL_THREAD_ATTACH: 29785 break; 29786 case DLL_THREAD_DETACH: 29787 thread_dtor(module); 29788 break; 29789 } 29790 #if MDBX_BUILD_SHARED_LIBRARY 29791 return TRUE; 29792 #endif 29793 } 29794 29795 #if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER 29796 #if defined(_MSC_VER) 29797 # pragma const_seg(push) 29798 # pragma data_seg(push) 29799 29800 # ifndef _M_IX86 29801 /* kick a linker to create the TLS directory if not already done */ 29802 # pragma comment(linker, "/INCLUDE:_tls_used") 29803 /* Force some symbol references. */ 29804 # pragma comment(linker, "/INCLUDE:mdbx_tls_anchor") 29805 /* specific const-segment for WIN64 */ 29806 # pragma const_seg(".CRT$XLB") 29807 const 29808 # else 29809 /* kick a linker to create the TLS directory if not already done */ 29810 # pragma comment(linker, "/INCLUDE:__tls_used") 29811 /* Force some symbol references. */ 29812 # pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor") 29813 /* specific data-segment for WIN32 */ 29814 # pragma data_seg(".CRT$XLB") 29815 # endif 29816 29817 __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler; 29818 # pragma data_seg(pop) 29819 # pragma const_seg(pop) 29820 29821 #elif defined(__GNUC__) 29822 # ifndef _M_IX86 29823 const 29824 # endif 29825 PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler; 29826 #else 29827 # error FIXME 29828 #endif 29829 #endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */ 29830 29831 /*----------------------------------------------------------------------------*/ 29832 29833 #define LCK_SHARED 0 29834 #define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK 29835 #define LCK_WAITFOR 0 29836 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY 29837 29838 static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, 29839 size_t bytes) { 29840 OVERLAPPED ov; 29841 ov.hEvent = 0; 29842 ov.Offset = (DWORD)offset; 29843 ov.OffsetHigh = HIGH_DWORD(offset); 29844 return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); 29845 } 29846 29847 static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, 29848 size_t bytes) { 29849 return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, 29850 HIGH_DWORD(bytes)); 29851 } 29852 29853 /*----------------------------------------------------------------------------*/ 29854 /* global `write` lock for write-txt processing, 29855 * exclusive locking both meta-pages) */ 29856 29857 #define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) 29858 #define LCK_META_OFFSET 0 29859 #define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) 29860 #define LCK_BODY_OFFSET LCK_META_LEN 29861 #define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) 29862 #define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN 29863 #define LCK_WHOLE 0, LCK_MAXLEN 29864 29865 int mdbx_txn_lock(MDBX_env *env, bool dontwait) { 29866 if (dontwait) { 29867 if (!TryEnterCriticalSection(&env->me_windowsbug_lock)) 29868 return MDBX_BUSY; 29869 } else { 29870 __try { 29871 EnterCriticalSection(&env->me_windowsbug_lock); 29872 } 29873 __except ((GetExceptionCode() == 29874 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) 29875 ? EXCEPTION_EXECUTE_HANDLER 29876 : EXCEPTION_CONTINUE_SEARCH) { 29877 return ERROR_POSSIBLE_DEADLOCK; 29878 } 29879 } 29880 29881 if ((env->me_flags & MDBX_EXCLUSIVE) || 29882 flock(env->me_lazy_fd, 29883 dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) 29884 : (LCK_EXCLUSIVE | LCK_WAITFOR), 29885 LCK_BODY)) 29886 return MDBX_SUCCESS; 29887 int rc = (int)GetLastError(); 29888 LeaveCriticalSection(&env->me_windowsbug_lock); 29889 return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; 29890 } 29891 29892 void mdbx_txn_unlock(MDBX_env *env) { 29893 int rc = (env->me_flags & MDBX_EXCLUSIVE) 29894 ? TRUE 29895 : funlock(env->me_lazy_fd, LCK_BODY); 29896 LeaveCriticalSection(&env->me_windowsbug_lock); 29897 if (!rc) 29898 mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); 29899 } 29900 29901 /*----------------------------------------------------------------------------*/ 29902 /* global `read` lock for readers registration, 29903 * exclusive locking `mti_numreaders` (second) cacheline */ 29904 29905 #define LCK_LO_OFFSET 0 29906 #define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders) 29907 #define LCK_UP_OFFSET LCK_LO_LEN 29908 #define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET) 29909 #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN 29910 #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN 29911 29912 MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { 29913 osal_srwlock_AcquireShared(&env->me_remap_guard); 29914 if (env->me_lfd == INVALID_HANDLE_VALUE) 29915 return MDBX_SUCCESS; /* readonly database in readonly filesystem */ 29916 29917 /* transition from S-? (used) to S-E (locked), 29918 * e.g. exclusive lock upper-part */ 29919 if ((env->me_flags & MDBX_EXCLUSIVE) || 29920 flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) 29921 return MDBX_SUCCESS; 29922 29923 int rc = (int)GetLastError(); 29924 osal_srwlock_ReleaseShared(&env->me_remap_guard); 29925 return rc; 29926 } 29927 29928 MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { 29929 if (env->me_lfd != INVALID_HANDLE_VALUE) { 29930 /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ 29931 if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && 29932 !funlock(env->me_lfd, LCK_UPPER)) 29933 mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); 29934 } 29935 osal_srwlock_ReleaseShared(&env->me_remap_guard); 29936 } 29937 29938 MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { 29939 return flock(fd, 29940 wait ? LCK_EXCLUSIVE | LCK_WAITFOR 29941 : LCK_EXCLUSIVE | LCK_DONTWAIT, 29942 0, LCK_MAXLEN) 29943 ? MDBX_SUCCESS 29944 : (int)GetLastError(); 29945 } 29946 29947 static int suspend_and_append(mdbx_handle_array_t **array, 29948 const DWORD ThreadId) { 29949 const unsigned limit = (*array)->limit; 29950 if ((*array)->count == limit) { 29951 void *ptr = osal_realloc( 29952 (limit > ARRAY_LENGTH((*array)->handles)) 29953 ? *array 29954 : /* don't free initial array on the stack */ NULL, 29955 sizeof(mdbx_handle_array_t) + 29956 sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles))); 29957 if (!ptr) 29958 return MDBX_ENOMEM; 29959 if (limit == ARRAY_LENGTH((*array)->handles)) 29960 memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); 29961 *array = (mdbx_handle_array_t *)ptr; 29962 (*array)->limit = limit * 2; 29963 } 29964 29965 HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, 29966 FALSE, ThreadId); 29967 if (hThread == NULL) 29968 return (int)GetLastError(); 29969 29970 if (SuspendThread(hThread) == (DWORD)-1) { 29971 int err = (int)GetLastError(); 29972 DWORD ExitCode; 29973 if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED || 29974 !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE) 29975 err = MDBX_SUCCESS; 29976 CloseHandle(hThread); 29977 return err; 29978 } 29979 29980 (*array)->handles[(*array)->count++] = hThread; 29981 return MDBX_SUCCESS; 29982 } 29983 29984 MDBX_INTERNAL_FUNC int 29985 osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { 29986 eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); 29987 const uintptr_t CurrentTid = GetCurrentThreadId(); 29988 int rc; 29989 if (env->me_lck_mmap.lck) { 29990 /* Scan LCK for threads of the current process */ 29991 const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; 29992 const MDBX_reader *const end = 29993 begin + 29994 atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease); 29995 const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; 29996 for (const MDBX_reader *reader = begin; reader < end; ++reader) { 29997 if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) { 29998 skip_lck: 29999 continue; 30000 } 30001 if (reader->mr_tid.weak == CurrentTid || 30002 reader->mr_tid.weak == WriteTxnOwner) 30003 goto skip_lck; 30004 30005 rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); 30006 if (rc != MDBX_SUCCESS) { 30007 bailout_lck: 30008 (void)osal_resume_threads_after_remap(*array); 30009 return rc; 30010 } 30011 } 30012 if (WriteTxnOwner && WriteTxnOwner != CurrentTid) { 30013 rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner); 30014 if (rc != MDBX_SUCCESS) 30015 goto bailout_lck; 30016 } 30017 } else { 30018 /* Without LCK (i.e. read-only mode). 30019 * Walk through a snapshot of all running threads */ 30020 eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); 30021 const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); 30022 if (hSnapshot == INVALID_HANDLE_VALUE) 30023 return (int)GetLastError(); 30024 30025 THREADENTRY32 entry; 30026 entry.dwSize = sizeof(THREADENTRY32); 30027 30028 if (!Thread32First(hSnapshot, &entry)) { 30029 rc = (int)GetLastError(); 30030 bailout_toolhelp: 30031 CloseHandle(hSnapshot); 30032 (void)osal_resume_threads_after_remap(*array); 30033 return rc; 30034 } 30035 30036 do { 30037 if (entry.th32OwnerProcessID != env->me_pid || 30038 entry.th32ThreadID == CurrentTid) 30039 continue; 30040 30041 rc = suspend_and_append(array, entry.th32ThreadID); 30042 if (rc != MDBX_SUCCESS) 30043 goto bailout_toolhelp; 30044 30045 } while (Thread32Next(hSnapshot, &entry)); 30046 30047 rc = (int)GetLastError(); 30048 if (rc != ERROR_NO_MORE_FILES) 30049 goto bailout_toolhelp; 30050 CloseHandle(hSnapshot); 30051 } 30052 30053 return MDBX_SUCCESS; 30054 } 30055 30056 MDBX_INTERNAL_FUNC int 30057 osal_resume_threads_after_remap(mdbx_handle_array_t *array) { 30058 int rc = MDBX_SUCCESS; 30059 for (unsigned i = 0; i < array->count; ++i) { 30060 const HANDLE hThread = array->handles[i]; 30061 if (ResumeThread(hThread) == (DWORD)-1) { 30062 const int err = (int)GetLastError(); 30063 DWORD ExitCode; 30064 if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED && 30065 GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE) 30066 rc = err; 30067 } 30068 CloseHandle(hThread); 30069 } 30070 return rc; 30071 } 30072 30073 /*----------------------------------------------------------------------------*/ 30074 /* global `initial` lock for lockfile initialization, 30075 * exclusive/shared locking first cacheline */ 30076 30077 /* Briefly description of locking schema/algorithm: 30078 * - Windows does not support upgrading or downgrading for file locking. 30079 * - Therefore upgrading/downgrading is emulated by shared and exclusive 30080 * locking of upper and lower halves. 30081 * - In other words, we have FSM with possible 9 states, 30082 * i.e. free/shared/exclusive x free/shared/exclusive == 9. 30083 * Only 6 states of FSM are used, which 2 of ones are transitive. 30084 * 30085 * States: 30086 * ?-? = free, i.e. unlocked 30087 * S-? = used, i.e. shared lock 30088 * E-? = exclusive-read, i.e. operational exclusive 30089 * ?-S 30090 * ?-E = middle (transitive state) 30091 * S-S 30092 * S-E = locked (transitive state) 30093 * E-S 30094 * E-E = exclusive-write, i.e. exclusive due (re)initialization 30095 * 30096 * The osal_lck_seize() moves the locking-FSM from the initial free/unlocked 30097 * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, 30098 * or to the "used" (and returns MDBX_RESULT_FALSE). 30099 * 30100 * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" 30101 * state to the "used" (i.e. shared) state. 30102 * 30103 * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) 30104 * state to the "exclusive write" state. 30105 */ 30106 30107 static void lck_unlock(MDBX_env *env) { 30108 int err; 30109 30110 if (env->me_lfd != INVALID_HANDLE_VALUE) { 30111 /* double `unlock` for robustly remove overlapped shared/exclusive locks */ 30112 while (funlock(env->me_lfd, LCK_LOWER)) 30113 ; 30114 err = (int)GetLastError(); 30115 assert(err == ERROR_NOT_LOCKED || 30116 (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); 30117 (void)err; 30118 SetLastError(ERROR_SUCCESS); 30119 30120 while (funlock(env->me_lfd, LCK_UPPER)) 30121 ; 30122 err = (int)GetLastError(); 30123 assert(err == ERROR_NOT_LOCKED || 30124 (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); 30125 (void)err; 30126 SetLastError(ERROR_SUCCESS); 30127 } 30128 30129 if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { 30130 /* explicitly unlock to avoid latency for other processes (windows kernel 30131 * releases such locks via deferred queues) */ 30132 while (funlock(env->me_lazy_fd, LCK_BODY)) 30133 ; 30134 err = (int)GetLastError(); 30135 assert(err == ERROR_NOT_LOCKED || 30136 (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); 30137 (void)err; 30138 SetLastError(ERROR_SUCCESS); 30139 30140 while (funlock(env->me_lazy_fd, LCK_WHOLE)) 30141 ; 30142 err = (int)GetLastError(); 30143 assert(err == ERROR_NOT_LOCKED || 30144 (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); 30145 (void)err; 30146 SetLastError(ERROR_SUCCESS); 30147 } 30148 } 30149 30150 /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) 30151 * or as 'used' (S-? and returns MDBX_RESULT_FALSE). 30152 * Otherwise returns an error. */ 30153 static int internal_seize_lck(HANDLE lfd) { 30154 int rc; 30155 assert(lfd != INVALID_HANDLE_VALUE); 30156 30157 /* 1) now on ?-? (free), get ?-E (middle) */ 30158 jitter4testing(false); 30159 if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { 30160 rc = (int)GetLastError() /* 2) something went wrong, give up */; 30161 ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); 30162 return rc; 30163 } 30164 30165 /* 3) now on ?-E (middle), try E-E (exclusive-write) */ 30166 jitter4testing(false); 30167 if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) 30168 return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; 30169 30170 /* 5) still on ?-E (middle) */ 30171 rc = (int)GetLastError(); 30172 jitter4testing(false); 30173 if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { 30174 /* 6) something went wrong, give up */ 30175 if (!funlock(lfd, LCK_UPPER)) 30176 mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", 30177 (int)GetLastError()); 30178 return rc; 30179 } 30180 30181 /* 7) still on ?-E (middle), try S-E (locked) */ 30182 jitter4testing(false); 30183 rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE 30184 : (int)GetLastError(); 30185 30186 jitter4testing(false); 30187 if (rc != MDBX_RESULT_FALSE) 30188 ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); 30189 30190 /* 8) now on S-E (locked) or still on ?-E (middle), 30191 * transition to S-? (used) or ?-? (free) */ 30192 if (!funlock(lfd, LCK_UPPER)) 30193 mdbx_panic("%s(%s) failed: err %u", __func__, 30194 "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); 30195 30196 /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ 30197 return rc; 30198 } 30199 30200 MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { 30201 int rc; 30202 30203 assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); 30204 if (env->me_flags & MDBX_EXCLUSIVE) 30205 return MDBX_RESULT_TRUE /* nope since files were must be opened 30206 non-shareable */ 30207 ; 30208 30209 if (env->me_lfd == INVALID_HANDLE_VALUE) { 30210 /* LY: without-lck mode (e.g. on read-only filesystem) */ 30211 jitter4testing(false); 30212 if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { 30213 rc = (int)GetLastError(); 30214 ERROR("%s, err %u", "without-lck", rc); 30215 return rc; 30216 } 30217 return MDBX_RESULT_FALSE; 30218 } 30219 30220 rc = internal_seize_lck(env->me_lfd); 30221 jitter4testing(false); 30222 if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { 30223 /* Check that another process don't operates in without-lck mode. 30224 * Doing such check by exclusive locking the body-part of db. Should be 30225 * noted: 30226 * - we need an exclusive lock for do so; 30227 * - we can't lock meta-pages, otherwise other process could get an error 30228 * while opening db in valid (non-conflict) mode. */ 30229 if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { 30230 rc = (int)GetLastError(); 30231 ERROR("%s, err %u", "lock-against-without-lck", rc); 30232 jitter4testing(false); 30233 lck_unlock(env); 30234 } else { 30235 jitter4testing(false); 30236 if (!funlock(env->me_lazy_fd, LCK_BODY)) 30237 mdbx_panic("%s(%s) failed: err %u", __func__, 30238 "unlock-against-without-lck", (int)GetLastError()); 30239 } 30240 } 30241 30242 return rc; 30243 } 30244 30245 MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { 30246 /* Transite from exclusive-write state (E-E) to used (S-?) */ 30247 assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); 30248 assert(env->me_lfd != INVALID_HANDLE_VALUE); 30249 30250 if (env->me_flags & MDBX_EXCLUSIVE) 30251 return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ 30252 ; 30253 /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ 30254 if (!funlock(env->me_lfd, LCK_LOWER)) 30255 mdbx_panic("%s(%s) failed: err %u", __func__, 30256 "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); 30257 30258 /* 2) now at ?-E (middle), transition to S-E (locked) */ 30259 if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { 30260 int rc = (int)GetLastError() /* 3) something went wrong, give up */; 30261 ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); 30262 return rc; 30263 } 30264 30265 /* 4) got S-E (locked), continue transition to S-? (used) */ 30266 if (!funlock(env->me_lfd, LCK_UPPER)) 30267 mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", 30268 (int)GetLastError()); 30269 30270 return MDBX_SUCCESS /* 5) now at S-? (used), done */; 30271 } 30272 30273 MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { 30274 /* Transite from used state (S-?) to exclusive-write (E-E) */ 30275 assert(env->me_lfd != INVALID_HANDLE_VALUE); 30276 30277 if (env->me_flags & MDBX_EXCLUSIVE) 30278 return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ 30279 ; 30280 30281 int rc; 30282 /* 1) now on S-? (used), try S-E (locked) */ 30283 jitter4testing(false); 30284 if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { 30285 rc = (int)GetLastError() /* 2) something went wrong, give up */; 30286 VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); 30287 return rc; 30288 } 30289 30290 /* 3) now on S-E (locked), transition to ?-E (middle) */ 30291 if (!funlock(env->me_lfd, LCK_LOWER)) 30292 mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", 30293 (int)GetLastError()); 30294 30295 /* 4) now on ?-E (middle), try E-E (exclusive-write) */ 30296 jitter4testing(false); 30297 if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { 30298 rc = (int)GetLastError() /* 5) something went wrong, give up */; 30299 VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); 30300 return rc; 30301 } 30302 30303 return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; 30304 } 30305 30306 MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, 30307 MDBX_env *inprocess_neighbor, 30308 int global_uniqueness_flag) { 30309 (void)env; 30310 (void)inprocess_neighbor; 30311 (void)global_uniqueness_flag; 30312 return MDBX_SUCCESS; 30313 } 30314 30315 MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, 30316 MDBX_env *inprocess_neighbor) { 30317 /* LY: should unmap before releasing the locks to avoid race condition and 30318 * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ 30319 if (env->me_map) 30320 osal_munmap(&env->me_dxb_mmap); 30321 if (env->me_lck_mmap.lck) { 30322 const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; 30323 osal_munmap(&env->me_lck_mmap); 30324 if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && 30325 mdbx_lck_upgrade(env) == MDBX_SUCCESS) 30326 /* this will fail if LCK is used/mmapped by other process(es) */ 30327 osal_ftruncate(env->me_lfd, 0); 30328 } 30329 lck_unlock(env); 30330 return MDBX_SUCCESS; 30331 } 30332 30333 /*----------------------------------------------------------------------------*/ 30334 /* reader checking (by pid) */ 30335 30336 MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { 30337 (void)env; 30338 return MDBX_SUCCESS; 30339 } 30340 30341 MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { 30342 (void)env; 30343 return MDBX_SUCCESS; 30344 } 30345 30346 /* Checks reader by pid. 30347 * 30348 * Returns: 30349 * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) 30350 * MDBX_RESULT_FALSE, if pid is dead (lock acquired) 30351 * or otherwise the errcode. */ 30352 MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { 30353 (void)env; 30354 HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); 30355 int rc; 30356 if (likely(hProcess)) { 30357 rc = WaitForSingleObject(hProcess, 0); 30358 if (unlikely(rc == (int)WAIT_FAILED)) 30359 rc = (int)GetLastError(); 30360 CloseHandle(hProcess); 30361 } else { 30362 rc = (int)GetLastError(); 30363 } 30364 30365 switch (rc) { 30366 case ERROR_INVALID_PARAMETER: 30367 /* pid seems invalid */ 30368 return MDBX_RESULT_FALSE; 30369 case WAIT_OBJECT_0: 30370 /* process just exited */ 30371 return MDBX_RESULT_FALSE; 30372 case ERROR_ACCESS_DENIED: 30373 /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc. 30374 * assume pid exists */ 30375 return MDBX_RESULT_TRUE; 30376 case WAIT_TIMEOUT: 30377 /* pid running */ 30378 return MDBX_RESULT_TRUE; 30379 default: 30380 /* failure */ 30381 return rc; 30382 } 30383 } 30384 30385 //---------------------------------------------------------------------------- 30386 // Stub for slim read-write lock 30387 // Copyright (C) 1995-2002 Brad Wilson 30388 30389 static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { 30390 srwl->readerCount = srwl->writerCount = 0; 30391 } 30392 30393 static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { 30394 while (true) { 30395 assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); 30396 30397 // If there's a writer already, spin without unnecessarily 30398 // interlocking the CPUs 30399 if (srwl->writerCount != 0) { 30400 YieldProcessor(); 30401 continue; 30402 } 30403 30404 // Add to the readers list 30405 _InterlockedIncrement(&srwl->readerCount); 30406 30407 // Check for writers again (we may have been preempted). If 30408 // there are no writers writing or waiting, then we're done. 30409 if (srwl->writerCount == 0) 30410 break; 30411 30412 // Remove from the readers list, spin, try again 30413 _InterlockedDecrement(&srwl->readerCount); 30414 YieldProcessor(); 30415 } 30416 } 30417 30418 static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { 30419 assert(srwl->readerCount > 0); 30420 _InterlockedDecrement(&srwl->readerCount); 30421 } 30422 30423 static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { 30424 while (true) { 30425 assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); 30426 30427 // If there's a writer already, spin without unnecessarily 30428 // interlocking the CPUs 30429 if (srwl->writerCount != 0) { 30430 YieldProcessor(); 30431 continue; 30432 } 30433 30434 // See if we can become the writer (expensive, because it inter- 30435 // locks the CPUs, so writing should be an infrequent process) 30436 if (_InterlockedExchange(&srwl->writerCount, 1) == 0) 30437 break; 30438 } 30439 30440 // Now we're the writer, but there may be outstanding readers. 30441 // Spin until there aren't any more; new readers will wait now 30442 // that we're the writer. 30443 while (srwl->readerCount != 0) { 30444 assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); 30445 YieldProcessor(); 30446 } 30447 } 30448 30449 static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { 30450 assert(srwl->writerCount == 1 && srwl->readerCount >= 0); 30451 srwl->writerCount = 0; 30452 } 30453 30454 static uint64_t WINAPI stub_GetTickCount64(void) { 30455 LARGE_INTEGER Counter, Frequency; 30456 return (QueryPerformanceFrequency(&Frequency) && 30457 QueryPerformanceCounter(&Counter)) 30458 ? Counter.QuadPart * 1000ul / Frequency.QuadPart 30459 : 0; 30460 } 30461 30462 /*----------------------------------------------------------------------------*/ 30463 30464 #ifndef xMDBX_ALLOY 30465 osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, 30466 osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, 30467 osal_srwlock_ReleaseExclusive; 30468 30469 MDBX_NtExtendSection mdbx_NtExtendSection; 30470 MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; 30471 MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; 30472 MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; 30473 MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; 30474 MDBX_NtFsControlFile mdbx_NtFsControlFile; 30475 MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; 30476 MDBX_GetTickCount64 mdbx_GetTickCount64; 30477 MDBX_RegGetValueA mdbx_RegGetValueA; 30478 #endif /* xMDBX_ALLOY */ 30479 30480 #if __GNUC_PREREQ(8, 0) 30481 #pragma GCC diagnostic push 30482 #pragma GCC diagnostic ignored "-Wcast-function-type" 30483 #endif /* GCC/MINGW */ 30484 30485 static void mdbx_winnt_import(void) { 30486 const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); 30487 30488 #define GET_PROC_ADDR(dll, ENTRY) \ 30489 mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) 30490 30491 if (GetProcAddress(hNtdll, "wine_get_version")) { 30492 assert(mdbx_RunningUnderWine()); 30493 } else { 30494 GET_PROC_ADDR(hNtdll, NtFsControlFile); 30495 GET_PROC_ADDR(hNtdll, NtExtendSection); 30496 assert(!mdbx_RunningUnderWine()); 30497 } 30498 30499 const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); 30500 GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); 30501 GET_PROC_ADDR(hKernel32dll, GetTickCount64); 30502 if (!mdbx_GetTickCount64) 30503 mdbx_GetTickCount64 = stub_GetTickCount64; 30504 if (!mdbx_RunningUnderWine()) { 30505 GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); 30506 GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); 30507 GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); 30508 GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); 30509 } 30510 30511 const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); 30512 GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); 30513 #undef GET_PROC_ADDR 30514 30515 const osal_srwlock_t_function init = (osal_srwlock_t_function)GetProcAddress( 30516 hKernel32dll, "InitializeSRWLock"); 30517 if (init != NULL) { 30518 osal_srwlock_Init = init; 30519 osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( 30520 hKernel32dll, "AcquireSRWLockShared"); 30521 osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( 30522 hKernel32dll, "ReleaseSRWLockShared"); 30523 osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( 30524 hKernel32dll, "AcquireSRWLockExclusive"); 30525 osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( 30526 hKernel32dll, "ReleaseSRWLockExclusive"); 30527 } else { 30528 osal_srwlock_Init = stub_srwlock_Init; 30529 osal_srwlock_AcquireShared = stub_srwlock_AcquireShared; 30530 osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; 30531 osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; 30532 osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; 30533 } 30534 } 30535 30536 #if __GNUC_PREREQ(8, 0) 30537 #pragma GCC diagnostic pop 30538 #endif /* GCC/MINGW */ 30539 30540 #endif /* Windows LCK-implementation */ 30541 /* 30542 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 30543 * and other libmdbx authors: please see AUTHORS file. 30544 * All rights reserved. 30545 * 30546 * Redistribution and use in source and binary forms, with or without 30547 * modification, are permitted only as authorized by the OpenLDAP 30548 * Public License. 30549 * 30550 * A copy of this license is available in the file LICENSE in the 30551 * top-level directory of the distribution or, alternatively, at 30552 * <http://www.OpenLDAP.org/license.html>. 30553 */ 30554 30555 #if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */ 30556 30557 30558 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 30559 #include <sys/sem.h> 30560 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ 30561 30562 /*----------------------------------------------------------------------------*/ 30563 /* global constructor/destructor */ 30564 30565 #if defined(__linux__) || defined(__gnu_linux__) 30566 30567 #include <sys/utsname.h> 30568 30569 #ifndef xMDBX_ALLOY 30570 uint32_t linux_kernel_version; 30571 bool mdbx_RunningOnWSL1; 30572 #endif /* xMDBX_ALLOY */ 30573 30574 MDBX_EXCLUDE_FOR_GPROF 30575 __cold static uint8_t probe_for_WSL(const char *tag) { 30576 const char *const WSL = strstr(tag, "WSL"); 30577 if (WSL && WSL[3] >= '2' && WSL[3] <= '9') 30578 return WSL[3] - '0'; 30579 const char *const wsl = strstr(tag, "wsl"); 30580 if (wsl && wsl[3] >= '2' && wsl[3] <= '9') 30581 return wsl[3] - '0'; 30582 if (WSL || wsl || strcasestr(tag, "Microsoft")) 30583 /* Expecting no new kernel within WSL1, either it will explicitly 30584 * marked by an appropriate WSL-version hint. */ 30585 return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; 30586 return 0; 30587 } 30588 30589 #endif /* Linux */ 30590 30591 #ifdef ENABLE_GPROF 30592 extern void _mcleanup(void); 30593 extern void monstartup(unsigned long, unsigned long); 30594 extern void _init(void); 30595 extern void _fini(void); 30596 extern void __gmon_start__(void) __attribute__((__weak__)); 30597 #endif /* ENABLE_GPROF */ 30598 30599 MDBX_EXCLUDE_FOR_GPROF 30600 __cold static __attribute__((__constructor__)) void 30601 mdbx_global_constructor(void) { 30602 #ifdef ENABLE_GPROF 30603 if (!&__gmon_start__) 30604 monstartup((uintptr_t)&_init, (uintptr_t)&_fini); 30605 #endif /* ENABLE_GPROF */ 30606 30607 #if defined(__linux__) || defined(__gnu_linux__) 30608 struct utsname buffer; 30609 if (uname(&buffer) == 0) { 30610 int i = 0; 30611 char *p = buffer.release; 30612 while (*p && i < 4) { 30613 if (*p >= '0' && *p <= '9') { 30614 long number = strtol(p, &p, 10); 30615 if (number > 0) { 30616 if (number > 255) 30617 number = 255; 30618 linux_kernel_version += number << (24 - i * 8); 30619 } 30620 ++i; 30621 } else { 30622 ++p; 30623 } 30624 } 30625 /* "Official" way of detecting WSL1 but not WSL2 30626 * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 30627 * 30628 * WARNING: False negative detection of WSL1 will result in DATA LOSS! 30629 * So, the REQUIREMENTS for this code: 30630 * 1. MUST detect WSL1 without false-negatives. 30631 * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ 30632 mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || 30633 probe_for_WSL(buffer.sysname) == 1 || 30634 probe_for_WSL(buffer.release) == 1; 30635 } 30636 #endif /* Linux */ 30637 30638 global_ctor(); 30639 } 30640 30641 MDBX_EXCLUDE_FOR_GPROF 30642 __cold static __attribute__((__destructor__)) void 30643 mdbx_global_destructor(void) { 30644 global_dtor(); 30645 #ifdef ENABLE_GPROF 30646 if (!&__gmon_start__) 30647 _mcleanup(); 30648 #endif /* ENABLE_GPROF */ 30649 } 30650 30651 /*----------------------------------------------------------------------------*/ 30652 /* lck */ 30653 30654 /* Описание реализации блокировок для POSIX & Linux: 30655 * 30656 * lck-файл отображается в память, в нём организуется таблица читателей и 30657 * размещаются совместно используемые posix-мьютексы (futex). Посредством 30658 * этих мьютексов (см struct MDBX_lockinfo) реализуются: 30659 * - Блокировка таблицы читателей для регистрации, 30660 * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). 30661 * - Блокировка БД для пишущих транзакций, 30662 * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). 30663 * 30664 * Остальной функционал реализуется отдельно посредством файловых блокировок: 30665 * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод 30666 * в операционный режим, функции osal_lck_seize() и osal_lck_downgrade(). 30667 * - Проверка присутствие процессов-читателей, 30668 * т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check(). 30669 * 30670 * Для блокировки файлов используется fcntl(F_SETLK), так как: 30671 * - lockf() оперирует только эксклюзивной блокировкой и требует 30672 * открытия файла в RW-режиме. 30673 * - flock() не гарантирует атомарности при смене блокировок 30674 * и оперирует только всем файлом целиком. 30675 * - Для контроля процессов-читателей используются однобайтовые 30676 * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом 30677 * в качестве позиции используется pid процесса-читателя. 30678 * - Для первоначального захвата и shared/exclusive выполняется блокировка 30679 * основного файла БД и при успехе lck-файла. 30680 * 30681 * ---------------------------------------------------------------------------- 30682 * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ 30683 * 30684 * Эксклюзивный режим без lck-файла: 30685 * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, 30686 * в зависимости от MDBX_RDONLY. 30687 * 30688 * Не-операционный режим на время пере-инициализации и разрушении lck-файла: 30689 * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её 30690 * снятия при получении F_RDLCK через F_SETLKW. 30691 * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки 30692 * lck-файла: 30693 * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле 30694 * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. 30695 * + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла 30696 * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. 30697 * 30698 * ОПЕРАЦИОННЫЙ режим с lck-файлом: 30699 * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут 30700 * получить F_WRLCK и таким образом видят что БД используется. 30701 * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. 30702 * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле 30703 * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. 30704 * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла 30705 * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. 30706 */ 30707 30708 #if MDBX_USE_OFDLOCKS 30709 static int op_setlk, op_setlkw, op_getlk; 30710 __cold static void choice_fcntl(void) { 30711 assert(!op_setlk && !op_setlkw && !op_getlk); 30712 if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 30713 #if defined(__linux__) || defined(__gnu_linux__) 30714 && linux_kernel_version > 30715 0x030f0000 /* OFD locks are available since 3.15, but engages here 30716 only for 3.16 and later kernels (i.e. LTS) because 30717 of reliability reasons */ 30718 #endif /* linux */ 30719 ) { 30720 op_setlk = F_OFD_SETLK; 30721 op_setlkw = F_OFD_SETLKW; 30722 op_getlk = F_OFD_GETLK; 30723 return; 30724 } 30725 op_setlk = F_SETLK; 30726 op_setlkw = F_SETLKW; 30727 op_getlk = F_GETLK; 30728 } 30729 #else 30730 #define op_setlk F_SETLK 30731 #define op_setlkw F_SETLKW 30732 #define op_getlk F_GETLK 30733 #endif /* MDBX_USE_OFDLOCKS */ 30734 30735 #ifndef OFF_T_MAX 30736 #define OFF_T_MAX \ 30737 (((sizeof(off_t) > 4) ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) 30738 #endif 30739 30740 static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, 30741 const off_t offset, off_t len) { 30742 STATIC_ASSERT(sizeof(off_t) >= sizeof(void *) && 30743 sizeof(off_t) >= sizeof(size_t)); 30744 #ifdef __ANDROID_API__ 30745 STATIC_ASSERT_MSG((sizeof(off_t) * 8 == MDBX_WORDBITS), 30746 "The bitness of system `off_t` type is mismatch. Please " 30747 "fix build and/or NDK configuration."); 30748 #endif /* Android */ 30749 jitter4testing(true); 30750 assert(offset >= 0 && len > 0); 30751 assert((uint64_t)offset < (uint64_t)INT64_MAX && 30752 (uint64_t)len < (uint64_t)INT64_MAX && 30753 (uint64_t)(offset + len) > (uint64_t)offset); 30754 30755 assert((uint64_t)offset < (uint64_t)OFF_T_MAX && 30756 (uint64_t)len <= (uint64_t)OFF_T_MAX && 30757 (uint64_t)(offset + len) <= (uint64_t)OFF_T_MAX); 30758 30759 assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) == 30760 ((uint64_t)offset + (uint64_t)len)); 30761 for (;;) { 30762 struct flock lock_op; 30763 STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) && 30764 sizeof(off_t) <= sizeof(lock_op.l_len) && 30765 OFF_T_MAX == (off_t)OFF_T_MAX, 30766 "Support for large/64-bit-sized files is misconfigured " 30767 "for the target system and/or toolchain. " 30768 "Please fix it or at least disable it completely."); 30769 memset(&lock_op, 0, sizeof(lock_op)); 30770 lock_op.l_type = lck; 30771 lock_op.l_whence = SEEK_SET; 30772 lock_op.l_start = offset; 30773 lock_op.l_len = len; 30774 int rc = fcntl(fd, cmd, &lock_op); 30775 jitter4testing(true); 30776 if (rc != -1) { 30777 if (cmd == op_getlk) { 30778 /* Checks reader by pid. Returns: 30779 * MDBX_RESULT_TRUE - if pid is live (reader holds a lock). 30780 * MDBX_RESULT_FALSE - if pid is dead (a lock could be placed). */ 30781 return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE 30782 : MDBX_RESULT_TRUE; 30783 } 30784 return MDBX_SUCCESS; 30785 } 30786 rc = errno; 30787 #if MDBX_USE_OFDLOCKS 30788 if (rc == EINVAL && 30789 (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) { 30790 /* fallback to non-OFD locks */ 30791 if (cmd == F_OFD_SETLK) 30792 cmd = F_SETLK; 30793 else if (cmd == F_OFD_SETLKW) 30794 cmd = F_SETLKW; 30795 else 30796 cmd = F_GETLK; 30797 op_setlk = F_SETLK; 30798 op_setlkw = F_SETLKW; 30799 op_getlk = F_GETLK; 30800 continue; 30801 } 30802 #endif /* MDBX_USE_OFDLOCKS */ 30803 if (rc != EINTR || cmd == op_setlkw) { 30804 assert(MDBX_IS_ERROR(rc)); 30805 return rc; 30806 } 30807 } 30808 } 30809 30810 MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { 30811 #if MDBX_USE_OFDLOCKS 30812 if (unlikely(op_setlk == 0)) 30813 choice_fcntl(); 30814 #endif /* MDBX_USE_OFDLOCKS */ 30815 return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); 30816 } 30817 30818 MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { 30819 assert(env->me_lfd != INVALID_HANDLE_VALUE); 30820 assert(env->me_pid > 0); 30821 if (unlikely(osal_getpid() != env->me_pid)) 30822 return MDBX_PANIC; 30823 return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); 30824 } 30825 30826 MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { 30827 assert(env->me_lfd != INVALID_HANDLE_VALUE); 30828 assert(env->me_pid > 0); 30829 return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); 30830 } 30831 30832 MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { 30833 assert(env->me_lfd != INVALID_HANDLE_VALUE); 30834 assert(pid > 0); 30835 return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); 30836 } 30837 30838 /*---------------------------------------------------------------------------*/ 30839 30840 #if MDBX_LOCKING > MDBX_LOCKING_SYSV 30841 MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { 30842 #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 30843 return sem_init(ipc, false, 1) ? errno : 0; 30844 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 30845 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 30846 return pthread_mutex_init(ipc, nullptr); 30847 #else 30848 #error "FIXME" 30849 #endif 30850 } 30851 30852 MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) { 30853 #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 30854 return sem_destroy(ipc) ? errno : 0; 30855 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 30856 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 30857 return pthread_mutex_destroy(ipc); 30858 #else 30859 #error "FIXME" 30860 #endif 30861 } 30862 #endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ 30863 30864 static int check_fstat(MDBX_env *env) { 30865 struct stat st; 30866 30867 int rc = MDBX_SUCCESS; 30868 if (fstat(env->me_lazy_fd, &st)) { 30869 rc = errno; 30870 ERROR("fstat(%s), err %d", "DXB", rc); 30871 return rc; 30872 } 30873 30874 if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { 30875 #ifdef EBADFD 30876 rc = EBADFD; 30877 #else 30878 rc = EPERM; 30879 #endif 30880 ERROR("%s %s, err %d", "DXB", 30881 (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); 30882 return rc; 30883 } 30884 30885 if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { 30886 VERBOSE("dxb-file is too short (%u), exclusive-lock needed", 30887 (unsigned)st.st_size); 30888 rc = MDBX_RESULT_TRUE; 30889 } 30890 30891 //---------------------------------------------------------------------------- 30892 30893 if (fstat(env->me_lfd, &st)) { 30894 rc = errno; 30895 ERROR("fstat(%s), err %d", "LCK", rc); 30896 return rc; 30897 } 30898 30899 if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { 30900 #ifdef EBADFD 30901 rc = EBADFD; 30902 #else 30903 rc = EPERM; 30904 #endif 30905 ERROR("%s %s, err %d", "LCK", 30906 (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); 30907 return rc; 30908 } 30909 30910 /* Checking file size for detect the situation when we got the shared lock 30911 * immediately after osal_lck_destroy(). */ 30912 if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { 30913 VERBOSE("lck-file is too short (%u), exclusive-lock needed", 30914 (unsigned)st.st_size); 30915 rc = MDBX_RESULT_TRUE; 30916 } 30917 30918 return rc; 30919 } 30920 30921 __cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { 30922 assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); 30923 if (unlikely(osal_getpid() != env->me_pid)) 30924 return MDBX_PANIC; 30925 #if MDBX_USE_OFDLOCKS 30926 if (unlikely(op_setlk == 0)) 30927 choice_fcntl(); 30928 #endif /* MDBX_USE_OFDLOCKS */ 30929 30930 int rc = MDBX_SUCCESS; 30931 #if defined(__linux__) || defined(__gnu_linux__) 30932 if (unlikely(mdbx_RunningOnWSL1)) { 30933 rc = ENOLCK /* No record locks available */; 30934 ERROR("%s, err %u", 30935 "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " 30936 "injecting failure to avoid data loss", 30937 rc); 30938 return rc; 30939 } 30940 #endif /* Linux */ 30941 30942 if (env->me_lfd == INVALID_HANDLE_VALUE) { 30943 /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ 30944 rc = 30945 lck_op(env->me_lazy_fd, op_setlk, 30946 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); 30947 if (rc != MDBX_SUCCESS) { 30948 ERROR("%s, err %u", "without-lck", rc); 30949 eASSERT(env, MDBX_IS_ERROR(rc)); 30950 return rc; 30951 } 30952 return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; 30953 } 30954 #if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0 30955 sched_yield(); 30956 #endif 30957 30958 retry: 30959 if (rc == MDBX_RESULT_TRUE) { 30960 rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); 30961 if (rc != MDBX_SUCCESS) { 30962 ERROR("%s, err %u", "unlock-before-retry", rc); 30963 eASSERT(env, MDBX_IS_ERROR(rc)); 30964 return rc; 30965 } 30966 } 30967 30968 /* Firstly try to get exclusive locking. */ 30969 rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); 30970 if (rc == MDBX_SUCCESS) { 30971 rc = check_fstat(env); 30972 if (MDBX_IS_ERROR(rc)) 30973 return rc; 30974 30975 continue_dxb_exclusive: 30976 rc = 30977 lck_op(env->me_lazy_fd, op_setlk, 30978 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); 30979 if (rc == MDBX_SUCCESS) 30980 return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; 30981 30982 int err = check_fstat(env); 30983 if (MDBX_IS_ERROR(err)) 30984 return err; 30985 30986 /* the cause may be a collision with POSIX's file-lock recovery. */ 30987 if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || 30988 rc == EDEADLK)) { 30989 ERROR("%s, err %u", "dxb-exclusive", rc); 30990 eASSERT(env, MDBX_IS_ERROR(rc)); 30991 return rc; 30992 } 30993 30994 /* Fallback to lck-shared */ 30995 } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || 30996 rc == EWOULDBLOCK || rc == EDEADLK)) { 30997 ERROR("%s, err %u", "try-exclusive", rc); 30998 eASSERT(env, MDBX_IS_ERROR(rc)); 30999 return rc; 31000 } 31001 31002 /* Here could be one of two: 31003 * - osal_lck_destroy() from the another process was hold the lock 31004 * during a destruction. 31005 * - either osal_lck_seize() from the another process was got the exclusive 31006 * lock and doing initialization. 31007 * For distinguish these cases will use size of the lck-file later. */ 31008 31009 /* Wait for lck-shared now. */ 31010 /* Here may be await during transient processes, for instance until another 31011 * competing process doesn't call lck_downgrade(). */ 31012 rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); 31013 if (rc != MDBX_SUCCESS) { 31014 ERROR("%s, err %u", "try-shared", rc); 31015 eASSERT(env, MDBX_IS_ERROR(rc)); 31016 return rc; 31017 } 31018 31019 rc = check_fstat(env); 31020 if (rc == MDBX_RESULT_TRUE) 31021 goto retry; 31022 if (rc != MDBX_SUCCESS) { 31023 ERROR("%s, err %u", "lck_fstat", rc); 31024 return rc; 31025 } 31026 31027 /* got shared, retry exclusive */ 31028 rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); 31029 if (rc == MDBX_SUCCESS) 31030 goto continue_dxb_exclusive; 31031 31032 if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || 31033 rc == EDEADLK)) { 31034 ERROR("%s, err %u", "try-exclusive", rc); 31035 eASSERT(env, MDBX_IS_ERROR(rc)); 31036 return rc; 31037 } 31038 31039 /* Lock against another process operating in without-lck or exclusive mode. */ 31040 rc = 31041 lck_op(env->me_lazy_fd, op_setlk, 31042 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); 31043 if (rc != MDBX_SUCCESS) { 31044 ERROR("%s, err %u", "lock-against-without-lck", rc); 31045 eASSERT(env, MDBX_IS_ERROR(rc)); 31046 return rc; 31047 } 31048 31049 /* Done: return with shared locking. */ 31050 return MDBX_RESULT_FALSE; 31051 } 31052 31053 MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { 31054 assert(env->me_lfd != INVALID_HANDLE_VALUE); 31055 if (unlikely(osal_getpid() != env->me_pid)) 31056 return MDBX_PANIC; 31057 31058 int rc = MDBX_SUCCESS; 31059 if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { 31060 rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid); 31061 if (rc == MDBX_SUCCESS) 31062 rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1, 31063 OFF_T_MAX - env->me_pid - 1); 31064 } 31065 if (rc == MDBX_SUCCESS) 31066 rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); 31067 if (unlikely(rc != 0)) { 31068 ERROR("%s, err %u", "lck", rc); 31069 assert(MDBX_IS_ERROR(rc)); 31070 } 31071 return rc; 31072 } 31073 31074 __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, 31075 MDBX_env *inprocess_neighbor) { 31076 if (unlikely(osal_getpid() != env->me_pid)) 31077 return MDBX_PANIC; 31078 31079 int rc = MDBX_SUCCESS; 31080 struct stat lck_info; 31081 MDBX_lockinfo *lck = env->me_lck_mmap.lck; 31082 if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck && 31083 /* try get exclusive access */ 31084 lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && 31085 /* if LCK was not removed */ 31086 fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 && 31087 lck_op(env->me_lazy_fd, op_setlk, 31088 (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, 31089 OFF_T_MAX) == 0) { 31090 31091 VERBOSE("%p got exclusive, drown locks", (void *)env); 31092 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 31093 if (env->me_sysv_ipc.semid != -1) 31094 rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; 31095 #else 31096 rc = osal_ipclock_destroy(&lck->mti_rlock); 31097 if (rc == 0) 31098 rc = osal_ipclock_destroy(&lck->mti_wlock); 31099 #endif /* MDBX_LOCKING */ 31100 31101 eASSERT(env, rc == 0); 31102 if (rc == 0) { 31103 const bool synced = lck->mti_unsynced_pages.weak == 0; 31104 osal_munmap(&env->me_lck_mmap); 31105 if (synced) 31106 rc = ftruncate(env->me_lfd, 0) ? errno : 0; 31107 } 31108 31109 jitter4testing(false); 31110 } 31111 31112 /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored 31113 * after file was closed. 31114 * 31115 * 2) File locks would be released (by kernel) while the file-descriptors will 31116 * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, 31117 * locks should be released here explicitly with properly order. */ 31118 31119 /* close dxb and restore lock */ 31120 if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { 31121 if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS) 31122 rc = errno; 31123 env->me_dsync_fd = INVALID_HANDLE_VALUE; 31124 } 31125 if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { 31126 if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS) 31127 rc = errno; 31128 env->me_lazy_fd = INVALID_HANDLE_VALUE; 31129 if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { 31130 /* restore file-lock */ 31131 rc = lck_op( 31132 inprocess_neighbor->me_lazy_fd, F_SETLKW, 31133 (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 31134 (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) 31135 ? 0 31136 : inprocess_neighbor->me_pid, 31137 (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); 31138 } 31139 } 31140 31141 /* close clk and restore locks */ 31142 if (env->me_lfd != INVALID_HANDLE_VALUE) { 31143 if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS) 31144 rc = errno; 31145 env->me_lfd = INVALID_HANDLE_VALUE; 31146 if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { 31147 /* restore file-locks */ 31148 rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); 31149 if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) 31150 rc = osal_rpid_set(inprocess_neighbor); 31151 } 31152 } 31153 31154 if (inprocess_neighbor && rc != MDBX_SUCCESS) 31155 inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; 31156 return rc; 31157 } 31158 31159 /*---------------------------------------------------------------------------*/ 31160 31161 __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, 31162 MDBX_env *inprocess_neighbor, 31163 int global_uniqueness_flag) { 31164 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 31165 int semid = -1; 31166 /* don't initialize semaphores twice */ 31167 (void)inprocess_neighbor; 31168 if (global_uniqueness_flag == MDBX_RESULT_TRUE) { 31169 struct stat st; 31170 if (fstat(env->me_lazy_fd, &st)) 31171 return errno; 31172 sysv_retry_create: 31173 semid = semget(env->me_sysv_ipc.key, 2, 31174 IPC_CREAT | IPC_EXCL | 31175 (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); 31176 if (unlikely(semid == -1)) { 31177 int err = errno; 31178 if (err != EEXIST) 31179 return err; 31180 31181 /* remove and re-create semaphore set */ 31182 semid = semget(env->me_sysv_ipc.key, 2, 0); 31183 if (semid == -1) { 31184 err = errno; 31185 if (err != ENOENT) 31186 return err; 31187 goto sysv_retry_create; 31188 } 31189 if (semctl(semid, 2, IPC_RMID)) { 31190 err = errno; 31191 if (err != EIDRM) 31192 return err; 31193 } 31194 goto sysv_retry_create; 31195 } 31196 31197 unsigned short val_array[2] = {1, 1}; 31198 if (semctl(semid, 2, SETALL, val_array)) 31199 return errno; 31200 } else { 31201 semid = semget(env->me_sysv_ipc.key, 2, 0); 31202 if (semid == -1) 31203 return errno; 31204 31205 /* check read & write access */ 31206 struct semid_ds data[2]; 31207 if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) 31208 return errno; 31209 } 31210 31211 env->me_sysv_ipc.semid = semid; 31212 return MDBX_SUCCESS; 31213 31214 #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX 31215 (void)inprocess_neighbor; 31216 if (global_uniqueness_flag != MDBX_RESULT_TRUE) 31217 return MDBX_SUCCESS; 31218 #error "FIXME: Not implemented" 31219 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 31220 31221 /* don't initialize semaphores twice */ 31222 (void)inprocess_neighbor; 31223 if (global_uniqueness_flag == MDBX_RESULT_TRUE) { 31224 if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1)) 31225 return errno; 31226 if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1)) 31227 return errno; 31228 } 31229 return MDBX_SUCCESS; 31230 31231 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 31232 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 31233 if (inprocess_neighbor) 31234 return MDBX_SUCCESS /* don't need any initialization for mutexes 31235 if LCK already opened/used inside current process */ 31236 ; 31237 31238 /* FIXME: Unfortunately, there is no other reliable way but to long testing 31239 * on each platform. On the other hand, behavior like FreeBSD is incorrect 31240 * and we can expect it to be rare. Moreover, even on FreeBSD without 31241 * additional in-process initialization, the probability of an problem 31242 * occurring is vanishingly small, and the symptom is a return of EINVAL 31243 * while locking a mutex. In other words, in the worst case, the problem 31244 * results in an EINVAL error at the start of the transaction, but NOT data 31245 * loss, nor database corruption, nor other fatal troubles. Thus, the code 31246 * below I am inclined to think the workaround for erroneous platforms (like 31247 * FreeBSD), rather than a defect of libmdbx. */ 31248 #if defined(__FreeBSD__) 31249 /* seems that shared mutexes on FreeBSD required in-process initialization */ 31250 (void)global_uniqueness_flag; 31251 #else 31252 /* shared mutexes on many other platforms (including Darwin and Linux's 31253 * futexes) doesn't need any addition in-process initialization */ 31254 if (global_uniqueness_flag != MDBX_RESULT_TRUE) 31255 return MDBX_SUCCESS; 31256 #endif 31257 31258 pthread_mutexattr_t ma; 31259 int rc = pthread_mutexattr_init(&ma); 31260 if (rc) 31261 return rc; 31262 31263 rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); 31264 if (rc) 31265 goto bailout; 31266 31267 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 31268 #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust) 31269 rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); 31270 #elif defined(PTHREAD_MUTEX_ROBUST_NP) || \ 31271 defined(pthread_mutexattr_setrobust_np) 31272 rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); 31273 #elif _POSIX_THREAD_PROCESS_SHARED < 200809L 31274 rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); 31275 #else 31276 rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); 31277 #endif 31278 if (rc) 31279 goto bailout; 31280 #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ 31281 31282 #if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 && \ 31283 !defined(MDBX_SAFE4QEMU) 31284 rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); 31285 if (rc == ENOTSUP) 31286 rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); 31287 if (rc && rc != ENOTSUP) 31288 goto bailout; 31289 #endif /* PTHREAD_PRIO_INHERIT */ 31290 31291 rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); 31292 if (rc && rc != ENOTSUP) 31293 goto bailout; 31294 31295 rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma); 31296 if (rc) 31297 goto bailout; 31298 rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma); 31299 31300 bailout: 31301 pthread_mutexattr_destroy(&ma); 31302 return rc; 31303 #else 31304 #error "FIXME" 31305 #endif /* MDBX_LOCKING > 0 */ 31306 } 31307 31308 __cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, 31309 const int err) { 31310 int rc = err; 31311 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV 31312 if (err == EOWNERDEAD) { 31313 /* We own the mutex. Clean up after dead previous owner. */ 31314 31315 const bool rlocked = ipc == &env->me_lck->mti_rlock; 31316 rc = MDBX_SUCCESS; 31317 if (!rlocked) { 31318 if (unlikely(env->me_txn)) { 31319 /* env is hosed if the dead thread was ours */ 31320 env->me_flags |= MDBX_FATAL_ERROR; 31321 env->me_txn = NULL; 31322 rc = MDBX_PANIC; 31323 } 31324 } 31325 WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), 31326 (rc ? "this process' env is hosed" : "recovering")); 31327 31328 int check_rc = cleanup_dead_readers(env, rlocked, NULL); 31329 check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; 31330 31331 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 31332 rc = (rc == MDBX_SUCCESS) ? check_rc : rc; 31333 #else 31334 #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) 31335 int mreco_rc = pthread_mutex_consistent(ipc); 31336 #elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) 31337 int mreco_rc = pthread_mutex_consistent_np(ipc); 31338 #elif _POSIX_THREAD_PROCESS_SHARED < 200809L 31339 int mreco_rc = pthread_mutex_consistent_np(ipc); 31340 #else 31341 int mreco_rc = pthread_mutex_consistent(ipc); 31342 #endif 31343 check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; 31344 31345 if (unlikely(mreco_rc)) 31346 ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); 31347 31348 rc = (rc == MDBX_SUCCESS) ? check_rc : rc; 31349 if (MDBX_IS_ERROR(rc)) 31350 pthread_mutex_unlock(ipc); 31351 #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ 31352 return rc; 31353 } 31354 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 31355 (void)ipc; 31356 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 31357 (void)ipc; 31358 #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX 31359 #ifdef _MSC_VER 31360 #pragma message("warning: TODO") 31361 #else 31362 #warning "TODO" 31363 #endif 31364 (void)ipc; 31365 #else 31366 #error "FIXME" 31367 #endif /* MDBX_LOCKING */ 31368 31369 ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); 31370 if (rc != EDEADLK) 31371 env->me_flags |= MDBX_FATAL_ERROR; 31372 return rc; 31373 } 31374 31375 #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) 31376 MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { 31377 /* avoid 32-bit Bionic bug/hang with 32-pit TID */ 31378 if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { 31379 pid_t tid = gettid(); 31380 if (unlikely(tid > 0xffff)) { 31381 FATAL("Raise the ENOSYS(%d) error to avoid hang due " 31382 "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " 31383 "that don’t fit in 16 bits, see " 31384 "https://android.googlesource.com/platform/bionic/+/master/" 31385 "docs/32-bit-abi.md#is-too-small-for-large-pids", 31386 ENOSYS, tid, tid); 31387 return ENOSYS; 31388 } 31389 } 31390 return 0; 31391 } 31392 #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ 31393 31394 static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, 31395 const bool dont_wait) { 31396 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 31397 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 31398 int rc = osal_check_tid4bionic(); 31399 if (likely(rc == 0)) 31400 rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); 31401 rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; 31402 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 31403 int rc = MDBX_SUCCESS; 31404 if (dont_wait) { 31405 if (sem_trywait(ipc)) { 31406 rc = errno; 31407 if (rc == EAGAIN) 31408 rc = MDBX_BUSY; 31409 } 31410 } else if (sem_wait(ipc)) 31411 rc = errno; 31412 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV 31413 struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), 31414 .sem_op = -1, 31415 .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; 31416 int rc; 31417 if (semop(env->me_sysv_ipc.semid, &op, 1)) { 31418 rc = errno; 31419 if (dont_wait && rc == EAGAIN) 31420 rc = MDBX_BUSY; 31421 } else { 31422 rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; 31423 *ipc = env->me_pid; 31424 } 31425 #else 31426 #error "FIXME" 31427 #endif /* MDBX_LOCKING */ 31428 31429 if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) 31430 rc = mdbx_ipclock_failed(env, ipc, rc); 31431 return rc; 31432 } 31433 31434 static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { 31435 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 31436 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 31437 int rc = pthread_mutex_unlock(ipc); 31438 (void)env; 31439 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 31440 int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; 31441 (void)env; 31442 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV 31443 if (unlikely(*ipc != (pid_t)env->me_pid)) 31444 return EPERM; 31445 *ipc = 0; 31446 struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), 31447 .sem_op = 1, 31448 .sem_flg = SEM_UNDO}; 31449 int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; 31450 #else 31451 #error "FIXME" 31452 #endif /* MDBX_LOCKING */ 31453 return rc; 31454 } 31455 31456 MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { 31457 TRACE("%s", ">>"); 31458 jitter4testing(true); 31459 int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); 31460 TRACE("<< rc %d", rc); 31461 return rc; 31462 } 31463 31464 MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { 31465 TRACE("%s", ">>"); 31466 int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); 31467 TRACE("<< rc %d", rc); 31468 if (unlikely(rc != MDBX_SUCCESS)) 31469 mdbx_panic("%s() failed: err %d\n", __func__, rc); 31470 jitter4testing(true); 31471 } 31472 31473 int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { 31474 TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); 31475 jitter4testing(true); 31476 int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); 31477 TRACE("<< rc %d", rc); 31478 return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; 31479 } 31480 31481 void mdbx_txn_unlock(MDBX_env *env) { 31482 TRACE("%s", ">>"); 31483 int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); 31484 TRACE("<< rc %d", rc); 31485 if (unlikely(rc != MDBX_SUCCESS)) 31486 mdbx_panic("%s() failed: err %d\n", __func__, rc); 31487 jitter4testing(true); 31488 } 31489 31490 #else 31491 #ifdef _MSC_VER 31492 #pragma warning(disable : 4206) /* nonstandard extension used: translation \ 31493 unit is empty */ 31494 #endif /* _MSC_VER (warnings) */ 31495 #endif /* !Windows LCK-implementation */