github.com/moontrade/mdbx-go@v0.4.0/mdbx_chk.c (about) 1 /* mdbx_chk.c - memory-mapped database check tool */ 2 3 /* 4 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 5 * and other libmdbx authors: please see AUTHORS file. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted only as authorized by the OpenLDAP 10 * Public License. 11 * 12 * A copy of this license is available in the file LICENSE in the 13 * top-level directory of the distribution or, alternatively, at 14 * <http://www.OpenLDAP.org/license.html>. */ 15 16 #ifdef _MSC_VER 17 #if _MSC_VER > 1800 18 #pragma warning(disable : 4464) /* relative include path contains '..' */ 19 #endif 20 #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ 21 #endif /* _MSC_VER (warnings) */ 22 23 #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ 24 /* 25 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 26 * and other libmdbx authors: please see AUTHORS file. 27 * All rights reserved. 28 * 29 * Redistribution and use in source and binary forms, with or without 30 * modification, are permitted only as authorized by the OpenLDAP 31 * Public License. 32 * 33 * A copy of this license is available in the file LICENSE in the 34 * top-level directory of the distribution or, alternatively, at 35 * <http://www.OpenLDAP.org/license.html>. */ 36 37 #define MDBX_BUILD_SOURCERY 86a8d6c403a2023fc2df0ab38f71339b78e82f0aa786f480a1cb166c05497134_v0_12_1_0_gb36a07a5 38 #ifdef MDBX_CONFIG_H 39 #include MDBX_CONFIG_H 40 #endif 41 42 #define LIBMDBX_INTERNALS 43 #ifdef xMDBX_TOOLS 44 #define MDBX_DEPRECATED 45 #endif /* xMDBX_TOOLS */ 46 47 #ifdef xMDBX_ALLOY 48 /* Amalgamated build */ 49 #define MDBX_INTERNAL_FUNC static 50 #define MDBX_INTERNAL_VAR static 51 #else 52 /* Non-amalgamated build */ 53 #define MDBX_INTERNAL_FUNC 54 #define MDBX_INTERNAL_VAR extern 55 #endif /* xMDBX_ALLOY */ 56 57 /*----------------------------------------------------------------------------*/ 58 59 /** Disables using GNU/Linux libc extensions. 60 * \ingroup build_option 61 * \note This option couldn't be moved to the options.h since dependant 62 * control macros/defined should be prepared before include the options.h */ 63 #ifndef MDBX_DISABLE_GNU_SOURCE 64 #define MDBX_DISABLE_GNU_SOURCE 0 65 #endif 66 #if MDBX_DISABLE_GNU_SOURCE 67 #undef _GNU_SOURCE 68 #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE) 69 #define _GNU_SOURCE 70 #endif /* MDBX_DISABLE_GNU_SOURCE */ 71 72 /* Should be defined before any includes */ 73 #if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) && \ 74 !defined(ANDROID) 75 #define _FILE_OFFSET_BITS 64 76 #endif 77 78 #ifdef __APPLE__ 79 #define _DARWIN_C_SOURCE 80 #endif 81 82 #ifdef _MSC_VER 83 #if _MSC_FULL_VER < 190024234 84 /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual 85 * Studio 2015 Update 3). But you could remove this #error and try to continue 86 * at your own risk. In such case please don't rise up an issues related ONLY to 87 * old compilers. 88 * 89 * NOTE: 90 * Unfortunately, there are several different builds of "Visual Studio" that 91 * are called "Visual Studio 2015 Update 3". 92 * 93 * The 190024234 is used here because it is minimal version of Visual Studio 94 * that was used for build and testing libmdbx in recent years. Soon this 95 * value will be increased to 19.0.24241.7, since build and testing using 96 * "Visual Studio 2015" will be performed only at https://ci.appveyor.com. 97 * 98 * Please ask Microsoft (but not us) for information about version differences 99 * and how to and where you can obtain the latest "Visual Studio 2015" build 100 * with all fixes. 101 */ 102 #error \ 103 "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required." 104 #endif 105 #ifndef _CRT_SECURE_NO_WARNINGS 106 #define _CRT_SECURE_NO_WARNINGS 107 #endif /* _CRT_SECURE_NO_WARNINGS */ 108 #if _MSC_VER > 1800 109 #pragma warning(disable : 4464) /* relative include path contains '..' */ 110 #endif 111 #if _MSC_VER > 1913 112 #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ 113 */ 114 #endif 115 #if _MSC_VER > 1914 116 #pragma warning( \ 117 disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ 118 producing 'defined' has undefined behavior */ 119 #endif 120 #pragma warning(disable : 4710) /* 'xyz': function not inlined */ 121 #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ 122 inline expansion */ 123 #pragma warning( \ 124 disable : 4201) /* nonstandard extension used : nameless struct / union */ 125 #pragma warning(disable : 4702) /* unreachable code */ 126 #pragma warning(disable : 4706) /* assignment within conditional expression */ 127 #pragma warning(disable : 4127) /* conditional expression is constant */ 128 #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ 129 alignment specifier */ 130 #pragma warning(disable : 4310) /* cast truncates constant value */ 131 #pragma warning( \ 132 disable : 4820) /* bytes padding added after data member for alignment */ 133 #pragma warning(disable : 4548) /* expression before comma has no effect; \ 134 expected expression with side - effect */ 135 #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ 136 unaligned */ 137 #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \ 138 array in struct/union */ 139 #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ 140 aggregate initializer */ 141 #pragma warning( \ 142 disable : 4505) /* unreferenced local function has been removed */ 143 #endif /* _MSC_VER (warnings) */ 144 145 #if defined(__GNUC__) && __GNUC__ < 9 146 #pragma GCC diagnostic ignored "-Wattributes" 147 #endif /* GCC < 9 */ 148 149 #if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ 150 !defined(__USE_MINGW_ANSI_STDIO) 151 #define __USE_MINGW_ANSI_STDIO 1 152 #endif /* __USE_MINGW_ANSI_STDIO */ 153 154 #include "mdbx.h" 155 /* 156 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 157 * and other libmdbx authors: please see AUTHORS file. 158 * All rights reserved. 159 * 160 * Redistribution and use in source and binary forms, with or without 161 * modification, are permitted only as authorized by the OpenLDAP 162 * Public License. 163 * 164 * A copy of this license is available in the file LICENSE in the 165 * top-level directory of the distribution or, alternatively, at 166 * <http://www.OpenLDAP.org/license.html>. 167 */ 168 169 170 /*----------------------------------------------------------------------------*/ 171 /* Microsoft compiler generates a lot of warning for self includes... */ 172 173 #ifdef _MSC_VER 174 #pragma warning(push, 1) 175 #pragma warning(disable : 4548) /* expression before comma has no effect; \ 176 expected expression with side - effect */ 177 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ 178 * semantics are not enabled. Specify /EHsc */ 179 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ 180 * mode specified; termination on exception is \ 181 * not guaranteed. Specify /EHsc */ 182 #endif /* _MSC_VER (warnings) */ 183 184 #if defined(_WIN32) || defined(_WIN64) 185 #if !defined(_CRT_SECURE_NO_WARNINGS) 186 #define _CRT_SECURE_NO_WARNINGS 187 #endif /* _CRT_SECURE_NO_WARNINGS */ 188 #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ 189 !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT 190 #define _NO_CRT_STDIO_INLINE 191 #endif 192 #elif !defined(_POSIX_C_SOURCE) 193 #define _POSIX_C_SOURCE 200809L 194 #endif /* Windows */ 195 196 /*----------------------------------------------------------------------------*/ 197 /* basic C99 includes */ 198 #include <inttypes.h> 199 #include <stddef.h> 200 #include <stdint.h> 201 #include <stdlib.h> 202 203 #include <assert.h> 204 #include <fcntl.h> 205 #include <limits.h> 206 #include <stdio.h> 207 #include <string.h> 208 #include <time.h> 209 210 #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 211 #error \ 212 "Sanity checking failed: Two's complement, reasonably sized integer types" 213 #endif 214 215 #ifndef SSIZE_MAX 216 #define SSIZE_MAX INTPTR_MAX 217 #endif 218 219 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul 220 #define MDBX_WORDBITS 64 221 #else 222 #define MDBX_WORDBITS 32 223 #endif /* MDBX_WORDBITS */ 224 225 /*----------------------------------------------------------------------------*/ 226 /* feature testing */ 227 228 #ifndef __has_warning 229 #define __has_warning(x) (0) 230 #endif 231 232 #ifndef __has_include 233 #define __has_include(x) (0) 234 #endif 235 236 #ifndef __has_feature 237 #define __has_feature(x) (0) 238 #endif 239 240 #ifndef __has_extension 241 #define __has_extension(x) (0) 242 #endif 243 244 #if __has_feature(thread_sanitizer) 245 #define __SANITIZE_THREAD__ 1 246 #endif 247 248 #if __has_feature(address_sanitizer) 249 #define __SANITIZE_ADDRESS__ 1 250 #endif 251 252 #ifndef __GNUC_PREREQ 253 #if defined(__GNUC__) && defined(__GNUC_MINOR__) 254 #define __GNUC_PREREQ(maj, min) \ 255 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) 256 #else 257 #define __GNUC_PREREQ(maj, min) (0) 258 #endif 259 #endif /* __GNUC_PREREQ */ 260 261 #ifndef __CLANG_PREREQ 262 #ifdef __clang__ 263 #define __CLANG_PREREQ(maj, min) \ 264 ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) 265 #else 266 #define __CLANG_PREREQ(maj, min) (0) 267 #endif 268 #endif /* __CLANG_PREREQ */ 269 270 #ifndef __GLIBC_PREREQ 271 #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) 272 #define __GLIBC_PREREQ(maj, min) \ 273 ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) 274 #else 275 #define __GLIBC_PREREQ(maj, min) (0) 276 #endif 277 #endif /* __GLIBC_PREREQ */ 278 279 /*----------------------------------------------------------------------------*/ 280 /* C11' alignas() */ 281 282 #if __has_include(<stdalign.h>) 283 #include <stdalign.h> 284 #endif 285 #if defined(alignas) || defined(__cplusplus) 286 #define MDBX_ALIGNAS(N) alignas(N) 287 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L 288 #define MDBX_ALIGNAS(N) _Alignas(N) 289 #elif defined(_MSC_VER) 290 #define MDBX_ALIGNAS(N) __declspec(align(N)) 291 #elif __has_attribute(__aligned__) || defined(__GNUC__) 292 #define MDBX_ALIGNAS(N) __attribute__((__aligned__(N))) 293 #else 294 #error "FIXME: Required alignas() or equivalent." 295 #endif /* MDBX_ALIGNAS */ 296 297 /*----------------------------------------------------------------------------*/ 298 /* Systems macros and includes */ 299 300 #ifndef __extern_C 301 #ifdef __cplusplus 302 #define __extern_C extern "C" 303 #else 304 #define __extern_C 305 #endif 306 #endif /* __extern_C */ 307 308 #if !defined(nullptr) && !defined(__cplusplus) || \ 309 (__cplusplus < 201103L && !defined(_MSC_VER)) 310 #define nullptr NULL 311 #endif 312 313 #if defined(__APPLE__) || defined(_DARWIN_C_SOURCE) 314 #include <AvailabilityMacros.h> 315 #include <TargetConditionals.h> 316 #ifndef MAC_OS_X_VERSION_MIN_REQUIRED 317 #define MAC_OS_X_VERSION_MIN_REQUIRED 1070 /* Mac OS X 10.7, 2011 */ 318 #endif 319 #endif /* Apple OSX & iOS */ 320 321 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ 322 defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ 323 defined(__APPLE__) || defined(__MACH__) 324 #include <sys/cdefs.h> 325 #include <sys/mount.h> 326 #include <sys/sysctl.h> 327 #include <sys/types.h> 328 #if defined(__FreeBSD__) || defined(__DragonFly__) 329 #include <vm/vm_param.h> 330 #elif defined(__OpenBSD__) || defined(__NetBSD__) 331 #include <uvm/uvm_param.h> 332 #else 333 #define SYSCTL_LEGACY_NONCONST_MIB 334 #endif 335 #ifndef __MACH__ 336 #include <sys/vmmeter.h> 337 #endif 338 #else 339 #include <malloc.h> 340 #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ 341 defined(_WIN32) || defined(_WIN64)) 342 #include <mntent.h> 343 #endif /* !Solaris */ 344 #endif /* !xBSD */ 345 346 #if defined(__FreeBSD__) || __has_include(<malloc_np.h>) 347 #include <malloc_np.h> 348 #endif 349 350 #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>) 351 #include <malloc/malloc.h> 352 #endif /* MacOS */ 353 354 #if defined(__MACH__) 355 #include <mach/host_info.h> 356 #include <mach/mach_host.h> 357 #include <mach/mach_port.h> 358 #include <uuid/uuid.h> 359 #endif 360 361 #if defined(__linux__) || defined(__gnu_linux__) 362 #include <sched.h> 363 #include <sys/sendfile.h> 364 #include <sys/statfs.h> 365 #endif /* Linux */ 366 367 #ifndef _XOPEN_SOURCE 368 #define _XOPEN_SOURCE 0 369 #endif 370 371 #ifndef _XOPEN_SOURCE_EXTENDED 372 #define _XOPEN_SOURCE_EXTENDED 0 373 #else 374 #include <utmpx.h> 375 #endif /* _XOPEN_SOURCE_EXTENDED */ 376 377 #if defined(__sun) || defined(__SVR4) || defined(__svr4__) 378 #include <kstat.h> 379 #include <sys/mnttab.h> 380 /* On Solaris, it's easier to add a missing prototype rather than find a 381 * combination of #defines that break nothing. */ 382 __extern_C key_t ftok(const char *, int); 383 #endif /* SunOS/Solaris */ 384 385 #if defined(_WIN32) || defined(_WIN64) /*-------------------------------------*/ 386 387 #ifndef _WIN32_WINNT 388 #define _WIN32_WINNT 0x0601 /* Windows 7 */ 389 #elif _WIN32_WINNT < 0x0500 390 #error At least 'Windows 2000' API is required for libmdbx. 391 #endif /* _WIN32_WINNT */ 392 #if (defined(__MINGW32__) || defined(__MINGW64__)) && \ 393 !defined(__USE_MINGW_ANSI_STDIO) 394 #define __USE_MINGW_ANSI_STDIO 1 395 #endif /* MinGW */ 396 #ifndef WIN32_LEAN_AND_MEAN 397 #define WIN32_LEAN_AND_MEAN 398 #endif /* WIN32_LEAN_AND_MEAN */ 399 #include <excpt.h> 400 #include <tlhelp32.h> 401 #include <windows.h> 402 #include <winnt.h> 403 #include <winternl.h> 404 405 #else /*----------------------------------------------------------------------*/ 406 407 #include <unistd.h> 408 #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1 409 #error "libmdbx requires the _POSIX_MAPPED_FILES feature" 410 #endif /* _POSIX_MAPPED_FILES */ 411 412 #include <pthread.h> 413 #include <semaphore.h> 414 #include <signal.h> 415 #include <sys/file.h> 416 #include <sys/ipc.h> 417 #include <sys/mman.h> 418 #include <sys/param.h> 419 #include <sys/stat.h> 420 #include <sys/statvfs.h> 421 #include <sys/uio.h> 422 423 #endif /*---------------------------------------------------------------------*/ 424 425 #if defined(__ANDROID_API__) || defined(ANDROID) 426 #include <android/log.h> 427 #if __ANDROID_API__ >= 21 428 #include <sys/sendfile.h> 429 #endif 430 #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS != MDBX_WORDBITS 431 #error "_FILE_OFFSET_BITS != MDBX_WORDBITS" (_FILE_OFFSET_BITS != MDBX_WORDBITS) 432 #elif defined(__FILE_OFFSET_BITS) && __FILE_OFFSET_BITS != MDBX_WORDBITS 433 #error "__FILE_OFFSET_BITS != MDBX_WORDBITS" (__FILE_OFFSET_BITS != MDBX_WORDBITS) 434 #endif 435 #endif /* Android */ 436 437 #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>) 438 #include <sys/stat.h> 439 #endif 440 #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>) 441 #include <sys/types.h> 442 #endif 443 #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>) 444 #include <sys/file.h> 445 #endif 446 447 /*----------------------------------------------------------------------------*/ 448 /* Byteorder */ 449 450 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ 451 defined(i486) || defined(__i486) || defined(__i486__) || \ 452 defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ 453 defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ 454 defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ 455 defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ 456 defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ 457 defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) 458 #ifndef __ia32__ 459 /* LY: define neutral __ia32__ for x86 and x86-64 */ 460 #define __ia32__ 1 461 #endif /* __ia32__ */ 462 #if !defined(__amd64__) && \ 463 (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ 464 defined(_M_X64) || defined(_M_AMD64)) 465 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ 466 #define __amd64__ 1 467 #endif /* __amd64__ */ 468 #endif /* all x86 */ 469 470 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ 471 !defined(__ORDER_BIG_ENDIAN__) 472 473 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || \ 474 defined(__ANDROID_API__) || defined(HAVE_ENDIAN_H) || __has_include(<endian.h>) 475 #include <endian.h> 476 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ 477 defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>) 478 #include <machine/endian.h> 479 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>) 480 #include <sys/isa_defs.h> 481 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ 482 (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>)) 483 #include <sys/endian.h> 484 #include <sys/types.h> 485 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ 486 defined(__NetBSD__) || defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>) 487 #include <sys/param.h> 488 #endif /* OS */ 489 490 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) 491 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN 492 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN 493 #define __BYTE_ORDER__ __BYTE_ORDER 494 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) 495 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN 496 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN 497 #define __BYTE_ORDER__ _BYTE_ORDER 498 #else 499 #define __ORDER_LITTLE_ENDIAN__ 1234 500 #define __ORDER_BIG_ENDIAN__ 4321 501 502 #if defined(__LITTLE_ENDIAN__) || \ 503 (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ 504 defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ 505 defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ 506 defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ 507 defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ 508 defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ 509 defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ 510 defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ 511 defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ 512 defined(__WINDOWS__) 513 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ 514 515 #elif defined(__BIG_ENDIAN__) || \ 516 (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ 517 defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ 518 defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ 519 defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ 520 defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ 521 defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ 522 defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) 523 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ 524 525 #else 526 #error __BYTE_ORDER__ should be defined. 527 #endif /* Arch */ 528 529 #endif 530 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ 531 532 /*----------------------------------------------------------------------------*/ 533 /* Availability of CMOV or equivalent */ 534 535 #ifndef MDBX_HAVE_CMOV 536 #if defined(__e2k__) 537 #define MDBX_HAVE_CMOV 1 538 #elif defined(__thumb2__) || defined(__thumb2) 539 #define MDBX_HAVE_CMOV 1 540 #elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) 541 #define MDBX_HAVE_CMOV 0 542 #elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ 543 defined(__aarch64) || defined(__arm__) || defined(__arm) || \ 544 defined(__CC_ARM) 545 #define MDBX_HAVE_CMOV 1 546 #elif (defined(__riscv__) || defined(__riscv64)) && \ 547 (defined(__riscv_b) || defined(__riscv_bitmanip)) 548 #define MDBX_HAVE_CMOV 1 549 #elif defined(i686) || defined(__i686) || defined(__i686__) || \ 550 (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ 551 defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ 552 defined(_M_X64) || defined(_M_AMD64) 553 #define MDBX_HAVE_CMOV 1 554 #else 555 #define MDBX_HAVE_CMOV 0 556 #endif 557 #endif /* MDBX_HAVE_CMOV */ 558 559 /*----------------------------------------------------------------------------*/ 560 /* Compiler's includes for builtins/intrinsics */ 561 562 #if defined(_MSC_VER) || defined(__INTEL_COMPILER) 563 #include <intrin.h> 564 #elif __GNUC_PREREQ(4, 4) || defined(__clang__) 565 #if defined(__e2k__) 566 #include <e2kintrin.h> 567 #include <x86intrin.h> 568 #endif /* __e2k__ */ 569 #if defined(__ia32__) 570 #include <cpuid.h> 571 #include <x86intrin.h> 572 #endif /* __ia32__ */ 573 #ifdef __ARM_NEON 574 #include <arm_neon.h> 575 #endif 576 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 577 #include <mbarrier.h> 578 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 579 (defined(HP_IA64) || defined(__ia64)) 580 #include <machine/sys/inline.h> 581 #elif defined(__IBMC__) && defined(__powerpc) 582 #include <atomic.h> 583 #elif defined(_AIX) 584 #include <builtins.h> 585 #include <sys/atomic_op.h> 586 #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) 587 #include <c_asm.h> 588 #include <machine/builtins.h> 589 #elif defined(__MWERKS__) 590 /* CodeWarrior - troubles ? */ 591 #pragma gcc_extensions 592 #elif defined(__SNC__) 593 /* Sony PS3 - troubles ? */ 594 #elif defined(__hppa__) || defined(__hppa) 595 #include <machine/inline.h> 596 #else 597 #error Unsupported C compiler, please use GNU C 4.4 or newer 598 #endif /* Compiler */ 599 600 #if !defined(__noop) && !defined(_MSC_VER) 601 #define __noop \ 602 do { \ 603 } while (0) 604 #endif /* __noop */ 605 606 #if defined(__fallthrough) && \ 607 (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) 608 #undef __fallthrough 609 #endif /* __fallthrough workaround for MinGW */ 610 611 #ifndef __fallthrough 612 #if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) && \ 613 (!defined(__clang__) || __clang__ > 4)) || \ 614 __cplusplus >= 201703L 615 #define __fallthrough [[fallthrough]] 616 #elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L 617 #define __fallthrough [[fallthrough]] 618 #elif __GNUC_PREREQ(7, 0) && \ 619 (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) || \ 620 (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126)) 621 #define __fallthrough __attribute__((__fallthrough__)) 622 #elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L && \ 623 __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") 624 #define __fallthrough [[clang::fallthrough]] 625 #else 626 #define __fallthrough 627 #endif 628 #endif /* __fallthrough */ 629 630 #ifndef __unreachable 631 #if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) 632 #define __unreachable() __builtin_unreachable() 633 #elif defined(_MSC_VER) 634 #define __unreachable() __assume(0) 635 #else 636 #define __unreachable() \ 637 do { \ 638 } while (1) 639 #endif 640 #endif /* __unreachable */ 641 642 #ifndef __prefetch 643 #if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch) 644 #define __prefetch(ptr) __builtin_prefetch(ptr) 645 #else 646 #define __prefetch(ptr) \ 647 do { \ 648 (void)(ptr); \ 649 } while (0) 650 #endif 651 #endif /* __prefetch */ 652 653 #ifndef offsetof 654 #define offsetof(type, member) __builtin_offsetof(type, member) 655 #endif /* offsetof */ 656 657 #ifndef container_of 658 #define container_of(ptr, type, member) \ 659 ((type *)((char *)(ptr)-offsetof(type, member))) 660 #endif /* container_of */ 661 662 /*----------------------------------------------------------------------------*/ 663 664 #ifndef __always_inline 665 #if defined(__GNUC__) || __has_attribute(__always_inline__) 666 #define __always_inline __inline __attribute__((__always_inline__)) 667 #elif defined(_MSC_VER) 668 #define __always_inline __forceinline 669 #else 670 #define __always_inline 671 #endif 672 #endif /* __always_inline */ 673 674 #ifndef __noinline 675 #if defined(__GNUC__) || __has_attribute(__noinline__) 676 #define __noinline __attribute__((__noinline__)) 677 #elif defined(_MSC_VER) 678 #define __noinline __declspec(noinline) 679 #else 680 #define __noinline 681 #endif 682 #endif /* __noinline */ 683 684 #ifndef __must_check_result 685 #if defined(__GNUC__) || __has_attribute(__warn_unused_result__) 686 #define __must_check_result __attribute__((__warn_unused_result__)) 687 #else 688 #define __must_check_result 689 #endif 690 #endif /* __must_check_result */ 691 692 #ifndef __nothrow 693 #if defined(__cplusplus) 694 #if __cplusplus < 201703L 695 #define __nothrow throw() 696 #else 697 #define __nothrow noexcept(true) 698 #endif /* __cplusplus */ 699 #elif defined(__GNUC__) || __has_attribute(__nothrow__) 700 #define __nothrow __attribute__((__nothrow__)) 701 #elif defined(_MSC_VER) && defined(__cplusplus) 702 #define __nothrow __declspec(nothrow) 703 #else 704 #define __nothrow 705 #endif 706 #endif /* __nothrow */ 707 708 #ifndef __hidden 709 #if defined(__GNUC__) || __has_attribute(__visibility__) 710 #define __hidden __attribute__((__visibility__("hidden"))) 711 #else 712 #define __hidden 713 #endif 714 #endif /* __hidden */ 715 716 #ifndef __optimize 717 #if defined(__OPTIMIZE__) 718 #if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__) 719 #define __optimize(ops) __attribute__((__optimize__(ops))) 720 #else 721 #define __optimize(ops) 722 #endif 723 #else 724 #define __optimize(ops) 725 #endif 726 #endif /* __optimize */ 727 728 #ifndef __hot 729 #if defined(__OPTIMIZE__) 730 #if defined(__e2k__) 731 #define __hot __attribute__((__hot__)) __optimize(3) 732 #elif defined(__clang__) && !__has_attribute(__hot_) && \ 733 __has_attribute(__section__) && \ 734 (defined(__linux__) || defined(__gnu_linux__)) 735 /* just put frequently used functions in separate section */ 736 #define __hot __attribute__((__section__("text.hot"))) __optimize("O3") 737 #elif defined(__LCC__) 738 #define __hot __attribute__((__hot__, __optimize__("Ofast,O4"))) 739 #elif defined(__GNUC__) || __has_attribute(__hot__) 740 #define __hot __attribute__((__hot__)) __optimize("O3") 741 #else 742 #define __hot __optimize("O3") 743 #endif 744 #else 745 #define __hot 746 #endif 747 #endif /* __hot */ 748 749 #ifndef __cold 750 #if defined(__OPTIMIZE__) 751 #if defined(__e2k__) 752 #define __cold __attribute__((__cold__)) __optimize(1) 753 #elif defined(__clang__) && !__has_attribute(cold) && \ 754 __has_attribute(__section__) && \ 755 (defined(__linux__) || defined(__gnu_linux__)) 756 /* just put infrequently used functions in separate section */ 757 #define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") 758 #elif defined(__LCC__) 759 #define __hot __attribute__((__cold__, __optimize__("Osize"))) 760 #elif defined(__GNUC__) || __has_attribute(cold) 761 #define __cold __attribute__((__cold__)) __optimize("Os") 762 #else 763 #define __cold __optimize("Os") 764 #endif 765 #else 766 #define __cold 767 #endif 768 #endif /* __cold */ 769 770 #ifndef __flatten 771 #if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__)) 772 #define __flatten __attribute__((__flatten__)) 773 #else 774 #define __flatten 775 #endif 776 #endif /* __flatten */ 777 778 #ifndef likely 779 #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && \ 780 !defined(__COVERITY__) 781 #define likely(cond) __builtin_expect(!!(cond), 1) 782 #else 783 #define likely(x) (!!(x)) 784 #endif 785 #endif /* likely */ 786 787 #ifndef unlikely 788 #if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && \ 789 !defined(__COVERITY__) 790 #define unlikely(cond) __builtin_expect(!!(cond), 0) 791 #else 792 #define unlikely(x) (!!(x)) 793 #endif 794 #endif /* unlikely */ 795 796 #ifndef __anonymous_struct_extension__ 797 #if defined(__GNUC__) 798 #define __anonymous_struct_extension__ __extension__ 799 #else 800 #define __anonymous_struct_extension__ 801 #endif 802 #endif /* __anonymous_struct_extension__ */ 803 804 #ifndef expect_with_probability 805 #if defined(__builtin_expect_with_probability) || \ 806 __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) 807 #define expect_with_probability(expr, value, prob) \ 808 __builtin_expect_with_probability(expr, value, prob) 809 #else 810 #define expect_with_probability(expr, value, prob) (expr) 811 #endif 812 #endif /* expect_with_probability */ 813 814 #ifndef MDBX_WEAK_IMPORT_ATTRIBUTE 815 #ifdef WEAK_IMPORT_ATTRIBUTE 816 #define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE 817 #elif __has_attribute(__weak__) && __has_attribute(__weak_import__) 818 #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__, __weak_import__)) 819 #elif __has_attribute(__weak__) || \ 820 (defined(__GNUC__) && __GNUC__ >= 4 && defined(__ELF__)) 821 #define MDBX_WEAK_IMPORT_ATTRIBUTE __attribute__((__weak__)) 822 #else 823 #define MDBX_WEAK_IMPORT_ATTRIBUTE 824 #endif 825 #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ 826 827 /*----------------------------------------------------------------------------*/ 828 829 #if defined(MDBX_USE_VALGRIND) 830 #include <valgrind/memcheck.h> 831 #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE 832 /* LY: available since Valgrind 3.10 */ 833 #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 834 #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 835 #endif 836 #elif !defined(RUNNING_ON_VALGRIND) 837 #define VALGRIND_CREATE_MEMPOOL(h, r, z) 838 #define VALGRIND_DESTROY_MEMPOOL(h) 839 #define VALGRIND_MEMPOOL_TRIM(h, a, s) 840 #define VALGRIND_MEMPOOL_ALLOC(h, a, s) 841 #define VALGRIND_MEMPOOL_FREE(h, a) 842 #define VALGRIND_MEMPOOL_CHANGE(h, a, b, s) 843 #define VALGRIND_MAKE_MEM_NOACCESS(a, s) 844 #define VALGRIND_MAKE_MEM_DEFINED(a, s) 845 #define VALGRIND_MAKE_MEM_UNDEFINED(a, s) 846 #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 847 #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a, s) 848 #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) 849 #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) 850 #define RUNNING_ON_VALGRIND (0) 851 #endif /* MDBX_USE_VALGRIND */ 852 853 #ifdef __SANITIZE_ADDRESS__ 854 #include <sanitizer/asan_interface.h> 855 #elif !defined(ASAN_POISON_MEMORY_REGION) 856 #define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) 857 #define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) 858 #endif /* __SANITIZE_ADDRESS__ */ 859 860 /*----------------------------------------------------------------------------*/ 861 862 #ifndef ARRAY_LENGTH 863 #ifdef __cplusplus 864 template <typename T, size_t N> char (&__ArraySizeHelper(T (&array)[N]))[N]; 865 #define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) 866 #else 867 #define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) 868 #endif 869 #endif /* ARRAY_LENGTH */ 870 871 #ifndef ARRAY_END 872 #define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) 873 #endif /* ARRAY_END */ 874 875 #define CONCAT(a, b) a##b 876 #define XCONCAT(a, b) CONCAT(a, b) 877 878 #define MDBX_TETRAD(a, b, c, d) \ 879 ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) 880 881 #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3]) 882 883 #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__) 884 885 #ifndef STATIC_ASSERT_MSG 886 #if defined(static_assert) 887 #define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg) 888 #elif defined(_STATIC_ASSERT) 889 #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) 890 #elif defined(_MSC_VER) 891 #include <crtdbg.h> 892 #define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) 893 #elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \ 894 __has_feature(c_static_assert) 895 #define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg) 896 #else 897 #define STATIC_ASSERT_MSG(expr, msg) \ 898 switch (0) { \ 899 case 0: \ 900 case (expr):; \ 901 } 902 #endif 903 #endif /* STATIC_ASSERT */ 904 905 #ifndef STATIC_ASSERT 906 #define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr) 907 #endif 908 909 #ifndef __Wpedantic_format_voidptr 910 MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void * 911 __Wpedantic_format_voidptr(const void *ptr) { 912 return ptr; 913 } 914 #define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) 915 #endif /* __Wpedantic_format_voidptr */ 916 917 #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2) 918 /* Actually libmdbx was not tested with compilers older than GCC 4.2. 919 * But you could ignore this warning at your own risk. 920 * In such case please don't rise up an issues related ONLY to old compilers. 921 */ 922 #warning "libmdbx required GCC >= 4.2" 923 #endif 924 925 #if defined(__clang__) && !__CLANG_PREREQ(3, 8) 926 /* Actually libmdbx was not tested with CLANG older than 3.8. 927 * But you could ignore this warning at your own risk. 928 * In such case please don't rise up an issues related ONLY to old compilers. 929 */ 930 #warning "libmdbx required CLANG >= 3.8" 931 #endif 932 933 #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) 934 /* Actually libmdbx was not tested with something older than glibc 2.12. 935 * But you could ignore this warning at your own risk. 936 * In such case please don't rise up an issues related ONLY to old systems. 937 */ 938 #warning "libmdbx was only tested with GLIBC >= 2.12." 939 #endif 940 941 #ifdef __SANITIZE_THREAD__ 942 #warning \ 943 "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." 944 #endif /* __SANITIZE_THREAD__ */ 945 946 #if __has_warning("-Wnested-anon-types") 947 #if defined(__clang__) 948 #pragma clang diagnostic ignored "-Wnested-anon-types" 949 #elif defined(__GNUC__) 950 #pragma GCC diagnostic ignored "-Wnested-anon-types" 951 #else 952 #pragma warning disable "nested-anon-types" 953 #endif 954 #endif /* -Wnested-anon-types */ 955 956 #if __has_warning("-Wconstant-logical-operand") 957 #if defined(__clang__) 958 #pragma clang diagnostic ignored "-Wconstant-logical-operand" 959 #elif defined(__GNUC__) 960 #pragma GCC diagnostic ignored "-Wconstant-logical-operand" 961 #else 962 #pragma warning disable "constant-logical-operand" 963 #endif 964 #endif /* -Wconstant-logical-operand */ 965 966 #if defined(__LCC__) && (__LCC__ <= 121) 967 /* bug #2798 */ 968 #pragma diag_suppress alignment_reduction_ignored 969 #elif defined(__ICC) 970 #pragma warning(disable : 3453 1366) 971 #elif __has_warning("-Walignment-reduction-ignored") 972 #if defined(__clang__) 973 #pragma clang diagnostic ignored "-Walignment-reduction-ignored" 974 #elif defined(__GNUC__) 975 #pragma GCC diagnostic ignored "-Walignment-reduction-ignored" 976 #else 977 #pragma warning disable "alignment-reduction-ignored" 978 #endif 979 #endif /* -Walignment-reduction-ignored */ 980 981 #ifndef MDBX_EXCLUDE_FOR_GPROF 982 #ifdef ENABLE_GPROF 983 #define MDBX_EXCLUDE_FOR_GPROF \ 984 __attribute__((__no_instrument_function__, \ 985 __no_profile_instrument_function__)) 986 #else 987 #define MDBX_EXCLUDE_FOR_GPROF 988 #endif /* ENABLE_GPROF */ 989 #endif /* MDBX_EXCLUDE_FOR_GPROF */ 990 991 #ifdef __cplusplus 992 extern "C" { 993 #endif 994 995 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ 996 997 /* 998 * Copyright 2015-2022 Leonid Yuriev <leo@yuriev.ru> 999 * and other libmdbx authors: please see AUTHORS file. 1000 * All rights reserved. 1001 * 1002 * Redistribution and use in source and binary forms, with or without 1003 * modification, are permitted only as authorized by the OpenLDAP 1004 * Public License. 1005 * 1006 * A copy of this license is available in the file LICENSE in the 1007 * top-level directory of the distribution or, alternatively, at 1008 * <http://www.OpenLDAP.org/license.html>. 1009 */ 1010 1011 1012 /*----------------------------------------------------------------------------*/ 1013 /* C11 Atomics */ 1014 1015 #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) 1016 #include <cstdatomic> 1017 #define MDBX_HAVE_C11ATOMICS 1018 #elif !defined(__cplusplus) && \ 1019 (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ 1020 !defined(__STDC_NO_ATOMICS__) && \ 1021 (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ 1022 !(defined(__GNUC__) || defined(__clang__))) 1023 #include <stdatomic.h> 1024 #define MDBX_HAVE_C11ATOMICS 1025 #elif defined(__GNUC__) || defined(__clang__) 1026 #elif defined(_MSC_VER) 1027 #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ 1028 #pragma warning(disable : 4133) /* 'function': incompatible types - from \ 1029 'size_t' to 'LONGLONG' */ 1030 #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ 1031 'std::size_t', possible loss of data */ 1032 #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ 1033 'long', possible loss of data */ 1034 #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) 1035 #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) 1036 #elif defined(__APPLE__) 1037 #include <libkern/OSAtomic.h> 1038 #else 1039 #error FIXME atomic-ops 1040 #endif 1041 1042 /*----------------------------------------------------------------------------*/ 1043 /* Memory/Compiler barriers, cache coherence */ 1044 1045 #if __has_include(<sys/cachectl.h>) 1046 #include <sys/cachectl.h> 1047 #elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ 1048 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ 1049 defined(__MWERKS__) || defined(__sgi) 1050 /* MIPS should have explicit cache control */ 1051 #include <sys/cachectl.h> 1052 #endif 1053 1054 MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { 1055 #if defined(__clang__) || defined(__GNUC__) 1056 __asm__ __volatile__("" ::: "memory"); 1057 #elif defined(_MSC_VER) 1058 _ReadWriteBarrier(); 1059 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ 1060 __memory_barrier(); 1061 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 1062 __compiler_barrier(); 1063 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 1064 (defined(HP_IA64) || defined(__ia64)) 1065 _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); 1066 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ 1067 defined(__ppc64__) || defined(__powerpc64__) 1068 __fence(); 1069 #else 1070 #error "Could not guess the kind of compiler, please report to us." 1071 #endif 1072 } 1073 1074 MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { 1075 #ifdef MDBX_HAVE_C11ATOMICS 1076 atomic_thread_fence(memory_order_seq_cst); 1077 #elif defined(__ATOMIC_SEQ_CST) 1078 #ifdef __clang__ 1079 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); 1080 #else 1081 __atomic_thread_fence(__ATOMIC_SEQ_CST); 1082 #endif 1083 #elif defined(__clang__) || defined(__GNUC__) 1084 __sync_synchronize(); 1085 #elif defined(_WIN32) || defined(_WIN64) 1086 MemoryBarrier(); 1087 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ 1088 #if defined(__ia32__) 1089 _mm_mfence(); 1090 #else 1091 __mf(); 1092 #endif 1093 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) 1094 __machine_rw_barrier(); 1095 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ 1096 (defined(HP_IA64) || defined(__ia64)) 1097 _Asm_mf(); 1098 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ 1099 defined(__ppc64__) || defined(__powerpc64__) 1100 __lwsync(); 1101 #else 1102 #error "Could not guess the kind of compiler, please report to us." 1103 #endif 1104 } 1105 1106 /*----------------------------------------------------------------------------*/ 1107 /* system-depended definitions */ 1108 1109 #if defined(_WIN32) || defined(_WIN64) 1110 #define HAVE_SYS_STAT_H 1111 #define HAVE_SYS_TYPES_H 1112 typedef HANDLE osal_thread_t; 1113 typedef unsigned osal_thread_key_t; 1114 #define MAP_FAILED NULL 1115 #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) 1116 #define THREAD_CALL WINAPI 1117 #define THREAD_RESULT DWORD 1118 typedef struct { 1119 HANDLE mutex; 1120 HANDLE event[2]; 1121 } osal_condpair_t; 1122 typedef CRITICAL_SECTION osal_fastmutex_t; 1123 1124 #if !defined(_MSC_VER) && !defined(__try) 1125 #define __try 1126 #define __except(COND) if (false) 1127 #endif /* stub for MSVC's __try/__except */ 1128 1129 #if MDBX_WITHOUT_MSVC_CRT 1130 1131 #ifndef osal_malloc 1132 static inline void *osal_malloc(size_t bytes) { 1133 return HeapAlloc(GetProcessHeap(), 0, bytes); 1134 } 1135 #endif /* osal_malloc */ 1136 1137 #ifndef osal_calloc 1138 static inline void *osal_calloc(size_t nelem, size_t size) { 1139 return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); 1140 } 1141 #endif /* osal_calloc */ 1142 1143 #ifndef osal_realloc 1144 static inline void *osal_realloc(void *ptr, size_t bytes) { 1145 return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) 1146 : HeapAlloc(GetProcessHeap(), 0, bytes); 1147 } 1148 #endif /* osal_realloc */ 1149 1150 #ifndef osal_free 1151 static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } 1152 #endif /* osal_free */ 1153 1154 #else /* MDBX_WITHOUT_MSVC_CRT */ 1155 1156 #define osal_malloc malloc 1157 #define osal_calloc calloc 1158 #define osal_realloc realloc 1159 #define osal_free free 1160 #define osal_strdup _strdup 1161 1162 #endif /* MDBX_WITHOUT_MSVC_CRT */ 1163 1164 #ifndef snprintf 1165 #define snprintf _snprintf /* ntdll */ 1166 #endif 1167 1168 #ifndef vsnprintf 1169 #define vsnprintf _vsnprintf /* ntdll */ 1170 #endif 1171 1172 MDBX_INTERNAL_FUNC size_t osal_mb2w(wchar_t *dst, size_t dst_n, const char *src, 1173 size_t src_n); 1174 1175 #else /*----------------------------------------------------------------------*/ 1176 1177 typedef pthread_t osal_thread_t; 1178 typedef pthread_key_t osal_thread_key_t; 1179 #define INVALID_HANDLE_VALUE (-1) 1180 #define THREAD_CALL 1181 #define THREAD_RESULT void * 1182 typedef struct { 1183 pthread_mutex_t mutex; 1184 pthread_cond_t cond[2]; 1185 } osal_condpair_t; 1186 typedef pthread_mutex_t osal_fastmutex_t; 1187 #define osal_malloc malloc 1188 #define osal_calloc calloc 1189 #define osal_realloc realloc 1190 #define osal_free free 1191 #define osal_strdup strdup 1192 #endif /* Platform */ 1193 1194 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) 1195 /* malloc_usable_size() already provided */ 1196 #elif defined(__APPLE__) 1197 #define malloc_usable_size(ptr) malloc_size(ptr) 1198 #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT 1199 #define malloc_usable_size(ptr) _msize(ptr) 1200 #endif /* malloc_usable_size */ 1201 1202 /*----------------------------------------------------------------------------*/ 1203 /* OS abstraction layer stuff */ 1204 1205 /* Get the size of a memory page for the system. 1206 * This is the basic size that the platform's memory manager uses, and is 1207 * fundamental to the use of memory-mapped files. */ 1208 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t 1209 osal_syspagesize(void) { 1210 #if defined(_WIN32) || defined(_WIN64) 1211 SYSTEM_INFO si; 1212 GetSystemInfo(&si); 1213 return si.dwPageSize; 1214 #else 1215 return sysconf(_SC_PAGE_SIZE); 1216 #endif 1217 } 1218 1219 #if defined(_WIN32) || defined(_WIN64) 1220 typedef wchar_t pathchar_t; 1221 #else 1222 typedef char pathchar_t; 1223 #endif 1224 1225 typedef struct osal_mmap_param { 1226 union { 1227 void *address; 1228 uint8_t *dxb; 1229 struct MDBX_lockinfo *lck; 1230 }; 1231 mdbx_filehandle_t fd; 1232 size_t limit; /* mapping length, but NOT a size of file nor DB */ 1233 size_t current; /* mapped region size, i.e. the size of file and DB */ 1234 uint64_t filesize /* in-process cache of a file size */; 1235 #if defined(_WIN32) || defined(_WIN64) 1236 HANDLE section; /* memory-mapped section handle */ 1237 #endif 1238 } osal_mmap_t; 1239 1240 typedef union bin128 { 1241 __anonymous_struct_extension__ struct { uint64_t x, y; }; 1242 __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; 1243 } bin128_t; 1244 1245 #if defined(_WIN32) || defined(_WIN64) 1246 typedef union osal_srwlock { 1247 __anonymous_struct_extension__ struct { 1248 long volatile readerCount; 1249 long volatile writerCount; 1250 }; 1251 RTL_SRWLOCK native; 1252 } osal_srwlock_t; 1253 #endif /* Windows */ 1254 1255 #ifndef __cplusplus 1256 1257 /*----------------------------------------------------------------------------*/ 1258 /* libc compatibility stuff */ 1259 1260 #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ 1261 (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) 1262 #define osal_asprintf asprintf 1263 #define osal_vasprintf vasprintf 1264 #else 1265 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC 1266 MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); 1267 MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); 1268 #endif 1269 1270 #if !defined(MADV_DODUMP) && defined(MADV_CORE) 1271 #define MADV_DODUMP MADV_CORE 1272 #endif /* MADV_CORE -> MADV_DODUMP */ 1273 1274 #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE) 1275 #define MADV_DONTDUMP MADV_NOCORE 1276 #endif /* MADV_NOCORE -> MADV_DONTDUMP */ 1277 1278 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); 1279 MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); 1280 1281 /* max bytes to write in one call */ 1282 #if defined(_WIN32) || defined(_WIN64) 1283 #define MAX_WRITE UINT32_C(0x01000000) 1284 #else 1285 #define MAX_WRITE UINT32_C(0x3fff0000) 1286 #endif 1287 1288 #if defined(__linux__) || defined(__gnu_linux__) 1289 MDBX_INTERNAL_VAR uint32_t linux_kernel_version; 1290 MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; 1291 #endif /* Linux */ 1292 1293 #ifndef osal_strdup 1294 LIBMDBX_API char *osal_strdup(const char *str); 1295 #endif 1296 1297 MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { 1298 #if defined(_WIN32) || defined(_WIN64) 1299 DWORD rc = GetLastError(); 1300 #else 1301 int rc = errno; 1302 #endif 1303 return rc; 1304 } 1305 1306 #ifndef osal_memalign_alloc 1307 MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, 1308 void **result); 1309 #endif 1310 #ifndef osal_memalign_free 1311 MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); 1312 #endif 1313 1314 MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); 1315 MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); 1316 MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); 1317 MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, 1318 bool part); 1319 MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); 1320 MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); 1321 1322 MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); 1323 MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); 1324 MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); 1325 MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); 1326 1327 MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, 1328 int iovcnt, uint64_t offset, 1329 size_t expected_written); 1330 MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, 1331 uint64_t offset); 1332 MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, 1333 size_t count, uint64_t offset); 1334 MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, 1335 size_t count); 1336 1337 MDBX_INTERNAL_FUNC int 1338 osal_thread_create(osal_thread_t *thread, 1339 THREAD_RESULT(THREAD_CALL *start_routine)(void *), 1340 void *arg); 1341 MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); 1342 1343 enum osal_syncmode_bits { 1344 MDBX_SYNC_NONE = 0, 1345 MDBX_SYNC_DATA = 1, 1346 MDBX_SYNC_SIZE = 2, 1347 MDBX_SYNC_IODQ = 4 1348 }; 1349 1350 MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, 1351 const enum osal_syncmode_bits mode_bits); 1352 MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); 1353 MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); 1354 MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); 1355 1356 enum osal_openfile_purpose { 1357 MDBX_OPEN_DXB_READ = 0, 1358 MDBX_OPEN_DXB_LAZY = 1, 1359 MDBX_OPEN_DXB_DSYNC = 2, 1360 MDBX_OPEN_LCK = 3, 1361 MDBX_OPEN_COPY = 4, 1362 MDBX_OPEN_DELETE = 5 1363 }; 1364 1365 MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, 1366 const MDBX_env *env, 1367 const pathchar_t *pathname, 1368 mdbx_filehandle_t *fd, 1369 mdbx_mode_t unix_mode_bits); 1370 MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); 1371 MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); 1372 MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); 1373 MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); 1374 MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); 1375 1376 #define MMAP_OPTION_TRUNCATE 1 1377 #define MMAP_OPTION_SEMAPHORE 2 1378 MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, 1379 const size_t must, const size_t limit, 1380 const unsigned options); 1381 MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); 1382 #define MDBX_MRESIZE_MAY_MOVE 0x00000100 1383 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 1384 MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, 1385 size_t size, size_t limit); 1386 #if defined(_WIN32) || defined(_WIN64) 1387 typedef struct { 1388 unsigned limit, count; 1389 HANDLE handles[31]; 1390 } mdbx_handle_array_t; 1391 MDBX_INTERNAL_FUNC int 1392 osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); 1393 MDBX_INTERNAL_FUNC int 1394 osal_resume_threads_after_remap(mdbx_handle_array_t *array); 1395 #endif /* Windows */ 1396 MDBX_INTERNAL_FUNC int osal_msync(osal_mmap_t *map, size_t offset, 1397 size_t length, 1398 enum osal_syncmode_bits mode_bits); 1399 MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, 1400 const pathchar_t *pathname, 1401 int err); 1402 1403 MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { 1404 STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); 1405 #if defined(_WIN32) || defined(_WIN64) 1406 return GetCurrentProcessId(); 1407 #else 1408 STATIC_ASSERT(sizeof(pid_t) <= sizeof(uint32_t)); 1409 return getpid(); 1410 #endif 1411 } 1412 1413 MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { 1414 mdbx_tid_t thunk; 1415 STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); 1416 #if defined(_WIN32) || defined(_WIN64) 1417 thunk = GetCurrentThreadId(); 1418 #else 1419 thunk = pthread_self(); 1420 #endif 1421 return (uintptr_t)thunk; 1422 } 1423 1424 #if !defined(_WIN32) && !defined(_WIN64) 1425 #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) 1426 MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); 1427 #else 1428 static __inline int osal_check_tid4bionic(void) { return 0; } 1429 #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ 1430 1431 MDBX_MAYBE_UNUSED static __inline int 1432 osal_pthread_mutex_lock(pthread_mutex_t *mutex) { 1433 int err = osal_check_tid4bionic(); 1434 return unlikely(err) ? err : pthread_mutex_lock(mutex); 1435 } 1436 #endif /* !Windows */ 1437 1438 MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); 1439 MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); 1440 MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); 1441 1442 MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); 1443 /*----------------------------------------------------------------------------*/ 1444 /* lck stuff */ 1445 1446 /// \brief Initialization of synchronization primitives linked with MDBX_env 1447 /// instance both in LCK-file and within the current process. 1448 /// \param 1449 /// global_uniqueness_flag = true - denotes that there are no other processes 1450 /// working with DB and LCK-file. Thus the function MUST initialize 1451 /// shared synchronization objects in memory-mapped LCK-file. 1452 /// global_uniqueness_flag = false - denotes that at least one process is 1453 /// already working with DB and LCK-file, including the case when DB 1454 /// has already been opened in the current process. Thus the function 1455 /// MUST NOT initialize shared synchronization objects in memory-mapped 1456 /// LCK-file that are already in use. 1457 /// \return Error code or zero on success. 1458 MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, 1459 MDBX_env *inprocess_neighbor, 1460 int global_uniqueness_flag); 1461 1462 /// \brief Disconnects from shared interprocess objects and destructs 1463 /// synchronization objects linked with MDBX_env instance 1464 /// within the current process. 1465 /// \param 1466 /// inprocess_neighbor = NULL - if the current process does not have other 1467 /// instances of MDBX_env linked with the DB being closed. 1468 /// Thus the function MUST check for other processes working with DB or 1469 /// LCK-file, and keep or destroy shared synchronization objects in 1470 /// memory-mapped LCK-file depending on the result. 1471 /// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env 1472 /// (anyone of there is several) working with DB or LCK-file within the 1473 /// current process. Thus the function MUST NOT try to acquire exclusive 1474 /// lock and/or try to destruct shared synchronization objects linked with 1475 /// DB or LCK-file. Moreover, the implementation MUST ensure correct work 1476 /// of other instances of MDBX_env within the current process, e.g. 1477 /// restore POSIX-fcntl locks after the closing of file descriptors. 1478 /// \return Error code (MDBX_PANIC) or zero on success. 1479 MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, 1480 MDBX_env *inprocess_neighbor); 1481 1482 /// \brief Connects to shared interprocess locking objects and tries to acquire 1483 /// the maximum lock level (shared if exclusive is not available) 1484 /// Depending on implementation or/and platform (Windows) this function may 1485 /// acquire the non-OS super-level lock (e.g. for shared synchronization 1486 /// objects initialization), which will be downgraded to OS-exclusive or 1487 /// shared via explicit calling of osal_lck_downgrade(). 1488 /// \return 1489 /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus 1490 /// the current process is the first and only after the last use of DB. 1491 /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus 1492 /// DB has already been opened and now is used by other processes. 1493 /// Otherwise (not 0 and not -1) - error code. 1494 MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); 1495 1496 /// \brief Downgrades the level of initially acquired lock to 1497 /// operational level specified by argument. The reson for such downgrade: 1498 /// - unblocking of other processes that are waiting for access, i.e. 1499 /// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes 1500 /// should be made aware that access is unavailable rather than 1501 /// wait for it. 1502 /// - freeing locks that interfere file operation (especially for Windows) 1503 /// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock. 1504 /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive 1505 /// operational lock. 1506 /// \return Error code or zero on success 1507 MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); 1508 1509 /// \brief Locks LCK-file or/and table of readers for (de)registering. 1510 /// \return Error code or zero on success 1511 MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); 1512 1513 /// \brief Unlocks LCK-file or/and table of readers after (de)registering. 1514 MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); 1515 1516 /// \brief Acquires lock for DB change (on writing transaction start) 1517 /// Reading transactions will not be blocked. 1518 /// Declared as LIBMDBX_API because it is used in mdbx_chk. 1519 /// \return Error code or zero on success 1520 LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); 1521 1522 /// \brief Releases lock once DB changes is made (after writing transaction 1523 /// has finished). 1524 /// Declared as LIBMDBX_API because it is used in mdbx_chk. 1525 LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); 1526 1527 /// \brief Sets alive-flag of reader presence (indicative lock) for PID of 1528 /// the current process. The function does no more than needed for 1529 /// the correct working of osal_rpid_check() in other processes. 1530 /// \return Error code or zero on success 1531 MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); 1532 1533 /// \brief Resets alive-flag of reader presence (indicative lock) 1534 /// for PID of the current process. The function does no more than needed 1535 /// for the correct working of osal_rpid_check() in other processes. 1536 /// \return Error code or zero on success 1537 MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); 1538 1539 /// \brief Checks for reading process status with the given pid with help of 1540 /// alive-flag of presence (indicative lock) or using another way. 1541 /// \return 1542 /// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive 1543 /// and working with DB (indicative lock is present). 1544 /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent 1545 /// or not working with DB (indicative lock is not present). 1546 /// Otherwise (not 0 and not -1) - error code. 1547 MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); 1548 1549 #if defined(_WIN32) || defined(_WIN64) 1550 1551 #define OSAL_MB2WIDE(FROM, TO) \ 1552 do { \ 1553 const char *const from_tmp = (FROM); \ 1554 const size_t from_mblen = strlen(from_tmp); \ 1555 const size_t to_wlen = osal_mb2w(nullptr, 0, from_tmp, from_mblen); \ 1556 if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ 1557 return ERROR_INVALID_NAME; \ 1558 wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ 1559 if (to_wlen + 1 != \ 1560 osal_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ 1561 return ERROR_INVALID_NAME; \ 1562 (TO) = to_tmp; \ 1563 } while (0) 1564 1565 typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); 1566 MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, 1567 osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, 1568 osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; 1569 1570 #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ 1571 typedef enum _FILE_INFO_BY_HANDLE_CLASS { 1572 FileBasicInfo, 1573 FileStandardInfo, 1574 FileNameInfo, 1575 FileRenameInfo, 1576 FileDispositionInfo, 1577 FileAllocationInfo, 1578 FileEndOfFileInfo, 1579 FileStreamInfo, 1580 FileCompressionInfo, 1581 FileAttributeTagInfo, 1582 FileIdBothDirectoryInfo, 1583 FileIdBothDirectoryRestartInfo, 1584 FileIoPriorityHintInfo, 1585 FileRemoteProtocolInfo, 1586 MaximumFileInfoByHandleClass 1587 } FILE_INFO_BY_HANDLE_CLASS, 1588 *PFILE_INFO_BY_HANDLE_CLASS; 1589 1590 typedef struct _FILE_END_OF_FILE_INFO { 1591 LARGE_INTEGER EndOfFile; 1592 } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO; 1593 1594 #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001 1595 #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002 1596 1597 typedef struct _FILE_REMOTE_PROTOCOL_INFO { 1598 USHORT StructureVersion; 1599 USHORT StructureSize; 1600 DWORD Protocol; 1601 USHORT ProtocolMajorVersion; 1602 USHORT ProtocolMinorVersion; 1603 USHORT ProtocolRevision; 1604 USHORT Reserved; 1605 DWORD Flags; 1606 struct { 1607 DWORD Reserved[8]; 1608 } GenericReserved; 1609 struct { 1610 DWORD Reserved[16]; 1611 } ProtocolSpecificReserved; 1612 } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO; 1613 1614 #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */ 1615 1616 typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( 1617 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, 1618 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); 1619 MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx 1620 mdbx_GetFileInformationByHandleEx; 1621 1622 typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( 1623 _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, 1624 _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber, 1625 _Out_opt_ LPDWORD lpMaximumComponentLength, 1626 _Out_opt_ LPDWORD lpFileSystemFlags, 1627 _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); 1628 MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW 1629 mdbx_GetVolumeInformationByHandleW; 1630 1631 typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, 1632 _Out_ LPWSTR lpszFilePath, 1633 _In_ DWORD cchFilePath, 1634 _In_ DWORD dwFlags); 1635 MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; 1636 1637 typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( 1638 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, 1639 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); 1640 MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle 1641 mdbx_SetFileInformationByHandle; 1642 1643 typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( 1644 IN HANDLE FileHandle, IN OUT HANDLE Event, 1645 IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext, 1646 OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, 1647 IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, 1648 OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); 1649 MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; 1650 1651 typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); 1652 MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; 1653 1654 #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 1655 typedef struct _WIN32_MEMORY_RANGE_ENTRY { 1656 PVOID VirtualAddress; 1657 SIZE_T NumberOfBytes; 1658 } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; 1659 #endif /* Windows 8.x */ 1660 1661 typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( 1662 HANDLE hProcess, ULONG_PTR NumberOfEntries, 1663 PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); 1664 MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; 1665 1666 typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; 1667 1668 typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, 1669 IN PLARGE_INTEGER NewSectionSize); 1670 MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; 1671 1672 static __inline bool mdbx_RunningUnderWine(void) { 1673 return !mdbx_NtExtendSection; 1674 } 1675 1676 typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, 1677 LPCSTR lpValue, DWORD dwFlags, 1678 LPDWORD pdwType, PVOID pvData, 1679 LPDWORD pcbData); 1680 MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; 1681 1682 NTSYSAPI ULONG RtlRandomEx(PULONG Seed); 1683 1684 #endif /* Windows */ 1685 1686 #endif /* !__cplusplus */ 1687 1688 /*----------------------------------------------------------------------------*/ 1689 1690 #if defined(_MSC_VER) && _MSC_VER >= 1900 1691 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros 1692 * for internal format-args checker. */ 1693 #undef PRIuPTR 1694 #undef PRIiPTR 1695 #undef PRIdPTR 1696 #undef PRIxPTR 1697 #define PRIuPTR "Iu" 1698 #define PRIiPTR "Ii" 1699 #define PRIdPTR "Id" 1700 #define PRIxPTR "Ix" 1701 #define PRIuSIZE "zu" 1702 #define PRIiSIZE "zi" 1703 #define PRIdSIZE "zd" 1704 #define PRIxSIZE "zx" 1705 #endif /* fix PRI*PTR for _MSC_VER */ 1706 1707 #ifndef PRIuSIZE 1708 #define PRIuSIZE PRIuPTR 1709 #define PRIiSIZE PRIiPTR 1710 #define PRIdSIZE PRIdPTR 1711 #define PRIxSIZE PRIxPTR 1712 #endif /* PRI*SIZE macros for MSVC */ 1713 1714 #ifdef _MSC_VER 1715 #pragma warning(pop) 1716 #endif 1717 1718 #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) 1719 #if defined(xMDBX_TOOLS) 1720 extern LIBMDBX_API const char *const mdbx_sourcery_anchor; 1721 #endif 1722 1723 /******************************************************************************* 1724 ******************************************************************************* 1725 ******************************************************************************* 1726 * 1727 * 1728 * #### ##### ##### # #### # # #### 1729 * # # # # # # # # ## # # 1730 * # # # # # # # # # # # #### 1731 * # # ##### # # # # # # # # 1732 * # # # # # # # # ## # # 1733 * #### # # # #### # # #### 1734 * 1735 * 1736 */ 1737 1738 /** \defgroup build_option Build options 1739 * The libmdbx build options. 1740 @{ */ 1741 1742 /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ 1743 #define MDBX_OSX_WANNA_DURABILITY 0 1744 /** Using fsync() with chance of data lost on power failure */ 1745 #define MDBX_OSX_WANNA_SPEED 1 1746 1747 #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY 1748 /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED 1749 * for OSX & iOS */ 1750 #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY 1751 #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */ 1752 1753 /** Controls checking PID against reuse DB environment after the fork() */ 1754 #ifndef MDBX_ENV_CHECKPID 1755 #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) 1756 /* PID check could be omitted: 1757 * - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork() 1758 * mapped pages will not be available for child process. 1759 * - in Windows where fork() not available. */ 1760 #define MDBX_ENV_CHECKPID 0 1761 #else 1762 #define MDBX_ENV_CHECKPID 1 1763 #endif 1764 #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) 1765 #else 1766 #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) 1767 #endif /* MDBX_ENV_CHECKPID */ 1768 1769 /** Controls checking transaction owner thread against misuse transactions from 1770 * other threads. */ 1771 #ifndef MDBX_TXN_CHECKOWNER 1772 #define MDBX_TXN_CHECKOWNER 1 1773 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) 1774 #else 1775 #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) 1776 #endif /* MDBX_TXN_CHECKOWNER */ 1777 1778 /** Does a system have battery-backed Real-Time Clock or just a fake. */ 1779 #ifndef MDBX_TRUST_RTC 1780 #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) || \ 1781 defined(__OpenBSD__) 1782 #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */ 1783 #else 1784 #define MDBX_TRUST_RTC 1 1785 #endif 1786 #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) 1787 #else 1788 #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) 1789 #endif /* MDBX_TRUST_RTC */ 1790 1791 /** Controls online database auto-compactification during write-transactions. */ 1792 #ifndef MDBX_ENABLE_REFUND 1793 #define MDBX_ENABLE_REFUND 1 1794 #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) 1795 #error MDBX_ENABLE_REFUND must be defined as 0 or 1 1796 #endif /* MDBX_ENABLE_REFUND */ 1797 1798 /** Controls gathering statistics for page operations. */ 1799 #ifndef MDBX_ENABLE_PGOP_STAT 1800 #define MDBX_ENABLE_PGOP_STAT 1 1801 #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1) 1802 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 1803 #endif /* MDBX_ENABLE_PGOP_STAT */ 1804 1805 /** Enables chunking long list of retired pages during huge transactions commit 1806 * to avoid use sequences of pages. */ 1807 #ifndef MDBX_ENABLE_BIGFOOT 1808 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) 1809 #define MDBX_ENABLE_BIGFOOT 1 1810 #else 1811 #define MDBX_ENABLE_BIGFOOT 0 1812 #endif 1813 #elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) 1814 #error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 1815 #endif /* MDBX_ENABLE_BIGFOOT */ 1816 1817 /** Controls use of POSIX madvise() hints and friends. */ 1818 #ifndef MDBX_ENABLE_MADVISE 1819 #define MDBX_ENABLE_MADVISE 1 1820 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1) 1821 #error MDBX_ENABLE_MADVISE must be defined as 0 or 1 1822 #endif /* MDBX_ENABLE_MADVISE */ 1823 1824 /** Disable some checks to reduce an overhead and detection probability of 1825 * database corruption to a values closer to the LMDB. */ 1826 #ifndef MDBX_DISABLE_VALIDATION 1827 #define MDBX_DISABLE_VALIDATION 0 1828 #elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) 1829 #error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 1830 #endif /* MDBX_DISABLE_VALIDATION */ 1831 1832 #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT 1833 #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 1834 #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 || \ 1835 MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1) 1836 #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 1837 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ 1838 1839 #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT 1840 #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1 1841 #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 || \ 1842 MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1) 1843 #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 1844 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ 1845 1846 /** Basically, this build-option is for TODO. Guess it should be replaced 1847 * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: 1848 * 0/OFF = Don't track dirty pages at all and don't spilling ones. 1849 * This should be by-default on Linux and may-be other systems 1850 * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides 1851 * properly LRU tracking and async writing on-demand. 1852 * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit 1853 * spilling with msync(MS_ASYNC). */ 1854 #ifndef MDBX_FAKE_SPILL_WRITEMAP 1855 #if defined(__linux__) || defined(__gnu_linux__) 1856 #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ 1857 #else 1858 #define MDBX_FAKE_SPILL_WRITEMAP 0 1859 #endif 1860 #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) 1861 #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 1862 #endif /* MDBX_FAKE_SPILL_WRITEMAP */ 1863 1864 /** Controls sort order of internal page number lists. 1865 * This mostly experimental/advanced option with not for regular MDBX users. 1866 * \warning The database format depend on this option and libmdbx builded with 1867 * different option value are incompatible. */ 1868 #ifndef MDBX_PNL_ASCENDING 1869 #define MDBX_PNL_ASCENDING 0 1870 #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) 1871 #error MDBX_PNL_ASCENDING must be defined as 0 or 1 1872 #endif /* MDBX_PNL_ASCENDING */ 1873 1874 /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */ 1875 #ifndef MDBX_WITHOUT_MSVC_CRT 1876 #define MDBX_WITHOUT_MSVC_CRT 1 1877 #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1) 1878 #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1 1879 #endif /* MDBX_WITHOUT_MSVC_CRT */ 1880 1881 /** Size of buffer used during copying a environment/database file. */ 1882 #ifndef MDBX_ENVCOPY_WRITEBUF 1883 #define MDBX_ENVCOPY_WRITEBUF 1048576u 1884 #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \ 1885 MDBX_ENVCOPY_WRITEBUF % 65536u 1886 #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536 1887 #endif /* MDBX_ENVCOPY_WRITEBUF */ 1888 1889 /** Forces assertion checking */ 1890 #ifndef MDBX_FORCE_ASSERTIONS 1891 #define MDBX_FORCE_ASSERTIONS 0 1892 #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1) 1893 #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1 1894 #endif /* MDBX_FORCE_ASSERTIONS */ 1895 1896 /** Presumed malloc size overhead for each allocation 1897 * to adjust allocations to be more aligned. */ 1898 #ifndef MDBX_ASSUME_MALLOC_OVERHEAD 1899 #ifdef __SIZEOF_POINTER__ 1900 #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u) 1901 #else 1902 #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u) 1903 #endif 1904 #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 || \ 1905 MDBX_ASSUME_MALLOC_OVERHEAD % 4 1906 #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4 1907 #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */ 1908 1909 /** If defined then enables integration with Valgrind, 1910 * a memory analyzing tool. */ 1911 #ifndef MDBX_USE_VALGRIND 1912 #endif /* MDBX_USE_VALGRIND */ 1913 1914 /** If defined then enables use C11 atomics, 1915 * otherwise detects ones availability automatically. */ 1916 #ifndef MDBX_HAVE_C11ATOMICS 1917 #endif /* MDBX_HAVE_C11ATOMICS */ 1918 1919 //------------------------------------------------------------------------------ 1920 1921 /** Win32 File Locking API for \ref MDBX_LOCKING */ 1922 #define MDBX_LOCKING_WIN32FILES -1 1923 1924 /** SystemV IPC semaphores for \ref MDBX_LOCKING */ 1925 #define MDBX_LOCKING_SYSV 5 1926 1927 /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */ 1928 #define MDBX_LOCKING_POSIX1988 1988 1929 1930 /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */ 1931 #define MDBX_LOCKING_POSIX2001 2001 1932 1933 /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */ 1934 #define MDBX_LOCKING_POSIX2008 2008 1935 1936 /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */ 1937 #define MDBX_LOCKING_BENAPHORE 1995 1938 1939 /** Advanced: Choices the locking implementation (autodetection by default). */ 1940 #if defined(_WIN32) || defined(_WIN64) 1941 #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES 1942 #else 1943 #ifndef MDBX_LOCKING 1944 #if defined(_POSIX_THREAD_PROCESS_SHARED) && \ 1945 _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__) 1946 1947 /* Some platforms define the EOWNERDEAD error code even though they 1948 * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */ 1949 #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L && \ 1950 ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) && \ 1951 _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) || \ 1952 (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) && \ 1953 _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) || \ 1954 defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \ 1955 (!defined(__GLIBC__) || \ 1956 __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */) 1957 #define MDBX_LOCKING MDBX_LOCKING_POSIX2008 1958 #else 1959 #define MDBX_LOCKING MDBX_LOCKING_POSIX2001 1960 #endif 1961 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) 1962 #define MDBX_LOCKING MDBX_LOCKING_POSIX1988 1963 #else 1964 #define MDBX_LOCKING MDBX_LOCKING_SYSV 1965 #endif 1966 #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING) 1967 #else 1968 #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING) 1969 #endif /* MDBX_LOCKING */ 1970 #endif /* !Windows */ 1971 1972 /** Advanced: Using POSIX OFD-locks (autodetection by default). */ 1973 #ifndef MDBX_USE_OFDLOCKS 1974 #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \ 1975 !defined(MDBX_SAFE4QEMU) && \ 1976 !defined(__sun) /* OFD-lock are broken on Solaris */ 1977 #define MDBX_USE_OFDLOCKS 1 1978 #else 1979 #define MDBX_USE_OFDLOCKS 0 1980 #endif 1981 #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) 1982 #else 1983 #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) 1984 #endif /* MDBX_USE_OFDLOCKS */ 1985 1986 /** Advanced: Using sendfile() syscall (autodetection by default). */ 1987 #ifndef MDBX_USE_SENDFILE 1988 #if ((defined(__linux__) || defined(__gnu_linux__)) && \ 1989 !defined(__ANDROID_API__)) || \ 1990 (defined(__ANDROID_API__) && __ANDROID_API__ >= 21) 1991 #define MDBX_USE_SENDFILE 1 1992 #else 1993 #define MDBX_USE_SENDFILE 0 1994 #endif 1995 #endif /* MDBX_USE_SENDFILE */ 1996 1997 /** Advanced: Using copy_file_range() syscall (autodetection by default). */ 1998 #ifndef MDBX_USE_COPYFILERANGE 1999 #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) 2000 #define MDBX_USE_COPYFILERANGE 1 2001 #else 2002 #define MDBX_USE_COPYFILERANGE 0 2003 #endif 2004 #endif /* MDBX_USE_COPYFILERANGE */ 2005 2006 /** Advanced: Using sync_file_range() syscall (autodetection by default). */ 2007 #ifndef MDBX_USE_SYNCFILERANGE 2008 #if ((defined(__linux__) || defined(__gnu_linux__)) && \ 2009 defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) || \ 2010 (defined(__ANDROID_API__) && __ANDROID_API__ >= 26) 2011 #define MDBX_USE_SYNCFILERANGE 1 2012 #else 2013 #define MDBX_USE_SYNCFILERANGE 0 2014 #endif 2015 #endif /* MDBX_USE_SYNCFILERANGE */ 2016 2017 //------------------------------------------------------------------------------ 2018 2019 #ifndef MDBX_CPU_WRITEBACK_INCOHERENT 2020 #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ 2021 defined(__hppa__) || defined(DOXYGEN) 2022 #define MDBX_CPU_WRITEBACK_INCOHERENT 0 2023 #else 2024 #define MDBX_CPU_WRITEBACK_INCOHERENT 1 2025 #endif 2026 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ 2027 2028 #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE 2029 #ifdef __OpenBSD__ 2030 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1 2031 #else 2032 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 2033 #endif 2034 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 2035 2036 #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE 2037 #if defined(__mips) || defined(__mips__) || defined(__mips64) || \ 2038 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ 2039 defined(__MWERKS__) || defined(__sgi) 2040 /* MIPS has cache coherency issues. */ 2041 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1 2042 #else 2043 /* LY: assume no relevant mmap/dcache issues. */ 2044 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 2045 #endif 2046 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ 2047 2048 #ifndef MDBX_64BIT_ATOMIC 2049 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) 2050 #define MDBX_64BIT_ATOMIC 1 2051 #else 2052 #define MDBX_64BIT_ATOMIC 0 2053 #endif 2054 #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) 2055 #else 2056 #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) 2057 #endif /* MDBX_64BIT_ATOMIC */ 2058 2059 #ifndef MDBX_64BIT_CAS 2060 #if defined(ATOMIC_LLONG_LOCK_FREE) 2061 #if ATOMIC_LLONG_LOCK_FREE > 1 2062 #define MDBX_64BIT_CAS 1 2063 #else 2064 #define MDBX_64BIT_CAS 0 2065 #endif 2066 #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE) 2067 #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1 2068 #define MDBX_64BIT_CAS 1 2069 #else 2070 #define MDBX_64BIT_CAS 0 2071 #endif 2072 #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE) 2073 #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1 2074 #define MDBX_64BIT_CAS 1 2075 #else 2076 #define MDBX_64BIT_CAS 0 2077 #endif 2078 #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) 2079 #define MDBX_64BIT_CAS 1 2080 #else 2081 #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC 2082 #endif 2083 #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS) 2084 #else 2085 #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS) 2086 #endif /* MDBX_64BIT_CAS */ 2087 2088 #ifndef MDBX_UNALIGNED_OK 2089 #if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ 2090 defined(ENABLE_UBSAN) 2091 #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ 2092 #elif defined(__ARM_FEATURE_UNALIGNED) 2093 #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ 2094 #elif defined(__e2k__) || defined(__elbrus__) 2095 #if __iset__ > 4 2096 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ 2097 #else 2098 #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ 2099 #endif 2100 #elif defined(__ia32__) 2101 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ 2102 #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) 2103 /* expecting an optimization will well done, also this 2104 * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ 2105 #define MDBX_UNALIGNED_OK 0 2106 #else 2107 #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ 2108 #endif 2109 #elif MDBX_UNALIGNED_OK == 1 2110 #undef MDBX_UNALIGNED_OK 2111 #define MDBX_UNALIGNED_OK 32 /* any unaligned access allowed */ 2112 #endif /* MDBX_UNALIGNED_OK */ 2113 2114 #ifndef MDBX_CACHELINE_SIZE 2115 #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) 2116 #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE 2117 #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) 2118 #define MDBX_CACHELINE_SIZE 128 2119 #else 2120 #define MDBX_CACHELINE_SIZE 64 2121 #endif 2122 #endif /* MDBX_CACHELINE_SIZE */ 2123 2124 /** @} end of build options */ 2125 /******************************************************************************* 2126 ******************************************************************************* 2127 ******************************************************************************/ 2128 2129 #ifndef DOXYGEN 2130 2131 /* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */ 2132 #ifndef MDBX_DEBUG 2133 #ifdef NDEBUG 2134 #define MDBX_DEBUG 0 2135 #else 2136 #define MDBX_DEBUG 1 2137 #endif 2138 #endif /* MDBX_DEBUG */ 2139 2140 #else 2141 2142 /* !!! Actually this is a fake definitions for Doxygen !!! */ 2143 2144 /** Controls enabling of debugging features. 2145 * 2146 * - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all, 2147 * including logging and assertion controls. 2148 * Logging level and corresponding debug flags changing 2149 * by \ref mdbx_setup_debug() will not have effect. 2150 * - `MDBX_DEBUG > 0` Enables code for the debugging features (logging, 2151 * assertions checking and internal audit). 2152 * Simultaneously sets the default logging level 2153 * to the `MDBX_DEBUG` value. 2154 * Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`. 2155 * 2156 * \ingroup build_option */ 2157 #define MDBX_DEBUG 0...7 2158 2159 /** Disables using of GNU libc extensions. */ 2160 #define MDBX_DISABLE_GNU_SOURCE 0 or 1 2161 2162 #endif /* DOXYGEN */ 2163 2164 /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ 2165 #if MDBX_DEBUG 2166 #undef NDEBUG 2167 #endif 2168 2169 /*----------------------------------------------------------------------------*/ 2170 /* Atomics */ 2171 2172 enum MDBX_memory_order { 2173 mo_Relaxed, 2174 mo_AcquireRelease 2175 /* , mo_SequentialConsistency */ 2176 }; 2177 2178 typedef union { 2179 volatile uint32_t weak; 2180 #ifdef MDBX_HAVE_C11ATOMICS 2181 volatile _Atomic uint32_t c11a; 2182 #endif /* MDBX_HAVE_C11ATOMICS */ 2183 } MDBX_atomic_uint32_t; 2184 2185 typedef union { 2186 volatile uint64_t weak; 2187 #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) 2188 volatile _Atomic uint64_t c11a; 2189 #endif 2190 #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC 2191 __anonymous_struct_extension__ struct { 2192 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 2193 MDBX_atomic_uint32_t low, high; 2194 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 2195 MDBX_atomic_uint32_t high, low; 2196 #else 2197 #error "FIXME: Unsupported byte order" 2198 #endif /* __BYTE_ORDER__ */ 2199 }; 2200 #endif 2201 } MDBX_atomic_uint64_t; 2202 2203 #ifdef MDBX_HAVE_C11ATOMICS 2204 2205 /* Crutches for C11 atomic compiler's bugs */ 2206 #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127 2207 #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak) 2208 #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak) 2209 #elif defined(__clang__) && __clang__ < 8 2210 #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a) 2211 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) 2212 #else 2213 #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a) 2214 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) 2215 #endif /* Crutches for C11 atomic compiler's bugs */ 2216 2217 #define mo_c11_store(fence) \ 2218 (((fence) == mo_Relaxed) ? memory_order_relaxed \ 2219 : ((fence) == mo_AcquireRelease) ? memory_order_release \ 2220 : memory_order_seq_cst) 2221 #define mo_c11_load(fence) \ 2222 (((fence) == mo_Relaxed) ? memory_order_relaxed \ 2223 : ((fence) == mo_AcquireRelease) ? memory_order_acquire \ 2224 : memory_order_seq_cst) 2225 2226 #endif /* MDBX_HAVE_C11ATOMICS */ 2227 2228 #ifndef __cplusplus 2229 2230 #ifdef MDBX_HAVE_C11ATOMICS 2231 #define osal_memory_fence(order, write) \ 2232 atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) 2233 #else /* MDBX_HAVE_C11ATOMICS */ 2234 #define osal_memory_fence(order, write) \ 2235 do { \ 2236 osal_compiler_barrier(); \ 2237 if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ 2238 : mo_AcquireRelease)) \ 2239 osal_memory_barrier(); \ 2240 } while (0) 2241 #endif /* MDBX_HAVE_C11ATOMICS */ 2242 2243 #if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) 2244 #define atomic_store32(p, value, order) \ 2245 ({ \ 2246 const uint32_t value_to_store = (value); \ 2247 atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \ 2248 mo_c11_store(order)); \ 2249 value_to_store; \ 2250 }) 2251 #define atomic_load32(p, order) \ 2252 atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)) 2253 #define atomic_store64(p, value, order) \ 2254 ({ \ 2255 const uint64_t value_to_store = (value); \ 2256 atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \ 2257 mo_c11_store(order)); \ 2258 value_to_store; \ 2259 }) 2260 #define atomic_load64(p, order) \ 2261 atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)) 2262 #endif /* LCC && MDBX_HAVE_C11ATOMICS */ 2263 2264 #ifndef atomic_store32 2265 MDBX_MAYBE_UNUSED static __always_inline uint32_t 2266 atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, 2267 enum MDBX_memory_order order) { 2268 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); 2269 #ifdef MDBX_HAVE_C11ATOMICS 2270 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); 2271 atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); 2272 #else /* MDBX_HAVE_C11ATOMICS */ 2273 if (order != mo_Relaxed) 2274 osal_compiler_barrier(); 2275 p->weak = value; 2276 osal_memory_fence(order, true); 2277 #endif /* MDBX_HAVE_C11ATOMICS */ 2278 return value; 2279 } 2280 #endif /* atomic_store32 */ 2281 2282 #ifndef atomic_load32 2283 MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( 2284 const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { 2285 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); 2286 #ifdef MDBX_HAVE_C11ATOMICS 2287 assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); 2288 return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); 2289 #else /* MDBX_HAVE_C11ATOMICS */ 2290 osal_memory_fence(order, false); 2291 const uint32_t value = p->weak; 2292 if (order != mo_Relaxed) 2293 osal_compiler_barrier(); 2294 return value; 2295 #endif /* MDBX_HAVE_C11ATOMICS */ 2296 } 2297 #endif /* atomic_load32 */ 2298 2299 #endif /* !__cplusplus */ 2300 2301 /*----------------------------------------------------------------------------*/ 2302 /* Basic constants and types */ 2303 2304 /* A stamp that identifies a file as an MDBX file. 2305 * There's nothing special about this value other than that it is easily 2306 * recognizable, and it will reflect any byte order mismatches. */ 2307 #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) 2308 2309 /* FROZEN: The version number for a database's datafile format. */ 2310 #define MDBX_DATA_VERSION 3 2311 /* The version number for a database's lockfile format. */ 2312 #define MDBX_LOCK_VERSION 4 2313 2314 /* handle for the DB used to track free pages. */ 2315 #define FREE_DBI 0 2316 /* handle for the default DB. */ 2317 #define MAIN_DBI 1 2318 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ 2319 #define CORE_DBS 2 2320 2321 /* Number of meta pages - also hardcoded elsewhere */ 2322 #define NUM_METAS 3 2323 2324 /* A page number in the database. 2325 * 2326 * MDBX uses 32 bit for page numbers. This limits database 2327 * size up to 2^44 bytes, in case of 4K pages. */ 2328 typedef uint32_t pgno_t; 2329 typedef MDBX_atomic_uint32_t atomic_pgno_t; 2330 #define PRIaPGNO PRIu32 2331 #define MAX_PAGENO UINT32_C(0x7FFFffff) 2332 #define MIN_PAGENO NUM_METAS 2333 2334 #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) 2335 2336 /* A transaction ID. */ 2337 typedef uint64_t txnid_t; 2338 typedef MDBX_atomic_uint64_t atomic_txnid_t; 2339 #define PRIaTXN PRIi64 2340 #define MIN_TXNID UINT64_C(1) 2341 #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) 2342 #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1) 2343 #define INVALID_TXNID UINT64_MAX 2344 /* LY: for testing non-atomic 64-bit txnid on 32-bit arches. 2345 * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */ 2346 #ifndef xMDBX_TXNID_STEP 2347 #if MDBX_64BIT_CAS 2348 #define xMDBX_TXNID_STEP 1u 2349 #else 2350 #define xMDBX_TXNID_STEP 2u 2351 #endif 2352 #endif /* xMDBX_TXNID_STEP */ 2353 2354 /* Used for offsets within a single page. 2355 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 2356 * this is plenty. */ 2357 typedef uint16_t indx_t; 2358 2359 #define MEGABYTE ((size_t)1 << 20) 2360 2361 /*----------------------------------------------------------------------------*/ 2362 /* Core structures for database and shared memory (i.e. format definition) */ 2363 #pragma pack(push, 4) 2364 2365 /* Information about a single database in the environment. */ 2366 typedef struct MDBX_db { 2367 uint16_t md_flags; /* see mdbx_dbi_open */ 2368 uint16_t md_depth; /* depth of this tree */ 2369 uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */ 2370 pgno_t md_root; /* the root page of this tree */ 2371 pgno_t md_branch_pages; /* number of internal pages */ 2372 pgno_t md_leaf_pages; /* number of leaf pages */ 2373 pgno_t md_overflow_pages; /* number of overflow pages */ 2374 uint64_t md_seq; /* table sequence counter */ 2375 uint64_t md_entries; /* number of data items */ 2376 uint64_t md_mod_txnid; /* txnid of last committed modification */ 2377 } MDBX_db; 2378 2379 /* database size-related parameters */ 2380 typedef struct MDBX_geo { 2381 uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential 2382 quantized) value */ 2383 uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed 2384 (exponential quantized) value */ 2385 pgno_t lower; /* minimal size of datafile in pages */ 2386 pgno_t upper; /* maximal size of datafile in pages */ 2387 pgno_t now; /* current size of datafile in pages */ 2388 pgno_t next; /* first unused page in the datafile, 2389 but actually the file may be shorter. */ 2390 } MDBX_geo; 2391 2392 /* Meta page content. 2393 * A meta page is the start point for accessing a database snapshot. 2394 * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ 2395 typedef struct MDBX_meta { 2396 /* Stamp identifying this as an MDBX file. 2397 * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ 2398 uint32_t mm_magic_and_version[2]; 2399 2400 /* txnid that committed this page, the first of a two-phase-update pair */ 2401 union { 2402 MDBX_atomic_uint32_t mm_txnid_a[2]; 2403 uint64_t unsafe_txnid; 2404 }; 2405 2406 uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ 2407 uint8_t mm_validator_id; /* ID of checksum and page validation method, 2408 * zero (nothing) for now */ 2409 uint8_t mm_extra_pagehdr; /* extra bytes in the page header, 2410 * zero (nothing) for now */ 2411 2412 MDBX_geo mm_geo; /* database size-related parameters */ 2413 2414 MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ 2415 /* The size of pages used in this DB */ 2416 #define mm_psize mm_dbs[FREE_DBI].md_xsize 2417 MDBX_canary mm_canary; 2418 2419 #define MDBX_DATASIGN_NONE 0u 2420 #define MDBX_DATASIGN_WEAK 1u 2421 #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) 2422 #define META_IS_STEADY(meta) \ 2423 SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) 2424 union { 2425 uint32_t mm_sign[2]; 2426 uint64_t unsafe_sign; 2427 }; 2428 2429 /* txnid that committed this page, the second of a two-phase-update pair */ 2430 MDBX_atomic_uint32_t mm_txnid_b[2]; 2431 2432 /* Number of non-meta pages which were put in GC after COW. May be 0 in case 2433 * DB was previously handled by libmdbx without corresponding feature. 2434 * This value in couple with mr_snapshot_pages_retired allows fast estimation 2435 * of "how much reader is restraining GC recycling". */ 2436 uint32_t mm_pages_retired[2]; 2437 2438 /* The analogue /proc/sys/kernel/random/boot_id or similar to determine 2439 * whether the system was rebooted after the last use of the database files. 2440 * If there was no reboot, but there is no need to rollback to the last 2441 * steady sync point. Zeros mean that no relevant information is available 2442 * from the system. */ 2443 bin128_t mm_bootid; 2444 2445 } MDBX_meta; 2446 2447 #pragma pack(1) 2448 2449 /* Common header for all page types. The page type depends on mp_flags. 2450 * 2451 * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with 2452 * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages 2453 * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. 2454 * 2455 * P_OVERFLOW records occupy one or more contiguous pages where only the 2456 * first has a page header. They hold the real data of F_BIGDATA nodes. 2457 * 2458 * P_SUBP sub-pages are small leaf "pages" with duplicate data. 2459 * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. 2460 * (Duplicate data can also go in sub-databases, which use normal pages.) 2461 * 2462 * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. 2463 * 2464 * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once 2465 * in the snapshot: Either used by a database or listed in a GC record. */ 2466 typedef struct MDBX_page { 2467 union { 2468 #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) 2469 #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) 2470 #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) 2471 #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) 2472 #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) 2473 uint64_t 2474 mp_txnid; /* txnid which created this page, maybe zero in legacy DB */ 2475 struct MDBX_page *mp_next; /* for in-memory list of freed pages */ 2476 }; 2477 uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ 2478 #define P_BRANCH 0x01u /* branch page */ 2479 #define P_LEAF 0x02u /* leaf page */ 2480 #define P_OVERFLOW 0x04u /* overflow page */ 2481 #define P_META 0x08u /* meta page */ 2482 #define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ 2483 #define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ 2484 #define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ 2485 #define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ 2486 #define P_SPILLED 0x2000u /* spilled in parent txn */ 2487 #define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ 2488 #define P_FROZEN 0x8000u /* used for retire page with known status */ 2489 #define P_ILL_BITS \ 2490 ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) 2491 uint16_t mp_flags; 2492 union { 2493 uint32_t mp_pages; /* number of overflow pages */ 2494 __anonymous_struct_extension__ struct { 2495 indx_t mp_lower; /* lower bound of free space */ 2496 indx_t mp_upper; /* upper bound of free space */ 2497 }; 2498 }; 2499 pgno_t mp_pgno; /* page number */ 2500 2501 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2502 (!defined(__cplusplus) && defined(_MSC_VER)) 2503 indx_t mp_ptrs[] /* dynamic size */; 2504 #endif /* C99 */ 2505 } MDBX_page; 2506 2507 #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) 2508 2509 /* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ 2510 #define PAGETYPE_COMPAT(p) \ 2511 (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ 2512 ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ 2513 : PAGETYPE_WHOLE(p)) 2514 2515 /* Size of the page header, excluding dynamic data at the end */ 2516 #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) 2517 2518 #pragma pack(pop) 2519 2520 #if MDBX_ENABLE_PGOP_STAT 2521 /* Statistics of page operations overall of all (running, completed and aborted) 2522 * transactions */ 2523 typedef struct { 2524 MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ 2525 MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ 2526 MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones 2527 for nested transactions */ 2528 MDBX_atomic_uint64_t split; /* Page splits */ 2529 MDBX_atomic_uint64_t merge; /* Page merges */ 2530 MDBX_atomic_uint64_t spill; /* Quantity of spilled dirty pages */ 2531 MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ 2532 MDBX_atomic_uint64_t 2533 wops; /* Number of explicit write operations (not a pages) to a disk */ 2534 MDBX_atomic_uint64_t 2535 gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The 2536 unit/scale is platform-depended, see osal_monotime(). */ 2537 } MDBX_pgop_stat_t; 2538 #endif /* MDBX_ENABLE_PGOP_STAT */ 2539 2540 #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES 2541 #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) 2542 typedef void osal_ipclock_t; 2543 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV 2544 2545 #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) 2546 typedef mdbx_pid_t osal_ipclock_t; 2547 #ifndef EOWNERDEAD 2548 #define EOWNERDEAD MDBX_RESULT_TRUE 2549 #endif 2550 2551 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ 2552 MDBX_LOCKING == MDBX_LOCKING_POSIX2008 2553 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) 2554 typedef pthread_mutex_t osal_ipclock_t; 2555 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 2556 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) 2557 typedef sem_t osal_ipclock_t; 2558 #else 2559 #error "FIXME" 2560 #endif /* MDBX_LOCKING */ 2561 2562 #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) 2563 MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); 2564 MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); 2565 #endif /* MDBX_LOCKING */ 2566 2567 /* Reader Lock Table 2568 * 2569 * Readers don't acquire any locks for their data access. Instead, they 2570 * simply record their transaction ID in the reader table. The reader 2571 * mutex is needed just to find an empty slot in the reader table. The 2572 * slot's address is saved in thread-specific data so that subsequent 2573 * read transactions started by the same thread need no further locking to 2574 * proceed. 2575 * 2576 * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. 2577 * No reader table is used if the database is on a read-only filesystem. 2578 * 2579 * Since the database uses multi-version concurrency control, readers don't 2580 * actually need any locking. This table is used to keep track of which 2581 * readers are using data from which old transactions, so that we'll know 2582 * when a particular old transaction is no longer in use. Old transactions 2583 * that have discarded any data pages can then have those pages reclaimed 2584 * for use by a later write transaction. 2585 * 2586 * The lock table is constructed such that reader slots are aligned with the 2587 * processor's cache line size. Any slot is only ever used by one thread. 2588 * This alignment guarantees that there will be no contention or cache 2589 * thrashing as threads update their own slot info, and also eliminates 2590 * any need for locking when accessing a slot. 2591 * 2592 * A writer thread will scan every slot in the table to determine the oldest 2593 * outstanding reader transaction. Any freed pages older than this will be 2594 * reclaimed by the writer. The writer doesn't use any locks when scanning 2595 * this table. This means that there's no guarantee that the writer will 2596 * see the most up-to-date reader info, but that's not required for correct 2597 * operation - all we need is to know the upper bound on the oldest reader, 2598 * we don't care at all about the newest reader. So the only consequence of 2599 * reading stale information here is that old pages might hang around a 2600 * while longer before being reclaimed. That's actually good anyway, because 2601 * the longer we delay reclaiming old pages, the more likely it is that a 2602 * string of contiguous pages can be found after coalescing old pages from 2603 * many old transactions together. */ 2604 2605 /* The actual reader record, with cacheline padding. */ 2606 typedef struct MDBX_reader { 2607 /* Current Transaction ID when this transaction began, or (txnid_t)-1. 2608 * Multiple readers that start at the same time will probably have the 2609 * same ID here. Again, it's not important to exclude them from 2610 * anything; all we need to know is which version of the DB they 2611 * started from so we can avoid overwriting any data used in that 2612 * particular version. */ 2613 MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; 2614 2615 /* The information we store in a single slot of the reader table. 2616 * In addition to a transaction ID, we also record the process and 2617 * thread ID that owns a slot, so that we can detect stale information, 2618 * e.g. threads or processes that went away without cleaning up. 2619 * 2620 * NOTE: We currently don't check for stale records. 2621 * We simply re-init the table when we know that we're the only process 2622 * opening the lock file. */ 2623 2624 /* The thread ID of the thread owning this txn. */ 2625 MDBX_atomic_uint64_t mr_tid; 2626 2627 /* The process ID of the process owning this reader txn. */ 2628 MDBX_atomic_uint32_t mr_pid; 2629 2630 /* The number of pages used in the reader's MVCC snapshot, 2631 * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ 2632 atomic_pgno_t mr_snapshot_pages_used; 2633 /* Number of retired pages at the time this reader starts transaction. So, 2634 * at any time the difference mm_pages_retired - mr_snapshot_pages_retired 2635 * will give the number of pages which this reader restraining from reuse. */ 2636 MDBX_atomic_uint64_t mr_snapshot_pages_retired; 2637 } MDBX_reader; 2638 2639 /* The header for the reader table (a memory-mapped lock file). */ 2640 typedef struct MDBX_lockinfo { 2641 /* Stamp identifying this as an MDBX file. 2642 * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ 2643 uint64_t mti_magic_and_version; 2644 2645 /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ 2646 uint32_t mti_os_and_format; 2647 2648 /* Flags which environment was opened. */ 2649 MDBX_atomic_uint32_t mti_envmode; 2650 2651 /* Threshold of un-synced-with-disk pages for auto-sync feature, 2652 * zero means no-threshold, i.e. auto-sync is disabled. */ 2653 atomic_pgno_t mti_autosync_threshold; 2654 2655 /* Low 32-bit of txnid with which meta-pages was synced, 2656 * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ 2657 MDBX_atomic_uint32_t mti_meta_sync_txnid; 2658 2659 /* Period for timed auto-sync feature, i.e. at the every steady checkpoint 2660 * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. 2661 * The time value is represented in a suitable system-dependent form, for 2662 * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). 2663 * Zero means timed auto-sync is disabled. */ 2664 MDBX_atomic_uint64_t mti_autosync_period; 2665 2666 /* Marker to distinguish uniqueness of DB/CLK. */ 2667 MDBX_atomic_uint64_t mti_bait_uniqueness; 2668 2669 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2670 2671 #if MDBX_ENABLE_PGOP_STAT 2672 /* Statistics of costly ops of all (running, completed and aborted) 2673 * transactions */ 2674 MDBX_pgop_stat_t mti_pgop_stat; 2675 #endif /* MDBX_ENABLE_PGOP_STAT*/ 2676 2677 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2678 2679 /* Write transaction lock. */ 2680 #if MDBX_LOCKING > 0 2681 osal_ipclock_t mti_wlock; 2682 #endif /* MDBX_LOCKING > 0 */ 2683 2684 atomic_txnid_t mti_oldest_reader; 2685 2686 /* Timestamp of the last steady sync. Value is represented in a suitable 2687 * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or 2688 * clock_gettime(CLOCK_MONOTONIC). */ 2689 MDBX_atomic_uint64_t mti_sync_timestamp; 2690 2691 /* Number un-synced-with-disk pages for auto-sync feature. */ 2692 atomic_pgno_t mti_unsynced_pages; 2693 2694 /* Number of page which was discarded last time by madvise(MADV_FREE). */ 2695 atomic_pgno_t mti_discarded_tail; 2696 2697 /* Timestamp of the last readers check. */ 2698 MDBX_atomic_uint64_t mti_reader_check_timestamp; 2699 2700 /* Shared anchor for tracking readahead edge and enabled/disabled status. */ 2701 pgno_t mti_readahead_anchor; 2702 2703 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2704 2705 /* Readeaders registration lock. */ 2706 #if MDBX_LOCKING > 0 2707 osal_ipclock_t mti_rlock; 2708 #endif /* MDBX_LOCKING > 0 */ 2709 2710 /* The number of slots that have been used in the reader table. 2711 * This always records the maximum count, it is not decremented 2712 * when readers release their slots. */ 2713 MDBX_atomic_uint32_t mti_numreaders; 2714 MDBX_atomic_uint32_t mti_readers_refresh_flag; 2715 2716 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2717 (!defined(__cplusplus) && defined(_MSC_VER)) 2718 MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ 2719 MDBX_reader mti_readers[] /* dynamic size */; 2720 #endif /* C99 */ 2721 } MDBX_lockinfo; 2722 2723 /* Lockfile format signature: version, features and field layout */ 2724 #define MDBX_LOCK_FORMAT \ 2725 (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ 2726 (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ 2727 (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ 2728 (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ 2729 (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) 2730 2731 #define MDBX_DATA_MAGIC \ 2732 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) 2733 2734 #define MDBX_DATA_MAGIC_LEGACY_COMPAT \ 2735 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2) 2736 2737 #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255) 2738 2739 #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) 2740 2741 /* The maximum size of a database page. 2742 * 2743 * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. 2744 * 2745 * MDBX will use database pages < OS pages if needed. 2746 * That causes more I/O in write transactions: The OS must 2747 * know (read) the whole page before writing a partial page. 2748 * 2749 * Note that we don't currently support Huge pages. On Linux, 2750 * regular data files cannot use Huge pages, and in general 2751 * Huge pages aren't actually pageable. We rely on the OS 2752 * demand-pager to read our data and page it out when memory 2753 * pressure from other processes is high. So until OSs have 2754 * actual paging support for Huge pages, they're not viable. */ 2755 #define MAX_PAGESIZE MDBX_MAX_PAGESIZE 2756 #define MIN_PAGESIZE MDBX_MIN_PAGESIZE 2757 2758 #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) 2759 #if defined(_WIN32) || defined(_WIN64) 2760 #define MAX_MAPSIZE32 UINT32_C(0x38000000) 2761 #else 2762 #define MAX_MAPSIZE32 UINT32_C(0x7f000000) 2763 #endif 2764 #define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MAX_PAGESIZE) 2765 2766 #if MDBX_WORDBITS >= 64 2767 #define MAX_MAPSIZE MAX_MAPSIZE64 2768 #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO) 2769 #else 2770 #define MAX_MAPSIZE MAX_MAPSIZE32 2771 #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) 2772 #endif /* MDBX_WORDBITS */ 2773 2774 #define MDBX_READERS_LIMIT 32767 2775 #define MDBX_RADIXSORT_THRESHOLD 333 2776 2777 /*----------------------------------------------------------------------------*/ 2778 2779 /* An PNL is an Page Number List, a sorted array of IDs. 2780 * The first element of the array is a counter for how many actual page-numbers 2781 * are in the list. By default PNLs are sorted in descending order, this allow 2782 * cut off a page with lowest pgno (at the tail) just truncating the list. The 2783 * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ 2784 typedef pgno_t *MDBX_PNL; 2785 2786 #if MDBX_PNL_ASCENDING 2787 #define MDBX_PNL_ORDERED(first, last) ((first) < (last)) 2788 #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) 2789 #else 2790 #define MDBX_PNL_ORDERED(first, last) ((first) > (last)) 2791 #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) 2792 #endif 2793 2794 /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */ 2795 typedef txnid_t *MDBX_TXL; 2796 2797 /* An Dirty-Page list item is an pgno/pointer pair. */ 2798 typedef struct MDBX_dp { 2799 MDBX_page *ptr; 2800 pgno_t pgno; 2801 union { 2802 unsigned extra; 2803 __anonymous_struct_extension__ struct { 2804 unsigned multi : 1; 2805 unsigned lru : 31; 2806 }; 2807 }; 2808 } MDBX_dp; 2809 2810 /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ 2811 typedef struct MDBX_dpl { 2812 unsigned sorted; 2813 unsigned length; 2814 unsigned pages_including_loose; /* number of pages, but not an entries. */ 2815 unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ 2816 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 2817 (!defined(__cplusplus) && defined(_MSC_VER)) 2818 MDBX_dp items[] /* dynamic size with holes at zero and after the last */; 2819 #endif 2820 } MDBX_dpl; 2821 2822 /* PNL sizes */ 2823 #define MDBX_PNL_GRANULATE 1024 2824 #define MDBX_PNL_INITIAL \ 2825 (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) 2826 2827 #define MDBX_TXL_GRANULATE 32 2828 #define MDBX_TXL_INITIAL \ 2829 (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) 2830 #define MDBX_TXL_MAX \ 2831 ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) 2832 2833 #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) 2834 #define MDBX_PNL_SIZE(pl) ((pl)[0]) 2835 #define MDBX_PNL_FIRST(pl) ((pl)[1]) 2836 #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)]) 2837 #define MDBX_PNL_BEGIN(pl) (&(pl)[1]) 2838 #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) 2839 2840 #if MDBX_PNL_ASCENDING 2841 #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) 2842 #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) 2843 #else 2844 #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) 2845 #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) 2846 #endif 2847 2848 #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) 2849 #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) 2850 2851 /*----------------------------------------------------------------------------*/ 2852 /* Internal structures */ 2853 2854 /* Auxiliary DB info. 2855 * The information here is mostly static/read-only. There is 2856 * only a single copy of this record in the environment. */ 2857 typedef struct MDBX_dbx { 2858 MDBX_val md_name; /* name of the database */ 2859 MDBX_cmp_func *md_cmp; /* function for comparing keys */ 2860 MDBX_cmp_func *md_dcmp; /* function for comparing data items */ 2861 size_t md_klen_min, md_klen_max; /* min/max key length for the database */ 2862 size_t md_vlen_min, 2863 md_vlen_max; /* min/max value/data length for the database */ 2864 } MDBX_dbx; 2865 2866 typedef struct troika { 2867 uint8_t fsm, recent, prefer_steady, tail_and_flags; 2868 #define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) 2869 #define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) 2870 #define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) 2871 #define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) 2872 txnid_t txnid[NUM_METAS]; 2873 } meta_troika_t; 2874 2875 /* A database transaction. 2876 * Every operation requires a transaction handle. */ 2877 struct MDBX_txn { 2878 #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) 2879 uint32_t mt_signature; 2880 2881 /* Transaction Flags */ 2882 /* mdbx_txn_begin() flags */ 2883 #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) 2884 #define MDBX_TXN_RW_BEGIN_FLAGS \ 2885 (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) 2886 /* Additional flag for sync_locked() */ 2887 #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) 2888 2889 #define TXN_FLAGS \ 2890 (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ 2891 MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) 2892 2893 #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ 2894 ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ 2895 MDBX_SHRINK_ALLOWED) 2896 #error "Oops, some txn flags overlapped or wrong" 2897 #endif 2898 uint32_t mt_flags; 2899 2900 MDBX_txn *mt_parent; /* parent of a nested txn */ 2901 /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ 2902 MDBX_txn *mt_child; 2903 MDBX_geo mt_geo; 2904 /* next unallocated page */ 2905 #define mt_next_pgno mt_geo.next 2906 /* corresponding to the current size of datafile */ 2907 #define mt_end_pgno mt_geo.now 2908 2909 /* The ID of this transaction. IDs are integers incrementing from 2910 * INITIAL_TXNID. Only committed write transactions increment the ID. If a 2911 * transaction aborts, the ID may be re-used by the next writer. */ 2912 txnid_t mt_txnid; 2913 txnid_t mt_front; 2914 2915 MDBX_env *mt_env; /* the DB environment */ 2916 /* Array of records for each DB known in the environment. */ 2917 MDBX_dbx *mt_dbxs; 2918 /* Array of MDBX_db records for each known DB */ 2919 MDBX_db *mt_dbs; 2920 /* Array of sequence numbers for each DB handle */ 2921 MDBX_atomic_uint32_t *mt_dbiseqs; 2922 2923 /* Transaction DBI Flags */ 2924 #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ 2925 #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ 2926 #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ 2927 #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ 2928 #define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ 2929 #define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ 2930 #define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ 2931 /* Array of flags for each DB */ 2932 uint8_t *mt_dbistate; 2933 /* Number of DB records in use, or 0 when the txn is finished. 2934 * This number only ever increments until the txn finishes; we 2935 * don't decrement it when individual DB handles are closed. */ 2936 MDBX_dbi mt_numdbs; 2937 size_t mt_owner; /* thread ID that owns this transaction */ 2938 MDBX_canary mt_canary; 2939 void *mt_userctx; /* User-settable context */ 2940 MDBX_cursor **mt_cursors; 2941 2942 union { 2943 struct { 2944 /* For read txns: This thread/txn's reader table slot, or NULL. */ 2945 MDBX_reader *reader; 2946 } to; 2947 struct { 2948 meta_troika_t troika; 2949 /* In write txns, array of cursors for each DB */ 2950 pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ 2951 txnid_t last_reclaimed; /* ID of last used record */ 2952 #if MDBX_ENABLE_REFUND 2953 pgno_t loose_refund_wl /* FIXME: describe */; 2954 #endif /* MDBX_ENABLE_REFUND */ 2955 /* dirtylist room: Dirty array size - dirty pages visible to this txn. 2956 * Includes ancestor txns' dirty pages not hidden by other txns' 2957 * dirty/spilled pages. Thus commit(nested txn) has room to merge 2958 * dirtylist into mt_parent after freeing hidden mt_parent pages. */ 2959 unsigned dirtyroom; 2960 /* a sequence to spilling dirty page with LRU policy */ 2961 unsigned dirtylru; 2962 /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ 2963 MDBX_dpl *dirtylist; 2964 /* The list of reclaimed txns from GC */ 2965 MDBX_TXL lifo_reclaimed; 2966 /* The list of pages that became unused during this transaction. */ 2967 MDBX_PNL retired_pages; 2968 /* The list of loose pages that became unused and may be reused 2969 * in this transaction, linked through `mp_next`. */ 2970 MDBX_page *loose_pages; 2971 /* Number of loose pages (tw.loose_pages) */ 2972 unsigned loose_count; 2973 unsigned spill_least_removed; 2974 /* The sorted list of dirty pages we temporarily wrote to disk 2975 * because the dirty list was full. page numbers in here are 2976 * shifted left by 1, deleted slots have the LSB set. */ 2977 MDBX_PNL spill_pages; 2978 } tw; 2979 }; 2980 }; 2981 2982 #if MDBX_WORDBITS >= 64 2983 #define CURSOR_STACK 32 2984 #else 2985 #define CURSOR_STACK 24 2986 #endif 2987 2988 struct MDBX_xcursor; 2989 2990 /* Cursors are used for all DB operations. 2991 * A cursor holds a path of (page pointer, key index) from the DB 2992 * root to a position in the DB, plus other state. MDBX_DUPSORT 2993 * cursors include an xcursor to the current data item. Write txns 2994 * track their cursors and keep them up to date when data moves. 2995 * Exception: An xcursor's pointer to a P_SUBP page can be stale. 2996 * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ 2997 struct MDBX_cursor { 2998 #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1) 2999 #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) 3000 #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) 3001 uint32_t mc_signature; 3002 /* The database handle this cursor operates on */ 3003 MDBX_dbi mc_dbi; 3004 /* Next cursor on this DB in this txn */ 3005 MDBX_cursor *mc_next; 3006 /* Backup of the original cursor if this cursor is a shadow */ 3007 MDBX_cursor *mc_backup; 3008 /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ 3009 struct MDBX_xcursor *mc_xcursor; 3010 /* The transaction that owns this cursor */ 3011 MDBX_txn *mc_txn; 3012 /* The database record for this cursor */ 3013 MDBX_db *mc_db; 3014 /* The database auxiliary record for this cursor */ 3015 MDBX_dbx *mc_dbx; 3016 /* The mt_dbistate for this database */ 3017 uint8_t *mc_dbistate; 3018 uint8_t mc_snum; /* number of pushed pages */ 3019 uint8_t mc_top; /* index of top page, normally mc_snum-1 */ 3020 3021 /* Cursor state flags. */ 3022 #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ 3023 #define C_EOF 0x02 /* No more data */ 3024 #define C_SUB 0x04 /* Cursor is a sub-cursor */ 3025 #define C_DEL 0x08 /* last op was a cursor_del */ 3026 #define C_UNTRACK 0x10 /* Un-track cursor when closing */ 3027 #define C_RECLAIMING 0x20 /* GC lookup is prohibited */ 3028 #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ 3029 uint8_t mc_flags; /* see mdbx_cursor */ 3030 3031 /* Cursor checking flags. */ 3032 #define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ 3033 #define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ 3034 #define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ 3035 #define CC_UPDATING 0x08 /* update/rebalance pending */ 3036 #define CC_SKIPORD 0x10 /* don't check keys ordering */ 3037 #define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ 3038 #define CC_RETIRING 0x40 /* refs to child pages may be invalid */ 3039 #define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ 3040 uint8_t mc_checking; /* page checking level */ 3041 3042 MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ 3043 indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ 3044 }; 3045 3046 #define CHECK_LEAF_TYPE(mc, mp) \ 3047 (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ 3048 (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) 3049 3050 /* Context for sorted-dup records. 3051 * We could have gone to a fully recursive design, with arbitrarily 3052 * deep nesting of sub-databases. But for now we only handle these 3053 * levels - main DB, optional sub-DB, sorted-duplicate DB. */ 3054 typedef struct MDBX_xcursor { 3055 /* A sub-cursor for traversing the Dup DB */ 3056 MDBX_cursor mx_cursor; 3057 /* The database record for this Dup DB */ 3058 MDBX_db mx_db; 3059 /* The auxiliary DB record for this Dup DB */ 3060 MDBX_dbx mx_dbx; 3061 } MDBX_xcursor; 3062 3063 typedef struct MDBX_cursor_couple { 3064 MDBX_cursor outer; 3065 void *mc_userctx; /* User-settable context */ 3066 MDBX_xcursor inner; 3067 } MDBX_cursor_couple; 3068 3069 /* The database environment. */ 3070 struct MDBX_env { 3071 /* ----------------------------------------------------- mostly static part */ 3072 #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) 3073 MDBX_atomic_uint32_t me_signature; 3074 /* Failed to update the meta page. Probably an I/O error. */ 3075 #define MDBX_FATAL_ERROR UINT32_C(0x80000000) 3076 /* Some fields are initialized. */ 3077 #define MDBX_ENV_ACTIVE UINT32_C(0x20000000) 3078 /* me_txkey is set */ 3079 #define MDBX_ENV_TXKEY UINT32_C(0x10000000) 3080 /* Legacy MDBX_MAPASYNC (prior v0.9) */ 3081 #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) 3082 /* Legacy MDBX_COALESCE (prior v0.12) */ 3083 #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) 3084 #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) 3085 uint32_t me_flags; 3086 osal_mmap_t me_dxb_mmap; /* The main data file */ 3087 #define me_map me_dxb_mmap.dxb 3088 #define me_lazy_fd me_dxb_mmap.fd 3089 mdbx_filehandle_t me_dsync_fd; 3090 osal_mmap_t me_lck_mmap; /* The lock file */ 3091 #define me_lfd me_lck_mmap.fd 3092 struct MDBX_lockinfo *me_lck; 3093 3094 unsigned me_psize; /* DB page size, initialized from me_os_psize */ 3095 unsigned me_leaf_nodemax; /* max size of a leaf-node */ 3096 uint8_t me_psize2log; /* log2 of DB page size */ 3097 int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ 3098 uint16_t me_merge_threshold, 3099 me_merge_threshold_gc; /* pages emptier than this are candidates for 3100 merging */ 3101 unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ 3102 unsigned me_maxreaders; /* size of the reader table */ 3103 MDBX_dbi me_maxdbs; /* size of the DB table */ 3104 uint32_t me_pid; /* process ID of this env */ 3105 osal_thread_key_t me_txkey; /* thread-key for readers */ 3106 pathchar_t *me_pathname; /* path to the DB files */ 3107 void *me_pbuf; /* scratch area for DUPSORT put() */ 3108 MDBX_txn *me_txn0; /* preallocated write transaction */ 3109 3110 MDBX_dbx *me_dbxs; /* array of static DB info */ 3111 uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ 3112 MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ 3113 unsigned 3114 me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ 3115 uint32_t me_live_reader; /* have liveness lock in reader table */ 3116 void *me_userctx; /* User-settable context */ 3117 MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ 3118 3119 struct { 3120 unsigned dp_reserve_limit; 3121 unsigned rp_augment_limit; 3122 unsigned dp_limit; 3123 unsigned dp_initial; 3124 uint8_t dp_loose_limit; 3125 uint8_t spill_max_denominator; 3126 uint8_t spill_min_denominator; 3127 uint8_t spill_parent4child_denominator; 3128 unsigned merge_threshold_16dot16_percent; 3129 union { 3130 unsigned all; 3131 /* tracks options with non-auto values but tuned by user */ 3132 struct { 3133 unsigned dp_limit : 1; 3134 } non_auto; 3135 } flags; 3136 } me_options; 3137 3138 /* struct me_dbgeo used for accepting db-geo params from user for the new 3139 * database creation, i.e. when mdbx_env_set_geometry() was called before 3140 * mdbx_env_open(). */ 3141 struct { 3142 size_t lower; /* minimal size of datafile */ 3143 size_t upper; /* maximal size of datafile */ 3144 size_t now; /* current size of datafile */ 3145 size_t grow; /* step to grow datafile */ 3146 size_t shrink; /* threshold to shrink datafile */ 3147 } me_dbgeo; 3148 3149 #if MDBX_LOCKING == MDBX_LOCKING_SYSV 3150 union { 3151 key_t key; 3152 int semid; 3153 } me_sysv_ipc; 3154 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ 3155 3156 MDBX_env *me_lcklist_next; 3157 3158 /* --------------------------------------------------- mostly volatile part */ 3159 3160 MDBX_txn *me_txn; /* current write transaction */ 3161 osal_fastmutex_t me_dbi_lock; 3162 MDBX_dbi me_numdbs; /* number of DBs opened */ 3163 3164 MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ 3165 unsigned me_dp_reserve_len; 3166 /* PNL of pages that became unused in a write txn */ 3167 MDBX_PNL me_retired_pages; 3168 3169 #if defined(_WIN32) || defined(_WIN64) 3170 osal_srwlock_t me_remap_guard; 3171 /* Workaround for LockFileEx and WriteFile multithread bug */ 3172 CRITICAL_SECTION me_windowsbug_lock; 3173 #else 3174 osal_fastmutex_t me_remap_guard; 3175 #endif 3176 3177 /* -------------------------------------------------------------- debugging */ 3178 3179 #if MDBX_DEBUG 3180 MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ 3181 #endif 3182 #ifdef MDBX_USE_VALGRIND 3183 int me_valgrind_handle; 3184 #endif 3185 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) 3186 pgno_t me_poison_edge; 3187 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ 3188 3189 #ifndef xMDBX_DEBUG_SPILLING 3190 #define xMDBX_DEBUG_SPILLING 0 3191 #endif 3192 #if xMDBX_DEBUG_SPILLING == 2 3193 unsigned debug_dirtied_est, debug_dirtied_act; 3194 #endif /* xMDBX_DEBUG_SPILLING */ 3195 3196 /* ------------------------------------------------- stub for lck-less mode */ 3197 MDBX_atomic_uint64_t 3198 x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) / 3199 sizeof(MDBX_atomic_uint64_t)]; 3200 }; 3201 3202 #ifndef __cplusplus 3203 /*----------------------------------------------------------------------------*/ 3204 /* Debug and Logging stuff */ 3205 3206 #define MDBX_RUNTIME_FLAGS_INIT \ 3207 ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT 3208 3209 extern uint8_t runtime_flags; 3210 extern uint8_t loglevel; 3211 extern MDBX_debug_func *debug_logger; 3212 3213 MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { 3214 #if MDBX_DEBUG 3215 if (MDBX_DBG_JITTER & runtime_flags) 3216 osal_jitter(tiny); 3217 #else 3218 (void)tiny; 3219 #endif 3220 } 3221 3222 MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) 3223 debug_log(int level, const char *function, int line, const char *fmt, ...) 3224 MDBX_PRINTF_ARGS(4, 5); 3225 MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, 3226 const char *fmt, va_list args); 3227 3228 #if MDBX_DEBUG 3229 #define LOG_ENABLED(msg) unlikely(msg <= loglevel) 3230 #define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) 3231 #else /* MDBX_DEBUG */ 3232 #define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) 3233 #define AUDIT_ENABLED() (0) 3234 #endif /* MDBX_DEBUG */ 3235 3236 #if MDBX_FORCE_ASSERTIONS 3237 #define ASSERT_ENABLED() (1) 3238 #elif MDBX_DEBUG 3239 #define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) 3240 #else 3241 #define ASSERT_ENABLED() (0) 3242 #endif /* assertions */ 3243 3244 #define DEBUG_EXTRA(fmt, ...) \ 3245 do { \ 3246 if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ 3247 debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ 3248 } while (0) 3249 3250 #define DEBUG_EXTRA_PRINT(fmt, ...) \ 3251 do { \ 3252 if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ 3253 debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ 3254 } while (0) 3255 3256 #define TRACE(fmt, ...) \ 3257 do { \ 3258 if (LOG_ENABLED(MDBX_LOG_TRACE)) \ 3259 debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3260 } while (0) 3261 3262 #define DEBUG(fmt, ...) \ 3263 do { \ 3264 if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ 3265 debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3266 } while (0) 3267 3268 #define VERBOSE(fmt, ...) \ 3269 do { \ 3270 if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ 3271 debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3272 } while (0) 3273 3274 #define NOTICE(fmt, ...) \ 3275 do { \ 3276 if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ 3277 debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3278 } while (0) 3279 3280 #define WARNING(fmt, ...) \ 3281 do { \ 3282 if (LOG_ENABLED(MDBX_LOG_WARN)) \ 3283 debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3284 } while (0) 3285 3286 #undef ERROR /* wingdi.h \ 3287 Yeah, morons from M$ put such definition to the public header. */ 3288 3289 #define ERROR(fmt, ...) \ 3290 do { \ 3291 if (LOG_ENABLED(MDBX_LOG_ERROR)) \ 3292 debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ 3293 } while (0) 3294 3295 #define FATAL(fmt, ...) \ 3296 debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); 3297 3298 #define ENSURE_MSG(env, expr, msg) \ 3299 do { \ 3300 if (unlikely(!(expr))) \ 3301 mdbx_assert_fail(env, msg, __func__, __LINE__); \ 3302 } while (0) 3303 3304 #define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) 3305 3306 /* assert(3) variant in environment context */ 3307 #define eASSERT(env, expr) \ 3308 do { \ 3309 if (ASSERT_ENABLED()) \ 3310 ENSURE(env, expr); \ 3311 } while (0) 3312 3313 /* assert(3) variant in cursor context */ 3314 #define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) 3315 3316 /* assert(3) variant in transaction context */ 3317 #define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) 3318 3319 #ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ 3320 #undef assert 3321 #define assert(expr) eASSERT(NULL, expr) 3322 #endif 3323 3324 /*----------------------------------------------------------------------------*/ 3325 /* Cache coherence and mmap invalidation */ 3326 3327 #if MDBX_CPU_WRITEBACK_INCOHERENT 3328 #define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() 3329 #else 3330 #define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() 3331 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ 3332 3333 MDBX_MAYBE_UNUSED static __inline void 3334 osal_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { 3335 #if MDBX_MMAP_INCOHERENT_FILE_WRITE 3336 char *const begin = (char *)(-pagesize & (intptr_t)addr); 3337 char *const end = 3338 (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); 3339 int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; 3340 eASSERT(nullptr, err == 0); 3341 (void)err; 3342 #else 3343 (void)pagesize; 3344 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 3345 3346 #if MDBX_MMAP_INCOHERENT_CPU_CACHE 3347 #ifdef DCACHE 3348 /* MIPS has cache coherency issues. 3349 * Note: for any nbytes >= on-chip cache size, entire is flushed. */ 3350 cacheflush(addr, nbytes, DCACHE); 3351 #else 3352 #error "Oops, cacheflush() not available" 3353 #endif /* DCACHE */ 3354 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ 3355 3356 #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE 3357 (void)addr; 3358 (void)nbytes; 3359 #endif 3360 } 3361 3362 /*----------------------------------------------------------------------------*/ 3363 /* Internal prototypes */ 3364 3365 MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, 3366 int *dead); 3367 MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, 3368 MDBX_reader *end); 3369 MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); 3370 3371 MDBX_INTERNAL_FUNC void global_ctor(void); 3372 MDBX_INTERNAL_FUNC void global_dtor(void); 3373 MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); 3374 3375 #endif /* !__cplusplus */ 3376 3377 #define MDBX_IS_ERROR(rc) \ 3378 ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) 3379 3380 /* Internal error codes, not exposed outside libmdbx */ 3381 #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) 3382 3383 /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ 3384 #define DDBI(mc) \ 3385 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 3386 3387 /* Key size which fits in a DKBUF (debug key buffer). */ 3388 #define DKBUF_MAX 511 3389 #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2] 3390 #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1) 3391 #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1) 3392 3393 #if MDBX_DEBUG 3394 #define DKBUF_DEBUG DKBUF 3395 #define DKEY_DEBUG(x) DKEY(x) 3396 #define DVAL_DEBUG(x) DVAL(x) 3397 #else 3398 #define DKBUF_DEBUG ((void)(0)) 3399 #define DKEY_DEBUG(x) ("-") 3400 #define DVAL_DEBUG(x) ("-") 3401 #endif 3402 3403 /* An invalid page number. 3404 * Mainly used to denote an empty tree. */ 3405 #define P_INVALID (~(pgno_t)0) 3406 3407 /* Test if the flags f are set in a flag word w. */ 3408 #define F_ISSET(w, f) (((w) & (f)) == (f)) 3409 3410 /* Round n up to an even number. */ 3411 #define EVEN(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */ 3412 3413 /* Default size of memory map. 3414 * This is certainly too small for any actual applications. Apps should 3415 * always set the size explicitly using mdbx_env_set_geometry(). */ 3416 #define DEFAULT_MAPSIZE MEGABYTE 3417 3418 /* Number of slots in the reader table. 3419 * This value was chosen somewhat arbitrarily. The 61 is a prime number, 3420 * and such readers plus a couple mutexes fit into single 4KB page. 3421 * Applications should set the table size using mdbx_env_set_maxreaders(). */ 3422 #define DEFAULT_READERS 61 3423 3424 /* Test if a page is a leaf page */ 3425 #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0) 3426 /* Test if a page is a LEAF2 page */ 3427 #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0) 3428 /* Test if a page is a branch page */ 3429 #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0) 3430 /* Test if a page is an overflow page */ 3431 #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0) 3432 /* Test if a page is a sub page */ 3433 #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) 3434 3435 /* Header for a single key/data pair within a page. 3436 * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. 3437 * We guarantee 2-byte alignment for 'MDBX_node's. 3438 * 3439 * Leaf node flags describe node contents. F_BIGDATA says the node's 3440 * data part is the page number of an overflow page with actual data. 3441 * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in 3442 * a sub-page/sub-database, and named databases (just F_SUBDATA). */ 3443 typedef struct MDBX_node { 3444 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 3445 union { 3446 uint32_t mn_dsize; 3447 uint32_t mn_pgno32; 3448 }; 3449 uint8_t mn_flags; /* see mdbx_node flags */ 3450 uint8_t mn_extra; 3451 uint16_t mn_ksize; /* key size */ 3452 #else 3453 uint16_t mn_ksize; /* key size */ 3454 uint8_t mn_extra; 3455 uint8_t mn_flags; /* see mdbx_node flags */ 3456 union { 3457 uint32_t mn_pgno32; 3458 uint32_t mn_dsize; 3459 }; 3460 #endif /* __BYTE_ORDER__ */ 3461 3462 /* mdbx_node Flags */ 3463 #define F_BIGDATA 0x01 /* data put on overflow page */ 3464 #define F_SUBDATA 0x02 /* data is a sub-database */ 3465 #define F_DUPDATA 0x04 /* data has duplicates */ 3466 3467 /* valid flags for mdbx_node_add() */ 3468 #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) 3469 3470 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ 3471 (!defined(__cplusplus) && defined(_MSC_VER)) 3472 uint8_t mn_data[] /* key and data are appended here */; 3473 #endif /* C99 */ 3474 } MDBX_node; 3475 3476 #define DB_PERSISTENT_FLAGS \ 3477 (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ 3478 MDBX_INTEGERDUP | MDBX_REVERSEDUP) 3479 3480 /* mdbx_dbi_open() flags */ 3481 #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) 3482 3483 #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ 3484 #define DB_INTERNAL_FLAGS DB_VALID 3485 3486 #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS 3487 #error "Oops, some flags overlapped or wrong" 3488 #endif 3489 #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS 3490 #error "Oops, some flags overlapped or wrong" 3491 #endif 3492 3493 /* max number of pages to commit in one writev() call */ 3494 #define MDBX_COMMIT_PAGES 64 3495 #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ 3496 #undef MDBX_COMMIT_PAGES 3497 #define MDBX_COMMIT_PAGES IOV_MAX 3498 #endif 3499 3500 /* 3501 * / 3502 * | -1, a < b 3503 * CMP2INT(a,b) = < 0, a == b 3504 * | 1, a > b 3505 * \ 3506 */ 3507 #ifndef __e2k__ 3508 /* LY: fast enough on most systems */ 3509 #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) 3510 #else 3511 /* LY: more parallelable on VLIW Elbrus */ 3512 #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) 3513 #endif 3514 3515 /* Do not spill pages to disk if txn is getting full, may fail instead */ 3516 #define MDBX_NOSPILL 0x8000 3517 3518 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3519 int64pgno(int64_t i64) { 3520 if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1)) 3521 return (pgno_t)i64; 3522 return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO; 3523 } 3524 3525 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3526 pgno_add(size_t base, size_t augend) { 3527 assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); 3528 return int64pgno(base + augend); 3529 } 3530 3531 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t 3532 pgno_sub(size_t base, size_t subtrahend) { 3533 assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && 3534 subtrahend < MAX_PAGENO); 3535 return int64pgno(base - subtrahend); 3536 } 3537 3538 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool 3539 is_powerof2(size_t x) { 3540 return (x & (x - 1)) == 0; 3541 } 3542 3543 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t 3544 floor_powerof2(size_t value, size_t granularity) { 3545 assert(is_powerof2(granularity)); 3546 return value & ~(granularity - 1); 3547 } 3548 3549 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t 3550 ceil_powerof2(size_t value, size_t granularity) { 3551 return floor_powerof2(value + granularity - 1, granularity); 3552 } 3553 3554 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned 3555 log2n_powerof2(size_t value) { 3556 assert(value > 0 && value < INT32_MAX && is_powerof2(value)); 3557 assert((value & -(int32_t)value) == value); 3558 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) 3559 return __builtin_ctzl(value); 3560 #elif defined(_MSC_VER) 3561 unsigned long index; 3562 _BitScanForward(&index, (unsigned long)value); 3563 return index; 3564 #else 3565 static const uint8_t debruijn_ctz32[32] = { 3566 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 3567 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; 3568 return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; 3569 #endif 3570 } 3571 3572 /* Only a subset of the mdbx_env flags can be changed 3573 * at runtime. Changing other flags requires closing the 3574 * environment and re-opening it with the new flags. */ 3575 #define ENV_CHANGEABLE_FLAGS \ 3576 (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ 3577 MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ 3578 MDBX_VALIDATION) 3579 #define ENV_CHANGELESS_FLAGS \ 3580 (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ 3581 MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) 3582 #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) 3583 3584 #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS 3585 MDBX_MAYBE_UNUSED static void static_checks(void) { 3586 STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, 3587 "Oops, MDBX_MAX_DBI or CORE_DBS?"); 3588 STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == 3589 ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) & 3590 (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)), 3591 "Oops, some flags overlapped or wrong"); 3592 STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0, 3593 "Oops, some flags overlapped or wrong"); 3594 } 3595 #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */ 3596 3597 #ifdef __cplusplus 3598 } 3599 #endif 3600 3601 #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ 3602 do { \ 3603 TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ 3604 (size_t)(size), __LINE__); \ 3605 ASAN_POISON_MEMORY_REGION(addr, size); \ 3606 } while (0) 3607 3608 #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ 3609 do { \ 3610 TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ 3611 (size_t)(size), __LINE__); \ 3612 ASAN_UNPOISON_MEMORY_REGION(addr, size); \ 3613 } while (0) 3614 3615 typedef struct flagbit { 3616 int bit; 3617 const char *name; 3618 } flagbit; 3619 3620 const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, 3621 {MDBX_INTEGERKEY, "integerkey"}, 3622 {MDBX_REVERSEKEY, "reversekey"}, 3623 {MDBX_DUPFIXED, "dupfixed"}, 3624 {MDBX_REVERSEDUP, "reversedup"}, 3625 {MDBX_INTEGERDUP, "integerdup"}, 3626 {0, nullptr}}; 3627 3628 #if defined(_WIN32) || defined(_WIN64) 3629 /* 3630 * POSIX getopt for Windows 3631 * 3632 * AT&T Public License 3633 * 3634 * Code given out at the 1985 UNIFORUM conference in Dallas. 3635 */ 3636 3637 /*----------------------------------------------------------------------------*/ 3638 /* Microsoft compiler generates a lot of warning for self includes... */ 3639 3640 #ifdef _MSC_VER 3641 #pragma warning(push, 1) 3642 #pragma warning(disable : 4548) /* expression before comma has no effect; \ 3643 expected expression with side - effect */ 3644 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ 3645 * semantics are not enabled. Specify /EHsc */ 3646 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ 3647 * mode specified; termination on exception is \ 3648 * not guaranteed. Specify /EHsc */ 3649 #if !defined(_CRT_SECURE_NO_WARNINGS) 3650 #define _CRT_SECURE_NO_WARNINGS 3651 #endif 3652 #endif /* _MSC_VER (warnings) */ 3653 3654 #include <stdio.h> 3655 #include <string.h> 3656 3657 #ifdef _MSC_VER 3658 #pragma warning(pop) 3659 #endif 3660 /*----------------------------------------------------------------------------*/ 3661 3662 #ifndef NULL 3663 #define NULL 0 3664 #endif 3665 3666 #ifndef EOF 3667 #define EOF (-1) 3668 #endif 3669 3670 int optind = 1; 3671 int optopt; 3672 char *optarg; 3673 3674 int getopt(int argc, char *const argv[], const char *opts) { 3675 static int sp = 1; 3676 int c; 3677 const char *cp; 3678 3679 if (sp == 1) { 3680 if (optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0') 3681 return EOF; 3682 else if (strcmp(argv[optind], "--") == 0) { 3683 optind++; 3684 return EOF; 3685 } 3686 } 3687 optopt = c = argv[optind][sp]; 3688 if (c == ':' || (cp = strchr(opts, c)) == NULL) { 3689 fprintf(stderr, "%s: %s -- %c\n", argv[0], "illegal option", c); 3690 if (argv[optind][++sp] == '\0') { 3691 optind++; 3692 sp = 1; 3693 } 3694 return '?'; 3695 } 3696 if (*++cp == ':') { 3697 if (argv[optind][sp + 1] != '\0') 3698 optarg = &argv[optind++][sp + 1]; 3699 else if (++optind >= argc) { 3700 fprintf(stderr, "%s: %s -- %c\n", argv[0], "option requires an argument", 3701 c); 3702 sp = 1; 3703 return '?'; 3704 } else 3705 optarg = argv[optind++]; 3706 sp = 1; 3707 } else { 3708 if (argv[optind][++sp] == '\0') { 3709 sp = 1; 3710 optind++; 3711 } 3712 optarg = NULL; 3713 } 3714 return c; 3715 } 3716 3717 static volatile BOOL user_break; 3718 static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { 3719 (void)dwCtrlType; 3720 user_break = 1; 3721 return true; 3722 } 3723 3724 static uint64_t GetMilliseconds(void) { 3725 LARGE_INTEGER Counter, Frequency; 3726 return (QueryPerformanceFrequency(&Frequency) && 3727 QueryPerformanceCounter(&Counter)) 3728 ? Counter.QuadPart * 1000ul / Frequency.QuadPart 3729 : 0; 3730 } 3731 3732 #else /* WINDOWS */ 3733 3734 static volatile sig_atomic_t user_break; 3735 static void signal_handler(int sig) { 3736 (void)sig; 3737 user_break = 1; 3738 } 3739 3740 #endif /* !WINDOWS */ 3741 3742 #define EXIT_INTERRUPTED (EXIT_FAILURE + 4) 3743 #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) 3744 #define EXIT_FAILURE_MDBX (EXIT_FAILURE + 2) 3745 #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) 3746 #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE 3747 3748 typedef struct { 3749 const char *name; 3750 struct { 3751 uint64_t branch, large_count, large_volume, leaf; 3752 uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; 3753 uint64_t total, empty, other; 3754 } pages; 3755 uint64_t payload_bytes; 3756 uint64_t lost_bytes; 3757 } walk_dbi_t; 3758 3759 struct { 3760 short *pagemap; 3761 uint64_t total_payload_bytes; 3762 uint64_t pgcount; 3763 walk_dbi_t 3764 dbi[MDBX_MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1]; 3765 } walk; 3766 3767 #define dbi_free walk.dbi[FREE_DBI] 3768 #define dbi_main walk.dbi[MAIN_DBI] 3769 #define dbi_meta walk.dbi[CORE_DBS] 3770 3771 int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; 3772 MDBX_env *env; 3773 MDBX_txn *txn; 3774 MDBX_envinfo envinfo; 3775 size_t userdb_count, skipped_subdb; 3776 uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, 3777 unused_pages, backed_pages; 3778 unsigned verbose; 3779 bool ignore_wrong_order, quiet, dont_traversal; 3780 const char *only_subdb; 3781 int stuck_meta = -1; 3782 3783 struct problem { 3784 struct problem *pr_next; 3785 size_t count; 3786 const char *caption; 3787 }; 3788 3789 struct problem *problems_list; 3790 unsigned total_problems, data_tree_problems, gc_tree_problems; 3791 3792 static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { 3793 if (!quiet) { 3794 va_list args; 3795 3796 fflush(stderr); 3797 va_start(args, msg); 3798 vfprintf(stdout, msg, args); 3799 va_end(args); 3800 } 3801 } 3802 3803 static void va_log(MDBX_log_level_t level, const char *function, int line, 3804 const char *msg, va_list args) { 3805 static const char *const prefixes[] = { 3806 "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, 3807 " " /* notice */, " // " /* verbose */, " //// " /* debug */, 3808 " ////// " /* trace */ 3809 }; 3810 3811 FILE *out = stdout; 3812 if (level <= MDBX_LOG_ERROR) { 3813 total_problems++; 3814 out = stderr; 3815 } 3816 3817 if (!quiet && verbose + 1 >= (unsigned)level && 3818 (unsigned)level < ARRAY_LENGTH(prefixes)) { 3819 fflush(nullptr); 3820 fputs(prefixes[level], out); 3821 vfprintf(out, msg, args); 3822 3823 const bool have_lf = msg[strlen(msg) - 1] == '\n'; 3824 if (level == MDBX_LOG_FATAL && function && line) 3825 fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", 3826 function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); 3827 else if (!have_lf) 3828 fputc('\n', out); 3829 fflush(nullptr); 3830 } 3831 3832 if (level == MDBX_LOG_FATAL) { 3833 #if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS 3834 exit(EXIT_FAILURE_MDBX); 3835 #endif 3836 abort(); 3837 } 3838 } 3839 3840 static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { 3841 va_list args; 3842 va_start(args, msg); 3843 va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); 3844 va_end(args); 3845 } 3846 3847 static void logger(MDBX_log_level_t level, const char *function, int line, 3848 const char *msg, va_list args) { 3849 (void)line; 3850 (void)function; 3851 if (level < MDBX_LOG_EXTRA) 3852 va_log(level, function, line, msg, args); 3853 } 3854 3855 static int check_user_break(void) { 3856 switch (user_break) { 3857 case 0: 3858 return MDBX_SUCCESS; 3859 case 1: 3860 print(" - interrupted by signal\n"); 3861 fflush(nullptr); 3862 user_break = 2; 3863 } 3864 return MDBX_EINTR; 3865 } 3866 3867 static void pagemap_cleanup(void) { 3868 for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; 3869 i < ARRAY_LENGTH(walk.dbi); ++i) { 3870 if (walk.dbi[i].name) { 3871 osal_free((void *)walk.dbi[i].name); 3872 walk.dbi[i].name = nullptr; 3873 } 3874 } 3875 3876 osal_free(walk.pagemap); 3877 walk.pagemap = nullptr; 3878 } 3879 3880 static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { 3881 static walk_dbi_t *last; 3882 3883 if (dbi_name == MDBX_PGWALK_MAIN) 3884 return &dbi_main; 3885 if (dbi_name == MDBX_PGWALK_GC) 3886 return &dbi_free; 3887 if (dbi_name == MDBX_PGWALK_META) 3888 return &dbi_meta; 3889 3890 if (last && strcmp(last->name, dbi_name) == 0) 3891 return last; 3892 3893 walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; 3894 for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { 3895 if (strcmp(dbi->name, dbi_name) == 0) 3896 return last = dbi; 3897 } 3898 3899 if (verbose > 0 && !silent) { 3900 print(" - found '%s' area\n", dbi_name); 3901 fflush(nullptr); 3902 } 3903 3904 if (dbi == ARRAY_END(walk.dbi)) 3905 return nullptr; 3906 3907 dbi->name = osal_strdup(dbi_name); 3908 return last = dbi; 3909 } 3910 3911 static void MDBX_PRINTF_ARGS(4, 5) 3912 problem_add(const char *object, uint64_t entry_number, const char *msg, 3913 const char *extra, ...) { 3914 total_problems++; 3915 3916 if (!quiet) { 3917 int need_fflush = 0; 3918 struct problem *p; 3919 3920 for (p = problems_list; p; p = p->pr_next) 3921 if (p->caption == msg) 3922 break; 3923 3924 if (!p) { 3925 p = osal_calloc(1, sizeof(*p)); 3926 if (unlikely(!p)) 3927 return; 3928 p->caption = msg; 3929 p->pr_next = problems_list; 3930 problems_list = p; 3931 need_fflush = 1; 3932 } 3933 3934 p->count++; 3935 if (verbose > 1) { 3936 print(" %s #%" PRIu64 ": %s", object, entry_number, msg); 3937 if (extra) { 3938 va_list args; 3939 printf(" ("); 3940 va_start(args, extra); 3941 vfprintf(stdout, extra, args); 3942 va_end(args); 3943 printf(")"); 3944 } 3945 printf("\n"); 3946 if (need_fflush) 3947 fflush(nullptr); 3948 } 3949 } 3950 } 3951 3952 static struct problem *problems_push(void) { 3953 struct problem *p = problems_list; 3954 problems_list = nullptr; 3955 return p; 3956 } 3957 3958 static size_t problems_pop(struct problem *list) { 3959 size_t count = 0; 3960 3961 if (problems_list) { 3962 int i; 3963 3964 print(" - problems: "); 3965 for (i = 0; problems_list; ++i) { 3966 struct problem *p = problems_list->pr_next; 3967 count += problems_list->count; 3968 print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, 3969 problems_list->count); 3970 osal_free(problems_list); 3971 problems_list = p; 3972 } 3973 print("\n"); 3974 fflush(nullptr); 3975 } 3976 3977 problems_list = list; 3978 return count; 3979 } 3980 3981 static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, 3982 void *const ctx, const int deep, 3983 const char *const dbi_name_or_tag, const size_t page_size, 3984 const MDBX_page_type_t pagetype, const MDBX_error_t err, 3985 const size_t nentries, const size_t payload_bytes, 3986 const size_t header_bytes, const size_t unused_bytes) { 3987 (void)ctx; 3988 const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC; 3989 if (deep > 42) { 3990 problem_add("deep", deep, "too large", nullptr); 3991 data_tree_problems += !is_gc_tree; 3992 gc_tree_problems += is_gc_tree; 3993 return MDBX_CORRUPTED /* avoid infinite loop/recursion */; 3994 } 3995 3996 walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false); 3997 if (!dbi) { 3998 data_tree_problems += !is_gc_tree; 3999 gc_tree_problems += is_gc_tree; 4000 return MDBX_ENOMEM; 4001 } 4002 4003 const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; 4004 walk.pgcount += pgnumber; 4005 4006 const char *pagetype_caption; 4007 bool branch = false; 4008 switch (pagetype) { 4009 default: 4010 problem_add("page", pgno, "unknown page-type", "type %u, deep %i", 4011 (unsigned)pagetype, deep); 4012 pagetype_caption = "unknown"; 4013 dbi->pages.other += pgnumber; 4014 data_tree_problems += !is_gc_tree; 4015 gc_tree_problems += is_gc_tree; 4016 break; 4017 case MDBX_page_broken: 4018 pagetype_caption = "broken"; 4019 dbi->pages.other += pgnumber; 4020 data_tree_problems += !is_gc_tree; 4021 gc_tree_problems += is_gc_tree; 4022 break; 4023 case MDBX_subpage_broken: 4024 pagetype_caption = "broken-subpage"; 4025 data_tree_problems += !is_gc_tree; 4026 gc_tree_problems += is_gc_tree; 4027 break; 4028 case MDBX_page_meta: 4029 pagetype_caption = "meta"; 4030 dbi->pages.other += pgnumber; 4031 break; 4032 case MDBX_page_large: 4033 pagetype_caption = "large"; 4034 dbi->pages.large_volume += pgnumber; 4035 dbi->pages.large_count += 1; 4036 break; 4037 case MDBX_page_branch: 4038 pagetype_caption = "branch"; 4039 dbi->pages.branch += pgnumber; 4040 branch = true; 4041 break; 4042 case MDBX_page_leaf: 4043 pagetype_caption = "leaf"; 4044 dbi->pages.leaf += pgnumber; 4045 break; 4046 case MDBX_page_dupfixed_leaf: 4047 pagetype_caption = "leaf-dupfixed"; 4048 dbi->pages.leaf_dupfixed += pgnumber; 4049 break; 4050 case MDBX_subpage_leaf: 4051 pagetype_caption = "subleaf-dupsort"; 4052 dbi->pages.subleaf_dupsort += 1; 4053 break; 4054 case MDBX_subpage_dupfixed_leaf: 4055 pagetype_caption = "subleaf-dupfixed"; 4056 dbi->pages.subleaf_dupfixed += 1; 4057 break; 4058 } 4059 4060 if (pgnumber) { 4061 if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) { 4062 if (pgnumber == 1) 4063 print(" %s-page %" PRIu64, pagetype_caption, pgno); 4064 else 4065 print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); 4066 print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR 4067 ", unused %" PRIiPTR ", deep %i\n", 4068 dbi->name, header_bytes, 4069 (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, 4070 payload_bytes, unused_bytes, deep); 4071 } 4072 4073 bool already_used = false; 4074 for (unsigned n = 0; n < pgnumber; ++n) { 4075 uint64_t spanpgno = pgno + n; 4076 if (spanpgno >= alloc_pages) { 4077 problem_add("page", spanpgno, "wrong page-no", 4078 "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i", 4079 pagetype_caption, spanpgno, alloc_pages, deep); 4080 data_tree_problems += !is_gc_tree; 4081 gc_tree_problems += is_gc_tree; 4082 } else if (walk.pagemap[spanpgno]) { 4083 walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; 4084 problem_add("page", spanpgno, 4085 (branch && coll_dbi == dbi) ? "loop" : "already used", 4086 "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, 4087 deep); 4088 already_used = true; 4089 data_tree_problems += !is_gc_tree; 4090 gc_tree_problems += is_gc_tree; 4091 } else { 4092 walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1); 4093 dbi->pages.total += 1; 4094 } 4095 } 4096 4097 if (already_used) 4098 return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ 4099 : MDBX_SUCCESS; 4100 } 4101 4102 if (MDBX_IS_ERROR(err)) { 4103 problem_add("page", pgno, "invalid/corrupted", "%s-page", pagetype_caption); 4104 data_tree_problems += !is_gc_tree; 4105 gc_tree_problems += is_gc_tree; 4106 } else { 4107 if (unused_bytes > page_size) { 4108 problem_add("page", pgno, "illegal unused-bytes", 4109 "%s-page: %u < %" PRIuPTR " < %u", pagetype_caption, 0, 4110 unused_bytes, envinfo.mi_dxb_pagesize); 4111 data_tree_problems += !is_gc_tree; 4112 gc_tree_problems += is_gc_tree; 4113 } 4114 4115 if (header_bytes < (int)sizeof(long) || 4116 (size_t)header_bytes >= envinfo.mi_dxb_pagesize - sizeof(long)) { 4117 problem_add("page", pgno, "illegal header-length", 4118 "%s-page: %" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR, 4119 pagetype_caption, sizeof(long), header_bytes, 4120 envinfo.mi_dxb_pagesize - sizeof(long)); 4121 data_tree_problems += !is_gc_tree; 4122 gc_tree_problems += is_gc_tree; 4123 } 4124 if (payload_bytes < 1) { 4125 if (nentries > 1) { 4126 problem_add("page", pgno, "zero size-of-entry", 4127 "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries", 4128 pagetype_caption, payload_bytes, nentries); 4129 /* if ((size_t)header_bytes + unused_bytes < page_size) { 4130 // LY: hush a misuse error 4131 page_bytes = page_size; 4132 } */ 4133 data_tree_problems += !is_gc_tree; 4134 gc_tree_problems += is_gc_tree; 4135 } else { 4136 problem_add("page", pgno, "empty", 4137 "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR 4138 " entries, deep %i", 4139 pagetype_caption, payload_bytes, nentries, deep); 4140 dbi->pages.empty += 1; 4141 data_tree_problems += !is_gc_tree; 4142 gc_tree_problems += is_gc_tree; 4143 } 4144 } 4145 4146 if (pgnumber) { 4147 if (page_bytes != page_size) { 4148 problem_add("page", pgno, "misused", 4149 "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR 4150 "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", 4151 pagetype_caption, page_size, page_bytes, header_bytes, 4152 payload_bytes, unused_bytes, deep); 4153 if (page_size > page_bytes) 4154 dbi->lost_bytes += page_size - page_bytes; 4155 data_tree_problems += !is_gc_tree; 4156 gc_tree_problems += is_gc_tree; 4157 } else { 4158 dbi->payload_bytes += payload_bytes + header_bytes; 4159 walk.total_payload_bytes += payload_bytes + header_bytes; 4160 } 4161 } 4162 } 4163 4164 return check_user_break(); 4165 } 4166 4167 typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, 4168 const MDBX_val *data); 4169 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, 4170 bool silent); 4171 4172 static int handle_userdb(const uint64_t record_number, const MDBX_val *key, 4173 const MDBX_val *data) { 4174 (void)record_number; 4175 (void)key; 4176 (void)data; 4177 return check_user_break(); 4178 } 4179 4180 static int handle_freedb(const uint64_t record_number, const MDBX_val *key, 4181 const MDBX_val *data) { 4182 char *bad = ""; 4183 pgno_t *iptr = data->iov_base; 4184 4185 if (key->iov_len != sizeof(txnid_t)) 4186 problem_add("entry", record_number, "wrong txn-id size", 4187 "key-size %" PRIiPTR, key->iov_len); 4188 else { 4189 txnid_t txnid; 4190 memcpy(&txnid, key->iov_base, sizeof(txnid)); 4191 if (txnid < 1 || txnid > envinfo.mi_recent_txnid) 4192 problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid); 4193 else { 4194 if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) 4195 problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, 4196 data->iov_len); 4197 size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; 4198 if (number < 1 || number > MDBX_PGL_LIMIT) 4199 problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); 4200 else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { 4201 problem_add("entry", txnid, "trimmed idl", 4202 "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", 4203 (number + 1) * sizeof(pgno_t), data->iov_len); 4204 number = data->iov_len / sizeof(pgno_t) - 1; 4205 } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= 4206 /* LY: allow gap up to one page. it is ok 4207 * and better than shink-and-retry inside update_gc() */ 4208 envinfo.mi_dxb_pagesize) 4209 problem_add("entry", txnid, "extra idl space", 4210 "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", 4211 (number + 1) * sizeof(pgno_t), data->iov_len); 4212 4213 gc_pages += number; 4214 if (envinfo.mi_latter_reader_txnid > txnid) 4215 reclaimable_pages += number; 4216 4217 pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; 4218 pgno_t span = 1; 4219 for (unsigned i = 0; i < number; ++i) { 4220 if (check_user_break()) 4221 return MDBX_EINTR; 4222 const pgno_t pgno = iptr[i]; 4223 if (pgno < NUM_METAS) 4224 problem_add("entry", txnid, "wrong idl entry", 4225 "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); 4226 else if (pgno >= backed_pages) 4227 problem_add("entry", txnid, "wrong idl entry", 4228 "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, 4229 backed_pages); 4230 else if (pgno >= alloc_pages) 4231 problem_add("entry", txnid, "wrong idl entry", 4232 "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, 4233 alloc_pages - 1); 4234 else { 4235 if (MDBX_PNL_DISORDERED(prev, pgno)) { 4236 bad = " [bad sequence]"; 4237 problem_add("entry", txnid, "bad sequence", 4238 "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, 4239 (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), 4240 i, pgno); 4241 } 4242 if (walk.pagemap) { 4243 int idx = walk.pagemap[pgno]; 4244 if (idx == 0) 4245 walk.pagemap[pgno] = -1; 4246 else if (idx > 0) 4247 problem_add("page", pgno, "already used", "by %s", 4248 walk.dbi[idx - 1].name); 4249 else 4250 problem_add("page", pgno, "already listed in GC", nullptr); 4251 } 4252 } 4253 prev = pgno; 4254 while (i + span < number && 4255 iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) 4256 : pgno_sub(pgno, span))) 4257 ++span; 4258 } 4259 if (verbose > 3 && !only_subdb) { 4260 print(" transaction %" PRIaTXN ", %" PRIuPTR 4261 " pages, maxspan %" PRIaPGNO "%s\n", 4262 txnid, number, span, bad); 4263 if (verbose > 4) { 4264 for (unsigned i = 0; i < number; i += span) { 4265 const pgno_t pgno = iptr[i]; 4266 for (span = 1; 4267 i + span < number && 4268 iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) 4269 : pgno_sub(pgno, span)); 4270 ++span) 4271 ; 4272 if (span > 1) { 4273 print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span); 4274 } else 4275 print(" %9" PRIaPGNO "\n", pgno); 4276 } 4277 } 4278 } 4279 } 4280 } 4281 4282 return check_user_break(); 4283 } 4284 4285 static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { 4286 return (a->iov_len == b->iov_len && 4287 memcmp(a->iov_base, b->iov_base, a->iov_len) == 0) 4288 ? 0 4289 : 1; 4290 } 4291 4292 static int handle_maindb(const uint64_t record_number, const MDBX_val *key, 4293 const MDBX_val *data) { 4294 char *name; 4295 int rc; 4296 size_t i; 4297 4298 name = key->iov_base; 4299 for (i = 0; i < key->iov_len; ++i) { 4300 if (name[i] < ' ') 4301 return handle_userdb(record_number, key, data); 4302 } 4303 4304 name = osal_malloc(key->iov_len + 1); 4305 if (unlikely(!name)) 4306 return MDBX_ENOMEM; 4307 memcpy(name, key->iov_base, key->iov_len); 4308 name[key->iov_len] = '\0'; 4309 userdb_count++; 4310 4311 rc = process_db(~0u, name, handle_userdb, false); 4312 osal_free(name); 4313 if (rc != MDBX_INCOMPATIBLE) 4314 return rc; 4315 4316 return handle_userdb(record_number, key, data); 4317 } 4318 4319 static const char *db_flags2keymode(unsigned flags) { 4320 flags &= (MDBX_REVERSEKEY | MDBX_INTEGERKEY); 4321 switch (flags) { 4322 case 0: 4323 return "usual"; 4324 case MDBX_REVERSEKEY: 4325 return "reserve"; 4326 case MDBX_INTEGERKEY: 4327 return "ordinal"; 4328 case MDBX_REVERSEKEY | MDBX_INTEGERKEY: 4329 return "msgpack"; 4330 default: 4331 assert(false); 4332 __unreachable(); 4333 } 4334 } 4335 4336 static const char *db_flags2valuemode(unsigned flags) { 4337 flags &= (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP); 4338 switch (flags) { 4339 case 0: 4340 return "single"; 4341 case MDBX_DUPSORT: 4342 return "multi"; 4343 case MDBX_REVERSEDUP: 4344 case MDBX_DUPSORT | MDBX_REVERSEDUP: 4345 return "multi-reverse"; 4346 case MDBX_DUPFIXED: 4347 case MDBX_DUPSORT | MDBX_DUPFIXED: 4348 return "multi-samelength"; 4349 case MDBX_DUPFIXED | MDBX_REVERSEDUP: 4350 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: 4351 return "multi-reverse-samelength"; 4352 case MDBX_INTEGERDUP: 4353 case MDBX_DUPSORT | MDBX_INTEGERDUP: 4354 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: 4355 case MDBX_DUPFIXED | MDBX_INTEGERDUP: 4356 return "multi-ordinal"; 4357 case MDBX_INTEGERDUP | MDBX_REVERSEDUP: 4358 case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: 4359 return "multi-msgpack"; 4360 case MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: 4361 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: 4362 return "reserved"; 4363 default: 4364 assert(false); 4365 __unreachable(); 4366 } 4367 } 4368 4369 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, 4370 bool silent) { 4371 MDBX_cursor *mc; 4372 MDBX_stat ms; 4373 MDBX_val key, data; 4374 MDBX_val prev_key, prev_data; 4375 unsigned flags; 4376 int rc, i; 4377 struct problem *saved_list; 4378 uint64_t problems_count; 4379 4380 uint64_t record_count = 0, dups = 0; 4381 uint64_t key_bytes = 0, data_bytes = 0; 4382 4383 if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { 4384 print(" ! abort processing '%s' due to a previous error\n", 4385 dbi_name ? dbi_name : "@MAIN"); 4386 return MDBX_BAD_TXN; 4387 } 4388 4389 if (dbi_handle == ~0u) { 4390 rc = mdbx_dbi_open_ex( 4391 txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, 4392 (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, 4393 (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); 4394 if (rc) { 4395 if (!dbi_name || 4396 rc != 4397 MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { 4398 error("mdbx_dbi_open('%s') failed, error %d %s\n", 4399 dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc)); 4400 } 4401 return rc; 4402 } 4403 } 4404 4405 if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && 4406 strcmp(only_subdb, dbi_name) != 0) { 4407 if (verbose) { 4408 print("Skip processing '%s'...\n", dbi_name); 4409 fflush(nullptr); 4410 } 4411 skipped_subdb++; 4412 return MDBX_SUCCESS; 4413 } 4414 4415 if (!silent && verbose) { 4416 print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN"); 4417 fflush(nullptr); 4418 } 4419 4420 rc = mdbx_dbi_flags(txn, dbi_handle, &flags); 4421 if (rc) { 4422 error("mdbx_dbi_flags() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4423 return rc; 4424 } 4425 4426 rc = mdbx_dbi_stat(txn, dbi_handle, &ms, sizeof(ms)); 4427 if (rc) { 4428 error("mdbx_dbi_stat() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4429 return rc; 4430 } 4431 4432 if (!silent && verbose) { 4433 print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), 4434 db_flags2valuemode(flags)); 4435 if (verbose > 1) { 4436 print(", flags:"); 4437 if (!flags) 4438 print(" none"); 4439 else { 4440 for (i = 0; dbflags[i].bit; i++) 4441 if (flags & dbflags[i].bit) 4442 print(" %s", dbflags[i].name); 4443 } 4444 if (verbose > 2) 4445 print(" (0x%02X), dbi-id %d", flags, dbi_handle); 4446 } 4447 print("\n"); 4448 if (ms.ms_mod_txnid) 4449 print(" - last modification txn#%" PRIu64 "\n", ms.ms_mod_txnid); 4450 if (verbose > 1) { 4451 print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize, 4452 ms.ms_entries); 4453 print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64 4454 ", overflow %" PRIu64 "\n", 4455 ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, 4456 ms.ms_overflow_pages); 4457 } 4458 } 4459 4460 walk_dbi_t *dbi = (dbi_handle < CORE_DBS) 4461 ? &walk.dbi[dbi_handle] 4462 : pagemap_lookup_dbi(dbi_name, true); 4463 if (!dbi) { 4464 error("too many DBIs or out of memory\n"); 4465 return MDBX_ENOMEM; 4466 } 4467 if (!dont_traversal) { 4468 const uint64_t subtotal_pages = 4469 ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages; 4470 if (subtotal_pages != dbi->pages.total) 4471 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", 4472 "subtotal", subtotal_pages, dbi->pages.total); 4473 if (ms.ms_branch_pages != dbi->pages.branch) 4474 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch", 4475 ms.ms_branch_pages, dbi->pages.branch); 4476 const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed; 4477 if (ms.ms_leaf_pages != allleaf_pages) 4478 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", 4479 "all-leaf", ms.ms_leaf_pages, allleaf_pages); 4480 if (ms.ms_overflow_pages != dbi->pages.large_volume) 4481 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", 4482 "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume); 4483 } 4484 rc = mdbx_cursor_open(txn, dbi_handle, &mc); 4485 if (rc) { 4486 error("mdbx_cursor_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4487 return rc; 4488 } 4489 4490 if (ignore_wrong_order) { /* for debugging with enabled assertions */ 4491 mc->mc_checking |= CC_SKIPORD; 4492 if (mc->mc_xcursor) 4493 mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; 4494 } 4495 4496 const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); 4497 saved_list = problems_push(); 4498 prev_key.iov_base = nullptr; 4499 prev_key.iov_len = 0; 4500 prev_data.iov_base = nullptr; 4501 prev_data.iov_len = 0; 4502 rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); 4503 while (rc == MDBX_SUCCESS) { 4504 rc = check_user_break(); 4505 if (rc) 4506 goto bailout; 4507 4508 bool bad_key = false; 4509 if (key.iov_len > maxkeysize) { 4510 problem_add("entry", record_count, "key length exceeds max-key-size", 4511 "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); 4512 bad_key = true; 4513 } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && 4514 key.iov_len != sizeof(uint32_t)) { 4515 problem_add("entry", record_count, "wrong key length", 4516 "%" PRIuPTR " != 4or8", key.iov_len); 4517 bad_key = true; 4518 } 4519 4520 bool bad_data = false; 4521 if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && 4522 data.iov_len != sizeof(uint32_t)) { 4523 problem_add("entry", record_count, "wrong data length", 4524 "%" PRIuPTR " != 4or8", data.iov_len); 4525 bad_data = true; 4526 } 4527 4528 if (prev_key.iov_base) { 4529 if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && 4530 prev_data.iov_len != data.iov_len) { 4531 problem_add("entry", record_count, "different data length", 4532 "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, 4533 data.iov_len); 4534 bad_data = true; 4535 } 4536 4537 if (!bad_key) { 4538 int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); 4539 if (cmp == 0) { 4540 ++dups; 4541 if ((flags & MDBX_DUPSORT) == 0) { 4542 problem_add("entry", record_count, "duplicated entries", nullptr); 4543 if (prev_data.iov_base && data.iov_len == prev_data.iov_len && 4544 memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) { 4545 problem_add("entry", record_count, "complete duplicate", nullptr); 4546 } 4547 } else if (!bad_data && prev_data.iov_base) { 4548 cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); 4549 if (cmp == 0) { 4550 problem_add("entry", record_count, "complete duplicate", nullptr); 4551 } else if (cmp < 0 && !ignore_wrong_order) { 4552 problem_add("entry", record_count, "wrong order of multi-values", 4553 nullptr); 4554 } 4555 } 4556 } else if (cmp < 0 && !ignore_wrong_order) { 4557 problem_add("entry", record_count, "wrong order of entries", nullptr); 4558 } 4559 } 4560 } 4561 4562 if (handler) { 4563 rc = handler(record_count, &key, &data); 4564 if (MDBX_IS_ERROR(rc)) 4565 goto bailout; 4566 } 4567 4568 record_count++; 4569 key_bytes += key.iov_len; 4570 data_bytes += data.iov_len; 4571 4572 if (!bad_key) { 4573 if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) 4574 print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); 4575 prev_key = key; 4576 } 4577 if (!bad_data) { 4578 if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && 4579 !prev_data.iov_base) 4580 print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); 4581 prev_data = data; 4582 } 4583 rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); 4584 } 4585 if (rc != MDBX_NOTFOUND) 4586 error("mdbx_cursor_get() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4587 else 4588 rc = 0; 4589 4590 if (record_count != ms.ms_entries) 4591 problem_add("entry", record_count, "different number of entries", 4592 "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); 4593 bailout: 4594 problems_count = problems_pop(saved_list); 4595 if (!silent && verbose) { 4596 print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 4597 " key's bytes, %" PRIu64 " data's " 4598 "bytes, %" PRIu64 " problems\n", 4599 record_count, dups, key_bytes, data_bytes, problems_count); 4600 fflush(nullptr); 4601 } 4602 4603 mdbx_cursor_close(mc); 4604 return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS; 4605 } 4606 4607 static void usage(char *prog) { 4608 fprintf(stderr, 4609 "usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] " 4610 "dbpath\n" 4611 " -V\t\tprint version and exit\n" 4612 " -v\t\tmore verbose, could be used multiple times\n" 4613 " -q\t\tbe quiet\n" 4614 " -c\t\tforce cooperative mode (don't try exclusive)\n" 4615 " -w\t\twrite-mode checking\n" 4616 " -d\t\tdisable page-by-page traversal of B-tree\n" 4617 " -i\t\tignore wrong order errors (for custom comparators case)\n" 4618 " -s subdb\tprocess a specific subdatabase only\n" 4619 " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" 4620 " -t\t\tturn to a specified meta-page on successful check\n" 4621 " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", 4622 prog); 4623 exit(EXIT_INTERRUPTED); 4624 } 4625 4626 static bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, 4627 uint64_t sign_b, const bool wanna_steady) { 4628 if (txn_a == txn_b) 4629 return SIGN_IS_STEADY(sign_b); 4630 4631 if (wanna_steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) 4632 return SIGN_IS_STEADY(sign_b); 4633 4634 return txn_a < txn_b; 4635 } 4636 4637 static bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, 4638 uint64_t sign_b) { 4639 if (!txn_a || txn_a != txn_b) 4640 return false; 4641 4642 if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) 4643 return false; 4644 4645 return true; 4646 } 4647 4648 static int meta_recent(const bool wanna_steady) { 4649 if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 4650 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady)) 4651 return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, 4652 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady) 4653 ? 1 4654 : 2; 4655 else 4656 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 4657 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, wanna_steady) 4658 ? 2 4659 : 0; 4660 } 4661 4662 static int meta_tail(int head) { 4663 switch (head) { 4664 case 0: 4665 return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, 4666 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) 4667 ? 1 4668 : 2; 4669 case 1: 4670 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 4671 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) 4672 ? 0 4673 : 2; 4674 case 2: 4675 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 4676 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true) 4677 ? 0 4678 : 1; 4679 default: 4680 assert(false); 4681 return -1; 4682 } 4683 } 4684 4685 static int meta_head(void) { return meta_recent(false); } 4686 4687 void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_x, 4688 uint64_t bootid_y) { 4689 const bool have_bootid = (bootid_x | bootid_y) != 0; 4690 const bool bootid_match = bootid_x == envinfo.mi_bootid.current.x && 4691 bootid_y == envinfo.mi_bootid.current.y; 4692 4693 print(" - meta-%d: ", num); 4694 switch (sign) { 4695 case MDBX_DATASIGN_NONE: 4696 print("no-sync/legacy"); 4697 break; 4698 case MDBX_DATASIGN_WEAK: 4699 print("weak-%s", bootid_match ? (have_bootid ? "intact (same boot-id)" 4700 : "unknown (no boot-id") 4701 : "dead"); 4702 break; 4703 default: 4704 print("steady"); 4705 break; 4706 } 4707 print(" txn#%" PRIu64, txnid); 4708 4709 const int head = meta_head(); 4710 if (num == head) 4711 print(", head"); 4712 else if (num == meta_tail(head)) 4713 print(", tail"); 4714 else 4715 print(", stay"); 4716 4717 if (stuck_meta >= 0) { 4718 if (num == stuck_meta) 4719 print(", forced for checking"); 4720 } else if (txnid > envinfo.mi_recent_txnid && 4721 (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE) 4722 print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", 4723 txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid); 4724 print("\n"); 4725 } 4726 4727 static uint64_t get_meta_txnid(const unsigned meta_id) { 4728 switch (meta_id) { 4729 default: 4730 assert(false); 4731 error("unexpected meta_id %u\n", meta_id); 4732 return 0; 4733 case 0: 4734 return envinfo.mi_meta0_txnid; 4735 case 1: 4736 return envinfo.mi_meta1_txnid; 4737 case 2: 4738 return envinfo.mi_meta2_txnid; 4739 } 4740 } 4741 4742 static void print_size(const char *prefix, const uint64_t value, 4743 const char *suffix) { 4744 const char sf[] = 4745 "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ 4746 double k = 1024.0; 4747 size_t i; 4748 for (i = 0; sf[i + 1] && value / k > 1000.0; ++i) 4749 k *= 1024; 4750 print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix); 4751 } 4752 4753 int mdbx_chk(int argc, char *argv[]) { 4754 int rc; 4755 char *prog = argv[0]; 4756 char *envname; 4757 unsigned problems_maindb = 0, problems_freedb = 0, problems_meta = 0; 4758 bool write_locked = false; 4759 bool turn_meta = false; 4760 bool force_turn_meta = false; 4761 4762 double elapsed; 4763 #if defined(_WIN32) || defined(_WIN64) 4764 uint64_t timestamp_start, timestamp_finish; 4765 timestamp_start = GetMilliseconds(); 4766 #else 4767 struct timespec timestamp_start, timestamp_finish; 4768 if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { 4769 rc = errno; 4770 error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4771 return EXIT_FAILURE_SYS; 4772 } 4773 #endif 4774 4775 dbi_meta.name = "@META"; 4776 dbi_free.name = "@GC"; 4777 dbi_main.name = "@MAIN"; 4778 atexit(pagemap_cleanup); 4779 4780 if (argc < 2) 4781 usage(prog); 4782 4783 for (int i; (i = getopt(argc, argv, 4784 "0" 4785 "1" 4786 "2" 4787 "T" 4788 "V" 4789 "v" 4790 "q" 4791 "n" 4792 "w" 4793 "c" 4794 "t" 4795 "d" 4796 "i" 4797 "s:")) != EOF;) { 4798 switch (i) { 4799 case 'V': 4800 printf("mdbx_chk version %d.%d.%d.%d\n" 4801 " - source: %s %s, commit %s, tree %s\n" 4802 " - anchor: %s\n" 4803 " - build: %s for %s by %s\n" 4804 " - flags: %s\n" 4805 " - options: %s\n", 4806 mdbx_version.major, mdbx_version.minor, mdbx_version.release, 4807 mdbx_version.revision, mdbx_version.git.describe, 4808 mdbx_version.git.datetime, mdbx_version.git.commit, 4809 mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, 4810 mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, 4811 mdbx_build.options); 4812 return EXIT_SUCCESS; 4813 case 'v': 4814 verbose++; 4815 break; 4816 case '0': 4817 stuck_meta = 0; 4818 break; 4819 case '1': 4820 stuck_meta = 1; 4821 break; 4822 case '2': 4823 stuck_meta = 2; 4824 break; 4825 case 't': 4826 turn_meta = true; 4827 break; 4828 case 'T': 4829 turn_meta = force_turn_meta = true; 4830 quiet = false; 4831 if (verbose < 2) 4832 verbose = 2; 4833 break; 4834 case 'q': 4835 quiet = true; 4836 break; 4837 case 'n': 4838 break; 4839 case 'w': 4840 envflags &= ~MDBX_RDONLY; 4841 #if MDBX_MMAP_INCOHERENT_FILE_WRITE 4842 /* Temporary `workaround` for OpenBSD kernel's flaw. 4843 * See todo4recovery://erased_by_github/libmdbx/issues/67 */ 4844 envflags |= MDBX_WRITEMAP; 4845 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ 4846 break; 4847 case 'c': 4848 envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE; 4849 break; 4850 case 'd': 4851 dont_traversal = true; 4852 break; 4853 case 's': 4854 if (only_subdb && strcmp(only_subdb, optarg)) 4855 usage(prog); 4856 only_subdb = optarg; 4857 break; 4858 case 'i': 4859 ignore_wrong_order = true; 4860 break; 4861 default: 4862 usage(prog); 4863 } 4864 } 4865 4866 if (optind != argc - 1) 4867 usage(prog); 4868 4869 rc = MDBX_SUCCESS; 4870 if (stuck_meta >= 0 && (envflags & MDBX_EXCLUSIVE) == 0) { 4871 error("exclusive mode is required to using specific meta-page(%d) for " 4872 "checking.\n", 4873 stuck_meta); 4874 rc = EXIT_INTERRUPTED; 4875 } 4876 if (turn_meta) { 4877 if (stuck_meta < 0) { 4878 error("meta-page must be specified (by -0, -1 or -2 options) to turn to " 4879 "it.\n"); 4880 rc = EXIT_INTERRUPTED; 4881 } 4882 if (envflags & MDBX_RDONLY) { 4883 error("write-mode must be enabled to turn to the specified meta-page.\n"); 4884 rc = EXIT_INTERRUPTED; 4885 } 4886 if (only_subdb || dont_traversal) { 4887 error("whole database checking with tree-traversal are required to turn " 4888 "to the specified meta-page.\n"); 4889 rc = EXIT_INTERRUPTED; 4890 } 4891 } 4892 if (rc) 4893 exit(rc); 4894 4895 #if defined(_WIN32) || defined(_WIN64) 4896 SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); 4897 #else 4898 #ifdef SIGPIPE 4899 signal(SIGPIPE, signal_handler); 4900 #endif 4901 #ifdef SIGHUP 4902 signal(SIGHUP, signal_handler); 4903 #endif 4904 signal(SIGINT, signal_handler); 4905 signal(SIGTERM, signal_handler); 4906 #endif /* !WINDOWS */ 4907 4908 envname = argv[optind]; 4909 print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n", 4910 mdbx_version.git.describe, mdbx_version.git.datetime, 4911 mdbx_version.git.tree, envname, 4912 (envflags & MDBX_RDONLY) ? "only" : "write"); 4913 fflush(nullptr); 4914 mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) 4915 ? (MDBX_log_level_t)(verbose + 1) 4916 : MDBX_LOG_TRACE, 4917 MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | 4918 MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, 4919 logger); 4920 4921 rc = mdbx_env_create(&env); 4922 if (rc) { 4923 error("mdbx_env_create() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4924 return rc < 0 ? EXIT_FAILURE_MDBX : EXIT_FAILURE_SYS; 4925 } 4926 4927 rc = mdbx_env_set_maxdbs(env, MDBX_MAX_DBI); 4928 if (rc) { 4929 error("mdbx_env_set_maxdbs() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4930 goto bailout; 4931 } 4932 4933 if (stuck_meta >= 0) { 4934 rc = mdbx_env_open_for_recovery(env, envname, stuck_meta, 4935 (envflags & MDBX_RDONLY) ? false : true); 4936 } else { 4937 rc = mdbx_env_open(env, envname, envflags, 0); 4938 if ((envflags & MDBX_EXCLUSIVE) && 4939 (rc == MDBX_BUSY || 4940 #if defined(_WIN32) || defined(_WIN64) 4941 rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION 4942 #else 4943 rc == EBUSY || rc == EAGAIN 4944 #endif 4945 )) { 4946 envflags &= ~MDBX_EXCLUSIVE; 4947 rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0); 4948 } 4949 } 4950 4951 if (rc) { 4952 error("mdbx_env_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4953 if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY)) 4954 print("Please run %s in the read-write mode (with '-w' option).\n", prog); 4955 goto bailout; 4956 } 4957 if (verbose) 4958 print(" - %s mode\n", 4959 (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); 4960 4961 if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) { 4962 rc = mdbx_txn_lock(env, false); 4963 if (rc != MDBX_SUCCESS) { 4964 error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4965 goto bailout; 4966 } 4967 write_locked = true; 4968 } 4969 4970 rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); 4971 if (rc) { 4972 error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4973 goto bailout; 4974 } 4975 4976 rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo)); 4977 if (rc) { 4978 error("mdbx_env_info_ex() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4979 goto bailout; 4980 } 4981 if (verbose) { 4982 print(" - current boot-id "); 4983 if (envinfo.mi_bootid.current.x | envinfo.mi_bootid.current.y) 4984 print("%016" PRIx64 "-%016" PRIx64 "\n", envinfo.mi_bootid.current.x, 4985 envinfo.mi_bootid.current.y); 4986 else 4987 print("unavailable\n"); 4988 } 4989 4990 mdbx_filehandle_t dxb_fd; 4991 rc = mdbx_env_get_fd(env, &dxb_fd); 4992 if (rc) { 4993 error("mdbx_env_get_fd() failed, error %d %s\n", rc, mdbx_strerror(rc)); 4994 goto bailout; 4995 } 4996 4997 uint64_t dxb_filesize = 0; 4998 #if defined(_WIN32) || defined(_WIN64) 4999 { 5000 BY_HANDLE_FILE_INFORMATION info; 5001 if (!GetFileInformationByHandle(dxb_fd, &info)) 5002 rc = GetLastError(); 5003 else 5004 dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; 5005 } 5006 #else 5007 { 5008 struct stat st; 5009 STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), 5010 "libmdbx requires 64-bit file I/O on 64-bit systems"); 5011 if (fstat(dxb_fd, &st)) 5012 rc = errno; 5013 else 5014 dxb_filesize = st.st_size; 5015 } 5016 #endif 5017 if (rc) { 5018 error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); 5019 goto bailout; 5020 } 5021 5022 errno = 0; 5023 const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize; 5024 alloc_pages = txn->mt_next_pgno; 5025 backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; 5026 if (backed_pages > dxbfile_pages) { 5027 print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", 5028 backed_pages, dxbfile_pages); 5029 ++problems_meta; 5030 } 5031 if (dxbfile_pages < NUM_METAS) 5032 print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS); 5033 if (backed_pages < NUM_METAS) 5034 print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS); 5035 if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS) 5036 goto bailout; 5037 if (backed_pages > MAX_PAGENO + 1) { 5038 print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n", 5039 backed_pages, MAX_PAGENO + 1); 5040 ++problems_meta; 5041 backed_pages = MAX_PAGENO + 1; 5042 } 5043 5044 if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { 5045 if (backed_pages > dxbfile_pages) { 5046 print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", 5047 backed_pages, dxbfile_pages); 5048 ++problems_meta; 5049 backed_pages = dxbfile_pages; 5050 } 5051 if (alloc_pages > backed_pages) { 5052 print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", 5053 alloc_pages, backed_pages); 5054 ++problems_meta; 5055 alloc_pages = backed_pages; 5056 } 5057 } else { 5058 /* LY: DB may be shrinked by writer down to the allocated pages. */ 5059 if (alloc_pages > backed_pages) { 5060 print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", 5061 alloc_pages, backed_pages); 5062 ++problems_meta; 5063 alloc_pages = backed_pages; 5064 } 5065 if (alloc_pages > dxbfile_pages) { 5066 print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n", 5067 alloc_pages, dxbfile_pages); 5068 ++problems_meta; 5069 alloc_pages = dxbfile_pages; 5070 } 5071 if (backed_pages > dxbfile_pages) 5072 backed_pages = dxbfile_pages; 5073 } 5074 5075 if (verbose) { 5076 print(" - pagesize %u (%u system), max keysize %d..%d" 5077 ", max readers %u\n", 5078 envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, 5079 mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), 5080 mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders); 5081 print_size(" - mapsize ", envinfo.mi_mapsize, "\n"); 5082 if (envinfo.mi_geo.lower == envinfo.mi_geo.upper) 5083 print_size(" - fixed datafile: ", envinfo.mi_geo.current, ""); 5084 else { 5085 print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, ""); 5086 print_size(" .. ", envinfo.mi_geo.upper, ", "); 5087 print_size("+", envinfo.mi_geo.grow, ", "); 5088 print_size("-", envinfo.mi_geo.shrink, "\n"); 5089 print_size(" - current datafile: ", envinfo.mi_geo.current, ""); 5090 } 5091 printf(", %" PRIu64 " pages\n", 5092 envinfo.mi_geo.current / envinfo.mi_dxb_pagesize); 5093 #if defined(_WIN32) || defined(_WIN64) 5094 if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper) 5095 print( 5096 " WARNING: Due Windows system limitations a " 5097 "file couldn't\n be truncated while the database " 5098 "is opened. So, the size\n database file " 5099 "of may by large than the database itself,\n " 5100 "until it will be closed or reopened in read-write mode.\n"); 5101 #endif 5102 verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 5103 envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y); 5104 verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, 5105 envinfo.mi_bootid.meta1.x, envinfo.mi_bootid.meta1.y); 5106 verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, 5107 envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y); 5108 } 5109 5110 if (stuck_meta >= 0) { 5111 if (verbose) { 5112 print(" - skip checking meta-pages since the %u" 5113 " is selected for verification\n", 5114 stuck_meta); 5115 print(" - transactions: recent %" PRIu64 5116 ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n", 5117 envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta), 5118 envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta)); 5119 } 5120 } else { 5121 if (verbose > 1) 5122 print(" - performs check for meta-pages clashes\n"); 5123 if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, 5124 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { 5125 print(" ! meta-%d and meta-%d are clashed\n", 0, 1); 5126 ++problems_meta; 5127 } 5128 if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, 5129 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { 5130 print(" ! meta-%d and meta-%d are clashed\n", 1, 2); 5131 ++problems_meta; 5132 } 5133 if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, 5134 envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { 5135 print(" ! meta-%d and meta-%d are clashed\n", 2, 0); 5136 ++problems_meta; 5137 } 5138 5139 const unsigned steady_meta_id = meta_recent(true); 5140 const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); 5141 const unsigned weak_meta_id = meta_recent(false); 5142 const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); 5143 if (envflags & MDBX_EXCLUSIVE) { 5144 if (verbose > 1) 5145 print(" - performs full check recent-txn-id with meta-pages\n"); 5146 if (steady_meta_txnid != envinfo.mi_recent_txnid) { 5147 print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 5148 " != %" PRIi64 ")\n", 5149 steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); 5150 ++problems_meta; 5151 } 5152 } else if (write_locked) { 5153 if (verbose > 1) 5154 print(" - performs lite check recent-txn-id with meta-pages (not a " 5155 "monopolistic mode)\n"); 5156 if (weak_meta_txnid != envinfo.mi_recent_txnid) { 5157 print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 5158 " != %" PRIi64 ")\n", 5159 weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); 5160 ++problems_meta; 5161 } 5162 } else if (verbose) { 5163 print(" - skip check recent-txn-id with meta-pages (monopolistic or " 5164 "read-write mode only)\n"); 5165 } 5166 total_problems += problems_meta; 5167 5168 if (verbose) 5169 print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 5170 ", lag %" PRIi64 "\n", 5171 envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, 5172 envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); 5173 } 5174 5175 if (!dont_traversal) { 5176 struct problem *saved_list; 5177 size_t traversal_problems; 5178 uint64_t empty_pages, lost_bytes; 5179 5180 print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); 5181 fflush(nullptr); 5182 walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); 5183 if (!walk.pagemap) { 5184 rc = errno ? errno : MDBX_ENOMEM; 5185 error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); 5186 goto bailout; 5187 } 5188 5189 saved_list = problems_push(); 5190 rc = mdbx_env_pgwalk(txn, pgvisitor, nullptr, 5191 true /* always skip key ordering checking to avoid 5192 MDBX_CORRUPTED when using custom comparators */); 5193 traversal_problems = problems_pop(saved_list); 5194 5195 if (rc) { 5196 if (rc != MDBX_EINTR || !check_user_break()) 5197 error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); 5198 goto bailout; 5199 } 5200 5201 for (uint64_t n = 0; n < alloc_pages; ++n) 5202 if (!walk.pagemap[n]) 5203 unused_pages += 1; 5204 5205 empty_pages = lost_bytes = 0; 5206 for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name; 5207 ++dbi) { 5208 empty_pages += dbi->pages.empty; 5209 lost_bytes += dbi->lost_bytes; 5210 } 5211 5212 if (verbose) { 5213 uint64_t total_page_bytes = walk.pgcount * envinfo.mi_dxb_pagesize; 5214 print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", 5215 walk.pgcount, unused_pages); 5216 if (verbose > 1) { 5217 for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; 5218 ++dbi) { 5219 print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total); 5220 if (dbi->pages.other && dbi->pages.other != dbi->pages.total) 5221 print(", other %" PRIu64, dbi->pages.other); 5222 if (dbi->pages.branch) 5223 print(", branch %" PRIu64, dbi->pages.branch); 5224 if (dbi->pages.large_count) 5225 print(", large %" PRIu64, dbi->pages.large_count); 5226 uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed; 5227 if (all_leaf) { 5228 print(", leaf %" PRIu64, all_leaf); 5229 if (verbose > 2 && 5230 (dbi->pages.subleaf_dupsort | dbi->pages.leaf_dupfixed | 5231 dbi->pages.subleaf_dupfixed)) 5232 print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64 5233 ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")", 5234 dbi->pages.leaf, dbi->pages.subleaf_dupsort, 5235 dbi->pages.leaf_dupfixed, dbi->pages.subleaf_dupfixed); 5236 } 5237 print("\n"); 5238 } 5239 } 5240 5241 if (verbose > 1) 5242 print(" - usage: total %" PRIu64 " bytes, payload %" PRIu64 5243 " (%.1f%%), unused " 5244 "%" PRIu64 " (%.1f%%)\n", 5245 total_page_bytes, walk.total_payload_bytes, 5246 walk.total_payload_bytes * 100.0 / total_page_bytes, 5247 total_page_bytes - walk.total_payload_bytes, 5248 (total_page_bytes - walk.total_payload_bytes) * 100.0 / 5249 total_page_bytes); 5250 if (verbose > 2) { 5251 for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; 5252 ++dbi) 5253 if (dbi->pages.total) { 5254 uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; 5255 print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," 5256 " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", 5257 dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, 5258 dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, 5259 dbi_bytes - dbi->payload_bytes, 5260 (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); 5261 if (dbi->pages.empty) 5262 print(", %" PRIu64 " empty pages", dbi->pages.empty); 5263 if (dbi->lost_bytes) 5264 print(", %" PRIu64 " bytes lost", dbi->lost_bytes); 5265 print("\n"); 5266 } else 5267 print(" %s: empty\n", dbi->name); 5268 } 5269 print(" - summary: average fill %.1f%%", 5270 walk.total_payload_bytes * 100.0 / total_page_bytes); 5271 if (empty_pages) 5272 print(", %" PRIu64 " empty pages", empty_pages); 5273 if (lost_bytes) 5274 print(", %" PRIu64 " bytes lost", lost_bytes); 5275 print(", %" PRIuPTR " problems\n", traversal_problems); 5276 } 5277 } else if (verbose) { 5278 print("Skipping b-tree walk...\n"); 5279 fflush(nullptr); 5280 } 5281 5282 if (!verbose) 5283 print("Iterating DBIs...\n"); 5284 if (data_tree_problems) { 5285 print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN", 5286 data_tree_problems); 5287 problems_maindb = data_tree_problems; 5288 } else 5289 problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); 5290 5291 if (gc_tree_problems) { 5292 print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC", 5293 gc_tree_problems); 5294 problems_freedb = gc_tree_problems; 5295 } else 5296 problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false); 5297 5298 if (verbose) { 5299 uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; 5300 double percent = value / 100.0; 5301 print(" - space: %" PRIu64 " total pages", value); 5302 print(", backed %" PRIu64 " (%.1f%%)", backed_pages, 5303 backed_pages / percent); 5304 print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages, 5305 alloc_pages / percent); 5306 5307 if (verbose > 1) { 5308 value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages; 5309 print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); 5310 5311 value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount; 5312 print(", used %" PRIu64 " (%.1f%%)", value, value / percent); 5313 5314 print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent); 5315 5316 value = gc_pages - reclaimable_pages; 5317 print(", detained %" PRIu64 " (%.1f%%)", value, value / percent); 5318 5319 print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages, 5320 reclaimable_pages / percent); 5321 } 5322 5323 value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages + 5324 reclaimable_pages; 5325 print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); 5326 } 5327 5328 if (problems_maindb == 0 && problems_freedb == 0) { 5329 if (!dont_traversal && 5330 (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { 5331 if (walk.pgcount != alloc_pages - gc_pages) { 5332 error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64 5333 "(allocated - GC))\n", 5334 walk.pgcount, alloc_pages - gc_pages); 5335 } 5336 if (unused_pages != gc_pages) { 5337 error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", 5338 unused_pages, gc_pages); 5339 } 5340 } else if (verbose) { 5341 print(" - skip check used and gc pages (btree-traversal with " 5342 "monopolistic or read-write mode only)\n"); 5343 } 5344 5345 if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { 5346 if (!userdb_count && verbose) 5347 print(" - does not contain multiple databases\n"); 5348 } 5349 } 5350 5351 if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && 5352 (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 && 5353 get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { 5354 print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 5355 "\n", 5356 envinfo.mi_recent_txnid); 5357 fflush(nullptr); 5358 if (write_locked) { 5359 mdbx_txn_unlock(env); 5360 write_locked = false; 5361 } 5362 rc = mdbx_env_sync_ex(env, true, false); 5363 if (rc != MDBX_SUCCESS) 5364 error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); 5365 else { 5366 total_problems -= 1; 5367 problems_meta -= 1; 5368 } 5369 } 5370 5371 if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb && 5372 (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { 5373 const bool successful_check = (rc | total_problems | problems_meta) == 0; 5374 if (successful_check || force_turn_meta) { 5375 fflush(nullptr); 5376 print(" = Performing turn to the specified meta-page (%d) due to %s!\n", 5377 stuck_meta, 5378 successful_check ? "successful check" : "the -T option was given"); 5379 fflush(nullptr); 5380 rc = mdbx_env_turn_for_recovery(env, stuck_meta); 5381 if (rc != MDBX_SUCCESS) 5382 error("mdbx_env_turn_for_recovery() failed, error %d %s\n", rc, 5383 mdbx_strerror(rc)); 5384 } else { 5385 print(" = Skipping turn to the specified meta-page (%d) due to " 5386 "unsuccessful check!\n", 5387 stuck_meta); 5388 } 5389 } 5390 5391 bailout: 5392 if (txn) 5393 mdbx_txn_abort(txn); 5394 if (write_locked) { 5395 mdbx_txn_unlock(env); 5396 write_locked = false; 5397 } 5398 if (env) { 5399 const bool dont_sync = rc != 0 || total_problems; 5400 mdbx_env_close_ex(env, dont_sync); 5401 } 5402 fflush(nullptr); 5403 if (rc) { 5404 if (rc < 0) 5405 return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; 5406 return EXIT_FAILURE_MDBX; 5407 } 5408 5409 #if defined(_WIN32) || defined(_WIN64) 5410 timestamp_finish = GetMilliseconds(); 5411 elapsed = (timestamp_finish - timestamp_start) * 1e-3; 5412 #else 5413 if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { 5414 rc = errno; 5415 error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); 5416 return EXIT_FAILURE_SYS; 5417 } 5418 elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + 5419 (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; 5420 #endif /* !WINDOWS */ 5421 5422 if (total_problems) { 5423 print("Total %u error%s detected, elapsed %.3f seconds.\n", total_problems, 5424 (total_problems > 1) ? "s are" : " is", elapsed); 5425 if (problems_meta || problems_maindb || problems_freedb) 5426 return EXIT_FAILURE_CHECK_MAJOR; 5427 return EXIT_FAILURE_CHECK_MINOR; 5428 } 5429 print("No error is detected, elapsed %.3f seconds\n", elapsed); 5430 return EXIT_SUCCESS; 5431 }