mdb.c (294997B)
1 /** @file mdb.c 2 * @brief Lightning memory-mapped database library 3 * 4 * A Btree-based database management library modeled loosely on the 5 * BerkeleyDB API, but much simplified. 6 */ 7 /* 8 * Copyright 2011-2021 Howard Chu, Symas Corp. 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted only as authorized by the OpenLDAP 13 * Public License. 14 * 15 * A copy of this license is available in the file LICENSE in the 16 * top-level directory of the distribution or, alternatively, at 17 * <http://www.OpenLDAP.org/license.html>. 18 * 19 * This code is derived from btree.c written by Martin Hedenfalk. 20 * 21 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 22 * 23 * Permission to use, copy, modify, and distribute this software for any 24 * purpose with or without fee is hereby granted, provided that the above 25 * copyright notice and this permission notice appear in all copies. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 28 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 29 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 30 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 31 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 32 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 33 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 34 */ 35 #ifndef _GNU_SOURCE 36 #define _GNU_SOURCE 1 37 #endif 38 #if defined(__WIN64__) 39 #define _FILE_OFFSET_BITS 64 40 #endif 41 #ifdef _WIN32 42 #include <malloc.h> 43 #include <windows.h> 44 #include <wchar.h> /* get wcscpy() */ 45 46 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it 47 * as int64 which is wrong. MSVC doesn't define it at all, so just 48 * don't use it. 49 */ 50 #define MDB_PID_T int 51 #define MDB_THR_T DWORD 52 #include <sys/types.h> 53 #include <sys/stat.h> 54 #ifdef __GNUC__ 55 # include <sys/param.h> 56 #else 57 # define LITTLE_ENDIAN 1234 58 # define BIG_ENDIAN 4321 59 # define BYTE_ORDER LITTLE_ENDIAN 60 # ifndef SSIZE_MAX 61 # define SSIZE_MAX INT_MAX 62 # endif 63 #endif 64 #else 65 #include <sys/types.h> 66 #include <sys/stat.h> 67 #define MDB_PID_T pid_t 68 #define MDB_THR_T pthread_t 69 #include <sys/param.h> 70 #include <sys/uio.h> 71 #include <sys/mman.h> 72 #ifdef HAVE_SYS_FILE_H 73 #include <sys/file.h> 74 #endif 75 #include <fcntl.h> 76 #endif 77 78 #if defined(__mips) && defined(__linux) 79 /* MIPS has cache coherency issues, requires explicit cache control */ 80 #include <sys/cachectl.h> 81 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) 82 #else 83 #define CACHEFLUSH(addr, bytes, cache) 84 #endif 85 86 #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) 87 /** fdatasync is broken on ext3/ext4fs on older kernels, see 88 * description in #mdb_env_open2 comments. You can safely 89 * define MDB_FDATASYNC_WORKS if this code will only be run 90 * on kernels 3.6 and newer. 91 */ 92 #define BROKEN_FDATASYNC 93 #endif 94 95 #include <errno.h> 96 #include <limits.h> 97 #include <stddef.h> 98 #include <inttypes.h> 99 #include <stdio.h> 100 #include <stdlib.h> 101 #include <string.h> 102 #include <time.h> 103 104 #ifdef _MSC_VER 105 #include <io.h> 106 typedef SSIZE_T ssize_t; 107 #else 108 #include <unistd.h> 109 #endif 110 111 #if defined(__sun) || defined(ANDROID) 112 /* Most platforms have posix_memalign, older may only have memalign */ 113 #define HAVE_MEMALIGN 1 114 #include <malloc.h> 115 /* On Solaris, we need the POSIX sigwait function */ 116 #if defined (__sun) 117 # define _POSIX_PTHREAD_SEMANTICS 1 118 #endif 119 #endif 120 121 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) 122 #include <netinet/in.h> 123 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ 124 #endif 125 126 #if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110 127 # define MDB_USE_POSIX_MUTEX 1 128 # define MDB_USE_ROBUST 1 129 #elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) 130 # define MDB_USE_POSIX_SEM 1 131 # define MDB_FDATASYNC fsync 132 #elif defined(ANDROID) 133 # define MDB_FDATASYNC fsync 134 #endif 135 136 #ifndef _WIN32 137 #include <pthread.h> 138 #include <signal.h> 139 #ifdef MDB_USE_POSIX_SEM 140 # define MDB_USE_HASH 1 141 #include <semaphore.h> 142 #else 143 #define MDB_USE_POSIX_MUTEX 1 144 #endif 145 #endif 146 147 #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ 148 + defined(MDB_USE_POSIX_MUTEX) != 1 149 # error "Ambiguous shared-lock implementation" 150 #endif 151 152 #ifdef USE_VALGRIND 153 #include <valgrind/memcheck.h> 154 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) 155 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) 156 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) 157 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) 158 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) 159 #else 160 #define VGMEMP_CREATE(h,r,z) 161 #define VGMEMP_ALLOC(h,a,s) 162 #define VGMEMP_FREE(h,a) 163 #define VGMEMP_DESTROY(h) 164 #define VGMEMP_DEFINED(a,s) 165 #endif 166 167 #ifndef BYTE_ORDER 168 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) 169 /* Solaris just defines one or the other */ 170 # define LITTLE_ENDIAN 1234 171 # define BIG_ENDIAN 4321 172 # ifdef _LITTLE_ENDIAN 173 # define BYTE_ORDER LITTLE_ENDIAN 174 # else 175 # define BYTE_ORDER BIG_ENDIAN 176 # endif 177 # else 178 # define BYTE_ORDER __BYTE_ORDER 179 # endif 180 #endif 181 182 #ifndef LITTLE_ENDIAN 183 #define LITTLE_ENDIAN __LITTLE_ENDIAN 184 #endif 185 #ifndef BIG_ENDIAN 186 #define BIG_ENDIAN __BIG_ENDIAN 187 #endif 188 189 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) 190 #define MISALIGNED_OK 1 191 #endif 192 193 #include "lmdb.h" 194 #include "midl.h" 195 196 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) 197 # error "Unknown or unsupported endianness (BYTE_ORDER)" 198 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 199 # error "Two's complement, reasonably sized integer types, please" 200 #endif 201 202 #if (((__clang_major__ << 8) | __clang_minor__) >= 0x0302) || (((__GNUC__ << 8) | __GNUC_MINOR__) >= 0x0403) 203 /** Mark infrequently used env functions as cold. This puts them in a separate 204 * section, and optimizes them for size */ 205 #define ESECT __attribute__ ((cold)) 206 #else 207 /* On older compilers, use a separate section */ 208 # ifdef __GNUC__ 209 # ifdef __APPLE__ 210 # define ESECT __attribute__ ((section("__TEXT,text_env"))) 211 # else 212 # define ESECT __attribute__ ((section("text_env"))) 213 # endif 214 # else 215 # define ESECT 216 # endif 217 #endif 218 219 #ifdef _WIN32 220 #define CALL_CONV WINAPI 221 #else 222 #define CALL_CONV 223 #endif 224 225 /** @defgroup internal LMDB Internals 226 * @{ 227 */ 228 /** @defgroup compat Compatibility Macros 229 * A bunch of macros to minimize the amount of platform-specific ifdefs 230 * needed throughout the rest of the code. When the features this library 231 * needs are similar enough to POSIX to be hidden in a one-or-two line 232 * replacement, this macro approach is used. 233 * @{ 234 */ 235 236 /** Features under development */ 237 #ifndef MDB_DEVEL 238 #define MDB_DEVEL 0 239 #endif 240 241 /** Wrapper around __func__, which is a C99 feature */ 242 #if __STDC_VERSION__ >= 199901L 243 # define mdb_func_ __func__ 244 #elif __GNUC__ >= 2 || _MSC_VER >= 1300 245 # define mdb_func_ __FUNCTION__ 246 #else 247 /* If a debug message says <mdb_unknown>(), update the #if statements above */ 248 # define mdb_func_ "<mdb_unknown>" 249 #endif 250 251 /* Internal error codes, not exposed outside liblmdb */ 252 #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) 253 #ifdef _WIN32 254 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) 255 #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) 256 #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ 257 #endif 258 259 #ifdef __GLIBC__ 260 #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) 261 #endif 262 /** Some platforms define the EOWNERDEAD error code 263 * even though they don't support Robust Mutexes. 264 * Compile with -DMDB_USE_ROBUST=0, or use some other 265 * mechanism like -DMDB_USE_POSIX_SEM instead of 266 * -DMDB_USE_POSIX_MUTEX. 267 * (Posix semaphores are not robust.) 268 */ 269 #ifndef MDB_USE_ROBUST 270 /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ 271 # if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ 272 (defined(__GLIBC__) && GLIBC_VER < 0x020004)) 273 # define MDB_USE_ROBUST 0 274 # else 275 # define MDB_USE_ROBUST 1 276 # endif 277 #endif /* !MDB_USE_ROBUST */ 278 279 #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) 280 /* glibc < 2.12 only provided _np API */ 281 # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ 282 (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) 283 # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP 284 # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) 285 # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) 286 # endif 287 #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ 288 289 #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) 290 #define MDB_ROBUST_SUPPORTED 1 291 #endif 292 293 #ifdef _WIN32 294 #define MDB_USE_HASH 1 295 #define MDB_PIDLOCK 0 296 #define THREAD_RET DWORD 297 #define pthread_t HANDLE 298 #define pthread_mutex_t HANDLE 299 #define pthread_cond_t HANDLE 300 typedef HANDLE mdb_mutex_t, mdb_mutexref_t; 301 #define pthread_key_t DWORD 302 #define pthread_self() GetCurrentThreadId() 303 #define pthread_key_create(x,y) \ 304 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) 305 #define pthread_key_delete(x) TlsFree(x) 306 #define pthread_getspecific(x) TlsGetValue(x) 307 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) 308 #define pthread_mutex_unlock(x) ReleaseMutex(*x) 309 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) 310 #define pthread_cond_signal(x) SetEvent(*x) 311 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) 312 #define THREAD_CREATE(thr,start,arg) \ 313 (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) 314 #define THREAD_FINISH(thr) \ 315 (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) 316 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) 317 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) 318 #define mdb_mutex_consistent(mutex) 0 319 #define getpid() GetCurrentProcessId() 320 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) 321 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) 322 #define ErrCode() GetLastError() 323 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} 324 #define close(fd) (CloseHandle(fd) ? 0 : -1) 325 #define munmap(ptr,len) UnmapViewOfFile(ptr) 326 #ifdef PROCESS_QUERY_LIMITED_INFORMATION 327 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION 328 #else 329 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 330 #endif 331 #define Z "I" 332 #else 333 #define THREAD_RET void * 334 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) 335 #define THREAD_FINISH(thr) pthread_join(thr,NULL) 336 #define Z "z" /**< printf format modifier for size_t */ 337 338 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ 339 #define MDB_PIDLOCK 1 340 341 #ifdef MDB_USE_POSIX_SEM 342 343 typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; 344 #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) 345 #define UNLOCK_MUTEX(mutex) sem_post(mutex) 346 347 static int 348 mdb_sem_wait(sem_t *sem) 349 { 350 int rc; 351 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; 352 return rc; 353 } 354 355 #else /* MDB_USE_POSIX_MUTEX: */ 356 /** Shared mutex/semaphore as the original is stored. 357 * 358 * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. 359 * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it 360 * is array[size 1] so it can be assigned to the pointer. 361 */ 362 typedef pthread_mutex_t mdb_mutex_t[1]; 363 /** Reference to an #mdb_mutex_t */ 364 typedef pthread_mutex_t *mdb_mutexref_t; 365 /** Lock the reader or writer mutex. 366 * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). 367 */ 368 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) 369 /** Unlock the reader or writer mutex. 370 */ 371 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) 372 /** Mark mutex-protected data as repaired, after death of previous owner. 373 */ 374 #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) 375 #endif /* MDB_USE_POSIX_SEM */ 376 377 /** Get the error code for the last failed system function. 378 */ 379 #define ErrCode() errno 380 381 /** An abstraction for a file handle. 382 * On POSIX systems file handles are small integers. On Windows 383 * they're opaque pointers. 384 */ 385 #define HANDLE int 386 387 /** A value for an invalid file handle. 388 * Mainly used to initialize file variables and signify that they are 389 * unused. 390 */ 391 #define INVALID_HANDLE_VALUE (-1) 392 393 /** Get the size of a memory page for the system. 394 * This is the basic size that the platform's memory manager uses, and is 395 * fundamental to the use of memory-mapped files. 396 */ 397 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) 398 #endif 399 400 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 401 #define MNAME_LEN 32 402 #else 403 #define MNAME_LEN (sizeof(pthread_mutex_t)) 404 #endif 405 406 /** @} */ 407 408 #ifdef MDB_ROBUST_SUPPORTED 409 /** Lock mutex, handle any error, set rc = result. 410 * Return 0 on success, nonzero (not rc) on error. 411 */ 412 #define LOCK_MUTEX(rc, env, mutex) \ 413 (((rc) = LOCK_MUTEX0(mutex)) && \ 414 ((rc) = mdb_mutex_failed(env, mutex, rc))) 415 static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); 416 #else 417 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) 418 #define mdb_mutex_failed(env, mutex, rc) (rc) 419 #endif 420 421 #ifndef _WIN32 422 /** A flag for opening a file and requesting synchronous data writes. 423 * This is only used when writing a meta page. It's not strictly needed; 424 * we could just do a normal write and then immediately perform a flush. 425 * But if this flag is available it saves us an extra system call. 426 * 427 * @note If O_DSYNC is undefined but exists in /usr/include, 428 * preferably set some compiler flag to get the definition. 429 */ 430 #ifndef MDB_DSYNC 431 # ifdef O_DSYNC 432 # define MDB_DSYNC O_DSYNC 433 # else 434 # define MDB_DSYNC O_SYNC 435 # endif 436 #endif 437 #endif 438 439 /** Function for flushing the data of a file. Define this to fsync 440 * if fdatasync() is not supported. 441 */ 442 #ifndef MDB_FDATASYNC 443 # define MDB_FDATASYNC fdatasync 444 #endif 445 446 #ifndef MDB_MSYNC 447 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) 448 #endif 449 450 #ifndef MS_SYNC 451 #define MS_SYNC 1 452 #endif 453 454 #ifndef MS_ASYNC 455 #define MS_ASYNC 0 456 #endif 457 458 /** A page number in the database. 459 * Note that 64 bit page numbers are overkill, since pages themselves 460 * already represent 12-13 bits of addressable memory, and the OS will 461 * always limit applications to a maximum of 63 bits of address space. 462 * 463 * @note In the #MDB_node structure, we only store 48 bits of this value, 464 * which thus limits us to only 60 bits of addressable data. 465 */ 466 typedef MDB_ID pgno_t; 467 468 /** A transaction ID. 469 * See struct MDB_txn.mt_txnid for details. 470 */ 471 typedef MDB_ID txnid_t; 472 473 /** @defgroup debug Debug Macros 474 * @{ 475 */ 476 #ifndef MDB_DEBUG 477 /** Enable debug output. Needs variable argument macros (a C99 feature). 478 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs 479 * read from and written to the database (used for free space management). 480 */ 481 #define MDB_DEBUG 0 482 #endif 483 484 #define MDB_DBG_INFO 1 485 #define MDB_DBG_TRACE 2 486 487 #if MDB_DEBUG 488 static int mdb_debug = MDB_DBG_TRACE; 489 static txnid_t mdb_debug_start; 490 491 /** Print a debug message with printf formatting. 492 * Requires double parenthesis around 2 or more args. 493 */ 494 # define DPRINTF(args) ((void) ((mdb_debug & MDB_DBG_INFO) && DPRINTF0 args)) 495 # define DPRINTF0(fmt, ...) \ 496 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) 497 /** Trace info for replaying */ 498 # define MDB_TRACE(args) ((void) ((mdb_debug & MDB_DBG_TRACE) && DPRINTF1 args)) 499 # define DPRINTF1(fmt, ...) \ 500 fprintf(stderr, ">%d:%s: " fmt "\n", getpid(), mdb_func_, __VA_ARGS__) 501 #else 502 # define DPRINTF(args) ((void) 0) 503 # define MDB_TRACE(args) ((void) 0) 504 #endif 505 /** Print a debug string. 506 * The string is printed literally, with no format processing. 507 */ 508 #define DPUTS(arg) DPRINTF(("%s", arg)) 509 /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */ 510 #define DDBI(mc) \ 511 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 512 /** @} */ 513 514 /** @brief The maximum size of a database page. 515 * 516 * It is 32k or 64k, since value-PAGEBASE must fit in 517 * #MDB_page.%mp_upper. 518 * 519 * LMDB will use database pages < OS pages if needed. 520 * That causes more I/O in write transactions: The OS must 521 * know (read) the whole page before writing a partial page. 522 * 523 * Note that we don't currently support Huge pages. On Linux, 524 * regular data files cannot use Huge pages, and in general 525 * Huge pages aren't actually pageable. We rely on the OS 526 * demand-pager to read our data and page it out when memory 527 * pressure from other processes is high. So until OSs have 528 * actual paging support for Huge pages, they're not viable. 529 */ 530 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) 531 532 /** The minimum number of keys required in a database page. 533 * Setting this to a larger value will place a smaller bound on the 534 * maximum size of a data item. Data items larger than this size will 535 * be pushed into overflow pages instead of being stored directly in 536 * the B-tree node. This value used to default to 4. With a page size 537 * of 4096 bytes that meant that any item larger than 1024 bytes would 538 * go into an overflow page. That also meant that on average 2-3KB of 539 * each overflow page was wasted space. The value cannot be lower than 540 * 2 because then there would no longer be a tree structure. With this 541 * value, items larger than 2KB will go into overflow pages, and on 542 * average only 1KB will be wasted. 543 */ 544 #define MDB_MINKEYS 2 545 546 /** A stamp that identifies a file as an LMDB file. 547 * There's nothing special about this value other than that it is easily 548 * recognizable, and it will reflect any byte order mismatches. 549 */ 550 #define MDB_MAGIC 0xBEEFC0DE 551 552 /** The version number for a database's datafile format. */ 553 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) 554 /** The version number for a database's lockfile format. */ 555 #define MDB_LOCK_VERSION 1 556 557 /** @brief The max size of a key we can write, or 0 for computed max. 558 * 559 * This macro should normally be left alone or set to 0. 560 * Note that a database with big keys or dupsort data cannot be 561 * reliably modified by a liblmdb which uses a smaller max. 562 * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. 563 * 564 * Other values are allowed, for backwards compat. However: 565 * A value bigger than the computed max can break if you do not 566 * know what you are doing, and liblmdb <= 0.9.10 can break when 567 * modifying a DB with keys/dupsort data bigger than its max. 568 * 569 * Data items in an #MDB_DUPSORT database are also limited to 570 * this size, since they're actually keys of a sub-DB. Keys and 571 * #MDB_DUPSORT data items must fit on a node in a regular page. 572 */ 573 #ifndef MDB_MAXKEYSIZE 574 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) 575 #endif 576 577 /** The maximum size of a key we can write to the environment. */ 578 #if MDB_MAXKEYSIZE 579 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) 580 #else 581 #define ENV_MAXKEY(env) ((env)->me_maxkey) 582 #endif 583 584 /** @brief The maximum size of a data item. 585 * 586 * We only store a 32 bit value for node sizes. 587 */ 588 #define MAXDATASIZE 0xffffffffUL 589 590 #if MDB_DEBUG 591 /** Key size which fits in a #DKBUF. 592 * @ingroup debug 593 */ 594 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) 595 /** A key buffer. 596 * @ingroup debug 597 * This is used for printing a hex dump of a key's contents. 598 */ 599 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] 600 /** A data value buffer. 601 * @ingroup debug 602 * This is used for printing a hex dump of a #MDB_DUPSORT value's contents. 603 */ 604 #define DDBUF char dbuf[DKBUF_MAXKEYSIZE*2+1+2] 605 /** Display a key in hex. 606 * @ingroup debug 607 * Invoke a function to display a key in hex. 608 */ 609 #define DKEY(x) mdb_dkey(x, kbuf) 610 #else 611 #define DKBUF 612 #define DDBUF 613 #define DKEY(x) 0 614 #endif 615 616 /** An invalid page number. 617 * Mainly used to denote an empty tree. 618 */ 619 #define P_INVALID (~(pgno_t)0) 620 621 /** Test if the flags \b f are set in a flag word \b w. */ 622 #define F_ISSET(w, f) (((w) & (f)) == (f)) 623 624 /** Round \b n up to an even number. */ 625 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ 626 627 /** Used for offsets within a single page. 628 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 629 * this is plenty. 630 */ 631 typedef uint16_t indx_t; 632 633 /** Default size of memory map. 634 * This is certainly too small for any actual applications. Apps should always set 635 * the size explicitly using #mdb_env_set_mapsize(). 636 */ 637 #define DEFAULT_MAPSIZE 1048576 638 639 /** @defgroup readers Reader Lock Table 640 * Readers don't acquire any locks for their data access. Instead, they 641 * simply record their transaction ID in the reader table. The reader 642 * mutex is needed just to find an empty slot in the reader table. The 643 * slot's address is saved in thread-specific data so that subsequent read 644 * transactions started by the same thread need no further locking to proceed. 645 * 646 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. 647 * 648 * No reader table is used if the database is on a read-only filesystem, or 649 * if #MDB_NOLOCK is set. 650 * 651 * Since the database uses multi-version concurrency control, readers don't 652 * actually need any locking. This table is used to keep track of which 653 * readers are using data from which old transactions, so that we'll know 654 * when a particular old transaction is no longer in use. Old transactions 655 * that have discarded any data pages can then have those pages reclaimed 656 * for use by a later write transaction. 657 * 658 * The lock table is constructed such that reader slots are aligned with the 659 * processor's cache line size. Any slot is only ever used by one thread. 660 * This alignment guarantees that there will be no contention or cache 661 * thrashing as threads update their own slot info, and also eliminates 662 * any need for locking when accessing a slot. 663 * 664 * A writer thread will scan every slot in the table to determine the oldest 665 * outstanding reader transaction. Any freed pages older than this will be 666 * reclaimed by the writer. The writer doesn't use any locks when scanning 667 * this table. This means that there's no guarantee that the writer will 668 * see the most up-to-date reader info, but that's not required for correct 669 * operation - all we need is to know the upper bound on the oldest reader, 670 * we don't care at all about the newest reader. So the only consequence of 671 * reading stale information here is that old pages might hang around a 672 * while longer before being reclaimed. That's actually good anyway, because 673 * the longer we delay reclaiming old pages, the more likely it is that a 674 * string of contiguous pages can be found after coalescing old pages from 675 * many old transactions together. 676 * @{ 677 */ 678 /** Number of slots in the reader table. 679 * This value was chosen somewhat arbitrarily. 126 readers plus a 680 * couple mutexes fit exactly into 8KB on my development machine. 681 * Applications should set the table size using #mdb_env_set_maxreaders(). 682 */ 683 #define DEFAULT_READERS 126 684 685 /** The size of a CPU cache line in bytes. We want our lock structures 686 * aligned to this size to avoid false cache line sharing in the 687 * lock table. 688 * This value works for most CPUs. For Itanium this should be 128. 689 */ 690 #ifndef CACHELINE 691 #define CACHELINE 64 692 #endif 693 694 /** The information we store in a single slot of the reader table. 695 * In addition to a transaction ID, we also record the process and 696 * thread ID that owns a slot, so that we can detect stale information, 697 * e.g. threads or processes that went away without cleaning up. 698 * @note We currently don't check for stale records. We simply re-init 699 * the table when we know that we're the only process opening the 700 * lock file. 701 */ 702 typedef struct MDB_rxbody { 703 /** Current Transaction ID when this transaction began, or (txnid_t)-1. 704 * Multiple readers that start at the same time will probably have the 705 * same ID here. Again, it's not important to exclude them from 706 * anything; all we need to know is which version of the DB they 707 * started from so we can avoid overwriting any data used in that 708 * particular version. 709 */ 710 volatile txnid_t mrb_txnid; 711 /** The process ID of the process owning this reader txn. */ 712 volatile MDB_PID_T mrb_pid; 713 /** The thread ID of the thread owning this txn. */ 714 volatile MDB_THR_T mrb_tid; 715 } MDB_rxbody; 716 717 /** The actual reader record, with cacheline padding. */ 718 typedef struct MDB_reader { 719 union { 720 MDB_rxbody mrx; 721 /** shorthand for mrb_txnid */ 722 #define mr_txnid mru.mrx.mrb_txnid 723 #define mr_pid mru.mrx.mrb_pid 724 #define mr_tid mru.mrx.mrb_tid 725 /** cache line alignment */ 726 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; 727 } mru; 728 } MDB_reader; 729 730 /** The header for the reader table. 731 * The table resides in a memory-mapped file. (This is a different file 732 * than is used for the main database.) 733 * 734 * For POSIX the actual mutexes reside in the shared memory of this 735 * mapped file. On Windows, mutexes are named objects allocated by the 736 * kernel; we store the mutex names in this mapped file so that other 737 * processes can grab them. This same approach is also used on 738 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support 739 * process-shared POSIX mutexes. For these cases where a named object 740 * is used, the object name is derived from a 64 bit FNV hash of the 741 * environment pathname. As such, naming collisions are extremely 742 * unlikely. If a collision occurs, the results are unpredictable. 743 */ 744 typedef struct MDB_txbody { 745 /** Stamp identifying this as an LMDB file. It must be set 746 * to #MDB_MAGIC. */ 747 uint32_t mtb_magic; 748 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ 749 uint32_t mtb_format; 750 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 751 char mtb_rmname[MNAME_LEN]; 752 #else 753 /** Mutex protecting access to this table. 754 * This is the reader table lock used with LOCK_MUTEX(). 755 */ 756 mdb_mutex_t mtb_rmutex; 757 #endif 758 /** The ID of the last transaction committed to the database. 759 * This is recorded here only for convenience; the value can always 760 * be determined by reading the main database meta pages. 761 */ 762 volatile txnid_t mtb_txnid; 763 /** The number of slots that have been used in the reader table. 764 * This always records the maximum count, it is not decremented 765 * when readers release their slots. 766 */ 767 volatile unsigned mtb_numreaders; 768 } MDB_txbody; 769 770 /** The actual reader table definition. */ 771 typedef struct MDB_txninfo { 772 union { 773 MDB_txbody mtb; 774 #define mti_magic mt1.mtb.mtb_magic 775 #define mti_format mt1.mtb.mtb_format 776 #define mti_rmutex mt1.mtb.mtb_rmutex 777 #define mti_rmname mt1.mtb.mtb_rmname 778 #define mti_txnid mt1.mtb.mtb_txnid 779 #define mti_numreaders mt1.mtb.mtb_numreaders 780 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; 781 } mt1; 782 union { 783 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 784 char mt2_wmname[MNAME_LEN]; 785 #define mti_wmname mt2.mt2_wmname 786 #else 787 mdb_mutex_t mt2_wmutex; 788 #define mti_wmutex mt2.mt2_wmutex 789 #endif 790 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; 791 } mt2; 792 MDB_reader mti_readers[1]; 793 } MDB_txninfo; 794 795 /** Lockfile format signature: version, features and field layout */ 796 #define MDB_LOCK_FORMAT \ 797 ((uint32_t) \ 798 ((MDB_LOCK_VERSION) \ 799 /* Flags which describe functionality */ \ 800 + (((MDB_PIDLOCK) != 0) << 16))) 801 /** @} */ 802 803 /** Common header for all page types. The page type depends on #mp_flags. 804 * 805 * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with 806 * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages 807 * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. 808 * 809 * #P_OVERFLOW records occupy one or more contiguous pages where only the 810 * first has a page header. They hold the real data of #F_BIGDATA nodes. 811 * 812 * #P_SUBP sub-pages are small leaf "pages" with duplicate data. 813 * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. 814 * (Duplicate data can also go in sub-databases, which use normal pages.) 815 * 816 * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. 817 * 818 * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once 819 * in the snapshot: Either used by a database or listed in a freeDB record. 820 */ 821 typedef struct MDB_page { 822 #define mp_pgno mp_p.p_pgno 823 #define mp_next mp_p.p_next 824 union { 825 pgno_t p_pgno; /**< page number */ 826 struct MDB_page *p_next; /**< for in-memory list of freed pages */ 827 } mp_p; 828 uint16_t mp_pad; /**< key size if this is a LEAF2 page */ 829 /** @defgroup mdb_page Page Flags 830 * @ingroup internal 831 * Flags for the page headers. 832 * @{ 833 */ 834 #define P_BRANCH 0x01 /**< branch page */ 835 #define P_LEAF 0x02 /**< leaf page */ 836 #define P_OVERFLOW 0x04 /**< overflow page */ 837 #define P_META 0x08 /**< meta page */ 838 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ 839 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ 840 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ 841 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ 842 #define P_KEEP 0x8000 /**< leave this page alone during spill */ 843 /** @} */ 844 uint16_t mp_flags; /**< @ref mdb_page */ 845 #define mp_lower mp_pb.pb.pb_lower 846 #define mp_upper mp_pb.pb.pb_upper 847 #define mp_pages mp_pb.pb_pages 848 union { 849 struct { 850 indx_t pb_lower; /**< lower bound of free space */ 851 indx_t pb_upper; /**< upper bound of free space */ 852 } pb; 853 uint32_t pb_pages; /**< number of overflow pages */ 854 } mp_pb; 855 indx_t mp_ptrs[0]; /**< dynamic size */ 856 } MDB_page; 857 858 /** Alternate page header, for 2-byte aligned access */ 859 typedef struct MDB_page2 { 860 uint16_t mp2_p[sizeof(pgno_t)/2]; 861 uint16_t mp2_pad; 862 uint16_t mp2_flags; 863 indx_t mp2_lower; 864 indx_t mp2_upper; 865 indx_t mp2_ptrs[0]; 866 } MDB_page2; 867 868 #define MP_PGNO(p) (((MDB_page2 *)(void *)(p))->mp2_p) 869 #define MP_PAD(p) (((MDB_page2 *)(void *)(p))->mp2_pad) 870 #define MP_FLAGS(p) (((MDB_page2 *)(void *)(p))->mp2_flags) 871 #define MP_LOWER(p) (((MDB_page2 *)(void *)(p))->mp2_lower) 872 #define MP_UPPER(p) (((MDB_page2 *)(void *)(p))->mp2_upper) 873 #define MP_PTRS(p) (((MDB_page2 *)(void *)(p))->mp2_ptrs) 874 875 /** Size of the page header, excluding dynamic data at the end */ 876 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) 877 878 /** Address of first usable data byte in a page, after the header */ 879 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) 880 881 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ 882 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) 883 884 /** Number of nodes on a page */ 885 #define NUMKEYS(p) ((MP_LOWER(p) - (PAGEHDRSZ-PAGEBASE)) >> 1) 886 887 /** The amount of space remaining in the page */ 888 #define SIZELEFT(p) (indx_t)(MP_UPPER(p) - MP_LOWER(p)) 889 890 /** The percentage of space used in the page, in tenths of a percent. */ 891 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ 892 ((env)->me_psize - PAGEHDRSZ)) 893 /** The minimum page fill factor, in tenths of a percent. 894 * Pages emptier than this are candidates for merging. 895 */ 896 #define FILL_THRESHOLD 250 897 898 /** Test if a page is a leaf page */ 899 #define IS_LEAF(p) F_ISSET(MP_FLAGS(p), P_LEAF) 900 /** Test if a page is a LEAF2 page */ 901 #define IS_LEAF2(p) F_ISSET(MP_FLAGS(p), P_LEAF2) 902 /** Test if a page is a branch page */ 903 #define IS_BRANCH(p) F_ISSET(MP_FLAGS(p), P_BRANCH) 904 /** Test if a page is an overflow page */ 905 #define IS_OVERFLOW(p) F_ISSET(MP_FLAGS(p), P_OVERFLOW) 906 /** Test if a page is a sub page */ 907 #define IS_SUBP(p) F_ISSET(MP_FLAGS(p), P_SUBP) 908 909 /** The number of overflow pages needed to store the given size. */ 910 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) 911 912 /** Link in #MDB_txn.%mt_loose_pgs list. 913 * Kept outside the page header, which is needed when reusing the page. 914 */ 915 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) 916 917 /** Header for a single key/data pair within a page. 918 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. 919 * We guarantee 2-byte alignment for 'MDB_node's. 920 * 921 * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child 922 * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used 923 * for pgno. (Branch nodes have no flags). Lo and hi are in host byte 924 * order in case some accesses can be optimized to 32-bit word access. 925 * 926 * Leaf node flags describe node contents. #F_BIGDATA says the node's 927 * data part is the page number of an overflow page with actual data. 928 * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in 929 * a sub-page/sub-database, and named databases (just #F_SUBDATA). 930 */ 931 typedef struct MDB_node { 932 /** part of data size or pgno 933 * @{ */ 934 #if BYTE_ORDER == LITTLE_ENDIAN 935 unsigned short mn_lo, mn_hi; 936 #else 937 unsigned short mn_hi, mn_lo; 938 #endif 939 /** @} */ 940 /** @defgroup mdb_node Node Flags 941 * @ingroup internal 942 * Flags for node headers. 943 * @{ 944 */ 945 #define F_BIGDATA 0x01 /**< data put on overflow page */ 946 #define F_SUBDATA 0x02 /**< data is a sub-database */ 947 #define F_DUPDATA 0x04 /**< data has duplicates */ 948 949 /** valid flags for #mdb_node_add() */ 950 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) 951 952 /** @} */ 953 unsigned short mn_flags; /**< @ref mdb_node */ 954 unsigned short mn_ksize; /**< key size */ 955 char mn_data[1]; /**< key and data are appended here */ 956 } MDB_node; 957 958 /** Size of the node header, excluding dynamic data at the end */ 959 #define NODESIZE offsetof(MDB_node, mn_data) 960 961 /** Bit position of top word in page number, for shifting mn_flags */ 962 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) 963 964 /** Size of a node in a branch page with a given key. 965 * This is just the node header plus the key, there is no data. 966 */ 967 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) 968 969 /** Size of a node in a leaf page with a given key and data. 970 * This is node header plus key plus data size. 971 */ 972 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) 973 974 /** Address of node \b i in page \b p */ 975 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + MP_PTRS(p)[i] + PAGEBASE)) 976 977 /** Address of the key for the node */ 978 #define NODEKEY(node) (void *)((node)->mn_data) 979 980 /** Address of the data for a node */ 981 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) 982 983 /** Get the page number pointed to by a branch node */ 984 #define NODEPGNO(node) \ 985 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ 986 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) 987 /** Set the page number in a branch node */ 988 #define SETPGNO(node,pgno) do { \ 989 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ 990 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) 991 992 /** Get the size of the data in a leaf node */ 993 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) 994 /** Set the size of the data for a leaf node */ 995 #define SETDSZ(node,size) do { \ 996 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) 997 /** The size of a key in a node */ 998 #define NODEKSZ(node) ((node)->mn_ksize) 999 1000 /** Copy a page number from src to dst */ 1001 #ifdef MISALIGNED_OK 1002 #define COPY_PGNO(dst,src) dst = src 1003 #undef MP_PGNO 1004 #define MP_PGNO(p) ((p)->mp_pgno) 1005 #else 1006 #if SIZE_MAX > 4294967295UL 1007 #define COPY_PGNO(dst,src) do { \ 1008 unsigned short *s, *d; \ 1009 s = (unsigned short *)&(src); \ 1010 d = (unsigned short *)&(dst); \ 1011 *d++ = *s++; \ 1012 *d++ = *s++; \ 1013 *d++ = *s++; \ 1014 *d = *s; \ 1015 } while (0) 1016 #else 1017 #define COPY_PGNO(dst,src) do { \ 1018 unsigned short *s, *d; \ 1019 s = (unsigned short *)&(src); \ 1020 d = (unsigned short *)&(dst); \ 1021 *d++ = *s++; \ 1022 *d = *s; \ 1023 } while (0) 1024 #endif 1025 #endif 1026 /** The address of a key in a LEAF2 page. 1027 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. 1028 * There are no node headers, keys are stored contiguously. 1029 */ 1030 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) 1031 1032 /** Set the \b node's key into \b keyptr, if requested. */ 1033 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ 1034 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } 1035 1036 /** Set the \b node's key into \b key. */ 1037 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } 1038 1039 /** Information about a single database in the environment. */ 1040 typedef struct MDB_db { 1041 uint32_t md_pad; /**< also ksize for LEAF2 pages */ 1042 uint16_t md_flags; /**< @ref mdb_dbi_open */ 1043 uint16_t md_depth; /**< depth of this tree */ 1044 pgno_t md_branch_pages; /**< number of internal pages */ 1045 pgno_t md_leaf_pages; /**< number of leaf pages */ 1046 pgno_t md_overflow_pages; /**< number of overflow pages */ 1047 size_t md_entries; /**< number of data items */ 1048 pgno_t md_root; /**< the root page of this tree */ 1049 } MDB_db; 1050 1051 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ 1052 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) 1053 /** #mdb_dbi_open() flags */ 1054 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ 1055 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) 1056 1057 /** Handle for the DB used to track free pages. */ 1058 #define FREE_DBI 0 1059 /** Handle for the default DB. */ 1060 #define MAIN_DBI 1 1061 /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ 1062 #define CORE_DBS 2 1063 1064 /** Number of meta pages - also hardcoded elsewhere */ 1065 #define NUM_METAS 2 1066 1067 /** Meta page content. 1068 * A meta page is the start point for accessing a database snapshot. 1069 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). 1070 */ 1071 typedef struct MDB_meta { 1072 /** Stamp identifying this as an LMDB file. It must be set 1073 * to #MDB_MAGIC. */ 1074 uint32_t mm_magic; 1075 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ 1076 uint32_t mm_version; 1077 void *mm_address; /**< address for fixed mapping */ 1078 size_t mm_mapsize; /**< size of mmap region */ 1079 MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ 1080 /** The size of pages used in this DB */ 1081 #define mm_psize mm_dbs[FREE_DBI].md_pad 1082 /** Any persistent environment flags. @ref mdb_env */ 1083 #define mm_flags mm_dbs[FREE_DBI].md_flags 1084 /** Last used page in the datafile. 1085 * Actually the file may be shorter if the freeDB lists the final pages. 1086 */ 1087 pgno_t mm_last_pg; 1088 volatile txnid_t mm_txnid; /**< txnid that committed this page */ 1089 } MDB_meta; 1090 1091 /** Buffer for a stack-allocated meta page. 1092 * The members define size and alignment, and silence type 1093 * aliasing warnings. They are not used directly; that could 1094 * mean incorrectly using several union members in parallel. 1095 */ 1096 typedef union MDB_metabuf { 1097 MDB_page mb_page; 1098 struct { 1099 char mm_pad[PAGEHDRSZ]; 1100 MDB_meta mm_meta; 1101 } mb_metabuf; 1102 } MDB_metabuf; 1103 1104 /** Auxiliary DB info. 1105 * The information here is mostly static/read-only. There is 1106 * only a single copy of this record in the environment. 1107 */ 1108 typedef struct MDB_dbx { 1109 MDB_val md_name; /**< name of the database */ 1110 MDB_cmp_func *md_cmp; /**< function for comparing keys */ 1111 MDB_cmp_func *md_dcmp; /**< function for comparing data items */ 1112 MDB_rel_func *md_rel; /**< user relocate function */ 1113 void *md_relctx; /**< user-provided context for md_rel */ 1114 } MDB_dbx; 1115 1116 /** A database transaction. 1117 * Every operation requires a transaction handle. 1118 */ 1119 struct MDB_txn { 1120 MDB_txn *mt_parent; /**< parent of a nested txn */ 1121 /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ 1122 MDB_txn *mt_child; 1123 pgno_t mt_next_pgno; /**< next unallocated page */ 1124 /** The ID of this transaction. IDs are integers incrementing from 1. 1125 * Only committed write transactions increment the ID. If a transaction 1126 * aborts, the ID may be re-used by the next writer. 1127 */ 1128 txnid_t mt_txnid; 1129 MDB_env *mt_env; /**< the DB environment */ 1130 /** The list of pages that became unused during this transaction. 1131 */ 1132 MDB_IDL mt_free_pgs; 1133 /** The list of loose pages that became unused and may be reused 1134 * in this transaction, linked through #NEXT_LOOSE_PAGE(page). 1135 */ 1136 MDB_page *mt_loose_pgs; 1137 /** Number of loose pages (#mt_loose_pgs) */ 1138 int mt_loose_count; 1139 /** The sorted list of dirty pages we temporarily wrote to disk 1140 * because the dirty list was full. page numbers in here are 1141 * shifted left by 1, deleted slots have the LSB set. 1142 */ 1143 MDB_IDL mt_spill_pgs; 1144 union { 1145 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ 1146 MDB_ID2L dirty_list; 1147 /** For read txns: This thread/txn's reader table slot, or NULL. */ 1148 MDB_reader *reader; 1149 } mt_u; 1150 /** Array of records for each DB known in the environment. */ 1151 MDB_dbx *mt_dbxs; 1152 /** Array of MDB_db records for each known DB */ 1153 MDB_db *mt_dbs; 1154 /** Array of sequence numbers for each DB handle */ 1155 unsigned int *mt_dbiseqs; 1156 /** @defgroup mt_dbflag Transaction DB Flags 1157 * @ingroup internal 1158 * @{ 1159 */ 1160 #define DB_DIRTY 0x01 /**< DB was written in this txn */ 1161 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ 1162 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ 1163 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ 1164 #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ 1165 #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ 1166 /** @} */ 1167 /** In write txns, array of cursors for each DB */ 1168 MDB_cursor **mt_cursors; 1169 /** Array of flags for each DB */ 1170 unsigned char *mt_dbflags; 1171 /** Number of DB records in use, or 0 when the txn is finished. 1172 * This number only ever increments until the txn finishes; we 1173 * don't decrement it when individual DB handles are closed. 1174 */ 1175 MDB_dbi mt_numdbs; 1176 1177 /** @defgroup mdb_txn Transaction Flags 1178 * @ingroup internal 1179 * @{ 1180 */ 1181 /** #mdb_txn_begin() flags */ 1182 #define MDB_TXN_BEGIN_FLAGS MDB_RDONLY 1183 #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ 1184 /* internal txn flags */ 1185 #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ 1186 #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ 1187 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ 1188 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ 1189 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ 1190 #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ 1191 /** most operations on the txn are currently illegal */ 1192 #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) 1193 /** @} */ 1194 unsigned int mt_flags; /**< @ref mdb_txn */ 1195 /** #dirty_list room: Array size - \#dirty pages visible to this txn. 1196 * Includes ancestor txns' dirty pages not hidden by other txns' 1197 * dirty/spilled pages. Thus commit(nested txn) has room to merge 1198 * dirty_list into mt_parent after freeing hidden mt_parent pages. 1199 */ 1200 unsigned int mt_dirty_room; 1201 }; 1202 1203 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. 1204 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to 1205 * raise this on a 64 bit machine. 1206 */ 1207 #define CURSOR_STACK 32 1208 1209 struct MDB_xcursor; 1210 1211 /** Cursors are used for all DB operations. 1212 * A cursor holds a path of (page pointer, key index) from the DB 1213 * root to a position in the DB, plus other state. #MDB_DUPSORT 1214 * cursors include an xcursor to the current data item. Write txns 1215 * track their cursors and keep them up to date when data moves. 1216 * Exception: An xcursor's pointer to a #P_SUBP page can be stale. 1217 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). 1218 */ 1219 struct MDB_cursor { 1220 /** Next cursor on this DB in this txn */ 1221 MDB_cursor *mc_next; 1222 /** Backup of the original cursor if this cursor is a shadow */ 1223 MDB_cursor *mc_backup; 1224 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ 1225 struct MDB_xcursor *mc_xcursor; 1226 /** The transaction that owns this cursor */ 1227 MDB_txn *mc_txn; 1228 /** The database handle this cursor operates on */ 1229 MDB_dbi mc_dbi; 1230 /** The database record for this cursor */ 1231 MDB_db *mc_db; 1232 /** The database auxiliary record for this cursor */ 1233 MDB_dbx *mc_dbx; 1234 /** The @ref mt_dbflag for this database */ 1235 unsigned char *mc_dbflag; 1236 unsigned short mc_snum; /**< number of pushed pages */ 1237 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ 1238 /** @defgroup mdb_cursor Cursor Flags 1239 * @ingroup internal 1240 * Cursor state flags. 1241 * @{ 1242 */ 1243 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ 1244 #define C_EOF 0x02 /**< No more data */ 1245 #define C_SUB 0x04 /**< Cursor is a sub-cursor */ 1246 #define C_DEL 0x08 /**< last op was a cursor_del */ 1247 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ 1248 /** @} */ 1249 unsigned int mc_flags; /**< @ref mdb_cursor */ 1250 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ 1251 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ 1252 }; 1253 1254 /** Context for sorted-dup records. 1255 * We could have gone to a fully recursive design, with arbitrarily 1256 * deep nesting of sub-databases. But for now we only handle these 1257 * levels - main DB, optional sub-DB, sorted-duplicate DB. 1258 */ 1259 typedef struct MDB_xcursor { 1260 /** A sub-cursor for traversing the Dup DB */ 1261 MDB_cursor mx_cursor; 1262 /** The database record for this Dup DB */ 1263 MDB_db mx_db; 1264 /** The auxiliary DB record for this Dup DB */ 1265 MDB_dbx mx_dbx; 1266 /** The @ref mt_dbflag for this Dup DB */ 1267 unsigned char mx_dbflag; 1268 } MDB_xcursor; 1269 1270 /** Check if there is an inited xcursor */ 1271 #define XCURSOR_INITED(mc) \ 1272 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 1273 1274 /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed 1275 * when the node which contains the sub-page may have moved. Called 1276 * with leaf page \b mp = mc->mc_pg[\b top]. 1277 */ 1278 #define XCURSOR_REFRESH(mc, top, mp) do { \ 1279 MDB_page *xr_pg = (mp); \ 1280 MDB_node *xr_node; \ 1281 if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ 1282 xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ 1283 if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ 1284 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ 1285 } while (0) 1286 1287 /** State of FreeDB old pages, stored in the MDB_env */ 1288 typedef struct MDB_pgstate { 1289 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ 1290 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ 1291 } MDB_pgstate; 1292 1293 /** The database environment. */ 1294 struct MDB_env { 1295 HANDLE me_fd; /**< The main data file */ 1296 HANDLE me_lfd; /**< The lock file */ 1297 HANDLE me_mfd; /**< For writing and syncing the meta pages */ 1298 /** Failed to update the meta page. Probably an I/O error. */ 1299 #define MDB_FATAL_ERROR 0x80000000U 1300 /** Some fields are initialized. */ 1301 #define MDB_ENV_ACTIVE 0x20000000U 1302 /** me_txkey is set */ 1303 #define MDB_ENV_TXKEY 0x10000000U 1304 /** fdatasync is unreliable */ 1305 #define MDB_FSYNCONLY 0x08000000U 1306 uint32_t me_flags; /**< @ref mdb_env */ 1307 unsigned int me_psize; /**< DB page size, inited from me_os_psize */ 1308 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ 1309 unsigned int me_maxreaders; /**< size of the reader table */ 1310 /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ 1311 volatile int me_close_readers; 1312 MDB_dbi me_numdbs; /**< number of DBs opened */ 1313 MDB_dbi me_maxdbs; /**< size of the DB table */ 1314 MDB_PID_T me_pid; /**< process ID of this env */ 1315 char *me_path; /**< path to the DB files */ 1316 char *me_map; /**< the memory map of the data file */ 1317 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ 1318 MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ 1319 void *me_pbuf; /**< scratch area for DUPSORT put() */ 1320 MDB_txn *me_txn; /**< current write transaction */ 1321 MDB_txn *me_txn0; /**< prealloc'd write transaction */ 1322 size_t me_mapsize; /**< size of the data memory map */ 1323 off_t me_size; /**< current file size */ 1324 pgno_t me_maxpg; /**< me_mapsize / me_psize */ 1325 MDB_dbx *me_dbxs; /**< array of static DB info */ 1326 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ 1327 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ 1328 pthread_key_t me_txkey; /**< thread-key for readers */ 1329 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ 1330 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ 1331 # define me_pglast me_pgstate.mf_pglast 1332 # define me_pghead me_pgstate.mf_pghead 1333 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ 1334 /** IDL of pages that became unused in a write txn */ 1335 MDB_IDL me_free_pgs; 1336 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ 1337 MDB_ID2L me_dirty_list; 1338 /** Max number of freelist items that can fit in a single overflow page */ 1339 int me_maxfree_1pg; 1340 /** Max size of a node on a page */ 1341 unsigned int me_nodemax; 1342 #if !(MDB_MAXKEYSIZE) 1343 unsigned int me_maxkey; /**< max size of a key */ 1344 #endif 1345 int me_live_reader; /**< have liveness lock in reader table */ 1346 #ifdef _WIN32 1347 int me_pidquery; /**< Used in OpenProcess */ 1348 #endif 1349 #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ 1350 # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ 1351 # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ 1352 #else 1353 mdb_mutex_t me_rmutex; 1354 mdb_mutex_t me_wmutex; 1355 #endif 1356 void *me_userctx; /**< User-settable context */ 1357 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ 1358 }; 1359 1360 /** Nested transaction */ 1361 typedef struct MDB_ntxn { 1362 MDB_txn mnt_txn; /**< the transaction */ 1363 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ 1364 } MDB_ntxn; 1365 1366 /** max number of pages to commit in one writev() call */ 1367 #define MDB_COMMIT_PAGES 64 1368 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES 1369 #undef MDB_COMMIT_PAGES 1370 #define MDB_COMMIT_PAGES IOV_MAX 1371 #endif 1372 1373 /** max bytes to write in one call */ 1374 #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) 1375 1376 /** Check \b txn and \b dbi arguments to a function */ 1377 #define TXN_DBI_EXIST(txn, dbi, validity) \ 1378 ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) 1379 1380 /** Check for misused \b dbi handles */ 1381 #define TXN_DBI_CHANGED(txn, dbi) \ 1382 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) 1383 1384 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); 1385 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); 1386 static int mdb_page_touch(MDB_cursor *mc); 1387 1388 #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ 1389 "reset-tmp", "fail-begin", "fail-beginchild"} 1390 enum { 1391 /* mdb_txn_end operation number, for logging */ 1392 MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, 1393 MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD 1394 }; 1395 #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ 1396 #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ 1397 #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ 1398 #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ 1399 static void mdb_txn_end(MDB_txn *txn, unsigned mode); 1400 1401 static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); 1402 static int mdb_page_search_root(MDB_cursor *mc, 1403 MDB_val *key, int modify); 1404 #define MDB_PS_MODIFY 1 1405 #define MDB_PS_ROOTONLY 2 1406 #define MDB_PS_FIRST 4 1407 #define MDB_PS_LAST 8 1408 static int mdb_page_search(MDB_cursor *mc, 1409 MDB_val *key, int flags); 1410 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); 1411 1412 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ 1413 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, 1414 pgno_t newpgno, unsigned int nflags); 1415 1416 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); 1417 static MDB_meta *mdb_env_pick_meta(const MDB_env *env); 1418 static int mdb_env_write_meta(MDB_txn *txn); 1419 #if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */ 1420 # define mdb_env_close0(env, excl) mdb_env_close1(env) 1421 #endif 1422 static void mdb_env_close0(MDB_env *env, int excl); 1423 1424 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); 1425 static int mdb_node_add(MDB_cursor *mc, indx_t indx, 1426 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); 1427 static void mdb_node_del(MDB_cursor *mc, int ksize); 1428 static void mdb_node_shrink(MDB_page *mp, indx_t indx); 1429 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); 1430 static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); 1431 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); 1432 static size_t mdb_branch_size(MDB_env *env, MDB_val *key); 1433 1434 static int mdb_rebalance(MDB_cursor *mc); 1435 static int mdb_update_key(MDB_cursor *mc, MDB_val *key); 1436 1437 static void mdb_cursor_pop(MDB_cursor *mc); 1438 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); 1439 1440 static int _mdb_cursor_del(MDB_cursor *mc, unsigned int flags); 1441 static int _mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags); 1442 1443 static int mdb_cursor_del0(MDB_cursor *mc); 1444 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); 1445 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); 1446 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1447 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1448 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, 1449 int *exactp); 1450 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1451 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1452 1453 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); 1454 static void mdb_xcursor_init0(MDB_cursor *mc); 1455 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); 1456 static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); 1457 1458 static int mdb_drop0(MDB_cursor *mc, int subs); 1459 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); 1460 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); 1461 1462 /** @cond */ 1463 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; 1464 /** @endcond */ 1465 1466 /** Compare two items pointing at size_t's of unknown alignment. */ 1467 #ifdef MISALIGNED_OK 1468 # define mdb_cmp_clong mdb_cmp_long 1469 #else 1470 # define mdb_cmp_clong mdb_cmp_cint 1471 #endif 1472 1473 #ifdef _WIN32 1474 static SECURITY_DESCRIPTOR mdb_null_sd; 1475 static SECURITY_ATTRIBUTES mdb_all_sa; 1476 static int mdb_sec_inited; 1477 1478 struct MDB_name; 1479 static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); 1480 #endif 1481 1482 /** Return the library version info. */ 1483 char * ESECT 1484 mdb_version(int *major, int *minor, int *patch) 1485 { 1486 if (major) *major = MDB_VERSION_MAJOR; 1487 if (minor) *minor = MDB_VERSION_MINOR; 1488 if (patch) *patch = MDB_VERSION_PATCH; 1489 return MDB_VERSION_STRING; 1490 } 1491 1492 /** Table of descriptions for LMDB @ref errors */ 1493 static char *const mdb_errstr[] = { 1494 "MDB_KEYEXIST: Key/data pair already exists", 1495 "MDB_NOTFOUND: No matching key/data pair found", 1496 "MDB_PAGE_NOTFOUND: Requested page not found", 1497 "MDB_CORRUPTED: Located page was wrong type", 1498 "MDB_PANIC: Update of meta page failed or environment had fatal error", 1499 "MDB_VERSION_MISMATCH: Database environment version mismatch", 1500 "MDB_INVALID: File is not an LMDB file", 1501 "MDB_MAP_FULL: Environment mapsize limit reached", 1502 "MDB_DBS_FULL: Environment maxdbs limit reached", 1503 "MDB_READERS_FULL: Environment maxreaders limit reached", 1504 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", 1505 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", 1506 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", 1507 "MDB_PAGE_FULL: Internal error - page has no more space", 1508 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", 1509 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", 1510 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", 1511 "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", 1512 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", 1513 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", 1514 }; 1515 1516 char * 1517 mdb_strerror(int err) 1518 { 1519 #ifdef _WIN32 1520 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. 1521 * This works as long as no function between the call to mdb_strerror 1522 * and the actual use of the message uses more than 4K of stack. 1523 */ 1524 #define MSGSIZE 1024 1525 #define PADSIZE 4096 1526 char buf[MSGSIZE+PADSIZE], *ptr = buf; 1527 #endif 1528 int i; 1529 if (!err) 1530 return ("Successful return: 0"); 1531 1532 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { 1533 i = err - MDB_KEYEXIST; 1534 return mdb_errstr[i]; 1535 } 1536 1537 #ifdef _WIN32 1538 /* These are the C-runtime error codes we use. The comment indicates 1539 * their numeric value, and the Win32 error they would correspond to 1540 * if the error actually came from a Win32 API. A major mess, we should 1541 * have used LMDB-specific error codes for everything. 1542 */ 1543 switch(err) { 1544 case ENOENT: /* 2, FILE_NOT_FOUND */ 1545 case EIO: /* 5, ACCESS_DENIED */ 1546 case ENOMEM: /* 12, INVALID_ACCESS */ 1547 case EACCES: /* 13, INVALID_DATA */ 1548 case EBUSY: /* 16, CURRENT_DIRECTORY */ 1549 case EINVAL: /* 22, BAD_COMMAND */ 1550 case ENOSPC: /* 28, OUT_OF_PAPER */ 1551 return strerror(err); 1552 default: 1553 ; 1554 } 1555 buf[0] = 0; 1556 FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | 1557 FORMAT_MESSAGE_IGNORE_INSERTS, 1558 NULL, err, 0, ptr, MSGSIZE, NULL); 1559 return ptr; 1560 #else 1561 if (err < 0) 1562 return "Invalid error code"; 1563 return strerror(err); 1564 #endif 1565 } 1566 1567 /** assert(3) variant in cursor context */ 1568 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) 1569 /** assert(3) variant in transaction context */ 1570 #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) 1571 /** assert(3) variant in environment context */ 1572 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) 1573 1574 #ifndef NDEBUG 1575 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ 1576 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) 1577 1578 static void ESECT 1579 mdb_assert_fail(MDB_env *env, const char *expr_txt, 1580 const char *func, const char *file, int line) 1581 { 1582 char buf[400]; 1583 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", 1584 file, line, expr_txt, func); 1585 if (env->me_assert_func) 1586 env->me_assert_func(env, buf); 1587 fprintf(stderr, "%s\n", buf); 1588 abort(); 1589 } 1590 #else 1591 # define mdb_assert0(env, expr, expr_txt) ((void) 0) 1592 #endif /* NDEBUG */ 1593 1594 #if MDB_DEBUG 1595 /** Return the page number of \b mp which may be sub-page, for debug output */ 1596 static pgno_t 1597 mdb_dbg_pgno(MDB_page *mp) 1598 { 1599 pgno_t ret; 1600 COPY_PGNO(ret, MP_PGNO(mp)); 1601 return ret; 1602 } 1603 1604 /** Display a key in hexadecimal and return the address of the result. 1605 * @param[in] key the key to display 1606 * @param[in] buf the buffer to write into. Should always be #DKBUF. 1607 * @return The key in hexadecimal form. 1608 */ 1609 char * 1610 mdb_dkey(MDB_val *key, char *buf) 1611 { 1612 char *ptr = buf; 1613 unsigned char *c = key->mv_data; 1614 unsigned int i; 1615 1616 if (!key) 1617 return ""; 1618 1619 if (key->mv_size > DKBUF_MAXKEYSIZE) 1620 return "MDB_MAXKEYSIZE"; 1621 /* may want to make this a dynamic check: if the key is mostly 1622 * printable characters, print it as-is instead of converting to hex. 1623 */ 1624 #if 1 1625 buf[0] = '\0'; 1626 for (i=0; i<key->mv_size; i++) 1627 ptr += sprintf(ptr, "%02x", *c++); 1628 #else 1629 sprintf(buf, "%.*s", key->mv_size, key->mv_data); 1630 #endif 1631 return buf; 1632 } 1633 1634 static char * 1635 mdb_dval(MDB_txn *txn, MDB_dbi dbi, MDB_val *data, char *buf) 1636 { 1637 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 1638 mdb_dkey(data, buf+1); 1639 *buf = '['; 1640 strcpy(buf + data->mv_size * 2 + 1, "]"); 1641 } else 1642 *buf = '\0'; 1643 return buf; 1644 } 1645 1646 static const char * 1647 mdb_leafnode_type(MDB_node *n) 1648 { 1649 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; 1650 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : 1651 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; 1652 } 1653 1654 /** Display all the keys in the page. */ 1655 void 1656 mdb_page_list(MDB_page *mp) 1657 { 1658 pgno_t pgno = mdb_dbg_pgno(mp); 1659 const char *type, *state = (MP_FLAGS(mp) & P_DIRTY) ? ", dirty" : ""; 1660 MDB_node *node; 1661 unsigned int i, nkeys, nsize, total = 0; 1662 MDB_val key; 1663 DKBUF; 1664 1665 switch (MP_FLAGS(mp) & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { 1666 case P_BRANCH: type = "Branch page"; break; 1667 case P_LEAF: type = "Leaf page"; break; 1668 case P_LEAF|P_SUBP: type = "Sub-page"; break; 1669 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; 1670 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; 1671 case P_OVERFLOW: 1672 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", 1673 pgno, mp->mp_pages, state); 1674 return; 1675 case P_META: 1676 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", 1677 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); 1678 return; 1679 default: 1680 fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, MP_FLAGS(mp)); 1681 return; 1682 } 1683 1684 nkeys = NUMKEYS(mp); 1685 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); 1686 1687 for (i=0; i<nkeys; i++) { 1688 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ 1689 key.mv_size = nsize = mp->mp_pad; 1690 key.mv_data = LEAF2KEY(mp, i, nsize); 1691 total += nsize; 1692 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); 1693 continue; 1694 } 1695 node = NODEPTR(mp, i); 1696 key.mv_size = node->mn_ksize; 1697 key.mv_data = node->mn_data; 1698 nsize = NODESIZE + key.mv_size; 1699 if (IS_BRANCH(mp)) { 1700 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), 1701 DKEY(&key)); 1702 total += nsize; 1703 } else { 1704 if (F_ISSET(node->mn_flags, F_BIGDATA)) 1705 nsize += sizeof(pgno_t); 1706 else 1707 nsize += NODEDSZ(node); 1708 total += nsize; 1709 nsize += sizeof(indx_t); 1710 fprintf(stderr, "key %d: nsize %d, %s%s\n", 1711 i, nsize, DKEY(&key), mdb_leafnode_type(node)); 1712 } 1713 total = EVEN(total); 1714 } 1715 fprintf(stderr, "Total: header %d + contents %d + unused %d\n", 1716 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + MP_LOWER(mp), total, SIZELEFT(mp)); 1717 } 1718 1719 void 1720 mdb_cursor_chk(MDB_cursor *mc) 1721 { 1722 unsigned int i; 1723 MDB_node *node; 1724 MDB_page *mp; 1725 1726 if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; 1727 for (i=0; i<mc->mc_top; i++) { 1728 mp = mc->mc_pg[i]; 1729 node = NODEPTR(mp, mc->mc_ki[i]); 1730 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) 1731 printf("oops!\n"); 1732 } 1733 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) 1734 printf("ack!\n"); 1735 if (XCURSOR_INITED(mc)) { 1736 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 1737 if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && 1738 mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { 1739 printf("blah!\n"); 1740 } 1741 } 1742 } 1743 #endif 1744 1745 #if (MDB_DEBUG) > 2 1746 /** Count all the pages in each DB and in the freelist 1747 * and make sure it matches the actual number of pages 1748 * being used. 1749 * All named DBs must be open for a correct count. 1750 */ 1751 static void mdb_audit(MDB_txn *txn) 1752 { 1753 MDB_cursor mc; 1754 MDB_val key, data; 1755 MDB_ID freecount, count; 1756 MDB_dbi i; 1757 int rc; 1758 1759 freecount = 0; 1760 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 1761 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 1762 freecount += *(MDB_ID *)data.mv_data; 1763 mdb_tassert(txn, rc == MDB_NOTFOUND); 1764 1765 count = 0; 1766 for (i = 0; i<txn->mt_numdbs; i++) { 1767 MDB_xcursor mx; 1768 if (!(txn->mt_dbflags[i] & DB_VALID)) 1769 continue; 1770 mdb_cursor_init(&mc, txn, i, &mx); 1771 if (txn->mt_dbs[i].md_root == P_INVALID) 1772 continue; 1773 count += txn->mt_dbs[i].md_branch_pages + 1774 txn->mt_dbs[i].md_leaf_pages + 1775 txn->mt_dbs[i].md_overflow_pages; 1776 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { 1777 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); 1778 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { 1779 unsigned j; 1780 MDB_page *mp; 1781 mp = mc.mc_pg[mc.mc_top]; 1782 for (j=0; j<NUMKEYS(mp); j++) { 1783 MDB_node *leaf = NODEPTR(mp, j); 1784 if (leaf->mn_flags & F_SUBDATA) { 1785 MDB_db db; 1786 memcpy(&db, NODEDATA(leaf), sizeof(db)); 1787 count += db.md_branch_pages + db.md_leaf_pages + 1788 db.md_overflow_pages; 1789 } 1790 } 1791 } 1792 mdb_tassert(txn, rc == MDB_NOTFOUND); 1793 } 1794 } 1795 if (freecount + count + NUM_METAS != txn->mt_next_pgno) { 1796 fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", 1797 txn->mt_txnid, freecount, count+NUM_METAS, 1798 freecount+count+NUM_METAS, txn->mt_next_pgno); 1799 } 1800 } 1801 #endif 1802 1803 int 1804 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1805 { 1806 return txn->mt_dbxs[dbi].md_cmp(a, b); 1807 } 1808 1809 int 1810 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1811 { 1812 MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; 1813 #if UINT_MAX < SIZE_MAX 1814 if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) 1815 dcmp = mdb_cmp_clong; 1816 #endif 1817 return dcmp(a, b); 1818 } 1819 1820 /** Allocate memory for a page. 1821 * Re-use old malloc'd pages first for singletons, otherwise just malloc. 1822 * Set #MDB_TXN_ERROR on failure. 1823 */ 1824 static MDB_page * 1825 mdb_page_malloc(MDB_txn *txn, unsigned num) 1826 { 1827 MDB_env *env = txn->mt_env; 1828 MDB_page *ret = env->me_dpages; 1829 size_t psize = env->me_psize, sz = psize, off; 1830 /* For ! #MDB_NOMEMINIT, psize counts how much to init. 1831 * For a single page alloc, we init everything after the page header. 1832 * For multi-page, we init the final page; if the caller needed that 1833 * many pages they will be filling in at least up to the last page. 1834 */ 1835 if (num == 1) { 1836 if (ret) { 1837 VGMEMP_ALLOC(env, ret, sz); 1838 VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); 1839 env->me_dpages = ret->mp_next; 1840 return ret; 1841 } 1842 psize -= off = PAGEHDRSZ; 1843 } else { 1844 sz *= num; 1845 off = sz - psize; 1846 } 1847 if ((ret = malloc(sz)) != NULL) { 1848 VGMEMP_ALLOC(env, ret, sz); 1849 if (!(env->me_flags & MDB_NOMEMINIT)) { 1850 memset((char *)ret + off, 0, psize); 1851 ret->mp_pad = 0; 1852 } 1853 } else { 1854 txn->mt_flags |= MDB_TXN_ERROR; 1855 } 1856 return ret; 1857 } 1858 /** Free a single page. 1859 * Saves single pages to a list, for future reuse. 1860 * (This is not used for multi-page overflow pages.) 1861 */ 1862 static void 1863 mdb_page_free(MDB_env *env, MDB_page *mp) 1864 { 1865 mp->mp_next = env->me_dpages; 1866 VGMEMP_FREE(env, mp); 1867 env->me_dpages = mp; 1868 } 1869 1870 /** Free a dirty page */ 1871 static void 1872 mdb_dpage_free(MDB_env *env, MDB_page *dp) 1873 { 1874 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { 1875 mdb_page_free(env, dp); 1876 } else { 1877 /* large pages just get freed directly */ 1878 VGMEMP_FREE(env, dp); 1879 free(dp); 1880 } 1881 } 1882 1883 /** Return all dirty pages to dpage list */ 1884 static void 1885 mdb_dlist_free(MDB_txn *txn) 1886 { 1887 MDB_env *env = txn->mt_env; 1888 MDB_ID2L dl = txn->mt_u.dirty_list; 1889 unsigned i, n = dl[0].mid; 1890 1891 for (i = 1; i <= n; i++) { 1892 mdb_dpage_free(env, dl[i].mptr); 1893 } 1894 dl[0].mid = 0; 1895 } 1896 1897 /** Loosen or free a single page. 1898 * Saves single pages to a list for future reuse 1899 * in this same txn. It has been pulled from the freeDB 1900 * and already resides on the dirty list, but has been 1901 * deleted. Use these pages first before pulling again 1902 * from the freeDB. 1903 * 1904 * If the page wasn't dirtied in this txn, just add it 1905 * to this txn's free list. 1906 */ 1907 static int 1908 mdb_page_loose(MDB_cursor *mc, MDB_page *mp) 1909 { 1910 int loose = 0; 1911 pgno_t pgno = mp->mp_pgno; 1912 MDB_txn *txn = mc->mc_txn; 1913 1914 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { 1915 if (txn->mt_parent) { 1916 MDB_ID2 *dl = txn->mt_u.dirty_list; 1917 /* If txn has a parent, make sure the page is in our 1918 * dirty list. 1919 */ 1920 if (dl[0].mid) { 1921 unsigned x = mdb_mid2l_search(dl, pgno); 1922 if (x <= dl[0].mid && dl[x].mid == pgno) { 1923 if (mp != dl[x].mptr) { /* bad cursor? */ 1924 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 1925 txn->mt_flags |= MDB_TXN_ERROR; 1926 return MDB_CORRUPTED; 1927 } 1928 /* ok, it's ours */ 1929 loose = 1; 1930 } 1931 } 1932 } else { 1933 /* no parent txn, so it's just ours */ 1934 loose = 1; 1935 } 1936 } 1937 if (loose) { 1938 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), 1939 mp->mp_pgno)); 1940 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; 1941 txn->mt_loose_pgs = mp; 1942 txn->mt_loose_count++; 1943 mp->mp_flags |= P_LOOSE; 1944 } else { 1945 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); 1946 if (rc) 1947 return rc; 1948 } 1949 1950 return MDB_SUCCESS; 1951 } 1952 1953 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. 1954 * @param[in] mc A cursor handle for the current operation. 1955 * @param[in] pflags Flags of the pages to update: 1956 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. 1957 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). 1958 * @return 0 on success, non-zero on failure. 1959 */ 1960 static int 1961 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) 1962 { 1963 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; 1964 MDB_txn *txn = mc->mc_txn; 1965 MDB_cursor *m3, *m0 = mc; 1966 MDB_xcursor *mx; 1967 MDB_page *dp, *mp; 1968 MDB_node *leaf; 1969 unsigned i, j; 1970 int rc = MDB_SUCCESS, level; 1971 1972 /* Mark pages seen by cursors */ 1973 if (mc->mc_flags & C_UNTRACK) 1974 mc = NULL; /* will find mc in mt_cursors */ 1975 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { 1976 for (; mc; mc=mc->mc_next) { 1977 if (!(mc->mc_flags & C_INITIALIZED)) 1978 continue; 1979 for (m3 = mc;; m3 = &mx->mx_cursor) { 1980 mp = NULL; 1981 for (j=0; j<m3->mc_snum; j++) { 1982 mp = m3->mc_pg[j]; 1983 if ((mp->mp_flags & Mask) == pflags) 1984 mp->mp_flags ^= P_KEEP; 1985 } 1986 mx = m3->mc_xcursor; 1987 /* Proceed to mx if it is at a sub-database */ 1988 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) 1989 break; 1990 if (! (mp && (mp->mp_flags & P_LEAF))) 1991 break; 1992 leaf = NODEPTR(mp, m3->mc_ki[j-1]); 1993 if (!(leaf->mn_flags & F_SUBDATA)) 1994 break; 1995 } 1996 } 1997 if (i == 0) 1998 break; 1999 } 2000 2001 if (all) { 2002 /* Mark dirty root pages */ 2003 for (i=0; i<txn->mt_numdbs; i++) { 2004 if (txn->mt_dbflags[i] & DB_DIRTY) { 2005 pgno_t pgno = txn->mt_dbs[i].md_root; 2006 if (pgno == P_INVALID) 2007 continue; 2008 if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) 2009 break; 2010 if ((dp->mp_flags & Mask) == pflags && level <= 1) 2011 dp->mp_flags ^= P_KEEP; 2012 } 2013 } 2014 } 2015 2016 return rc; 2017 } 2018 2019 static int mdb_page_flush(MDB_txn *txn, int keep); 2020 2021 /** Spill pages from the dirty list back to disk. 2022 * This is intended to prevent running into #MDB_TXN_FULL situations, 2023 * but note that they may still occur in a few cases: 2024 * 1) our estimate of the txn size could be too small. Currently this 2025 * seems unlikely, except with a large number of #MDB_MULTIPLE items. 2026 * 2) child txns may run out of space if their parents dirtied a 2027 * lot of pages and never spilled them. TODO: we probably should do 2028 * a preemptive spill during #mdb_txn_begin() of a child txn, if 2029 * the parent's dirty_room is below a given threshold. 2030 * 2031 * Otherwise, if not using nested txns, it is expected that apps will 2032 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk 2033 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. 2034 * If the txn never references them again, they can be left alone. 2035 * If the txn only reads them, they can be used without any fuss. 2036 * If the txn writes them again, they can be dirtied immediately without 2037 * going thru all of the work of #mdb_page_touch(). Such references are 2038 * handled by #mdb_page_unspill(). 2039 * 2040 * Also note, we never spill DB root pages, nor pages of active cursors, 2041 * because we'll need these back again soon anyway. And in nested txns, 2042 * we can't spill a page in a child txn if it was already spilled in a 2043 * parent txn. That would alter the parent txns' data even though 2044 * the child hasn't committed yet, and we'd have no way to undo it if 2045 * the child aborted. 2046 * 2047 * @param[in] m0 cursor A cursor handle identifying the transaction and 2048 * database for which we are checking space. 2049 * @param[in] key For a put operation, the key being stored. 2050 * @param[in] data For a put operation, the data being stored. 2051 * @return 0 on success, non-zero on failure. 2052 */ 2053 static int 2054 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) 2055 { 2056 MDB_txn *txn = m0->mc_txn; 2057 MDB_page *dp; 2058 MDB_ID2L dl = txn->mt_u.dirty_list; 2059 unsigned int i, j, need; 2060 int rc; 2061 2062 if (m0->mc_flags & C_SUB) 2063 return MDB_SUCCESS; 2064 2065 /* Estimate how much space this op will take */ 2066 i = m0->mc_db->md_depth; 2067 /* Named DBs also dirty the main DB */ 2068 if (m0->mc_dbi >= CORE_DBS) 2069 i += txn->mt_dbs[MAIN_DBI].md_depth; 2070 /* For puts, roughly factor in the key+data size */ 2071 if (key) 2072 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; 2073 i += i; /* double it for good measure */ 2074 need = i; 2075 2076 if (txn->mt_dirty_room > i) 2077 return MDB_SUCCESS; 2078 2079 if (!txn->mt_spill_pgs) { 2080 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); 2081 if (!txn->mt_spill_pgs) 2082 return ENOMEM; 2083 } else { 2084 /* purge deleted slots */ 2085 MDB_IDL sl = txn->mt_spill_pgs; 2086 unsigned int num = sl[0]; 2087 j=0; 2088 for (i=1; i<=num; i++) { 2089 if (!(sl[i] & 1)) 2090 sl[++j] = sl[i]; 2091 } 2092 sl[0] = j; 2093 } 2094 2095 /* Preserve pages which may soon be dirtied again */ 2096 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) 2097 goto done; 2098 2099 /* Less aggressive spill - we originally spilled the entire dirty list, 2100 * with a few exceptions for cursor pages and DB root pages. But this 2101 * turns out to be a lot of wasted effort because in a large txn many 2102 * of those pages will need to be used again. So now we spill only 1/8th 2103 * of the dirty pages. Testing revealed this to be a good tradeoff, 2104 * better than 1/2, 1/4, or 1/10. 2105 */ 2106 if (need < MDB_IDL_UM_MAX / 8) 2107 need = MDB_IDL_UM_MAX / 8; 2108 2109 /* Save the page IDs of all the pages we're flushing */ 2110 /* flush from the tail forward, this saves a lot of shifting later on. */ 2111 for (i=dl[0].mid; i && need; i--) { 2112 MDB_ID pn = dl[i].mid << 1; 2113 dp = dl[i].mptr; 2114 if (dp->mp_flags & (P_LOOSE|P_KEEP)) 2115 continue; 2116 /* Can't spill twice, make sure it's not already in a parent's 2117 * spill list. 2118 */ 2119 if (txn->mt_parent) { 2120 MDB_txn *tx2; 2121 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { 2122 if (tx2->mt_spill_pgs) { 2123 j = mdb_midl_search(tx2->mt_spill_pgs, pn); 2124 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { 2125 dp->mp_flags |= P_KEEP; 2126 break; 2127 } 2128 } 2129 } 2130 if (tx2) 2131 continue; 2132 } 2133 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) 2134 goto done; 2135 need--; 2136 } 2137 mdb_midl_sort(txn->mt_spill_pgs); 2138 2139 /* Flush the spilled part of dirty list */ 2140 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) 2141 goto done; 2142 2143 /* Reset any dirty pages we kept that page_flush didn't see */ 2144 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); 2145 2146 done: 2147 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; 2148 return rc; 2149 } 2150 2151 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ 2152 static txnid_t 2153 mdb_find_oldest(MDB_txn *txn) 2154 { 2155 int i; 2156 txnid_t mr, oldest = txn->mt_txnid - 1; 2157 if (txn->mt_env->me_txns) { 2158 MDB_reader *r = txn->mt_env->me_txns->mti_readers; 2159 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { 2160 if (r[i].mr_pid) { 2161 mr = r[i].mr_txnid; 2162 if (oldest > mr) 2163 oldest = mr; 2164 } 2165 } 2166 } 2167 return oldest; 2168 } 2169 2170 /** Add a page to the txn's dirty list */ 2171 static void 2172 mdb_page_dirty(MDB_txn *txn, MDB_page *mp) 2173 { 2174 MDB_ID2 mid; 2175 int rc, (*insert)(MDB_ID2L, MDB_ID2 *); 2176 2177 if (txn->mt_flags & MDB_TXN_WRITEMAP) { 2178 insert = mdb_mid2l_append; 2179 } else { 2180 insert = mdb_mid2l_insert; 2181 } 2182 mid.mid = mp->mp_pgno; 2183 mid.mptr = mp; 2184 rc = insert(txn->mt_u.dirty_list, &mid); 2185 mdb_tassert(txn, rc == 0); 2186 txn->mt_dirty_room--; 2187 } 2188 2189 /** Allocate page numbers and memory for writing. Maintain me_pglast, 2190 * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. 2191 * 2192 * If there are free pages available from older transactions, they 2193 * are re-used first. Otherwise allocate a new page at mt_next_pgno. 2194 * Do not modify the freedB, just merge freeDB records into me_pghead[] 2195 * and move me_pglast to say which records were consumed. Only this 2196 * function can create me_pghead and move me_pglast/mt_next_pgno. 2197 * @param[in] mc cursor A cursor handle identifying the transaction and 2198 * database for which we are allocating. 2199 * @param[in] num the number of pages to allocate. 2200 * @param[out] mp Address of the allocated page(s). Requests for multiple pages 2201 * will always be satisfied by a single contiguous chunk of memory. 2202 * @return 0 on success, non-zero on failure. 2203 */ 2204 static int 2205 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) 2206 { 2207 #ifdef MDB_PARANOID /* Seems like we can ignore this now */ 2208 /* Get at most <Max_retries> more freeDB records once me_pghead 2209 * has enough pages. If not enough, use new pages from the map. 2210 * If <Paranoid> and mc is updating the freeDB, only get new 2211 * records if me_pghead is empty. Then the freelist cannot play 2212 * catch-up with itself by growing while trying to save it. 2213 */ 2214 enum { Paranoid = 1, Max_retries = 500 }; 2215 #else 2216 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; 2217 #endif 2218 int rc, retry = num * 60; 2219 MDB_txn *txn = mc->mc_txn; 2220 MDB_env *env = txn->mt_env; 2221 pgno_t pgno, *mop = env->me_pghead; 2222 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; 2223 MDB_page *np; 2224 txnid_t oldest = 0, last; 2225 MDB_cursor_op op; 2226 MDB_cursor m2; 2227 int found_old = 0; 2228 2229 /* If there are any loose pages, just use them */ 2230 if (num == 1 && txn->mt_loose_pgs) { 2231 np = txn->mt_loose_pgs; 2232 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); 2233 txn->mt_loose_count--; 2234 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), 2235 np->mp_pgno)); 2236 *mp = np; 2237 return MDB_SUCCESS; 2238 } 2239 2240 *mp = NULL; 2241 2242 /* If our dirty list is already full, we can't do anything */ 2243 if (txn->mt_dirty_room == 0) { 2244 rc = MDB_TXN_FULL; 2245 goto fail; 2246 } 2247 2248 for (op = MDB_FIRST;; op = MDB_NEXT) { 2249 MDB_val key, data; 2250 MDB_node *leaf; 2251 pgno_t *idl; 2252 2253 /* Seek a big enough contiguous page range. Prefer 2254 * pages at the tail, just truncating the list. 2255 */ 2256 if (mop_len > n2) { 2257 i = mop_len; 2258 do { 2259 pgno = mop[i]; 2260 if (mop[i-n2] == pgno+n2) 2261 goto search_done; 2262 } while (--i > n2); 2263 if (--retry < 0) 2264 break; 2265 } 2266 2267 if (op == MDB_FIRST) { /* 1st iteration */ 2268 /* Prepare to fetch more and coalesce */ 2269 last = env->me_pglast; 2270 oldest = env->me_pgoldest; 2271 mdb_cursor_init(&m2, txn, FREE_DBI, NULL); 2272 if (last) { 2273 op = MDB_SET_RANGE; 2274 key.mv_data = &last; /* will look up last+1 */ 2275 key.mv_size = sizeof(last); 2276 } 2277 if (Paranoid && mc->mc_dbi == FREE_DBI) 2278 retry = -1; 2279 } 2280 if (Paranoid && retry < 0 && mop_len) 2281 break; 2282 2283 last++; 2284 /* Do not fetch more if the record will be too recent */ 2285 if (oldest <= last) { 2286 if (!found_old) { 2287 oldest = mdb_find_oldest(txn); 2288 env->me_pgoldest = oldest; 2289 found_old = 1; 2290 } 2291 if (oldest <= last) 2292 break; 2293 } 2294 rc = mdb_cursor_get(&m2, &key, NULL, op); 2295 if (rc) { 2296 if (rc == MDB_NOTFOUND) 2297 break; 2298 goto fail; 2299 } 2300 last = *(txnid_t*)key.mv_data; 2301 if (oldest <= last) { 2302 if (!found_old) { 2303 oldest = mdb_find_oldest(txn); 2304 env->me_pgoldest = oldest; 2305 found_old = 1; 2306 } 2307 if (oldest <= last) 2308 break; 2309 } 2310 np = m2.mc_pg[m2.mc_top]; 2311 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); 2312 if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) 2313 goto fail; 2314 2315 idl = (MDB_ID *) data.mv_data; 2316 i = idl[0]; 2317 if (!mop) { 2318 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { 2319 rc = ENOMEM; 2320 goto fail; 2321 } 2322 } else { 2323 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) 2324 goto fail; 2325 mop = env->me_pghead; 2326 } 2327 env->me_pglast = last; 2328 #if (MDB_DEBUG) > 1 2329 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", 2330 last, txn->mt_dbs[FREE_DBI].md_root, i)); 2331 for (j = i; j; j--) 2332 DPRINTF(("IDL %"Z"u", idl[j])); 2333 #endif 2334 /* Merge in descending sorted order */ 2335 mdb_midl_xmerge(mop, idl); 2336 mop_len = mop[0]; 2337 } 2338 2339 /* Use new pages from the map when nothing suitable in the freeDB */ 2340 i = 0; 2341 pgno = txn->mt_next_pgno; 2342 if (pgno + num >= env->me_maxpg) { 2343 DPUTS("DB size maxed out"); 2344 rc = MDB_MAP_FULL; 2345 goto fail; 2346 } 2347 2348 search_done: 2349 if (env->me_flags & MDB_WRITEMAP) { 2350 np = (MDB_page *)(env->me_map + env->me_psize * pgno); 2351 } else { 2352 if (!(np = mdb_page_malloc(txn, num))) { 2353 rc = ENOMEM; 2354 goto fail; 2355 } 2356 } 2357 if (i) { 2358 mop[0] = mop_len -= num; 2359 /* Move any stragglers down */ 2360 for (j = i-num; j < mop_len; ) 2361 mop[++j] = mop[++i]; 2362 } else { 2363 txn->mt_next_pgno = pgno + num; 2364 } 2365 np->mp_pgno = pgno; 2366 mdb_page_dirty(txn, np); 2367 *mp = np; 2368 2369 return MDB_SUCCESS; 2370 2371 fail: 2372 txn->mt_flags |= MDB_TXN_ERROR; 2373 return rc; 2374 } 2375 2376 /** Copy the used portions of a non-overflow page. 2377 * @param[in] dst page to copy into 2378 * @param[in] src page to copy from 2379 * @param[in] psize size of a page 2380 */ 2381 static void 2382 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) 2383 { 2384 enum { Align = sizeof(pgno_t) }; 2385 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; 2386 2387 /* If page isn't full, just copy the used portion. Adjust 2388 * alignment so memcpy may copy words instead of bytes. 2389 */ 2390 if ((unused &= -Align) && !IS_LEAF2(src)) { 2391 upper = (upper + PAGEBASE) & -Align; 2392 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); 2393 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), 2394 psize - upper); 2395 } else { 2396 memcpy(dst, src, psize - unused); 2397 } 2398 } 2399 2400 /** Pull a page off the txn's spill list, if present. 2401 * If a page being referenced was spilled to disk in this txn, bring 2402 * it back and make it dirty/writable again. 2403 * @param[in] txn the transaction handle. 2404 * @param[in] mp the page being referenced. It must not be dirty. 2405 * @param[out] ret the writable page, if any. ret is unchanged if 2406 * mp wasn't spilled. 2407 */ 2408 static int 2409 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) 2410 { 2411 MDB_env *env = txn->mt_env; 2412 const MDB_txn *tx2; 2413 unsigned x; 2414 pgno_t pgno = mp->mp_pgno, pn = pgno << 1; 2415 2416 for (tx2 = txn; tx2; tx2=tx2->mt_parent) { 2417 if (!tx2->mt_spill_pgs) 2418 continue; 2419 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 2420 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 2421 MDB_page *np; 2422 int num; 2423 if (txn->mt_dirty_room == 0) 2424 return MDB_TXN_FULL; 2425 if (IS_OVERFLOW(mp)) 2426 num = mp->mp_pages; 2427 else 2428 num = 1; 2429 if (env->me_flags & MDB_WRITEMAP) { 2430 np = mp; 2431 } else { 2432 np = mdb_page_malloc(txn, num); 2433 if (!np) 2434 return ENOMEM; 2435 if (num > 1) 2436 memcpy(np, mp, num * env->me_psize); 2437 else 2438 mdb_page_copy(np, mp, env->me_psize); 2439 } 2440 if (tx2 == txn) { 2441 /* If in current txn, this page is no longer spilled. 2442 * If it happens to be the last page, truncate the spill list. 2443 * Otherwise mark it as deleted by setting the LSB. 2444 */ 2445 if (x == txn->mt_spill_pgs[0]) 2446 txn->mt_spill_pgs[0]--; 2447 else 2448 txn->mt_spill_pgs[x] |= 1; 2449 } /* otherwise, if belonging to a parent txn, the 2450 * page remains spilled until child commits 2451 */ 2452 2453 mdb_page_dirty(txn, np); 2454 np->mp_flags |= P_DIRTY; 2455 *ret = np; 2456 break; 2457 } 2458 } 2459 return MDB_SUCCESS; 2460 } 2461 2462 /** Touch a page: make it dirty and re-insert into tree with updated pgno. 2463 * Set #MDB_TXN_ERROR on failure. 2464 * @param[in] mc cursor pointing to the page to be touched 2465 * @return 0 on success, non-zero on failure. 2466 */ 2467 static int 2468 mdb_page_touch(MDB_cursor *mc) 2469 { 2470 MDB_page *mp = mc->mc_pg[mc->mc_top], *np; 2471 MDB_txn *txn = mc->mc_txn; 2472 MDB_cursor *m2, *m3; 2473 pgno_t pgno; 2474 int rc; 2475 2476 if (!F_ISSET(MP_FLAGS(mp), P_DIRTY)) { 2477 if (txn->mt_flags & MDB_TXN_SPILLS) { 2478 np = NULL; 2479 rc = mdb_page_unspill(txn, mp, &np); 2480 if (rc) 2481 goto fail; 2482 if (np) 2483 goto done; 2484 } 2485 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || 2486 (rc = mdb_page_alloc(mc, 1, &np))) 2487 goto fail; 2488 pgno = np->mp_pgno; 2489 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), 2490 mp->mp_pgno, pgno)); 2491 mdb_cassert(mc, mp->mp_pgno != pgno); 2492 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 2493 /* Update the parent page, if any, to point to the new page */ 2494 if (mc->mc_top) { 2495 MDB_page *parent = mc->mc_pg[mc->mc_top-1]; 2496 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); 2497 SETPGNO(node, pgno); 2498 } else { 2499 mc->mc_db->md_root = pgno; 2500 } 2501 } else if (txn->mt_parent && !IS_SUBP(mp)) { 2502 MDB_ID2 mid, *dl = txn->mt_u.dirty_list; 2503 pgno = mp->mp_pgno; 2504 /* If txn has a parent, make sure the page is in our 2505 * dirty list. 2506 */ 2507 if (dl[0].mid) { 2508 unsigned x = mdb_mid2l_search(dl, pgno); 2509 if (x <= dl[0].mid && dl[x].mid == pgno) { 2510 if (mp != dl[x].mptr) { /* bad cursor? */ 2511 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 2512 txn->mt_flags |= MDB_TXN_ERROR; 2513 return MDB_CORRUPTED; 2514 } 2515 return 0; 2516 } 2517 } 2518 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); 2519 /* No - copy it */ 2520 np = mdb_page_malloc(txn, 1); 2521 if (!np) 2522 return ENOMEM; 2523 mid.mid = pgno; 2524 mid.mptr = np; 2525 rc = mdb_mid2l_insert(dl, &mid); 2526 mdb_cassert(mc, rc == 0); 2527 } else { 2528 return 0; 2529 } 2530 2531 mdb_page_copy(np, mp, txn->mt_env->me_psize); 2532 np->mp_pgno = pgno; 2533 np->mp_flags |= P_DIRTY; 2534 2535 done: 2536 /* Adjust cursors pointing to mp */ 2537 mc->mc_pg[mc->mc_top] = np; 2538 m2 = txn->mt_cursors[mc->mc_dbi]; 2539 if (mc->mc_flags & C_SUB) { 2540 for (; m2; m2=m2->mc_next) { 2541 m3 = &m2->mc_xcursor->mx_cursor; 2542 if (m3->mc_snum < mc->mc_snum) continue; 2543 if (m3->mc_pg[mc->mc_top] == mp) 2544 m3->mc_pg[mc->mc_top] = np; 2545 } 2546 } else { 2547 for (; m2; m2=m2->mc_next) { 2548 if (m2->mc_snum < mc->mc_snum) continue; 2549 if (m2 == mc) continue; 2550 if (m2->mc_pg[mc->mc_top] == mp) { 2551 m2->mc_pg[mc->mc_top] = np; 2552 if (IS_LEAF(np)) 2553 XCURSOR_REFRESH(m2, mc->mc_top, np); 2554 } 2555 } 2556 } 2557 return 0; 2558 2559 fail: 2560 txn->mt_flags |= MDB_TXN_ERROR; 2561 return rc; 2562 } 2563 2564 int 2565 mdb_env_sync(MDB_env *env, int force) 2566 { 2567 int rc = 0; 2568 if (env->me_flags & MDB_RDONLY) 2569 return EACCES; 2570 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { 2571 if (env->me_flags & MDB_WRITEMAP) { 2572 int flags = ((env->me_flags & MDB_MAPASYNC) && !force) 2573 ? MS_ASYNC : MS_SYNC; 2574 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) 2575 rc = ErrCode(); 2576 #ifdef _WIN32 2577 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) 2578 rc = ErrCode(); 2579 #endif 2580 } else { 2581 #ifdef BROKEN_FDATASYNC 2582 if (env->me_flags & MDB_FSYNCONLY) { 2583 if (fsync(env->me_fd)) 2584 rc = ErrCode(); 2585 } else 2586 #endif 2587 if (MDB_FDATASYNC(env->me_fd)) 2588 rc = ErrCode(); 2589 } 2590 } 2591 return rc; 2592 } 2593 2594 /** Back up parent txn's cursors, then grab the originals for tracking */ 2595 static int 2596 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) 2597 { 2598 MDB_cursor *mc, *bk; 2599 MDB_xcursor *mx; 2600 size_t size; 2601 int i; 2602 2603 for (i = src->mt_numdbs; --i >= 0; ) { 2604 if ((mc = src->mt_cursors[i]) != NULL) { 2605 size = sizeof(MDB_cursor); 2606 if (mc->mc_xcursor) 2607 size += sizeof(MDB_xcursor); 2608 for (; mc; mc = bk->mc_next) { 2609 bk = malloc(size); 2610 if (!bk) 2611 return ENOMEM; 2612 *bk = *mc; 2613 mc->mc_backup = bk; 2614 mc->mc_db = &dst->mt_dbs[i]; 2615 /* Kill pointers into src to reduce abuse: The 2616 * user may not use mc until dst ends. But we need a valid 2617 * txn pointer here for cursor fixups to keep working. 2618 */ 2619 mc->mc_txn = dst; 2620 mc->mc_dbflag = &dst->mt_dbflags[i]; 2621 if ((mx = mc->mc_xcursor) != NULL) { 2622 *(MDB_xcursor *)(bk+1) = *mx; 2623 mx->mx_cursor.mc_txn = dst; 2624 } 2625 mc->mc_next = dst->mt_cursors[i]; 2626 dst->mt_cursors[i] = mc; 2627 } 2628 } 2629 } 2630 return MDB_SUCCESS; 2631 } 2632 2633 /** Close this write txn's cursors, give parent txn's cursors back to parent. 2634 * @param[in] txn the transaction handle. 2635 * @param[in] merge true to keep changes to parent cursors, false to revert. 2636 * @return 0 on success, non-zero on failure. 2637 */ 2638 static void 2639 mdb_cursors_close(MDB_txn *txn, unsigned merge) 2640 { 2641 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; 2642 MDB_xcursor *mx; 2643 int i; 2644 2645 for (i = txn->mt_numdbs; --i >= 0; ) { 2646 for (mc = cursors[i]; mc; mc = next) { 2647 next = mc->mc_next; 2648 if ((bk = mc->mc_backup) != NULL) { 2649 if (merge) { 2650 /* Commit changes to parent txn */ 2651 mc->mc_next = bk->mc_next; 2652 mc->mc_backup = bk->mc_backup; 2653 mc->mc_txn = bk->mc_txn; 2654 mc->mc_db = bk->mc_db; 2655 mc->mc_dbflag = bk->mc_dbflag; 2656 if ((mx = mc->mc_xcursor) != NULL) 2657 mx->mx_cursor.mc_txn = bk->mc_txn; 2658 } else { 2659 /* Abort nested txn */ 2660 *mc = *bk; 2661 if ((mx = mc->mc_xcursor) != NULL) 2662 *mx = *(MDB_xcursor *)(bk+1); 2663 } 2664 mc = bk; 2665 } 2666 /* Only malloced cursors are permanently tracked. */ 2667 free(mc); 2668 } 2669 cursors[i] = NULL; 2670 } 2671 } 2672 2673 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2674 enum Pidlock_op { 2675 Pidset, Pidcheck 2676 }; 2677 #else 2678 enum Pidlock_op { 2679 Pidset = F_SETLK, Pidcheck = F_GETLK 2680 }; 2681 #endif 2682 2683 /** Set or check a pid lock. Set returns 0 on success. 2684 * Check returns 0 if the process is certainly dead, nonzero if it may 2685 * be alive (the lock exists or an error happened so we do not know). 2686 * 2687 * On Windows Pidset is a no-op, we merely check for the existence 2688 * of the process with the given pid. On POSIX we use a single byte 2689 * lock on the lockfile, set at an offset equal to the pid. 2690 */ 2691 static int 2692 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) 2693 { 2694 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2695 int ret = 0; 2696 HANDLE h; 2697 if (op == Pidcheck) { 2698 h = OpenProcess(env->me_pidquery, FALSE, pid); 2699 /* No documented "no such process" code, but other program use this: */ 2700 if (!h) 2701 return ErrCode() != ERROR_INVALID_PARAMETER; 2702 /* A process exists until all handles to it close. Has it exited? */ 2703 ret = WaitForSingleObject(h, 0) != 0; 2704 CloseHandle(h); 2705 } 2706 return ret; 2707 #else 2708 for (;;) { 2709 int rc; 2710 struct flock lock_info; 2711 memset(&lock_info, 0, sizeof(lock_info)); 2712 lock_info.l_type = F_WRLCK; 2713 lock_info.l_whence = SEEK_SET; 2714 lock_info.l_start = pid; 2715 lock_info.l_len = 1; 2716 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { 2717 if (op == F_GETLK && lock_info.l_type != F_UNLCK) 2718 rc = -1; 2719 } else if ((rc = ErrCode()) == EINTR) { 2720 continue; 2721 } 2722 return rc; 2723 } 2724 #endif 2725 } 2726 2727 /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). 2728 * @param[in] txn the transaction handle to initialize 2729 * @return 0 on success, non-zero on failure. 2730 */ 2731 static int 2732 mdb_txn_renew0(MDB_txn *txn) 2733 { 2734 MDB_env *env = txn->mt_env; 2735 MDB_txninfo *ti = env->me_txns; 2736 MDB_meta *meta; 2737 unsigned int i, nr, flags = txn->mt_flags; 2738 uint16_t x; 2739 int rc, new_notls = 0; 2740 2741 if ((flags &= MDB_TXN_RDONLY) != 0) { 2742 if (!ti) { 2743 meta = mdb_env_pick_meta(env); 2744 txn->mt_txnid = meta->mm_txnid; 2745 txn->mt_u.reader = NULL; 2746 } else { 2747 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : 2748 pthread_getspecific(env->me_txkey); 2749 if (r) { 2750 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) 2751 return MDB_BAD_RSLOT; 2752 } else { 2753 MDB_PID_T pid = env->me_pid; 2754 MDB_THR_T tid = pthread_self(); 2755 mdb_mutexref_t rmutex = env->me_rmutex; 2756 2757 if (!env->me_live_reader) { 2758 rc = mdb_reader_pid(env, Pidset, pid); 2759 if (rc) 2760 return rc; 2761 env->me_live_reader = 1; 2762 } 2763 2764 if (LOCK_MUTEX(rc, env, rmutex)) 2765 return rc; 2766 nr = ti->mti_numreaders; 2767 for (i=0; i<nr; i++) 2768 if (ti->mti_readers[i].mr_pid == 0) 2769 break; 2770 if (i == env->me_maxreaders) { 2771 UNLOCK_MUTEX(rmutex); 2772 return MDB_READERS_FULL; 2773 } 2774 r = &ti->mti_readers[i]; 2775 /* Claim the reader slot, carefully since other code 2776 * uses the reader table un-mutexed: First reset the 2777 * slot, next publish it in mti_numreaders. After 2778 * that, it is safe for mdb_env_close() to touch it. 2779 * When it will be closed, we can finally claim it. 2780 */ 2781 r->mr_pid = 0; 2782 r->mr_txnid = (txnid_t)-1; 2783 r->mr_tid = tid; 2784 if (i == nr) 2785 ti->mti_numreaders = ++nr; 2786 env->me_close_readers = nr; 2787 r->mr_pid = pid; 2788 UNLOCK_MUTEX(rmutex); 2789 2790 new_notls = (env->me_flags & MDB_NOTLS); 2791 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { 2792 r->mr_pid = 0; 2793 return rc; 2794 } 2795 } 2796 do /* LY: Retry on a race, ITS#7970. */ 2797 r->mr_txnid = ti->mti_txnid; 2798 while(r->mr_txnid != ti->mti_txnid); 2799 if (!r->mr_txnid && (env->me_flags & MDB_RDONLY)) { 2800 meta = mdb_env_pick_meta(env); 2801 r->mr_txnid = meta->mm_txnid; 2802 } else { 2803 meta = env->me_metas[r->mr_txnid & 1]; 2804 } 2805 txn->mt_txnid = r->mr_txnid; 2806 txn->mt_u.reader = r; 2807 } 2808 2809 } else { 2810 /* Not yet touching txn == env->me_txn0, it may be active */ 2811 if (ti) { 2812 if (LOCK_MUTEX(rc, env, env->me_wmutex)) 2813 return rc; 2814 txn->mt_txnid = ti->mti_txnid; 2815 meta = env->me_metas[txn->mt_txnid & 1]; 2816 } else { 2817 meta = mdb_env_pick_meta(env); 2818 txn->mt_txnid = meta->mm_txnid; 2819 } 2820 txn->mt_txnid++; 2821 #if MDB_DEBUG 2822 if (txn->mt_txnid == mdb_debug_start) 2823 mdb_debug = MDB_DBG_INFO; 2824 #endif 2825 txn->mt_child = NULL; 2826 txn->mt_loose_pgs = NULL; 2827 txn->mt_loose_count = 0; 2828 txn->mt_dirty_room = MDB_IDL_UM_MAX; 2829 txn->mt_u.dirty_list = env->me_dirty_list; 2830 txn->mt_u.dirty_list[0].mid = 0; 2831 txn->mt_free_pgs = env->me_free_pgs; 2832 txn->mt_free_pgs[0] = 0; 2833 txn->mt_spill_pgs = NULL; 2834 env->me_txn = txn; 2835 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); 2836 } 2837 2838 /* Copy the DB info and flags */ 2839 memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); 2840 2841 /* Moved to here to avoid a data race in read TXNs */ 2842 txn->mt_next_pgno = meta->mm_last_pg+1; 2843 2844 txn->mt_flags = flags; 2845 2846 /* Setup db info */ 2847 txn->mt_numdbs = env->me_numdbs; 2848 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 2849 x = env->me_dbflags[i]; 2850 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; 2851 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; 2852 } 2853 txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; 2854 txn->mt_dbflags[FREE_DBI] = DB_VALID; 2855 2856 if (env->me_flags & MDB_FATAL_ERROR) { 2857 DPUTS("environment had fatal error, must shutdown!"); 2858 rc = MDB_PANIC; 2859 } else if (env->me_maxpg < txn->mt_next_pgno) { 2860 rc = MDB_MAP_RESIZED; 2861 } else { 2862 return MDB_SUCCESS; 2863 } 2864 mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); 2865 return rc; 2866 } 2867 2868 int 2869 mdb_txn_renew(MDB_txn *txn) 2870 { 2871 int rc; 2872 2873 if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) 2874 return EINVAL; 2875 2876 rc = mdb_txn_renew0(txn); 2877 if (rc == MDB_SUCCESS) { 2878 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2879 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2880 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); 2881 } 2882 return rc; 2883 } 2884 2885 int 2886 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) 2887 { 2888 MDB_txn *txn; 2889 MDB_ntxn *ntxn; 2890 int rc, size, tsize; 2891 2892 flags &= MDB_TXN_BEGIN_FLAGS; 2893 flags |= env->me_flags & MDB_WRITEMAP; 2894 2895 if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ 2896 return EACCES; 2897 2898 if (parent) { 2899 /* Nested transactions: Max 1 child, write txns only, no writemap */ 2900 flags |= parent->mt_flags; 2901 if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { 2902 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; 2903 } 2904 /* Child txns save MDB_pgstate and use own copy of cursors */ 2905 size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); 2906 size += tsize = sizeof(MDB_ntxn); 2907 } else if (flags & MDB_RDONLY) { 2908 size = env->me_maxdbs * (sizeof(MDB_db)+1); 2909 size += tsize = sizeof(MDB_txn); 2910 } else { 2911 /* Reuse preallocated write txn. However, do not touch it until 2912 * mdb_txn_renew0() succeeds, since it currently may be active. 2913 */ 2914 txn = env->me_txn0; 2915 goto renew; 2916 } 2917 if ((txn = calloc(1, size)) == NULL) { 2918 DPRINTF(("calloc: %s", strerror(errno))); 2919 return ENOMEM; 2920 } 2921 txn->mt_dbxs = env->me_dbxs; /* static */ 2922 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); 2923 txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; 2924 txn->mt_flags = flags; 2925 txn->mt_env = env; 2926 2927 if (parent) { 2928 unsigned int i; 2929 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 2930 txn->mt_dbiseqs = parent->mt_dbiseqs; 2931 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); 2932 if (!txn->mt_u.dirty_list || 2933 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) 2934 { 2935 free(txn->mt_u.dirty_list); 2936 free(txn); 2937 return ENOMEM; 2938 } 2939 txn->mt_txnid = parent->mt_txnid; 2940 txn->mt_dirty_room = parent->mt_dirty_room; 2941 txn->mt_u.dirty_list[0].mid = 0; 2942 txn->mt_spill_pgs = NULL; 2943 txn->mt_next_pgno = parent->mt_next_pgno; 2944 parent->mt_flags |= MDB_TXN_HAS_CHILD; 2945 parent->mt_child = txn; 2946 txn->mt_parent = parent; 2947 txn->mt_numdbs = parent->mt_numdbs; 2948 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 2949 /* Copy parent's mt_dbflags, but clear DB_NEW */ 2950 for (i=0; i<txn->mt_numdbs; i++) 2951 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; 2952 rc = 0; 2953 ntxn = (MDB_ntxn *)txn; 2954 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ 2955 if (env->me_pghead) { 2956 size = MDB_IDL_SIZEOF(env->me_pghead); 2957 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); 2958 if (env->me_pghead) 2959 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); 2960 else 2961 rc = ENOMEM; 2962 } 2963 if (!rc) 2964 rc = mdb_cursor_shadow(parent, txn); 2965 if (rc) 2966 mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); 2967 } else { /* MDB_RDONLY */ 2968 txn->mt_dbiseqs = env->me_dbiseqs; 2969 renew: 2970 rc = mdb_txn_renew0(txn); 2971 } 2972 if (rc) { 2973 if (txn != env->me_txn0) 2974 free(txn); 2975 } else { 2976 txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ 2977 *ret = txn; 2978 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2979 txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', 2980 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); 2981 } 2982 MDB_TRACE(("%p, %p, %u = %p", env, parent, flags, txn)); 2983 2984 return rc; 2985 } 2986 2987 MDB_env * 2988 mdb_txn_env(MDB_txn *txn) 2989 { 2990 if(!txn) return NULL; 2991 return txn->mt_env; 2992 } 2993 2994 size_t 2995 mdb_txn_id(MDB_txn *txn) 2996 { 2997 if(!txn) return 0; 2998 return txn->mt_txnid; 2999 } 3000 3001 /** Export or close DBI handles opened in this txn. */ 3002 static void 3003 mdb_dbis_update(MDB_txn *txn, int keep) 3004 { 3005 int i; 3006 MDB_dbi n = txn->mt_numdbs; 3007 MDB_env *env = txn->mt_env; 3008 unsigned char *tdbflags = txn->mt_dbflags; 3009 3010 for (i = n; --i >= CORE_DBS;) { 3011 if (tdbflags[i] & DB_NEW) { 3012 if (keep) { 3013 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; 3014 } else { 3015 char *ptr = env->me_dbxs[i].md_name.mv_data; 3016 if (ptr) { 3017 env->me_dbxs[i].md_name.mv_data = NULL; 3018 env->me_dbxs[i].md_name.mv_size = 0; 3019 env->me_dbflags[i] = 0; 3020 env->me_dbiseqs[i]++; 3021 free(ptr); 3022 } 3023 } 3024 } 3025 } 3026 if (keep && env->me_numdbs < n) 3027 env->me_numdbs = n; 3028 } 3029 3030 /** End a transaction, except successful commit of a nested transaction. 3031 * May be called twice for readonly txns: First reset it, then abort. 3032 * @param[in] txn the transaction handle to end 3033 * @param[in] mode why and how to end the transaction 3034 */ 3035 static void 3036 mdb_txn_end(MDB_txn *txn, unsigned mode) 3037 { 3038 MDB_env *env = txn->mt_env; 3039 #if MDB_DEBUG 3040 static const char *const names[] = MDB_END_NAMES; 3041 #endif 3042 3043 /* Export or close DBI handles opened in this txn */ 3044 mdb_dbis_update(txn, mode & MDB_END_UPDATE); 3045 3046 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 3047 names[mode & MDB_END_OPMASK], 3048 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 3049 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); 3050 3051 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 3052 if (txn->mt_u.reader) { 3053 txn->mt_u.reader->mr_txnid = (txnid_t)-1; 3054 if (!(env->me_flags & MDB_NOTLS)) { 3055 txn->mt_u.reader = NULL; /* txn does not own reader */ 3056 } else if (mode & MDB_END_SLOT) { 3057 txn->mt_u.reader->mr_pid = 0; 3058 txn->mt_u.reader = NULL; 3059 } /* else txn owns the slot until it does MDB_END_SLOT */ 3060 } 3061 txn->mt_numdbs = 0; /* prevent further DBI activity */ 3062 txn->mt_flags |= MDB_TXN_FINISHED; 3063 3064 } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { 3065 pgno_t *pghead = env->me_pghead; 3066 3067 if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ 3068 mdb_cursors_close(txn, 0); 3069 if (!(env->me_flags & MDB_WRITEMAP)) { 3070 mdb_dlist_free(txn); 3071 } 3072 3073 txn->mt_numdbs = 0; 3074 txn->mt_flags = MDB_TXN_FINISHED; 3075 3076 if (!txn->mt_parent) { 3077 mdb_midl_shrink(&txn->mt_free_pgs); 3078 env->me_free_pgs = txn->mt_free_pgs; 3079 /* me_pgstate: */ 3080 env->me_pghead = NULL; 3081 env->me_pglast = 0; 3082 3083 env->me_txn = NULL; 3084 mode = 0; /* txn == env->me_txn0, do not free() it */ 3085 3086 /* The writer mutex was locked in mdb_txn_begin. */ 3087 if (env->me_txns) 3088 UNLOCK_MUTEX(env->me_wmutex); 3089 } else { 3090 txn->mt_parent->mt_child = NULL; 3091 txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; 3092 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; 3093 mdb_midl_free(txn->mt_free_pgs); 3094 free(txn->mt_u.dirty_list); 3095 } 3096 mdb_midl_free(txn->mt_spill_pgs); 3097 3098 mdb_midl_free(pghead); 3099 } 3100 3101 if (mode & MDB_END_FREE) 3102 free(txn); 3103 } 3104 3105 void 3106 mdb_txn_reset(MDB_txn *txn) 3107 { 3108 if (txn == NULL) 3109 return; 3110 3111 /* This call is only valid for read-only txns */ 3112 if (!(txn->mt_flags & MDB_TXN_RDONLY)) 3113 return; 3114 3115 mdb_txn_end(txn, MDB_END_RESET); 3116 } 3117 3118 static void 3119 _mdb_txn_abort(MDB_txn *txn) 3120 { 3121 if (txn == NULL) 3122 return; 3123 3124 if (txn->mt_child) 3125 _mdb_txn_abort(txn->mt_child); 3126 3127 mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); 3128 } 3129 3130 void 3131 mdb_txn_abort(MDB_txn *txn) 3132 { 3133 MDB_TRACE(("%p", txn)); 3134 _mdb_txn_abort(txn); 3135 } 3136 3137 /** Save the freelist as of this transaction to the freeDB. 3138 * This changes the freelist. Keep trying until it stabilizes. 3139 */ 3140 static int 3141 mdb_freelist_save(MDB_txn *txn) 3142 { 3143 /* env->me_pghead[] can grow and shrink during this call. 3144 * env->me_pglast and txn->mt_free_pgs[] can only grow. 3145 * Page numbers cannot disappear from txn->mt_free_pgs[]. 3146 */ 3147 MDB_cursor mc; 3148 MDB_env *env = txn->mt_env; 3149 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; 3150 txnid_t pglast = 0, head_id = 0; 3151 pgno_t freecnt = 0, *free_pgs, *mop; 3152 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; 3153 3154 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 3155 3156 if (env->me_pghead) { 3157 /* Make sure first page of freeDB is touched and on freelist */ 3158 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); 3159 if (rc && rc != MDB_NOTFOUND) 3160 return rc; 3161 } 3162 3163 if (!env->me_pghead && txn->mt_loose_pgs) { 3164 /* Put loose page numbers in mt_free_pgs, since 3165 * we may be unable to return them to me_pghead. 3166 */ 3167 MDB_page *mp = txn->mt_loose_pgs; 3168 MDB_ID2 *dl = txn->mt_u.dirty_list; 3169 unsigned x; 3170 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) 3171 return rc; 3172 for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { 3173 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 3174 /* must also remove from dirty list */ 3175 if (txn->mt_flags & MDB_TXN_WRITEMAP) { 3176 for (x=1; x<=dl[0].mid; x++) 3177 if (dl[x].mid == mp->mp_pgno) 3178 break; 3179 mdb_tassert(txn, x <= dl[0].mid); 3180 } else { 3181 x = mdb_mid2l_search(dl, mp->mp_pgno); 3182 mdb_tassert(txn, dl[x].mid == mp->mp_pgno); 3183 mdb_dpage_free(env, mp); 3184 } 3185 dl[x].mptr = NULL; 3186 } 3187 { 3188 /* squash freed slots out of the dirty list */ 3189 unsigned y; 3190 for (y=1; dl[y].mptr && y <= dl[0].mid; y++); 3191 if (y <= dl[0].mid) { 3192 for(x=y, y++;;) { 3193 while (!dl[y].mptr && y <= dl[0].mid) y++; 3194 if (y > dl[0].mid) break; 3195 dl[x++] = dl[y++]; 3196 } 3197 dl[0].mid = x-1; 3198 } else { 3199 /* all slots freed */ 3200 dl[0].mid = 0; 3201 } 3202 } 3203 txn->mt_loose_pgs = NULL; 3204 txn->mt_loose_count = 0; 3205 } 3206 3207 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ 3208 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) 3209 ? SSIZE_MAX : maxfree_1pg; 3210 3211 for (;;) { 3212 /* Come back here after each Put() in case freelist changed */ 3213 MDB_val key, data; 3214 pgno_t *pgs; 3215 ssize_t j; 3216 3217 /* If using records from freeDB which we have not yet 3218 * deleted, delete them and any we reserved for me_pghead. 3219 */ 3220 while (pglast < env->me_pglast) { 3221 rc = mdb_cursor_first(&mc, &key, NULL); 3222 if (rc) 3223 return rc; 3224 pglast = head_id = *(txnid_t *)key.mv_data; 3225 total_room = head_room = 0; 3226 mdb_tassert(txn, pglast <= env->me_pglast); 3227 rc = _mdb_cursor_del(&mc, 0); 3228 if (rc) 3229 return rc; 3230 } 3231 3232 /* Save the IDL of pages freed by this txn, to a single record */ 3233 if (freecnt < txn->mt_free_pgs[0]) { 3234 if (!freecnt) { 3235 /* Make sure last page of freeDB is touched and on freelist */ 3236 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); 3237 if (rc && rc != MDB_NOTFOUND) 3238 return rc; 3239 } 3240 free_pgs = txn->mt_free_pgs; 3241 /* Write to last page of freeDB */ 3242 key.mv_size = sizeof(txn->mt_txnid); 3243 key.mv_data = &txn->mt_txnid; 3244 do { 3245 freecnt = free_pgs[0]; 3246 data.mv_size = MDB_IDL_SIZEOF(free_pgs); 3247 rc = _mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3248 if (rc) 3249 return rc; 3250 /* Retry if mt_free_pgs[] grew during the Put() */ 3251 free_pgs = txn->mt_free_pgs; 3252 } while (freecnt < free_pgs[0]); 3253 mdb_midl_sort(free_pgs); 3254 memcpy(data.mv_data, free_pgs, data.mv_size); 3255 #if (MDB_DEBUG) > 1 3256 { 3257 unsigned int i = free_pgs[0]; 3258 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", 3259 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); 3260 for (; i; i--) 3261 DPRINTF(("IDL %"Z"u", free_pgs[i])); 3262 } 3263 #endif 3264 continue; 3265 } 3266 3267 mop = env->me_pghead; 3268 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; 3269 3270 /* Reserve records for me_pghead[]. Split it if multi-page, 3271 * to avoid searching freeDB for a page range. Use keys in 3272 * range [1,me_pglast]: Smaller than txnid of oldest reader. 3273 */ 3274 if (total_room >= mop_len) { 3275 if (total_room == mop_len || --more < 0) 3276 break; 3277 } else if (head_room >= maxfree_1pg && head_id > 1) { 3278 /* Keep current record (overflow page), add a new one */ 3279 head_id--; 3280 head_room = 0; 3281 } 3282 /* (Re)write {key = head_id, IDL length = head_room} */ 3283 total_room -= head_room; 3284 head_room = mop_len - total_room; 3285 if (head_room > maxfree_1pg && head_id > 1) { 3286 /* Overflow multi-page for part of me_pghead */ 3287 head_room /= head_id; /* amortize page sizes */ 3288 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); 3289 } else if (head_room < 0) { 3290 /* Rare case, not bothering to delete this record */ 3291 head_room = 0; 3292 } 3293 key.mv_size = sizeof(head_id); 3294 key.mv_data = &head_id; 3295 data.mv_size = (head_room + 1) * sizeof(pgno_t); 3296 rc = _mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3297 if (rc) 3298 return rc; 3299 /* IDL is initially empty, zero out at least the length */ 3300 pgs = (pgno_t *)data.mv_data; 3301 j = head_room > clean_limit ? head_room : 0; 3302 do { 3303 pgs[j] = 0; 3304 } while (--j >= 0); 3305 total_room += head_room; 3306 } 3307 3308 /* Return loose page numbers to me_pghead, though usually none are 3309 * left at this point. The pages themselves remain in dirty_list. 3310 */ 3311 if (txn->mt_loose_pgs) { 3312 MDB_page *mp = txn->mt_loose_pgs; 3313 unsigned count = txn->mt_loose_count; 3314 MDB_IDL loose; 3315 /* Room for loose pages + temp IDL with same */ 3316 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) 3317 return rc; 3318 mop = env->me_pghead; 3319 loose = mop + MDB_IDL_ALLOCLEN(mop) - count; 3320 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) 3321 loose[ ++count ] = mp->mp_pgno; 3322 loose[0] = count; 3323 mdb_midl_sort(loose); 3324 mdb_midl_xmerge(mop, loose); 3325 txn->mt_loose_pgs = NULL; 3326 txn->mt_loose_count = 0; 3327 mop_len = mop[0]; 3328 } 3329 3330 /* Fill in the reserved me_pghead records */ 3331 rc = MDB_SUCCESS; 3332 if (mop_len) { 3333 MDB_val key, data; 3334 3335 mop += mop_len; 3336 rc = mdb_cursor_first(&mc, &key, &data); 3337 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { 3338 txnid_t id = *(txnid_t *)key.mv_data; 3339 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; 3340 MDB_ID save; 3341 3342 mdb_tassert(txn, len >= 0 && id <= env->me_pglast); 3343 key.mv_data = &id; 3344 if (len > mop_len) { 3345 len = mop_len; 3346 data.mv_size = (len + 1) * sizeof(MDB_ID); 3347 } 3348 data.mv_data = mop -= len; 3349 save = mop[0]; 3350 mop[0] = len; 3351 rc = _mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); 3352 mop[0] = save; 3353 if (rc || !(mop_len -= len)) 3354 break; 3355 } 3356 } 3357 return rc; 3358 } 3359 3360 /** Flush (some) dirty pages to the map, after clearing their dirty flag. 3361 * @param[in] txn the transaction that's being committed 3362 * @param[in] keep number of initial pages in dirty_list to keep dirty. 3363 * @return 0 on success, non-zero on failure. 3364 */ 3365 static int 3366 mdb_page_flush(MDB_txn *txn, int keep) 3367 { 3368 MDB_env *env = txn->mt_env; 3369 MDB_ID2L dl = txn->mt_u.dirty_list; 3370 unsigned psize = env->me_psize, j; 3371 int i, pagecount = dl[0].mid, rc; 3372 size_t size = 0, pos = 0; 3373 pgno_t pgno = 0; 3374 MDB_page *dp = NULL; 3375 #ifdef _WIN32 3376 OVERLAPPED ov; 3377 #else 3378 struct iovec iov[MDB_COMMIT_PAGES]; 3379 ssize_t wpos = 0, wsize = 0, wres; 3380 size_t next_pos = 1; /* impossible pos, so pos != next_pos */ 3381 int n = 0; 3382 #endif 3383 3384 j = i = keep; 3385 3386 if (env->me_flags & MDB_WRITEMAP) { 3387 /* Clear dirty flags */ 3388 while (++i <= pagecount) { 3389 dp = dl[i].mptr; 3390 /* Don't flush this page yet */ 3391 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3392 dp->mp_flags &= ~P_KEEP; 3393 dl[++j] = dl[i]; 3394 continue; 3395 } 3396 dp->mp_flags &= ~P_DIRTY; 3397 } 3398 goto done; 3399 } 3400 3401 /* Write the pages */ 3402 for (;;) { 3403 if (++i <= pagecount) { 3404 dp = dl[i].mptr; 3405 /* Don't flush this page yet */ 3406 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3407 dp->mp_flags &= ~P_KEEP; 3408 dl[i].mid = 0; 3409 continue; 3410 } 3411 pgno = dl[i].mid; 3412 /* clear dirty flag */ 3413 dp->mp_flags &= ~P_DIRTY; 3414 pos = pgno * psize; 3415 size = psize; 3416 if (IS_OVERFLOW(dp)) size *= dp->mp_pages; 3417 } 3418 #ifdef _WIN32 3419 else break; 3420 3421 /* Windows actually supports scatter/gather I/O, but only on 3422 * unbuffered file handles. Since we're relying on the OS page 3423 * cache for all our data, that's self-defeating. So we just 3424 * write pages one at a time. We use the ov structure to set 3425 * the write offset, to at least save the overhead of a Seek 3426 * system call. 3427 */ 3428 DPRINTF(("committing page %"Z"u", pgno)); 3429 memset(&ov, 0, sizeof(ov)); 3430 ov.Offset = pos & 0xffffffff; 3431 ov.OffsetHigh = pos >> 16 >> 16; 3432 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { 3433 rc = ErrCode(); 3434 DPRINTF(("WriteFile: %d", rc)); 3435 return rc; 3436 } 3437 #else 3438 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ 3439 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { 3440 if (n) { 3441 retry_write: 3442 /* Write previous page(s) */ 3443 #ifdef MDB_USE_PWRITEV 3444 wres = pwritev(env->me_fd, iov, n, wpos); 3445 #else 3446 if (n == 1) { 3447 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); 3448 } else { 3449 retry_seek: 3450 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { 3451 rc = ErrCode(); 3452 if (rc == EINTR) 3453 goto retry_seek; 3454 DPRINTF(("lseek: %s", strerror(rc))); 3455 return rc; 3456 } 3457 wres = writev(env->me_fd, iov, n); 3458 } 3459 #endif 3460 if (wres != wsize) { 3461 if (wres < 0) { 3462 rc = ErrCode(); 3463 if (rc == EINTR) 3464 goto retry_write; 3465 DPRINTF(("Write error: %s", strerror(rc))); 3466 } else { 3467 rc = EIO; /* TODO: Use which error code? */ 3468 DPUTS("short write, filesystem full?"); 3469 } 3470 return rc; 3471 } 3472 n = 0; 3473 } 3474 if (i > pagecount) 3475 break; 3476 wpos = pos; 3477 wsize = 0; 3478 } 3479 DPRINTF(("committing page %"Z"u", pgno)); 3480 next_pos = pos + size; 3481 iov[n].iov_len = size; 3482 iov[n].iov_base = (char *)dp; 3483 wsize += size; 3484 n++; 3485 #endif /* _WIN32 */ 3486 } 3487 3488 /* MIPS has cache coherency issues, this is a no-op everywhere else 3489 * Note: for any size >= on-chip cache size, entire on-chip cache is 3490 * flushed. 3491 */ 3492 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); 3493 3494 for (i = keep; ++i <= pagecount; ) { 3495 dp = dl[i].mptr; 3496 /* This is a page we skipped above */ 3497 if (!dl[i].mid) { 3498 dl[++j] = dl[i]; 3499 dl[j].mid = dp->mp_pgno; 3500 continue; 3501 } 3502 mdb_dpage_free(env, dp); 3503 } 3504 3505 done: 3506 i--; 3507 txn->mt_dirty_room += i - j; 3508 dl[0].mid = j; 3509 return MDB_SUCCESS; 3510 } 3511 3512 static int 3513 _mdb_txn_commit(MDB_txn *txn) 3514 { 3515 int rc; 3516 unsigned int i, end_mode; 3517 MDB_env *env; 3518 3519 if (txn == NULL) 3520 return EINVAL; 3521 3522 /* mdb_txn_end() mode for a commit which writes nothing */ 3523 end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; 3524 3525 if (txn->mt_child) { 3526 rc = _mdb_txn_commit(txn->mt_child); 3527 if (rc) 3528 goto fail; 3529 } 3530 3531 env = txn->mt_env; 3532 3533 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 3534 goto done; 3535 } 3536 3537 if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { 3538 DPUTS("txn has failed/finished, can't commit"); 3539 if (txn->mt_parent) 3540 txn->mt_parent->mt_flags |= MDB_TXN_ERROR; 3541 rc = MDB_BAD_TXN; 3542 goto fail; 3543 } 3544 3545 if (txn->mt_parent) { 3546 MDB_txn *parent = txn->mt_parent; 3547 MDB_page **lp; 3548 MDB_ID2L dst, src; 3549 MDB_IDL pspill; 3550 unsigned x, y, len, ps_len; 3551 3552 /* Append our free list to parent's */ 3553 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); 3554 if (rc) 3555 goto fail; 3556 mdb_midl_free(txn->mt_free_pgs); 3557 /* Failures after this must either undo the changes 3558 * to the parent or set MDB_TXN_ERROR in the parent. 3559 */ 3560 3561 parent->mt_next_pgno = txn->mt_next_pgno; 3562 parent->mt_flags = txn->mt_flags; 3563 3564 /* Merge our cursors into parent's and close them */ 3565 mdb_cursors_close(txn, 1); 3566 3567 /* Update parent's DB table. */ 3568 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 3569 parent->mt_numdbs = txn->mt_numdbs; 3570 parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; 3571 parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; 3572 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 3573 /* preserve parent's DB_NEW status */ 3574 x = parent->mt_dbflags[i] & DB_NEW; 3575 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; 3576 } 3577 3578 dst = parent->mt_u.dirty_list; 3579 src = txn->mt_u.dirty_list; 3580 /* Remove anything in our dirty list from parent's spill list */ 3581 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { 3582 x = y = ps_len; 3583 pspill[0] = (pgno_t)-1; 3584 /* Mark our dirty pages as deleted in parent spill list */ 3585 for (i=0, len=src[0].mid; ++i <= len; ) { 3586 MDB_ID pn = src[i].mid << 1; 3587 while (pn > pspill[x]) 3588 x--; 3589 if (pn == pspill[x]) { 3590 pspill[x] = 1; 3591 y = --x; 3592 } 3593 } 3594 /* Squash deleted pagenums if we deleted any */ 3595 for (x=y; ++x <= ps_len; ) 3596 if (!(pspill[x] & 1)) 3597 pspill[++y] = pspill[x]; 3598 pspill[0] = y; 3599 } 3600 3601 /* Remove anything in our spill list from parent's dirty list */ 3602 if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { 3603 for (i=1; i<=txn->mt_spill_pgs[0]; i++) { 3604 MDB_ID pn = txn->mt_spill_pgs[i]; 3605 if (pn & 1) 3606 continue; /* deleted spillpg */ 3607 pn >>= 1; 3608 y = mdb_mid2l_search(dst, pn); 3609 if (y <= dst[0].mid && dst[y].mid == pn) { 3610 free(dst[y].mptr); 3611 while (y < dst[0].mid) { 3612 dst[y] = dst[y+1]; 3613 y++; 3614 } 3615 dst[0].mid--; 3616 } 3617 } 3618 } 3619 3620 /* Find len = length of merging our dirty list with parent's */ 3621 x = dst[0].mid; 3622 dst[0].mid = 0; /* simplify loops */ 3623 if (parent->mt_parent) { 3624 len = x + src[0].mid; 3625 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; 3626 for (i = x; y && i; y--) { 3627 pgno_t yp = src[y].mid; 3628 while (yp < dst[i].mid) 3629 i--; 3630 if (yp == dst[i].mid) { 3631 i--; 3632 len--; 3633 } 3634 } 3635 } else { /* Simplify the above for single-ancestor case */ 3636 len = MDB_IDL_UM_MAX - txn->mt_dirty_room; 3637 } 3638 /* Merge our dirty list with parent's */ 3639 y = src[0].mid; 3640 for (i = len; y; dst[i--] = src[y--]) { 3641 pgno_t yp = src[y].mid; 3642 while (yp < dst[x].mid) 3643 dst[i--] = dst[x--]; 3644 if (yp == dst[x].mid) 3645 free(dst[x--].mptr); 3646 } 3647 mdb_tassert(txn, i == x); 3648 dst[0].mid = len; 3649 free(txn->mt_u.dirty_list); 3650 parent->mt_dirty_room = txn->mt_dirty_room; 3651 if (txn->mt_spill_pgs) { 3652 if (parent->mt_spill_pgs) { 3653 /* TODO: Prevent failure here, so parent does not fail */ 3654 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); 3655 if (rc) 3656 parent->mt_flags |= MDB_TXN_ERROR; 3657 mdb_midl_free(txn->mt_spill_pgs); 3658 mdb_midl_sort(parent->mt_spill_pgs); 3659 } else { 3660 parent->mt_spill_pgs = txn->mt_spill_pgs; 3661 } 3662 } 3663 3664 /* Append our loose page list to parent's */ 3665 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) 3666 ; 3667 *lp = txn->mt_loose_pgs; 3668 parent->mt_loose_count += txn->mt_loose_count; 3669 3670 parent->mt_child = NULL; 3671 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); 3672 free(txn); 3673 return rc; 3674 } 3675 3676 if (txn != env->me_txn) { 3677 DPUTS("attempt to commit unknown transaction"); 3678 rc = EINVAL; 3679 goto fail; 3680 } 3681 3682 mdb_cursors_close(txn, 0); 3683 3684 if (!txn->mt_u.dirty_list[0].mid && 3685 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) 3686 goto done; 3687 3688 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", 3689 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); 3690 3691 /* Update DB root pointers */ 3692 if (txn->mt_numdbs > CORE_DBS) { 3693 MDB_cursor mc; 3694 MDB_dbi i; 3695 MDB_val data; 3696 data.mv_size = sizeof(MDB_db); 3697 3698 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 3699 for (i = CORE_DBS; i < txn->mt_numdbs; i++) { 3700 if (txn->mt_dbflags[i] & DB_DIRTY) { 3701 if (TXN_DBI_CHANGED(txn, i)) { 3702 rc = MDB_BAD_DBI; 3703 goto fail; 3704 } 3705 data.mv_data = &txn->mt_dbs[i]; 3706 rc = _mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 3707 F_SUBDATA); 3708 if (rc) 3709 goto fail; 3710 } 3711 } 3712 } 3713 3714 rc = mdb_freelist_save(txn); 3715 if (rc) 3716 goto fail; 3717 3718 mdb_midl_free(env->me_pghead); 3719 env->me_pghead = NULL; 3720 mdb_midl_shrink(&txn->mt_free_pgs); 3721 3722 #if (MDB_DEBUG) > 2 3723 mdb_audit(txn); 3724 #endif 3725 3726 if ((rc = mdb_page_flush(txn, 0)) || 3727 (rc = mdb_env_sync(env, 0)) || 3728 (rc = mdb_env_write_meta(txn))) 3729 goto fail; 3730 end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; 3731 3732 done: 3733 mdb_txn_end(txn, end_mode); 3734 return MDB_SUCCESS; 3735 3736 fail: 3737 _mdb_txn_abort(txn); 3738 return rc; 3739 } 3740 3741 int 3742 mdb_txn_commit(MDB_txn *txn) 3743 { 3744 MDB_TRACE(("%p", txn)); 3745 return _mdb_txn_commit(txn); 3746 } 3747 3748 /** Read the environment parameters of a DB environment before 3749 * mapping it into memory. 3750 * @param[in] env the environment handle 3751 * @param[out] meta address of where to store the meta information 3752 * @return 0 on success, non-zero on failure. 3753 */ 3754 static int ESECT 3755 mdb_env_read_header(MDB_env *env, MDB_meta *meta) 3756 { 3757 MDB_metabuf pbuf; 3758 MDB_page *p; 3759 MDB_meta *m; 3760 int i, rc, off; 3761 enum { Size = sizeof(pbuf) }; 3762 3763 /* We don't know the page size yet, so use a minimum value. 3764 * Read both meta pages so we can use the latest one. 3765 */ 3766 3767 for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { 3768 #ifdef _WIN32 3769 DWORD len; 3770 OVERLAPPED ov; 3771 memset(&ov, 0, sizeof(ov)); 3772 ov.Offset = off; 3773 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; 3774 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) 3775 rc = 0; 3776 #else 3777 rc = pread(env->me_fd, &pbuf, Size, off); 3778 #endif 3779 if (rc != Size) { 3780 if (rc == 0 && off == 0) 3781 return ENOENT; 3782 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; 3783 DPRINTF(("read: %s", mdb_strerror(rc))); 3784 return rc; 3785 } 3786 3787 p = (MDB_page *)&pbuf; 3788 3789 if (!F_ISSET(p->mp_flags, P_META)) { 3790 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); 3791 return MDB_INVALID; 3792 } 3793 3794 m = METADATA(p); 3795 if (m->mm_magic != MDB_MAGIC) { 3796 DPUTS("meta has invalid magic"); 3797 return MDB_INVALID; 3798 } 3799 3800 if (m->mm_version != MDB_DATA_VERSION) { 3801 DPRINTF(("database is version %u, expected version %u", 3802 m->mm_version, MDB_DATA_VERSION)); 3803 return MDB_VERSION_MISMATCH; 3804 } 3805 3806 if (off == 0 || m->mm_txnid > meta->mm_txnid) 3807 *meta = *m; 3808 } 3809 return 0; 3810 } 3811 3812 /** Fill in most of the zeroed #MDB_meta for an empty database environment */ 3813 static void ESECT 3814 mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) 3815 { 3816 meta->mm_magic = MDB_MAGIC; 3817 meta->mm_version = MDB_DATA_VERSION; 3818 meta->mm_mapsize = env->me_mapsize; 3819 meta->mm_psize = env->me_psize; 3820 meta->mm_last_pg = NUM_METAS-1; 3821 meta->mm_flags = env->me_flags & 0xffff; 3822 meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ 3823 meta->mm_dbs[FREE_DBI].md_root = P_INVALID; 3824 meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; 3825 } 3826 3827 /** Write the environment parameters of a freshly created DB environment. 3828 * @param[in] env the environment handle 3829 * @param[in] meta the #MDB_meta to write 3830 * @return 0 on success, non-zero on failure. 3831 */ 3832 static int ESECT 3833 mdb_env_init_meta(MDB_env *env, MDB_meta *meta) 3834 { 3835 MDB_page *p, *q; 3836 int rc; 3837 unsigned int psize; 3838 #ifdef _WIN32 3839 DWORD len; 3840 OVERLAPPED ov; 3841 memset(&ov, 0, sizeof(ov)); 3842 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3843 ov.Offset = pos; \ 3844 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) 3845 #else 3846 int len; 3847 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3848 len = pwrite(fd, ptr, size, pos); \ 3849 if (len == -1 && ErrCode() == EINTR) continue; \ 3850 rc = (len >= 0); break; } while(1) 3851 #endif 3852 3853 DPUTS("writing new meta page"); 3854 3855 psize = env->me_psize; 3856 3857 p = calloc(NUM_METAS, psize); 3858 if (!p) 3859 return ENOMEM; 3860 3861 p->mp_pgno = 0; 3862 p->mp_flags = P_META; 3863 *(MDB_meta *)METADATA(p) = *meta; 3864 3865 q = (MDB_page *)((char *)p + psize); 3866 q->mp_pgno = 1; 3867 q->mp_flags = P_META; 3868 *(MDB_meta *)METADATA(q) = *meta; 3869 3870 DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); 3871 if (!rc) 3872 rc = ErrCode(); 3873 else if ((unsigned) len == psize * NUM_METAS) 3874 rc = MDB_SUCCESS; 3875 else 3876 rc = ENOSPC; 3877 free(p); 3878 return rc; 3879 } 3880 3881 /** Update the environment info to commit a transaction. 3882 * @param[in] txn the transaction that's being committed 3883 * @return 0 on success, non-zero on failure. 3884 */ 3885 static int 3886 mdb_env_write_meta(MDB_txn *txn) 3887 { 3888 MDB_env *env; 3889 MDB_meta meta, metab, *mp; 3890 unsigned flags; 3891 size_t mapsize; 3892 off_t off; 3893 int rc, len, toggle; 3894 char *ptr; 3895 HANDLE mfd; 3896 #ifdef _WIN32 3897 OVERLAPPED ov; 3898 #else 3899 int r2; 3900 #endif 3901 3902 toggle = txn->mt_txnid & 1; 3903 DPRINTF(("writing meta page %d for root page %"Z"u", 3904 toggle, txn->mt_dbs[MAIN_DBI].md_root)); 3905 3906 env = txn->mt_env; 3907 flags = env->me_flags; 3908 mp = env->me_metas[toggle]; 3909 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; 3910 /* Persist any increases of mapsize config */ 3911 if (mapsize < env->me_mapsize) 3912 mapsize = env->me_mapsize; 3913 3914 if (flags & MDB_WRITEMAP) { 3915 mp->mm_mapsize = mapsize; 3916 mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3917 mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3918 mp->mm_last_pg = txn->mt_next_pgno - 1; 3919 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ 3920 !(defined(__i386__) || defined(__x86_64__)) 3921 /* LY: issue a memory barrier, if not x86. ITS#7969 */ 3922 __sync_synchronize(); 3923 #endif 3924 mp->mm_txnid = txn->mt_txnid; 3925 if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { 3926 unsigned meta_size = env->me_psize; 3927 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; 3928 ptr = (char *)mp - PAGEHDRSZ; 3929 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ 3930 r2 = (ptr - env->me_map) & (env->me_os_psize - 1); 3931 ptr -= r2; 3932 meta_size += r2; 3933 #endif 3934 if (MDB_MSYNC(ptr, meta_size, rc)) { 3935 rc = ErrCode(); 3936 goto fail; 3937 } 3938 } 3939 goto done; 3940 } 3941 metab.mm_txnid = mp->mm_txnid; 3942 metab.mm_last_pg = mp->mm_last_pg; 3943 3944 meta.mm_mapsize = mapsize; 3945 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3946 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3947 meta.mm_last_pg = txn->mt_next_pgno - 1; 3948 meta.mm_txnid = txn->mt_txnid; 3949 3950 off = offsetof(MDB_meta, mm_mapsize); 3951 ptr = (char *)&meta + off; 3952 len = sizeof(MDB_meta) - off; 3953 off += (char *)mp - env->me_map; 3954 3955 /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. 3956 * (me_mfd goes to the same file as me_fd, but writing to it 3957 * also syncs to disk. Avoids a separate fdatasync() call.) 3958 */ 3959 mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; 3960 #ifdef _WIN32 3961 { 3962 memset(&ov, 0, sizeof(ov)); 3963 ov.Offset = off; 3964 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) 3965 rc = -1; 3966 } 3967 #else 3968 retry_write: 3969 rc = pwrite(mfd, ptr, len, off); 3970 #endif 3971 if (rc != len) { 3972 rc = rc < 0 ? ErrCode() : EIO; 3973 #ifndef _WIN32 3974 if (rc == EINTR) 3975 goto retry_write; 3976 #endif 3977 DPUTS("write failed, disk error?"); 3978 /* On a failure, the pagecache still contains the new data. 3979 * Write some old data back, to prevent it from being used. 3980 * Use the non-SYNC fd; we know it will fail anyway. 3981 */ 3982 meta.mm_last_pg = metab.mm_last_pg; 3983 meta.mm_txnid = metab.mm_txnid; 3984 #ifdef _WIN32 3985 memset(&ov, 0, sizeof(ov)); 3986 ov.Offset = off; 3987 WriteFile(env->me_fd, ptr, len, NULL, &ov); 3988 #else 3989 r2 = pwrite(env->me_fd, ptr, len, off); 3990 (void)r2; /* Silence warnings. We don't care about pwrite's return value */ 3991 #endif 3992 fail: 3993 env->me_flags |= MDB_FATAL_ERROR; 3994 return rc; 3995 } 3996 /* MIPS has cache coherency issues, this is a no-op everywhere else */ 3997 CACHEFLUSH(env->me_map + off, len, DCACHE); 3998 done: 3999 /* Memory ordering issues are irrelevant; since the entire writer 4000 * is wrapped by wmutex, all of these changes will become visible 4001 * after the wmutex is unlocked. Since the DB is multi-version, 4002 * readers will get consistent data regardless of how fresh or 4003 * how stale their view of these values is. 4004 */ 4005 if (env->me_txns) 4006 env->me_txns->mti_txnid = txn->mt_txnid; 4007 4008 return MDB_SUCCESS; 4009 } 4010 4011 /** Check both meta pages to see which one is newer. 4012 * @param[in] env the environment handle 4013 * @return newest #MDB_meta. 4014 */ 4015 static MDB_meta * 4016 mdb_env_pick_meta(const MDB_env *env) 4017 { 4018 MDB_meta *const *metas = env->me_metas; 4019 return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; 4020 } 4021 4022 int ESECT 4023 mdb_env_create(MDB_env **env) 4024 { 4025 MDB_env *e; 4026 4027 e = calloc(1, sizeof(MDB_env)); 4028 if (!e) 4029 return ENOMEM; 4030 4031 e->me_maxreaders = DEFAULT_READERS; 4032 e->me_maxdbs = e->me_numdbs = CORE_DBS; 4033 e->me_fd = INVALID_HANDLE_VALUE; 4034 e->me_lfd = INVALID_HANDLE_VALUE; 4035 e->me_mfd = INVALID_HANDLE_VALUE; 4036 #ifdef MDB_USE_POSIX_SEM 4037 e->me_rmutex = SEM_FAILED; 4038 e->me_wmutex = SEM_FAILED; 4039 #endif 4040 e->me_pid = getpid(); 4041 GET_PAGESIZE(e->me_os_psize); 4042 VGMEMP_CREATE(e,0,0); 4043 *env = e; 4044 MDB_TRACE(("%p", e)); 4045 return MDB_SUCCESS; 4046 } 4047 4048 static int ESECT 4049 mdb_env_map(MDB_env *env, void *addr) 4050 { 4051 MDB_page *p; 4052 unsigned int flags = env->me_flags; 4053 #ifdef _WIN32 4054 int rc; 4055 HANDLE mh; 4056 LONG sizelo, sizehi; 4057 size_t msize; 4058 4059 if (flags & MDB_RDONLY) { 4060 /* Don't set explicit map size, use whatever exists */ 4061 msize = 0; 4062 sizelo = 0; 4063 sizehi = 0; 4064 } else { 4065 msize = env->me_mapsize; 4066 sizelo = msize & 0xffffffff; 4067 sizehi = msize >> 16 >> 16; /* only needed on Win64 */ 4068 4069 /* Windows won't create mappings for zero length files. 4070 * and won't map more than the file size. 4071 * Just set the maxsize right now. 4072 */ 4073 if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo 4074 || !SetEndOfFile(env->me_fd) 4075 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) 4076 return ErrCode(); 4077 } 4078 4079 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? 4080 PAGE_READWRITE : PAGE_READONLY, 4081 sizehi, sizelo, NULL); 4082 if (!mh) 4083 return ErrCode(); 4084 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? 4085 FILE_MAP_WRITE : FILE_MAP_READ, 4086 0, 0, msize, addr); 4087 rc = env->me_map ? 0 : ErrCode(); 4088 CloseHandle(mh); 4089 if (rc) 4090 return rc; 4091 #else 4092 int mmap_flags = MAP_SHARED; 4093 int prot = PROT_READ; 4094 #ifdef MAP_NOSYNC /* Used on FreeBSD */ 4095 if (flags & MDB_NOSYNC) 4096 mmap_flags |= MAP_NOSYNC; 4097 #endif 4098 if (flags & MDB_WRITEMAP) { 4099 prot |= PROT_WRITE; 4100 if (ftruncate(env->me_fd, env->me_mapsize) < 0) 4101 return ErrCode(); 4102 } 4103 env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags, 4104 env->me_fd, 0); 4105 if (env->me_map == MAP_FAILED) { 4106 env->me_map = NULL; 4107 return ErrCode(); 4108 } 4109 4110 if (flags & MDB_NORDAHEAD) { 4111 /* Turn off readahead. It's harmful when the DB is larger than RAM. */ 4112 #ifdef MADV_RANDOM 4113 madvise(env->me_map, env->me_mapsize, MADV_RANDOM); 4114 #else 4115 #ifdef POSIX_MADV_RANDOM 4116 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); 4117 #endif /* POSIX_MADV_RANDOM */ 4118 #endif /* MADV_RANDOM */ 4119 } 4120 #endif /* _WIN32 */ 4121 4122 /* Can happen because the address argument to mmap() is just a 4123 * hint. mmap() can pick another, e.g. if the range is in use. 4124 * The MAP_FIXED flag would prevent that, but then mmap could 4125 * instead unmap existing pages to make room for the new map. 4126 */ 4127 if (addr && env->me_map != addr) 4128 return EBUSY; /* TODO: Make a new MDB_* error code? */ 4129 4130 p = (MDB_page *)env->me_map; 4131 env->me_metas[0] = METADATA(p); 4132 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); 4133 4134 return MDB_SUCCESS; 4135 } 4136 4137 int ESECT 4138 mdb_env_set_mapsize(MDB_env *env, size_t size) 4139 { 4140 /* If env is already open, caller is responsible for making 4141 * sure there are no active txns. 4142 */ 4143 if (env->me_map) { 4144 int rc; 4145 MDB_meta *meta; 4146 void *old; 4147 if (env->me_txn) 4148 return EINVAL; 4149 meta = mdb_env_pick_meta(env); 4150 if (!size) 4151 size = meta->mm_mapsize; 4152 { 4153 /* Silently round up to minimum if the size is too small */ 4154 size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; 4155 if (size < minsize) 4156 size = minsize; 4157 } 4158 munmap(env->me_map, env->me_mapsize); 4159 env->me_mapsize = size; 4160 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; 4161 rc = mdb_env_map(env, old); 4162 if (rc) 4163 return rc; 4164 } 4165 env->me_mapsize = size; 4166 if (env->me_psize) 4167 env->me_maxpg = env->me_mapsize / env->me_psize; 4168 MDB_TRACE(("%p, %"Yu"", env, size)); 4169 return MDB_SUCCESS; 4170 } 4171 4172 int ESECT 4173 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) 4174 { 4175 if (env->me_map) 4176 return EINVAL; 4177 env->me_maxdbs = dbs + CORE_DBS; 4178 MDB_TRACE(("%p, %u", env, dbs)); 4179 return MDB_SUCCESS; 4180 } 4181 4182 int ESECT 4183 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) 4184 { 4185 if (env->me_map || readers < 1) 4186 return EINVAL; 4187 env->me_maxreaders = readers; 4188 MDB_TRACE(("%p, %u", env, readers)); 4189 return MDB_SUCCESS; 4190 } 4191 4192 int ESECT 4193 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) 4194 { 4195 if (!env || !readers) 4196 return EINVAL; 4197 *readers = env->me_maxreaders; 4198 return MDB_SUCCESS; 4199 } 4200 4201 static int ESECT 4202 mdb_fsize(HANDLE fd, size_t *size) 4203 { 4204 #ifdef _WIN32 4205 LARGE_INTEGER fsize; 4206 4207 if (!GetFileSizeEx(fd, &fsize)) 4208 return ErrCode(); 4209 4210 *size = fsize.QuadPart; 4211 #else 4212 struct stat st; 4213 4214 if (fstat(fd, &st)) 4215 return ErrCode(); 4216 4217 *size = st.st_size; 4218 #endif 4219 return MDB_SUCCESS; 4220 } 4221 4222 4223 #ifdef _WIN32 4224 typedef wchar_t mdb_nchar_t; 4225 # define MDB_NAME(str) L##str 4226 # define mdb_name_cpy wcscpy 4227 #else 4228 /** Character type for file names: char on Unix, wchar_t on Windows */ 4229 typedef char mdb_nchar_t; 4230 # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ 4231 # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ 4232 #endif 4233 4234 /** Filename - string of #mdb_nchar_t[] */ 4235 typedef struct MDB_name { 4236 int mn_len; /**< Length */ 4237 int mn_alloced; /**< True if #mn_val was malloced */ 4238 mdb_nchar_t *mn_val; /**< Contents */ 4239 } MDB_name; 4240 4241 /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ 4242 static const mdb_nchar_t *const mdb_suffixes[2][2] = { 4243 { MDB_NAME("/data.mdb"), MDB_NAME("") }, 4244 { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } 4245 }; 4246 4247 #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ 4248 4249 /** Set up filename + scratch area for filename suffix, for opening files. 4250 * It should be freed with #mdb_fname_destroy(). 4251 * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. 4252 * 4253 * @param[in] path Pathname for #mdb_env_open(). 4254 * @param[in] envflags Whether a subdir and/or lockfile will be used. 4255 * @param[out] fname Resulting filename, with room for a suffix if necessary. 4256 */ 4257 static int ESECT 4258 mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) 4259 { 4260 int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); 4261 fname->mn_alloced = 0; 4262 #ifdef _WIN32 4263 return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); 4264 #else 4265 fname->mn_len = strlen(path); 4266 if (no_suffix) 4267 fname->mn_val = (char *) path; 4268 else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { 4269 fname->mn_alloced = 1; 4270 strcpy(fname->mn_val, path); 4271 } 4272 else 4273 return ENOMEM; 4274 return MDB_SUCCESS; 4275 #endif 4276 } 4277 4278 /** Destroy \b fname from #mdb_fname_init() */ 4279 #define mdb_fname_destroy(fname) \ 4280 do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) 4281 4282 #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ 4283 # define MDB_CLOEXEC O_CLOEXEC 4284 #else 4285 # define MDB_CLOEXEC 0 4286 #endif 4287 4288 /** File type, access mode etc. for #mdb_fopen() */ 4289 enum mdb_fopen_type { 4290 #ifdef _WIN32 4291 MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS 4292 #else 4293 /* A comment in mdb_fopen() explains some O_* flag choices. */ 4294 MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ 4295 MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ 4296 MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ 4297 MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ 4298 /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits 4299 * distinguish otherwise-equal MDB_O_* constants from each other. 4300 */ 4301 MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, 4302 MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ 4303 #endif 4304 }; 4305 4306 /** Open an LMDB file. 4307 * @param[in] env The LMDB environment. 4308 * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is 4309 * appended if necessary to create the filename, without changing mn_len. 4310 * @param[in] which Determines file type, access mode, etc. 4311 * @param[in] mode The Unix permissions for the file, if we create it. 4312 * @param[out] res Resulting file handle. 4313 * @return 0 on success, non-zero on failure. 4314 */ 4315 static int ESECT 4316 mdb_fopen(const MDB_env *env, MDB_name *fname, 4317 enum mdb_fopen_type which, mdb_mode_t mode, 4318 HANDLE *res) 4319 { 4320 int rc = MDB_SUCCESS; 4321 HANDLE fd; 4322 #ifdef _WIN32 4323 DWORD acc, share, disp, attrs; 4324 #else 4325 int flags; 4326 #endif 4327 4328 if (fname->mn_alloced) /* modifiable copy */ 4329 mdb_name_cpy(fname->mn_val + fname->mn_len, 4330 mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); 4331 4332 /* The directory must already exist. Usually the file need not. 4333 * MDB_O_META requires the file because we already created it using 4334 * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. 4335 * 4336 * With MDB_O_COPY we do not want the OS to cache the writes, since 4337 * the source data is already in the OS cache. 4338 * 4339 * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) 4340 * to avoid the flock() issues noted under Caveats in lmdb.h. 4341 * Also set it for other filehandles which the user cannot get at 4342 * and close himself, which he may need after fork(). I.e. all but 4343 * me_fd, which programs do use via mdb_env_get_fd(). 4344 */ 4345 4346 #ifdef _WIN32 4347 acc = GENERIC_READ|GENERIC_WRITE; 4348 share = FILE_SHARE_READ|FILE_SHARE_WRITE; 4349 disp = OPEN_ALWAYS; 4350 attrs = FILE_ATTRIBUTE_NORMAL; 4351 switch (which) { 4352 case MDB_O_RDONLY: /* read-only datafile */ 4353 acc = GENERIC_READ; 4354 disp = OPEN_EXISTING; 4355 break; 4356 case MDB_O_META: /* for writing metapages */ 4357 acc = GENERIC_WRITE; 4358 disp = OPEN_EXISTING; 4359 attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; 4360 break; 4361 case MDB_O_COPY: /* mdb_env_copy() & co */ 4362 acc = GENERIC_WRITE; 4363 share = 0; 4364 disp = CREATE_NEW; 4365 attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; 4366 break; 4367 default: break; /* silence gcc -Wswitch (not all enum values handled) */ 4368 } 4369 fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); 4370 #else 4371 fd = open(fname->mn_val, which & MDB_O_MASK, mode); 4372 #endif 4373 4374 if (fd == INVALID_HANDLE_VALUE) 4375 rc = ErrCode(); 4376 #ifndef _WIN32 4377 else { 4378 if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { 4379 /* Set CLOEXEC if we could not pass it to open() */ 4380 if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) 4381 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); 4382 } 4383 if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { 4384 /* This may require buffer alignment. There is no portable 4385 * way to ask how much, so we require OS pagesize alignment. 4386 */ 4387 # ifdef F_NOCACHE /* __APPLE__ */ 4388 (void) fcntl(fd, F_NOCACHE, 1); 4389 # elif defined O_DIRECT 4390 /* open(...O_DIRECT...) would break on filesystems without 4391 * O_DIRECT support (ITS#7682). Try to set it here instead. 4392 */ 4393 if ((flags = fcntl(fd, F_GETFL)) != -1) 4394 (void) fcntl(fd, F_SETFL, flags | O_DIRECT); 4395 # endif 4396 } 4397 } 4398 #endif /* !_WIN32 */ 4399 4400 *res = fd; 4401 return rc; 4402 } 4403 4404 4405 #ifdef BROKEN_FDATASYNC 4406 #include <sys/utsname.h> 4407 #include <sys/vfs.h> 4408 #endif 4409 4410 /** Further setup required for opening an LMDB environment 4411 */ 4412 static int ESECT 4413 mdb_env_open2(MDB_env *env) 4414 { 4415 unsigned int flags = env->me_flags; 4416 int i, newenv = 0, rc; 4417 MDB_meta meta; 4418 4419 #ifdef _WIN32 4420 /* See if we should use QueryLimited */ 4421 rc = GetVersion(); 4422 if ((rc & 0xff) > 5) 4423 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; 4424 else 4425 env->me_pidquery = PROCESS_QUERY_INFORMATION; 4426 #endif /* _WIN32 */ 4427 4428 #ifdef BROKEN_FDATASYNC 4429 /* ext3/ext4 fdatasync is broken on some older Linux kernels. 4430 * https://lkml.org/lkml/2012/9/3/83 4431 * Kernels after 3.6-rc6 are known good. 4432 * https://lkml.org/lkml/2012/9/10/556 4433 * See if the DB is on ext3/ext4, then check for new enough kernel 4434 * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known 4435 * to be patched. 4436 */ 4437 { 4438 struct statfs st; 4439 fstatfs(env->me_fd, &st); 4440 while (st.f_type == 0xEF53) { 4441 struct utsname uts; 4442 int i; 4443 uname(&uts); 4444 if (uts.release[0] < '3') { 4445 if (!strncmp(uts.release, "2.6.32.", 7)) { 4446 i = atoi(uts.release+7); 4447 if (i >= 60) 4448 break; /* 2.6.32.60 and newer is OK */ 4449 } else if (!strncmp(uts.release, "2.6.34.", 7)) { 4450 i = atoi(uts.release+7); 4451 if (i >= 15) 4452 break; /* 2.6.34.15 and newer is OK */ 4453 } 4454 } else if (uts.release[0] == '3') { 4455 i = atoi(uts.release+2); 4456 if (i > 5) 4457 break; /* 3.6 and newer is OK */ 4458 if (i == 5) { 4459 i = atoi(uts.release+4); 4460 if (i >= 4) 4461 break; /* 3.5.4 and newer is OK */ 4462 } else if (i == 2) { 4463 i = atoi(uts.release+4); 4464 if (i >= 30) 4465 break; /* 3.2.30 and newer is OK */ 4466 } 4467 } else { /* 4.x and newer is OK */ 4468 break; 4469 } 4470 env->me_flags |= MDB_FSYNCONLY; 4471 break; 4472 } 4473 } 4474 #endif 4475 4476 if ((i = mdb_env_read_header(env, &meta)) != 0) { 4477 if (i != ENOENT) 4478 return i; 4479 DPUTS("new mdbenv"); 4480 newenv = 1; 4481 env->me_psize = env->me_os_psize; 4482 if (env->me_psize > MAX_PAGESIZE) 4483 env->me_psize = MAX_PAGESIZE; 4484 memset(&meta, 0, sizeof(meta)); 4485 mdb_env_init_meta0(env, &meta); 4486 meta.mm_mapsize = DEFAULT_MAPSIZE; 4487 } else { 4488 env->me_psize = meta.mm_psize; 4489 } 4490 4491 /* Was a mapsize configured? */ 4492 if (!env->me_mapsize) { 4493 env->me_mapsize = meta.mm_mapsize; 4494 } 4495 { 4496 /* Make sure mapsize >= committed data size. Even when using 4497 * mm_mapsize, which could be broken in old files (ITS#7789). 4498 */ 4499 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; 4500 if (env->me_mapsize < minsize) 4501 env->me_mapsize = minsize; 4502 } 4503 meta.mm_mapsize = env->me_mapsize; 4504 4505 if (newenv && !(flags & MDB_FIXEDMAP)) { 4506 /* mdb_env_map() may grow the datafile. Write the metapages 4507 * first, so the file will be valid if initialization fails. 4508 * Except with FIXEDMAP, since we do not yet know mm_address. 4509 * We could fill in mm_address later, but then a different 4510 * program might end up doing that - one with a memory layout 4511 * and map address which does not suit the main program. 4512 */ 4513 rc = mdb_env_init_meta(env, &meta); 4514 if (rc) 4515 return rc; 4516 newenv = 0; 4517 } 4518 4519 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); 4520 if (rc) 4521 return rc; 4522 4523 if (newenv) { 4524 if (flags & MDB_FIXEDMAP) 4525 meta.mm_address = env->me_map; 4526 i = mdb_env_init_meta(env, &meta); 4527 if (i != MDB_SUCCESS) { 4528 return i; 4529 } 4530 } 4531 4532 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; 4533 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) 4534 - sizeof(indx_t); 4535 #if !(MDB_MAXKEYSIZE) 4536 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); 4537 #endif 4538 env->me_maxpg = env->me_mapsize / env->me_psize; 4539 4540 #if MDB_DEBUG 4541 { 4542 MDB_meta *meta = mdb_env_pick_meta(env); 4543 MDB_db *db = &meta->mm_dbs[MAIN_DBI]; 4544 4545 DPRINTF(("opened database version %u, pagesize %u", 4546 meta->mm_version, env->me_psize)); 4547 DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); 4548 DPRINTF(("depth: %u", db->md_depth)); 4549 DPRINTF(("entries: %"Z"u", db->md_entries)); 4550 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); 4551 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); 4552 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); 4553 DPRINTF(("root: %"Z"u", db->md_root)); 4554 } 4555 #endif 4556 4557 return MDB_SUCCESS; 4558 } 4559 4560 4561 /** Release a reader thread's slot in the reader lock table. 4562 * This function is called automatically when a thread exits. 4563 * @param[in] ptr This points to the slot in the reader lock table. 4564 */ 4565 static void 4566 mdb_env_reader_dest(void *ptr) 4567 { 4568 MDB_reader *reader = ptr; 4569 4570 #ifndef _WIN32 4571 if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ 4572 #endif 4573 /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ 4574 reader->mr_pid = 0; 4575 } 4576 4577 #ifdef _WIN32 4578 /** Junk for arranging thread-specific callbacks on Windows. This is 4579 * necessarily platform and compiler-specific. Windows supports up 4580 * to 1088 keys. Let's assume nobody opens more than 64 environments 4581 * in a single process, for now. They can override this if needed. 4582 */ 4583 #ifndef MAX_TLS_KEYS 4584 #define MAX_TLS_KEYS 64 4585 #endif 4586 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; 4587 static int mdb_tls_nkeys; 4588 4589 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) 4590 { 4591 int i; 4592 switch(reason) { 4593 case DLL_PROCESS_ATTACH: break; 4594 case DLL_THREAD_ATTACH: break; 4595 case DLL_THREAD_DETACH: 4596 for (i=0; i<mdb_tls_nkeys; i++) { 4597 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); 4598 if (r) { 4599 mdb_env_reader_dest(r); 4600 } 4601 } 4602 break; 4603 case DLL_PROCESS_DETACH: break; 4604 } 4605 } 4606 #ifdef __GNUC__ 4607 #ifdef _WIN64 4608 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4609 #else 4610 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4611 #endif 4612 #else 4613 #ifdef _WIN64 4614 /* Force some symbol references. 4615 * _tls_used forces the linker to create the TLS directory if not already done 4616 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. 4617 */ 4618 #pragma comment(linker, "/INCLUDE:_tls_used") 4619 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") 4620 #pragma const_seg(".CRT$XLB") 4621 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; 4622 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4623 #pragma const_seg() 4624 #else /* _WIN32 */ 4625 #pragma comment(linker, "/INCLUDE:__tls_used") 4626 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") 4627 #pragma data_seg(".CRT$XLB") 4628 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4629 #pragma data_seg() 4630 #endif /* WIN 32/64 */ 4631 #endif /* !__GNUC__ */ 4632 #endif 4633 4634 /** Downgrade the exclusive lock on the region back to shared */ 4635 static int ESECT 4636 mdb_env_share_locks(MDB_env *env, int *excl) 4637 { 4638 int rc = 0; 4639 MDB_meta *meta = mdb_env_pick_meta(env); 4640 4641 env->me_txns->mti_txnid = meta->mm_txnid; 4642 4643 #ifdef _WIN32 4644 { 4645 OVERLAPPED ov; 4646 /* First acquire a shared lock. The Unlock will 4647 * then release the existing exclusive lock. 4648 */ 4649 memset(&ov, 0, sizeof(ov)); 4650 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4651 rc = ErrCode(); 4652 } else { 4653 UnlockFile(env->me_lfd, 0, 0, 1, 0); 4654 *excl = 0; 4655 } 4656 } 4657 #else 4658 { 4659 struct flock lock_info; 4660 /* The shared lock replaces the existing lock */ 4661 memset((void *)&lock_info, 0, sizeof(lock_info)); 4662 lock_info.l_type = F_RDLCK; 4663 lock_info.l_whence = SEEK_SET; 4664 lock_info.l_start = 0; 4665 lock_info.l_len = 1; 4666 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4667 (rc = ErrCode()) == EINTR) ; 4668 *excl = rc ? -1 : 0; /* error may mean we lost the lock */ 4669 } 4670 #endif 4671 4672 return rc; 4673 } 4674 4675 /** Try to get exclusive lock, otherwise shared. 4676 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. 4677 */ 4678 static int ESECT 4679 mdb_env_excl_lock(MDB_env *env, int *excl) 4680 { 4681 int rc = 0; 4682 #ifdef _WIN32 4683 if (LockFile(env->me_lfd, 0, 0, 1, 0)) { 4684 *excl = 1; 4685 } else { 4686 OVERLAPPED ov; 4687 memset(&ov, 0, sizeof(ov)); 4688 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4689 *excl = 0; 4690 } else { 4691 rc = ErrCode(); 4692 } 4693 } 4694 #else 4695 struct flock lock_info; 4696 memset((void *)&lock_info, 0, sizeof(lock_info)); 4697 lock_info.l_type = F_WRLCK; 4698 lock_info.l_whence = SEEK_SET; 4699 lock_info.l_start = 0; 4700 lock_info.l_len = 1; 4701 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4702 (rc = ErrCode()) == EINTR) ; 4703 if (!rc) { 4704 *excl = 1; 4705 } else 4706 # ifndef MDB_USE_POSIX_MUTEX 4707 if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ 4708 # endif 4709 { 4710 lock_info.l_type = F_RDLCK; 4711 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && 4712 (rc = ErrCode()) == EINTR) ; 4713 if (rc == 0) 4714 *excl = 0; 4715 } 4716 #endif 4717 return rc; 4718 } 4719 4720 #ifdef MDB_USE_HASH 4721 /* 4722 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code 4723 * 4724 * @(#) $Revision: 5.1 $ 4725 * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ 4726 * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ 4727 * 4728 * http://www.isthe.com/chongo/tech/comp/fnv/index.html 4729 * 4730 *** 4731 * 4732 * Please do not copyright this code. This code is in the public domain. 4733 * 4734 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 4735 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO 4736 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR 4737 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 4738 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 4739 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 4740 * PERFORMANCE OF THIS SOFTWARE. 4741 * 4742 * By: 4743 * chongo <Landon Curt Noll> /\oo/\ 4744 * http://www.isthe.com/chongo/ 4745 * 4746 * Share and Enjoy! :-) 4747 */ 4748 4749 typedef unsigned long long mdb_hash_t; 4750 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) 4751 4752 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer 4753 * @param[in] val value to hash 4754 * @param[in] hval initial value for hash 4755 * @return 64 bit hash 4756 * 4757 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the 4758 * hval arg on the first call. 4759 */ 4760 static mdb_hash_t 4761 mdb_hash_val(MDB_val *val, mdb_hash_t hval) 4762 { 4763 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ 4764 unsigned char *end = s + val->mv_size; 4765 /* 4766 * FNV-1a hash each octet of the string 4767 */ 4768 while (s < end) { 4769 /* xor the bottom with the current octet */ 4770 hval ^= (mdb_hash_t)*s++; 4771 4772 /* multiply by the 64 bit FNV magic prime mod 2^64 */ 4773 hval += (hval << 1) + (hval << 4) + (hval << 5) + 4774 (hval << 7) + (hval << 8) + (hval << 40); 4775 } 4776 /* return our new hash value */ 4777 return hval; 4778 } 4779 4780 /** Hash the string and output the encoded hash. 4781 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with 4782 * very short name limits. We don't care about the encoding being reversible, 4783 * we just want to preserve as many bits of the input as possible in a 4784 * small printable string. 4785 * @param[in] str string to hash 4786 * @param[out] encbuf an array of 11 chars to hold the hash 4787 */ 4788 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; 4789 4790 static void ESECT 4791 mdb_pack85(unsigned long l, char *out) 4792 { 4793 int i; 4794 4795 for (i=0; i<5; i++) { 4796 *out++ = mdb_a85[l % 85]; 4797 l /= 85; 4798 } 4799 } 4800 4801 static void ESECT 4802 mdb_hash_enc(MDB_val *val, char *encbuf) 4803 { 4804 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); 4805 4806 mdb_pack85(h, encbuf); 4807 mdb_pack85(h>>32, encbuf+5); 4808 encbuf[10] = '\0'; 4809 } 4810 #endif 4811 4812 /** Open and/or initialize the lock region for the environment. 4813 * @param[in] env The LMDB environment. 4814 * @param[in] fname Filename + scratch area, from #mdb_fname_init(). 4815 * @param[in] mode The Unix permissions for the file, if we create it. 4816 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive 4817 * @return 0 on success, non-zero on failure. 4818 */ 4819 static int ESECT 4820 mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) 4821 { 4822 #ifdef _WIN32 4823 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT 4824 #else 4825 # define MDB_ERRCODE_ROFS EROFS 4826 #endif 4827 int rc; 4828 off_t size, rsize; 4829 4830 rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); 4831 if (rc) { 4832 /* Omit lockfile if read-only env on read-only filesystem */ 4833 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { 4834 return MDB_SUCCESS; 4835 } 4836 goto fail; 4837 } 4838 4839 if (!(env->me_flags & MDB_NOTLS)) { 4840 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); 4841 if (rc) 4842 goto fail; 4843 env->me_flags |= MDB_ENV_TXKEY; 4844 #ifdef _WIN32 4845 /* Windows TLS callbacks need help finding their TLS info. */ 4846 if (mdb_tls_nkeys >= MAX_TLS_KEYS) { 4847 rc = MDB_TLS_FULL; 4848 goto fail; 4849 } 4850 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; 4851 #endif 4852 } 4853 4854 /* Try to get exclusive lock. If we succeed, then 4855 * nobody is using the lock region and we should initialize it. 4856 */ 4857 if ((rc = mdb_env_excl_lock(env, excl))) goto fail; 4858 4859 #ifdef _WIN32 4860 size = GetFileSize(env->me_lfd, NULL); 4861 #else 4862 size = lseek(env->me_lfd, 0, SEEK_END); 4863 if (size == -1) goto fail_errno; 4864 #endif 4865 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); 4866 if (size < rsize && *excl > 0) { 4867 #ifdef _WIN32 4868 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize 4869 || !SetEndOfFile(env->me_lfd)) 4870 goto fail_errno; 4871 #else 4872 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; 4873 #endif 4874 } else { 4875 rsize = size; 4876 size = rsize - sizeof(MDB_txninfo); 4877 env->me_maxreaders = size/sizeof(MDB_reader) + 1; 4878 } 4879 { 4880 #ifdef _WIN32 4881 HANDLE mh; 4882 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, 4883 0, 0, NULL); 4884 if (!mh) goto fail_errno; 4885 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); 4886 CloseHandle(mh); 4887 if (!env->me_txns) goto fail_errno; 4888 #else 4889 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, 4890 env->me_lfd, 0); 4891 if (m == MAP_FAILED) goto fail_errno; 4892 env->me_txns = m; 4893 #endif 4894 } 4895 if (*excl > 0) { 4896 #ifdef _WIN32 4897 BY_HANDLE_FILE_INFORMATION stbuf; 4898 struct { 4899 DWORD volume; 4900 DWORD nhigh; 4901 DWORD nlow; 4902 } idbuf; 4903 MDB_val val; 4904 char encbuf[11]; 4905 4906 if (!mdb_sec_inited) { 4907 InitializeSecurityDescriptor(&mdb_null_sd, 4908 SECURITY_DESCRIPTOR_REVISION); 4909 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); 4910 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); 4911 mdb_all_sa.bInheritHandle = FALSE; 4912 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; 4913 mdb_sec_inited = 1; 4914 } 4915 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; 4916 idbuf.volume = stbuf.dwVolumeSerialNumber; 4917 idbuf.nhigh = stbuf.nFileIndexHigh; 4918 idbuf.nlow = stbuf.nFileIndexLow; 4919 val.mv_data = &idbuf; 4920 val.mv_size = sizeof(idbuf); 4921 mdb_hash_enc(&val, encbuf); 4922 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); 4923 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); 4924 env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); 4925 if (!env->me_rmutex) goto fail_errno; 4926 env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); 4927 if (!env->me_wmutex) goto fail_errno; 4928 #elif defined(MDB_USE_POSIX_SEM) 4929 struct stat stbuf; 4930 struct { 4931 dev_t dev; 4932 ino_t ino; 4933 } idbuf; 4934 MDB_val val; 4935 char encbuf[11]; 4936 4937 #if defined(__NetBSD__) 4938 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ 4939 #endif 4940 if (fstat(env->me_lfd, &stbuf)) goto fail_errno; 4941 idbuf.dev = stbuf.st_dev; 4942 idbuf.ino = stbuf.st_ino; 4943 val.mv_data = &idbuf; 4944 val.mv_size = sizeof(idbuf); 4945 mdb_hash_enc(&val, encbuf); 4946 #ifdef MDB_SHORT_SEMNAMES 4947 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ 4948 #endif 4949 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); 4950 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); 4951 /* Clean up after a previous run, if needed: Try to 4952 * remove both semaphores before doing anything else. 4953 */ 4954 sem_unlink(env->me_txns->mti_rmname); 4955 sem_unlink(env->me_txns->mti_wmname); 4956 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 4957 O_CREAT|O_EXCL, mode, 1); 4958 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4959 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 4960 O_CREAT|O_EXCL, mode, 1); 4961 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4962 #else /* MDB_USE_POSIX_MUTEX: */ 4963 pthread_mutexattr_t mattr; 4964 4965 /* Solaris needs this before initing a robust mutex. Otherwise 4966 * it may skip the init and return EBUSY "seems someone already 4967 * inited" or EINVAL "it was inited differently". 4968 */ 4969 memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); 4970 memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); 4971 4972 if ((rc = pthread_mutexattr_init(&mattr))) 4973 goto fail; 4974 4975 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); 4976 #ifdef MDB_ROBUST_SUPPORTED 4977 if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); 4978 #endif 4979 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); 4980 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); 4981 pthread_mutexattr_destroy(&mattr); 4982 if (rc) 4983 goto fail; 4984 #endif /* _WIN32 || MDB_USE_POSIX_SEM */ 4985 4986 env->me_txns->mti_magic = MDB_MAGIC; 4987 env->me_txns->mti_format = MDB_LOCK_FORMAT; 4988 env->me_txns->mti_txnid = 0; 4989 env->me_txns->mti_numreaders = 0; 4990 4991 } else { 4992 if (env->me_txns->mti_magic != MDB_MAGIC) { 4993 DPUTS("lock region has invalid magic"); 4994 rc = MDB_INVALID; 4995 goto fail; 4996 } 4997 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { 4998 DPRINTF(("lock region has format+version 0x%x, expected 0x%x", 4999 env->me_txns->mti_format, MDB_LOCK_FORMAT)); 5000 rc = MDB_VERSION_MISMATCH; 5001 goto fail; 5002 } 5003 rc = ErrCode(); 5004 if (rc && rc != EACCES && rc != EAGAIN) { 5005 goto fail; 5006 } 5007 #ifdef _WIN32 5008 env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); 5009 if (!env->me_rmutex) goto fail_errno; 5010 env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); 5011 if (!env->me_wmutex) goto fail_errno; 5012 #elif defined(MDB_USE_POSIX_SEM) 5013 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); 5014 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 5015 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); 5016 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 5017 #endif 5018 } 5019 return MDB_SUCCESS; 5020 5021 fail_errno: 5022 rc = ErrCode(); 5023 fail: 5024 return rc; 5025 } 5026 5027 /** Only a subset of the @ref mdb_env flags can be changed 5028 * at runtime. Changing other flags requires closing the 5029 * environment and re-opening it with the new flags. 5030 */ 5031 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) 5032 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ 5033 MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) 5034 5035 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) 5036 # error "Persistent DB flags & env flags overlap, but both go in mm_flags" 5037 #endif 5038 5039 int ESECT 5040 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) 5041 { 5042 int rc, excl = -1; 5043 MDB_name fname; 5044 5045 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) 5046 return EINVAL; 5047 5048 flags |= env->me_flags; 5049 5050 rc = mdb_fname_init(path, flags, &fname); 5051 if (rc) 5052 return rc; 5053 5054 if (flags & MDB_RDONLY) { 5055 /* silently ignore WRITEMAP when we're only getting read access */ 5056 flags &= ~MDB_WRITEMAP; 5057 } else { 5058 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && 5059 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) 5060 rc = ENOMEM; 5061 } 5062 env->me_flags = flags |= MDB_ENV_ACTIVE; 5063 if (rc) 5064 goto leave; 5065 5066 env->me_path = strdup(path); 5067 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); 5068 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); 5069 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); 5070 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { 5071 rc = ENOMEM; 5072 goto leave; 5073 } 5074 env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ 5075 5076 /* For RDONLY, get lockfile after we know datafile exists */ 5077 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { 5078 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 5079 if (rc) 5080 goto leave; 5081 } 5082 5083 rc = mdb_fopen(env, &fname, 5084 (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, 5085 mode, &env->me_fd); 5086 if (rc) 5087 goto leave; 5088 5089 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { 5090 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 5091 if (rc) 5092 goto leave; 5093 } 5094 5095 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { 5096 if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { 5097 /* Synchronous fd for meta writes. Needed even with 5098 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. 5099 */ 5100 rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); 5101 if (rc) 5102 goto leave; 5103 } 5104 DPRINTF(("opened dbenv %p", (void *) env)); 5105 if (excl > 0) { 5106 rc = mdb_env_share_locks(env, &excl); 5107 if (rc) 5108 goto leave; 5109 } 5110 if (!(flags & MDB_RDONLY)) { 5111 MDB_txn *txn; 5112 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * 5113 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); 5114 if ((env->me_pbuf = calloc(1, env->me_psize)) && 5115 (txn = calloc(1, size))) 5116 { 5117 txn->mt_dbs = (MDB_db *)((char *)txn + tsize); 5118 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 5119 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); 5120 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); 5121 txn->mt_env = env; 5122 txn->mt_dbxs = env->me_dbxs; 5123 txn->mt_flags = MDB_TXN_FINISHED; 5124 env->me_txn0 = txn; 5125 } else { 5126 rc = ENOMEM; 5127 } 5128 } 5129 } 5130 5131 leave: 5132 MDB_TRACE(("%p, %s, %u, %04o", env, path, flags & (CHANGEABLE|CHANGELESS), mode)); 5133 if (rc) { 5134 mdb_env_close0(env, excl); 5135 } 5136 mdb_fname_destroy(fname); 5137 return rc; 5138 } 5139 5140 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ 5141 static void ESECT 5142 mdb_env_close0(MDB_env *env, int excl) 5143 { 5144 int i; 5145 5146 if (!(env->me_flags & MDB_ENV_ACTIVE)) 5147 return; 5148 5149 /* Doing this here since me_dbxs may not exist during mdb_env_close */ 5150 if (env->me_dbxs) { 5151 for (i = env->me_maxdbs; --i >= CORE_DBS; ) 5152 free(env->me_dbxs[i].md_name.mv_data); 5153 free(env->me_dbxs); 5154 } 5155 5156 free(env->me_pbuf); 5157 free(env->me_dbiseqs); 5158 free(env->me_dbflags); 5159 free(env->me_path); 5160 free(env->me_dirty_list); 5161 free(env->me_txn0); 5162 mdb_midl_free(env->me_free_pgs); 5163 5164 if (env->me_flags & MDB_ENV_TXKEY) { 5165 pthread_key_delete(env->me_txkey); 5166 #ifdef _WIN32 5167 /* Delete our key from the global list */ 5168 for (i=0; i<mdb_tls_nkeys; i++) 5169 if (mdb_tls_keys[i] == env->me_txkey) { 5170 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; 5171 mdb_tls_nkeys--; 5172 break; 5173 } 5174 #endif 5175 } 5176 5177 if (env->me_map) { 5178 munmap(env->me_map, env->me_mapsize); 5179 } 5180 if (env->me_mfd != INVALID_HANDLE_VALUE) 5181 (void) close(env->me_mfd); 5182 if (env->me_fd != INVALID_HANDLE_VALUE) 5183 (void) close(env->me_fd); 5184 if (env->me_txns) { 5185 MDB_PID_T pid = getpid(); 5186 /* Clearing readers is done in this function because 5187 * me_txkey with its destructor must be disabled first. 5188 * 5189 * We skip the the reader mutex, so we touch only 5190 * data owned by this process (me_close_readers and 5191 * our readers), and clear each reader atomically. 5192 */ 5193 for (i = env->me_close_readers; --i >= 0; ) 5194 if (env->me_txns->mti_readers[i].mr_pid == pid) 5195 env->me_txns->mti_readers[i].mr_pid = 0; 5196 #ifdef _WIN32 5197 if (env->me_rmutex) { 5198 CloseHandle(env->me_rmutex); 5199 if (env->me_wmutex) CloseHandle(env->me_wmutex); 5200 } 5201 /* Windows automatically destroys the mutexes when 5202 * the last handle closes. 5203 */ 5204 #elif defined(MDB_USE_POSIX_SEM) 5205 if (env->me_rmutex != SEM_FAILED) { 5206 sem_close(env->me_rmutex); 5207 if (env->me_wmutex != SEM_FAILED) 5208 sem_close(env->me_wmutex); 5209 /* If we have the filelock: If we are the 5210 * only remaining user, clean up semaphores. 5211 */ 5212 if (excl == 0) 5213 mdb_env_excl_lock(env, &excl); 5214 if (excl > 0) { 5215 sem_unlink(env->me_txns->mti_rmname); 5216 sem_unlink(env->me_txns->mti_wmname); 5217 } 5218 } 5219 #endif 5220 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); 5221 } 5222 if (env->me_lfd != INVALID_HANDLE_VALUE) { 5223 #ifdef _WIN32 5224 if (excl >= 0) { 5225 /* Unlock the lockfile. Windows would have unlocked it 5226 * after closing anyway, but not necessarily at once. 5227 */ 5228 UnlockFile(env->me_lfd, 0, 0, 1, 0); 5229 } 5230 #endif 5231 (void) close(env->me_lfd); 5232 } 5233 5234 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); 5235 } 5236 5237 void ESECT 5238 mdb_env_close(MDB_env *env) 5239 { 5240 MDB_page *dp; 5241 5242 if (env == NULL) 5243 return; 5244 5245 MDB_TRACE(("%p", env)); 5246 VGMEMP_DESTROY(env); 5247 while ((dp = env->me_dpages) != NULL) { 5248 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 5249 env->me_dpages = dp->mp_next; 5250 free(dp); 5251 } 5252 5253 mdb_env_close0(env, 0); 5254 free(env); 5255 } 5256 5257 /** Compare two items pointing at aligned size_t's */ 5258 static int 5259 mdb_cmp_long(const MDB_val *a, const MDB_val *b) 5260 { 5261 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : 5262 *(size_t *)a->mv_data > *(size_t *)b->mv_data; 5263 } 5264 5265 /** Compare two items pointing at aligned unsigned int's. 5266 * 5267 * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, 5268 * but #mdb_cmp_clong() is called instead if the data type is size_t. 5269 */ 5270 static int 5271 mdb_cmp_int(const MDB_val *a, const MDB_val *b) 5272 { 5273 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : 5274 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; 5275 } 5276 5277 /** Compare two items pointing at unsigned ints of unknown alignment. 5278 * Nodes and keys are guaranteed to be 2-byte aligned. 5279 */ 5280 static int 5281 mdb_cmp_cint(const MDB_val *a, const MDB_val *b) 5282 { 5283 #if BYTE_ORDER == LITTLE_ENDIAN 5284 unsigned short *u, *c; 5285 int x; 5286 5287 u = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5288 c = (unsigned short *) ((char *) b->mv_data + a->mv_size); 5289 do { 5290 x = *--u - *--c; 5291 } while(!x && u > (unsigned short *)a->mv_data); 5292 return x; 5293 #else 5294 unsigned short *u, *c, *end; 5295 int x; 5296 5297 end = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5298 u = (unsigned short *)a->mv_data; 5299 c = (unsigned short *)b->mv_data; 5300 do { 5301 x = *u++ - *c++; 5302 } while(!x && u < end); 5303 return x; 5304 #endif 5305 } 5306 5307 /** Compare two items lexically */ 5308 static int 5309 mdb_cmp_memn(const MDB_val *a, const MDB_val *b) 5310 { 5311 int diff; 5312 ssize_t len_diff; 5313 unsigned int len; 5314 5315 len = a->mv_size; 5316 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5317 if (len_diff > 0) { 5318 len = b->mv_size; 5319 len_diff = 1; 5320 } 5321 5322 diff = memcmp(a->mv_data, b->mv_data, len); 5323 return diff ? diff : len_diff<0 ? -1 : len_diff; 5324 } 5325 5326 /** Compare two items in reverse byte order */ 5327 static int 5328 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) 5329 { 5330 const unsigned char *p1, *p2, *p1_lim; 5331 ssize_t len_diff; 5332 int diff; 5333 5334 p1_lim = (const unsigned char *)a->mv_data; 5335 p1 = (const unsigned char *)a->mv_data + a->mv_size; 5336 p2 = (const unsigned char *)b->mv_data + b->mv_size; 5337 5338 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5339 if (len_diff > 0) { 5340 p1_lim += len_diff; 5341 len_diff = 1; 5342 } 5343 5344 while (p1 > p1_lim) { 5345 diff = *--p1 - *--p2; 5346 if (diff) 5347 return diff; 5348 } 5349 return len_diff<0 ? -1 : len_diff; 5350 } 5351 5352 /** Search for key within a page, using binary search. 5353 * Returns the smallest entry larger or equal to the key. 5354 * If exactp is non-null, stores whether the found entry was an exact match 5355 * in *exactp (1 or 0). 5356 * Updates the cursor index with the index of the found entry. 5357 * If no entry larger or equal to the key is found, returns NULL. 5358 */ 5359 static MDB_node * 5360 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) 5361 { 5362 unsigned int i = 0, nkeys; 5363 int low, high; 5364 int rc = 0; 5365 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5366 MDB_node *node = NULL; 5367 MDB_val nodekey; 5368 MDB_cmp_func *cmp; 5369 DKBUF; 5370 5371 nkeys = NUMKEYS(mp); 5372 5373 DPRINTF(("searching %u keys in %s %spage %"Z"u", 5374 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", 5375 mdb_dbg_pgno(mp))); 5376 5377 low = IS_LEAF(mp) ? 0 : 1; 5378 high = nkeys - 1; 5379 cmp = mc->mc_dbx->md_cmp; 5380 5381 /* Branch pages have no data, so if using integer keys, 5382 * alignment is guaranteed. Use faster mdb_cmp_int. 5383 */ 5384 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { 5385 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) 5386 cmp = mdb_cmp_long; 5387 else 5388 cmp = mdb_cmp_int; 5389 } 5390 5391 if (IS_LEAF2(mp)) { 5392 nodekey.mv_size = mc->mc_db->md_pad; 5393 node = NODEPTR(mp, 0); /* fake */ 5394 while (low <= high) { 5395 i = (low + high) >> 1; 5396 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); 5397 rc = cmp(key, &nodekey); 5398 DPRINTF(("found leaf index %u [%s], rc = %i", 5399 i, DKEY(&nodekey), rc)); 5400 if (rc == 0) 5401 break; 5402 if (rc > 0) 5403 low = i + 1; 5404 else 5405 high = i - 1; 5406 } 5407 } else { 5408 while (low <= high) { 5409 i = (low + high) >> 1; 5410 5411 node = NODEPTR(mp, i); 5412 nodekey.mv_size = NODEKSZ(node); 5413 nodekey.mv_data = NODEKEY(node); 5414 5415 rc = cmp(key, &nodekey); 5416 #if MDB_DEBUG 5417 if (IS_LEAF(mp)) 5418 DPRINTF(("found leaf index %u [%s], rc = %i", 5419 i, DKEY(&nodekey), rc)); 5420 else 5421 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", 5422 i, DKEY(&nodekey), NODEPGNO(node), rc)); 5423 #endif 5424 if (rc == 0) 5425 break; 5426 if (rc > 0) 5427 low = i + 1; 5428 else 5429 high = i - 1; 5430 } 5431 } 5432 5433 if (rc > 0) { /* Found entry is less than the key. */ 5434 i++; /* Skip to get the smallest entry larger than key. */ 5435 if (!IS_LEAF2(mp)) 5436 node = NODEPTR(mp, i); 5437 } 5438 if (exactp) 5439 *exactp = (rc == 0 && nkeys > 0); 5440 /* store the key index */ 5441 mc->mc_ki[mc->mc_top] = i; 5442 if (i >= nkeys) 5443 /* There is no entry larger or equal to the key. */ 5444 return NULL; 5445 5446 /* nodeptr is fake for LEAF2 */ 5447 return node; 5448 } 5449 5450 #if 0 5451 static void 5452 mdb_cursor_adjust(MDB_cursor *mc, func) 5453 { 5454 MDB_cursor *m2; 5455 5456 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 5457 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { 5458 func(mc, m2); 5459 } 5460 } 5461 } 5462 #endif 5463 5464 /** Pop a page off the top of the cursor's stack. */ 5465 static void 5466 mdb_cursor_pop(MDB_cursor *mc) 5467 { 5468 if (mc->mc_snum) { 5469 DPRINTF(("popping page %"Z"u off db %d cursor %p", 5470 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); 5471 5472 mc->mc_snum--; 5473 if (mc->mc_snum) { 5474 mc->mc_top--; 5475 } else { 5476 mc->mc_flags &= ~C_INITIALIZED; 5477 } 5478 } 5479 } 5480 5481 /** Push a page onto the top of the cursor's stack. 5482 * Set #MDB_TXN_ERROR on failure. 5483 */ 5484 static int 5485 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) 5486 { 5487 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, 5488 DDBI(mc), (void *) mc)); 5489 5490 if (mc->mc_snum >= CURSOR_STACK) { 5491 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5492 return MDB_CURSOR_FULL; 5493 } 5494 5495 mc->mc_top = mc->mc_snum++; 5496 mc->mc_pg[mc->mc_top] = mp; 5497 mc->mc_ki[mc->mc_top] = 0; 5498 5499 return MDB_SUCCESS; 5500 } 5501 5502 /** Find the address of the page corresponding to a given page number. 5503 * Set #MDB_TXN_ERROR on failure. 5504 * @param[in] mc the cursor accessing the page. 5505 * @param[in] pgno the page number for the page to retrieve. 5506 * @param[out] ret address of a pointer where the page's address will be stored. 5507 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. 5508 * @return 0 on success, non-zero on failure. 5509 */ 5510 static int 5511 mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) 5512 { 5513 MDB_txn *txn = mc->mc_txn; 5514 MDB_env *env = txn->mt_env; 5515 MDB_page *p = NULL; 5516 int level; 5517 5518 if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { 5519 MDB_txn *tx2 = txn; 5520 level = 1; 5521 do { 5522 MDB_ID2L dl = tx2->mt_u.dirty_list; 5523 unsigned x; 5524 /* Spilled pages were dirtied in this txn and flushed 5525 * because the dirty list got full. Bring this page 5526 * back in from the map (but don't unspill it here, 5527 * leave that unless page_touch happens again). 5528 */ 5529 if (tx2->mt_spill_pgs) { 5530 MDB_ID pn = pgno << 1; 5531 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 5532 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 5533 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5534 goto done; 5535 } 5536 } 5537 if (dl[0].mid) { 5538 unsigned x = mdb_mid2l_search(dl, pgno); 5539 if (x <= dl[0].mid && dl[x].mid == pgno) { 5540 p = dl[x].mptr; 5541 goto done; 5542 } 5543 } 5544 level++; 5545 } while ((tx2 = tx2->mt_parent) != NULL); 5546 } 5547 5548 if (pgno < txn->mt_next_pgno) { 5549 level = 0; 5550 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5551 } else { 5552 DPRINTF(("page %"Z"u not found", pgno)); 5553 txn->mt_flags |= MDB_TXN_ERROR; 5554 return MDB_PAGE_NOTFOUND; 5555 } 5556 5557 done: 5558 *ret = p; 5559 if (lvl) 5560 *lvl = level; 5561 return MDB_SUCCESS; 5562 } 5563 5564 /** Finish #mdb_page_search() / #mdb_page_search_lowest(). 5565 * The cursor is at the root page, set up the rest of it. 5566 */ 5567 static int 5568 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) 5569 { 5570 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5571 int rc; 5572 DKBUF; 5573 5574 while (IS_BRANCH(mp)) { 5575 MDB_node *node; 5576 indx_t i; 5577 5578 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); 5579 /* Don't assert on branch pages in the FreeDB. We can get here 5580 * while in the process of rebalancing a FreeDB branch page; we must 5581 * let that proceed. ITS#8336 5582 */ 5583 mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); 5584 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); 5585 5586 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { 5587 i = 0; 5588 if (flags & MDB_PS_LAST) { 5589 i = NUMKEYS(mp) - 1; 5590 /* if already init'd, see if we're already in right place */ 5591 if (mc->mc_flags & C_INITIALIZED) { 5592 if (mc->mc_ki[mc->mc_top] == i) { 5593 mc->mc_top = mc->mc_snum++; 5594 mp = mc->mc_pg[mc->mc_top]; 5595 goto ready; 5596 } 5597 } 5598 } 5599 } else { 5600 int exact; 5601 node = mdb_node_search(mc, key, &exact); 5602 if (node == NULL) 5603 i = NUMKEYS(mp) - 1; 5604 else { 5605 i = mc->mc_ki[mc->mc_top]; 5606 if (!exact) { 5607 mdb_cassert(mc, i > 0); 5608 i--; 5609 } 5610 } 5611 DPRINTF(("following index %u for key [%s]", i, DKEY(key))); 5612 } 5613 5614 mdb_cassert(mc, i < NUMKEYS(mp)); 5615 node = NODEPTR(mp, i); 5616 5617 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5618 return rc; 5619 5620 mc->mc_ki[mc->mc_top] = i; 5621 if ((rc = mdb_cursor_push(mc, mp))) 5622 return rc; 5623 5624 ready: 5625 if (flags & MDB_PS_MODIFY) { 5626 if ((rc = mdb_page_touch(mc)) != 0) 5627 return rc; 5628 mp = mc->mc_pg[mc->mc_top]; 5629 } 5630 } 5631 5632 if (!IS_LEAF(mp)) { 5633 DPRINTF(("internal error, index points to a %02X page!?", 5634 mp->mp_flags)); 5635 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5636 return MDB_CORRUPTED; 5637 } 5638 5639 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, 5640 key ? DKEY(key) : "null")); 5641 mc->mc_flags |= C_INITIALIZED; 5642 mc->mc_flags &= ~C_EOF; 5643 5644 return MDB_SUCCESS; 5645 } 5646 5647 /** Search for the lowest key under the current branch page. 5648 * This just bypasses a NUMKEYS check in the current page 5649 * before calling mdb_page_search_root(), because the callers 5650 * are all in situations where the current page is known to 5651 * be underfilled. 5652 */ 5653 static int 5654 mdb_page_search_lowest(MDB_cursor *mc) 5655 { 5656 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5657 MDB_node *node = NODEPTR(mp, 0); 5658 int rc; 5659 5660 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5661 return rc; 5662 5663 mc->mc_ki[mc->mc_top] = 0; 5664 if ((rc = mdb_cursor_push(mc, mp))) 5665 return rc; 5666 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); 5667 } 5668 5669 /** Search for the page a given key should be in. 5670 * Push it and its parent pages on the cursor stack. 5671 * @param[in,out] mc the cursor for this operation. 5672 * @param[in] key the key to search for, or NULL for first/last page. 5673 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB 5674 * are touched (updated with new page numbers). 5675 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. 5676 * This is used by #mdb_cursor_first() and #mdb_cursor_last(). 5677 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. 5678 * @return 0 on success, non-zero on failure. 5679 */ 5680 static int 5681 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) 5682 { 5683 int rc; 5684 pgno_t root; 5685 5686 /* Make sure the txn is still viable, then find the root from 5687 * the txn's db table and set it as the root of the cursor's stack. 5688 */ 5689 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { 5690 DPUTS("transaction may not be used now"); 5691 return MDB_BAD_TXN; 5692 } else { 5693 /* Make sure we're using an up-to-date root */ 5694 if (*mc->mc_dbflag & DB_STALE) { 5695 MDB_cursor mc2; 5696 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 5697 return MDB_BAD_DBI; 5698 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); 5699 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); 5700 if (rc) 5701 return rc; 5702 { 5703 MDB_val data; 5704 int exact = 0; 5705 uint16_t flags; 5706 MDB_node *leaf = mdb_node_search(&mc2, 5707 &mc->mc_dbx->md_name, &exact); 5708 if (!exact) 5709 return MDB_BAD_DBI; 5710 if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 5711 return MDB_INCOMPATIBLE; /* not a named DB */ 5712 rc = mdb_node_read(&mc2, leaf, &data); 5713 if (rc) 5714 return rc; 5715 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), 5716 sizeof(uint16_t)); 5717 /* The txn may not know this DBI, or another process may 5718 * have dropped and recreated the DB with other flags. 5719 */ 5720 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) 5721 return MDB_INCOMPATIBLE; 5722 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); 5723 } 5724 *mc->mc_dbflag &= ~DB_STALE; 5725 } 5726 root = mc->mc_db->md_root; 5727 5728 if (root == P_INVALID) { /* Tree is empty. */ 5729 DPUTS("tree is empty"); 5730 return MDB_NOTFOUND; 5731 } 5732 } 5733 5734 mdb_cassert(mc, root > 1); 5735 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) 5736 if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) 5737 return rc; 5738 5739 mc->mc_snum = 1; 5740 mc->mc_top = 0; 5741 5742 DPRINTF(("db %d root page %"Z"u has flags 0x%X", 5743 DDBI(mc), root, mc->mc_pg[0]->mp_flags)); 5744 5745 if (flags & MDB_PS_MODIFY) { 5746 if ((rc = mdb_page_touch(mc))) 5747 return rc; 5748 } 5749 5750 if (flags & MDB_PS_ROOTONLY) 5751 return MDB_SUCCESS; 5752 5753 return mdb_page_search_root(mc, key, flags); 5754 } 5755 5756 static int 5757 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) 5758 { 5759 MDB_txn *txn = mc->mc_txn; 5760 pgno_t pg = mp->mp_pgno; 5761 unsigned x = 0, ovpages = mp->mp_pages; 5762 MDB_env *env = txn->mt_env; 5763 MDB_IDL sl = txn->mt_spill_pgs; 5764 MDB_ID pn = pg << 1; 5765 int rc; 5766 5767 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); 5768 /* If the page is dirty or on the spill list we just acquired it, 5769 * so we should give it back to our current free list, if any. 5770 * Otherwise put it onto the list of pages we freed in this txn. 5771 * 5772 * Won't create me_pghead: me_pglast must be inited along with it. 5773 * Unsupported in nested txns: They would need to hide the page 5774 * range in ancestor txns' dirty and spilled lists. 5775 */ 5776 if (env->me_pghead && 5777 !txn->mt_parent && 5778 ((mp->mp_flags & P_DIRTY) || 5779 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) 5780 { 5781 unsigned i, j; 5782 pgno_t *mop; 5783 MDB_ID2 *dl, ix, iy; 5784 rc = mdb_midl_need(&env->me_pghead, ovpages); 5785 if (rc) 5786 return rc; 5787 if (!(mp->mp_flags & P_DIRTY)) { 5788 /* This page is no longer spilled */ 5789 if (x == sl[0]) 5790 sl[0]--; 5791 else 5792 sl[x] |= 1; 5793 goto release; 5794 } 5795 /* Remove from dirty list */ 5796 dl = txn->mt_u.dirty_list; 5797 x = dl[0].mid--; 5798 for (ix = dl[x]; ix.mptr != mp; ix = iy) { 5799 if (x > 1) { 5800 x--; 5801 iy = dl[x]; 5802 dl[x] = ix; 5803 } else { 5804 mdb_cassert(mc, x > 1); 5805 j = ++(dl[0].mid); 5806 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ 5807 txn->mt_flags |= MDB_TXN_ERROR; 5808 return MDB_CORRUPTED; 5809 } 5810 } 5811 txn->mt_dirty_room++; 5812 if (!(env->me_flags & MDB_WRITEMAP)) 5813 mdb_dpage_free(env, mp); 5814 release: 5815 /* Insert in me_pghead */ 5816 mop = env->me_pghead; 5817 j = mop[0] + ovpages; 5818 for (i = mop[0]; i && mop[i] < pg; i--) 5819 mop[j--] = mop[i]; 5820 while (j>i) 5821 mop[j--] = pg++; 5822 mop[0] += ovpages; 5823 } else { 5824 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); 5825 if (rc) 5826 return rc; 5827 } 5828 mc->mc_db->md_overflow_pages -= ovpages; 5829 return 0; 5830 } 5831 5832 /** Return the data associated with a given node. 5833 * @param[in] mc The cursor for this operation. 5834 * @param[in] leaf The node being read. 5835 * @param[out] data Updated to point to the node's data. 5836 * @return 0 on success, non-zero on failure. 5837 */ 5838 static int 5839 mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) 5840 { 5841 MDB_page *omp; /* overflow page */ 5842 pgno_t pgno; 5843 int rc; 5844 5845 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { 5846 data->mv_size = NODEDSZ(leaf); 5847 data->mv_data = NODEDATA(leaf); 5848 return MDB_SUCCESS; 5849 } 5850 5851 /* Read overflow data. 5852 */ 5853 data->mv_size = NODEDSZ(leaf); 5854 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); 5855 if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { 5856 DPRINTF(("read overflow page %"Z"u failed", pgno)); 5857 return rc; 5858 } 5859 data->mv_data = METADATA(omp); 5860 5861 return MDB_SUCCESS; 5862 } 5863 5864 int 5865 mdb_get(MDB_txn *txn, MDB_dbi dbi, 5866 MDB_val *key, MDB_val *data) 5867 { 5868 MDB_cursor mc; 5869 MDB_xcursor mx; 5870 int exact = 0; 5871 DKBUF; 5872 5873 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); 5874 5875 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 5876 return EINVAL; 5877 5878 if (txn->mt_flags & MDB_TXN_BLOCKED) 5879 return MDB_BAD_TXN; 5880 5881 mdb_cursor_init(&mc, txn, dbi, &mx); 5882 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); 5883 } 5884 5885 /** Find a sibling for a page. 5886 * Replaces the page at the top of the cursor's stack with the 5887 * specified sibling, if one exists. 5888 * @param[in] mc The cursor for this operation. 5889 * @param[in] move_right Non-zero if the right sibling is requested, 5890 * otherwise the left sibling. 5891 * @return 0 on success, non-zero on failure. 5892 */ 5893 static int 5894 mdb_cursor_sibling(MDB_cursor *mc, int move_right) 5895 { 5896 int rc; 5897 MDB_node *indx; 5898 MDB_page *mp; 5899 5900 if (mc->mc_snum < 2) { 5901 return MDB_NOTFOUND; /* root has no siblings */ 5902 } 5903 5904 mdb_cursor_pop(mc); 5905 DPRINTF(("parent page is page %"Z"u, index %u", 5906 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); 5907 5908 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) 5909 : (mc->mc_ki[mc->mc_top] == 0)) { 5910 DPRINTF(("no more keys left, moving to %s sibling", 5911 move_right ? "right" : "left")); 5912 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { 5913 /* undo cursor_pop before returning */ 5914 mc->mc_top++; 5915 mc->mc_snum++; 5916 return rc; 5917 } 5918 } else { 5919 if (move_right) 5920 mc->mc_ki[mc->mc_top]++; 5921 else 5922 mc->mc_ki[mc->mc_top]--; 5923 DPRINTF(("just moving to %s index key %u", 5924 move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); 5925 } 5926 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); 5927 5928 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5929 if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { 5930 /* mc will be inconsistent if caller does mc_snum++ as above */ 5931 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 5932 return rc; 5933 } 5934 5935 mdb_cursor_push(mc, mp); 5936 if (!move_right) 5937 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; 5938 5939 return MDB_SUCCESS; 5940 } 5941 5942 /** Move the cursor to the next data item. */ 5943 static int 5944 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5945 { 5946 MDB_page *mp; 5947 MDB_node *leaf; 5948 int rc; 5949 5950 if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) 5951 return MDB_NOTFOUND; 5952 5953 if (!(mc->mc_flags & C_INITIALIZED)) 5954 return mdb_cursor_first(mc, key, data); 5955 5956 mp = mc->mc_pg[mc->mc_top]; 5957 5958 if (mc->mc_flags & C_EOF) { 5959 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) 5960 return MDB_NOTFOUND; 5961 mc->mc_flags ^= C_EOF; 5962 } 5963 5964 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5965 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5966 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5967 if (op == MDB_NEXT || op == MDB_NEXT_DUP) { 5968 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); 5969 if (op != MDB_NEXT || rc != MDB_NOTFOUND) { 5970 if (rc == MDB_SUCCESS) 5971 MDB_GET_KEY(leaf, key); 5972 return rc; 5973 } 5974 } 5975 } else { 5976 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5977 if (op == MDB_NEXT_DUP) 5978 return MDB_NOTFOUND; 5979 } 5980 } 5981 5982 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", 5983 mdb_dbg_pgno(mp), (void *) mc)); 5984 if (mc->mc_flags & C_DEL) { 5985 mc->mc_flags ^= C_DEL; 5986 goto skip; 5987 } 5988 5989 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { 5990 DPUTS("=====> move to next sibling page"); 5991 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 5992 mc->mc_flags |= C_EOF; 5993 return rc; 5994 } 5995 mp = mc->mc_pg[mc->mc_top]; 5996 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5997 } else 5998 mc->mc_ki[mc->mc_top]++; 5999 6000 skip: 6001 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 6002 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 6003 6004 if (IS_LEAF2(mp)) { 6005 key->mv_size = mc->mc_db->md_pad; 6006 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6007 return MDB_SUCCESS; 6008 } 6009 6010 mdb_cassert(mc, IS_LEAF(mp)); 6011 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6012 6013 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6014 mdb_xcursor_init1(mc, leaf); 6015 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6016 if (rc != MDB_SUCCESS) 6017 return rc; 6018 } else if (data) { 6019 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6020 return rc; 6021 } 6022 6023 MDB_GET_KEY(leaf, key); 6024 return MDB_SUCCESS; 6025 } 6026 6027 /** Move the cursor to the previous data item. */ 6028 static int 6029 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 6030 { 6031 MDB_page *mp; 6032 MDB_node *leaf; 6033 int rc; 6034 6035 if (!(mc->mc_flags & C_INITIALIZED)) { 6036 rc = mdb_cursor_last(mc, key, data); 6037 if (rc) 6038 return rc; 6039 mc->mc_ki[mc->mc_top]++; 6040 } 6041 6042 mp = mc->mc_pg[mc->mc_top]; 6043 6044 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 6045 mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 6046 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6047 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6048 if (op == MDB_PREV || op == MDB_PREV_DUP) { 6049 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); 6050 if (op != MDB_PREV || rc != MDB_NOTFOUND) { 6051 if (rc == MDB_SUCCESS) { 6052 MDB_GET_KEY(leaf, key); 6053 mc->mc_flags &= ~C_EOF; 6054 } 6055 return rc; 6056 } 6057 } 6058 } else { 6059 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6060 if (op == MDB_PREV_DUP) 6061 return MDB_NOTFOUND; 6062 } 6063 } 6064 6065 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", 6066 mdb_dbg_pgno(mp), (void *) mc)); 6067 6068 mc->mc_flags &= ~(C_EOF|C_DEL); 6069 6070 if (mc->mc_ki[mc->mc_top] == 0) { 6071 DPUTS("=====> move to prev sibling page"); 6072 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { 6073 return rc; 6074 } 6075 mp = mc->mc_pg[mc->mc_top]; 6076 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; 6077 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 6078 } else 6079 mc->mc_ki[mc->mc_top]--; 6080 6081 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 6082 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 6083 6084 if (!IS_LEAF(mp)) 6085 return MDB_CORRUPTED; 6086 6087 if (IS_LEAF2(mp)) { 6088 key->mv_size = mc->mc_db->md_pad; 6089 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6090 return MDB_SUCCESS; 6091 } 6092 6093 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6094 6095 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6096 mdb_xcursor_init1(mc, leaf); 6097 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 6098 if (rc != MDB_SUCCESS) 6099 return rc; 6100 } else if (data) { 6101 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6102 return rc; 6103 } 6104 6105 MDB_GET_KEY(leaf, key); 6106 return MDB_SUCCESS; 6107 } 6108 6109 /** Set the cursor on a specific data item. */ 6110 static int 6111 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6112 MDB_cursor_op op, int *exactp) 6113 { 6114 int rc; 6115 MDB_page *mp; 6116 MDB_node *leaf = NULL; 6117 DKBUF; 6118 6119 if (key->mv_size == 0) 6120 return MDB_BAD_VALSIZE; 6121 6122 if (mc->mc_xcursor) 6123 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6124 6125 /* See if we're already on the right page */ 6126 if (mc->mc_flags & C_INITIALIZED) { 6127 MDB_val nodekey; 6128 6129 mp = mc->mc_pg[mc->mc_top]; 6130 if (!NUMKEYS(mp)) { 6131 mc->mc_ki[mc->mc_top] = 0; 6132 return MDB_NOTFOUND; 6133 } 6134 if (MP_FLAGS(mp) & P_LEAF2) { 6135 nodekey.mv_size = mc->mc_db->md_pad; 6136 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); 6137 } else { 6138 leaf = NODEPTR(mp, 0); 6139 MDB_GET_KEY2(leaf, nodekey); 6140 } 6141 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6142 if (rc == 0) { 6143 /* Probably happens rarely, but first node on the page 6144 * was the one we wanted. 6145 */ 6146 mc->mc_ki[mc->mc_top] = 0; 6147 if (exactp) 6148 *exactp = 1; 6149 goto set1; 6150 } 6151 if (rc > 0) { 6152 unsigned int i; 6153 unsigned int nkeys = NUMKEYS(mp); 6154 if (nkeys > 1) { 6155 if (MP_FLAGS(mp) & P_LEAF2) { 6156 nodekey.mv_data = LEAF2KEY(mp, 6157 nkeys-1, nodekey.mv_size); 6158 } else { 6159 leaf = NODEPTR(mp, nkeys-1); 6160 MDB_GET_KEY2(leaf, nodekey); 6161 } 6162 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6163 if (rc == 0) { 6164 /* last node was the one we wanted */ 6165 mc->mc_ki[mc->mc_top] = nkeys-1; 6166 if (exactp) 6167 *exactp = 1; 6168 goto set1; 6169 } 6170 if (rc < 0) { 6171 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 6172 /* This is definitely the right page, skip search_page */ 6173 if (MP_FLAGS(mp) & P_LEAF2) { 6174 nodekey.mv_data = LEAF2KEY(mp, 6175 mc->mc_ki[mc->mc_top], nodekey.mv_size); 6176 } else { 6177 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6178 MDB_GET_KEY2(leaf, nodekey); 6179 } 6180 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6181 if (rc == 0) { 6182 /* current node was the one we wanted */ 6183 if (exactp) 6184 *exactp = 1; 6185 goto set1; 6186 } 6187 } 6188 rc = 0; 6189 mc->mc_flags &= ~C_EOF; 6190 goto set2; 6191 } 6192 } 6193 /* If any parents have right-sibs, search. 6194 * Otherwise, there's nothing further. 6195 */ 6196 for (i=0; i<mc->mc_top; i++) 6197 if (mc->mc_ki[i] < 6198 NUMKEYS(mc->mc_pg[i])-1) 6199 break; 6200 if (i == mc->mc_top) { 6201 /* There are no other pages */ 6202 mc->mc_ki[mc->mc_top] = nkeys; 6203 return MDB_NOTFOUND; 6204 } 6205 } 6206 if (!mc->mc_top) { 6207 /* There are no other pages */ 6208 mc->mc_ki[mc->mc_top] = 0; 6209 if (op == MDB_SET_RANGE && !exactp) { 6210 rc = 0; 6211 goto set1; 6212 } else 6213 return MDB_NOTFOUND; 6214 } 6215 } else { 6216 mc->mc_pg[0] = 0; 6217 } 6218 6219 rc = mdb_page_search(mc, key, 0); 6220 if (rc != MDB_SUCCESS) 6221 return rc; 6222 6223 mp = mc->mc_pg[mc->mc_top]; 6224 mdb_cassert(mc, IS_LEAF(mp)); 6225 6226 set2: 6227 leaf = mdb_node_search(mc, key, exactp); 6228 if (exactp != NULL && !*exactp) { 6229 /* MDB_SET specified and not an exact match. */ 6230 return MDB_NOTFOUND; 6231 } 6232 6233 if (leaf == NULL) { 6234 DPUTS("===> inexact leaf not found, goto sibling"); 6235 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 6236 mc->mc_flags |= C_EOF; 6237 return rc; /* no entries matched */ 6238 } 6239 mp = mc->mc_pg[mc->mc_top]; 6240 mdb_cassert(mc, IS_LEAF(mp)); 6241 leaf = NODEPTR(mp, 0); 6242 } 6243 6244 set1: 6245 mc->mc_flags |= C_INITIALIZED; 6246 mc->mc_flags &= ~C_EOF; 6247 6248 if (IS_LEAF2(mp)) { 6249 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { 6250 key->mv_size = mc->mc_db->md_pad; 6251 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6252 } 6253 return MDB_SUCCESS; 6254 } 6255 6256 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6257 mdb_xcursor_init1(mc, leaf); 6258 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { 6259 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6260 } else { 6261 int ex2, *ex2p; 6262 if (op == MDB_GET_BOTH) { 6263 ex2p = &ex2; 6264 ex2 = 0; 6265 } else { 6266 ex2p = NULL; 6267 } 6268 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); 6269 if (rc != MDB_SUCCESS) 6270 return rc; 6271 } 6272 } else if (data) { 6273 if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { 6274 MDB_val olddata; 6275 MDB_cmp_func *dcmp; 6276 if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) 6277 return rc; 6278 dcmp = mc->mc_dbx->md_dcmp; 6279 #if UINT_MAX < SIZE_MAX 6280 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6281 dcmp = mdb_cmp_clong; 6282 #endif 6283 rc = dcmp(data, &olddata); 6284 if (rc) { 6285 if (op == MDB_GET_BOTH || rc > 0) 6286 return MDB_NOTFOUND; 6287 rc = 0; 6288 } 6289 *data = olddata; 6290 6291 } else { 6292 if (mc->mc_xcursor) 6293 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6294 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6295 return rc; 6296 } 6297 } 6298 6299 /* The key already matches in all other cases */ 6300 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) 6301 MDB_GET_KEY(leaf, key); 6302 DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); 6303 6304 return rc; 6305 } 6306 6307 /** Move the cursor to the first item in the database. */ 6308 static int 6309 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6310 { 6311 int rc; 6312 MDB_node *leaf; 6313 6314 if (mc->mc_xcursor) 6315 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6316 6317 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6318 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 6319 if (rc != MDB_SUCCESS) 6320 return rc; 6321 } 6322 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6323 6324 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); 6325 mc->mc_flags |= C_INITIALIZED; 6326 mc->mc_flags &= ~C_EOF; 6327 6328 mc->mc_ki[mc->mc_top] = 0; 6329 6330 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6331 if ( key ) { 6332 key->mv_size = mc->mc_db->md_pad; 6333 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); 6334 } 6335 return MDB_SUCCESS; 6336 } 6337 6338 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6339 mdb_xcursor_init1(mc, leaf); 6340 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6341 if (rc) 6342 return rc; 6343 } else if (data) { 6344 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6345 return rc; 6346 } 6347 6348 MDB_GET_KEY(leaf, key); 6349 return MDB_SUCCESS; 6350 } 6351 6352 /** Move the cursor to the last item in the database. */ 6353 static int 6354 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6355 { 6356 int rc; 6357 MDB_node *leaf; 6358 6359 if (mc->mc_xcursor) 6360 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6361 6362 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6363 rc = mdb_page_search(mc, NULL, MDB_PS_LAST); 6364 if (rc != MDB_SUCCESS) 6365 return rc; 6366 } 6367 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6368 6369 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; 6370 mc->mc_flags |= C_INITIALIZED|C_EOF; 6371 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6372 6373 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6374 if (key) { 6375 key->mv_size = mc->mc_db->md_pad; 6376 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); 6377 } 6378 return MDB_SUCCESS; 6379 } 6380 6381 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6382 mdb_xcursor_init1(mc, leaf); 6383 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 6384 if (rc) 6385 return rc; 6386 } else if (data) { 6387 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6388 return rc; 6389 } 6390 6391 MDB_GET_KEY(leaf, key); 6392 return MDB_SUCCESS; 6393 } 6394 6395 int 6396 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6397 MDB_cursor_op op) 6398 { 6399 int rc; 6400 int exact = 0; 6401 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); 6402 6403 if (mc == NULL) 6404 return EINVAL; 6405 6406 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 6407 return MDB_BAD_TXN; 6408 6409 switch (op) { 6410 case MDB_GET_CURRENT: 6411 if (!(mc->mc_flags & C_INITIALIZED)) { 6412 rc = EINVAL; 6413 } else { 6414 MDB_page *mp = mc->mc_pg[mc->mc_top]; 6415 int nkeys = NUMKEYS(mp); 6416 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { 6417 mc->mc_ki[mc->mc_top] = nkeys; 6418 rc = MDB_NOTFOUND; 6419 break; 6420 } 6421 rc = MDB_SUCCESS; 6422 if (IS_LEAF2(mp)) { 6423 key->mv_size = mc->mc_db->md_pad; 6424 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6425 } else { 6426 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6427 MDB_GET_KEY(leaf, key); 6428 if (data) { 6429 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6430 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); 6431 } else { 6432 rc = mdb_node_read(mc, leaf, data); 6433 } 6434 } 6435 } 6436 } 6437 break; 6438 case MDB_GET_BOTH: 6439 case MDB_GET_BOTH_RANGE: 6440 if (data == NULL) { 6441 rc = EINVAL; 6442 break; 6443 } 6444 if (mc->mc_xcursor == NULL) { 6445 rc = MDB_INCOMPATIBLE; 6446 break; 6447 } 6448 /* FALLTHRU */ 6449 case MDB_SET: 6450 case MDB_SET_KEY: 6451 case MDB_SET_RANGE: 6452 if (key == NULL) { 6453 rc = EINVAL; 6454 } else { 6455 rc = mdb_cursor_set(mc, key, data, op, 6456 op == MDB_SET_RANGE ? NULL : &exact); 6457 } 6458 break; 6459 case MDB_GET_MULTIPLE: 6460 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6461 rc = EINVAL; 6462 break; 6463 } 6464 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6465 rc = MDB_INCOMPATIBLE; 6466 break; 6467 } 6468 rc = MDB_SUCCESS; 6469 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || 6470 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) 6471 break; 6472 goto fetchm; 6473 case MDB_NEXT_MULTIPLE: 6474 if (data == NULL) { 6475 rc = EINVAL; 6476 break; 6477 } 6478 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6479 rc = MDB_INCOMPATIBLE; 6480 break; 6481 } 6482 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); 6483 if (rc == MDB_SUCCESS) { 6484 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 6485 MDB_cursor *mx; 6486 fetchm: 6487 mx = &mc->mc_xcursor->mx_cursor; 6488 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * 6489 mx->mc_db->md_pad; 6490 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); 6491 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; 6492 } else { 6493 rc = MDB_NOTFOUND; 6494 } 6495 } 6496 break; 6497 case MDB_PREV_MULTIPLE: 6498 if (data == NULL) { 6499 rc = EINVAL; 6500 break; 6501 } 6502 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6503 rc = MDB_INCOMPATIBLE; 6504 break; 6505 } 6506 if (!(mc->mc_flags & C_INITIALIZED)) 6507 rc = mdb_cursor_last(mc, key, data); 6508 else 6509 rc = MDB_SUCCESS; 6510 if (rc == MDB_SUCCESS) { 6511 MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; 6512 if (mx->mc_flags & C_INITIALIZED) { 6513 rc = mdb_cursor_sibling(mx, 0); 6514 if (rc == MDB_SUCCESS) 6515 goto fetchm; 6516 } else { 6517 rc = MDB_NOTFOUND; 6518 } 6519 } 6520 break; 6521 case MDB_NEXT: 6522 case MDB_NEXT_DUP: 6523 case MDB_NEXT_NODUP: 6524 rc = mdb_cursor_next(mc, key, data, op); 6525 break; 6526 case MDB_PREV: 6527 case MDB_PREV_DUP: 6528 case MDB_PREV_NODUP: 6529 rc = mdb_cursor_prev(mc, key, data, op); 6530 break; 6531 case MDB_FIRST: 6532 rc = mdb_cursor_first(mc, key, data); 6533 break; 6534 case MDB_FIRST_DUP: 6535 mfunc = mdb_cursor_first; 6536 mmove: 6537 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6538 rc = EINVAL; 6539 break; 6540 } 6541 if (mc->mc_xcursor == NULL) { 6542 rc = MDB_INCOMPATIBLE; 6543 break; 6544 } 6545 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { 6546 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 6547 rc = MDB_NOTFOUND; 6548 break; 6549 } 6550 mc->mc_flags &= ~C_EOF; 6551 { 6552 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6553 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6554 MDB_GET_KEY(leaf, key); 6555 rc = mdb_node_read(mc, leaf, data); 6556 break; 6557 } 6558 } 6559 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { 6560 rc = EINVAL; 6561 break; 6562 } 6563 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); 6564 break; 6565 case MDB_LAST: 6566 rc = mdb_cursor_last(mc, key, data); 6567 break; 6568 case MDB_LAST_DUP: 6569 mfunc = mdb_cursor_last; 6570 goto mmove; 6571 default: 6572 DPRINTF(("unhandled/unimplemented cursor operation %u", op)); 6573 rc = EINVAL; 6574 break; 6575 } 6576 6577 if (mc->mc_flags & C_DEL) 6578 mc->mc_flags ^= C_DEL; 6579 6580 return rc; 6581 } 6582 6583 /** Touch all the pages in the cursor stack. Set mc_top. 6584 * Makes sure all the pages are writable, before attempting a write operation. 6585 * @param[in] mc The cursor to operate on. 6586 */ 6587 static int 6588 mdb_cursor_touch(MDB_cursor *mc) 6589 { 6590 int rc = MDB_SUCCESS; 6591 6592 if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { 6593 /* Touch DB record of named DB */ 6594 MDB_cursor mc2; 6595 MDB_xcursor mcx; 6596 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 6597 return MDB_BAD_DBI; 6598 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); 6599 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); 6600 if (rc) 6601 return rc; 6602 *mc->mc_dbflag |= DB_DIRTY; 6603 } 6604 mc->mc_top = 0; 6605 if (mc->mc_snum) { 6606 do { 6607 rc = mdb_page_touch(mc); 6608 } while (!rc && ++(mc->mc_top) < mc->mc_snum); 6609 mc->mc_top = mc->mc_snum-1; 6610 } 6611 return rc; 6612 } 6613 6614 /** Do not spill pages to disk if txn is getting full, may fail instead */ 6615 #define MDB_NOSPILL 0x8000 6616 6617 static int 6618 _mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6619 unsigned int flags) 6620 { 6621 MDB_env *env; 6622 MDB_node *leaf = NULL; 6623 MDB_page *fp, *mp, *sub_root = NULL; 6624 uint16_t fp_flags; 6625 MDB_val xdata, *rdata, dkey, olddata; 6626 MDB_db dummy; 6627 int do_sub = 0, insert_key, insert_data; 6628 unsigned int mcount = 0, dcount = 0, nospill; 6629 size_t nsize; 6630 int rc, rc2; 6631 unsigned int nflags; 6632 DKBUF; 6633 6634 if (mc == NULL || key == NULL) 6635 return EINVAL; 6636 6637 env = mc->mc_txn->mt_env; 6638 6639 /* Check this first so counter will always be zero on any 6640 * early failures. 6641 */ 6642 if (flags & MDB_MULTIPLE) { 6643 dcount = data[1].mv_size; 6644 data[1].mv_size = 0; 6645 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) 6646 return MDB_INCOMPATIBLE; 6647 } 6648 6649 nospill = flags & MDB_NOSPILL; 6650 flags &= ~MDB_NOSPILL; 6651 6652 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 6653 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 6654 6655 if (key->mv_size-1 >= ENV_MAXKEY(env)) 6656 return MDB_BAD_VALSIZE; 6657 6658 #if SIZE_MAX > MAXDATASIZE 6659 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) 6660 return MDB_BAD_VALSIZE; 6661 #else 6662 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) 6663 return MDB_BAD_VALSIZE; 6664 #endif 6665 6666 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", 6667 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); 6668 6669 dkey.mv_size = 0; 6670 6671 if (flags & MDB_CURRENT) { 6672 if (!(mc->mc_flags & C_INITIALIZED)) 6673 return EINVAL; 6674 rc = MDB_SUCCESS; 6675 } else if (mc->mc_db->md_root == P_INVALID) { 6676 /* new database, cursor has nothing to point to */ 6677 mc->mc_snum = 0; 6678 mc->mc_top = 0; 6679 mc->mc_flags &= ~C_INITIALIZED; 6680 rc = MDB_NO_ROOT; 6681 } else { 6682 int exact = 0; 6683 MDB_val d2; 6684 if (flags & MDB_APPEND) { 6685 MDB_val k2; 6686 rc = mdb_cursor_last(mc, &k2, &d2); 6687 if (rc == 0) { 6688 rc = mc->mc_dbx->md_cmp(key, &k2); 6689 if (rc > 0) { 6690 rc = MDB_NOTFOUND; 6691 mc->mc_ki[mc->mc_top]++; 6692 } else { 6693 /* new key is <= last key */ 6694 rc = MDB_KEYEXIST; 6695 } 6696 } 6697 } else { 6698 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); 6699 } 6700 if ((flags & MDB_NOOVERWRITE) && rc == 0) { 6701 DPRINTF(("duplicate key [%s]", DKEY(key))); 6702 *data = d2; 6703 return MDB_KEYEXIST; 6704 } 6705 if (rc && rc != MDB_NOTFOUND) 6706 return rc; 6707 } 6708 6709 if (mc->mc_flags & C_DEL) 6710 mc->mc_flags ^= C_DEL; 6711 6712 /* Cursor is positioned, check for room in the dirty list */ 6713 if (!nospill) { 6714 if (flags & MDB_MULTIPLE) { 6715 rdata = &xdata; 6716 xdata.mv_size = data->mv_size * dcount; 6717 } else { 6718 rdata = data; 6719 } 6720 if ((rc2 = mdb_page_spill(mc, key, rdata))) 6721 return rc2; 6722 } 6723 6724 if (rc == MDB_NO_ROOT) { 6725 MDB_page *np; 6726 /* new database, write a root leaf page */ 6727 DPUTS("allocating new root leaf page"); 6728 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { 6729 return rc2; 6730 } 6731 mdb_cursor_push(mc, np); 6732 mc->mc_db->md_root = np->mp_pgno; 6733 mc->mc_db->md_depth++; 6734 *mc->mc_dbflag |= DB_DIRTY; 6735 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) 6736 == MDB_DUPFIXED) 6737 MP_FLAGS(np) |= P_LEAF2; 6738 mc->mc_flags |= C_INITIALIZED; 6739 } else { 6740 /* make sure all cursor pages are writable */ 6741 rc2 = mdb_cursor_touch(mc); 6742 if (rc2) 6743 return rc2; 6744 } 6745 6746 insert_key = insert_data = rc; 6747 if (insert_key) { 6748 /* The key does not exist */ 6749 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); 6750 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 6751 LEAFSIZE(key, data) > env->me_nodemax) 6752 { 6753 /* Too big for a node, insert in sub-DB. Set up an empty 6754 * "old sub-page" for prep_subDB to expand to a full page. 6755 */ 6756 fp_flags = P_LEAF|P_DIRTY; 6757 fp = env->me_pbuf; 6758 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ 6759 MP_LOWER(fp) = MP_UPPER(fp) = (PAGEHDRSZ-PAGEBASE); 6760 olddata.mv_size = PAGEHDRSZ; 6761 goto prep_subDB; 6762 } 6763 } else { 6764 /* there's only a key anyway, so this is a no-op */ 6765 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6766 char *ptr; 6767 unsigned int ksize = mc->mc_db->md_pad; 6768 if (key->mv_size != ksize) 6769 return MDB_BAD_VALSIZE; 6770 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); 6771 memcpy(ptr, key->mv_data, ksize); 6772 fix_parent: 6773 /* if overwriting slot 0 of leaf, need to 6774 * update branch key if there is a parent page 6775 */ 6776 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6777 unsigned short dtop = 1; 6778 mc->mc_top--; 6779 /* slot 0 is always an empty key, find real slot */ 6780 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6781 mc->mc_top--; 6782 dtop++; 6783 } 6784 if (mc->mc_ki[mc->mc_top]) 6785 rc2 = mdb_update_key(mc, key); 6786 else 6787 rc2 = MDB_SUCCESS; 6788 mc->mc_top += dtop; 6789 if (rc2) 6790 return rc2; 6791 } 6792 return MDB_SUCCESS; 6793 } 6794 6795 more: 6796 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6797 olddata.mv_size = NODEDSZ(leaf); 6798 olddata.mv_data = NODEDATA(leaf); 6799 6800 /* DB has dups? */ 6801 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { 6802 /* Prepare (sub-)page/sub-DB to accept the new item, 6803 * if needed. fp: old sub-page or a header faking 6804 * it. mp: new (sub-)page. offset: growth in page 6805 * size. xdata: node data with new page or DB. 6806 */ 6807 unsigned i, offset = 0; 6808 mp = fp = xdata.mv_data = env->me_pbuf; 6809 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; 6810 6811 /* Was a single item before, must convert now */ 6812 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6813 MDB_cmp_func *dcmp; 6814 /* Just overwrite the current item */ 6815 if (flags == MDB_CURRENT) 6816 goto current; 6817 dcmp = mc->mc_dbx->md_dcmp; 6818 #if UINT_MAX < SIZE_MAX 6819 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6820 dcmp = mdb_cmp_clong; 6821 #endif 6822 /* does data match? */ 6823 if (!dcmp(data, &olddata)) { 6824 if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) 6825 return MDB_KEYEXIST; 6826 /* overwrite it */ 6827 goto current; 6828 } 6829 6830 /* Back up original data item */ 6831 dkey.mv_size = olddata.mv_size; 6832 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); 6833 6834 /* Make sub-page header for the dup items, with dummy body */ 6835 MP_FLAGS(fp) = P_LEAF|P_DIRTY|P_SUBP; 6836 MP_LOWER(fp) = (PAGEHDRSZ-PAGEBASE); 6837 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; 6838 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6839 MP_FLAGS(fp) |= P_LEAF2; 6840 fp->mp_pad = data->mv_size; 6841 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ 6842 } else { 6843 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + 6844 (dkey.mv_size & 1) + (data->mv_size & 1); 6845 } 6846 MP_UPPER(fp) = xdata.mv_size - PAGEBASE; 6847 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ 6848 } else if (leaf->mn_flags & F_SUBDATA) { 6849 /* Data is on sub-DB, just store it */ 6850 flags |= F_DUPDATA|F_SUBDATA; 6851 goto put_sub; 6852 } else { 6853 /* Data is on sub-page */ 6854 fp = olddata.mv_data; 6855 switch (flags) { 6856 default: 6857 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6858 offset = EVEN(NODESIZE + sizeof(indx_t) + 6859 data->mv_size); 6860 break; 6861 } 6862 offset = fp->mp_pad; 6863 if (SIZELEFT(fp) < offset) { 6864 offset *= 4; /* space for 4 more */ 6865 break; 6866 } 6867 /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */ 6868 case MDB_CURRENT: 6869 MP_FLAGS(fp) |= P_DIRTY; 6870 COPY_PGNO(MP_PGNO(fp), MP_PGNO(mp)); 6871 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; 6872 flags |= F_DUPDATA; 6873 goto put_sub; 6874 } 6875 xdata.mv_size = olddata.mv_size + offset; 6876 } 6877 6878 fp_flags = MP_FLAGS(fp); 6879 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { 6880 /* Too big for a sub-page, convert to sub-DB */ 6881 fp_flags &= ~P_SUBP; 6882 prep_subDB: 6883 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6884 fp_flags |= P_LEAF2; 6885 dummy.md_pad = fp->mp_pad; 6886 dummy.md_flags = MDB_DUPFIXED; 6887 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 6888 dummy.md_flags |= MDB_INTEGERKEY; 6889 } else { 6890 dummy.md_pad = 0; 6891 dummy.md_flags = 0; 6892 } 6893 dummy.md_depth = 1; 6894 dummy.md_branch_pages = 0; 6895 dummy.md_leaf_pages = 1; 6896 dummy.md_overflow_pages = 0; 6897 dummy.md_entries = NUMKEYS(fp); 6898 xdata.mv_size = sizeof(MDB_db); 6899 xdata.mv_data = &dummy; 6900 if ((rc = mdb_page_alloc(mc, 1, &mp))) 6901 return rc; 6902 offset = env->me_psize - olddata.mv_size; 6903 flags |= F_DUPDATA|F_SUBDATA; 6904 dummy.md_root = mp->mp_pgno; 6905 sub_root = mp; 6906 } 6907 if (mp != fp) { 6908 MP_FLAGS(mp) = fp_flags | P_DIRTY; 6909 MP_PAD(mp) = MP_PAD(fp); 6910 MP_LOWER(mp) = MP_LOWER(fp); 6911 MP_UPPER(mp) = MP_UPPER(fp) + offset; 6912 if (fp_flags & P_LEAF2) { 6913 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); 6914 } else { 6915 memcpy((char *)mp + MP_UPPER(mp) + PAGEBASE, (char *)fp + MP_UPPER(fp) + PAGEBASE, 6916 olddata.mv_size - MP_UPPER(fp) - PAGEBASE); 6917 memcpy((char *)MP_PTRS(mp), (char *)MP_PTRS(fp), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); 6918 for (i=0; i<NUMKEYS(fp); i++) 6919 mp->mp_ptrs[i] += offset; 6920 } 6921 } 6922 6923 rdata = &xdata; 6924 flags |= F_DUPDATA; 6925 do_sub = 1; 6926 if (!insert_key) 6927 mdb_node_del(mc, 0); 6928 goto new_sub; 6929 } 6930 current: 6931 /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ 6932 if ((leaf->mn_flags ^ flags) & F_SUBDATA) 6933 return MDB_INCOMPATIBLE; 6934 /* overflow page overwrites need special handling */ 6935 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 6936 MDB_page *omp; 6937 pgno_t pg; 6938 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); 6939 6940 memcpy(&pg, olddata.mv_data, sizeof(pg)); 6941 if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) 6942 return rc2; 6943 ovpages = omp->mp_pages; 6944 6945 /* Is the ov page large enough? */ 6946 if (ovpages >= dpages) { 6947 if (!(omp->mp_flags & P_DIRTY) && 6948 (level || (env->me_flags & MDB_WRITEMAP))) 6949 { 6950 rc = mdb_page_unspill(mc->mc_txn, omp, &omp); 6951 if (rc) 6952 return rc; 6953 level = 0; /* dirty in this txn or clean */ 6954 } 6955 /* Is it dirty? */ 6956 if (omp->mp_flags & P_DIRTY) { 6957 /* yes, overwrite it. Note in this case we don't 6958 * bother to try shrinking the page if the new data 6959 * is smaller than the overflow threshold. 6960 */ 6961 if (level > 1) { 6962 /* It is writable only in a parent txn */ 6963 size_t sz = (size_t) env->me_psize * ovpages, off; 6964 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); 6965 MDB_ID2 id2; 6966 if (!np) 6967 return ENOMEM; 6968 id2.mid = pg; 6969 id2.mptr = np; 6970 /* Note - this page is already counted in parent's dirty_room */ 6971 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); 6972 mdb_cassert(mc, rc2 == 0); 6973 /* Currently we make the page look as with put() in the 6974 * parent txn, in case the user peeks at MDB_RESERVEd 6975 * or unused parts. Some users treat ovpages specially. 6976 */ 6977 if (!(flags & MDB_RESERVE)) { 6978 /* Skip the part where LMDB will put *data. 6979 * Copy end of page, adjusting alignment so 6980 * compiler may copy words instead of bytes. 6981 */ 6982 off = (PAGEHDRSZ + data->mv_size) & -(int)sizeof(size_t); 6983 memcpy((size_t *)((char *)np + off), 6984 (size_t *)((char *)omp + off), sz - off); 6985 sz = PAGEHDRSZ; 6986 } 6987 memcpy(np, omp, sz); /* Copy beginning of page */ 6988 omp = np; 6989 } 6990 SETDSZ(leaf, data->mv_size); 6991 if (F_ISSET(flags, MDB_RESERVE)) 6992 data->mv_data = METADATA(omp); 6993 else 6994 memcpy(METADATA(omp), data->mv_data, data->mv_size); 6995 return MDB_SUCCESS; 6996 } 6997 } 6998 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) 6999 return rc2; 7000 } else if (data->mv_size == olddata.mv_size) { 7001 /* same size, just replace it. Note that we could 7002 * also reuse this node if the new data is smaller, 7003 * but instead we opt to shrink the node in that case. 7004 */ 7005 if (F_ISSET(flags, MDB_RESERVE)) 7006 data->mv_data = olddata.mv_data; 7007 else if (!(mc->mc_flags & C_SUB)) 7008 memcpy(olddata.mv_data, data->mv_data, data->mv_size); 7009 else { 7010 if (key->mv_size != NODEKSZ(leaf)) 7011 goto new_ksize; 7012 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); 7013 goto fix_parent; 7014 } 7015 return MDB_SUCCESS; 7016 } 7017 new_ksize: 7018 mdb_node_del(mc, 0); 7019 } 7020 7021 rdata = data; 7022 7023 new_sub: 7024 nflags = flags & NODE_ADD_FLAGS; 7025 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); 7026 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { 7027 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) 7028 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ 7029 if (!insert_key) 7030 nflags |= MDB_SPLIT_REPLACE; 7031 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); 7032 } else { 7033 /* There is room already in this leaf page. */ 7034 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); 7035 if (rc == 0) { 7036 /* Adjust other cursors pointing to mp */ 7037 MDB_cursor *m2, *m3; 7038 MDB_dbi dbi = mc->mc_dbi; 7039 unsigned i = mc->mc_top; 7040 MDB_page *mp = mc->mc_pg[i]; 7041 7042 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7043 if (mc->mc_flags & C_SUB) 7044 m3 = &m2->mc_xcursor->mx_cursor; 7045 else 7046 m3 = m2; 7047 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; 7048 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { 7049 m3->mc_ki[i]++; 7050 } 7051 XCURSOR_REFRESH(m3, i, mp); 7052 } 7053 } 7054 } 7055 7056 if (rc == MDB_SUCCESS) { 7057 /* Now store the actual data in the child DB. Note that we're 7058 * storing the user data in the keys field, so there are strict 7059 * size limits on dupdata. The actual data fields of the child 7060 * DB are all zero size. 7061 */ 7062 if (do_sub) { 7063 int xflags, new_dupdata; 7064 size_t ecount; 7065 put_sub: 7066 xdata.mv_size = 0; 7067 xdata.mv_data = ""; 7068 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 7069 if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) { 7070 xflags = MDB_CURRENT|MDB_NOSPILL; 7071 } else { 7072 mdb_xcursor_init1(mc, leaf); 7073 xflags = (flags & MDB_NODUPDATA) ? 7074 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; 7075 } 7076 if (sub_root) 7077 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; 7078 new_dupdata = (int)dkey.mv_size; 7079 /* converted, write the original data first */ 7080 if (dkey.mv_size) { 7081 rc = _mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); 7082 if (rc) 7083 goto bad_sub; 7084 /* we've done our job */ 7085 dkey.mv_size = 0; 7086 } 7087 if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { 7088 /* Adjust other cursors pointing to mp */ 7089 MDB_cursor *m2; 7090 MDB_xcursor *mx = mc->mc_xcursor; 7091 unsigned i = mc->mc_top; 7092 MDB_page *mp = mc->mc_pg[i]; 7093 7094 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 7095 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 7096 if (!(m2->mc_flags & C_INITIALIZED)) continue; 7097 if (m2->mc_pg[i] == mp) { 7098 if (m2->mc_ki[i] == mc->mc_ki[i]) { 7099 mdb_xcursor_init2(m2, mx, new_dupdata); 7100 } else if (!insert_key) { 7101 XCURSOR_REFRESH(m2, i, mp); 7102 } 7103 } 7104 } 7105 } 7106 ecount = mc->mc_xcursor->mx_db.md_entries; 7107 if (flags & MDB_APPENDDUP) 7108 xflags |= MDB_APPEND; 7109 rc = _mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); 7110 if (flags & F_SUBDATA) { 7111 void *db = NODEDATA(leaf); 7112 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 7113 } 7114 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; 7115 } 7116 /* Increment count unless we just replaced an existing item. */ 7117 if (insert_data) 7118 mc->mc_db->md_entries++; 7119 if (insert_key) { 7120 /* Invalidate txn if we created an empty sub-DB */ 7121 if (rc) 7122 goto bad_sub; 7123 /* If we succeeded and the key didn't exist before, 7124 * make sure the cursor is marked valid. 7125 */ 7126 mc->mc_flags |= C_INITIALIZED; 7127 } 7128 if (flags & MDB_MULTIPLE) { 7129 if (!rc) { 7130 mcount++; 7131 /* let caller know how many succeeded, if any */ 7132 data[1].mv_size = mcount; 7133 if (mcount < dcount) { 7134 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; 7135 insert_key = insert_data = 0; 7136 goto more; 7137 } 7138 } 7139 } 7140 return rc; 7141 bad_sub: 7142 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ 7143 rc = MDB_CORRUPTED; 7144 } 7145 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7146 return rc; 7147 } 7148 7149 int 7150 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 7151 unsigned int flags) 7152 { 7153 DKBUF; 7154 DDBUF; 7155 int rc = _mdb_cursor_put(mc, key, data, flags); 7156 MDB_TRACE(("%p, %"Z"u[%s], %"Z"u%s, %u", 7157 mc, key ? key->mv_size:0, DKEY(key), data ? data->mv_size:0, 7158 data ? mdb_dval(mc->mc_txn, mc->mc_dbi, data, dbuf):"", flags)); 7159 return rc; 7160 } 7161 7162 static int 7163 _mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 7164 { 7165 MDB_node *leaf; 7166 MDB_page *mp; 7167 int rc; 7168 7169 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 7170 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 7171 7172 if (!(mc->mc_flags & C_INITIALIZED)) 7173 return EINVAL; 7174 7175 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7176 return MDB_NOTFOUND; 7177 7178 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) 7179 return rc; 7180 7181 rc = mdb_cursor_touch(mc); 7182 if (rc) 7183 return rc; 7184 7185 mp = mc->mc_pg[mc->mc_top]; 7186 if (!IS_LEAF(mp)) 7187 return MDB_CORRUPTED; 7188 if (IS_LEAF2(mp)) 7189 goto del_key; 7190 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7191 7192 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7193 if (flags & MDB_NODUPDATA) { 7194 /* mdb_cursor_del0() will subtract the final entry */ 7195 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; 7196 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7197 } else { 7198 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { 7199 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7200 } 7201 rc = _mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); 7202 if (rc) 7203 return rc; 7204 /* If sub-DB still has entries, we're done */ 7205 if (mc->mc_xcursor->mx_db.md_entries) { 7206 if (leaf->mn_flags & F_SUBDATA) { 7207 /* update subDB info */ 7208 void *db = NODEDATA(leaf); 7209 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 7210 } else { 7211 MDB_cursor *m2; 7212 /* shrink fake page */ 7213 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); 7214 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7215 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7216 /* fix other sub-DB cursors pointed at fake pages on this page */ 7217 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 7218 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 7219 if (!(m2->mc_flags & C_INITIALIZED)) continue; 7220 if (m2->mc_pg[mc->mc_top] == mp) { 7221 XCURSOR_REFRESH(m2, mc->mc_top, mp); 7222 } 7223 } 7224 } 7225 mc->mc_db->md_entries--; 7226 return rc; 7227 } else { 7228 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7229 } 7230 /* otherwise fall thru and delete the sub-DB */ 7231 } 7232 7233 if (leaf->mn_flags & F_SUBDATA) { 7234 /* add all the child DB's pages to the free list */ 7235 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 7236 if (rc) 7237 goto fail; 7238 } 7239 } 7240 /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ 7241 else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { 7242 rc = MDB_INCOMPATIBLE; 7243 goto fail; 7244 } 7245 7246 /* add overflow pages to free list */ 7247 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 7248 MDB_page *omp; 7249 pgno_t pg; 7250 7251 memcpy(&pg, NODEDATA(leaf), sizeof(pg)); 7252 if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || 7253 (rc = mdb_ovpage_free(mc, omp))) 7254 goto fail; 7255 } 7256 7257 del_key: 7258 return mdb_cursor_del0(mc); 7259 7260 fail: 7261 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7262 return rc; 7263 } 7264 7265 int 7266 mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 7267 { 7268 MDB_TRACE(("%p, %u", 7269 mc, flags)); 7270 return _mdb_cursor_del(mc, flags); 7271 } 7272 7273 /** Allocate and initialize new pages for a database. 7274 * Set #MDB_TXN_ERROR on failure. 7275 * @param[in] mc a cursor on the database being added to. 7276 * @param[in] flags flags defining what type of page is being allocated. 7277 * @param[in] num the number of pages to allocate. This is usually 1, 7278 * unless allocating overflow pages for a large record. 7279 * @param[out] mp Address of a page, or NULL on failure. 7280 * @return 0 on success, non-zero on failure. 7281 */ 7282 static int 7283 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) 7284 { 7285 MDB_page *np; 7286 int rc; 7287 7288 if ((rc = mdb_page_alloc(mc, num, &np))) 7289 return rc; 7290 DPRINTF(("allocated new mpage %"Z"u, page size %u", 7291 np->mp_pgno, mc->mc_txn->mt_env->me_psize)); 7292 np->mp_flags = flags | P_DIRTY; 7293 np->mp_lower = (PAGEHDRSZ-PAGEBASE); 7294 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; 7295 7296 if (IS_BRANCH(np)) 7297 mc->mc_db->md_branch_pages++; 7298 else if (IS_LEAF(np)) 7299 mc->mc_db->md_leaf_pages++; 7300 else if (IS_OVERFLOW(np)) { 7301 mc->mc_db->md_overflow_pages += num; 7302 np->mp_pages = num; 7303 } 7304 *mp = np; 7305 7306 return 0; 7307 } 7308 7309 /** Calculate the size of a leaf node. 7310 * The size depends on the environment's page size; if a data item 7311 * is too large it will be put onto an overflow page and the node 7312 * size will only include the key and not the data. Sizes are always 7313 * rounded up to an even number of bytes, to guarantee 2-byte alignment 7314 * of the #MDB_node headers. 7315 * @param[in] env The environment handle. 7316 * @param[in] key The key for the node. 7317 * @param[in] data The data for the node. 7318 * @return The number of bytes needed to store the node. 7319 */ 7320 static size_t 7321 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) 7322 { 7323 size_t sz; 7324 7325 sz = LEAFSIZE(key, data); 7326 if (sz > env->me_nodemax) { 7327 /* put on overflow page */ 7328 sz -= data->mv_size - sizeof(pgno_t); 7329 } 7330 7331 return EVEN(sz + sizeof(indx_t)); 7332 } 7333 7334 /** Calculate the size of a branch node. 7335 * The size should depend on the environment's page size but since 7336 * we currently don't support spilling large keys onto overflow 7337 * pages, it's simply the size of the #MDB_node header plus the 7338 * size of the key. Sizes are always rounded up to an even number 7339 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. 7340 * @param[in] env The environment handle. 7341 * @param[in] key The key for the node. 7342 * @return The number of bytes needed to store the node. 7343 */ 7344 static size_t 7345 mdb_branch_size(MDB_env *env, MDB_val *key) 7346 { 7347 size_t sz; 7348 7349 sz = INDXSIZE(key); 7350 if (sz > env->me_nodemax) { 7351 /* put on overflow page */ 7352 /* not implemented */ 7353 /* sz -= key->size - sizeof(pgno_t); */ 7354 } 7355 7356 return sz + sizeof(indx_t); 7357 } 7358 7359 /** Add a node to the page pointed to by the cursor. 7360 * Set #MDB_TXN_ERROR on failure. 7361 * @param[in] mc The cursor for this operation. 7362 * @param[in] indx The index on the page where the new node should be added. 7363 * @param[in] key The key for the new node. 7364 * @param[in] data The data for the new node, if any. 7365 * @param[in] pgno The page number, if adding a branch node. 7366 * @param[in] flags Flags for the node. 7367 * @return 0 on success, non-zero on failure. Possible errors are: 7368 * <ul> 7369 * <li>ENOMEM - failed to allocate overflow pages for the node. 7370 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error 7371 * should never happen since all callers already calculate the 7372 * page's free space before calling this function. 7373 * </ul> 7374 */ 7375 static int 7376 mdb_node_add(MDB_cursor *mc, indx_t indx, 7377 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) 7378 { 7379 unsigned int i; 7380 size_t node_size = NODESIZE; 7381 ssize_t room; 7382 indx_t ofs; 7383 MDB_node *node; 7384 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7385 MDB_page *ofp = NULL; /* overflow page */ 7386 void *ndata; 7387 DKBUF; 7388 7389 mdb_cassert(mc, MP_UPPER(mp) >= MP_LOWER(mp)); 7390 7391 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", 7392 IS_LEAF(mp) ? "leaf" : "branch", 7393 IS_SUBP(mp) ? "sub-" : "", 7394 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, 7395 key ? key->mv_size : 0, key ? DKEY(key) : "null")); 7396 7397 if (IS_LEAF2(mp)) { 7398 /* Move higher keys up one slot. */ 7399 int ksize = mc->mc_db->md_pad, dif; 7400 char *ptr = LEAF2KEY(mp, indx, ksize); 7401 dif = NUMKEYS(mp) - indx; 7402 if (dif > 0) 7403 memmove(ptr+ksize, ptr, dif*ksize); 7404 /* insert new key */ 7405 memcpy(ptr, key->mv_data, ksize); 7406 7407 /* Just using these for counting */ 7408 MP_LOWER(mp) += sizeof(indx_t); 7409 MP_UPPER(mp) -= ksize - sizeof(indx_t); 7410 return MDB_SUCCESS; 7411 } 7412 7413 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); 7414 if (key != NULL) 7415 node_size += key->mv_size; 7416 if (IS_LEAF(mp)) { 7417 mdb_cassert(mc, key && data); 7418 if (F_ISSET(flags, F_BIGDATA)) { 7419 /* Data already on overflow page. */ 7420 node_size += sizeof(pgno_t); 7421 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { 7422 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); 7423 int rc; 7424 /* Put data on overflow page. */ 7425 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", 7426 data->mv_size, node_size+data->mv_size)); 7427 node_size = EVEN(node_size + sizeof(pgno_t)); 7428 if ((ssize_t)node_size > room) 7429 goto full; 7430 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) 7431 return rc; 7432 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); 7433 flags |= F_BIGDATA; 7434 goto update; 7435 } else { 7436 node_size += data->mv_size; 7437 } 7438 } 7439 node_size = EVEN(node_size); 7440 if ((ssize_t)node_size > room) 7441 goto full; 7442 7443 update: 7444 /* Move higher pointers up one slot. */ 7445 for (i = NUMKEYS(mp); i > indx; i--) 7446 MP_PTRS(mp)[i] = MP_PTRS(mp)[i - 1]; 7447 7448 /* Adjust free space offsets. */ 7449 ofs = MP_UPPER(mp) - node_size; 7450 mdb_cassert(mc, ofs >= MP_LOWER(mp) + sizeof(indx_t)); 7451 MP_PTRS(mp)[indx] = ofs; 7452 MP_UPPER(mp) = ofs; 7453 MP_LOWER(mp) += sizeof(indx_t); 7454 7455 /* Write the node data. */ 7456 node = NODEPTR(mp, indx); 7457 node->mn_ksize = (key == NULL) ? 0 : key->mv_size; 7458 node->mn_flags = flags; 7459 if (IS_LEAF(mp)) 7460 SETDSZ(node,data->mv_size); 7461 else 7462 SETPGNO(node,pgno); 7463 7464 if (key) 7465 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7466 7467 if (IS_LEAF(mp)) { 7468 ndata = NODEDATA(node); 7469 if (ofp == NULL) { 7470 if (F_ISSET(flags, F_BIGDATA)) 7471 memcpy(ndata, data->mv_data, sizeof(pgno_t)); 7472 else if (F_ISSET(flags, MDB_RESERVE)) 7473 data->mv_data = ndata; 7474 else 7475 memcpy(ndata, data->mv_data, data->mv_size); 7476 } else { 7477 memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); 7478 ndata = METADATA(ofp); 7479 if (F_ISSET(flags, MDB_RESERVE)) 7480 data->mv_data = ndata; 7481 else 7482 memcpy(ndata, data->mv_data, data->mv_size); 7483 } 7484 } 7485 7486 return MDB_SUCCESS; 7487 7488 full: 7489 DPRINTF(("not enough room in page %"Z"u, got %u ptrs", 7490 mdb_dbg_pgno(mp), NUMKEYS(mp))); 7491 DPRINTF(("upper-lower = %u - %u = %"Z"d", MP_UPPER(mp),MP_LOWER(mp),room)); 7492 DPRINTF(("node size = %"Z"u", node_size)); 7493 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7494 return MDB_PAGE_FULL; 7495 } 7496 7497 /** Delete the specified node from a page. 7498 * @param[in] mc Cursor pointing to the node to delete. 7499 * @param[in] ksize The size of a node. Only used if the page is 7500 * part of a #MDB_DUPFIXED database. 7501 */ 7502 static void 7503 mdb_node_del(MDB_cursor *mc, int ksize) 7504 { 7505 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7506 indx_t indx = mc->mc_ki[mc->mc_top]; 7507 unsigned int sz; 7508 indx_t i, j, numkeys, ptr; 7509 MDB_node *node; 7510 char *base; 7511 7512 DPRINTF(("delete node %u on %s page %"Z"u", indx, 7513 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); 7514 numkeys = NUMKEYS(mp); 7515 mdb_cassert(mc, indx < numkeys); 7516 7517 if (IS_LEAF2(mp)) { 7518 int x = numkeys - 1 - indx; 7519 base = LEAF2KEY(mp, indx, ksize); 7520 if (x) 7521 memmove(base, base + ksize, x * ksize); 7522 MP_LOWER(mp) -= sizeof(indx_t); 7523 MP_UPPER(mp) += ksize - sizeof(indx_t); 7524 return; 7525 } 7526 7527 node = NODEPTR(mp, indx); 7528 sz = NODESIZE + node->mn_ksize; 7529 if (IS_LEAF(mp)) { 7530 if (F_ISSET(node->mn_flags, F_BIGDATA)) 7531 sz += sizeof(pgno_t); 7532 else 7533 sz += NODEDSZ(node); 7534 } 7535 sz = EVEN(sz); 7536 7537 ptr = MP_PTRS(mp)[indx]; 7538 for (i = j = 0; i < numkeys; i++) { 7539 if (i != indx) { 7540 MP_PTRS(mp)[j] = MP_PTRS(mp)[i]; 7541 if (MP_PTRS(mp)[i] < ptr) 7542 MP_PTRS(mp)[j] += sz; 7543 j++; 7544 } 7545 } 7546 7547 base = (char *)mp + MP_UPPER(mp) + PAGEBASE; 7548 memmove(base + sz, base, ptr - MP_UPPER(mp)); 7549 7550 MP_LOWER(mp) -= sizeof(indx_t); 7551 MP_UPPER(mp) += sz; 7552 } 7553 7554 /** Compact the main page after deleting a node on a subpage. 7555 * @param[in] mp The main page to operate on. 7556 * @param[in] indx The index of the subpage on the main page. 7557 */ 7558 static void 7559 mdb_node_shrink(MDB_page *mp, indx_t indx) 7560 { 7561 MDB_node *node; 7562 MDB_page *sp, *xp; 7563 char *base; 7564 indx_t delta, nsize, len, ptr; 7565 int i; 7566 7567 node = NODEPTR(mp, indx); 7568 sp = (MDB_page *)NODEDATA(node); 7569 delta = SIZELEFT(sp); 7570 nsize = NODEDSZ(node) - delta; 7571 7572 /* Prepare to shift upward, set len = length(subpage part to shift) */ 7573 if (IS_LEAF2(sp)) { 7574 len = nsize; 7575 if (nsize & 1) 7576 return; /* do not make the node uneven-sized */ 7577 } else { 7578 xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ 7579 for (i = NUMKEYS(sp); --i >= 0; ) 7580 MP_PTRS(xp)[i] = MP_PTRS(sp)[i] - delta; 7581 len = PAGEHDRSZ; 7582 } 7583 MP_UPPER(sp) = MP_LOWER(sp); 7584 COPY_PGNO(MP_PGNO(sp), mp->mp_pgno); 7585 SETDSZ(node, nsize); 7586 7587 /* Shift <lower nodes...initial part of subpage> upward */ 7588 base = (char *)mp + mp->mp_upper + PAGEBASE; 7589 memmove(base + delta, base, (char *)sp + len - base); 7590 7591 ptr = mp->mp_ptrs[indx]; 7592 for (i = NUMKEYS(mp); --i >= 0; ) { 7593 if (mp->mp_ptrs[i] <= ptr) 7594 mp->mp_ptrs[i] += delta; 7595 } 7596 mp->mp_upper += delta; 7597 } 7598 7599 /** Initial setup of a sorted-dups cursor. 7600 * Sorted duplicates are implemented as a sub-database for the given key. 7601 * The duplicate data items are actually keys of the sub-database. 7602 * Operations on the duplicate data items are performed using a sub-cursor 7603 * initialized when the sub-database is first accessed. This function does 7604 * the preliminary setup of the sub-cursor, filling in the fields that 7605 * depend only on the parent DB. 7606 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7607 */ 7608 static void 7609 mdb_xcursor_init0(MDB_cursor *mc) 7610 { 7611 MDB_xcursor *mx = mc->mc_xcursor; 7612 7613 mx->mx_cursor.mc_xcursor = NULL; 7614 mx->mx_cursor.mc_txn = mc->mc_txn; 7615 mx->mx_cursor.mc_db = &mx->mx_db; 7616 mx->mx_cursor.mc_dbx = &mx->mx_dbx; 7617 mx->mx_cursor.mc_dbi = mc->mc_dbi; 7618 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; 7619 mx->mx_cursor.mc_snum = 0; 7620 mx->mx_cursor.mc_top = 0; 7621 mx->mx_cursor.mc_flags = C_SUB; 7622 mx->mx_dbx.md_name.mv_size = 0; 7623 mx->mx_dbx.md_name.mv_data = NULL; 7624 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; 7625 mx->mx_dbx.md_dcmp = NULL; 7626 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; 7627 } 7628 7629 /** Final setup of a sorted-dups cursor. 7630 * Sets up the fields that depend on the data from the main cursor. 7631 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7632 * @param[in] node The data containing the #MDB_db record for the 7633 * sorted-dup database. 7634 */ 7635 static void 7636 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) 7637 { 7638 MDB_xcursor *mx = mc->mc_xcursor; 7639 7640 if (node->mn_flags & F_SUBDATA) { 7641 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); 7642 mx->mx_cursor.mc_pg[0] = 0; 7643 mx->mx_cursor.mc_snum = 0; 7644 mx->mx_cursor.mc_top = 0; 7645 mx->mx_cursor.mc_flags = C_SUB; 7646 } else { 7647 MDB_page *fp = NODEDATA(node); 7648 mx->mx_db.md_pad = 0; 7649 mx->mx_db.md_flags = 0; 7650 mx->mx_db.md_depth = 1; 7651 mx->mx_db.md_branch_pages = 0; 7652 mx->mx_db.md_leaf_pages = 1; 7653 mx->mx_db.md_overflow_pages = 0; 7654 mx->mx_db.md_entries = NUMKEYS(fp); 7655 COPY_PGNO(mx->mx_db.md_root, MP_PGNO(fp)); 7656 mx->mx_cursor.mc_snum = 1; 7657 mx->mx_cursor.mc_top = 0; 7658 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; 7659 mx->mx_cursor.mc_pg[0] = fp; 7660 mx->mx_cursor.mc_ki[0] = 0; 7661 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 7662 mx->mx_db.md_flags = MDB_DUPFIXED; 7663 mx->mx_db.md_pad = fp->mp_pad; 7664 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 7665 mx->mx_db.md_flags |= MDB_INTEGERKEY; 7666 } 7667 } 7668 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7669 mx->mx_db.md_root)); 7670 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7671 #if UINT_MAX < SIZE_MAX 7672 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) 7673 mx->mx_dbx.md_cmp = mdb_cmp_clong; 7674 #endif 7675 } 7676 7677 7678 /** Fixup a sorted-dups cursor due to underlying update. 7679 * Sets up some fields that depend on the data from the main cursor. 7680 * Almost the same as init1, but skips initialization steps if the 7681 * xcursor had already been used. 7682 * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. 7683 * @param[in] src_mx The xcursor of an up-to-date cursor. 7684 * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. 7685 */ 7686 static void 7687 mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) 7688 { 7689 MDB_xcursor *mx = mc->mc_xcursor; 7690 7691 if (new_dupdata) { 7692 mx->mx_cursor.mc_snum = 1; 7693 mx->mx_cursor.mc_top = 0; 7694 mx->mx_cursor.mc_flags |= C_INITIALIZED; 7695 mx->mx_cursor.mc_ki[0] = 0; 7696 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7697 #if UINT_MAX < SIZE_MAX 7698 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; 7699 #endif 7700 } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { 7701 return; 7702 } 7703 mx->mx_db = src_mx->mx_db; 7704 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; 7705 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7706 mx->mx_db.md_root)); 7707 } 7708 7709 /** Initialize a cursor for a given transaction and database. */ 7710 static void 7711 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) 7712 { 7713 mc->mc_next = NULL; 7714 mc->mc_backup = NULL; 7715 mc->mc_dbi = dbi; 7716 mc->mc_txn = txn; 7717 mc->mc_db = &txn->mt_dbs[dbi]; 7718 mc->mc_dbx = &txn->mt_dbxs[dbi]; 7719 mc->mc_dbflag = &txn->mt_dbflags[dbi]; 7720 mc->mc_snum = 0; 7721 mc->mc_top = 0; 7722 mc->mc_pg[0] = 0; 7723 mc->mc_ki[0] = 0; 7724 mc->mc_flags = 0; 7725 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 7726 mdb_tassert(txn, mx != NULL); 7727 mc->mc_xcursor = mx; 7728 mdb_xcursor_init0(mc); 7729 } else { 7730 mc->mc_xcursor = NULL; 7731 } 7732 if (*mc->mc_dbflag & DB_STALE) { 7733 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); 7734 } 7735 } 7736 7737 int 7738 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) 7739 { 7740 MDB_cursor *mc; 7741 size_t size = sizeof(MDB_cursor); 7742 7743 if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 7744 return EINVAL; 7745 7746 if (txn->mt_flags & MDB_TXN_BLOCKED) 7747 return MDB_BAD_TXN; 7748 7749 if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 7750 return EINVAL; 7751 7752 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) 7753 size += sizeof(MDB_xcursor); 7754 7755 if ((mc = malloc(size)) != NULL) { 7756 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); 7757 if (txn->mt_cursors) { 7758 mc->mc_next = txn->mt_cursors[dbi]; 7759 txn->mt_cursors[dbi] = mc; 7760 mc->mc_flags |= C_UNTRACK; 7761 } 7762 } else { 7763 return ENOMEM; 7764 } 7765 7766 MDB_TRACE(("%p, %u = %p", txn, dbi, mc)); 7767 *ret = mc; 7768 7769 return MDB_SUCCESS; 7770 } 7771 7772 int 7773 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) 7774 { 7775 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) 7776 return EINVAL; 7777 7778 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) 7779 return EINVAL; 7780 7781 if (txn->mt_flags & MDB_TXN_BLOCKED) 7782 return MDB_BAD_TXN; 7783 7784 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); 7785 return MDB_SUCCESS; 7786 } 7787 7788 /* Return the count of duplicate data items for the current key */ 7789 int 7790 mdb_cursor_count(MDB_cursor *mc, size_t *countp) 7791 { 7792 MDB_node *leaf; 7793 7794 if (mc == NULL || countp == NULL) 7795 return EINVAL; 7796 7797 if (mc->mc_xcursor == NULL) 7798 return MDB_INCOMPATIBLE; 7799 7800 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 7801 return MDB_BAD_TXN; 7802 7803 if (!(mc->mc_flags & C_INITIALIZED)) 7804 return EINVAL; 7805 7806 if (!mc->mc_snum) 7807 return MDB_NOTFOUND; 7808 7809 if (mc->mc_flags & C_EOF) { 7810 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7811 return MDB_NOTFOUND; 7812 mc->mc_flags ^= C_EOF; 7813 } 7814 7815 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 7816 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7817 *countp = 1; 7818 } else { 7819 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 7820 return EINVAL; 7821 7822 *countp = mc->mc_xcursor->mx_db.md_entries; 7823 } 7824 return MDB_SUCCESS; 7825 } 7826 7827 void 7828 mdb_cursor_close(MDB_cursor *mc) 7829 { 7830 MDB_TRACE(("%p", mc)); 7831 if (mc && !mc->mc_backup) { 7832 /* remove from txn, if tracked */ 7833 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { 7834 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; 7835 while (*prev && *prev != mc) prev = &(*prev)->mc_next; 7836 if (*prev == mc) 7837 *prev = mc->mc_next; 7838 } 7839 free(mc); 7840 } 7841 } 7842 7843 MDB_txn * 7844 mdb_cursor_txn(MDB_cursor *mc) 7845 { 7846 if (!mc) return NULL; 7847 return mc->mc_txn; 7848 } 7849 7850 MDB_dbi 7851 mdb_cursor_dbi(MDB_cursor *mc) 7852 { 7853 return mc->mc_dbi; 7854 } 7855 7856 /** Replace the key for a branch node with a new key. 7857 * Set #MDB_TXN_ERROR on failure. 7858 * @param[in] mc Cursor pointing to the node to operate on. 7859 * @param[in] key The new key to use. 7860 * @return 0 on success, non-zero on failure. 7861 */ 7862 static int 7863 mdb_update_key(MDB_cursor *mc, MDB_val *key) 7864 { 7865 MDB_page *mp; 7866 MDB_node *node; 7867 char *base; 7868 size_t len; 7869 int delta, ksize, oksize; 7870 indx_t ptr, i, numkeys, indx; 7871 DKBUF; 7872 7873 indx = mc->mc_ki[mc->mc_top]; 7874 mp = mc->mc_pg[mc->mc_top]; 7875 node = NODEPTR(mp, indx); 7876 ptr = mp->mp_ptrs[indx]; 7877 #if MDB_DEBUG 7878 { 7879 MDB_val k2; 7880 char kbuf2[DKBUF_MAXKEYSIZE*2+1]; 7881 k2.mv_data = NODEKEY(node); 7882 k2.mv_size = node->mn_ksize; 7883 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", 7884 indx, ptr, 7885 mdb_dkey(&k2, kbuf2), 7886 DKEY(key), 7887 mp->mp_pgno)); 7888 } 7889 #endif 7890 7891 /* Sizes must be 2-byte aligned. */ 7892 ksize = EVEN(key->mv_size); 7893 oksize = EVEN(node->mn_ksize); 7894 delta = ksize - oksize; 7895 7896 /* Shift node contents if EVEN(key length) changed. */ 7897 if (delta) { 7898 if (delta > 0 && SIZELEFT(mp) < delta) { 7899 pgno_t pgno; 7900 /* not enough space left, do a delete and split */ 7901 DPRINTF(("Not enough room, delta = %d, splitting...", delta)); 7902 pgno = NODEPGNO(node); 7903 mdb_node_del(mc, 0); 7904 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); 7905 } 7906 7907 numkeys = NUMKEYS(mp); 7908 for (i = 0; i < numkeys; i++) { 7909 if (mp->mp_ptrs[i] <= ptr) 7910 mp->mp_ptrs[i] -= delta; 7911 } 7912 7913 base = (char *)mp + mp->mp_upper + PAGEBASE; 7914 len = ptr - mp->mp_upper + NODESIZE; 7915 memmove(base - delta, base, len); 7916 mp->mp_upper -= delta; 7917 7918 node = NODEPTR(mp, indx); 7919 } 7920 7921 /* But even if no shift was needed, update ksize */ 7922 if (node->mn_ksize != key->mv_size) 7923 node->mn_ksize = key->mv_size; 7924 7925 if (key->mv_size) 7926 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7927 7928 return MDB_SUCCESS; 7929 } 7930 7931 static void 7932 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); 7933 7934 /** Perform \b act while tracking temporary cursor \b mn */ 7935 #define WITH_CURSOR_TRACKING(mn, act) do { \ 7936 MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ 7937 if ((mn).mc_flags & C_SUB) { \ 7938 dummy.mc_flags = C_INITIALIZED; \ 7939 dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ 7940 tracked = &dummy; \ 7941 } else { \ 7942 tracked = &(mn); \ 7943 } \ 7944 tracked->mc_next = *tp; \ 7945 *tp = tracked; \ 7946 { act; } \ 7947 *tp = tracked->mc_next; \ 7948 } while (0) 7949 7950 /** Move a node from csrc to cdst. 7951 */ 7952 static int 7953 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) 7954 { 7955 MDB_node *srcnode; 7956 MDB_val key, data; 7957 pgno_t srcpg; 7958 MDB_cursor mn; 7959 int rc; 7960 unsigned short flags; 7961 7962 DKBUF; 7963 7964 /* Mark src and dst as dirty. */ 7965 if ((rc = mdb_page_touch(csrc)) || 7966 (rc = mdb_page_touch(cdst))) 7967 return rc; 7968 7969 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7970 key.mv_size = csrc->mc_db->md_pad; 7971 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); 7972 data.mv_size = 0; 7973 data.mv_data = NULL; 7974 srcpg = 0; 7975 flags = 0; 7976 } else { 7977 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); 7978 mdb_cassert(csrc, !((size_t)srcnode & 1)); 7979 srcpg = NODEPGNO(srcnode); 7980 flags = srcnode->mn_flags; 7981 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7982 unsigned int snum = csrc->mc_snum; 7983 MDB_node *s2; 7984 /* must find the lowest key below src */ 7985 rc = mdb_page_search_lowest(csrc); 7986 if (rc) 7987 return rc; 7988 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7989 key.mv_size = csrc->mc_db->md_pad; 7990 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7991 } else { 7992 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7993 key.mv_size = NODEKSZ(s2); 7994 key.mv_data = NODEKEY(s2); 7995 } 7996 csrc->mc_snum = snum--; 7997 csrc->mc_top = snum; 7998 } else { 7999 key.mv_size = NODEKSZ(srcnode); 8000 key.mv_data = NODEKEY(srcnode); 8001 } 8002 data.mv_size = NODEDSZ(srcnode); 8003 data.mv_data = NODEDATA(srcnode); 8004 } 8005 mn.mc_xcursor = NULL; 8006 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { 8007 unsigned int snum = cdst->mc_snum; 8008 MDB_node *s2; 8009 MDB_val bkey; 8010 /* must find the lowest key below dst */ 8011 mdb_cursor_copy(cdst, &mn); 8012 rc = mdb_page_search_lowest(&mn); 8013 if (rc) 8014 return rc; 8015 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 8016 bkey.mv_size = mn.mc_db->md_pad; 8017 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); 8018 } else { 8019 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 8020 bkey.mv_size = NODEKSZ(s2); 8021 bkey.mv_data = NODEKEY(s2); 8022 } 8023 mn.mc_snum = snum--; 8024 mn.mc_top = snum; 8025 mn.mc_ki[snum] = 0; 8026 rc = mdb_update_key(&mn, &bkey); 8027 if (rc) 8028 return rc; 8029 } 8030 8031 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", 8032 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", 8033 csrc->mc_ki[csrc->mc_top], 8034 DKEY(&key), 8035 csrc->mc_pg[csrc->mc_top]->mp_pgno, 8036 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); 8037 8038 /* Add the node to the destination page. 8039 */ 8040 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); 8041 if (rc != MDB_SUCCESS) 8042 return rc; 8043 8044 /* Delete the node from the source page. 8045 */ 8046 mdb_node_del(csrc, key.mv_size); 8047 8048 { 8049 /* Adjust other cursors pointing to mp */ 8050 MDB_cursor *m2, *m3; 8051 MDB_dbi dbi = csrc->mc_dbi; 8052 MDB_page *mpd, *mps; 8053 8054 mps = csrc->mc_pg[csrc->mc_top]; 8055 /* If we're adding on the left, bump others up */ 8056 if (fromleft) { 8057 mpd = cdst->mc_pg[csrc->mc_top]; 8058 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8059 if (csrc->mc_flags & C_SUB) 8060 m3 = &m2->mc_xcursor->mx_cursor; 8061 else 8062 m3 = m2; 8063 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 8064 continue; 8065 if (m3 != cdst && 8066 m3->mc_pg[csrc->mc_top] == mpd && 8067 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { 8068 m3->mc_ki[csrc->mc_top]++; 8069 } 8070 if (m3 !=csrc && 8071 m3->mc_pg[csrc->mc_top] == mps && 8072 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { 8073 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 8074 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 8075 m3->mc_ki[csrc->mc_top-1]++; 8076 } 8077 if (IS_LEAF(mps)) 8078 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); 8079 } 8080 } else 8081 /* Adding on the right, bump others down */ 8082 { 8083 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8084 if (csrc->mc_flags & C_SUB) 8085 m3 = &m2->mc_xcursor->mx_cursor; 8086 else 8087 m3 = m2; 8088 if (m3 == csrc) continue; 8089 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 8090 continue; 8091 if (m3->mc_pg[csrc->mc_top] == mps) { 8092 if (!m3->mc_ki[csrc->mc_top]) { 8093 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 8094 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 8095 m3->mc_ki[csrc->mc_top-1]--; 8096 } else { 8097 m3->mc_ki[csrc->mc_top]--; 8098 } 8099 if (IS_LEAF(mps)) 8100 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); 8101 } 8102 } 8103 } 8104 } 8105 8106 /* Update the parent separators. 8107 */ 8108 if (csrc->mc_ki[csrc->mc_top] == 0) { 8109 if (csrc->mc_ki[csrc->mc_top-1] != 0) { 8110 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 8111 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 8112 } else { 8113 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 8114 key.mv_size = NODEKSZ(srcnode); 8115 key.mv_data = NODEKEY(srcnode); 8116 } 8117 DPRINTF(("update separator for source page %"Z"u to [%s]", 8118 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); 8119 mdb_cursor_copy(csrc, &mn); 8120 mn.mc_snum--; 8121 mn.mc_top--; 8122 /* We want mdb_rebalance to find mn when doing fixups */ 8123 WITH_CURSOR_TRACKING(mn, 8124 rc = mdb_update_key(&mn, &key)); 8125 if (rc) 8126 return rc; 8127 } 8128 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 8129 MDB_val nullkey; 8130 indx_t ix = csrc->mc_ki[csrc->mc_top]; 8131 nullkey.mv_size = 0; 8132 csrc->mc_ki[csrc->mc_top] = 0; 8133 rc = mdb_update_key(csrc, &nullkey); 8134 csrc->mc_ki[csrc->mc_top] = ix; 8135 mdb_cassert(csrc, rc == MDB_SUCCESS); 8136 } 8137 } 8138 8139 if (cdst->mc_ki[cdst->mc_top] == 0) { 8140 if (cdst->mc_ki[cdst->mc_top-1] != 0) { 8141 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 8142 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); 8143 } else { 8144 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); 8145 key.mv_size = NODEKSZ(srcnode); 8146 key.mv_data = NODEKEY(srcnode); 8147 } 8148 DPRINTF(("update separator for destination page %"Z"u to [%s]", 8149 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); 8150 mdb_cursor_copy(cdst, &mn); 8151 mn.mc_snum--; 8152 mn.mc_top--; 8153 /* We want mdb_rebalance to find mn when doing fixups */ 8154 WITH_CURSOR_TRACKING(mn, 8155 rc = mdb_update_key(&mn, &key)); 8156 if (rc) 8157 return rc; 8158 } 8159 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { 8160 MDB_val nullkey; 8161 indx_t ix = cdst->mc_ki[cdst->mc_top]; 8162 nullkey.mv_size = 0; 8163 cdst->mc_ki[cdst->mc_top] = 0; 8164 rc = mdb_update_key(cdst, &nullkey); 8165 cdst->mc_ki[cdst->mc_top] = ix; 8166 mdb_cassert(cdst, rc == MDB_SUCCESS); 8167 } 8168 } 8169 8170 return MDB_SUCCESS; 8171 } 8172 8173 /** Merge one page into another. 8174 * The nodes from the page pointed to by \b csrc will 8175 * be copied to the page pointed to by \b cdst and then 8176 * the \b csrc page will be freed. 8177 * @param[in] csrc Cursor pointing to the source page. 8178 * @param[in] cdst Cursor pointing to the destination page. 8179 * @return 0 on success, non-zero on failure. 8180 */ 8181 static int 8182 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) 8183 { 8184 MDB_page *psrc, *pdst; 8185 MDB_node *srcnode; 8186 MDB_val key, data; 8187 unsigned nkeys; 8188 int rc; 8189 indx_t i, j; 8190 8191 psrc = csrc->mc_pg[csrc->mc_top]; 8192 pdst = cdst->mc_pg[cdst->mc_top]; 8193 8194 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); 8195 8196 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ 8197 mdb_cassert(csrc, cdst->mc_snum > 1); 8198 8199 /* Mark dst as dirty. */ 8200 if ((rc = mdb_page_touch(cdst))) 8201 return rc; 8202 8203 /* get dst page again now that we've touched it. */ 8204 pdst = cdst->mc_pg[cdst->mc_top]; 8205 8206 /* Move all nodes from src to dst. 8207 */ 8208 j = nkeys = NUMKEYS(pdst); 8209 if (IS_LEAF2(psrc)) { 8210 key.mv_size = csrc->mc_db->md_pad; 8211 key.mv_data = METADATA(psrc); 8212 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8213 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); 8214 if (rc != MDB_SUCCESS) 8215 return rc; 8216 key.mv_data = (char *)key.mv_data + key.mv_size; 8217 } 8218 } else { 8219 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8220 srcnode = NODEPTR(psrc, i); 8221 if (i == 0 && IS_BRANCH(psrc)) { 8222 MDB_cursor mn; 8223 MDB_node *s2; 8224 mdb_cursor_copy(csrc, &mn); 8225 mn.mc_xcursor = NULL; 8226 /* must find the lowest key below src */ 8227 rc = mdb_page_search_lowest(&mn); 8228 if (rc) 8229 return rc; 8230 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 8231 key.mv_size = mn.mc_db->md_pad; 8232 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); 8233 } else { 8234 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 8235 key.mv_size = NODEKSZ(s2); 8236 key.mv_data = NODEKEY(s2); 8237 } 8238 } else { 8239 key.mv_size = srcnode->mn_ksize; 8240 key.mv_data = NODEKEY(srcnode); 8241 } 8242 8243 data.mv_size = NODEDSZ(srcnode); 8244 data.mv_data = NODEDATA(srcnode); 8245 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); 8246 if (rc != MDB_SUCCESS) 8247 return rc; 8248 } 8249 } 8250 8251 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", 8252 pdst->mp_pgno, NUMKEYS(pdst), 8253 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); 8254 8255 /* Unlink the src page from parent and add to free list. 8256 */ 8257 csrc->mc_top--; 8258 mdb_node_del(csrc, 0); 8259 if (csrc->mc_ki[csrc->mc_top] == 0) { 8260 key.mv_size = 0; 8261 rc = mdb_update_key(csrc, &key); 8262 if (rc) { 8263 csrc->mc_top++; 8264 return rc; 8265 } 8266 } 8267 csrc->mc_top++; 8268 8269 psrc = csrc->mc_pg[csrc->mc_top]; 8270 /* If not operating on FreeDB, allow this page to be reused 8271 * in this txn. Otherwise just add to free list. 8272 */ 8273 rc = mdb_page_loose(csrc, psrc); 8274 if (rc) 8275 return rc; 8276 if (IS_LEAF(psrc)) 8277 csrc->mc_db->md_leaf_pages--; 8278 else 8279 csrc->mc_db->md_branch_pages--; 8280 { 8281 /* Adjust other cursors pointing to mp */ 8282 MDB_cursor *m2, *m3; 8283 MDB_dbi dbi = csrc->mc_dbi; 8284 unsigned int top = csrc->mc_top; 8285 8286 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8287 if (csrc->mc_flags & C_SUB) 8288 m3 = &m2->mc_xcursor->mx_cursor; 8289 else 8290 m3 = m2; 8291 if (m3 == csrc) continue; 8292 if (m3->mc_snum < csrc->mc_snum) continue; 8293 if (m3->mc_pg[top] == psrc) { 8294 m3->mc_pg[top] = pdst; 8295 m3->mc_ki[top] += nkeys; 8296 m3->mc_ki[top-1] = cdst->mc_ki[top-1]; 8297 } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && 8298 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { 8299 m3->mc_ki[top-1]--; 8300 } 8301 if (IS_LEAF(psrc)) 8302 XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); 8303 } 8304 } 8305 { 8306 unsigned int snum = cdst->mc_snum; 8307 uint16_t depth = cdst->mc_db->md_depth; 8308 mdb_cursor_pop(cdst); 8309 rc = mdb_rebalance(cdst); 8310 /* Did the tree height change? */ 8311 if (depth != cdst->mc_db->md_depth) 8312 snum += cdst->mc_db->md_depth - depth; 8313 cdst->mc_snum = snum; 8314 cdst->mc_top = snum-1; 8315 } 8316 return rc; 8317 } 8318 8319 /** Copy the contents of a cursor. 8320 * @param[in] csrc The cursor to copy from. 8321 * @param[out] cdst The cursor to copy to. 8322 */ 8323 static void 8324 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) 8325 { 8326 unsigned int i; 8327 8328 cdst->mc_txn = csrc->mc_txn; 8329 cdst->mc_dbi = csrc->mc_dbi; 8330 cdst->mc_db = csrc->mc_db; 8331 cdst->mc_dbx = csrc->mc_dbx; 8332 cdst->mc_snum = csrc->mc_snum; 8333 cdst->mc_top = csrc->mc_top; 8334 cdst->mc_flags = csrc->mc_flags; 8335 8336 for (i=0; i<csrc->mc_snum; i++) { 8337 cdst->mc_pg[i] = csrc->mc_pg[i]; 8338 cdst->mc_ki[i] = csrc->mc_ki[i]; 8339 } 8340 } 8341 8342 /** Rebalance the tree after a delete operation. 8343 * @param[in] mc Cursor pointing to the page where rebalancing 8344 * should begin. 8345 * @return 0 on success, non-zero on failure. 8346 */ 8347 static int 8348 mdb_rebalance(MDB_cursor *mc) 8349 { 8350 MDB_node *node; 8351 int rc, fromleft; 8352 unsigned int ptop, minkeys, thresh; 8353 MDB_cursor mn; 8354 indx_t oldki; 8355 8356 if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { 8357 minkeys = 2; 8358 thresh = 1; 8359 } else { 8360 minkeys = 1; 8361 thresh = FILL_THRESHOLD; 8362 } 8363 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", 8364 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", 8365 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), 8366 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); 8367 8368 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && 8369 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { 8370 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", 8371 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); 8372 return MDB_SUCCESS; 8373 } 8374 8375 if (mc->mc_snum < 2) { 8376 MDB_page *mp = mc->mc_pg[0]; 8377 if (IS_SUBP(mp)) { 8378 DPUTS("Can't rebalance a subpage, ignoring"); 8379 return MDB_SUCCESS; 8380 } 8381 if (NUMKEYS(mp) == 0) { 8382 DPUTS("tree is completely empty"); 8383 mc->mc_db->md_root = P_INVALID; 8384 mc->mc_db->md_depth = 0; 8385 mc->mc_db->md_leaf_pages = 0; 8386 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8387 if (rc) 8388 return rc; 8389 /* Adjust cursors pointing to mp */ 8390 mc->mc_snum = 0; 8391 mc->mc_top = 0; 8392 mc->mc_flags &= ~C_INITIALIZED; 8393 { 8394 MDB_cursor *m2, *m3; 8395 MDB_dbi dbi = mc->mc_dbi; 8396 8397 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8398 if (mc->mc_flags & C_SUB) 8399 m3 = &m2->mc_xcursor->mx_cursor; 8400 else 8401 m3 = m2; 8402 if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) 8403 continue; 8404 if (m3->mc_pg[0] == mp) { 8405 m3->mc_snum = 0; 8406 m3->mc_top = 0; 8407 m3->mc_flags &= ~C_INITIALIZED; 8408 } 8409 } 8410 } 8411 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 8412 int i; 8413 DPUTS("collapsing root page!"); 8414 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8415 if (rc) 8416 return rc; 8417 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); 8418 rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); 8419 if (rc) 8420 return rc; 8421 mc->mc_db->md_depth--; 8422 mc->mc_db->md_branch_pages--; 8423 mc->mc_ki[0] = mc->mc_ki[1]; 8424 for (i = 1; i<mc->mc_db->md_depth; i++) { 8425 mc->mc_pg[i] = mc->mc_pg[i+1]; 8426 mc->mc_ki[i] = mc->mc_ki[i+1]; 8427 } 8428 { 8429 /* Adjust other cursors pointing to mp */ 8430 MDB_cursor *m2, *m3; 8431 MDB_dbi dbi = mc->mc_dbi; 8432 8433 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8434 if (mc->mc_flags & C_SUB) 8435 m3 = &m2->mc_xcursor->mx_cursor; 8436 else 8437 m3 = m2; 8438 if (m3 == mc) continue; 8439 if (!(m3->mc_flags & C_INITIALIZED)) 8440 continue; 8441 if (m3->mc_pg[0] == mp) { 8442 for (i=0; i<mc->mc_db->md_depth; i++) { 8443 m3->mc_pg[i] = m3->mc_pg[i+1]; 8444 m3->mc_ki[i] = m3->mc_ki[i+1]; 8445 } 8446 m3->mc_snum--; 8447 m3->mc_top--; 8448 } 8449 } 8450 } 8451 } else 8452 DPUTS("root page doesn't need rebalancing"); 8453 return MDB_SUCCESS; 8454 } 8455 8456 /* The parent (branch page) must have at least 2 pointers, 8457 * otherwise the tree is invalid. 8458 */ 8459 ptop = mc->mc_top-1; 8460 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); 8461 8462 /* Leaf page fill factor is below the threshold. 8463 * Try to move keys from left or right neighbor, or 8464 * merge with a neighbor page. 8465 */ 8466 8467 /* Find neighbors. 8468 */ 8469 mdb_cursor_copy(mc, &mn); 8470 mn.mc_xcursor = NULL; 8471 8472 oldki = mc->mc_ki[mc->mc_top]; 8473 if (mc->mc_ki[ptop] == 0) { 8474 /* We're the leftmost leaf in our parent. 8475 */ 8476 DPUTS("reading right neighbor"); 8477 mn.mc_ki[ptop]++; 8478 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8479 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8480 if (rc) 8481 return rc; 8482 mn.mc_ki[mn.mc_top] = 0; 8483 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 8484 fromleft = 0; 8485 } else { 8486 /* There is at least one neighbor to the left. 8487 */ 8488 DPUTS("reading left neighbor"); 8489 mn.mc_ki[ptop]--; 8490 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8491 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8492 if (rc) 8493 return rc; 8494 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; 8495 mc->mc_ki[mc->mc_top] = 0; 8496 fromleft = 1; 8497 } 8498 8499 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", 8500 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), 8501 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); 8502 8503 /* If the neighbor page is above threshold and has enough keys, 8504 * move one key from it. Otherwise we should try to merge them. 8505 * (A branch page must never have less than 2 keys.) 8506 */ 8507 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { 8508 rc = mdb_node_move(&mn, mc, fromleft); 8509 if (fromleft) { 8510 /* if we inserted on left, bump position up */ 8511 oldki++; 8512 } 8513 } else { 8514 if (!fromleft) { 8515 rc = mdb_page_merge(&mn, mc); 8516 } else { 8517 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); 8518 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; 8519 /* We want mdb_rebalance to find mn when doing fixups */ 8520 WITH_CURSOR_TRACKING(mn, 8521 rc = mdb_page_merge(mc, &mn)); 8522 mdb_cursor_copy(&mn, mc); 8523 } 8524 mc->mc_flags &= ~C_EOF; 8525 } 8526 mc->mc_ki[mc->mc_top] = oldki; 8527 return rc; 8528 } 8529 8530 /** Complete a delete operation started by #mdb_cursor_del(). */ 8531 static int 8532 mdb_cursor_del0(MDB_cursor *mc) 8533 { 8534 int rc; 8535 MDB_page *mp; 8536 indx_t ki; 8537 unsigned int nkeys; 8538 MDB_cursor *m2, *m3; 8539 MDB_dbi dbi = mc->mc_dbi; 8540 8541 ki = mc->mc_ki[mc->mc_top]; 8542 mp = mc->mc_pg[mc->mc_top]; 8543 mdb_node_del(mc, mc->mc_db->md_pad); 8544 mc->mc_db->md_entries--; 8545 { 8546 /* Adjust other cursors pointing to mp */ 8547 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8548 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8549 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8550 continue; 8551 if (m3 == mc || m3->mc_snum < mc->mc_snum) 8552 continue; 8553 if (m3->mc_pg[mc->mc_top] == mp) { 8554 if (m3->mc_ki[mc->mc_top] == ki) { 8555 m3->mc_flags |= C_DEL; 8556 if (mc->mc_db->md_flags & MDB_DUPSORT) { 8557 /* Sub-cursor referred into dataset which is gone */ 8558 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 8559 } 8560 continue; 8561 } else if (m3->mc_ki[mc->mc_top] > ki) { 8562 m3->mc_ki[mc->mc_top]--; 8563 } 8564 XCURSOR_REFRESH(m3, mc->mc_top, mp); 8565 } 8566 } 8567 } 8568 rc = mdb_rebalance(mc); 8569 if (rc) 8570 goto fail; 8571 8572 /* DB is totally empty now, just bail out. 8573 * Other cursors adjustments were already done 8574 * by mdb_rebalance and aren't needed here. 8575 */ 8576 if (!mc->mc_snum) { 8577 mc->mc_flags |= C_EOF; 8578 return rc; 8579 } 8580 8581 mp = mc->mc_pg[mc->mc_top]; 8582 nkeys = NUMKEYS(mp); 8583 8584 /* Adjust other cursors pointing to mp */ 8585 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { 8586 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8587 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8588 continue; 8589 if (m3->mc_snum < mc->mc_snum) 8590 continue; 8591 if (m3->mc_pg[mc->mc_top] == mp) { 8592 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { 8593 /* if m3 points past last node in page, find next sibling */ 8594 if (m3->mc_ki[mc->mc_top] >= nkeys) { 8595 rc = mdb_cursor_sibling(m3, 1); 8596 if (rc == MDB_NOTFOUND) { 8597 m3->mc_flags |= C_EOF; 8598 rc = MDB_SUCCESS; 8599 continue; 8600 } 8601 if (rc) 8602 goto fail; 8603 } 8604 if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { 8605 MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); 8606 /* If this node has dupdata, it may need to be reinited 8607 * because its data has moved. 8608 * If the xcursor was not initd it must be reinited. 8609 * Else if node points to a subDB, nothing is needed. 8610 * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. 8611 */ 8612 if (node->mn_flags & F_DUPDATA) { 8613 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 8614 if (!(node->mn_flags & F_SUBDATA)) 8615 m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); 8616 } else { 8617 mdb_xcursor_init1(m3, node); 8618 rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); 8619 if (rc) 8620 goto fail; 8621 } 8622 } 8623 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; 8624 } 8625 } 8626 } 8627 } 8628 mc->mc_flags |= C_DEL; 8629 8630 fail: 8631 if (rc) 8632 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 8633 return rc; 8634 } 8635 8636 int 8637 mdb_del(MDB_txn *txn, MDB_dbi dbi, 8638 MDB_val *key, MDB_val *data) 8639 { 8640 DKBUF; 8641 DDBUF; 8642 if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 8643 return EINVAL; 8644 8645 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 8646 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 8647 8648 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { 8649 /* must ignore any data */ 8650 data = NULL; 8651 } 8652 8653 MDB_TRACE(("%p, %u, %"Z"u[%s], %"Z"u%s", 8654 txn, dbi, key ? key->mv_size:0, DKEY(key), data ? data->mv_size:0, 8655 data ? mdb_dval(txn, dbi, data, dbuf):"")); 8656 return mdb_del0(txn, dbi, key, data, 0); 8657 } 8658 8659 static int 8660 mdb_del0(MDB_txn *txn, MDB_dbi dbi, 8661 MDB_val *key, MDB_val *data, unsigned flags) 8662 { 8663 MDB_cursor mc; 8664 MDB_xcursor mx; 8665 MDB_cursor_op op; 8666 MDB_val rdata, *xdata; 8667 int rc, exact = 0; 8668 DKBUF; 8669 8670 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); 8671 8672 mdb_cursor_init(&mc, txn, dbi, &mx); 8673 8674 if (data) { 8675 op = MDB_GET_BOTH; 8676 rdata = *data; 8677 xdata = &rdata; 8678 } else { 8679 op = MDB_SET; 8680 xdata = NULL; 8681 flags |= MDB_NODUPDATA; 8682 } 8683 rc = mdb_cursor_set(&mc, key, xdata, op, &exact); 8684 if (rc == 0) { 8685 /* let mdb_page_split know about this cursor if needed: 8686 * delete will trigger a rebalance; if it needs to move 8687 * a node from one page to another, it will have to 8688 * update the parent's separator key(s). If the new sepkey 8689 * is larger than the current one, the parent page may 8690 * run out of space, triggering a split. We need this 8691 * cursor to be consistent until the end of the rebalance. 8692 */ 8693 mc.mc_flags |= C_UNTRACK; 8694 mc.mc_next = txn->mt_cursors[dbi]; 8695 txn->mt_cursors[dbi] = &mc; 8696 rc = _mdb_cursor_del(&mc, flags); 8697 txn->mt_cursors[dbi] = mc.mc_next; 8698 } 8699 return rc; 8700 } 8701 8702 /** Split a page and insert a new node. 8703 * Set #MDB_TXN_ERROR on failure. 8704 * @param[in,out] mc Cursor pointing to the page and desired insertion index. 8705 * The cursor will be updated to point to the actual page and index where 8706 * the node got inserted after the split. 8707 * @param[in] newkey The key for the newly inserted node. 8708 * @param[in] newdata The data for the newly inserted node. 8709 * @param[in] newpgno The page number, if the new node is a branch node. 8710 * @param[in] nflags The #NODE_ADD_FLAGS for the new node. 8711 * @return 0 on success, non-zero on failure. 8712 */ 8713 static int 8714 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, 8715 unsigned int nflags) 8716 { 8717 unsigned int flags; 8718 int rc = MDB_SUCCESS, new_root = 0, did_split = 0; 8719 indx_t newindx; 8720 pgno_t pgno = 0; 8721 int i, j, split_indx, nkeys, pmax; 8722 MDB_env *env = mc->mc_txn->mt_env; 8723 MDB_node *node; 8724 MDB_val sepkey, rkey, xdata, *rdata = &xdata; 8725 MDB_page *copy = NULL; 8726 MDB_page *mp, *rp, *pp; 8727 int ptop; 8728 MDB_cursor mn; 8729 DKBUF; 8730 8731 mp = mc->mc_pg[mc->mc_top]; 8732 newindx = mc->mc_ki[mc->mc_top]; 8733 nkeys = NUMKEYS(mp); 8734 8735 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", 8736 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, 8737 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); 8738 8739 /* Create a right sibling. */ 8740 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) 8741 return rc; 8742 rp->mp_pad = mp->mp_pad; 8743 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); 8744 8745 /* Usually when splitting the root page, the cursor 8746 * height is 1. But when called from mdb_update_key, 8747 * the cursor height may be greater because it walks 8748 * up the stack while finding the branch slot to update. 8749 */ 8750 if (mc->mc_top < 1) { 8751 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) 8752 goto done; 8753 /* shift current top to make room for new parent */ 8754 for (i=mc->mc_snum; i>0; i--) { 8755 mc->mc_pg[i] = mc->mc_pg[i-1]; 8756 mc->mc_ki[i] = mc->mc_ki[i-1]; 8757 } 8758 mc->mc_pg[0] = pp; 8759 mc->mc_ki[0] = 0; 8760 mc->mc_db->md_root = pp->mp_pgno; 8761 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); 8762 new_root = mc->mc_db->md_depth++; 8763 8764 /* Add left (implicit) pointer. */ 8765 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { 8766 /* undo the pre-push */ 8767 mc->mc_pg[0] = mc->mc_pg[1]; 8768 mc->mc_ki[0] = mc->mc_ki[1]; 8769 mc->mc_db->md_root = mp->mp_pgno; 8770 mc->mc_db->md_depth--; 8771 goto done; 8772 } 8773 mc->mc_snum++; 8774 mc->mc_top++; 8775 ptop = 0; 8776 } else { 8777 ptop = mc->mc_top-1; 8778 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); 8779 } 8780 8781 mdb_cursor_copy(mc, &mn); 8782 mn.mc_xcursor = NULL; 8783 mn.mc_pg[mn.mc_top] = rp; 8784 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; 8785 8786 if (nflags & MDB_APPEND) { 8787 mn.mc_ki[mn.mc_top] = 0; 8788 sepkey = *newkey; 8789 split_indx = newindx; 8790 nkeys = 0; 8791 } else { 8792 8793 split_indx = (nkeys+1) / 2; 8794 8795 if (IS_LEAF2(rp)) { 8796 char *split, *ins; 8797 int x; 8798 unsigned int lsize, rsize, ksize; 8799 /* Move half of the keys to the right sibling */ 8800 x = mc->mc_ki[mc->mc_top] - split_indx; 8801 ksize = mc->mc_db->md_pad; 8802 split = LEAF2KEY(mp, split_indx, ksize); 8803 rsize = (nkeys - split_indx) * ksize; 8804 lsize = (nkeys - split_indx) * sizeof(indx_t); 8805 mp->mp_lower -= lsize; 8806 rp->mp_lower += lsize; 8807 mp->mp_upper += rsize - lsize; 8808 rp->mp_upper -= rsize - lsize; 8809 sepkey.mv_size = ksize; 8810 if (newindx == split_indx) { 8811 sepkey.mv_data = newkey->mv_data; 8812 } else { 8813 sepkey.mv_data = split; 8814 } 8815 if (x<0) { 8816 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); 8817 memcpy(rp->mp_ptrs, split, rsize); 8818 sepkey.mv_data = rp->mp_ptrs; 8819 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); 8820 memcpy(ins, newkey->mv_data, ksize); 8821 mp->mp_lower += sizeof(indx_t); 8822 mp->mp_upper -= ksize - sizeof(indx_t); 8823 } else { 8824 if (x) 8825 memcpy(rp->mp_ptrs, split, x * ksize); 8826 ins = LEAF2KEY(rp, x, ksize); 8827 memcpy(ins, newkey->mv_data, ksize); 8828 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); 8829 rp->mp_lower += sizeof(indx_t); 8830 rp->mp_upper -= ksize - sizeof(indx_t); 8831 mc->mc_ki[mc->mc_top] = x; 8832 } 8833 } else { 8834 int psize, nsize, k, keythresh; 8835 8836 /* Maximum free space in an empty page */ 8837 pmax = env->me_psize - PAGEHDRSZ; 8838 /* Threshold number of keys considered "small" */ 8839 keythresh = env->me_psize >> 7; 8840 8841 if (IS_LEAF(mp)) 8842 nsize = mdb_leaf_size(env, newkey, newdata); 8843 else 8844 nsize = mdb_branch_size(env, newkey); 8845 nsize = EVEN(nsize); 8846 8847 /* grab a page to hold a temporary copy */ 8848 copy = mdb_page_malloc(mc->mc_txn, 1); 8849 if (copy == NULL) { 8850 rc = ENOMEM; 8851 goto done; 8852 } 8853 copy->mp_pgno = mp->mp_pgno; 8854 copy->mp_flags = mp->mp_flags; 8855 copy->mp_lower = (PAGEHDRSZ-PAGEBASE); 8856 copy->mp_upper = env->me_psize - PAGEBASE; 8857 8858 /* prepare to insert */ 8859 for (i=0, j=0; i<nkeys; i++) { 8860 if (i == newindx) { 8861 copy->mp_ptrs[j++] = 0; 8862 } 8863 copy->mp_ptrs[j++] = mp->mp_ptrs[i]; 8864 } 8865 8866 /* When items are relatively large the split point needs 8867 * to be checked, because being off-by-one will make the 8868 * difference between success or failure in mdb_node_add. 8869 * 8870 * It's also relevant if a page happens to be laid out 8871 * such that one half of its nodes are all "small" and 8872 * the other half of its nodes are "large." If the new 8873 * item is also "large" and falls on the half with 8874 * "large" nodes, it also may not fit. 8875 * 8876 * As a final tweak, if the new item goes on the last 8877 * spot on the page (and thus, onto the new page), bias 8878 * the split so the new page is emptier than the old page. 8879 * This yields better packing during sequential inserts. 8880 */ 8881 if (nkeys < keythresh || nsize > pmax/16 || newindx >= nkeys) { 8882 /* Find split point */ 8883 psize = 0; 8884 if (newindx <= split_indx || newindx >= nkeys) { 8885 i = 0; j = 1; 8886 k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); 8887 } else { 8888 i = nkeys; j = -1; 8889 k = split_indx-1; 8890 } 8891 for (; i!=k; i+=j) { 8892 if (i == newindx) { 8893 psize += nsize; 8894 node = NULL; 8895 } else { 8896 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8897 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); 8898 if (IS_LEAF(mp)) { 8899 if (F_ISSET(node->mn_flags, F_BIGDATA)) 8900 psize += sizeof(pgno_t); 8901 else 8902 psize += NODEDSZ(node); 8903 } 8904 psize = EVEN(psize); 8905 } 8906 if (psize > pmax || i == k-j) { 8907 split_indx = i + (j<0); 8908 break; 8909 } 8910 } 8911 } 8912 if (split_indx == newindx) { 8913 sepkey.mv_size = newkey->mv_size; 8914 sepkey.mv_data = newkey->mv_data; 8915 } else { 8916 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); 8917 sepkey.mv_size = node->mn_ksize; 8918 sepkey.mv_data = NODEKEY(node); 8919 } 8920 } 8921 } 8922 8923 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); 8924 8925 /* Copy separator key to the parent. 8926 */ 8927 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { 8928 int snum = mc->mc_snum; 8929 mn.mc_snum--; 8930 mn.mc_top--; 8931 did_split = 1; 8932 /* We want other splits to find mn when doing fixups */ 8933 WITH_CURSOR_TRACKING(mn, 8934 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); 8935 if (rc) 8936 goto done; 8937 8938 /* root split? */ 8939 if (mc->mc_snum > snum) { 8940 ptop++; 8941 } 8942 /* Right page might now have changed parent. 8943 * Check if left page also changed parent. 8944 */ 8945 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 8946 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 8947 for (i=0; i<ptop; i++) { 8948 mc->mc_pg[i] = mn.mc_pg[i]; 8949 mc->mc_ki[i] = mn.mc_ki[i]; 8950 } 8951 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 8952 if (mn.mc_ki[ptop]) { 8953 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 8954 } else { 8955 /* find right page's left sibling */ 8956 mc->mc_ki[ptop] = mn.mc_ki[ptop]; 8957 mdb_cursor_sibling(mc, 0); 8958 } 8959 } 8960 } else { 8961 mn.mc_top--; 8962 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); 8963 mn.mc_top++; 8964 } 8965 if (rc != MDB_SUCCESS) { 8966 goto done; 8967 } 8968 if (nflags & MDB_APPEND) { 8969 mc->mc_pg[mc->mc_top] = rp; 8970 mc->mc_ki[mc->mc_top] = 0; 8971 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); 8972 if (rc) 8973 goto done; 8974 for (i=0; i<mc->mc_top; i++) 8975 mc->mc_ki[i] = mn.mc_ki[i]; 8976 } else if (!IS_LEAF2(mp)) { 8977 /* Move nodes */ 8978 mc->mc_pg[mc->mc_top] = rp; 8979 i = split_indx; 8980 j = 0; 8981 do { 8982 if (i == newindx) { 8983 rkey.mv_data = newkey->mv_data; 8984 rkey.mv_size = newkey->mv_size; 8985 if (IS_LEAF(mp)) { 8986 rdata = newdata; 8987 } else 8988 pgno = newpgno; 8989 flags = nflags; 8990 /* Update index for the new key. */ 8991 mc->mc_ki[mc->mc_top] = j; 8992 } else { 8993 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8994 rkey.mv_data = NODEKEY(node); 8995 rkey.mv_size = node->mn_ksize; 8996 if (IS_LEAF(mp)) { 8997 xdata.mv_data = NODEDATA(node); 8998 xdata.mv_size = NODEDSZ(node); 8999 rdata = &xdata; 9000 } else 9001 pgno = NODEPGNO(node); 9002 flags = node->mn_flags; 9003 } 9004 9005 if (!IS_LEAF(mp) && j == 0) { 9006 /* First branch index doesn't need key data. */ 9007 rkey.mv_size = 0; 9008 } 9009 9010 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); 9011 if (rc) 9012 goto done; 9013 if (i == nkeys) { 9014 i = 0; 9015 j = 0; 9016 mc->mc_pg[mc->mc_top] = copy; 9017 } else { 9018 i++; 9019 j++; 9020 } 9021 } while (i != split_indx); 9022 9023 nkeys = NUMKEYS(copy); 9024 for (i=0; i<nkeys; i++) 9025 mp->mp_ptrs[i] = copy->mp_ptrs[i]; 9026 mp->mp_lower = copy->mp_lower; 9027 mp->mp_upper = copy->mp_upper; 9028 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), 9029 env->me_psize - copy->mp_upper - PAGEBASE); 9030 9031 /* reset back to original page */ 9032 if (newindx < split_indx) { 9033 mc->mc_pg[mc->mc_top] = mp; 9034 } else { 9035 mc->mc_pg[mc->mc_top] = rp; 9036 mc->mc_ki[ptop]++; 9037 /* Make sure mc_ki is still valid. 9038 */ 9039 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 9040 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 9041 for (i=0; i<=ptop; i++) { 9042 mc->mc_pg[i] = mn.mc_pg[i]; 9043 mc->mc_ki[i] = mn.mc_ki[i]; 9044 } 9045 } 9046 } 9047 if (nflags & MDB_RESERVE) { 9048 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 9049 if (!(node->mn_flags & F_BIGDATA)) 9050 newdata->mv_data = NODEDATA(node); 9051 } 9052 } else { 9053 if (newindx >= split_indx) { 9054 mc->mc_pg[mc->mc_top] = rp; 9055 mc->mc_ki[ptop]++; 9056 /* Make sure mc_ki is still valid. 9057 */ 9058 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 9059 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 9060 for (i=0; i<=ptop; i++) { 9061 mc->mc_pg[i] = mn.mc_pg[i]; 9062 mc->mc_ki[i] = mn.mc_ki[i]; 9063 } 9064 } 9065 } 9066 } 9067 9068 { 9069 /* Adjust other cursors pointing to mp */ 9070 MDB_cursor *m2, *m3; 9071 MDB_dbi dbi = mc->mc_dbi; 9072 nkeys = NUMKEYS(mp); 9073 9074 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 9075 if (mc->mc_flags & C_SUB) 9076 m3 = &m2->mc_xcursor->mx_cursor; 9077 else 9078 m3 = m2; 9079 if (m3 == mc) 9080 continue; 9081 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 9082 continue; 9083 if (new_root) { 9084 int k; 9085 /* sub cursors may be on different DB */ 9086 if (m3->mc_pg[0] != mp) 9087 continue; 9088 /* root split */ 9089 for (k=new_root; k>=0; k--) { 9090 m3->mc_ki[k+1] = m3->mc_ki[k]; 9091 m3->mc_pg[k+1] = m3->mc_pg[k]; 9092 } 9093 if (m3->mc_ki[0] >= nkeys) { 9094 m3->mc_ki[0] = 1; 9095 } else { 9096 m3->mc_ki[0] = 0; 9097 } 9098 m3->mc_pg[0] = mc->mc_pg[0]; 9099 m3->mc_snum++; 9100 m3->mc_top++; 9101 } 9102 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { 9103 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) 9104 m3->mc_ki[mc->mc_top]++; 9105 if (m3->mc_ki[mc->mc_top] >= nkeys) { 9106 m3->mc_pg[mc->mc_top] = rp; 9107 m3->mc_ki[mc->mc_top] -= nkeys; 9108 for (i=0; i<mc->mc_top; i++) { 9109 m3->mc_ki[i] = mn.mc_ki[i]; 9110 m3->mc_pg[i] = mn.mc_pg[i]; 9111 } 9112 } 9113 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && 9114 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { 9115 m3->mc_ki[ptop]++; 9116 } 9117 if (IS_LEAF(mp)) 9118 XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); 9119 } 9120 } 9121 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); 9122 9123 done: 9124 if (copy) /* tmp page */ 9125 mdb_page_free(env, copy); 9126 if (rc) 9127 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 9128 return rc; 9129 } 9130 9131 int 9132 mdb_put(MDB_txn *txn, MDB_dbi dbi, 9133 MDB_val *key, MDB_val *data, unsigned int flags) 9134 { 9135 MDB_cursor mc; 9136 MDB_xcursor mx; 9137 int rc; 9138 DKBUF; 9139 DDBUF; 9140 9141 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 9142 return EINVAL; 9143 9144 if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) 9145 return EINVAL; 9146 9147 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 9148 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 9149 9150 MDB_TRACE(("%p, %u, %"Z"u[%s], %"Z"u%s, %u", 9151 txn, dbi, key ? key->mv_size:0, DKEY(key), data->mv_size, mdb_dval(txn, dbi, data, dbuf), flags)); 9152 mdb_cursor_init(&mc, txn, dbi, &mx); 9153 mc.mc_next = txn->mt_cursors[dbi]; 9154 txn->mt_cursors[dbi] = &mc; 9155 rc = _mdb_cursor_put(&mc, key, data, flags); 9156 txn->mt_cursors[dbi] = mc.mc_next; 9157 return rc; 9158 } 9159 9160 #ifndef MDB_WBUF 9161 #define MDB_WBUF (1024*1024) 9162 #endif 9163 #define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ 9164 9165 /** State needed for a double-buffering compacting copy. */ 9166 typedef struct mdb_copy { 9167 MDB_env *mc_env; 9168 MDB_txn *mc_txn; 9169 pthread_mutex_t mc_mutex; 9170 pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ 9171 char *mc_wbuf[2]; 9172 char *mc_over[2]; 9173 int mc_wlen[2]; 9174 int mc_olen[2]; 9175 pgno_t mc_next_pgno; 9176 HANDLE mc_fd; 9177 int mc_toggle; /**< Buffer number in provider */ 9178 int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ 9179 /** Error code. Never cleared if set. Both threads can set nonzero 9180 * to fail the copy. Not mutex-protected, LMDB expects atomic int. 9181 */ 9182 volatile int mc_error; 9183 } mdb_copy; 9184 9185 /** Dedicated writer thread for compacting copy. */ 9186 static THREAD_RET ESECT CALL_CONV 9187 mdb_env_copythr(void *arg) 9188 { 9189 mdb_copy *my = arg; 9190 char *ptr; 9191 int toggle = 0, wsize, rc; 9192 #ifdef _WIN32 9193 DWORD len; 9194 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9195 #else 9196 int len; 9197 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9198 #ifdef SIGPIPE 9199 sigset_t set; 9200 sigemptyset(&set); 9201 sigaddset(&set, SIGPIPE); 9202 if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) 9203 my->mc_error = rc; 9204 #endif 9205 #endif 9206 9207 pthread_mutex_lock(&my->mc_mutex); 9208 for(;;) { 9209 while (!my->mc_new) 9210 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9211 if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ 9212 break; 9213 wsize = my->mc_wlen[toggle]; 9214 ptr = my->mc_wbuf[toggle]; 9215 again: 9216 rc = MDB_SUCCESS; 9217 while (wsize > 0 && !my->mc_error) { 9218 DO_WRITE(rc, my->mc_fd, ptr, wsize, len); 9219 if (!rc) { 9220 rc = ErrCode(); 9221 #if defined(SIGPIPE) && !defined(_WIN32) 9222 if (rc == EPIPE) { 9223 /* Collect the pending SIGPIPE, otherwise at least OS X 9224 * gives it to the process on thread-exit (ITS#8504). 9225 */ 9226 int tmp; 9227 sigwait(&set, &tmp); 9228 } 9229 #endif 9230 break; 9231 } else if (len > 0) { 9232 rc = MDB_SUCCESS; 9233 ptr += len; 9234 wsize -= len; 9235 continue; 9236 } else { 9237 rc = EIO; 9238 break; 9239 } 9240 } 9241 if (rc) { 9242 my->mc_error = rc; 9243 } 9244 /* If there's an overflow page tail, write it too */ 9245 if (my->mc_olen[toggle]) { 9246 wsize = my->mc_olen[toggle]; 9247 ptr = my->mc_over[toggle]; 9248 my->mc_olen[toggle] = 0; 9249 goto again; 9250 } 9251 my->mc_wlen[toggle] = 0; 9252 toggle ^= 1; 9253 /* Return the empty buffer to provider */ 9254 my->mc_new--; 9255 pthread_cond_signal(&my->mc_cond); 9256 } 9257 pthread_mutex_unlock(&my->mc_mutex); 9258 return (THREAD_RET)0; 9259 #undef DO_WRITE 9260 } 9261 9262 /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. 9263 * 9264 * @param[in] my control structure. 9265 * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). 9266 */ 9267 static int ESECT 9268 mdb_env_cthr_toggle(mdb_copy *my, int adjust) 9269 { 9270 pthread_mutex_lock(&my->mc_mutex); 9271 my->mc_new += adjust; 9272 pthread_cond_signal(&my->mc_cond); 9273 while (my->mc_new & 2) /* both buffers in use */ 9274 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9275 pthread_mutex_unlock(&my->mc_mutex); 9276 9277 my->mc_toggle ^= (adjust & 1); 9278 /* Both threads reset mc_wlen, to be safe from threading errors */ 9279 my->mc_wlen[my->mc_toggle] = 0; 9280 return my->mc_error; 9281 } 9282 9283 /** Depth-first tree traversal for compacting copy. 9284 * @param[in] my control structure. 9285 * @param[in,out] pg database root. 9286 * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. 9287 */ 9288 static int ESECT 9289 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) 9290 { 9291 MDB_cursor mc = {0}; 9292 MDB_node *ni; 9293 MDB_page *mo, *mp, *leaf; 9294 char *buf, *ptr; 9295 int rc, toggle; 9296 unsigned int i; 9297 9298 /* Empty DB, nothing to do */ 9299 if (*pg == P_INVALID) 9300 return MDB_SUCCESS; 9301 9302 mc.mc_snum = 1; 9303 mc.mc_txn = my->mc_txn; 9304 9305 rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); 9306 if (rc) 9307 return rc; 9308 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); 9309 if (rc) 9310 return rc; 9311 9312 /* Make cursor pages writable */ 9313 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); 9314 if (buf == NULL) 9315 return ENOMEM; 9316 9317 for (i=0; i<mc.mc_top; i++) { 9318 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); 9319 mc.mc_pg[i] = (MDB_page *)ptr; 9320 ptr += my->mc_env->me_psize; 9321 } 9322 9323 /* This is writable space for a leaf page. Usually not needed. */ 9324 leaf = (MDB_page *)ptr; 9325 9326 toggle = my->mc_toggle; 9327 while (mc.mc_snum > 0) { 9328 unsigned n; 9329 mp = mc.mc_pg[mc.mc_top]; 9330 n = NUMKEYS(mp); 9331 9332 if (IS_LEAF(mp)) { 9333 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { 9334 for (i=0; i<n; i++) { 9335 ni = NODEPTR(mp, i); 9336 if (ni->mn_flags & F_BIGDATA) { 9337 MDB_page *omp; 9338 pgno_t pg; 9339 9340 /* Need writable leaf */ 9341 if (mp != leaf) { 9342 mc.mc_pg[mc.mc_top] = leaf; 9343 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9344 mp = leaf; 9345 ni = NODEPTR(mp, i); 9346 } 9347 9348 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 9349 memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); 9350 rc = mdb_page_get(&mc, pg, &omp, NULL); 9351 if (rc) 9352 goto done; 9353 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9354 rc = mdb_env_cthr_toggle(my, 1); 9355 if (rc) 9356 goto done; 9357 toggle = my->mc_toggle; 9358 } 9359 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9360 memcpy(mo, omp, my->mc_env->me_psize); 9361 mo->mp_pgno = my->mc_next_pgno; 9362 my->mc_next_pgno += omp->mp_pages; 9363 my->mc_wlen[toggle] += my->mc_env->me_psize; 9364 if (omp->mp_pages > 1) { 9365 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); 9366 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; 9367 rc = mdb_env_cthr_toggle(my, 1); 9368 if (rc) 9369 goto done; 9370 toggle = my->mc_toggle; 9371 } 9372 } else if (ni->mn_flags & F_SUBDATA) { 9373 MDB_db db; 9374 9375 /* Need writable leaf */ 9376 if (mp != leaf) { 9377 mc.mc_pg[mc.mc_top] = leaf; 9378 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9379 mp = leaf; 9380 ni = NODEPTR(mp, i); 9381 } 9382 9383 memcpy(&db, NODEDATA(ni), sizeof(db)); 9384 my->mc_toggle = toggle; 9385 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); 9386 if (rc) 9387 goto done; 9388 toggle = my->mc_toggle; 9389 memcpy(NODEDATA(ni), &db, sizeof(db)); 9390 } 9391 } 9392 } 9393 } else { 9394 mc.mc_ki[mc.mc_top]++; 9395 if (mc.mc_ki[mc.mc_top] < n) { 9396 pgno_t pg; 9397 again: 9398 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); 9399 pg = NODEPGNO(ni); 9400 rc = mdb_page_get(&mc, pg, &mp, NULL); 9401 if (rc) 9402 goto done; 9403 mc.mc_top++; 9404 mc.mc_snum++; 9405 mc.mc_ki[mc.mc_top] = 0; 9406 if (IS_BRANCH(mp)) { 9407 /* Whenever we advance to a sibling branch page, 9408 * we must proceed all the way down to its first leaf. 9409 */ 9410 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); 9411 goto again; 9412 } else 9413 mc.mc_pg[mc.mc_top] = mp; 9414 continue; 9415 } 9416 } 9417 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9418 rc = mdb_env_cthr_toggle(my, 1); 9419 if (rc) 9420 goto done; 9421 toggle = my->mc_toggle; 9422 } 9423 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9424 mdb_page_copy(mo, mp, my->mc_env->me_psize); 9425 mo->mp_pgno = my->mc_next_pgno++; 9426 my->mc_wlen[toggle] += my->mc_env->me_psize; 9427 if (mc.mc_top) { 9428 /* Update parent if there is one */ 9429 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); 9430 SETPGNO(ni, mo->mp_pgno); 9431 mdb_cursor_pop(&mc); 9432 } else { 9433 /* Otherwise we're done */ 9434 *pg = mo->mp_pgno; 9435 break; 9436 } 9437 } 9438 done: 9439 free(buf); 9440 return rc; 9441 } 9442 9443 /** Copy environment with compaction. */ 9444 static int ESECT 9445 mdb_env_copyfd1(MDB_env *env, HANDLE fd) 9446 { 9447 MDB_meta *mm; 9448 MDB_page *mp; 9449 mdb_copy my = {0}; 9450 MDB_txn *txn = NULL; 9451 pthread_t thr; 9452 pgno_t root, new_root; 9453 int rc = MDB_SUCCESS; 9454 9455 #ifdef _WIN32 9456 if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || 9457 !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { 9458 rc = ErrCode(); 9459 goto done; 9460 } 9461 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); 9462 if (my.mc_wbuf[0] == NULL) { 9463 /* _aligned_malloc() sets errno, but we use Windows error codes */ 9464 rc = ERROR_NOT_ENOUGH_MEMORY; 9465 goto done; 9466 } 9467 #else 9468 if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) 9469 return rc; 9470 if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) 9471 goto done2; 9472 #ifdef HAVE_MEMALIGN 9473 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); 9474 if (my.mc_wbuf[0] == NULL) { 9475 rc = errno; 9476 goto done; 9477 } 9478 #else 9479 { 9480 void *p; 9481 if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) 9482 goto done; 9483 my.mc_wbuf[0] = p; 9484 } 9485 #endif 9486 #endif 9487 memset(my.mc_wbuf[0], 0, MDB_WBUF*2); 9488 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; 9489 my.mc_next_pgno = NUM_METAS; 9490 my.mc_env = env; 9491 my.mc_fd = fd; 9492 rc = THREAD_CREATE(thr, mdb_env_copythr, &my); 9493 if (rc) 9494 goto done; 9495 9496 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9497 if (rc) 9498 goto finish; 9499 9500 mp = (MDB_page *)my.mc_wbuf[0]; 9501 memset(mp, 0, NUM_METAS * env->me_psize); 9502 mp->mp_pgno = 0; 9503 mp->mp_flags = P_META; 9504 mm = (MDB_meta *)METADATA(mp); 9505 mdb_env_init_meta0(env, mm); 9506 mm->mm_address = env->me_metas[0]->mm_address; 9507 9508 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); 9509 mp->mp_pgno = 1; 9510 mp->mp_flags = P_META; 9511 *(MDB_meta *)METADATA(mp) = *mm; 9512 mm = (MDB_meta *)METADATA(mp); 9513 9514 /* Set metapage 1 with current main DB */ 9515 root = new_root = txn->mt_dbs[MAIN_DBI].md_root; 9516 if (root != P_INVALID) { 9517 /* Count free pages + freeDB pages. Subtract from last_pg 9518 * to find the new last_pg, which also becomes the new root. 9519 */ 9520 MDB_ID freecount = 0; 9521 MDB_cursor mc; 9522 MDB_val key, data; 9523 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 9524 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 9525 freecount += *(MDB_ID *)data.mv_data; 9526 if (rc != MDB_NOTFOUND) 9527 goto finish; 9528 freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + 9529 txn->mt_dbs[FREE_DBI].md_leaf_pages + 9530 txn->mt_dbs[FREE_DBI].md_overflow_pages; 9531 9532 new_root = txn->mt_next_pgno - 1 - freecount; 9533 mm->mm_last_pg = new_root; 9534 mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 9535 mm->mm_dbs[MAIN_DBI].md_root = new_root; 9536 } else { 9537 /* When the DB is empty, handle it specially to 9538 * fix any breakage like page leaks from ITS#8174. 9539 */ 9540 mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; 9541 } 9542 if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { 9543 mm->mm_txnid = 1; /* use metapage 1 */ 9544 } 9545 9546 my.mc_wlen[0] = env->me_psize * NUM_METAS; 9547 my.mc_txn = txn; 9548 rc = mdb_env_cwalk(&my, &root, 0); 9549 if (rc == MDB_SUCCESS && root != new_root) { 9550 rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ 9551 } 9552 9553 finish: 9554 if (rc) 9555 my.mc_error = rc; 9556 mdb_env_cthr_toggle(&my, 1 | MDB_EOF); 9557 rc = THREAD_FINISH(thr); 9558 _mdb_txn_abort(txn); 9559 9560 done: 9561 #ifdef _WIN32 9562 if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); 9563 if (my.mc_cond) CloseHandle(my.mc_cond); 9564 if (my.mc_mutex) CloseHandle(my.mc_mutex); 9565 #else 9566 free(my.mc_wbuf[0]); 9567 pthread_cond_destroy(&my.mc_cond); 9568 done2: 9569 pthread_mutex_destroy(&my.mc_mutex); 9570 #endif 9571 return rc ? rc : my.mc_error; 9572 } 9573 9574 /** Copy environment as-is. */ 9575 static int ESECT 9576 mdb_env_copyfd0(MDB_env *env, HANDLE fd) 9577 { 9578 MDB_txn *txn = NULL; 9579 mdb_mutexref_t wmutex = NULL; 9580 int rc; 9581 size_t wsize, w3; 9582 char *ptr; 9583 #ifdef _WIN32 9584 DWORD len, w2; 9585 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9586 #else 9587 ssize_t len; 9588 size_t w2; 9589 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9590 #endif 9591 9592 /* Do the lock/unlock of the reader mutex before starting the 9593 * write txn. Otherwise other read txns could block writers. 9594 */ 9595 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9596 if (rc) 9597 return rc; 9598 9599 if (env->me_txns) { 9600 /* We must start the actual read txn after blocking writers */ 9601 mdb_txn_end(txn, MDB_END_RESET_TMP); 9602 9603 /* Temporarily block writers until we snapshot the meta pages */ 9604 wmutex = env->me_wmutex; 9605 if (LOCK_MUTEX(rc, env, wmutex)) 9606 goto leave; 9607 9608 rc = mdb_txn_renew0(txn); 9609 if (rc) { 9610 UNLOCK_MUTEX(wmutex); 9611 goto leave; 9612 } 9613 } 9614 9615 wsize = env->me_psize * NUM_METAS; 9616 ptr = env->me_map; 9617 w2 = wsize; 9618 while (w2 > 0) { 9619 DO_WRITE(rc, fd, ptr, w2, len); 9620 if (!rc) { 9621 rc = ErrCode(); 9622 break; 9623 } else if (len > 0) { 9624 rc = MDB_SUCCESS; 9625 ptr += len; 9626 w2 -= len; 9627 continue; 9628 } else { 9629 /* Non-blocking or async handles are not supported */ 9630 rc = EIO; 9631 break; 9632 } 9633 } 9634 if (wmutex) 9635 UNLOCK_MUTEX(wmutex); 9636 9637 if (rc) 9638 goto leave; 9639 9640 w3 = txn->mt_next_pgno * env->me_psize; 9641 { 9642 size_t fsize = 0; 9643 if ((rc = mdb_fsize(env->me_fd, &fsize))) 9644 goto leave; 9645 if (w3 > fsize) 9646 w3 = fsize; 9647 } 9648 wsize = w3 - wsize; 9649 while (wsize > 0) { 9650 if (wsize > MAX_WRITE) 9651 w2 = MAX_WRITE; 9652 else 9653 w2 = wsize; 9654 DO_WRITE(rc, fd, ptr, w2, len); 9655 if (!rc) { 9656 rc = ErrCode(); 9657 break; 9658 } else if (len > 0) { 9659 rc = MDB_SUCCESS; 9660 ptr += len; 9661 wsize -= len; 9662 continue; 9663 } else { 9664 rc = EIO; 9665 break; 9666 } 9667 } 9668 9669 leave: 9670 _mdb_txn_abort(txn); 9671 return rc; 9672 } 9673 9674 int ESECT 9675 mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) 9676 { 9677 if (flags & MDB_CP_COMPACT) 9678 return mdb_env_copyfd1(env, fd); 9679 else 9680 return mdb_env_copyfd0(env, fd); 9681 } 9682 9683 int ESECT 9684 mdb_env_copyfd(MDB_env *env, HANDLE fd) 9685 { 9686 return mdb_env_copyfd2(env, fd, 0); 9687 } 9688 9689 int ESECT 9690 mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) 9691 { 9692 int rc; 9693 MDB_name fname; 9694 HANDLE newfd = INVALID_HANDLE_VALUE; 9695 9696 rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); 9697 if (rc == MDB_SUCCESS) { 9698 rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); 9699 mdb_fname_destroy(fname); 9700 } 9701 if (rc == MDB_SUCCESS) { 9702 rc = mdb_env_copyfd2(env, newfd, flags); 9703 if (close(newfd) < 0 && rc == MDB_SUCCESS) 9704 rc = ErrCode(); 9705 } 9706 return rc; 9707 } 9708 9709 int ESECT 9710 mdb_env_copy(MDB_env *env, const char *path) 9711 { 9712 return mdb_env_copy2(env, path, 0); 9713 } 9714 9715 int ESECT 9716 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) 9717 { 9718 if (flag & ~CHANGEABLE) 9719 return EINVAL; 9720 if (onoff) 9721 env->me_flags |= flag; 9722 else 9723 env->me_flags &= ~flag; 9724 return MDB_SUCCESS; 9725 } 9726 9727 int ESECT 9728 mdb_env_get_flags(MDB_env *env, unsigned int *arg) 9729 { 9730 if (!env || !arg) 9731 return EINVAL; 9732 9733 *arg = env->me_flags & (CHANGEABLE|CHANGELESS); 9734 return MDB_SUCCESS; 9735 } 9736 9737 int ESECT 9738 mdb_env_set_userctx(MDB_env *env, void *ctx) 9739 { 9740 if (!env) 9741 return EINVAL; 9742 env->me_userctx = ctx; 9743 return MDB_SUCCESS; 9744 } 9745 9746 void * ESECT 9747 mdb_env_get_userctx(MDB_env *env) 9748 { 9749 return env ? env->me_userctx : NULL; 9750 } 9751 9752 int ESECT 9753 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) 9754 { 9755 if (!env) 9756 return EINVAL; 9757 #ifndef NDEBUG 9758 env->me_assert_func = func; 9759 #endif 9760 return MDB_SUCCESS; 9761 } 9762 9763 int ESECT 9764 mdb_env_get_path(MDB_env *env, const char **arg) 9765 { 9766 if (!env || !arg) 9767 return EINVAL; 9768 9769 *arg = env->me_path; 9770 return MDB_SUCCESS; 9771 } 9772 9773 int ESECT 9774 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) 9775 { 9776 if (!env || !arg) 9777 return EINVAL; 9778 9779 *arg = env->me_fd; 9780 return MDB_SUCCESS; 9781 } 9782 9783 /** Common code for #mdb_stat() and #mdb_env_stat(). 9784 * @param[in] env the environment to operate in. 9785 * @param[in] db the #MDB_db record containing the stats to return. 9786 * @param[out] arg the address of an #MDB_stat structure to receive the stats. 9787 * @return 0, this function always succeeds. 9788 */ 9789 static int ESECT 9790 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) 9791 { 9792 arg->ms_psize = env->me_psize; 9793 arg->ms_depth = db->md_depth; 9794 arg->ms_branch_pages = db->md_branch_pages; 9795 arg->ms_leaf_pages = db->md_leaf_pages; 9796 arg->ms_overflow_pages = db->md_overflow_pages; 9797 arg->ms_entries = db->md_entries; 9798 9799 return MDB_SUCCESS; 9800 } 9801 9802 int ESECT 9803 mdb_env_stat(MDB_env *env, MDB_stat *arg) 9804 { 9805 MDB_meta *meta; 9806 9807 if (env == NULL || arg == NULL) 9808 return EINVAL; 9809 9810 meta = mdb_env_pick_meta(env); 9811 9812 return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); 9813 } 9814 9815 int ESECT 9816 mdb_env_info(MDB_env *env, MDB_envinfo *arg) 9817 { 9818 MDB_meta *meta; 9819 9820 if (env == NULL || arg == NULL) 9821 return EINVAL; 9822 9823 meta = mdb_env_pick_meta(env); 9824 arg->me_mapaddr = meta->mm_address; 9825 arg->me_last_pgno = meta->mm_last_pg; 9826 arg->me_last_txnid = meta->mm_txnid; 9827 9828 arg->me_mapsize = env->me_mapsize; 9829 arg->me_maxreaders = env->me_maxreaders; 9830 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; 9831 return MDB_SUCCESS; 9832 } 9833 9834 /** Set the default comparison functions for a database. 9835 * Called immediately after a database is opened to set the defaults. 9836 * The user can then override them with #mdb_set_compare() or 9837 * #mdb_set_dupsort(). 9838 * @param[in] txn A transaction handle returned by #mdb_txn_begin() 9839 * @param[in] dbi A database handle returned by #mdb_dbi_open() 9840 */ 9841 static void 9842 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) 9843 { 9844 uint16_t f = txn->mt_dbs[dbi].md_flags; 9845 9846 txn->mt_dbxs[dbi].md_cmp = 9847 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : 9848 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; 9849 9850 txn->mt_dbxs[dbi].md_dcmp = 9851 !(f & MDB_DUPSORT) ? 0 : 9852 ((f & MDB_INTEGERDUP) 9853 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) 9854 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); 9855 } 9856 9857 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) 9858 { 9859 MDB_val key, data; 9860 MDB_dbi i; 9861 MDB_cursor mc; 9862 MDB_db dummy; 9863 int rc, dbflag, exact; 9864 unsigned int unused = 0, seq; 9865 char *namedup; 9866 size_t len; 9867 9868 if (flags & ~VALID_FLAGS) 9869 return EINVAL; 9870 if (txn->mt_flags & MDB_TXN_BLOCKED) 9871 return MDB_BAD_TXN; 9872 9873 /* main DB? */ 9874 if (!name) { 9875 *dbi = MAIN_DBI; 9876 if (flags & PERSISTENT_FLAGS) { 9877 uint16_t f2 = flags & PERSISTENT_FLAGS; 9878 /* make sure flag changes get committed */ 9879 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { 9880 txn->mt_dbs[MAIN_DBI].md_flags |= f2; 9881 txn->mt_flags |= MDB_TXN_DIRTY; 9882 } 9883 } 9884 mdb_default_cmp(txn, MAIN_DBI); 9885 MDB_TRACE(("%p, (null), %u = %u", txn, flags, MAIN_DBI)); 9886 return MDB_SUCCESS; 9887 } 9888 9889 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { 9890 mdb_default_cmp(txn, MAIN_DBI); 9891 } 9892 9893 /* Is the DB already open? */ 9894 len = strlen(name); 9895 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 9896 if (!txn->mt_dbxs[i].md_name.mv_size) { 9897 /* Remember this free slot */ 9898 if (!unused) unused = i; 9899 continue; 9900 } 9901 if (len == txn->mt_dbxs[i].md_name.mv_size && 9902 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { 9903 *dbi = i; 9904 return MDB_SUCCESS; 9905 } 9906 } 9907 9908 /* If no free slot and max hit, fail */ 9909 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) 9910 return MDB_DBS_FULL; 9911 9912 /* Cannot mix named databases with some mainDB flags */ 9913 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) 9914 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; 9915 9916 /* Find the DB info */ 9917 dbflag = DB_NEW|DB_VALID|DB_USRVALID; 9918 exact = 0; 9919 key.mv_size = len; 9920 key.mv_data = (void *)name; 9921 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 9922 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); 9923 if (rc == MDB_SUCCESS) { 9924 /* make sure this is actually a DB */ 9925 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); 9926 if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 9927 return MDB_INCOMPATIBLE; 9928 } else { 9929 if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) 9930 return rc; 9931 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 9932 return EACCES; 9933 } 9934 9935 /* Done here so we cannot fail after creating a new DB */ 9936 if ((namedup = strdup(name)) == NULL) 9937 return ENOMEM; 9938 9939 if (rc) { 9940 /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ 9941 data.mv_size = sizeof(MDB_db); 9942 data.mv_data = &dummy; 9943 memset(&dummy, 0, sizeof(dummy)); 9944 dummy.md_root = P_INVALID; 9945 dummy.md_flags = flags & PERSISTENT_FLAGS; 9946 WITH_CURSOR_TRACKING(mc, 9947 rc = _mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); 9948 dbflag |= DB_DIRTY; 9949 } 9950 9951 if (rc) { 9952 free(namedup); 9953 } else { 9954 /* Got info, register DBI in this txn */ 9955 unsigned int slot = unused ? unused : txn->mt_numdbs; 9956 txn->mt_dbxs[slot].md_name.mv_data = namedup; 9957 txn->mt_dbxs[slot].md_name.mv_size = len; 9958 txn->mt_dbxs[slot].md_rel = NULL; 9959 txn->mt_dbflags[slot] = dbflag; 9960 /* txn-> and env-> are the same in read txns, use 9961 * tmp variable to avoid undefined assignment 9962 */ 9963 seq = ++txn->mt_env->me_dbiseqs[slot]; 9964 txn->mt_dbiseqs[slot] = seq; 9965 9966 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); 9967 *dbi = slot; 9968 mdb_default_cmp(txn, slot); 9969 if (!unused) { 9970 txn->mt_numdbs++; 9971 } 9972 MDB_TRACE(("%p, %s, %u = %u", txn, name, flags, slot)); 9973 } 9974 9975 return rc; 9976 } 9977 9978 int ESECT 9979 mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) 9980 { 9981 if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 9982 return EINVAL; 9983 9984 if (txn->mt_flags & MDB_TXN_BLOCKED) 9985 return MDB_BAD_TXN; 9986 9987 if (txn->mt_dbflags[dbi] & DB_STALE) { 9988 MDB_cursor mc; 9989 MDB_xcursor mx; 9990 /* Stale, must read the DB's root. cursor_init does it for us. */ 9991 mdb_cursor_init(&mc, txn, dbi, &mx); 9992 } 9993 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); 9994 } 9995 9996 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) 9997 { 9998 char *ptr; 9999 if (dbi < CORE_DBS || dbi >= env->me_maxdbs) 10000 return; 10001 ptr = env->me_dbxs[dbi].md_name.mv_data; 10002 /* If there was no name, this was already closed */ 10003 if (ptr) { 10004 MDB_TRACE(("%p, %u", env, dbi)); 10005 env->me_dbxs[dbi].md_name.mv_data = NULL; 10006 env->me_dbxs[dbi].md_name.mv_size = 0; 10007 env->me_dbflags[dbi] = 0; 10008 env->me_dbiseqs[dbi]++; 10009 free(ptr); 10010 } 10011 } 10012 10013 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) 10014 { 10015 /* We could return the flags for the FREE_DBI too but what's the point? */ 10016 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10017 return EINVAL; 10018 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; 10019 return MDB_SUCCESS; 10020 } 10021 10022 /** Add all the DB's pages to the free list. 10023 * @param[in] mc Cursor on the DB to free. 10024 * @param[in] subs non-Zero to check for sub-DBs in this DB. 10025 * @return 0 on success, non-zero on failure. 10026 */ 10027 static int 10028 mdb_drop0(MDB_cursor *mc, int subs) 10029 { 10030 int rc; 10031 10032 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 10033 if (rc == MDB_SUCCESS) { 10034 MDB_txn *txn = mc->mc_txn; 10035 MDB_node *ni; 10036 MDB_cursor mx; 10037 unsigned int i; 10038 10039 /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. 10040 * This also avoids any P_LEAF2 pages, which have no nodes. 10041 * Also if the DB doesn't have sub-DBs and has no overflow 10042 * pages, omit scanning leaves. 10043 */ 10044 if ((mc->mc_flags & C_SUB) || 10045 (!subs && !mc->mc_db->md_overflow_pages)) 10046 mdb_cursor_pop(mc); 10047 10048 mdb_cursor_copy(mc, &mx); 10049 while (mc->mc_snum > 0) { 10050 MDB_page *mp = mc->mc_pg[mc->mc_top]; 10051 unsigned n = NUMKEYS(mp); 10052 if (IS_LEAF(mp)) { 10053 for (i=0; i<n; i++) { 10054 ni = NODEPTR(mp, i); 10055 if (ni->mn_flags & F_BIGDATA) { 10056 MDB_page *omp; 10057 pgno_t pg; 10058 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 10059 rc = mdb_page_get(mc, pg, &omp, NULL); 10060 if (rc != 0) 10061 goto done; 10062 mdb_cassert(mc, IS_OVERFLOW(omp)); 10063 rc = mdb_midl_append_range(&txn->mt_free_pgs, 10064 pg, omp->mp_pages); 10065 if (rc) 10066 goto done; 10067 mc->mc_db->md_overflow_pages -= omp->mp_pages; 10068 if (!mc->mc_db->md_overflow_pages && !subs) 10069 break; 10070 } else if (subs && (ni->mn_flags & F_SUBDATA)) { 10071 mdb_xcursor_init1(mc, ni); 10072 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 10073 if (rc) 10074 goto done; 10075 } 10076 } 10077 if (!subs && !mc->mc_db->md_overflow_pages) 10078 goto pop; 10079 } else { 10080 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) 10081 goto done; 10082 for (i=0; i<n; i++) { 10083 pgno_t pg; 10084 ni = NODEPTR(mp, i); 10085 pg = NODEPGNO(ni); 10086 /* free it */ 10087 mdb_midl_xappend(txn->mt_free_pgs, pg); 10088 } 10089 } 10090 if (!mc->mc_top) 10091 break; 10092 mc->mc_ki[mc->mc_top] = i; 10093 rc = mdb_cursor_sibling(mc, 1); 10094 if (rc) { 10095 if (rc != MDB_NOTFOUND) 10096 goto done; 10097 /* no more siblings, go back to beginning 10098 * of previous level. 10099 */ 10100 pop: 10101 mdb_cursor_pop(mc); 10102 mc->mc_ki[0] = 0; 10103 for (i=1; i<mc->mc_snum; i++) { 10104 mc->mc_ki[i] = 0; 10105 mc->mc_pg[i] = mx.mc_pg[i]; 10106 } 10107 } 10108 } 10109 /* free it */ 10110 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); 10111 done: 10112 if (rc) 10113 txn->mt_flags |= MDB_TXN_ERROR; 10114 } else if (rc == MDB_NOTFOUND) { 10115 rc = MDB_SUCCESS; 10116 } 10117 mc->mc_flags &= ~C_INITIALIZED; 10118 return rc; 10119 } 10120 10121 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) 10122 { 10123 MDB_cursor *mc, *m2; 10124 int rc; 10125 10126 if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10127 return EINVAL; 10128 10129 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 10130 return EACCES; 10131 10132 if (TXN_DBI_CHANGED(txn, dbi)) 10133 return MDB_BAD_DBI; 10134 10135 rc = mdb_cursor_open(txn, dbi, &mc); 10136 if (rc) 10137 return rc; 10138 10139 MDB_TRACE(("%u, %d", dbi, del)); 10140 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); 10141 /* Invalidate the dropped DB's cursors */ 10142 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) 10143 m2->mc_flags &= ~(C_INITIALIZED|C_EOF); 10144 if (rc) 10145 goto leave; 10146 10147 /* Can't delete the main DB */ 10148 if (del && dbi >= CORE_DBS) { 10149 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); 10150 if (!rc) { 10151 txn->mt_dbflags[dbi] = DB_STALE; 10152 mdb_dbi_close(txn->mt_env, dbi); 10153 } else { 10154 txn->mt_flags |= MDB_TXN_ERROR; 10155 } 10156 } else { 10157 /* reset the DB record, mark it dirty */ 10158 txn->mt_dbflags[dbi] |= DB_DIRTY; 10159 txn->mt_dbs[dbi].md_depth = 0; 10160 txn->mt_dbs[dbi].md_branch_pages = 0; 10161 txn->mt_dbs[dbi].md_leaf_pages = 0; 10162 txn->mt_dbs[dbi].md_overflow_pages = 0; 10163 txn->mt_dbs[dbi].md_entries = 0; 10164 txn->mt_dbs[dbi].md_root = P_INVALID; 10165 10166 txn->mt_flags |= MDB_TXN_DIRTY; 10167 } 10168 leave: 10169 mdb_cursor_close(mc); 10170 return rc; 10171 } 10172 10173 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 10174 { 10175 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10176 return EINVAL; 10177 10178 txn->mt_dbxs[dbi].md_cmp = cmp; 10179 return MDB_SUCCESS; 10180 } 10181 10182 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 10183 { 10184 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10185 return EINVAL; 10186 10187 txn->mt_dbxs[dbi].md_dcmp = cmp; 10188 return MDB_SUCCESS; 10189 } 10190 10191 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) 10192 { 10193 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10194 return EINVAL; 10195 10196 txn->mt_dbxs[dbi].md_rel = rel; 10197 return MDB_SUCCESS; 10198 } 10199 10200 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) 10201 { 10202 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10203 return EINVAL; 10204 10205 txn->mt_dbxs[dbi].md_relctx = ctx; 10206 return MDB_SUCCESS; 10207 } 10208 10209 int ESECT 10210 mdb_env_get_maxkeysize(MDB_env *env) 10211 { 10212 return ENV_MAXKEY(env); 10213 } 10214 10215 int ESECT 10216 mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) 10217 { 10218 unsigned int i, rdrs; 10219 MDB_reader *mr; 10220 char buf[64]; 10221 int rc = 0, first = 1; 10222 10223 if (!env || !func) 10224 return -1; 10225 if (!env->me_txns) { 10226 return func("(no reader locks)\n", ctx); 10227 } 10228 rdrs = env->me_txns->mti_numreaders; 10229 mr = env->me_txns->mti_readers; 10230 for (i=0; i<rdrs; i++) { 10231 if (mr[i].mr_pid) { 10232 txnid_t txnid = mr[i].mr_txnid; 10233 sprintf(buf, txnid == (txnid_t)-1 ? 10234 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", 10235 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); 10236 if (first) { 10237 first = 0; 10238 rc = func(" pid thread txnid\n", ctx); 10239 if (rc < 0) 10240 break; 10241 } 10242 rc = func(buf, ctx); 10243 if (rc < 0) 10244 break; 10245 } 10246 } 10247 if (first) { 10248 rc = func("(no active readers)\n", ctx); 10249 } 10250 return rc; 10251 } 10252 10253 /** Insert pid into list if not already present. 10254 * return -1 if already present. 10255 */ 10256 static int ESECT 10257 mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) 10258 { 10259 /* binary search of pid in list */ 10260 unsigned base = 0; 10261 unsigned cursor = 1; 10262 int val = 0; 10263 unsigned n = ids[0]; 10264 10265 while( 0 < n ) { 10266 unsigned pivot = n >> 1; 10267 cursor = base + pivot + 1; 10268 val = pid - ids[cursor]; 10269 10270 if( val < 0 ) { 10271 n = pivot; 10272 10273 } else if ( val > 0 ) { 10274 base = cursor; 10275 n -= pivot + 1; 10276 10277 } else { 10278 /* found, so it's a duplicate */ 10279 return -1; 10280 } 10281 } 10282 10283 if( val > 0 ) { 10284 ++cursor; 10285 } 10286 ids[0]++; 10287 for (n = ids[0]; n > cursor; n--) 10288 ids[n] = ids[n-1]; 10289 ids[n] = pid; 10290 return 0; 10291 } 10292 10293 int ESECT 10294 mdb_reader_check(MDB_env *env, int *dead) 10295 { 10296 if (!env) 10297 return EINVAL; 10298 if (dead) 10299 *dead = 0; 10300 return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; 10301 } 10302 10303 /** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ 10304 static int ESECT 10305 mdb_reader_check0(MDB_env *env, int rlocked, int *dead) 10306 { 10307 mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; 10308 unsigned int i, j, rdrs; 10309 MDB_reader *mr; 10310 MDB_PID_T *pids, pid; 10311 int rc = MDB_SUCCESS, count = 0; 10312 10313 rdrs = env->me_txns->mti_numreaders; 10314 pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); 10315 if (!pids) 10316 return ENOMEM; 10317 pids[0] = 0; 10318 mr = env->me_txns->mti_readers; 10319 for (i=0; i<rdrs; i++) { 10320 pid = mr[i].mr_pid; 10321 if (pid && pid != env->me_pid) { 10322 if (mdb_pid_insert(pids, pid) == 0) { 10323 if (!mdb_reader_pid(env, Pidcheck, pid)) { 10324 /* Stale reader found */ 10325 j = i; 10326 if (rmutex) { 10327 if ((rc = LOCK_MUTEX0(rmutex)) != 0) { 10328 if ((rc = mdb_mutex_failed(env, rmutex, rc))) 10329 break; 10330 rdrs = 0; /* the above checked all readers */ 10331 } else { 10332 /* Recheck, a new process may have reused pid */ 10333 if (mdb_reader_pid(env, Pidcheck, pid)) 10334 j = rdrs; 10335 } 10336 } 10337 for (; j<rdrs; j++) 10338 if (mr[j].mr_pid == pid) { 10339 DPRINTF(("clear stale reader pid %u txn %"Z"d", 10340 (unsigned) pid, mr[j].mr_txnid)); 10341 mr[j].mr_pid = 0; 10342 count++; 10343 } 10344 if (rmutex) 10345 UNLOCK_MUTEX(rmutex); 10346 } 10347 } 10348 } 10349 } 10350 free(pids); 10351 if (dead) 10352 *dead = count; 10353 return rc; 10354 } 10355 10356 #ifdef MDB_ROBUST_SUPPORTED 10357 /** Handle #LOCK_MUTEX0() failure. 10358 * Try to repair the lock file if the mutex owner died. 10359 * @param[in] env the environment handle 10360 * @param[in] mutex LOCK_MUTEX0() mutex 10361 * @param[in] rc LOCK_MUTEX0() error (nonzero) 10362 * @return 0 on success with the mutex locked, or an error code on failure. 10363 */ 10364 static int ESECT 10365 mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) 10366 { 10367 int rlocked, rc2; 10368 MDB_meta *meta; 10369 10370 if (rc == MDB_OWNERDEAD) { 10371 /* We own the mutex. Clean up after dead previous owner. */ 10372 rc = MDB_SUCCESS; 10373 rlocked = (mutex == env->me_rmutex); 10374 if (!rlocked) { 10375 /* Keep mti_txnid updated, otherwise next writer can 10376 * overwrite data which latest meta page refers to. 10377 */ 10378 meta = mdb_env_pick_meta(env); 10379 env->me_txns->mti_txnid = meta->mm_txnid; 10380 /* env is hosed if the dead thread was ours */ 10381 if (env->me_txn) { 10382 env->me_flags |= MDB_FATAL_ERROR; 10383 env->me_txn = NULL; 10384 rc = MDB_PANIC; 10385 } 10386 } 10387 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), 10388 (rc ? "this process' env is hosed" : "recovering"))); 10389 rc2 = mdb_reader_check0(env, rlocked, NULL); 10390 if (rc2 == 0) 10391 rc2 = mdb_mutex_consistent(mutex); 10392 if (rc || (rc = rc2)) { 10393 DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); 10394 UNLOCK_MUTEX(mutex); 10395 } 10396 } else { 10397 #ifdef _WIN32 10398 rc = ErrCode(); 10399 #endif 10400 DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); 10401 } 10402 10403 return rc; 10404 } 10405 #endif /* MDB_ROBUST_SUPPORTED */ 10406 10407 #if defined(_WIN32) 10408 /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ 10409 static int ESECT 10410 utf8_to_utf16(const char *src, MDB_name *dst, int xtra) 10411 { 10412 int rc, need = 0; 10413 wchar_t *result = NULL; 10414 for (;;) { /* malloc result, then fill it in */ 10415 need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); 10416 if (!need) { 10417 rc = ErrCode(); 10418 free(result); 10419 return rc; 10420 } 10421 if (!result) { 10422 result = malloc(sizeof(wchar_t) * (need + xtra)); 10423 if (!result) 10424 return ENOMEM; 10425 continue; 10426 } 10427 dst->mn_alloced = 1; 10428 dst->mn_len = need - 1; 10429 dst->mn_val = result; 10430 return MDB_SUCCESS; 10431 } 10432 } 10433 #endif /* defined(_WIN32) */ 10434 /** @} */