From 6d4002e12288368cdb0693074eb9f4be47f4fec5 Mon Sep 17 00:00:00 2001 From: Jorge Fuentes Date: Wed, 6 Apr 2022 14:47:24 +0200 Subject: [PATCH] CHG: Add kryszyp lmdb-js improvements --- dependencies/lmdb/libraries/liblmdb/COPYRIGHT | 2 +- dependencies/lmdb/libraries/liblmdb/Makefile | 30 +- dependencies/lmdb/libraries/liblmdb/chacha8.c | 183 ++ dependencies/lmdb/libraries/liblmdb/chacha8.h | 14 + dependencies/lmdb/libraries/liblmdb/crypto.c | 121 + dependencies/lmdb/libraries/liblmdb/intro.doc | 2 +- dependencies/lmdb/libraries/liblmdb/lmdb.h | 145 +- dependencies/lmdb/libraries/liblmdb/mdb.c | 2131 ++++++++++++----- .../lmdb/libraries/liblmdb/mdb_copy.1 | 17 +- .../lmdb/libraries/liblmdb/mdb_copy.c | 26 +- .../lmdb/libraries/liblmdb/mdb_drop.1 | 17 +- .../lmdb/libraries/liblmdb/mdb_drop.c | 25 +- .../lmdb/libraries/liblmdb/mdb_dump.1 | 17 +- .../lmdb/libraries/liblmdb/mdb_dump.c | 46 +- .../lmdb/libraries/liblmdb/mdb_load.1 | 17 +- .../lmdb/libraries/liblmdb/mdb_load.c | 48 +- .../lmdb/libraries/liblmdb/mdb_stat.1 | 17 +- .../lmdb/libraries/liblmdb/mdb_stat.c | 46 +- dependencies/lmdb/libraries/liblmdb/midl.c | 39 +- dependencies/lmdb/libraries/liblmdb/midl.h | 18 +- dependencies/lmdb/libraries/liblmdb/module.c | 101 + dependencies/lmdb/libraries/liblmdb/module.h | 16 + dependencies/lmdb/libraries/liblmdb/mtest.c | 3 +- dependencies/lmdb/libraries/liblmdb/mtest2.c | 2 +- dependencies/lmdb/libraries/liblmdb/mtest3.c | 2 +- dependencies/lmdb/libraries/liblmdb/mtest4.c | 2 +- dependencies/lmdb/libraries/liblmdb/mtest5.c | 2 +- dependencies/lmdb/libraries/liblmdb/mtest6.c | 2 +- .../lmdb/libraries/liblmdb/mtest_enc.c | 190 ++ .../lmdb/libraries/liblmdb/mtest_enc2.c | 189 ++ .../lmdb/libraries/liblmdb/mtest_remap.c | 177 ++ .../lmdb/libraries/liblmdb/sample-bdb.txt | 2 +- .../lmdb/libraries/liblmdb/sample-mdb.txt | 2 +- 33 files changed, 2978 insertions(+), 673 deletions(-) create mode 100644 dependencies/lmdb/libraries/liblmdb/chacha8.c create mode 100644 dependencies/lmdb/libraries/liblmdb/chacha8.h create mode 100644 dependencies/lmdb/libraries/liblmdb/crypto.c create mode 100644 dependencies/lmdb/libraries/liblmdb/module.c create mode 100644 dependencies/lmdb/libraries/liblmdb/module.h create mode 100644 dependencies/lmdb/libraries/liblmdb/mtest_enc.c create mode 100644 dependencies/lmdb/libraries/liblmdb/mtest_enc2.c create mode 100644 dependencies/lmdb/libraries/liblmdb/mtest_remap.c diff --git a/dependencies/lmdb/libraries/liblmdb/COPYRIGHT b/dependencies/lmdb/libraries/liblmdb/COPYRIGHT index d9118b97c9..14eb1493d6 100644 --- a/dependencies/lmdb/libraries/liblmdb/COPYRIGHT +++ b/dependencies/lmdb/libraries/liblmdb/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2011-2020 Howard Chu, Symas Corp. +Copyright 2011-2021 Howard Chu, Symas Corp. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/Makefile b/dependencies/lmdb/libraries/liblmdb/Makefile index 942d0dbe4c..c252b50e21 100644 --- a/dependencies/lmdb/libraries/liblmdb/Makefile +++ b/dependencies/lmdb/libraries/liblmdb/Makefile @@ -24,9 +24,11 @@ W = -W -Wall -Wno-unused-parameter -Wbad-function-cast -Wuninitialized THREADS = -pthread OPT = -O2 -g CFLAGS = $(THREADS) $(OPT) $(W) $(XCFLAGS) +LDFLAGS = $(THREADS) LDLIBS = SOLIBS = SOEXT = .so +LDL = -ldl prefix = /usr/local exec_prefix = $(prefix) bindir = $(exec_prefix)/bin @@ -42,7 +44,11 @@ ILIBS = liblmdb.a liblmdb$(SOEXT) IPROGS = mdb_stat mdb_copy mdb_dump mdb_load mdb_drop IDOCS = mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 mdb_drop.1 PROGS = $(IPROGS) mtest mtest2 mtest3 mtest4 mtest5 +RPROGS = mtest_remap mtest_enc mtest_enc2 + all: $(ILIBS) $(PROGS) +# Requires CPPFLAGS=-DMDB_VL32 and/or -DMDB_RPAGE_CACHE +rall: all $(RPROGS) install: $(ILIBS) $(IPROGS) $(IHDRS) mkdir -p $(DESTDIR)$(bindir) @@ -55,7 +61,7 @@ install: $(ILIBS) $(IPROGS) $(IHDRS) for f in $(IDOCS); do cp $$f $(DESTDIR)$(mandir)/man1; done clean: - rm -rf $(PROGS) *.[ao] *.[ls]o *~ testdb + rm -rf $(PROGS) $(RPROGS) *.[ao] *.[ls]o *~ testdb test: all rm -rf testdb && mkdir testdb @@ -68,17 +74,29 @@ liblmdb$(SOEXT): mdb.lo midl.lo # $(CC) $(LDFLAGS) -pthread -shared -Wl,-Bsymbolic -o $@ mdb.o midl.o $(SOLIBS) $(CC) $(LDFLAGS) -pthread -shared -o $@ mdb.lo midl.lo $(SOLIBS) -mdb_stat: mdb_stat.o liblmdb.a -mdb_copy: mdb_copy.o liblmdb.a -mdb_dump: mdb_dump.o liblmdb.a -mdb_load: mdb_load.o liblmdb.a -mdb_drop: mdb_drop.o liblmdb.a +mdb_stat: mdb_stat.o module.o liblmdb.a + $(CC) $(LDFLAGS) -o $@ $^ $(LDL) +mdb_copy: mdb_copy.o module.o liblmdb.a + $(CC) $(LDFLAGS) -o $@ $^ $(LDL) +mdb_dump: mdb_dump.o module.o liblmdb.a + $(CC) $(LDFLAGS) -o $@ $^ $(LDL) +mdb_load: mdb_load.o module.o liblmdb.a + $(CC) $(LDFLAGS) -o $@ $^ $(LDL) +mdb_drop: mdb_drop.o module.o liblmdb.a + $(CC) $(LDFLAGS) -o $@ $^ $(LDL) mtest: mtest.o liblmdb.a mtest2: mtest2.o liblmdb.a mtest3: mtest3.o liblmdb.a mtest4: mtest4.o liblmdb.a mtest5: mtest5.o liblmdb.a mtest6: mtest6.o liblmdb.a +mtest_remap: mtest_remap.o liblmdb.a +mtest_enc: mtest_enc.o chacha8.o liblmdb.a +mtest_enc2: mtest_enc2.o module.o liblmdb.a crypto.lm + $(CC) $(LDFLAGS) -pthread -o $@ mtest_enc2.o module.o liblmdb.a $(LDL) + +crypto.lm: crypto.c + $(CC) -shared -o $@ -lcrypto mdb.o: mdb.c lmdb.h midl.h $(CC) $(CFLAGS) $(CPPFLAGS) -c mdb.c diff --git a/dependencies/lmdb/libraries/liblmdb/chacha8.c b/dependencies/lmdb/libraries/liblmdb/chacha8.c new file mode 100644 index 0000000000..530eebccf0 --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/chacha8.c @@ -0,0 +1,183 @@ +/* +chacha-merged.c version 20080118 +D. J. Bernstein +Public domain. +*/ + +#include +#include +//#include + +#include "chacha8.h" +#if 0 +#include "common/int-util.h" +#include "warnings.h" +#endif + +#if BYTE_ORDER == LITTLE_ENDIAN +#define SWAP32LE(x) (x) +#else +#define SWAP32LE(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \ + (((uint32_t) (x) & 0x0000ff00) << 8) | \ + (((uint32_t) (x) & 0x00ff0000) >> 8) | \ + (((uint32_t) (x) & 0xff000000) >> 24)) +#endif + +/* + * The following macros are used to obtain exact-width results. + */ +#define U8V(v) ((uint8_t)(v) & UINT8_C(0xFF)) +#define U32V(v) ((uint32_t)(v) & UINT32_C(0xFFFFFFFF)) + +/* + * The following macros load words from an array of bytes with + * different types of endianness, and vice versa. + */ +#define U8TO32_LITTLE(p) SWAP32LE(((uint32_t*)(p))[0]) +#define U32TO8_LITTLE(p, v) (((uint32_t*)(p))[0] = SWAP32LE(v)) + +#define ROTATE(v,c) (rol32(v,c)) +#define XOR(v,w) ((v) ^ (w)) +#define PLUS(v,w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v),1)) + +#define QUARTERROUND(a,b,c,d) \ + a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ + a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); + +static const char sigma[] = "expand 32-byte k"; + +static uint32_t rol32(uint32_t x, int r) { + return (x << (r & 31)) | (x >> (-r & 31)); +} + +void chacha8(const void* data, size_t length, const uint8_t* key, const uint8_t* iv, char* cipher) { + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; + char* ctarget = 0; + char tmp[64]; + int i; + + if (!length) return; + + j0 = U8TO32_LITTLE(sigma + 0); + j1 = U8TO32_LITTLE(sigma + 4); + j2 = U8TO32_LITTLE(sigma + 8); + j3 = U8TO32_LITTLE(sigma + 12); + j4 = U8TO32_LITTLE(key + 0); + j5 = U8TO32_LITTLE(key + 4); + j6 = U8TO32_LITTLE(key + 8); + j7 = U8TO32_LITTLE(key + 12); + j8 = U8TO32_LITTLE(key + 16); + j9 = U8TO32_LITTLE(key + 20); + j10 = U8TO32_LITTLE(key + 24); + j11 = U8TO32_LITTLE(key + 28); + j12 = 0; + j13 = 0; + j14 = U8TO32_LITTLE(iv + 0); + j15 = U8TO32_LITTLE(iv + 4); + + for (;;) { + if (length < 64) { + memcpy(tmp, data, length); + data = tmp; + ctarget = cipher; + cipher = tmp; + } + x0 = j0; + x1 = j1; + x2 = j2; + x3 = j3; + x4 = j4; + x5 = j5; + x6 = j6; + x7 = j7; + x8 = j8; + x9 = j9; + x10 = j10; + x11 = j11; + x12 = j12; + x13 = j13; + x14 = j14; + x15 = j15; + for (i = 8;i > 0;i -= 2) { + QUARTERROUND( x0, x4, x8,x12) + QUARTERROUND( x1, x5, x9,x13) + QUARTERROUND( x2, x6,x10,x14) + QUARTERROUND( x3, x7,x11,x15) + QUARTERROUND( x0, x5,x10,x15) + QUARTERROUND( x1, x6,x11,x12) + QUARTERROUND( x2, x7, x8,x13) + QUARTERROUND( x3, x4, x9,x14) + } + x0 = PLUS( x0, j0); + x1 = PLUS( x1, j1); + x2 = PLUS( x2, j2); + x3 = PLUS( x3, j3); + x4 = PLUS( x4, j4); + x5 = PLUS( x5, j5); + x6 = PLUS( x6, j6); + x7 = PLUS( x7, j7); + x8 = PLUS( x8, j8); + x9 = PLUS( x9, j9); + x10 = PLUS(x10,j10); + x11 = PLUS(x11,j11); + x12 = PLUS(x12,j12); + x13 = PLUS(x13,j13); + x14 = PLUS(x14,j14); + x15 = PLUS(x15,j15); + + x0 = XOR( x0,U8TO32_LITTLE((uint8_t*)data + 0)); + x1 = XOR( x1,U8TO32_LITTLE((uint8_t*)data + 4)); + x2 = XOR( x2,U8TO32_LITTLE((uint8_t*)data + 8)); + x3 = XOR( x3,U8TO32_LITTLE((uint8_t*)data + 12)); + x4 = XOR( x4,U8TO32_LITTLE((uint8_t*)data + 16)); + x5 = XOR( x5,U8TO32_LITTLE((uint8_t*)data + 20)); + x6 = XOR( x6,U8TO32_LITTLE((uint8_t*)data + 24)); + x7 = XOR( x7,U8TO32_LITTLE((uint8_t*)data + 28)); + x8 = XOR( x8,U8TO32_LITTLE((uint8_t*)data + 32)); + x9 = XOR( x9,U8TO32_LITTLE((uint8_t*)data + 36)); + x10 = XOR(x10,U8TO32_LITTLE((uint8_t*)data + 40)); + x11 = XOR(x11,U8TO32_LITTLE((uint8_t*)data + 44)); + x12 = XOR(x12,U8TO32_LITTLE((uint8_t*)data + 48)); + x13 = XOR(x13,U8TO32_LITTLE((uint8_t*)data + 52)); + x14 = XOR(x14,U8TO32_LITTLE((uint8_t*)data + 56)); + x15 = XOR(x15,U8TO32_LITTLE((uint8_t*)data + 60)); + + j12 = PLUSONE(j12); + if (!j12) + { + j13 = PLUSONE(j13); + /* stopping at 2^70 bytes per iv is user's responsibility */ + } + + U32TO8_LITTLE(cipher + 0,x0); + U32TO8_LITTLE(cipher + 4,x1); + U32TO8_LITTLE(cipher + 8,x2); + U32TO8_LITTLE(cipher + 12,x3); + U32TO8_LITTLE(cipher + 16,x4); + U32TO8_LITTLE(cipher + 20,x5); + U32TO8_LITTLE(cipher + 24,x6); + U32TO8_LITTLE(cipher + 28,x7); + U32TO8_LITTLE(cipher + 32,x8); + U32TO8_LITTLE(cipher + 36,x9); + U32TO8_LITTLE(cipher + 40,x10); + U32TO8_LITTLE(cipher + 44,x11); + U32TO8_LITTLE(cipher + 48,x12); + U32TO8_LITTLE(cipher + 52,x13); + U32TO8_LITTLE(cipher + 56,x14); + U32TO8_LITTLE(cipher + 60,x15); + + if (length <= 64) { + if (length < 64) { + memcpy(ctarget, cipher, length); + } + return; + } + length -= 64; + cipher += 64; + data = (uint8_t*)data + 64; + } +} diff --git a/dependencies/lmdb/libraries/liblmdb/chacha8.h b/dependencies/lmdb/libraries/liblmdb/chacha8.h new file mode 100644 index 0000000000..40990ed96b --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/chacha8.h @@ -0,0 +1,14 @@ +#include +#include +#ifdef __cplusplus +extern "C" { +#endif +void chacha8(const void* data, size_t length, const uint8_t* key, const uint8_t* iv, char* cipher); + +#define CHACHA8_KEY_SIZE 32 +#define CHACHA8_IV_SIZE 8 + +#ifdef __cplusplus +} +#endif + diff --git a/dependencies/lmdb/libraries/liblmdb/crypto.c b/dependencies/lmdb/libraries/liblmdb/crypto.c new file mode 100644 index 0000000000..dadfa9244c --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/crypto.c @@ -0,0 +1,121 @@ +/* crypto.c - LMDB encryption helper module */ +/* + * Copyright 2020-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ +#include + +#include + +#include "lmdb.h" + +MDB_crypto_hooks MDB_crypto; + +static EVP_CIPHER *cipher; + +static int mcf_str2key(const char *passwd, MDB_val *key) +{ + unsigned int size; + EVP_MD_CTX *mdctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(mdctx, EVP_sha256(), NULL); + EVP_DigestUpdate(mdctx, "Just a Constant", sizeof("Just a Constant")); + EVP_DigestUpdate(mdctx, passwd, strlen(passwd)); + EVP_DigestFinal_ex(mdctx, key->mv_data, &size); + EVP_MD_CTX_free(mdctx); + return 0; +} + +/* cheats - internal OpenSSL 1.1 structures */ +typedef struct evp_cipher_ctx_st { + const EVP_CIPHER *cipher; + ENGINE *engine; /* functional reference if 'cipher' is + * ENGINE-provided */ + int encrypt; /* encrypt or decrypt */ + int buf_len; /* number we have left */ + unsigned char oiv[EVP_MAX_IV_LENGTH]; /* original iv */ + unsigned char iv[EVP_MAX_IV_LENGTH]; /* working iv */ + unsigned char buf[EVP_MAX_BLOCK_LENGTH]; /* saved partial block */ + int num; /* used by cfb/ofb/ctr mode */ + /* FIXME: Should this even exist? It appears unused */ + void *app_data; /* application stuff */ + int key_len; /* May change for variable length cipher */ + unsigned long flags; /* Various flags */ + void *cipher_data; /* per EVP data */ + int final_used; + int block_mask; + unsigned char final[EVP_MAX_BLOCK_LENGTH]; /* possible final block */ +} EVP_CIPHER_CTX; + +#define CHACHA_KEY_SIZE 32 +#define CHACHA_CTR_SIZE 16 +#define CHACHA_BLK_SIZE 64 +#define POLY1305_BLOCK_SIZE 16 + +typedef struct { + union { + double align; /* this ensures even sizeof(EVP_CHACHA_KEY)%8==0 */ + unsigned int d[CHACHA_KEY_SIZE / 4]; + } key; + unsigned int counter[CHACHA_CTR_SIZE / 4]; + unsigned char buf[CHACHA_BLK_SIZE]; + unsigned int partial_len; +} EVP_CHACHA_KEY; + +typedef struct { + EVP_CHACHA_KEY key; + unsigned int nonce[12/4]; + unsigned char tag[POLY1305_BLOCK_SIZE]; + unsigned char tls_aad[POLY1305_BLOCK_SIZE]; + struct { uint64_t aad, text; } len; + int aad, mac_inited, tag_len, nonce_len; + size_t tls_payload_length; +} EVP_CHACHA_AEAD_CTX; + +static int mcf_encfunc(const MDB_val *src, MDB_val *dst, const MDB_val *key, int encdec) +{ + unsigned char iv[12]; + int ivl, outl, rc; + mdb_size_t *ptr; + EVP_CIPHER_CTX ctx = {0}; + EVP_CHACHA_AEAD_CTX cactx; + + ctx.cipher_data = &cactx; + ptr = key[1].mv_data; + ivl = ptr[0] & 0xffffffff; + memcpy(iv, &ivl, 4); + memcpy(iv+4, ptr+1, sizeof(mdb_size_t)); + EVP_CipherInit_ex(&ctx, cipher, NULL, key[0].mv_data, iv, encdec); + EVP_CIPHER_CTX_set_padding(&ctx, 0); + if (!encdec) { + EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_AEAD_SET_TAG, key[2].mv_size, key[2].mv_data); + } + rc = EVP_CipherUpdate(&ctx, dst->mv_data, &outl, src->mv_data, src->mv_size); + if (rc) + rc = EVP_CipherFinal_ex(&ctx, key[2].mv_data, &outl); + if (rc && encdec) { + EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_AEAD_GET_TAG, key[2].mv_size, key[2].mv_data); + } + return rc == 0; +} + +static const MDB_crypto_funcs mcf_table = { + mcf_str2key, + mcf_encfunc, + NULL, + CHACHA_KEY_SIZE, + POLY1305_BLOCK_SIZE, + 0 +}; + +MDB_crypto_funcs *MDB_crypto() +{ + cipher = (EVP_CIPHER *)EVP_chacha20_poly1305(); + return (MDB_crypto_funcs *)&mcf_table; +} diff --git a/dependencies/lmdb/libraries/liblmdb/intro.doc b/dependencies/lmdb/libraries/liblmdb/intro.doc index 4853af736f..b5bb06716a 100644 --- a/dependencies/lmdb/libraries/liblmdb/intro.doc +++ b/dependencies/lmdb/libraries/liblmdb/intro.doc @@ -1,5 +1,5 @@ /* - * Copyright 2015-2020 Howard Chu, Symas Corp. + * Copyright 2015-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/lmdb.h b/dependencies/lmdb/libraries/liblmdb/lmdb.h index 3f11f518da..f3db0d8f50 100644 --- a/dependencies/lmdb/libraries/liblmdb/lmdb.h +++ b/dependencies/lmdb/libraries/liblmdb/lmdb.h @@ -136,7 +136,7 @@ * * @author Howard Chu, Symas Corporation. * - * @copyright Copyright 2011-2020 Howard Chu, Symas Corp. All rights reserved. + * @copyright Copyright 2011-2021 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -187,6 +187,11 @@ typedef mode_t mdb_mode_t; # define MDB_FMT_Z "z" /**< printf/scanf format modifier for size_t */ #endif +#if !defined(MDB_RPAGE_CACHE) || (defined(MDB_VL32) && !(MDB_RPAGE_CACHE)) +/** Support #MDB_REMAP_CHUNKS. Implied by MDB_VL32. Define as 0 to disable. */ +#define MDB_RPAGE_CACHE 1 +#endif + #ifndef MDB_VL32 /** Unsigned type used for mapsize, entry counts and page/transaction IDs. * @@ -219,7 +224,7 @@ typedef int mdb_filehandle_t; /** @defgroup mdb LMDB API * @{ - * @brief OpenLDAP Lightning Memory-Mapped Database Manager + * @brief Symas Lightning Memory-Mapped Database Manager */ /** @defgroup Version Version Macros * @{ @@ -229,7 +234,7 @@ typedef int mdb_filehandle_t; /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 70 +#define MDB_VERSION_PATCH 90 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -239,7 +244,7 @@ typedef int mdb_filehandle_t; MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "December 19, 2015" +#define MDB_VERSION_DATE "May 1, 2017" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" @@ -288,6 +293,7 @@ typedef struct MDB_val { void *mv_data; /**< address of the data item */ } MDB_val; + /** @brief A callback function used to compare two keys in a database */ typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); @@ -307,11 +313,42 @@ typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); */ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); +#if MDB_RPAGE_CACHE +/** @brief A callback function used to encrypt/decrypt pages in the env. + * + * Encrypt or decrypt the data in src and store the result in dst using the + * provided key. The result must be the same number of bytes as the input. + * @param[in] src The input data to be transformed. + * @param[out] dst Storage for the result. + * @param[in] key An array of three values: key[0] is the encryption key, + * key[1] is the initialization vector, and key[2] is the authentication + * data, if any. + * @param[in] encdec 1 to encrypt, 0 to decrypt. + * @return A non-zero error value on failure and 0 on success. + */ +typedef int (MDB_enc_func)(const MDB_val *src, MDB_val *dst, const MDB_val *key, int encdec); + +/** @brief A callback function used to checksum pages in the env. + * + * Compute the checksum of the data in src and store the result in dst, + * An optional key may be used with keyed hash algorithms. + * @param[in] src The input data to be transformed. + * @param[out] dst Storage for the result. + * @param[in] key An encryption key, if encryption was configured. This + * parameter will be NULL if there is no key. + */ +typedef void (MDB_sum_func)(const MDB_val *src, MDB_val *dst, const MDB_val *key); +#endif + /** @defgroup mdb_env Environment Flags * @{ */ /** mmap at a fixed address (experimental) */ #define MDB_FIXEDMAP 0x01 + /** Use overlapping sync approach */ +#define MDB_OVERLAPPINGSYNC 0x1000 + /** encrypted DB - read-only flag, set by #mdb_env_set_encrypt() */ +#define MDB_ENCRYPT 0x2000U /** no environment directory */ #define MDB_NOSUBDIR 0x4000 /** don't fsync after commit */ @@ -334,8 +371,12 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOMEMINIT 0x1000000 /** use the previous snapshot rather than the latest one */ #define MDB_PREVSNAPSHOT 0x2000000 + /** don't use a single mmap, remap individual chunks (needs MDB_RPAGE_CACHE) */ +#define MDB_REMAP_CHUNKS 0x4000000 /** @} */ +#define MDB_OVERLAPPINGSYNC_SYNC = 0x02 + /** @defgroup mdb_dbi_open Database Flags * @{ */ @@ -482,8 +523,14 @@ typedef enum MDB_cursor_op { #define MDB_BAD_DBI (-30780) /** Unexpected problem - txn should abort */ #define MDB_PROBLEM (-30779) + /** Page checksum incorrect */ +#define MDB_BAD_CHECKSUM (-30778) + /** Encryption/decryption failed */ +#define MDB_CRYPTO_FAIL (-30777) + /** Environment encryption mismatch */ +#define MDB_ENV_ENCRYPTION (-30776) /** The last defined error code */ -#define MDB_LAST_ERRCODE MDB_PROBLEM +#define MDB_LAST_ERRCODE MDB_ENV_ENCRYPTION /** @} */ /** @brief Statistics for a database in the environment */ @@ -610,7 +657,7 @@ int mdb_env_create(MDB_env **env); *
  • #MDB_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps - * the slot reseved for the #MDB_txn object. A thread may use parallel + * the slot reserved for the #MDB_txn object. A thread may use parallel * read-only transactions. A read-only transaction may span threads if * the user synchronizes its use. Applications that multiplex many * user threads over individual OS threads need this option. Such an @@ -884,6 +931,16 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); */ int mdb_env_set_mapsize(MDB_env *env, mdb_size_t size); + + /** @brief Set the size of DB pages in bytes. + * + * The size defaults to the OS page size. Smaller or larger values may be + * desired depending on the size of keys and values being used. Also, an + * explicit size may need to be set when using filesystems like ZFS which + * don't use the OS page size. + */ +int mdb_env_set_pagesize(MDB_env *env, int size); + /** @brief Set the maximum number of threads/reader slots for the environment. * * This defines the number of slots in the lock table that is used to track readers in the @@ -968,7 +1025,7 @@ void *mdb_env_get_userctx(MDB_env *env); typedef void MDB_assert_func(MDB_env *env, const char *msg); /** Set or reset the assert() callback of the environment. - * Disabled if liblmdb is buillt with NDEBUG. + * Disabled if liblmdb is built with NDEBUG. * @note This hack should become obsolete as lmdb's error handling matures. * @param[in] env An environment handle returned by #mdb_env_create(). * @param[in] func An #MDB_assert_func function, or 0. @@ -976,6 +1033,32 @@ typedef void MDB_assert_func(MDB_env *env, const char *msg); */ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); +#if MDB_RPAGE_CACHE + /** @brief Set encryption on an environment. + * + * This must be called before #mdb_env_open(). + * It implicitly sets #MDB_REMAP_CHUNKS on the env. + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_enc_func function. + * @param[in] key The encryption key. + * @param[in] size The size of authentication data in bytes, if any. + * Set this to zero for unauthenticated encryption mechanisms. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_encrypt(MDB_env *env, MDB_enc_func *func, const MDB_val *key, unsigned int size); + + + /** @brief Set checksums on an environment. + * + * This must be called before #mdb_env_open(). + * @param[in] env An environment handle returned by #mdb_env_create(). + * @param[in] func An #MDB_sum_func function. + * @param[in] size The size of computed checksum values, in bytes. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_set_checksum(MDB_env *env, MDB_sum_func *func, unsigned int size); +#endif + /** @brief Create a transaction for use with the environment. * * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). @@ -1123,6 +1206,8 @@ int mdb_txn_renew(MDB_txn *txn); * To use named databases (with name != NULL), #mdb_env_set_maxdbs() * must be called before opening the environment. Database names are * keys in the unnamed database, and may be read but not written. + * @note Names are C strings and stored with their NUL terminator included. + * In LMDB 0.9 the NUL terminator was omitted. * * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] name The name of the database to open. If only a single @@ -1468,6 +1553,13 @@ MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); */ MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); + /** @brief Check if the cursor is pointing to a named database record. + * + * @param[in] cursor A cursor handle returned by #mdb_cursor_open() + * @return 1 if current record is a named database, 0 otherwise. + */ +int mdb_cursor_is_db(MDB_cursor *cursor); + /** @brief Retrieve by cursor. * * This function retrieves key/data pairs from the database. The address and length @@ -1639,15 +1731,54 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); int mdb_reader_check(MDB_env *env, int *dead); /** @} */ +/** @defgroup crypto LMDB Encryption Helper API + * @{ + * @brief Helpers for setting up encryption + */ + + /** @brief A function for converting a string into an encryption key. + * + * @param[in] passwd The string to be converted. + * @param[in,out] key The resulting key. The caller must + * provide the space for the key. + * @return 0 on success, non-zero on failure. + */ +typedef int (MDB_str2key_func)(const char *passwd, MDB_val *key); + + /** @brief A structure for dynamically loaded crypto modules. + * + * This is the information that the command line tools expect + * in order to operate on encrypted or checksummed environments. + */ +typedef struct MDB_crypto_funcs { + MDB_str2key_func *mcf_str2key; + MDB_enc_func *mcf_encfunc; + MDB_sum_func *mcf_sumfunc; + int mcf_keysize; /**< The size of an encryption key, in bytes */ + int mcf_esumsize; /**< The size of the MAC, for authenticated encryption */ + int mcf_sumsize; /**< The size of the checksum, for plain checksums */ +} MDB_crypto_funcs; + + /** @brief The function that returns the #MDB_crypto_funcs structure. + * + * The command line tools expect this function to be named "MDB_crypto". + * It must be exported by the dynamic module so that the tools can use it. + * @return A pointer to a #MDB_crypto_funcs structure. + */ +typedef MDB_crypto_funcs *(MDB_crypto_hooks)(void); +/** @} */ + #ifdef __cplusplus } #endif /** @page tools LMDB Command Line Tools The following describes the command line tools that are available for LMDB. \li \ref mdb_copy_1 + \li \ref mdb_drop_1 \li \ref mdb_dump_1 \li \ref mdb_load_1 \li \ref mdb_stat_1 */ #endif /* _LMDB_H_ */ + diff --git a/dependencies/lmdb/libraries/liblmdb/mdb.c b/dependencies/lmdb/libraries/liblmdb/mdb.c index a83ea12234..d0dee6cb67 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -164,6 +164,8 @@ typedef SSIZE_T ssize_t; #if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110 # define MDB_USE_POSIX_MUTEX 1 # define MDB_USE_ROBUST 1 +#elif defined(__APPLE__) && !defined(MDB_USE_ROBUST) +# define MDB_USE_POSIX_SEM 1 #elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) # if !(defined(MDB_USE_POSIX_MUTEX) || defined(MDB_USE_POSIX_SEM)) # define MDB_USE_SYSV_SEM 1 @@ -172,6 +174,11 @@ typedef SSIZE_T ssize_t; #elif defined(__ANDROID__) # define MDB_FDATASYNC fsync #endif +#if defined(__APPLE__) +# define MDB_FDATASYNC(fd) fcntl(fd, F_FULLFSYNC) && \ + fcntl(fd, 85 /* F_BARRIERFSYNC */) && /* fsync + barrier */ \ + fsync(fd) +#endif #ifndef _WIN32 #include @@ -486,7 +493,7 @@ typedef pthread_mutex_t *mdb_mutexref_t; #define MNAME_LEN (sizeof(pthread_mutex_t)) #endif -/** Initial part of #MDB_env.me_mutexname[]. +/** Initial part of #MDB_env.%me_mutexname[]. * Changes to this code must be reflected in #MDB_LOCK_FORMAT. */ #ifdef _WIN32 @@ -590,7 +597,7 @@ static txnid_t mdb_debug_start; * The string is printed literally, with no format processing. */ #define DPUTS(arg) DPRINTF(("%s", arg)) - /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ + /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) /** @} */ @@ -634,7 +641,7 @@ static txnid_t mdb_debug_start; #define MDB_MAGIC 0xBEEFC0DE /** The version number for a database's datafile format. */ -#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 2) /** The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 2) /** Number of bits representing #MDB_LOCK_VERSION in #MDB_LOCK_FORMAT. @@ -885,16 +892,20 @@ typedef struct MDB_txninfo { char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; #if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) - union { + union { struct { #ifdef MDB_USE_SYSV_SEM int mt2_wlocked; + int mt2_sync_locked; #define mti_wlocked mt2.mt2_wlocked +#define mti_sync_locked mt2.mt2_sync_locked #else mdb_mutex_t mt2_wmutex; + mdb_mutex_t mt2_sync_mutex; #define mti_wmutex mt2.mt2_wmutex +#define mti_sync_mutex mt2.mt2_sync_mutex #endif char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; - } mt2; + }; } mt2; #endif MDB_reader mti_readers[1]; } MDB_txninfo; @@ -958,7 +969,8 @@ enum { * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. * * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. + * first has a page header. They hold the real data of #F_BIGDATA nodes, + * and the node stores the pgno and number of pages used by the record. * * #P_SUBP sub-pages are small leaf "pages" with duplicate data. * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. @@ -969,14 +981,37 @@ enum { * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a freeDB record. */ -typedef struct MDB_page { +typedef struct MDB_page_header { #define mp_pgno mp_p.p_pgno #define mp_next mp_p.p_next union { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ - } mp_p; - uint16_t mp_pad; /**< key size if this is a LEAF2 page */ + } mh_p; + + /** If page is clean: snapshot txnid, dirty: txn workid, metapage: unused. + * + * The value indicates which snapshot/#MDB_txn the page belongs to. + * Tested with #IS_MUTABLE(), #IS_WRITABLE(), #IS_DIRTY_NW(). + * + * (clean page).mp_txnid == txnid of creator < txnid of later txns. + * (dirty page).mp_txnid >= mt_workid of creator txn. + * (dirty page).mt_txnid < mt_workid of children of creator txn. + * + * Thus an #MDB_txn can write to pages with mp_txnid >= txn.mt_workid. + * A page with smaller mp_txnid is dirty in an ancestor txn or clean. + * + * Non-#MDB_WRITEMAP sets txn.mt_workid > txn.mt_txnid, to tell apart + * spilled and dirty pages. WRITEMAP sets mt_workid = mt_txnid, since + * it does not copy/spill pages. Thus (page.mp_txnid == txn.mt_txnid) + * says "spilled page" without WRITEMAP, "dirty page" with WRITEMAP. + * + * Finally, ((dirty page).mp_txnid & #MDB_PGTXNID_FLAGMASK) can be used + * for flags with non-WRITEMAP; it keeps low bits in workid = 0. + */ + txnid_t mh_txnid; + + uint16_t mh_pad; /**< key size if this is a LEAF2 page */ /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -986,13 +1021,15 @@ typedef struct MDB_page { #define P_LEAF 0x02 /**< leaf page */ #define P_OVERFLOW 0x04 /**< overflow page */ #define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_DIRTY_OVF 0x2000 /**< page has dirty overflow nodes */ #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ #define P_KEEP 0x8000 /**< leave this page alone during spill */ +/** Persistent flags for page administration rather than page contents */ +#define P_ADM_FLAGS 0 /* later... */ /** @} */ - uint16_t mp_flags; /**< @ref mdb_page */ + uint16_t mh_flags; /**< @ref mdb_page */ #define mp_lower mp_pb.pb.pb_lower #define mp_upper mp_pb.pb.pb_upper #define mp_pages mp_pb.pb_pages @@ -1002,18 +1039,27 @@ typedef struct MDB_page { indx_t pb_upper; /**< upper bound of free space */ } pb; uint32_t pb_pages; /**< number of overflow pages */ - } mp_pb; + } mh_pb; +} MDB_page_header; + +typedef struct MDB_page { + MDB_page_header mp_hdr; +#define mp_p mp_hdr.mh_p +#define mp_txnid mp_hdr.mh_txnid +#define mp_pad mp_hdr.mh_pad +#define mp_flags mp_hdr.mh_flags +#define mp_pb mp_hdr.mh_pb indx_t mp_ptrs[1]; /**< dynamic size */ } MDB_page; /** Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) +#define PAGEHDRSZ ((unsigned)sizeof(MDB_page_header)) /** Address of first usable data byte in a page, after the header */ #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) +#define PAGEBASE PAGEHDRSZ /** Number of nodes on a page */ #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) @@ -1040,6 +1086,28 @@ typedef struct MDB_page { /** Test if a page is a sub page */ #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + /** Test if (this non-sub page is dirty && env is non-#MDB_WRITEMAP) */ +#define IS_DIRTY_NW(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) + /** Test if this non-sub page belongs to the current snapshot */ +#define IS_MUTABLE(txn, p) ((p)->mp_txnid >= (txn)->mt_txnid) + /** Test if this non-sub page is writable in this txn (not an ancestor) */ +#define IS_WRITABLE(txn, p) ((p)->mp_txnid >= (txn)->mt_workid) + + /** Info about overflow page, stored in an F_BIGDATA node */ +typedef struct MDB_ovpage { + pgno_t op_pgno; + txnid_t op_txnid; + mdb_size_t op_pages; +} MDB_ovpage; + +#if OVERFLOW_NOTYET + /** Header for a dirty overflow page in memory */ +typedef struct MDB_dovpage { + MDB_page_header mp_hdr; + void *mp_ptr; +} MDB_dovpage; +#endif + /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) @@ -1048,6 +1116,27 @@ typedef struct MDB_page { */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + /** Mark the page as writable by this txn */ +#ifndef MDB_TEST +#define SET_PGTXNID(txn, mp) ((mp)->mp_txnid = (txn)->mt_workid) +#else +#define SET_PGTXNID(txn, mp) \ + ((mp)->mp_txnid = (txn)->mt_workid \ + /* random unused "flags" added when not WRITEMAP for debugging */ \ + | (((txn)->mt_flags & MDB_TXN_WRITEMAP) ? 0 : \ + (MDB_RAND((size_t)(txn)) >> (32-MDB_PGTXNID_FLAGBITS)))) +#define MDB_RAND(x) (mdb_rnd = (mdb_rnd + (unsigned)(x)) * 987654321 + 54321) +static volatile unsigned mdb_rnd; +#endif + + /** mp_txnid bits reserved in dirty pages for flags. + * TODO: For future code with header-free ovpages, if we omit mp_flags + * from the "header" kept elsewhere. Otherwise, drop this code. + */ +#define MDB_PGTXNID_FLAGBITS 4 +#define MDB_PGTXNID_STEP ((txnid_t)1 << MDB_PGTXNID_FLAGBITS) +#define MDB_PGTXNID_FLAGMASK (MDB_PGTXNID_STEP-1) + /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. @@ -1058,7 +1147,7 @@ typedef struct MDB_page { * order in case some accesses can be optimized to 32-bit word access. * * Leaf node flags describe node contents. #F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. + * data part is an MDB_ovpage struct pointing to a page with actual data. * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ @@ -1261,7 +1350,7 @@ struct MDB_txn { /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ MDB_txn *mt_child; pgno_t mt_next_pgno; /**< next unallocated page */ -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE pgno_t mt_last_pgno; /**< last written page */ #endif /** The ID of this transaction. IDs are integers incrementing from 1. @@ -1269,6 +1358,23 @@ struct MDB_txn { * aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; + + /** Written to mp_txnid of dirty pages, to be fixed by #mdb_page_flush(). + * + * Value >= WRITEMAP ? txnid : 1 + (parent ? parent.last_workid : txnid). + * See #MDB_page.%mp_txnid. + * + * An MDB_txn can write to a page when page.mp_txnid >= txn.mt_workid. + * New children get bigger workid than pages dirty in their parent + * (i.e. bigger than parent.mt_last_workid). When children commit, + * they copy #mt_last_workid to the parent to match their pages. + */ + txnid_t mt_workid; + /** Current max mp_txnid of the MDB_txn's dirty pages: Starts as + * #mt_workid, then grows as it is copied from children who commit. + */ + txnid_t mt_last_workid; + MDB_env *mt_env; /**< the DB environment */ /** The list of pages that became unused during this transaction. */ @@ -1282,14 +1388,19 @@ struct MDB_txn { /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. + * Unused with #MDB_WRITEMAP, which does not use a dirty list. */ MDB_IDL mt_spill_pgs; union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + /** For write txns: Modified pages, sorted. Unused when MDB_WRITEMAP. */ MDB_ID2L dirty_list; /** For read txns: This thread/txn's reader table slot, or NULL. */ MDB_reader *reader; } mt_u; +#if OVERFLOW_NOTYET + /** The sorted list of dirty overflow pages. */ + MDB_ID2L mt_dirty_ovs; +#endif /** Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; /** Array of MDB_db records for each known DB */ @@ -1311,7 +1422,7 @@ struct MDB_txn { MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE /** List of read-only pages (actually chunks) */ MDB_ID3L mt_rpages; /** We map chunks of 16 pages. Even though Windows uses 4KB pages, all @@ -1335,7 +1446,7 @@ struct MDB_txn { * @{ */ /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY|MDB_OVERLAPPINGSYNC) #define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ #define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ @@ -1346,6 +1457,7 @@ struct MDB_txn { #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ +#define MDB_TXN_DIRTYNUM 0x20 /**< dirty list uses nump list */ /** most operations on the txn are currently illegal */ #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) /** @} */ @@ -1354,6 +1466,7 @@ struct MDB_txn { * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirty_list into mt_parent after freeing hidden mt_parent pages. + * When #MDB_WRITEMAP, it is nonzero but otherwise irrelevant. */ unsigned int mt_dirty_room; }; @@ -1413,10 +1526,12 @@ struct MDB_cursor { unsigned int mc_flags; /**< @ref mdb_cursor */ MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE MDB_page *mc_ovpg; /**< a referenced overflow page */ # define MC_OVPG(mc) ((mc)->mc_ovpg) -# define MC_SET_OVPG(mc, pg) ((mc)->mc_ovpg = (pg)) +# define MC_SET_OVPG(mc, pg) \ + ((mc)->mc_ovpg = \ + (MDB_REMAPPING((mc)->mc_txn->mt_env->me_flags) ? (pg) : NULL)) #else # define MC_OVPG(mc) ((MDB_page *)0) # define MC_SET_OVPG(mc, pg) ((void)0) @@ -1461,20 +1576,29 @@ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ } MDB_pgstate; - +/**/ +struct MDB_last_map { + struct MDB_last_map *last_map; + char *map; + mdb_size_t mapsize; +}; +typedef struct MDB_last_map MDB_last_map; +/**/ /** The database environment. */ struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ HANDLE me_mfd; /**< For writing and syncing the meta pages */ #ifdef _WIN32 -#ifdef MDB_VL32 +#ifdef MDB_RPAGE_CACHE HANDLE me_fmh; /**< File Mapping handle */ -#endif /* MDB_VL32 */ +#endif HANDLE me_ovfd; /**< Overlapped/async with write-through file handle */ #endif /* _WIN32 */ /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U + /** using a raw block device */ +#define MDB_RAWPART 0x40000000U /** Some fields are initialized. */ #define MDB_ENV_ACTIVE 0x20000000U /** me_txkey is set */ @@ -1492,6 +1616,9 @@ struct MDB_env { MDB_PID_T me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ +/**/ + MDB_last_map *me_last_map; /**< the previous memory map of the data file after a resize */ +/**/ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ @@ -1511,8 +1638,11 @@ struct MDB_env { MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. + * Unused except for a dummy element when #MDB_WRITEMAP. + */ MDB_ID2L me_dirty_list; + int *me_dirty_nump; /** Max number of freelist items that can fit in a single overflow page */ int me_maxfree_1pg; /** Max size of a node on a page */ @@ -1523,26 +1653,35 @@ struct MDB_env { int me_live_reader; /**< have liveness lock in reader table */ #ifdef _WIN32 int me_pidquery; /**< Used in OpenProcess */ - OVERLAPPED *ov; /**< Used for for overlapping I/O requests */ - int ovs; /**< Count of OVERLAPPEDs */ + OVERLAPPED *me_ov; /**< Used for overlapping I/O requests */ + int me_ovs; /**< Count of MDB_overlaps */ #endif #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ +# define me_sync_mutex me_txns->mti_sync_mutex /**< Shared sync lock */ #else mdb_mutex_t me_rmutex; mdb_mutex_t me_wmutex; + mdb_mutex_t me_sync_mutex; # if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) /** Half-initialized name of mutexes, to be completed by #MUTEXNAME() */ char me_mutexname[sizeof(MUTEXNAME_PREFIX) + 11]; # endif #endif -#ifdef MDB_VL32 + mdb_size_t me_synced_txn_id; +#if MDB_RPAGE_CACHE MDB_ID3L me_rpages; /**< like #mt_rpages, but global to env */ pthread_mutex_t me_rpmutex; /**< control access to #me_rpages */ + MDB_sum_func *me_sumfunc; /**< checksum env data */ + unsigned short me_sumsize; /**< size of per-page checksums */ #define MDB_ERPAGE_SIZE 16384 #define MDB_ERPAGE_MAX (MDB_ERPAGE_SIZE-1) + unsigned short me_esumsize; /**< size of per-page authentication data */ unsigned int me_rpcheck; + + MDB_enc_func *me_encfunc; /**< encrypt env data */ + MDB_val me_enckey; /**< key for env encryption */ #endif void *me_userctx; /**< User-settable context */ MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ @@ -1589,7 +1728,15 @@ enum { #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ static void mdb_txn_end(MDB_txn *txn, unsigned mode); -static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +#if MDB_RPAGE_CACHE +#define MDB_PAGE_GET(mc, pg, numpgs, mp) mdb_page_get(mc, pg, numpgs, mp) +static void mdb_rpage_dispose(MDB_env *env, MDB_ID3 *id3); +#else +/* Drop unused numpgs argument when !MDB_RPAGE_CACHE */ +#define MDB_PAGE_GET(mc, pg, numpgs, mp) mdb_page_get(mc, pg, mp) +#endif +static int MDB_PAGE_GET(MDB_cursor *mc, pgno_t pgno, int numpgs, MDB_page **mp); + static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 @@ -1608,9 +1755,9 @@ static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta); static MDB_meta *mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); #if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */ -# define mdb_env_close0(env, excl) mdb_env_close1(env) +# define mdb_env_close_active(env, excl) mdb_env_close1(env) #endif -static void mdb_env_close0(MDB_env *env, int excl); +static void mdb_env_close_active(MDB_env *env, int excl); static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); static int mdb_node_add(MDB_cursor *mc, indx_t indx, @@ -1705,6 +1852,9 @@ static char *const mdb_errstr[] = { "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", "MDB_PROBLEM: Unexpected problem - txn should abort", + "MDB_BAD_CHECKSUM: Page checksum mismatch", + "MDB_CRYPTO_FAIL: Page encryption or decryption failed", + "MDB_ENV_ENCRYPTION: Environment encryption mismatch", }; char * @@ -1836,7 +1986,7 @@ void mdb_page_list(MDB_page *mp) { pgno_t pgno = mdb_dbg_pgno(mp); - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + const char *type; MDB_node *node; unsigned int i, nkeys, nsize, total = 0; MDB_val key; @@ -1849,8 +1999,7 @@ mdb_page_list(MDB_page *mp) case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; case P_OVERFLOW: - fprintf(stderr, "Overflow page %"Yu" pages %u%s\n", - pgno, mp->mp_pages, state); + fprintf(stderr, "Overflow page %"Yu" pages %u\n", pgno, mp->mp_pages); return; case P_META: fprintf(stderr, "Meta-page %"Yu" txnid %"Yu"\n", @@ -1862,7 +2011,7 @@ mdb_page_list(MDB_page *mp) } nkeys = NUMKEYS(mp); - fprintf(stderr, "%s %"Yu" numkeys %d%s\n", type, pgno, nkeys, state); + fprintf(stderr, "%s %"Yu" numkeys %d\n", type, pgno, nkeys); for (i=0; imn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); + nsize += sizeof(MDB_ovpage); else nsize += NODEDSZ(node); total += nsize; @@ -2000,7 +2149,7 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) * Set #MDB_TXN_ERROR on failure. */ static MDB_page * -mdb_page_malloc(MDB_txn *txn, unsigned num) +mdb_page_malloc(MDB_txn *txn, unsigned num, int init) { MDB_env *env = txn->mt_env; MDB_page *ret = env->me_dpages; @@ -2011,26 +2160,28 @@ mdb_page_malloc(MDB_txn *txn, unsigned num) * many pages they will be filling in at least up to the last page. */ if (num == 1) { + psize -= off = PAGEHDRSZ; if (ret) { VGMEMP_ALLOC(env, ret, sz); VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); env->me_dpages = ret->mp_next; - return ret; + goto init; } - psize -= off = PAGEHDRSZ; } else { sz *= num; off = sz - psize; } if ((ret = malloc(sz)) != NULL) { VGMEMP_ALLOC(env, ret, sz); - if (!(env->me_flags & MDB_NOMEMINIT)) { +init: + if (init && !(env->me_flags & MDB_NOMEMINIT)) { memset((char *)ret + off, 0, psize); ret->mp_pad = 0; } } else { txn->mt_flags |= MDB_TXN_ERROR; } + ret->mp_flags = 0; return ret; } /** Free a single page. @@ -2058,6 +2209,24 @@ mdb_dpage_free(MDB_env *env, MDB_page *dp) } } +#if MDB_RPAGE_CACHE +/** Free an encrypted dirty page + * We can't check if it's an overflow page, + * caller must tell us how many are being freed. + */ +static void +mdb_dpage_free_n(MDB_env *env, MDB_page *dp, int num) +{ + if (num == 1) + mdb_page_free(env, dp); + else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} +#endif + /** Return all dirty pages to dpage list */ static void mdb_dlist_free(MDB_txn *txn) @@ -2066,20 +2235,34 @@ mdb_dlist_free(MDB_txn *txn) MDB_ID2L dl = txn->mt_u.dirty_list; unsigned i, n = dl[0].mid; - for (i = 1; i <= n; i++) { - mdb_dpage_free(env, dl[i].mptr); + if (txn->mt_flags & MDB_TXN_DIRTYNUM) { + int *dl_nump = env->me_dirty_nump; + for (i = 1; i <= n; i++) { + mdb_dpage_free_n(env, dl[i].mptr, dl_nump[i]); + } + txn->mt_flags ^= MDB_TXN_DIRTYNUM; + } else { + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } } dl[0].mid = 0; } -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE +#if defined(MDB_VL32) || ((MDB_RPAGE_CACHE) & 2) /* Always remap */ +#define MDB_REMAPPING(flags) 1 +#else +#define MDB_REMAPPING(flags) ((flags) & MDB_REMAP_CHUNKS) +#endif + static void mdb_page_unref(MDB_txn *txn, MDB_page *mp) { pgno_t pgno; MDB_ID3L tl = txn->mt_rpages; unsigned x, rem; - if (mp->mp_flags & (P_SUBP|P_DIRTY)) + if (IS_SUBP(mp) || IS_DIRTY_NW(txn, mp)) return; rem = mp->mp_pgno & (MDB_RPAGE_CHUNK-1); pgno = mp->mp_pgno ^ rem; @@ -2089,7 +2272,8 @@ mdb_page_unref(MDB_txn *txn, MDB_page *mp) if (tl[x].mref) tl[x].mref--; } -#define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp) +#define MDB_PAGE_UNREF(txn, mp) \ + (MDB_REMAPPING(txn->mt_env->me_flags) ? mdb_page_unref(txn, mp) : (void)0) static void mdb_cursor_unref(MDB_cursor *mc) @@ -2110,14 +2294,33 @@ mdb_cursor_unref(MDB_cursor *mc) mc->mc_flags &= ~C_INITIALIZED; } #define MDB_CURSOR_UNREF(mc, force) \ - (((force) || ((mc)->mc_flags & C_INITIALIZED)) \ + ((MDB_REMAPPING((mc)->mc_txn->mt_env->me_flags) && \ + ((force) || ((mc)->mc_flags & C_INITIALIZED))) \ ? mdb_cursor_unref(mc) \ : (void)0) +/* Unref ovpage \b omp in \b mc and tracked cursors */ +static void +mdb_ovpage_unref_all(MDB_cursor *mc, MDB_page *omp) +{ + MDB_txn *txn = mc->mc_txn; + MDB_cursor *next = txn->mt_cursors[mc->mc_dbi]; + for (;; mc = next, next = mc->mc_next) { + if (MC_OVPG(mc) == omp) { + mdb_page_unref(mc->mc_txn, omp); + MC_SET_OVPG(mc, NULL); + } + if (next == NULL) + break; + } +} + #else +#define MDB_REMAPPING(flags) 0 #define MDB_PAGE_UNREF(txn, mp) #define MDB_CURSOR_UNREF(mc, force) ((void)0) -#endif /* MDB_VL32 */ +#define mdb_ovpage_unref_all(mc, omp, pgno) ((void)0) +#endif /* MDB_RPAGE_CACHE */ /** Loosen or free a single page. * Saves single pages to a list for future reuse @@ -2132,66 +2335,41 @@ mdb_cursor_unref(MDB_cursor *mc) static int mdb_page_loose(MDB_cursor *mc, MDB_page *mp) { - int loose = 0; - pgno_t pgno = mp->mp_pgno; + int rc = MDB_SUCCESS; MDB_txn *txn = mc->mc_txn; - if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { - if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_u.dirty_list; - /* If txn has a parent, make sure the page is in our - * dirty list. - */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - /* ok, it's ours */ - loose = 1; - } - } - } else { - /* no parent txn, so it's just ours */ - loose = 1; - } - } - if (loose) { + if (IS_WRITABLE(txn, mp) && mc->mc_dbi != FREE_DBI) { + /* Page is dirty in this txn, and is not in freeDB */ DPRINTF(("loosen db %d page %"Yu, DDBI(mc), mp->mp_pgno)); NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; txn->mt_loose_pgs = mp; txn->mt_loose_count++; mp->mp_flags |= P_LOOSE; } else { - int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); - if (rc) - return rc; + rc = mdb_midl_append(&txn->mt_free_pgs, mp->mp_pgno); } - return MDB_SUCCESS; + return rc; } /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * @param[in] mc A cursor handle for the current operation. * @param[in] pflags Flags of the pages to update: - * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * 0 to set P_KEEP, P_KEEP to clear it. * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). * @return 0 on success, non-zero on failure. */ static int mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { - enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; + enum { Mask = P_SUBP|P_LOOSE|P_KEEP }; MDB_txn *txn = mc->mc_txn; MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDB_page *dp, *mp; MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; + unsigned i, j, x; + int rc = MDB_SUCCESS; /* Mark pages seen by cursors: First m0, then tracked cursors */ for (i = txn->mt_numdbs;; ) { @@ -2201,8 +2379,12 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) for (j=0; jmc_snum; j++) { mp = m3->mc_pg[j]; if ((mp->mp_flags & Mask) == pflags) + if (IS_WRITABLE(txn, mp)) mp->mp_flags ^= P_KEEP; } + if (MC_OVPG(m3) && ((MC_OVPG(m3)->mp_flags & Mask) == pflags) && + IS_WRITABLE(txn, MC_OVPG(m3))) + MC_OVPG(m3)->mp_flags ^= P_KEEP; mx = m3->mc_xcursor; /* Proceed to mx if it is at a sub-database */ if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) @@ -2223,14 +2405,17 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) mark_done: if (all) { /* Mark dirty root pages */ + MDB_ID2L dl = txn->mt_u.dirty_list; for (i=0; imt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) + x = mdb_mid2l_search(dl, pgno); + if (! (x <= dl[0].mid && dl[x].mid == pgno)) + continue; + dp = dl[x].mptr; + if ((dp->mp_flags & Mask) == pflags) dp->mp_flags ^= P_KEEP; } } @@ -2253,7 +2438,7 @@ static int mdb_page_flush(MDB_txn *txn, int keep); * * Otherwise, if not using nested txns, it is expected that apps will * not run into #MDB_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * the same way as for a txn commit. * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. * If the txn writes them again, they can be dirtied immediately without @@ -2282,7 +2467,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) unsigned int i, j, need; int rc; - if (m0->mc_flags & C_SUB) + if (m0->mc_flags & (C_SUB|C_WRITEMAP)) return MDB_SUCCESS; /* Estimate how much space this op will take */ @@ -2316,7 +2501,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) } /* Preserve pages which may soon be dirtied again */ - if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + if ((rc = mdb_pages_xkeep(m0, 0, 1)) != MDB_SUCCESS) goto done; /* Less aggressive spill - we originally spilled the entire dirty list, @@ -2364,7 +2549,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) goto done; /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + rc = mdb_pages_xkeep(m0, P_KEEP, i); done: txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; @@ -2376,7 +2561,11 @@ static txnid_t mdb_find_oldest(MDB_txn *txn) { int i; - txnid_t mr, oldest = txn->mt_txnid - 1; + /* */ + txnid_t mr, oldest = (txn->mt_env->me_flags & MDB_OVERLAPPINGSYNC) ? + txn->mt_env->me_synced_txn_id : + (txn->mt_txnid - 1); + /* */ if (txn->mt_env->me_txns) { MDB_reader *r = txn->mt_env->me_txns->mti_readers; for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { @@ -2390,25 +2579,20 @@ mdb_find_oldest(MDB_txn *txn) return oldest; } -/** Add a page to the txn's dirty list */ +/** Add a page to the txn's dirty list, if there is one */ static void mdb_page_dirty(MDB_txn *txn, MDB_page *mp) { MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); -#ifdef _WIN32 /* With Windows we always write dirty pages with WriteFile, - * so we always want them ordered */ - insert = mdb_mid2l_insert; -#else /* but otherwise with writemaps, we just use msync, we - * don't need the ordering and just append */ - if (txn->mt_flags & MDB_TXN_WRITEMAP) - insert = mdb_mid2l_append; - else - insert = mdb_mid2l_insert; -#endif + int rc; + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + txn->mt_flags |= MDB_TXN_DIRTY; + return; + } mid.mid = mp->mp_pgno; mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); + rc = mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); mdb_tassert(txn, rc == 0); txn->mt_dirty_room--; } @@ -2455,13 +2639,35 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) MDB_cursor m2; int found_old = 0; +#if OVERFLOW_NOTYET + MDB_dovpage *dph = NULL; + + if (ov) { + if (!txn->mt_dirty_ovs) { + txn->mt_dirty_ovs = mdb_mid2l_alloc(16); + if (!txn->mt_dirty_ovs) + return ENOMEM; + } else if (mdb_mid2l_need(&txn->mt_dirty_ovs, txn->mt_dirty_ovs[0].mid + 1)) + return ENOMEM; + dph = malloc(sizeof(MDB_dovpage)); + } +#endif + /* If there are any loose pages, just use them */ if (num == 1 && txn->mt_loose_pgs) { np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; DPRINTF(("db %d use loose page %"Yu, DDBI(mc), np->mp_pgno)); +#if OVERFLOW_NOTYET + if (ov) { + dph->mp_hdr = np->mp_hdr; + dph->mp_ptr = np; + np = (MDB_page *)dph; + } +#endif *mp = np; + np->mp_flags &= P_ADM_FLAGS; return MDB_SUCCESS; } @@ -2576,17 +2782,17 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) i = 0; pgno = txn->mt_next_pgno; if (pgno + num >= env->me_maxpg) { - DPUTS("DB size maxed out"); - rc = MDB_MAP_FULL; - goto fail; + /* */ + size_t new_size = ((size_t) (2 * (pgno + num) * env->me_psize / 0x40000 + 1)) * 0x40000; +// fprintf(stderr, "resizing from %u to %u", env->me_mapsize, new_size); + rc = mdb_env_set_mapsize(env, new_size); + /* */ } #if defined(_WIN32) && !defined(MDB_VL32) - if (!(env->me_flags & MDB_RDONLY)) { + if (env->me_flags & MDB_WRITEMAP) { void *p; p = (MDB_page *)(env->me_map + env->me_psize * pgno); - p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT, - (env->me_flags & MDB_WRITEMAP) ? PAGE_READWRITE: - PAGE_READONLY); + p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT, PAGE_READWRITE); if (!p) { DPUTS("VirtualAlloc failed"); rc = ErrCode(); @@ -2599,7 +2805,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (env->me_flags & MDB_WRITEMAP) { np = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { - if (!(np = mdb_page_malloc(txn, num))) { + if (!(np = mdb_page_malloc(txn, num, 1))) { rc = ENOMEM; goto fail; } @@ -2612,13 +2818,30 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } else { txn->mt_next_pgno = pgno + num; } +#if OVERFLOW_NOTYET + if (ov) { + dph->mp_hdr = np->mp_hdr; + dph->mp_ptr = np; + np = (MDB_page *)dph; + } +#endif np->mp_pgno = pgno; + SET_PGTXNID(txn, np); + np->mp_flags = 0; +#if OVERFLOW_NOTYET + mdb_page_dirty(txn, np, ov); +#else mdb_page_dirty(txn, np); +#endif *mp = np; return MDB_SUCCESS; fail: +#if OVERFLOW_NOTYET + if (dph) + free(dph); +#endif txn->mt_flags |= MDB_TXN_ERROR; return rc; } @@ -2647,39 +2870,40 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) } } -/** Pull a page off the txn's spill list, if present. - * If a page being referenced was spilled to disk in this txn, bring - * it back and make it dirty/writable again. +/** Bring back a page which this txn spilled to disk; make it writable again. * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. It must not be dirty. - * @param[out] ret the writable page, if any. ret is unchanged if - * mp wasn't spilled. + * @param[in] mp the spilled page. + * @param[out] ret the writable page. */ static int mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { MDB_env *env = txn->mt_env; - const MDB_txn *tx2; unsigned x; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + + /* x = position in current spill list, or 0 */ + x = 0; + if (txn->mt_spill_pgs) { + x = mdb_midl_search(txn->mt_spill_pgs, pn); + if (! (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn)) + x = 0; + } + if (x == 0 && !txn->mt_parent) + return MDB_PROBLEM; /* should be a spilled page */ + + { MDB_page *np; int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; if (IS_OVERFLOW(mp)) num = mp->mp_pages; else num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); + { + np = mdb_page_malloc(txn, num, 1); if (!np) return ENOMEM; if (num > 1) @@ -2687,7 +2911,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) else mdb_page_copy(np, mp, env->me_psize); } - if (tx2 == txn) { + if (x) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. @@ -2701,12 +2925,10 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) */ mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; + SET_PGTXNID(txn, np); *ret = np; - break; - } + return MDB_SUCCESS; } - return MDB_SUCCESS; } /** Touch a page: make it dirty and re-insert into tree with updated pgno. @@ -2720,18 +2942,15 @@ mdb_page_touch(MDB_cursor *mc) MDB_page *mp = mc->mc_pg[mc->mc_top], *np; MDB_txn *txn = mc->mc_txn; MDB_cursor *m2, *m3; + unsigned np_flags; pgno_t pgno; int rc; - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (rc) - goto fail; - if (np) - goto done; - } + if (IS_SUBP(mp) || IS_WRITABLE(txn, mp)) + return MDB_SUCCESS; + + if (!IS_MUTABLE(txn, mp)) { + /* Page from an older snapshot */ if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || (rc = mdb_page_alloc(mc, 1, &np))) goto fail; @@ -2748,39 +2967,36 @@ mdb_page_touch(MDB_cursor *mc) } else { mc->mc_db->md_root = pgno; } - } else if (txn->mt_parent && !IS_SUBP(mp)) { + } else if (!IS_DIRTY_NW(txn, mp)) { + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + goto done; + } else { + /* Writable in an ancestor txn */ MDB_ID2 mid, *dl = txn->mt_u.dirty_list; pgno = mp->mp_pgno; - /* If txn has a parent, make sure the page is in our - * dirty list. - */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - return 0; - } + if (!txn->mt_parent) { + rc = MDB_PROBLEM; + goto fail; } mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - /* No - copy it */ - np = mdb_page_malloc(txn, 1); - if (!np) - return ENOMEM; + np = mdb_page_malloc(txn, 1, 1); + if (!np) { + rc = ENOMEM; + goto fail; + } mid.mid = pgno; mid.mptr = np; rc = mdb_mid2l_insert(dl, &mid); mdb_cassert(mc, rc == 0); - } else { - return 0; } + np_flags = np->mp_flags; /* P_ADM_FLAGS */ mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_flags |= np_flags; np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; + SET_PGTXNID(txn, np); done: /* Adjust cursors pointing to mp */ @@ -2818,9 +3034,9 @@ mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) int rc = 0; if (env->me_flags & MDB_RDONLY) return EACCES; - if (force -#ifndef _WIN32 /* Sync is normally achieved in Windows by doing WRITE_THROUGH writes */ - || !(env->me_flags & MDB_NOSYNC) + if (force || !(env->me_flags & MDB_NOSYNC) +#ifdef _WIN32 /* Sync is normally achieved in Windows by doing WRITE_THROUGH writes */ + && (env->me_flags & MDB_WRITEMAP) #endif ) { if (env->me_flags & MDB_WRITEMAP) { @@ -2850,7 +3066,46 @@ int mdb_env_sync(MDB_env *env, int force) { MDB_meta *m = mdb_env_pick_meta(env); - return mdb_env_sync0(env, force, m->mm_last_pg+1); + // + if (env->me_flags & MDB_OVERLAPPINGSYNC) { + MDB_txninfo *ti = env->me_txns; + size_t last_txn_id = ti->mti_txnid; + //fprintf(stderr,"syncing txn %u, ", last_txn_id); + int rc; + if (LOCK_MUTEX(rc, env, env->me_sync_mutex)) + return rc; + if (env->me_synced_txn_id >= last_txn_id) { + UNLOCK_MUTEX(env->me_sync_mutex); + return 0; + } + MDB_txn sync_txn; + MDB_db dbs[2]; + do { + m = mdb_env_pick_meta(env); + sync_txn.mt_env = env; + sync_txn.mt_flags = 2; + sync_txn.mt_dbs = dbs; + sync_txn.mt_dbs[FREE_DBI] = m->mm_dbs[FREE_DBI]; + sync_txn.mt_dbs[MAIN_DBI] = m->mm_dbs[MAIN_DBI]; + sync_txn.mt_dbs[FREE_DBI].md_flags &= ~MDB_OVERLAPPINGSYNC; // clear this to indicate it is flushed txn + sync_txn.mt_txnid = last_txn_id = m->mm_txnid; + sync_txn.mt_next_pgno = m->mm_last_pg + 1; + } while(ti->mti_txnid != last_txn_id); // avoid race condition in copying data by verifying that this is updated + rc = mdb_env_sync0(env, force, sync_txn.mt_next_pgno); + if (rc) { + UNLOCK_MUTEX(env->me_sync_mutex); + return rc; + } + rc = mdb_env_write_meta(&sync_txn); + if (rc == 0) + env->me_synced_txn_id = last_txn_id; + //fprintf(stderr,"finished syncing txn %u, ", last_txn_id); + UNLOCK_MUTEX(env->me_sync_mutex); + return rc; + } else { + return mdb_env_sync0(env, force, m->mm_last_pg+1); + } + // } /** Back up parent txn's cursors, then grab the originals for tracking */ @@ -3082,12 +3337,22 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_child = NULL; txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; - txn->mt_dirty_room = MDB_IDL_UM_MAX; + if (env->me_flags & MDB_WRITEMAP) { + txn->mt_workid = txn->mt_txnid; + txn->mt_dirty_room = 1; + } else { + txn->mt_workid = (txn->mt_txnid | MDB_PGTXNID_FLAGMASK) + 1; + txn->mt_dirty_room = MDB_IDL_UM_MAX; + } + txn->mt_last_workid = txn->mt_workid; txn->mt_u.dirty_list = env->me_dirty_list; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; txn->mt_free_pgs[0] = 0; txn->mt_spill_pgs = NULL; +#if OVERFLOW_NOTYET + txn->mt_dirty_ovs = NULL; +#endif env->me_txn = txn; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); } @@ -3097,8 +3362,9 @@ mdb_txn_renew0(MDB_txn *txn) /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; -#ifdef MDB_VL32 - txn->mt_last_pgno = txn->mt_next_pgno - 1; +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) + txn->mt_last_pgno = txn->mt_next_pgno - 1; #endif txn->mt_flags = flags; @@ -3116,9 +3382,14 @@ mdb_txn_renew0(MDB_txn *txn) if (env->me_flags & MDB_FATAL_ERROR) { DPUTS("environment had fatal error, must shutdown!"); rc = MDB_PANIC; - } else if (env->me_maxpg < txn->mt_next_pgno) { - rc = MDB_MAP_RESIZED; } else { + /* */ + if (env->me_maxpg < txn->mt_next_pgno) { + // need to resize map + size_t new_size = ((size_t) (2 * (txn->mt_next_pgno) * env->me_psize / 0x40000 + 1)) * 0x40000; + mdb_env_set_mapsize(env, new_size); + } + /* */ return MDB_SUCCESS; } mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); @@ -3142,6 +3413,33 @@ mdb_txn_renew(MDB_txn *txn) return rc; } +/** Used up all workids. Rewind it and update dirty pages to match. */ +static txnid_t ESECT +mdb_workid_rewind(MDB_txn *txn) +{ + txnid_t workid, diff; + + while (txn->mt_parent) + txn = txn->mt_parent; + workid = txn->mt_txnid & ~MDB_PGTXNID_FLAGMASK; + do { + workid += MDB_PGTXNID_STEP; + diff = txn->mt_last_workid - workid; + if (diff) { + MDB_ID2L dl = txn->mt_u.dirty_list; + int i; + for (i = dl[0].mid; i; i--) { + if (MDB_PGTXNID_FLAGBITS) + ((MDB_page *)dl[i].mptr)->mp_txnid -= diff; + else + ((MDB_page *)dl[i].mptr)->mp_txnid = workid; + } + txn->mt_workid = txn->mt_last_workid = workid; + } + } while ((txn = txn->mt_child) != NULL); + return workid; +} + int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { @@ -3178,8 +3476,8 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) DPRINTF(("calloc: %s", strerror(errno))); return ENOMEM; } -#ifdef MDB_VL32 - if (!parent) { +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags) && !parent) { txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); if (!txn->mt_rpages) { free(txn); @@ -3197,6 +3495,10 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) if (parent) { unsigned int i; + txnid_t workid = parent->mt_last_workid + MDB_PGTXNID_STEP; + if (!workid) /* wraparound after lots of previous children */ + workid = mdb_workid_rewind(parent) + MDB_PGTXNID_STEP; + txn->mt_workid = txn->mt_last_workid = workid; txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); @@ -3211,13 +3513,17 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; txn->mt_spill_pgs = NULL; +#if OVERFLOW_NOTYET + txn->mt_dirty_ovs = NULL; +#endif txn->mt_next_pgno = parent->mt_next_pgno; parent->mt_flags |= MDB_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; -#ifdef MDB_VL32 - txn->mt_rpages = parent->mt_rpages; +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) + txn->mt_rpages = parent->mt_rpages; #endif memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ @@ -3245,8 +3551,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } if (rc) { if (txn != env->me_txn0) { -#ifdef MDB_VL32 - free(txn->mt_rpages); +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) + free(txn->mt_rpages); #endif free(txn); } @@ -3350,6 +3657,10 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) txn->mt_numdbs = 0; txn->mt_flags = MDB_TXN_FINISHED; + mdb_midl_free(txn->mt_spill_pgs); +#if OVERFLOW_NOTYET + mdb_mid2l_free(txn->mt_dirty_ovs); +#endif if (!txn->mt_parent) { mdb_midl_shrink(&txn->mt_free_pgs); env->me_free_pgs = txn->mt_free_pgs; @@ -3370,12 +3681,10 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) mdb_midl_free(txn->mt_free_pgs); free(txn->mt_u.dirty_list); } - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_free(pghead); } -#ifdef MDB_VL32 - if (!txn->mt_parent) { +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags) && !txn->mt_parent) { MDB_ID3L el = env->me_rpages, tl = txn->mt_rpages; unsigned i, x, n = tl[0].mid; pthread_mutex_lock(&env->me_rpmutex); @@ -3383,13 +3692,23 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { /* tmp overflow pages that we didn't share in env */ munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + if (tl[i].menc) { + mdb_rpage_dispose(env, &tl[i]); + tl[i].menc = NULL; + } } else { x = mdb_mid3l_search(el, tl[i].mid); if (tl[i].mptr == el[x].mptr) { el[x].mref--; + if (!el[x].mref) + el[x].muse = 0; } else { /* another tmp overflow page */ munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + if (tl[i].menc) { + mdb_rpage_dispose(env, &tl[i]); + tl[i].menc = NULL; + } } } } @@ -3443,7 +3762,7 @@ mdb_freelist_save(MDB_txn *txn) */ MDB_cursor mc; MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1, lost_loose = 0; txnid_t pglast = 0, head_id = 0; pgno_t freecnt = 0, *free_pgs, *mop; ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; @@ -3466,6 +3785,7 @@ mdb_freelist_save(MDB_txn *txn) unsigned x; if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) return rc; + lost_loose = txn->mt_loose_count; for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* must also remove from dirty list */ @@ -3612,6 +3932,7 @@ mdb_freelist_save(MDB_txn *txn) /* Room for loose pages + temp IDL with same */ if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) return rc; + lost_loose += count; mop = env->me_pghead; loose = mop + MDB_IDL_ALLOCLEN(mop) - count; for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) @@ -3624,11 +3945,17 @@ mdb_freelist_save(MDB_txn *txn) mop_len = mop[0]; } - /* Fill in the reserved me_pghead records */ + /* Fill in the reserved me_pghead records. Everything is finally + * in place, so this will not allocate or free any DB pages. + */ rc = MDB_SUCCESS; if (mop_len) { MDB_val key, data; + /* Protect DB env from any (buggy) freelist use when saving mop */ + env->me_pghead = NULL; + txn->mt_dirty_room = 0; + mop += mop_len; rc = mdb_cursor_first(&mc, &key, &data); for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { @@ -3642,18 +3969,33 @@ mdb_freelist_save(MDB_txn *txn) len = mop_len; data.mv_size = (len + 1) * sizeof(MDB_ID); } + mop_len -= len; data.mv_data = mop -= len; save = mop[0]; mop[0] = len; rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); mop[0] = save; - if (rc || !(mop_len -= len)) + if (rc || !mop_len) break; } + + env->me_pghead = mop - mop_len; } + + /* Restore this so we can check vs. dirty_list after mdb_page_flush() */ + if (! (txn->mt_flags & MDB_TXN_WRITEMAP)) + txn->mt_loose_count += lost_loose; + return rc; } +#if MDB_RPAGE_CACHE +static int mdb_rpage_decrypt(MDB_env *env, MDB_ID3 *id3, int rem, int numpgs); +static int mdb_page_encrypt(MDB_env *env, MDB_page *in, MDB_page *out, size_t size); +static int mdb_page_chk_checksum(MDB_env *env, MDB_page *mp, size_t size); +static void mdb_page_set_checksum(MDB_env *env, MDB_page *mp, size_t size); +#endif + /** Flush (some) dirty pages to the map, after clearing their dirty flag. * @param[in] txn the transaction that's being committed * @param[in] keep number of initial pages in dirty_list to keep dirty. @@ -3665,16 +4007,16 @@ mdb_page_flush(MDB_txn *txn, int keep) MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; + int i, pagecount = dl[0].mid, rc, *dl_nump, nump = 1; size_t size = 0; MDB_OFF_T pos = 0; pgno_t pgno = 0; MDB_page *dp = NULL; #ifdef _WIN32 - OVERLAPPED *ov = env->ov; + OVERLAPPED *ov, *this_ov; MDB_page *wdp; int async_i = 0; - HANDLE fd = (env->me_flags & MDB_NOSYNC) ? env->me_fd : env->me_ovfd; + HANDLE fd = (txn->mt_flags & MDB_NOSYNC) ? env->me_fd : env->me_ovfd; #else struct iovec iov[MDB_COMMIT_PAGES]; HANDLE fd = env->me_fd; @@ -3684,46 +4026,74 @@ mdb_page_flush(MDB_txn *txn, int keep) int n = 0; j = i = keep; - if (env->me_flags & MDB_WRITEMAP -#ifdef _WIN32 - /* In windows, we still do writes to the file (with write-through enabled in sync mode), - * as this is faster than FlushViewOfFile/FlushFileBuffers */ - && (env->me_flags & MDB_NOSYNC) -#endif - ) { - /* Clear dirty flags */ - while (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - } + + if (env->me_flags & MDB_WRITEMAP) { goto done; } + /* setup nump list, flag that it's in use */ + dl_nump = env->me_dirty_nump; + for (n=1; n<=pagecount; n++) { + dp = dl[n].mptr; + dl_nump[n] = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + } + txn->mt_flags |= MDB_TXN_DIRTYNUM; + /* */ + pgno = dl[pagecount].mid + dl_nump[pagecount]; + n = 0; + #ifdef _WIN32 - if (pagecount - keep >= env->ovs) { + if (!(env->me_flags & MDB_WRITEMAP) && pagecount > 0) { + DWORD file_high; + size_t file_size = GetFileSize(fd, &file_high); + file_size += (size_t) file_high << 32; + if (pgno * psize >= file_size) { + file_size = ((size_t) (pgno < 100 ? 2 : pgno < 1000 ? 1.5 : pgno < 10000 ? 1.25 : pgno < 100000 ? 1.125 : 1.0625) * pgno * psize / 0x40000 + 1) * 0x40000; + LONG high_position = file_size >> 32; + if (SetFilePointer(fd, file_size & 0xffffffff, &high_position, FILE_BEGIN) == INVALID_SET_FILE_POINTER) { + fprintf(stderr, "SetFilePointer failed: %s\n", strerror(ErrCode())); + } else { + rc = SetEndOfFile(fd); + if (!rc) { + rc = ErrCode(); + fprintf(stderr, "SetEndOfFile error %s\n", strerror(rc)); + } + } + } + if (!MDB_REMAPPING(env->me_flags)) { + MDB_meta *m = mdb_env_pick_meta(env); + void *p; + p = (MDB_page *)(env->me_map + env->me_psize * m->mm_last_pg); + if (pgno > m->mm_last_pg) { + p = VirtualAlloc(p, env->me_psize * (pgno - m->mm_last_pg), MEM_COMMIT, PAGE_READONLY); + if (!p) { + fprintf(stderr, "VirtualAlloc failed\n"); + DPUTS("VirtualAlloc failed"); + return ErrCode(); + } + } + } + } + /* */ + if (pagecount - keep >= env->me_ovs) { /* ran out of room in ov array, and re-malloc, copy handles and free previous */ int ovs = (pagecount - keep) * 1.5; /* provide extra padding to reduce number of re-allocations */ int new_size = ovs * sizeof(OVERLAPPED); ov = malloc(new_size); if (ov == NULL) return ENOMEM; - int previous_size = env->ovs * sizeof(OVERLAPPED); - memcpy(ov, env->ov, previous_size); /* Copy previous OVERLAPPED data to retain event handles */ + int previous_size = env->me_ovs * sizeof(OVERLAPPED); + memcpy(ov, env->me_ov, previous_size); /* Copy previous OVERLAPPED data to retain event handles */ /* And clear rest of memory */ - memset(&ov[env->ovs], 0, new_size - previous_size); - if (env->ovs > 0) { - free(env->ov); /* release previous allocation */ + memset(&ov[env->me_ovs], 0, new_size - previous_size); + if (env->me_ovs > 0) { + free(env->me_ov); /* release previous allocation */ } - env->ov = ov; - env->ovs = ovs; + env->me_ov = ov; + env->me_ovs = ovs; } + ov = env->me_ov; #endif /* Write the pages */ @@ -3737,39 +4107,40 @@ mdb_page_flush(MDB_txn *txn, int keep) continue; } pgno = dl[i].mid; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; + /* Mark the page as clean */ + dp->mp_txnid = txn->mt_txnid; pos = pgno * psize; size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + nump = dl_nump[i]; + size *= nump; } /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE #ifdef _WIN32 - /* If writemap is enabled, consecutive page positions infer - * contiguous (mapped) memory. - * Otherwise force write pages one at a time. - * Windows actually supports scatter/gather I/O, but only on + /* Windows actually supports scatter/gather I/O, but only on * unbuffered file handles. Since we're relying on the OS page * cache for all our data, that's self-defeating. So we just * write pages one at a time. We use the ov structure to set * the write offset, to at least save the overhead of a Seek * system call. + * If writemap is enabled, consecutive page positions infer + * contiguous (mapped) memory. + * Otherwise force write pages one at a time. */ || !(env->me_flags & MDB_WRITEMAP) #endif ) { if (n) { retry_write: + rc = 0; /* Write previous page(s) */ - DPRINTF(("committing page %"Z"u", pgno)); #ifdef _WIN32 - OVERLAPPED *this_ov = &ov[async_i]; + this_ov = &ov[async_i]; /* Clear status, and keep hEvent, we reuse that */ this_ov->Internal = 0; this_ov->Offset = wpos & 0xffffffff; this_ov->OffsetHigh = wpos >> 16 >> 16; - if (!F_ISSET(env->me_flags, MDB_NOSYNC) && !this_ov->hEvent) { + if (!F_ISSET(txn->mt_flags, MDB_NOSYNC) && !this_ov->hEvent) { HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL); if (!event) { rc = ErrCode(); @@ -3784,9 +4155,10 @@ mdb_page_flush(MDB_txn *txn, int keep) DPRINTF(("WriteFile: %d", rc)); return rc; } + rc = 0; } async_i++; -#else +#else /* _WIN32 */ #ifdef MDB_USE_PWRITEV wres = pwritev(fd, iov, n, wpos); #else @@ -3799,9 +4171,11 @@ mdb_page_flush(MDB_txn *txn, int keep) if (rc == EINTR) goto retry_seek; DPRINTF(("lseek: %s", strerror(rc))); - return rc; + wres = wsize; + } else { + rc = 0; + wres = writev(fd, iov, n); } - wres = writev(fd, iov, n); } #endif if (wres != wsize) { @@ -3814,35 +4188,58 @@ mdb_page_flush(MDB_txn *txn, int keep) rc = EIO; /* TODO: Use which error code? */ DPUTS("short write, filesystem full?"); } - return rc; } #endif /* _WIN32 */ + if (rc) + return rc; n = 0; } if (i > pagecount) break; wpos = pos; wsize = 0; -#ifdef _WIN32 - wdp = dp; } -#else +#if MDB_RPAGE_CACHE + if (env->me_sumfunc) { + mdb_page_set_checksum(env, dp, size); } + if (env->me_encfunc) { + MDB_page *encp = mdb_page_malloc(txn, nump, 0); + if (!encp) + return ENOMEM; + if (mdb_page_encrypt(env, dp, encp, size)) { + mdb_dpage_free_n(env, encp, nump); + return MDB_CRYPTO_FAIL; + } + mdb_dpage_free_n(env, dp, nump); + dp = encp; + dl[i].mptr = dp; + } +#endif +#ifdef _WIN32 + wdp = dp; +#else iov[n].iov_len = size; iov[n].iov_base = (char *)dp; -#endif /* _WIN32 */ +#endif DPRINTF(("committing page %"Yu, pgno)); next_pos = pos + size; wsize += size; n++; } -#ifdef MDB_VL32 - if (pgno > txn->mt_last_pgno) +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags) && pgno > txn->mt_last_pgno) txn->mt_last_pgno = pgno; #endif + /* MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); + #ifdef _WIN32 - if (!F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (!(txn->mt_flags & MDB_NOSYNC)) { /* Now wait for all the asynchronous/overlapped sync/write-through writes to complete. * We start with the last one so that all the others should already be complete and * we reduce thread suspend/resuming (in practice, typically about 99.5% of writes are @@ -3861,14 +4258,9 @@ mdb_page_flush(MDB_txn *txn, int keep) } #endif /* _WIN32 */ - if (!(env->me_flags & MDB_WRITEMAP)) { - /* Don't free pages when using writemap (can only get here in NOSYNC mode in Windows) - * MIPS has cache coherency issues, this is a no-op everywhere else - * Note: for any size >= on-chip cache size, entire on-chip cache is - * flushed. - */ - CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); - + if (!(env->me_flags & MDB_WRITEMAP)) { + /* Don't free pages when using writemap (can only get here in NOSYNC mode in Windows) + */ for (i = keep; ++i <= pagecount; ) { dp = dl[i].mptr; /* This is a page we skipped above */ @@ -3877,14 +4269,16 @@ mdb_page_flush(MDB_txn *txn, int keep) dl[j].mid = dp->mp_pgno; continue; } - mdb_dpage_free(env, dp); + mdb_dpage_free_n(env, dp, dl_nump[i]); } } + txn->mt_flags ^= MDB_TXN_DIRTYNUM; done: i--; txn->mt_dirty_room += i - j; dl[0].mid = j; + return MDB_SUCCESS; } @@ -4048,6 +4442,7 @@ mdb_txn_commit(MDB_txn *txn) *lp = txn->mt_loose_pgs; parent->mt_loose_count += txn->mt_loose_count; + parent->mt_last_workid = txn->mt_last_workid; parent->mt_child = NULL; mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); free(txn); @@ -4106,11 +4501,29 @@ mdb_txn_commit(MDB_txn *txn) if ((rc = mdb_page_flush(txn, 0))) goto fail; + if ((unsigned)txn->mt_loose_count < txn->mt_u.dirty_list[0].mid) { + rc = MDB_PROBLEM; /* mt_loose_pgs does not match dirty_list */ + goto fail; + } + if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC) && (rc = mdb_env_sync0(env, 0, txn->mt_next_pgno))) goto fail; + + // + if ((txn->mt_flags & MDB_NOSYNC) && (env->me_flags & MDB_OVERLAPPINGSYNC)) + txn->mt_dbs[FREE_DBI].md_flags |= MDB_OVERLAPPINGSYNC; + else + txn->mt_dbs[FREE_DBI].md_flags &= ~MDB_OVERLAPPINGSYNC; + // + if ((rc = mdb_env_write_meta(txn))) goto fail; + + // + if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC)) + env->me_synced_txn_id = txn->mt_txnid; + // end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; if (env->me_flags & MDB_PREVSNAPSHOT) { if (!(env->me_flags & MDB_NOLOCK)) { @@ -4131,6 +4544,21 @@ mdb_txn_commit(MDB_txn *txn) return rc; } +MDB_meta* mdb_pick_meta(const MDB_env *env, MDB_meta* a, MDB_meta* b) { + if (env->me_flags & MDB_PREVSNAPSHOT) { + if (env->me_flags & MDB_OVERLAPPINGSYNC) { + if (!b->mm_txnid) + return a; + return (a->mm_txnid + (a->mm_flags & MDB_OVERLAPPINGSYNC ? 0 : 0x10000)) > + (b->mm_txnid + (b->mm_flags & MDB_OVERLAPPINGSYNC ? 0 : 0x10000)) ? a : b; + } + return a->mm_txnid > b->mm_txnid ? b : a; + } + return a->mm_txnid >= b->mm_txnid ? a : b; +} + +static int ESECT mdb_env_map(MDB_env *env, void *addr); + /** Read the environment parameters of a DB environment before * mapping it into memory. * @param[in] env the environment handle @@ -4147,11 +4575,38 @@ mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) int i, rc, off; enum { Size = sizeof(pbuf) }; + if (env->me_flags & MDB_RAWPART) { +#define VM_ALIGN 0x200000 + env->me_mapsize += VM_ALIGN-1; + env->me_mapsize &= ~(VM_ALIGN-1); + env->me_psize = env->me_os_psize; + rc = mdb_env_map(env, NULL); + if (rc) + return rc; + p = (MDB_page *)env->me_map; + for (i=0; imp_flags, P_META)) + return ENOENT; + if (env->me_metas[i]->mm_magic != MDB_MAGIC) + return MDB_INVALID; + if ((env->me_metas[i]->mm_version & 0xffff) != MDB_DATA_VERSION) + return MDB_VERSION_MISMATCH; + if (i == 0) + *meta = *env->me_metas[i]; + else + *meta = *mdb_pick_meta(env, meta, env->me_metas[i]); + p = (MDB_page *)((char *)p + env->me_psize); + } + return 0; + } + /* We don't know the page size yet, so use a minimum value. * Read both meta pages so we can use the latest one. */ + int num_metas = env->me_flags & MDB_OVERLAPPINGSYNC ? 3 : NUM_METAS; - for (i=off=0; imm_psize) { + for (i=off=0; ime_flags & MDB_OVERLAPPINGSYNC ? meta->mm_psize >> 1 : meta->mm_psize) { #ifdef _WIN32 DWORD len; OVERLAPPED ov; @@ -4172,26 +4627,31 @@ mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) } p = (MDB_page *)&pbuf; + if (off == 0) { + if (!F_ISSET(p->mp_flags, P_META)) { + if (env->me_flags & MDB_RAWPART) + return ENOENT; + DPRINTF(("page %"Yu" not a meta page", p->mp_pgno)); + return MDB_INVALID; + } - if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF(("page %"Yu" not a meta page", p->mp_pgno)); - return MDB_INVALID; - } - - m = METADATA(p); - if (m->mm_magic != MDB_MAGIC) { - DPUTS("meta has invalid magic"); - return MDB_INVALID; - } - - if (m->mm_version != MDB_DATA_VERSION) { - DPRINTF(("database is version %u, expected version %u", - m->mm_version, MDB_DATA_VERSION)); - return MDB_VERSION_MISMATCH; - } + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } - if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid)) + if ((m->mm_version & 0xffff) != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + (m->mm_version & 0xffff), MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } *meta = *m; + } else { + m = METADATA(p); + *meta = *mdb_pick_meta(env, meta, m); + } + // } return 0; } @@ -4240,6 +4700,18 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) psize = env->me_psize; + if ((env->me_flags & (MDB_RAWPART|MDB_WRITEMAP)) == (MDB_RAWPART|MDB_WRITEMAP)) { + p = (MDB_page *)env->me_map; + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + return 0; + } + p = calloc(NUM_METAS, psize); if (!p) return ENOMEM; @@ -4252,6 +4724,14 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) q->mp_flags = P_META; *(MDB_meta *)METADATA(q) = *meta; +#if MDB_RPAGE_CACHE + if (env->me_sumsize) { + /* save the checksum size in tail of page 0 */ + char *ptr = (char *)q; + unsigned short *u = (unsigned short *)(ptr-2); + *u = env->me_sumsize; + } +#endif DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); if (!rc) rc = ErrCode(); @@ -4332,11 +4812,15 @@ mdb_env_write_meta(MDB_txn *txn) meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; + meta.mm_version = MDB_DATA_VERSION; off = offsetof(MDB_meta, mm_mapsize); ptr = (char *)&meta + off; len = sizeof(MDB_meta) - off; - off += (char *)mp - env->me_map; + if (flags & 2) { + off += PAGEHDRSZ + (env->me_psize >> 1); + } else + off += (char *)mp - env->me_map; /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. * (me_mfd goes to the same file as me_fd, but writing to it @@ -4388,7 +4872,7 @@ mdb_env_write_meta(MDB_txn *txn) * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - if (env->me_txns) + if (env->me_txns && !(flags & 2)) env->me_txns->mti_txnid = txn->mt_txnid; return MDB_SUCCESS; @@ -4402,8 +4886,15 @@ static MDB_meta * mdb_env_pick_meta(const MDB_env *env) { MDB_meta *const *metas = env->me_metas; - return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^ - ((env->me_flags & MDB_PREVSNAPSHOT) != 0) ]; + // + MDB_meta *latest = mdb_pick_meta(env, metas[0], metas[1]); + if (env->me_flags & MDB_PREVSNAPSHOT && env->me_flags & MDB_OVERLAPPINGSYNC) { + int offset = env->me_psize >> 1; + MDB_meta *flushed = ((MDB_meta*) (((char*)metas[0]) + offset)); + latest = mdb_pick_meta(env, latest, flushed); + } + // + return latest; } int ESECT @@ -4423,9 +4914,11 @@ mdb_env_create(MDB_env **env) #ifdef MDB_USE_POSIX_SEM e->me_rmutex = SEM_FAILED; e->me_wmutex = SEM_FAILED; + e->me_sync_mutex = SEM_FAILED; #elif defined MDB_USE_SYSV_SEM e->me_rmutex->semid = -1; e->me_wmutex->semid = -1; + e->me_sync_mutex->semid = -1; #endif e->me_pid = getpid(); GET_PAGESIZE(e->me_os_psize); @@ -4490,37 +4983,39 @@ mdb_env_map(MDB_env *env, void *addr) if (rc) return mdb_nt2win32(rc); map = addr; -#ifdef MDB_VL32 - msize = NUM_METAS * env->me_psize; -#endif + if (MDB_REMAPPING(env->me_flags)) + msize = NUM_METAS * env->me_psize; rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot); -#ifdef MDB_VL32 - env->me_fmh = mh; -#else - NtClose(mh); +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) + env->me_fmh = mh; + else #endif + NtClose(mh); if (rc) return mdb_nt2win32(rc); env->me_map = map; -#else +#else /* !_WIN32 */ int mmap_flags = MAP_SHARED; int prot = PROT_READ; + if (flags & MDB_WRITEMAP) + prot |= PROT_WRITE; #ifdef MAP_NOSYNC /* Used on FreeBSD */ if (flags & MDB_NOSYNC) mmap_flags |= MAP_NOSYNC; #endif -#ifdef MDB_VL32 - (void) flags; - env->me_map = mmap(addr, NUM_METAS * env->me_psize, prot, mmap_flags, - env->me_fd, 0); - if (env->me_map == MAP_FAILED) { - env->me_map = NULL; - return ErrCode(); - } -#else + if (MDB_REMAPPING(env->me_flags)) { + (void) flags; + env->me_map = mmap(addr, NUM_METAS * env->me_psize, prot, mmap_flags, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + } else + { if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) + if (!(flags & MDB_RAWPART) && ftruncate(env->me_fd, env->me_mapsize) < 0) return ErrCode(); } env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags, @@ -4540,6 +5035,7 @@ mdb_env_map(MDB_env *env, void *addr) #endif /* POSIX_MADV_RANDOM */ #endif /* MADV_RANDOM */ } + } #endif /* _WIN32 */ /* Can happen because the address argument to mmap() is just a @@ -4549,7 +5045,6 @@ mdb_env_map(MDB_env *env, void *addr) */ if (addr && env->me_map != addr) return EBUSY; /* TODO: Make a new MDB_* error code? */ -#endif p = (MDB_page *)env->me_map; env->me_metas[0] = METADATA(p); @@ -4566,12 +5061,15 @@ mdb_env_set_mapsize(MDB_env *env, mdb_size_t size) */ if (env->me_map) { MDB_meta *meta; -#ifndef MDB_VL32 void *old; int rc; -#endif - if (env->me_txn) + + + /* + if (env->me_txn) We are intentionally resizing during transactions now return EINVAL; + */ + meta = mdb_env_pick_meta(env); if (!size) size = meta->mm_mapsize; @@ -4581,17 +5079,25 @@ mdb_env_set_mapsize(MDB_env *env, mdb_size_t size) if (size < minsize) size = minsize; } -#ifndef MDB_VL32 - /* For MDB_VL32 this bit is a noop since we dynamically remap + if (!(MDB_REMAPPING(env->me_flags))) + { + /* For MDB_REMAP_CHUNKS this bit is a noop since we dynamically remap * chunks of the DB anyway. */ - munmap(env->me_map, env->me_mapsize); + /* We don't unmap right now because we intentionally want to leave old maps around for lingering read transactions and other threads that haven't resized yet */ + MDB_last_map* last_map = malloc(sizeof(MDB_last_map)); + last_map->last_map = env->me_last_map; + last_map->map = env->me_map; + last_map->mapsize = env->me_mapsize; + env->me_last_map = last_map; + //munmap(env->me_map, env->me_mapsize); + /* */ env->me_mapsize = size; old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; rc = mdb_env_map(env, old); if (rc) return rc; -#endif /* !MDB_VL32 */ + } } env->me_mapsize = size; if (env->me_psize) @@ -4871,7 +5377,7 @@ mdb_env_open2(MDB_env *env, int prev) if (!NtCreateSection) return MDB_PROBLEM; } - env->ovs = 0; + env->me_ovs = 0; #endif /* _WIN32 */ #ifdef BROKEN_FDATASYNC @@ -4989,6 +5495,18 @@ mdb_env_open2(MDB_env *env, int prev) return i; } } + if ((env->me_flags ^ env->me_metas[0]->mm_flags) & MDB_ENCRYPT) + return MDB_ENV_ENCRYPTION; + +#if MDB_RPAGE_CACHE + if (!newenv && env->me_sumfunc) { + /* for checksums, check sum size from tail of page 0 */ + char *ptr = env->me_map + env->me_psize; + unsigned short *u = (unsigned short *)(ptr - 2); + if (*u != env->me_sumsize) + return MDB_BAD_CHECKSUM; + } +#endif env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) @@ -5100,6 +5618,9 @@ static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { int rc = 0; + MDB_meta *meta = mdb_env_pick_meta(env); + + env->me_txns->mti_txnid = meta->mm_txnid; #ifdef _WIN32 { @@ -5382,6 +5903,8 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) if (!env->me_rmutex) goto fail_errno; env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 'w')); if (!env->me_wmutex) goto fail_errno; + env->me_sync_mutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 's')); + if (!env->me_sync_mutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) struct stat stbuf; struct { @@ -5410,10 +5933,13 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) */ sem_unlink(MUTEXNAME(env, 'r')); sem_unlink(MUTEXNAME(env, 'w')); + sem_unlink(MUTEXNAME(env, 's')); env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), O_CREAT|O_EXCL, mode, 1); if (env->me_rmutex == SEM_FAILED) goto fail_errno; env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), O_CREAT|O_EXCL, mode, 1); if (env->me_wmutex == SEM_FAILED) goto fail_errno; + env->me_sync_mutex = sem_open(MUTEXNAME(env, 's'), O_CREAT|O_EXCL, mode, 1); + if (env->me_sync_mutex == SEM_FAILED) goto fail_errno; #elif defined(MDB_USE_SYSV_SEM) unsigned short vals[2] = {1, 1}; key_t key = ftok(fname->mn_val, 'M'); /* fname is lockfile path now */ @@ -5428,6 +5954,7 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) env->me_txns->mti_semid = semid; env->me_txns->mti_rlocked = 0; env->me_txns->mti_wlocked = 0; + env->me_txns->mti_sync_locked = 0; #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; @@ -5437,6 +5964,7 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) */ memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); + memset(env->me_txns->mti_sync_mutex, 0, sizeof(*env->me_txns->mti_sync_mutex)); if ((rc = pthread_mutexattr_init(&mattr)) != 0) goto fail; @@ -5446,6 +5974,7 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) #endif if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_sync_mutex, &mattr); pthread_mutexattr_destroy(&mattr); if (rc) goto fail; @@ -5481,12 +6010,16 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) if (!env->me_rmutex) goto fail_errno; env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'w')); if (!env->me_wmutex) goto fail_errno; + env->me_sync_mutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 's')); + if (!env->me_sync_mutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) mdb_env_mname_init(env); env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), 0); if (env->me_rmutex == SEM_FAILED) goto fail_errno; env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), 0); if (env->me_wmutex == SEM_FAILED) goto fail_errno; + env->me_sync_mutex = sem_open(MUTEXNAME(env, 's'), 0); + if (env->me_sync_mutex == SEM_FAILED) goto fail_errno; #elif defined(MDB_USE_SYSV_SEM) semid = env->me_txns->mti_semid; semu.buf = &buf; @@ -5501,10 +6034,13 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) #ifdef MDB_USE_SYSV_SEM env->me_rmutex->semid = semid; env->me_wmutex->semid = semid; + env->me_sync_mutex->semid = semid; env->me_rmutex->semnum = 0; env->me_wmutex->semnum = 1; + env->me_sync_mutex->semnum = 2; env->me_rmutex->locked = &env->me_txns->mti_rlocked; env->me_wmutex->locked = &env->me_txns->mti_wlocked; + env->me_sync_mutex->locked = &env->me_txns->mti_sync_locked; #endif return MDB_SUCCESS; @@ -5515,15 +6051,76 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) return rc; } +#ifdef MDB_TEST +#if MDB_RPAGE_CACHE +/** Trivial encryption for testing */ +static void ESECT +mdb_enctest(const MDB_val *src, MDB_val *dst, const MDB_val *key, int encdec) +{ + mdb_size_t *sptr = src->mv_data, *dptr = dst->mv_data; + mdb_size_t x=*(mdb_size_t*)key[0].mv_data, v=*(mdb_size_t*)key[1].mv_data; + int i, len = dst->mv_size / sizeof(mdb_size_t); + + for (i = 0; i < len; i++) + x += v += i + sptr[i] + (dptr[i] = sptr[i] ^ x); +} +#endif /* MDB_RPAGE_CACHE */ + +/** Add #mdb_env_open() flags from environment variable $LMDB_FLAGS. + * + * Supports the normal flags plus 'e' = trivial encryption for testing. + */ +static int ESECT +mdb_env_envflags(MDB_env *env) +{ + static const char names[] = "ace" "fhi" "lmn" "rst" "vw"; + static const unsigned f[] = { + /*a*/ MDB_MAPASYNC, /*c*/ MDB_REMAP_CHUNKS, /*e*/ MDB_ENCRYPT, + /*f*/ MDB_FIXEDMAP, /*h*/ MDB_NORDAHEAD, /*i*/ MDB_NOMEMINIT, + /*l*/ MDB_NOLOCK, /*m*/ MDB_NOMETASYNC, /*n*/ MDB_NOSUBDIR, + /*r*/ MDB_RDONLY, /*s*/ MDB_NOSYNC, /*t*/ MDB_NOTLS, + /*v*/ MDB_PREVSNAPSHOT, /*w*/ MDB_WRITEMAP, + }; + unsigned flags = 0; + const char *s, *opts = getenv("LMDB_FLAGS"); + if (opts) { + for (; *opts; opts++) { + if ((s = strchr(names, *opts)) == NULL) + return EINVAL; + flags |= f[s - names]; + } + if (flags & MDB_ENCRYPT) { +#if MDB_RPAGE_CACHE + if (!env->me_encfunc) { + static mdb_size_t k = (MDB_SIZE_MAX/67*73) | 1; + MDB_val key = {sizeof(k), &k}; + int rc; + rc = mdb_env_set_encrypt(env, mdb_enctest, &key, 0); + if (rc) + return rc; + } +#else + return EINVAL; +#endif + } + env->me_flags |= flags; + } + return MDB_SUCCESS; +} +#else +#define mdb_env_envflags(env) MDB_SUCCESS +#endif /* MDB_TEST */ + /** Only a subset of the @ref mdb_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT) + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT|MDB_REMAP_CHUNKS|MDB_OVERLAPPINGSYNC) +#define EXPOSED (CHANGEABLE|CHANGELESS | MDB_ENCRYPT) -#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) +#if VALID_FLAGS & PERSISTENT_FLAGS & EXPOSED # error "Persistent DB flags & env flags overlap, but both go in mm_flags" #endif @@ -5532,27 +6129,34 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode { int rc, excl = -1; MDB_name fname; - if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; -#ifdef MDB_VL32 - if (flags & MDB_WRITEMAP) { - /* silently ignore WRITEMAP in 32 bit mode */ - flags ^= MDB_WRITEMAP; - } - if (flags & MDB_FIXEDMAP) { + if ((rc = mdb_env_envflags(env)) != MDB_SUCCESS) + return rc; + flags |= env->me_flags; + + if (MDB_REMAPPING(0)) /* if we always remap chunks */ + flags |= MDB_REMAP_CHUNKS; + if (MDB_REMAPPING(flags)) { + /* silently ignore WRITEMAP with REMAP_CHUNKS */ + flags &= ~MDB_WRITEMAP; +#if (MDB_RPAGE_CACHE) & 2 + /* TEST: silently ignore FIXEDMAP, so mtest*.c will work */ + flags &= ~MDB_FIXEDMAP; +#else /* cannot support FIXEDMAP */ - return EINVAL; - } + if (flags & MDB_FIXEDMAP) + return EINVAL; #endif - flags |= env->me_flags; + } rc = mdb_fname_init(path, flags, &fname); if (rc) return rc; -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(flags)) { #ifdef _WIN32 env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); if (!env->me_rpmutex) { @@ -5564,15 +6168,28 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode if (rc) goto leave; #endif + } +#endif +#ifndef _WIN32 + { + struct stat st; + flags &= ~MDB_RAWPART; + if (!stat(path, &st) && (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode))) + flags |= MDB_RAWPART | MDB_NOSUBDIR; + } #endif - flags |= MDB_ENV_ACTIVE; /* tell mdb_env_close0() to clean up */ + flags |= MDB_ENV_ACTIVE; /* tell mdb_env_close_active() to clean up */ if (flags & MDB_RDONLY) { /* silently ignore WRITEMAP when we're only getting read access */ flags &= ~MDB_WRITEMAP; } else { + /* WRITEMAP has a dummy element to match dirty_room = 1 */ + size_t dl_size = (flags & MDB_WRITEMAP) ? 2 : MDB_IDL_UM_SIZE; if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && - (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + (env->me_dirty_list = calloc(dl_size, sizeof(MDB_ID2))))) + rc = ENOMEM; + if (env->me_dirty_list && !(env->me_dirty_nump = malloc(dl_size * sizeof(int)))) rc = ENOMEM; } @@ -5580,7 +6197,8 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode if (rc) goto leave; -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(flags)) { env->me_rpages = malloc(MDB_ERPAGE_SIZE * sizeof(MDB_ID3)); if (!env->me_rpages) { @@ -5591,7 +6209,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode env->me_rpcheck = MDB_ERPAGE_SIZE/2; } #endif - + /**/ + env->me_last_map = NULL; + /**/ env->me_path = strdup(path); env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); @@ -5608,8 +6228,10 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode if (rc) goto leave; if ((flags & MDB_PREVSNAPSHOT) && !excl) { - rc = EAGAIN; - goto leave; + // + flags ^= MDB_PREVSNAPSHOT; + env->me_flags = flags; + // } } @@ -5637,13 +6259,34 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); if (rc) goto leave; - DPRINTF(("opened dbenv %p", (void *) env)); - if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { + // + if (excl > 0) { + if (flags & MDB_PREVSNAPSHOT) { + MDB_meta* safe_meta = mdb_env_pick_meta(env); + flags &= ~MDB_PREVSNAPSHOT; // clear the flag now, so we can compare to the latest + env->me_flags = flags; + MDB_meta* latest = mdb_env_pick_meta(env); + if (latest->mm_txnid != safe_meta->mm_txnid) { + MDB_txn rollback_txn; + MDB_db dbs[2]; + rollback_txn.mt_env = env; + rollback_txn.mt_flags = 0; + rollback_txn.mt_dbs = dbs; + rollback_txn.mt_dbs[FREE_DBI] = safe_meta->mm_dbs[FREE_DBI]; + rollback_txn.mt_dbs[MAIN_DBI] = safe_meta->mm_dbs[MAIN_DBI]; + rollback_txn.mt_txnid = safe_meta->mm_txnid; + rollback_txn.mt_next_pgno = safe_meta->mm_last_pg + 1; + mdb_env_write_meta(&rollback_txn); + rollback_txn.mt_txnid--; // overwrite both meta pages to safe meta data + mdb_env_write_meta(&rollback_txn); + } + } rc = mdb_env_share_locks(env, &excl); if (rc) goto leave; } + // if (!(flags & MDB_RDONLY)) { MDB_txn *txn; int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * @@ -5656,7 +6299,8 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) { txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); if (!txn->mt_rpages) { free(txn); @@ -5665,6 +6309,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode } txn->mt_rpages[0].mid = 0; txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; + } #endif txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDB_TXN_FINISHED; @@ -5677,15 +6322,15 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode leave: if (rc) { - mdb_env_close0(env, excl); + mdb_env_close_active(env, excl); } mdb_fname_destroy(fname); return rc; } -/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ +/** When #MDB_ENV_ACTIVE: Clear #mdb_env_open()ed resources, release readers */ static void ESECT -mdb_env_close0(MDB_env *env, int excl) +mdb_env_close_active(MDB_env *env, int excl) { int i; @@ -5704,7 +6349,9 @@ mdb_env_close0(MDB_env *env, int excl) free(env->me_dbflags); free(env->me_path); free(env->me_dirty_list); -#ifdef MDB_VL32 + free(env->me_dirty_nump); +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) { if (env->me_txn0 && env->me_txn0->mt_rpages) free(env->me_txn0->mt_rpages); if (env->me_rpages) { @@ -5714,6 +6361,7 @@ mdb_env_close0(MDB_env *env, int excl) munmap(el[x].mptr, el[x].mcnt * env->me_psize); free(el); } + } #endif free(env->me_txn0); mdb_midl_free(env->me_free_pgs); @@ -5732,20 +6380,29 @@ mdb_env_close0(MDB_env *env, int excl) } if (env->me_map) { -#ifdef MDB_VL32 - munmap(env->me_map, NUM_METAS*env->me_psize); -#else - munmap(env->me_map, env->me_mapsize); -#endif + if (MDB_REMAPPING(env->me_flags)) + munmap(env->me_map, NUM_METAS*env->me_psize); + else { + munmap(env->me_map, env->me_mapsize); + /**/ + MDB_last_map *last_map = env->me_last_map; + while(last_map) { // unmap all of the previous maps as well + munmap(last_map->map, last_map->mapsize); + MDB_last_map *last_last_map = last_map; + last_map = last_map->last_map; + free(last_last_map); + } + /**/ + } } if (env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); #ifdef _WIN32 - if (env->ovs > 0) { - for (i = 0; i < env->ovs; i++) { - CloseHandle(env->ov[i].hEvent); + if (env->me_ovs > 0) { + for (i = 0; i < env->me_ovs; i++) { + CloseHandle(env->me_ov[i].hEvent); } - free(env->ov); + free(env->me_ov); } if (env->me_ovfd != INVALID_HANDLE_VALUE) (void) close(env->me_ovfd); @@ -5768,6 +6425,7 @@ mdb_env_close0(MDB_env *env, int excl) if (env->me_rmutex) { CloseHandle(env->me_rmutex); if (env->me_wmutex) CloseHandle(env->me_wmutex); + if (env->me_sync_mutex) CloseHandle(env->me_sync_mutex); } /* Windows automatically destroys the mutexes when * the last handle closes. @@ -5777,6 +6435,8 @@ mdb_env_close0(MDB_env *env, int excl) sem_close(env->me_rmutex); if (env->me_wmutex != SEM_FAILED) sem_close(env->me_wmutex); + if (env->me_sync_mutex != SEM_FAILED) + sem_close(env->me_sync_mutex); /* If we have the filelock: If we are the * only remaining user, clean up semaphores. */ @@ -5785,6 +6445,7 @@ mdb_env_close0(MDB_env *env, int excl) if (excl > 0) { sem_unlink(MUTEXNAME(env, 'r')); sem_unlink(MUTEXNAME(env, 'w')); + sem_unlink(MUTEXNAME(env, 's')); } } #elif defined(MDB_USE_SYSV_SEM) @@ -5807,6 +6468,7 @@ mdb_env_close0(MDB_env *env, int excl) if (excl > 0) { pthread_mutex_destroy(env->me_txns->mti_rmutex); pthread_mutex_destroy(env->me_txns->mti_wmutex); + pthread_mutex_destroy(env->me_txns->mti_sync_mutex); } #endif munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); @@ -5822,13 +6484,16 @@ mdb_env_close0(MDB_env *env, int excl) #endif (void) close(env->me_lfd); } -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(env->me_flags)) + { #ifdef _WIN32 if (env->me_fmh) CloseHandle(env->me_fmh); if (env->me_rpmutex) CloseHandle(env->me_rpmutex); #else pthread_mutex_destroy(&env->me_rpmutex); #endif + } #endif env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); @@ -5849,7 +6514,10 @@ mdb_env_close(MDB_env *env) free(dp); } - mdb_env_close0(env, 0); + mdb_env_close_active(env, 0); +#if MDB_RPAGE_CACHE + free(env->me_enckey.mv_data); +#endif free(env); } @@ -6098,7 +6766,37 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) return MDB_SUCCESS; } -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + +static int +mdb_rpage_encsum(MDB_env *env, MDB_ID3 *id3, unsigned rem, int numpgs) +{ + int rc = 0; + if (env->me_encfunc) { + unsigned short muse = id3->muse; + rc = mdb_rpage_decrypt(env, id3, rem, numpgs); + if (!rc && env->me_sumfunc && muse != id3->muse) { + MDB_page *p = (MDB_page *)((char *)id3->menc + rem * env->me_psize); + rc = mdb_page_chk_checksum(env, p, numpgs * env->me_psize); + } + } else { + if (!(id3->muse & (1 << rem))) { + MDB_page *p; + int bit; + /* If this is an overflow page, set all use bits to the end */ + if (rem + numpgs > MDB_RPAGE_CHUNK) + bit = 0xffff; + else + bit = 1; + + id3->muse |= (bit << rem); + p = (MDB_page *)((char *)id3->mptr + rem * env->me_psize); + rc = mdb_page_chk_checksum(env, p, numpgs * env->me_psize); + } + } + return rc; +} + /** Map a read-only page. * There are two levels of tracking in use, a per-txn list and a per-env list. * ref'ing and unref'ing the per-txn list is faster since it requires no @@ -6144,18 +6842,20 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) * reference to one overflow page at a time. * * @param[in] txn the transaction for this access. - * @param[in] pgno the page number for the page to retrieve. + * @param[in] pg0 the page number for the page to retrieve. + * @param[in] numpgs number of database pages (can be > 1 for overflow pages) * @param[out] ret address of a pointer where the page's address will be stored. * @return 0 on success, non-zero on failure. */ static int -mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) +mdb_rpage_get(MDB_txn *txn, pgno_t pg0, int numpgs, MDB_page **ret) { MDB_env *env = txn->mt_env; MDB_page *p; MDB_ID3L tl = txn->mt_rpages; MDB_ID3L el = env->me_rpages; MDB_ID3 id3; + char *base; unsigned x, rem; pgno_t pgno; int rc, retries = 1; @@ -6184,19 +6884,26 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) pgno = pg0 ^ rem; id3.mid = 0; + id3.menc = NULL; + id3.muse = 0; x = mdb_mid3l_search(tl, pgno); if (x <= tl[0].mid && tl[x].mid == pgno) { if (x != tl[0].mid && tl[x+1].mid == pg0) x++; /* check for overflow size */ - p = (MDB_page *)((char *)tl[x].mptr + rem * env->me_psize); - if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) { - id3.mcnt = p->mp_pages + rem; + if (numpgs + rem > tl[x].mcnt) { + id3.mcnt = numpgs + rem; len = id3.mcnt * env->me_psize; SET_OFF(off, pgno * env->me_psize); MAP(rc, env, id3.mptr, len, off); if (rc) return rc; + /* setup for encryption */ + if (env->me_encfunc) { + id3.menc = malloc(len); + if (!id3.menc) + return ENOMEM; + } /* check for local-only page */ if (rem) { mdb_tassert(txn, tl[x].mid != pg0); @@ -6210,6 +6917,10 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) /* ignore the mapping we got from env, use new one */ tl[x].mptr = id3.mptr; tl[x].mcnt = id3.mcnt; + if (tl[x].menc) + mdb_rpage_dispose(env, &tl[x]); + tl[x].menc = id3.menc; + tl[x].muse = id3.muse; /* if no active ref, see if we can replace in env */ if (!tl[x].mref) { unsigned i; @@ -6220,6 +6931,10 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) munmap(el[i].mptr, el[i].mcnt * env->me_psize); el[i].mptr = tl[x].mptr; el[i].mcnt = tl[x].mcnt; + if (el[i].menc) + mdb_rpage_dispose(env, &el[i]); + el[i].menc = tl[x].menc; + el[i].muse = tl[x].muse; } else { /* there are others, remove ourself */ el[i].mref--; @@ -6230,7 +6945,14 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) } id3.mptr = tl[x].mptr; id3.mcnt = tl[x].mcnt; + id3.menc = tl[x].menc; + id3.muse = tl[x].muse; tl[x].mref++; + if (env->me_encfunc || env->me_sumfunc) { + rc = mdb_rpage_encsum(env, &id3, rem, numpgs); + if (rc) return rc; + tl[x].muse = id3.muse; + } goto ok; } @@ -6247,6 +6969,8 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) /* tmp overflow pages don't go to env */ if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + if (tl[i].menc) + mdb_rpage_dispose(env, &tl[i]); continue; } x = mdb_mid3l_search(el, tl[i].mid); @@ -6285,7 +7009,7 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) if ((env->me_flags & MDB_RDONLY) && pgno + MDB_RPAGE_CHUNK-1 > txn->mt_last_pgno) id3.mcnt = txn->mt_last_pgno + 1 - pgno; else - id3.mcnt = MDB_RPAGE_CHUNK; + id3.mcnt = numpgs + rem > MDB_RPAGE_CHUNK ? numpgs + rem : MDB_RPAGE_CHUNK; len = id3.mcnt * env->me_psize; id3.mid = pgno; @@ -6294,27 +7018,49 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) x = mdb_mid3l_search(el, pgno); if (x <= el[0].mid && el[x].mid == pgno) { id3.mptr = el[x].mptr; - id3.mcnt = el[x].mcnt; + id3.menc = el[x].menc; + id3.muse = el[x].muse; /* check for overflow size */ - p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); - if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { - id3.mcnt = p->mp_pages + rem; - len = id3.mcnt * env->me_psize; + if (id3.mcnt > el[x].mcnt) { SET_OFF(off, pgno * env->me_psize); MAP(rc, env, id3.mptr, len, off); if (rc) goto fail; + if (env->me_encfunc) { + id3.menc = malloc(len); + if (!id3.menc) { + rc = ENOMEM; + goto fail; + } + id3.muse = 0; + } if (!el[x].mref) { munmap(el[x].mptr, env->me_psize * el[x].mcnt); el[x].mptr = id3.mptr; el[x].mcnt = id3.mcnt; + if (el[x].menc) + mdb_rpage_dispose(env, &el[x]); + el[x].menc = id3.menc; + el[x].muse = id3.muse; } else { id3.mid = pg0; + if (env->me_encfunc || env->me_sumfunc) { + rc = mdb_rpage_encsum(env, &id3, rem, numpgs); + if (rc) + goto fail; + el[x].muse = id3.muse; + } pthread_mutex_unlock(&env->me_rpmutex); goto found; } } el[x].mref++; + if (env->me_encfunc || env->me_sumfunc) { + rc = mdb_rpage_encsum(env, &id3, rem, numpgs); + if (rc) + goto fail; + el[x].muse = id3.muse; + } pthread_mutex_unlock(&env->me_rpmutex); goto found; } @@ -6325,6 +7071,8 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) if (!el[i].mref) { if (!y) y = i; munmap(el[i].mptr, env->me_psize * el[i].mcnt); + if (el[i].menc) + mdb_rpage_dispose(env, &el[i]); } } if (!y) { @@ -6357,13 +7105,15 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) pthread_mutex_unlock(&env->me_rpmutex); return rc; } - /* check for overflow size */ - p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); - if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { - id3.mcnt = p->mp_pages + rem; - munmap(id3.mptr, len); - len = id3.mcnt * env->me_psize; - MAP(rc, env, id3.mptr, len, off); + if (env->me_encfunc) { + id3.menc = malloc(len); + if (!id3.menc) { + rc = ENOMEM; + goto fail; + } + } + if (env->me_encfunc || env->me_sumfunc) { + rc = mdb_rpage_encsum(env, &id3, rem, numpgs); if (rc) goto fail; } @@ -6375,14 +7125,137 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) return MDB_TXN_FULL; } ok: - p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); + base = (char *)(env->me_encfunc ? id3.menc : id3.mptr); + p = (MDB_page *)(base + rem * env->me_psize); + rc = MDB_SUCCESS; + if (env->me_encfunc || env->me_sumfunc) { + rc = mdb_rpage_encsum(env, &id3, rem, numpgs); + } #if MDB_DEBUG /* we don't need this check any more */ if (IS_OVERFLOW(p)) { mdb_tassert(txn, p->mp_pages + rem <= id3.mcnt); } #endif *ret = p; - return MDB_SUCCESS; + return rc; +} + +static int mdb_page_encrypt(MDB_env *env, MDB_page *dp, MDB_page *encp, size_t size) +{ + MDB_val in, out, enckeys[3]; + int xsize = sizeof(pgno_t) + sizeof(txnid_t); + in.mv_size = size - xsize; + in.mv_data = (char *)dp + xsize; + out.mv_size = in.mv_size; + out.mv_data = (char *)encp + xsize; + if (env->me_esumsize) { + in.mv_size -= env->me_esumsize; + out.mv_size -= env->me_esumsize; + enckeys[2].mv_size = env->me_esumsize; + enckeys[2].mv_data = (char *)out.mv_data + out.mv_size; + } else { + enckeys[2].mv_size = 0; + enckeys[2].mv_data = 0; + } + encp->mp_pgno = dp->mp_pgno; + encp->mp_txnid = dp->mp_txnid; + enckeys[0] = env->me_enckey; + enckeys[1].mv_size = xsize; + enckeys[1].mv_data = dp; + return env->me_encfunc(&in, &out, enckeys, 1); +} + +static int mdb_rpage_decrypt(MDB_env *env, MDB_ID3 *id3, int rem, int numpgs) +{ + int rc = 0; + if (!(id3->muse & (1 << rem))) { + MDB_val in, out, enckeys[3]; + int bit; + int xsize = sizeof(pgno_t) + sizeof(txnid_t); + + /* If this is an overflow page, set all use bits to the end */ + if (rem + numpgs > MDB_RPAGE_CHUNK) + bit = 0xffff; + else + bit = 1; + + id3->muse |= (bit << rem); + in.mv_size = numpgs * env->me_psize - xsize; + in.mv_data = (char *)id3->mptr + rem * env->me_psize + xsize; + enckeys[0] = env->me_enckey; + enckeys[1].mv_size = xsize; + enckeys[1].mv_data = (char *)in.mv_data - xsize; + if (env->me_esumsize) { + in.mv_size -= env->me_esumsize; + enckeys[2].mv_size = env->me_esumsize; + enckeys[2].mv_data = (char *)in.mv_data + in.mv_size; + } else { + enckeys[2].mv_size = 0; + enckeys[2].mv_data = 0; + } + out.mv_size = in.mv_size; + out.mv_data = (char *)id3->menc + rem * env->me_psize + xsize; + if (env->me_encfunc(&in, &out, enckeys, 0)) + rc = MDB_CRYPTO_FAIL; + else { + MDB_page *penc, *pclr; + penc = (MDB_page *)enckeys[1].mv_data; + pclr = (MDB_page *)((char *)out.mv_data - xsize); + pclr->mp_pgno = penc->mp_pgno; + pclr->mp_txnid = penc->mp_txnid; + } + } + return rc; +} + +/** zero out decrypted pages before freeing them */ +static void mdb_rpage_dispose(MDB_env *env, MDB_ID3 *id3) +{ + char *base = id3->menc; + int i, j; + for (i=0, j=1; i<15; i++) { + if (id3->muse & j) + memset(base, 0, env->me_psize); + j <<= 1; + base += env->me_psize; + } + if (id3->muse & j) { + i = id3->mcnt - (MDB_RPAGE_CHUNK - 1); + memset(base, 0, i * env->me_psize); + } + free(id3->menc); +} + +static void mdb_page_set_checksum(MDB_env *env, MDB_page *mp, size_t size) +{ + MDB_val src, dst, *key; + src.mv_size = size - env->me_sumsize; + src.mv_data = mp; + dst.mv_size = env->me_sumsize; + dst.mv_data = (char *)src.mv_data + src.mv_size; + if (env->me_encfunc) + key = &env->me_enckey; + else + key = NULL; + env->me_sumfunc(&src, &dst, key); +} + +static int mdb_page_chk_checksum(MDB_env *env, MDB_page *mp, size_t size) +{ + MDB_val src, dst, chk, *key; + char sumbuf[256]; + src.mv_size = size - env->me_sumsize; + src.mv_data = mp; + chk.mv_size = env->me_sumsize; + chk.mv_data = (char *)src.mv_data + src.mv_size; + dst.mv_size = env->me_sumsize; + dst.mv_data = sumbuf; + if (env->me_encfunc) + key = &env->me_enckey; + else + key = NULL; + env->me_sumfunc(&src, &dst, key); + return memcmp(chk.mv_data, dst.mv_data, env->me_sumsize) ? MDB_BAD_CHECKSUM : 0; } #endif @@ -6390,35 +7263,28 @@ mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) * Set #MDB_TXN_ERROR on failure. * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. + * @param[in] numpgs number of database pages (can be > 1 for overflow pages) * @param[out] ret address of a pointer where the page's address will be stored. - * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. * @return 0 on success, non-zero on failure. */ static int -mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) +mdb_page_get(MDB_cursor *mc, pgno_t pgno, +#if MDB_RPAGE_CACHE + int numpgs, +#endif + MDB_page **ret) { - MDB_txn *txn = mc->mc_txn; + MDB_txn *txn = mc->mc_txn, *tx2; MDB_page *p = NULL; - int level; if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { + for (tx2 = txn;; ) { MDB_ID2L dl = tx2->mt_u.dirty_list; + MDB_IDL sl; unsigned x; - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). + /* tx2 may have malloced its own "dirty" version of the + * page, with the same page number. */ - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - goto mapped; - } - } if (dl[0].mid) { unsigned x = mdb_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { @@ -6426,8 +7292,23 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) goto done; } } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); + /* Spilled pages were dirtied in this txn, then cleaned + * and flushed to the map when dirty_list got full. + * Check if tx2 spilled the page before moving on to + * search the parent. (But don't unspill here, leave + * that unless page_touch happens again.) + */ + sl = tx2->mt_spill_pgs; + if ((tx2 = tx2->mt_parent) == NULL) + break; + if (sl) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(sl, pn); + if (x <= sl[0] && sl[x] == pn) { + goto mapped; + } + } + } } if (pgno >= txn->mt_next_pgno) { @@ -6436,26 +7317,23 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) return MDB_PAGE_NOTFOUND; } - level = 0; - mapped: - { -#ifdef MDB_VL32 - int rc = mdb_rpage_get(txn, pgno, &p); +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(txn->mt_env->me_flags)) { + int rc = mdb_rpage_get(txn, pgno, numpgs, &p); if (rc) { txn->mt_flags |= MDB_TXN_ERROR; return rc; } -#else + } else +#endif + { MDB_env *env = txn->mt_env; p = (MDB_page *)(env->me_map + env->me_psize * pgno); -#endif } done: *ret = p; - if (lvl) - *lvl = level; return MDB_SUCCESS; } @@ -6512,7 +7390,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, i < NUMKEYS(mp)); node = NODEPTR(mp, i); - if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + if ((rc = MDB_PAGE_GET(mc, NODEPGNO(node), 1, &mp)) != 0) return rc; mc->mc_ki[mc->mc_top] = i; @@ -6555,7 +7433,7 @@ mdb_page_search_lowest(MDB_cursor *mc) MDB_node *node = NODEPTR(mp, 0); int rc; - if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) + if ((rc = MDB_PAGE_GET(mc, NODEPGNO(node), 1, &mp)) != 0) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -6631,19 +7509,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE if (mc->mc_pg[0]) MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]); #endif - if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) + if ((rc = MDB_PAGE_GET(mc, root, 1, &mc->mc_pg[0])) != 0) return rc; } -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE + if (MDB_REMAPPING(mc->mc_txn->mt_env->me_flags)) { int i; for (i=1; imc_snum; i++) - MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]); + mdb_page_unref(mc->mc_txn, mc->mc_pg[i]); } #endif mc->mc_snum = 1; @@ -6670,8 +7549,8 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) pgno_t pg = mp->mp_pgno; unsigned x = 0, ovpages = mp->mp_pages; MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; MDB_ID pn = pg << 1; + MDB_page *freeme = NULL; int rc; DPRINTF(("free ov page %"Yu" (%d)", pg, ovpages)); @@ -6683,24 +7562,26 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) * Unsupported in nested txns: They would need to hide the page * range in ancestor txns' dirty and spilled lists. */ - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { + if (IS_MUTABLE(txn, mp) && env->me_pghead && !txn->mt_parent) { unsigned i, j; pgno_t *mop; MDB_ID2 *dl, ix, iy; rc = mdb_midl_need(&env->me_pghead, ovpages); if (rc) return rc; - if (!(mp->mp_flags & P_DIRTY)) { + if (!IS_DIRTY_NW(txn, mp)) { /* spilled or WRITEMAP */ + MDB_IDL sl = txn->mt_spill_pgs; + if (sl) { + x = mdb_midl_search(sl, pn); + if (! (x <= sl[0] && sl[x] == pn)) + return MDB_PROBLEM; /* This page is no longer spilled */ if (x == sl[0]) sl[0]--; else sl[x] |= 1; - goto release; + } + goto release; } /* Remove from dirty list */ dl = txn->mt_u.dirty_list; @@ -6719,8 +7600,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) } } txn->mt_dirty_room++; - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); + freeme = mp; release: /* Insert in me_pghead */ mop = env->me_pghead; @@ -6735,11 +7615,15 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) if (rc) return rc; } -#ifdef MDB_VL32 - if (mc->mc_ovpg == mp) - mc->mc_ovpg = NULL; -#endif + if (MC_OVPG(mc) == mp) + MC_SET_OVPG(mc, NULL); mc->mc_db->md_overflow_pages -= ovpages; + + if (MDB_REMAPPING(env->me_flags)) + mdb_ovpage_unref_all(mc, mp); + if (freeme) + mdb_dpage_free(env, freeme); + return 0; } @@ -6753,27 +7637,28 @@ static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ - pgno_t pgno; + MDB_ovpage ovp; int rc; if (MC_OVPG(mc)) { MDB_PAGE_UNREF(mc->mc_txn, MC_OVPG(mc)); MC_SET_OVPG(mc, NULL); } + data->mv_size = NODEDSZ(leaf); if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); data->mv_data = NODEDATA(leaf); return MDB_SUCCESS; } /* Read overflow data. */ - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Yu" failed", pgno)); + memcpy(&ovp, NODEDATA(leaf), sizeof(ovp)); + { + if ((rc = MDB_PAGE_GET(mc, ovp.op_pgno, ovp.op_pages, &omp)) != 0) { + DPRINTF(("read overflow page %"Yu" failed", ovp.op_pgno)); return rc; } + } data->mv_data = METADATA(omp); MC_SET_OVPG(mc, omp); @@ -6799,7 +7684,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, mdb_cursor_init(&mc, txn, dbi, &mx); rc = mdb_cursor_set(&mc, key, data, MDB_SET, &exact); - /* unref all the pages when MDB_VL32 - caller must copy the data + /* unref all the pages when MDB_REMAP_CHUNKS - caller must copy the data * before doing anything else */ MDB_CURSOR_UNREF(&mc, 1); @@ -6820,7 +7705,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) int rc; MDB_node *indx; MDB_page *mp; -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE MDB_page *op; #endif @@ -6828,7 +7713,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) return MDB_NOTFOUND; /* root has no siblings */ } -#ifdef MDB_VL32 +#if MDB_RPAGE_CACHE op = mc->mc_pg[mc->mc_top]; #endif mdb_cursor_pop(mc); @@ -6858,7 +7743,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) MDB_PAGE_UNREF(mc->mc_txn, op); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { + if ((rc = MDB_PAGE_GET(mc, NODEPGNO(indx), 1, &mp)) != 0) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; @@ -6947,16 +7832,12 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdb_xcursor_init1(mc, leaf); - } - if (data) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } else if (data) { if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } } MDB_GET_KEY(leaf, key); @@ -6980,7 +7861,8 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_db->md_flags & MDB_DUPSORT) { + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { if (op == MDB_PREV || op == MDB_PREV_DUP) { @@ -7022,27 +7904,25 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + if (!IS_LEAF(mp)) + return MDB_CORRUPTED; + if (IS_LEAF2(mp)) { key->mv_size = mc->mc_db->md_pad; key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); return MDB_SUCCESS; } - mdb_cassert(mc, IS_LEAF(mp)); leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdb_xcursor_init1(mc, leaf); - } - if (data) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } else if (data) { if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } } MDB_GET_KEY(leaf, key); @@ -7200,24 +8080,22 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; - } else { - ex2p = NULL; - } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (rc != MDB_SUCCESS) - return rc; + ex2p = NULL; } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (data) { + if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { MDB_val olddata; MDB_cmp_func *dcmp; if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) @@ -7275,22 +8153,23 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + if ( key ) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + } return MDB_SUCCESS; } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) - return rc; - } + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; } + MDB_GET_KEY(leaf, key); return MDB_SUCCESS; } @@ -7319,21 +8198,21 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + if (key) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + } return MDB_SUCCESS; } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) - return rc; - } + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else if (data) { + if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) + return rc; } MDB_GET_KEY(leaf, key); @@ -7700,7 +8579,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Too big for a node, insert in sub-DB. Set up an empty * "old sub-page" for prep_subDB to expand to a full page. */ - fp_flags = P_LEAF|P_DIRTY; + fp_flags = P_LEAF; fp = env->me_pbuf; fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); @@ -7754,6 +8633,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned i, offset = 0; mp = fp = xdata.mv_data = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + mp->mp_txnid = mc->mc_txn->mt_txnid; /* Was a single item before, must convert now */ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -7777,7 +8657,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_flags = P_LEAF|P_SUBP; fp->mp_lower = (PAGEHDRSZ-PAGEBASE); xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; if (mc->mc_db->md_flags & MDB_DUPFIXED) { @@ -7811,7 +8691,6 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */ case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; COPY_PGNO(fp->mp_pgno, mp->mp_pgno); mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; @@ -7844,13 +8723,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, xdata.mv_data = &dummy; if ((rc = mdb_page_alloc(mc, 1, &mp))) return rc; + fp_flags |= mp->mp_flags; /* P_ADM_FLAGS */ offset = env->me_psize - olddata.mv_size; flags |= F_DUPDATA|F_SUBDATA; dummy.md_root = mp->mp_pgno; sub_root = mp; } if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_flags = fp_flags; mp->mp_pad = fp->mp_pad; mp->mp_lower = fp->mp_lower; mp->mp_upper = fp->mp_upper + offset; @@ -7879,38 +8759,33 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* overflow page overwrites need special handling */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + MDB_ovpage ovp; + int ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) + memcpy(&ovp, olddata.mv_data, sizeof(ovp)); + if ((rc2 = MDB_PAGE_GET(mc, ovp.op_pgno, ovp.op_pages, &omp)) != 0) return rc2; - ovpages = omp->mp_pages; - - /* Is the ov page large enough? */ - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) && - (level || (env->me_flags & MDB_WRITEMAP))) - { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (rc) - return rc; - level = 0; /* dirty in this txn or clean */ - } - /* Is it dirty? */ - if (omp->mp_flags & P_DIRTY) { + ovpages = ovp.op_pages; + + /* Is the ov page big enough and from this txn (or a parent)? */ + if (ovpages >= dpages && IS_MUTABLE(mc->mc_txn, omp)) { /* yes, overwrite it. Note in this case we don't * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ - if (level > 1) { + if (!IS_WRITABLE(mc->mc_txn, omp)) { + if (!IS_DIRTY_NW(mc->mc_txn, omp)) { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + } else { /* It is writable only in a parent txn */ size_t sz = (size_t) env->me_psize * ovpages, off; - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages, 1); MDB_ID2 id2; if (!np) return ENOMEM; - id2.mid = pg; + id2.mid = ovp.op_pgno; id2.mptr = np; /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); @@ -7930,7 +8805,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, sz = PAGEHDRSZ; } memcpy(np, omp, sz); /* Copy beginning of page */ + SET_PGTXNID(mc->mc_txn, np); omp = np; + } } SETDSZ(leaf, data->mv_size); if (F_ISSET(flags, MDB_RESERVE)) @@ -7938,7 +8815,6 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, else memcpy(METADATA(omp), data->mv_data, data->mv_size); return MDB_SUCCESS; - } } if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) return rc2; @@ -8008,7 +8884,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, xdata.mv_size = 0; xdata.mv_data = ""; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags == MDB_CURRENT) { + if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) { xflags = MDB_CURRENT|MDB_NOSPILL; } else { mdb_xcursor_init1(mc, leaf); @@ -8112,6 +8988,8 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) return rc; mp = mc->mc_pg[mc->mc_top]; + if (!IS_LEAF(mp)) + return MDB_CORRUPTED; if (IS_LEAF2(mp)) goto del_key; leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); @@ -8173,10 +9051,10 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) /* add overflow pages to free list */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; - pgno_t pg; + MDB_ovpage ovp; - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || + memcpy(&ovp, NODEDATA(leaf), sizeof(ovp)); + if ((rc = MDB_PAGE_GET(mc, ovp.op_pgno, ovp.op_pages, &omp)) || (rc = mdb_ovpage_free(mc, omp))) goto fail; } @@ -8208,9 +9086,13 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) return rc; DPRINTF(("allocated new mpage %"Yu", page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize)); - np->mp_flags = flags | P_DIRTY; + np->mp_flags |= flags; np->mp_lower = (PAGEHDRSZ-PAGEBASE); np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; +#if MDB_RPAGE_CACHE + np->mp_upper -= mc->mc_txn->mt_env->me_sumsize; + np->mp_upper -= mc->mc_txn->mt_env->me_esumsize; +#endif if (IS_BRANCH(np)) mc->mc_db->md_branch_pages++; @@ -8244,7 +9126,7 @@ mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) sz = LEAFSIZE(key, data); if (sz > env->me_nodemax) { /* put on overflow page */ - sz -= data->mv_size - sizeof(pgno_t); + sz -= data->mv_size - sizeof(MDB_ovpage); } return EVEN(sz + sizeof(indx_t)); @@ -8269,7 +9151,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) if (sz > env->me_nodemax) { /* put on overflow page */ /* not implemented */ - /* sz -= key->size - sizeof(pgno_t); */ + /* sz -= key->size - sizeof(MDB_ovpage); */ } return sz + sizeof(indx_t); @@ -8336,14 +9218,14 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, mdb_cassert(mc, key && data); if (F_ISSET(flags, F_BIGDATA)) { /* Data already on overflow page. */ - node_size += sizeof(pgno_t); + node_size += sizeof(MDB_ovpage); } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", data->mv_size, node_size+data->mv_size)); - node_size = EVEN(node_size + sizeof(pgno_t)); + node_size = EVEN(node_size + sizeof(MDB_ovpage)); if ((ssize_t)node_size > room) goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) @@ -8387,13 +9269,17 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, ndata = NODEDATA(node); if (ofp == NULL) { if (F_ISSET(flags, F_BIGDATA)) - memcpy(ndata, data->mv_data, sizeof(pgno_t)); + memcpy(ndata, data->mv_data, sizeof(MDB_ovpage)); else if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = ndata; else memcpy(ndata, data->mv_data, data->mv_size); } else { - memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + MDB_ovpage ovp; + ovp.op_pgno = ofp->mp_pgno; + ovp.op_txnid = mc->mc_txn->mt_txnid; + ovp.op_pages = ofp->mp_pages; + memcpy(ndata, &ovp, sizeof(MDB_ovpage)); ndata = METADATA(ofp); if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = ndata; @@ -8447,7 +9333,7 @@ mdb_node_del(MDB_cursor *mc, int ksize) sz = NODESIZE + node->mn_ksize; if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); + sz += sizeof(MDB_ovpage); else sz += NODEDSZ(node); } @@ -8776,6 +9662,18 @@ mdb_cursor_dbi(MDB_cursor *mc) return mc->mc_dbi; } +int +mdb_cursor_is_db(MDB_cursor *mc) +{ + + if (mc && (mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_SUBDATA) + return 1; + } + return 0; +} + /** Replace the key for a branch node with a new key. * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. @@ -9307,7 +10205,7 @@ mdb_rebalance(MDB_cursor *mc) mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdb_page_loose(mc, mp); if (rc) return rc; /* Adjust cursors pointing to mp */ @@ -9335,11 +10233,11 @@ mdb_rebalance(MDB_cursor *mc) } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { int i; DPUTS("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdb_page_loose(mc, mp); if (rc) return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + rc = MDB_PAGE_GET(mc, mc->mc_db->md_root, 1, &mc->mc_pg[0]); if (rc) return rc; mc->mc_db->md_depth--; @@ -9400,7 +10298,7 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + rc = MDB_PAGE_GET(mc, NODEPGNO(node), 1, &mn.mc_pg[mn.mc_top]); if (rc) return rc; mn.mc_ki[mn.mc_top] = 0; @@ -9412,7 +10310,7 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + rc = MDB_PAGE_GET(mc, NODEPGNO(node), 1, &mn.mc_pg[mn.mc_top]); if (rc) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; @@ -9490,60 +10388,68 @@ mdb_cursor_del0(MDB_cursor *mc) } } rc = mdb_rebalance(mc); + if (rc) + goto fail; - if (rc == MDB_SUCCESS) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by mdb_rebalance and aren't needed here. - */ - if (!mc->mc_snum) - return rc; + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdb_rebalance and aren't needed here. + */ + if (!mc->mc_snum) { + mc->mc_flags |= C_EOF; + return rc; + } - mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - continue; - } + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; } - if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. - * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. - */ - if (node->mn_flags & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node->mn_flags & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } else { - mdb_xcursor_init1(m3, node); - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } + if (rc) + goto fail; + } + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + mdb_xcursor_init1(m3, node); + rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (rc) + goto fail; } } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; } } } - mc->mc_flags |= C_DEL; } + mc->mc_flags |= C_DEL; +fail: if (rc) mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return rc; @@ -9647,7 +10553,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); /* Create a right sibling. */ - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + rc = mdb_page_new(mc, mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2), 1, &rp); + if (rc) return rc; rp->mp_pad = mp->mp_pad; DPRINTF(("new right sibling: page %"Yu, rp->mp_pgno)); @@ -9751,7 +10658,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno nsize = EVEN(nsize); /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc->mc_txn, 1); + copy = mdb_page_malloc(mc->mc_txn, 1, 1); if (copy == NULL) { rc = ENOMEM; goto done; @@ -9803,7 +10710,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); + psize += sizeof(MDB_ovpage); else psize += NODEDSZ(node); } @@ -10207,7 +11114,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) mc.mc_txn = my->mc_txn; mc.mc_flags = my->mc_txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); - rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + rc = MDB_PAGE_GET(&mc, *pg, 1, &mc.mc_pg[0]); if (rc) return rc; rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); @@ -10240,7 +11147,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) ni = NODEPTR(mp, i); if (ni->mn_flags & F_BIGDATA) { MDB_page *omp; - pgno_t pg; + MDB_ovpage ovp; /* Need writable leaf */ if (mp != leaf) { @@ -10250,9 +11157,8 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) ni = NODEPTR(mp, i); } - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdb_page_get(&mc, pg, &omp, NULL); + memcpy(&ovp, NODEDATA(ni), sizeof(ovp)); + rc = MDB_PAGE_GET(&mc, ovp.op_pgno, ovp.op_pages, &omp); if (rc) goto done; if (my->mc_wlen[toggle] >= MDB_WBUF) { @@ -10263,11 +11169,13 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; + ovp.op_pgno = my->mc_next_pgno; + ovp.op_txnid = 1; + memcpy(NODEDATA(ni), &ovp, sizeof(ovp)); + my->mc_next_pgno += ovp.op_pages; my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + if (ovp.op_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (ovp.op_pages - 1); my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; rc = mdb_env_cthr_toggle(my, 1); if (rc) @@ -10302,7 +11210,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) again: ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); pg = NODEPGNO(ni); - rc = mdb_page_get(&mc, pg, &mp, NULL); + rc = MDB_PAGE_GET(&mc, pg, 1, &mp); if (rc) goto done; mc.mc_top++; @@ -10328,6 +11236,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdb_page_copy(mo, mp, my->mc_env->me_psize); mo->mp_pgno = my->mc_next_pgno++; + mo->mp_txnid = 1; my->mc_wlen[toggle] += my->mc_env->me_psize; if (mc.mc_top) { /* Update parent if there is one */ @@ -10635,7 +11544,7 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg) if (!env || !arg) return EINVAL; - *arg = env->me_flags & (CHANGEABLE|CHANGELESS); + *arg = env->me_flags & EXPOSED; return MDB_SUCCESS; } @@ -10665,6 +11574,43 @@ mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) return MDB_SUCCESS; } +#if MDB_RPAGE_CACHE +int ESECT +mdb_env_set_encrypt(MDB_env *env, MDB_enc_func *func, const MDB_val *key, unsigned int size) +{ + char *kdata; + + if (!env || !func || !key) + return EINVAL; + if (env->me_flags & MDB_ENV_ACTIVE) + return EINVAL; + if (! (kdata = malloc(key[0].mv_size))) + return ENOMEM; + + memcpy(kdata, key->mv_data, key->mv_size); + free(env->me_enckey.mv_data); + env->me_enckey.mv_data = kdata; + env->me_enckey.mv_size = key->mv_size; + env->me_encfunc = func; + if (size) + env->me_esumsize = size; + env->me_flags |= MDB_REMAP_CHUNKS | MDB_ENCRYPT; + return MDB_SUCCESS; +} + +int ESECT +mdb_env_set_checksum(MDB_env *env, MDB_sum_func *func, unsigned int size) +{ + if (!env || !func || !size) + return EINVAL; + if (env->me_flags & MDB_ENV_ACTIVE) + return EINVAL; + env->me_sumfunc = func; + env->me_sumsize = size; + return MDB_SUCCESS; +} +#endif + int ESECT mdb_env_get_path(MDB_env *env, const char **arg) { @@ -10685,6 +11631,19 @@ mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) return MDB_SUCCESS; } +int ESECT +mdb_env_set_pagesize(MDB_env *env, int size) +{ + if (!env || env->me_map) + return EINVAL; + if (size > MAX_PAGESIZE || size < 256) + return EINVAL; + if (size & (size-1)) + return EINVAL; + env->me_os_psize = size; + return MDB_SUCCESS; +} + /** Common code for #mdb_stat() and #mdb_env_stat(). * @param[in] env the environment to operate in. * @param[in] db the #MDB_db record containing the stats to return. @@ -10768,7 +11727,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db int rc, dbflag, exact; unsigned int unused = 0, seq; char *namedup; - size_t len; + size_t size; if (flags & ~VALID_FLAGS) return EINVAL; @@ -10795,15 +11754,15 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db } /* Is the DB already open? */ - len = strlen(name); + size = strlen(name) + 1; for (i=CORE_DBS; imt_numdbs; i++) { if (!txn->mt_dbxs[i].md_name.mv_size) { /* Remember this free slot */ if (!unused) unused = i; continue; } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + if (size == txn->mt_dbxs[i].md_name.mv_size && + !strcmp(name, txn->mt_dbxs[i].md_name.mv_data)) { *dbi = i; return MDB_SUCCESS; } @@ -10820,7 +11779,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db /* Find the DB info */ dbflag = DB_NEW|DB_VALID|DB_USRVALID; exact = 0; - key.mv_size = len; + key.mv_size = size; key.mv_data = (void *)name; mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); @@ -10858,7 +11817,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db /* Got info, register DBI in this txn */ unsigned int slot = unused ? unused : txn->mt_numdbs; txn->mt_dbxs[slot].md_name.mv_data = namedup; - txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_name.mv_size = size; txn->mt_dbxs[slot].md_rel = NULL; txn->mt_dbflags[slot] = dbflag; /* txn-> and env-> are the same in read txns, use @@ -10948,11 +11907,12 @@ mdb_drop0(MDB_cursor *mc, int subs) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); -#ifdef MDB_VL32 + if (MDB_REMAPPING(mc->mc_txn->mt_env->me_flags)) { /* bump refcount for mx's pages */ for (i=0; imc_snum; i++) - mdb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL); -#endif + MDB_PAGE_GET(&mx, mc->mc_pg[i]->mp_pgno, 1, &mx.mc_pg[i]); + } + while (mc->mc_snum > 0) { MDB_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = NUMKEYS(mp); @@ -10960,18 +11920,13 @@ mdb_drop0(MDB_cursor *mc, int subs) for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(mc, pg, &omp, NULL); - if (rc != 0) - goto done; - mdb_cassert(mc, IS_OVERFLOW(omp)); + MDB_ovpage ovp; + memcpy(&ovp, NODEDATA(ni), sizeof(ovp)); rc = mdb_midl_append_range(&txn->mt_free_pgs, - pg, omp->mp_pages); + ovp.op_pgno, ovp.op_pages); if (rc) goto done; - mc->mc_db->md_overflow_pages -= omp->mp_pages; + mc->mc_db->md_overflow_pages -= ovp.op_pages; if (!mc->mc_db->md_overflow_pages && !subs) break; } else if (subs && (ni->mn_flags & F_SUBDATA)) { diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_copy.1 b/dependencies/lmdb/libraries/liblmdb/mdb_copy.1 index d2593e2f46..db0c973a5e 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_copy.1 +++ b/dependencies/lmdb/libraries/liblmdb/mdb_copy.1 @@ -1,5 +1,5 @@ -.TH MDB_COPY 1 "2017/07/31" "LMDB 0.9.70" -.\" Copyright 2012-2020 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_COPY 1 "2017/07/31" "LMDB 0.9.90" +.\" Copyright 2012-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_copy \- LMDB environment copy tool @@ -13,6 +13,10 @@ mdb_copy \- LMDB environment copy tool .BR \-n ] [\c .BR \-v ] +[\c +.BI \-m \ module +[\c +.BI \-w \ password\fR]] .B srcpath [\c .BR dstpath ] @@ -46,6 +50,15 @@ Open LDMB environment(s) which do not use subdirectories. .BR \-v Use the previous environment state instead of the latest state. This may be useful if the latest state has been corrupted. +.TP +.BI \-m \ module +Load the specified dynamic module to utilize cryptographic functions. +This is required to operate on environments that have been configured +with page-level checksums or encryption. +.TP +.BI \-w \ password +Specify the password for an encrypted environment. This is only +used if a cryptography module has been loaded. .SH DIAGNOSTICS Exit status is zero if no errors occur. diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_copy.c b/dependencies/lmdb/libraries/liblmdb/mdb_copy.c index cfcd13ac30..c230caa2cb 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_copy.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb_copy.c @@ -1,6 +1,6 @@ /* mdb_copy.c - memory-mapped database backup tool */ /* - * Copyright 2012-2020 Howard Chu, Symas Corp. + * Copyright 2012-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -21,6 +21,7 @@ #include #include #include "lmdb.h" +#include "module.h" static void sighandle(int sig) @@ -34,6 +35,9 @@ int main(int argc,char * argv[]) const char *progname = argv[0], *act; unsigned flags = MDB_RDONLY; unsigned cpflags = 0; + char *module = NULL, *password = NULL; + void *mlm = NULL; + char *errmsg; for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') @@ -45,15 +49,24 @@ int main(int argc,char * argv[]) else if (argv[1][1] == 'V' && argv[1][2] == '\0') { printf("%s\n", MDB_VERSION_STRING); exit(0); + } else if (argv[1][1] == 'm' && argv[1][2] == '\0') { + module = argv[2]; + argc--; + argv++; + } else if (argv[1][1] == 'w' && argv[1][2] == '\0') { + password = argv[2]; + argc--; + argv++; } else argc = 0; } if (argc<2 || argc>3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] [-v] srcpath [dstpath]\n", progname); + fprintf(stderr, "usage: %s [-V] [-c] [-n] [-v] [-m module [-w password]] srcpath [dstpath]\n", progname); exit(EXIT_FAILURE); } + #ifdef SIGPIPE signal(SIGPIPE, sighandle); #endif @@ -66,6 +79,13 @@ int main(int argc,char * argv[]) act = "opening environment"; rc = mdb_env_create(&env); if (rc == MDB_SUCCESS) { + if (module) { + mlm = mlm_setup(env, module, password, &errmsg); + if (!mlm) { + fprintf(stderr, "Failed to load crypto module: %s\n", errmsg); + exit(EXIT_FAILURE); + } + } rc = mdb_env_open(env, argv[1], flags, 0600); } if (rc == MDB_SUCCESS) { @@ -79,6 +99,8 @@ int main(int argc,char * argv[]) fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc, mdb_strerror(rc)); mdb_env_close(env); + if (mlm) + mlm_unload(mlm); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_drop.1 b/dependencies/lmdb/libraries/liblmdb/mdb_drop.1 index be54744070..36a73e1088 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_drop.1 +++ b/dependencies/lmdb/libraries/liblmdb/mdb_drop.1 @@ -1,5 +1,5 @@ -.TH MDB_DROP 1 "2017/11/19" "LMDB 0.9.70" -.\" Copyright 2014-2020 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_DROP 1 "2017/11/19" "LMDB 0.9.90" +.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_drop \- LMDB database delete tool @@ -12,6 +12,10 @@ mdb_drop \- LMDB database delete tool [\c .BR \-d ] [\c +.BI \-m \ module +[\c +.BI \-w \ password\fR]] +[\c .BI \-s \ subdb\fR] .BR \ envpath .SH DESCRIPTION @@ -30,6 +34,15 @@ Operate on an LMDB database which does not use subdirectories. .BR \-d Delete the specified database, don't just empty it. .TP +.BI \-m \ module +Load the specified dynamic module to utilize cryptographic functions. +This is required to operate on environments that have been configured +with page-level checksums or encryption. +.TP +.BI \-w \ password +Specify the password for an encrypted environment. This is only +used if a cryptography module has been loaded. +.TP .BR \-s \ subdb Operate on a specific subdatabase. If no database is specified, only the main database is dropped. .SH DIAGNOSTICS diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_drop.c b/dependencies/lmdb/libraries/liblmdb/mdb_drop.c index 3d9d779b5b..54e91332f0 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_drop.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb_drop.c @@ -1,6 +1,6 @@ /* mdb_drop.c - memory-mapped database delete tool */ /* - * Copyright 2016-2020 Howard Chu, Symas Corp. + * Copyright 2016-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -19,6 +19,7 @@ #include #include #include "lmdb.h" +#include "module.h" static volatile sig_atomic_t gotsig; @@ -29,7 +30,7 @@ static void dumpsig( int sig ) static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-n] [-d] [-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-n] [-d] [-m module [-w password]] [-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -43,6 +44,9 @@ int main(int argc, char *argv[]) char *envname; char *subname = NULL; int envflags = 0, delete = 0; + char *module = NULL, *password = NULL; + void *mlm = NULL; + char *errmsg; if (argc < 2) { usage(prog); @@ -54,7 +58,7 @@ int main(int argc, char *argv[]) * -V: print version and exit * (default) empty the main DB */ - while ((i = getopt(argc, argv, "dns:V")) != EOF) { + while ((i = getopt(argc, argv, "dm:ns:w:V")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); @@ -69,6 +73,12 @@ int main(int argc, char *argv[]) case 's': subname = optarg; break; + case 'm': + module = optarg; + break; + case 'w': + password = optarg; + break; default: usage(prog); } @@ -92,6 +102,13 @@ int main(int argc, char *argv[]) fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); return EXIT_FAILURE; } + if (module) { + mlm = mlm_setup(env, module, password, &errmsg); + if (!mlm) { + fprintf(stderr, "Failed to load crypto module: %s\n", errmsg); + goto env_close; + } + } mdb_env_set_maxdbs(env, 2); @@ -130,6 +147,8 @@ int main(int argc, char *argv[]) mdb_txn_abort(txn); env_close: mdb_env_close(env); + if (mlm) + mlm_unload(mlm); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_dump.1 b/dependencies/lmdb/libraries/liblmdb/mdb_dump.1 index 1279c9c54c..acefe4e719 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_dump.1 +++ b/dependencies/lmdb/libraries/liblmdb/mdb_dump.1 @@ -1,5 +1,5 @@ -.TH MDB_DUMP 1 "2017/07/31" "LMDB 0.9.70" -.\" Copyright 2014-2020 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_DUMP 1 "2017/07/31" "LMDB 0.9.90" +.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_dump \- LMDB environment export tool @@ -18,6 +18,10 @@ mdb_dump \- LMDB environment export tool [\c .BR \-p ] [\c +.BI \-m \ module +[\c +.BI \-w \ password\fR]] +[\c .BR \-a \ | .BI \-s \ subdb\fR] .BR \ envpath @@ -57,6 +61,15 @@ Note: different systems may have different notions about what characters are considered printing characters, and databases dumped in this manner may be less portable to external systems. .TP +.BI \-m \ module +Load the specified dynamic module to utilize cryptographic functions. +This is required to operate on environments that have been configured +with page-level checksums or encryption. +.TP +.BI \-w \ password +Specify the password for an encrypted environment. This is only +used if a cryptography module has been loaded. +.TP .BR \-a Dump all of the subdatabases in the environment. .TP diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_dump.c b/dependencies/lmdb/libraries/liblmdb/mdb_dump.c index ce2e091a81..a3af117e02 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_dump.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb_dump.c @@ -1,6 +1,6 @@ /* mdb_dump.c - memory-mapped database dump tool */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -19,6 +19,7 @@ #include #include #include "lmdb.h" +#include "module.h" #define Yu MDB_PRIy(u) @@ -153,7 +154,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-v] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-v] [-m module [-w password]] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -167,6 +168,8 @@ int main(int argc, char *argv[]) char *envname; char *subname = NULL; int alldbs = 0, envflags = 0, list = 0; + char *module = NULL, *password = NULL, *errmsg; + void *mlm = NULL; if (argc < 2) { usage(prog); @@ -181,7 +184,7 @@ int main(int argc, char *argv[]) * -V: print version and exit * (default) dump only the main DB */ - while ((i = getopt(argc, argv, "af:lnps:vV")) != EOF) { + while ((i = getopt(argc, argv, "af:lm:nps:vw:V")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); @@ -189,7 +192,7 @@ int main(int argc, char *argv[]) break; case 'l': list = 1; - /*FALLTHROUGH*/; + /*FALLTHROUGH*/ case 'a': if (subname) usage(prog); @@ -216,6 +219,12 @@ int main(int argc, char *argv[]) usage(prog); subname = optarg; break; + case 'm': + module = optarg; + break; + case 'w': + password = optarg; + break; default: usage(prog); } @@ -240,6 +249,14 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + if (module) { + mlm = mlm_setup(env, module, password, &errmsg); + if (!mlm) { + fprintf(stderr, "Failed to load crypto module: %s\n", errmsg); + goto env_close; + } + } + if (alldbs || subname) { mdb_env_set_maxdbs(env, 2); } @@ -256,9 +273,9 @@ int main(int argc, char *argv[]) goto env_close; } - rc = mdb_open(txn, subname, 0, &dbi); + rc = mdb_dbi_open(txn, subname, 0, &dbi); if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); + fprintf(stderr, "mdb_dbi_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } @@ -273,27 +290,22 @@ int main(int argc, char *argv[]) goto txn_abort; } while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) + if (!mdb_cursor_is_db(cursor)) continue; count++; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); + rc = mdb_dbi_open(txn, key.mv_data, 0, &db2); if (rc == MDB_SUCCESS) { if (list) { - printf("%s\n", str); + printf("%s\n", (char *)key.mv_data); list++; } else { - rc = dumpit(txn, db2, str); + rc = dumpit(txn, db2, key.mv_data); if (rc) break; } - mdb_close(env, db2); + mdb_dbi_close(env, db2); } - free(str); if (rc) continue; } mdb_cursor_close(cursor); @@ -314,6 +326,8 @@ int main(int argc, char *argv[]) mdb_txn_abort(txn); env_close: mdb_env_close(env); + if (mlm) + mlm_unload(mlm); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_load.1 b/dependencies/lmdb/libraries/liblmdb/mdb_load.1 index a34f0380d8..aa25b03aad 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_load.1 +++ b/dependencies/lmdb/libraries/liblmdb/mdb_load.1 @@ -1,5 +1,5 @@ -.TH MDB_LOAD 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2014-2020 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_LOAD 1 "2015/09/30" "LMDB 0.9.90" +.\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_load \- LMDB environment import tool @@ -12,6 +12,10 @@ mdb_load \- LMDB environment import tool [\c .BR \-n ] [\c +.BI \-m \ module +[\c +.BI \-w \ password\fR]] +[\c .BI \-s \ subdb\fR] [\c .BR \-N ] @@ -50,6 +54,15 @@ Read from the specified file instead of from the standard input. .BR \-n Load an LMDB database which does not use subdirectories. .TP +.BI \-m \ module +Load the specified dynamic module to utilize cryptographic functions. +This is required to operate on environments that have been configured +with page-level checksums or encryption. +.TP +.BI \-w \ password +Specify the password for an encrypted environment. This is only +used if a cryptography module has been loaded. +.TP .BR \-s \ subdb Load a specific subdatabase. If no database is specified, data is loaded into the main database. .TP diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_load.c b/dependencies/lmdb/libraries/liblmdb/mdb_load.c index 54977d845e..62af83d883 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_load.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb_load.c @@ -1,6 +1,6 @@ /* mdb_load.c - memory-mapped database load tool */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -18,6 +18,7 @@ #include #include #include "lmdb.h" +#include "module.h" #define PRINT 1 #define NOHDR 2 @@ -39,6 +40,8 @@ static MDB_envinfo info; static MDB_val kbuf, dbuf; static MDB_val k0buf; +static unsigned int pagesize; + #define Yu MDB_PRIy(u) #define STRLENOF(s) (sizeof(s)-1) @@ -127,6 +130,17 @@ static void readhdr(void) prog, lineno, (char *)dbuf.mv_data+STRLENOF("maxreaders=")); exit(EXIT_FAILURE); } + } else if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data+STRLENOF("db_pagesize="), + "%u", &pagesize); + if (i != 1) { + fprintf(stderr, "%s: line %"Yu": invalid pagesize %s\n", + prog, lineno, (char *)dbuf.mv_data+STRLENOF("db_pagesize=")); + exit(EXIT_FAILURE); + } } else { int i; for (i=0; dbflags[i].bit; i++) { @@ -276,7 +290,7 @@ static int readline(MDB_val *out, MDB_val *buf) static void usage(void) { - fprintf(stderr, "usage: %s [-V] [-a] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-a] [-f input] [-n] [-m module [-w password]] [-s name] [-N] [-T] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -296,6 +310,8 @@ int main(int argc, char *argv[]) int envflags = MDB_NOSYNC, putflags = 0; int dohdr = 0, append = 0; MDB_val prevk; + char *module = NULL, *password = NULL, *errmsg; + void *mlm = NULL; prog = argv[0]; @@ -311,7 +327,7 @@ int main(int argc, char *argv[]) * -T: read plaintext * -V: print version and exit */ - while ((i = getopt(argc, argv, "af:ns:NTV")) != EOF) { + while ((i = getopt(argc, argv, "af:m:ns:w:NTV")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); @@ -339,6 +355,12 @@ int main(int argc, char *argv[]) case 'T': mode |= NOHDR | PRINT; break; + case 'm': + module = optarg; + break; + case 'w': + password = optarg; + break; default: usage(); } @@ -359,6 +381,13 @@ int main(int argc, char *argv[]) fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); return EXIT_FAILURE; } + if (module) { + mlm = mlm_setup(env, module, password, &errmsg); + if (!mlm) { + fprintf(stderr, "Failed to load crypto module: %s\n", errmsg); + goto env_close; + } + } mdb_env_set_maxdbs(env, 2); @@ -368,6 +397,9 @@ int main(int argc, char *argv[]) if (info.me_mapsize) mdb_env_set_mapsize(env, info.me_mapsize); + if (pagesize) + mdb_env_set_pagesize(env, pagesize); + if (info.me_mapaddr) envflags |= MDB_FIXEDMAP; @@ -399,9 +431,9 @@ int main(int argc, char *argv[]) goto env_close; } - rc = mdb_open(txn, subname, flags|MDB_CREATE, &dbi); + rc = mdb_dbi_open(txn, subname, flags|MDB_CREATE, &dbi); if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); + fprintf(stderr, "mdb_dbi_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } prevk.mv_size = 0; @@ -427,6 +459,10 @@ int main(int argc, char *argv[]) fprintf(stderr, "%s: line %"Yu": failed to read key value\n", prog, lineno); goto txn_abort; } + if (!key.mv_size) { + fprintf(stderr, "%s: line %"Yu": zero-length key(ignored)\n", prog, lineno); + continue; + } if (append) { appflag = MDB_APPEND; @@ -487,6 +523,8 @@ int main(int argc, char *argv[]) mdb_txn_abort(txn); env_close: mdb_env_close(env); + if (mlm) + mlm_unload(mlm); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_stat.1 b/dependencies/lmdb/libraries/liblmdb/mdb_stat.1 index 5b3ae6e36e..367bd6f36a 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_stat.1 +++ b/dependencies/lmdb/libraries/liblmdb/mdb_stat.1 @@ -1,5 +1,5 @@ -.TH MDB_STAT 1 "2017/07/31" "LMDB 0.9.70" -.\" Copyright 2012-2020 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_STAT 1 "2017/07/31" "LMDB 0.9.90" +.\" Copyright 2012-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_stat \- LMDB environment status tool @@ -16,6 +16,10 @@ mdb_stat \- LMDB environment status tool [\c .BR \-v ] [\c +.BI \-m \ module +[\c +.BI \-w \ password\fR]] +[\c .BR \-r [ r ]] [\c .BR \-a \ | @@ -45,6 +49,15 @@ Display the status of an LMDB database which does not use subdirectories. Use the previous environment state instead of the latest state. This may be useful if the latest state has been corrupted. .TP +.BI \-m \ module +Load the specified dynamic module to utilize cryptographic functions. +This is required to operate on environments that have been configured +with page-level checksums or encryption. +.TP +.BI \-w \ password +Specify the password for an encrypted environment. This is only +used if a cryptography module has been loaded. +.TP .BR \-r Display information about the environment reader table. Shows the process ID, thread ID, and transaction ID for each active diff --git a/dependencies/lmdb/libraries/liblmdb/mdb_stat.c b/dependencies/lmdb/libraries/liblmdb/mdb_stat.c index d49dde6dd4..caf519b341 100644 --- a/dependencies/lmdb/libraries/liblmdb/mdb_stat.c +++ b/dependencies/lmdb/libraries/liblmdb/mdb_stat.c @@ -1,6 +1,6 @@ /* mdb_stat.c - memory-mapped database status tool */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -16,15 +16,14 @@ #include #include #include "lmdb.h" +#include "module.h" #define Z MDB_FMT_Z #define Yu MDB_PRIy(u) static void prstat(MDB_stat *ms) { -#if 0 printf(" Page size: %u\n", ms->ms_psize); -#endif printf(" Tree depth: %u\n", ms->ms_depth); printf(" Branch pages: %"Yu"\n", ms->ms_branch_pages); printf(" Leaf pages: %"Yu"\n", ms->ms_leaf_pages); @@ -34,7 +33,7 @@ static void prstat(MDB_stat *ms) static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-v] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-v] [-m module [-w password]] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -50,6 +49,8 @@ int main(int argc, char *argv[]) char *envname; char *subname = NULL; int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; + char *module = NULL, *password = NULL, *errmsg; + void *mlm = NULL; if (argc < 2) { usage(prog); @@ -65,7 +66,7 @@ int main(int argc, char *argv[]) * -V: print version and exit * (default) print stat of only the main DB */ - while ((i = getopt(argc, argv, "Vaefnrs:v")) != EOF) { + while ((i = getopt(argc, argv, "Vaefm:nrs:vw:")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); @@ -96,6 +97,12 @@ int main(int argc, char *argv[]) usage(prog); subname = optarg; break; + case 'm': + module = optarg; + break; + case 'w': + password = optarg; + break; default: usage(prog); } @@ -111,6 +118,14 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + if (module) { + mlm = mlm_setup(env, module, password, &errmsg); + if (!mlm) { + fprintf(stderr, "Failed to load crypto module: %s\n", errmsg); + goto env_close; + } + } + if (alldbs || subname) { mdb_env_set_maxdbs(env, 4); } @@ -204,9 +219,9 @@ int main(int argc, char *argv[]) printf(" Free pages: %"Yu"\n", pages); } - rc = mdb_open(txn, subname, 0, &dbi); + rc = mdb_dbi_open(txn, subname, 0, &dbi); if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); + fprintf(stderr, "mdb_dbi_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } @@ -228,17 +243,12 @@ int main(int argc, char *argv[]) goto txn_abort; } while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) + if (!mdb_cursor_is_db(cursor)) continue; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); + rc = mdb_dbi_open(txn, key.mv_data, 0, &db2); if (rc == MDB_SUCCESS) - printf("Status of %s\n", str); - free(str); + printf("Status of %s\n", (char *)key.mv_data); if (rc) continue; rc = mdb_stat(txn, db2, &mst); if (rc) { @@ -246,7 +256,7 @@ int main(int argc, char *argv[]) goto txn_abort; } prstat(&mst); - mdb_close(env, db2); + mdb_dbi_close(env, db2); } mdb_cursor_close(cursor); } @@ -254,11 +264,13 @@ int main(int argc, char *argv[]) if (rc == MDB_NOTFOUND) rc = MDB_SUCCESS; - mdb_close(env, dbi); + mdb_dbi_close(env, dbi); txn_abort: mdb_txn_abort(txn); env_close: mdb_env_close(env); + if (mlm) + mlm_unload(mlm); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/dependencies/lmdb/libraries/liblmdb/midl.c b/dependencies/lmdb/libraries/liblmdb/midl.c index 4a5ca0778a..272e557e3f 100644 --- a/dependencies/lmdb/libraries/liblmdb/midl.c +++ b/dependencies/lmdb/libraries/liblmdb/midl.c @@ -3,8 +3,8 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2020 The OpenLDAP Foundation. - * Portions Copyright 2001-2020 Howard Chu, Symas Corp. + * Copyright 2000-2021 The OpenLDAP Foundation. + * Portions Copyright 2001-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -355,7 +355,38 @@ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) return 0; } -#ifdef MDB_VL32 +MDB_ID2L mdb_mid2l_alloc(int num) +{ + MDB_ID2L ids = malloc((num+2) * sizeof(MDB_ID2)); + if (ids) { + ids->mid = num; + ids++; + ids->mid = 0; + } + return ids; +} + +void mdb_mid2l_free(MDB_ID2L ids) +{ + if (ids) + free(ids-1); +} + +int mdb_mid2l_need( MDB_ID2L *idp, unsigned num ) +{ + MDB_ID2L ids = *idp; + num += ids[0].mid; + if (num > ids[-1].mid) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = realloc(ids-1, num * sizeof(MDB_ID2)))) + return ENOMEM; + ids[0].mid = num - 2; + *idp = ids+1; + } + return 0; +} + +#if MDB_RPAGE_CACHE unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ) { /* @@ -415,7 +446,7 @@ int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ) return 0; } -#endif /* MDB_VL32 */ +#endif /* MDB_RPAGE_CACHE */ /** @} */ /** @} */ diff --git a/dependencies/lmdb/libraries/liblmdb/midl.h b/dependencies/lmdb/libraries/liblmdb/midl.h index c0835be684..aa45c2e726 100644 --- a/dependencies/lmdb/libraries/liblmdb/midl.h +++ b/dependencies/lmdb/libraries/liblmdb/midl.h @@ -11,8 +11,8 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2020 The OpenLDAP Foundation. - * Portions Copyright 2001-2020 Howard Chu, Symas Corp. + * Copyright 2000-2021 The OpenLDAP Foundation. + * Portions Copyright 2001-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -178,12 +178,20 @@ int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ); */ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ); -#ifdef MDB_VL32 +MDB_ID2L mdb_mid2l_alloc(int num); + +void mdb_mid2l_free(MDB_ID2L ids); + +int mdb_mid2l_need( MDB_ID2L *idp, unsigned num ); + +#if MDB_RPAGE_CACHE typedef struct MDB_ID3 { MDB_ID mid; /**< The ID */ void *mptr; /**< The pointer */ + void *menc; /**< Decrypted pointer */ unsigned int mcnt; /**< Number of pages */ - unsigned int mref; /**< Refcounter */ + unsigned short mref; /**< Refcounter */ + unsigned short muse; /**< Bitmap of used pages */ } MDB_ID3; typedef MDB_ID3 *MDB_ID3L; @@ -191,7 +199,7 @@ typedef MDB_ID3 *MDB_ID3L; unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ); int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ); -#endif /* MDB_VL32 */ +#endif /* MDB_RPAGE_CACHE */ /** @} */ /** @} */ #ifdef __cplusplus diff --git a/dependencies/lmdb/libraries/liblmdb/module.c b/dependencies/lmdb/libraries/liblmdb/module.c new file mode 100644 index 0000000000..52a24eb019 --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/module.c @@ -0,0 +1,101 @@ +/* module.c - helper for dynamically loading crypto module */ +/* + * Copyright 2020-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ +#ifdef _WIN32 +#include +#else +#include +#endif + +#include +#include + +#include "lmdb.h" +#include "module.h" + +void *mlm_load(const char *file, const char *name, MDB_crypto_funcs **mcf_ptr, char **errmsg) +{ + MDB_crypto_hooks *hookfunc; + void *ret = NULL; + if (!name) + name = "MDB_crypto"; + +#ifdef _WIN32 + { + HINSTANCE mlm = LoadLibrary(file); + if (mlm) { + hookfunc = GetProcAddress(mlm, name); + if (hookfunc) + *mcf_ptr = hookfunc(); + else { + *errmsg = "Crypto hook function not found"; + FreeLibrary(mlm); + mlm = NULL; + } + } else { + *errmsg = GetLastError(); + } + ret = (void *)mlm; + } +#else + { + void *mlm = dlopen(file, RTLD_NOW); + if (mlm) { + hookfunc = dlsym(mlm, name); + if (hookfunc) + *mcf_ptr = hookfunc(); + else { + *errmsg = "Crypto hook function not found"; + dlclose(mlm); + mlm = NULL; + } + } else { + *errmsg = dlerror(); + } + ret = mlm; + } +#endif + return ret; +} + +void mlm_unload(void *mlm) +{ +#ifdef _WIN32 + FreeLibrary((HINSTANCE)mlm); +#else + dlclose(mlm); +#endif +} + +void *mlm_setup(MDB_env *env, const char *file, const char *password, char **errmsg) +{ + MDB_crypto_funcs *cf; + MDB_val enckey = {0}; + void *mlm = mlm_load(file, NULL, &cf, errmsg); + if (mlm) { + if (cf->mcf_sumfunc) { + mdb_env_set_checksum(env, cf->mcf_sumfunc, cf->mcf_sumsize); + } + if (cf->mcf_encfunc && password) { + char keybuf[2048]; + enckey.mv_data = keybuf; + enckey.mv_size = cf->mcf_keysize; + if (cf->mcf_str2key) + cf->mcf_str2key(password, &enckey); + else + strncpy(enckey.mv_data, password, enckey.mv_size); + mdb_env_set_encrypt(env, cf->mcf_encfunc, &enckey, cf->mcf_esumsize); + memset(enckey.mv_data, 0, enckey.mv_size); + } + } + return mlm; +} diff --git a/dependencies/lmdb/libraries/liblmdb/module.h b/dependencies/lmdb/libraries/liblmdb/module.h new file mode 100644 index 0000000000..13d4494dd8 --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/module.h @@ -0,0 +1,16 @@ +/* module.h - helper for dynamically loading crypto module */ +/* + * Copyright 2020-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ + +void *mlm_load(const char *file, const char *name, MDB_crypto_funcs **mcf_ptr, char **errmsg); +void mlm_unload(void *lm); +void *mlm_setup(MDB_env *env, const char *file, const char *password, char **errmsg); diff --git a/dependencies/lmdb/libraries/liblmdb/mtest.c b/dependencies/lmdb/libraries/liblmdb/mtest.c index c1c9abb8f7..2a45eb12c0 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest.c @@ -1,6 +1,6 @@ /* mtest.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,6 +47,7 @@ int main(int argc,char * argv[]) E(mdb_env_create(&env)); E(mdb_env_set_maxreaders(env, 1)); E(mdb_env_set_mapsize(env, 10485760)); + E(mdb_env_set_pagesize(env, 1024)); E(mdb_env_open(env, "./testdb", MDB_FIXEDMAP /*|MDB_NOSYNC*/, 0664)); E(mdb_txn_begin(env, NULL, 0, &txn)); diff --git a/dependencies/lmdb/libraries/liblmdb/mtest2.c b/dependencies/lmdb/libraries/liblmdb/mtest2.c index db32525c5b..1ce4c9442d 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest2.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest2.c @@ -1,6 +1,6 @@ /* mtest2.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/mtest3.c b/dependencies/lmdb/libraries/liblmdb/mtest3.c index bc471eeeaa..f8da0d331c 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest3.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest3.c @@ -1,6 +1,6 @@ /* mtest3.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/mtest4.c b/dependencies/lmdb/libraries/liblmdb/mtest4.c index b7531755a9..3d7476c455 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest4.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest4.c @@ -1,6 +1,6 @@ /* mtest4.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/mtest5.c b/dependencies/lmdb/libraries/liblmdb/mtest5.c index d6d1cf9cd7..d7a7307e29 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest5.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest5.c @@ -1,6 +1,6 @@ /* mtest5.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/mtest6.c b/dependencies/lmdb/libraries/liblmdb/mtest6.c index e4d4e6b27e..cf8ba961d0 100644 --- a/dependencies/lmdb/libraries/liblmdb/mtest6.c +++ b/dependencies/lmdb/libraries/liblmdb/mtest6.c @@ -1,6 +1,6 @@ /* mtest6.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2020 Howard Chu, Symas Corp. + * Copyright 2011-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/mtest_enc.c b/dependencies/lmdb/libraries/liblmdb/mtest_enc.c new file mode 100644 index 0000000000..a8c11adfcb --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/mtest_enc.c @@ -0,0 +1,190 @@ +/* mtest_enc.c - memory-mapped database tester/toy with encryption */ +/* + * Copyright 2011-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ +#include +#include +#include +#include "lmdb.h" +#include "chacha8.h" + +#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) +#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) +#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ + "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) + +static int encfunc(const MDB_val *src, MDB_val *dst, const MDB_val *key, int encdec) +{ + chacha8(src->mv_data, src->mv_size, key[0].mv_data, key[1].mv_data, dst->mv_data); + return 0; +} + +int main(int argc,char * argv[]) +{ + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDB_stat mst; + MDB_cursor *cursor, *cur2; + MDB_cursor_op op; + MDB_val enckey; + int count; + int *values; + char sval[32] = ""; + char ekey[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; + + srand(time(NULL)); + + count = (rand()%384) + 64; + values = (int *)malloc(count*sizeof(int)); + + for(i = 0;i in each iteration, since MDB_NOOVERWRITE may modify it */ + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_KEYEXIST, mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { + j++; + data.mv_size = sizeof(sval); + data.mv_data = sval; + } + } + if (j) printf("%d duplicates skipped\n", j); + E(mdb_txn_commit(txn)); + E(mdb_env_stat(env, &mst)); + + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + j=0; + key.mv_data = sval; + for (i= count - 1; i > -1; i-= (rand()%5)) { + j++; + txn=NULL; + E(mdb_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { + j--; + mdb_txn_abort(txn); + } else { + E(mdb_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); + + E(mdb_env_stat(env, &mst)); + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + printf("Cursor prev\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last/prev\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + E(mdb_cursor_get(cursor, &key, &data, MDB_PREV)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + printf("Deleting with cursor\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cur2)); + for (i=0; i<50; i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, MDB_NEXT))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + E(mdb_del(txn, dbi, &key, NULL)); + } + + printf("Restarting cursor in txn\n"); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cur2); + E(mdb_txn_commit(txn)); + + printf("Restarting cursor outside txn\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cursor, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + mdb_dbi_close(env, dbi); + mdb_env_close(env); + + return 0; +} diff --git a/dependencies/lmdb/libraries/liblmdb/mtest_enc2.c b/dependencies/lmdb/libraries/liblmdb/mtest_enc2.c new file mode 100644 index 0000000000..853960ab52 --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/mtest_enc2.c @@ -0,0 +1,189 @@ +/* mtest_enc.c - memory-mapped database tester/toy with encryption */ +/* + * Copyright 2011-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ +#include +#include +#include +#include +#include "lmdb.h" +#include "module.h" + +#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) +#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) +#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ + "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) + +MDB_crypto_funcs *cf; + +int main(int argc,char * argv[]) +{ + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDB_stat mst; + MDB_cursor *cursor, *cur2; + MDB_cursor_op op; + int count; + int *values; + char sval[32] = ""; + char password[] = "This is my passphrase for now..."; + void *mlm; + char *errmsg; + + srand(time(NULL)); + + count = (rand()%384) + 64; + values = (int *)malloc(count*sizeof(int)); + + for(i = 0;i in each iteration, since MDB_NOOVERWRITE may modify it */ + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_KEYEXIST, mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { + j++; + data.mv_size = sizeof(sval); + data.mv_data = sval; + } + } + if (j) printf("%d duplicates skipped\n", j); + E(mdb_txn_commit(txn)); + E(mdb_env_stat(env, &mst)); + + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + j=0; + key.mv_data = sval; + for (i= count - 1; i > -1; i-= (rand()%5)) { + j++; + txn=NULL; + E(mdb_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { + j--; + mdb_txn_abort(txn); + } else { + E(mdb_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); + + E(mdb_env_stat(env, &mst)); + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + printf("Cursor prev\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last/prev\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + E(mdb_cursor_get(cursor, &key, &data, MDB_PREV)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + printf("Deleting with cursor\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cur2)); + for (i=0; i<50; i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, MDB_NEXT))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + E(mdb_del(txn, dbi, &key, NULL)); + } + + printf("Restarting cursor in txn\n"); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cur2); + E(mdb_txn_commit(txn)); + + printf("Restarting cursor outside txn\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cursor, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + mdb_dbi_close(env, dbi); + mdb_env_close(env); + mlm_unload(mlm); + + return 0; +} diff --git a/dependencies/lmdb/libraries/liblmdb/mtest_remap.c b/dependencies/lmdb/libraries/liblmdb/mtest_remap.c new file mode 100644 index 0000000000..93c062eb06 --- /dev/null +++ b/dependencies/lmdb/libraries/liblmdb/mtest_remap.c @@ -0,0 +1,177 @@ +/* mtest_remap.c - memory-mapped database tester/toy with page remapping */ +/* + * Copyright 2011-2021 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the Symas + * Dual-Use License. + * + * A copy of this license is available in the file LICENSE in the + * source distribution. + */ +#include +#include +#include +#include "lmdb.h" +#include "chacha8.h" + +#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) +#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) +#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ + "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) + +int main(int argc,char * argv[]) +{ + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDB_stat mst; + MDB_cursor *cursor, *cur2; + MDB_cursor_op op; + int count; + int *values; + char sval[32] = ""; + + srand(time(NULL)); + + count = (rand()%384) + 64; + values = (int *)malloc(count*sizeof(int)); + + for(i = 0;i in each iteration, since MDB_NOOVERWRITE may modify it */ + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_KEYEXIST, mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { + j++; + data.mv_size = sizeof(sval); + data.mv_data = sval; + } + } + if (j) printf("%d duplicates skipped\n", j); + E(mdb_txn_commit(txn)); + E(mdb_env_stat(env, &mst)); + + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + j=0; + key.mv_data = sval; + for (i= count - 1; i > -1; i-= (rand()%5)) { + j++; + txn=NULL; + E(mdb_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { + j--; + mdb_txn_abort(txn); + } else { + E(mdb_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); + + E(mdb_env_stat(env, &mst)); + E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + printf("Cursor prev\n"); + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + printf("Cursor last/prev\n"); + E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + E(mdb_cursor_get(cursor, &key, &data, MDB_PREV)); + printf("key: %.*s, data: %.*s\n", + (int) key.mv_size, (char *) key.mv_data, + (int) data.mv_size, (char *) data.mv_data); + + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + printf("Deleting with cursor\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cur2)); + for (i=0; i<50; i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, MDB_NEXT))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + E(mdb_del(txn, dbi, &key, NULL)); + } + + printf("Restarting cursor in txn\n"); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cur2); + E(mdb_txn_commit(txn)); + + printf("Restarting cursor outside txn\n"); + E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdb_cursor_open(txn, dbi, &cursor)); + for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdb_cursor_get(cursor, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", + key.mv_data, (int) key.mv_size, (char *) key.mv_data, + data.mv_data, (int) data.mv_size, (char *) data.mv_data); + } + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + + mdb_dbi_close(env, dbi); + mdb_env_close(env); + + return 0; +} diff --git a/dependencies/lmdb/libraries/liblmdb/sample-bdb.txt b/dependencies/lmdb/libraries/liblmdb/sample-bdb.txt index c72078c722..8ca927c6cf 100644 --- a/dependencies/lmdb/libraries/liblmdb/sample-bdb.txt +++ b/dependencies/lmdb/libraries/liblmdb/sample-bdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-mdb.txt */ /* - * Copyright 2012-2020 Howard Chu, Symas Corp. + * Copyright 2012-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/dependencies/lmdb/libraries/liblmdb/sample-mdb.txt b/dependencies/lmdb/libraries/liblmdb/sample-mdb.txt index e54a847068..2e1731631d 100644 --- a/dependencies/lmdb/libraries/liblmdb/sample-mdb.txt +++ b/dependencies/lmdb/libraries/liblmdb/sample-mdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-bdb.txt */ /* - * Copyright 2012-2020 Howard Chu, Symas Corp. + * Copyright 2012-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without