diff options
Diffstat (limited to 'db2/btree')
-rw-r--r-- | db2/btree/bt_close.c | 184 | ||||
-rw-r--r-- | db2/btree/bt_compare.c | 205 | ||||
-rw-r--r-- | db2/btree/bt_conv.c | 83 | ||||
-rw-r--r-- | db2/btree/bt_cursor.c | 1577 | ||||
-rw-r--r-- | db2/btree/bt_delete.c | 607 | ||||
-rw-r--r-- | db2/btree/bt_open.c | 355 | ||||
-rw-r--r-- | db2/btree/bt_page.c | 312 | ||||
-rw-r--r-- | db2/btree/bt_put.c | 919 | ||||
-rw-r--r-- | db2/btree/bt_rec.c | 767 | ||||
-rw-r--r-- | db2/btree/bt_recno.c | 1195 | ||||
-rw-r--r-- | db2/btree/bt_rsearch.c | 347 | ||||
-rw-r--r-- | db2/btree/bt_search.c | 335 | ||||
-rw-r--r-- | db2/btree/bt_split.c | 952 | ||||
-rw-r--r-- | db2/btree/bt_stat.c | 257 | ||||
-rw-r--r-- | db2/btree/btree.src | 137 | ||||
-rw-r--r-- | db2/btree/btree_auto.c | 1279 |
16 files changed, 9511 insertions, 0 deletions
diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c new file mode 100644 index 0000000000..4e80634e86 --- /dev/null +++ b/db2/btree/bt_close.c @@ -0,0 +1,184 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_close.c 10.22 (Sleepycat) 8/23/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/mman.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static void __bam_upstat __P((DB *dbp)); + +/* + * __bam_close -- + * Close a btree. + * + * PUBLIC: int __bam_close __P((DB *)); + */ +int +__bam_close(dbp) + DB *dbp; +{ + BTREE *t; + + DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0); + + t = dbp->internal; + + /* Update tree statistics. */ + __bam_upstat(dbp); + + /* Free any allocated memory. */ + if (t->bt_rkey.data) + FREE(t->bt_rkey.data, t->bt_rkey.size); + if (t->bt_rdata.data) + FREE(t->bt_rdata.data, t->bt_rdata.ulen); + if (t->bt_sp != t->bt_stack) + FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG)); + + FREE(t, sizeof(BTREE)); + dbp->internal = NULL; + + return (0); +} + +/* + * __bam_sync -- + * Sync the btree to disk. + * + * PUBLIC: int __bam_sync __P((DB *, int)); + */ +int +__bam_sync(argdbp, flags) + DB *argdbp; + int flags; +{ + DB *dbp; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_syncchk(argdbp, flags)) != 0) + return (ret); + + /* If it wasn't possible to modify the file, we're done. */ + if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY)) + return (0); + + GETHANDLE(argdbp, NULL, &dbp, ret); + + /* Flush any dirty pages from the cache to the backing file. */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = 0; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_upstat -- + * Update tree statistics. + */ +static void +__bam_upstat(dbp) + DB *dbp; +{ + BTREE *t; + BTMETA *meta; + DB_LOCK mlock; + db_pgno_t pgno; + int flags, ret; + + /* + * We use a no-op log call to log the update of the statistics onto the + * metadata page. The dbp->close() call isn't transaction protected to + * start with, and I'm not sure what undoing a statistics update means, + * anyway. + */ + if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY)) + return; + + /* Lock the page. */ + if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock) != 0) + return; + + flags = 0; + pgno = PGNO_METADATA; + + /* Get the page. */ + if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) { + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __db_noop_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(meta), 0)) == 0) + goto err; + + /* Update the statistics. */ + t = dbp->internal; + __bam_add_mstat(&t->lstat, &meta->stat); + + flags = DB_MPOOL_DIRTY; + } + +err: (void)memp_fput(dbp->mpf, (PAGE *)meta, flags); + (void)__BT_LPUT(dbp, mlock); +} diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c new file mode 100644 index 0000000000..e802fd24ab --- /dev/null +++ b/db2/btree/bt_compare.c @@ -0,0 +1,205 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_compare.c 10.3 (Sleepycat) 7/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_cmp -- + * Compare a key to a given record. + * + * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *)); + */ +int +__bam_cmp(dbp, k1, e) + DB *dbp; + const DBT *k1; + EPG *e; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE *t; + DBT k2; + PAGE *h; + + t = dbp->internal; + + /* + * Returns: + * < 0 if k1 is < record + * = 0 if k1 is = record + * > 0 if k1 is > record + * + * The left-most key on internal pages, at any level of the tree, is + * guaranteed, by the following code, to be less than any user key. + * This saves us from having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything we've yet seen. + */ + h = e->page; + if (e->indx == 0 && + h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE) + return (1); + + bo = NULL; + if (TYPE(h) == P_LBTREE) { + bk = GET_BKEYDATA(h, e->indx); + if (bk->type == B_OVERFLOW) + bo = (BOVERFLOW *)bk; + else { + memset(&k2, 0, sizeof(k2)); + k2.data = bk->data; + k2.size = bk->len; + } + } else { + bi = GET_BINTERNAL(h, e->indx); + if (bi->type == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + memset(&k2, 0, sizeof(k2)); + k2.data = bi->data; + k2.size = bi->len; + } + } + + /* + * XXX + * We ignore system errors; the only recoverable one is ENOMEM, and we + * don't want to require that comparison routines handle random errors. + * We don't want to return a valid comparison, either, so we stop. + */ + if (bo != NULL) { + /* + * If using the default comparison routine, use __db_moff(), + * which compares the overflow key a page at a time. + */ + if (t->bt_compare == __bam_defcmp) + return (__db_moff(dbp, k1, bo->pgno)); + + /* + * Otherwise, we need a contiguous record so we can hand it + * to the user's routine. + */ + if (__db_goff(dbp, &k2, bo->tlen, + bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0) + abort(); + } + return ((*t->bt_compare)(k1, &k2)); +} + +/* + * __bam_defcmp -- + * Default comparison routine. + * + * PUBLIC: int __bam_defcmp __P((const DBT *, const DBT *)); + */ +int +__bam_defcmp(a, b) + const DBT *a, *b; +{ + size_t len; + u_int8_t *p1, *p2; + + /* + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + * + * XXX + * If a size_t doesn't fit into a long, or if the difference between + * any two characters doesn't fit into an int, this routine can lose. + * What we need is a signed integral type that's guaranteed to be at + * least as large as a size_t, and there is no such thing. + */ + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((long)*p1 - (long)*p2); + return ((long)a->size - (long)b->size); +} + +/* + * __bam_defpfx -- + * Default prefix routine. + * + * PUBLIC: size_t __bam_defpfx __P((const DBT *, const DBT *)); + */ +size_t +__bam_defpfx(a, b) + const DBT *a, *b; +{ + size_t cnt, len; + u_int8_t *p1, *p2; + + cnt = 1; + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* + * We know that a->size must be <= b->size, or they wouldn't be + * in this order. + */ + return (a->size < b->size ? a->size + 1 : a->size); +} diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c new file mode 100644 index 0000000000..537e2f98ec --- /dev/null +++ b/db2/btree/bt_conv.c @@ -0,0 +1,83 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_conv.c 10.3 (Sleepycat) 8/9/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" + +/* + * __bam_pgin, __bam_pgout -- + * Convert host-specific page layout to/from the host-independent + * format stored on disk. + * + * PUBLIC: int __bam_pgin __P((db_pgno_t, void *, DBT *)); + * PUBLIC: int __bam_pgout __P((db_pgno_t, void *, DBT *)); + */ +int +__bam_pgin(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgin(pg, pp)); +} + +int +__bam_pgout(pg, pp, cookie) + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + return (pg == PGNO_METADATA ? __bam_mswap(pp) : __db_pgout(pg, pp)); +} + +/* + * __bam_mswap -- + * Swap the bytes on the btree metadata page. + * + * PUBLIC: int __bam_mswap __P((PAGE *)); + */ +int +__bam_mswap(pg) + PAGE *pg; +{ + u_int8_t *p; + + p = (u_int8_t *)pg; + SWAP32(p); /* lsn.file */ + SWAP32(p); /* lsn.offset */ + SWAP32(p); /* pgno */ + SWAP32(p); /* magic */ + SWAP32(p); /* version */ + SWAP32(p); /* pagesize */ + SWAP32(p); /* maxkey */ + SWAP32(p); /* minkey */ + SWAP32(p); /* free */ + SWAP32(p); /* flags */ + return (0); +} diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c new file mode 100644 index 0000000000..592ec9b3ff --- /dev/null +++ b/db2/btree/bt_cursor.c @@ -0,0 +1,1577 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_cursor.c 10.26 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_c_close __P((DBC *)); +static int __bam_c_del __P((DBC *, int)); +static int __bam_c_first __P((DB *, CURSOR *)); +static int __bam_c_get __P((DBC *, DBT *, DBT *, int)); +static int __bam_c_last __P((DB *, CURSOR *)); +static int __bam_c_next __P((DB *, CURSOR *, int)); +static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *)); +static int __bam_c_prev __P((DB *, CURSOR *)); +static int __bam_c_put __P((DBC *, DBT *, DBT *, int)); +static int __bam_c_rget __P((DB *, CURSOR *, DBT *, DBT *, int)); +static int __bam_c_search __P((DB *, CURSOR *, const DBT *, u_int, int, int *)); + +/* Discard the current page/lock held by a cursor. */ +#undef DISCARD +#define DISCARD(dbp, cp) { \ + (void)memp_fput(dbp->mpf, (cp)->page, 0); \ + (cp)->page = NULL; \ + (void)__BT_TLPUT((dbp), (cp)->lock); \ + (cp)->lock = LOCK_INVALID; \ +} + +/* + * __bam_cursor -- + * Interface to the cursor functions. + * + * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **)); + */ +int +__bam_cursor(dbp, txn, dbcp) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; +{ + CURSOR *cp; + DBC *dbc; + + DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0); + + if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL) + return (ENOMEM); + if ((cp = (CURSOR *)calloc(1, sizeof(CURSOR))) == NULL) { + free(dbc); + return (ENOMEM); + } + + cp->dbc = dbc; + cp->pgno = cp->dpgno = PGNO_INVALID; + cp->lock = LOCK_INVALID; + + dbc->dbp = dbp; + dbc->txn = txn; + dbc->internal = cp; + dbc->c_close = __bam_c_close; + dbc->c_del = __bam_c_del; + dbc->c_get = __bam_c_get; + dbc->c_put = __bam_c_put; + + /* All cursor structures hang off the main DB structure. */ + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + *dbcp = dbc; + return (0); +} + +/* + * __bam_c_close -- + * Close a single cursor. + */ +static int +__bam_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + CURSOR *cp; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + cp = dbc->internal; + + /* If a cursor key was deleted do the actual deletion. */ + ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0; + + /* Discard any lock if we're not inside a transaction. */ + if (dbp->txn == NULL && cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + + /* Remove the cursor from the queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Discard the structures. */ + FREE(cp, sizeof(CURSOR)); + FREE(dbc, sizeof(DBC)); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_del -- + * Delete using a cursor. + */ +static int +__bam_c_del(dbc, flags) + DBC *dbc; + int flags; +{ + CURSOR *cp; + DB *dbp; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + db_indx_t indx; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cdelchk(dbc->dbp, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + /* If already deleted, return failure. */ + if (F_ISSET(cp, C_DELETED | C_REPLACE)) + return (DB_KEYEMPTY); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + + /* + * We don't physically delete the record until the cursor moves, + * so we have to have a long-lived write lock on the page instead + * of a long-lived read lock. Note, we have to have a read lock + * to even get here, so we simply discard it. + */ + if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) { + if ((ret = __bam_lget(dbp, + 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + goto err; + (void)__BT_TLPUT(dbp, cp->lock); + cp->lock = lock; + cp->mode = DB_LOCK_WRITE; + } + + /* + * Acquire the underlying page (which may be different from the above + * page because it may be a duplicate page), and set the on-page and + * in-cursor delete flags. We don't need to lock it as we've already + * write-locked the page leading to it. + */ + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) { + (void)memp_fput(dbp->mpf, h, 0); + goto err; + } + + /* Set the intent-to-delete flag on the page and in all cursors. */ + if (cp->dpgno == PGNO_INVALID) + GET_BKEYDATA(h, indx + O_INDX)->deleted = 1; + else + GET_BKEYDATA(h, indx)->deleted = 1; + (void)__bam_ca_delete(dbp, pgno, indx, NULL); + + ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + +err: PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_get -- + * Retrieve a key/data pair from the tree. + * + * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); + */ +int +__bam_get(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + DBC dbc; + CURSOR cp; + int ret; + + DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) + return (ret); + + /* Build a cursor. */ + memset(&cp, 0, sizeof(cp)); + cp.dbc = &dbc; + cp.pgno = cp.dpgno = PGNO_INVALID; + cp.lock = LOCK_INVALID; + + memset(&dbc, 0, sizeof(dbc)); + dbc.dbp = argdbp; + dbc.txn = txn; + dbc.internal = &cp; + + /* Get the key. */ + if ((ret = __bam_c_get(&dbc, + key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET)) != 0) + return (ret); + + /* Discard any lock, the cursor didn't really exist. */ + if (cp.lock != LOCK_INVALID) + (void)__BT_TLPUT(argdbp, cp.lock); + + return (0); +} + +/* + * __bam_c_get -- + * Get using a cursor (btree). + */ +static int +__bam_c_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR *cp, copy; + DB *dbp; + PAGE *h; + int exact, ret; + + DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, + NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cgetchk(dbc->dbp, + key, data, flags, cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* + * Break out the code to return a cursor's record number. It + * has nothing to do with the cursor get code except that it's + * been rammed into the interface. + */ + if (LF_ISSET(DB_GET_RECNO)) { + ret = __bam_c_rget(dbp, cp, key, data, flags); + PUTHANDLE(dbp); + return (ret); + } + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + cp->page = NULL; + cp->lock = LOCK_INVALID; + + switch (flags) { + case DB_CURRENT: + /* It's not possible to return a deleted record. */ + if (F_ISSET(cp, C_DELETED | C_REPLACE)) { + PUTHANDLE(dbp); + return (DB_KEYEMPTY); + } + + /* Get the page with the current item on it. */ + if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + goto err; + break; + case DB_NEXT: + if (cp->pgno != PGNO_INVALID) { + if ((ret = __bam_c_next(dbp, cp, 1)) != 0) + goto err; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + if ((ret = __bam_c_first(dbp, cp)) != 0) + goto err; + break; + case DB_PREV: + if (cp->pgno != PGNO_INVALID) { + if ((ret = __bam_c_prev(dbp, cp)) != 0) + goto err; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + if ((ret = __bam_c_last(dbp, cp)) != 0) + goto err; + break; + case DB_SET_RECNO: + exact = 1; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0) + goto err; + break; + case DB_SET: + exact = 1; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + goto err; + break; + case DB_SET_RANGE: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + goto err; + break; + } + + /* + * Return the key if the user didn't give us one. If we've moved to + * a duplicate page, we may no longer have a pointer to the main page, + * so we have to go get it. We know that it's already read-locked, + * however, so we don't have to acquire a new lock. + */ + if (flags != DB_SET) { + if (cp->dpgno != PGNO_INVALID) { + if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) + goto err; + } else + h = cp->page; + ret = __db_ret(dbp, + h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen); + if (cp->dpgno != PGNO_INVALID) + (void)memp_fput(dbp->mpf, h, 0); + if (ret) + goto err; + } + + /* Return the data. */ + if ((ret = __db_ret(dbp, cp->page, + cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx, + data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + goto err; + + /* + * If the previous cursor record has been deleted, delete it. The + * returned key isn't a deleted key, so clear the flag. + */ + if (F_ISSET(©, C_DELETED) && __bam_c_physdel(dbp, ©, cp->page)) + goto err; + F_CLR(cp, C_DELETED | C_REPLACE); + + /* Release the previous lock, if any. */ + if (copy.lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, copy.lock); + + /* Release the pinned page. */ + ret = memp_fput(dbp->mpf, cp->page, 0); + + ++t->lstat.bt_get; + + if (0) { +err: if (cp->page != NULL) + (void)memp_fput(dbp->mpf, cp->page, 0); + if (cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + *cp = copy; + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_rget -- + * Return the record number for a cursor. + */ +static int +__bam_c_rget(dbp, cp, key, data, flags) + DB *dbp; + CURSOR *cp; + DBT *key, *data; + int flags; +{ + BTREE *t; + DBT dbt; + db_recno_t recno; + int exact, ret; + + /* Get the page with the current item on it. */ + if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + return (ret); + + /* Get a copy of the key. */ + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; + if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0) + goto err; + + exact = 1; + if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0) + goto err; + + t = dbp->internal; + ret = __db_retcopy(data, &recno, sizeof(recno), + &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc); + + /* Release the stack. */ + __bam_stkrel(dbp); + +err: (void)memp_fput(dbp->mpf, cp->page, 0); + free(dbt.data); + return (ret); +} + +/* + * __bam_c_put -- + * Put using a cursor. + */ +static int +__bam_c_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR *cp, copy; + DB *dbp; + DBT dbt; + db_indx_t indx; + db_pgno_t pgno; + int exact, needkey, ret; + void *arg; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, + data, flags); + + cp = dbc->internal; + + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + cp->page = NULL; + cp->lock = LOCK_INVALID; + + /* + * To split, we need a valid key for the page. Since it's a cursor, + * we have to build one. + */ + if (0) { +split: if (needkey) { + memset(&dbt, 0, sizeof(DBT)); + ret = __db_ret(dbp, cp->page, indx, + &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen); + + DISCARD(dbp, cp); + + if (ret) + goto err; + arg = &dbt; + } else { + (void)__bam_stkrel(dbp); + arg = key; + } + if ((ret = __bam_split(dbp, arg)) != 0) + goto err; + } + + /* If there's no key supplied, use the cursor. */ + if (flags == DB_KEYFIRST || flags == DB_KEYLAST) + needkey = 0; + else { + needkey = 1; + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + /* Acquire the current page. */ + if ((ret = __bam_lget(dbp, + 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + goto err; + } + + ret = 0; + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + case DB_CURRENT: + if ((ret = __bam_iitem(dbp, &cp->page, + &indx, key, data, flags, 0)) == DB_NEEDSPLIT) + goto split; + break; + case DB_KEYFIRST: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0) + goto err; + + indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; + if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, + data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) + goto split; + if (ret) + goto err; + break; + case DB_KEYLAST: + exact = 0; + if ((ret = + __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0) + goto err; + + indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; + if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, + data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) + goto split; + break; + } + if (ret) + goto err; + + /* + * Update the cursor to point to the new entry. The new entry was + * stored on the current page, because we split pages until it was + * possible. + */ + if (cp->dpgno == PGNO_INVALID) + cp->indx = indx; + else + cp->dindx = indx; + + /* + * If the previous cursor record has been deleted, delete it. The + * returned key isn't a deleted key, so clear the flag. + */ + if (F_ISSET(©, C_DELETED) && + (ret = __bam_c_physdel(dbp, ©, cp->page)) != 0) + goto err; + F_CLR(cp, C_DELETED | C_REPLACE); + + /* Release the previous lock, if any. */ + if (copy.lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, copy.lock); + + /* Discard the pinned page. */ + ret = memp_fput(dbp->mpf, cp->page, 0); + if (0) { +err: if (cp->page != NULL) + (void)memp_fput(dbp->mpf, cp->page, 0); + if (cp->lock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, cp->lock); + *cp = copy; + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_c_first -- + * Return the first record. + */ +static int +__bam_c_first(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_pgno_t pgno; + int ret; + + /* Walk down the left-hand side of the tree. */ + for (pgno = PGNO_ROOT;;) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = GET_BINTERNAL(cp->page, 0)->pgno; + DISCARD(dbp, cp); + } + + cp->pgno = cp->page->pgno; + cp->indx = 0; + cp->dpgno = PGNO_INVALID; + + /* If it's an empty page or a deleted record, go to the next one. */ + if (NUM_ENT(cp->page) == 0 || + GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted) + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + + /* If it's a duplicate reference, go to the first entry. */ + if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0) + return (ret); + + /* If it's a deleted record, go to the next one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + return (0); +} + +/* + * __bam_c_last -- + * Return the last record. + */ +static int +__bam_c_last(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_pgno_t pgno; + int ret; + + /* Walk down the right-hand side of the tree. */ + for (pgno = PGNO_ROOT;;) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = + GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; + DISCARD(dbp, cp); + } + + cp->pgno = cp->page->pgno; + cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX; + cp->dpgno = PGNO_INVALID; + + /* If it's an empty page or a deleted record, go to the previous one. */ + if (NUM_ENT(cp->page) == 0 || + GET_BKEYDATA(cp->page, cp->indx + O_INDX)->deleted) + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + + /* If it's a duplicate reference, go to the last entry. */ + if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0) + return (ret); + + /* If it's a deleted record, go to the previous one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + return (0); +} + +/* + * __bam_c_next -- + * Move to the next record. + */ +static int +__bam_c_next(dbp, cp, initial_move) + DB *dbp; + CURSOR *cp; + int initial_move; +{ + db_indx_t adjust, indx; + db_pgno_t pgno; + int ret; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + */ + if (cp->dpgno == PGNO_INVALID) { + adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; + pgno = cp->pgno; + indx = cp->indx; + } else { + adjust = O_INDX; + pgno = cp->dpgno; + indx = cp->dindx; + } + if (cp->page == NULL) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + } + + /* + * If at the end of the page, move to a subsequent page. + * + * !!! + * Check for >= NUM_ENT. If we're here as the result of a search that + * landed us on NUM_ENT, we'll increment indx before we test. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (initial_move) + indx += adjust; + for (;;) { + if (indx >= NUM_ENT(cp->page)) { + pgno = cp->page->next_pgno; + DISCARD(dbp, cp); + + /* + * If we're in a btree leaf page, we've reached the end + * of the tree. If we've reached the end of a page of + * duplicates, continue from the btree leaf page where + * we found this page of duplicates. + */ + if (pgno == PGNO_INVALID) { + /* If in a btree leaf page, it's EOF. */ + if (cp->dpgno == PGNO_INVALID) + return (DB_NOTFOUND); + + /* Continue from the last btree leaf page. */ + cp->dpgno = PGNO_INVALID; + + adjust = P_INDX; + pgno = cp->pgno; + indx = cp->indx + P_INDX; + } else + indx = 0; + + if ((ret = __bam_lget(dbp, + 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + continue; + } + + /* Ignore deleted records. */ + if (dbp->type == DB_BTREE && + ((cp->dpgno == PGNO_INVALID && + GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) || + (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, indx)->deleted))) { + indx += adjust; + continue; + } + + /* + * If we're not in a duplicates page, check to see if we've + * found a page of duplicates, in which case we move to the + * first entry. + */ + if (cp->dpgno == PGNO_INVALID) { + cp->pgno = cp->page->pgno; + cp->indx = indx; + + if ((ret = + __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0) + return (ret); + if (cp->dpgno != PGNO_INVALID) { + indx = cp->dindx; + adjust = O_INDX; + continue; + } + } else { + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + } + break; + } + return (0); +} + +/* + * __bam_c_prev -- + * Move to the previous record. + */ +static int +__bam_c_prev(dbp, cp) + DB *dbp; + CURSOR *cp; +{ + db_indx_t indx, adjust; + db_pgno_t pgno; + int ret, set_indx; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + */ + if (cp->dpgno == PGNO_INVALID) { + adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; + pgno = cp->pgno; + indx = cp->indx; + } else { + adjust = O_INDX; + pgno = cp->dpgno; + indx = cp->dindx; + } + if (cp->page == NULL) { + if ((ret = + __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + } + + /* + * If at the beginning of the page, move to any previous one. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + for (;;) { + if (indx == 0) { + pgno = cp->page->prev_pgno; + DISCARD(dbp, cp); + + /* + * If we're in a btree leaf page, we've reached the + * beginning of the tree. If we've reached the first + * of a page of duplicates, continue from the btree + * leaf page where we found this page of duplicates. + */ + if (pgno == PGNO_INVALID) { + /* If in a btree leaf page, it's SOF. */ + if (cp->dpgno == PGNO_INVALID) + return (DB_NOTFOUND); + + /* Continue from the last btree leaf page. */ + cp->dpgno = PGNO_INVALID; + + adjust = P_INDX; + pgno = cp->pgno; + indx = cp->indx; + set_indx = 0; + } else + set_indx = 1; + + if ((ret = __bam_lget(dbp, + 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + + if (set_indx) + indx = NUM_ENT(cp->page); + if (indx == 0) + continue; + } + + /* Ignore deleted records. */ + indx -= adjust; + if (dbp->type == DB_BTREE && + ((cp->dpgno == PGNO_INVALID && + GET_BKEYDATA(cp->page, indx + O_INDX)->deleted) || + (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, indx)->deleted))) + continue; + + /* + * If we're not in a duplicates page, check to see if we've + * found a page of duplicates, in which case we move to the + * last entry. + */ + if (cp->dpgno == PGNO_INVALID) { + cp->pgno = cp->page->pgno; + cp->indx = indx; + + if ((ret = + __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0) + return (ret); + if (cp->dpgno != PGNO_INVALID) { + indx = cp->dindx + O_INDX; + adjust = O_INDX; + continue; + } + } else { + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + } + break; + } + return (0); +} + +/* + * __bam_c_search -- + * Move to a specified record. + */ +static int +__bam_c_search(dbp, cp, key, flags, isrecno, exactp) + DB *dbp; + CURSOR *cp; + const DBT *key; + u_int flags; + int isrecno, *exactp; +{ + BTREE *t; + db_recno_t recno; + int needexact, ret; + + t = dbp->internal; + needexact = *exactp; + + /* + * Find any matching record; the search function pins the page. Make + * sure it's a valid key (__bam_search may return an index just past + * the end of a page) and return it. + */ + if (isrecno) { + if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + return (ret); + ret = __bam_rsearch(dbp, &recno, flags, 1, exactp); + } else + ret = __bam_search(dbp, key, flags, 1, NULL, exactp); + if (ret != 0) + return (ret); + + cp->page = t->bt_csp->page; + cp->pgno = cp->page->pgno; + cp->indx = t->bt_csp->indx; + cp->lock = t->bt_csp->lock; + cp->dpgno = PGNO_INVALID; + + /* + * If we have an exact match, make sure that we're not looking at a + * chain of duplicates -- if so, move to an entry in that chain. + */ + if (*exactp) { + if ((ret = __bam_ovfl_chk(dbp, + cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0) + return (ret); + } else + if (needexact) + return (DB_NOTFOUND); + + /* If past the end of a page, find the next entry. */ + if (cp->indx == NUM_ENT(cp->page) && + (ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + + /* If it's a deleted record, go to the next or previous one. */ + if (cp->dpgno != PGNO_INVALID && + GET_BKEYDATA(cp->page, cp->dindx)->deleted) + if (flags == S_KEYLAST) { + if ((ret = __bam_c_prev(dbp, cp)) != 0) + return (ret); + } else + if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + return (ret); + return (0); +} + +/* + * __bam_ovfl_chk -- + * Check for an overflow record, and if found, move to the correct + * record. + * + * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); + */ +int +__bam_ovfl_chk(dbp, cp, indx, to_end) + DB *dbp; + CURSOR *cp; + u_int32_t indx; + int to_end; +{ + BOVERFLOW *bo; + db_pgno_t pgno; + int ret; + + /* Check for an overflow entry. */ + bo = GET_BOVERFLOW(cp->page, indx); + if (bo->type != B_DUPLICATE) + return (0); + + /* + * If we find one, go to the duplicates page, and optionally move + * to the last record on that page. + * + * XXX + * We don't lock duplicates pages, we've already got the correct + * lock on the main page. + */ + pgno = bo->pgno; + if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + return (ret); + cp->page = NULL; + if (to_end) { + if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0) + return (ret); + indx = NUM_ENT(cp->page) - O_INDX; + } else { + if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + return (ret); + indx = 0; + } + + /* Update the duplicate entry in the cursor. */ + cp->dpgno = cp->page->pgno; + cp->dindx = indx; + + return (0); +} + +#ifdef DEBUG +/* + * __bam_cprint -- + * Display the current btree cursor list. + */ +int +__bam_cprint(dbp) + DB *dbp; +{ + CURSOR *cp; + DBC *dbc; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + fprintf(stderr, + "%#0x: page: %lu index: %lu dpage %lu dindex: %lu", + (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, + (u_long)cp->dpgno, (u_long)cp->dindx); + if (F_ISSET(cp, C_DELETED)) + fprintf(stderr, "(deleted)"); + fprintf(stderr, "\n"); + } + DB_THREAD_UNLOCK(dbp); + return (0); +} +#endif /* DEBUG */ + +/* + * __bam_ca_delete -- + * Check if any of the cursors refer to the item we are about to delete. + * We'll return the number of cursors that refer to the item in question. + * If a cursor does refer to the item, then we set its deleted bit. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *)); + */ +int +__bam_ca_delete(dbp, pgno, indx, curs) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + CURSOR *curs; +{ + DBC *dbc; + CURSOR *cp; + int count; + + /* + * Adjust the cursors. We don't have to review the cursors for any + * process other than the current one, because we have the page write + * locked at this point, and any other process had better be using a + * different locker ID, meaning that only cursors in our process can + * be on the page. + * + * It's possible for multiple cursors within the thread to have write + * locks on the same page, but, cursors within a thread must be single + * threaded, so all we're locking here is the cursor linked list. + * + * indx refers to the first of what might be a duplicate set. The + * cursor passed in is the one initiating the delete, so we don't + * want to count it. + */ + DB_THREAD_LOCK(dbp); + for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if ((curs != cp && + cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) { + ++count; + F_SET(cp, C_DELETED); + } + } + DB_THREAD_UNLOCK(dbp); + return (count); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_di(dbp, pgno, indx, value) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int value; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx) + cp->indx += value; + if (cp->dpgno == pgno && cp->dindx >= indx) + cp->dindx += value; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving data items to a duplicates page. + * + * PUBLIC: void __bam_ca_dup __P((DB *, + * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +void +__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) + DB *dbp; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + CURSOR *cp; + DBC *dbc; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * No need to test duplicates, this only gets called when moving + * leaf page data items onto a duplicates page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + /* + * Ignore matching entries that have already been moved, + * we move from the same location on the leaf page more + * than once. + */ + if (cp->dpgno == PGNO_INVALID && + cp->pgno == fpgno && cp->indx == fi) { + cp->indx = first; + cp->dpgno = tpgno; + cp->dindx = ti; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_move -- + * Adjust the cursors when moving data items to another page. + * + * PUBLIC: void __bam_ca_move __P((DB *, BTREE *, db_pgno_t, db_pgno_t)); + */ +void +__bam_ca_move(dbp, t, fpgno, tpgno) + DB *dbp; + BTREE *t; + db_pgno_t fpgno, tpgno; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * No need to test duplicates, this only gets called when copying + * over the root page with a leaf or internal page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == fpgno) + cp->pgno = tpgno; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_replace -- + * Check if any of the cursors refer to the item we are about to replace. + * If so, their flags should be changed from deleted to replaced. + * + * PUBLIC: void __bam_ca_replace + * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); + */ +void +__bam_ca_replace(dbp, pgno, indx, pass) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + ca_replace_arg pass; +{ + CURSOR *cp; + DBC *dbc; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * Find any cursors that have logically deleted a record we're about + * to overwrite. + * + * Pass == REPLACE_SETUP: + * Set C_REPLACE_SETUP so we can find the cursors again. + * + * Pass == REPLACE_SUCCESS: + * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the + * overwrite was successful. + * + * Pass == REPLACE_FAILED: + * Clear C_REPLACE_SETUP, the overwrite failed. + * + * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value + * for the cursor as it may have been changed by other cursor update + * routines as the item was deleted/inserted. + */ + DB_THREAD_LOCK(dbp); + switch (pass) { + case REPLACE_SETUP: /* Setup. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if ((cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) + F_SET(cp, C_REPLACE_SETUP); + } + break; + case REPLACE_SUCCESS: /* Overwrite succeeded. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (F_ISSET(cp, C_REPLACE_SETUP)) { + if (cp->dpgno == pgno) + cp->dindx = indx; + if (cp->pgno == pgno) + cp->indx = indx; + F_SET(cp, C_REPLACE); + F_CLR(cp, C_DELETED | C_REPLACE_SETUP); + } + } + break; + case REPLACE_FAILED: /* Overwrite failed. */ + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (F_ISSET(cp, C_REPLACE_SETUP)) { + if (cp->dpgno == pgno) + cp->dindx = indx; + if (cp->pgno == pgno) + cp->indx = indx; + F_CLR(cp, C_REPLACE_SETUP); + } + } + break; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: void __bam_ca_split __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) + DB *dbp; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DBC *dbc; + CURSOR *cp; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == ppgno) + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + if (cp->dpgno == ppgno) + if (cp->dindx < split_indx) { + if (cleft) + cp->dpgno = lpgno; + } else { + cp->dpgno = rpgno; + cp->dindx -= split_indx; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_c_physdel -- + * Actually do the cursor deletion. + */ +static int +__bam_c_physdel(dbp, cp, h) + DB *dbp; + CURSOR *cp; + PAGE *h; +{ + BOVERFLOW bo; + BTREE *t; + DBT dbt; + DB_LOCK lock; + db_indx_t indx; + db_pgno_t pgno, next_pgno, prev_pgno; + int local, ret; + + t = dbp->internal; + ret = 0; + + /* Figure out what we're deleting. */ + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } + + /* + * If the item is referenced by another cursor, leave it up to that + * cursor to do the delete. + */ + if (__bam_ca_delete(dbp, pgno, indx, cp) != 0) + return (0); + + /* + * If we don't already have the page locked, get it and delete the + * items. + */ + if ((h == NULL || h->pgno != pgno)) { + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + return (ret); + local = 1; + } else + local = 0; + + /* + * If we're deleting a duplicate entry, call the common code to do + * the work. + */ + if (TYPE(h) == P_DUPLICATE) { + pgno = PGNO(h); + prev_pgno = PREV_PGNO(h); + next_pgno = NEXT_PGNO(h); + if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0) + goto err; + + /* + * There are 4 cases: + * + * 1. We removed an item on a page, but there are more items + * on the page. + * 2. We removed the last item on a page, removing the last + * duplicate. + * 3. We removed the last item on a page, but there is a + * following page of duplicates. + * 4. We removed the last item on a page, but there is a + * previous page of duplicates. + * + * In case 1, h != NULL, h->pgno == pgno + * In case 2, h == NULL, + * prev_pgno == PGNO_INVALID, next_pgno == PGNO_INVALID + * In case 3, h != NULL, next_pgno != PGNO_INVALID + * In case 4, h == NULL, prev_pgno != PGNO_INVALID + * + * In case 1, there's nothing else to do. + * In case 2, remove the entry from the parent page. + * In case 3 or 4, if the deleted page was the first in a chain + * of duplicate pages, update the parent page's entry. + * + * Test: + * If there were previous pages of duplicates or we didn't + * empty the current page of duplicates, we don't need to + * touch the parent page. + */ + if (PREV_PGNO(h) != PGNO_INVALID || + (h != NULL && pgno == h->pgno)) + goto done; + + /* + * Release any page we're holding and the lock on the deleted + * page. + */ + if (local) { + if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + } + + /* Acquire the parent page. */ + if ((ret = + __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) { + (void)__BT_TLPUT(dbp, lock); + goto err; + } + local = 1; + + /* + * If we deleted the last duplicate, we can fall out and do a + * normal btree delete in the context of the parent page. If + * not, we have to update the parent's page. + */ + indx = cp->indx; + if (next_pgno != PGNO_INVALID) { + /* + * Copy, delete, update and re-insert the parent page's + * entry. + */ + bo = *GET_BOVERFLOW(h, indx); + (void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE); + bo.pgno = next_pgno; + memset(&dbt, 0, sizeof(dbt)); + dbt.data = &bo; + dbt.size = BOVERFLOW_SIZE; + (void)__db_pitem(dbp, + h, indx, BOVERFLOW_SIZE, &dbt, NULL); + + /* Discard the parent page. */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + + goto done; + } + } + + /* Otherwise, do a normal btree delete. */ + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + /* + * If the page is empty, delete it. To delete a leaf page we need a + * copy of a key from the page. We use the first one that was there, + * since it's the last key that the page held. We malloc the page + * information instead of using the return key/data memory because + * we've already set them -- the reason that we've already set them + * is because we're (potentially) about to do a reverse split, which + * would make our saved page information useless. + * + * XXX + * The following operations to delete a page might deadlock. I think + * that's OK. The problem is if we're deleting an item because we're + * closing cursors because we've already deadlocked and want to call + * txn_abort(). If we fail due to deadlock, we'll leave an locked + * empty page in the tree, which won't be empty long because we're + * going to undo the delete. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; + if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0) + goto err; + + if (local) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + local = 0; + } + + ret = __bam_dpage(dbp, &dbt); + free(dbt.data); + } + +err: +done: if (local) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + } + + if (ret == 0) + ++t->lstat.bt_deleted; + return (ret); +} diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c new file mode 100644 index 0000000000..e7ec4dfe3e --- /dev/null +++ b/db2/btree/bt_delete.c @@ -0,0 +1,607 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_delete.c 10.18 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_dpages __P((DB *, BTREE *)); + +/* + * __bam_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, int)); + */ +int +__bam_delete(argdbp, txn, key, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key; + int flags; +{ + BTREE *t; + DB *dbp; + PAGE *h; + db_indx_t cnt, i, indx; + int dpage, exact, ret, stack; + + DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags); + + stack = 0; + + /* Check for invalid flags. */ + if ((ret = + __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0) + goto err; + stack = 1; + h = t->bt_csp->page; + indx = t->bt_csp->indx; + + /* Delete the key/data pair, including any duplicates. */ + for (cnt = 1, i = indx;; ++cnt) + if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx]) + break; + for (; cnt > 0; --cnt, ++t->lstat.bt_deleted) + if (__bam_ca_delete(dbp, h->pgno, indx, NULL) != 0) { + GET_BKEYDATA(h, indx + O_INDX)->deleted = 1; + indx += P_INDX; + } else if ((ret = __bam_ditem(dbp, h, indx)) != 0 || + (ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + /* If we're using record numbers, update internal page record counts. */ + if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0) + goto err; + + /* If the page is now empty, delete it. */ + dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT; + + __bam_stkrel(dbp); + stack = 0; + + ret = dpage ? __bam_dpage(dbp, key) : 0; + +err: if (stack) + __bam_stkrel(dbp); + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, int)); + */ +int +__ram_delete(argdbp, txn, key, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key; + int flags; +{ + BKEYDATA bk; + BTREE *t; + DB *dbp; + DBT hdr, data; + PAGE *h; + db_indx_t indx; + db_recno_t recno; + int exact, ret, stack; + + stack = 0; + + /* Check for invalid flags. */ + if ((ret = + __db_delchk(argdbp, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0) + goto err; + + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* If the record has already been deleted, we couldn't have found it. */ + if (GET_BKEYDATA(h, indx)->deleted) { + ret = DB_KEYEMPTY; + goto done; + } + + /* + * If we're not renumbering records, replace the record with a marker + * and return. + */ + if (!F_ISSET(dbp, DB_RE_RENUMBER)) { + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + bk.deleted = 1; + bk.type = B_KEYDATA; + bk.len = 0; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bk; + hdr.size = SSZA(BKEYDATA, data); + memset(&data, 0, sizeof(data)); + data.data = (char *) ""; + data.size = 0; + if ((ret = __db_pitem(dbp, + h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + + ++t->lstat.bt_deleted; + goto done; + } + + /* Delete the item. */ + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + + ++t->lstat.bt_deleted; + if (t->bt_recno != NULL) + F_SET(t->bt_recno, RECNO_MODIFIED); + + /* Adjust the counts. */ + __bam_adjust(dbp, t, -1); + + /* Adjust the cursors. */ + __ram_ca(dbp, recno, CA_DELETE); + + /* + * If the page is now empty, delete it -- we have the whole tree + * locked, so there are no preparations to make. Else, release + * the pages. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + stack = 0; + ret = __bam_dpages(dbp, t); + } + +done: +err: if (stack) + __bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_ditem -- + * Delete one or more entries from a page. + * + * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t)); + */ +int +__bam_ditem(dbp, h, indx) + DB *dbp; + PAGE *h; + u_int32_t indx; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + u_int32_t nbytes; + int ret; + + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, indx); + switch (bi->type) { + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BINTERNAL_SIZE(bi->len); + goto offpage; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bi->len); + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + case P_LBTREE: + /* + * If it's a duplicate key, discard the index and don't touch + * the actual page item. This works because no data item can + * have an index that matches any other index so even if the + * data item is in an index "slot", it won't match any other + * index. + */ + if (!(indx % 2)) { + if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + return (__bam_adjindx(dbp, + h, indx, indx - P_INDX, 0)); + if (indx < (u_int32_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + return (__bam_adjindx(dbp, + h, indx, indx + O_INDX, 0)); + } + /* FALLTHROUGH */ + case P_LRECNO: + bk = GET_BKEYDATA(h, indx); + switch (bk->type) { + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BOVERFLOW_SIZE; + +offpage: /* Delete duplicate/offpage chains. */ + bo = GET_BOVERFLOW(h, indx); + if (bo->type == B_DUPLICATE) { + if ((ret = + __db_ddup(dbp, bo->pgno, __bam_free)) != 0) + return (ret); + } else + if ((ret = + __db_doff(dbp, bo->pgno, __bam_free)) != 0) + return (ret); + break; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bk->len); + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + + /* Delete the item. */ + if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0) + return (ret); + + /* Mark the page dirty. */ + return (memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)); +} + +/* + * __bam_adjindx -- + * Adjust an index on the page. + * + * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); + */ +int +__bam_adjindx(dbp, h, indx, indx_copy, is_insert) + DB *dbp; + PAGE *h; + u_int32_t indx, indx_copy; + int is_insert; +{ + db_indx_t copy; + int ret; + + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, + (u_int32_t)is_insert)) != 0) + return (ret); + + if (is_insert) { + copy = h->inp[indx_copy]; + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx + O_INDX], &h->inp[indx], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + h->inp[indx] = copy; + ++NUM_ENT(h); + } else { + --NUM_ENT(h); + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx], &h->inp[indx + O_INDX], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + } + + /* Mark the page dirty. */ + ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); + + /* Adjust the cursors. */ + __bam_ca_di(dbp, h->pgno, indx, is_insert ? 1 : -1); + return (0); +} + +/* + * __bam_dpage -- + * Delete a page from the tree. + * + * PUBLIC: int __bam_dpage __P((DB *, const DBT *)); + */ +int +__bam_dpage(dbp, key) + DB *dbp; + const DBT *key; +{ + BTREE *t; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int exact, level, ret; + + ret = 0; + t = dbp->internal; + + /* + * The locking protocol is that we acquire locks by walking down the + * tree, to avoid the obvious deadlocks. + * + * Call __bam_search to reacquire the empty leaf page, but this time + * get both the leaf page and it's parent, locked. Walk back up the + * tree, until we have the top pair of pages that we want to delete. + * Once we have the top page that we want to delete locked, lock the + * underlying pages and check to make sure they're still empty. If + * they are, delete them. + */ + for (level = LEAFLEVEL;; ++level) { + /* Acquire a page and its parent, locked. */ + if ((ret = + __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0) + return (ret); + + /* + * If we reach the root or the page isn't going to be empty + * when we delete one record, quit. + */ + h = t->bt_csp[-1].page; + if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1) + break; + + /* Release the two locked pages. */ + (void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0); + (void)__BT_TLPUT(dbp, t->bt_csp[-1].lock); + (void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0); + (void)__BT_TLPUT(dbp, t->bt_csp[0].lock); + } + + /* + * Leave the stack pointer one after the last entry, we may be about + * to push more items on the stack. + */ + ++t->bt_csp; + + /* + * t->bt_csp[-2].page is the top page, which we're not going to delete, + * and t->bt_csp[-1].page is the first page we are going to delete. + * + * Walk down the chain, acquiring the rest of the pages until we've + * retrieved the leaf page. If we find any pages that aren't going + * to be emptied by the delete, someone else added something while we + * were walking the tree, and we discontinue the delete. + */ + for (h = t->bt_csp[-1].page;;) { + if (ISLEAF(h)) { + if (NUM_ENT(h) != 0) + goto release; + break; + } else + if (NUM_ENT(h) != 1) + goto release; + + /* + * Get the next page, write lock it and push it onto the stack. + * We know it's index 0, because it can only have one element. + */ + pgno = TYPE(h) == P_IBTREE ? + GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno; + + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + goto release; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto release; + BT_STK_PUSH(t, h, 0, lock, ret); + if (ret != 0) + goto release; + } + + BT_STK_POP(t); + return (__bam_dpages(dbp, t)); + +release: + /* Discard any locked pages and return. */ + BT_STK_POP(t); + __bam_stkrel(dbp); + return (ret); +} + +/* + * __bam_dpages -- + * Delete a set of locked pages. + */ +static int +__bam_dpages(dbp, t) + DB *dbp; + BTREE *t; +{ + DBT a, b; + DB_LOCK lock; + EPG *epg; + PAGE *h; + db_pgno_t pgno; + db_recno_t rcnt; + int ret; + + rcnt = 0; /* XXX: Shut the compiler up. */ + epg = t->bt_sp; + + /* + * !!! + * There is an interesting deadlock situation here. We have to relink + * the leaf page chain around the leaf page being deleted. Consider + * a cursor walking through the leaf pages, that has the previous page + * read-locked and is waiting on a lock for the page we're deleting. + * It will deadlock here. This is a problem, because if our process is + * selected to resolve the deadlock, we'll leave an empty leaf page + * that we can never again access by walking down the tree. So, before + * we unlink the subtree, we relink the leaf page chain. + */ + if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0) + goto release; + + /* + * We have the entire stack of deletable pages locked. Start from the + * top of the tree and move to the bottom, as it's better to release + * the inner pages as soon as possible. + */ + if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0) + goto release; + + /* + * If we deleted the next-to-last item from the root page, the tree + * has collapsed a level. Try and write lock the remaining root + 1 + * page and copy it onto the root page. If we can't get the lock, + * that's okay, the tree just stays a level deeper than we'd like. + */ + h = epg->page; + if (h->pgno == PGNO_ROOT && NUM_ENT(h) == 1) { + pgno = TYPE(epg->page) == P_IBTREE ? + GET_BINTERNAL(epg->page, 0)->pgno : + GET_RINTERNAL(epg->page, 0)->pgno; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + goto release; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto release; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + memset(&a, 0, sizeof(a)); + a.data = h; + a.size = dbp->pgsize; + memset(&b, 0, sizeof(b)); + b.data = P_ENTRY(epg->page, 0); + b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); + __bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn, + &h->lsn, 0, dbp->log_fileid, h->pgno, &a, &b, + &epg->page->lsn); + } + + /* + * Make the switch. + * + * One fixup -- if the tree has record numbers and we're not + * converting to a leaf page, we have to preserve the total + * record count. + */ + if (TYPE(h) == P_IRECNO || + (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + rcnt = RE_NREC(epg->page); + memcpy(epg->page, h, dbp->pgsize); + epg->page->pgno = PGNO_ROOT; + if (TYPE(h) == P_IRECNO || + (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + RE_NREC_SET(epg->page, rcnt); + + /* Free the last page in that level of the btree. */ + ++t->lstat.bt_freed; + (void)__bam_free(dbp, h); + + /* Adjust the cursors. */ + __bam_ca_move(dbp, t, h->pgno, PGNO_ROOT); + + (void)__BT_TLPUT(dbp, lock); + } + + /* Release the top page in the subtree. */ + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + + /* + * Free the rest of the pages. + * + * XXX + * Don't bother checking for errors. We've unlinked the subtree from + * the tree, and there's no possibility of recovery. + */ + for (; ++epg <= t->bt_csp; ++t->lstat.bt_freed) { + if (NUM_ENT(epg->page) != 0) + (void)__bam_ditem(dbp, epg->page, epg->indx); + + (void)__bam_free(dbp, epg->page); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (0); + +release: + /* Discard any remaining pages and return. */ + for (; epg <= t->bt_csp; ++epg) { + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (ret); +} diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c new file mode 100644 index 0000000000..354888c6c2 --- /dev/null +++ b/db2/btree/bt_open.c @@ -0,0 +1,355 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_open.c 10.20 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +/* + * Implementation of btree access method for 4.4BSD. + * + * The design here was originally based on that of the btree access method + * used in the Postgres database system at UC Berkeley. This implementation + * is wholly independent of the Postgres code. + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "common_ext.h" + +static int __bam_keyalloc __P((BTREE *)); +static int __bam_setmeta __P((DB *, BTREE *)); + +/* + * __bam_open -- + * Open a btree. + * + * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *)); + */ +int +__bam_open(dbp, type, dbinfo) + DB *dbp; + DBTYPE type; + DB_INFO *dbinfo; +{ + BTREE *t; + int ret; + + /* Allocate the btree internal structure. */ + if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL) + return (ENOMEM); + + t->bt_sp = t->bt_csp = t->bt_stack; + t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + + if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) && + (ret = __bam_keyalloc(t)) != 0) + goto err; + + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. + */ + if (dbinfo != NULL) { + /* Minimum number of keys per page. */ + if (dbinfo->bt_minkey == 0) + t->bt_minkey = DEFMINKEYPAGE; + else { + if (dbinfo->bt_minkey < 2) + goto einval; + t->bt_minkey = dbinfo->bt_minkey; + } + + /* Maximum number of keys per page. */ + if (dbinfo->bt_maxkey == 0) + t->bt_maxkey = 0; + else { + if (dbinfo->bt_maxkey < 1) + goto einval; + t->bt_maxkey = dbinfo->bt_maxkey; + } + + /* + * If no comparison, use default comparison. If no comparison + * and no prefix, use default prefix. (We can't default the + * prefix if the user supplies a comparison routine; shortening + * the keys may break their comparison algorithm.) + */ + t->bt_compare = dbinfo->bt_compare == NULL ? + __bam_defcmp : dbinfo->bt_compare; + t->bt_prefix = dbinfo->bt_prefix == NULL ? + (dbinfo->bt_compare == NULL ? + __bam_defpfx : NULL) : dbinfo->bt_prefix; + } else { + t->bt_minkey = DEFMINKEYPAGE; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } + + /* Initialize the remaining fields of the DB. */ + dbp->type = type; + dbp->internal = t; + dbp->cursor = __bam_cursor; + dbp->del = __bam_delete; + dbp->get = __bam_get; + dbp->put = __bam_put; + dbp->stat = __bam_stat; + dbp->sync = __bam_sync; + + /* + * The btree data structure requires that at least two key/data pairs + * can fit on a page, but other than that there's no fixed requirement. + * Translate the minimum number of items into the bytes a key/data pair + * can use before being placed on an overflow page. We calculate for + * the worst possible alignment by assuming every item requires the + * maximum alignment for padding. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. + */ + t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) + - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); + + /* Create a root page if new tree. */ + if ((ret = __bam_setmeta(dbp, t)) != 0) + goto err; + + return (0); + +einval: ret = EINVAL; + +err: if (t != NULL) { + /* If we allocated room for key/data return, discard it. */ + if (t->bt_rkey.data != NULL) + free(t->bt_rkey.data); + + FREE(t, sizeof(BTREE)); + } + return (ret); +} + +/* + * __bam_bdup -- + * Create a BTREE handle for a threaded DB handle. + * + * PUBLIC: int __bam_bdup __P((DB *, DB *)); + */ +int +__bam_bdup(orig, new) + DB *orig, *new; +{ + BTREE *t, *ot; + int ret; + + ot = orig->internal; + + if ((t = (BTREE *)calloc(1, sizeof(*t))) == NULL) + return (ENOMEM); + + /* + * !!! + * Ignore the cursor queue, only the first DB has attached cursors. + */ + + t->bt_sp = t->bt_csp = t->bt_stack; + t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + + if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) && + (ret = __bam_keyalloc(t)) != 0) { + FREE(t, sizeof(*t)); + return (ret); + } + + t->bt_maxkey = ot->bt_maxkey; + t->bt_minkey = ot->bt_minkey; + t->bt_compare = ot->bt_compare; + t->bt_prefix = ot->bt_prefix; + t->bt_ovflsize = ot->bt_ovflsize; + + /* + * !!! + * The entire RECNO structure is shared. If it breaks, the application + * was misusing it to start with. + */ + t->bt_recno = ot->bt_recno; + + new->internal = t; + + return (0); +} + +/* + * __bam_keyalloc -- + * Allocate return memory for recno keys. + */ +static int +__bam_keyalloc(t) + BTREE *t; +{ + /* + * Recno keys are always the same size, and we don't want to have + * to check for space on each return. Allocate it now. + */ + if ((t->bt_rkey.data = (void *)malloc(sizeof(db_recno_t))) == NULL) + return (ENOMEM); + t->bt_rkey.ulen = sizeof(db_recno_t); + return (0); +} + +/* + * __bam_setmeta -- + * Check (and optionally create) a tree. + */ +static int +__bam_setmeta(dbp, t) + DB *dbp; + BTREE *t; +{ + BTMETA *meta; + PAGE *root; + DB_LOCK mlock, rlock; + db_pgno_t pgno; + int ret; + + /* Get, and optionally create the metadata page. */ + pgno = PGNO_METADATA; + if ((ret = + __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &mlock)) != 0) + return (ret); + if ((ret = + __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) { + (void)__BT_LPUT(dbp, mlock); + return (ret); + } + + /* + * If the magic number is correct, we're not creating the tree. + * Correct any fields that may not be right. Note, all of the + * local flags were set by db_open(3). + */ + if (meta->magic != 0) { + t->bt_maxkey = meta->maxkey; + t->bt_minkey = meta->minkey; + + (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); + (void)__BT_LPUT(dbp, mlock); + return (0); + } + + /* Initialize the tree structure metadata information. */ + ZERO_LSN(meta->lsn); + meta->pgno = PGNO_METADATA; + meta->magic = DB_BTREEMAGIC; + meta->version = DB_BTREEVERSION; + meta->pagesize = dbp->pgsize; + meta->maxkey = t->bt_maxkey; + meta->minkey = t->bt_minkey; + meta->free = PGNO_INVALID; + meta->flags = 0; + if (dbp->type == DB_RECNO) + F_SET(meta, BTM_RECNO); + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(meta, BTM_DUP); + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + F_SET(meta, BTM_FIXEDLEN); + if (F_ISSET(dbp, DB_BT_RECNUM)) + F_SET(meta, BTM_RECNUM); + if (F_ISSET(dbp, DB_RE_RENUMBER)) + F_SET(meta, BTM_RENUMBER); + meta->re_len = 0; + meta->re_pad = 0; + memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN); + + /* Create and initialize a root page. */ + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rlock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) { + (void)__BT_LPUT(dbp, rlock); + return (ret); + } + P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, + PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); + ZERO_LSN(root->lsn); + + /* Release the metadata and root pages. */ + if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) + return (ret); + if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) + return (ret); + + /* + * Flush the metadata and root pages to disk -- since the user can't + * transaction protect open, the pages have to exist during recovery. + * + * XXX + * It's not useful to return not-yet-flushed here -- convert it to + * an error. + */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = EINVAL; + + /* Release the locks. */ + (void)__BT_LPUT(dbp, mlock); + (void)__BT_LPUT(dbp, rlock); + + return (ret); +} diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c new file mode 100644 index 0000000000..7ee74ffcf8 --- /dev/null +++ b/db2/btree/bt_page.c @@ -0,0 +1,312 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_page.c 10.5 (Sleepycat) 8/18/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_new -- + * Get a new page, preferably from the freelist. + * + * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **)); + */ +int +__bam_new(dbp, type, pagepp) + DB *dbp; + u_int32_t type; + PAGE **pagepp; +{ + BTMETA *meta; + DB_LOCK mlock; + PAGE *h; + db_pgno_t pgno; + int ret; + + meta = NULL; + h = NULL; + mlock = LOCK_INVALID; + + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + goto err; + + if (meta->free == PGNO_INVALID) { + if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0) + goto err; + ZERO_LSN(h->lsn); + h->pgno = pgno; + } else { + pgno = meta->free; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + goto err; + meta->free = h->next_pgno; + } + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn, + &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn, + h->pgno, (u_int32_t)type, meta->free)) != 0) + goto err; + LSN(h) = LSN(meta); + } + + (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, mlock); + + P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); + *pagepp = h; + return (0); + +err: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + if (meta != NULL) + (void)memp_fput(dbp->mpf, meta, 0); + if (mlock != LOCK_INVALID) + (void)__BT_TLPUT(dbp, mlock); + return (ret); +} + +/* + * __bam_free -- + * Add a page to the head of the freelist. + * + * PUBLIC: int __bam_free __P((DB *, PAGE *)); + */ +int +__bam_free(dbp, h) + DB *dbp; + PAGE *h; +{ + BTMETA *meta; + DBT ldbt; + DB_LOCK mlock; + db_pgno_t pgno; + int is_dirty, ret, t_ret; + + /* + * Retrieve the metadata page and insert the page at the head of + * the free list. If either the lock get or page get routines + * fail, then we need to put the page with which we were called + * back because our caller assumes we take care of it. + */ + is_dirty = 0; + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &mlock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) { + (void)__BT_TLPUT(dbp, mlock); + goto err; + } + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + memset(&ldbt, 0, sizeof(ldbt)); + ldbt.data = h; + ldbt.size = P_OVERHEAD; + if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info, + dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, + &meta->lsn, &ldbt, meta->free)) != 0) { + (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); + (void)__BT_TLPUT(dbp, mlock); + return (ret); + } + LSN(h) = LSN(meta); + } + + /* + * The page should have nothing interesting on it, re-initialize it, + * leaving only the page number and the LSN. + */ +#ifdef DEBUG + { db_pgno_t __pgno; DB_LSN __lsn; + __pgno = h->pgno; + __lsn = h->lsn; + memset(h, 0xff, dbp->pgsize); + h->pgno = __pgno; + h->lsn = __lsn; + } +#endif + P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, meta->free, 0, P_INVALID); + + /* Link the page on the metadata free list. */ + meta->free = h->pgno; + + /* Discard the metadata page. */ + ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + if ((t_ret = __BT_TLPUT(dbp, mlock)) != 0) + ret = t_ret; + + /* Discard the caller's page reference. */ + is_dirty = DB_MPOOL_DIRTY; +err: if ((t_ret = memp_fput(dbp->mpf, h, is_dirty)) != 0 && ret == 0) + ret = t_ret; + + /* + * XXX + * We have to unlock the caller's page in the caller! + */ + return (ret); +} + +#ifdef DEBUG +/* + * __bam_lt -- + * Print out the list of currently held locks. + */ +int +__bam_lt(dbp) + DB *dbp; +{ + DB_LOCKREQ req; + + if (F_ISSET(dbp, DB_AM_LOCKING)) { + req.op = DB_LOCK_DUMP; + lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL); + } + return (0); +} +#endif + +/* + * __bam_lget -- + * The standard lock get call. + * + * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); + */ +int +__bam_lget(dbp, do_couple, pgno, mode, lockp) + DB *dbp; + int do_couple; + db_pgno_t pgno; + db_lockmode_t mode; + DB_LOCK *lockp; +{ + DB_LOCKREQ couple[2]; + u_int32_t locker; + int ret; + + if (!F_ISSET(dbp, DB_AM_LOCKING)) + return (0); + + locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid; + dbp->lock.pgno = pgno; + + /* + * If the object not currently locked, acquire the lock and return, + * otherwise, lock couple. If we fail and it's not a system error, + * convert to EAGAIN. + */ + if (do_couple) { + couple[0].op = DB_LOCK_GET; + couple[0].obj = &dbp->lock_dbt; + couple[0].mode = mode; + couple[1].op = DB_LOCK_PUT; + couple[1].lock = *lockp; + + ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL); + if (ret != 0) { + /* If we fail, discard the lock we held. */ + __bam_lput(dbp, *lockp); + + return (ret < 0 ? EAGAIN : ret); + } + *lockp = couple[0].lock; + } else { + ret = lock_get(dbp->dbenv->lk_info, + locker, 0, &dbp->lock_dbt, mode, lockp); + return (ret < 0 ? EAGAIN : ret); + } + return (0); +} + +/* + * __bam_lput -- + * The standard lock put call. + * + * PUBLIC: int __bam_lput __P((DB *, DB_LOCK)); + */ +int +__bam_lput(dbp, lock) + DB *dbp; + DB_LOCK lock; +{ + return (__BT_LPUT(dbp, lock)); +} + +/* + * __bam_pget -- + * The standard page get call. + * + * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, int)); + */ +int +__bam_pget(dbp, hp, pgnop, mflags) + DB *dbp; + PAGE **hp; + db_pgno_t *pgnop; + int mflags; +{ + return (memp_fget((dbp)->mpf, + pgnop, mflags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop)); +} diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c new file mode 100644 index 0000000000..632c3d185b --- /dev/null +++ b/db2/btree/bt_put.c @@ -0,0 +1,919 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_put.c 10.23 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_fixed __P((BTREE *, DBT *)); +static int __bam_lookup __P((DB *, DBT *, int *)); +static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); +static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t)); + +/* + * __bam_put -- + * Add a new key/data pair or replace an existing pair (btree). + * + * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); + */ +int +__bam_put(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + CURSOR c; + DB *dbp; + PAGE *h; + db_indx_t indx; + int exact, iflags, newkey, replace, ret, stack; + + DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags); + + /* Check flags. */ + if ((ret = __db_putchk(argdbp, key, data, flags, + F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + +retry: /* + * Find the location at which to insert. The call to bt_lookup() + * leaves the returned page pinned. + */ + if ((ret = __bam_lookup(dbp, key, &exact)) != 0) { + PUTHANDLE(dbp); + return (ret); + } + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* + * If an identical key is already in the tree, and DB_NOOVERWRITE is + * set, an error is returned. If an identical key is already in the + * tree and DB_NOOVERWRITE is not set, the key is either added (when + * duplicates are permitted) or an error is returned. The exception + * is when the item located is referenced by a cursor and marked for + * deletion, in which case we permit the overwrite and flag the cursor. + */ + replace = 0; + if (exact && flags == DB_NOOVERWRITE) { + if (!GET_BKEYDATA(h, indx + O_INDX)->deleted) { + ret = DB_KEYEXIST; + goto err; + } + replace = 1; + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); + } + + /* + * If we're inserting into the first or last page of the tree, + * remember where we did it so we can do fast lookup next time. + * + * XXX + * Does reverse order still work (did it ever!?!?) + */ + t->bt_lpgno = + h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ? + h->pgno : PGNO_INVALID; + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, we're either adding a new duplicate at the + * end of the duplicate set, or we're replacing the data item with a + * new data item. If the key isn't an exact match, we're inserting + * a new key/data pair, before the search location. + */ + newkey = dbp->type == DB_BTREE && !exact; + if (exact) { + if (F_ISSET(dbp, DB_AM_DUP)) { + /* + * Make sure that we're not looking at a page of + * duplicates -- if so, move to the last entry on + * that page. + */ + c.page = h; + c.pgno = h->pgno; + c.indx = indx; + c.dpgno = PGNO_INVALID; + c.dindx = 0; + if ((ret = + __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0) + goto err; + if (c.dpgno != PGNO_INVALID) { + /* + * XXX + * The __bam_ovfl_chk() routine memp_fput() the + * current page and acquired a new one, but did + * not do anything about the lock we're holding. + */ + t->bt_csp->page = h = c.page; + indx = c.dindx; + } + iflags = DB_AFTER; + } else + iflags = DB_CURRENT; + } else + iflags = DB_BEFORE; + + /* + * The pages we're using may be modified by __bam_iitem(), so make + * sure we reset the stack. + */ + ret = __bam_iitem(dbp, + &h, &indx, key, data, iflags, newkey ? BI_NEWKEY : 0); + t->bt_csp->page = h; + t->bt_csp->indx = indx; + + switch (ret) { + case 0: + /* + * Done. Clean up the cursor, and, if we're doing record + * numbers, adjust the internal page counts. + */ + if (replace) + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); + + if (!replace && F_ISSET(dbp, DB_BT_RECNUM)) + ret = __bam_adjust(dbp, t, 1); + break; + case DB_NEEDSPLIT: + /* + * We have to split the page. Back out the cursor setup, + * discard the stack of pages, and do the split. + */ + if (replace) { + replace = 0; + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); + } + + (void)__bam_stkrel(dbp); + stack = 0; + + if ((ret = __bam_split(dbp, key)) != 0) + break; + + goto retry; + /* NOTREACHED */ + default: + if (replace) + __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); + break; + } + +err: if (stack) + (void)__bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_lookup -- + * Find the right location in the tree for the key. + */ +static int +__bam_lookup(dbp, key, exactp) + DB *dbp; + DBT *key; + int *exactp; +{ + BTREE *t; + DB_LOCK lock; + EPG e; + PAGE *h; + db_indx_t indx; + int cmp, ret; + + t = dbp->internal; + h = NULL; + + /* + * Record numbers can't be fast-tracked, we have to lock the entire + * tree. + */ + if (F_ISSET(dbp, DB_BT_RECNUM)) + goto slow; + + /* Check to see if we've been seeing sorted input. */ + if (t->bt_lpgno == PGNO_INVALID) + goto slow; + + /* + * Retrieve the page on which we did the last insert. It's okay if + * it doesn't exist, or if it's not the page type we expect, it just + * means that the world changed. + */ + if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) + goto miss; + if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) { + (void)__BT_LPUT(dbp, lock); + goto miss; + } + if (TYPE(h) != P_LBTREE) + goto miss; + if (NUM_ENT(h) == 0) + goto miss; + + /* + * We have to be at the end or beginning of the tree to know that + * we're inserting in a sort order. If that's the case and we're + * in the right order in comparison to the first/last key/data pair, + * we have the right position. + */ + if (h->next_pgno == PGNO_INVALID) { + e.page = h; + e.indx = NUM_ENT(h) - P_INDX; + if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) { + if (cmp > 0) + e.indx += P_INDX; + goto fast; + } + } + if (h->prev_pgno == PGNO_INVALID) { + e.page = h; + e.indx = 0; + if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) { + /* + * We're doing a put, so we want to insert as the last + * of any set of duplicates. + */ + if (cmp == 0) { + for (indx = 0; + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]; + indx += P_INDX); + e.indx = indx; + } + goto fast; + } + } + goto miss; + + /* Set the exact match flag in case we've already inserted this key. */ +fast: *exactp = cmp == 0; + + /* Enter the entry in the stack. */ + BT_STK_CLR(t); + BT_STK_ENTER(t, e.page, e.indx, lock, ret); + if (ret != 0) + return (ret); + + ++t->lstat.bt_cache_hit; + return (0); + +miss: ++t->lstat.bt_cache_miss; + if (h != NULL) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + } + +slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp)); +} + +/* + * OVPUT -- + * Copy an overflow item onto a page. + */ +#undef OVPUT +#define OVPUT(h, indx, bo) do { \ + DBT __hdr; \ + memset(&__hdr, 0, sizeof(__hdr)); \ + __hdr.data = &bo; \ + __hdr.size = BOVERFLOW_SIZE; \ + if ((ret = __db_pitem(dbp, \ + h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ + return (ret); \ +} while (0) + +/* + * __bam_iitem -- + * Insert an item into the tree. + * + * PUBLIC: int __bam_iitem __P((DB *, + * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, int, int)); + */ +int +__bam_iitem(dbp, hp, indxp, key, data, op, flags) + DB *dbp; + PAGE **hp; + db_indx_t *indxp; + DBT *key, *data; + int op, flags; +{ + BTREE *t; + BKEYDATA *bk; + BOVERFLOW kbo, dbo; + DBT tdbt; + PAGE *h; + db_indx_t indx; + u_int32_t have_bytes, need_bytes, needed; + int bigkey, bigdata, dcopy, dupadjust, ret; + + t = dbp->internal; + h = *hp; + indx = *indxp; + + dupadjust = 0; + bk = NULL; /* XXX: Shut the compiler up. */ + + /* + * If it's a page of duplicates, call the common code to do the work. + * + * !!! + * Here's where the hp and indxp are important. The duplicate code + * may decide to rework/rearrange the pages and indices we're using, + * so the caller must understand that the stack has to change. + */ + if (TYPE(h) == P_DUPLICATE) { + /* Adjust the index for the new item if it's a DB_AFTER op. */ + if (op == DB_AFTER) + ++*indxp; + + /* Remove the current item if it's a DB_CURRENT op. */ + if (op == DB_CURRENT && (ret = __db_ditem(dbp, *hp, *indxp, + BKEYDATA_SIZE(GET_BKEYDATA(*hp, *indxp)->len))) != 0) + return (ret); + + /* Put the new/replacement item onto the page. */ + return (__db_dput(dbp, data, hp, indxp, __bam_new)); + } + + /* + * XXX + * Handle partial puts. + * + * This is truly awful from a performance standput. We don't optimize + * for partial puts at all, we delete the record and add it back in, + * regardless of size or if we're simply overwriting current data. + * The hash access method does this a lot better than we do, and we're + * eventually going to have to fix it. + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + tdbt = *data; + if ((ret = __bam_partial(dbp, &tdbt, h, indx)) != 0) + return (ret); + data = &tdbt; + } + + /* If it's a short fixed-length record, fix it up. */ + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) { + tdbt = *data; + if ((ret = __bam_fixed(t, &tdbt)) != 0) + return (ret); + data = &tdbt; + } + + /* + * If the key or data item won't fit on a page, store it in the + * overflow pages. + * + * !!! + * From this point on, we have to recover the allocated overflow + * pages on error. + */ + bigkey = bigdata = 0; + if (LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize) { + kbo.deleted = 0; + kbo.type = B_OVERFLOW; + kbo.tlen = key->size; + if ((ret = __db_poff(dbp, key, &kbo.pgno, __bam_new)) != 0) + goto err; + bigkey = 1; + } + if (data->size > t->bt_ovflsize) { + dbo.deleted = 0; + dbo.type = B_OVERFLOW; + dbo.tlen = data->size; + if ((ret = __db_poff(dbp, data, &dbo.pgno, __bam_new)) != 0) + goto err; + bigdata = 1; + } + + dcopy = 0; + needed = 0; + if (LF_ISSET(BI_NEWKEY)) { + /* If BI_NEWKEY is set we're adding a new key and data pair. */ + if (bigkey) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(key->size); + if (bigdata) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(data->size); + } else { + /* + * We're either overwriting the data item of a key/data pair + * or we're adding the data item only, i.e. a new duplicate. + */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (bk->type == B_OVERFLOW) + have_bytes = BOVERFLOW_PSIZE; + else + have_bytes = BKEYDATA_PSIZE(bk->len); + need_bytes = 0; + } else { + have_bytes = 0; + need_bytes = sizeof(db_indx_t); + } + if (bigdata) + need_bytes += BOVERFLOW_PSIZE; + else + need_bytes += BKEYDATA_PSIZE(data->size); + + /* + * If we're overwriting a data item, we copy it if it's not a + * special record type and it's the same size (including any + * alignment) and do a delete/insert otherwise. + */ + if (op == DB_CURRENT && !bigdata && + bk->type == B_KEYDATA && have_bytes == need_bytes) + dcopy = 1; + if (have_bytes < need_bytes) + needed += need_bytes - have_bytes; + } + + /* + * If there's not enough room, or the user has put a ceiling on the + * number of keys permitted in the page, split the page. + * + * XXX + * The t->bt_maxkey test here may be insufficient -- do we have to + * check in the btree split code, so we don't undo it there!?!? + */ + if (P_FREESPACE(h) < needed || + (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) { + ret = DB_NEEDSPLIT; + goto err; + } + + /* + * The code breaks it up into six cases: + * + * 1. Append a new key/data pair. + * 2. Insert a new key/data pair. + * 3. Copy the data item. + * 4. Delete/insert the data item. + * 5. Append a new data item. + * 6. Insert a new data item. + */ + if (LF_ISSET(BI_NEWKEY)) { + switch (op) { + case DB_AFTER: /* 1. Append a new key/data pair. */ + indx += 2; + *indxp += 2; + break; + case DB_BEFORE: /* 2. Insert a new key/data pair. */ + break; + default: + abort(); + } + + /* Add the key. */ + if (bigkey) + OVPUT(h, indx, kbo); + else { + DBT __data; + memset(&__data, 0, sizeof(__data)); + __data.data = key->data; + __data.size = key->size; + if ((ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(key->size), NULL, &__data)) != 0) + goto err; + } + ++indx; + } else { + switch (op) { + case DB_CURRENT: /* 3. Copy the data item. */ + /* + * If we're not logging and it's possible, overwrite + * the current item. + * + * XXX + * We should add a separate logging message so that + * we can do this anytime it's possible, including + * for partial record puts. + */ + if (dcopy && !DB_LOGGING(dbp)) { + bk->len = data->size; + memcpy(bk->data, data->data, data->size); + goto done; + } + /* 4. Delete/insert the data item. */ + if (TYPE(h) == P_LBTREE) + ++indx; + if ((ret = __bam_ditem(dbp, h, indx)) != 0) + goto err; + break; + case DB_AFTER: /* 5. Append a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* + * Adjust the cursor and copy in the key for + * the duplicate. + */ + if ((ret = __bam_adjindx(dbp, + h, indx + P_INDX, indx, 1)) != 0) + goto err; + + indx += 3; + dupadjust = 1; + + *indxp += 2; + } else { + ++indx; + __bam_ca_di(dbp, h->pgno, indx, 1); + + *indxp += 1; + } + break; + case DB_BEFORE: /* 6. Insert a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* + * Adjust the cursor and copy in the key for + * the duplicate. + */ + if ((ret = + __bam_adjindx(dbp, h, indx, indx, 1)) != 0) + goto err; + + ++indx; + dupadjust = 1; + } else + __bam_ca_di(dbp, h->pgno, indx, 1); + break; + default: + abort(); + } + } + + /* Add the data. */ + if (bigdata) + OVPUT(h, indx, dbo); + else { + BKEYDATA __bk; + DBT __hdr, __data; + memset(&__data, 0, sizeof(__data)); + __data.data = data->data; + __data.size = data->size; + + if (LF_ISSET(BI_DELETED)) { + __bk.len = __data.size; + __bk.deleted = 1; + __bk.type = B_KEYDATA; + __hdr.data = &__bk; + __hdr.size = SSZA(BKEYDATA, data); + ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(__data.size), &__hdr, &__data); + } else + ret = __db_pitem(dbp, h, indx, + BKEYDATA_SIZE(data->size), NULL, &__data); + if (ret != 0) + goto err; + } + +done: ++t->lstat.bt_added; + + ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); + + /* + * If the page is at least 50% full, and we added a duplicate, see if + * that set of duplicates takes up at least 25% of the space. If it + * does, move it off onto its own page. + */ + if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { + --indx; + if ((ret = __bam_ndup(dbp, h, indx)) != 0) + goto err; + } + + if (t->bt_recno != NULL) + F_SET(t->bt_recno, RECNO_MODIFIED); + + if (0) { +err: if (bigkey) + (void)__db_doff(dbp, kbo.pgno, __bam_free); + if (bigdata) + (void)__db_doff(dbp, dbo.pgno, __bam_free); + } + return (ret); +} + +/* + * __bam_ndup -- + * Check to see if the duplicate set at indx should have its own page. + * If it should, create it. + */ +static int +__bam_ndup(dbp, h, indx) + DB *dbp; + PAGE *h; + u_int32_t indx; +{ + BKEYDATA *bk; + BOVERFLOW bo; + DBT hdr; + PAGE *cp; + db_indx_t cnt, cpindx, first, sz; + int ret; + + while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { + if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx]) + break; + bk = GET_BKEYDATA(h, indx); + sz += bk->type == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + bk = GET_BKEYDATA(h, indx + O_INDX); + sz += bk->type == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + } + + /* + * If this set of duplicates is using more than 25% of the page, move + * them off. The choice of 25% is a WAG, but it has to be small enough + * that we can always split regardless of the presence of duplicates. + */ + if (sz < dbp->pgsize / 4) + return (0); + + /* Get a new page. */ + if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0) + return (ret); + + /* + * Move this set of duplicates off the page. First points to the first + * key of the first duplicate key/data pair, cnt is the number of pairs + * we're dealing with. + */ + memset(&hdr, 0, sizeof(hdr)); + for (indx = first + O_INDX, cpindx = 0;; ++cpindx) { + /* Copy the entry to the new page. */ + bk = GET_BKEYDATA(h, indx); + hdr.data = bk; + hdr.size = bk->type == B_KEYDATA ? + BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; + if ((ret = + __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0) + goto err; + + /* + * Move cursors referencing the old entry to the new entry. + * Done after the page put because __db_pitem() adjusts + * cursors on the new page, and before the delete because + * __db_ditem adjusts cursors on the old page. + */ + __bam_ca_dup(dbp, + PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx); + + /* Delete the data item. */ + if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0) + goto err; + + /* Delete all but the first reference to the key. */ + if (--cnt == 0) + break; + if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0) + goto err; + } + + /* Put in a new data item that points to the duplicates page. */ + bo.deleted = 0; + bo.type = B_DUPLICATE; + bo.pgno = cp->pgno; + bo.tlen = 0; + + OVPUT(h, indx, bo); + + return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY)); + +err: (void)__bam_free(dbp, cp); + return (ret); +} + +/* + * __bam_fixed -- + * Build the real record for a fixed length put. + */ +static int +__bam_fixed(t, dbt) + BTREE *t; + DBT *dbt; +{ + RECNO *rp; + + rp = t->bt_recno; + + /* + * If using fixed-length records, and the record is long, return + * EINVAL. If it's short, pad it out. Use the record data return + * memory, it's only short-term. + */ + if (dbt->size > rp->re_len) + return (EINVAL); + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + memcpy(t->bt_rdata.data, dbt->data, dbt->size); + memset((u_int8_t *)t->bt_rdata.data + dbt->size, + rp->re_pad, rp->re_len - dbt->size); + + /* Set the DBT to reference our new record. */ + t->bt_rdata.size = rp->re_len; + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + *dbt = t->bt_rdata; + return (0); +} + +/* + * __bam_partial -- + * Build the real record for a partial put. + */ +static int +__bam_partial(dbp, dbt, h, indx) + DB *dbp; + DBT *dbt; + PAGE *h; + u_int32_t indx; +{ + BTREE *t; + BKEYDATA *bk, tbk; + BOVERFLOW *bo; + DBT copy; + u_int32_t len, nbytes, tlen; + int ret; + u_int8_t *p; + + bo = NULL; /* XXX: Shut the compiler up. */ + t = dbp->internal; + + /* + * Figure out how much total space we'll need. Worst case is where + * the record is 0 bytes long, in which case doff causes the record + * to extend, and the put data is appended to it. + */ + if (indx < NUM_ENT(h)) { + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (bk->type == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + nbytes = bo->tlen; + } else + nbytes = bk->len; + } else { + bk = &tbk; + bk->type = B_KEYDATA; + nbytes = bk->len = 0; + } + nbytes += dbt->doff + dbt->size + dbt->dlen; + + /* Allocate the space. */ + if (t->bt_rdata.ulen < nbytes) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(nbytes) : + (void *)realloc(t->bt_rdata.data, nbytes); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = nbytes; + } + + /* We use nul bytes for extending the record, get it over with. */ + memset(t->bt_rdata.data, 0, nbytes); + + tlen = 0; + if (bk->type == B_OVERFLOW) { + /* Take up to doff bytes from the record. */ + memset(©, 0, sizeof(copy)); + if ((ret = __db_goff(dbp, ©, bo->tlen, + bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + return (ret); + tlen += dbt->doff; + + /* + * If the original record was larger than the offset: + * If dlen > size, shift the remaining data down. + * If dlen < size, shift the remaining data up. + * Use memmove(), the regions may overlap. + */ + p = t->bt_rdata.data; + if (bo->tlen > dbt->doff) + if (dbt->dlen > dbt->size) { + tlen += len = bo->tlen - + dbt->doff - (dbt->dlen - dbt->size); + memmove(p + dbt->doff + dbt->size, + p + dbt->doff + dbt->dlen, len); + } else if (dbt->dlen < dbt->size) { + tlen += len = bo->tlen - + dbt->doff - (dbt->size - dbt->dlen); + memmove(p + dbt->doff + dbt->dlen, + p + dbt->doff + dbt->size, len); + } else + tlen += bo->tlen - dbt->doff; + + /* Copy in the user's data. */ + memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff, + dbt->data, dbt->size); + tlen += dbt->size; + } else { + /* Take up to doff bytes from the record. */ + memcpy(t->bt_rdata.data, + bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); + tlen += dbt->doff; + + /* Copy in the user's data. */ + memcpy((u_int8_t *)t->bt_rdata.data + + dbt->doff, dbt->data, dbt->size); + tlen += dbt->size; + + /* Copy in any remaining data. */ + len = dbt->doff + dbt->dlen; + if (bk->len > len) { + memcpy((u_int8_t *)t->bt_rdata.data + dbt->doff + + dbt->size, bk->data + len, bk->len - len); + tlen += bk->len - len; + } + } + + /* Set the DBT to reference our new record. */ + t->bt_rdata.size = tlen; + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + *dbt = t->bt_rdata; + return (0); +} diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c new file mode 100644 index 0000000000..d4bc7f6824 --- /dev/null +++ b/db2/btree/bt_rec.c @@ -0,0 +1,767 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_rec.c 10.11 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "shqueue.h" +#include "hash.h" +#include "btree.h" +#include "log.h" +#include "db_dispatch.h" +#include "common_ext.h" + +/* + * __bam_pg_alloc_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __bam_pg_alloc_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_pg_alloc_args *argp; + BTMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + DB *file_dbp, *mdbp; + db_pgno_t pgno; + int cmp_n, cmp_p, created, modified, ret; + + REC_PRINT(__bam_pg_alloc_print); + REC_INTRO(__bam_pg_alloc_read); + + /* + * Fix up the allocated page. If we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list. + * + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_METADATA; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + goto out; + } + if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + (void)memp_fput(mpf, meta, 0); + goto out; + } + + /* Fix up the allocated page. */ + created = IS_ZERO_LSN(LSN(pagep)); + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->page_lsn); + if ((created || cmp_p == 0) && redo) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, 0, argp->ptype); + + pagep->lsn = *lsnp; + modified = 1; + } else if ((created || cmp_n == 0) && !redo) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, meta->free, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + (void)memp_fput(mpf, meta, 0); + goto out; + } + + /* Fix up the metadata page. */ + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + meta->lsn = *lsnp; + meta->free = argp->next; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + meta->lsn = argp->meta_lsn; + meta->free = argp->pgno; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_pg_free_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __bam_pg_free_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_pg_free_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_pg_free_args *argp; + BTMETA *meta; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_pg_free_print); + REC_INTRO(__bam_pg_free_read); + + /* + * Fix up the freed page. If we're redoing the operation we get the + * page and explicitly discard its contents, then update its LSN. If + * we're undoing the operation, we get the page and restore its header. + */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &LSN(argp->header.data)); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + pagep->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + memcpy(pagep, argp->header.data, argp->header.size); + + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN and free pointer. + */ + pgno = PGNO_METADATA; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + meta->free = argp->pgno; + + meta->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + meta->free = argp->next; + + meta->lsn = argp->meta_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_split_recover -- + * Recovery function for split. + * + * PUBLIC: int __bam_split_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_split_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_split_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; + db_pgno_t pgno; + int l_update, p_update, r_update, ret, rootsplit, t_ret; + + REC_PRINT(__bam_split_print); + + mpf = NULL; + _lp = lp = np = pp = _rp = rp = NULL; + + REC_INTRO(__bam_split_read); + + /* + * There are two kinds of splits that we have to recover from. The + * first is a root-page split, where the root page is split from a + * leaf page into an internal page and two new leaf pages are created. + * The second is where a page is split into two pages, and a new key + * is inserted into the parent page. + */ + sp = argp->pg.data; + pgno = PGNO(sp); + rootsplit = pgno == PGNO_ROOT; + if (memp_fget(mpf, &argp->left, 0, &lp) != 0) + lp = NULL; + if (memp_fget(mpf, &argp->right, 0, &rp) != 0) + rp = NULL; + + if (redo) { + l_update = r_update = p_update = 0; + /* + * Decide if we need to resplit the page. + * + * If this is a root split, then the root has to exist, it's + * the page we're splitting and it gets modified. If this is + * not a root split, then the left page has to exist, for the + * same reason. + */ + if (rootsplit) { + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + pp = NULL; + goto out; + } + p_update = + log_compare(&LSN(pp), &LSN(argp->pg.data)) == 0; + } else + if (lp == NULL) { + (void)__db_pgerr(file_dbp, argp->left); + goto out; + } + if (lp == NULL || log_compare(&LSN(lp), &argp->llsn) == 0) + l_update = 1; + if (rp == NULL || log_compare(&LSN(rp), &argp->rlsn) == 0) + r_update = 1; + if (!p_update && !l_update && !r_update) + goto done; + + /* Allocate and initialize new left/right child pages. */ + if ((_lp = (PAGE *)malloc(file_dbp->pgsize)) == NULL) + goto nomem; + if ((_rp = (PAGE *)malloc(file_dbp->pgsize)) == NULL) { +nomem: errno = ENOMEM; + __db_err(file_dbp->dbenv, "%s", strerror(errno)); + goto out; + } + if (rootsplit) { + P_INIT(_lp, file_dbp->pgsize, argp->left, + PGNO_INVALID, + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : argp->left, + PGNO_INVALID, LEVEL(sp), TYPE(sp)); + } else { + P_INIT(_lp, file_dbp->pgsize, PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno, + ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp), + LEVEL(sp), TYPE(sp)); + } + + /* Split the page. */ + if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 || + (ret = __bam_copy(file_dbp, sp, _rp, argp->indx, + NUM_ENT(sp))) != 0) + goto out; + + /* If the left child is wrong, update it. */ + if (lp == NULL && (ret = + memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) { + (void)__db_pgerr(file_dbp, argp->left); + lp = NULL; + goto out; + } + if (l_update) { + memcpy(lp, _lp, file_dbp->pgsize); + lp->lsn = *lsnp; + if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + lp = NULL; + } + + /* If the right child is wrong, update it. */ + if (rp == NULL && (ret = memp_fget(mpf, + &argp->right, DB_MPOOL_CREATE, &rp)) != 0) { + (void)__db_pgerr(file_dbp, argp->right); + rp = NULL; + goto out; + } + if (r_update) { + memcpy(rp, _rp, file_dbp->pgsize); + rp->lsn = *lsnp; + if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + rp = NULL; + } + + /* + * If the parent page is wrong, update it. This is of interest + * only if it was a root split, since root splits create parent + * pages. All other splits modify a parent page, but those are + * separately logged and recovered. + */ + if (rootsplit && p_update) { + if (file_dbp->type == DB_BTREE) + P_INIT(pp, file_dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, + _lp->level + 1, P_IBTREE); + else + P_INIT(pp, file_dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, + _lp->level + 1, P_IRECNO); + RE_NREC_SET(pp, + file_dbp->type == DB_RECNO || + F_ISSET(file_dbp, DB_BT_RECNUM) ? + __bam_total(_lp) + __bam_total(_rp) : 0); + pp->lsn = *lsnp; + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + pp = NULL; + } + + /* + * Finally, redo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page had better exist. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + (void)__db_pgerr(file_dbp, argp->npgno); + np = NULL; + goto out; + } + if (log_compare(&LSN(np), &argp->nlsn) == 0) { + PREV_PGNO(np) = argp->right; + np->lsn = *lsnp; + if ((ret = memp_fput(mpf, + np, DB_MPOOL_DIRTY)) != 0) + goto fatal; + np = NULL; + } + } + } else { + /* + * If the split page is wrong, replace its contents with the + * logged page contents. The split page had better exist. + */ + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + pp = NULL; + goto out; + } + if (log_compare(lsnp, &LSN(pp)) == 0) { + memcpy(pp, argp->pg.data, argp->pg.size); + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + pp = NULL; + } + + /* + * If it's a root split and the left child ever existed, put + * it on the free list. (If it's not a root split, we just + * updated the left page -- it's the same as the split page.) + * If the right child ever existed, root split or not, put it + * on the free list. + */ + if ((rootsplit && lp != NULL) || rp != NULL) { + if (rootsplit && lp != NULL && + log_compare(lsnp, &LSN(lp)) == 0) { + lp->lsn = argp->llsn; + if ((ret = + memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + lp = NULL; + } + if (rp != NULL && + log_compare(lsnp, &LSN(rp)) == 0) { + rp->lsn = argp->rlsn; + if ((ret = + memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto fatal; + rp = NULL; + } + } + + /* + * Finally, undo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page had better exist. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + (void)__db_pgerr(file_dbp, argp->npgno); + np = NULL; + goto out; + } + if (log_compare(lsnp, &LSN(np)) == 0) { + PREV_PGNO(np) = argp->left; + np->lsn = argp->nlsn; + if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) + goto fatal; + np = NULL; + } + } + } + +done: ret = 0; + *lsnp = argp->prev_lsn; + + if (0) { +fatal: (void)__db_panic(file_dbp); + } +out: /* Free any pages that weren't dirtied. */ + if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) + ret = t_ret; + if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0) + ret = t_ret; + if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0) + ret = t_ret; + + /* Free any allocated space. */ + if (_lp != NULL) + free(_lp); + if (_rp != NULL) + free(_rp); + + REC_CLOSE; +} + +/* + * __bam_rsplit_recover -- + * Recovery function for a reverse split. + * + * PUBLIC: int __bam_rsplit_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_rsplit_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_rsplit_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_rsplit_print); + REC_INTRO(__bam_rsplit_read); + + /* Fix the root page. */ + pgno = PGNO_ROOT; + if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) { + __db_pgerr(file_dbp, pgno); + pagep = NULL; + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->rootlsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + pagep->pgno = PGNO_ROOT; + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT, + PGNO_INVALID, PGNO_INVALID, pagep->level + 1, TYPE(pagep)); + if ((ret = __db_pitem(file_dbp, pagep, 0, + argp->rootent.size, &argp->rootent, NULL)) != 0) + goto out; + pagep->lsn = argp->rootlsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + /* Fix the page copied over the root page. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &LSN(argp->pgdbt.data)); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + (void)__db_panic(file_dbp); + goto out; + } + + ret = 0; + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __bam_adj_recover -- + * Recovery function for adj. + * + * PUBLIC: int __bam_adj_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_adj_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_adj_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_adj_print); + REC_INTRO(__bam_adj_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + if ((ret = __bam_adjindx(file_dbp, + pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) + goto err; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + if ((ret = __bam_adjindx(file_dbp, + pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) + goto err; + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + + if (0) { +err: (void)memp_fput(mpf, pagep, 0); + } +out: REC_CLOSE; +} + +/* + * __bam_cadjust_recover -- + * Recovery function for the adjust of a count change in an internal + * page. + * + * PUBLIC: int __bam_cadjust_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_cadjust_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_cadjust_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_cadjust_print); + REC_INTRO(__bam_cadjust_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + errno = __db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + if (file_dbp->type == DB_BTREE && + F_ISSET(file_dbp, DB_BT_RECNUM)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + if (file_dbp->type == DB_RECNO) { + GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + if (file_dbp->type == DB_BTREE && + F_ISSET(file_dbp, DB_BT_RECNUM)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + if (file_dbp->type == DB_RECNO) { + GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->total && PGNO(pagep) == PGNO_ROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __bam_cdel_recover -- + * Recovery function for the intent-to-delete of a cursor record. + * + * PUBLIC: int __bam_cdel_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__bam_cdel_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __bam_cdel_args *argp; + DB *file_dbp, *mdbp; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + REC_PRINT(__bam_cdel_print); + REC_INTRO(__bam_cdel_read); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + (void)__db_pgerr(file_dbp, argp->pgno); + pagep = NULL; + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + if (cmp_p == 0 && redo) { + /* Need to redo update described. */ + GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 1; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && !redo) { + /* Need to undo update described. */ + GET_BKEYDATA(pagep, argp->indx + O_INDX)->deleted = 0; + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c new file mode 100644 index 0000000000..cd8872a064 --- /dev/null +++ b/db2/btree/bt_recno.c @@ -0,0 +1,1195 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_recno.c 10.12 (Sleepycat) 8/25/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __ram_add __P((DB *, db_recno_t *, DBT *, int, int)); +static int __ram_c_close __P((DBC *)); +static int __ram_c_del __P((DBC *, int)); +static int __ram_c_get __P((DBC *, DBT *, DBT *, int)); +static int __ram_c_put __P((DBC *, DBT *, DBT *, int)); +static int __ram_fmap __P((DB *, db_recno_t)); +static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, int)); +static int __ram_source __P((DB *, RECNO *, const char *)); +static int __ram_sync __P((DB *, int)); +static int __ram_update __P((DB *, db_recno_t, int)); +static int __ram_vmap __P((DB *, db_recno_t)); +static int __ram_writeback __P((DB *)); + +/* + * If we're renumbering records, then we have to detect in the cursor that a + * record was deleted, and adjust the cursor as necessary. If not renumbering + * records, then we can detect this by looking at the actual record, so we + * ignore the cursor delete flag. + */ +#define CD_SET(dbp, cp) { \ + if (F_ISSET(dbp, DB_RE_RENUMBER)) \ + F_SET(cp, CR_DELETED); \ +} +#define CD_CLR(dbp, cp) { \ + if (F_ISSET(dbp, DB_RE_RENUMBER)) \ + F_CLR(cp, CR_DELETED); \ +} +#define CD_ISSET(dbp, cp) \ + (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED)) + +/* + * __ram_open -- + * Recno open function. + * + * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *)); + */ +int +__ram_open(dbp, type, dbinfo) + DB *dbp; + DBTYPE type; + DB_INFO *dbinfo; +{ + BTREE *t; + RECNO *rp; + int ret; + + ret = 0; + + /* Allocate and initialize the private RECNO structure. */ + if ((rp = (RECNO *)calloc(1, sizeof(*rp))) == NULL) + return (errno); + + if (dbinfo != NULL) { + /* + * If the user specified a source tree, open it and map it in. + * + * !!! + * We don't complain if the user specified transactions or + * threads. It's possible to make it work, but you'd better + * know what you're doing! + */ + if (dbinfo->re_source == NULL) { + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } else { + if ((ret = + __ram_source(dbp, rp, dbinfo->re_source)) != 0) + goto err; + } + + /* Copy delimiter, length and padding values. */ + rp->re_delim = + F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n'; + rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' '; + + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((rp->re_len = dbinfo->re_len) == 0) { + __db_err(dbp->dbenv, + "record length must be greater than 0"); + ret = EINVAL; + goto err; + } + } else + rp->re_len = 0; + } else { + rp->re_delim = '\n'; + rp->re_pad = ' '; + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } + + /* Open the underlying btree. */ + if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0) + goto err; + + /* Set the routines necessary to make it look like a recno tree. */ + dbp->cursor = __ram_cursor; + dbp->del = __ram_delete; + dbp->get = __ram_get; + dbp->put = __ram_put; + dbp->sync = __ram_sync; + + /* Link in the private recno structure. */ + ((BTREE *)dbp->internal)->bt_recno = rp; + + /* If we're snapshotting an underlying source file, do it now. */ + if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) + if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + goto err; + + return (0); + +err: /* If we mmap'd a source file, discard it. */ + if (rp->re_smap != NULL) + (void)__db_munmap(rp->re_smap, rp->re_msize); + + /* If we opened a source file, discard it. */ + if (rp->re_fd != -1) + (void)__db_close(rp->re_fd); + if (rp->re_source != NULL) + FREES(rp->re_source); + + /* If we allocated room for key/data return, discard it. */ + t = dbp->internal; + if (t->bt_rkey.data != NULL) + free(t->bt_rkey.data); + + FREE(rp, sizeof(*rp)); + + return (ret); +} + +/* + * __ram_cursor -- + * Recno db->cursor function. + * + * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **)); + */ +int +__ram_cursor(dbp, txn, dbcp) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; +{ + RCURSOR *cp; + DBC *dbc; + + DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0); + + if ((dbc = (DBC *)calloc(1, sizeof(DBC))) == NULL) + return (ENOMEM); + if ((cp = (RCURSOR *)calloc(1, sizeof(RCURSOR))) == NULL) { + free(dbc); + return (ENOMEM); + } + + cp->dbc = dbc; + cp->recno = RECNO_OOB; + + dbc->dbp = dbp; + dbc->txn = txn; + dbc->internal = cp; + dbc->c_close = __ram_c_close; + dbc->c_del = __ram_c_del; + dbc->c_get = __ram_c_get; + dbc->c_put = __ram_c_put; + + /* All cursor structures hang off the main DB structure. */ + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + *dbcp = dbc; + return (0); +} + +/* + * __ram_get -- + * Recno db->get function. + */ +static int +__ram_get(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + PAGE *h; + db_indx_t indx; + db_recno_t recno; + int exact, ret, stack; + + stack = 0; + + DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + t = dbp->internal; + + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + goto done; + + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0) + goto done; + if (!exact) + return (DB_NOTFOUND); + stack = 1; + + h = t->bt_csp->page; + indx = t->bt_csp->indx; + + /* If the record has already been deleted, we couldn't have found it. */ + if (GET_BKEYDATA(h, indx)->deleted) { + ret = DB_KEYEMPTY; + goto done; + } + + /* Return the data item. */ + ret = __db_ret(dbp, + h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen); + ++t->lstat.bt_get; + +done: /* Discard the stack. */ + if (stack) + __bam_stkrel(dbp); + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_put -- + * Recno db->put function. + */ +static int +__ram_put(argdbp, txn, key, data, flags) + DB *argdbp; + DB_TXN *txn; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + db_recno_t recno; + int ret; + + DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags); + + /* Check for invalid flags. */ + if ((ret = __db_putchk(argdbp, + key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0) + return (ret); + + GETHANDLE(argdbp, txn, &dbp, ret); + + /* + * If we're appending to the tree, make sure we've read in all of + * the backing source file. Otherwise, check the user's record + * number and fill in as necessary. + */ + ret = LF_ISSET(DB_APPEND) ? + __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1); + + /* Add the record. */ + if (ret == 0) + ret = __ram_add(dbp, &recno, data, flags, 0); + + /* If we're appending to the tree, we have to return the record. */ + if (ret == 0 && LF_ISSET(DB_APPEND)) { + t = dbp->internal; + ret = __db_retcopy(key, &recno, sizeof(recno), + &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc); + } + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_sync -- + * Recno db->sync function. + */ +static int +__ram_sync(argdbp, flags) + DB *argdbp; + int flags; +{ + DB *dbp; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags); + + /* Sync the underlying btree. */ + if ((ret = __bam_sync(argdbp, flags)) != 0) + return (ret); + + /* Copy back the backing source file. */ + GETHANDLE(argdbp, NULL, &dbp, ret); + ret = __ram_writeback(dbp); + PUTHANDLE(dbp); + + return (ret); +} + +/* + * __ram_close -- + * Recno db->close function. + * + * PUBLIC: int __ram_close __P((DB *)); + */ +int +__ram_close(argdbp) + DB *argdbp; +{ + RECNO *rp; + + DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0); + + rp = ((BTREE *)argdbp->internal)->bt_recno; + + /* Close any underlying mmap region. */ + if (rp->re_smap != NULL) + (void)__db_munmap(rp->re_smap, rp->re_msize); + + /* Close any backing source file descriptor. */ + if (rp->re_fd != -1) + (void)__db_close(rp->re_fd); + + /* Free any backing source file name. */ + if (rp->re_source != NULL) + FREES(rp->re_source); + + /* Free allocated memory. */ + FREE(rp, sizeof(RECNO)); + ((BTREE *)argdbp->internal)->bt_recno = NULL; + + /* Close the underlying btree. */ + return (__bam_close(argdbp)); +} + +/* + * __ram_c_close -- + * Recno cursor->close function. + */ +static int +__ram_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0); + + dbp = dbc->dbp; + + /* Remove the cursor from the queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->curs_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Discard the structures. */ + FREE(dbc->internal, sizeof(RCURSOR)); + FREE(dbc, sizeof(DBC)); + + return (0); +} + +/* + * __ram_c_del -- + * Recno cursor->c_del function. + */ +static int +__ram_c_del(dbc, flags) + DBC *dbc; + int flags; +{ + DBT key; + RCURSOR *cp; + int ret; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags); + + cp = dbc->internal; + + /* Check for invalid flags. */ + if ((ret = __db_cdelchk(dbc->dbp, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + return (ret); + + /* If already deleted, return failure. */ + if (CD_ISSET(dbc->dbp, cp)) + return (DB_KEYEMPTY); + + /* Build a normal delete request. */ + memset(&key, 0, sizeof(key)); + key.data = &cp->recno; + key.size = sizeof(db_recno_t); + if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0) + CD_SET(dbc->dbp, cp); + + return (ret); +} + +/* + * __ram_c_get -- + * Recno cursor->c_get function. + */ +static int +__ram_c_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + DB *dbp; + RCURSOR *cp, copy; + int ret; + + DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, + NULL, flags); + + cp = dbc->internal; + dbp = dbc->dbp; + + /* Check for invalid flags. */ + if ((ret = __db_cgetchk(dbc->dbp, + key, data, flags, cp->recno != RECNO_OOB)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + +retry: /* Update the record number. */ + switch (flags) { + case DB_CURRENT: + if (CD_ISSET(dbp, cp)) { + PUTHANDLE(dbp); + return (DB_KEYEMPTY); + } + break; + case DB_NEXT: + if (CD_ISSET(dbp, cp)) + break; + if (cp->recno != RECNO_OOB) { + ++cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + flags = DB_NEXT; + cp->recno = 1; + break; + case DB_PREV: + if (cp->recno != RECNO_OOB) { + if (cp->recno == 1) + return (DB_NOTFOUND); + --cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + flags = DB_PREV; + if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND) + goto err; + if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0) + goto err; + if (cp->recno == 0) + return (DB_NOTFOUND); + break; + case DB_SET: + case DB_SET_RANGE: + if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0) + goto err; + break; + } + + /* + * Return the key if the user didn't give us one, and then pass it + * into __ram_get(). + */ + if (flags != DB_SET && flags != DB_SET_RANGE && + (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno), + &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0) + return (ret); + + /* + * The cursor was reset, so the delete adjustment is no + * longer necessary. + */ + CD_CLR(dbp, cp); + + /* + * Retrieve the record. + * + * Skip any keys that don't really exist. + */ + if ((ret = __ram_get(dbp, dbc->txn, key, data, 0)) != 0) + if (ret == DB_KEYEMPTY && + (flags == DB_NEXT || flags == DB_PREV)) + goto retry; + +err: if (ret != 0) + *cp = copy; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_c_put -- + * Recno cursor->c_put function. + */ +static int +__ram_c_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + int flags; +{ + BTREE *t; + RCURSOR *cp, copy; + DB *dbp; + int exact, ret; + void *arg; + + DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags); + + cp = dbc->internal; + + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + return (ret); + + GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + t = dbp->internal; + + /* Initialize the cursor for a new retrieval. */ + copy = *cp; + + /* + * To split, we need a valid key for the page. Since it's a cursor, + * we have to build one. + * + * The split code discards all short-term locks and stack pages. + */ + if (0) { +split: arg = &cp->recno; + if ((ret = __bam_split(dbp, arg)) != 0) + goto err; + } + + if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + if ((ret = __bam_iitem(dbp, &t->bt_csp->page, + &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { + if ((ret = __bam_stkrel(dbp)) != 0) + goto err; + goto split; + } + if ((ret = __bam_stkrel(dbp)) != 0) + goto err; + + if (flags != DB_CURRENT) { + /* Adjust the counts. */ + if ((ret = __bam_adjust(dbp, t, 1)) != 0) + goto err; + + switch (flags) { + case DB_AFTER: + /* Adjust the cursors. */ + __ram_ca(dbp, cp->recno, CA_IAFTER); + + /* Set this cursor to reference the new record. */ + cp->recno = copy.recno + 1; + break; + case DB_BEFORE: + /* Adjust the cursors. */ + __ram_ca(dbp, cp->recno, CA_IBEFORE); + + /* Set this cursor to reference the new record. */ + cp->recno = copy.recno; + break; + } + + } + + /* + * The cursor was reset, so the delete adjustment is no + * longer necessary. + */ + CD_CLR(dbp, cp); + +err: if (ret != 0) + *cp = copy; + + PUTHANDLE(dbp); + return (ret); +} + +/* + * __ram_ca -- + * Adjust cursors. + * + * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg)); + */ +void +__ram_ca(dbp, recno, op) + DB *dbp; + db_recno_t recno; + ca_recno_arg op; +{ + DBC *dbc; + RCURSOR *cp; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (RCURSOR *)dbc->internal; + switch (op) { + case CA_DELETE: + if (recno > cp->recno) + --cp->recno; + break; + case CA_IAFTER: + if (recno > cp->recno) + ++cp->recno; + break; + case CA_IBEFORE: + if (recno >= cp->recno) + ++cp->recno; + break; + } + } + DB_THREAD_UNLOCK(dbp); +} + +#ifdef DEBUG +/* + * __ram_cprint -- + * Display the current recno cursor list. + */ +int +__ram_cprint(dbp) + DB *dbp; +{ + DBC *dbc; + RCURSOR *cp; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->curs_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (RCURSOR *)dbc->internal; + fprintf(stderr, + "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno); + } + DB_THREAD_UNLOCK(dbp); + return (0); +} +#endif /* DEBUG */ + +/* + * __ram_getno -- + * Check the user's record number, and make sure we've seen it. + * + * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); + */ +int +__ram_getno(dbp, key, rep, can_create) + DB *dbp; + const DBT *key; + db_recno_t *rep; + int can_create; +{ + db_recno_t recno; + + /* Check the user's record number. */ + if ((recno = *(db_recno_t *)key->data) == 0) { + __db_err(dbp->dbenv, "illegal record number of 0"); + return (EINVAL); + } + if (rep != NULL) + *rep = recno; + + /* + * Btree can neither create records or read them in. Recno can + * do both, see if we can find the record. + */ + return (dbp->type == DB_RECNO ? + __ram_update(dbp, recno, can_create) : 0); +} + +/* + * __ram_snapshot -- + * Read in any remaining records from the backing input file. + * + * PUBLIC: int __ram_snapshot __P((DB *)); + */ +int +__ram_snapshot(dbp) + DB *dbp; +{ + return (__ram_update(dbp, DB_MAX_RECORDS, 0)); +} + +/* + * __ram_update -- + * Ensure the tree has records up to and including the specified one. + */ +static int +__ram_update(dbp, recno, can_create) + DB *dbp; + db_recno_t recno; + int can_create; +{ + BTREE *t; + RECNO *rp; + db_recno_t nrecs; + int ret; + + t = dbp->internal; + rp = t->bt_recno; + + /* + * If we can't create records and we've read the entire backing input + * file, we're done. + */ + if (!can_create && F_ISSET(rp, RECNO_EOF)) + return (0); + + /* + * If we haven't seen this record yet, try to get it from the original + * file. + */ + if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + return (ret); + if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) { + if ((ret = rp->re_irec(dbp, recno)) != 0) + return (ret); + if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + return (ret); + } + + /* + * If we can create records, create empty ones up to the requested + * record. + */ + if (!can_create || recno <= nrecs + 1) + return (0); + + t->bt_rdata.dlen = 0; + t->bt_rdata.doff = 0; + t->bt_rdata.flags = 0; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + t->bt_rdata.size = rp->re_len; + memset(t->bt_rdata.data, rp->re_pad, rp->re_len); + } else + t->bt_rdata.size = 0; + + while (recno > ++nrecs) + if ((ret = __ram_add(dbp, + &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0) + return (ret); + return (0); +} + +/* + * __ram_source -- + * Load information about the backing file. + */ +static int +__ram_source(dbp, rp, fname) + DB *dbp; + RECNO *rp; + const char *fname; +{ + off_t size; + int oflags, ret; + + if ((ret = __db_appname(dbp->dbenv, + DB_APP_DATA, NULL, fname, NULL, &rp->re_source)) != 0) + return (ret); + + oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0; + if ((ret = + __db_fdopen(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) { + __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); + goto err; + } + + /* + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ + if ((ret = + __db_stat(dbp->dbenv, rp->re_source, rp->re_fd, &size, NULL)) != 0) + goto err; + if (size == 0) { + F_SET(rp, RECNO_EOF); + return (0); + } + + if ((ret = __db_mmap(rp->re_fd, (size_t)size, 1, 1, &rp->re_smap)) != 0) + goto err; + rp->re_cmap = rp->re_smap; + rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size); + rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ? __ram_fmap : __ram_vmap; + return (0); + +err: FREES(rp->re_source) + return (ret); +} + +/* + * __ram_writeback -- + * Rewrite the backing file. + */ +static int +__ram_writeback(dbp) + DB *dbp; +{ + RECNO *rp; + DBT key, data; + db_recno_t keyno; + ssize_t nw; + int fd, ret, t_ret; + u_int8_t delim, *pad; + + rp = ((BTREE *)dbp->internal)->bt_recno; + + /* If the file wasn't modified, we're done. */ + if (!F_ISSET(rp, RECNO_MODIFIED)) + return (0); + + /* If there's no backing source file, we're done. */ + if (rp->re_source == NULL) { + F_CLR(rp, RECNO_MODIFIED); + return (0); + } + + /* + * Read any remaining records into the tree. + * + * XXX + * This is why we can't support transactions when applications specify + * backing (re_source) files. At this point we have to read in the + * rest of the records from the file so that we can write all of the + * records back out again, which could modify a page for which we'd + * have to log changes and which we don't have locked. This could be + * partially fixed by taking a snapshot of the entire file during the + * db_open(), or, since db_open() isn't transaction protected, as part + * of the first DB operation. But, if a checkpoint occurs then, the + * part of the log holding the copy of the file could be discarded, and + * that would make it impossible to recover in the face of disaster. + * This could all probably be fixed, but it would require transaction + * protecting the backing source file, i.e. mpool would have to know + * about it, and we don't want to go there. + */ + if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + return (ret); + + /* + * !!! + * Close any underlying mmap region. This is required for Windows NT + * (4.0, Service Pack 2) -- if the file is still mapped, the following + * open will fail. + */ + if (rp->re_smap != NULL) { + (void)__db_munmap(rp->re_smap, rp->re_msize); + rp->re_smap = NULL; + } + + /* Get rid of any backing file descriptor, just on GP's. */ + if (rp->re_fd != -1) { + (void)__db_close(rp->re_fd); + rp->re_fd = -1; + } + + /* Open the file, truncating it. */ + if ((ret = __db_fdopen(rp->re_source, + DB_SEQUENTIAL | DB_TRUNCATE, + DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) { + __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); + return (ret); + } + + /* + * We step through the records, writing each one out. Use the record + * number and the dbp->get() function, instead of a cursor, so we find + * and write out "deleted" or non-existent records. + */ + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + key.size = sizeof(db_recno_t); + key.data = &keyno; + + /* + * We'll need the delimiter if we're doing variable-length records, + * and the pad character if we're doing fixed-length records. + */ + delim = rp->re_delim; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((pad = malloc(rp->re_len)) == NULL) { + ret = ENOMEM; + goto err; + } + memset(pad, rp->re_pad, rp->re_len); + } else + pad = NULL; /* XXX: Shut the compiler up. */ + for (keyno = 1;; ++keyno) { + switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { + case 0: + if ((ret = + __db_write(fd, data.data, data.size, &nw)) != 0) + goto err; + if (nw != (ssize_t)data.size) { + ret = EIO; + goto err; + } + break; + case DB_KEYEMPTY: + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((ret = + __db_write(fd, pad, rp->re_len, &nw)) != 0) + goto err; + if (nw != (ssize_t) rp->re_len) { + ret = EIO; + goto err; + } + } + break; + case DB_NOTFOUND: + ret = 0; + goto done; + } + if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((ret = __db_write(fd, &delim, 1, &nw)) != 0) + goto err; + if (nw != 1) { + ret = EIO; + goto err; + } + } + } + +err: +done: /* Close the file descriptor. */ + if ((t_ret = __db_close(fd)) != 0 || ret == 0) + ret = t_ret; + + if (ret == 0) + F_CLR(rp, RECNO_MODIFIED); + return (ret); +} + +/* + * __ram_fmap -- + * Get fixed length records from a file. + */ +static int +__ram_fmap(dbp, top) + DB *dbp; + db_recno_t top; +{ + BTREE *t; + DBT data; + RECNO *rp; + db_recno_t recno; + u_int32_t len; + u_int8_t *sp, *ep, *p; + int ret; + + if ((ret = __bam_nrecs(dbp, &recno)) != 0) + return (ret); + + t = dbp->internal; + rp = t->bt_recno; + if (t->bt_rdata.ulen < rp->re_len) { + t->bt_rdata.data = t->bt_rdata.data == NULL ? + (void *)malloc(rp->re_len) : + (void *)realloc(t->bt_rdata.data, rp->re_len); + if (t->bt_rdata.data == NULL) { + t->bt_rdata.ulen = 0; + return (ENOMEM); + } + t->bt_rdata.ulen = rp->re_len; + } + + memset(&data, 0, sizeof(data)); + data.data = t->bt_rdata.data; + data.size = rp->re_len; + + sp = (u_int8_t *)rp->re_cmap; + ep = (u_int8_t *)rp->re_emap; + while (recno <= top) { + if (sp >= ep) { + F_SET(rp, RECNO_EOF); + return (DB_NOTFOUND); + } + len = rp->re_len; + for (p = t->bt_rdata.data; + sp < ep && len > 0; *p++ = *sp++, --len); + + /* + * Another process may have read some portion of the input + * file already, in which case we just want to discard the + * new record. + * + * XXX + * We should just do a seek, since the records are fixed + * length. + */ + if (rp->re_last >= recno) { + if (len != 0) + memset(p, rp->re_pad, len); + + ++recno; + if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + return (ret); + } + ++rp->re_last; + } + rp->re_cmap = sp; + return (0); +} + +/* + * __ram_vmap -- + * Get variable length records from a file. + */ +static int +__ram_vmap(dbp, top) + DB *dbp; + db_recno_t top; +{ + BTREE *t; + DBT data; + RECNO *rp; + db_recno_t recno; + u_int8_t *sp, *ep; + int delim, ret; + + t = dbp->internal; + rp = t->bt_recno; + + if ((ret = __bam_nrecs(dbp, &recno)) != 0) + return (ret); + + memset(&data, 0, sizeof(data)); + + delim = rp->re_delim; + + sp = (u_int8_t *)rp->re_cmap; + ep = (u_int8_t *)rp->re_emap; + while (recno <= top) { + if (sp >= ep) { + F_SET(rp, RECNO_EOF); + return (DB_NOTFOUND); + } + for (data.data = sp; sp < ep && *sp != delim; ++sp); + + /* + * Another process may have read some portion of the input + * file already, in which case we just want to discard the + * new record. + */ + if (rp->re_last >= recno) { + data.size = sp - (u_int8_t *)data.data; + ++recno; + if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + return (ret); + } + ++rp->re_last; + ++sp; + } + rp->re_cmap = sp; + return (0); +} + +/* + * __ram_add -- + * Add records into the tree. + */ +static int +__ram_add(dbp, recnop, data, flags, bi_flags) + DB *dbp; + db_recno_t *recnop; + DBT *data; + int flags, bi_flags; +{ + BTREE *t; + PAGE *h; + db_indx_t indx; + int exact, ret, stack; + + t = dbp->internal; + +retry: /* Find the slot for insertion. */ + if ((ret = __bam_rsearch(dbp, recnop, + S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0) + return (ret); + h = t->bt_csp->page; + indx = t->bt_csp->indx; + stack = 1; + + /* + * The recno access method doesn't currently support duplicates, so + * if an identical key is already in the tree we're either overwriting + * it or an error is returned. + */ + if (exact && LF_ISSET(DB_NOOVERWRITE)) { + ret = DB_KEYEXIST; + goto err; + } + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, or we're replacing the data item with a + * new data item. If the key isn't an exact match, we're inserting + * a new key/data pair, before the search location. + */ + if ((ret = __bam_iitem(dbp, &h, &indx, NULL, + data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) == DB_NEEDSPLIT) { + (void)__bam_stkrel(dbp); + stack = 0; + if ((ret = __bam_split(dbp, recnop)) != 0) + goto err; + goto retry; + } + + if (!exact && ret == 0) + __bam_adjust(dbp, t, 1); + +err: if (stack) + __bam_stkrel(dbp); + return (ret); +} diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c new file mode 100644 index 0000000000..ee26221e25 --- /dev/null +++ b/db2/btree/bt_rsearch.c @@ -0,0 +1,347 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_rsearch.c 10.8 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_rsearch -- + * Search a btree for a record number. + * + * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int, int, int *)); + */ +int +__bam_rsearch(dbp, recnop, flags, stop, exactp) + DB *dbp; + db_recno_t *recnop; + u_int flags; + int stop, *exactp; +{ + BINTERNAL *bi; + BTREE *t; + DB_LOCK lock; + PAGE *h; + RINTERNAL *ri; + db_indx_t indx, top; + db_pgno_t pg; + db_recno_t recno, total; + int isappend, ret, stack; + + t = dbp->internal; + + /* + * We test for groups of flags, S_APPEND is the only one that can be + * OR'd into the set. Clear it now so that the tests for equality + * will work. + */ + if ((isappend = LF_ISSET(S_APPEND)) != 0) + LF_CLR(S_APPEND); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks and if we are + * locking pairs of pages. See btree.h for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, + flags == S_INSERT || flags == S_DELETE ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + total = RE_NREC(h); + + /* + * If appending to the tree, set the record number now -- we have the + * root page locked. + * + * Delete only deletes exact matches, read only returns exact matches. + * Note, this is different from __bam_search(), which returns non-exact + * matches for read. + * + * The record may not exist. We can only return the correct location + * for the record immediately after the last record in the tree, so do + * a fast check now. + */ + if (isappend) { + *exactp = 0; + *recnop = recno = total + 1; + } else { + recno = *recnop; + if (recno <= total) + *exactp = 1; + else { + *exactp = 0; + if (flags == S_DELETE || + flags == S_FIND || recno > total + 1) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + return (DB_NOTFOUND); + } + } + } + + /* Decide if we're building a stack based on the operation. */ + BT_STK_CLR(t); + stack = flags == S_DELETE || flags == S_INSERT; + + /* + * Decide if we need to save this page; if we do, write lock it, and + * start to build a stack. + */ + if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) { + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + stack = 1; + } + + /* Records in the tree are 0-based, and record numbers are 1-based. */ + --recno; + + for (total = 0;;) { + switch (TYPE(h)) { + case P_LBTREE: + BT_STK_ENTER(t, h, (recno - total) * P_INDX, lock, ret); + return (ret); + case P_IBTREE: + for (indx = 0, top = NUM_ENT(h);;) { + bi = GET_BINTERNAL(h, indx); + if (++indx == top || total + bi->nrecs > recno) + break; + total += bi->nrecs; + } + pg = bi->pgno; + break; + case P_LRECNO: + BT_STK_ENTER(t, h, recno - total, lock, ret); + return (ret); + case P_IRECNO: + for (indx = 0, top = NUM_ENT(h);;) { + ri = GET_RINTERNAL(h, indx); + if (++indx == top || total + ri->nrecs > recno) + break; + total += ri->nrecs; + } + pg = ri->pgno; + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + --indx; + + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + } + BT_STK_PUSH(t, h, indx, lock, ret); + if (ret) + goto err; + + if ((ret = __bam_lget(dbp, 0, pg, + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, + &lock)) != 0) + goto err; + } else { + (void)memp_fput(dbp->mpf, h, 0); + + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if (LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) + stack = 1; + + if ((ret = __bam_lget(dbp, 1, pg, + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, + &lock)) != 0) + goto err; + } + + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + goto err; + } + /* NOTREACHED */ + +err: BT_STK_POP(t); + __bam_stkrel(dbp); + return (ret); +} + +/* + * __bam_adjust -- + * Adjust the tree after adding or deleting a record. + * + * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int)); + */ +int +__bam_adjust(dbp, t, adjust) + DB *dbp; + BTREE *t; + int adjust; +{ + EPG *epg; + PAGE *h; + int ret; + + /* Update the record counts for the tree. */ + for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + h = epg->page; + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { + if (DB_LOGGING(dbp) && + (ret = __bam_cadjust_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(h), 0, dbp->log_fileid, + PGNO(h), &LSN(h), (u_int32_t)epg->indx, + (int32_t)adjust, 1)) != 0) + return (ret); + + if (TYPE(h) == P_IBTREE) + GET_BINTERNAL(h, epg->indx)->nrecs += adjust; + else + GET_RINTERNAL(h, epg->indx)->nrecs += adjust; + + if (PGNO(h) == PGNO_ROOT) + RE_NREC_ADJ(h, adjust); + + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + } + } + return (0); +} + +/* + * __bam_nrecs -- + * Return the number of records in the tree. + * + * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *)); + */ +int +__bam_nrecs(dbp, rep) + DB *dbp; + db_recno_t *rep; +{ + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int ret; + + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + return (ret); + + *rep = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbp, lock); + + return (0); +} + +/* + * __bam_total -- + * Return the number of records below a page. + * + * PUBLIC: db_recno_t __bam_total __P((PAGE *)); + */ +db_recno_t +__bam_total(h) + PAGE *h; +{ + db_recno_t recs; + db_indx_t nxt, top; + + switch (TYPE(h)) { + case P_LBTREE: + recs = NUM_ENT(h) / 2; + break; + case P_IBTREE: + for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt) + recs += GET_BINTERNAL(h, nxt)->nrecs; + break; + case P_LRECNO: + recs = NUM_ENT(h); + break; + case P_IRECNO: + for (recs = 0, nxt = 0, top = NUM_ENT(h); nxt < top; ++nxt) + recs += GET_RINTERNAL(h, nxt)->nrecs; + break; + default: + abort(); + } + return (recs); +} diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c new file mode 100644 index 0000000000..d5f20d4c61 --- /dev/null +++ b/db2/btree/bt_search.c @@ -0,0 +1,335 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_search.c 10.6 (Sleepycat) 8/22/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_search -- + * Search a btree for a key. + * + * PUBLIC: int __bam_search __P((DB *, + * PUBLIC: const DBT *, u_int, int, db_recno_t *, int *)); + */ +int +__bam_search(dbp, key, flags, stop, recnop, exactp) + DB *dbp; + const DBT *key; + u_int flags; + int stop, *exactp; + db_recno_t *recnop; +{ + BTREE *t; + DB_LOCK lock; + EPG cur; + PAGE *h; + db_indx_t base, i, indx, lim; + db_pgno_t pg; + db_recno_t recno; + int cmp, jump, ret, stack; + + t = dbp->internal; + recno = 0; + + BT_STK_CLR(t); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks, if we position + * to the first or last item in a set of duplicates, if we return + * deleted items, and if we are locking pairs of pages. See btree.h + * for more details. In addition, if we're doing record numbers, we + * have to lock the entire tree regardless. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = PGNO_ROOT; + stack = F_ISSET(dbp, DB_BT_RECNUM) && + (flags == S_INSERT || flags == S_DELETE); + if ((ret = __bam_lget(dbp, + 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + + /* Decide if we need to save this page; if we do, write lock it. */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + return (ret); + } + + stack = 1; + } + + for (;;) { + /* + * Do a binary search on the current page. If we're searching + * a leaf page, we have to manipulate the indices in groups of + * two. If we're searching an internal page, they're an index + * per page item. If we find an exact match on a leaf page, + * we're done. + */ + cur.page = h; + jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; + for (base = 0, + lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) { + cur.indx = indx = base + ((lim >> 1) * jump); + if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) { + if (TYPE(h) == P_LBTREE) + goto match; + goto next; + } + if (cmp > 0) { + base = indx + jump; + --lim; + } + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + O_INDX index. + * + * If it's a leaf page, return base as the "found" value. + * Delete only deletes exact matches. + */ + if (TYPE(h) == P_LBTREE) { + *exactp = 0; + + if (LF_ISSET(S_EXACT)) + goto notfound; + + BT_STK_ENTER(t, h, base, lock, ret); + return (ret); + } + + /* + * If it's not a leaf page, record the internal page (which is + * a parent page for the key). Decrement the base by 1 if it's + * non-zero so that if a split later occurs, the inserted page + * will be to the right of the saved page. + */ + indx = base > 0 ? base - O_INDX : base; + + /* + * If we're trying to calculate the record number, sum up + * all the record numbers on this page up to the indx point. + */ + if (recnop != NULL) + for (i = 0; i < indx; ++i) + recno += GET_BINTERNAL(h, i)->nrecs; + +next: pg = GET_BINTERNAL(h, indx)->pgno; + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + } + BT_STK_PUSH(t, h, indx, lock, ret); + if (ret != 0) + goto err; + + if ((ret = + __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + goto err; + } else { + (void)memp_fput(dbp->mpf, h, 0); + + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) + stack = 1; + + if ((ret = + __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + goto err; + } + if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + goto err; + } + + /* NOTREACHED */ +match: *exactp = 1; + + /* + * If we're trying to calculate the record number, add in the + * offset on this page and correct for the fact that records + * in the tree are 0-based. + */ + if (recnop != NULL) + *recnop = recno + (indx / P_INDX) + 1; + + /* + * If we got here, we know that we have a btree leaf page. + * + * If there are duplicates, go to the first/last one. + */ + if (LF_ISSET(S_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + indx += P_INDX; + else + while (indx > 0 && + h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + + /* + * Now check if we are allowed to return deleted item; if not + * find/last the first non-deleted item. + */ + if (LF_ISSET(S_DELNO)) { + if (LF_ISSET(S_DUPLAST)) + while (GET_BKEYDATA(h, indx + O_INDX)->deleted && + indx > 0 && + h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + else + while (GET_BKEYDATA(h, indx + O_INDX)->deleted && + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + indx += P_INDX; + + if (GET_BKEYDATA(h, indx + O_INDX)->deleted) + goto notfound; + } + + BT_STK_ENTER(t, h, indx, lock, ret); + return (ret); + +notfound: + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + ret = DB_NOTFOUND; + +err: if (t->bt_csp > t->bt_sp) { + BT_STK_POP(t); + __bam_stkrel(dbp); + } + return (ret); +} + +/* + * __bam_stkrel -- + * Release all pages currently held in the stack. + * + * PUBLIC: int __bam_stkrel __P((DB *)); + */ +int +__bam_stkrel(dbp) + DB *dbp; +{ + BTREE *t; + EPG *epg; + + t = dbp->internal; + for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbp, epg->lock); + } + return (0); +} + +/* + * __bam_stkgrow -- + * Grow the stack. + * + * PUBLIC: int __bam_stkgrow __P((BTREE *)); + */ +int +__bam_stkgrow(t) + BTREE *t; +{ + EPG *p; + size_t entries; + + entries = t->bt_esp - t->bt_sp; + + if ((p = (EPG *)calloc(entries * 2, sizeof(EPG))) == NULL) + return (ENOMEM); + memcpy(p, t->bt_sp, entries * sizeof(EPG)); + if (t->bt_sp != t->bt_stack) + FREE(t->bt_sp, entries * sizeof(EPG)); + t->bt_sp = p; + t->bt_csp = p + entries; + t->bt_esp = p + entries * 2; + return (0); +} diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c new file mode 100644 index 0000000000..89cfcb5a2e --- /dev/null +++ b/db2/btree/bt_split.c @@ -0,0 +1,952 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_split.c 10.12 (Sleepycat) 8/24/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_page __P((DB *, EPG *, EPG *)); +static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *)); +static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int)); +static int __bam_root __P((DB *, EPG *)); + +/* + * __bam_split -- + * Split a page. + * + * PUBLIC: int __bam_split __P((DB *, void *)); + */ +int +__bam_split(dbp, arg) + DB *dbp; + void *arg; +{ + BTREE *t; + enum { UP, DOWN } dir; + int exact, level, ret; + + t = dbp->internal; + + /* + * The locking protocol we use to avoid deadlock to acquire locks by + * walking down the tree, but we do it as lazily as possible, locking + * the root only as a last resort. We expect all stack pages to have + * been discarded before we're called; we discard all short-term locks. + * + * When __bam_split is first called, we know that a leaf page was too + * full for an insert. We don't know what leaf page it was, but we + * have the key/recno that caused the problem. We call XX_search to + * reacquire the leaf page, but this time get both the leaf page and + * its parent, locked. We then split the leaf page and see if the new + * internal key will fit into the parent page. If it will, we're done. + * + * If it won't, we discard our current locks and repeat the process, + * only this time acquiring the parent page and its parent, locked. + * This process repeats until we succeed in the split, splitting the + * root page as the final resort. The entire process then repeats, + * as necessary, until we split a leaf page. + * + * XXX + * A traditional method of speeding this up is to maintain a stack of + * the pages traversed in the original search. You can detect if the + * stack is correct by storing the page's LSN when it was searched and + * comparing that LSN with the current one when it's locked during the + * split. This would be an easy change for this code, but I have no + * numbers that indicate it's worthwhile. + */ + for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { + /* + * Acquire a page and its parent, locked. + */ + if ((ret = (dbp->type == DB_BTREE ? + __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbp, + (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) + return (ret); + + /* Split the page. */ + ret = t->bt_csp[0].page->pgno == PGNO_ROOT ? + __bam_root(dbp, &t->bt_csp[0]) : + __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]); + + switch (ret) { + case 0: + /* Once we've split the leaf page, we're done. */ + if (level == LEAFLEVEL) + return (0); + + /* Switch directions. */ + if (dir == UP) + dir = DOWN; + break; + case DB_NEEDSPLIT: + /* + * It's possible to fail to split repeatedly, as other + * threads may be modifying the tree, or the page usage + * is sufficiently bad that we don't get enough space + * the first time. + */ + if (dir == DOWN) + dir = UP; + break; + default: + return (ret); + } + } + /* NOTREACHED */ +} + +/* + * __bam_root -- + * Split the root page of a btree. + */ +static int +__bam_root(dbp, cp) + DB *dbp; + EPG *cp; +{ + BTREE *t; + PAGE *lp, *rp; + int ret; + + t = dbp->internal; + + /* Yeah, right. */ + if (cp->page->level >= MAXBTREELEVEL) + return (ENOSPC); + + /* Create new left and right pages for the split. */ + lp = rp = NULL; + if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 || + (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, lp->pgno, + PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, + cp->page->level, TYPE(cp->page)); + + /* Split the page. */ + if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + DBT __a; + DB_LSN __lsn; + memset(&__a, 0, sizeof(__a)); + __a.data = cp->page; + __a.size = dbp->pgsize; + ZERO_LSN(__lsn); + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), + PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, + &__a)) != 0) + goto err; + LSN(lp) = LSN(rp) = LSN(cp->page); + } + + /* Clean up the new root page. */ + if ((ret = (dbp->type == DB_RECNO ? + __ram_root(dbp, cp->page, lp, rp) : + __bam_broot(dbp, cp->page, lp, rp))) != 0) + goto err; + + /* Success -- write the real pages back to the store. */ + (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, cp->lock); + (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); + (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + + ++t->lstat.bt_split; + ++t->lstat.bt_rootsplit; + return (0); + +err: if (lp != NULL) + (void)__bam_free(dbp, lp); + if (rp != NULL) + (void)__bam_free(dbp, rp); + (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbp, cp->lock); + return (ret); +} + +/* + * __bam_page -- + * Split the non-root page of a btree. + */ +static int +__bam_page(dbp, pp, cp) + DB *dbp; + EPG *pp, *cp; +{ + BTREE *t; + DB_LOCK tplock; + PAGE *lp, *rp, *tp; + int ret; + + t = dbp->internal; + lp = rp = tp = NULL; + ret = -1; + + /* Create new right page for the split. */ + if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + return (ret); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno, + cp->page->level, TYPE(cp->page)); + + /* Create new left page for the split. */ + if ((lp = (PAGE *)malloc(dbp->pgsize)) == NULL) { + ret = ENOMEM; + goto err; + } +#ifdef DEBUG + memset(lp, 0xff, dbp->pgsize); +#endif + P_INIT(lp, dbp->pgsize, cp->page->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + ZERO_LSN(lp->lsn); + + /* + * Split right. + * + * Only the indices are sorted on the page, i.e., the key/data pairs + * aren't, so it's simpler to copy the data from the split page onto + * two new pages instead of copying half the data to the right page + * and compacting the left page in place. Since the left page can't + * change, we swap the original and the allocated left page after the + * split. + */ + if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0) + goto err; + + /* + * Fix up the previous pointer of any leaf page following the split + * page. + * + * !!! + * There are interesting deadlock situations here as we write-lock a + * page that's not in our direct ancestry. Consider a cursor walking + * through the leaf pages, that has the previous page read-locked and + * is waiting on a lock for the page we just split. It will deadlock + * here. If this is a problem, we can fail in the split; it's not a + * problem as the split will succeed after the cursor passes through + * the page we're splitting. + */ + if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { + if ((ret = __bam_lget(dbp, + 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0) + goto err; + } + + /* Insert the new pages into the parent page. */ + if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbp)) { + DBT __a; + DB_LSN __lsn; + memset(&__a, 0, sizeof(__a)); + __a.data = cp->page; + __a.size = dbp->pgsize; + if (tp == NULL) + ZERO_LSN(__lsn); + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), + &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), + tp == NULL ? 0 : PGNO(tp), + tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0) + goto err; + + LSN(lp) = LSN(rp) = LSN(cp->page); + if (tp != NULL) + LSN(tp) = LSN(cp->page); + } + + /* Copy the allocated page into place. */ + memcpy(cp->page, lp, LOFFSET(lp)); + memcpy((u_int8_t *)cp->page + HOFFSET(lp), + (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); + FREE(lp, dbp->pgsize); + lp = NULL; + + /* Finish the next-page link. */ + if (tp != NULL) + tp->prev_pgno = rp->pgno; + + /* Success -- write the real pages back to the store. */ + (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, pp->lock); + (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, cp->lock); + (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + if (tp != NULL) { + (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); + (void)__BT_TLPUT(dbp, tplock); + } + return (0); + +err: if (lp != NULL) + FREE(lp, dbp->pgsize); + if (rp != NULL) + (void)__bam_free(dbp, rp); + if (tp != NULL) { + (void)memp_fput(dbp->mpf, tp, 0); + (void)__BT_TLPUT(dbp, tplock); + } + (void)memp_fput(dbp->mpf, pp->page, 0); + (void)__BT_TLPUT(dbp, pp->lock); + (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbp, cp->lock); + return (ret); +} + +/* + * __bam_broot -- + * Fix up the btree root page after it has been split. + * + * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); + */ +int +__bam_broot(dbp, rootp, lp, rp) + DB *dbp; + PAGE *rootp, *lp, *rp; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk; + DBT hdr, data; + int ret; + + /* + * If the root page was a leaf page, change it into an internal page. + * We copy the key we split on (but not the key's data, in the case of + * a leaf page) to the new root page. + */ + P_INIT(rootp, dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); + + /* + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + bi.len = 0; + bi.deleted = 0; + bi.type = B_KEYDATA; + bi.pgno = lp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(lp); + RE_NREC_SET(rootp, bi.nrecs); + } + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = (char *) ""; + data.size = 0; + if ((ret = + __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, &data)) != 0) + return (ret); + + switch (TYPE(rp)) { + case P_IBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bi = GET_BINTERNAL(rp, 0); + + bi.len = child_bi->len; + bi.deleted = 0; + bi.type = child_bi->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bi->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0) + return (ret); + break; + case P_LBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bk = GET_BKEYDATA(rp, 0); + switch (child_bk->type) { + case B_KEYDATA: + bi.len = child_bk->len; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk->data; + data.size = child_bk->len; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bi.len = BOVERFLOW_SIZE; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rp->pgno; + if (F_ISSET(dbp, DB_BT_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbp, rootp, 1, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bk->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + return (0); +} + +/* + * __ram_root -- + * Fix up the recno root page after it has been split. + * + * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); + */ +int +__ram_root(dbp, rootp, lp, rp) + DB *dbp; + PAGE *rootp, *lp, *rp; +{ + DBT hdr; + RINTERNAL ri; + int ret; + + /* Initialize the page. */ + P_INIT(rootp, dbp->pgsize, + PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); + + /* Initialize the header. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + + /* Insert the left and right keys, set the header information. */ + ri.pgno = lp->pgno; + ri.nrecs = __bam_total(lp); + if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_SET(rootp, ri.nrecs); + ri.pgno = rp->pgno; + ri.nrecs = __bam_total(rp); + if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_ADJ(rootp, ri.nrecs); + return (0); +} + +/* + * __bam_pinsert -- + * Insert a new key into a parent page, completing the split. + */ +static int +__bam_pinsert(dbp, parent, lchild, rchild) + DB *dbp; + EPG *parent; + PAGE *lchild, *rchild; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk, *tmp_bk; + BTREE *t; + DBT a, b, hdr, data; + PAGE *ppage; + RINTERNAL ri; + db_indx_t off; + db_recno_t nrecs; + u_int32_t n, nbytes, nksize; + int ret; + + t = dbp->internal; + ppage = parent->page; + + /* If handling record numbers, count records split to the right page. */ + nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ? + __bam_total(rchild) : 0; + + /* + * Now we insert the new page's first key into the parent page, which + * completes the split. The parent points to a PAGE and a page index + * offset, where the new key goes ONE AFTER the index, because we split + * to the right. + * + * XXX + * Some btree algorithms replace the key for the old page as well as + * the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. + */ + off = parent->indx + O_INDX; + + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack used when inserting into BINTERNAL pages. + * Retain only what's needed to distinguish between the new entry and + * the LAST entry on the page to its left. If the keys compare equal, + * retain the entire key. We ignore overflow keys, and the entire key + * must be retained for the next-to-leftmost key on the leftmost page + * of each level, or the search will fail. Applicable ONLY to internal + * pages that have leaf pages as children. Further reduction of the + * key between pairs of internal pages loses too much information. + */ + switch (TYPE(rchild)) { + case P_IBTREE: + child_bi = GET_BINTERNAL(rchild, 0); + nbytes = BINTERNAL_PSIZE(child_bi->len); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + /* Add a new record for the right page. */ + bi.len = child_bi->len; + bi.deleted = 0; + bi.type = child_bi->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bi->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)(child_bi->data))->pgno)) != 0) + return (ret); + break; + case P_LBTREE: + child_bk = GET_BKEYDATA(rchild, 0); + switch (child_bk->type) { + case B_KEYDATA: + nbytes = BINTERNAL_PSIZE(child_bk->len); + nksize = child_bk->len; + if (t->bt_prefix == NULL) + goto noprefix; + if (ppage->prev_pgno == PGNO_INVALID && off <= 1) + goto noprefix; + tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX); + if (tmp_bk->type != B_KEYDATA) + goto noprefix; + memset(&a, 0, sizeof(a)); + a.size = tmp_bk->len; + a.data = tmp_bk->data; + memset(&b, 0, sizeof(b)); + b.size = child_bk->len; + b.data = child_bk->data; + nksize = t->bt_prefix(&a, &b); + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) { + t->lstat.bt_pfxsaved += nbytes - n; + nbytes = n; + } else +noprefix: nksize = child_bk->len; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + bi.len = nksize; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk->data; + data.size = nksize; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + bi.len = BOVERFLOW_SIZE; + bi.deleted = 0; + bi.type = child_bk->type; + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbp, ppage, off, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (child_bk->type == B_OVERFLOW && (ret = + __db_ioff(dbp, ((BOVERFLOW *)child_bk)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + break; + case P_IRECNO: + case P_LRECNO: + nbytes = RINTERNAL_PSIZE; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + + /* Add a new record for the right page. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + ri.pgno = rchild->pgno; + ri.nrecs = nrecs; + if ((ret = __db_pitem(dbp, + ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + + /* Adjust the parent page's left page record count. */ + if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + /* Log the change. */ + if (DB_LOGGING(dbp) && + (ret = __bam_cadjust_log(dbp->dbenv->lg_info, + dbp->txn, &LSN(ppage), 0, dbp->log_fileid, + PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, + -(int32_t)nrecs, (int32_t)0)) != 0) + return (ret); + + /* Update the left page count. */ + if (dbp->type == DB_RECNO) + GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + else + GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + } + + return (0); +} + +/* + * __bam_psplit -- + * Do the real work of splitting the page. + */ +static int +__bam_psplit(dbp, cp, lp, rp, cleft) + DB *dbp; + EPG *cp; + PAGE *lp, *rp; + int cleft; +{ + BTREE *t; + PAGE *pp; + db_indx_t half, nbytes, off, splitp, top; + int adjust, cnt, isbigkey, ret; + + t = dbp->internal; + pp = cp->page; + adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; + + /* + * If we're splitting the first (last) page on a level because we're + * inserting (appending) a key to it, it's likely that the data is + * sorted. Moving a single item to the new page is less work and can + * push the fill factor higher than normal. If we're wrong it's not + * a big deal, we'll just do the split the right way next time. + */ + off = 0; + if (NEXT_PGNO(pp) == PGNO_INVALID && + ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || + (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) + off = NUM_ENT(cp->page) - adjust; + else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) + off = adjust; + + ++t->lstat.bt_split; + if (off != 0) { + ++t->lstat.bt_fastsplit; + goto sort; + } + + /* + * Split the data to the left and right pages. Try not to split on + * an overflow key. (Overflow keys on internal pages will slow down + * searches.) Refuse to split in the middle of a set of duplicates. + * + * First, find the optimum place to split. + * + * It's possible to try and split past the last record on the page if + * there's a very large record at the end of the page. Make sure this + * doesn't happen by bounding the check at the next-to-last entry on + * the page. + * + * Note, we try and split half the data present on the page. This is + * because another process may have already split the page and left + * it half empty. We don't try and skip the split -- we don't know + * how much space we're going to need on the page, and we may need up + * to half the page for a big item, so there's no easy test to decide + * if we need to split or not. Besides, if two threads are inserting + * data into the same place in the database, we're probably going to + * need more space soon anyway. + */ + top = NUM_ENT(pp) - adjust; + half = (dbp->pgsize - HOFFSET(pp)) / 2; + for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) + switch (TYPE(pp)) { + case P_IBTREE: + if (GET_BINTERNAL(pp, off)->type == B_KEYDATA) + nbytes += + BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); + else + nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + if (GET_BKEYDATA(pp, off)->type == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + + ++off; + if (GET_BKEYDATA(pp, off)->type == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes += RINTERNAL_SIZE; + break; + case P_LRECNO: + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } +sort: splitp = off; + + /* + * Splitp is either at or just past the optimum split point. If + * it's a big key, try and find something close by that's not. + */ + if (TYPE(pp) == P_IBTREE) + isbigkey = GET_BINTERNAL(pp, off)->type != B_KEYDATA; + else if (TYPE(pp) == P_LBTREE) + isbigkey = GET_BKEYDATA(pp, off)->type != B_KEYDATA; + else + isbigkey = 0; + if (isbigkey) + for (cnt = 1; cnt <= 3; ++cnt) { + off = splitp + cnt * adjust; + if (off < (db_indx_t)NUM_ENT(pp) && + ((TYPE(pp) == P_IBTREE && + GET_BINTERNAL(pp, off)->type == B_KEYDATA) || + GET_BKEYDATA(pp, off)->type == B_KEYDATA)) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (TYPE(pp) == P_IBTREE ? + GET_BINTERNAL(pp, off)->type == B_KEYDATA : + GET_BKEYDATA(pp, off)->type == B_KEYDATA) { + splitp = off; + break; + } + } + + /* + * We can't split in the middle a set of duplicates. We know that + * no duplicate set can take up more than about 25% of the page, + * because that's the point where we push it off onto a duplicate + * page set. So, this loop can't be unbounded. + */ + if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE && + pp->inp[splitp] == pp->inp[splitp - adjust]) + for (cnt = 1;; ++cnt) { + off = splitp + cnt * adjust; + if (off < NUM_ENT(pp) && + pp->inp[splitp] != pp->inp[off]) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (pp->inp[splitp] != pp->inp[off]) { + splitp = off + adjust; + break; + } + } + + + /* We're going to split at splitp. */ + if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) + return (ret); + if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) + return (ret); + + /* Adjust the cursors. */ + __bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft); + return (0); +} + +/* + * __bam_copy -- + * Copy a set of records from one page to another. + * + * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__bam_copy(dbp, pp, cp, nxt, stop) + DB *dbp; + PAGE *pp, *cp; + u_int32_t nxt, stop; +{ + db_indx_t dup, nbytes, off; + + /* + * Copy the rest of the data to the right page. Nxt is the next + * offset placed on the target page. + */ + for (dup = off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { + switch (TYPE(pp)) { + case P_IBTREE: + if (GET_BINTERNAL(pp, nxt)->type == B_KEYDATA) + nbytes = + BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); + else + nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + /* + * If we're on a key and it's a duplicate, just copy + * the offset. + */ + if (off != 0 && (nxt % P_INDX) == 0 && + pp->inp[nxt] == pp->inp[nxt - P_INDX]) { + cp->inp[off] = cp->inp[off - P_INDX]; + continue; + } + /* FALLTHROUGH */ + case P_LRECNO: + if (GET_BKEYDATA(pp, nxt)->type == B_KEYDATA) + nbytes = + BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); + else + nbytes = BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } + cp->inp[off] = HOFFSET(cp) -= nbytes; + memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); + } + return (0); +} diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c new file mode 100644 index 0000000000..ba71ea616d --- /dev/null +++ b/db2/btree/bt_stat.c @@ -0,0 +1,257 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_stat.c 10.11 (Sleepycat) 8/19/97"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); + +/* + * __bam_stat -- + * Gather/print the btree statistics + * + * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), int)); + */ +int +__bam_stat(argdbp, spp, db_malloc, flags) + DB *argdbp; + void *spp; + void *(*db_malloc) __P((size_t)); + int flags; +{ + BTMETA *meta; + BTREE *t; + DB *dbp; + DB_BTREE_STAT *sp; + DB_LOCK lock; + PAGE *h; + db_pgno_t lastpgno, pgno; + int ret; + + DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags); + + /* Check for invalid flags. */ + if ((ret = __db_statchk(argdbp, flags)) != 0) + return (ret); + + if (spp == NULL) + return (0); + + GETHANDLE(argdbp, NULL, &dbp, ret); + t = dbp->internal; + + /* Allocate and clear the structure. */ + if ((sp = db_malloc == NULL ? + (DB_BTREE_STAT *)malloc(sizeof(*sp)) : + (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) { + ret = ENOMEM; + goto err; + } + memset(sp, 0, sizeof(*sp)); + + /* If the app just wants the record count, make it fast. */ + if (LF_ISSET(DB_RECORDCOUNT)) { + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0) + goto err; + + sp->bt_nrecs = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + goto done; + } + + /* Get the meta-data page. */ + pgno = PGNO_METADATA; + if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + goto err; + + /* Translate the metadata flags. */ + if (F_ISSET(meta, BTM_DUP)) + sp->bt_flags |= DB_DUP; + if (F_ISSET(meta, BTM_FIXEDLEN)) + sp->bt_flags |= DB_FIXEDLEN; + if (F_ISSET(meta, BTM_RECNUM)) + sp->bt_flags |= DB_RECNUM; + if (F_ISSET(meta, BTM_RENUMBER)) + sp->bt_flags |= DB_RENUMBER; + + /* + * Get the maxkey, minkey, re_len and re_pad fields from the + * metadata. + */ + sp->bt_minkey = meta->minkey; + sp->bt_maxkey = meta->maxkey; + sp->bt_re_len = meta->re_len; + sp->bt_re_pad = meta->re_pad; + + /* Get the page size from the DB. */ + sp->bt_pagesize = dbp->pgsize; + + /* Initialize counters with the meta-data page information. */ + __bam_add_rstat(&meta->stat, sp); + + /* + * Add in the local information from this handle. + * + * !!! + * This is a bit odd, but it gets us closer to the truth. + */ + __bam_add_rstat(&t->lstat, sp); + + /* Walk the free list, counting pages. */ + for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) { + ++sp->bt_free; + + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + (void)memp_fput(dbp->mpf, meta, 0); + (void)__BT_TLPUT(dbp, lock); + goto err; + } + pgno = h->next_pgno; + (void)memp_fput(dbp->mpf, h, 0); + } + + /* Discard the meta-data page. */ + (void)memp_fput(dbp->mpf, meta, 0); + (void)__BT_TLPUT(dbp, lock); + + /* Get the root page. */ + pgno = PGNO_ROOT; + if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) + goto err; + if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + (void)__BT_LPUT(dbp, lock); + goto err; + } + + /* Get the levels from the root page. */ + sp->bt_levels = h->level; + + /* + * Determine the last page of the database, then walk it, counting + * things. + */ + if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0) + goto err; + (void)memp_fput(dbp->mpf, h, 0); + for (;;) { + switch (TYPE(h)) { + case P_INVALID: + break; + case P_IBTREE: + case P_IRECNO: + ++sp->bt_int_pg; + sp->bt_int_pgfree += HOFFSET(h) - LOFFSET(h); + break; + case P_LBTREE: + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h); + sp->bt_nrecs += NUM_ENT(h) / P_INDX; + break; + case P_LRECNO: + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += HOFFSET(h) - LOFFSET(h); + sp->bt_nrecs += NUM_ENT(h); + break; + case P_DUPLICATE: + ++sp->bt_dup_pg; + /* XXX MARGO: sp->bt_dup_pgfree; */ + break; + case P_OVERFLOW: + ++sp->bt_over_pg; + /* XXX MARGO: sp->bt_over_pgfree; */ + break; + default: + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + return (__db_pgfmt(dbp, pgno)); + } + + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbp, lock); + + if (++pgno > lastpgno) + break; + if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) + break; + if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) { + (void)__BT_LPUT(dbp, lock); + break; + } + } + +done: *(DB_BTREE_STAT **)spp = sp; + ret = 0; + +err: PUTHANDLE(dbp); + return (ret); +} + +/* + * __bam_add_mstat -- + * Add the local statistics to the meta-data page statistics. + * + * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); + */ +void +__bam_add_mstat(from, to) + DB_BTREE_LSTAT *from; + DB_BTREE_LSTAT *to; +{ + to->bt_freed += from->bt_freed; + to->bt_pfxsaved += from->bt_pfxsaved; + to->bt_split += from->bt_split; + to->bt_rootsplit += from->bt_rootsplit; + to->bt_fastsplit += from->bt_fastsplit; + to->bt_added += from->bt_added; + to->bt_deleted += from->bt_deleted; + to->bt_get += from->bt_get; + to->bt_cache_hit += from->bt_cache_hit; + to->bt_cache_miss += from->bt_cache_miss; +} + +/* + * __bam_add_rstat -- + * Add the local statistics to the returned statistics. + */ +static void +__bam_add_rstat(from, to) + DB_BTREE_LSTAT *from; + DB_BTREE_STAT *to; +{ + to->bt_freed += from->bt_freed; + to->bt_pfxsaved += from->bt_pfxsaved; + to->bt_split += from->bt_split; + to->bt_rootsplit += from->bt_rootsplit; + to->bt_fastsplit += from->bt_fastsplit; + to->bt_added += from->bt_added; + to->bt_deleted += from->bt_deleted; + to->bt_get += from->bt_get; + to->bt_cache_hit += from->bt_cache_hit; + to->bt_cache_miss += from->bt_cache_miss; +} diff --git a/db2/btree/btree.src b/db2/btree/btree.src new file mode 100644 index 0000000000..50cc0dd0ff --- /dev/null +++ b/db2/btree/btree.src @@ -0,0 +1,137 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)btree.src 10.3 (Sleepycat) 8/17/97"; +#endif /* not lint */ + +PREFIX bam + +/* + * BTREE-pg_alloc: used to record allocating a new page. + * + * meta_lsn: the meta-data page's original lsn. + * page_lsn: the allocated page's original lsn. + * pgno: the page allocated. + * next: the next page on the free list. + */ +BEGIN pg_alloc +ARG fileid u_int32_t lu +POINTER meta_lsn DB_LSN * lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +/* + * BTREE-pg_free: used to record freeing a page. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + */ +BEGIN pg_free +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +DBT header DBT s +ARG next db_pgno_t lu +END + +/* + * BTREE-split: used to log a page split. + * + * left: the page number for the low-order contents. + * llsn: the left page's original LSN. + * right: the page number for the high-order contents. + * rlsn: the right page's original LSN. + * indx: the number of entries that went to the left page. + * npgno: the next page number + * nlsn: the next page's original LSN (or 0 if no next page). + * pg: the split page's contents before the split. + */ +BEGIN split +ARG fileid u_int32_t lu +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +DBT pg DBT s +END + +/* + * BTREE-rsplit: used to log a reverse-split + * + * pgno: the page number of the page copied over the root. + * pgdbt: the page being copied on the root page. + * rootent: last entry on the root page. + * rootlsn: the root page's original lsn. + */ +BEGIN rsplit +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +DBT pgdbt DBT s +DBT rootent DBT s +POINTER rootlsn DB_LSN * lu +END + +/* + * BTREE-adj: used to log the adjustment of an index. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index adjusted. + * indx_copy: the index to copy if inserting. + * is_insert: 0 if a delete, 1 if an insert. + */ +BEGIN adj +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG indx_copy u_int32_t lu +ARG is_insert u_int32_t lu +END + +/* + * BTREE-cadjust: used to adjust the count change in an internal page. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be adjusted. + * adjust: the signed adjustment. + * total: if the total tree entries count should be adjusted + */ +BEGIN cadjust +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG adjust int32_t ld +ARG total int32_t ld +END + +/* + * BTREE-cdel: used to log the intent-to-delete of a cursor record. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be deleted. + */ +BEGIN cdel +ARG fileid u_int32_t lu +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +END diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c new file mode 100644 index 0000000000..e6b72252e5 --- /dev/null +++ b/db2/btree/btree_auto.c @@ -0,0 +1,1279 @@ +/* Do not edit: automatically built by dist/db_gen.sh. */ +#include "config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <ctype.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "btree.h" +#include "db_am.h" +#include "common_ext.h" + +/* + * PUBLIC: int __bam_pg_alloc_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t, + * PUBLIC: u_int32_t, db_pgno_t)); + */ +int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, + fileid, meta_lsn, page_lsn, pgno, ptype, next) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + DB_LSN * meta_lsn; + DB_LSN * page_lsn; + db_pgno_t pgno; + u_int32_t ptype; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_alloc; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(*meta_lsn) + + sizeof(*page_lsn) + + sizeof(pgno) + + sizeof(ptype) + + sizeof(next); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (page_lsn != NULL) + memcpy(bp, page_lsn, sizeof(*page_lsn)); + else + memset(bp, 0, sizeof(*page_lsn)); + bp += sizeof(*page_lsn); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ptype, sizeof(ptype)); + bp += sizeof(ptype); + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_pg_alloc_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_pg_alloc_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_pg_alloc_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_pg_alloc_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tptype: %lu\n", (u_long)argp->ptype); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_pg_alloc_read __P((void *, __bam_pg_alloc_args **)); + */ +int +__bam_pg_alloc_read(recbuf, argpp) + void *recbuf; + __bam_pg_alloc_args **argpp; +{ + __bam_pg_alloc_args *argp; + u_int8_t *bp; + + argp = (__bam_pg_alloc_args *)malloc(sizeof(__bam_pg_alloc_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); + bp += sizeof(argp->page_lsn); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ptype, bp, sizeof(argp->ptype)); + bp += sizeof(argp->ptype); + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_pg_free_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, DBT *, + * PUBLIC: db_pgno_t)); + */ +int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, meta_lsn, header, next) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * meta_lsn; + DBT *header; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_free; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(next); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (header == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &header->size, sizeof(header->size)); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + bp += header->size; + } + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_pg_free_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_pg_free_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_pg_free_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_pg_free_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + c = ((char *)argp->header.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_pg_free_read __P((void *, __bam_pg_free_args **)); + */ +int +__bam_pg_free_read(recbuf, argpp) + void *recbuf; + __bam_pg_free_args **argpp; +{ + __bam_pg_free_args *argp; + u_int8_t *bp; + + argp = (__bam_pg_free_args *)malloc(sizeof(__bam_pg_free_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->header.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_split_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: DB_LSN *, u_int32_t, db_pgno_t, DB_LSN *, + * PUBLIC: DBT *)); + */ +int __bam_split_log(logp, txnid, ret_lsnp, flags, + fileid, left, llsn, right, rlsn, indx, + npgno, nlsn, pg) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t left; + DB_LSN * llsn; + db_pgno_t right; + DB_LSN * rlsn; + u_int32_t indx; + db_pgno_t npgno; + DB_LSN * nlsn; + DBT *pg; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_split; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(left) + + sizeof(*llsn) + + sizeof(right) + + sizeof(*rlsn) + + sizeof(indx) + + sizeof(npgno) + + sizeof(*nlsn) + + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &left, sizeof(left)); + bp += sizeof(left); + if (llsn != NULL) + memcpy(bp, llsn, sizeof(*llsn)); + else + memset(bp, 0, sizeof(*llsn)); + bp += sizeof(*llsn); + memcpy(bp, &right, sizeof(right)); + bp += sizeof(right); + if (rlsn != NULL) + memcpy(bp, rlsn, sizeof(*rlsn)); + else + memset(bp, 0, sizeof(*rlsn)); + bp += sizeof(*rlsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &npgno, sizeof(npgno)); + bp += sizeof(npgno); + if (nlsn != NULL) + memcpy(bp, nlsn, sizeof(*nlsn)); + else + memset(bp, 0, sizeof(*nlsn)); + bp += sizeof(*nlsn); + if (pg == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pg->size, sizeof(pg->size)); + bp += sizeof(pg->size); + memcpy(bp, pg->data, pg->size); + bp += pg->size; + } +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_split_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_split_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_split_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_split_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tleft: %lu\n", (u_long)argp->left); + printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + printf("\tright: %lu\n", (u_long)argp->right); + printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tnpgno: %lu\n", (u_long)argp->npgno); + printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + c = ((char *)argp->pg.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_split_read __P((void *, __bam_split_args **)); + */ +int +__bam_split_read(recbuf, argpp) + void *recbuf; + __bam_split_args **argpp; +{ + __bam_split_args *argp; + u_int8_t *bp; + + argp = (__bam_split_args *)malloc(sizeof(__bam_split_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->left, bp, sizeof(argp->left)); + bp += sizeof(argp->left); + memcpy(&argp->llsn, bp, sizeof(argp->llsn)); + bp += sizeof(argp->llsn); + memcpy(&argp->right, bp, sizeof(argp->right)); + bp += sizeof(argp->right); + memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); + bp += sizeof(argp->rlsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->npgno, bp, sizeof(argp->npgno)); + bp += sizeof(argp->npgno); + memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); + bp += sizeof(argp->nlsn); + memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_rsplit_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DBT *, DBT *, + * PUBLIC: DB_LSN *)); + */ +int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, pgdbt, rootent, rootlsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DBT *pgdbt; + DBT *rootent; + DB_LSN * rootlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_rsplit; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size) + + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + + sizeof(*rootlsn); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pgdbt == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pgdbt->size, sizeof(pgdbt->size)); + bp += sizeof(pgdbt->size); + memcpy(bp, pgdbt->data, pgdbt->size); + bp += pgdbt->size; + } + if (rootent == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &rootent->size, sizeof(rootent->size)); + bp += sizeof(rootent->size); + memcpy(bp, rootent->data, rootent->size); + bp += rootent->size; + } + if (rootlsn != NULL) + memcpy(bp, rootlsn, sizeof(*rootlsn)); + else + memset(bp, 0, sizeof(*rootlsn)); + bp += sizeof(*rootlsn); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_rsplit_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_rsplit_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_rsplit_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_rsplit_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpgdbt: "); + for (i = 0; i < argp->pgdbt.size; i++) { + c = ((char *)argp->pgdbt.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\trootent: "); + for (i = 0; i < argp->rootent.size; i++) { + c = ((char *)argp->rootent.data)[i]; + if (isprint(c) || c == 0xa) + putchar(c); + else + printf("%#x ", c); + } + printf("\n"); + printf("\trootlsn: [%lu][%lu]\n", + (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_rsplit_read __P((void *, __bam_rsplit_args **)); + */ +int +__bam_rsplit_read(recbuf, argpp) + void *recbuf; + __bam_rsplit_args **argpp; +{ + __bam_rsplit_args *argp; + u_int8_t *bp; + + argp = (__bam_rsplit_args *)malloc(sizeof(__bam_rsplit_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pgdbt.data = bp; + bp += argp->pgdbt.size; + memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->rootent.data = bp; + bp += argp->rootent.size; + memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); + bp += sizeof(argp->rootlsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_adj_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t)); + */ +int __bam_adj_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, indx_copy, is_insert) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t indx_copy; + u_int32_t is_insert; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_adj; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(indx_copy) + + sizeof(is_insert); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &indx_copy, sizeof(indx_copy)); + bp += sizeof(indx_copy); + memcpy(bp, &is_insert, sizeof(is_insert)); + bp += sizeof(is_insert); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_adj_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_adj_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_adj_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_adj_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); + printf("\tis_insert: %lu\n", (u_long)argp->is_insert); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_adj_read __P((void *, __bam_adj_args **)); + */ +int +__bam_adj_read(recbuf, argpp) + void *recbuf; + __bam_adj_args **argpp; +{ + __bam_adj_args *argp; + u_int8_t *bp; + + argp = (__bam_adj_args *)malloc(sizeof(__bam_adj_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy)); + bp += sizeof(argp->indx_copy); + memcpy(&argp->is_insert, bp, sizeof(argp->is_insert)); + bp += sizeof(argp->is_insert); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_cadjust_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: int32_t, int32_t)); + */ +int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, adjust, total) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + int32_t adjust; + int32_t total; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cadjust; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(adjust) + + sizeof(total); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &adjust, sizeof(adjust)); + bp += sizeof(adjust); + memcpy(bp, &total, sizeof(total)); + bp += sizeof(total); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_cadjust_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_cadjust_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_cadjust_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_cadjust_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tadjust: %ld\n", (long)argp->adjust); + printf("\ttotal: %ld\n", (long)argp->total); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_cadjust_read __P((void *, __bam_cadjust_args **)); + */ +int +__bam_cadjust_read(recbuf, argpp) + void *recbuf; + __bam_cadjust_args **argpp; +{ + __bam_cadjust_args *argp; + u_int8_t *bp; + + argp = (__bam_cadjust_args *)malloc(sizeof(__bam_cadjust_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->adjust, bp, sizeof(argp->adjust)); + bp += sizeof(argp->adjust); + memcpy(&argp->total, bp, sizeof(argp->total)); + bp += sizeof(argp->total); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_cdel_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t)); + */ +int __bam_cdel_log(logp, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cdel; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + null_lsn.file = 0; + null_lsn.offset = 0; + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx); + if ((logrec.data = (void *)malloc(logrec.size)) == NULL) + return (ENOMEM); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); +#ifdef DEBUG + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + free(logrec.data); + return (ret); +} + +/* + * PUBLIC: int __bam_cdel_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ + +int +__bam_cdel_print(notused1, dbtp, lsnp, notused3, notused4) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused3; + void *notused4; +{ + __bam_cdel_args *argp; + u_int32_t i; + int c, ret; + + i = 0; + c = 0; + notused1 = NULL; + notused3 = 0; + notused4 = NULL; + + if((ret = __bam_cdel_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %lu\n", (u_long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\n"); + free(argp); + return (0); +} + +/* + * PUBLIC: int __bam_cdel_read __P((void *, __bam_cdel_args **)); + */ +int +__bam_cdel_read(recbuf, argpp) + void *recbuf; + __bam_cdel_args **argpp; +{ + __bam_cdel_args *argp; + u_int8_t *bp; + + argp = (__bam_cdel_args *)malloc(sizeof(__bam_cdel_args) + + sizeof(DB_TXN)); + if (argp == NULL) + return (ENOMEM); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __bam_init_print __P((DB_ENV *)); + */ +int +__bam_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_print, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_print, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_print, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_print, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_print, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_print, DB_bam_cdel)) != 0) + return (ret); + return (0); +} + +/* + * PUBLIC: int __bam_init_recover __P((DB_ENV *)); + */ +int +__bam_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_recover, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_recover, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_recover, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_recover, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_recover, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_recover, DB_bam_cdel)) != 0) + return (ret); + return (0); +} + |